diff options
| author | Harsh Aggarwal (NVIDIA) <haaggarwal@nvidia.com> | 2025-08-01 14:48:53 +0530 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-08-01 09:18:53 +0000 |
| commit | bdda8a90cdd44ca822b21233ac988f92d1f20826 (patch) | |
| tree | 4f628776a047b2719aee676287b3adc9d4f68f7a /prelude | |
| parent | 7a133445ef82232a6961c25d09b79f45cd92e709 (diff) | |
Fix 7441: CUDA boolean vector layout to use 1-byte elements (#7862)
* Fix 7441: CUDA boolean vector layout to use 1-byte elements
Boolean vectors (bool1, bool2, bool3, bool4) were incorrectly implemented
as integer-based types using 4 bytes per element instead of actual 1-byte
boolean elements on CUDA targets.
Changes:
- Update CUDA prelude to define boolean vectors as structs with bool fields
instead of typedef aliases to integer vectors
- Implement CUDALayoutRulesImpl::GetVectorLayout to use 1-byte alignment
for boolean vectors, matching actual CUDA memory layout behavior
- Update make_bool functions to populate struct fields correctly
This ensures boolean vectors have the same memory layout as bool[4] arrays:
- bool1: 1 byte (was 4 bytes)
- bool2: 2 bytes (was 8 bytes)
- bool3: 3 bytes (was 12 bytes)
- bool4: 4 bytes (was 16 bytes)
Fixes memory layout mismatch between Slang reflection API and actual
CUDA compilation, achieving 75% memory savings for boolean vector usage.
* Fix CI issues -
Add and update associated functions and operators
* Make boolX same as uchar
* Use align construct on struct for boolX
* Improve Test case for robust alignment checks
* Formatting
* Disable selected slangpy tests
* add metal check which is slightly different than cuda
* Test-1
* Test-2
* Test-3
* Test-4
* ReflectionChange
* cleanup and update
* _slang_select with plain bool is needed for reverse-loop-checkpoint-test
Diffstat (limited to 'prelude')
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 85 |
1 files changed, 72 insertions, 13 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index 3ebdbe777..178c12f5f 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -190,10 +190,65 @@ typedef size_t NonUniformResourceIndex; template<typename T, int ROWS, int COLS> struct Matrix; -typedef int1 bool1; -typedef int2 bool2; -typedef int3 bool3; -typedef int4 bool4; +// Boolean vector types should follow CUDA's builtin vector alignment rules +// Align boolX the same as charX according to CUDA spec: +// char1/uchar1: 1-byte aligned, char2/uchar2: 2-byte aligned +// char3/uchar3: 1-byte aligned, char4/uchar4: 4-byte aligned +struct __align__(1) bool1 +{ + bool x; + + SLANG_FORCE_INLINE SLANG_CUDA_CALL bool& operator[](int idx) + { + return (&x)[idx]; + } + SLANG_FORCE_INLINE SLANG_CUDA_CALL const bool& operator[](int idx) const + { + return (&x)[idx]; + } +}; + +struct __align__(2) bool2 +{ + bool x, y; + + SLANG_FORCE_INLINE SLANG_CUDA_CALL bool& operator[](int idx) + { + return (&x)[idx]; + } + SLANG_FORCE_INLINE SLANG_CUDA_CALL const bool& operator[](int idx) const + { + return (&x)[idx]; + } +}; + +struct __align__(1) bool3 +{ + bool x, y, z; + + SLANG_FORCE_INLINE SLANG_CUDA_CALL bool& operator[](int idx) + { + return (&x)[idx]; + } + SLANG_FORCE_INLINE SLANG_CUDA_CALL const bool& operator[](int idx) const + { + return (&x)[idx]; + } +}; + +struct __align__(4) bool4 +{ + bool x, y, z, w; + + SLANG_FORCE_INLINE SLANG_CUDA_CALL bool& operator[](int idx) + { + return (&x)[idx]; + } + SLANG_FORCE_INLINE SLANG_CUDA_CALL const bool& operator[](int idx) const + { + return (&x)[idx]; + } +}; #if SLANG_CUDA_RTC @@ -282,6 +337,7 @@ struct __align__(4) __half4 return ((T*)(&x))[index]; \ } SLANG_VECTOR_GET_ELEMENT(int) +SLANG_VECTOR_GET_ELEMENT(bool) SLANG_VECTOR_GET_ELEMENT(uint) SLANG_VECTOR_GET_ELEMENT(short) SLANG_VECTOR_GET_ELEMENT(ushort) @@ -310,6 +366,7 @@ SLANG_VECTOR_GET_ELEMENT(double) return ((T*)(x)) + index; \ } SLANG_VECTOR_GET_ELEMENT_PTR(int) +SLANG_VECTOR_GET_ELEMENT_PTR(bool) SLANG_VECTOR_GET_ELEMENT_PTR(uint) SLANG_VECTOR_GET_ELEMENT_PTR(short) SLANG_VECTOR_GET_ELEMENT_PTR(ushort) @@ -334,15 +391,14 @@ SLANG_VECTOR_GET_ELEMENT_PTR(__half) _slang_vector_get_element(thisVal, i) op _slang_vector_get_element(other, i); \ return result; \ } -#define SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, op) \ - SLANG_FORCE_INLINE SLANG_CUDA_CALL bool##n operator op(T##n thisVal, T##n other) \ - { \ - bool##n result; \ - for (int i = 0; i < n; i++) \ - *_slang_vector_get_element_ptr(&result, i) = \ - (int)(_slang_vector_get_element(thisVal, i) \ - op _slang_vector_get_element(other, i)); \ - return result; \ +#define SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, op) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL bool##n operator op(T##n thisVal, T##n other) \ + { \ + bool##n result; \ + for (int i = 0; i < n; i++) \ + *_slang_vector_get_element_ptr(&result, i) = \ + (_slang_vector_get_element(thisVal, i) op _slang_vector_get_element(other, i)); \ + return result; \ } #define SLANG_CUDA_VECTOR_UNARY_OP(T, n, op) \ SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator op(T##n thisVal) \ @@ -382,6 +438,7 @@ SLANG_VECTOR_GET_ELEMENT_PTR(__half) SLANG_CUDA_VECTOR_INT_OP(T, 4) SLANG_CUDA_VECTOR_INT_OPS(int) +SLANG_CUDA_VECTOR_INT_OPS(bool) SLANG_CUDA_VECTOR_INT_OPS(uint) SLANG_CUDA_VECTOR_INT_OPS(ushort) SLANG_CUDA_VECTOR_INT_OPS(short) @@ -594,6 +651,7 @@ struct GetVectorTypeImpl GET_VECTOR_TYPE_IMPL(T, 4) GET_VECTOR_TYPE_IMPL_N(int) +GET_VECTOR_TYPE_IMPL_N(bool) GET_VECTOR_TYPE_IMPL_N(uint) GET_VECTOR_TYPE_IMPL_N(short) GET_VECTOR_TYPE_IMPL_N(ushort) @@ -1039,6 +1097,7 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<__half, R, C> operator%( SLANG_SELECT_IMPL(T, 4) SLANG_SELECT_T(int) +SLANG_SELECT_T(bool) SLANG_SELECT_T(uint) SLANG_SELECT_T(short) SLANG_SELECT_T(ushort) |
