Fix 7441: CUDA boolean vector layout to use 1-byte elements (#7862)

* Fix 7441: CUDA boolean vector layout to use 1-byte elements Boolean vectors (bool1, bool2, bool3, bool4) were incorrectly implemented as integer-based types using 4 bytes per element instead of actual 1-byte boolean elements on CUDA targets. Changes: - Update CUDA prelude to define boolean vectors as structs with bool fields instead of typedef aliases to integer vectors - Implement CUDALayoutRulesImpl::GetVectorLayout to use 1-byte alignment for boolean vectors, matching actual CUDA memory layout behavior - Update make_bool functions to populate struct fields correctly This ensures boolean vectors have the same memory layout as bool[4] arrays: - bool1: 1 byte (was 4 bytes) - bool2: 2 bytes (was 8 bytes) - bool3: 3 bytes (was 12 bytes) - bool4: 4 bytes (was 16 bytes) Fixes memory layout mismatch between Slang reflection API and actual CUDA compilation, achieving 75% memory savings for boolean vector usage. * Fix CI issues - Add and update associated functions and operators * Make boolX same as uchar * Use align construct on struct for boolX * Improve Test case for robust alignment checks * Formatting * Disable selected slangpy tests * add metal check which is slightly different than cuda * Test-1 * Test-2 * Test-3 * Test-4 * ReflectionChange * cleanup and update * _slang_select with plain bool is needed for reverse-loop-checkpoint-test
author: Harsh Aggarwal (NVIDIA) <haaggarwal@nvidia.com> 2025-08-01 14:48:53 +0530
committer: GitHub <noreply@github.com> 2025-08-01 09:18:53 +0000
commit: bdda8a90cdd44ca822b21233ac988f92d1f20826 (patch)
tree: 4f628776a047b2719aee676287b3adc9d4f68f7a /prelude
parent: 7a133445ef82232a6961c25d09b79f45cd92e709 (diff)
1 files changed, 72 insertions, 13 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 3ebdbe777..178c12f5f 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -190,10 +190,65 @@ typedef size_t NonUniformResourceIndex;
 template<typename T, int ROWS, int COLS>
 struct Matrix;
 
-typedef int1 bool1;
-typedef int2 bool2;
-typedef int3 bool3;
-typedef int4 bool4;
+// Boolean vector types should follow CUDA's builtin vector alignment rules
+// Align boolX the same as charX according to CUDA spec:
+// char1/uchar1: 1-byte aligned, char2/uchar2: 2-byte aligned
+// char3/uchar3: 1-byte aligned, char4/uchar4: 4-byte aligned
+struct __align__(1) bool1
+{
+    bool x;
+
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL bool& operator[](int idx)
+    {
+        return (&x)[idx];
+    }
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL const bool& operator[](int idx) const
+    {
+        return (&x)[idx];
+    }
+};
+
+struct __align__(2) bool2
+{
+    bool x, y;
+
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL bool& operator[](int idx)
+    {
+        return (&x)[idx];
+    }
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL const bool& operator[](int idx) const
+    {
+        return (&x)[idx];
+    }
+};
+
+struct __align__(1) bool3
+{
+    bool x, y, z;
+
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL bool& operator[](int idx)
+    {
+        return (&x)[idx];
+    }
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL const bool& operator[](int idx) const
+    {
+        return (&x)[idx];
+    }
+};
+
+struct __align__(4) bool4
+{
+    bool x, y, z, w;
+
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL bool& operator[](int idx)
+    {
+        return (&x)[idx];
+    }
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL const bool& operator[](int idx) const
+    {
+        return (&x)[idx];
+    }
+};
 
 #if SLANG_CUDA_RTC
 
@@ -282,6 +337,7 @@ struct __align__(4) __half4
         return ((T*)(&x))[index];                                                     \
     }
 SLANG_VECTOR_GET_ELEMENT(int)
+SLANG_VECTOR_GET_ELEMENT(bool)
 SLANG_VECTOR_GET_ELEMENT(uint)
 SLANG_VECTOR_GET_ELEMENT(short)
 SLANG_VECTOR_GET_ELEMENT(ushort)
@@ -310,6 +366,7 @@ SLANG_VECTOR_GET_ELEMENT(double)
         return ((T*)(x)) + index;                                                            \
     }
 SLANG_VECTOR_GET_ELEMENT_PTR(int)
+SLANG_VECTOR_GET_ELEMENT_PTR(bool)
 SLANG_VECTOR_GET_ELEMENT_PTR(uint)
 SLANG_VECTOR_GET_ELEMENT_PTR(short)
 SLANG_VECTOR_GET_ELEMENT_PTR(ushort)
@@ -334,15 +391,14 @@ SLANG_VECTOR_GET_ELEMENT_PTR(__half)
                 _slang_vector_get_element(thisVal, i) op _slang_vector_get_element(other, i); \
         return result;                                                                        \
     }
-#define SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, op)                                \
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL bool##n operator op(T##n thisVal, T##n other) \
-    {                                                                                \
-        bool##n result;                                                              \
-        for (int i = 0; i < n; i++)                                                  \
-            *_slang_vector_get_element_ptr(&result, i) =                             \
-                (int)(_slang_vector_get_element(thisVal, i)                          \
-                          op _slang_vector_get_element(other, i));                   \
-        return result;                                                               \
+#define SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, op)                                           \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL bool##n operator op(T##n thisVal, T##n other)            \
+    {                                                                                           \
+        bool##n result;                                                                         \
+        for (int i = 0; i < n; i++)                                                             \
+            *_slang_vector_get_element_ptr(&result, i) =                                        \
+                (_slang_vector_get_element(thisVal, i) op _slang_vector_get_element(other, i)); \
+        return result;                                                                          \
     }
 #define SLANG_CUDA_VECTOR_UNARY_OP(T, n, op)                                                       \
     SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator op(T##n thisVal)                              \
@@ -382,6 +438,7 @@ SLANG_VECTOR_GET_ELEMENT_PTR(__half)
     SLANG_CUDA_VECTOR_INT_OP(T, 4)
 
 SLANG_CUDA_VECTOR_INT_OPS(int)
+SLANG_CUDA_VECTOR_INT_OPS(bool)
 SLANG_CUDA_VECTOR_INT_OPS(uint)
 SLANG_CUDA_VECTOR_INT_OPS(ushort)
 SLANG_CUDA_VECTOR_INT_OPS(short)
@@ -594,6 +651,7 @@ struct GetVectorTypeImpl
     GET_VECTOR_TYPE_IMPL(T, 4)
 
 GET_VECTOR_TYPE_IMPL_N(int)
+GET_VECTOR_TYPE_IMPL_N(bool)
 GET_VECTOR_TYPE_IMPL_N(uint)
 GET_VECTOR_TYPE_IMPL_N(short)
 GET_VECTOR_TYPE_IMPL_N(ushort)
@@ -1039,6 +1097,7 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<__half, R, C> operator%(
     SLANG_SELECT_IMPL(T, 4)
 
 SLANG_SELECT_T(int)
+SLANG_SELECT_T(bool)
 SLANG_SELECT_T(uint)
 SLANG_SELECT_T(short)
 SLANG_SELECT_T(ushort)
author	Harsh Aggarwal (NVIDIA) <haaggarwal@nvidia.com>	2025-08-01 14:48:53 +0530
committer	GitHub <noreply@github.com>	2025-08-01 09:18:53 +0000
commit	bdda8a90cdd44ca822b21233ac988f92d1f20826 (patch)
tree	4f628776a047b2719aee676287b3adc9d4f68f7a /prelude
parent	7a133445ef82232a6961c25d09b79f45cd92e709 (diff)