12 files changed, 656 insertions, 81 deletions
diff --git a/prelude/slang-cpp-prelude.h b/prelude/slang-cpp-prelude.h
index 725be4b42..ffd18cf32 100644
--- a/prelude/slang-cpp-prelude.h
+++ b/prelude/slang-cpp-prelude.h
@@ -36,6 +36,81 @@
 #   define SLANG_INFINITY   INFINITY
 #endif
 
+// Detect the compiler type
+
+#ifndef SLANG_COMPILER
+#    define SLANG_COMPILER
+
+/*
+Compiler defines, see http://sourceforge.net/p/predef/wiki/Compilers/
+NOTE that SLANG_VC holds the compiler version - not just 1 or 0
+*/
+#    if defined(_MSC_VER)
+#        if _MSC_VER >= 1900
+#            define SLANG_VC 14
+#        elif _MSC_VER >= 1800
+#            define SLANG_VC 12
+#        elif _MSC_VER >= 1700
+#            define SLANG_VC 11
+#        elif _MSC_VER >= 1600
+#            define SLANG_VC 10
+#        elif _MSC_VER >= 1500
+#            define SLANG_VC 9
+#        else
+#            error "unknown version of Visual C++ compiler"
+#        endif
+#    elif defined(__clang__)
+#        define SLANG_CLANG 1
+#    elif defined(__SNC__)
+#        define SLANG_SNC 1
+#    elif defined(__ghs__)
+#        define SLANG_GHS 1
+#    elif defined(__GNUC__) /* note: __clang__, __SNC__, or __ghs__ imply __GNUC__ */
+#        define SLANG_GCC 1
+#    else
+#        error "unknown compiler"
+#    endif
+/*
+Any compilers not detected by the above logic are now now explicitly zeroed out.
+*/
+#    ifndef SLANG_VC
+#        define SLANG_VC 0
+#    endif
+#    ifndef SLANG_CLANG
+#        define SLANG_CLANG 0
+#    endif
+#    ifndef SLANG_SNC
+#        define SLANG_SNC 0
+#    endif
+#    ifndef SLANG_GHS
+#        define SLANG_GHS 0
+#    endif
+#    ifndef SLANG_GCC
+#        define SLANG_GCC 0
+#    endif
+#endif /* SLANG_COMPILER */
+
+#define SLANG_GCC_FAMILY (SLANG_CLANG || SLANG_SNC || SLANG_GHS || SLANG_GCC)
+
+// GCC Specific
+#if SLANG_GCC_FAMILY
+#	define SLANG_ALIGN_OF(T)	__alignof__(T)
+// Use this macro instead of offsetof, because gcc produces warning if offsetof is used on a 
+// non POD type, even though it produces the correct result
+#   define SLANG_OFFSET_OF(T, ELEMENT) (size_t(&((T*)1)->ELEMENT) - 1)
+#endif // SLANG_GCC_FAMILY
+
+// Microsoft VC specific
+#if SLANG_VC
+#   define SLANG_ALIGN_OF(T) __alignof(T)
+#endif // SLANG_VC
+
+// Default impls
+
+#ifndef SLANG_OFFSET_OF
+#   define SLANG_OFFSET_OF(X, Y) offsetof(X, Y)
+#endif
+
 #include "slang-cpp-types.h"
 #include "slang-cpp-scalar-intrinsics.h"
 
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 91094a75e..01c658e0b 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -15,6 +15,15 @@
 #include <optix.h>
 #endif
 
+// Define slang offsetof implementation 
+#ifndef SLANG_OFFSET_OF
+#   define SLANG_OFFSET_OF(type, member) (size_t)((char*)&(((type *)0)->member) - (char*)0)
+#endif
+
+#ifndef SLANG_ALIGN_OF
+#   define SLANG_ALIGN_OF(type) __alignof__(type)
+#endif
+
 // Must be large enough to cause overflow and therefore infinity
 #ifndef SLANG_INFINITY
 #   define SLANG_INFINITY   ((float)(1e+300 * 1e+300))
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index afdd96029..761316d86 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -1962,6 +1962,35 @@ __target_intrinsic(cuda, " @ ")
 __target_intrinsic(cpp, " @ ")
 int __SyntaxError();
 
+/// For downstream compilers that allow sizeof/alignof/offsetof
+/// Can't be called in the C/C++ style. Need to use __size_of<some_type>() as opposed to sizeof(some_type).
+__generic<T>
+__target_intrinsic(cuda, "sizeof($G0)")
+__target_intrinsic(cpp, "sizeof($G0)")
+int __sizeOf();
+
+__generic<T>
+__target_intrinsic(cuda, "sizeof($T0)")
+__target_intrinsic(cpp, "sizeof($T0)")
+int __sizeOf(T v);
+
+__generic<T>
+__target_intrinsic(cuda, "SLANG_ALIGN_OF($G0)")
+__target_intrinsic(cpp, "SLANG_ALIGN_OF($G0)")
+int __alignOf();
+
+__generic<T>
+__target_intrinsic(cuda, "SLANG_ALIGN_OF($T0)")
+__target_intrinsic(cpp, "SLANG_ALIGN_OF($T0)")
+int __alignOf(T v);
+
+// It would be nice to have offsetof equivalent, but it's not clear how that would work in terms of the Slang language.
+// Here we allow calculating the offset of a field in bytes from an *instance* of the type.
+__generic<T,F>
+__target_intrinsic(cuda, "int(((char*)&($1)) - ((char*)&($0)))")
+__target_intrinsic(cpp, "int(((char*)&($1)) - ((char*)&($0))")
+int __offsetOf(in T t, in F field);
+
 /// Mark beginning of "interlocked" operations in a fragment shader.
 __target_intrinsic(glsl, "beginInvocationInterlockARB")
 __glsl_extension(GL_ARB_fragment_shader_interlock)
diff --git a/source/slang/slang-intrinsic-expand.cpp b/source/slang/slang-intrinsic-expand.cpp
index bd2e17b28..045b7e6c5 100644
--- a/source/slang/slang-intrinsic-expand.cpp
+++ b/source/slang/slang-intrinsic-expand.cpp
@@ -236,6 +236,35 @@ const char* IntrinsicExpandContext::_emitSpecial(const char* cursor)
         }
         break;
 
+        case 'G':
+        {
+            // Get the type/value at the index of the specialization of this generic
+
+            SLANG_RELEASE_ASSERT(*cursor >= '0' && *cursor <= '9');
+            Index argIndex = (*cursor++) - '0';
+            
+            IRSpecialize* specialize = as<IRSpecialize>(m_callInst->getCallee());
+            SLANG_ASSERT(specialize);
+
+            {
+                auto argCount = Index(specialize->getArgCount());
+                SLANG_UNUSED(argCount);
+                SLANG_ASSERT(argIndex < argCount);
+
+                auto arg = specialize->getArg(argIndex);
+
+                if (auto type = as<IRType>(arg))
+                {
+                    m_emitter->emitType(type);
+                }
+                else
+                {
+                    m_emitter->emitVal(arg, getInfo(EmitOp::General));
+                }
+            }
+        }
+        break;
+
         case 'T':
             // Get the the 'element' type for the type of the param at the index
         {
diff --git a/source/slang/slang-type-layout.cpp b/source/slang/slang-type-layout.cpp
index ad38e11cb..0fc7958d0 100644
--- a/source/slang/slang-type-layout.cpp
+++ b/source/slang/slang-type-layout.cpp
@@ -8,25 +8,30 @@
 
 namespace Slang {
 
-size_t RoundToAlignment(size_t offset, size_t alignment)
+static bool _isPow2(size_t v)
 {
-    size_t remainder = offset % alignment;
-    if (remainder == 0)
-        return offset;
-    else
-        return offset + (alignment - remainder);
+    return v > 0 && ((v - 1) & v) == 0;
+}
+
+static size_t _roundToAlignment(size_t offset, size_t alignment)
+{
+    // Must also be a power of 2
+    SLANG_ASSERT(_isPow2(alignment));
+
+    const size_t mask = alignment - 1;
+    return (offset + mask) & ~mask;
 }
 
-LayoutSize RoundToAlignment(LayoutSize offset, size_t alignment)
+static LayoutSize _roundToAlignment(LayoutSize offset, size_t alignment)
 {
     // An infinite size is assumed to be maximally aligned.
     if(offset.isInfinite())
         return LayoutSize::infinite();
 
-    return RoundToAlignment(offset.getFiniteValue(), alignment);
+    return _roundToAlignment(offset.getFiniteValue(), alignment);
 }
 
-static size_t RoundUpToPowerOfTwo( size_t value )
+static size_t _roundUpToPowerOfTwo( size_t value )
 {
     // TODO(tfoley): I know this isn't a fast approach
     size_t result = 1;
@@ -35,6 +40,21 @@ static size_t RoundUpToPowerOfTwo( size_t value )
     return result;
 }
 
+static bool _isAligned(size_t size, size_t alignment)
+{
+    SLANG_ASSERT(_isPow2(alignment));
+    return ((alignment - 1) & size) == 0;
+}
+
+// This is a workaround to keep functions from causing warnings in release builds, and therefore causing compilation to fail.
+void _typeLayout_keepFunctions()
+{
+    auto a = _isAligned;
+    auto b = _isPow2;
+    SLANG_UNUSED(a);
+    SLANG_UNUSED(b);
+}
+
 //
 
 struct DefaultLayoutRulesImpl : SimpleLayoutRulesImpl
@@ -81,7 +101,7 @@ struct DefaultLayoutRulesImpl : SimpleLayoutRulesImpl
         SLANG_RELEASE_ASSERT(elementInfo.size.isFinite());
         auto elementSize = elementInfo.size.getFiniteValue();
         auto elementAlignment = elementInfo.alignment;
-        auto elementStride = RoundToAlignment(elementSize, elementAlignment);
+        auto elementStride = _roundToAlignment(elementSize, elementAlignment);
 
         // An array with no elements will have zero size.
         //
@@ -155,7 +175,7 @@ struct DefaultLayoutRulesImpl : SimpleLayoutRulesImpl
         auto fieldBaseOffset = ioStructInfo->size;
 
         // We need to ensure that the offset for the field will respect its alignment
-        auto fieldOffset = RoundToAlignment(fieldBaseOffset, fieldInfo.alignment);
+        auto fieldOffset = _roundToAlignment(fieldBaseOffset, fieldInfo.alignment);
 
         // The size of the struct must be adjusted to cover the bytes consumed
         // by this field.
@@ -222,7 +242,7 @@ struct GLSLBaseLayoutRulesImpl : DefaultLayoutRulesImpl
         SimpleLayoutInfo vectorInfo(
             LayoutResourceKind::Uniform,
             size,
-            RoundUpToPowerOfTwo(size));
+            _roundUpToPowerOfTwo(size));
         return vectorInfo;
     }
 
@@ -231,7 +251,7 @@ struct GLSLBaseLayoutRulesImpl : DefaultLayoutRulesImpl
         // The size of an array must be rounded up to be a multiple of its alignment.
         //
         auto info = Super::GetArrayLayout(elementInfo, elementCount);
-        info.size = RoundToAlignment(info.size, info.alignment);
+        info.size = _roundToAlignment(info.size, info.alignment);
         return info;
     }
 
@@ -239,7 +259,7 @@ struct GLSLBaseLayoutRulesImpl : DefaultLayoutRulesImpl
     {
         // The size of a `struct` must be rounded up to be a multiple of its alignment.
         //
-        ioStructInfo->size = RoundToAlignment(ioStructInfo->size, ioStructInfo->alignment);
+        ioStructInfo->size = _roundToAlignment(ioStructInfo->size, ioStructInfo->alignment);
     }
 };
 
@@ -329,7 +349,7 @@ struct HLSLConstantBufferLayoutRulesImpl : DefaultLayoutRulesImpl
             return ioStructInfo->size;
 
         ioStructInfo->alignment = std::max(ioStructInfo->alignment, fieldInfo.alignment);
-        ioStructInfo->size = RoundToAlignment(ioStructInfo->size, fieldInfo.alignment);
+        ioStructInfo->size = _roundToAlignment(ioStructInfo->size, fieldInfo.alignment);
 
         LayoutSize fieldOffset = ioStructInfo->size;
         LayoutSize fieldSize = fieldInfo.size;
@@ -340,7 +360,7 @@ struct HLSLConstantBufferLayoutRulesImpl : DefaultLayoutRulesImpl
         auto endRegister = (fieldOffset + fieldSize - 1) / registerSize;
         if (startRegister != endRegister)
         {
-            ioStructInfo->size = RoundToAlignment(ioStructInfo->size, size_t(registerSize));
+            ioStructInfo->size = _roundToAlignment(ioStructInfo->size, size_t(registerSize));
             fieldOffset = ioStructInfo->size;
         }
 
@@ -396,10 +416,38 @@ struct CPULayoutRulesImpl : DefaultLayoutRulesImpl
     void EndStructLayout(UniformLayoutInfo* ioStructInfo) override
     {
         // Conform to C/C++ size is adjusted to the largest alignment
-        ioStructInfo->size = RoundToAlignment(ioStructInfo->size, ioStructInfo->alignment);
+        ioStructInfo->size = _roundToAlignment(ioStructInfo->size, ioStructInfo->alignment);
     }
 };
 
+// The CUDA compiler NVRTC only works on 64 bit operating systems.
+// So instead of using native host type sizes we use these types instead
+//
+// NOTE! This implies that our CUDA reflection (even if produced on 32 bit host environment) is always 64 bit.
+// This is unlikely to be a problem in practice.
+
+// NOTE! For the moment the CUDA prelude we use size_t - but that's ok as we currently use these types for
+// sizes
+
+// Memory sizes, and memory offsets (signed)
+typedef int64_t CUDASize;
+typedef int64_t CUDAOffset;
+
+// TODO(JS): This could be better as CudaUSize if we accepted LowerCamel Acronyms...
+typedef uint64_t CUDAUSize;
+
+// A type that is the size of a pointer
+typedef CUDASize CUDAPtr;
+// For CUtexObject and CUsurfObject
+typedef CUDAPtr CUDAHandle;
+
+// This is not strictly speaking needed - but exists to be consistent with cuda-prelude.h and the current CUDA emit.
+typedef CUDAPtr CUDASamplerState;
+
+// TODO(JS): Perhaps there is an argument these should be 32 bit?
+typedef CUDASize CUDACount;
+typedef CUDASize CUDAIndex;
+
 struct CUDALayoutRulesImpl : DefaultLayoutRulesImpl
 {
     typedef DefaultLayoutRulesImpl Super;
@@ -421,54 +469,23 @@ struct CUDALayoutRulesImpl : DefaultLayoutRulesImpl
     SimpleArrayLayoutInfo GetArrayLayout(SimpleLayoutInfo elementInfo, LayoutSize elementCount) override
     {
         SLANG_RELEASE_ASSERT(elementInfo.size.isFinite());
-        auto elementSize = elementInfo.size.getFiniteValue();
-        auto elementAlignment = elementInfo.alignment;
-        auto elementStride = RoundToAlignment(elementSize, elementAlignment);
-
+        
         if (elementCount.isInfinite())
         {
             // This is an unsized array, get information for element
             auto info = Super::GetArrayLayout(elementInfo, LayoutSize(1));
 
             // So it is actually a Array<T> on CUDA which is a pointer and a size
-            info.size = sizeof(void*) * 2;
-            info.alignment = SLANG_ALIGN_OF(void*);
+            info.size = _roundToAlignment((CUDAPtr) + sizeof(CUDACount), sizeof(CUDAPtr));
+            info.alignment = sizeof(CUDAPtr);
             return info;
         }
+        
+        // It's fine to use the Default impl, as long as any elements size is alignment rounded (as happen in EndStructLayout).
+        // If that weren't the case the array may be smaller than elementSize * elementCount which would be wrong for CUDA.
+        SLANG_ASSERT(_isAligned(elementInfo.size.getFiniteValue(), elementInfo.alignment));
 
-        // An array with no elements will have zero size.
-        //
-        LayoutSize arraySize = 0;
-        //
-        // Any array with a non-zero number of elements will need
-        // to have space for N elements of size `elementSize`, with
-        // the constraints that there must be `elementStride` bytes
-        // between consecutive elements.
-        //
-        if (elementCount > 0)
-        {
-            // We can think of this as either allocating (N-1)
-            // chunks of size `elementStride` (for most of the elements)
-            // and then one final chunk of size `elementSize`  for
-            // the last element, or equivalently as allocating
-            // N chunks of size `elementStride` and then "giving back"
-            // the final `elementStride - elementSize` bytes.
-            //
-            arraySize = (elementStride * (elementCount - 1)) + elementSize;
-        }
-
-        SimpleArrayLayoutInfo arrayInfo;
-        arrayInfo.kind = elementInfo.kind;
-        arrayInfo.size = arraySize;
-        arrayInfo.alignment = elementAlignment;
-        arrayInfo.elementStride = elementStride;
-        return arrayInfo;
-    }
-
-    // Computes the alignment of a vector type given element size and element count.
-    uint32_t getVectorAlignment(uint32_t elementSize, uint32_t elementCount)
-    {
-        return elementCount == 3 ? elementSize : elementSize * elementCount;
+        return Super::GetArrayLayout(elementInfo, elementCount);
     }
 
     SimpleLayoutInfo GetVectorLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t elementCount) override
@@ -478,28 +495,53 @@ struct CUDALayoutRulesImpl : DefaultLayoutRulesImpl
         {
             SimpleLayoutInfo fixInfo(elementInfo);
             fixInfo.size = sizeof(int32_t);
-            fixInfo.alignment = SLANG_ALIGN_OF(int32_t);
+            fixInfo.alignment = sizeof(int32_t);
             return GetVectorLayout(BaseType::Int, fixInfo, elementCount);
         }
+        
+        const auto elementSize = elementInfo.size.getFiniteValue();
+
+        // These rules can largely be determines by looking at
+        // 'vector_types.h' in the CUDA SDK
+
+        // Size in bytes of vector
+        size_t size = elementSize * elementCount;
+        // Special case 3, as uses alignment of the elementSize
+        size_t alignment = (elementCount == 3) ? elementSize : size;
+
+        // special case half
+        if (elementType == BaseType::Half && elementCount >= 3)
+        {
+            alignment = elementSize * 2;
+            size = _roundToAlignment(size, alignment);
+        }
+
+        // Nothing is aligned more than 16
+        alignment = std::min(alignment, size_t(16));
+
+        // TODO(JS): It's not 100% clear what is right in terms of size in respect of *alignment*. If the size is the 'used' bytes, then
+        // it can be less that the aligned size. If that's the case the GetArrayLayout (and MatrixLayout) is *wrong* in that on the last element
+        // it uses the size (not the aligned size/stride).
+        //
+        // Here I am assuming it's reasonable for the size to be the aligned size. That being the case the GetArrayLayout/GetMatrixLayout will be
+        // correct without special handling.
+        // 
+        // The assert below checks that is indeed the case.
+
+        // The size must be a multiple of the alignment
+        SLANG_ASSERT(_isAligned(size, alignment));
 
         SimpleLayoutInfo vectorInfo;
         vectorInfo.kind = elementInfo.kind;
-        vectorInfo.size = elementInfo.size * elementCount;
-        vectorInfo.alignment = getVectorAlignment((uint32_t)elementInfo.size.getFiniteValue(), (uint32_t)elementCount);
+        vectorInfo.size = size;
+        vectorInfo.alignment = alignment;
+     
         return vectorInfo;
     }
 
     SimpleArrayLayoutInfo GetMatrixLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t rowCount, size_t columnCount) override
     {
-        // Special case bool
-        if (elementType == BaseType::Bool)
-        {
-            SimpleLayoutInfo fixInfo(elementInfo);
-            fixInfo.size = sizeof(int32_t);
-            fixInfo.alignment = SLANG_ALIGN_OF(int32_t);
-            return GetMatrixLayout(BaseType::Int, fixInfo, rowCount, columnCount);
-        }
-
+        // The default behavior is to calculate the size as an array of rowCount vectors, which is correct here
         return Super::GetMatrixLayout(elementType, elementInfo, rowCount, columnCount);
     }
 
@@ -511,7 +553,7 @@ struct CUDALayoutRulesImpl : DefaultLayoutRulesImpl
     void EndStructLayout(UniformLayoutInfo* ioStructInfo) override
     {
         // Conform to CUDA/C/C++ size is adjusted to the largest alignment
-        ioStructInfo->size = RoundToAlignment(ioStructInfo->size, ioStructInfo->alignment);
+        ioStructInfo->size = _roundToAlignment(ioStructInfo->size, ioStructInfo->alignment);
     }
 };
 
@@ -874,9 +916,10 @@ struct CUDAObjectLayoutRulesImpl : CPUObjectLayoutRulesImpl
         switch (kind)
         {
             case ShaderParameterKind::ConstantBuffer:
+            {
                 // It's a pointer to the actual uniform data
-                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), SLANG_ALIGN_OF(void*));
-
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(CUDAPtr), sizeof(CUDAPtr));
+            }
             case ShaderParameterKind::TextureSampler:
             case ShaderParameterKind::MutableTextureSampler:
                 // That there is no distinct Sampler on CUDA, so TextureSampler is the same as a Texture
@@ -884,29 +927,37 @@ struct CUDAObjectLayoutRulesImpl : CPUObjectLayoutRulesImpl
             case ShaderParameterKind::MutableTexture:
             case ShaderParameterKind::TextureUniformBuffer:
             case ShaderParameterKind::Texture:
-                // It's a pointer to a texture interface 
-                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(ObjectHandle), SLANG_ALIGN_OF(ObjectHandle));
+            {
+                // It's a CUtexObject or CUsurfObject which is an opaque CUDAHandle sized
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(CUDAHandle), sizeof(CUDAPtr));
+            }
 
             case ShaderParameterKind::StructuredBuffer:
             case ShaderParameterKind::MutableStructuredBuffer:
-                // It's a pointer and a size
-                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*) * 2, SLANG_ALIGN_OF(void*));
-
+            {
+                // It's a ptr and a count of the amount of elements
+                const size_t size = _roundToAlignment(sizeof(CUDAPtr) + sizeof(CUDACount), sizeof(CUDAPtr));
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, size, sizeof(CUDAPtr));
+            }
             case ShaderParameterKind::RawBuffer:
             case ShaderParameterKind::Buffer:
             case ShaderParameterKind::MutableRawBuffer:
             case ShaderParameterKind::MutableBuffer:
-                // It's a pointer and a size in bytes
-                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*) * 2, SLANG_ALIGN_OF(void*));
-
+            {
+                // It's a ptr and a count of the amount of elements
+                const size_t size = _roundToAlignment(sizeof(CUDAPtr) + sizeof(CUDACount), sizeof(CUDAPtr));
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, size, sizeof(CUDAPtr));
+            }
             case ShaderParameterKind::SamplerState:
+            {
                 // In CUDA it seems that sampler states are combined into texture objects.
                 // So it's a binding issue to combine a sampler with a texture - and sampler are ignored
                 // For simplicity here though - we do create a variable and that variable takes up
                 // uniform binding space.
                 // TODO(JS): If we wanted to remove these variables we'd want to do it as a pass. The pass
                 // would presumably have to remove use of variables of this kind throughout IR. 
-                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), SLANG_ALIGN_OF(void*));
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(CUDASamplerState), sizeof(CUDAPtr));
+            }
 
             case ShaderParameterKind::InputRenderTarget:
                 // TODO: how to handle these?
@@ -4007,7 +4058,7 @@ static TypeLayoutResult _createTypeLayout(
             // The tag is always a `uint` for now.
             //
             auto tagInfo = context.rules->GetScalarLayout(BaseType::UInt);
-            info.size = RoundToAlignment(info.size, tagInfo.alignment);
+            info.size = _roundToAlignment(info.size, tagInfo.alignment);
 
             taggedUnionLayout->tagOffset = info.size;
 
diff --git a/tests/cuda/cuda-array-layout.slang b/tests/cuda/cuda-array-layout.slang
new file mode 100644
index 000000000..7fee3b192
--- /dev/null
+++ b/tests/cuda/cuda-array-layout.slang
@@ -0,0 +1,32 @@
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -output-using-type -shaderobj
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<int> outputBuffer : register(u0);
+
+struct PadLadenStruct
+{
+    double a;
+    uint8_t b;
+};
+
+// This is to check if the last half can be inserted 'inside' the spare padding of a. It should not be
+struct StructWithArray
+{
+    PadLadenStruct a[1];
+    uint8_t b;
+    
+    matrix<half, 3, 3> c;
+    uint8_t d;
+};
+
+[numthreads(1, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    StructWithArray s;
+    outputBuffer[0] = __sizeOf(s);
+    
+    outputBuffer[1] = __offsetOf(s, s.a);
+    outputBuffer[2] = __offsetOf(s, s.b);
+    outputBuffer[3] = __offsetOf(s, s.c);
+    outputBuffer[4] = __offsetOf(s, s.d);
+}
diff --git a/tests/cuda/cuda-array-layout.slang.expected.txt b/tests/cuda/cuda-array-layout.slang.expected.txt
new file mode 100644
index 000000000..bc3e8bd6c
--- /dev/null
+++ b/tests/cuda/cuda-array-layout.slang.expected.txt
@@ -0,0 +1,9 @@
+type: int32_t
+48
+0
+16
+20
+44
+0
+0
+0
diff --git a/tests/cuda/cuda-layout.slang b/tests/cuda/cuda-layout.slang
new file mode 100644
index 000000000..725bf798e
--- /dev/null
+++ b/tests/cuda/cuda-layout.slang
@@ -0,0 +1,24 @@
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -output-using-type -shaderobj
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<int> outputBuffer : register(u0);
+
+#define WRITE_TYPE_ALIGN(base, type) \
+    outputBuffer[base * 4 + 0] = __alignOf<type>(); \
+    outputBuffer[base * 4 + 1] = __alignOf<vector<type, 2> >(); \
+    outputBuffer[base * 4 + 2] = __alignOf<vector<type, 3> >(); \
+    outputBuffer[base * 4 + 3] = __alignOf<vector<type, 4> >();
+    
+
+[numthreads(1, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    WRITE_TYPE_ALIGN(0, uint8_t)
+    WRITE_TYPE_ALIGN(1, uint16_t)
+    WRITE_TYPE_ALIGN(2, int)
+    WRITE_TYPE_ALIGN(3, int64_t)
+    
+    WRITE_TYPE_ALIGN(4, half)
+    WRITE_TYPE_ALIGN(5, float)
+    WRITE_TYPE_ALIGN(6, double)   
+}
diff --git a/tests/cuda/cuda-layout.slang.expected.txt b/tests/cuda/cuda-layout.slang.expected.txt
new file mode 100644
index 000000000..bf4edf064
--- /dev/null
+++ b/tests/cuda/cuda-layout.slang.expected.txt
@@ -0,0 +1,33 @@
+type: int32_t
+1
+2
+1
+4
+2
+4
+2
+8
+4
+8
+4
+16
+8
+16
+8
+16
+2
+4
+4
+4
+4
+8
+4
+16
+8
+16
+8
+16
+0
+0
+0
+0
diff --git a/tests/cuda/cuda-reflection.slang b/tests/cuda/cuda-reflection.slang
new file mode 100644
index 000000000..95bf591c9
--- /dev/null
+++ b/tests/cuda/cuda-reflection.slang
@@ -0,0 +1,28 @@
+// cuda-reflection.slang
+
+//TEST:REFLECTION:-stage compute -entry main -target cuda
+
+struct PadLadenStruct
+{
+    double a;
+    uint8_t b;
+};
+
+// This is to check if the last half can be inserted 'inside' the spare padding of a. It should not be
+struct StructWithArray
+{
+    PadLadenStruct a[1];
+    uint8_t c;
+    
+    matrix<half, 3, 3> d;
+    uint8_t e;
+};
+
+ConstantBuffer<StructWithArray> cb;
+RWStructuredBuffer<StructWithArray> sb;
+
+[numthreads(1, 1, 1)]
+void main(
+    uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+}
+\ No newline at end of file
diff --git a/tests/cuda/cuda-reflection.slang.expected b/tests/cuda/cuda-reflection.slang.expected
new file mode 100644
index 000000000..d27d99557
--- /dev/null
+++ b/tests/cuda/cuda-reflection.slang.expected
@@ -0,0 +1,250 @@
+result code = 0
+standard error = {
+}
+standard output = {
+{
+    "parameters": [
+        {
+            "name": "cb",
+            "binding": {"kind": "uniform", "offset": 0, "size": 8},
+            "type": {
+                "kind": "constantBuffer",
+                "elementType": {
+                    "kind": "struct",
+                    "name": "StructWithArray",
+                    "fields": [
+                        {
+                            "name": "a",
+                            "type": {
+                                "kind": "array",
+                                "elementCount": 1,
+                                "elementType": {
+                                    "kind": "struct",
+                                    "name": "PadLadenStruct",
+                                    "fields": [
+                                        {
+                                            "name": "a",
+                                            "type": {
+                                                "kind": "scalar",
+                                                "scalarType": "float64"
+                                            },
+                                            "binding": {"kind": "uniform", "offset": 0, "size": 8}
+                                        },
+                                        {
+                                            "name": "b",
+                                            "type": {
+                                                "kind": "scalar",
+                                                "scalarType": "uint8"
+                                            },
+                                            "binding": {"kind": "uniform", "offset": 8, "size": 1}
+                                        }
+                                    ]
+                                },
+                                "uniformStride": 16
+                            },
+                            "binding": {"kind": "uniform", "offset": 0, "size": 16}
+                        },
+                        {
+                            "name": "c",
+                            "type": {
+                                "kind": "scalar",
+                                "scalarType": "uint8"
+                            },
+                            "binding": {"kind": "uniform", "offset": 16, "size": 1}
+                        },
+                        {
+                            "name": "d",
+                            "type": {
+                                "kind": "matrix",
+                                "rowCount": 3,
+                                "columnCount": 3,
+                                "elementType": {
+                                    "kind": "scalar",
+                                    "scalarType": "float16"
+                                }
+                            },
+                            "binding": {"kind": "uniform", "offset": 20, "size": 24}
+                        },
+                        {
+                            "name": "e",
+                            "type": {
+                                "kind": "scalar",
+                                "scalarType": "uint8"
+                            },
+                            "binding": {"kind": "uniform", "offset": 44, "size": 1}
+                        }
+                    ]
+                },
+                "containerVarLayout": {
+                    "binding": {"kind": "uniform", "offset": 0, "size": 8}
+                },
+                "elementVarLayout": {
+                    "type": {
+                        "kind": "struct",
+                        "name": "StructWithArray",
+                        "fields": [
+                            {
+                                "name": "a",
+                                "type": {
+                                    "kind": "array",
+                                    "elementCount": 1,
+                                    "elementType": {
+                                        "kind": "struct",
+                                        "name": "PadLadenStruct",
+                                        "fields": [
+                                            {
+                                                "name": "a",
+                                                "type": {
+                                                    "kind": "scalar",
+                                                    "scalarType": "float64"
+                                                },
+                                                "binding": {"kind": "uniform", "offset": 0, "size": 8}
+                                            },
+                                            {
+                                                "name": "b",
+                                                "type": {
+                                                    "kind": "scalar",
+                                                    "scalarType": "uint8"
+                                                },
+                                                "binding": {"kind": "uniform", "offset": 8, "size": 1}
+                                            }
+                                        ]
+                                    },
+                                    "uniformStride": 16
+                                },
+                                "binding": {"kind": "uniform", "offset": 0, "size": 16}
+                            },
+                            {
+                                "name": "c",
+                                "type": {
+                                    "kind": "scalar",
+                                    "scalarType": "uint8"
+                                },
+                                "binding": {"kind": "uniform", "offset": 16, "size": 1}
+                            },
+                            {
+                                "name": "d",
+                                "type": {
+                                    "kind": "matrix",
+                                    "rowCount": 3,
+                                    "columnCount": 3,
+                                    "elementType": {
+                                        "kind": "scalar",
+                                        "scalarType": "float16"
+                                    }
+                                },
+                                "binding": {"kind": "uniform", "offset": 20, "size": 24}
+                            },
+                            {
+                                "name": "e",
+                                "type": {
+                                    "kind": "scalar",
+                                    "scalarType": "uint8"
+                                },
+                                "binding": {"kind": "uniform", "offset": 44, "size": 1}
+                            }
+                        ]
+                    },
+                    "binding": {"kind": "uniform", "offset": 0, "size": 48}
+                }
+            }
+        },
+        {
+            "name": "sb",
+            "binding": {"kind": "uniform", "offset": 8, "size": 16},
+            "type": {
+                "kind": "resource",
+                "baseShape": "structuredBuffer",
+                "access": "readWrite",
+                "resultType": {
+                    "kind": "struct",
+                    "name": "StructWithArray",
+                    "fields": [
+                        {
+                            "name": "a",
+                            "type": {
+                                "kind": "array",
+                                "elementCount": 1,
+                                "elementType": {
+                                    "kind": "struct",
+                                    "name": "PadLadenStruct",
+                                    "fields": [
+                                        {
+                                            "name": "a",
+                                            "type": {
+                                                "kind": "scalar",
+                                                "scalarType": "float64"
+                                            },
+                                            "binding": {"kind": "uniform", "offset": 0, "size": 8}
+                                        },
+                                        {
+                                            "name": "b",
+                                            "type": {
+                                                "kind": "scalar",
+                                                "scalarType": "uint8"
+                                            },
+                                            "binding": {"kind": "uniform", "offset": 8, "size": 1}
+                                        }
+                                    ]
+                                },
+                                "uniformStride": 16
+                            },
+                            "binding": {"kind": "uniform", "offset": 0, "size": 16}
+                        },
+                        {
+                            "name": "c",
+                            "type": {
+                                "kind": "scalar",
+                                "scalarType": "uint8"
+                            },
+                            "binding": {"kind": "uniform", "offset": 16, "size": 1}
+                        },
+                        {
+                            "name": "d",
+                            "type": {
+                                "kind": "matrix",
+                                "rowCount": 3,
+                                "columnCount": 3,
+                                "elementType": {
+                                    "kind": "scalar",
+                                    "scalarType": "float16"
+                                }
+                            },
+                            "binding": {"kind": "uniform", "offset": 20, "size": 24}
+                        },
+                        {
+                            "name": "e",
+                            "type": {
+                                "kind": "scalar",
+                                "scalarType": "uint8"
+                            },
+                            "binding": {"kind": "uniform", "offset": 44, "size": 1}
+                        }
+                    ]
+                }
+            }
+        }
+    ],
+    "entryPoints": [
+        {
+            "name": "main",
+            "stage:": "compute",
+            "parameters": [
+                {
+                    "name": "dispatchThreadID",
+                    "semanticName": "SV_DISPATCHTHREADID",
+                    "type": {
+                        "kind": "vector",
+                        "elementCount": 3,
+                        "elementType": {
+                            "kind": "scalar",
+                            "scalarType": "uint32"
+                        }
+                    }
+                }
+            ],
+            "threadGroupSize": [1, 1, 1]
+        }
+    ]
+}
+}
diff --git a/tools/slang-reflection-test/slang-reflection-test-main.cpp b/tools/slang-reflection-test/slang-reflection-test-main.cpp
index 655b4e41d..0b8e88d68 100644
--- a/tools/slang-reflection-test/slang-reflection-test-main.cpp
+++ b/tools/slang-reflection-test/slang-reflection-test-main.cpp
@@ -489,10 +489,16 @@ static void emitReflectionScalarTypeInfoJSON(
 #define CASE(TAG, ID) case slang::TypeReflection::ScalarType::TAG: write(writer, #ID); break
         CASE(Void, void);
         CASE(Bool, bool);
+
+        CASE(Int8, int8);
+        CASE(UInt8, uint8);
+        CASE(Int16, int16);
+        CASE(UInt16, uint16);
         CASE(Int32, int32);
         CASE(UInt32, uint32);
         CASE(Int64, int64);
         CASE(UInt64, uint64);
+
         CASE(Float16, float16);
         CASE(Float32, float32);
         CASE(Float64, float64);