7 files changed, 173 insertions, 32 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 74a9d0cec..89f33e24b 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -207,6 +207,12 @@ union Union64
     double d;
 };
 
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float make_float(T val)
+{
+    return (float)val;
+}
+
 SLANG_FORCE_INLINE SLANG_CUDA_CALL float _slang_fmod(float x, float y)
 {
     return ::fmodf(x, y);
diff --git a/source/slang/slang-compiler.h b/source/slang/slang-compiler.h
index 2409cedfb..7fc43d778 100755
--- a/source/slang/slang-compiler.h
+++ b/source/slang/slang-compiler.h
@@ -1769,6 +1769,8 @@ namespace Slang
     /// Are we generating code for a CUDA API (CUDA / OptiX)?
     bool isCUDATarget(TargetRequest* targetReq);
 
+    // Are we generating code for a CPU target
+    bool isCPUTarget(TargetRequest* targetReq);
 
         /// A request to generate output in some target format.
     class TargetRequest : public RefObject
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index 679d8ce88..678b4137a 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -1267,15 +1267,7 @@ Result linkAndOptimizeIR(
     if (requiredLoweringPassSet.meshOutput)
         legalizeMeshOutputTypes(irModule);
 
-    if (options.shouldLegalizeExistentialAndResourceTypes)
-    {
-        if (!isMetalTarget(targetRequest))
-        {
-            // We need to lower any types used in a buffer resource (e.g. ContantBuffer or StructuredBuffer) into
-            // a simple storage type that has target independent layout based on the kind of buffer resource.
-            lowerBufferElementTypeToStorageType(targetProgram, irModule);
-        }
-    }
+    lowerBufferElementTypeToStorageType(targetProgram, irModule);
 
     // Rewrite functions that return arrays to return them via `out` parameter,
     // since our target languages doesn't allow returning arrays.
diff --git a/source/slang/slang-ir-lower-buffer-element-type.cpp b/source/slang/slang-ir-lower-buffer-element-type.cpp
index 981e29697..d042aae43 100644
--- a/source/slang/slang-ir-lower-buffer-element-type.cpp
+++ b/source/slang/slang-ir-lower-buffer-element-type.cpp
@@ -877,7 +877,9 @@ namespace Slang
     void lowerBufferElementTypeToStorageType(TargetProgram* target, IRModule* module, bool lowerBufferPointer)
     {
         SlangMatrixLayoutMode defaultMatrixMode = (SlangMatrixLayoutMode)target->getOptionSet().getMatrixLayoutMode();
-        if (defaultMatrixMode == SLANG_MATRIX_LAYOUT_MODE_UNKNOWN)
+        if ((isCPUTarget(target->getTargetReq()) || isCUDATarget(target->getTargetReq()) || isMetalTarget(target->getTargetReq())))
+            defaultMatrixMode = SLANG_MATRIX_LAYOUT_ROW_MAJOR;
+        else if (defaultMatrixMode == SLANG_MATRIX_LAYOUT_MODE_UNKNOWN)
             defaultMatrixMode = SLANG_MATRIX_LAYOUT_ROW_MAJOR;
         LoweredElementTypeContext context(target, lowerBufferPointer, defaultMatrixMode);
         context.processModule(module);
diff --git a/tests/compute/column-major.slang b/tests/compute/column-major.slang
index 19d863260..1cd08434b 100644
--- a/tests/compute/column-major.slang
+++ b/tests/compute/column-major.slang
@@ -1,33 +1,59 @@
 // column-major.slang
 
-// Unfortunately CPU and CUDA only work with row layout, so they have to be disabled here.
-
-//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -output-using-type -compile-arg -O3 -shaderobj
-//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -output-using-type -shaderobj -Xslang -matrix-layout-column-major
-//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -output-using-type -dx12 -shaderobj -Xslang -matrix-layout-column-major
-//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -output-using-type -shaderobj -Xslang -matrix-layout-column-major
-//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -output-using-type -shaderobj
-//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -output-using-type -mtl -shaderobj -Xslang -matrix-layout-column-major
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-cpu -compute -compile-arg -O3 -shaderobj -Xslang -matrix-layout-column-major
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-slang -compute -shaderobj -Xslang -matrix-layout-column-major
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-slang -compute -dx12 -shaderobj -Xslang -matrix-layout-column-major
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-vk -compute -shaderobj -Xslang -matrix-layout-column-major
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-vk -compute -emit-spirv-via-glsl -Xslang -matrix-layout-column-major
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-cuda -compute -shaderobj -Xslang -matrix-layout-column-major
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-mtl -compute -shaderobj -Xslang -matrix-layout-column-major
 
 // This data is in column major layout order.... 
 //TEST_INPUT:cbuffer(data=[1.0 0.0 0.0 10.0  0.0 1.0 0.0 20.0  0.0 0.0 1.0 30.0  0.0 0.0 0.0 1.0]):name matrixBuffer
 
 ConstantBuffer<float4x4> matrixBuffer;
 
-//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name output
-RWStructuredBuffer<float> output;
+//TEST_INPUT:ubuffer(data=[0], stride=4):out,name output
+RWStructuredBuffer<uint> output;
+
+bool floatCheck(float data, float valueToCheckFor)
+{
+    return data < (valueToCheckFor + 0.001) && data > valueToCheckFor - 0.001;
+}
 
 [numthreads(1, 1, 1)]
 void computeMain(uint3 tid : SV_DispatchThreadID)
 {
     float4 v = float4(1, 2, 3, 1);
 
-    float4x4 M = matrixBuffer;
+    float4x4 M1 = matrixBuffer;
     
-    float4 r = mul(v, M);
-
-    output[0] = r.x;
-    output[1] = r.y;
-    output[2] = r.z;
-    output[3] = r.w;
+    float4 r = mul(v, M1);
+
+    float4x4 M2 = mul(M1, M1);
+
+    float4x4 M3 = float4x4(
+            1.0, 0.0, 0.0, 10.0, 
+            0.0, 1.0, 0.0, 20.0,
+            0.0, 0.0, 1.0, 30.0,
+            0.0, 0.0, 0.0, 1.0
+        );
+
+    output[0] = uint(true
+            && floatCheck(r.x, 11)
+            && floatCheck(r.y, 22)
+            && floatCheck(r.z, 33)
+            && floatCheck(r.w, 1)
+
+            && floatCheck(M1[3][0], 10)
+
+            && floatCheck(M2[3][0], 20)
+            && floatCheck(M2._41, 20)
+            && floatCheck(M2._41_32[0], 20)
+            && floatCheck(M2._33_42[0], 1)
+            && floatCheck(M2._42_33[0], 40)
+
+            && floatCheck(M3[0][3], 10)
+        );
+    //BUF: 1
 }
diff --git a/tests/compute/column-major.slang.expected.txt b/tests/compute/column-major.slang.expected.txt
deleted file mode 100644
index 1e24f3253..000000000
--- a/tests/compute/column-major.slang.expected.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-type: float
-11.000000
-22.000000
-33.000000
-1.000000
diff --git a/tests/compute/constant-buffer-memory-packing.slang b/tests/compute/constant-buffer-memory-packing.slang
new file mode 100644
index 000000000..5246c4d33
--- /dev/null
+++ b/tests/compute/constant-buffer-memory-packing.slang
@@ -0,0 +1,118 @@
+// column-major-with-row-major-operations.slang
+
+// Metal/CPP/CUDA do not deal with packing currently, different results will occur.
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-cpu -compute -xslang -DTARGET_WITHOUT_PACKING
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-cuda -compute -xslang -DTARGET_WITHOUT_PACKING
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-mtl -compute -xslang -DTARGET_WITHOUT_PACKING
+
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-slang -compute -dx12
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-vk -compute
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-vk -compute -emit-spirv-via-glsl
+
+// CPP/Metal/CUDA due to not having memory packing will recieve the following ROW matrix:
+// {1,2,3}
+// {0,4,5}
+// {6,0,7}
+
+// GLSL/SPIRV/HLSL due to having memory packing will recieve the following ROW/COL matrix:
+// {1,2,3}
+// {0,4,5}
+// {6,0,7}
+
+//TEST_INPUT:cbuffer(data=[1.0 2.0 3.0 0.0  4.0 5.0 6.0 0.0  7.0 8.0 9.0 0]):name matrixTestCBuf1
+ConstantBuffer<row_major float3x3> matrixTestCBuf1;
+
+// CPP/Metal/CUDA due to not having memory packing will recieve the following COL matrix post-transpose:
+// {1,0,8}
+// {4,2,0}
+// {7,5,3}
+
+//TEST_INPUT:cbuffer(data=[1.0 4.0 7.0 0.0  2.0 5.0 8.0 0.0  3.0 6.0 9.0 0.0]):name matrixTestCBuf2
+ConstantBuffer<column_major float3x3> matrixTestCBuf2;
+
+//TEST_INPUT:cbuffer(data=[1.0 2.0 3.0 0.0  4.0 5.0 6.0 0.0]):name NeedsPadding
+cbuffer NeedsPadding
+{
+    float3 data1;
+    float3 data2;
+};
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name output
+RWStructuredBuffer<uint> output;
+
+bool floatCheck(float data, float valueToCheckFor)
+{
+    return data < (valueToCheckFor + 0.001) && data > valueToCheckFor - 0.001;
+}
+
+[numthreads(1, 1, 1)]
+void computeMain(uint3 tid : SV_DispatchThreadID)
+{
+    float3x3 matrixTest1;
+    matrixTest1 = matrixTestCBuf1;
+
+    float3x3 matrixTest2;
+    matrixTest2 = matrixTestCBuf2;
+
+    output[0] = uint(true
+#ifndef TARGET_WITHOUT_PACKING
+            && floatCheck(matrixTest1[0][0], 1)
+            && floatCheck(matrixTest1[0][1], 2)
+            && floatCheck(matrixTest1[0][2], 3)
+            && floatCheck(matrixTest1[1][0], 4)
+            && floatCheck(matrixTest1[1][1], 5)
+            && floatCheck(matrixTest1[1][2], 6)
+            && floatCheck(matrixTest1[2][0], 7)
+            && floatCheck(matrixTest1[2][1], 8)
+            && floatCheck(matrixTest1[2][2], 9)
+
+            && floatCheck(matrixTest2[0][0], 1)
+            && floatCheck(matrixTest2[0][1], 2)
+            && floatCheck(matrixTest2[0][2], 3)
+            && floatCheck(matrixTest2[1][0], 4)
+            && floatCheck(matrixTest2[1][1], 5)
+            && floatCheck(matrixTest2[1][2], 6)
+            && floatCheck(matrixTest2[2][0], 7)
+            && floatCheck(matrixTest2[2][1], 8)
+            && floatCheck(matrixTest2[2][2], 9)
+
+            && floatCheck(data1[0], 1)
+            && floatCheck(data1[1], 2)
+            && floatCheck(data1[2], 3)
+            && floatCheck(data2[0], 4)
+            && floatCheck(data2[1], 5)
+            && floatCheck(data2[2], 6)
+#else
+            && floatCheck(matrixTest1[0][0], 1)
+            && floatCheck(matrixTest1[0][1], 2)
+            && floatCheck(matrixTest1[0][2], 3)
+            && floatCheck(matrixTest1[1][0], 0)
+            && floatCheck(matrixTest1[1][1], 4)
+            && floatCheck(matrixTest1[1][2], 5)
+            && floatCheck(matrixTest1[2][0], 6)
+            && floatCheck(matrixTest1[2][1], 0)
+            && floatCheck(matrixTest1[2][2], 7)
+
+            && floatCheck(matrixTest2[0][0], 1)
+            && floatCheck(matrixTest2[0][1], 0)
+            && floatCheck(matrixTest2[0][2], 8)
+            && floatCheck(matrixTest2[1][0], 4)
+            && floatCheck(matrixTest2[1][1], 2)
+            && floatCheck(matrixTest2[1][2], 0)
+            && floatCheck(matrixTest2[2][0], 7)
+            && floatCheck(matrixTest2[2][1], 5)
+            && floatCheck(matrixTest2[2][2], 3)
+
+            && floatCheck(data1[0], 1)
+            && floatCheck(data1[1], 2)
+            && floatCheck(data1[2], 3)
+            && floatCheck(data2[0], 0)
+            && floatCheck(data2[1], 4)
+            && floatCheck(data2[2], 5)
+#endif
+        );
+    output[1] = (uint)matrixTest2[0][0];
+    output[2] = (uint)matrixTest2[0][1];
+    output[3] = (uint)matrixTest2[0][2];
+    //BUF: 1
+}