4 files changed, 931 insertions, 121 deletions
diff --git a/prelude/slang-cpp-prelude.h b/prelude/slang-cpp-prelude.h
index 84a61f929..d15abdb88 100644
--- a/prelude/slang-cpp-prelude.h
+++ b/prelude/slang-cpp-prelude.h
@@ -296,8 +296,8 @@ struct ISlangUnknown
 
 // Includes
 
-#include "slang-cpp-types.h"
 #include "slang-cpp-scalar-intrinsics.h"
+#include "slang-cpp-types.h"
 
 // TODO(JS): Hack! Output C++ code from slang can copy uninitialized variables. 
 #if defined(_MSC_VER)
diff --git a/prelude/slang-cpp-scalar-intrinsics.h b/prelude/slang-cpp-scalar-intrinsics.h
index 66035260d..2b9e7f777 100644
--- a/prelude/slang-cpp-scalar-intrinsics.h
+++ b/prelude/slang-cpp-scalar-intrinsics.h
@@ -490,6 +490,17 @@ void InterlockedAdd(uint32_t* dest, uint32_t value, uint32_t* oldValue)
 
 #endif // SLANG_LLVM
 
+
+// ----------------------- fmod --------------------------
+SLANG_FORCE_INLINE float _slang_fmod(float x, float y)
+{
+    return F32_fmod(x, y);
+}
+SLANG_FORCE_INLINE double _slang_fmod(double x, double y)
+{
+    return F64_fmod(x, y);
+}
+
 #ifdef SLANG_PRELUDE_NAMESPACE
 } 
 #endif
diff --git a/prelude/slang-cpp-types.h b/prelude/slang-cpp-types.h
index c15c5ec40..28fe3dd8d 100644
--- a/prelude/slang-cpp-types.h
+++ b/prelude/slang-cpp-types.h
@@ -86,26 +86,159 @@ template <typename T>
 struct Vector<T, 1>
 {
     T x;
+    const T& operator[](size_t /*index*/) const { return x; }
+    T& operator[](size_t /*index*/) { return x; }
+    operator T() const { return x; }
+    Vector() = default;
+    Vector(T scalar)
+    {
+        x = scalar;
+    }
+    template <typename U>
+    Vector(Vector<U, 1> other)
+    {
+        x = (T)other.x;
+    }
+    template <typename U, int otherSize>
+    Vector(Vector<U, otherSize> other)
+    {
+        int minSize = 1;
+        if (otherSize < minSize) minSize = otherSize;
+        for (int i = 0; i < minSize; i++)
+            (*this)[i] = (T)other[i];
+    }
 };
 
 template <typename T>
 struct Vector<T, 2>
 {
     T x, y;
+    const T& operator[](size_t index) const { return index == 0 ? x : y; }
+    T& operator[](size_t index) { return index == 0 ? x : y; }
+    Vector() = default;
+    Vector(T scalar)
+    {
+        x = y = scalar;
+    }
+    Vector(T _x, T _y)
+    {
+        x = _x;
+        y = _y;
+    }
+    template <typename U>
+    Vector(Vector<U, 2> other)
+    {
+        x = (T)other.x;
+        y = (T)other.y;
+    }
+    template <typename U, int otherSize>
+    Vector(Vector<U, otherSize> other)
+    {
+        int minSize = 2;
+        if (otherSize < minSize) minSize = otherSize;
+        for (int i = 0; i < minSize; i++)
+            (*this)[i] = (T)other[i];
+    }
 };
 
 template <typename T>
 struct Vector<T, 3>
 {
     T x, y, z;
+    const T& operator[](size_t index) const { return *((T*)(this) + index); }
+    T& operator[](size_t index) { return *((T*)(this) + index); }
+
+    Vector() = default;
+    Vector(T scalar)
+    {
+        x = y = z = scalar;
+    }
+    Vector(T _x, T _y, T _z)
+    {
+        x = _x;
+        y = _y;
+        z = _z;
+    }
+    template <typename U>
+    Vector(Vector<U, 3> other)
+    {
+        x = (T)other.x;
+        y = (T)other.y;
+        z = (T)other.z;
+    }
+    template <typename U, int otherSize>
+    Vector(Vector<U, otherSize> other)
+    {
+        int minSize = 3;
+        if (otherSize < minSize) minSize = otherSize;
+        for (int i = 0; i < minSize; i++)
+            (*this)[i] = (T)other[i];
+    }
 };
 
 template <typename T>
 struct Vector<T, 4>
 {
     T x, y, z, w;
+
+    const T& operator[](size_t index) const { return *((T*)(this) + index); }
+    T& operator[](size_t index) { return *((T*)(this) + index); }
+    Vector() = default;
+    Vector(T scalar)
+    {
+        x = y = z = w = scalar;
+    }
+    Vector(T _x, T _y, T _z, T _w)
+    {
+        x = _x;
+        y = _y;
+        z = _z;
+        w = _w;
+    }
+    template <typename U, int otherSize>
+    Vector(Vector<U, otherSize> other)
+    {
+        int minSize = 4;
+        if (otherSize < minSize) minSize = otherSize;
+        for (int i = 0; i < minSize; i++)
+            (*this)[i] = (T)other[i];
+    }
+ 
 };
 
+template<typename T, int N>
+SLANG_FORCE_INLINE T _slang_vector_get_element(Vector<T, N> x, int index)
+{
+    return x[index];
+}
+
+template<typename T, int N>
+SLANG_FORCE_INLINE const T* _slang_vector_get_element_ptr(const Vector<T, N>* x, int index)
+{
+    return &((*const_cast<Vector<T,N>*>(x))[index]);
+}
+
+template<typename T, int N>
+SLANG_FORCE_INLINE T* _slang_vector_get_element_ptr(Vector<T, N>* x, int index)
+{
+    return &((*x)[index]);
+}
+
+template<typename T, int n, typename OtherT, int m>
+SLANG_FORCE_INLINE Vector<T, n> _slang_vector_reshape(const Vector<OtherT, m> other)
+{
+    Vector<T, n> result;
+    for (int i = 0; i < n; i++)
+    {
+        OtherT otherElement = T(0);
+        if (i < m)
+            otherElement = _slang_vector_get_element(other, i);
+        *_slang_vector_get_element_ptr(&result, i) = (T)otherElement;
+    }
+    return result;
+}
+
+typedef uint32_t uint;
 
 typedef Vector<float, 2> float2;
 typedef Vector<float, 3> float3;
@@ -119,12 +252,320 @@ typedef Vector<uint32_t, 2> uint2;
 typedef Vector<uint32_t, 3> uint3;
 typedef Vector<uint32_t, 4> uint4;
 
+#define SLANG_VECTOR_BINARY_OP(T, op) \
+    template<int n> \
+    SLANG_FORCE_INLINE Vector<T, n> operator op(const Vector<T, n>& thisVal, const Vector<T, n>& other) \
+    { \
+        Vector<T, n> result;\
+        for (int i = 0; i < n; i++) \
+            result[i] = thisVal[i] op other[i]; \
+        return result;\
+    }
+#define SLANG_VECTOR_BINARY_COMPARE_OP(T, op) \
+    template<int n> \
+    SLANG_FORCE_INLINE Vector<bool, n> operator op(const Vector<T, n>& thisVal, const Vector<T, n>& other) \
+    { \
+        Vector<bool, n> result;\
+        for (int i = 0; i < n; i++) \
+            result[i] = thisVal[i] op other[i]; \
+        return result;\
+    }
+
+#define SLANG_VECTOR_UNARY_OP(T, op) \
+    template<int n> \
+    SLANG_FORCE_INLINE Vector<T, n> operator op(const Vector<T, n>& thisVal) \
+    { \
+        Vector<T, n> result;\
+        for (int i = 0; i < n; i++) \
+            result[i] = op thisVal[i]; \
+        return result;\
+    }
+#define SLANG_INT_VECTOR_OPS(T) \
+    SLANG_VECTOR_BINARY_OP(T, +)\
+    SLANG_VECTOR_BINARY_OP(T, -)\
+    SLANG_VECTOR_BINARY_OP(T, *)\
+    SLANG_VECTOR_BINARY_OP(T, / )\
+    SLANG_VECTOR_BINARY_OP(T, &)\
+    SLANG_VECTOR_BINARY_OP(T, |)\
+    SLANG_VECTOR_BINARY_OP(T, &&)\
+    SLANG_VECTOR_BINARY_OP(T, ||)\
+    SLANG_VECTOR_BINARY_OP(T, ^)\
+    SLANG_VECTOR_BINARY_OP(T, %)\
+    SLANG_VECTOR_BINARY_OP(T, >>)\
+    SLANG_VECTOR_BINARY_OP(T, <<)\
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, >)\
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, <)\
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, >=)\
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, <=)\
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, ==)\
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, !=)\
+    SLANG_VECTOR_UNARY_OP(T, !)\
+    SLANG_VECTOR_UNARY_OP(T, ~)
+#define SLANG_FLOAT_VECTOR_OPS(T) \
+    SLANG_VECTOR_BINARY_OP(T, +)\
+    SLANG_VECTOR_BINARY_OP(T, -)\
+    SLANG_VECTOR_BINARY_OP(T, *)\
+    SLANG_VECTOR_BINARY_OP(T, /)\
+    SLANG_VECTOR_UNARY_OP(T, -)\
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, >)\
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, <)\
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, >=)\
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, <=)\
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, ==)\
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, !=)
+
+SLANG_INT_VECTOR_OPS(bool)
+SLANG_INT_VECTOR_OPS(int)
+SLANG_INT_VECTOR_OPS(int8_t)
+SLANG_INT_VECTOR_OPS(int16_t)
+SLANG_INT_VECTOR_OPS(int64_t)
+SLANG_INT_VECTOR_OPS(uint)
+SLANG_INT_VECTOR_OPS(uint8_t)
+SLANG_INT_VECTOR_OPS(uint16_t)
+SLANG_INT_VECTOR_OPS(uint64_t)
+
+SLANG_FLOAT_VECTOR_OPS(float)
+SLANG_FLOAT_VECTOR_OPS(double)
+
+#define SLANG_VECTOR_INT_NEG_OP(T) \
+    template<int N>\
+    Vector<T, N> operator-(const Vector<T, N>& thisVal) \
+    { \
+        Vector<T, N> result;\
+        for (int i = 0; i < N; i++) \
+            result[i] = 0 - thisVal[i]; \
+        return result;\
+    }
+SLANG_VECTOR_INT_NEG_OP(int)
+SLANG_VECTOR_INT_NEG_OP(int8_t)
+SLANG_VECTOR_INT_NEG_OP(int16_t)
+SLANG_VECTOR_INT_NEG_OP(int64_t)
+SLANG_VECTOR_INT_NEG_OP(uint)
+SLANG_VECTOR_INT_NEG_OP(uint8_t)
+SLANG_VECTOR_INT_NEG_OP(uint16_t)
+SLANG_VECTOR_INT_NEG_OP(uint64_t)
+
+#define SLANG_FLOAT_VECTOR_MOD(T)\
+    template<int N> \
+    Vector<T, N> operator%(const Vector<T, N>& left, const Vector<T, N>& right) \
+    {\
+        Vector<T, N> result;\
+        for (int i = 0; i < N; i++) \
+            result[i] = _slang_fmod(left[i], right[i]); \
+        return result;\
+    }
+
+SLANG_FLOAT_VECTOR_MOD(float)
+SLANG_FLOAT_VECTOR_MOD(double)
+#undef SLANG_FLOAT_VECTOR_MOD
+#undef SLANG_VECTOR_BINARY_OP
+#undef SLANG_VECTOR_UNARY_OP
+#undef SLANG_INT_VECTOR_OPS
+#undef SLANG_FLOAT_VECTOR_OPS
+#undef SLANG_VECTOR_INT_NEG_OP
+#undef SLANG_FLOAT_VECTOR_MOD
+
 template <typename T, int ROWS, int COLS>
 struct Matrix
 {
     Vector<T, COLS> rows[ROWS];
+    Vector<T, COLS>& operator[](size_t index) { return rows[index]; }
+    Matrix() = default;
+    Matrix(T scalar)
+    {
+        for (int i = 0; i < ROWS; i++)
+            rows[i] = Vector<T, COLS>(scalar);
+    }
+    Matrix(const Vector<T, COLS>& row0)
+    {
+        rows[0] = row0;
+    }
+    Matrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1)
+    {
+        rows[0] = row0;
+        rows[1] = row1;
+    }
+    Matrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1, const Vector<T, COLS>& row2)
+    {
+        rows[0] = row0;
+        rows[1] = row1;
+        rows[2] = row2;
+    }
+    Matrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1, const Vector<T, COLS>& row2, const Vector<T, COLS>& row3)
+    {
+        rows[0] = row0;
+        rows[1] = row1;
+        rows[2] = row2;
+        rows[3] = row3;
+    }
+    template<typename U, int otherRow, int otherCol>
+    Matrix(const Matrix<U, otherRow, otherCol>& other)
+    {
+        int minRow = ROWS;
+        int minCol = COLS;
+        if (minRow > otherRow) minRow = otherRow;
+        if (minCol > otherCol) minCol = otherCol;
+        for (int i = 0; i < minRow; i++)
+            for (int j = 0; j < minCol; j++)
+                rows[i][j] = (T)other.rows[i][j];
+    }
+    Matrix(T v0, T v1, T v2, T v3)
+    {
+        rows[0][0] = v0;  rows[0][1] = v1;
+        rows[1][0] = v2;  rows[1][1] = v3;
+    }
+    Matrix(T v0, T v1, T v2, T v3, T v4, T v5)
+    {
+        if (COLS == 3)
+        {
+            rows[0][0] = v0;  rows[0][1] = v1; rows[0][2] = v2;
+            rows[1][0] = v3;  rows[1][1] = v4; rows[1][2] = v5;
+        }
+        else
+        {
+            rows[0][0] = v0;  rows[0][1] = v1;
+            rows[1][0] = v2;  rows[1][1] = v3;
+            rows[2][0] = v4;  rows[2][1] = v5;
+        }
+    }
+    Matrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7)
+    {
+        if (COLS == 4)
+        {
+            rows[0][0] = v0;  rows[0][1] = v1; rows[0][2] = v2; rows[0][3] = v3;
+            rows[1][0] = v4;  rows[1][1] = v5; rows[1][2] = v6; rows[1][3] = v7;
+        }
+        else
+        {
+            rows[0][0] = v0;  rows[0][1] = v1;
+            rows[1][0] = v2;  rows[1][1] = v3;
+            rows[2][0] = v4;  rows[2][1] = v5;
+            rows[3][0] = v6;  rows[3][1] = v7;
+        }
+    }
+    Matrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8)
+    {
+        rows[0][0] = v0;  rows[0][1] = v1;  rows[0][2] = v2;
+        rows[1][0] = v3;  rows[1][1] = v4;  rows[1][2] = v5;
+        rows[2][0] = v6;  rows[2][1] = v7;  rows[2][2] = v8;
+    }
+    Matrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11)
+    {
+        if (COLS == 4)
+        {
+            rows[0][0] = v0;  rows[0][1] = v1;  rows[0][2] = v2;  rows[0][3] = v3;
+            rows[1][0] = v4;  rows[1][1] = v5;  rows[1][2] = v6;  rows[1][3] = v7;
+            rows[2][0] = v8;  rows[2][1] = v9;  rows[2][2] = v10; rows[2][3] = v11;
+        }
+        else
+        {
+            rows[0][0] = v0;  rows[0][1] = v1;  rows[0][2] = v2;
+            rows[1][0] = v3;  rows[1][1] = v4;  rows[1][2] = v5;
+            rows[2][0] = v6;  rows[2][1] = v7;  rows[2][2] = v8;
+            rows[3][0] = v9;  rows[3][1] = v10; rows[3][2] = v11;
+        }
+    }
+    Matrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15)
+    {
+        rows[0][0] = v0;  rows[0][1] = v1;  rows[0][2] = v2;  rows[0][3] = v3;
+        rows[1][0] = v4;  rows[1][1] = v5;  rows[1][2] = v6;  rows[1][3] = v7;
+        rows[2][0] = v8;  rows[2][1] = v9;  rows[2][2] = v10; rows[2][3] = v11;
+        rows[3][0] = v12; rows[3][1] = v13; rows[3][2] = v14; rows[3][3] = v15;
+    }
 };
 
+#define SLANG_MATRIX_BINARY_OP(T, op) \
+    template<int R, int C> \
+    Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal, const Matrix<T, R, C>& other) \
+    { \
+        Matrix<T, R, C> result;\
+        for (int i = 0; i < R; i++) \
+            for (int j = 0; j < C; j++) \
+                result.rows[i][j] = thisVal.rows[i][j] op other.rows[i][j]; \
+        return result;\
+    }
+
+#define SLANG_MATRIX_UNARY_OP(T, op) \
+    template<int R, int C> \
+    Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal) \
+    { \
+        Matrix<T, R, C> result;\
+        for (int i = 0; i < R; i++) \
+            for (int j = 0; j < C; j++) \
+                result[i].rows[i][j] = op thisVal.rows[i][j]; \
+        return result;\
+    }
+#define SLANG_INT_MATRIX_OPS(T) \
+    SLANG_MATRIX_BINARY_OP(T, +)\
+    SLANG_MATRIX_BINARY_OP(T, -)\
+    SLANG_MATRIX_BINARY_OP(T, *)\
+    SLANG_MATRIX_BINARY_OP(T, / )\
+    SLANG_MATRIX_BINARY_OP(T, &)\
+    SLANG_MATRIX_BINARY_OP(T, |)\
+    SLANG_MATRIX_BINARY_OP(T, &&)\
+    SLANG_MATRIX_BINARY_OP(T, ||)\
+    SLANG_MATRIX_BINARY_OP(T, ^)\
+    SLANG_MATRIX_BINARY_OP(T, %)\
+    SLANG_MATRIX_UNARY_OP(T, !)\
+    SLANG_MATRIX_UNARY_OP(T, ~)
+#define SLANG_FLOAT_MATRIX_OPS(T) \
+    SLANG_MATRIX_BINARY_OP(T, +)\
+    SLANG_MATRIX_BINARY_OP(T, -)\
+    SLANG_MATRIX_BINARY_OP(T, *)\
+    SLANG_MATRIX_BINARY_OP(T, /)\
+    SLANG_MATRIX_UNARY_OP(T, -)
+SLANG_INT_MATRIX_OPS(int)
+SLANG_INT_MATRIX_OPS(int8_t)
+SLANG_INT_MATRIX_OPS(int16_t)
+SLANG_INT_MATRIX_OPS(int64_t)
+SLANG_INT_MATRIX_OPS(uint)
+SLANG_INT_MATRIX_OPS(uint8_t)
+SLANG_INT_MATRIX_OPS(uint16_t)
+SLANG_INT_MATRIX_OPS(uint64_t)
+
+SLANG_FLOAT_MATRIX_OPS(float)
+SLANG_FLOAT_MATRIX_OPS(double)
+
+#define SLANG_MATRIX_INT_NEG_OP(T) \
+    template<int R, int C>\
+    SLANG_FORCE_INLINE Matrix<T, R, C> operator-(Matrix<T, R, C> thisVal) \
+    { \
+        Matrix<T, R, C> result;\
+        for (int i = 0; i < R; i++) \
+            for (int j = 0; j < C; j++) \
+            result.rows[i][j] = 0 - thisVal.rows[i][j]; \
+        return result;\
+    }
+    SLANG_MATRIX_INT_NEG_OP(int)
+    SLANG_MATRIX_INT_NEG_OP(int8_t)
+    SLANG_MATRIX_INT_NEG_OP(int16_t)
+    SLANG_MATRIX_INT_NEG_OP(int64_t)
+    SLANG_MATRIX_INT_NEG_OP(uint)
+    SLANG_MATRIX_INT_NEG_OP(uint8_t)
+    SLANG_MATRIX_INT_NEG_OP(uint16_t)
+    SLANG_MATRIX_INT_NEG_OP(uint64_t)
+
+#define SLANG_FLOAT_MATRIX_MOD(T)\
+    template<int R, int C> \
+    SLANG_FORCE_INLINE Matrix<T, R, C> operator%(Matrix<T, R, C> left, Matrix<T, R, C> right) \
+    {\
+        Matrix<T, R, C> result;\
+        for (int i = 0; i < R; i++) \
+            for (int j = 0; j < C; j++) \
+                result.rows[i][j] = _slang_fmod(left.rows[i][j], right.rows[i][j]); \
+        return result;\
+    }
+
+    SLANG_FLOAT_MATRIX_MOD(float)
+    SLANG_FLOAT_MATRIX_MOD(double)
+#undef SLANG_FLOAT_MATRIX_MOD
+#undef SLANG_MATRIX_BINARY_OP
+#undef SLANG_MATRIX_UNARY_OP
+#undef SLANG_INT_MATRIX_OPS
+#undef SLANG_FLOAT_MATRIX_OPS
+#undef SLANG_MATRIX_INT_NEG_OP
+#undef SLANG_FLOAT_MATRIX_MOD
+
 // We can just map `NonUniformResourceIndex` type directly to the index type on CPU, as CPU does not require
 // any special handling around such accesses.
 typedef size_t NonUniformResourceIndex;
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 448b69c63..cb1bb188b 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -149,12 +149,11 @@ typedef size_t NonUniformResourceIndex;
 template <typename T, int ROWS, int COLS>
 struct Matrix;
 
-typedef bool bool1;
+typedef int1 bool1;
 typedef int2 bool2;
 typedef int3 bool3;
 typedef int4 bool4; 
 
-
 typedef signed char int8_t;
 typedef short int16_t;
 typedef int int32_t;
@@ -186,163 +185,522 @@ union Union64
     double d;
 };
 
-//
-// Half support
-// 
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float _slang_fmod(float x, float y)
+{
+    return ::fmodf(x, y);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double _slang_fmod(double x, double y)
+{
+    return ::fmod(x, y);
+}
 
 #if SLANG_CUDA_ENABLE_HALF
 
 // Add the other vector half types
-struct __half3 { __half2 xy; __half z; };
-struct __half4 { __half2 xy; __half2 zw; };
-
-// *** convert ***
-
-// half -> other
-
-// float
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 convert_float2(const __half2& v) { return __half22float2(v); } 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float3 convert_float3(const __half3& v) { const float2 xy = __half22float2(v.xy); return float3{xy.x, xy.y, __half2float(v.z)}; } 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 convert_float4(const __half4& v) { const float2 xy = __half22float2(v.xy); const float2 zw = __half22float2(v.zw); return float4{xy.x, xy.y, zw.x, zw.y}; }
-
-// double
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double2 convert_double2(const __half2& v) { const float2 xy = __half22float2(v); return double2{ xy.x, xy.y }; } 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double3 convert_double3(const __half3& v) { const float2 xy = __half22float2(v.xy); return double3{ xy.x, xy.y, __half2float(v.z)}; } 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double4 convert_double4(const __half4& v) { const float2 xy = __half22float2(v.xy); const float2 zw = __half22float2(v.zw); return double4{xy.x, xy.y, zw.x, zw.y}; }
-
-// int
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int2 convert_int2(const __half2& v) { return int2 { __half2int_rz(v.x), __half2int_rz(v.y) }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int3 convert_int3(const __half3& v) { return int3 { __half2int_rz(v.xy.x), __half2int_rz(v.xy.y), __half2int_rz(v.z) }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int4 convert_int4(const __half4& v) { return int4 { __half2int_rz(v.xy.x), __half2int_rz(v.xy.y), __half2int_rz(v.zw.x), __half2int_rz(v.zw.y)}; }
-
-// uint
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2 convert_uint2(const __half2& v) { return uint2 { __half2uint_rz(v.x), __half2uint_rz(v.y) }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint3 convert_uint3(const __half3& v) { return uint3 { __half2uint_rz(v.xy.x), __half2uint_rz(v.xy.y), __half2uint_rz(v.z) }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4 convert_uint4(const __half4& v) { return uint4 { __half2uint_rz(v.xy.x), __half2uint_rz(v.xy.y), __half2uint_rz(v.zw.x), __half2uint_rz(v.zw.y)}; }
+struct __half1 { __half x; };
+struct __align__(4) __half3 { __half x, y, z; };
+struct __align__(4) __half4 { __half x, y, z, w; };
+#endif
 
-// other -> half
+#define SLANG_VECTOR_GET_ELEMENT(T) \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##1 x, int index) { return ((T*)(&x))[index]; }\
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##2 x, int index) { return ((T*)(&x))[index]; }\
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##3 x, int index) { return ((T*)(&x))[index]; }\
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##4 x, int index) { return ((T*)(&x))[index]; }
+SLANG_VECTOR_GET_ELEMENT(int)
+SLANG_VECTOR_GET_ELEMENT(uint)
+SLANG_VECTOR_GET_ELEMENT(short)
+SLANG_VECTOR_GET_ELEMENT(ushort)
+SLANG_VECTOR_GET_ELEMENT(char)
+SLANG_VECTOR_GET_ELEMENT(uchar)
+SLANG_VECTOR_GET_ELEMENT(longlong)
+SLANG_VECTOR_GET_ELEMENT(ulonglong)
+SLANG_VECTOR_GET_ELEMENT(float)
+SLANG_VECTOR_GET_ELEMENT(double)
+
+#define SLANG_VECTOR_GET_ELEMENT_PTR(T) \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##1* x, int index) { return ((T*)(x)) + index; }\
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##2* x, int index) { return ((T*)(x)) + index; }\
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##3* x, int index) { return ((T*)(x)) + index; }\
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##4* x, int index) { return ((T*)(x)) + index; }
+SLANG_VECTOR_GET_ELEMENT_PTR(int)
+SLANG_VECTOR_GET_ELEMENT_PTR(uint)
+SLANG_VECTOR_GET_ELEMENT_PTR(short)
+SLANG_VECTOR_GET_ELEMENT_PTR(ushort)
+SLANG_VECTOR_GET_ELEMENT_PTR(char)
+SLANG_VECTOR_GET_ELEMENT_PTR(uchar)
+SLANG_VECTOR_GET_ELEMENT_PTR(longlong)
+SLANG_VECTOR_GET_ELEMENT_PTR(ulonglong)
+SLANG_VECTOR_GET_ELEMENT_PTR(float)
+SLANG_VECTOR_GET_ELEMENT_PTR(double)
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 convert___half2(const float2& v) { return __float22half2_rn(v); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 convert___half3(const float3& v) { return __half3{ __float22half2_rn(float2{v.x, v.y}), __float2half_rn(v.z) }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 convert___half4(const float4& v) { return __half4{ __float22half2_rn(float2{v.x, v.y}), __float22half2_rn(float2{v.z, v.w}) }; }
+#if SLANG_CUDA_ENABLE_HALF
+SLANG_VECTOR_GET_ELEMENT(__half)
+SLANG_VECTOR_GET_ELEMENT_PTR(__half)
+#endif
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 convert___half2(const int2& v) { return __half2{ __int2half_rz(v.x), __int2half_rz(v.y) }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 convert___half3(const int3& v) { return __half3{ __half2{__int2half_rz(v.x), __int2half_rz(v.y)}, __int2half_rz(v.z) }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 convert___half4(const int4& v) { return __half4{ __half2{__int2half_rz(v.x), __int2half_rz(v.y)}, __half2{__int2half_rz(v.z),  __int2half_rz(v.w)} }; }
+#define SLANG_CUDA_VECTOR_BINARY_OP(T, n, op) \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator op(T##n thisVal, T##n other) \
+    { \
+        T##n result;\
+        for (int i = 0; i < n; i++) \
+            *_slang_vector_get_element_ptr(&result, i) = _slang_vector_get_element(thisVal,i) op _slang_vector_get_element(other,i); \
+        return result;\
+    }
+#define SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, op) \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL bool##n operator op(T##n thisVal, T##n other) \
+    { \
+        bool##n result;\
+        for (int i = 0; i < n; i++) \
+            *_slang_vector_get_element_ptr(&result, i) = (int)(_slang_vector_get_element(thisVal,i) op _slang_vector_get_element(other,i)); \
+        return result;\
+    }
+#define SLANG_CUDA_VECTOR_UNARY_OP(T, n, op) \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator op(T##n thisVal) \
+    { \
+        T##n result;\
+        for (int i = 0; i < n; i++) \
+            *_slang_vector_get_element_ptr(&result, i) = op _slang_vector_get_element(thisVal,i); \
+        return result;\
+    }
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 convert___half2(const uint2& v) { return __half2{ __uint2half_rz(v.x), __uint2half_rz(v.y) }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 convert___half3(const uint3& v) { return __half3{ __half2{__uint2half_rz(v.x), __uint2half_rz(v.y)}, __uint2half_rz(v.z) }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 convert___half4(const uint4& v) { return __half4{ __half2{__uint2half_rz(v.x), __uint2half_rz(v.y)}, __half2{__uint2half_rz(v.z),  __uint2half_rz(v.w)} }; }
+#define SLANG_CUDA_VECTOR_INT_OP(T, n) \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, +)\
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, -)\
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, *)\
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, /)\
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, %)\
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, ^)\
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, &)\
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, |)\
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, &&)\
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, ||)\
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, >>)\
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, <<)\
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >)\
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <)\
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >=)\
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <=)\
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, ==)\
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, !=)\
+    SLANG_CUDA_VECTOR_UNARY_OP(T, n, !)\
+    SLANG_CUDA_VECTOR_UNARY_OP(T, n, -)\
+    SLANG_CUDA_VECTOR_UNARY_OP(T, n, ~)
+
+#define SLANG_CUDA_VECTOR_INT_OPS(T) \
+    SLANG_CUDA_VECTOR_INT_OP(T, 2) \
+    SLANG_CUDA_VECTOR_INT_OP(T, 3) \
+    SLANG_CUDA_VECTOR_INT_OP(T, 4)
+
+SLANG_CUDA_VECTOR_INT_OPS(int)
+SLANG_CUDA_VECTOR_INT_OPS(uint)
+SLANG_CUDA_VECTOR_INT_OPS(ushort)
+SLANG_CUDA_VECTOR_INT_OPS(short)
+SLANG_CUDA_VECTOR_INT_OPS(char)
+SLANG_CUDA_VECTOR_INT_OPS(uchar)
+SLANG_CUDA_VECTOR_INT_OPS(longlong)
+SLANG_CUDA_VECTOR_INT_OPS(ulonglong)
+
+#define SLANG_CUDA_VECTOR_FLOAT_OP(T, n) \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, +)\
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, -)\
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, *)\
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, /)\
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, &&)\
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, ||)\
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >)\
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <)\
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >=)\
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <=)\
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, ==)\
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, !=)\
+    SLANG_CUDA_VECTOR_UNARY_OP(T, n, -)
+#define SLANG_CUDA_VECTOR_FLOAT_OPS(T) \
+    SLANG_CUDA_VECTOR_FLOAT_OP(T, 2) \
+    SLANG_CUDA_VECTOR_FLOAT_OP(T, 3) \
+    SLANG_CUDA_VECTOR_FLOAT_OP(T, 4)
+
+SLANG_CUDA_VECTOR_FLOAT_OPS(float)
+SLANG_CUDA_VECTOR_FLOAT_OPS(double)
+#if SLANG_CUDA_ENABLE_HALF
+SLANG_CUDA_VECTOR_FLOAT_OPS(__half)
+#endif
+#define SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, n)\
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator%(const T##n& left, const T##n& right) \
+    {\
+        T##n result;\
+        for (int i = 0; i < n; i++) \
+            *_slang_vector_get_element_ptr(&result, i) = _slang_fmod(_slang_vector_get_element(left,i), _slang_vector_get_element(right,i)); \
+        return result;\
+    }
+#define SLANG_CUDA_FLOAT_VECTOR_MOD(T) \
+    SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 2)\
+    SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 3)\
+    SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 4)
+
+SLANG_CUDA_FLOAT_VECTOR_MOD(float)
+SLANG_CUDA_FLOAT_VECTOR_MOD(double)
+
+#define SLANG_MAKE_VECTOR(T) \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x, T y) { return T##2{x, y}; }\
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x, T y, T z) { return T##3{ x, y, z }; }\
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x, T y, T z, T w) { return T##4{ x, y, z, w }; }
+SLANG_MAKE_VECTOR(int)
+SLANG_MAKE_VECTOR(uint)
+SLANG_MAKE_VECTOR(short)
+SLANG_MAKE_VECTOR(ushort)
+SLANG_MAKE_VECTOR(char)
+SLANG_MAKE_VECTOR(uchar)
+SLANG_MAKE_VECTOR(float)
+SLANG_MAKE_VECTOR(double)
+SLANG_MAKE_VECTOR(longlong)
+SLANG_MAKE_VECTOR(ulonglong)
+#if SLANG_CUDA_ENABLE_HALF
+SLANG_MAKE_VECTOR(__half)
+#endif
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 convert___half2(const double2& v) { return __float22half2_rn(float2{v.x, v.y}); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 convert___half3(const double3& v) { return __half3{ __float22half2_rn(float2{v.x, v.y}), __float2half_rn(v.z) }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 convert___half4(const double4& v) { return __half4{ __float22half2_rn(float2{v.x, v.y}), __float22half2_rn(float2{v.z, v.w}) }; }
+#define SLANG_MAKE_VECTOR_FROM_SCALAR(T) \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##1 make_##T##1(T x) { return T##1{x}; }\
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x) { return make_##T##2(x, x); }\
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x) { return make_##T##3(x, x, x); }\
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x) { return make_##T##4(x, x, x, x); }
+SLANG_MAKE_VECTOR_FROM_SCALAR(int)
+SLANG_MAKE_VECTOR_FROM_SCALAR(uint)
+SLANG_MAKE_VECTOR_FROM_SCALAR(short)
+SLANG_MAKE_VECTOR_FROM_SCALAR(ushort)
+SLANG_MAKE_VECTOR_FROM_SCALAR(char)
+SLANG_MAKE_VECTOR_FROM_SCALAR(uchar)
+SLANG_MAKE_VECTOR_FROM_SCALAR(longlong)
+SLANG_MAKE_VECTOR_FROM_SCALAR(ulonglong)
+SLANG_MAKE_VECTOR_FROM_SCALAR(float)
+SLANG_MAKE_VECTOR_FROM_SCALAR(double)
+#if SLANG_CUDA_ENABLE_HALF
+SLANG_MAKE_VECTOR_FROM_SCALAR(__half)
+#endif
 
-// *** make ***
+template<typename T, int n>
+struct GetVectorTypeImpl {};
 
-// Mechanism to make half vectors
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 make___half2(__half x, __half y) { return __halves2half2(x, y); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 make___half3(__half x, __half y, __half z) { return __half3{ __halves2half2(x, y), z }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 make___half4(__half x, __half y, __half z, __half w) { return __half4{ __halves2half2(x, y), __halves2half2(z, w)}; }
+#define GET_VECTOR_TYPE_IMPL(T, n)\
+struct GetVectorTypeImpl<T,n>\
+{\
+    typedef T##n type;\
+    static SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n fromScalar(T v) { return make_##T##n(v); } \
+};
+#define GET_VECTOR_TYPE_IMPL_N(T)\
+    GET_VECTOR_TYPE_IMPL(T, 1)\
+    GET_VECTOR_TYPE_IMPL(T, 2)\
+    GET_VECTOR_TYPE_IMPL(T, 3)\
+    GET_VECTOR_TYPE_IMPL(T, 4)
+
+GET_VECTOR_TYPE_IMPL_N(int)
+GET_VECTOR_TYPE_IMPL_N(uint)
+GET_VECTOR_TYPE_IMPL_N(short)
+GET_VECTOR_TYPE_IMPL_N(ushort)
+GET_VECTOR_TYPE_IMPL_N(char)
+GET_VECTOR_TYPE_IMPL_N(uchar)
+GET_VECTOR_TYPE_IMPL_N(longlong)
+GET_VECTOR_TYPE_IMPL_N(ulonglong)
+GET_VECTOR_TYPE_IMPL_N(float)
+GET_VECTOR_TYPE_IMPL_N(double)
+#if SLANG_CUDA_ENABLE_HALF
+GET_VECTOR_TYPE_IMPL_N(__half)
+#endif
+template<typename T, int n>
+using Vector = typename GetVectorTypeImpl<T, n>::type;
 
-// *** constructFromScalar ***
+template<typename T, int n, typename OtherT, int m>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, n> _slang_vector_reshape(const Vector<OtherT, m> other)
+{
+    Vector<T, n> result;
+    for (int i = 0; i < n; i++)
+    {
+        OtherT otherElement = T(0);
+        if (i < m)
+            otherElement = _slang_vector_get_element(other, i);
+        *_slang_vector_get_element_ptr(&result, i) = (T)otherElement;
+    }
+    return result;
+}
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 constructFromScalar___half2(half x) { return __half2half2(x); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 constructFromScalar___half3(half x) { return __half3{__half2half2(x), x}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 constructFromScalar___half4(half x) { const __half2 v = __half2half2(x); return __half4{v, v}; }
+template <typename T, int ROWS, int COLS>
+struct Matrix
+{
+    Vector<T, COLS> rows[ROWS];
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, COLS>& operator[](size_t index) { return rows[index]; }
+};
 
-// *** half2 ***
 
-// half2 maths ops
+template<typename T, int ROWS, int COLS>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T scalar)
+{
+    Matrix<T, ROWS, COLS> result;
+    for (int i = 0; i < ROWS; i++)
+        result.rows[i] = GetVectorTypeImpl<T, COLS>::fromScalar(scalar);
+    return result;
 
-// NOTE! That by default these are in cuda_fp16.hpp, but we disable them, because we need to define the comparison operators
-// as we need versions that will return vector<bool>
+}
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator+(const __half2& lh, const __half2& rh) { return __hadd2(lh, rh); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator-(const __half2& lh, const __half2& rh) { return __hsub2(lh, rh); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator*(const __half2& lh, const __half2& rh) { return __hmul2(lh, rh); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator/(const __half2& lh, const __half2& rh) { return __h2div(lh, rh); }
+template<typename T, int ROWS, int COLS>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0)
+{
+    Matrix<T, ROWS, COLS> result;
+    result.rows[0] = row0;
+    return result;
+}
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2& operator+=(__half2& lh, const __half2& rh) { lh = __hadd2(lh, rh); return lh; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2& operator-=(__half2& lh, const __half2& rh) { lh = __hsub2(lh, rh); return lh; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2& operator*=(__half2& lh, const __half2& rh) { lh = __hmul2(lh, rh); return lh; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2& operator/=(__half2& lh, const __half2& rh) { lh = __h2div(lh, rh); return lh; }
+template<typename T, int ROWS, int COLS>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1)
+{
+    Matrix<T, ROWS, COLS> result;
+    result.rows[0] = row0;
+    result.rows[1] = row1;
+    return result;
+}
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 &operator++(__half2 &h)      { __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hadd2(h, one); return h; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 &operator--(__half2 &h)      { __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hsub2(h, one); return h; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2  operator++(__half2 &h, int) { __half2 ret = h; __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hadd2(h, one); return ret; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2  operator--(__half2 &h, int) { __half2 ret = h; __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hsub2(h, one); return ret; }
+template<typename T, int ROWS, int COLS>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1, const Vector<T, COLS>& row2)
+{
+    Matrix<T, ROWS, COLS> result;
+    result.rows[0] = row0;
+    result.rows[1] = row1;
+    result.rows[2] = row2;
+    return result;
+}
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator+(const __half2 &h) { return h; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator-(const __half2 &h) { return __hneg2(h); }
+template<typename T, int ROWS, int COLS>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1, const Vector<T, COLS>& row2, const Vector<T, COLS>& row3)
+{
+    Matrix<T, ROWS, COLS> result;
+    result.rows[0] = row0;
+    result.rows[1] = row1;
+    result.rows[2] = row2;
+    result.rows[3] = row3;
+    return result;
+}
 
-// vec op scalar
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator+(const __half2& lh, __half rh) { return __hadd2(lh, __half2half2(rh)); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator-(const __half2& lh, __half rh) { return __hsub2(lh, __half2half2(rh)); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator*(const __half2& lh, __half rh) { return __hmul2(lh, __half2half2(rh)); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator/(const __half2& lh, __half rh) { return __h2div(lh, __half2half2(rh)); }
+template<typename T, int ROWS, int COLS, typename U, int otherRow, int otherCol>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Matrix<U, otherRow, otherCol>& other)
+{
+    Matrix<T, ROWS, COLS> result;
+    int minRow = ROWS;
+    int minCol = COLS;
+    if (minRow > otherRow) minRow = otherRow;
+    if (minCol > otherCol) minCol = otherCol;
+    for (int i = 0; i < minRow; i++)
+        for (int j = 0; j < minCol; j++)
+            *_slang_vector_get_element_ptr(result.rows + i, j) = (T)_slang_vector_get_element(other.rows[i], j);
+    return result;
+}
 
-// scalar op vec
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator+(__half lh, const __half2& rh) { return __hadd2(__half2half2(lh), rh); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator-(__half lh, const __half2& rh) { return __hsub2(__half2half2(lh), rh); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator*(__half lh, const __half2& rh) { return __hmul2(__half2half2(lh), rh); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator/(__half lh, const __half2& rh) { return __h2div(__half2half2(lh), rh); }
+template<typename T, int ROWS, int COLS>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3)
+{
+    Matrix<T, ROWS, COLS> rs;
+    rs.rows[0].x = v0;  rs.rows[0].y = v1;
+    rs.rows[1].x = v2;  rs.rows[1].y = v3;
+    return rs;
+}
 
-// *** half3 ***
+template<typename T, int ROWS, int COLS>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5)
+{
+    Matrix<T, ROWS, COLS> rs;
+    if (COLS == 3)
+    {
+        rs.rows[0].x = v0;  rs.rows[0].y = v1; rs.rows[0].z = v2;
+        rs.rows[1].x = v3;  rs.rows[1].y = v4; rs.rows[1].z = v5;
+    }
+    else
+    {
+        rs.rows[0].x = v0;  rs.rows[0].y = v1;
+        rs.rows[1].x = v2;  rs.rows[1].y = v3;
+        rs.rows[2].x = v4;  rs.rows[2].y = v5;
+    }
+    return rs;
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator+(const __half3& lh, const __half3& rh) { return __half3{__hadd2(lh.xy, rh.xy), __hadd(lh.z, rh.z)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(const __half3& lh, const __half3& rh) { return __half3{__hsub2(lh.xy, rh.xy), __hsub(lh.z, rh.z)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator*(const __half3& lh, const __half3& rh) { return __half3{__hmul2(lh.xy, rh.xy), __hmul(lh.z, rh.z)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator/(const __half3& lh, const __half3& rh) { return __half3{__h2div(lh.xy, rh.xy), __hdiv(lh.z, rh.z)}; }
+}
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(const __half3& h) { return __half3{__hneg2(h.xy), __hneg(h.z)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator+(const __half3& h) { return h; }
+template<typename T, int ROWS, int COLS>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7)
+{
+    Matrix<T, ROWS, COLS> rs;
+    if (COLS == 4)
+    {
+        rs.rows[0].x = v0;  rs.rows[0].y = v1; rs.rows[0].z = v2; rs.rows[0].w = v3;
+        rs.rows[1].x = v4;  rs.rows[1].y = v5; rs.rows[1].z = v6; rs.rows[1].w = v7;
+    }
+    else
+    {
+        rs.rows[0].x = v0;  rs.rows[0].y = v1;
+        rs.rows[1].x = v2;  rs.rows[1].y = v3;
+        rs.rows[2].x = v4;  rs.rows[2].y = v5;
+        rs.rows[3].x = v6;  rs.rows[3].y = v7;
+    }
+    return rs;
+}
 
-// vec op scalar
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator+(const __half3& lh, __half rh) { return __half3{__hadd2(lh.xy, __half2half2(rh)), __hadd(lh.z, rh)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(const __half3& lh, __half rh) { return __half3{__hsub2(lh.xy, __half2half2(rh)), __hsub(lh.z, rh)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator*(const __half3& lh, __half rh) { return __half3{__hmul2(lh.xy, __half2half2(rh)), __hmul(lh.z, rh)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator/(const __half3& lh, __half rh) { return __half3{__h2div(lh.xy, __half2half2(rh)), __hdiv(lh.z, rh)}; }
+template<typename T, int ROWS, int COLS>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8)
+{
+    Matrix<T, ROWS, COLS> rs;
+    rs.rows[0].x = v0;  rs.rows[0].y = v1;  rs.rows[0].z = v2;
+    rs.rows[1].x = v3;  rs.rows[1].y = v4;  rs.rows[1].z = v5;
+    rs.rows[2].x = v6;  rs.rows[2].y = v7;  rs.rows[2].z = v8;
+    return rs;
+}
 
-// scalar op vec
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator+(__half lh, const __half3& rh) { return __half3{__hadd2(__half2half2(lh), rh.xy), __hadd(lh, rh.z)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(__half lh, const __half3& rh) { return __half3{__hsub2(__half2half2(lh), rh.xy), __hsub(lh, rh.z)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator*(__half lh, const __half3& rh) { return __half3{__hmul2(__half2half2(lh), rh.xy), __hmul(lh, rh.z)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator/(__half lh, const __half3& rh) { return __half3{__h2div(__half2half2(lh), rh.xy), __hdiv(lh, rh.z)}; }
+template<typename T, int ROWS, int COLS>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11)
+{
+    Matrix<T, ROWS, COLS> rs;
+    if (COLS == 4)
+    {
+        rs.rows[0].x = v0;  rs.rows[0].y = v1;  rs.rows[0].z = v2;  rs.rows[0].w = v3;
+        rs.rows[1].x = v4;  rs.rows[1].y = v5;  rs.rows[1].z = v6;  rs.rows[1].w = v7;
+        rs.rows[2].x = v8;  rs.rows[2].y = v9;  rs.rows[2].z = v10; rs.rows[2].w = v11;
+    }
+    else
+    {
+        rs.rows[0].x = v0;  rs.rows[0].y = v1;  rs.rows[0].z = v2;
+        rs.rows[1].x = v3;  rs.rows[1].y = v4;  rs.rows[1].z = v5;
+        rs.rows[2].x = v6;  rs.rows[2].y = v7;  rs.rows[2].z = v8;
+        rs.rows[3].x = v9;  rs.rows[3].y = v10; rs.rows[3].z = v11;
+    }
+    return rs;
+}
 
-// *** half4 ***
+template<typename T, int ROWS, int COLS>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15)
+{
+    Matrix<T, ROWS, COLS> rs;
+    rs.rows[0].x = v0;  rs.rows[0].y = v1;  rs.rows[0].z = v2;  rs.rows[0].w = v3;
+    rs.rows[1].x = v4;  rs.rows[1].y = v5;  rs.rows[1].z = v6;  rs.rows[1].w = v7;
+    rs.rows[2].x = v8;  rs.rows[2].y = v9;  rs.rows[2].z = v10; rs.rows[2].w = v11;
+    rs.rows[3].x = v12; rs.rows[3].y = v13; rs.rows[3].z = v14; rs.rows[3].w = v15;
+    return rs;
+}
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator+(const __half4& lh, const __half4& rh) { return __half4{__hadd2(lh.xy, rh.xy), __hadd2(lh.zw, rh.zw)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(const __half4& lh, const __half4& rh) { return __half4{__hsub2(lh.xy, rh.xy), __hsub2(lh.zw, rh.zw)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator*(const __half4& lh, const __half4& rh) { return __half4{__hmul2(lh.xy, rh.xy), __hmul2(lh.zw, rh.zw)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator/(const __half4& lh, const __half4& rh) { return __half4{__h2div(lh.xy, rh.xy), __h2div(lh.zw, rh.zw)}; }
+#define SLANG_MATRIX_BINARY_OP(T, op) \
+    template<int R, int C> \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal, const Matrix<T, R, C>& other) \
+    { \
+        Matrix<T, R, C> result;\
+        for (int i = 0; i < R; i++) \
+            for (int j = 0; j < C; j++) \
+                *_slang_vector_get_element_ptr(result.rows+i,j) = _slang_vector_get_element(thisVal.rows[i], j) op _slang_vector_get_element(other.rows[i], j); \
+        return result;\
+    }
 
-// vec op scalar
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator+(const __half4& lh, __half rh) { const __half2 rhv = __half2half2(rh); return __half4{__hadd2(lh.xy, rhv), __hadd2(lh.zw, rhv)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(const __half4& lh, __half rh) { const __half2 rhv = __half2half2(rh); return __half4{__hsub2(lh.xy, rhv), __hsub2(lh.zw, rhv)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator*(const __half4& lh, __half rh) { const __half2 rhv = __half2half2(rh); return __half4{__hmul2(lh.xy, rhv), __hmul2(lh.zw, rhv)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator/(const __half4& lh, __half rh) { const __half2 rhv = __half2half2(rh); return __half4{__h2div(lh.xy, rhv), __h2div(lh.zw, rhv)}; }
+#define SLANG_MATRIX_UNARY_OP(T, op) \
+    template<int R, int C> \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal) \
+    { \
+        Matrix<T, R, C> result;\
+        for (int i = 0; i < R; i++) \
+            for (int j = 0; j < C; j++) \
+                *_slang_vector_get_element_ptr(result.rows+i,j) = op _slang_vector_get_element(thisVal.rows[i], j); \
+        return result;\
+    }
+#define SLANG_INT_MATRIX_OPS(T) \
+    SLANG_MATRIX_BINARY_OP(T, +)\
+    SLANG_MATRIX_BINARY_OP(T, -)\
+    SLANG_MATRIX_BINARY_OP(T, *)\
+    SLANG_MATRIX_BINARY_OP(T, / )\
+    SLANG_MATRIX_BINARY_OP(T, &)\
+    SLANG_MATRIX_BINARY_OP(T, |)\
+    SLANG_MATRIX_BINARY_OP(T, &&)\
+    SLANG_MATRIX_BINARY_OP(T, ||)\
+    SLANG_MATRIX_BINARY_OP(T, ^)\
+    SLANG_MATRIX_BINARY_OP(T, %)\
+    SLANG_MATRIX_UNARY_OP(T, !)\
+    SLANG_MATRIX_UNARY_OP(T, ~)
+#define SLANG_FLOAT_MATRIX_OPS(T) \
+    SLANG_MATRIX_BINARY_OP(T, +)\
+    SLANG_MATRIX_BINARY_OP(T, -)\
+    SLANG_MATRIX_BINARY_OP(T, *)\
+    SLANG_MATRIX_BINARY_OP(T, /)\
+    SLANG_MATRIX_UNARY_OP(T, -)
+SLANG_INT_MATRIX_OPS(int)
+SLANG_INT_MATRIX_OPS(uint)
+SLANG_INT_MATRIX_OPS(short)
+SLANG_INT_MATRIX_OPS(ushort)
+SLANG_INT_MATRIX_OPS(char)
+SLANG_INT_MATRIX_OPS(uchar)
+SLANG_INT_MATRIX_OPS(longlong)
+SLANG_INT_MATRIX_OPS(ulonglong)
+SLANG_FLOAT_MATRIX_OPS(float)
+SLANG_FLOAT_MATRIX_OPS(double)
+#if SLANG_CUDA_ENABLE_HALF
+SLANG_FLOAT_MATRIX_OPS(__half)
+#endif
+#define SLANG_MATRIX_INT_NEG_OP(T) \
+    template<int R, int C>\
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator-(Matrix<T, R, C> thisVal) \
+    { \
+        Matrix<T, R, C> result;\
+        for (int i = 0; i < R; i++) \
+            for (int j = 0; j < C; j++) \
+                *_slang_vector_get_element_ptr(result.rows+i,j) = 0 - _slang_vector_get_element(thisVal.rows[i], j); \
+        return result;\
+    }
+    SLANG_MATRIX_INT_NEG_OP(int)
+    SLANG_MATRIX_INT_NEG_OP(uint)
+    SLANG_MATRIX_INT_NEG_OP(short)
+    SLANG_MATRIX_INT_NEG_OP(ushort)
+    SLANG_MATRIX_INT_NEG_OP(char)
+    SLANG_MATRIX_INT_NEG_OP(uchar)
+    SLANG_MATRIX_INT_NEG_OP(longlong)
+    SLANG_MATRIX_INT_NEG_OP(ulonglong)
+
+#define SLANG_FLOAT_MATRIX_MOD(T)\
+    template<int R, int C> \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator%(Matrix<T, R, C> left, Matrix<T, R, C> right) \
+    {\
+        Matrix<T, R, C> result;\
+        for (int i = 0; i < R; i++) \
+            for (int j = 0; j < C; j++) \
+                *_slang_vector_get_element_ptr(result.rows+i,j) = _slang_fmod(_slang_vector_get_element(left.rows[i], j), _slang_vector_get_element(right.rows[i], j)); \
+        return result;\
+    }
 
-// scalar op vec
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator+(__half lh, const __half4& rh) { const __half2 lhv = __half2half2(lh); return __half4{__hadd2(lhv, rh.xy), __hadd2(lhv, rh.zw)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(__half lh, const __half4& rh) { const __half2 lhv = __half2half2(lh); return __half4{__hsub2(lhv, rh.xy), __hsub2(lhv, rh.zw)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator*(__half lh, const __half4& rh) { const __half2 lhv = __half2half2(lh); return __half4{__hmul2(lhv, rh.xy), __hmul2(lhv, rh.zw)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator/(__half lh, const __half4& rh) { const __half2 lhv = __half2half2(lh); return __half4{__h2div(lhv, rh.xy), __h2div(lhv, rh.zw)}; }
+    SLANG_FLOAT_MATRIX_MOD(float)
+    SLANG_FLOAT_MATRIX_MOD(double)
+#if SLANG_CUDA_ENABLE_HALF
+    template<int R, int C> 
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<__half, R, C> operator%(Matrix<__half, R, C> left, Matrix<__half, R, C> right)
+    {
+        Matrix<__half, R, C> result;
+        for (int i = 0; i < R; i++) 
+            for (int j = 0; j < C; j++) 
+                * _slang_vector_get_element_ptr(result.rows + i, j) = __float2half(_slang_fmod(__half2float(_slang_vector_get_element(left.rows[i], j)), __half2float(_slang_vector_get_element(right.rows[i], j))));
+        return result;
+    }
+#endif
+#undef SLANG_FLOAT_MATRIX_MOD
+#undef SLANG_MATRIX_BINARY_OP
+#undef SLANG_MATRIX_UNARY_OP
+#undef SLANG_INT_MATRIX_OPS
+#undef SLANG_FLOAT_MATRIX_OPS
+#undef SLANG_MATRIX_INT_NEG_OP
+#undef SLANG_FLOAT_MATRIX_MOD
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(const __half4& h) { return __half4{__hneg2(h.xy), __hneg2(h.zw)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator+(const __half4& h) { return h; }
+//
+// Half support
+// 
 
+#if SLANG_CUDA_ENABLE_HALF
 // Convenience functions ushort -> half
 
 SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __ushort_as_half(const ushort2& i) { return __halves2half2(__ushort_as_half(i.x), __ushort_as_half(i.y)); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 __ushort_as_half(const ushort3& i) { return __half3{__halves2half2(__ushort_as_half(i.x), __ushort_as_half(i.y)), __ushort_as_half(i.z)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 __ushort_as_half(const ushort4& i) { return __half4{ __halves2half2(__ushort_as_half(i.x), __ushort_as_half(i.y)), __halves2half2(__ushort_as_half(i.z), __ushort_as_half(i.w)) }; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 __ushort_as_half(const ushort3& i) { return __half3{__ushort_as_half(i.x), __ushort_as_half(i.y), __ushort_as_half(i.z)}; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 __ushort_as_half(const ushort4& i) { return __half4{ __ushort_as_half(i.x), __ushort_as_half(i.y), __ushort_as_half(i.z), __ushort_as_half(i.w) }; }
 
 // Convenience functions half -> ushort
 
 SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort2 __half_as_ushort(const __half2& i) { return make_ushort2(__half_as_ushort(i.x), __half_as_ushort(i.y)); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort3 __half_as_ushort(const __half3& i) { return make_ushort3(__half_as_ushort(i.xy.x), __half_as_ushort(i.xy.y), __half_as_ushort(i.z)); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort4 __half_as_ushort(const __half4& i) { return make_ushort4(__half_as_ushort(i.xy.x), __half_as_ushort(i.xy.y), __half_as_ushort(i.zw.x), __half_as_ushort(i.zw.y)); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort3 __half_as_ushort(const __half3& i) { return make_ushort3(__half_as_ushort(i.x), __half_as_ushort(i.y), __half_as_ushort(i.z)); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort4 __half_as_ushort(const __half4& i) { return make_ushort4(__half_as_ushort(i.x), __half_as_ushort(i.y), __half_as_ushort(i.z), __half_as_ushort(i.w)); }
 
 // This is a little bit of a hack. Fortunately CUDA has the definitions of the templated types in 
 // include/surface_indirect_functions.h
@@ -438,7 +796,7 @@ template <> \
 SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 FUNC_NAME##_convert<float4>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \
 { \
     const __half4 v = __ushort_as_half(FUNC_NAME<ushort4>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \
-    return float4{v.xy.x, v.xy.y, v.zw.x, v.zw.y}; \
+    return float4{v.x, v.y, v.z, v.w}; \
 }
 
 SLANG_SURFACE_READ_HALF_CONVERT(surf1Dread, (int x), (x))