diff options
| author | Yong He <yonghe@outlook.com> | 2023-02-16 13:55:32 -0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-02-16 13:55:32 -0800 |
| commit | 4c4826d47eeef4675daae4ae53ff76f4d5ebd84a (patch) | |
| tree | ed4af0ded878e4f06e9641ce61d26ffd7c89ccbc /prelude | |
| parent | eda88e513e8b1e2abc05e9dc8555f237d96472df (diff) | |
Overhaul global inst deduplication and cpp/cuda backend. (#2654)
* Overhaul global inst deduplication and cpp/cuda backend.
* Update IR documentation.
---------
Co-authored-by: Yong He <yhe@nvidia.com>
Diffstat (limited to 'prelude')
| -rw-r--r-- | prelude/slang-cpp-prelude.h | 2 | ||||
| -rw-r--r-- | prelude/slang-cpp-scalar-intrinsics.h | 11 | ||||
| -rw-r--r-- | prelude/slang-cpp-types.h | 441 | ||||
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 598 |
4 files changed, 931 insertions, 121 deletions
diff --git a/prelude/slang-cpp-prelude.h b/prelude/slang-cpp-prelude.h index 84a61f929..d15abdb88 100644 --- a/prelude/slang-cpp-prelude.h +++ b/prelude/slang-cpp-prelude.h @@ -296,8 +296,8 @@ struct ISlangUnknown // Includes -#include "slang-cpp-types.h" #include "slang-cpp-scalar-intrinsics.h" +#include "slang-cpp-types.h" // TODO(JS): Hack! Output C++ code from slang can copy uninitialized variables. #if defined(_MSC_VER) diff --git a/prelude/slang-cpp-scalar-intrinsics.h b/prelude/slang-cpp-scalar-intrinsics.h index 66035260d..2b9e7f777 100644 --- a/prelude/slang-cpp-scalar-intrinsics.h +++ b/prelude/slang-cpp-scalar-intrinsics.h @@ -490,6 +490,17 @@ void InterlockedAdd(uint32_t* dest, uint32_t value, uint32_t* oldValue) #endif // SLANG_LLVM + +// ----------------------- fmod -------------------------- +SLANG_FORCE_INLINE float _slang_fmod(float x, float y) +{ + return F32_fmod(x, y); +} +SLANG_FORCE_INLINE double _slang_fmod(double x, double y) +{ + return F64_fmod(x, y); +} + #ifdef SLANG_PRELUDE_NAMESPACE } #endif diff --git a/prelude/slang-cpp-types.h b/prelude/slang-cpp-types.h index c15c5ec40..28fe3dd8d 100644 --- a/prelude/slang-cpp-types.h +++ b/prelude/slang-cpp-types.h @@ -86,26 +86,159 @@ template <typename T> struct Vector<T, 1> { T x; + const T& operator[](size_t /*index*/) const { return x; } + T& operator[](size_t /*index*/) { return x; } + operator T() const { return x; } + Vector() = default; + Vector(T scalar) + { + x = scalar; + } + template <typename U> + Vector(Vector<U, 1> other) + { + x = (T)other.x; + } + template <typename U, int otherSize> + Vector(Vector<U, otherSize> other) + { + int minSize = 1; + if (otherSize < minSize) minSize = otherSize; + for (int i = 0; i < minSize; i++) + (*this)[i] = (T)other[i]; + } }; template <typename T> struct Vector<T, 2> { T x, y; + const T& operator[](size_t index) const { return index == 0 ? x : y; } + T& operator[](size_t index) { return index == 0 ? x : y; } + Vector() = default; + Vector(T scalar) + { + x = y = scalar; + } + Vector(T _x, T _y) + { + x = _x; + y = _y; + } + template <typename U> + Vector(Vector<U, 2> other) + { + x = (T)other.x; + y = (T)other.y; + } + template <typename U, int otherSize> + Vector(Vector<U, otherSize> other) + { + int minSize = 2; + if (otherSize < minSize) minSize = otherSize; + for (int i = 0; i < minSize; i++) + (*this)[i] = (T)other[i]; + } }; template <typename T> struct Vector<T, 3> { T x, y, z; + const T& operator[](size_t index) const { return *((T*)(this) + index); } + T& operator[](size_t index) { return *((T*)(this) + index); } + + Vector() = default; + Vector(T scalar) + { + x = y = z = scalar; + } + Vector(T _x, T _y, T _z) + { + x = _x; + y = _y; + z = _z; + } + template <typename U> + Vector(Vector<U, 3> other) + { + x = (T)other.x; + y = (T)other.y; + z = (T)other.z; + } + template <typename U, int otherSize> + Vector(Vector<U, otherSize> other) + { + int minSize = 3; + if (otherSize < minSize) minSize = otherSize; + for (int i = 0; i < minSize; i++) + (*this)[i] = (T)other[i]; + } }; template <typename T> struct Vector<T, 4> { T x, y, z, w; + + const T& operator[](size_t index) const { return *((T*)(this) + index); } + T& operator[](size_t index) { return *((T*)(this) + index); } + Vector() = default; + Vector(T scalar) + { + x = y = z = w = scalar; + } + Vector(T _x, T _y, T _z, T _w) + { + x = _x; + y = _y; + z = _z; + w = _w; + } + template <typename U, int otherSize> + Vector(Vector<U, otherSize> other) + { + int minSize = 4; + if (otherSize < minSize) minSize = otherSize; + for (int i = 0; i < minSize; i++) + (*this)[i] = (T)other[i]; + } + }; +template<typename T, int N> +SLANG_FORCE_INLINE T _slang_vector_get_element(Vector<T, N> x, int index) +{ + return x[index]; +} + +template<typename T, int N> +SLANG_FORCE_INLINE const T* _slang_vector_get_element_ptr(const Vector<T, N>* x, int index) +{ + return &((*const_cast<Vector<T,N>*>(x))[index]); +} + +template<typename T, int N> +SLANG_FORCE_INLINE T* _slang_vector_get_element_ptr(Vector<T, N>* x, int index) +{ + return &((*x)[index]); +} + +template<typename T, int n, typename OtherT, int m> +SLANG_FORCE_INLINE Vector<T, n> _slang_vector_reshape(const Vector<OtherT, m> other) +{ + Vector<T, n> result; + for (int i = 0; i < n; i++) + { + OtherT otherElement = T(0); + if (i < m) + otherElement = _slang_vector_get_element(other, i); + *_slang_vector_get_element_ptr(&result, i) = (T)otherElement; + } + return result; +} + +typedef uint32_t uint; typedef Vector<float, 2> float2; typedef Vector<float, 3> float3; @@ -119,12 +252,320 @@ typedef Vector<uint32_t, 2> uint2; typedef Vector<uint32_t, 3> uint3; typedef Vector<uint32_t, 4> uint4; +#define SLANG_VECTOR_BINARY_OP(T, op) \ + template<int n> \ + SLANG_FORCE_INLINE Vector<T, n> operator op(const Vector<T, n>& thisVal, const Vector<T, n>& other) \ + { \ + Vector<T, n> result;\ + for (int i = 0; i < n; i++) \ + result[i] = thisVal[i] op other[i]; \ + return result;\ + } +#define SLANG_VECTOR_BINARY_COMPARE_OP(T, op) \ + template<int n> \ + SLANG_FORCE_INLINE Vector<bool, n> operator op(const Vector<T, n>& thisVal, const Vector<T, n>& other) \ + { \ + Vector<bool, n> result;\ + for (int i = 0; i < n; i++) \ + result[i] = thisVal[i] op other[i]; \ + return result;\ + } + +#define SLANG_VECTOR_UNARY_OP(T, op) \ + template<int n> \ + SLANG_FORCE_INLINE Vector<T, n> operator op(const Vector<T, n>& thisVal) \ + { \ + Vector<T, n> result;\ + for (int i = 0; i < n; i++) \ + result[i] = op thisVal[i]; \ + return result;\ + } +#define SLANG_INT_VECTOR_OPS(T) \ + SLANG_VECTOR_BINARY_OP(T, +)\ + SLANG_VECTOR_BINARY_OP(T, -)\ + SLANG_VECTOR_BINARY_OP(T, *)\ + SLANG_VECTOR_BINARY_OP(T, / )\ + SLANG_VECTOR_BINARY_OP(T, &)\ + SLANG_VECTOR_BINARY_OP(T, |)\ + SLANG_VECTOR_BINARY_OP(T, &&)\ + SLANG_VECTOR_BINARY_OP(T, ||)\ + SLANG_VECTOR_BINARY_OP(T, ^)\ + SLANG_VECTOR_BINARY_OP(T, %)\ + SLANG_VECTOR_BINARY_OP(T, >>)\ + SLANG_VECTOR_BINARY_OP(T, <<)\ + SLANG_VECTOR_BINARY_COMPARE_OP(T, >)\ + SLANG_VECTOR_BINARY_COMPARE_OP(T, <)\ + SLANG_VECTOR_BINARY_COMPARE_OP(T, >=)\ + SLANG_VECTOR_BINARY_COMPARE_OP(T, <=)\ + SLANG_VECTOR_BINARY_COMPARE_OP(T, ==)\ + SLANG_VECTOR_BINARY_COMPARE_OP(T, !=)\ + SLANG_VECTOR_UNARY_OP(T, !)\ + SLANG_VECTOR_UNARY_OP(T, ~) +#define SLANG_FLOAT_VECTOR_OPS(T) \ + SLANG_VECTOR_BINARY_OP(T, +)\ + SLANG_VECTOR_BINARY_OP(T, -)\ + SLANG_VECTOR_BINARY_OP(T, *)\ + SLANG_VECTOR_BINARY_OP(T, /)\ + SLANG_VECTOR_UNARY_OP(T, -)\ + SLANG_VECTOR_BINARY_COMPARE_OP(T, >)\ + SLANG_VECTOR_BINARY_COMPARE_OP(T, <)\ + SLANG_VECTOR_BINARY_COMPARE_OP(T, >=)\ + SLANG_VECTOR_BINARY_COMPARE_OP(T, <=)\ + SLANG_VECTOR_BINARY_COMPARE_OP(T, ==)\ + SLANG_VECTOR_BINARY_COMPARE_OP(T, !=) + +SLANG_INT_VECTOR_OPS(bool) +SLANG_INT_VECTOR_OPS(int) +SLANG_INT_VECTOR_OPS(int8_t) +SLANG_INT_VECTOR_OPS(int16_t) +SLANG_INT_VECTOR_OPS(int64_t) +SLANG_INT_VECTOR_OPS(uint) +SLANG_INT_VECTOR_OPS(uint8_t) +SLANG_INT_VECTOR_OPS(uint16_t) +SLANG_INT_VECTOR_OPS(uint64_t) + +SLANG_FLOAT_VECTOR_OPS(float) +SLANG_FLOAT_VECTOR_OPS(double) + +#define SLANG_VECTOR_INT_NEG_OP(T) \ + template<int N>\ + Vector<T, N> operator-(const Vector<T, N>& thisVal) \ + { \ + Vector<T, N> result;\ + for (int i = 0; i < N; i++) \ + result[i] = 0 - thisVal[i]; \ + return result;\ + } +SLANG_VECTOR_INT_NEG_OP(int) +SLANG_VECTOR_INT_NEG_OP(int8_t) +SLANG_VECTOR_INT_NEG_OP(int16_t) +SLANG_VECTOR_INT_NEG_OP(int64_t) +SLANG_VECTOR_INT_NEG_OP(uint) +SLANG_VECTOR_INT_NEG_OP(uint8_t) +SLANG_VECTOR_INT_NEG_OP(uint16_t) +SLANG_VECTOR_INT_NEG_OP(uint64_t) + +#define SLANG_FLOAT_VECTOR_MOD(T)\ + template<int N> \ + Vector<T, N> operator%(const Vector<T, N>& left, const Vector<T, N>& right) \ + {\ + Vector<T, N> result;\ + for (int i = 0; i < N; i++) \ + result[i] = _slang_fmod(left[i], right[i]); \ + return result;\ + } + +SLANG_FLOAT_VECTOR_MOD(float) +SLANG_FLOAT_VECTOR_MOD(double) +#undef SLANG_FLOAT_VECTOR_MOD +#undef SLANG_VECTOR_BINARY_OP +#undef SLANG_VECTOR_UNARY_OP +#undef SLANG_INT_VECTOR_OPS +#undef SLANG_FLOAT_VECTOR_OPS +#undef SLANG_VECTOR_INT_NEG_OP +#undef SLANG_FLOAT_VECTOR_MOD + template <typename T, int ROWS, int COLS> struct Matrix { Vector<T, COLS> rows[ROWS]; + Vector<T, COLS>& operator[](size_t index) { return rows[index]; } + Matrix() = default; + Matrix(T scalar) + { + for (int i = 0; i < ROWS; i++) + rows[i] = Vector<T, COLS>(scalar); + } + Matrix(const Vector<T, COLS>& row0) + { + rows[0] = row0; + } + Matrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1) + { + rows[0] = row0; + rows[1] = row1; + } + Matrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1, const Vector<T, COLS>& row2) + { + rows[0] = row0; + rows[1] = row1; + rows[2] = row2; + } + Matrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1, const Vector<T, COLS>& row2, const Vector<T, COLS>& row3) + { + rows[0] = row0; + rows[1] = row1; + rows[2] = row2; + rows[3] = row3; + } + template<typename U, int otherRow, int otherCol> + Matrix(const Matrix<U, otherRow, otherCol>& other) + { + int minRow = ROWS; + int minCol = COLS; + if (minRow > otherRow) minRow = otherRow; + if (minCol > otherCol) minCol = otherCol; + for (int i = 0; i < minRow; i++) + for (int j = 0; j < minCol; j++) + rows[i][j] = (T)other.rows[i][j]; + } + Matrix(T v0, T v1, T v2, T v3) + { + rows[0][0] = v0; rows[0][1] = v1; + rows[1][0] = v2; rows[1][1] = v3; + } + Matrix(T v0, T v1, T v2, T v3, T v4, T v5) + { + if (COLS == 3) + { + rows[0][0] = v0; rows[0][1] = v1; rows[0][2] = v2; + rows[1][0] = v3; rows[1][1] = v4; rows[1][2] = v5; + } + else + { + rows[0][0] = v0; rows[0][1] = v1; + rows[1][0] = v2; rows[1][1] = v3; + rows[2][0] = v4; rows[2][1] = v5; + } + } + Matrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) + { + if (COLS == 4) + { + rows[0][0] = v0; rows[0][1] = v1; rows[0][2] = v2; rows[0][3] = v3; + rows[1][0] = v4; rows[1][1] = v5; rows[1][2] = v6; rows[1][3] = v7; + } + else + { + rows[0][0] = v0; rows[0][1] = v1; + rows[1][0] = v2; rows[1][1] = v3; + rows[2][0] = v4; rows[2][1] = v5; + rows[3][0] = v6; rows[3][1] = v7; + } + } + Matrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8) + { + rows[0][0] = v0; rows[0][1] = v1; rows[0][2] = v2; + rows[1][0] = v3; rows[1][1] = v4; rows[1][2] = v5; + rows[2][0] = v6; rows[2][1] = v7; rows[2][2] = v8; + } + Matrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11) + { + if (COLS == 4) + { + rows[0][0] = v0; rows[0][1] = v1; rows[0][2] = v2; rows[0][3] = v3; + rows[1][0] = v4; rows[1][1] = v5; rows[1][2] = v6; rows[1][3] = v7; + rows[2][0] = v8; rows[2][1] = v9; rows[2][2] = v10; rows[2][3] = v11; + } + else + { + rows[0][0] = v0; rows[0][1] = v1; rows[0][2] = v2; + rows[1][0] = v3; rows[1][1] = v4; rows[1][2] = v5; + rows[2][0] = v6; rows[2][1] = v7; rows[2][2] = v8; + rows[3][0] = v9; rows[3][1] = v10; rows[3][2] = v11; + } + } + Matrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) + { + rows[0][0] = v0; rows[0][1] = v1; rows[0][2] = v2; rows[0][3] = v3; + rows[1][0] = v4; rows[1][1] = v5; rows[1][2] = v6; rows[1][3] = v7; + rows[2][0] = v8; rows[2][1] = v9; rows[2][2] = v10; rows[2][3] = v11; + rows[3][0] = v12; rows[3][1] = v13; rows[3][2] = v14; rows[3][3] = v15; + } }; +#define SLANG_MATRIX_BINARY_OP(T, op) \ + template<int R, int C> \ + Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal, const Matrix<T, R, C>& other) \ + { \ + Matrix<T, R, C> result;\ + for (int i = 0; i < R; i++) \ + for (int j = 0; j < C; j++) \ + result.rows[i][j] = thisVal.rows[i][j] op other.rows[i][j]; \ + return result;\ + } + +#define SLANG_MATRIX_UNARY_OP(T, op) \ + template<int R, int C> \ + Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal) \ + { \ + Matrix<T, R, C> result;\ + for (int i = 0; i < R; i++) \ + for (int j = 0; j < C; j++) \ + result[i].rows[i][j] = op thisVal.rows[i][j]; \ + return result;\ + } +#define SLANG_INT_MATRIX_OPS(T) \ + SLANG_MATRIX_BINARY_OP(T, +)\ + SLANG_MATRIX_BINARY_OP(T, -)\ + SLANG_MATRIX_BINARY_OP(T, *)\ + SLANG_MATRIX_BINARY_OP(T, / )\ + SLANG_MATRIX_BINARY_OP(T, &)\ + SLANG_MATRIX_BINARY_OP(T, |)\ + SLANG_MATRIX_BINARY_OP(T, &&)\ + SLANG_MATRIX_BINARY_OP(T, ||)\ + SLANG_MATRIX_BINARY_OP(T, ^)\ + SLANG_MATRIX_BINARY_OP(T, %)\ + SLANG_MATRIX_UNARY_OP(T, !)\ + SLANG_MATRIX_UNARY_OP(T, ~) +#define SLANG_FLOAT_MATRIX_OPS(T) \ + SLANG_MATRIX_BINARY_OP(T, +)\ + SLANG_MATRIX_BINARY_OP(T, -)\ + SLANG_MATRIX_BINARY_OP(T, *)\ + SLANG_MATRIX_BINARY_OP(T, /)\ + SLANG_MATRIX_UNARY_OP(T, -) +SLANG_INT_MATRIX_OPS(int) +SLANG_INT_MATRIX_OPS(int8_t) +SLANG_INT_MATRIX_OPS(int16_t) +SLANG_INT_MATRIX_OPS(int64_t) +SLANG_INT_MATRIX_OPS(uint) +SLANG_INT_MATRIX_OPS(uint8_t) +SLANG_INT_MATRIX_OPS(uint16_t) +SLANG_INT_MATRIX_OPS(uint64_t) + +SLANG_FLOAT_MATRIX_OPS(float) +SLANG_FLOAT_MATRIX_OPS(double) + +#define SLANG_MATRIX_INT_NEG_OP(T) \ + template<int R, int C>\ + SLANG_FORCE_INLINE Matrix<T, R, C> operator-(Matrix<T, R, C> thisVal) \ + { \ + Matrix<T, R, C> result;\ + for (int i = 0; i < R; i++) \ + for (int j = 0; j < C; j++) \ + result.rows[i][j] = 0 - thisVal.rows[i][j]; \ + return result;\ + } + SLANG_MATRIX_INT_NEG_OP(int) + SLANG_MATRIX_INT_NEG_OP(int8_t) + SLANG_MATRIX_INT_NEG_OP(int16_t) + SLANG_MATRIX_INT_NEG_OP(int64_t) + SLANG_MATRIX_INT_NEG_OP(uint) + SLANG_MATRIX_INT_NEG_OP(uint8_t) + SLANG_MATRIX_INT_NEG_OP(uint16_t) + SLANG_MATRIX_INT_NEG_OP(uint64_t) + +#define SLANG_FLOAT_MATRIX_MOD(T)\ + template<int R, int C> \ + SLANG_FORCE_INLINE Matrix<T, R, C> operator%(Matrix<T, R, C> left, Matrix<T, R, C> right) \ + {\ + Matrix<T, R, C> result;\ + for (int i = 0; i < R; i++) \ + for (int j = 0; j < C; j++) \ + result.rows[i][j] = _slang_fmod(left.rows[i][j], right.rows[i][j]); \ + return result;\ + } + + SLANG_FLOAT_MATRIX_MOD(float) + SLANG_FLOAT_MATRIX_MOD(double) +#undef SLANG_FLOAT_MATRIX_MOD +#undef SLANG_MATRIX_BINARY_OP +#undef SLANG_MATRIX_UNARY_OP +#undef SLANG_INT_MATRIX_OPS +#undef SLANG_FLOAT_MATRIX_OPS +#undef SLANG_MATRIX_INT_NEG_OP +#undef SLANG_FLOAT_MATRIX_MOD + // We can just map `NonUniformResourceIndex` type directly to the index type on CPU, as CPU does not require // any special handling around such accesses. typedef size_t NonUniformResourceIndex; diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index 448b69c63..cb1bb188b 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -149,12 +149,11 @@ typedef size_t NonUniformResourceIndex; template <typename T, int ROWS, int COLS> struct Matrix; -typedef bool bool1; +typedef int1 bool1; typedef int2 bool2; typedef int3 bool3; typedef int4 bool4; - typedef signed char int8_t; typedef short int16_t; typedef int int32_t; @@ -186,163 +185,522 @@ union Union64 double d; }; -// -// Half support -// +SLANG_FORCE_INLINE SLANG_CUDA_CALL float _slang_fmod(float x, float y) +{ + return ::fmodf(x, y); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double _slang_fmod(double x, double y) +{ + return ::fmod(x, y); +} #if SLANG_CUDA_ENABLE_HALF // Add the other vector half types -struct __half3 { __half2 xy; __half z; }; -struct __half4 { __half2 xy; __half2 zw; }; - -// *** convert *** - -// half -> other - -// float -SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 convert_float2(const __half2& v) { return __half22float2(v); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float3 convert_float3(const __half3& v) { const float2 xy = __half22float2(v.xy); return float3{xy.x, xy.y, __half2float(v.z)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 convert_float4(const __half4& v) { const float2 xy = __half22float2(v.xy); const float2 zw = __half22float2(v.zw); return float4{xy.x, xy.y, zw.x, zw.y}; } - -// double -SLANG_FORCE_INLINE SLANG_CUDA_CALL double2 convert_double2(const __half2& v) { const float2 xy = __half22float2(v); return double2{ xy.x, xy.y }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double3 convert_double3(const __half3& v) { const float2 xy = __half22float2(v.xy); return double3{ xy.x, xy.y, __half2float(v.z)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double4 convert_double4(const __half4& v) { const float2 xy = __half22float2(v.xy); const float2 zw = __half22float2(v.zw); return double4{xy.x, xy.y, zw.x, zw.y}; } - -// int -SLANG_FORCE_INLINE SLANG_CUDA_CALL int2 convert_int2(const __half2& v) { return int2 { __half2int_rz(v.x), __half2int_rz(v.y) }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL int3 convert_int3(const __half3& v) { return int3 { __half2int_rz(v.xy.x), __half2int_rz(v.xy.y), __half2int_rz(v.z) }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL int4 convert_int4(const __half4& v) { return int4 { __half2int_rz(v.xy.x), __half2int_rz(v.xy.y), __half2int_rz(v.zw.x), __half2int_rz(v.zw.y)}; } - -// uint -SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2 convert_uint2(const __half2& v) { return uint2 { __half2uint_rz(v.x), __half2uint_rz(v.y) }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL uint3 convert_uint3(const __half3& v) { return uint3 { __half2uint_rz(v.xy.x), __half2uint_rz(v.xy.y), __half2uint_rz(v.z) }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4 convert_uint4(const __half4& v) { return uint4 { __half2uint_rz(v.xy.x), __half2uint_rz(v.xy.y), __half2uint_rz(v.zw.x), __half2uint_rz(v.zw.y)}; } +struct __half1 { __half x; }; +struct __align__(4) __half3 { __half x, y, z; }; +struct __align__(4) __half4 { __half x, y, z, w; }; +#endif -// other -> half +#define SLANG_VECTOR_GET_ELEMENT(T) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##1 x, int index) { return ((T*)(&x))[index]; }\ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##2 x, int index) { return ((T*)(&x))[index]; }\ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##3 x, int index) { return ((T*)(&x))[index]; }\ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##4 x, int index) { return ((T*)(&x))[index]; } +SLANG_VECTOR_GET_ELEMENT(int) +SLANG_VECTOR_GET_ELEMENT(uint) +SLANG_VECTOR_GET_ELEMENT(short) +SLANG_VECTOR_GET_ELEMENT(ushort) +SLANG_VECTOR_GET_ELEMENT(char) +SLANG_VECTOR_GET_ELEMENT(uchar) +SLANG_VECTOR_GET_ELEMENT(longlong) +SLANG_VECTOR_GET_ELEMENT(ulonglong) +SLANG_VECTOR_GET_ELEMENT(float) +SLANG_VECTOR_GET_ELEMENT(double) + +#define SLANG_VECTOR_GET_ELEMENT_PTR(T) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##1* x, int index) { return ((T*)(x)) + index; }\ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##2* x, int index) { return ((T*)(x)) + index; }\ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##3* x, int index) { return ((T*)(x)) + index; }\ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##4* x, int index) { return ((T*)(x)) + index; } +SLANG_VECTOR_GET_ELEMENT_PTR(int) +SLANG_VECTOR_GET_ELEMENT_PTR(uint) +SLANG_VECTOR_GET_ELEMENT_PTR(short) +SLANG_VECTOR_GET_ELEMENT_PTR(ushort) +SLANG_VECTOR_GET_ELEMENT_PTR(char) +SLANG_VECTOR_GET_ELEMENT_PTR(uchar) +SLANG_VECTOR_GET_ELEMENT_PTR(longlong) +SLANG_VECTOR_GET_ELEMENT_PTR(ulonglong) +SLANG_VECTOR_GET_ELEMENT_PTR(float) +SLANG_VECTOR_GET_ELEMENT_PTR(double) -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 convert___half2(const float2& v) { return __float22half2_rn(v); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 convert___half3(const float3& v) { return __half3{ __float22half2_rn(float2{v.x, v.y}), __float2half_rn(v.z) }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 convert___half4(const float4& v) { return __half4{ __float22half2_rn(float2{v.x, v.y}), __float22half2_rn(float2{v.z, v.w}) }; } +#if SLANG_CUDA_ENABLE_HALF +SLANG_VECTOR_GET_ELEMENT(__half) +SLANG_VECTOR_GET_ELEMENT_PTR(__half) +#endif -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 convert___half2(const int2& v) { return __half2{ __int2half_rz(v.x), __int2half_rz(v.y) }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 convert___half3(const int3& v) { return __half3{ __half2{__int2half_rz(v.x), __int2half_rz(v.y)}, __int2half_rz(v.z) }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 convert___half4(const int4& v) { return __half4{ __half2{__int2half_rz(v.x), __int2half_rz(v.y)}, __half2{__int2half_rz(v.z), __int2half_rz(v.w)} }; } +#define SLANG_CUDA_VECTOR_BINARY_OP(T, n, op) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator op(T##n thisVal, T##n other) \ + { \ + T##n result;\ + for (int i = 0; i < n; i++) \ + *_slang_vector_get_element_ptr(&result, i) = _slang_vector_get_element(thisVal,i) op _slang_vector_get_element(other,i); \ + return result;\ + } +#define SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, op) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL bool##n operator op(T##n thisVal, T##n other) \ + { \ + bool##n result;\ + for (int i = 0; i < n; i++) \ + *_slang_vector_get_element_ptr(&result, i) = (int)(_slang_vector_get_element(thisVal,i) op _slang_vector_get_element(other,i)); \ + return result;\ + } +#define SLANG_CUDA_VECTOR_UNARY_OP(T, n, op) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator op(T##n thisVal) \ + { \ + T##n result;\ + for (int i = 0; i < n; i++) \ + *_slang_vector_get_element_ptr(&result, i) = op _slang_vector_get_element(thisVal,i); \ + return result;\ + } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 convert___half2(const uint2& v) { return __half2{ __uint2half_rz(v.x), __uint2half_rz(v.y) }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 convert___half3(const uint3& v) { return __half3{ __half2{__uint2half_rz(v.x), __uint2half_rz(v.y)}, __uint2half_rz(v.z) }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 convert___half4(const uint4& v) { return __half4{ __half2{__uint2half_rz(v.x), __uint2half_rz(v.y)}, __half2{__uint2half_rz(v.z), __uint2half_rz(v.w)} }; } +#define SLANG_CUDA_VECTOR_INT_OP(T, n) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, +)\ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, -)\ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, *)\ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, /)\ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, %)\ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, ^)\ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, &)\ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, |)\ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, &&)\ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, ||)\ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, >>)\ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, <<)\ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >)\ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <)\ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >=)\ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <=)\ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, ==)\ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, !=)\ + SLANG_CUDA_VECTOR_UNARY_OP(T, n, !)\ + SLANG_CUDA_VECTOR_UNARY_OP(T, n, -)\ + SLANG_CUDA_VECTOR_UNARY_OP(T, n, ~) + +#define SLANG_CUDA_VECTOR_INT_OPS(T) \ + SLANG_CUDA_VECTOR_INT_OP(T, 2) \ + SLANG_CUDA_VECTOR_INT_OP(T, 3) \ + SLANG_CUDA_VECTOR_INT_OP(T, 4) + +SLANG_CUDA_VECTOR_INT_OPS(int) +SLANG_CUDA_VECTOR_INT_OPS(uint) +SLANG_CUDA_VECTOR_INT_OPS(ushort) +SLANG_CUDA_VECTOR_INT_OPS(short) +SLANG_CUDA_VECTOR_INT_OPS(char) +SLANG_CUDA_VECTOR_INT_OPS(uchar) +SLANG_CUDA_VECTOR_INT_OPS(longlong) +SLANG_CUDA_VECTOR_INT_OPS(ulonglong) + +#define SLANG_CUDA_VECTOR_FLOAT_OP(T, n) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, +)\ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, -)\ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, *)\ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, /)\ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, &&)\ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, ||)\ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >)\ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <)\ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >=)\ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <=)\ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, ==)\ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, !=)\ + SLANG_CUDA_VECTOR_UNARY_OP(T, n, -) +#define SLANG_CUDA_VECTOR_FLOAT_OPS(T) \ + SLANG_CUDA_VECTOR_FLOAT_OP(T, 2) \ + SLANG_CUDA_VECTOR_FLOAT_OP(T, 3) \ + SLANG_CUDA_VECTOR_FLOAT_OP(T, 4) + +SLANG_CUDA_VECTOR_FLOAT_OPS(float) +SLANG_CUDA_VECTOR_FLOAT_OPS(double) +#if SLANG_CUDA_ENABLE_HALF +SLANG_CUDA_VECTOR_FLOAT_OPS(__half) +#endif +#define SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, n)\ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator%(const T##n& left, const T##n& right) \ + {\ + T##n result;\ + for (int i = 0; i < n; i++) \ + *_slang_vector_get_element_ptr(&result, i) = _slang_fmod(_slang_vector_get_element(left,i), _slang_vector_get_element(right,i)); \ + return result;\ + } +#define SLANG_CUDA_FLOAT_VECTOR_MOD(T) \ + SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 2)\ + SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 3)\ + SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 4) + +SLANG_CUDA_FLOAT_VECTOR_MOD(float) +SLANG_CUDA_FLOAT_VECTOR_MOD(double) + +#define SLANG_MAKE_VECTOR(T) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x, T y) { return T##2{x, y}; }\ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x, T y, T z) { return T##3{ x, y, z }; }\ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x, T y, T z, T w) { return T##4{ x, y, z, w }; } +SLANG_MAKE_VECTOR(int) +SLANG_MAKE_VECTOR(uint) +SLANG_MAKE_VECTOR(short) +SLANG_MAKE_VECTOR(ushort) +SLANG_MAKE_VECTOR(char) +SLANG_MAKE_VECTOR(uchar) +SLANG_MAKE_VECTOR(float) +SLANG_MAKE_VECTOR(double) +SLANG_MAKE_VECTOR(longlong) +SLANG_MAKE_VECTOR(ulonglong) +#if SLANG_CUDA_ENABLE_HALF +SLANG_MAKE_VECTOR(__half) +#endif -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 convert___half2(const double2& v) { return __float22half2_rn(float2{v.x, v.y}); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 convert___half3(const double3& v) { return __half3{ __float22half2_rn(float2{v.x, v.y}), __float2half_rn(v.z) }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 convert___half4(const double4& v) { return __half4{ __float22half2_rn(float2{v.x, v.y}), __float22half2_rn(float2{v.z, v.w}) }; } +#define SLANG_MAKE_VECTOR_FROM_SCALAR(T) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##1 make_##T##1(T x) { return T##1{x}; }\ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x) { return make_##T##2(x, x); }\ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x) { return make_##T##3(x, x, x); }\ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x) { return make_##T##4(x, x, x, x); } +SLANG_MAKE_VECTOR_FROM_SCALAR(int) +SLANG_MAKE_VECTOR_FROM_SCALAR(uint) +SLANG_MAKE_VECTOR_FROM_SCALAR(short) +SLANG_MAKE_VECTOR_FROM_SCALAR(ushort) +SLANG_MAKE_VECTOR_FROM_SCALAR(char) +SLANG_MAKE_VECTOR_FROM_SCALAR(uchar) +SLANG_MAKE_VECTOR_FROM_SCALAR(longlong) +SLANG_MAKE_VECTOR_FROM_SCALAR(ulonglong) +SLANG_MAKE_VECTOR_FROM_SCALAR(float) +SLANG_MAKE_VECTOR_FROM_SCALAR(double) +#if SLANG_CUDA_ENABLE_HALF +SLANG_MAKE_VECTOR_FROM_SCALAR(__half) +#endif -// *** make *** +template<typename T, int n> +struct GetVectorTypeImpl {}; -// Mechanism to make half vectors -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 make___half2(__half x, __half y) { return __halves2half2(x, y); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 make___half3(__half x, __half y, __half z) { return __half3{ __halves2half2(x, y), z }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 make___half4(__half x, __half y, __half z, __half w) { return __half4{ __halves2half2(x, y), __halves2half2(z, w)}; } +#define GET_VECTOR_TYPE_IMPL(T, n)\ +struct GetVectorTypeImpl<T,n>\ +{\ + typedef T##n type;\ + static SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n fromScalar(T v) { return make_##T##n(v); } \ +}; +#define GET_VECTOR_TYPE_IMPL_N(T)\ + GET_VECTOR_TYPE_IMPL(T, 1)\ + GET_VECTOR_TYPE_IMPL(T, 2)\ + GET_VECTOR_TYPE_IMPL(T, 3)\ + GET_VECTOR_TYPE_IMPL(T, 4) + +GET_VECTOR_TYPE_IMPL_N(int) +GET_VECTOR_TYPE_IMPL_N(uint) +GET_VECTOR_TYPE_IMPL_N(short) +GET_VECTOR_TYPE_IMPL_N(ushort) +GET_VECTOR_TYPE_IMPL_N(char) +GET_VECTOR_TYPE_IMPL_N(uchar) +GET_VECTOR_TYPE_IMPL_N(longlong) +GET_VECTOR_TYPE_IMPL_N(ulonglong) +GET_VECTOR_TYPE_IMPL_N(float) +GET_VECTOR_TYPE_IMPL_N(double) +#if SLANG_CUDA_ENABLE_HALF +GET_VECTOR_TYPE_IMPL_N(__half) +#endif +template<typename T, int n> +using Vector = typename GetVectorTypeImpl<T, n>::type; -// *** constructFromScalar *** +template<typename T, int n, typename OtherT, int m> +SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, n> _slang_vector_reshape(const Vector<OtherT, m> other) +{ + Vector<T, n> result; + for (int i = 0; i < n; i++) + { + OtherT otherElement = T(0); + if (i < m) + otherElement = _slang_vector_get_element(other, i); + *_slang_vector_get_element_ptr(&result, i) = (T)otherElement; + } + return result; +} -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 constructFromScalar___half2(half x) { return __half2half2(x); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 constructFromScalar___half3(half x) { return __half3{__half2half2(x), x}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 constructFromScalar___half4(half x) { const __half2 v = __half2half2(x); return __half4{v, v}; } +template <typename T, int ROWS, int COLS> +struct Matrix +{ + Vector<T, COLS> rows[ROWS]; + SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, COLS>& operator[](size_t index) { return rows[index]; } +}; -// *** half2 *** -// half2 maths ops +template<typename T, int ROWS, int COLS> +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T scalar) +{ + Matrix<T, ROWS, COLS> result; + for (int i = 0; i < ROWS; i++) + result.rows[i] = GetVectorTypeImpl<T, COLS>::fromScalar(scalar); + return result; -// NOTE! That by default these are in cuda_fp16.hpp, but we disable them, because we need to define the comparison operators -// as we need versions that will return vector<bool> +} -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator+(const __half2& lh, const __half2& rh) { return __hadd2(lh, rh); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator-(const __half2& lh, const __half2& rh) { return __hsub2(lh, rh); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator*(const __half2& lh, const __half2& rh) { return __hmul2(lh, rh); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator/(const __half2& lh, const __half2& rh) { return __h2div(lh, rh); } +template<typename T, int ROWS, int COLS> +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0) +{ + Matrix<T, ROWS, COLS> result; + result.rows[0] = row0; + return result; +} -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2& operator+=(__half2& lh, const __half2& rh) { lh = __hadd2(lh, rh); return lh; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2& operator-=(__half2& lh, const __half2& rh) { lh = __hsub2(lh, rh); return lh; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2& operator*=(__half2& lh, const __half2& rh) { lh = __hmul2(lh, rh); return lh; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2& operator/=(__half2& lh, const __half2& rh) { lh = __h2div(lh, rh); return lh; } +template<typename T, int ROWS, int COLS> +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1) +{ + Matrix<T, ROWS, COLS> result; + result.rows[0] = row0; + result.rows[1] = row1; + return result; +} -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 &operator++(__half2 &h) { __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hadd2(h, one); return h; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 &operator--(__half2 &h) { __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hsub2(h, one); return h; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator++(__half2 &h, int) { __half2 ret = h; __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hadd2(h, one); return ret; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator--(__half2 &h, int) { __half2 ret = h; __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hsub2(h, one); return ret; } +template<typename T, int ROWS, int COLS> +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1, const Vector<T, COLS>& row2) +{ + Matrix<T, ROWS, COLS> result; + result.rows[0] = row0; + result.rows[1] = row1; + result.rows[2] = row2; + return result; +} -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator+(const __half2 &h) { return h; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator-(const __half2 &h) { return __hneg2(h); } +template<typename T, int ROWS, int COLS> +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1, const Vector<T, COLS>& row2, const Vector<T, COLS>& row3) +{ + Matrix<T, ROWS, COLS> result; + result.rows[0] = row0; + result.rows[1] = row1; + result.rows[2] = row2; + result.rows[3] = row3; + return result; +} -// vec op scalar -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator+(const __half2& lh, __half rh) { return __hadd2(lh, __half2half2(rh)); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator-(const __half2& lh, __half rh) { return __hsub2(lh, __half2half2(rh)); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator*(const __half2& lh, __half rh) { return __hmul2(lh, __half2half2(rh)); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator/(const __half2& lh, __half rh) { return __h2div(lh, __half2half2(rh)); } +template<typename T, int ROWS, int COLS, typename U, int otherRow, int otherCol> +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Matrix<U, otherRow, otherCol>& other) +{ + Matrix<T, ROWS, COLS> result; + int minRow = ROWS; + int minCol = COLS; + if (minRow > otherRow) minRow = otherRow; + if (minCol > otherCol) minCol = otherCol; + for (int i = 0; i < minRow; i++) + for (int j = 0; j < minCol; j++) + *_slang_vector_get_element_ptr(result.rows + i, j) = (T)_slang_vector_get_element(other.rows[i], j); + return result; +} -// scalar op vec -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator+(__half lh, const __half2& rh) { return __hadd2(__half2half2(lh), rh); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator-(__half lh, const __half2& rh) { return __hsub2(__half2half2(lh), rh); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator*(__half lh, const __half2& rh) { return __hmul2(__half2half2(lh), rh); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator/(__half lh, const __half2& rh) { return __h2div(__half2half2(lh), rh); } +template<typename T, int ROWS, int COLS> +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3) +{ + Matrix<T, ROWS, COLS> rs; + rs.rows[0].x = v0; rs.rows[0].y = v1; + rs.rows[1].x = v2; rs.rows[1].y = v3; + return rs; +} -// *** half3 *** +template<typename T, int ROWS, int COLS> +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5) +{ + Matrix<T, ROWS, COLS> rs; + if (COLS == 3) + { + rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; + rs.rows[1].x = v3; rs.rows[1].y = v4; rs.rows[1].z = v5; + } + else + { + rs.rows[0].x = v0; rs.rows[0].y = v1; + rs.rows[1].x = v2; rs.rows[1].y = v3; + rs.rows[2].x = v4; rs.rows[2].y = v5; + } + return rs; -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator+(const __half3& lh, const __half3& rh) { return __half3{__hadd2(lh.xy, rh.xy), __hadd(lh.z, rh.z)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(const __half3& lh, const __half3& rh) { return __half3{__hsub2(lh.xy, rh.xy), __hsub(lh.z, rh.z)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator*(const __half3& lh, const __half3& rh) { return __half3{__hmul2(lh.xy, rh.xy), __hmul(lh.z, rh.z)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator/(const __half3& lh, const __half3& rh) { return __half3{__h2div(lh.xy, rh.xy), __hdiv(lh.z, rh.z)}; } +} -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(const __half3& h) { return __half3{__hneg2(h.xy), __hneg(h.z)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator+(const __half3& h) { return h; } +template<typename T, int ROWS, int COLS> +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) +{ + Matrix<T, ROWS, COLS> rs; + if (COLS == 4) + { + rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; rs.rows[0].w = v3; + rs.rows[1].x = v4; rs.rows[1].y = v5; rs.rows[1].z = v6; rs.rows[1].w = v7; + } + else + { + rs.rows[0].x = v0; rs.rows[0].y = v1; + rs.rows[1].x = v2; rs.rows[1].y = v3; + rs.rows[2].x = v4; rs.rows[2].y = v5; + rs.rows[3].x = v6; rs.rows[3].y = v7; + } + return rs; +} -// vec op scalar -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator+(const __half3& lh, __half rh) { return __half3{__hadd2(lh.xy, __half2half2(rh)), __hadd(lh.z, rh)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(const __half3& lh, __half rh) { return __half3{__hsub2(lh.xy, __half2half2(rh)), __hsub(lh.z, rh)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator*(const __half3& lh, __half rh) { return __half3{__hmul2(lh.xy, __half2half2(rh)), __hmul(lh.z, rh)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator/(const __half3& lh, __half rh) { return __half3{__h2div(lh.xy, __half2half2(rh)), __hdiv(lh.z, rh)}; } +template<typename T, int ROWS, int COLS> +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8) +{ + Matrix<T, ROWS, COLS> rs; + rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; + rs.rows[1].x = v3; rs.rows[1].y = v4; rs.rows[1].z = v5; + rs.rows[2].x = v6; rs.rows[2].y = v7; rs.rows[2].z = v8; + return rs; +} -// scalar op vec -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator+(__half lh, const __half3& rh) { return __half3{__hadd2(__half2half2(lh), rh.xy), __hadd(lh, rh.z)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(__half lh, const __half3& rh) { return __half3{__hsub2(__half2half2(lh), rh.xy), __hsub(lh, rh.z)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator*(__half lh, const __half3& rh) { return __half3{__hmul2(__half2half2(lh), rh.xy), __hmul(lh, rh.z)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator/(__half lh, const __half3& rh) { return __half3{__h2div(__half2half2(lh), rh.xy), __hdiv(lh, rh.z)}; } +template<typename T, int ROWS, int COLS> +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11) +{ + Matrix<T, ROWS, COLS> rs; + if (COLS == 4) + { + rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; rs.rows[0].w = v3; + rs.rows[1].x = v4; rs.rows[1].y = v5; rs.rows[1].z = v6; rs.rows[1].w = v7; + rs.rows[2].x = v8; rs.rows[2].y = v9; rs.rows[2].z = v10; rs.rows[2].w = v11; + } + else + { + rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; + rs.rows[1].x = v3; rs.rows[1].y = v4; rs.rows[1].z = v5; + rs.rows[2].x = v6; rs.rows[2].y = v7; rs.rows[2].z = v8; + rs.rows[3].x = v9; rs.rows[3].y = v10; rs.rows[3].z = v11; + } + return rs; +} -// *** half4 *** +template<typename T, int ROWS, int COLS> +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) +{ + Matrix<T, ROWS, COLS> rs; + rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; rs.rows[0].w = v3; + rs.rows[1].x = v4; rs.rows[1].y = v5; rs.rows[1].z = v6; rs.rows[1].w = v7; + rs.rows[2].x = v8; rs.rows[2].y = v9; rs.rows[2].z = v10; rs.rows[2].w = v11; + rs.rows[3].x = v12; rs.rows[3].y = v13; rs.rows[3].z = v14; rs.rows[3].w = v15; + return rs; +} -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator+(const __half4& lh, const __half4& rh) { return __half4{__hadd2(lh.xy, rh.xy), __hadd2(lh.zw, rh.zw)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(const __half4& lh, const __half4& rh) { return __half4{__hsub2(lh.xy, rh.xy), __hsub2(lh.zw, rh.zw)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator*(const __half4& lh, const __half4& rh) { return __half4{__hmul2(lh.xy, rh.xy), __hmul2(lh.zw, rh.zw)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator/(const __half4& lh, const __half4& rh) { return __half4{__h2div(lh.xy, rh.xy), __h2div(lh.zw, rh.zw)}; } +#define SLANG_MATRIX_BINARY_OP(T, op) \ + template<int R, int C> \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal, const Matrix<T, R, C>& other) \ + { \ + Matrix<T, R, C> result;\ + for (int i = 0; i < R; i++) \ + for (int j = 0; j < C; j++) \ + *_slang_vector_get_element_ptr(result.rows+i,j) = _slang_vector_get_element(thisVal.rows[i], j) op _slang_vector_get_element(other.rows[i], j); \ + return result;\ + } -// vec op scalar -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator+(const __half4& lh, __half rh) { const __half2 rhv = __half2half2(rh); return __half4{__hadd2(lh.xy, rhv), __hadd2(lh.zw, rhv)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(const __half4& lh, __half rh) { const __half2 rhv = __half2half2(rh); return __half4{__hsub2(lh.xy, rhv), __hsub2(lh.zw, rhv)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator*(const __half4& lh, __half rh) { const __half2 rhv = __half2half2(rh); return __half4{__hmul2(lh.xy, rhv), __hmul2(lh.zw, rhv)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator/(const __half4& lh, __half rh) { const __half2 rhv = __half2half2(rh); return __half4{__h2div(lh.xy, rhv), __h2div(lh.zw, rhv)}; } +#define SLANG_MATRIX_UNARY_OP(T, op) \ + template<int R, int C> \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal) \ + { \ + Matrix<T, R, C> result;\ + for (int i = 0; i < R; i++) \ + for (int j = 0; j < C; j++) \ + *_slang_vector_get_element_ptr(result.rows+i,j) = op _slang_vector_get_element(thisVal.rows[i], j); \ + return result;\ + } +#define SLANG_INT_MATRIX_OPS(T) \ + SLANG_MATRIX_BINARY_OP(T, +)\ + SLANG_MATRIX_BINARY_OP(T, -)\ + SLANG_MATRIX_BINARY_OP(T, *)\ + SLANG_MATRIX_BINARY_OP(T, / )\ + SLANG_MATRIX_BINARY_OP(T, &)\ + SLANG_MATRIX_BINARY_OP(T, |)\ + SLANG_MATRIX_BINARY_OP(T, &&)\ + SLANG_MATRIX_BINARY_OP(T, ||)\ + SLANG_MATRIX_BINARY_OP(T, ^)\ + SLANG_MATRIX_BINARY_OP(T, %)\ + SLANG_MATRIX_UNARY_OP(T, !)\ + SLANG_MATRIX_UNARY_OP(T, ~) +#define SLANG_FLOAT_MATRIX_OPS(T) \ + SLANG_MATRIX_BINARY_OP(T, +)\ + SLANG_MATRIX_BINARY_OP(T, -)\ + SLANG_MATRIX_BINARY_OP(T, *)\ + SLANG_MATRIX_BINARY_OP(T, /)\ + SLANG_MATRIX_UNARY_OP(T, -) +SLANG_INT_MATRIX_OPS(int) +SLANG_INT_MATRIX_OPS(uint) +SLANG_INT_MATRIX_OPS(short) +SLANG_INT_MATRIX_OPS(ushort) +SLANG_INT_MATRIX_OPS(char) +SLANG_INT_MATRIX_OPS(uchar) +SLANG_INT_MATRIX_OPS(longlong) +SLANG_INT_MATRIX_OPS(ulonglong) +SLANG_FLOAT_MATRIX_OPS(float) +SLANG_FLOAT_MATRIX_OPS(double) +#if SLANG_CUDA_ENABLE_HALF +SLANG_FLOAT_MATRIX_OPS(__half) +#endif +#define SLANG_MATRIX_INT_NEG_OP(T) \ + template<int R, int C>\ + SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator-(Matrix<T, R, C> thisVal) \ + { \ + Matrix<T, R, C> result;\ + for (int i = 0; i < R; i++) \ + for (int j = 0; j < C; j++) \ + *_slang_vector_get_element_ptr(result.rows+i,j) = 0 - _slang_vector_get_element(thisVal.rows[i], j); \ + return result;\ + } + SLANG_MATRIX_INT_NEG_OP(int) + SLANG_MATRIX_INT_NEG_OP(uint) + SLANG_MATRIX_INT_NEG_OP(short) + SLANG_MATRIX_INT_NEG_OP(ushort) + SLANG_MATRIX_INT_NEG_OP(char) + SLANG_MATRIX_INT_NEG_OP(uchar) + SLANG_MATRIX_INT_NEG_OP(longlong) + SLANG_MATRIX_INT_NEG_OP(ulonglong) + +#define SLANG_FLOAT_MATRIX_MOD(T)\ + template<int R, int C> \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator%(Matrix<T, R, C> left, Matrix<T, R, C> right) \ + {\ + Matrix<T, R, C> result;\ + for (int i = 0; i < R; i++) \ + for (int j = 0; j < C; j++) \ + *_slang_vector_get_element_ptr(result.rows+i,j) = _slang_fmod(_slang_vector_get_element(left.rows[i], j), _slang_vector_get_element(right.rows[i], j)); \ + return result;\ + } -// scalar op vec -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator+(__half lh, const __half4& rh) { const __half2 lhv = __half2half2(lh); return __half4{__hadd2(lhv, rh.xy), __hadd2(lhv, rh.zw)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(__half lh, const __half4& rh) { const __half2 lhv = __half2half2(lh); return __half4{__hsub2(lhv, rh.xy), __hsub2(lhv, rh.zw)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator*(__half lh, const __half4& rh) { const __half2 lhv = __half2half2(lh); return __half4{__hmul2(lhv, rh.xy), __hmul2(lhv, rh.zw)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator/(__half lh, const __half4& rh) { const __half2 lhv = __half2half2(lh); return __half4{__h2div(lhv, rh.xy), __h2div(lhv, rh.zw)}; } + SLANG_FLOAT_MATRIX_MOD(float) + SLANG_FLOAT_MATRIX_MOD(double) +#if SLANG_CUDA_ENABLE_HALF + template<int R, int C> + SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<__half, R, C> operator%(Matrix<__half, R, C> left, Matrix<__half, R, C> right) + { + Matrix<__half, R, C> result; + for (int i = 0; i < R; i++) + for (int j = 0; j < C; j++) + * _slang_vector_get_element_ptr(result.rows + i, j) = __float2half(_slang_fmod(__half2float(_slang_vector_get_element(left.rows[i], j)), __half2float(_slang_vector_get_element(right.rows[i], j)))); + return result; + } +#endif +#undef SLANG_FLOAT_MATRIX_MOD +#undef SLANG_MATRIX_BINARY_OP +#undef SLANG_MATRIX_UNARY_OP +#undef SLANG_INT_MATRIX_OPS +#undef SLANG_FLOAT_MATRIX_OPS +#undef SLANG_MATRIX_INT_NEG_OP +#undef SLANG_FLOAT_MATRIX_MOD -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(const __half4& h) { return __half4{__hneg2(h.xy), __hneg2(h.zw)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator+(const __half4& h) { return h; } +// +// Half support +// +#if SLANG_CUDA_ENABLE_HALF // Convenience functions ushort -> half SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __ushort_as_half(const ushort2& i) { return __halves2half2(__ushort_as_half(i.x), __ushort_as_half(i.y)); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 __ushort_as_half(const ushort3& i) { return __half3{__halves2half2(__ushort_as_half(i.x), __ushort_as_half(i.y)), __ushort_as_half(i.z)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 __ushort_as_half(const ushort4& i) { return __half4{ __halves2half2(__ushort_as_half(i.x), __ushort_as_half(i.y)), __halves2half2(__ushort_as_half(i.z), __ushort_as_half(i.w)) }; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 __ushort_as_half(const ushort3& i) { return __half3{__ushort_as_half(i.x), __ushort_as_half(i.y), __ushort_as_half(i.z)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 __ushort_as_half(const ushort4& i) { return __half4{ __ushort_as_half(i.x), __ushort_as_half(i.y), __ushort_as_half(i.z), __ushort_as_half(i.w) }; } // Convenience functions half -> ushort SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort2 __half_as_ushort(const __half2& i) { return make_ushort2(__half_as_ushort(i.x), __half_as_ushort(i.y)); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort3 __half_as_ushort(const __half3& i) { return make_ushort3(__half_as_ushort(i.xy.x), __half_as_ushort(i.xy.y), __half_as_ushort(i.z)); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort4 __half_as_ushort(const __half4& i) { return make_ushort4(__half_as_ushort(i.xy.x), __half_as_ushort(i.xy.y), __half_as_ushort(i.zw.x), __half_as_ushort(i.zw.y)); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort3 __half_as_ushort(const __half3& i) { return make_ushort3(__half_as_ushort(i.x), __half_as_ushort(i.y), __half_as_ushort(i.z)); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort4 __half_as_ushort(const __half4& i) { return make_ushort4(__half_as_ushort(i.x), __half_as_ushort(i.y), __half_as_ushort(i.z), __half_as_ushort(i.w)); } // This is a little bit of a hack. Fortunately CUDA has the definitions of the templated types in // include/surface_indirect_functions.h @@ -438,7 +796,7 @@ template <> \ SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 FUNC_NAME##_convert<float4>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ { \ const __half4 v = __ushort_as_half(FUNC_NAME<ushort4>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ - return float4{v.xy.x, v.xy.y, v.zw.x, v.zw.y}; \ + return float4{v.x, v.y, v.z, v.w}; \ } SLANG_SURFACE_READ_HALF_CONVERT(surf1Dread, (int x), (x)) |
