diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2021-05-04 14:44:20 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2021-05-04 14:44:20 -0400 |
| commit | 1c643167a9417e75082b3898425ab9d2d999f583 (patch) | |
| tree | ad7459f186e0cf6b0d938763bae13aa57d23d318 | |
| parent | 7d52d3bd8905dfdf3018c41c9cad4685a98eb009 (diff) | |
More CUDA Half support (#1833)
* #include an absolute path didn't work - because paths were taken to always be relative.
* Split out StringEscapeUtil.
* Added StringEscapeUtil.
* Fix typo in unix quoting type.
* Small comment improvements.
* Try to fix linux linking issue.
* Fix typo.
* Attempt to fix linux link issue.
* Update VS proj even though nothing really changed.
* Fix another typo issue.
* Fix for windows issue.
Fixed bug.
* Make separate Utils for escaping.
* Fix typo.
* Split out into StringEscapeHandler.
* Windows shell does handle removing quotes (so remove code to remove them).
* Handle unescaping if not initiating using the shell.
* Slight improvement around shell like decoding.
* Simplify command extraction.
* Add shared-library category type.
* Fix bug in command extraction.
* Typo in transcendental category.
* Enable unit-test on in smoke test category.
* Make parsing failing output as a failing test.
* Fixes for transcendental tests. Disable tests that do not work.
* Changed category parsing.
* Removed the TestResult parameter from _gatherTestsForFile.
Made testsList only output.
* Remove testing if all tests were disabled.
* Make args of CommandLine always unescaped.
* Add category.
* Don't need escaping on unix/linux.
* Remove some no longer used functions.
* Add requireSMVersion to CUDAExtensionTracker.
* half-calc.slang now works for CUDA.
* bit-cast-16-bit works on CUDA.
* WIP handling of CUDA vector<half> types.
* Half swizzle CUDA.
* Half vector test.
* Fix swizzle half bug.
* Fix compilation issue with narrowing to Index.
* Add unary ops.
* Add some vector scalar maths ops.
* Add half vector conversions for CUDA.
* Fix erroneous comment.
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 225 | ||||
| -rw-r--r-- | source/slang/slang-emit-cuda.cpp | 28 | ||||
| -rw-r--r-- | tests/compute/half-vector-calc.slang | 12 | ||||
| -rw-r--r-- | tests/compute/half-vector-calc.slang.expected.txt | 8 |
4 files changed, 197 insertions, 76 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index 61702824c..05b978cf6 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -52,78 +52,6 @@ #ifndef SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK # define SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK(index, count) SLANG_PRELUDE_ASSERT(index < count); #endif - -// -// Half support -// - -#if SLANG_CUDA_ENABLE_HALF - -// Add the other vector half types -struct __half3 { __half2 xy; __half z; }; -struct __half4 { __half2 xy; __half2 zw; }; - -// Mechanism to make half vectors -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 make___half2(__half x, __half y) { return __halves2half2(x, y); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 make___half3(__half x, __half y, __half z) { return __half3{ __halves2half2(x, y), z }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 make___half4(__half x, __half y, __half z, __half w) { return __half4{ __halves2half2(x, y), __halves2half2(z, w)}; } - -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 constructFromScalar___half2(half x) { return __half2half2(x); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 constructFromScalar___half3(half x) { return __half3{__half2half2(x), x}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 constructFromScalar___half4(half x) { const __half2 v = __half2half2(x); return __half4{v, v}; } - -// Half3 maths ops - -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator+(const __half3 &lh, const __half3 &rh) { return __half3{__hadd2(lh.xy, rh.xy), __hadd(lh.z, rh.z)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(const __half3 &lh, const __half3 &rh) { return __half3{__hsub2(lh.xy, rh.xy), __hsub(lh.z, rh.z)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator*(const __half3 &lh, const __half3 &rh) { return __half3{__hmul2(lh.xy, rh.xy), __hmul(lh.z, rh.z)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator/(const __half3 &lh, const __half3 &rh) { return __half3{__h2div(lh.xy, rh.xy), __hdiv(lh.z, rh.z)}; } - -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(const __half3 &h) { return __half3{__hneg2(h.xy), __hneg(h.z)}; } - -#if 0 -// We need to return the vector<bool> type -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator==(const __half3 &lh, const __half3 &rh) { return __hbeq2(lh.xy, rh.xy) && __heq(lh.z, rh.z); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator!=(const __half3 &lh, const __half3 &rh) { return __hbneu2(lh.xy, rh.xy) && __hneu(lh.z, rh.z); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator>(const __half3 &lh, const __half3 &rh) { return __hbgt2(lh.xy, rh.xy) && __hgt(lh.z, rh.z); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator<(const __half3 &lh, const __half3 &rh) { return __hblt2(lh.xy, rh.xy) && __hlt(lh.z, rh.z); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator>=(const __half3 &lh, const __half3 &rh) { return __hbge2(lh.xy, rh.xy) && __hge(lh.z, rh.z); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator<=(const __half3 &lh, const __half3 &rh) { return __hble2(lh.xy, rh.xy) && __hle(lh.z, rh.z); } -#endif - -// Half4 maths ops - -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator+(const __half4 &lh, const __half4 &rh) { return __half4{__hadd2(lh.xy, rh.xy), __hadd2(lh.zw, rh.zw)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(const __half4 &lh, const __half4 &rh) { return __half4{__hsub2(lh.xy, rh.xy), __hsub2(lh.zw, rh.zw)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator*(const __half4 &lh, const __half4 &rh) { return __half4{__hmul2(lh.xy, rh.xy), __hmul2(lh.zw, rh.zw)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator/(const __half4 &lh, const __half4 &rh) { return __half4{__h2div(lh.xy, rh.xy), __h2div(lh.zw, rh.zw)}; } - -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(const __half4 &h) { return __half4{__hneg2(h.xy), __hneg2(h.zw)}; } - -#if 0 -// We need to return vector<bool> type -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator==(const __half4 &lh, const __half4 &rh) { return __hbeq2(lh.xy, rh.xy) && __hbeq2(lh.zw, rh.zw); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator!=(const __half4 &lh, const __half4 &rh) { return __hbneu2(lh.xy, rh.xy) && __hbneu2(lh.zw, rh.zw); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator>(const __half4 &lh, const __half4 &rh) { return __hbgt2(lh.xy, rh.xy) && __hbgt2(lh.zw, rh.zw); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator<(const __half4 &lh, const __half4 &rh) { return __hblt2(lh.xy, rh.xy) && __hblt2(lh.zw, rh.zw); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator>=(const __half4 &lh, const __half4 &rh) { return __hbge2(lh.xy, rh.xy) && __hbge2(lh.zw, rh.zw); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator<=(const __half4 &lh, const __half4 &rh) { return __hble2(lh.xy, rh.xy) && __hble2(lh.zw, rh.zw); } -#endif - -// Use the round nearest as the default - it is the only one defined -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __float22half2(const float2 a) { return __float22half2_rn(a); } - -// Implement the vector versions -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __float2half(float2 a) { return __float22half2(a); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 __float2half(float3 a) { __half3 o; o.xy = __float22half2(make_float2(a.x, a.y)); o.z = __float2half(a.z); return o; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 __float2half(float4 a) { __half4 o; o.xy = __float22half2(make_float2(a.x, a.y)); o.zw = __float22half2(make_float2(a.z, a.w)); return o; } - -SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 __half2float(__half2 a) { return __half22float2(a); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float3 __half2float(__half3 a) { float2 xy = __half22float2(a.xy); float z = __half2float(a.z); return make_float3(xy.x, xy.y, z); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 __half2float(__half4 a) { float2 xy = __half22float2(a.xy); float2 zw = __half22float2(a.zw); return make_float4(xy.x, xy.y, zw.x, zw.y); } - -#endif - // This macro handles how out-of-range surface coordinates are handled; // I can equal // cudaBoundaryModeClamp, in which case out-of-range coordinates are clamped to the valid range @@ -217,6 +145,159 @@ union Union64 double d; }; +// +// Half support +// + +#if SLANG_CUDA_ENABLE_HALF + +// Add the other vector half types +struct __half3 { __half2 xy; __half z; }; +struct __half4 { __half2 xy; __half2 zw; }; + + +// half -> other + +// float +SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 convert_float2(const __half2& v) { return __half22float2(v); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL float3 convert_float3(const __half3& v) { const float2 xy = __half22float2(v.xy); return float3{xy.x, xy.y, __half2float(v.z)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 convert_float4(const __half4& v) { const float2 xy = __half22float2(v.xy); const float2 zw = __half22float2(v.zw); return float4{xy.x, xy.y, zw.x, zw.y}; } + +// double +SLANG_FORCE_INLINE SLANG_CUDA_CALL double2 convert_double2(const __half2& v) { const float2 xy = __half22float2(v); return double2{ xy.x, xy.y }; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL double3 convert_double3(const __half3& v) { const float2 xy = __half22float2(v.xy); return double3{ xy.x, xy.y, __half2float(v.z)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL double4 convert_double4(const __half4& v) { const float2 xy = __half22float2(v.xy); const float2 zw = __half22float2(v.zw); return double4{xy.x, xy.y, zw.x, zw.y}; } + +// int +SLANG_FORCE_INLINE SLANG_CUDA_CALL int2 convert_int2(const __half2& v) { return int2 { __half2int_rz(v.x), __half2int_rz(v.y) }; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL int3 convert_int3(const __half3& v) { return int3 { __half2int_rz(v.xy.x), __half2int_rz(v.xy.y), __half2int_rz(v.z) }; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL int4 convert_int4(const __half4& v) { return int4 { __half2int_rz(v.xy.x), __half2int_rz(v.xy.y), __half2int_rz(v.zw.x), __half2int_rz(v.zw.y)}; } + +// uint +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2 convert_uint2(const __half2& v) { return uint2 { __half2uint_rz(v.x), __half2uint_rz(v.y) }; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint3 convert_uint3(const __half3& v) { return uint3 { __half2uint_rz(v.xy.x), __half2uint_rz(v.xy.y), __half2uint_rz(v.z) }; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4 convert_uint4(const __half4& v) { return uint4 { __half2uint_rz(v.xy.x), __half2uint_rz(v.xy.y), __half2uint_rz(v.zw.x), __half2uint_rz(v.zw.y)}; } + +// other -> half + +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 convert___half2(const float2& v) { return __float22half2_rn(v); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 convert___half3(const float3& v) { return __half3{ __float22half2_rn(float2{v.x, v.y}), __float2half_rn(v.z) }; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 convert___half4(const float4& v) { return __half4{ __float22half2_rn(float2{v.x, v.y}), __float22half2_rn(float2{v.z, v.w}) }; } + +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 convert___half2(const int2& v) { return __half2{ __int2half_rz(v.x), __int2half_rz(v.y) }; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 convert___half3(const int3& v) { return __half3{ __half2{__int2half_rz(v.x), __int2half_rz(v.y)}, __int2half_rz(v.z) }; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 convert___half4(const int4& v) { return __half4{ __half2{__int2half_rz(v.x), __int2half_rz(v.y)}, __half2{__int2half_rz(v.z), __int2half_rz(v.w)} }; } + +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 convert___half2(const uint2& v) { return __half2{ __uint2half_rz(v.x), __uint2half_rz(v.y) }; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 convert___half3(const uint3& v) { return __half3{ __half2{__uint2half_rz(v.x), __uint2half_rz(v.y)}, __uint2half_rz(v.z) }; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 convert___half4(const uint4& v) { return __half4{ __half2{__uint2half_rz(v.x), __uint2half_rz(v.y)}, __half2{__uint2half_rz(v.z), __uint2half_rz(v.w)} }; } + +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 convert___half2(const double2& v) { return __float22half2_rn(float2{v.x, v.y}); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 convert___half3(const double3& v) { return __half3{ __float22half2_rn(float2{v.x, v.y}), __float2half_rn(v.z) }; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 convert___half4(const double4& v) { return __half4{ __float22half2_rn(float2{v.x, v.y}), __float22half2_rn(float2{v.z, v.w}) }; } + +// half2 + +// vec op scalar +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator+(const __half2& lh, __half rh) { return __hadd2(lh, __half2half2(rh)); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator-(const __half2& lh, __half rh) { return __hsub2(lh, __half2half2(rh)); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator*(const __half2& lh, __half rh) { return __hmul2(lh, __half2half2(rh)); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator/(const __half2& lh, __half rh) { return __h2div(lh, __half2half2(rh)); } + +// scalar op vec +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator+(__half lh, const __half2& rh) { return __hadd2(__half2half2(lh), rh); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator-(__half lh, const __half2& rh) { return __hsub2(__half2half2(lh), rh); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator*(__half lh, const __half2& rh) { return __hmul2(__half2half2(lh), rh); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 operator/(__half lh, const __half2& rh) { return __h2div(__half2half2(lh), rh); } + +// Mechanism to make half vectors +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 make___half2(__half x, __half y) { return __halves2half2(x, y); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 make___half3(__half x, __half y, __half z) { return __half3{ __halves2half2(x, y), z }; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 make___half4(__half x, __half y, __half z, __half w) { return __half4{ __halves2half2(x, y), __halves2half2(z, w)}; } + +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 constructFromScalar___half2(half x) { return __half2half2(x); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 constructFromScalar___half3(half x) { return __half3{__half2half2(x), x}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 constructFromScalar___half4(half x) { const __half2 v = __half2half2(x); return __half4{v, v}; } + +// Half3 maths ops + +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator+(const __half3& lh, const __half3& rh) { return __half3{__hadd2(lh.xy, rh.xy), __hadd(lh.z, rh.z)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(const __half3& lh, const __half3& rh) { return __half3{__hsub2(lh.xy, rh.xy), __hsub(lh.z, rh.z)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator*(const __half3& lh, const __half3& rh) { return __half3{__hmul2(lh.xy, rh.xy), __hmul(lh.z, rh.z)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator/(const __half3& lh, const __half3& rh) { return __half3{__h2div(lh.xy, rh.xy), __hdiv(lh.z, rh.z)}; } + +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(const __half3& h) { return __half3{__hneg2(h.xy), __hneg(h.z)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator+(const __half3& h) { return h; } + +// vec op scalar +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator+(const __half3& lh, __half rh) { return __half3{__hadd2(lh.xy, __half2half2(rh)), __hadd(lh.z, rh)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(const __half3& lh, __half rh) { return __half3{__hsub2(lh.xy, __half2half2(rh)), __hsub(lh.z, rh)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator*(const __half3& lh, __half rh) { return __half3{__hmul2(lh.xy, __half2half2(rh)), __hmul(lh.z, rh)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator/(const __half3& lh, __half rh) { return __half3{__h2div(lh.xy, __half2half2(rh)), __hdiv(lh.z, rh)}; } + +// scalar op vec +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator+(__half lh, const __half3& rh) { return __half3{__hadd2(__half2half2(lh), rh.xy), __hadd(lh, rh.z)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator-(__half lh, const __half3& rh) { return __half3{__hsub2(__half2half2(lh), rh.xy), __hsub(lh, rh.z)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator*(__half lh, const __half3& rh) { return __half3{__hmul2(__half2half2(lh), rh.xy), __hmul(lh, rh.z)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 operator/(__half lh, const __half3& rh) { return __half3{__h2div(__half2half2(lh), rh.xy), __hdiv(lh, rh.z)}; } + + +#if 0 +// We need to return the vector<bool> type +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator==(const __half3& lh, const __half3& rh) { return __hbeq2(lh.xy, rh.xy) && __heq(lh.z, rh.z); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator!=(const __half3& lh, const __half3& rh) { return __hbneu2(lh.xy, rh.xy) && __hneu(lh.z, rh.z); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator>(const __half3& lh, const __half3& rh) { return __hbgt2(lh.xy, rh.xy) && __hgt(lh.z, rh.z); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator<(const __half3& lh, const __half3& rh) { return __hblt2(lh.xy, rh.xy) && __hlt(lh.z, rh.z); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator>=(const __half3& lh, const __half3& rh) { return __hbge2(lh.xy, rh.xy) && __hge(lh.z, rh.z); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator<=(const __half3& lh, const __half3& rh) { return __hble2(lh.xy, rh.xy) && __hle(lh.z, rh.z); } +#endif + +// Half4 maths ops + +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator+(const __half4& lh, const __half4& rh) { return __half4{__hadd2(lh.xy, rh.xy), __hadd2(lh.zw, rh.zw)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(const __half4& lh, const __half4& rh) { return __half4{__hsub2(lh.xy, rh.xy), __hsub2(lh.zw, rh.zw)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator*(const __half4& lh, const __half4& rh) { return __half4{__hmul2(lh.xy, rh.xy), __hmul2(lh.zw, rh.zw)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator/(const __half4& lh, const __half4& rh) { return __half4{__h2div(lh.xy, rh.xy), __h2div(lh.zw, rh.zw)}; } + +// vec op scalar +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator+(const __half4& lh, __half rh) { const __half2 rhv = __half2half2(rh); return __half4{__hadd2(lh.xy, rhv), __hadd2(lh.zw, rhv)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(const __half4& lh, __half rh) { const __half2 rhv = __half2half2(rh); return __half4{__hsub2(lh.xy, rhv), __hsub2(lh.zw, rhv)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator*(const __half4& lh, __half rh) { const __half2 rhv = __half2half2(rh); return __half4{__hmul2(lh.xy, rhv), __hmul2(lh.zw, rhv)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator/(const __half4& lh, __half rh) { const __half2 rhv = __half2half2(rh); return __half4{__h2div(lh.xy, rhv), __h2div(lh.zw, rhv)}; } + +// scalar op vec +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator+(__half lh, const __half4& rh) { const __half2 lhv = __half2half2(lh); return __half4{__hadd2(lhv, rh.xy), __hadd2(lhv, rh.zw)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(__half lh, const __half4& rh) { const __half2 lhv = __half2half2(lh); return __half4{__hsub2(lhv, rh.xy), __hsub2(lhv, rh.zw)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator*(__half lh, const __half4& rh) { const __half2 lhv = __half2half2(lh); return __half4{__hmul2(lhv, rh.xy), __hmul2(lhv, rh.zw)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator/(__half lh, const __half4& rh) { const __half2 lhv = __half2half2(lh); return __half4{__h2div(lhv, rh.xy), __h2div(lhv, rh.zw)}; } + +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator-(const __half4& h) { return __half4{__hneg2(h.xy), __hneg2(h.zw)}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 operator+(const __half4& h) { return h; } + +#if 0 +// We need to return vector<bool> type +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator==(const __half4& lh, const __half4& rh) { return __hbeq2(lh.xy, rh.xy) && __hbeq2(lh.zw, rh.zw); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator!=(const __half4& lh, const __half4& rh) { return __hbneu2(lh.xy, rh.xy) && __hbneu2(lh.zw, rh.zw); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator>(const __half4& lh, const __half4& rh) { return __hbgt2(lh.xy, rh.xy) && __hbgt2(lh.zw, rh.zw); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator<(const __half4& lh, const __half4& rh) { return __hblt2(lh.xy, rh.xy) && __hblt2(lh.zw, rh.zw); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator>=(const __half4& lh, const __half4& rh) { return __hbge2(lh.xy, rh.xy) && __hbge2(lh.zw, rh.zw); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool operator<=(const __half4& lh, const __half4& rh) { return __hble2(lh.xy, rh.xy) && __hble2(lh.zw, rh.zw); } +#endif + +// Use the round nearest as the default - it is the only one defined +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __float22half2(const float2 a) { return __float22half2_rn(a); } + +// Implement the vector versions +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __float2half(float2 a) { return __float22half2(a); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 __float2half(float3 a) { __half3 o; o.xy = __float22half2(make_float2(a.x, a.y)); o.z = __float2half(a.z); return o; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 __float2half(float4 a) { __half4 o; o.xy = __float22half2(make_float2(a.x, a.y)); o.zw = __float22half2(make_float2(a.z, a.w)); return o; } + +SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 __half2float(__half2 a) { return __half22float2(a); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL float3 __half2float(__half3 a) { float2 xy = __half22float2(a.xy); float z = __half2float(a.z); return make_float3(xy.x, xy.y, z); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 __half2float(__half4 a) { float2 xy = __half22float2(a.xy); float2 zw = __half22float2(a.zw); return make_float4(xy.x, xy.y, zw.x, zw.y); } + +#endif + // ----------------------------- F32 ----------------------------------------- // Unary diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp index 5f7eada68..b0c2cc02b 100644 --- a/source/slang/slang-emit-cuda.cpp +++ b/source/slang/slang-emit-cuda.cpp @@ -173,6 +173,33 @@ void CUDASourceEmitter::emitSpecializedOperationDefinition(const HLSLIntrinsic* if (auto vecType = as <IRVectorType>(specOp->returnType)) { + // Converting to or from half vector types is implemented prelude as convert___half functions + // Get the from type -> if it's half we ignore + + if (specOp->op == Op::ConstructConvert) + { + auto signatureType = specOp->signatureType; + + // Need to have impl of convert_float, double, int, uint, in prelude + + const auto paramCount = signatureType->getParamCount(); + SLANG_UNUSED(paramCount); + + // We have 2 'params' and param 1 is the source type + SLANG_ASSERT(paramCount == 2); + IRType* paramType = signatureType->getParamType(1); + + auto vecParamType = as<IRVectorType>(paramType); + + if (auto baseType = as<IRBasicType>(vecParamType->getElementType())) + { + if (baseType->getBaseType() == BaseType::Half) + { + return; + } + } + } + if (auto baseType = as<IRBasicType>(vecType->getElementType())) { if (baseType->getBaseType() == BaseType::Half) @@ -187,6 +214,7 @@ void CUDASourceEmitter::emitSpecializedOperationDefinition(const HLSLIntrinsic* case Op::Neg: case Op::ConstructFromScalar: + case Op::ConstructConvert: case Op::Leq: case Op::Less: diff --git a/tests/compute/half-vector-calc.slang b/tests/compute/half-vector-calc.slang index 5594c38fd..3ae204796 100644 --- a/tests/compute/half-vector-calc.slang +++ b/tests/compute/half-vector-calc.slang @@ -23,6 +23,18 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) v1 += v2.wzy; v2 += v0.xyxy; + // Unary + v2 = +v2.yxwz; + v2 = -v2.zwxy; + + // Scalar vector + v1 = v1 + v2.x; + v2 = v2 * half(2.0f); + v0 = half(2.0f) * v0; + v2 = v2 / half(2.0f); + + v0 *= half(2.0f); + v0 = v0 + v0 * v0; v1 = v1 + v1 * v1; v2 = v2 + v2 * v2; diff --git a/tests/compute/half-vector-calc.slang.expected.txt b/tests/compute/half-vector-calc.slang.expected.txt index 64beb1dd1..49c339529 100644 --- a/tests/compute/half-vector-calc.slang.expected.txt +++ b/tests/compute/half-vector-calc.slang.expected.txt @@ -1,5 +1,5 @@ type: float -20.000000 -98.500000 -292.000000 -600.500000 +30.000000 +161.500000 +492.000000 +1021.500000 |
