diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2021-04-23 11:32:07 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2021-04-23 11:32:07 -0400 |
| commit | 79e722338cd59aab74b4c57600c5ac6bce3bcd25 (patch) | |
| tree | af77066235c4038bd15c6297ef4f48d3e562171d /prelude | |
| parent | a47e7751c2738543e872452debc7494369c9fb35 (diff) | |
Preliminary CUDA Half support (#1808)
* #include an absolute path didn't work - because paths were taken to always be relative.
* WIP CUDA half support.
* Working support for half on CUDA - requires cuda_fp16.h and associated files can be found.
* Fix for win32 for unused funcs.
* Fix for Clang.
* Hack to disable unused local function warning.
Diffstat (limited to 'prelude')
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 40 |
1 files changed, 39 insertions, 1 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index a975ec99c..c6de56641 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -1,8 +1,17 @@ +// Define SLANG_CUDA_ENABLE_HALF to use the cuda_fp16 include to add half support. +// For this to work NVRTC needs to have the path to the CUDA SDK. +// +// As it stands the includes paths defined for Slang are passed down to NVRTC. Similarly defines defined for the Slang compile +// are passed down. + +#ifdef SLANG_CUDA_ENABLE_HALF +#include <cuda_fp16.h> +#endif + #ifdef SLANG_CUDA_ENABLE_OPTIX #include <optix.h> #endif - // Must be large enough to cause overflow and therefore infinity #ifndef SLANG_INFINITY # define SLANG_INFINITY ((float)(1e+300 * 1e+300)) @@ -44,6 +53,35 @@ # define SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK(index, count) SLANG_PRELUDE_ASSERT(index < count); #endif +// +// Half support +// + +#if SLANG_CUDA_ENABLE_HALF + +// Add the other vector half types +struct __half3 { __half2 xy; __half z; }; +struct __half4 { __half2 xy; __half2 zw; }; + +// Mechanism to make half vectors +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 make___half2(__half x, __half y) { return __halves2half2(x, y); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 make___half3(__half x, __half y, __half z) { __half3 o; o.xy = __halves2half2(x, y); o.z = z; return o; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 make___half4(__half x, __half y, __half z, __half w) { __half4 o; o.xy = __halves2half2(x, y); o.zw = __halves2half2(z, w); return o; } + +// Use the round nearest as the default - it is the only one defined +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __float22half2(const float2 a) { return __float22half2_rn(a); } + +// Implement the vector versions +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __float2half(float2 a) { return __float22half2(a); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 __float2half(float3 a) { __half3 o; o.xy = __float22half2(make_float2(a.x, a.y)); o.z = __float2half(a.z); return o; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 __float2half(float4 a) { __half4 o; o.xy = __float22half2(make_float2(a.x, a.y)); o.zw = __float22half2(make_float2(a.z, a.w)); return o; } + +SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 __half2float(__half2 a) { return __half22float2(a); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL float3 __half2float(__half3 a) { float2 xy = __half22float2(a.xy); float z = __half2float(a.z); return make_float3(xy.x, xy.y, z); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 __half2float(__half4 a) { float2 xy = __half22float2(a.xy); float2 zw = __half22float2(a.zw); return make_float4(xy.x, xy.y, zw.x, zw.y); } + +#endif + // This macro handles how out-of-range surface coordinates are handled; // I can equal // cudaBoundaryModeClamp, in which case out-of-range coordinates are clamped to the valid range |
