Preliminary CUDA Half support (#1808)

* #include an absolute path didn't work - because paths were taken to always be relative. * WIP CUDA half support. * Working support for half on CUDA - requires cuda_fp16.h and associated files can be found. * Fix for win32 for unused funcs. * Fix for Clang. * Hack to disable unused local function warning.
author: jsmall-nvidia <jsmall@nvidia.com> 2021-04-23 11:32:07 -0400
committer: GitHub <noreply@github.com> 2021-04-23 11:32:07 -0400
commit: 79e722338cd59aab74b4c57600c5ac6bce3bcd25 (patch)
tree: af77066235c4038bd15c6297ef4f48d3e562171d /prelude
parent: a47e7751c2738543e872452debc7494369c9fb35 (diff)
1 files changed, 39 insertions, 1 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index a975ec99c..c6de56641 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -1,8 +1,17 @@
+// Define SLANG_CUDA_ENABLE_HALF to use the cuda_fp16 include to add half support. 
+// For this to work NVRTC needs to have the path to the CUDA SDK.
+//
+// As it stands the includes paths defined for Slang are passed down to NVRTC. Similarly defines defined for the Slang compile
+// are passed down. 
+
+#ifdef SLANG_CUDA_ENABLE_HALF
+#include <cuda_fp16.h>
+#endif
+
 #ifdef SLANG_CUDA_ENABLE_OPTIX
 #include <optix.h>
 #endif
 
-
 // Must be large enough to cause overflow and therefore infinity
 #ifndef SLANG_INFINITY
 #   define SLANG_INFINITY   ((float)(1e+300 * 1e+300))
@@ -44,6 +53,35 @@
 #   define SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK(index, count) SLANG_PRELUDE_ASSERT(index < count); 
 #endif
 
+//
+// Half support
+// 
+
+#if SLANG_CUDA_ENABLE_HALF
+
+// Add the other vector half types
+struct __half3 { __half2 xy; __half z; };
+struct __half4 { __half2 xy; __half2 zw; };
+
+// Mechanism to make half vectors
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 make___half2(__half x, __half y) { return __halves2half2(x, y); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 make___half3(__half x, __half y, __half z) { __half3 o; o.xy = __halves2half2(x, y); o.z = z; return o; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 make___half4(__half x, __half y, __half z, __half w) { __half4 o; o.xy = __halves2half2(x, y); o.zw = __halves2half2(z, w); return o; }
+
+// Use the round nearest as the default - it is the only one defined
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __float22half2(const float2 a) { return __float22half2_rn(a); }
+
+// Implement the vector versions
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __float2half(float2 a) { return __float22half2(a); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 __float2half(float3 a) { __half3 o; o.xy = __float22half2(make_float2(a.x, a.y)); o.z = __float2half(a.z); return o; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 __float2half(float4 a) { __half4 o; o.xy = __float22half2(make_float2(a.x, a.y)); o.zw = __float22half2(make_float2(a.z, a.w)); return o; }
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 __half2float(__half2 a) { return __half22float2(a); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float3 __half2float(__half3 a) { float2 xy = __half22float2(a.xy); float z = __half2float(a.z); return make_float3(xy.x, xy.y, z); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 __half2float(__half4 a) { float2 xy = __half22float2(a.xy); float2 zw = __half22float2(a.zw); return make_float4(xy.x, xy.y, zw.x, zw.y); }
+
+#endif
+
  // This macro handles how out-of-range surface coordinates are handled; 
  // I can equal
  // cudaBoundaryModeClamp, in which case out-of-range coordinates are clamped to the valid range
author	jsmall-nvidia <jsmall@nvidia.com>	2021-04-23 11:32:07 -0400
committer	GitHub <noreply@github.com>	2021-04-23 11:32:07 -0400
commit	79e722338cd59aab74b4c57600c5ac6bce3bcd25 (patch)
tree	af77066235c4038bd15c6297ef4f48d3e562171d /prelude
parent	a47e7751c2738543e872452debc7494369c9fb35 (diff)