From 79e722338cd59aab74b4c57600c5ac6bce3bcd25 Mon Sep 17 00:00:00 2001 From: jsmall-nvidia Date: Fri, 23 Apr 2021 11:32:07 -0400 Subject: Preliminary CUDA Half support (#1808) * #include an absolute path didn't work - because paths were taken to always be relative. * WIP CUDA half support. * Working support for half on CUDA - requires cuda_fp16.h and associated files can be found. * Fix for win32 for unused funcs. * Fix for Clang. * Hack to disable unused local function warning. --- source/slang/hlsl.meta.slang | 46 ++++++++++++++++++++++++++++++++++++++-- source/slang/slang-compiler.cpp | 5 +++++ source/slang/slang-emit-cuda.cpp | 24 ++++++++++----------- source/slang/slang-emit-cuda.h | 14 ++++++++++-- 4 files changed, 73 insertions(+), 16 deletions(-) (limited to 'source/slang') diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 761016866..754b3ac63 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -1066,6 +1066,7 @@ matrix asuint(matrix x) __target_intrinsic(hlsl) __target_intrinsic(glsl, "uint16_t(packHalf2x16(vec2($0, 0.0)))") +__target_intrinsic(cuda, "__half_as_ushort") uint16_t asuint16(float16_t value); vector asuint16(vector value) @@ -1078,6 +1079,7 @@ matrix asuint16(matrix va __target_intrinsic(hlsl) __target_intrinsic(glsl, "float16_t(unpackHalf2x16($0).x)") +__target_intrinsic(cuda, "__ushort_as_half") float16_t asfloat16(uint16_t value); vector asfloat16(vector value) @@ -1088,11 +1090,16 @@ matrix asfloat16(matrix v // Float<->signed cases: -__target_intrinsic(hlsl) [__unsafeForceInlineEarly] int16_t asint16(float16_t value) { return asuint16(value); } +__target_intrinsic(hlsl) +__target_intrinsic(cuda, "__half_as_short") +[__unsafeForceInlineEarly] int16_t asint16(float16_t value) { return asuint16(value); } __target_intrinsic(hlsl) [__unsafeForceInlineEarly] vector asint16(vector value) { return asuint16(value); } __target_intrinsic(hlsl) [__unsafeForceInlineEarly] matrix asint16(matrix value) { return asuint16(value); } -__target_intrinsic(hlsl) [__unsafeForceInlineEarly] float16_t asfloat16(int16_t value) { return asfloat16(asuint16(value)); } +__target_intrinsic(hlsl) +__target_intrinsic(cuda, "__short_as_half") +[__unsafeForceInlineEarly] float16_t asfloat16(int16_t value) { return asfloat16(asuint16(value)); } + __target_intrinsic(hlsl) [__unsafeForceInlineEarly] vector asfloat16(vector value) { return asfloat16(asuint16(value)); } __target_intrinsic(hlsl) [__unsafeForceInlineEarly] matrix asfloat16(matrix value) { return asfloat16(asuint16(value)); } @@ -1593,6 +1600,8 @@ vector f16tof32(vector value) VECTOR_MAP_UNARY(float, N, f16tof32, value); } + + // Convert to 16-bit float stored in low bits of integer __target_intrinsic(glsl, "packHalf2x16(vec2($0,0.0))") __glsl_version(420) @@ -1606,6 +1615,39 @@ vector f32tof16(vector value) VECTOR_MAP_UNARY(uint, N, f32tof16, value); } +// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +// The following is Slang specific and NOT part of standard HLSL +// It's not clear what happens with float16 time in HLSL -> can the float16 coerce to uint for example? If so that would +// give the wrong result + +__target_intrinsic(glsl, "unpackHalf2x16($0).x") +__target_intrinsic(cuda, "__half2float") +__glsl_version(420) +float f16tof32(float16_t value); + +__generic +__target_intrinsic(hlsl) +__target_intrinsic(cuda, "__half2float") +vector f16tof32(vector value) +{ + VECTOR_MAP_UNARY(float, N, f16tof32, value); +} + +// Convert to float16_t +__target_intrinsic(glsl, "packHalf2x16(vec2($0,0.0))") +__glsl_version(420) +__target_intrinsic(cuda, "__float2half") +float16_t f32tof16_(float value); + +__generic +__target_intrinsic(cuda, "__float2half") +vector f32tof16_(vector value) +{ + VECTOR_MAP_UNARY(uint, N, f32tof16, value); +} + +// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + // Flip surface normal to face forward, if needed __generic __target_intrinsic(hlsl) diff --git a/source/slang/slang-compiler.cpp b/source/slang/slang-compiler.cpp index 19a5fddf8..1d416634a 100755 --- a/source/slang/slang-compiler.cpp +++ b/source/slang/slang-compiler.cpp @@ -1421,6 +1421,11 @@ SlangResult dissassembleDXILUsingDXC( options.requiredCapabilityVersions.add(version); } + + if (cudaTracker->isBaseTypeRequired(BaseType::Half)) + { + options.flags |= CompileOptions::Flag::EnableFloat16; + } } options.sourceContents = source.source; diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp index 2f5a9917d..a259ea933 100644 --- a/source/slang/slang-emit-cuda.cpp +++ b/source/slang/slang-emit-cuda.cpp @@ -27,7 +27,7 @@ static bool _isSingleNameBasicType(IROp op) } } -/* static */ UnownedStringSlice CUDASourceEmitter::getBuiltinTypeName(IROp op) +UnownedStringSlice CUDASourceEmitter::getBuiltinTypeName(IROp op) { switch (op) { @@ -44,8 +44,11 @@ static bool _isSingleNameBasicType(IROp op) case kIROp_UIntType: return UnownedStringSlice("uint"); case kIROp_UInt64Type: return UnownedStringSlice("ulonglong"); - // Not clear just yet how we should handle half... we want all processing as float probly, but when reading/writing to memory converting - case kIROp_HalfType: return UnownedStringSlice("half"); + case kIROp_HalfType: + { + m_extensionTracker->requireBaseType(BaseType::Half); + return UnownedStringSlice("__half"); + } case kIROp_FloatType: return UnownedStringSlice("float"); case kIROp_DoubleType: return UnownedStringSlice("double"); @@ -54,7 +57,7 @@ static bool _isSingleNameBasicType(IROp op) } -/* static */ UnownedStringSlice CUDASourceEmitter::getVectorPrefix(IROp op) +UnownedStringSlice CUDASourceEmitter::getVectorPrefix(IROp op) { switch (op) { @@ -70,8 +73,11 @@ static bool _isSingleNameBasicType(IROp op) case kIROp_UIntType: return UnownedStringSlice("uint"); case kIROp_UInt64Type: return UnownedStringSlice("ulonglong"); - // Not clear just yet how we should handle half... we want all processing as float probly, but when reading/writing to memory converting - case kIROp_HalfType: return UnownedStringSlice("half"); + case kIROp_HalfType: + { + m_extensionTracker->requireBaseType(BaseType::Half); + return UnownedStringSlice("__half"); + } case kIROp_FloatType: return UnownedStringSlice("float"); case kIROp_DoubleType: return UnownedStringSlice("double"); @@ -160,12 +166,6 @@ SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target, switch (type->getOp()) { - case kIROp_HalfType: - { - // Special case half - out << getBuiltinTypeName(kIROp_FloatType); - return SLANG_OK; - } case kIROp_VectorType: { auto vecType = static_cast(type); diff --git a/source/slang/slang-emit-cuda.h b/source/slang/slang-emit-cuda.h index fefa40a11..a5d227c6b 100644 --- a/source/slang/slang-emit-cuda.h +++ b/source/slang/slang-emit-cuda.h @@ -11,7 +11,17 @@ class CUDAExtensionTracker : public RefObject { public: + typedef uint32_t BaseTypeFlags; + SemanticVersion m_smVersion; + + void requireBaseType(BaseType baseType) { m_baseTypeFlags |= _getFlag(baseType); } + bool isBaseTypeRequired(BaseType baseType) { return (m_baseTypeFlags & _getFlag(baseType)) != 0; } + +protected: + static BaseTypeFlags _getFlag(BaseType baseType) { return BaseTypeFlags(1) << int(baseType); } + + BaseTypeFlags m_baseTypeFlags = 0; }; class CUDASourceEmitter : public CPPSourceEmitter @@ -30,8 +40,8 @@ public: }; }; - static UnownedStringSlice getBuiltinTypeName(IROp op); - static UnownedStringSlice getVectorPrefix(IROp op); + UnownedStringSlice getBuiltinTypeName(IROp op); + UnownedStringSlice getVectorPrefix(IROp op); virtual RefObject* getExtensionTracker() SLANG_OVERRIDE { return m_extensionTracker; } virtual void emitTempModifiers(IRInst* temp) SLANG_OVERRIDE; -- cgit v1.2.3