Read half->float RWTexture conversion (#1842)

* #include an absolute path didn't work - because paths were taken to always be relative. * Fix for writing to RWTexture with half types on CUDA. * CUDA half functionality doc updates. * First pass support for sust.p RWTexture format conversion on write. * Tidy up implementation of $C. Made clamping mode #define able. * A simple test for RWTexture CUDA format conversion. * Add support for float2 and float4. * WIP conversion testing. * Use $E to fix byte addressing in X in CUDA. * Do not scale when accessing via _convert versions of surface functions. * Revert to previous test. * Test with half/float convert write/read. * More broad half->float read conversion testing. * Improve documentation around half and RWTexture conversion.
author: jsmall-nvidia <jsmall@nvidia.com> 2021-05-15 11:45:58 -0400
committer: GitHub <noreply@github.com> 2021-05-15 11:45:58 -0400
commit: d5e8044d0a9723bb0bbd7ae1738d1157265da783 (patch)
tree: d330e87e67646fd6e978e4debad17b4f7fbe2c40 /source
parent: bfe75618be81566882be8570b8db82ad5a2f8fe4 (diff)
2 files changed, 64 insertions, 24 deletions
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index 9e5cf80c8..6b73630a3 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -1083,7 +1083,7 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                             }
 
                             sb << (isArray ? "Layered" : "");
-                            sb << "read<$T0>($0";
+                            sb << "read$C<$T0>($0";
                                 
                             for (int i = 0; i < vecCount; ++i)
                             {
diff --git a/source/slang/slang-intrinsic-expand.cpp b/source/slang/slang-intrinsic-expand.cpp
index c5bedbc37..bd2e17b28 100644
--- a/source/slang/slang-intrinsic-expand.cpp
+++ b/source/slang/slang-intrinsic-expand.cpp
@@ -1,6 +1,8 @@
 // slang-intrinsic-expand.cpp
 #include "slang-intrinsic-expand.h"
 
+#include "slang-emit-cuda.h"
+
 namespace Slang {
 
 void IntrinsicExpandContext::emit(IRCall* inst, IRUse* args, Int argCount, const UnownedStringSlice& intrinsicText)
@@ -101,13 +103,13 @@ static BaseType _getBaseTypeFromScalarType(SlangScalarType type)
 // The VK back-end gets away with this kind of coincidentally, since the "legalization" we have to do for resources means that there wouldn't be a single f() function any more.
 // But for CUDA and C++ that's not the case or generally desirable.
 
-static IRFormatDecoration* _findImageFormatDecoration(IRInst* inst)
+static IRFormatDecoration* _findImageFormatDecoration(IRInst* resourceInst)
 {
     // JS(TODO):
     // There could perhaps be other situations, that need to be covered
 
     // If this is a load, we need to get the decoration from the field key
-    if (IRLoad* load = as<IRLoad>(inst))
+    if (IRLoad* load = as<IRLoad>(resourceInst))
     {
         if (IRFieldAddress* fieldAddress = as<IRFieldAddress>(load->getOperand(0)))
         {
@@ -116,7 +118,7 @@ static IRFormatDecoration* _findImageFormatDecoration(IRInst* inst)
         }
     }
     // Otherwise just try on the instruction
-    return inst->findDecoration<IRFormatDecoration>();
+    return resourceInst->findDecoration<IRFormatDecoration>();
 }
 
 // Returns true if dataType and imageFormat are compatible - that they have the same representation,
@@ -149,36 +151,26 @@ static bool _isImageFormatCompatible(ImageFormat imageFormat, IRType* dataType)
     return formatBaseType == baseType;
 }
 
-static bool _isConvertRequired(ImageFormat imageFormat, IRInst* resourceVar)
+static bool _isConvertRequired(ImageFormat imageFormat, IRInst* callee)
 {
-    auto textureType = as<IRTextureTypeBase>(resourceVar->getDataType());
+    auto textureType = as<IRTextureTypeBase>(callee->getDataType());
     IRType* elementType = textureType ? textureType->getElementType() : nullptr;
     return elementType && !_isImageFormatCompatible(imageFormat, elementType);
 }
 
-static size_t _calcBackingElementSizeInBytes(IRInst* resourceVar)
+static size_t _calcBackingElementSizeInBytes(IRInst* resourceInst)
 {
     // First see if there is a format associated with the resource
-    if (IRFormatDecoration* formatDecoration = _findImageFormatDecoration(resourceVar))
+    if (IRFormatDecoration* formatDecoration = _findImageFormatDecoration(resourceInst))
     {
-        const ImageFormat imageFormat = formatDecoration->getFormat();
-
-        if (_isConvertRequired(imageFormat, resourceVar))
-        {
-            // If the access is a converting access then the x coordinate is *NOT* scaled
-            // This is a CUDA specific issue(!).
-            return 1;
-        }
-
-        const auto& imageFormatInfo = getImageFormatInfo(imageFormat);
-        return imageFormatInfo.sizeInBytes;
+        return getImageFormatInfo(formatDecoration->getFormat()).sizeInBytes;
     }
     else
     {
         // If not we *assume* the backing format is the same as the element type used for access.
         /// Ie in RWTexture<T>, this would return sizeof(T)
 
-        auto textureType = as<IRTextureTypeBase>(resourceVar->getDataType());
+        auto textureType = as<IRTextureTypeBase>(resourceInst->getDataType());
         IRType* elementType = textureType ? textureType->getElementType() : nullptr;
 
         if (elementType)
@@ -206,6 +198,18 @@ static size_t _calcBackingElementSizeInBytes(IRInst* resourceVar)
     return 4;
 }
 
+static bool _isResourceRead(IRCall* call)
+{
+    IRType* returnType = call->getDataType();
+    return returnType && (as<IRVoidType>(returnType) == nullptr);
+}
+
+static bool _isResourceWrite(IRCall* call)
+{
+    IRType* returnType = call->getDataType();
+    return returnType && (as<IRVoidType>(returnType) != nullptr);
+}
+
 const char* IntrinsicExpandContext::_emitSpecial(const char* cursor)
 {
     const char*const end = m_text.end();
@@ -323,13 +327,35 @@ const char* IntrinsicExpandContext::_emitSpecial(const char* cursor)
             // writes that will do a format conversion.
             if (m_emitter->getTarget() == CodeGenTarget::CUDASource)
             {
-                IRInst* arg0 = m_callInst->getArg(0);
+                IRInst* resourceInst = m_callInst->getArg(0);
 
-                if (IRFormatDecoration* formatDecoration = _findImageFormatDecoration(arg0))
+                if (IRFormatDecoration* formatDecoration = _findImageFormatDecoration(resourceInst))
                 {
                     const ImageFormat imageFormat = formatDecoration->getFormat();
-                    if (_isConvertRequired(imageFormat, arg0))
+                    if (_isConvertRequired(imageFormat, resourceInst))
                     {
+                        // If the function returns something it's a reader so we may need to convert
+                        // and in doing so require half
+                        if (_isResourceRead(m_callInst))
+                        {
+                            // If the source format if half derived, then we need to enable half
+                            switch (imageFormat)
+                            {
+                                case ImageFormat::r16f:
+                                case ImageFormat::rg16f:
+                                case ImageFormat::rgba16f:
+                                {
+                                    CUDAExtensionTracker* extensionTracker = as<CUDAExtensionTracker>(m_emitter->getExtensionTracker());
+                                    if (extensionTracker)
+                                    {
+                                        extensionTracker->requireBaseType(BaseType::Half);
+                                    }
+                                    break;
+                                }
+                                default: break;
+                            }
+                        }
+
                         // Append _convert on the name to signify we need to use a code path, that will automatically
                         // do the format conversion.
                         m_writer->emit("_convert");
@@ -344,7 +370,21 @@ const char* IntrinsicExpandContext::_emitSpecial(const char* cursor)
             /// Sometimes accesses need to be scaled. For example in CUDA the x coordinate for surface
             /// access is byte addressed.
             /// $E will return the byte size of the *backing element*.
-            size_t elemSizeInBytes = _calcBackingElementSizeInBytes(m_callInst->getArg(0));
+
+            IRInst* resourceInst = m_callInst->getArg(0);
+            size_t elemSizeInBytes = _calcBackingElementSizeInBytes(resourceInst);
+
+            // If we have a format converstion and its a *write* we don't need to scale
+            if (IRFormatDecoration* formatDecoration = _findImageFormatDecoration(resourceInst))
+            {
+                const ImageFormat imageFormat = formatDecoration->getFormat();
+                if (_isConvertRequired(imageFormat, resourceInst) && _isResourceWrite(m_callInst))
+                {
+                    // If there is a conversion *and* it's a write we don't need to scale.
+                    elemSizeInBytes = 1;
+                }
+            }
+
             SLANG_ASSERT(elemSizeInBytes > 0);
             m_writer->emitUInt64(UInt64(elemSizeInBytes));
             break;
author	jsmall-nvidia <jsmall@nvidia.com>	2021-05-15 11:45:58 -0400
committer	GitHub <noreply@github.com>	2021-05-15 11:45:58 -0400
commit	d5e8044d0a9723bb0bbd7ae1738d1157265da783 (patch)
tree	d330e87e67646fd6e978e4debad17b4f7fbe2c40 /source
parent	bfe75618be81566882be8570b8db82ad5a2f8fe4 (diff)