Surface access on CUDA is byte addressed in X (#1841)

* #include an absolute path didn't work - because paths were taken to always be relative. * Fix for writing to RWTexture with half types on CUDA. * CUDA half functionality doc updates. * First pass support for sust.p RWTexture format conversion on write. * Tidy up implementation of $C. Made clamping mode #define able. * A simple test for RWTexture CUDA format conversion. * Use $E to fix byte addressing in X in CUDA. * Do not scale when accessing via _convert versions of surface functions.
author: jsmall-nvidia <jsmall@nvidia.com> 2021-05-15 11:22:14 -0400
committer: GitHub <noreply@github.com> 2021-05-15 11:22:14 -0400
commit: bfe75618be81566882be8570b8db82ad5a2f8fe4 (patch)
tree: 1a319bee77f8faa4c09f385287d2dffdc569499e
parent: 1027225ac7ec8da0e471b633f358333c8a95b010 (diff)
3 files changed, 88 insertions, 6 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index a439d274a..a18da027b 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -385,6 +385,9 @@ SLANG_SURFACE_WRITE(surfCubemapLayeredwrite, (int x, int y, int layerFace), (x,
 
 // Support for doing format conversion when writing to a surface/RWTexture
 
+// NOTE! For normal surface access x values are *byte* addressed.
+// For the _convert versions they are *not*. They don't need to be because sust.p does not require it.
+
 template <typename T>
 SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert(T, cudaSurfaceObject_t surfObj, int x, cudaSurfaceBoundaryMode boundaryMode);
 template <typename T>
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index c268c2a58..9e5cf80c8 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -1091,6 +1091,11 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                                 if (vecCount > 1)
                                 {
                                     sb << '.' << char(i + 'x');
+                                    // Surface access is *byte* addressed in x in CUDA
+                                    if (i == 0)
+                                    {
+                                        sb << " * $E";
+                                    }
                                 }
                             }
 
@@ -1140,6 +1145,12 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                                 {
                                     sb << '.' << char(i + 'x');
                                 }
+
+                                // Surface access is *byte* addressed in x in CUDA
+                                if (i == 0)
+                                {
+                                    sb << " * $E";
+                                }
                             }
 
                             sb << ", SLANG_CUDA_BOUNDARY_MODE)\")\n";
diff --git a/source/slang/slang-intrinsic-expand.cpp b/source/slang/slang-intrinsic-expand.cpp
index c1e886621..c5bedbc37 100644
--- a/source/slang/slang-intrinsic-expand.cpp
+++ b/source/slang/slang-intrinsic-expand.cpp
@@ -101,7 +101,7 @@ static BaseType _getBaseTypeFromScalarType(SlangScalarType type)
 // The VK back-end gets away with this kind of coincidentally, since the "legalization" we have to do for resources means that there wouldn't be a single f() function any more.
 // But for CUDA and C++ that's not the case or generally desirable.
 
-IRFormatDecoration* _findImageFormatDecoration(IRInst* inst)
+static IRFormatDecoration* _findImageFormatDecoration(IRInst* inst)
 {
     // JS(TODO):
     // There could perhaps be other situations, that need to be covered
@@ -119,7 +119,9 @@ IRFormatDecoration* _findImageFormatDecoration(IRInst* inst)
     return inst->findDecoration<IRFormatDecoration>();
 }
 
-bool _isImageFormatCompatible(ImageFormat imageFormat, IRType* dataType)
+// Returns true if dataType and imageFormat are compatible - that they have the same representation,
+// and no conversion is required.
+static bool _isImageFormatCompatible(ImageFormat imageFormat, IRType* dataType)
 {
     int numElems = 1;
 
@@ -147,6 +149,63 @@ bool _isImageFormatCompatible(ImageFormat imageFormat, IRType* dataType)
     return formatBaseType == baseType;
 }
 
+static bool _isConvertRequired(ImageFormat imageFormat, IRInst* resourceVar)
+{
+    auto textureType = as<IRTextureTypeBase>(resourceVar->getDataType());
+    IRType* elementType = textureType ? textureType->getElementType() : nullptr;
+    return elementType && !_isImageFormatCompatible(imageFormat, elementType);
+}
+
+static size_t _calcBackingElementSizeInBytes(IRInst* resourceVar)
+{
+    // First see if there is a format associated with the resource
+    if (IRFormatDecoration* formatDecoration = _findImageFormatDecoration(resourceVar))
+    {
+        const ImageFormat imageFormat = formatDecoration->getFormat();
+
+        if (_isConvertRequired(imageFormat, resourceVar))
+        {
+            // If the access is a converting access then the x coordinate is *NOT* scaled
+            // This is a CUDA specific issue(!).
+            return 1;
+        }
+
+        const auto& imageFormatInfo = getImageFormatInfo(imageFormat);
+        return imageFormatInfo.sizeInBytes;
+    }
+    else
+    {
+        // If not we *assume* the backing format is the same as the element type used for access.
+        /// Ie in RWTexture<T>, this would return sizeof(T)
+
+        auto textureType = as<IRTextureTypeBase>(resourceVar->getDataType());
+        IRType* elementType = textureType ? textureType->getElementType() : nullptr;
+
+        if (elementType)
+        {
+            int numElems = 1;
+
+            if (auto vecType = as<IRVectorType>(elementType))
+            {
+                numElems = int(getIntVal(vecType->getElementCount()));
+                elementType = vecType->getElementType();
+            }
+
+            BaseType baseType = BaseType::Void;
+            if (auto basicType = as<IRBasicType>(elementType))
+            {
+                baseType = basicType->getBaseType();
+            }
+
+            const auto& info = BaseTypeInfo::getInfo(baseType);
+            return info.sizeInBytes * numElems; 
+        }
+    }
+
+    // When in doubt 4 is not a terrible guess based on limitations around DX11 etc
+    return 4;
+}
+
 const char* IntrinsicExpandContext::_emitSpecial(const char* cursor)
 {
     const char*const end = m_text.end();
@@ -269,10 +328,7 @@ const char* IntrinsicExpandContext::_emitSpecial(const char* cursor)
                 if (IRFormatDecoration* formatDecoration = _findImageFormatDecoration(arg0))
                 {
                     const ImageFormat imageFormat = formatDecoration->getFormat();
-                    auto textureType = as<IRTextureTypeBase>(arg0->getDataType());
-                    IRType* elementType = textureType ? textureType->getElementType() : nullptr;
-
-                    if (elementType && ! _isImageFormatCompatible(imageFormat, elementType))
+                    if (_isConvertRequired(imageFormat, arg0))
                     {
                         // Append _convert on the name to signify we need to use a code path, that will automatically
                         // do the format conversion.
@@ -282,6 +338,18 @@ const char* IntrinsicExpandContext::_emitSpecial(const char* cursor)
             }
             break;
         }
+
+        case 'E':
+        {
+            /// Sometimes accesses need to be scaled. For example in CUDA the x coordinate for surface
+            /// access is byte addressed.
+            /// $E will return the byte size of the *backing element*.
+            size_t elemSizeInBytes = _calcBackingElementSizeInBytes(m_callInst->getArg(0));
+            SLANG_ASSERT(elemSizeInBytes > 0);
+            m_writer->emitUInt64(UInt64(elemSizeInBytes));
+            break;
+        }
+
         case 'c':
         {
             // When doing texture access in glsl the result may need to be cast.
author	jsmall-nvidia <jsmall@nvidia.com>	2021-05-15 11:22:14 -0400
committer	GitHub <noreply@github.com>	2021-05-15 11:22:14 -0400
commit	bfe75618be81566882be8570b8db82ad5a2f8fe4 (patch)
tree	1a319bee77f8faa4c09f385287d2dffdc569499e
parent	1027225ac7ec8da0e471b633f358333c8a95b010 (diff)