From 1027225ac7ec8da0e471b633f358333c8a95b010 Mon Sep 17 00:00:00 2001 From: jsmall-nvidia Date: Sat, 15 May 2021 10:52:55 -0400 Subject: Support for HW format conversions for RWTexture on CUDA (#1840) * #include an absolute path didn't work - because paths were taken to always be relative. * Fix for writing to RWTexture with half types on CUDA. * CUDA half functionality doc updates. * First pass support for sust.p RWTexture format conversion on write. * Tidy up implementation of $C. Made clamping mode #define able. * A simple test for RWTexture CUDA format conversion. --- prelude/slang-cuda-prelude.h | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'prelude') diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index a00e8f744..a439d274a 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -63,6 +63,15 @@ #ifndef SLANG_CUDA_BOUNDARY_MODE # define SLANG_CUDA_BOUNDARY_MODE cudaBoundaryModeZero + +// Can be one of SLANG_CUDA_PTX_BOUNDARY_MODE. Only applies *PTX* emitted CUDA operations +// which currently is just RWTextureRW format writes +// +// .trap causes an execution trap on out-of-bounds addresses +// .clamp stores data at the nearest surface location (sized appropriately) +// .zero drops stores to out-of-bounds addresses + +# define SLANG_PTX_BOUNDARY_MODE "zero" #endif struct TypeInfo @@ -371,9 +380,31 @@ SLANG_SURFACE_WRITE(surf1DLayeredwrite, (int x, int layer), (x, layer)) SLANG_SURFACE_WRITE(surf2DLayeredwrite, (int x, int y, int layer), (x, y, layer)) SLANG_SURFACE_WRITE(surfCubemapwrite, (int x, int y, int face), (x, y, face)) SLANG_SURFACE_WRITE(surfCubemapLayeredwrite, (int x, int y, int layerFace), (x, y, layerFace)) - + #endif +// Support for doing format conversion when writing to a surface/RWTexture + +template +SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert(T, cudaSurfaceObject_t surfObj, int x, cudaSurfaceBoundaryMode boundaryMode); +template +SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert(T, cudaSurfaceObject_t surfObj, int x, int y, cudaSurfaceBoundaryMode boundaryMode); + +// https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust + +template <> +SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert(float v, cudaSurfaceObject_t surfObj, int x, cudaSurfaceBoundaryMode boundaryMode) +{ + asm volatile ( "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2};}\n\t" :: "l"(surfObj),"r"(x),"f"(v)); +} + +template <> +SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert(float v, cudaSurfaceObject_t surfObj, int x, int y, cudaSurfaceBoundaryMode boundaryMode) +{ + asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"f"(v)); +} + // ----------------------------- F32 ----------------------------------------- // Unary -- cgit v1.2.3