summaryrefslogtreecommitdiffstats
path: root/prelude
diff options
context:
space:
mode:
Diffstat (limited to 'prelude')
-rw-r--r--prelude/slang-cuda-prelude.h1077
1 files changed, 388 insertions, 689 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 6c68cdb71..2c8faf922 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -1323,419 +1323,237 @@ SLANG_SURFACE_READ_HALF_CONVERT(surf3Dread, (int x, int y, int z), (x, y, z))
// For the _convert versions they are *not*. They don't need to be because sust.p does not require
// it.
-template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert(
- T,
- cudaSurfaceObject_t surfObj,
- int x,
- cudaSurfaceBoundaryMode boundaryMode);
-template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert(
- T,
- cudaSurfaceObject_t surfObj,
- int x,
- int y,
- cudaSurfaceBoundaryMode boundaryMode);
-template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert(
- T,
- cudaSurfaceObject_t surfObj,
- int x,
- int y,
- int z,
- cudaSurfaceBoundaryMode boundaryMode);
-
// https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust
-// Float
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float>(
- float v,
- cudaSurfaceObject_t surfObj,
- int x,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- asm volatile(
- "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2};}\n\t" ::"l"(surfObj),
- "r"(x),
- "f"(v));
-}
+// surf1Dwrite_convert
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float>(
- float v,
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert(
+ T v,
cudaSurfaceObject_t surfObj,
int x,
- int y,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- asm volatile(
- "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "f"(v));
-}
+ cudaSurfaceBoundaryMode boundaryMode);
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float>(
- float v,
- cudaSurfaceObject_t surfObj,
- int x,
- int y,
- int z,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- asm volatile(
- "{sust.p.3d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3,%4}], {%5};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "r"(z),
- "r"(0),
- "f"(v));
-}
+#define SLANG_SURF1DWRITE_CONVERT_IMPL(T, c) \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<T>( \
+ T v, \
+ cudaSurfaceObject_t surfObj, \
+ int x, \
+ cudaSurfaceBoundaryMode boundaryMode) \
+ { \
+ asm volatile( \
+ "sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2};" ::"l"(surfObj), \
+ "r"(x), \
+ c(v)); \
+ } \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<T##2>( \
+ T##2 v, \
+ cudaSurfaceObject_t surfObj, \
+ int x, \
+ cudaSurfaceBoundaryMode boundaryMode) \
+ { \
+ const T vx = v.x, vy = v.y; \
+ asm volatile( \
+ "sust.p.1d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2, %3};" ::"l"(surfObj), \
+ "r"(x), \
+ c(vx), \
+ c(vy)); \
+ } \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<T##4>( \
+ T##4 v, \
+ cudaSurfaceObject_t surfObj, \
+ int x, \
+ cudaSurfaceBoundaryMode boundaryMode) \
+ { \
+ const T vx = v.x, vy = v.y, vz = v.z, vw = v.w; \
+ asm volatile( \
+ "sust.p.1d.v4.b32." SLANG_PTX_BOUNDARY_MODE \
+ " [%0, {%1}], {%2, %3, %4, %5};" ::"l"(surfObj), \
+ "r"(x), \
+ c(vx), \
+ c(vy), \
+ c(vz), \
+ c(vw)); \
+ }
-// Float2
+SLANG_SURF1DWRITE_CONVERT_IMPL(float, "f")
+SLANG_SURF1DWRITE_CONVERT_IMPL(uint, "r")
+SLANG_SURF1DWRITE_CONVERT_IMPL(int, "r")
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float2>(
- float2 v,
- cudaSurfaceObject_t surfObj,
- int x,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- const float vx = v.x, vy = v.y;
- asm volatile(
- "{sust.p.1d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2,%3};}\n\t" ::"l"(surfObj),
- "r"(x),
- "f"(vx),
- "f"(vy));
-}
+// surf1DLayeredwrite_convert (not supported)
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float2>(
- float2 v,
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1DLayeredwrite_convert(
+ T v,
cudaSurfaceObject_t surfObj,
int x,
- int y,
+ int layer,
cudaSurfaceBoundaryMode boundaryMode)
{
- const float vx = v.x, vy = v.y;
- asm volatile(
- "{sust.p.2d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "f"(vx),
- "f"(vy));
+ static_assert(false, "CUDA doesn't support formatted surface writes on 1D array surfaces");
}
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float2>(
- float2 v,
+// surf2Dwrite_convert
+
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert(
+ T v,
cudaSurfaceObject_t surfObj,
int x,
int y,
- int z,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- const float vx = v.x, vy = v.y;
- asm volatile(
- "{sust.p.3d.v2.b32." SLANG_PTX_BOUNDARY_MODE
- " [%0, {%1,%2,%3,%4}], {%5,%6};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "r"(z),
- "r"(0),
- "f"(vx),
- "f"(vy));
-}
+ cudaSurfaceBoundaryMode boundaryMode);
-// Float4
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float4>(
- float4 v,
- cudaSurfaceObject_t surfObj,
- int x,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- const float vx = v.x, vy = v.y, vz = v.z, vw = v.w;
- asm volatile(
- "{sust.p.1d.v4.b32." SLANG_PTX_BOUNDARY_MODE
- " [%0, {%1}], {%2,%3,%4,%5};}\n\t" ::"l"(surfObj),
- "r"(x),
- "f"(vx),
- "f"(vy),
- "f"(vz),
- "f"(vw));
-}
+#define SLANG_SURF2DWRITE_CONVERT_IMPL(T, c) \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<T>( \
+ T v, \
+ cudaSurfaceObject_t surfObj, \
+ int x, \
+ int y, \
+ cudaSurfaceBoundaryMode boundaryMode) \
+ { \
+ asm volatile( \
+ "sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1, %2}], {%3};" ::"l"(surfObj), \
+ "r"(x), \
+ "r"(y), \
+ c(v)); \
+ } \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<T##2>( \
+ T##2 v, \
+ cudaSurfaceObject_t surfObj, \
+ int x, \
+ int y, \
+ cudaSurfaceBoundaryMode boundaryMode) \
+ { \
+ const T vx = v.x, vy = v.y; \
+ asm volatile( \
+ "sust.p.2d.v2.b32." SLANG_PTX_BOUNDARY_MODE \
+ " [%0, {%1, %2}], {%3, %4};" ::"l"(surfObj), \
+ "r"(x), \
+ "r"(y), \
+ c(vx), \
+ c(vy)); \
+ } \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<T##4>( \
+ T##4 v, \
+ cudaSurfaceObject_t surfObj, \
+ int x, \
+ int y, \
+ cudaSurfaceBoundaryMode boundaryMode) \
+ { \
+ const T vx = v.x, vy = v.y, vz = v.z, vw = v.w; \
+ asm volatile( \
+ "sust.p.2d.v4.b32." SLANG_PTX_BOUNDARY_MODE \
+ " [%0, {%1, %2}], {%3, %4, %5, %6};" ::"l"(surfObj), \
+ "r"(x), \
+ "r"(y), \
+ c(vx), \
+ c(vy), \
+ c(vz), \
+ c(vw)); \
+ }
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float4>(
- float4 v,
- cudaSurfaceObject_t surfObj,
- int x,
- int y,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- const float vx = v.x, vy = v.y, vz = v.z, vw = v.w;
- asm volatile(
- "{sust.p.2d.v4.b32." SLANG_PTX_BOUNDARY_MODE
- " [%0, {%1,%2}], {%3,%4,%5,%6};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "f"(vx),
- "f"(vy),
- "f"(vz),
- "f"(vw));
-}
+SLANG_SURF2DWRITE_CONVERT_IMPL(float, "f")
+SLANG_SURF2DWRITE_CONVERT_IMPL(uint, "r")
+SLANG_SURF2DWRITE_CONVERT_IMPL(int, "r")
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float4>(
- float4 v,
- cudaSurfaceObject_t surfObj,
- int x,
- int y,
- int z,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- const float vx = v.x, vy = v.y, vz = v.z, vw = v.w;
- asm volatile(
- "{sust.p.3d.v4.b32." SLANG_PTX_BOUNDARY_MODE
- " [%0, {%1,%2,%3,%4}], {%5,%6,%7,%8};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "r"(z),
- "r"(0),
- "f"(vx),
- "f"(vy),
- "f"(vz),
- "f"(vw));
-}
+// surf2DLayeredwrite_convert (not supported)
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<uint>(
- uint v,
- cudaSurfaceObject_t surfObj,
- int x,
- int y,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- asm volatile(
- "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "r"(v));
-}
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<uint>(
- uint v,
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2DLayeredwrite_convert(
+ T v,
cudaSurfaceObject_t surfObj,
int x,
int y,
- int z,
+ int layer,
cudaSurfaceBoundaryMode boundaryMode)
{
- asm volatile(
- "{sust.p.3d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3,%4}], {%5};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "r"(z),
- "r"(0),
- "r"(v));
+ static_assert(false, "CUDA doesn't support formatted surface writes on 2D array surfaces");
}
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<uint2>(
- uint2 v,
- cudaSurfaceObject_t surfObj,
- int x,
- int y,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- const uint vx = v.x, vy = v.y;
- asm volatile(
- "{sust.p.2d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "r"(vx),
- "r"(vy));
-}
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<uint2>(
- uint2 v,
- cudaSurfaceObject_t surfObj,
- int x,
- int y,
- int z,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- const uint vx = v.x, vy = v.y;
- asm volatile(
- "{sust.p.3d.v2.b32." SLANG_PTX_BOUNDARY_MODE
- " [%0, {%1,%2,%3,%4}], {%5,%6};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "r"(z),
- "r"(0),
- "r"(vx),
- "r"(vy));
-}
+// surf3Dwrite_convert
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<uint4>(
- uint4 v,
- cudaSurfaceObject_t surfObj,
- int x,
- int y,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- const uint vx = v.x, vy = v.y, vz = v.z, vw = v.w;
- asm volatile(
- "{sust.p.2d.v4.b32." SLANG_PTX_BOUNDARY_MODE
- " [%0, {%1,%2}], {%3,%4,%5,%6};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "r"(vx),
- "r"(vy),
- "r"(vz),
- "r"(vw));
-}
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<uint4>(
- uint4 v,
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert(
+ T v,
cudaSurfaceObject_t surfObj,
int x,
int y,
int z,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- const uint vx = v.x, vy = v.y, vz = v.z, vw = v.w;
- asm volatile(
- "{sust.p.3d.v4.b32." SLANG_PTX_BOUNDARY_MODE
- " [%0, {%1,%2,%3,%4}], {%5,%6,%7,%8};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "r"(z),
- "r"(0),
- "r"(vx),
- "r"(vy),
- "r"(vz),
- "r"(vw));
-}
+ cudaSurfaceBoundaryMode boundaryMode);
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<int>(
- int v,
- cudaSurfaceObject_t surfObj,
- int x,
- int y,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- asm volatile(
- "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "r"(v));
-}
-// Int2
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<int2>(
- int2 v,
- cudaSurfaceObject_t surfObj,
- int x,
- int y,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- const int vx = v.x, vy = v.y;
- asm volatile(
- "{sust.p.2d.v2.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "r"(vx),
- "r"(vy));
-}
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<int4>(
- int4 v,
- cudaSurfaceObject_t surfObj,
- int x,
- int y,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- const int vx = v.x, vy = v.y, vz = v.z, vw = v.w;
- asm volatile(
- "{sust.p.2d.v4.b32." SLANG_PTX_BOUNDARY_MODE
- " [%0, {%1,%2}], {%3,%4,%5,%6};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "r"(vx),
- "r"(vy),
- "r"(vz),
- "r"(vw));
-}
+#define SLANG_SURF3DWRITE_CONVERT_IMPL(T, c) \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<T>( \
+ T v, \
+ cudaSurfaceObject_t surfObj, \
+ int x, \
+ int y, \
+ int z, \
+ cudaSurfaceBoundaryMode boundaryMode) \
+ { \
+ asm volatile( \
+ "sust.p.3d.b32." SLANG_PTX_BOUNDARY_MODE \
+ " [%0, {%1, %2, %3, %4}], {%5};" ::"l"(surfObj), \
+ "r"(x), \
+ "r"(y), \
+ "r"(z), \
+ "r"(0), \
+ c(v)); \
+ } \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<T##2>( \
+ T##2 v, \
+ cudaSurfaceObject_t surfObj, \
+ int x, \
+ int y, \
+ int z, \
+ cudaSurfaceBoundaryMode boundaryMode) \
+ { \
+ const T vx = v.x, vy = v.y; \
+ asm volatile( \
+ "sust.p.3d.v2.b32." SLANG_PTX_BOUNDARY_MODE \
+ " [%0, {%1, %2, %3, %4}], {%5, %6};" ::"l"(surfObj), \
+ "r"(x), \
+ "r"(y), \
+ "r"(z), \
+ "r"(0), \
+ c(vx), \
+ c(vy)); \
+ } \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<T##4>( \
+ T##4 v, \
+ cudaSurfaceObject_t surfObj, \
+ int x, \
+ int y, \
+ int z, \
+ cudaSurfaceBoundaryMode boundaryMode) \
+ { \
+ const T vx = v.x, vy = v.y, vz = v.z, vw = v.w; \
+ asm volatile( \
+ "sust.p.3d.v4.b32." SLANG_PTX_BOUNDARY_MODE \
+ " [%0, {%1, %2, %3, %4}], {%5, %6, %7, %8};" ::"l"(surfObj), \
+ "r"(x), \
+ "r"(y), \
+ "r"(z), \
+ "r"(0), \
+ c(vx), \
+ c(vy), \
+ c(vz), \
+ c(vw)); \
+ }
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<int>(
- int v,
- cudaSurfaceObject_t surfObj,
- int x,
- int y,
- int z,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- asm volatile(
- "{sust.p.3d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3,%4}], {%5};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "r"(z),
- "r"(0),
- "r"(v));
-}
-// Int2
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<int2>(
- int2 v,
- cudaSurfaceObject_t surfObj,
- int x,
- int y,
- int z,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- const int vx = v.x, vy = v.y;
- asm volatile(
- "{sust.p.3d.v2.b32." SLANG_PTX_BOUNDARY_MODE
- " [%0, {%1,%2,%3,%4}], {%5,%6};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "r"(z),
- "r"(0),
- "r"(vx),
- "r"(vy));
-}
-// Int4
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<int4>(
- int4 v,
- cudaSurfaceObject_t surfObj,
- int x,
- int y,
- int z,
- cudaSurfaceBoundaryMode boundaryMode)
-{
- const int vx = v.x, vy = v.y, vz = v.z, vw = v.w;
- asm volatile(
- "{sust.p.3d.v4.b32." SLANG_PTX_BOUNDARY_MODE
- " [%0, {%1,%2,%3,%4}], {%5,%6,%7,%8};}\n\t" ::"l"(surfObj),
- "r"(x),
- "r"(y),
- "r"(z),
- "r"(0),
- "r"(vx),
- "r"(vy),
- "r"(vz),
- "r"(vw));
-}
+SLANG_SURF3DWRITE_CONVERT_IMPL(float, "f")
+SLANG_SURF3DWRITE_CONVERT_IMPL(uint, "r")
+SLANG_SURF3DWRITE_CONVERT_IMPL(int, "r")
// ----------------------------- F32 -----------------------------------------
@@ -4413,337 +4231,218 @@ struct TensorView
};
// Implementations for texture fetch/load functions using tex PTX intrinsics
-// These are used for read-only texture access with integer coordinates
-// See #6781 for details.
+// These are used for read-only texture access with integer coordinates.
-// 1D is not supported via PTX. Keeping this placeholder in case it ever gets
-// supported.
+// 1D is not supported via PTX. Keeping the implementation below in case it ever gets supported.
template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1Dfetch_int(CUtexObject texObj, int x)
-{
- T result;
- float stub;
- asm("tex.1d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5}];"
- : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
- : "l"(texObj), "r"(x));
- return result;
-}
-
-template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
- T result;
- float stub;
- asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
- : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
- : "l"(texObj), "r"(x), "r"(y));
- return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
- float result_x, result_y;
- float stub;
- asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
- : "=f"(result_x), "=f"(result_y), "=f"(stub), "=f"(stub)
- : "l"(texObj), "r"(x), "r"(y));
- return make_float2(result_x, result_y);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
- float result_x, result_y, result_z, result_w;
- asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
- : "=f"(result_x), "=f"(result_y), "=f"(result_z), "=f"(result_w)
- : "l"(texObj), "r"(x), "r"(y));
- return make_float4(result_x, result_y, result_z, result_w);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
- uint result;
- uint stub;
- asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
- : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
- : "l"(texObj), "r"(x), "r"(y));
- return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
- uint result_x, result_y;
- uint stub;
- asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
- : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
- : "l"(texObj), "r"(x), "r"(y));
- return make_uint2(result_x, result_y);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
- uint result_x, result_y, result_z, result_w;
- asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
- : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
- : "l"(texObj), "r"(x), "r"(y));
- return make_uint4(result_x, result_y, result_z, result_w);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
- int result;
- int stub;
- asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
- : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
- : "l"(texObj), "r"(x), "r"(y));
- return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int2 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
- int result_x, result_y;
- int stub;
- asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
- : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
- : "l"(texObj), "r"(x), "r"(y));
- return make_int2(result_x, result_y);
-}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1Dfetch_int(CUtexObject texObj, int x, int mip)
+{
+ static_assert(false, "CUDA does not support fetching from 1D textures");
+}
+
+#if 0
+#define SLANG_TEX1DFETCH_INT_IMPL(T, dtype, c) \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1Dfetch_int(CUtexObject texObj, int x, int mip) \
+ { \
+ T result; \
+ T stub; \
+ asm("tex.level.1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5}], %6;" \
+ : c(result), c(stub), c(stub), c(stub) \
+ : "l"(texObj), "r"(x), "r"(mip)); \
+ return result; \
+ } \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 tex1Dfetch_int(CUtexObject texObj, int x, int mip) \
+ { \
+ T result_x, result_y; \
+ T stub; \
+ asm("tex.level.1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5}], %6;" \
+ : c(result_x), c(result_y), c(stub), c(stub) \
+ : "l"(texObj), "r"(x), "r"(mip)); \
+ return make_##T##2(result_x, result_y); \
+ } \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 tex1Dfetch_int(CUtexObject texObj, int x, int mip) \
+ { \
+ T result_x, result_y, result_z, result_w; \
+ asm("tex.level.1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5}], %6;" \
+ : c(result_x), c(result_y), c(result_z), c(result_w) \
+ : "l"(texObj), "r"(x), "r"(mip)); \
+ return make_##T##4(result_x, result_y, result_z, result_w); \
+ }
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int4 tex2Dfetch_int(CUtexObject texObj, int x, int y)
-{
- int result_x, result_y, result_z, result_w;
- asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
- : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
- : "l"(texObj), "r"(x), "r"(y));
- return make_int4(result_x, result_y, result_z, result_w);
-}
+SLANG_TEX1DFETCH_INT_IMPL(float, "f32", "=f")
+SLANG_TEX1DFETCH_INT_IMPL(uint, "u32", "=r")
+SLANG_TEX1DFETCH_INT_IMPL(int, "s32", "=r")
+#endif
template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
- T result;
- float stub;
- asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
- : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
- // Note: The repeated z is a stub used as the fourth operand in ptx.
- // From the docs:
- // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex
- // Operand c is a scalar or singleton tuple for 1d textures; is a two-element vector for 2d
- // textures; and is a four-element vector for 3d textures.
- return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
- float result_x, result_y;
- float stub;
- asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=f"(result_x), "=f"(result_y), "=f"(stub), "=f"(stub)
- : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
- return make_float2(result_x, result_y);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
- float result_x, result_y, result_z, result_w;
- asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=f"(result_x), "=f"(result_y), "=f"(result_z), "=f"(result_w)
- : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
- return make_float4(result_x, result_y, result_z, result_w);
-}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex2Dfetch_int(CUtexObject texObj, int x, int y, int mip);
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
- uint result;
- uint stub;
- asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
- : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
- return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
- uint result_x, result_y;
- uint stub;
- asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
- : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
- return make_uint2(result_x, result_y);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
- uint result_x, result_y, result_z, result_w;
- asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
- : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
- return make_uint4(result_x, result_y, result_z, result_w);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
- int result;
- int stub;
- asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
- : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
- return result;
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int2 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
- int result_x, result_y;
- int stub;
- asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
- : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
- return make_int2(result_x, result_y);
-}
+#define SLANG_TEX2DFETCH_INT_IMPL(T, dtype, c) \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex2Dfetch_int(CUtexObject texObj, int x, int y, int mip) \
+ { \
+ T result; \
+ T stub; \
+ asm("tex.level.2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;" \
+ : c(result), c(stub), c(stub), c(stub) \
+ : "l"(texObj), "r"(x), "r"(y), "r"(mip)); \
+ return result; \
+ } \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL \
+ T##2 tex2Dfetch_int(CUtexObject texObj, int x, int y, int mip) \
+ { \
+ T result_x, result_y; \
+ T stub; \
+ asm("tex.level.2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;" \
+ : c(result_x), c(result_y), c(stub), c(stub) \
+ : "l"(texObj), "r"(x), "r"(y), "r"(mip)); \
+ return make_##T##2(result_x, result_y); \
+ } \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL \
+ T##4 tex2Dfetch_int(CUtexObject texObj, int x, int y, int mip) \
+ { \
+ T result_x, result_y, result_z, result_w; \
+ asm("tex.level.2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;" \
+ : c(result_x), c(result_y), c(result_z), c(result_w) \
+ : "l"(texObj), "r"(x), "r"(y), "r"(mip)); \
+ return make_##T##4(result_x, result_y, result_z, result_w); \
+ }
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int4 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
-{
- int result_x, result_y, result_z, result_w;
- asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
- : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
- return make_int4(result_x, result_y, result_z, result_w);
-}
+SLANG_TEX2DFETCH_INT_IMPL(float, "f32", "=f")
+SLANG_TEX2DFETCH_INT_IMPL(uint, "u32", "=r")
+SLANG_TEX2DFETCH_INT_IMPL(int, "s32", "=r")
-template<typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1DArrayfetch_int(CUtexObject texObj, int x, int layer)
-{
- T result;
- float stub;
- asm("tex.a1d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
- : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
- : "l"(texObj), "r"(x), "r"(layer));
- return result;
-}
template<typename T>
SLANG_FORCE_INLINE SLANG_CUDA_CALL T
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
- T result;
- float stub;
- asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=f"(result), "=f"(stub), "=f"(stub), "=f"(stub)
- : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
- return result;
-}
+tex3Dfetch_int(CUtexObject texObj, int x, int y, int z, int mip);
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float2
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
- float result_x, result_y;
- float stub;
- asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=f"(result_x), "=f"(result_y), "=f"(stub), "=f"(stub)
- : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
- return make_float2(result_x, result_y);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float4
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
- float result_x, result_y, result_z, result_w;
- asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=f"(result_x), "=f"(result_y), "=f"(result_z), "=f"(result_w)
- : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
- return make_float4(result_x, result_y, result_z, result_w);
-}
-
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
- uint result;
- uint stub;
- asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
- : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
- return result;
-}
+#define SLANG_TEX3DFETCH_INT_IMPL(T, dtype, c) \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL T \
+ tex3Dfetch_int(CUtexObject texObj, int x, int y, int z, int mip) \
+ { \
+ T result; \
+ T stub; \
+ asm("tex.level.3d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;" \
+ : c(result), c(stub), c(stub), c(stub) \
+ : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z) /* ignored */, "r"(mip)); \
+ return result; \
+ } \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL \
+ T##2 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z, int mip) \
+ { \
+ T result_x, result_y; \
+ T stub; \
+ asm("tex.level.3d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;" \
+ : c(result_x), c(result_y), c(stub), c(stub) \
+ : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z) /* ignored */, "r"(mip)); \
+ return make_##T##2(result_x, result_y); \
+ } \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL \
+ T##4 tex3Dfetch_int(CUtexObject texObj, int x, int y, int z, int mip) \
+ { \
+ T result_x, result_y, result_z, result_w; \
+ asm("tex.level.3d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;" \
+ : c(result_x), c(result_y), c(result_z), c(result_w) \
+ : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z) /* ignored */, "r"(mip)); \
+ return make_##T##4(result_x, result_y, result_z, result_w); \
+ }
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint2
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
- uint result_x, result_y;
- uint stub;
- asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
- : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
- return make_uint2(result_x, result_y);
-}
+SLANG_TEX3DFETCH_INT_IMPL(float, "f32", "=f")
+SLANG_TEX3DFETCH_INT_IMPL(uint, "u32", "=r")
+SLANG_TEX3DFETCH_INT_IMPL(int, "s32", "=r")
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint4
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
- uint result_x, result_y, result_z, result_w;
- asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
- : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
- return make_uint4(result_x, result_y, result_z, result_w);
-}
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T
+tex1DArrayfetch_int(CUtexObject texObj, int x, int layer, int mip);
+
+#define SLANG_TEX1DARRAYFETCH_INT_IMPL(T, dtype, c) \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL T \
+ tex1DArrayfetch_int(CUtexObject texObj, int x, int layer, int mip) \
+ { \
+ T result; \
+ T stub; \
+ asm("tex.level.a1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;" \
+ : c(result), c(stub), c(stub), c(stub) \
+ : "l"(texObj), "r"(layer), "r"(x), "r"(mip)); \
+ return result; \
+ } \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL \
+ T##2 tex1DArrayfetch_int(CUtexObject texObj, int x, int layer, int mip) \
+ { \
+ T result_x, result_y; \
+ T stub; \
+ asm("tex.level.a1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;" \
+ : c(result_x), c(result_y), c(stub), c(stub) \
+ : "l"(texObj), "r"(layer), "r"(x), "r"(mip)); \
+ return make_##T##2(result_x, result_y); \
+ } \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL \
+ T##4 tex1DArrayfetch_int(CUtexObject texObj, int x, int layer, int mip) \
+ { \
+ T result_x, result_y, result_z, result_w; \
+ asm("tex.level.a1d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6}], %7;" \
+ : c(result_x), c(result_y), c(result_z), c(result_w) \
+ : "l"(texObj), "r"(layer), "r"(x), "r"(mip)); \
+ return make_##T##4(result_x, result_y, result_z, result_w); \
+ }
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int tex2DArrayfetch_int(
- CUtexObject texObj,
- int x,
- int y,
- int layer)
-{
- int result;
- int stub;
- asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=r"(result), "=r"(stub), "=r"(stub), "=r"(stub)
- : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
- return result;
-}
+SLANG_TEX1DARRAYFETCH_INT_IMPL(float, "f32", "=f")
+SLANG_TEX1DARRAYFETCH_INT_IMPL(uint, "u32", "=r")
+SLANG_TEX1DARRAYFETCH_INT_IMPL(int, "s32", "=r")
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int2
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
- int result_x, result_y;
- int stub;
- asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=r"(result_x), "=r"(result_y), "=r"(stub), "=r"(stub)
- : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
- return make_int2(result_x, result_y);
-}
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T
+tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer, int mip);
+
+#define SLANG_TEX2DARRAYFETCH_INT_IMPL(T, dtype, c) \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL T \
+ tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer, int mip) \
+ { \
+ T result; \
+ T stub; \
+ asm("tex.level.a2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;" \
+ : c(result), c(stub), c(stub), c(stub) \
+ : "l"(texObj), "r"(layer), "r"(x), "r"(y), "r"(layer) /* ignored */, "r"(mip)); \
+ return result; \
+ } \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL \
+ T##2 tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer, int mip) \
+ { \
+ T result_x, result_y; \
+ T stub; \
+ asm("tex.level.a2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;" \
+ : c(result_x), c(result_y), c(stub), c(stub) \
+ : "l"(texObj), "r"(layer), "r"(x), "r"(y), "r"(layer) /* ignored */, "r"(mip)); \
+ return make_##T##2(result_x, result_y); \
+ } \
+ template<> \
+ SLANG_FORCE_INLINE SLANG_CUDA_CALL \
+ T##4 tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer, int mip) \
+ { \
+ T result_x, result_y, result_z, result_w; \
+ asm("tex.level.a2d.v4." dtype ".s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;" \
+ : c(result_x), c(result_y), c(result_z), c(result_w) \
+ : "l"(texObj), "r"(layer), "r"(x), "r"(y), "r"(layer) /* ignored */, "r"(mip)); \
+ return make_##T##4(result_x, result_y, result_z, result_w); \
+ }
-template<>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int4
-tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
-{
- int result_x, result_y, result_z, result_w;
- asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
- : "=r"(result_x), "=r"(result_y), "=r"(result_z), "=r"(result_w)
- : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
- return make_int4(result_x, result_y, result_z, result_w);
-}
+SLANG_TEX2DARRAYFETCH_INT_IMPL(float, "f32", "=f")
+SLANG_TEX2DARRAYFETCH_INT_IMPL(uint, "u32", "=r")
+SLANG_TEX2DARRAYFETCH_INT_IMPL(int, "s32", "=r")
// Wave rotate helper functions - templated approach
#define SLANG_WARP_FULL_MASK 0xFFFFFFFF