diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2020-02-12 09:15:47 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-02-12 09:15:47 -0500 |
| commit | fe9d27af9de047ea75db1334c961bb025fb732f6 (patch) | |
| tree | 3d7a3e1e693f0fe66930db87a5833eaea993620a | |
| parent | 9b3e768bceae562deeb330067f3ef5febc2e5244 (diff) | |
CUDA barrier/atomic support (#1218)
* * Improved fastRemoveAt
* Fixed off by one bug
* Fixed const safeness with List<>
* Made List begin and end const safe.
* Revert to previous RefPtr usage.
* Fix bug with casting.
* Tabs -> spaces.
Small fixes/improvements to List.
* Improve comment on List.
* Group shared/atomic test works on CUDA.
* * Enabled CUDA tests for atomics tests
* Enabled DX12 test for atomics-buffer.slang
Not clear just yet how to implement that for CUDA - it will work with StructuredBuffer.
* hasContent -> isNonEmpty
* Remove unneeded comment.
| -rw-r--r-- | source/slang/hlsl.meta.slang | 38 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang.h | 40 | ||||
| -rw-r--r-- | source/slang/slang-emit-cuda.cpp | 16 | ||||
| -rw-r--r-- | tests/compute/atomics-buffer.slang | 8 | ||||
| -rw-r--r-- | tests/compute/atomics-groupshared.slang | 3 | ||||
| -rw-r--r-- | tests/compute/atomics.slang | 3 | ||||
| -rw-r--r-- | tests/compute/groupshared.slang | 6 |
7 files changed, 98 insertions, 16 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index a646c1b77..669fbb440 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -347,10 +347,13 @@ __generic<T : __BuiltinType, let N : int, let M : int> bool all(matrix<T,N,M> x) // Barrier for writes to all memory spaces (HLSL SM 5.0) __target_intrinsic(glsl, "memoryBarrier(), groupMemoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()") +// TODO(JS): Doesn't seem to be weaker form of sync, so use this? +__target_intrinsic(cuda, "__syncthreads()") void AllMemoryBarrier(); // Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0) __target_intrinsic(glsl, "memoryBarrier(), groupMemoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer(), barrier()") +__target_intrinsic(cuda, "__syncthreads()") void AllMemoryBarrierWithGroupSync(); // Test if any components is non-zero (HLSL SM 1.0) @@ -648,6 +651,7 @@ __target_intrinsic(glsl, "memoryBarrier(), memoryBarrierImage(), memoryBarrierBu void DeviceMemoryBarrier(); __target_intrinsic(glsl, "memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer(), barrier()") +__target_intrinsic(glsl, "__syncthreads()") void DeviceMemoryBarrierWithGroupSync(); // Vector distance @@ -812,99 +816,131 @@ float2 GetRenderTargetSamplePosition(int Index); __target_intrinsic(glsl, "groupMemoryBarrier") void GroupMemoryBarrier(); + __target_intrinsic(glsl, "groupMemoryBarrier(), barrier()") +__target_intrinsic(cuda, "__syncthreads()") void GroupMemoryBarrierWithGroupSync(); // Atomics __target_intrinsic(glsl, "$atomicAdd($A, $1)") +__target_intrinsic(cuda, "atomicAdd(&$0, $1)") void InterlockedAdd(__ref int dest, int value); __target_intrinsic(glsl, "$atomicAdd($A, $1)") +__target_intrinsic(cuda, "atomicAdd((uint*)&$0, $1)") void InterlockedAdd(__ref uint dest, uint value); __target_intrinsic(glsl, "($2 = $atomicAdd($A, $1))") +__target_intrinsic(cuda, "($2 = atomicAdd(&$0, $1))") void InterlockedAdd(__ref int dest, int value, out int original_value); __target_intrinsic(glsl, "($2 = $atomicAdd($A, $1))") +__target_intrinsic(cuda, "($2 = (uint)atomicAdd((uint*)&$0, $1))") void InterlockedAdd(__ref uint dest, uint value, out uint original_value); __target_intrinsic(glsl, "$atomicAnd($A, $1)") +__target_intrinsic(cuda, "atomicAnd(&$0, $1)") void InterlockedAnd(__ref int dest, int value); __target_intrinsic(glsl, "$atomicAnd($A, $1)") +__target_intrinsic(cuda, "atomicAnd((int*)&$0, $1)") void InterlockedAnd(__ref uint dest, uint value); __target_intrinsic(glsl, "($2 = $atomicAnd($A, $1))") +__target_intrinsic(cuda, "($2 = atomicAnd(&$0, $1))") void InterlockedAnd(__ref int dest, int value, out int original_value); __target_intrinsic(glsl, "($2 = $atomicAnd($A, $1))") +__target_intrinsic(cuda, "($2 = atomicAnd((int*)&$0, $1))") void InterlockedAnd(__ref uint dest, uint value, out uint original_value); __target_intrinsic(glsl, "($3 = $atomicCompSwap($A, $1, $2))") +__target_intrinsic(cuda, "($3 = atomicCAS(&$0, $1, $2))") void InterlockedCompareExchange(__ref int dest, int compare_value, int value, out int original_value); __target_intrinsic(glsl, "($3 = $atomicCompSwap($A, $1, $2))") +__target_intrinsic(cuda, "($3 = (uint)atomicCAS((int*)&$0, $1, $2))") void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value, out uint original_value); __target_intrinsic(glsl, "$atomicCompSwap($A, $1, $2)") -void InterlockedCompareStore(__ref int dest, int compare_value, int value); +__target_intrinsic(cuda, "atomicCAS(&$0, $1, $2)") +void InterlockedCompareStore(__ref int dest, int compare_value, int value); __target_intrinsic(glsl, "$atomicCompSwap($A, $1, $2)") +__target_intrinsic(cuda, "atomicCAS((int*)&$0, $1, $2)") void InterlockedCompareStore(__ref uint dest, uint compare_value, uint value); __target_intrinsic(glsl, "($2 = $atomicExchange($A, $1))") +__target_intrinsic(cuda, "($2 = atomicExch(&$0, $1))") void InterlockedExchange(__ref int dest, int value, out int original_value); __target_intrinsic(glsl, "($2 = $atomicExchange($A, $1))") +__target_intrinsic(cuda, "($2 = (uint)atomicExch((int*)&$0, $1))") void InterlockedExchange(__ref uint dest, uint value, out uint original_value); __target_intrinsic(glsl, "$atomicMax($A, $1)") +__target_intrinsic(cuda, "atomicMax(&$0, $1)") void InterlockedMax(__ref int dest, int value); __target_intrinsic(glsl, "$atomicMax($A, $1)") +__target_intrinsic(cuda, "atomicMax((int*)&$0, $1)") void InterlockedMax(__ref uint dest, uint value); __target_intrinsic(glsl, "($2 = $atomicMax($A, $1))") +__target_intrinsic(cuda, "($2 = atomicMax(&$0, $1))") void InterlockedMax(__ref int dest, int value, out int original_value); __target_intrinsic(glsl, "($2 = $atomicMax($A, $1))") +__target_intrinsic(cuda, "($2 = (uint)atomicMax((int*)&$0, $1))") void InterlockedMax(__ref uint dest, uint value, out uint original_value); __target_intrinsic(glsl, "$atomicMin($A, $1)") +__target_intrinsic(cuda, "atomicMin(&$0, $1)") void InterlockedMin(__ref int dest, int value); __target_intrinsic(glsl, "$atomicMin($A, $1)") +__target_intrinsic(cuda, "atomicMin((int*)&$0, $1)") void InterlockedMin(__ref uint dest, uint value); __target_intrinsic(glsl, "($2 = $atomicMin($A, $1))") +__target_intrinsic(cuda, "($2 = atomicMin(&$0, $1))") void InterlockedMin(__ref int dest, int value, out int original_value); __target_intrinsic(glsl, "($2 = $atomicMin($A, $1))") +__target_intrinsic(cuda, "($2 = (uint)atomicMin((int*)&$0, $1))") void InterlockedMin(__ref uint dest, uint value, out uint original_value); __target_intrinsic(glsl, "$atomicOr($A, $1)") +__target_intrinsic(cuda, "atomicOr(&$0, $1)") void InterlockedOr(__ref int dest, int value); __target_intrinsic(glsl, "$atomicOr($A, $1)") +__target_intrinsic(cuda, "atomicOr((int*)&$0, $1)") void InterlockedOr(__ref uint dest, uint value); __target_intrinsic(glsl, "($2 = $atomicOr($A, $1))") +__target_intrinsic(cuda, "($2 = atomicOr(&$0, $1))") void InterlockedOr(__ref int dest, int value, out int original_value); __target_intrinsic(glsl, "($2 = $atomicOr($A, $1))") +__target_intrinsic(cuda, "($2 = (uint)atomicOr((int*)&$0, $1))") void InterlockedOr(__ref uint dest, uint value, out uint original_value); __target_intrinsic(glsl, "$atomicXor($A, $1)") +__target_intrinsic(cuda, "atomicXor(&$0, $1)") void InterlockedXor(__ref int dest, int value); __target_intrinsic(glsl, "$atomicXor($A, $1)") +__target_intrinsic(cuda, "atomicXor((int*)&$0, $1)") void InterlockedXor(__ref uint dest, uint value); __target_intrinsic(glsl, "($2 = $atomicXor($A, $1))") +__target_intrinsic(cuda, "($2 = atomicXor(&$0, $1))") void InterlockedXor(__ref int dest, int value, out int original_value); __target_intrinsic(glsl, "($2 = $atomicXor($A, $1))") +__target_intrinsic(cuda, "($2 = (uint)atomicXor((int*)&$0, $1))") void InterlockedXor(__ref uint dest, uint value, out uint original_value); // Is floating-point value finite? diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h index b66f43103..f69d0cb3d 100644 --- a/source/slang/hlsl.meta.slang.h +++ b/source/slang/hlsl.meta.slang.h @@ -396,10 +396,13 @@ SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> bool all(matri SLANG_RAW("\n") SLANG_RAW("// Barrier for writes to all memory spaces (HLSL SM 5.0)\n") SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), groupMemoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()\")\n") +SLANG_RAW("// TODO(JS): Doesn't seem to be weaker form of sync, so use this?\n") +SLANG_RAW("__target_intrinsic(cuda, \"__syncthreads()\")\n") SLANG_RAW("void AllMemoryBarrier();\n") SLANG_RAW("\n") SLANG_RAW("// Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0)\n") SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), groupMemoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer(), barrier()\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"__syncthreads()\")\n") SLANG_RAW("void AllMemoryBarrierWithGroupSync();\n") SLANG_RAW("\n") SLANG_RAW("// Test if any components is non-zero (HLSL SM 1.0)\n") @@ -724,6 +727,7 @@ SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), memoryBarrierImage(), mem SLANG_RAW("void DeviceMemoryBarrier();\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer(), barrier()\")\n") +SLANG_RAW("__target_intrinsic(glsl, \"__syncthreads()\")\n") SLANG_RAW("void DeviceMemoryBarrierWithGroupSync();\n") SLANG_RAW("\n") SLANG_RAW("// Vector distance\n") @@ -888,99 +892,131 @@ SLANG_RAW("// Group memory barrier\n") SLANG_RAW("__target_intrinsic(glsl, \"groupMemoryBarrier\")\n") SLANG_RAW("void GroupMemoryBarrier();\n") SLANG_RAW("\n") +SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"groupMemoryBarrier(), barrier()\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"__syncthreads()\")\n") SLANG_RAW("void GroupMemoryBarrierWithGroupSync();\n") SLANG_RAW("\n") SLANG_RAW("// Atomics\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"$atomicAdd($A, $1)\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"atomicAdd(&$0, $1)\")\n") SLANG_RAW("void InterlockedAdd(__ref int dest, int value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"$atomicAdd($A, $1)\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"atomicAdd((uint*)&$0, $1)\")\n") SLANG_RAW("void InterlockedAdd(__ref uint dest, uint value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"($2 = $atomicAdd($A, $1))\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"($2 = atomicAdd(&$0, $1))\")\n") SLANG_RAW("void InterlockedAdd(__ref int dest, int value, out int original_value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"($2 = $atomicAdd($A, $1))\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"($2 = (uint)atomicAdd((uint*)&$0, $1))\")\n") SLANG_RAW("void InterlockedAdd(__ref uint dest, uint value, out uint original_value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"$atomicAnd($A, $1)\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"atomicAnd(&$0, $1)\")\n") SLANG_RAW("void InterlockedAnd(__ref int dest, int value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"$atomicAnd($A, $1)\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"atomicAnd((int*)&$0, $1)\")\n") SLANG_RAW("void InterlockedAnd(__ref uint dest, uint value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"($2 = $atomicAnd($A, $1))\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"($2 = atomicAnd(&$0, $1))\")\n") SLANG_RAW("void InterlockedAnd(__ref int dest, int value, out int original_value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"($2 = $atomicAnd($A, $1))\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"($2 = atomicAnd((int*)&$0, $1))\")\n") SLANG_RAW("void InterlockedAnd(__ref uint dest, uint value, out uint original_value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"($3 = $atomicCompSwap($A, $1, $2))\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"($3 = atomicCAS(&$0, $1, $2))\")\n") SLANG_RAW("void InterlockedCompareExchange(__ref int dest, int compare_value, int value, out int original_value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"($3 = $atomicCompSwap($A, $1, $2))\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"($3 = (uint)atomicCAS((int*)&$0, $1, $2))\")\n") SLANG_RAW("void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value, out uint original_value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"$atomicCompSwap($A, $1, $2)\")\n") -SLANG_RAW("void InterlockedCompareStore(__ref int dest, int compare_value, int value);\n") +SLANG_RAW("__target_intrinsic(cuda, \"atomicCAS(&$0, $1, $2)\")\n") +SLANG_RAW("void InterlockedCompareStore(__ref int dest, int compare_value, int value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"$atomicCompSwap($A, $1, $2)\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"atomicCAS((int*)&$0, $1, $2)\")\n") SLANG_RAW("void InterlockedCompareStore(__ref uint dest, uint compare_value, uint value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"($2 = $atomicExchange($A, $1))\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"($2 = atomicExch(&$0, $1))\")\n") SLANG_RAW("void InterlockedExchange(__ref int dest, int value, out int original_value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"($2 = $atomicExchange($A, $1))\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"($2 = (uint)atomicExch((int*)&$0, $1))\")\n") SLANG_RAW("void InterlockedExchange(__ref uint dest, uint value, out uint original_value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"$atomicMax($A, $1)\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"atomicMax(&$0, $1)\")\n") SLANG_RAW("void InterlockedMax(__ref int dest, int value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"$atomicMax($A, $1)\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"atomicMax((int*)&$0, $1)\")\n") SLANG_RAW("void InterlockedMax(__ref uint dest, uint value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"($2 = $atomicMax($A, $1))\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"($2 = atomicMax(&$0, $1))\")\n") SLANG_RAW("void InterlockedMax(__ref int dest, int value, out int original_value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"($2 = $atomicMax($A, $1))\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"($2 = (uint)atomicMax((int*)&$0, $1))\")\n") SLANG_RAW("void InterlockedMax(__ref uint dest, uint value, out uint original_value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"$atomicMin($A, $1)\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"atomicMin(&$0, $1)\")\n") SLANG_RAW("void InterlockedMin(__ref int dest, int value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"$atomicMin($A, $1)\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"atomicMin((int*)&$0, $1)\")\n") SLANG_RAW("void InterlockedMin(__ref uint dest, uint value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"($2 = $atomicMin($A, $1))\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"($2 = atomicMin(&$0, $1))\")\n") SLANG_RAW("void InterlockedMin(__ref int dest, int value, out int original_value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"($2 = $atomicMin($A, $1))\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"($2 = (uint)atomicMin((int*)&$0, $1))\")\n") SLANG_RAW("void InterlockedMin(__ref uint dest, uint value, out uint original_value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"$atomicOr($A, $1)\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"atomicOr(&$0, $1)\")\n") SLANG_RAW("void InterlockedOr(__ref int dest, int value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"$atomicOr($A, $1)\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"atomicOr((int*)&$0, $1)\")\n") SLANG_RAW("void InterlockedOr(__ref uint dest, uint value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"($2 = $atomicOr($A, $1))\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"($2 = atomicOr(&$0, $1))\")\n") SLANG_RAW("void InterlockedOr(__ref int dest, int value, out int original_value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"($2 = $atomicOr($A, $1))\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"($2 = (uint)atomicOr((int*)&$0, $1))\")\n") SLANG_RAW("void InterlockedOr(__ref uint dest, uint value, out uint original_value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"$atomicXor($A, $1)\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"atomicXor(&$0, $1)\")\n") SLANG_RAW("void InterlockedXor(__ref int dest, int value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"$atomicXor($A, $1)\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"atomicXor((int*)&$0, $1)\")\n") SLANG_RAW("void InterlockedXor(__ref uint dest, uint value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"($2 = $atomicXor($A, $1))\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"($2 = atomicXor(&$0, $1))\")\n") SLANG_RAW("void InterlockedXor(__ref int dest, int value, out int original_value);\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"($2 = $atomicXor($A, $1))\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"($2 = (uint)atomicXor((int*)&$0, $1))\")\n") SLANG_RAW("void InterlockedXor(__ref uint dest, uint value, out uint original_value);\n") SLANG_RAW("\n") SLANG_RAW("// Is floating-point value finite?\n") @@ -1589,7 +1625,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa) sb << "};\n"; } -SLANG_RAW("#line 1516 \"hlsl.meta.slang\"") +SLANG_RAW("#line 1552 \"hlsl.meta.slang\"") SLANG_RAW("\n") SLANG_RAW("\n") SLANG_RAW("\n") diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp index c628e5caf..0bbaafa5b 100644 --- a/source/slang/slang-emit-cuda.cpp +++ b/source/slang/slang-emit-cuda.cpp @@ -470,7 +470,7 @@ void CUDASourceEmitter::emitRateQualifiersImpl(IRRate* rate) { if (as<IRGroupSharedRate>(rate)) { - m_writer->emit("groupshared "); + m_writer->emit("__shared__ "); } } @@ -599,6 +599,18 @@ void CUDASourceEmitter::emitModuleImpl(IRModule* module) m_writer->emit("\n};\n\n"); } + // Output group shared variables + + { + for (auto action : actions) + { + if (action.level == EmitAction::Level::Definition && action.inst->op == kIROp_GlobalVar && as<IRGroupSharedRate>(action.inst->getRate())) + { + emitGlobalInst(action.inst); + } + } + } + // Output the 'Context' which will be used for execution { m_writer->emit("struct Context\n{\n"); @@ -614,7 +626,7 @@ void CUDASourceEmitter::emitModuleImpl(IRModule* module) // Output all the thread locals for (auto action : actions) { - if (action.level == EmitAction::Level::Definition && action.inst->op == kIROp_GlobalVar) + if (action.level == EmitAction::Level::Definition && action.inst->op == kIROp_GlobalVar && !as<IRGroupSharedRate>(action.inst->getRate())) { emitGlobalInst(action.inst); } diff --git a/tests/compute/atomics-buffer.slang b/tests/compute/atomics-buffer.slang index 32b9e7bbc..1739d4bbc 100644 --- a/tests/compute/atomics-buffer.slang +++ b/tests/compute/atomics-buffer.slang @@ -1,13 +1,9 @@ // atomics-buffer.slang //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -// Note: not enabling D3D12 test yet because change -// was developed on a machine that can run D3D12 -// -//TEST_DISABLED(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 - -//TEST_INPUT:ubuffer(format=R_UInt32, data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]):out +//TEST_INPUT:ubuffer(format=R_UInt32, data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]):out,name outputBuffer RWBuffer<uint> outputBuffer; diff --git a/tests/compute/atomics-groupshared.slang b/tests/compute/atomics-groupshared.slang index e7ebb2269..9e237bee1 100644 --- a/tests/compute/atomics-groupshared.slang +++ b/tests/compute/atomics-groupshared.slang @@ -2,8 +2,9 @@ //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -cuda -//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out +//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer RWStructuredBuffer<uint> outputBuffer; diff --git a/tests/compute/atomics.slang b/tests/compute/atomics.slang index dc81e8a09..ddb5523e9 100644 --- a/tests/compute/atomics.slang +++ b/tests/compute/atomics.slang @@ -2,8 +2,9 @@ //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -cuda -//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out +//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out, name outputBuffer RWStructuredBuffer<uint> outputBuffer; diff --git a/tests/compute/groupshared.slang b/tests/compute/groupshared.slang index 8712ee878..2ebf325ab 100644 --- a/tests/compute/groupshared.slang +++ b/tests/compute/groupshared.slang @@ -3,8 +3,10 @@ //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute +//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out +//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out, name=gBuffer +RWStructuredBuffer<int> gBuffer; #define THREAD_COUNT 4 @@ -26,8 +28,6 @@ int test(int val) return val; } -RWStructuredBuffer<int> gBuffer; - [numthreads(THREAD_COUNT, 1, 1)] void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) { |
