summaryrefslogtreecommitdiffstats
path: root/source/slang
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2020-08-24 15:23:40 -0400
committerGitHub <noreply@github.com>2020-08-24 15:23:40 -0400
commit4804753d4a2ec389cc6ecd759f7ea712848fddf0 (patch)
tree0ac88b3da2aae7842cb8f71f55b79412716e8a60 /source/slang
parent67ca54997d445e15891965b8d77561b9d10bb18c (diff)
RWByteAddressBuffer::InterlockedCompareExchangeU64 (#1513)
* First pass at incorporating nvapi into test harness. * D3d12 Atomic Float Add via NVAPI working * Dx12 atomic float appears to work. * Atomic float add on Dx12. * Added atomic64 feature addition to vk. Fix correct output for atomic-float-byte-address.slang * Disable atomic float failing tests. * Upgraded VK headers. * Detect atomic float availability on VK. * Try to get test working for in64 atomic. * Made HLSL prelude controlled via the render-test requirements. * Added -enable-nvapi to premake. * Fix D3D12Renderer when NVAPI is not available. * Small improvements to VKRenderer. * Improve atomic documentation in target-compatibility.md. * Fixed NVAPI working on D3D12. * Test for specific NVAPI features. * Remove requiredFeatures from Renderer::Desc as was ignored. Tried to document more around nvapiExtnSlot. * Readded requiredFeatures to Renderer::Desc * Improve comments in the tests. * Rename Fp32 -> F32 Added cas-int64-byte-address-buffer.slang test Co-authored-by: Tim Foley <tfoleyNV@users.noreply.github.com>
Diffstat (limited to 'source/slang')
-rw-r--r--source/slang/hlsl.meta.slang77
1 files changed, 57 insertions, 20 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 8ec5c2c67..36fe9f216 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -48,6 +48,7 @@ struct ByteAddressBuffer
}
};
+// AtomicAdd
// Make the GLSL atomicAdd available.
// We have separate int/float implementations, as the float version requires some specific extensions
@@ -58,10 +59,11 @@ __glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_float)
float __atomicAdd(__ref float value, float amount);
-// Helper for hlsl, using nvAPI
+// Helper for hlsl, using NVAPI
__target_intrinsic(hlsl, "NvInterlockedAddUint64($0, $1, $2)")
uint2 __atomicAdd(RWByteAddressBuffer buf, uint offset, uint2);
+
// Int versions require glsl 4.30
// https://www.khronos.org/registry/OpenGL-Refpages/gl4/html/atomicAdd.xhtml
@@ -78,6 +80,32 @@ __glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
int64_t __atomicAdd(__ref int64_t value, int64_t amount);
+// CAS - Compare and swap
+
+// Helper for HLSL, using NVAPI
+
+__target_intrinsic(hlsl, "NvInterlockedCompareExchangeUint64($0, $1, $2, $3)")
+uint2 __cas(RWByteAddressBuffer buf, uint offset, uint2 compareValue, uint2 value);
+
+__target_intrinsic(glsl, "atomicCompSwap($0, $1, $2)")
+__glsl_version(430)
+__glsl_extension(GL_EXT_shader_atomic_int64)
+uint64_t __cas(__ref uint64_t ioValue, uint64_t compareValue, uint64_t newValue);
+
+// Conversion between uint64_t and uint2
+
+uint2 __asuint2(uint64_t i)
+{
+ return uint2(uint(i), uint(uint64_t(i) >> 32));
+}
+
+uint64_t __asuint64(uint2 i)
+{
+ return (uint64_t(i.y) << 32) | i.x;
+}
+
+//
+
__intrinsic_op($(kIROp_ByteAddressBufferLoad))
T __byteAddressBufferLoad<T>(ByteAddressBuffer buffer, int offset);
@@ -202,16 +230,15 @@ ${{{{
// https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#VK_EXT_shader_atomic_float
// https://htmlpreview.github.io/?https://github.com/KhronosGroup/SPIRV-Registry/blob/master/extensions/EXT/SPV_EXT_shader_atomic_float_add.html
- // Fp32
+ // Fp32 Add
__target_intrinsic(hlsl, "($3 = NvInterlockedAddFp32($0, $1, $2))")
__cuda_sm_version(2.0)
__target_intrinsic(cuda, "(*$3 = atomicAdd((float*)$0._getPtrAt($1), $2))")
- void InterlockedAddFp32(uint byteAddress, float valueToAdd, out float originalValue);
+ void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue);
-
__specialized_for_target(glsl)
- void InterlockedAddFp32(uint byteAddress, float valueToAdd, out float originalValue)
+ void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue)
{
RWStructuredBuffer<float> buf = __getEquivalentStructuredBuffer<float>(this);
originalValue = __atomicAdd(buf[byteAddress / 4], valueToAdd);
@@ -222,29 +249,24 @@ ${{{{
__target_intrinsic(hlsl, "(NvInterlockedAddFp32($0, $1, $2))")
__cuda_sm_version(2.0)
__target_intrinsic(cuda, "atomicAdd((float*)$0._getPtrAt($1), $2)")
- void InterlockedAddFp32(uint byteAddress, float valueToAdd);
+ void InterlockedAddF32(uint byteAddress, float valueToAdd);
__specialized_for_target(glsl)
- void InterlockedAddFp32(uint byteAddress, float valueToAdd)
+ void InterlockedAddF32(uint byteAddress, float valueToAdd)
{
RWStructuredBuffer<float> buf = __getEquivalentStructuredBuffer<float>(this);
__atomicAdd(buf[byteAddress / 4], valueToAdd);
}
- // Int64
+ // Int64 Add
__cuda_sm_version(6.0)
__target_intrinsic(cuda, "(*$3 = atomicAdd((uint64_t*)$0._getPtrAt($1), $2))")
void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue);
__specialized_for_target(hlsl)
- void InterlockedAddI64(uint byteAddress, int64_t inValueToAdd, out int64_t outOriginalValue)
+ void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t outOriginalValue)
{
- uint2 valueToAdd;
- valueToAdd.x = uint(inValueToAdd);
- valueToAdd.y = uint(uint64_t(inValueToAdd) >> 32);
-
- const uint2 originalValue = __atomicAdd(this, byteAddress, valueToAdd);
- outOriginalValue = (int64_t(originalValue.y) << 32) | originalValue.x;
+ outOriginalValue = __asuint64(__atomicAdd(this, byteAddress, __asuint2(valueToAdd)));
}
__specialized_for_target(glsl)
@@ -260,12 +282,9 @@ ${{{{
void InterlockedAddI64(uint byteAddress, int64_t valueToAdd);
__specialized_for_target(hlsl)
- void InterlockedAddI64(uint byteAddress, int64_t inValueToAdd)
+ void InterlockedAddI64(uint byteAddress, int64_t valueToAdd)
{
- uint2 valueToAdd;
- valueToAdd.x = uint(inValueToAdd);
- valueToAdd.y = uint(uint64_t(inValueToAdd) >> 32);
- __atomicAdd(this, byteAddress, valueToAdd);
+ __atomicAdd(this, byteAddress, __asuint2(valueToAdd));
}
__specialized_for_target(glsl)
@@ -275,6 +294,24 @@ ${{{{
__atomicAdd(buf[byteAddress / 8], valueToAdd);
}
+ // CAS UInt64
+
+ __target_intrinsic(cuda, "(*$4 = atomicCAS((uint64_t*)$0._getPtrAt($1), $2, $3))")
+ void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue);
+
+ __specialized_for_target(hlsl)
+ void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue)
+ {
+ outOriginalValue = __asuint64(__cas(this, byteAddress, __asuint2(compareValue), __asuint2(value)));
+ }
+
+ __specialized_for_target(glsl)
+ void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue)
+ {
+ RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+ outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value);
+ }
+
${{{{
}
}}}}