summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYong He <yonghe@outlook.com>2023-08-31 13:49:40 -0700
committerGitHub <noreply@github.com>2023-08-31 13:49:40 -0700
commitcc412af89e54b04ead508ca84825a18d001b92d0 (patch)
treeda7bb020c494cc4dc62a9c641fb88d7b0b9f89f2
parent1996785e1c5d76254a102c1ec0df5dd7e2e4d68a (diff)
Add SPIRV atomics intrinsics and fix buffer layout lowering. (#3170)
* Fix atomics intrinsics and buffer layout lowering. * Fix. * Add more test. * Fix. --------- Co-authored-by: Yong He <yhe@nvidia.com>
-rw-r--r--source/compiler-core/slang-dxc-compiler.cpp4
-rw-r--r--source/slang/hlsl.meta.slang1075
-rw-r--r--source/slang/slang-check-expr.cpp3
-rw-r--r--source/slang/slang-emit-spirv.cpp1
-rw-r--r--source/slang/slang-ir-inline.cpp30
-rw-r--r--source/slang/slang-ir-layout.cpp23
-rw-r--r--source/slang/slang-ir-layout.h1
-rw-r--r--source/slang/slang-ir-lower-buffer-element-type.cpp41
-rw-r--r--source/slang/slang-ir-specialize-function-call.cpp4
-rw-r--r--source/slang/slang-ir-specialize-resources.cpp3
-rw-r--r--source/slang/slang-ir-specialize-target-switch.cpp10
-rw-r--r--source/slang/slang-ir-spirv-legalize.cpp255
-rw-r--r--source/slang/slang-ir-use-uninitialized-out-param.cpp1
-rw-r--r--source/slang/slang-parser.cpp26
-rw-r--r--source/slang/slang-spirv-val.cpp2
-rw-r--r--tests/expected-failure.txt10
-rw-r--r--tests/hlsl-intrinsic/byte-address-buffer-atomics.slang64
17 files changed, 1264 insertions, 289 deletions
diff --git a/source/compiler-core/slang-dxc-compiler.cpp b/source/compiler-core/slang-dxc-compiler.cpp
index 9428d0d26..6956a3627 100644
--- a/source/compiler-core/slang-dxc-compiler.cpp
+++ b/source/compiler-core/slang-dxc-compiler.cpp
@@ -26,8 +26,12 @@
// Enable DXIL by default unless told not to
#ifndef SLANG_ENABLE_DXIL_SUPPORT
+#if SLANG_APPLE_FAMILY
+# define SLANG_ENABLE_DXIL_SUPPORT 0
+#else
# define SLANG_ENABLE_DXIL_SUPPORT 1
#endif
+#endif
// Enable calling through to `dxc` to
// generate code on Windows.
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 4c4c7cd59..d56c4ffcd 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -118,10 +118,22 @@ struct ByteAddressBuffer
// We have separate int/float implementations, as the float version requires some specific extensions
// https://www.khronos.org/registry/OpenGL/extensions/NV/NV_shader_atomic_float.txt
-__target_intrinsic(glsl, "atomicAdd($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_float)
-float __atomicAdd(__ref float value, float amount);
+float __atomicAdd(__ref float value, float amount)
+{
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicAdd($0, $1)";
+ case spirv:
+ return spirv_asm
+ {
+ OpExtension "SPV_EXT_shader_atomic_float_add";
+ OpCapability AtomicFloat32AddEXT;
+ result:$$float = OpAtomicFAddEXT &value Device None $amount
+ };
+ }
+}
// Helper for hlsl, using NVAPI
__target_intrinsic(hlsl, "NvInterlockedAddUint64($0, $1, $2)")
@@ -137,23 +149,66 @@ void __atomicAdd(RWByteAddressBuffer buf, uint offset, uint64_t value, out uint6
// Int versions require glsl 4.30
// https://www.khronos.org/registry/OpenGL-Refpages/gl4/html/atomicAdd.xhtml
-__target_intrinsic(glsl, "atomicAdd($0, $1)")
__glsl_version(430)
-int __atomicAdd(__ref int value, int amount);
+int __atomicAdd(__ref int value, int amount)
+{
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicAdd($0, $1)";
+ case spirv:
+ return spirv_asm
+ {
+ result:$$int = OpAtomicIAdd &value Device None $amount;
+ };
+ }
+}
-__target_intrinsic(glsl, "atomicAdd($0, $1)")
__glsl_version(430)
-uint __atomicAdd(__ref uint value, uint amount);
+uint __atomicAdd(__ref uint value, uint amount)
+{
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicAdd($0, $1)";
+ case spirv:
+ return spirv_asm
+ {
+ result:$$uint = OpAtomicIAdd &value Device None $amount;
+ };
+ }
+}
-__target_intrinsic(glsl, "atomicAdd($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
-int64_t __atomicAdd(__ref int64_t value, int64_t amount);
+int64_t __atomicAdd(__ref int64_t value, int64_t amount)
+{
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicAdd($0, $1)";
+ case spirv:
+ return spirv_asm
+ {
+ OpCapability Int64Atomics;
+ result:$$int64_t = OpAtomicIAdd &value Device None $amount
+ };
+ }
+}
__target_intrinsic(glsl, "atomicAdd($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
-uint64_t __atomicAdd(__ref uint64_t value, uint64_t amount);
+uint64_t __atomicAdd(__ref uint64_t value, uint64_t amount)
+{
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicAdd($0, $1)";
+ case spirv:
+ return spirv_asm
+ {
+ OpCapability Int64Atomics;
+ result:$$uint64_t = OpAtomicIAdd &value Device None $amount
+ };
+ }
+}
// Cas - Compare and swap
@@ -169,15 +224,37 @@ void __cas(RWByteAddressBuffer buf, uint offset, in int64_t compare_value, in in
__target_intrinsic(hlsl, "$0.InterlockedCompareExchange64($1, $2, $3, $4)")
void __cas(RWByteAddressBuffer buf, uint offset, in uint64_t compare_value, in uint64_t value, out uint64_t original_value);
-__target_intrinsic(glsl, "atomicCompSwap($0, $1, $2)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
-uint64_t __cas(__ref int64_t ioValue, int64_t compareValue, int64_t newValue);
+int64_t __cas(__ref int64_t ioValue, int64_t compareValue, int64_t newValue)
+{
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicCompSwap($0, $1, $2)";
+ case spirv:
+ return spirv_asm
+ {
+ OpCapability Int64Atomics;
+ result:$$int64_t = OpAtomicCompareExchange &ioValue Device None None $newValue $compareValue
+ };
+ }
+}
-__target_intrinsic(glsl, "atomicCompSwap($0, $1, $2)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
-uint64_t __cas(__ref uint64_t ioValue, uint64_t compareValue, uint64_t newValue);
+uint64_t __cas(__ref uint64_t ioValue, uint64_t compareValue, uint64_t newValue)
+{
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicCompSwap($0, $1, $2)";
+ case spirv:
+ return spirv_asm
+ {
+ OpCapability Int64Atomics;
+ result:$$uint64_t = OpAtomicCompareExchange &ioValue Device None None $newValue $compareValue
+ };
+ }
+}
// Max
@@ -185,10 +262,21 @@ __target_intrinsic(hlsl, "NvInterlockedMaxUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicMax(RWByteAddressBuffer buf, uint offset, uint2 value);
-__target_intrinsic(glsl, "atomicMax($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
-uint64_t __atomicMax(__ref uint64_t ioValue, uint64_t value);
+uint64_t __atomicMax(__ref uint64_t ioValue, uint64_t value)
+{
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicMax($0, $1)";
+ case spirv:
+ return spirv_asm
+ {
+ OpCapability Int64Atomics;
+ result:$$uint64_t = OpAtomicUMax &ioValue Device None $value
+ };
+ }
+}
// Min
@@ -196,10 +284,21 @@ __target_intrinsic(hlsl, "NvInterlockedMinUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicMin(RWByteAddressBuffer buf, uint offset, uint2 value);
-__target_intrinsic(glsl, "atomicMin($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
-uint64_t __atomicMin(__ref uint64_t ioValue, uint64_t value);
+uint64_t __atomicMin(__ref uint64_t ioValue, uint64_t value)
+{
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicMin($0, $1)";
+ case spirv:
+ return spirv_asm
+ {
+ OpCapability Int64Atomics;
+ result:$$uint64_t = OpAtomicUMin &ioValue Device None $value
+ };
+ }
+}
// And
@@ -207,10 +306,21 @@ __target_intrinsic(hlsl, "NvInterlockedAndUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicAnd(RWByteAddressBuffer buf, uint offset, uint2 value);
-__target_intrinsic(glsl, "atomicAnd($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
-uint64_t __atomicAnd(__ref uint64_t ioValue, uint64_t value);
+uint64_t __atomicAnd(__ref uint64_t ioValue, uint64_t value)
+{
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicAnd($0, $1)";
+ case spirv:
+ return spirv_asm
+ {
+ OpCapability Int64Atomics;
+ result:$$uint64_t = OpAtomicAnd &ioValue Device None $value
+ };
+ }
+}
// Or
@@ -218,10 +328,21 @@ __target_intrinsic(hlsl, "NvInterlockedOrUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicOr(RWByteAddressBuffer buf, uint offset, uint2 value);
-__target_intrinsic(glsl, "atomicOr($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
-uint64_t __atomicOr(__ref uint64_t ioValue, uint64_t value);
+uint64_t __atomicOr(__ref uint64_t ioValue, uint64_t value)
+{
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicOr($0, $1)";
+ case spirv:
+ return spirv_asm
+ {
+ OpCapability Int64Atomics;
+ result:$$uint64_t = OpAtomicOr &ioValue Device None $value
+ };
+ }
+}
// Xor
@@ -229,10 +350,21 @@ __target_intrinsic(hlsl, "NvInterlockedXorUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicXor(RWByteAddressBuffer buf, uint offset, uint2 value);
-__target_intrinsic(glsl, "atomicXor($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
-uint64_t __atomicXor(__ref uint64_t ioValue, uint64_t value);
+uint64_t __atomicXor(__ref uint64_t ioValue, uint64_t value)
+{
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicXor($0, $1)";
+ case spirv:
+ return spirv_asm
+ {
+ OpCapability Int64Atomics;
+ result:$$uint64_t = OpAtomicXor &ioValue Device None $value
+ };
+ }
+}
// Exchange
@@ -240,10 +372,21 @@ __target_intrinsic(hlsl, "NvInterlockedExchangeUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicExchange(RWByteAddressBuffer buf, uint offset, uint2 value);
-__target_intrinsic(glsl, "atomicExchange($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
-uint64_t __atomicExchange(__ref uint64_t ioValue, uint64_t value);
+uint64_t __atomicExchange(__ref uint64_t ioValue, uint64_t value)
+{
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicExchange($0, $1)";
+ case spirv:
+ return spirv_asm
+ {
+ OpCapability Int64Atomics;
+ result:$$uint64_t = OpAtomicExchange &ioValue Device None $value
+ };
+ }
+}
// Conversion between uint64_t and uint2
@@ -441,50 +584,60 @@ ${{{{
// F32 Add
- __target_intrinsic(hlsl, "($3 = NvInterlockedAddFp32($0, $1, $2))")
__cuda_sm_version(2.0)
- __target_intrinsic(cuda, "(*$3 = atomicAdd($0._getPtrAt<float>($1), $2))")
[__requiresNVAPI]
- void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue);
-
- __specialized_for_target(glsl)
void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue)
{
- RWStructuredBuffer<float> buf = __getEquivalentStructuredBuffer<float>(this);
- originalValue = __atomicAdd(buf[byteAddress / 4], valueToAdd);
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "($3 = NvInterlockedAddFp32($0, $1, $2))";
+ case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<float>($1), $2))";
+ case glsl:
+ case spirv:
+ {
+ let buf = __getEquivalentStructuredBuffer<float>(this);
+ originalValue = __atomicAdd(buf[byteAddress / 4], valueToAdd);
+ return;
+ }
+ }
}
// Without returning original value
- __target_intrinsic(hlsl, "(NvInterlockedAddFp32($0, $1, $2))")
[__requiresNVAPI]
__cuda_sm_version(2.0)
- __target_intrinsic(cuda, "atomicAdd($0._getPtrAt<float>($1), $2)")
- void InterlockedAddF32(uint byteAddress, float valueToAdd);
-
- __specialized_for_target(glsl)
void InterlockedAddF32(uint byteAddress, float valueToAdd)
{
- RWStructuredBuffer<float> buf = __getEquivalentStructuredBuffer<float>(this);
- __atomicAdd(buf[byteAddress / 4], valueToAdd);
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "(NvInterlockedAddFp32($0, $1, $2))";
+ case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt<float>($1), $2)";
+ case glsl:
+ case spirv:
+ {
+ let buf = __getEquivalentStructuredBuffer<float>(this);
+ __atomicAdd(buf[byteAddress / 4], valueToAdd);
+ return;
+ }
+ }
}
// Int64 Add
__cuda_sm_version(6.0)
- __target_intrinsic(cuda, "(*$3 = atomicAdd($0._getPtrAt<uint64_t>($1), $2))")
- void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue);
-
- __specialized_for_target(hlsl)
- void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t outOriginalValue)
- {
- outOriginalValue = __asuint64(__atomicAdd(this, byteAddress, __asuint2(valueToAdd)));
- }
-
- __specialized_for_target(glsl)
void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue)
{
- RWStructuredBuffer<int64_t> buf = __getEquivalentStructuredBuffer<int64_t>(this);
- originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd);
+ __target_switch
+ {
+ case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<uint64_t>($1), $2))";
+ case hlsl:
+ originalValue = __asuint64(__atomicAdd(this, byteAddress, __asuint2(valueToAdd)));
+ case glsl:
+ case spirv:
+ {
+ let buf = __getEquivalentStructuredBuffer<int64_t>(this);
+ originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd);
+ }
+ }
}
// Without returning original value
@@ -499,9 +652,10 @@ ${{{{
}
__specialized_for_target(glsl)
+ __specialized_for_target(spirv)
void InterlockedAddI64(uint byteAddress, int64_t valueToAdd)
{
- RWStructuredBuffer<int64_t> buf = __getEquivalentStructuredBuffer<int64_t>(this);
+ let buf = __getEquivalentStructuredBuffer<int64_t>(this);
__atomicAdd(buf[byteAddress / 8], valueToAdd);
}
@@ -517,9 +671,10 @@ ${{{{
}
__specialized_for_target(glsl)
+ __specialized_for_target(spirv)
void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue)
{
- RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+ let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value);
}
@@ -533,9 +688,10 @@ ${{{{
uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicMax(this, byteAddress, __asuint2(value))); }
__specialized_for_target(glsl)
+ __specialized_for_target(spirv)
uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value)
{
- RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+ let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
return __atomicMax(buf[byteAddress / 8], value);
}
@@ -549,9 +705,10 @@ ${{{{
uint64_t InterlockedMinU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicMin(this, byteAddress, __asuint2(value))); }
__specialized_for_target(glsl)
+ __specialized_for_target(spirv)
uint64_t InterlockedMinU64(uint byteAddress, uint64_t value)
{
- RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+ let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
return __atomicMin(buf[byteAddress / 8], value);
}
@@ -564,9 +721,10 @@ ${{{{
uint64_t InterlockedAndU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicAnd(this, byteAddress, __asuint2(value))); }
__specialized_for_target(glsl)
+ __specialized_for_target(spirv)
uint64_t InterlockedAndU64(uint byteAddress, uint64_t value)
{
- RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+ let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
return __atomicAnd(buf[byteAddress / 8], value);
}
@@ -579,9 +737,10 @@ ${{{{
uint64_t InterlockedOrU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicOr(this, byteAddress, __asuint2(value))); }
__specialized_for_target(glsl)
+ __specialized_for_target(spirv)
uint64_t InterlockedOrU64(uint byteAddress, uint64_t value)
{
- RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+ let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
return __atomicOr(buf[byteAddress / 8], value);
}
@@ -594,9 +753,10 @@ ${{{{
uint64_t InterlockedXorU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicXor(this, byteAddress, __asuint2(value))); }
__specialized_for_target(glsl)
+ __specialized_for_target(spirv)
uint64_t InterlockedXorU64(uint byteAddress, uint64_t value)
{
- RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+ let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
return __atomicXor(buf[byteAddress / 8], value);
}
@@ -609,9 +769,10 @@ ${{{{
uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicExchange(this, byteAddress, __asuint2(value))); }
__specialized_for_target(glsl)
+ __specialized_for_target(spirv)
uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value)
{
- RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+ let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
return __atomicExchange(buf[byteAddress / 8], value);
}
@@ -622,9 +783,10 @@ ${{{{
__atomicAdd(this, byteAddress, valueToAdd, outOriginalValue);
}
__specialized_for_target(glsl)
+ __specialized_for_target(spirv)
void InterlockedAdd64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue)
{
- RWStructuredBuffer<int64_t> buf = __getEquivalentStructuredBuffer<int64_t>(this);
+ let buf = __getEquivalentStructuredBuffer<int64_t>(this);
originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd);
}
__specialized_for_target(hlsl)
@@ -633,9 +795,10 @@ ${{{{
__atomicAdd(this, byteAddress, valueToAdd, outOriginalValue);
}
__specialized_for_target(glsl)
+ __specialized_for_target(spirv)
void InterlockedAdd64(uint byteAddress, uint64_t valueToAdd, out uint64_t originalValue)
{
- RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+ let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd);
}
__specialized_for_target(hlsl)
@@ -644,9 +807,10 @@ ${{{{
__cas(this, byteAddress, compareValue, value, outOriginalValue);
}
__specialized_for_target(glsl)
+ __specialized_for_target(spirv)
void InterlockedCompareExchange64(uint byteAddress, int64_t compareValue, int64_t value, out int64_t outOriginalValue)
{
- RWStructuredBuffer<int64_t> buf = __getEquivalentStructuredBuffer<int64_t>(this);
+ let buf = __getEquivalentStructuredBuffer<int64_t>(this);
outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value);
}
__specialized_for_target(hlsl)
@@ -655,101 +819,251 @@ ${{{{
__cas(this, byteAddress, compareValue, value, outOriginalValue);
}
__specialized_for_target(glsl)
+ __specialized_for_target(spirv)
void InterlockedCompareExchange64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue)
{
- RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+ let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value);
}
${{{{
- }
+ } // endif (type == RWByteAddressBuffer)
}}}}
// Added operations:
-
- __target_intrinsic(glsl, "($3 = atomicAdd($0._data[$1/4], $2))")
void InterlockedAdd(
UINT dest,
UINT value,
- out UINT original_value);
+ out UINT original_value)
+ {
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "($3 = atomicAdd($0._data[$1/4], $2))";
+ case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<uint32_t>($1), $2))";
+ case hlsl: __intrinsic_asm ".InterlockedAdd";
+ case spirv:
+ let buf = __getEquivalentStructuredBuffer<uint>(this);
+ ::InterlockedAdd(buf[dest / 4], value, original_value);
+ }
+ }
- __target_intrinsic(glsl, "atomicAdd($0._data[$1/4], $2)")
void InterlockedAdd(
UINT dest,
- UINT value);
+ UINT value)
+ {
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicAdd($0._data[$1/4], $2)";
+ case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt<uint32_t>($1), $2)";
+ case hlsl: __intrinsic_asm ".InterlockedAdd";
+ case spirv:
+ let buf = __getEquivalentStructuredBuffer<uint>(this);
+ ::InterlockedAdd(buf[dest / 4], value);
+ }
+ }
- __target_intrinsic(glsl, "($3 = atomicAnd($0._data[$1/4], $2))")
void InterlockedAnd(
UINT dest,
UINT value,
- out UINT original_value);
+ out UINT original_value)
+ {
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "$3 = atomicAnd($0._data[$1/4], $2)";
+ case cuda: __intrinsic_asm "(*$3 = atomicAnd($0._getPtrAt<uint32_t>($1), $2))";
+ case hlsl: __intrinsic_asm ".InterlockedAnd";
+ case spirv:
+ let buf = __getEquivalentStructuredBuffer<uint>(this);
+ ::InterlockedAnd(buf[dest / 4], value, original_value);
+ }
+ }
- __target_intrinsic(glsl, "atomicAnd($0._data[$1/4], $2)")
void InterlockedAnd(
UINT dest,
- UINT value);
+ UINT value)
+ {
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicAnd($0._data[$1/4], $2)";
+ case cuda: __intrinsic_asm "atomicAnd($0._getPtrAt<uint32_t>($1), $2)";
+ case hlsl: __intrinsic_asm ".InterlockedAnd";
+ case spirv:
+ let buf = __getEquivalentStructuredBuffer<uint>(this);
+ ::InterlockedAnd(buf[dest / 4], value);
+ }
+ }
- __target_intrinsic(glsl, "($4 = atomicCompSwap($0._data[$1/4], $2, $3))")
void InterlockedCompareExchange(
UINT dest,
UINT compare_value,
UINT value,
- out UINT original_value);
+ out UINT original_value)
+ {
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "($4 = atomicCompSwap($0._data[$1/4], $2, $3))";
+ case cuda: __intrinsic_asm "(*$4 = atomicCAS($0._getPtrAt<uint32_t>($1), $2, $3))";
+ case hlsl: __intrinsic_asm ".InterlockedCompareExchange";
+ case spirv:
+ let buf = __getEquivalentStructuredBuffer<uint>(this);
+ ::InterlockedCompareExchange(buf[dest / 4], compare_value, value, original_value);
+ }
+ }
- __target_intrinsic(glsl, "atomicCompSwap($0._data[$1/4], $2, $3)")
void InterlockedCompareStore(
UINT dest,
UINT compare_value,
- UINT value);
+ UINT value)
+ {
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicCompSwap($0._data[$1/4], $2, $3)";
+ case cuda: __intrinsic_asm "atomicCAS($0._getPtrAt<uint32_t>($1), $2, $3)";
+ case hlsl: __intrinsic_asm ".InterlockedCompareStore";
+ case spirv:
+ let buf = __getEquivalentStructuredBuffer<uint>(this);
+ ::InterlockedCompareStore(buf[dest / 4], compare_value, value);
+ }
+ }
- __target_intrinsic(glsl, "($3 = atomicExchange($0._data[$1/4], $2))")
void InterlockedExchange(
UINT dest,
UINT value,
- out UINT original_value);
+ out UINT original_value)
+ {
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "($3 = atomicExchange($0._data[$1/4], $2))";
+ case cuda: __intrinsic_asm "(*$3 = atomicExch($0._getPtrAt<uint32_t>($1), $2))";
+ case hlsl: __intrinsic_asm ".InterlockedExchange";
+ case spirv:
+ let buf = __getEquivalentStructuredBuffer<uint>(this);
+ ::InterlockedExchange(buf[dest / 4], value, original_value);
+ }
+ }
- __target_intrinsic(glsl, "($3 = atomicMax($0._data[$1/4], $2))")
void InterlockedMax(
UINT dest,
UINT value,
- out UINT original_value);
+ out UINT original_value)
+ {
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "($3 = atomicMax($0._data[$1/4], $2))";
+ case cuda: __intrinsic_asm "(*$3 = atomicMax($0._getPtrAt<uint32_t>($1), $2))";
+ case hlsl: __intrinsic_asm ".InterlockedMax";
+ case spirv:
+ let buf = __getEquivalentStructuredBuffer<uint>(this);
+ ::InterlockedMax(buf[dest / 4], value, original_value);
+ }
+ }
- __target_intrinsic(glsl, "atomicMax($0._data[$1/4], $2)")
void InterlockedMax(
UINT dest,
- UINT value);
+ UINT value)
+ {
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicMax($0._data[$1/4], $2)";
+ case cuda: __intrinsic_asm "atomicMax($0._getPtrAt<uint32_t>($1), $2)";
+ case hlsl: __intrinsic_asm ".InterlockedMax";
+ case spirv:
+ let buf = __getEquivalentStructuredBuffer<uint>(this);
+ ::InterlockedMax(buf[dest / 4], value);
+ }
+ }
- __target_intrinsic(glsl, "($3 = atomicMin($0._data[$1/4], $2))")
void InterlockedMin(
UINT dest,
UINT value,
- out UINT original_value);
+ out UINT original_value)
+ {
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "($3 = atomicMin($0._data[$1/4], $2))";
+ case cuda: __intrinsic_asm "(*$3 = atomicMin($0._getPtrAt<uint32_t>($1), $2))";
+ case hlsl: __intrinsic_asm ".InterlockedMin";
+ case spirv:
+ let buf = __getEquivalentStructuredBuffer<uint>(this);
+ ::InterlockedMin(buf[dest / 4], value, original_value);
+ }
+ }
- __target_intrinsic(glsl, "atomicMin($0._data[$1/4], $2)")
void InterlockedMin(
UINT dest,
- UINT value);
+ UINT value)
+ {
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicMin($0._data[$1/4], $2)";
+ case cuda: __intrinsic_asm "atomicMin($0._getPtrAt<uint32_t>($1), $2)";
+ case hlsl: __intrinsic_asm ".InterlockedMin";
+ case spirv:
+ let buf = __getEquivalentStructuredBuffer<uint>(this);
+ ::InterlockedMin(buf[dest / 4], value);
+ }
+ }
- __target_intrinsic(glsl, "($3 = atomicOr($0._data[$1/4], $2))")
void InterlockedOr(
UINT dest,
UINT value,
- out UINT original_value);
+ out UINT original_value)
+ {
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "($3 = atomicOr($0._data[$1/4], $2))";
+ case cuda: __intrinsic_asm "(*$3 = atomicOr($0._getPtrAt<uint32_t>($1), $2))";
+ case hlsl: __intrinsic_asm ".InterlockedOr";
+ case spirv:
+ let buf = __getEquivalentStructuredBuffer<uint>(this);
+ ::InterlockedOr(buf[dest / 4], value, original_value);
+ }
+ }
- __target_intrinsic(glsl, "atomicOr($0._data[$1/4], $2)")
void InterlockedOr(
UINT dest,
- UINT value);
+ UINT value)
+ {
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicOr($0._data[$1/4], $2)";
+ case cuda: __intrinsic_asm "atomicOr($0._getPtrAt<uint32_t>($1), $2)";
+ case hlsl: __intrinsic_asm ".InterlockedOr";
+ case spirv:
+ let buf = __getEquivalentStructuredBuffer<uint>(this);
+ ::InterlockedOr(buf[dest / 4], value);
+ }
+ }
- __target_intrinsic(glsl, "($3 = atomicXor($0._data[$1/4], $2))")
void InterlockedXor(
UINT dest,
UINT value,
- out UINT original_value);
+ out UINT original_value)
+ {
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "($3 = atomicXor($0._data[$1/4], $2))";
+ case cuda: __intrinsic_asm "(*$3 = atomicXor($0._getPtrAt<uint32_t>($1), $2))";
+ case hlsl: __intrinsic_asm ".InterlockedXor";
+ case spirv:
+ let buf = __getEquivalentStructuredBuffer<uint>(this);
+ ::InterlockedXor(buf[dest / 4], value, original_value);
+ }
+ }
- __target_intrinsic(glsl, "atomicXor($0._data[$1/4], $2)")
void InterlockedXor(
UINT dest,
- UINT value);
+ UINT value)
+ {
+ __target_switch
+ {
+ case glsl: __intrinsic_asm "atomicXor($0._data[$1/4], $2)";
+ case cuda: __intrinsic_asm "atomicXor($0._getPtrAt<uint32_t>($1), $2)";
+ case hlsl: __intrinsic_asm ".InterlockedXor";
+ case spirv:
+ let buf = __getEquivalentStructuredBuffer<uint>(this);
+ ::InterlockedXor(buf[dest / 4], value);
+ }
+ }
__target_intrinsic(hlsl)
[ForceInline]
@@ -2325,7 +2639,7 @@ __target_intrinsic(spirv, "OpFConvert resultType resultId _0")
[__readNone]
vector<float16_t, N> f32tof16_(vector<float, N> value)
{
- VECTOR_MAP_UNARY(uint, N, f32tof16, value);
+ VECTOR_MAP_UNARY(float16_t, N, f32tof16, value);
}
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -2696,132 +3010,507 @@ void GroupMemoryBarrierWithGroupSync()
// Atomics
-__target_intrinsic(glsl, "$atomicAdd($A, $1)")
-__target_intrinsic(cuda, "atomicAdd($0, $1)")
-void InterlockedAdd(__ref int dest, int value);
+__glsl_version(430)
+void InterlockedAdd(__ref int dest, int value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedAdd";
+ case cuda: __intrinsic_asm "atomicAdd($0, $1)";
+ case glsl: __intrinsic_asm "$atomicAdd($A, $1)";
+ case spirv:
+ spirv_asm
+ {
+ result:$$int = OpAtomicIAdd &dest Device None $value
+ };
+ }
+}
-__target_intrinsic(glsl, "$atomicAdd($A, $1)")
-__target_intrinsic(cuda, "atomicAdd((uint*)$0, $1)")
-void InterlockedAdd(__ref uint dest, uint value);
+__glsl_version(430)
+void InterlockedAdd(__ref uint dest, uint value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedAdd";
+ case cuda: __intrinsic_asm "atomicAdd((int*)$0, $1)";
+ case glsl: __intrinsic_asm "$atomicAdd($A, $1)";
+ case spirv:
+ spirv_asm
+ {
+ result:$$uint = OpAtomicIAdd &dest Device None $value
+ };
+ }
+}
-__target_intrinsic(glsl, "$atomicAdd($A, $1)")
-__target_intrinsic(cuda, "atomicAdd((uint*)$0, $1)")
-void InterlockedAdd(__ref uint dest, int value);
+[ForceInline]
+void InterlockedAdd(__ref uint dest, int value)
+{
+ InterlockedAdd(dest, (uint)value);
+}
-__target_intrinsic(glsl, "($2 = $atomicAdd($A, $1))")
-__target_intrinsic(cuda, "(*$2 = atomicAdd($0, $1))")
-void InterlockedAdd(__ref int dest, int value, out int original_value);
+__glsl_version(430)
+void InterlockedAdd(__ref int dest, int value, out int original_value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedAdd";
+ case cuda: __intrinsic_asm "(*$2 = atomicAdd($0, $1))";
+ case glsl: __intrinsic_asm "($2 = $atomicAdd($A, $1))";
+ case spirv:
+ spirv_asm
+ {
+ %original:$$int = OpAtomicIAdd &dest Device None $value;
+ OpStore &original_value %original
+ };
+ }
+}
-__target_intrinsic(glsl, "($2 = $atomicAdd($A, $1))")
-__target_intrinsic(cuda, "(*$2 = (uint)atomicAdd((uint*)$0, $1))")
-__target_intrinsic(spirv, "%old = OpAtomicIAdd _type(uint) resultId _0"
- "const(int, ScopeDevice) const(int, MemorySemanticsMaskNone) _1;"
- "OpStore _2 %old;")
-void InterlockedAdd(__ref uint dest, uint value, out uint original_value);
+__glsl_version(430)
+void InterlockedAdd(__ref uint dest, uint value, out uint original_value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedAdd";
+ case cuda: __intrinsic_asm "(*$2 = (uint)atomicAdd((int*)$0, $1))";
+ case glsl: __intrinsic_asm "($2 = $atomicAdd($A, $1))";
+ case spirv:
+ spirv_asm
+ {
+ %original:$$uint = OpAtomicIAdd &dest Device None $value;
+ OpStore &original_value %original
+ };
+ }
+}
-__target_intrinsic(glsl, "$atomicAnd($A, $1)")
-__target_intrinsic(cuda, "atomicAnd($0, $1)")
-void InterlockedAnd(__ref int dest, int value);
+__glsl_version(430)
+void InterlockedAnd(__ref int dest, int value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedAnd";
+ case cuda: __intrinsic_asm "atomicAnd($0, $1)";
+ case glsl: __intrinsic_asm "$atomicAnd($A, $1)";
+ case spirv:
+ spirv_asm
+ {
+ result:$$int = OpAtomicAnd &dest Device None $value;
+ };
+ }
+}
-__target_intrinsic(glsl, "$atomicAnd($A, $1)")
-__target_intrinsic(cuda, "atomicAnd((int*)$0, $1)")
-void InterlockedAnd(__ref uint dest, uint value);
+__glsl_version(430)
+void InterlockedAnd(__ref uint dest, uint value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedAnd";
+ case cuda: __intrinsic_asm "atomicAnd((int*)$0, $1)";
+ case glsl: __intrinsic_asm "$atomicAnd($A, $1)";
+ case spirv:
+ spirv_asm
+ {
+ result:$$uint = OpAtomicAnd &dest Device None $value;
+ };
+ }
+}
-__target_intrinsic(glsl, "($2 = $atomicAnd($A, $1))")
-__target_intrinsic(cuda, "(*$2 = atomicAnd($0, $1))")
-void InterlockedAnd(__ref int dest, int value, out int original_value);
+__glsl_version(430)
+void InterlockedAnd(__ref int dest, int value, out int original_value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedAnd";
+ case cuda: __intrinsic_asm "(*$2 = atomicAnd($0, $1))";
+ case glsl: __intrinsic_asm "($2 = $atomicAnd($A, $1))";
+ case spirv:
+ spirv_asm
+ {
+ %original:$$int = OpAtomicAnd &dest Device None $value;
+ OpStore &original_value %original
+ };
+ }
+}
-__target_intrinsic(glsl, "($2 = $atomicAnd($A, $1))")
-__target_intrinsic(cuda, "(*$2 = atomicAnd((int*)$0, $1))")
-void InterlockedAnd(__ref uint dest, uint value, out uint original_value);
+__glsl_version(430)
+void InterlockedAnd(__ref uint dest, uint value, out uint original_value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedAnd";
+ case glsl: __intrinsic_asm "($2 = atomicAnd($0, $1))";
+ case cuda: __intrinsic_asm "(*$2 = atomicAnd((int*)$0, $1))";
+ case spirv:
+ spirv_asm
+ {
+ %original:$$uint = OpAtomicAnd &dest Device None $value;
+ OpStore &original_value %original
+ };
+ }
+}
-__target_intrinsic(glsl, "($3 = $atomicCompSwap($A, $1, $2))")
-__target_intrinsic(cuda, "(*$3 = atomicCAS($0, $1, $2))")
-void InterlockedCompareExchange(__ref int dest, int compare_value, int value, out int original_value);
+__glsl_version(430)
+void InterlockedCompareExchange(__ref int dest, int compare_value, int value, out int original_value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedCompareExchange";
+ case glsl: __intrinsic_asm "($3 = $atomicCompSwap($A, $1, $2))";
+ case cuda: __intrinsic_asm "(*$3 = atomicCAS($0, $1, $2))";
+ case spirv:
+ spirv_asm
+ {
+ %original:$$int = OpAtomicCompareExchange &dest Device None None $value $compare_value;
+ OpStore &original_value %original
+ };
+ }
+}
-__target_intrinsic(glsl, "($3 = $atomicCompSwap($A, $1, $2))")
-__target_intrinsic(cuda, "(*$3 = (uint)atomicCAS((int*)$0, $1, $2))")
-void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value, out uint original_value);
+__glsl_version(430)
+void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value, out uint original_value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedCompareExchange";
+ case glsl: __intrinsic_asm "($3 = $atomicCompSwap($A, $1, $2))";
+ case cuda: __intrinsic_asm "(*$3 = (uint)atomicCAS((int*)$0, $1, $2))";
+ case spirv:
+ spirv_asm
+ {
+ %original:$$uint = OpAtomicCompareExchange &dest Device None None $value $compare_value;
+ OpStore &original_value %original
+ };
+ }
+}
-__target_intrinsic(glsl, "$atomicCompSwap($A, $1, $2)")
-__target_intrinsic(cuda, "atomicCAS($0, $1, $2)")
-void InterlockedCompareStore(__ref int dest, int compare_value, int value);
+__glsl_version(430)
+void InterlockedCompareStore(__ref int dest, int compare_value, int value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedCompareStore";
+ case glsl: __intrinsic_asm "$atomicCompSwap($A, $1, $2)";
+ case cuda: __intrinsic_asm "atomicCAS($0, $1, $2)";
+ case spirv:
+ spirv_asm
+ {
+ result:$$int = OpAtomicCompareExchange &dest Device None None $value $compare_value;
+ };
+ }
+}
-__target_intrinsic(glsl, "$atomicCompSwap($A, $1, $2)")
-__target_intrinsic(cuda, "atomicCAS((int*)$0, $1, $2)")
-void InterlockedCompareStore(__ref uint dest, uint compare_value, uint value);
+__glsl_version(430)
+void InterlockedCompareStore(__ref uint dest, uint compare_value, uint value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedCompareStore";
+ case glsl: __intrinsic_asm "$atomicCompSwap($A, $1, $2)";
+ case cuda: __intrinsic_asm "atomicCAS((int*)$0, $1, $2)";
+ case spirv:
+ spirv_asm
+ {
+ result:$$uint = OpAtomicCompareExchange &dest Device None None $value $compare_value;
+ };
+ }
+}
-__target_intrinsic(glsl, "($2 = $atomicExchange($A, $1))")
-__target_intrinsic(cuda, "(*$2 = atomicExch($0, $1))")
-void InterlockedExchange(__ref int dest, int value, out int original_value);
+__glsl_version(430)
+void InterlockedExchange(__ref int dest, int value, out int original_value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedExchange";
+ case glsl: __intrinsic_asm "($2 = $atomicExchange($A, $1))";
+ case cuda: __intrinsic_asm "(*$2 = atomicExch($0, $1))";
+ case spirv:
+ spirv_asm
+ {
+ %r:$$int = OpAtomicExchange &dest Device None $value;
+ OpStore &original_value %r
+ };
+ }
+}
-__target_intrinsic(glsl, "($2 = $atomicExchange($A, $1))")
-__target_intrinsic(cuda, "(*$2 = (uint)atomicExch((int*)$0, $1))")
-void InterlockedExchange(__ref uint dest, uint value, out uint original_value);
+__glsl_version(430)
+void InterlockedExchange(__ref uint dest, uint value, out uint original_value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedExchange";
+ case glsl: __intrinsic_asm "($2 = $atomicExchange($A, $1))";
+ case cuda: __intrinsic_asm "(*$2 = (uint)atomicExch((int*)$0, $1))";
+ case spirv:
+ spirv_asm
+ {
+ %r:$$uint = OpAtomicExchange &dest Device None $value;
+ OpStore &original_value %r
+ };
+ }
+}
-__target_intrinsic(glsl, "$atomicMax($A, $1)")
-__target_intrinsic(cuda, "atomicMax($0, $1)")
-void InterlockedMax(__ref int dest, int value);
+__glsl_version(430)
+void InterlockedMax(__ref int dest, int value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedMax";
+ case glsl: __intrinsic_asm "$atomicMax($A, $1)";
+ case cuda: __intrinsic_asm "atomicMax($0, $1)";
+ case spirv:
+ spirv_asm
+ {
+ result:$$int = OpAtomicSMax &dest Device None $value;
+ };
+ }
+}
-__target_intrinsic(glsl, "$atomicMax($A, $1)")
-__target_intrinsic(cuda, "atomicMax((int*)$0, $1)")
-void InterlockedMax(__ref uint dest, uint value);
+__glsl_version(430)
+void InterlockedMax(__ref uint dest, uint value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedMax";
+ case glsl: __intrinsic_asm "$atomicMax($A, $1)";
+ case cuda: __intrinsic_asm "atomicMax((int*)$0, $1)";
+ case spirv:
+ spirv_asm
+ {
+ result:$$uint = OpAtomicUMax &dest Device None $value;
+ };
+ }
+}
-__target_intrinsic(glsl, "($2 = $atomicMax($A, $1))")
-__target_intrinsic(cuda, "(*$2 = atomicMax($0, $1))")
-void InterlockedMax(__ref int dest, int value, out int original_value);
+__glsl_version(430)
+void InterlockedMax(__ref int dest, int value, out int original_value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedMax";
+ case glsl: __intrinsic_asm "($2 = $atomicMax($A, $1))";
+ case cuda: __intrinsic_asm "(*$2 = atomicMax($0, $1))";
+ case spirv:
+ spirv_asm
+ {
+ %v:$$int = OpAtomicSMax &dest Device None $value;
+ OpStore &original_value %v
+ };
+ }
+}
-__target_intrinsic(glsl, "($2 = $atomicMax($A, $1))")
-__target_intrinsic(cuda, "(*$2 = (uint)atomicMax((int*)$0, $1))")
-void InterlockedMax(__ref uint dest, uint value, out uint original_value);
+__glsl_version(430)
+void InterlockedMax(__ref uint dest, uint value, out uint original_value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedMax";
+ case glsl: __intrinsic_asm "($2 = $atomicMax($A, $1))";
+ case cuda: __intrinsic_asm "(*$2 = (uint)atomicMax((int*)$0, $1))";
+ case spirv:
+ spirv_asm
+ {
+ %v:$$uint = OpAtomicUMax &dest Device None $value;
+ OpStore &original_value %v
+ };
+ }
+}
-__target_intrinsic(glsl, "$atomicMin($A, $1)")
-__target_intrinsic(cuda, "atomicMin($0, $1)")
-void InterlockedMin(__ref int dest, int value);
+__glsl_version(430)
+void InterlockedMin(__ref int dest, int value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedMin";
+ case glsl: __intrinsic_asm "$atomicMin($A, $1)";
+ case cuda: __intrinsic_asm "atomicMin($0, $1)";
+ case spirv:
+ spirv_asm
+ {
+ result:$$int = OpAtomicSMin &dest Device None $value;
+ };
+ }
+}
-__target_intrinsic(glsl, "$atomicMin($A, $1)")
-__target_intrinsic(cuda, "atomicMin((int*)$0, $1)")
-void InterlockedMin(__ref uint dest, uint value);
+__glsl_version(430)
+void InterlockedMin(__ref uint dest, uint value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedMin";
+ case glsl: __intrinsic_asm "$atomicMin($A, $1)";
+ case cuda: __intrinsic_asm "atomicMin((int*)$0, $1)";
+ case spirv:
+ spirv_asm
+ {
+ result:$$uint = OpAtomicUMin &dest Device None $value;
+ };
+ }
+}
-__target_intrinsic(glsl, "($2 = $atomicMin($A, $1))")
-__target_intrinsic(cuda, "(*$2 = atomicMin($0, $1))")
-void InterlockedMin(__ref int dest, int value, out int original_value);
+__glsl_version(430)
+void InterlockedMin(__ref int dest, int value, out int original_value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedMin";
+ case glsl: __intrinsic_asm "($2 = $atomicMin($A, $1))";
+ case cuda: __intrinsic_asm "(*$2 = atomicMin($0, $1))";
+ case spirv:
+ spirv_asm
+ {
+ %v:$$int = OpAtomicSMin &dest Device None $value;
+ OpStore &original_value %v
+ };
+ }
+}
-__target_intrinsic(glsl, "($2 = $atomicMin($A, $1))")
-__target_intrinsic(cuda, "(*$2 = (uint)atomicMin((int*)$0, $1))")
-void InterlockedMin(__ref uint dest, uint value, out uint original_value);
+__glsl_version(430)
+void InterlockedMin(__ref uint dest, uint value, out uint original_value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedMin";
+ case glsl: __intrinsic_asm "($2 = $atomicMin($A, $1))";
+ case cuda: __intrinsic_asm "(*$2 = (uint)atomicMin((int*)$0, $1))";
+ case spirv:
+ spirv_asm
+ {
+ %v:$$uint = OpAtomicUMin &dest Device None $value;
+ OpStore &original_value %v
+ };
+ }
+}
-__target_intrinsic(glsl, "$atomicOr($A, $1)")
-__target_intrinsic(cuda, "atomicOr($0, $1)")
-void InterlockedOr(__ref int dest, int value);
+__glsl_version(430)
+void InterlockedOr(__ref int dest, int value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedOr";
+ case cuda: __intrinsic_asm "atomicOr((int*)$0, $1)";
+ case glsl: __intrinsic_asm "$atomicOr($A, $1)";
+ case spirv:
+ spirv_asm
+ {
+ result:$$int = OpAtomicOr &dest Device None $value;
+ };
+ }
+}
-__target_intrinsic(glsl, "$atomicOr($A, $1)")
-__target_intrinsic(cuda, "atomicOr((int*)$0, $1)")
-void InterlockedOr(__ref uint dest, uint value);
+__glsl_version(430)
+void InterlockedOr(__ref uint dest, uint value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedOr";
+ case cuda: __intrinsic_asm "atomicOr((int*)$0, $1)";
+ case glsl: __intrinsic_asm "$atomicOr($A, $1)";
+ case spirv:
+ spirv_asm
+ {
+ result:$$uint = OpAtomicOr &dest Device None $value;
+ };
+ }
+}
-__target_intrinsic(glsl, "($2 = $atomicOr($A, $1))")
-__target_intrinsic(cuda, "(*$2 = atomicOr($0, $1))")
-void InterlockedOr(__ref int dest, int value, out int original_value);
+__glsl_version(430)
+void InterlockedOr(__ref int dest, int value, out int original_value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedOr";
+ case glsl: __intrinsic_asm "($2 = atomicOr($0, $1))";
+ case cuda: __intrinsic_asm "(*$2 = atomicOr($0, $1))";
+ case spirv:
+ spirv_asm
+ {
+ %original:$$int = OpAtomicOr &dest Device None $value;
+ OpStore &original_value %original
+ };
+ }
+}
-__target_intrinsic(glsl, "($2 = $atomicOr($A, $1))")
-__target_intrinsic(cuda, "(*$2 = (uint)atomicOr((int*)$0, $1))")
-void InterlockedOr(__ref uint dest, uint value, out uint original_value);
+__glsl_version(430)
+void InterlockedOr(__ref uint dest, uint value, out uint original_value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedOr";
+ case glsl: __intrinsic_asm "($2 = atomicOr($0, $1))";
+ case cuda: __intrinsic_asm "(*$2 = atomicOr((int*)$0, $1))";
+ case spirv:
+ spirv_asm
+ {
+ %original:$$uint = OpAtomicOr &dest Device None $value;
+ OpStore &original_value %original
+ };
+ }
+}
-__target_intrinsic(glsl, "$atomicXor($A, $1)")
-__target_intrinsic(cuda, "atomicXor($0, $1)")
-void InterlockedXor(__ref int dest, int value);
+__glsl_version(430)
+void InterlockedXor(__ref int dest, int value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedXor";
+ case cuda: __intrinsic_asm "atomicXor((int*)$0, $1)";
+ case glsl: __intrinsic_asm "$atomicXor($A, $1)";
+ case spirv:
+ spirv_asm
+ {
+ result:$$int = OpAtomicXor &dest Device None $value;
+ };
+ }
+}
-__target_intrinsic(glsl, "$atomicXor($A, $1)")
-__target_intrinsic(cuda, "atomicXor((int*)$0, $1)")
-void InterlockedXor(__ref uint dest, uint value);
+__glsl_version(430)
+void InterlockedXor(__ref uint dest, uint value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedXor";
+ case cuda: __intrinsic_asm "atomicXor((int*)$0, $1)";
+ case glsl: __intrinsic_asm "$atomicXor($A, $1)";
+ case spirv:
+ spirv_asm
+ {
+ result:$$uint = OpAtomicXor &dest Device None $value;
+ };
+ }
+}
-__target_intrinsic(glsl, "($2 = $atomicXor($A, $1))")
-__target_intrinsic(cuda, "(*$2 = atomicXor($0, $1))")
-void InterlockedXor(__ref int dest, int value, out int original_value);
+__glsl_version(430)
+void InterlockedXor(__ref int dest, int value, out int original_value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedXor";
+ case glsl: __intrinsic_asm "($2 = atomicXor($0, $1))";
+ case cuda: __intrinsic_asm "(*$2 = atomicXor($0, $1))";
+ case spirv:
+ spirv_asm
+ {
+ %original:$$int = OpAtomicXor &dest Device None $value;
+ OpStore &original_value %original
+ };
+ }
+}
-__target_intrinsic(glsl, "($2 = $atomicXor($A, $1))")
-__target_intrinsic(cuda, "(*$2 = (uint)atomicXor((int*)$0, $1))")
-void InterlockedXor(__ref uint dest, uint value, out uint original_value);
+__glsl_version(430)
+void InterlockedXor(__ref uint dest, uint value, out uint original_value)
+{
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedXor";
+ case glsl: __intrinsic_asm "($2 = atomicXor($0, $1))";
+ case cuda: __intrinsic_asm "(*$2 = (uint)atomicXor((int*)$0, $1))";
+ case spirv:
+ spirv_asm
+ {
+ %original:$$uint = OpAtomicXor &dest Device None $value;
+ OpStore &original_value %original
+ };
+ }
+}
// Is floating-point value finite?
@@ -3315,7 +4004,7 @@ uint4 msad4(uint reference, uint2 source, uint4 accum)
int4 bytesX = (source.x >> uint4(24, 16, 8, 0)) & 0xFF;
int4 bytesY = (source.y >> uint4(24, 16, 8, 0)) & 0xFF;
- uint4 mask = bytesRef == 0 ? 0 : 0xFFFFFFFFu;
+ uint4 mask = select(bytesRef == 0, 0, 0xFFFFFFFFu);
uint4 result = accum;
result += mask.x & abs(bytesRef - int4(bytesX.x, bytesY.y, bytesY.z, bytesY.w));
diff --git a/source/slang/slang-check-expr.cpp b/source/slang/slang-check-expr.cpp
index bf7950310..a00b1bea2 100644
--- a/source/slang/slang-check-expr.cpp
+++ b/source/slang/slang-check-expr.cpp
@@ -2785,6 +2785,9 @@ namespace Slang
Expr* SemanticsExprVisitor::visitTypeCastExpr(TypeCastExpr * expr)
{
+ if (expr->type)
+ return expr;
+
// Check the term we are applying first
auto funcExpr = expr->functionExpr;
funcExpr = CheckTerm(funcExpr);
diff --git a/source/slang/slang-emit-spirv.cpp b/source/slang/slang-emit-spirv.cpp
index a203f4a80..846c4b5b4 100644
--- a/source/slang/slang-emit-spirv.cpp
+++ b/source/slang/slang-emit-spirv.cpp
@@ -1292,7 +1292,6 @@ struct SPIRVEmitContext
SpvLiteralInteger::from32(stride));
return arrayType;
}
-
case kIROp_TextureType:
{
const auto texTypeInst = as<IRTextureType>(inst);
diff --git a/source/slang/slang-ir-inline.cpp b/source/slang/slang-ir-inline.cpp
index 4308d16f0..5479a98ff 100644
--- a/source/slang/slang-ir-inline.cpp
+++ b/source/slang/slang-ir-inline.cpp
@@ -896,15 +896,39 @@ struct IntrinsicFunctionInliningPass : InliningPassBase
auto returnInst = as<IRReturn>(func->getFirstBlock()->getTerminator());
if (!returnInst)
return false;
- auto firstInst = as<IRSPIRVAsm>(func->getFirstBlock()->getFirstOrdinaryInst());
- return returnInst->getVal() == firstInst;
+
+ // If a function body has only asm blocks + trivial insts (load/store),
+ // this is considered as a pure asm function, and we can inline it.
+ bool hasSpvAsm = false;
+ for (auto inst = func->getFirstBlock()->getFirstOrdinaryInst(); inst != returnInst; inst = inst->getNextInst())
+ {
+ switch (inst->getOp())
+ {
+ case kIROp_SPIRVAsmOperandInst:
+ case kIROp_SPIRVAsm:
+ hasSpvAsm = true;
+ continue;
+ case kIROp_Load:
+ case kIROp_swizzle:
+ case kIROp_Store:
+ continue;
+ default:
+ return false;
+ }
+ }
+ return hasSpvAsm;
}
};
void performIntrinsicFunctionFunctionInlining(IRModule* module)
{
IntrinsicFunctionInliningPass pass(module);
- pass.considerAllCallSites();
+ bool changed = true;
+
+ while (changed)
+ {
+ changed = pass.considerAllCallSites();
+ }
}
struct CustomInliningPass : InliningPassBase
diff --git a/source/slang/slang-ir-layout.cpp b/source/slang/slang-ir-layout.cpp
index cba6894a9..39210ea1d 100644
--- a/source/slang/slang-ir-layout.cpp
+++ b/source/slang/slang-ir-layout.cpp
@@ -139,11 +139,12 @@ case kIROp_##TYPE##Type: \
{
auto structType = cast<IRStructType>(type);
IRSizeAndAlignment structLayout;
+ IRIntegerValue offset = 0;
for (auto field : structType->getFields())
{
IRSizeAndAlignment fieldTypeLayout;
SLANG_RETURN_ON_FAIL(getSizeAndAlignment(rules, field->getFieldType(), &fieldTypeLayout));
- structLayout.size = align(structLayout.size, fieldTypeLayout.alignment);
+ structLayout.size = align(offset, fieldTypeLayout.alignment);
structLayout.alignment = std::max(structLayout.alignment, fieldTypeLayout.alignment);
IRIntegerValue fieldOffset = structLayout.size;
@@ -165,6 +166,11 @@ case kIROp_##TYPE##Type: \
}
structLayout.size += fieldTypeLayout.size;
+ offset = structLayout.size;
+ if (as<IRMatrixType>(field->getFieldType()) || as<IRArrayTypeBase>(field->getFieldType()) || as<IRStructType>(field->getFieldType()))
+ {
+ offset = rules->adjustOffsetForNextAggregateMember(offset, fieldTypeLayout.alignment);
+ }
}
*outSizeAndAlignment = rules->alignCompositeElement(structLayout);
return SLANG_OK;
@@ -380,7 +386,11 @@ struct NaturalLayoutRules : IRTypeLayoutRules
{
ruleName = IRTypeLayoutRuleName::Natural;
}
-
+ virtual IRIntegerValue adjustOffsetForNextAggregateMember(IRIntegerValue currentSize, IRIntegerValue lastElementAlignment)
+ {
+ SLANG_UNUSED(lastElementAlignment);
+ return currentSize;
+ }
virtual IRSizeAndAlignment alignCompositeElement(IRSizeAndAlignment elementSize)
{
return elementSize;
@@ -402,6 +412,11 @@ struct Std430LayoutRules : IRTypeLayoutRules
{
return elementSize;
}
+ virtual IRIntegerValue adjustOffsetForNextAggregateMember(IRIntegerValue currentSize, IRIntegerValue lastElementAlignment)
+ {
+ return align(currentSize, (int)lastElementAlignment);
+ }
+
virtual IRSizeAndAlignment getVectorSizeAndAlignment(IRSizeAndAlignment element, IRIntegerValue count)
{
IRIntegerValue countForAlignment = count;
@@ -418,6 +433,10 @@ struct Std140LayoutRules : IRTypeLayoutRules
ruleName = IRTypeLayoutRuleName::Std140;
}
+ virtual IRIntegerValue adjustOffsetForNextAggregateMember(IRIntegerValue currentSize, IRIntegerValue lastElementAlignment)
+ {
+ return align(currentSize, (int)lastElementAlignment);
+ }
virtual IRSizeAndAlignment alignCompositeElement(IRSizeAndAlignment elementSize)
{
elementSize.alignment = (int)align(elementSize.alignment, 16);
diff --git a/source/slang/slang-ir-layout.h b/source/slang/slang-ir-layout.h
index 12da047d2..cae8d788a 100644
--- a/source/slang/slang-ir-layout.h
+++ b/source/slang/slang-ir-layout.h
@@ -56,6 +56,7 @@ public:
IRTypeLayoutRuleName ruleName;
virtual IRSizeAndAlignment alignCompositeElement(IRSizeAndAlignment elementSize) = 0;
virtual IRSizeAndAlignment getVectorSizeAndAlignment(IRSizeAndAlignment element, IRIntegerValue count) = 0;
+ virtual IRIntegerValue adjustOffsetForNextAggregateMember(IRIntegerValue currentSize, IRIntegerValue lastElementAlignment) = 0;
static IRTypeLayoutRules* getStd430();
static IRTypeLayoutRules* getStd140();
static IRTypeLayoutRules* getNatural();
diff --git a/source/slang/slang-ir-lower-buffer-element-type.cpp b/source/slang/slang-ir-lower-buffer-element-type.cpp
index bf87d72fe..bc8e47e91 100644
--- a/source/slang/slang-ir-lower-buffer-element-type.cpp
+++ b/source/slang/slang-ir-lower-buffer-element-type.cpp
@@ -165,7 +165,9 @@ namespace Slang
for (IRIntegerValue ii = 0; ii < count; ++ii)
{
auto packedElement = builder.emitElementExtract(packedArray, ii);
- auto originalElement = builder.emitCallInst(innerTypeInfo.originalType, innerTypeInfo.convertLoweredToOriginal, 1, &packedElement);
+ auto originalElement = innerTypeInfo.convertLoweredToOriginal
+ ? builder.emitCallInst(innerTypeInfo.originalType, innerTypeInfo.convertLoweredToOriginal, 1, &packedElement)
+ : packedElement;
args[(Index)ii] = originalElement;
}
auto result = builder.emitMakeArray(arrayType, (UInt)args.getCount(), args.getBuffer());
@@ -194,7 +196,9 @@ namespace Slang
for (IRIntegerValue ii = 0; ii < count; ++ii)
{
auto originalElement = builder.emitElementExtract(originalParam, ii);
- auto packedElement = builder.emitCallInst(innerTypeInfo.loweredType, innerTypeInfo.convertOriginalToLowered, 1, &originalElement);
+ auto packedElement = innerTypeInfo.convertOriginalToLowered
+ ? builder.emitCallInst(innerTypeInfo.loweredType, innerTypeInfo.convertOriginalToLowered, 1, &originalElement)
+ : originalElement;
args[(Index)ii] = packedElement;
}
auto packedArray = builder.emitMakeArray(innerArrayType, (UInt)args.getCount(), args.getBuffer());
@@ -259,7 +263,7 @@ namespace Slang
auto arrayType = builder.getArrayType(
vectorType,
isColMajor?matrixType->getColumnCount():matrixType->getRowCount(),
- builder.getIntValue(builder.getIntType(), elementSizeAlignment.size));
+ builder.getIntValue(builder.getIntType(), elementSizeAlignment.getStride()));
builder.createStructField(loweredType, structKey, arrayType);
info.loweredType = loweredType;
@@ -272,10 +276,16 @@ namespace Slang
else if (auto arrayType = as<IRArrayType>(type))
{
auto loweredInnerTypeInfo = getLoweredTypeInfo(arrayType->getElementType(), rules);
- if (!loweredInnerTypeInfo.convertLoweredToOriginal)
+ // For spirv backend, we always want to lower all array types, even if the element type
+ // comes out the same. This is because different layout rules may have different array
+ // stride requirements.
+ if (!target->shouldEmitSPIRVDirectly())
{
- info.loweredType = type;
- return info;
+ if (!loweredInnerTypeInfo.convertLoweredToOriginal)
+ {
+ info.loweredType = type;
+ return info;
+ }
}
auto loweredType = builder.createStructType();
info.loweredType = loweredType;
@@ -287,12 +297,12 @@ namespace Slang
auto structKey = builder.createStructKey();
builder.addNameHintDecoration(structKey, UnownedStringSlice("data"));
IRSizeAndAlignment elementSizeAlignment;
- getSizeAndAlignment(rules, loweredType, &elementSizeAlignment);
+ getSizeAndAlignment(rules, loweredInnerTypeInfo.loweredType, &elementSizeAlignment);
elementSizeAlignment = rules->alignCompositeElement(elementSizeAlignment);
auto innerArrayType = builder.getArrayType(
loweredInnerTypeInfo.loweredType,
arrayType->getElementCount(),
- builder.getIntValue(builder.getIntType(), elementSizeAlignment.size));
+ builder.getIntValue(builder.getIntType(), elementSizeAlignment.getStride()));
builder.createStructField(loweredType, structKey, innerArrayType);
info.loweredInnerArrayType = innerArrayType;
info.loweredInnerStructKey = structKey;
@@ -312,12 +322,19 @@ namespace Slang
if (loweredFieldTypeInfo.convertLoweredToOriginal || rules->ruleName != IRTypeLayoutRuleName::Natural)
isTrivial = false;
}
- if (isTrivial)
+
+ // For spirv backend, we always want to lower all array types, even if the element type
+ // comes out the same. This is because different layout rules may have different array
+ // stride requirements.
+ if (!target->shouldEmitSPIRVDirectly())
{
- info.loweredType = type;
- return info;
+ // For non-spirv target, we skip lowering this type if all field types are unchanged.
+ if (isTrivial)
+ {
+ info.loweredType = type;
+ return info;
+ }
}
-
auto loweredType = builder.createStructType();
StringBuilder nameSB;
getTypeNameHint(nameSB, type);
diff --git a/source/slang/slang-ir-specialize-function-call.cpp b/source/slang/slang-ir-specialize-function-call.cpp
index 2fa93087f..f9e106920 100644
--- a/source/slang/slang-ir-specialize-function-call.cpp
+++ b/source/slang/slang-ir-specialize-function-call.cpp
@@ -172,7 +172,9 @@ struct FunctionParameterSpecializationContext
return false;
if(!func->isDefinition())
return false;
-
+ UnownedStringSlice def;
+ if (findTargetIntrinsicDefinition(func, targetRequest->getTargetCaps(), def))
+ return false;
// With the basic checks out of the way, there are
// two conditions we care about:
//
diff --git a/source/slang/slang-ir-specialize-resources.cpp b/source/slang/slang-ir-specialize-resources.cpp
index 09c8f1f22..1a96389cb 100644
--- a/source/slang/slang-ir-specialize-resources.cpp
+++ b/source/slang/slang-ir-specialize-resources.cpp
@@ -288,6 +288,9 @@ struct ResourceOutputSpecializationPass
//
if(!func->isDefinition())
return false;
+ UnownedStringSlice def;
+ if (findTargetIntrinsicDefinition(func, targetRequest->getTargetCaps(), def))
+ return false;
// If any of the parameters of the function are `out`
// or `inout` parameters of a resource type, then we
diff --git a/source/slang/slang-ir-specialize-target-switch.cpp b/source/slang/slang-ir-specialize-target-switch.cpp
index 2593389b1..2be7c8194 100644
--- a/source/slang/slang-ir-specialize-target-switch.cpp
+++ b/source/slang/slang-ir-specialize-target-switch.cpp
@@ -30,10 +30,16 @@ namespace Slang
bestCapSet = capSet;
}
}
- SLANG_ASSERT(targetBlock);
IRBuilder builder(targetSwitch);
builder.setInsertBefore(targetSwitch);
- builder.emitBranch(targetBlock);
+ if (targetBlock)
+ {
+ builder.emitBranch(targetBlock);
+ }
+ else
+ {
+ builder.emitMissingReturn();
+ }
targetSwitch->removeAndDeallocate();
changed = true;
}
diff --git a/source/slang/slang-ir-spirv-legalize.cpp b/source/slang/slang-ir-spirv-legalize.cpp
index 32386e53f..a4b33324b 100644
--- a/source/slang/slang-ir-spirv-legalize.cpp
+++ b/source/slang/slang-ir-spirv-legalize.cpp
@@ -25,6 +25,67 @@ struct SPIRVLegalizationContext : public SourceEmitterBase
SPIRVEmitSharedContext* m_sharedContext;
IRModule* m_module;
+
+ struct LoweredStructuredBufferTypeInfo
+ {
+ IRType* structType;
+ IRStructKey* arrayKey;
+ IRArrayTypeBase* runtimeArrayType;
+ };
+ Dictionary<IRType*, LoweredStructuredBufferTypeInfo> m_loweredStructuredBufferTypes;
+
+ LoweredStructuredBufferTypeInfo lowerStructuredBufferType(IRHLSLStructuredBufferTypeBase* inst)
+ {
+ LoweredStructuredBufferTypeInfo result;
+ if (m_loweredStructuredBufferTypes.tryGetValue(inst, result))
+ return result;
+
+ auto layoutRules = getTypeLayoutRuleForBuffer(m_sharedContext->m_targetRequest, inst);
+
+ IRBuilder builder(m_sharedContext->m_irModule);
+
+ builder.setInsertBefore(inst);
+ auto elementType = inst->getElementType();
+ IRSizeAndAlignment elementSize;
+ getSizeAndAlignment(layoutRules, elementType, &elementSize);
+ elementSize = layoutRules->alignCompositeElement(elementSize);
+
+ const auto arrayType = builder.getUnsizedArrayType(inst->getElementType(), builder.getIntValue(builder.getIntType(), elementSize.getStride()));
+ const auto structType = builder.createStructType();
+ const auto arrayKey = builder.createStructKey();
+ builder.createStructField(structType, arrayKey, arrayType);
+ IRSizeAndAlignment structSize;
+ getSizeAndAlignment(layoutRules, structType, &structSize);
+
+ StringBuilder nameSb;
+ switch (inst->getOp())
+ {
+ case kIROp_HLSLRWStructuredBufferType:
+ nameSb << "RWStructuredBuffer";
+ break;
+ case kIROp_HLSLAppendStructuredBufferType:
+ nameSb << "AppendStructuredBuffer";
+ break;
+ case kIROp_HLSLConsumeStructuredBufferType:
+ nameSb << "ConsumeStructuredBuffer";
+ break;
+ case kIROp_HLSLRasterizerOrderedStructuredBufferType:
+ nameSb << "RasterizerOrderedStructuredBuffer";
+ break;
+ default:
+ nameSb << "StructuredBuffer";
+ break;
+ }
+ builder.addNameHintDecoration(structType, nameSb.getUnownedSlice());
+ builder.addDecoration(structType, kIROp_SPIRVBlockDecoration);
+
+ result.structType = structType;
+ result.arrayKey = arrayKey;
+ result.runtimeArrayType = arrayType;
+ m_loweredStructuredBufferTypes[inst] = result;
+ return result;
+ }
+
// We will use a single work list of instructions that need
// to be considered for specialization or simplification,
// whether generic, existential, etc.
@@ -84,13 +145,74 @@ struct SPIRVLegalizationContext : public SourceEmitterBase
return structType;
}
+ static void insertLoadAtLatestLocation(IRInst* addrInst, IRUse* inUse)
+ {
+ struct WorkItem { IRInst* addr; IRUse* use; };
+ List<WorkItem> workList;
+ List<IRInst*> instsToRemove;
+ workList.add(WorkItem{ addrInst, inUse });
+ for (Index i = 0; i < workList.getCount(); i++)
+ {
+ auto use = workList[i].use;
+ auto addr = workList[i].addr;
+ auto user = use->getUser();
+ IRBuilder builder(user);
+ builder.setInsertBefore(user);
+ switch (user->getOp())
+ {
+ case kIROp_GetElement:
+ case kIROp_FieldExtract:
+ {
+ auto basePtrType = as<IRPtrTypeBase>(addr->getDataType());
+ IRType* ptrType = nullptr;
+ if (basePtrType->hasAddressSpace())
+ ptrType = builder.getPtrType(kIROp_PtrType, user->getDataType(), basePtrType->getAddressSpace());
+ else
+ ptrType = builder.getPtrType(kIROp_PtrType, user->getDataType());
+ IRInst* subAddr = nullptr;
+ if (user->getOp() == kIROp_GetElement)
+ subAddr = builder.emitElementAddress(ptrType, addr, as<IRGetElement>(user)->getIndex());
+ else
+ subAddr = builder.emitFieldAddress(ptrType, addr, as<IRFieldExtract>(user)->getField());
+
+ for (auto u = user->firstUse; u; u = u->nextUse)
+ {
+ workList.add(WorkItem{ subAddr, u });
+ }
+ instsToRemove.add(user);
+ break;
+ }
+ default:
+ {
+ auto val = builder.emitLoad(addr);
+ builder.replaceOperand(use, val);
+ break;
+ }
+ }
+ }
+
+ for (auto i : instsToRemove)
+ if (!i->hasUses())
+ i->removeAndDeallocate();
+ }
+
void processGlobalParam(IRGlobalParam* inst)
{
// If the global param is not a pointer type, make it so and insert explicit load insts.
auto ptrType = as<IRPtrTypeBase>(inst->getDataType());
if (!ptrType)
{
- if (as<IRResourceTypeBase>(inst))
+ auto innerType = inst->getFullType();
+
+ auto arrayType = as<IRArrayType>(inst->getDataType());
+ IRInst* arraySize = nullptr;
+ if (arrayType)
+ {
+ arraySize = arrayType->getElementCount();
+ innerType = arrayType->getElementType();
+ }
+
+ if (as<IRResourceTypeBase>(innerType))
return;
SpvStorageClass storageClass = SpvStorageClassPrivate;
@@ -112,7 +234,6 @@ struct SPIRVLegalizationContext : public SourceEmitterBase
// Strip any HLSL wrappers
IRBuilder builder(m_sharedContext->m_irModule);
bool needLoad = true;
- auto innerType = inst->getFullType();
auto cbufferType = as<IRConstantBufferType>(innerType);
auto paramBlockType = as<IRParameterBlockType>(innerType);
if (cbufferType || paramBlockType)
@@ -165,6 +286,21 @@ struct SPIRVLegalizationContext : public SourceEmitterBase
varLayoutInst->removeAndDeallocate();
}
}
+ else
+ {
+ if (auto structuredBufferType = as<IRHLSLStructuredBufferTypeBase>(innerType))
+ {
+ innerType = lowerStructuredBufferType(structuredBufferType).structType;
+ storageClass = SpvStorageClassStorageBuffer;
+ needLoad = false;
+ }
+ }
+
+ auto innerElementType = innerType;
+ if (arraySize)
+ {
+ innerType = builder.getArrayType(innerType, arraySize);
+ }
// Make a pointer type of storageClass.
builder.setInsertBefore(inst);
@@ -173,17 +309,27 @@ struct SPIRVLegalizationContext : public SourceEmitterBase
if (needLoad)
{
// Insert an explicit load at each use site.
- List<IRUse*> uses;
- for (auto use = inst->firstUse; use; use = use->nextUse)
- {
- uses.add(use);
- }
- for (auto use : uses)
- {
- builder.setInsertBefore(use->getUser());
- auto loadedValue = builder.emitLoad(inst);
- use->set(loadedValue);
- }
+ traverseUses(inst, [&](IRUse* use)
+ {
+ insertLoadAtLatestLocation(inst, use);
+ });
+ }
+ else if (arraySize)
+ {
+ traverseUses(inst, [&](IRUse* use)
+ {
+ auto user = use->getUser();
+ if (auto getElement = as<IRGetElement>(user))
+ {
+ // For array resources, getElement(r, index) ==> getElementPtr(r, index).
+ IRBuilder builder(getElement);
+ builder.setInsertBefore(user);
+ auto newAddr = builder.emitElementAddress(builder.getPtrType(kIROp_PtrType, innerElementType, storageClass), inst, getElement->getIndex());
+ user->replaceUsesWith(newAddr);
+ user->removeAndDeallocate();
+ return;
+ }
+ });
}
}
processGlobalVar(inst);
@@ -535,53 +681,6 @@ struct SPIRVLegalizationContext : public SourceEmitterBase
}
}
- void processStructuredBufferType(IRHLSLStructuredBufferTypeBase * inst)
- {
- auto layoutRules = getTypeLayoutRuleForBuffer(m_sharedContext->m_targetRequest, inst);
-
- IRBuilder builder(m_sharedContext->m_irModule);
-
- builder.setInsertBefore(inst);
- auto elementType = inst->getElementType();
- IRSizeAndAlignment elementSize;
- getSizeAndAlignment(layoutRules, elementType, &elementSize);
- elementSize = layoutRules->alignCompositeElement(elementSize);
-
- const auto arrayType = builder.getUnsizedArrayType(inst->getElementType(), builder.getIntValue(builder.getIntType(), elementSize.getStride()));
- const auto structType = builder.createStructType();
- const auto arrayKey = builder.createStructKey();
- builder.createStructField(structType, arrayKey, arrayType);
- IRSizeAndAlignment structSize;
- getSizeAndAlignment(layoutRules, structType, &structSize);
-
- const auto ptrType = builder.getPtrType(kIROp_PtrType, structType, SpvStorageClassStorageBuffer);
-
- StringBuilder nameSb;
- switch (inst->getOp())
- {
- case kIROp_HLSLRWStructuredBufferType:
- nameSb << "RWStructuredBuffer";
- break;
- case kIROp_HLSLAppendStructuredBufferType:
- nameSb << "AppendStructuredBuffer";
- break;
- case kIROp_HLSLConsumeStructuredBufferType:
- nameSb << "ConsumeStructuredBuffer";
- break;
- case kIROp_HLSLRasterizerOrderedStructuredBufferType:
- nameSb << "RasterizerOrderedStructuredBuffer";
- break;
- default:
- nameSb << "StructuredBuffer";
- break;
- }
- builder.addNameHintDecoration(structType, nameSb.getUnownedSlice());
- builder.addDecoration(structType, kIROp_SPIRVBlockDecoration);
- inst->replaceUsesWith(ptrType);
- inst->removeAndDeallocate();
- addUsersToWorkList(ptrType);
- }
-
void duplicateMergeBlockIfNeeded(IRUse* breakBlockUse)
{
auto breakBlock = as<IRBlock>(breakBlockUse->get());
@@ -778,7 +877,20 @@ struct SPIRVLegalizationContext : public SourceEmitterBase
void processModule()
{
- addToWorkList(m_module->getModuleInst());
+ // Process global params before anything else, so we don't generate inefficient
+ // array marhalling code for array-typed global params.
+ for (auto globalInst : m_module->getGlobalInsts())
+ {
+ if (auto globalParam = as<IRGlobalParam>(globalInst))
+ {
+ processGlobalParam(globalParam);
+ }
+ else
+ {
+ addToWorkList(globalInst);
+ }
+ }
+
while (workList.getCount() != 0)
{
IRInst* inst = workList.getLast();
@@ -815,10 +927,6 @@ struct SPIRVLegalizationContext : public SourceEmitterBase
case kIROp_RWStructuredBufferStore:
processRWStructuredBufferStore(inst);
break;
- case kIROp_HLSLStructuredBufferType:
- case kIROp_HLSLRWStructuredBufferType:
- processStructuredBufferType(as<IRHLSLStructuredBufferTypeBase>(inst));
- break;
case kIROp_loop:
processLoop(as<IRLoop>(inst));
break;
@@ -837,6 +945,23 @@ struct SPIRVLegalizationContext : public SourceEmitterBase
}
}
+ // Translate types.
+ List<IRHLSLStructuredBufferTypeBase*> instsToProcess;
+ for (auto globalInst : m_module->getGlobalInsts())
+ {
+ if (auto t = as<IRHLSLStructuredBufferTypeBase>(globalInst))
+ {
+ instsToProcess.add(t);
+ }
+ }
+ for (auto t : instsToProcess)
+ {
+ auto lowered = lowerStructuredBufferType(t);
+ IRBuilder builder(t);
+ builder.setInsertBefore(t);
+ t->replaceUsesWith(builder.getPtrType(kIROp_PtrType, lowered.structType, SpvStorageClassStorageBuffer));
+ }
+
// SPIRV requires a dominator block to appear before dominated blocks.
// After legalizing the control flow, we need to sort our blocks to ensure this is true.
for (auto globalInst : m_module->getGlobalInsts())
diff --git a/source/slang/slang-ir-use-uninitialized-out-param.cpp b/source/slang/slang-ir-use-uninitialized-out-param.cpp
index 977876c6b..479538441 100644
--- a/source/slang/slang-ir-use-uninitialized-out-param.cpp
+++ b/source/slang/slang-ir-use-uninitialized-out-param.cpp
@@ -69,6 +69,7 @@ namespace Slang
stores.add(StoreSite{ use->getUser(), addr });
break;
case kIROp_Call:
+ case kIROp_SPIRVAsm:
// If we see a call using this address, treat it as a store.
stores.add(StoreSite{ use->getUser(), addr });
break;
diff --git a/source/slang/slang-parser.cpp b/source/slang/slang-parser.cpp
index 3a1b627bd..b270ba713 100644
--- a/source/slang/slang-parser.cpp
+++ b/source/slang/slang-parser.cpp
@@ -6071,6 +6071,32 @@ namespace Slang
parser->ReadToken(TokenType::Identifier);
return varExpr;
}
+ case TokenType::Scope:
+ {
+ parser->ReadToken(TokenType::Scope);
+ VarExpr* varExpr = parser->astBuilder->create<VarExpr>();
+ varExpr->scope = parser->currentScope;
+ while (varExpr->scope && !as<ModuleDecl>(varExpr->scope->containerDecl))
+ varExpr->scope = varExpr->scope->parent;
+ parser->FillPosition(varExpr);
+
+ auto nameToken = peekToken(parser);
+ auto nameAndLoc = NameLoc(nameToken);
+ varExpr->name = nameAndLoc.name;
+ if (nameToken.type == TokenType::CompletionRequest)
+ {
+ parser->hasSeenCompletionToken = true;
+ }
+ else
+ {
+ parser->ReadToken(TokenType::Identifier);
+ if (peekTokenType(parser) == TokenType::OpLess)
+ {
+ return maybeParseGenericApp(parser, varExpr);
+ }
+ }
+ return varExpr;
+ }
case TokenType::Identifier:
{
// We will perform name lookup here so that we can find syntax
diff --git a/source/slang/slang-spirv-val.cpp b/source/slang/slang-spirv-val.cpp
index d41f0e8cc..a6bc29306 100644
--- a/source/slang/slang-spirv-val.cpp
+++ b/source/slang/slang-spirv-val.cpp
@@ -53,6 +53,8 @@ SlangResult debugValidateSPIRV(const List<uint8_t>& spirv)
// Set up our process
CommandLine commandLine;
commandLine.m_executableLocation.setName("spirv-val");
+ commandLine.addArg("--target-env");
+ commandLine.addArg("vulkan1.2");
RefPtr<Process> p;
const auto createResult = Process::create(commandLine, 0, p);
// If we failed to even start the process, then validation isn't available
diff --git a/tests/expected-failure.txt b/tests/expected-failure.txt
index 9a685b5c7..3e158e2d1 100644
--- a/tests/expected-failure.txt
+++ b/tests/expected-failure.txt
@@ -1,24 +1,14 @@
tests/autodiff/global-param-hoisting.slang.1 (vk)
-tests/bugs/atomic-coerce.slang.1 (vk)
tests/bugs/buffer-swizzle-store.slang.1 (vk)
-tests/bugs/byte-address-buffer-interlocked-add-f32.slang (vk)
tests/bugs/gh-3075.slang.2 (vk)
tests/bugs/ray-query-in-generic.slang.1 (vk)
-tests/compute/buffer-layout.slang.2 (vk)
tests/compute/half-rw-texture-convert.slang.4 (vk)
tests/compute/half-rw-texture-convert2.slang.4 (vk)
-tests/compute/loop-unroll.slang.5 (vk)
tests/compute/ray-tracing-inline.slang.1 (vk)
tests/compute/rw-texture-simple.slang.4 (vk)
tests/compute/texture-sample-grad-offset-clamp.slang (vk)
tests/compute/texture-simple.slang.4 (vk)
tests/compute/texture-simpler.slang (vk)
-tests/hlsl/glsl-matrix-layout.slang (vk)
tests/language-feature/constants/constexpr-loop.slang.1 (vk)
tests/optimization/func-resource-result/func-resource-result-complex.slang.1 (vk)
-tests/slang-extension/atomic-float-byte-address-buffer.slang.2 (vk)
-tests/slang-extension/atomic-int64-byte-address-buffer.slang.4 (vk)
-tests/slang-extension/atomic-min-max-u64-byte-address-buffer.slang.4 (vk)
-tests/slang-extension/cas-int64-byte-address-buffer.slang.4 (vk)
-tests/slang-extension/exchange-int64-byte-address-buffer.slang.4 (vk)
tests/type/texture-sampler/texture-sampler-2d.slang (vk)
diff --git a/tests/hlsl-intrinsic/byte-address-buffer-atomics.slang b/tests/hlsl-intrinsic/byte-address-buffer-atomics.slang
new file mode 100644
index 000000000..f133bb372
--- /dev/null
+++ b/tests/hlsl-intrinsic/byte-address-buffer-atomics.slang
@@ -0,0 +1,64 @@
+//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK): -dx12 -use-dxil -output-using-type
+//TEST(compute, vulkan):COMPARE_COMPUTE(filecheck-buffer=CHECK):-vk -output-using-type
+//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-cuda -output-using-type
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+//TEST_INPUT:set bbuffer = ubuffer(data=[0 0 0 0])
+RWByteAddressBuffer bbuffer;
+
+[numthreads(1, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ uint originalValue;
+ bbuffer.InterlockedAdd(0, 1);
+
+ bbuffer.InterlockedAdd(0, 1, originalValue);
+ outputBuffer[4] = originalValue;
+
+ bbuffer.InterlockedMax(0, 3);
+ bbuffer.InterlockedMax(0, 4, originalValue);
+ outputBuffer[5] = originalValue;
+
+ bbuffer.InterlockedMin(0, 2);
+ bbuffer.InterlockedMin(0, 2, originalValue);
+ outputBuffer[6] = originalValue;
+
+ bbuffer.InterlockedOr(0, 1);
+ bbuffer.InterlockedOr(0, 1, originalValue);
+ outputBuffer[7] = originalValue;
+
+ bbuffer.InterlockedXor(0, 4);
+ bbuffer.InterlockedXor(0, 4, originalValue);
+ outputBuffer[8] = originalValue;
+
+ bbuffer.InterlockedAnd(0, 7);
+ bbuffer.InterlockedAnd(0, 7, originalValue);
+ outputBuffer[9] = originalValue;
+
+ bbuffer.InterlockedCompareExchange(4, 0, 1, originalValue);
+ outputBuffer[10] = originalValue;
+
+ bbuffer.InterlockedExchange(8, 3, originalValue);
+ outputBuffer[11] = originalValue;
+
+ bbuffer.InterlockedCompareStore(12, 0, 3);
+
+ // CHECK: 3
+ // CHECK: 1
+ // CHECK: 3
+ // CHECK: 3
+ outputBuffer[0] = bbuffer.Load(0);
+ outputBuffer[1] = bbuffer.Load(4);
+ outputBuffer[2] = bbuffer.Load(8);
+ outputBuffer[3] = bbuffer.Load(12);
+ // CHECK: 1
+ // CHECK: 3
+ // CHECK: 2
+ // CHECK: 3
+ // CHECK: 7
+ // CHECK: 3
+ // CHECK: 0
+ // CHECK: 0
+} \ No newline at end of file