diff options
| author | Yong He <yonghe@outlook.com> | 2023-08-31 13:49:40 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-08-31 13:49:40 -0700 |
| commit | cc412af89e54b04ead508ca84825a18d001b92d0 (patch) | |
| tree | da7bb020c494cc4dc62a9c641fb88d7b0b9f89f2 | |
| parent | 1996785e1c5d76254a102c1ec0df5dd7e2e4d68a (diff) | |
Add SPIRV atomics intrinsics and fix buffer layout lowering. (#3170)
* Fix atomics intrinsics and buffer layout lowering.
* Fix.
* Add more test.
* Fix.
---------
Co-authored-by: Yong He <yhe@nvidia.com>
| -rw-r--r-- | source/compiler-core/slang-dxc-compiler.cpp | 4 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang | 1075 | ||||
| -rw-r--r-- | source/slang/slang-check-expr.cpp | 3 | ||||
| -rw-r--r-- | source/slang/slang-emit-spirv.cpp | 1 | ||||
| -rw-r--r-- | source/slang/slang-ir-inline.cpp | 30 | ||||
| -rw-r--r-- | source/slang/slang-ir-layout.cpp | 23 | ||||
| -rw-r--r-- | source/slang/slang-ir-layout.h | 1 | ||||
| -rw-r--r-- | source/slang/slang-ir-lower-buffer-element-type.cpp | 41 | ||||
| -rw-r--r-- | source/slang/slang-ir-specialize-function-call.cpp | 4 | ||||
| -rw-r--r-- | source/slang/slang-ir-specialize-resources.cpp | 3 | ||||
| -rw-r--r-- | source/slang/slang-ir-specialize-target-switch.cpp | 10 | ||||
| -rw-r--r-- | source/slang/slang-ir-spirv-legalize.cpp | 255 | ||||
| -rw-r--r-- | source/slang/slang-ir-use-uninitialized-out-param.cpp | 1 | ||||
| -rw-r--r-- | source/slang/slang-parser.cpp | 26 | ||||
| -rw-r--r-- | source/slang/slang-spirv-val.cpp | 2 | ||||
| -rw-r--r-- | tests/expected-failure.txt | 10 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/byte-address-buffer-atomics.slang | 64 |
17 files changed, 1264 insertions, 289 deletions
diff --git a/source/compiler-core/slang-dxc-compiler.cpp b/source/compiler-core/slang-dxc-compiler.cpp index 9428d0d26..6956a3627 100644 --- a/source/compiler-core/slang-dxc-compiler.cpp +++ b/source/compiler-core/slang-dxc-compiler.cpp @@ -26,8 +26,12 @@ // Enable DXIL by default unless told not to #ifndef SLANG_ENABLE_DXIL_SUPPORT +#if SLANG_APPLE_FAMILY +# define SLANG_ENABLE_DXIL_SUPPORT 0 +#else # define SLANG_ENABLE_DXIL_SUPPORT 1 #endif +#endif // Enable calling through to `dxc` to // generate code on Windows. diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 4c4c7cd59..d56c4ffcd 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -118,10 +118,22 @@ struct ByteAddressBuffer // We have separate int/float implementations, as the float version requires some specific extensions // https://www.khronos.org/registry/OpenGL/extensions/NV/NV_shader_atomic_float.txt -__target_intrinsic(glsl, "atomicAdd($0, $1)") __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_float) -float __atomicAdd(__ref float value, float amount); +float __atomicAdd(__ref float value, float amount) +{ + __target_switch + { + case glsl: __intrinsic_asm "atomicAdd($0, $1)"; + case spirv: + return spirv_asm + { + OpExtension "SPV_EXT_shader_atomic_float_add"; + OpCapability AtomicFloat32AddEXT; + result:$$float = OpAtomicFAddEXT &value Device None $amount + }; + } +} // Helper for hlsl, using NVAPI __target_intrinsic(hlsl, "NvInterlockedAddUint64($0, $1, $2)") @@ -137,23 +149,66 @@ void __atomicAdd(RWByteAddressBuffer buf, uint offset, uint64_t value, out uint6 // Int versions require glsl 4.30 // https://www.khronos.org/registry/OpenGL-Refpages/gl4/html/atomicAdd.xhtml -__target_intrinsic(glsl, "atomicAdd($0, $1)") __glsl_version(430) -int __atomicAdd(__ref int value, int amount); +int __atomicAdd(__ref int value, int amount) +{ + __target_switch + { + case glsl: __intrinsic_asm "atomicAdd($0, $1)"; + case spirv: + return spirv_asm + { + result:$$int = OpAtomicIAdd &value Device None $amount; + }; + } +} -__target_intrinsic(glsl, "atomicAdd($0, $1)") __glsl_version(430) -uint __atomicAdd(__ref uint value, uint amount); +uint __atomicAdd(__ref uint value, uint amount) +{ + __target_switch + { + case glsl: __intrinsic_asm "atomicAdd($0, $1)"; + case spirv: + return spirv_asm + { + result:$$uint = OpAtomicIAdd &value Device None $amount; + }; + } +} -__target_intrinsic(glsl, "atomicAdd($0, $1)") __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) -int64_t __atomicAdd(__ref int64_t value, int64_t amount); +int64_t __atomicAdd(__ref int64_t value, int64_t amount) +{ + __target_switch + { + case glsl: __intrinsic_asm "atomicAdd($0, $1)"; + case spirv: + return spirv_asm + { + OpCapability Int64Atomics; + result:$$int64_t = OpAtomicIAdd &value Device None $amount + }; + } +} __target_intrinsic(glsl, "atomicAdd($0, $1)") __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) -uint64_t __atomicAdd(__ref uint64_t value, uint64_t amount); +uint64_t __atomicAdd(__ref uint64_t value, uint64_t amount) +{ + __target_switch + { + case glsl: __intrinsic_asm "atomicAdd($0, $1)"; + case spirv: + return spirv_asm + { + OpCapability Int64Atomics; + result:$$uint64_t = OpAtomicIAdd &value Device None $amount + }; + } +} // Cas - Compare and swap @@ -169,15 +224,37 @@ void __cas(RWByteAddressBuffer buf, uint offset, in int64_t compare_value, in in __target_intrinsic(hlsl, "$0.InterlockedCompareExchange64($1, $2, $3, $4)") void __cas(RWByteAddressBuffer buf, uint offset, in uint64_t compare_value, in uint64_t value, out uint64_t original_value); -__target_intrinsic(glsl, "atomicCompSwap($0, $1, $2)") __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) -uint64_t __cas(__ref int64_t ioValue, int64_t compareValue, int64_t newValue); +int64_t __cas(__ref int64_t ioValue, int64_t compareValue, int64_t newValue) +{ + __target_switch + { + case glsl: __intrinsic_asm "atomicCompSwap($0, $1, $2)"; + case spirv: + return spirv_asm + { + OpCapability Int64Atomics; + result:$$int64_t = OpAtomicCompareExchange &ioValue Device None None $newValue $compareValue + }; + } +} -__target_intrinsic(glsl, "atomicCompSwap($0, $1, $2)") __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) -uint64_t __cas(__ref uint64_t ioValue, uint64_t compareValue, uint64_t newValue); +uint64_t __cas(__ref uint64_t ioValue, uint64_t compareValue, uint64_t newValue) +{ + __target_switch + { + case glsl: __intrinsic_asm "atomicCompSwap($0, $1, $2)"; + case spirv: + return spirv_asm + { + OpCapability Int64Atomics; + result:$$uint64_t = OpAtomicCompareExchange &ioValue Device None None $newValue $compareValue + }; + } +} // Max @@ -185,10 +262,21 @@ __target_intrinsic(hlsl, "NvInterlockedMaxUint64($0, $1, $2)") [__requiresNVAPI] uint2 __atomicMax(RWByteAddressBuffer buf, uint offset, uint2 value); -__target_intrinsic(glsl, "atomicMax($0, $1)") __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) -uint64_t __atomicMax(__ref uint64_t ioValue, uint64_t value); +uint64_t __atomicMax(__ref uint64_t ioValue, uint64_t value) +{ + __target_switch + { + case glsl: __intrinsic_asm "atomicMax($0, $1)"; + case spirv: + return spirv_asm + { + OpCapability Int64Atomics; + result:$$uint64_t = OpAtomicUMax &ioValue Device None $value + }; + } +} // Min @@ -196,10 +284,21 @@ __target_intrinsic(hlsl, "NvInterlockedMinUint64($0, $1, $2)") [__requiresNVAPI] uint2 __atomicMin(RWByteAddressBuffer buf, uint offset, uint2 value); -__target_intrinsic(glsl, "atomicMin($0, $1)") __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) -uint64_t __atomicMin(__ref uint64_t ioValue, uint64_t value); +uint64_t __atomicMin(__ref uint64_t ioValue, uint64_t value) +{ + __target_switch + { + case glsl: __intrinsic_asm "atomicMin($0, $1)"; + case spirv: + return spirv_asm + { + OpCapability Int64Atomics; + result:$$uint64_t = OpAtomicUMin &ioValue Device None $value + }; + } +} // And @@ -207,10 +306,21 @@ __target_intrinsic(hlsl, "NvInterlockedAndUint64($0, $1, $2)") [__requiresNVAPI] uint2 __atomicAnd(RWByteAddressBuffer buf, uint offset, uint2 value); -__target_intrinsic(glsl, "atomicAnd($0, $1)") __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) -uint64_t __atomicAnd(__ref uint64_t ioValue, uint64_t value); +uint64_t __atomicAnd(__ref uint64_t ioValue, uint64_t value) +{ + __target_switch + { + case glsl: __intrinsic_asm "atomicAnd($0, $1)"; + case spirv: + return spirv_asm + { + OpCapability Int64Atomics; + result:$$uint64_t = OpAtomicAnd &ioValue Device None $value + }; + } +} // Or @@ -218,10 +328,21 @@ __target_intrinsic(hlsl, "NvInterlockedOrUint64($0, $1, $2)") [__requiresNVAPI] uint2 __atomicOr(RWByteAddressBuffer buf, uint offset, uint2 value); -__target_intrinsic(glsl, "atomicOr($0, $1)") __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) -uint64_t __atomicOr(__ref uint64_t ioValue, uint64_t value); +uint64_t __atomicOr(__ref uint64_t ioValue, uint64_t value) +{ + __target_switch + { + case glsl: __intrinsic_asm "atomicOr($0, $1)"; + case spirv: + return spirv_asm + { + OpCapability Int64Atomics; + result:$$uint64_t = OpAtomicOr &ioValue Device None $value + }; + } +} // Xor @@ -229,10 +350,21 @@ __target_intrinsic(hlsl, "NvInterlockedXorUint64($0, $1, $2)") [__requiresNVAPI] uint2 __atomicXor(RWByteAddressBuffer buf, uint offset, uint2 value); -__target_intrinsic(glsl, "atomicXor($0, $1)") __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) -uint64_t __atomicXor(__ref uint64_t ioValue, uint64_t value); +uint64_t __atomicXor(__ref uint64_t ioValue, uint64_t value) +{ + __target_switch + { + case glsl: __intrinsic_asm "atomicXor($0, $1)"; + case spirv: + return spirv_asm + { + OpCapability Int64Atomics; + result:$$uint64_t = OpAtomicXor &ioValue Device None $value + }; + } +} // Exchange @@ -240,10 +372,21 @@ __target_intrinsic(hlsl, "NvInterlockedExchangeUint64($0, $1, $2)") [__requiresNVAPI] uint2 __atomicExchange(RWByteAddressBuffer buf, uint offset, uint2 value); -__target_intrinsic(glsl, "atomicExchange($0, $1)") __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) -uint64_t __atomicExchange(__ref uint64_t ioValue, uint64_t value); +uint64_t __atomicExchange(__ref uint64_t ioValue, uint64_t value) +{ + __target_switch + { + case glsl: __intrinsic_asm "atomicExchange($0, $1)"; + case spirv: + return spirv_asm + { + OpCapability Int64Atomics; + result:$$uint64_t = OpAtomicExchange &ioValue Device None $value + }; + } +} // Conversion between uint64_t and uint2 @@ -441,50 +584,60 @@ ${{{{ // F32 Add - __target_intrinsic(hlsl, "($3 = NvInterlockedAddFp32($0, $1, $2))") __cuda_sm_version(2.0) - __target_intrinsic(cuda, "(*$3 = atomicAdd($0._getPtrAt<float>($1), $2))") [__requiresNVAPI] - void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue); - - __specialized_for_target(glsl) void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue) { - RWStructuredBuffer<float> buf = __getEquivalentStructuredBuffer<float>(this); - originalValue = __atomicAdd(buf[byteAddress / 4], valueToAdd); + __target_switch + { + case hlsl: __intrinsic_asm "($3 = NvInterlockedAddFp32($0, $1, $2))"; + case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<float>($1), $2))"; + case glsl: + case spirv: + { + let buf = __getEquivalentStructuredBuffer<float>(this); + originalValue = __atomicAdd(buf[byteAddress / 4], valueToAdd); + return; + } + } } // Without returning original value - __target_intrinsic(hlsl, "(NvInterlockedAddFp32($0, $1, $2))") [__requiresNVAPI] __cuda_sm_version(2.0) - __target_intrinsic(cuda, "atomicAdd($0._getPtrAt<float>($1), $2)") - void InterlockedAddF32(uint byteAddress, float valueToAdd); - - __specialized_for_target(glsl) void InterlockedAddF32(uint byteAddress, float valueToAdd) { - RWStructuredBuffer<float> buf = __getEquivalentStructuredBuffer<float>(this); - __atomicAdd(buf[byteAddress / 4], valueToAdd); + __target_switch + { + case hlsl: __intrinsic_asm "(NvInterlockedAddFp32($0, $1, $2))"; + case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt<float>($1), $2)"; + case glsl: + case spirv: + { + let buf = __getEquivalentStructuredBuffer<float>(this); + __atomicAdd(buf[byteAddress / 4], valueToAdd); + return; + } + } } // Int64 Add __cuda_sm_version(6.0) - __target_intrinsic(cuda, "(*$3 = atomicAdd($0._getPtrAt<uint64_t>($1), $2))") - void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue); - - __specialized_for_target(hlsl) - void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t outOriginalValue) - { - outOriginalValue = __asuint64(__atomicAdd(this, byteAddress, __asuint2(valueToAdd))); - } - - __specialized_for_target(glsl) void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue) { - RWStructuredBuffer<int64_t> buf = __getEquivalentStructuredBuffer<int64_t>(this); - originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd); + __target_switch + { + case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<uint64_t>($1), $2))"; + case hlsl: + originalValue = __asuint64(__atomicAdd(this, byteAddress, __asuint2(valueToAdd))); + case glsl: + case spirv: + { + let buf = __getEquivalentStructuredBuffer<int64_t>(this); + originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd); + } + } } // Without returning original value @@ -499,9 +652,10 @@ ${{{{ } __specialized_for_target(glsl) + __specialized_for_target(spirv) void InterlockedAddI64(uint byteAddress, int64_t valueToAdd) { - RWStructuredBuffer<int64_t> buf = __getEquivalentStructuredBuffer<int64_t>(this); + let buf = __getEquivalentStructuredBuffer<int64_t>(this); __atomicAdd(buf[byteAddress / 8], valueToAdd); } @@ -517,9 +671,10 @@ ${{{{ } __specialized_for_target(glsl) + __specialized_for_target(spirv) void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue) { - RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this); + let buf = __getEquivalentStructuredBuffer<uint64_t>(this); outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value); } @@ -533,9 +688,10 @@ ${{{{ uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicMax(this, byteAddress, __asuint2(value))); } __specialized_for_target(glsl) + __specialized_for_target(spirv) uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value) { - RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this); + let buf = __getEquivalentStructuredBuffer<uint64_t>(this); return __atomicMax(buf[byteAddress / 8], value); } @@ -549,9 +705,10 @@ ${{{{ uint64_t InterlockedMinU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicMin(this, byteAddress, __asuint2(value))); } __specialized_for_target(glsl) + __specialized_for_target(spirv) uint64_t InterlockedMinU64(uint byteAddress, uint64_t value) { - RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this); + let buf = __getEquivalentStructuredBuffer<uint64_t>(this); return __atomicMin(buf[byteAddress / 8], value); } @@ -564,9 +721,10 @@ ${{{{ uint64_t InterlockedAndU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicAnd(this, byteAddress, __asuint2(value))); } __specialized_for_target(glsl) + __specialized_for_target(spirv) uint64_t InterlockedAndU64(uint byteAddress, uint64_t value) { - RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this); + let buf = __getEquivalentStructuredBuffer<uint64_t>(this); return __atomicAnd(buf[byteAddress / 8], value); } @@ -579,9 +737,10 @@ ${{{{ uint64_t InterlockedOrU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicOr(this, byteAddress, __asuint2(value))); } __specialized_for_target(glsl) + __specialized_for_target(spirv) uint64_t InterlockedOrU64(uint byteAddress, uint64_t value) { - RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this); + let buf = __getEquivalentStructuredBuffer<uint64_t>(this); return __atomicOr(buf[byteAddress / 8], value); } @@ -594,9 +753,10 @@ ${{{{ uint64_t InterlockedXorU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicXor(this, byteAddress, __asuint2(value))); } __specialized_for_target(glsl) + __specialized_for_target(spirv) uint64_t InterlockedXorU64(uint byteAddress, uint64_t value) { - RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this); + let buf = __getEquivalentStructuredBuffer<uint64_t>(this); return __atomicXor(buf[byteAddress / 8], value); } @@ -609,9 +769,10 @@ ${{{{ uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicExchange(this, byteAddress, __asuint2(value))); } __specialized_for_target(glsl) + __specialized_for_target(spirv) uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value) { - RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this); + let buf = __getEquivalentStructuredBuffer<uint64_t>(this); return __atomicExchange(buf[byteAddress / 8], value); } @@ -622,9 +783,10 @@ ${{{{ __atomicAdd(this, byteAddress, valueToAdd, outOriginalValue); } __specialized_for_target(glsl) + __specialized_for_target(spirv) void InterlockedAdd64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue) { - RWStructuredBuffer<int64_t> buf = __getEquivalentStructuredBuffer<int64_t>(this); + let buf = __getEquivalentStructuredBuffer<int64_t>(this); originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd); } __specialized_for_target(hlsl) @@ -633,9 +795,10 @@ ${{{{ __atomicAdd(this, byteAddress, valueToAdd, outOriginalValue); } __specialized_for_target(glsl) + __specialized_for_target(spirv) void InterlockedAdd64(uint byteAddress, uint64_t valueToAdd, out uint64_t originalValue) { - RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this); + let buf = __getEquivalentStructuredBuffer<uint64_t>(this); originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd); } __specialized_for_target(hlsl) @@ -644,9 +807,10 @@ ${{{{ __cas(this, byteAddress, compareValue, value, outOriginalValue); } __specialized_for_target(glsl) + __specialized_for_target(spirv) void InterlockedCompareExchange64(uint byteAddress, int64_t compareValue, int64_t value, out int64_t outOriginalValue) { - RWStructuredBuffer<int64_t> buf = __getEquivalentStructuredBuffer<int64_t>(this); + let buf = __getEquivalentStructuredBuffer<int64_t>(this); outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value); } __specialized_for_target(hlsl) @@ -655,101 +819,251 @@ ${{{{ __cas(this, byteAddress, compareValue, value, outOriginalValue); } __specialized_for_target(glsl) + __specialized_for_target(spirv) void InterlockedCompareExchange64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue) { - RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this); + let buf = __getEquivalentStructuredBuffer<uint64_t>(this); outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value); } ${{{{ - } + } // endif (type == RWByteAddressBuffer) }}}} // Added operations: - - __target_intrinsic(glsl, "($3 = atomicAdd($0._data[$1/4], $2))") void InterlockedAdd( UINT dest, UINT value, - out UINT original_value); + out UINT original_value) + { + __target_switch + { + case glsl: __intrinsic_asm "($3 = atomicAdd($0._data[$1/4], $2))"; + case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<uint32_t>($1), $2))"; + case hlsl: __intrinsic_asm ".InterlockedAdd"; + case spirv: + let buf = __getEquivalentStructuredBuffer<uint>(this); + ::InterlockedAdd(buf[dest / 4], value, original_value); + } + } - __target_intrinsic(glsl, "atomicAdd($0._data[$1/4], $2)") void InterlockedAdd( UINT dest, - UINT value); + UINT value) + { + __target_switch + { + case glsl: __intrinsic_asm "atomicAdd($0._data[$1/4], $2)"; + case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt<uint32_t>($1), $2)"; + case hlsl: __intrinsic_asm ".InterlockedAdd"; + case spirv: + let buf = __getEquivalentStructuredBuffer<uint>(this); + ::InterlockedAdd(buf[dest / 4], value); + } + } - __target_intrinsic(glsl, "($3 = atomicAnd($0._data[$1/4], $2))") void InterlockedAnd( UINT dest, UINT value, - out UINT original_value); + out UINT original_value) + { + __target_switch + { + case glsl: __intrinsic_asm "$3 = atomicAnd($0._data[$1/4], $2)"; + case cuda: __intrinsic_asm "(*$3 = atomicAnd($0._getPtrAt<uint32_t>($1), $2))"; + case hlsl: __intrinsic_asm ".InterlockedAnd"; + case spirv: + let buf = __getEquivalentStructuredBuffer<uint>(this); + ::InterlockedAnd(buf[dest / 4], value, original_value); + } + } - __target_intrinsic(glsl, "atomicAnd($0._data[$1/4], $2)") void InterlockedAnd( UINT dest, - UINT value); + UINT value) + { + __target_switch + { + case glsl: __intrinsic_asm "atomicAnd($0._data[$1/4], $2)"; + case cuda: __intrinsic_asm "atomicAnd($0._getPtrAt<uint32_t>($1), $2)"; + case hlsl: __intrinsic_asm ".InterlockedAnd"; + case spirv: + let buf = __getEquivalentStructuredBuffer<uint>(this); + ::InterlockedAnd(buf[dest / 4], value); + } + } - __target_intrinsic(glsl, "($4 = atomicCompSwap($0._data[$1/4], $2, $3))") void InterlockedCompareExchange( UINT dest, UINT compare_value, UINT value, - out UINT original_value); + out UINT original_value) + { + __target_switch + { + case glsl: __intrinsic_asm "($4 = atomicCompSwap($0._data[$1/4], $2, $3))"; + case cuda: __intrinsic_asm "(*$4 = atomicCAS($0._getPtrAt<uint32_t>($1), $2, $3))"; + case hlsl: __intrinsic_asm ".InterlockedCompareExchange"; + case spirv: + let buf = __getEquivalentStructuredBuffer<uint>(this); + ::InterlockedCompareExchange(buf[dest / 4], compare_value, value, original_value); + } + } - __target_intrinsic(glsl, "atomicCompSwap($0._data[$1/4], $2, $3)") void InterlockedCompareStore( UINT dest, UINT compare_value, - UINT value); + UINT value) + { + __target_switch + { + case glsl: __intrinsic_asm "atomicCompSwap($0._data[$1/4], $2, $3)"; + case cuda: __intrinsic_asm "atomicCAS($0._getPtrAt<uint32_t>($1), $2, $3)"; + case hlsl: __intrinsic_asm ".InterlockedCompareStore"; + case spirv: + let buf = __getEquivalentStructuredBuffer<uint>(this); + ::InterlockedCompareStore(buf[dest / 4], compare_value, value); + } + } - __target_intrinsic(glsl, "($3 = atomicExchange($0._data[$1/4], $2))") void InterlockedExchange( UINT dest, UINT value, - out UINT original_value); + out UINT original_value) + { + __target_switch + { + case glsl: __intrinsic_asm "($3 = atomicExchange($0._data[$1/4], $2))"; + case cuda: __intrinsic_asm "(*$3 = atomicExch($0._getPtrAt<uint32_t>($1), $2))"; + case hlsl: __intrinsic_asm ".InterlockedExchange"; + case spirv: + let buf = __getEquivalentStructuredBuffer<uint>(this); + ::InterlockedExchange(buf[dest / 4], value, original_value); + } + } - __target_intrinsic(glsl, "($3 = atomicMax($0._data[$1/4], $2))") void InterlockedMax( UINT dest, UINT value, - out UINT original_value); + out UINT original_value) + { + __target_switch + { + case glsl: __intrinsic_asm "($3 = atomicMax($0._data[$1/4], $2))"; + case cuda: __intrinsic_asm "(*$3 = atomicMax($0._getPtrAt<uint32_t>($1), $2))"; + case hlsl: __intrinsic_asm ".InterlockedMax"; + case spirv: + let buf = __getEquivalentStructuredBuffer<uint>(this); + ::InterlockedMax(buf[dest / 4], value, original_value); + } + } - __target_intrinsic(glsl, "atomicMax($0._data[$1/4], $2)") void InterlockedMax( UINT dest, - UINT value); + UINT value) + { + __target_switch + { + case glsl: __intrinsic_asm "atomicMax($0._data[$1/4], $2)"; + case cuda: __intrinsic_asm "atomicMax($0._getPtrAt<uint32_t>($1), $2)"; + case hlsl: __intrinsic_asm ".InterlockedMax"; + case spirv: + let buf = __getEquivalentStructuredBuffer<uint>(this); + ::InterlockedMax(buf[dest / 4], value); + } + } - __target_intrinsic(glsl, "($3 = atomicMin($0._data[$1/4], $2))") void InterlockedMin( UINT dest, UINT value, - out UINT original_value); + out UINT original_value) + { + __target_switch + { + case glsl: __intrinsic_asm "($3 = atomicMin($0._data[$1/4], $2))"; + case cuda: __intrinsic_asm "(*$3 = atomicMin($0._getPtrAt<uint32_t>($1), $2))"; + case hlsl: __intrinsic_asm ".InterlockedMin"; + case spirv: + let buf = __getEquivalentStructuredBuffer<uint>(this); + ::InterlockedMin(buf[dest / 4], value, original_value); + } + } - __target_intrinsic(glsl, "atomicMin($0._data[$1/4], $2)") void InterlockedMin( UINT dest, - UINT value); + UINT value) + { + __target_switch + { + case glsl: __intrinsic_asm "atomicMin($0._data[$1/4], $2)"; + case cuda: __intrinsic_asm "atomicMin($0._getPtrAt<uint32_t>($1), $2)"; + case hlsl: __intrinsic_asm ".InterlockedMin"; + case spirv: + let buf = __getEquivalentStructuredBuffer<uint>(this); + ::InterlockedMin(buf[dest / 4], value); + } + } - __target_intrinsic(glsl, "($3 = atomicOr($0._data[$1/4], $2))") void InterlockedOr( UINT dest, UINT value, - out UINT original_value); + out UINT original_value) + { + __target_switch + { + case glsl: __intrinsic_asm "($3 = atomicOr($0._data[$1/4], $2))"; + case cuda: __intrinsic_asm "(*$3 = atomicOr($0._getPtrAt<uint32_t>($1), $2))"; + case hlsl: __intrinsic_asm ".InterlockedOr"; + case spirv: + let buf = __getEquivalentStructuredBuffer<uint>(this); + ::InterlockedOr(buf[dest / 4], value, original_value); + } + } - __target_intrinsic(glsl, "atomicOr($0._data[$1/4], $2)") void InterlockedOr( UINT dest, - UINT value); + UINT value) + { + __target_switch + { + case glsl: __intrinsic_asm "atomicOr($0._data[$1/4], $2)"; + case cuda: __intrinsic_asm "atomicOr($0._getPtrAt<uint32_t>($1), $2)"; + case hlsl: __intrinsic_asm ".InterlockedOr"; + case spirv: + let buf = __getEquivalentStructuredBuffer<uint>(this); + ::InterlockedOr(buf[dest / 4], value); + } + } - __target_intrinsic(glsl, "($3 = atomicXor($0._data[$1/4], $2))") void InterlockedXor( UINT dest, UINT value, - out UINT original_value); + out UINT original_value) + { + __target_switch + { + case glsl: __intrinsic_asm "($3 = atomicXor($0._data[$1/4], $2))"; + case cuda: __intrinsic_asm "(*$3 = atomicXor($0._getPtrAt<uint32_t>($1), $2))"; + case hlsl: __intrinsic_asm ".InterlockedXor"; + case spirv: + let buf = __getEquivalentStructuredBuffer<uint>(this); + ::InterlockedXor(buf[dest / 4], value, original_value); + } + } - __target_intrinsic(glsl, "atomicXor($0._data[$1/4], $2)") void InterlockedXor( UINT dest, - UINT value); + UINT value) + { + __target_switch + { + case glsl: __intrinsic_asm "atomicXor($0._data[$1/4], $2)"; + case cuda: __intrinsic_asm "atomicXor($0._getPtrAt<uint32_t>($1), $2)"; + case hlsl: __intrinsic_asm ".InterlockedXor"; + case spirv: + let buf = __getEquivalentStructuredBuffer<uint>(this); + ::InterlockedXor(buf[dest / 4], value); + } + } __target_intrinsic(hlsl) [ForceInline] @@ -2325,7 +2639,7 @@ __target_intrinsic(spirv, "OpFConvert resultType resultId _0") [__readNone] vector<float16_t, N> f32tof16_(vector<float, N> value) { - VECTOR_MAP_UNARY(uint, N, f32tof16, value); + VECTOR_MAP_UNARY(float16_t, N, f32tof16, value); } // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! @@ -2696,132 +3010,507 @@ void GroupMemoryBarrierWithGroupSync() // Atomics -__target_intrinsic(glsl, "$atomicAdd($A, $1)") -__target_intrinsic(cuda, "atomicAdd($0, $1)") -void InterlockedAdd(__ref int dest, int value); +__glsl_version(430) +void InterlockedAdd(__ref int dest, int value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedAdd"; + case cuda: __intrinsic_asm "atomicAdd($0, $1)"; + case glsl: __intrinsic_asm "$atomicAdd($A, $1)"; + case spirv: + spirv_asm + { + result:$$int = OpAtomicIAdd &dest Device None $value + }; + } +} -__target_intrinsic(glsl, "$atomicAdd($A, $1)") -__target_intrinsic(cuda, "atomicAdd((uint*)$0, $1)") -void InterlockedAdd(__ref uint dest, uint value); +__glsl_version(430) +void InterlockedAdd(__ref uint dest, uint value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedAdd"; + case cuda: __intrinsic_asm "atomicAdd((int*)$0, $1)"; + case glsl: __intrinsic_asm "$atomicAdd($A, $1)"; + case spirv: + spirv_asm + { + result:$$uint = OpAtomicIAdd &dest Device None $value + }; + } +} -__target_intrinsic(glsl, "$atomicAdd($A, $1)") -__target_intrinsic(cuda, "atomicAdd((uint*)$0, $1)") -void InterlockedAdd(__ref uint dest, int value); +[ForceInline] +void InterlockedAdd(__ref uint dest, int value) +{ + InterlockedAdd(dest, (uint)value); +} -__target_intrinsic(glsl, "($2 = $atomicAdd($A, $1))") -__target_intrinsic(cuda, "(*$2 = atomicAdd($0, $1))") -void InterlockedAdd(__ref int dest, int value, out int original_value); +__glsl_version(430) +void InterlockedAdd(__ref int dest, int value, out int original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedAdd"; + case cuda: __intrinsic_asm "(*$2 = atomicAdd($0, $1))"; + case glsl: __intrinsic_asm "($2 = $atomicAdd($A, $1))"; + case spirv: + spirv_asm + { + %original:$$int = OpAtomicIAdd &dest Device None $value; + OpStore &original_value %original + }; + } +} -__target_intrinsic(glsl, "($2 = $atomicAdd($A, $1))") -__target_intrinsic(cuda, "(*$2 = (uint)atomicAdd((uint*)$0, $1))") -__target_intrinsic(spirv, "%old = OpAtomicIAdd _type(uint) resultId _0" - "const(int, ScopeDevice) const(int, MemorySemanticsMaskNone) _1;" - "OpStore _2 %old;") -void InterlockedAdd(__ref uint dest, uint value, out uint original_value); +__glsl_version(430) +void InterlockedAdd(__ref uint dest, uint value, out uint original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedAdd"; + case cuda: __intrinsic_asm "(*$2 = (uint)atomicAdd((int*)$0, $1))"; + case glsl: __intrinsic_asm "($2 = $atomicAdd($A, $1))"; + case spirv: + spirv_asm + { + %original:$$uint = OpAtomicIAdd &dest Device None $value; + OpStore &original_value %original + }; + } +} -__target_intrinsic(glsl, "$atomicAnd($A, $1)") -__target_intrinsic(cuda, "atomicAnd($0, $1)") -void InterlockedAnd(__ref int dest, int value); +__glsl_version(430) +void InterlockedAnd(__ref int dest, int value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedAnd"; + case cuda: __intrinsic_asm "atomicAnd($0, $1)"; + case glsl: __intrinsic_asm "$atomicAnd($A, $1)"; + case spirv: + spirv_asm + { + result:$$int = OpAtomicAnd &dest Device None $value; + }; + } +} -__target_intrinsic(glsl, "$atomicAnd($A, $1)") -__target_intrinsic(cuda, "atomicAnd((int*)$0, $1)") -void InterlockedAnd(__ref uint dest, uint value); +__glsl_version(430) +void InterlockedAnd(__ref uint dest, uint value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedAnd"; + case cuda: __intrinsic_asm "atomicAnd((int*)$0, $1)"; + case glsl: __intrinsic_asm "$atomicAnd($A, $1)"; + case spirv: + spirv_asm + { + result:$$uint = OpAtomicAnd &dest Device None $value; + }; + } +} -__target_intrinsic(glsl, "($2 = $atomicAnd($A, $1))") -__target_intrinsic(cuda, "(*$2 = atomicAnd($0, $1))") -void InterlockedAnd(__ref int dest, int value, out int original_value); +__glsl_version(430) +void InterlockedAnd(__ref int dest, int value, out int original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedAnd"; + case cuda: __intrinsic_asm "(*$2 = atomicAnd($0, $1))"; + case glsl: __intrinsic_asm "($2 = $atomicAnd($A, $1))"; + case spirv: + spirv_asm + { + %original:$$int = OpAtomicAnd &dest Device None $value; + OpStore &original_value %original + }; + } +} -__target_intrinsic(glsl, "($2 = $atomicAnd($A, $1))") -__target_intrinsic(cuda, "(*$2 = atomicAnd((int*)$0, $1))") -void InterlockedAnd(__ref uint dest, uint value, out uint original_value); +__glsl_version(430) +void InterlockedAnd(__ref uint dest, uint value, out uint original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedAnd"; + case glsl: __intrinsic_asm "($2 = atomicAnd($0, $1))"; + case cuda: __intrinsic_asm "(*$2 = atomicAnd((int*)$0, $1))"; + case spirv: + spirv_asm + { + %original:$$uint = OpAtomicAnd &dest Device None $value; + OpStore &original_value %original + }; + } +} -__target_intrinsic(glsl, "($3 = $atomicCompSwap($A, $1, $2))") -__target_intrinsic(cuda, "(*$3 = atomicCAS($0, $1, $2))") -void InterlockedCompareExchange(__ref int dest, int compare_value, int value, out int original_value); +__glsl_version(430) +void InterlockedCompareExchange(__ref int dest, int compare_value, int value, out int original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedCompareExchange"; + case glsl: __intrinsic_asm "($3 = $atomicCompSwap($A, $1, $2))"; + case cuda: __intrinsic_asm "(*$3 = atomicCAS($0, $1, $2))"; + case spirv: + spirv_asm + { + %original:$$int = OpAtomicCompareExchange &dest Device None None $value $compare_value; + OpStore &original_value %original + }; + } +} -__target_intrinsic(glsl, "($3 = $atomicCompSwap($A, $1, $2))") -__target_intrinsic(cuda, "(*$3 = (uint)atomicCAS((int*)$0, $1, $2))") -void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value, out uint original_value); +__glsl_version(430) +void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value, out uint original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedCompareExchange"; + case glsl: __intrinsic_asm "($3 = $atomicCompSwap($A, $1, $2))"; + case cuda: __intrinsic_asm "(*$3 = (uint)atomicCAS((int*)$0, $1, $2))"; + case spirv: + spirv_asm + { + %original:$$uint = OpAtomicCompareExchange &dest Device None None $value $compare_value; + OpStore &original_value %original + }; + } +} -__target_intrinsic(glsl, "$atomicCompSwap($A, $1, $2)") -__target_intrinsic(cuda, "atomicCAS($0, $1, $2)") -void InterlockedCompareStore(__ref int dest, int compare_value, int value); +__glsl_version(430) +void InterlockedCompareStore(__ref int dest, int compare_value, int value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedCompareStore"; + case glsl: __intrinsic_asm "$atomicCompSwap($A, $1, $2)"; + case cuda: __intrinsic_asm "atomicCAS($0, $1, $2)"; + case spirv: + spirv_asm + { + result:$$int = OpAtomicCompareExchange &dest Device None None $value $compare_value; + }; + } +} -__target_intrinsic(glsl, "$atomicCompSwap($A, $1, $2)") -__target_intrinsic(cuda, "atomicCAS((int*)$0, $1, $2)") -void InterlockedCompareStore(__ref uint dest, uint compare_value, uint value); +__glsl_version(430) +void InterlockedCompareStore(__ref uint dest, uint compare_value, uint value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedCompareStore"; + case glsl: __intrinsic_asm "$atomicCompSwap($A, $1, $2)"; + case cuda: __intrinsic_asm "atomicCAS((int*)$0, $1, $2)"; + case spirv: + spirv_asm + { + result:$$uint = OpAtomicCompareExchange &dest Device None None $value $compare_value; + }; + } +} -__target_intrinsic(glsl, "($2 = $atomicExchange($A, $1))") -__target_intrinsic(cuda, "(*$2 = atomicExch($0, $1))") -void InterlockedExchange(__ref int dest, int value, out int original_value); +__glsl_version(430) +void InterlockedExchange(__ref int dest, int value, out int original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedExchange"; + case glsl: __intrinsic_asm "($2 = $atomicExchange($A, $1))"; + case cuda: __intrinsic_asm "(*$2 = atomicExch($0, $1))"; + case spirv: + spirv_asm + { + %r:$$int = OpAtomicExchange &dest Device None $value; + OpStore &original_value %r + }; + } +} -__target_intrinsic(glsl, "($2 = $atomicExchange($A, $1))") -__target_intrinsic(cuda, "(*$2 = (uint)atomicExch((int*)$0, $1))") -void InterlockedExchange(__ref uint dest, uint value, out uint original_value); +__glsl_version(430) +void InterlockedExchange(__ref uint dest, uint value, out uint original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedExchange"; + case glsl: __intrinsic_asm "($2 = $atomicExchange($A, $1))"; + case cuda: __intrinsic_asm "(*$2 = (uint)atomicExch((int*)$0, $1))"; + case spirv: + spirv_asm + { + %r:$$uint = OpAtomicExchange &dest Device None $value; + OpStore &original_value %r + }; + } +} -__target_intrinsic(glsl, "$atomicMax($A, $1)") -__target_intrinsic(cuda, "atomicMax($0, $1)") -void InterlockedMax(__ref int dest, int value); +__glsl_version(430) +void InterlockedMax(__ref int dest, int value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedMax"; + case glsl: __intrinsic_asm "$atomicMax($A, $1)"; + case cuda: __intrinsic_asm "atomicMax($0, $1)"; + case spirv: + spirv_asm + { + result:$$int = OpAtomicSMax &dest Device None $value; + }; + } +} -__target_intrinsic(glsl, "$atomicMax($A, $1)") -__target_intrinsic(cuda, "atomicMax((int*)$0, $1)") -void InterlockedMax(__ref uint dest, uint value); +__glsl_version(430) +void InterlockedMax(__ref uint dest, uint value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedMax"; + case glsl: __intrinsic_asm "$atomicMax($A, $1)"; + case cuda: __intrinsic_asm "atomicMax((int*)$0, $1)"; + case spirv: + spirv_asm + { + result:$$uint = OpAtomicUMax &dest Device None $value; + }; + } +} -__target_intrinsic(glsl, "($2 = $atomicMax($A, $1))") -__target_intrinsic(cuda, "(*$2 = atomicMax($0, $1))") -void InterlockedMax(__ref int dest, int value, out int original_value); +__glsl_version(430) +void InterlockedMax(__ref int dest, int value, out int original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedMax"; + case glsl: __intrinsic_asm "($2 = $atomicMax($A, $1))"; + case cuda: __intrinsic_asm "(*$2 = atomicMax($0, $1))"; + case spirv: + spirv_asm + { + %v:$$int = OpAtomicSMax &dest Device None $value; + OpStore &original_value %v + }; + } +} -__target_intrinsic(glsl, "($2 = $atomicMax($A, $1))") -__target_intrinsic(cuda, "(*$2 = (uint)atomicMax((int*)$0, $1))") -void InterlockedMax(__ref uint dest, uint value, out uint original_value); +__glsl_version(430) +void InterlockedMax(__ref uint dest, uint value, out uint original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedMax"; + case glsl: __intrinsic_asm "($2 = $atomicMax($A, $1))"; + case cuda: __intrinsic_asm "(*$2 = (uint)atomicMax((int*)$0, $1))"; + case spirv: + spirv_asm + { + %v:$$uint = OpAtomicUMax &dest Device None $value; + OpStore &original_value %v + }; + } +} -__target_intrinsic(glsl, "$atomicMin($A, $1)") -__target_intrinsic(cuda, "atomicMin($0, $1)") -void InterlockedMin(__ref int dest, int value); +__glsl_version(430) +void InterlockedMin(__ref int dest, int value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedMin"; + case glsl: __intrinsic_asm "$atomicMin($A, $1)"; + case cuda: __intrinsic_asm "atomicMin($0, $1)"; + case spirv: + spirv_asm + { + result:$$int = OpAtomicSMin &dest Device None $value; + }; + } +} -__target_intrinsic(glsl, "$atomicMin($A, $1)") -__target_intrinsic(cuda, "atomicMin((int*)$0, $1)") -void InterlockedMin(__ref uint dest, uint value); +__glsl_version(430) +void InterlockedMin(__ref uint dest, uint value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedMin"; + case glsl: __intrinsic_asm "$atomicMin($A, $1)"; + case cuda: __intrinsic_asm "atomicMin((int*)$0, $1)"; + case spirv: + spirv_asm + { + result:$$uint = OpAtomicUMin &dest Device None $value; + }; + } +} -__target_intrinsic(glsl, "($2 = $atomicMin($A, $1))") -__target_intrinsic(cuda, "(*$2 = atomicMin($0, $1))") -void InterlockedMin(__ref int dest, int value, out int original_value); +__glsl_version(430) +void InterlockedMin(__ref int dest, int value, out int original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedMin"; + case glsl: __intrinsic_asm "($2 = $atomicMin($A, $1))"; + case cuda: __intrinsic_asm "(*$2 = atomicMin($0, $1))"; + case spirv: + spirv_asm + { + %v:$$int = OpAtomicSMin &dest Device None $value; + OpStore &original_value %v + }; + } +} -__target_intrinsic(glsl, "($2 = $atomicMin($A, $1))") -__target_intrinsic(cuda, "(*$2 = (uint)atomicMin((int*)$0, $1))") -void InterlockedMin(__ref uint dest, uint value, out uint original_value); +__glsl_version(430) +void InterlockedMin(__ref uint dest, uint value, out uint original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedMin"; + case glsl: __intrinsic_asm "($2 = $atomicMin($A, $1))"; + case cuda: __intrinsic_asm "(*$2 = (uint)atomicMin((int*)$0, $1))"; + case spirv: + spirv_asm + { + %v:$$uint = OpAtomicUMin &dest Device None $value; + OpStore &original_value %v + }; + } +} -__target_intrinsic(glsl, "$atomicOr($A, $1)") -__target_intrinsic(cuda, "atomicOr($0, $1)") -void InterlockedOr(__ref int dest, int value); +__glsl_version(430) +void InterlockedOr(__ref int dest, int value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedOr"; + case cuda: __intrinsic_asm "atomicOr((int*)$0, $1)"; + case glsl: __intrinsic_asm "$atomicOr($A, $1)"; + case spirv: + spirv_asm + { + result:$$int = OpAtomicOr &dest Device None $value; + }; + } +} -__target_intrinsic(glsl, "$atomicOr($A, $1)") -__target_intrinsic(cuda, "atomicOr((int*)$0, $1)") -void InterlockedOr(__ref uint dest, uint value); +__glsl_version(430) +void InterlockedOr(__ref uint dest, uint value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedOr"; + case cuda: __intrinsic_asm "atomicOr((int*)$0, $1)"; + case glsl: __intrinsic_asm "$atomicOr($A, $1)"; + case spirv: + spirv_asm + { + result:$$uint = OpAtomicOr &dest Device None $value; + }; + } +} -__target_intrinsic(glsl, "($2 = $atomicOr($A, $1))") -__target_intrinsic(cuda, "(*$2 = atomicOr($0, $1))") -void InterlockedOr(__ref int dest, int value, out int original_value); +__glsl_version(430) +void InterlockedOr(__ref int dest, int value, out int original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedOr"; + case glsl: __intrinsic_asm "($2 = atomicOr($0, $1))"; + case cuda: __intrinsic_asm "(*$2 = atomicOr($0, $1))"; + case spirv: + spirv_asm + { + %original:$$int = OpAtomicOr &dest Device None $value; + OpStore &original_value %original + }; + } +} -__target_intrinsic(glsl, "($2 = $atomicOr($A, $1))") -__target_intrinsic(cuda, "(*$2 = (uint)atomicOr((int*)$0, $1))") -void InterlockedOr(__ref uint dest, uint value, out uint original_value); +__glsl_version(430) +void InterlockedOr(__ref uint dest, uint value, out uint original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedOr"; + case glsl: __intrinsic_asm "($2 = atomicOr($0, $1))"; + case cuda: __intrinsic_asm "(*$2 = atomicOr((int*)$0, $1))"; + case spirv: + spirv_asm + { + %original:$$uint = OpAtomicOr &dest Device None $value; + OpStore &original_value %original + }; + } +} -__target_intrinsic(glsl, "$atomicXor($A, $1)") -__target_intrinsic(cuda, "atomicXor($0, $1)") -void InterlockedXor(__ref int dest, int value); +__glsl_version(430) +void InterlockedXor(__ref int dest, int value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedXor"; + case cuda: __intrinsic_asm "atomicXor((int*)$0, $1)"; + case glsl: __intrinsic_asm "$atomicXor($A, $1)"; + case spirv: + spirv_asm + { + result:$$int = OpAtomicXor &dest Device None $value; + }; + } +} -__target_intrinsic(glsl, "$atomicXor($A, $1)") -__target_intrinsic(cuda, "atomicXor((int*)$0, $1)") -void InterlockedXor(__ref uint dest, uint value); +__glsl_version(430) +void InterlockedXor(__ref uint dest, uint value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedXor"; + case cuda: __intrinsic_asm "atomicXor((int*)$0, $1)"; + case glsl: __intrinsic_asm "$atomicXor($A, $1)"; + case spirv: + spirv_asm + { + result:$$uint = OpAtomicXor &dest Device None $value; + }; + } +} -__target_intrinsic(glsl, "($2 = $atomicXor($A, $1))") -__target_intrinsic(cuda, "(*$2 = atomicXor($0, $1))") -void InterlockedXor(__ref int dest, int value, out int original_value); +__glsl_version(430) +void InterlockedXor(__ref int dest, int value, out int original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedXor"; + case glsl: __intrinsic_asm "($2 = atomicXor($0, $1))"; + case cuda: __intrinsic_asm "(*$2 = atomicXor($0, $1))"; + case spirv: + spirv_asm + { + %original:$$int = OpAtomicXor &dest Device None $value; + OpStore &original_value %original + }; + } +} -__target_intrinsic(glsl, "($2 = $atomicXor($A, $1))") -__target_intrinsic(cuda, "(*$2 = (uint)atomicXor((int*)$0, $1))") -void InterlockedXor(__ref uint dest, uint value, out uint original_value); +__glsl_version(430) +void InterlockedXor(__ref uint dest, uint value, out uint original_value) +{ + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedXor"; + case glsl: __intrinsic_asm "($2 = atomicXor($0, $1))"; + case cuda: __intrinsic_asm "(*$2 = (uint)atomicXor((int*)$0, $1))"; + case spirv: + spirv_asm + { + %original:$$uint = OpAtomicXor &dest Device None $value; + OpStore &original_value %original + }; + } +} // Is floating-point value finite? @@ -3315,7 +4004,7 @@ uint4 msad4(uint reference, uint2 source, uint4 accum) int4 bytesX = (source.x >> uint4(24, 16, 8, 0)) & 0xFF; int4 bytesY = (source.y >> uint4(24, 16, 8, 0)) & 0xFF; - uint4 mask = bytesRef == 0 ? 0 : 0xFFFFFFFFu; + uint4 mask = select(bytesRef == 0, 0, 0xFFFFFFFFu); uint4 result = accum; result += mask.x & abs(bytesRef - int4(bytesX.x, bytesY.y, bytesY.z, bytesY.w)); diff --git a/source/slang/slang-check-expr.cpp b/source/slang/slang-check-expr.cpp index bf7950310..a00b1bea2 100644 --- a/source/slang/slang-check-expr.cpp +++ b/source/slang/slang-check-expr.cpp @@ -2785,6 +2785,9 @@ namespace Slang Expr* SemanticsExprVisitor::visitTypeCastExpr(TypeCastExpr * expr) { + if (expr->type) + return expr; + // Check the term we are applying first auto funcExpr = expr->functionExpr; funcExpr = CheckTerm(funcExpr); diff --git a/source/slang/slang-emit-spirv.cpp b/source/slang/slang-emit-spirv.cpp index a203f4a80..846c4b5b4 100644 --- a/source/slang/slang-emit-spirv.cpp +++ b/source/slang/slang-emit-spirv.cpp @@ -1292,7 +1292,6 @@ struct SPIRVEmitContext SpvLiteralInteger::from32(stride)); return arrayType; } - case kIROp_TextureType: { const auto texTypeInst = as<IRTextureType>(inst); diff --git a/source/slang/slang-ir-inline.cpp b/source/slang/slang-ir-inline.cpp index 4308d16f0..5479a98ff 100644 --- a/source/slang/slang-ir-inline.cpp +++ b/source/slang/slang-ir-inline.cpp @@ -896,15 +896,39 @@ struct IntrinsicFunctionInliningPass : InliningPassBase auto returnInst = as<IRReturn>(func->getFirstBlock()->getTerminator()); if (!returnInst) return false; - auto firstInst = as<IRSPIRVAsm>(func->getFirstBlock()->getFirstOrdinaryInst()); - return returnInst->getVal() == firstInst; + + // If a function body has only asm blocks + trivial insts (load/store), + // this is considered as a pure asm function, and we can inline it. + bool hasSpvAsm = false; + for (auto inst = func->getFirstBlock()->getFirstOrdinaryInst(); inst != returnInst; inst = inst->getNextInst()) + { + switch (inst->getOp()) + { + case kIROp_SPIRVAsmOperandInst: + case kIROp_SPIRVAsm: + hasSpvAsm = true; + continue; + case kIROp_Load: + case kIROp_swizzle: + case kIROp_Store: + continue; + default: + return false; + } + } + return hasSpvAsm; } }; void performIntrinsicFunctionFunctionInlining(IRModule* module) { IntrinsicFunctionInliningPass pass(module); - pass.considerAllCallSites(); + bool changed = true; + + while (changed) + { + changed = pass.considerAllCallSites(); + } } struct CustomInliningPass : InliningPassBase diff --git a/source/slang/slang-ir-layout.cpp b/source/slang/slang-ir-layout.cpp index cba6894a9..39210ea1d 100644 --- a/source/slang/slang-ir-layout.cpp +++ b/source/slang/slang-ir-layout.cpp @@ -139,11 +139,12 @@ case kIROp_##TYPE##Type: \ { auto structType = cast<IRStructType>(type); IRSizeAndAlignment structLayout; + IRIntegerValue offset = 0; for (auto field : structType->getFields()) { IRSizeAndAlignment fieldTypeLayout; SLANG_RETURN_ON_FAIL(getSizeAndAlignment(rules, field->getFieldType(), &fieldTypeLayout)); - structLayout.size = align(structLayout.size, fieldTypeLayout.alignment); + structLayout.size = align(offset, fieldTypeLayout.alignment); structLayout.alignment = std::max(structLayout.alignment, fieldTypeLayout.alignment); IRIntegerValue fieldOffset = structLayout.size; @@ -165,6 +166,11 @@ case kIROp_##TYPE##Type: \ } structLayout.size += fieldTypeLayout.size; + offset = structLayout.size; + if (as<IRMatrixType>(field->getFieldType()) || as<IRArrayTypeBase>(field->getFieldType()) || as<IRStructType>(field->getFieldType())) + { + offset = rules->adjustOffsetForNextAggregateMember(offset, fieldTypeLayout.alignment); + } } *outSizeAndAlignment = rules->alignCompositeElement(structLayout); return SLANG_OK; @@ -380,7 +386,11 @@ struct NaturalLayoutRules : IRTypeLayoutRules { ruleName = IRTypeLayoutRuleName::Natural; } - + virtual IRIntegerValue adjustOffsetForNextAggregateMember(IRIntegerValue currentSize, IRIntegerValue lastElementAlignment) + { + SLANG_UNUSED(lastElementAlignment); + return currentSize; + } virtual IRSizeAndAlignment alignCompositeElement(IRSizeAndAlignment elementSize) { return elementSize; @@ -402,6 +412,11 @@ struct Std430LayoutRules : IRTypeLayoutRules { return elementSize; } + virtual IRIntegerValue adjustOffsetForNextAggregateMember(IRIntegerValue currentSize, IRIntegerValue lastElementAlignment) + { + return align(currentSize, (int)lastElementAlignment); + } + virtual IRSizeAndAlignment getVectorSizeAndAlignment(IRSizeAndAlignment element, IRIntegerValue count) { IRIntegerValue countForAlignment = count; @@ -418,6 +433,10 @@ struct Std140LayoutRules : IRTypeLayoutRules ruleName = IRTypeLayoutRuleName::Std140; } + virtual IRIntegerValue adjustOffsetForNextAggregateMember(IRIntegerValue currentSize, IRIntegerValue lastElementAlignment) + { + return align(currentSize, (int)lastElementAlignment); + } virtual IRSizeAndAlignment alignCompositeElement(IRSizeAndAlignment elementSize) { elementSize.alignment = (int)align(elementSize.alignment, 16); diff --git a/source/slang/slang-ir-layout.h b/source/slang/slang-ir-layout.h index 12da047d2..cae8d788a 100644 --- a/source/slang/slang-ir-layout.h +++ b/source/slang/slang-ir-layout.h @@ -56,6 +56,7 @@ public: IRTypeLayoutRuleName ruleName; virtual IRSizeAndAlignment alignCompositeElement(IRSizeAndAlignment elementSize) = 0; virtual IRSizeAndAlignment getVectorSizeAndAlignment(IRSizeAndAlignment element, IRIntegerValue count) = 0; + virtual IRIntegerValue adjustOffsetForNextAggregateMember(IRIntegerValue currentSize, IRIntegerValue lastElementAlignment) = 0; static IRTypeLayoutRules* getStd430(); static IRTypeLayoutRules* getStd140(); static IRTypeLayoutRules* getNatural(); diff --git a/source/slang/slang-ir-lower-buffer-element-type.cpp b/source/slang/slang-ir-lower-buffer-element-type.cpp index bf87d72fe..bc8e47e91 100644 --- a/source/slang/slang-ir-lower-buffer-element-type.cpp +++ b/source/slang/slang-ir-lower-buffer-element-type.cpp @@ -165,7 +165,9 @@ namespace Slang for (IRIntegerValue ii = 0; ii < count; ++ii) { auto packedElement = builder.emitElementExtract(packedArray, ii); - auto originalElement = builder.emitCallInst(innerTypeInfo.originalType, innerTypeInfo.convertLoweredToOriginal, 1, &packedElement); + auto originalElement = innerTypeInfo.convertLoweredToOriginal + ? builder.emitCallInst(innerTypeInfo.originalType, innerTypeInfo.convertLoweredToOriginal, 1, &packedElement) + : packedElement; args[(Index)ii] = originalElement; } auto result = builder.emitMakeArray(arrayType, (UInt)args.getCount(), args.getBuffer()); @@ -194,7 +196,9 @@ namespace Slang for (IRIntegerValue ii = 0; ii < count; ++ii) { auto originalElement = builder.emitElementExtract(originalParam, ii); - auto packedElement = builder.emitCallInst(innerTypeInfo.loweredType, innerTypeInfo.convertOriginalToLowered, 1, &originalElement); + auto packedElement = innerTypeInfo.convertOriginalToLowered + ? builder.emitCallInst(innerTypeInfo.loweredType, innerTypeInfo.convertOriginalToLowered, 1, &originalElement) + : originalElement; args[(Index)ii] = packedElement; } auto packedArray = builder.emitMakeArray(innerArrayType, (UInt)args.getCount(), args.getBuffer()); @@ -259,7 +263,7 @@ namespace Slang auto arrayType = builder.getArrayType( vectorType, isColMajor?matrixType->getColumnCount():matrixType->getRowCount(), - builder.getIntValue(builder.getIntType(), elementSizeAlignment.size)); + builder.getIntValue(builder.getIntType(), elementSizeAlignment.getStride())); builder.createStructField(loweredType, structKey, arrayType); info.loweredType = loweredType; @@ -272,10 +276,16 @@ namespace Slang else if (auto arrayType = as<IRArrayType>(type)) { auto loweredInnerTypeInfo = getLoweredTypeInfo(arrayType->getElementType(), rules); - if (!loweredInnerTypeInfo.convertLoweredToOriginal) + // For spirv backend, we always want to lower all array types, even if the element type + // comes out the same. This is because different layout rules may have different array + // stride requirements. + if (!target->shouldEmitSPIRVDirectly()) { - info.loweredType = type; - return info; + if (!loweredInnerTypeInfo.convertLoweredToOriginal) + { + info.loweredType = type; + return info; + } } auto loweredType = builder.createStructType(); info.loweredType = loweredType; @@ -287,12 +297,12 @@ namespace Slang auto structKey = builder.createStructKey(); builder.addNameHintDecoration(structKey, UnownedStringSlice("data")); IRSizeAndAlignment elementSizeAlignment; - getSizeAndAlignment(rules, loweredType, &elementSizeAlignment); + getSizeAndAlignment(rules, loweredInnerTypeInfo.loweredType, &elementSizeAlignment); elementSizeAlignment = rules->alignCompositeElement(elementSizeAlignment); auto innerArrayType = builder.getArrayType( loweredInnerTypeInfo.loweredType, arrayType->getElementCount(), - builder.getIntValue(builder.getIntType(), elementSizeAlignment.size)); + builder.getIntValue(builder.getIntType(), elementSizeAlignment.getStride())); builder.createStructField(loweredType, structKey, innerArrayType); info.loweredInnerArrayType = innerArrayType; info.loweredInnerStructKey = structKey; @@ -312,12 +322,19 @@ namespace Slang if (loweredFieldTypeInfo.convertLoweredToOriginal || rules->ruleName != IRTypeLayoutRuleName::Natural) isTrivial = false; } - if (isTrivial) + + // For spirv backend, we always want to lower all array types, even if the element type + // comes out the same. This is because different layout rules may have different array + // stride requirements. + if (!target->shouldEmitSPIRVDirectly()) { - info.loweredType = type; - return info; + // For non-spirv target, we skip lowering this type if all field types are unchanged. + if (isTrivial) + { + info.loweredType = type; + return info; + } } - auto loweredType = builder.createStructType(); StringBuilder nameSB; getTypeNameHint(nameSB, type); diff --git a/source/slang/slang-ir-specialize-function-call.cpp b/source/slang/slang-ir-specialize-function-call.cpp index 2fa93087f..f9e106920 100644 --- a/source/slang/slang-ir-specialize-function-call.cpp +++ b/source/slang/slang-ir-specialize-function-call.cpp @@ -172,7 +172,9 @@ struct FunctionParameterSpecializationContext return false; if(!func->isDefinition()) return false; - + UnownedStringSlice def; + if (findTargetIntrinsicDefinition(func, targetRequest->getTargetCaps(), def)) + return false; // With the basic checks out of the way, there are // two conditions we care about: // diff --git a/source/slang/slang-ir-specialize-resources.cpp b/source/slang/slang-ir-specialize-resources.cpp index 09c8f1f22..1a96389cb 100644 --- a/source/slang/slang-ir-specialize-resources.cpp +++ b/source/slang/slang-ir-specialize-resources.cpp @@ -288,6 +288,9 @@ struct ResourceOutputSpecializationPass // if(!func->isDefinition()) return false; + UnownedStringSlice def; + if (findTargetIntrinsicDefinition(func, targetRequest->getTargetCaps(), def)) + return false; // If any of the parameters of the function are `out` // or `inout` parameters of a resource type, then we diff --git a/source/slang/slang-ir-specialize-target-switch.cpp b/source/slang/slang-ir-specialize-target-switch.cpp index 2593389b1..2be7c8194 100644 --- a/source/slang/slang-ir-specialize-target-switch.cpp +++ b/source/slang/slang-ir-specialize-target-switch.cpp @@ -30,10 +30,16 @@ namespace Slang bestCapSet = capSet; } } - SLANG_ASSERT(targetBlock); IRBuilder builder(targetSwitch); builder.setInsertBefore(targetSwitch); - builder.emitBranch(targetBlock); + if (targetBlock) + { + builder.emitBranch(targetBlock); + } + else + { + builder.emitMissingReturn(); + } targetSwitch->removeAndDeallocate(); changed = true; } diff --git a/source/slang/slang-ir-spirv-legalize.cpp b/source/slang/slang-ir-spirv-legalize.cpp index 32386e53f..a4b33324b 100644 --- a/source/slang/slang-ir-spirv-legalize.cpp +++ b/source/slang/slang-ir-spirv-legalize.cpp @@ -25,6 +25,67 @@ struct SPIRVLegalizationContext : public SourceEmitterBase SPIRVEmitSharedContext* m_sharedContext; IRModule* m_module; + + struct LoweredStructuredBufferTypeInfo + { + IRType* structType; + IRStructKey* arrayKey; + IRArrayTypeBase* runtimeArrayType; + }; + Dictionary<IRType*, LoweredStructuredBufferTypeInfo> m_loweredStructuredBufferTypes; + + LoweredStructuredBufferTypeInfo lowerStructuredBufferType(IRHLSLStructuredBufferTypeBase* inst) + { + LoweredStructuredBufferTypeInfo result; + if (m_loweredStructuredBufferTypes.tryGetValue(inst, result)) + return result; + + auto layoutRules = getTypeLayoutRuleForBuffer(m_sharedContext->m_targetRequest, inst); + + IRBuilder builder(m_sharedContext->m_irModule); + + builder.setInsertBefore(inst); + auto elementType = inst->getElementType(); + IRSizeAndAlignment elementSize; + getSizeAndAlignment(layoutRules, elementType, &elementSize); + elementSize = layoutRules->alignCompositeElement(elementSize); + + const auto arrayType = builder.getUnsizedArrayType(inst->getElementType(), builder.getIntValue(builder.getIntType(), elementSize.getStride())); + const auto structType = builder.createStructType(); + const auto arrayKey = builder.createStructKey(); + builder.createStructField(structType, arrayKey, arrayType); + IRSizeAndAlignment structSize; + getSizeAndAlignment(layoutRules, structType, &structSize); + + StringBuilder nameSb; + switch (inst->getOp()) + { + case kIROp_HLSLRWStructuredBufferType: + nameSb << "RWStructuredBuffer"; + break; + case kIROp_HLSLAppendStructuredBufferType: + nameSb << "AppendStructuredBuffer"; + break; + case kIROp_HLSLConsumeStructuredBufferType: + nameSb << "ConsumeStructuredBuffer"; + break; + case kIROp_HLSLRasterizerOrderedStructuredBufferType: + nameSb << "RasterizerOrderedStructuredBuffer"; + break; + default: + nameSb << "StructuredBuffer"; + break; + } + builder.addNameHintDecoration(structType, nameSb.getUnownedSlice()); + builder.addDecoration(structType, kIROp_SPIRVBlockDecoration); + + result.structType = structType; + result.arrayKey = arrayKey; + result.runtimeArrayType = arrayType; + m_loweredStructuredBufferTypes[inst] = result; + return result; + } + // We will use a single work list of instructions that need // to be considered for specialization or simplification, // whether generic, existential, etc. @@ -84,13 +145,74 @@ struct SPIRVLegalizationContext : public SourceEmitterBase return structType; } + static void insertLoadAtLatestLocation(IRInst* addrInst, IRUse* inUse) + { + struct WorkItem { IRInst* addr; IRUse* use; }; + List<WorkItem> workList; + List<IRInst*> instsToRemove; + workList.add(WorkItem{ addrInst, inUse }); + for (Index i = 0; i < workList.getCount(); i++) + { + auto use = workList[i].use; + auto addr = workList[i].addr; + auto user = use->getUser(); + IRBuilder builder(user); + builder.setInsertBefore(user); + switch (user->getOp()) + { + case kIROp_GetElement: + case kIROp_FieldExtract: + { + auto basePtrType = as<IRPtrTypeBase>(addr->getDataType()); + IRType* ptrType = nullptr; + if (basePtrType->hasAddressSpace()) + ptrType = builder.getPtrType(kIROp_PtrType, user->getDataType(), basePtrType->getAddressSpace()); + else + ptrType = builder.getPtrType(kIROp_PtrType, user->getDataType()); + IRInst* subAddr = nullptr; + if (user->getOp() == kIROp_GetElement) + subAddr = builder.emitElementAddress(ptrType, addr, as<IRGetElement>(user)->getIndex()); + else + subAddr = builder.emitFieldAddress(ptrType, addr, as<IRFieldExtract>(user)->getField()); + + for (auto u = user->firstUse; u; u = u->nextUse) + { + workList.add(WorkItem{ subAddr, u }); + } + instsToRemove.add(user); + break; + } + default: + { + auto val = builder.emitLoad(addr); + builder.replaceOperand(use, val); + break; + } + } + } + + for (auto i : instsToRemove) + if (!i->hasUses()) + i->removeAndDeallocate(); + } + void processGlobalParam(IRGlobalParam* inst) { // If the global param is not a pointer type, make it so and insert explicit load insts. auto ptrType = as<IRPtrTypeBase>(inst->getDataType()); if (!ptrType) { - if (as<IRResourceTypeBase>(inst)) + auto innerType = inst->getFullType(); + + auto arrayType = as<IRArrayType>(inst->getDataType()); + IRInst* arraySize = nullptr; + if (arrayType) + { + arraySize = arrayType->getElementCount(); + innerType = arrayType->getElementType(); + } + + if (as<IRResourceTypeBase>(innerType)) return; SpvStorageClass storageClass = SpvStorageClassPrivate; @@ -112,7 +234,6 @@ struct SPIRVLegalizationContext : public SourceEmitterBase // Strip any HLSL wrappers IRBuilder builder(m_sharedContext->m_irModule); bool needLoad = true; - auto innerType = inst->getFullType(); auto cbufferType = as<IRConstantBufferType>(innerType); auto paramBlockType = as<IRParameterBlockType>(innerType); if (cbufferType || paramBlockType) @@ -165,6 +286,21 @@ struct SPIRVLegalizationContext : public SourceEmitterBase varLayoutInst->removeAndDeallocate(); } } + else + { + if (auto structuredBufferType = as<IRHLSLStructuredBufferTypeBase>(innerType)) + { + innerType = lowerStructuredBufferType(structuredBufferType).structType; + storageClass = SpvStorageClassStorageBuffer; + needLoad = false; + } + } + + auto innerElementType = innerType; + if (arraySize) + { + innerType = builder.getArrayType(innerType, arraySize); + } // Make a pointer type of storageClass. builder.setInsertBefore(inst); @@ -173,17 +309,27 @@ struct SPIRVLegalizationContext : public SourceEmitterBase if (needLoad) { // Insert an explicit load at each use site. - List<IRUse*> uses; - for (auto use = inst->firstUse; use; use = use->nextUse) - { - uses.add(use); - } - for (auto use : uses) - { - builder.setInsertBefore(use->getUser()); - auto loadedValue = builder.emitLoad(inst); - use->set(loadedValue); - } + traverseUses(inst, [&](IRUse* use) + { + insertLoadAtLatestLocation(inst, use); + }); + } + else if (arraySize) + { + traverseUses(inst, [&](IRUse* use) + { + auto user = use->getUser(); + if (auto getElement = as<IRGetElement>(user)) + { + // For array resources, getElement(r, index) ==> getElementPtr(r, index). + IRBuilder builder(getElement); + builder.setInsertBefore(user); + auto newAddr = builder.emitElementAddress(builder.getPtrType(kIROp_PtrType, innerElementType, storageClass), inst, getElement->getIndex()); + user->replaceUsesWith(newAddr); + user->removeAndDeallocate(); + return; + } + }); } } processGlobalVar(inst); @@ -535,53 +681,6 @@ struct SPIRVLegalizationContext : public SourceEmitterBase } } - void processStructuredBufferType(IRHLSLStructuredBufferTypeBase * inst) - { - auto layoutRules = getTypeLayoutRuleForBuffer(m_sharedContext->m_targetRequest, inst); - - IRBuilder builder(m_sharedContext->m_irModule); - - builder.setInsertBefore(inst); - auto elementType = inst->getElementType(); - IRSizeAndAlignment elementSize; - getSizeAndAlignment(layoutRules, elementType, &elementSize); - elementSize = layoutRules->alignCompositeElement(elementSize); - - const auto arrayType = builder.getUnsizedArrayType(inst->getElementType(), builder.getIntValue(builder.getIntType(), elementSize.getStride())); - const auto structType = builder.createStructType(); - const auto arrayKey = builder.createStructKey(); - builder.createStructField(structType, arrayKey, arrayType); - IRSizeAndAlignment structSize; - getSizeAndAlignment(layoutRules, structType, &structSize); - - const auto ptrType = builder.getPtrType(kIROp_PtrType, structType, SpvStorageClassStorageBuffer); - - StringBuilder nameSb; - switch (inst->getOp()) - { - case kIROp_HLSLRWStructuredBufferType: - nameSb << "RWStructuredBuffer"; - break; - case kIROp_HLSLAppendStructuredBufferType: - nameSb << "AppendStructuredBuffer"; - break; - case kIROp_HLSLConsumeStructuredBufferType: - nameSb << "ConsumeStructuredBuffer"; - break; - case kIROp_HLSLRasterizerOrderedStructuredBufferType: - nameSb << "RasterizerOrderedStructuredBuffer"; - break; - default: - nameSb << "StructuredBuffer"; - break; - } - builder.addNameHintDecoration(structType, nameSb.getUnownedSlice()); - builder.addDecoration(structType, kIROp_SPIRVBlockDecoration); - inst->replaceUsesWith(ptrType); - inst->removeAndDeallocate(); - addUsersToWorkList(ptrType); - } - void duplicateMergeBlockIfNeeded(IRUse* breakBlockUse) { auto breakBlock = as<IRBlock>(breakBlockUse->get()); @@ -778,7 +877,20 @@ struct SPIRVLegalizationContext : public SourceEmitterBase void processModule() { - addToWorkList(m_module->getModuleInst()); + // Process global params before anything else, so we don't generate inefficient + // array marhalling code for array-typed global params. + for (auto globalInst : m_module->getGlobalInsts()) + { + if (auto globalParam = as<IRGlobalParam>(globalInst)) + { + processGlobalParam(globalParam); + } + else + { + addToWorkList(globalInst); + } + } + while (workList.getCount() != 0) { IRInst* inst = workList.getLast(); @@ -815,10 +927,6 @@ struct SPIRVLegalizationContext : public SourceEmitterBase case kIROp_RWStructuredBufferStore: processRWStructuredBufferStore(inst); break; - case kIROp_HLSLStructuredBufferType: - case kIROp_HLSLRWStructuredBufferType: - processStructuredBufferType(as<IRHLSLStructuredBufferTypeBase>(inst)); - break; case kIROp_loop: processLoop(as<IRLoop>(inst)); break; @@ -837,6 +945,23 @@ struct SPIRVLegalizationContext : public SourceEmitterBase } } + // Translate types. + List<IRHLSLStructuredBufferTypeBase*> instsToProcess; + for (auto globalInst : m_module->getGlobalInsts()) + { + if (auto t = as<IRHLSLStructuredBufferTypeBase>(globalInst)) + { + instsToProcess.add(t); + } + } + for (auto t : instsToProcess) + { + auto lowered = lowerStructuredBufferType(t); + IRBuilder builder(t); + builder.setInsertBefore(t); + t->replaceUsesWith(builder.getPtrType(kIROp_PtrType, lowered.structType, SpvStorageClassStorageBuffer)); + } + // SPIRV requires a dominator block to appear before dominated blocks. // After legalizing the control flow, we need to sort our blocks to ensure this is true. for (auto globalInst : m_module->getGlobalInsts()) diff --git a/source/slang/slang-ir-use-uninitialized-out-param.cpp b/source/slang/slang-ir-use-uninitialized-out-param.cpp index 977876c6b..479538441 100644 --- a/source/slang/slang-ir-use-uninitialized-out-param.cpp +++ b/source/slang/slang-ir-use-uninitialized-out-param.cpp @@ -69,6 +69,7 @@ namespace Slang stores.add(StoreSite{ use->getUser(), addr }); break; case kIROp_Call: + case kIROp_SPIRVAsm: // If we see a call using this address, treat it as a store. stores.add(StoreSite{ use->getUser(), addr }); break; diff --git a/source/slang/slang-parser.cpp b/source/slang/slang-parser.cpp index 3a1b627bd..b270ba713 100644 --- a/source/slang/slang-parser.cpp +++ b/source/slang/slang-parser.cpp @@ -6071,6 +6071,32 @@ namespace Slang parser->ReadToken(TokenType::Identifier); return varExpr; } + case TokenType::Scope: + { + parser->ReadToken(TokenType::Scope); + VarExpr* varExpr = parser->astBuilder->create<VarExpr>(); + varExpr->scope = parser->currentScope; + while (varExpr->scope && !as<ModuleDecl>(varExpr->scope->containerDecl)) + varExpr->scope = varExpr->scope->parent; + parser->FillPosition(varExpr); + + auto nameToken = peekToken(parser); + auto nameAndLoc = NameLoc(nameToken); + varExpr->name = nameAndLoc.name; + if (nameToken.type == TokenType::CompletionRequest) + { + parser->hasSeenCompletionToken = true; + } + else + { + parser->ReadToken(TokenType::Identifier); + if (peekTokenType(parser) == TokenType::OpLess) + { + return maybeParseGenericApp(parser, varExpr); + } + } + return varExpr; + } case TokenType::Identifier: { // We will perform name lookup here so that we can find syntax diff --git a/source/slang/slang-spirv-val.cpp b/source/slang/slang-spirv-val.cpp index d41f0e8cc..a6bc29306 100644 --- a/source/slang/slang-spirv-val.cpp +++ b/source/slang/slang-spirv-val.cpp @@ -53,6 +53,8 @@ SlangResult debugValidateSPIRV(const List<uint8_t>& spirv) // Set up our process CommandLine commandLine; commandLine.m_executableLocation.setName("spirv-val"); + commandLine.addArg("--target-env"); + commandLine.addArg("vulkan1.2"); RefPtr<Process> p; const auto createResult = Process::create(commandLine, 0, p); // If we failed to even start the process, then validation isn't available diff --git a/tests/expected-failure.txt b/tests/expected-failure.txt index 9a685b5c7..3e158e2d1 100644 --- a/tests/expected-failure.txt +++ b/tests/expected-failure.txt @@ -1,24 +1,14 @@ tests/autodiff/global-param-hoisting.slang.1 (vk) -tests/bugs/atomic-coerce.slang.1 (vk) tests/bugs/buffer-swizzle-store.slang.1 (vk) -tests/bugs/byte-address-buffer-interlocked-add-f32.slang (vk) tests/bugs/gh-3075.slang.2 (vk) tests/bugs/ray-query-in-generic.slang.1 (vk) -tests/compute/buffer-layout.slang.2 (vk) tests/compute/half-rw-texture-convert.slang.4 (vk) tests/compute/half-rw-texture-convert2.slang.4 (vk) -tests/compute/loop-unroll.slang.5 (vk) tests/compute/ray-tracing-inline.slang.1 (vk) tests/compute/rw-texture-simple.slang.4 (vk) tests/compute/texture-sample-grad-offset-clamp.slang (vk) tests/compute/texture-simple.slang.4 (vk) tests/compute/texture-simpler.slang (vk) -tests/hlsl/glsl-matrix-layout.slang (vk) tests/language-feature/constants/constexpr-loop.slang.1 (vk) tests/optimization/func-resource-result/func-resource-result-complex.slang.1 (vk) -tests/slang-extension/atomic-float-byte-address-buffer.slang.2 (vk) -tests/slang-extension/atomic-int64-byte-address-buffer.slang.4 (vk) -tests/slang-extension/atomic-min-max-u64-byte-address-buffer.slang.4 (vk) -tests/slang-extension/cas-int64-byte-address-buffer.slang.4 (vk) -tests/slang-extension/exchange-int64-byte-address-buffer.slang.4 (vk) tests/type/texture-sampler/texture-sampler-2d.slang (vk) diff --git a/tests/hlsl-intrinsic/byte-address-buffer-atomics.slang b/tests/hlsl-intrinsic/byte-address-buffer-atomics.slang new file mode 100644 index 000000000..f133bb372 --- /dev/null +++ b/tests/hlsl-intrinsic/byte-address-buffer-atomics.slang @@ -0,0 +1,64 @@ +//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK): -dx12 -use-dxil -output-using-type +//TEST(compute, vulkan):COMPARE_COMPUTE(filecheck-buffer=CHECK):-vk -output-using-type +//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-cuda -output-using-type + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<int> outputBuffer; + +//TEST_INPUT:set bbuffer = ubuffer(data=[0 0 0 0]) +RWByteAddressBuffer bbuffer; + +[numthreads(1, 1, 1)] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + uint originalValue; + bbuffer.InterlockedAdd(0, 1); + + bbuffer.InterlockedAdd(0, 1, originalValue); + outputBuffer[4] = originalValue; + + bbuffer.InterlockedMax(0, 3); + bbuffer.InterlockedMax(0, 4, originalValue); + outputBuffer[5] = originalValue; + + bbuffer.InterlockedMin(0, 2); + bbuffer.InterlockedMin(0, 2, originalValue); + outputBuffer[6] = originalValue; + + bbuffer.InterlockedOr(0, 1); + bbuffer.InterlockedOr(0, 1, originalValue); + outputBuffer[7] = originalValue; + + bbuffer.InterlockedXor(0, 4); + bbuffer.InterlockedXor(0, 4, originalValue); + outputBuffer[8] = originalValue; + + bbuffer.InterlockedAnd(0, 7); + bbuffer.InterlockedAnd(0, 7, originalValue); + outputBuffer[9] = originalValue; + + bbuffer.InterlockedCompareExchange(4, 0, 1, originalValue); + outputBuffer[10] = originalValue; + + bbuffer.InterlockedExchange(8, 3, originalValue); + outputBuffer[11] = originalValue; + + bbuffer.InterlockedCompareStore(12, 0, 3); + + // CHECK: 3 + // CHECK: 1 + // CHECK: 3 + // CHECK: 3 + outputBuffer[0] = bbuffer.Load(0); + outputBuffer[1] = bbuffer.Load(4); + outputBuffer[2] = bbuffer.Load(8); + outputBuffer[3] = bbuffer.Load(12); + // CHECK: 1 + // CHECK: 3 + // CHECK: 2 + // CHECK: 3 + // CHECK: 7 + // CHECK: 3 + // CHECK: 0 + // CHECK: 0 +}
\ No newline at end of file |
