From 1dcd814f5038229703e52841b1b0304c22bffb73 Mon Sep 17 00:00:00 2001 From: Yong He Date: Fri, 10 May 2024 09:41:31 -0700 Subject: More Metal Intrinsics. (#4143) --- source/slang/hlsl.meta.slang | 282 +++++++++++++++++++---------- source/slang/slang-capabilities.capdef | 2 + source/slang/slang-emit-c-like.cpp | 5 +- source/slang/slang-emit-metal.cpp | 118 +++++++++++- source/slang/slang-emit-metal.h | 5 + tests/glsl-intrinsic/intrinsic-basic.slang | 2 - tests/metal/barrier.slang | 18 ++ tests/metal/discard.slang | 22 +++ 8 files changed, 347 insertions(+), 107 deletions(-) create mode 100644 tests/metal/barrier.slang create mode 100644 tests/metal/discard.slang diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index b3e323bfc..303d18771 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -4040,13 +4040,14 @@ void abort(); __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T abs(T x) { __target_switch { case hlsl: __intrinsic_asm "abs"; case glsl: __intrinsic_asm "abs"; + case metal: __intrinsic_asm "abs"; case cuda: __intrinsic_asm "$P_abs($0)"; case cpp: __intrinsic_asm "$P_abs($0)"; case spirv: return spirv_asm { @@ -4060,13 +4061,14 @@ T abs(T x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector abs(vector x) { __target_switch { case hlsl: __intrinsic_asm "abs"; case glsl: __intrinsic_asm "abs"; + case metal: __intrinsic_asm "abs"; case spirv: return spirv_asm { result:$$vector = OpExtInst glsl450 SAbs $x; }; @@ -4077,7 +4079,7 @@ vector abs(vector x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix abs(matrix x) { __target_switch @@ -4354,7 +4356,7 @@ bool all(matrix x) // Barrier for writes to all memory spaces (HLSL SM 5.0) __glsl_extension(GL_KHR_memory_scope_semantics) -[require(cuda_glsl_hlsl_spirv, memorybarrier_compute)] +[require(cuda_glsl_hlsl_metal_spirv, memorybarrier_compute)] void AllMemoryBarrier() { __target_switch @@ -4362,6 +4364,7 @@ void AllMemoryBarrier() case hlsl: __intrinsic_asm "AllMemoryBarrier"; case glsl: __intrinsic_asm "memoryBarrier(gl_ScopeDevice, (gl_StorageSemanticsShared|gl_StorageSemanticsImage|gl_StorageSemanticsBuffer), gl_SemanticsAcquireRelease)"; case cuda: __intrinsic_asm "__threadfence()"; + case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture | mem_flags::mem_threadgroup_imageblock)"; case spirv: spirv_asm { OpMemoryBarrier Device AcquireRelease|UniformMemory|WorkgroupMemory|ImageMemory; @@ -4371,7 +4374,7 @@ void AllMemoryBarrier() // Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0) __glsl_extension(GL_KHR_memory_scope_semantics) -[require(cuda_glsl_hlsl_spirv, memorybarrier_compute)] +[require(cuda_glsl_hlsl_metal_spirv, memorybarrier_compute)] void AllMemoryBarrierWithGroupSync() { __target_switch @@ -4379,6 +4382,7 @@ void AllMemoryBarrierWithGroupSync() case hlsl: __intrinsic_asm "AllMemoryBarrierWithGroupSync"; case glsl: __intrinsic_asm "controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, (gl_StorageSemanticsShared|gl_StorageSemanticsImage|gl_StorageSemanticsBuffer), gl_SemanticsAcquireRelease)"; case cuda: __intrinsic_asm "__syncthreads()"; + case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture | mem_flags::mem_threadgroup_imageblock)"; case spirv: spirv_asm { OpControlBarrier Workgroup Device AcquireRelease|UniformMemory|WorkgroupMemory|ImageMemory; @@ -5428,13 +5432,14 @@ bool CheckAccessFullyMapped(uint status); // Clamp (HLSL SM 1.0) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T clamp(T x, T minBound, T maxBound) { __target_switch { case hlsl: __intrinsic_asm "clamp"; case glsl: __intrinsic_asm "clamp"; + case metal: __intrinsic_asm "clamp"; case spirv: if (__isSignedInt()) return spirv_asm { @@ -5451,13 +5456,14 @@ T clamp(T x, T minBound, T maxBound) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector clamp(vector x, vector minBound, vector maxBound) { __target_switch { case hlsl: __intrinsic_asm "clamp"; case glsl: __intrinsic_asm "clamp"; + case metal: __intrinsic_asm "clamp"; case spirv: if (__isSignedInt()) return spirv_asm { @@ -5474,7 +5480,7 @@ vector clamp(vector x, vector minBound, vector maxBound) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix clamp(matrix x, matrix minBound, matrix maxBound) { __target_switch @@ -5487,13 +5493,14 @@ matrix clamp(matrix x, matrix minBound, matrix maxBo __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T clamp(T x, T minBound, T maxBound) { __target_switch { case hlsl: __intrinsic_asm "clamp"; case glsl: __intrinsic_asm "clamp"; + case metal: __intrinsic_asm "clamp"; case spirv: return spirv_asm { result:$$T = OpExtInst glsl450 FClamp $x $minBound $maxBound }; @@ -5504,13 +5511,14 @@ T clamp(T x, T minBound, T maxBound) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector clamp(vector x, vector minBound, vector maxBound) { __target_switch { case hlsl: __intrinsic_asm "clamp"; case glsl: __intrinsic_asm "clamp"; + case metal: __intrinsic_asm "clamp"; case spirv: return spirv_asm { result:$$vector = OpExtInst glsl450 FClamp $x $minBound $maxBound }; @@ -5521,7 +5529,7 @@ vector clamp(vector x, vector minBound, vector maxBound) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix clamp(matrix x, matrix minBound, matrix maxBound) { __target_switch @@ -5700,7 +5708,7 @@ vector cospi(vector x) // Population count [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, shader5_sm_5_0)] uint countbits(uint value) { __target_switch @@ -5709,6 +5717,8 @@ uint countbits(uint value) __intrinsic_asm "countbits"; case glsl: __intrinsic_asm "bitCount"; + case metal: + __intrinsic_asm "popcount"; case cuda: case cpp: __intrinsic_asm "$P_countbits($0)"; @@ -5721,13 +5731,14 @@ uint countbits(uint value) // TODO: SPIRV does not support integer vectors. __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector cross(vector left, vector right) { __target_switch { case glsl: __intrinsic_asm "cross"; case hlsl: __intrinsic_asm "cross"; + case metal: __intrinsic_asm "cross"; case spirv: return spirv_asm { OpExtInst $$vector result glsl450 Cross $left $right }; @@ -5741,7 +5752,7 @@ vector cross(vector left, vector right) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector cross(vector left, vector right) { __target_switch @@ -5761,7 +5772,7 @@ vector cross(vector left, vector right) // Convert encoded color [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] int4 D3DCOLORtoUBYTE4(float4 color) { __target_switch @@ -5780,7 +5791,7 @@ for (auto xOrY : diffDimensions) { }}}} __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, fragmentprocessing)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, fragmentprocessing)] T dd$(xOrY)(T x) { __requireComputeDerivative(); @@ -5792,6 +5803,8 @@ T dd$(xOrY)(T x) __intrinsic_asm "dd$(xOrY)"; case glsl: __intrinsic_asm "dFd$(xOrY)"; + case metal: + __intrinsic_asm "dfd$(xOrY)"; case spirv: return spirv_asm {OpDPd$(xOrY) $$T result $x}; } @@ -5799,7 +5812,7 @@ T dd$(xOrY)(T x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, fragmentprocessing)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, fragmentprocessing)] vector dd$(xOrY)(vector x) { __requireComputeDerivative(); @@ -5811,6 +5824,8 @@ vector dd$(xOrY)(vector x) __intrinsic_asm "dd$(xOrY)"; case glsl: __intrinsic_asm "dFd$(xOrY)"; + case metal: + __intrinsic_asm "dfd$(xOrY)"; case spirv: return spirv_asm {OpDPd$(xOrY) $$vector result $x}; } @@ -5818,7 +5833,7 @@ vector dd$(xOrY)(vector x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, fragmentprocessing)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, fragmentprocessing)] matrix dd$(xOrY)(matrix x) { __requireComputeDerivative(); @@ -5930,7 +5945,7 @@ ${{{{ __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv)] +[require(cpp_cuda_glsl_hlsl_metal_spirv)] T degrees(T x) { __target_switch @@ -5947,7 +5962,7 @@ T degrees(T x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv)] +[require(cpp_cuda_glsl_hlsl_metal_spirv)] vector degrees(vector x) { __target_switch @@ -5964,7 +5979,7 @@ vector degrees(vector x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv)] +[require(cpp_cuda_glsl_hlsl_metal_spirv)] matrix degrees(matrix x) { __target_switch @@ -5980,13 +5995,14 @@ matrix degrees(matrix x) __generic [__readNone] [PreferCheckpoint] -[require(glsl_hlsl_spirv)] +[require(glsl_hlsl_metal_spirv)] T determinant(matrix m) { __target_switch { case glsl: __intrinsic_asm "determinant"; case hlsl: __intrinsic_asm "determinant"; + case metal: __intrinsic_asm "determinant"; case spirv: return spirv_asm { OpExtInst $$T result glsl450 Determinant $m }; @@ -5995,7 +6011,7 @@ T determinant(matrix m) // Barrier for device memory __glsl_extension(GL_KHR_memory_scope_semantics) -[require(cuda_glsl_hlsl_spirv, memorybarrier_compute)] +[require(cuda_glsl_hlsl_metal_spirv, memorybarrier_compute)] void DeviceMemoryBarrier() { __target_switch @@ -6003,6 +6019,7 @@ void DeviceMemoryBarrier() case hlsl: __intrinsic_asm "DeviceMemoryBarrier"; case glsl: __intrinsic_asm "memoryBarrier(gl_ScopeDevice, (gl_StorageSemanticsImage|gl_StorageSemanticsBuffer), gl_SemanticsAcquireRelease)"; case cuda: __intrinsic_asm "__threadfence()"; + case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture | mem_flags::mem_threadgroup_imageblock)"; case spirv: spirv_asm { OpMemoryBarrier Device AcquireRelease|UniformMemory|ImageMemory; @@ -6011,7 +6028,7 @@ void DeviceMemoryBarrier() } __glsl_extension(GL_KHR_memory_scope_semantics) -[require(cuda_glsl_hlsl_spirv, memorybarrier_compute)] +[require(cuda_glsl_hlsl_metal_spirv, memorybarrier_compute)] void DeviceMemoryBarrierWithGroupSync() { __target_switch @@ -6019,6 +6036,7 @@ void DeviceMemoryBarrierWithGroupSync() case hlsl: __intrinsic_asm "DeviceMemoryBarrierWithGroupSync"; case glsl: __intrinsic_asm "controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, (gl_StorageSemanticsImage|gl_StorageSemanticsBuffer), gl_SemanticsAcquireRelease)"; case cuda: __intrinsic_asm "__syncthreads()"; + case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_device | mem_flags::mem_threadgroup | mem_flags::mem_texture | mem_flags::mem_threadgroup_imageblock)"; case spirv: spirv_asm { OpControlBarrier Workgroup Device AcquireRelease|UniformMemory|ImageMemory; @@ -6030,13 +6048,14 @@ void DeviceMemoryBarrierWithGroupSync() __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T distance(vector x, vector y) { __target_switch { case glsl: __intrinsic_asm "distance"; case hlsl: __intrinsic_asm "distance"; + case metal: __intrinsic_asm "distance"; case spirv: return spirv_asm { OpExtInst $$T result glsl450 Distance $x $y }; @@ -6047,7 +6066,7 @@ T distance(vector x, vector y) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T distance(T x, T y) { __target_switch @@ -6065,7 +6084,7 @@ T distance(T x, T y) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv)] +[require(cpp_cuda_glsl_hlsl_metal_spirv)] T fdim(T x, T y) { __target_switch @@ -6078,7 +6097,7 @@ T fdim(T x, T y) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv)] +[require(cpp_cuda_glsl_hlsl_metal_spirv)] vector fdim(vector x, vector y) { __target_switch @@ -6123,7 +6142,7 @@ vector divide(vector x, vector y) __generic [__readNone] [ForceInline] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T dot(T x, T y) { __target_switch @@ -6137,13 +6156,14 @@ T dot(T x, T y) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T dot(vector x, vector y) { __target_switch { case glsl: __intrinsic_asm "dot"; case hlsl: __intrinsic_asm "dot"; + case metal: __intrinsic_asm "dot"; case spirv: return spirv_asm { OpDot $$T result $x $y }; @@ -6157,7 +6177,7 @@ T dot(vector x, vector y) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T dot(vector x, vector y) { __target_switch @@ -7137,7 +7157,7 @@ matrix frexp(matrix x, out matrix exp) // Texture filter width __generic [__readNone] -[require(glsl_hlsl_spirv, fragmentprocessing)] +[require(glsl_hlsl_metal_spirv, fragmentprocessing)] T fwidth(T x) { __requireComputeDerivative(); @@ -7147,6 +7167,8 @@ T fwidth(T x) __intrinsic_asm "fwidth($0)"; case glsl: __intrinsic_asm "fwidth($0)"; + case metal: + __intrinsic_asm "fwidth($0)"; case spirv: return spirv_asm { @@ -7290,17 +7312,33 @@ matrix GetAttributeAtVertex(matrix attribute, uint vertexIndex) // Get number of samples in render target [__readNone] -[require(sm_4_0)] -uint GetRenderTargetSampleCount(); +[require(hlsl, sm_4_0)] +[require(metal)] +uint GetRenderTargetSampleCount() +{ + __target_switch + { + case hlsl: __intrinsic_asm "GetRenderTargetSampleCount"; + case metal: __intrinsic_asm "get_num_samples"; + } +} // Get position of given sample [__readNone] -[require(sm_4_0)] -float2 GetRenderTargetSamplePosition(int Index); +[require(hlsl, sm_4_0)] +[require(metal)] +float2 GetRenderTargetSamplePosition(int Index) +{ + __target_switch + { + case hlsl: __intrinsic_asm "GetRenderTargetSamplePosition"; + case metal: __intrinsic_asm "get_sample_position"; + } +} // Group memory barrier __glsl_extension(GL_KHR_memory_scope_semantics) -[require(cuda_glsl_hlsl_spirv, memorybarrier_compute)] +[require(cuda_glsl_hlsl_metal_spirv, memorybarrier_compute)] void GroupMemoryBarrier() { __target_switch @@ -7308,6 +7346,7 @@ void GroupMemoryBarrier() case glsl: __intrinsic_asm "memoryBarrier(gl_ScopeWorkgroup, gl_StorageSemanticsShared, gl_SemanticsAcquireRelease)"; case hlsl: __intrinsic_asm "GroupMemoryBarrier"; case cuda: __intrinsic_asm "__threadfence_block"; + case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_threadgroup)"; case spirv: spirv_asm { @@ -7316,7 +7355,7 @@ void GroupMemoryBarrier() } } -[require(cuda_glsl_hlsl_spirv, memorybarrier_compute)] +[require(cuda_glsl_hlsl_metal_spirv, memorybarrier_compute)] void __subgroupBarrier() { __target_switch @@ -7324,6 +7363,7 @@ void __subgroupBarrier() case glsl: __intrinsic_asm "subgroupBarrier"; case hlsl: __intrinsic_asm "GroupMemoryBarrierWithGroupSync"; case cuda: __intrinsic_asm "__syncthreads()"; + case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_threadgroup)"; case spirv: spirv_asm { @@ -7333,7 +7373,7 @@ void __subgroupBarrier() } __glsl_extension(GL_KHR_memory_scope_semantics) -[require(cuda_glsl_hlsl_spirv, memorybarrier_compute)] +[require(cuda_glsl_hlsl_metal_spirv, memorybarrier_compute)] void GroupMemoryBarrierWithGroupSync() { __target_switch @@ -7341,6 +7381,7 @@ void GroupMemoryBarrierWithGroupSync() case glsl: __intrinsic_asm "controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup, gl_StorageSemanticsShared, gl_SemanticsAcquireRelease)"; case hlsl: __intrinsic_asm "GroupMemoryBarrierWithGroupSync"; case cuda: __intrinsic_asm "__syncthreads()"; + case metal: __intrinsic_asm "threadgroup_barrier(mem_flags::mem_threadgroup)"; case spirv: spirv_asm { @@ -8234,7 +8275,7 @@ void InterlockedXor(__ref uint64_t dest, uint64_t value, out uint64_t origina __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] bool isfinite(T x) { __target_switch @@ -8243,6 +8284,8 @@ bool isfinite(T x) case cuda: case cpp: __intrinsic_asm "$P_isfinite($0)"; + case metal: + __intrinsic_asm "isfinite"; default: return !(isinf(x) || isnan(x)); } @@ -8250,7 +8293,7 @@ bool isfinite(T x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector isfinite(vector x) { __target_switch @@ -8259,6 +8302,8 @@ vector isfinite(vector x) case glsl: case spirv: return !(isinf(x) || isnan(x)); + case metal: + __intrinsic_asm "isfinite"; default: VECTOR_MAP_UNARY(bool, N, isfinite, x); } @@ -8266,7 +8311,7 @@ vector isfinite(vector x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix isfinite(matrix x) { __target_switch @@ -8280,13 +8325,14 @@ matrix isfinite(matrix x) // Is floating-point value infinite? __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] bool isinf(T x) { __target_switch { case hlsl: case glsl: + case metal: __intrinsic_asm "isinf"; case cuda: case cpp: @@ -8298,13 +8344,14 @@ bool isinf(T x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector isinf(vector x) { __target_switch { case hlsl: case glsl: + case metal: __intrinsic_asm "isinf"; case spirv: return spirv_asm { result:$$vector = OpIsInf $x}; @@ -8315,7 +8362,7 @@ vector isinf(vector x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix isinf(matrix x) { __target_switch @@ -8329,13 +8376,14 @@ matrix isinf(matrix x) // Is floating-point value not-a-number? __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] bool isnan(T x) { __target_switch { case hlsl: case glsl: + case metal: __intrinsic_asm "isnan"; case cuda: case cpp: @@ -8347,13 +8395,14 @@ bool isnan(T x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector isnan(vector x) { __target_switch { case hlsl: case glsl: + case metal: __intrinsic_asm "isnan"; case spirv: return spirv_asm { result:$$vector = OpIsNan $x}; @@ -8364,7 +8413,7 @@ vector isnan(vector x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix isnan(matrix x) { __target_switch @@ -8460,13 +8509,14 @@ vector ldexp(vector x, vector exp) // Vector length __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T length(vector x) { __target_switch { case glsl: __intrinsic_asm "length"; case hlsl: __intrinsic_asm "length"; + case metal: __intrinsic_asm "length"; case spirv: return spirv_asm { OpExtInst $$T result glsl450 Length $x }; @@ -8477,12 +8527,12 @@ T length(vector x) // Scalar float length __generic -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T length(T x) { __target_switch { - case glsl: __intrinsic_asm "length"; + case glsl: __intrinsic_asm "length"; case spirv: return spirv_asm { OpExtInst $$T result glsl450 Length $x }; @@ -8494,41 +8544,43 @@ T length(T x) // Linear interpolation __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T lerp(T x, T y, T s) { __target_switch { case glsl: __intrinsic_asm "mix"; + case metal: __intrinsic_asm "mix"; case hlsl: __intrinsic_asm "lerp"; case spirv: return spirv_asm { OpExtInst $$T result glsl450 FMix $x $y $s }; default: - return x * (T(1.0f) - s) + y * s; + return x + (y - x) * s; } } __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector lerp(vector x, vector y, vector s) { __target_switch { case glsl: __intrinsic_asm "mix"; + case metal: __intrinsic_asm "mix"; case hlsl: __intrinsic_asm "lerp"; case spirv: return spirv_asm { OpExtInst $$vector result glsl450 FMix $x $y $s }; default: - return x * (T(1.0f) - s) + y * s; + return x + (y - x) * s; } } __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix lerp(matrix x, matrix y, matrix s) { __target_switch @@ -8541,7 +8593,7 @@ matrix lerp(matrix x, matrix y, matrix s) // Legacy lighting function (obsolete) [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] float4 lit(float n_dot_l, float n_dot_h, float m) { __target_switch @@ -8817,7 +8869,7 @@ matrix mad(matrix mvalue, matrix avalue, matrix [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T max(T x, T y) { // Note: a stdlib implementation of `max` (or `min`) will require splitting @@ -8829,6 +8881,7 @@ T max(T x, T y) { case hlsl: __intrinsic_asm "max"; case glsl: __intrinsic_asm "max"; + case metal: __intrinsic_asm "max"; case cuda: __intrinsic_asm "$P_max($0, $1)"; case cpp: __intrinsic_asm "$P_max($0, $1)"; case spirv: @@ -8851,13 +8904,14 @@ T max(T x, T y) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector max(vector x, vector y) { __target_switch { case hlsl: __intrinsic_asm "max"; case glsl: __intrinsic_asm "max"; + case metal: __intrinsic_asm "max"; case spirv: { if (__isSignedInt()) @@ -8880,7 +8934,7 @@ vector max(vector x, vector y) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix max(matrix x, matrix y) { __target_switch @@ -9045,22 +9099,24 @@ vector fmax3(vector x, vector y, vector z) __generic __target_intrinsic(hlsl) __target_intrinsic(glsl) +__target_intrinsic(metal) __target_intrinsic(cuda, "$P_min($0, $1)") __target_intrinsic(cpp, "$P_min($0, $1)") __target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0 _1") [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T min(T x, T y); __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector min(vector x, vector y) { __target_switch { case hlsl: __intrinsic_asm "min"; case glsl: __intrinsic_asm "min"; + case metal: __intrinsic_asm "min"; case spirv: { if (__isSignedInt()) @@ -9079,7 +9135,7 @@ vector min(vector x, vector y) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix min(matrix x, matrix y) { __target_switch @@ -9412,44 +9468,45 @@ uint4 msad4(uint reference, uint2 source, uint4 accum) __generic __intrinsic_op($(kIROp_Mul)) [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T mul(T x, T y); // scalar-vector and vector-scalar __generic __intrinsic_op($(kIROp_Mul)) [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector mul(vector x, T y); __generic __intrinsic_op($(kIROp_Mul)) [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector mul(T x, vector y); // scalar-matrix and matrix-scalar __generic __intrinsic_op($(kIROp_Mul)) [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix mul(matrix x, T y); __generic __intrinsic_op($(kIROp_Mul)) [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix mul(T x, matrix y); // vector-vector (dot product) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T mul(vector x, vector y) { __target_switch { case glsl: __intrinsic_asm "dot"; + case metal: __intrinsic_asm "dot"; case hlsl: __intrinsic_asm "mul"; default: return dot(x, y); @@ -9457,7 +9514,7 @@ T mul(vector x, vector y) } __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T mul(vector x, vector y) { __target_switch @@ -9471,12 +9528,13 @@ T mul(vector x, vector y) // vector-matrix __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector mul(vector left, matrix right) { __target_switch { case glsl: __intrinsic_asm "($1 * $0)"; + case metal: __intrinsic_asm "($1 * $0)"; case hlsl: __intrinsic_asm "mul"; case spirv: return spirv_asm { OpMatrixTimesVector $$vector result $right $left @@ -9497,12 +9555,13 @@ vector mul(vector left, matrix right) } __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector mul(vector left, matrix right) { __target_switch { case glsl: __intrinsic_asm "($1 * $0)"; + case metal: __intrinsic_asm "($1 * $0)"; case hlsl: __intrinsic_asm "mul"; default: vector result; @@ -9520,12 +9579,13 @@ vector mul(vector left, matrix right) } __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector mul(vector left, matrix right) { __target_switch { case glsl: __intrinsic_asm "($1 * $0)"; + case metal: __intrinsic_asm "($1 * $0)"; case hlsl: __intrinsic_asm "mul"; default: vector result; @@ -9545,12 +9605,13 @@ vector mul(vector left, matrix right) // matrix-vector __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector mul(matrix left, vector right) { __target_switch { case glsl: __intrinsic_asm "($1 * $0)"; + case metal: __intrinsic_asm "($1 * $0)"; case hlsl: __intrinsic_asm "mul"; case spirv: return spirv_asm { OpVectorTimesMatrix $$vector result $right $left @@ -9571,12 +9632,13 @@ vector mul(matrix left, vector right) } __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector mul(matrix left, vector right) { __target_switch { case glsl: __intrinsic_asm "($1 * $0)"; + case metal: __intrinsic_asm "($1 * $0)"; case hlsl: __intrinsic_asm "mul"; default: vector result; @@ -9594,12 +9656,13 @@ vector mul(matrix left, vector right) } __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector mul(matrix left, vector right) { __target_switch { case glsl: __intrinsic_asm "($1 * $0)"; + case metal: __intrinsic_asm "($1 * $0)"; case hlsl: __intrinsic_asm "mul"; default: vector result; @@ -9619,12 +9682,13 @@ vector mul(matrix left, vector right) // matrix-matrix __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix mul(matrix left, matrix right) { __target_switch { case glsl: __intrinsic_asm "($1 * $0)"; + case metal: __intrinsic_asm "($1 * $0)"; case hlsl: __intrinsic_asm "mul"; case spirv: return spirv_asm { OpMatrixTimesMatrix $$matrix result $right $left @@ -9646,12 +9710,13 @@ matrix mul(matrix left, matrix right) } __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix mul(matrix left, matrix right) { __target_switch { case glsl: __intrinsic_asm "($1 * $0)"; + case metal: __intrinsic_asm "($1 * $0)"; case hlsl: __intrinsic_asm "mul"; default: matrix result; @@ -9670,12 +9735,13 @@ matrix mul(matrix left, matrix right) } __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix mul(matrix left, matrix right) { __target_switch { case glsl: __intrinsic_asm "($1 * $0)"; + case metal: __intrinsic_asm "($1 * $0)"; case hlsl: __intrinsic_asm "mul"; default: matrix result; @@ -10043,7 +10109,7 @@ void ProcessTriTessFactorsMin( // Degrees to radians __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T radians(T x) { __target_switch @@ -10060,7 +10126,7 @@ T radians(T x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector radians(vector x) { __target_switch @@ -10077,7 +10143,7 @@ vector radians(vector x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix radians(matrix x) { __target_switch @@ -10091,7 +10157,7 @@ matrix radians(matrix x) // Approximate reciprocal __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T rcp(T x) { __target_switch @@ -10104,7 +10170,7 @@ T rcp(T x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector rcp(vector x) { __target_switch @@ -10120,7 +10186,7 @@ vector rcp(vector x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix rcp(matrix x) { __target_switch @@ -10134,13 +10200,14 @@ matrix rcp(matrix x) // Reflect incident vector across plane with given normal __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T reflect(T i, T n) { __target_switch { case glsl: __intrinsic_asm "reflect"; case hlsl: __intrinsic_asm "reflect"; + case metal: __intrinsic_asm "reflect"; case spirv: return spirv_asm { OpExtInst $$T result glsl450 Reflect $i $n }; @@ -10151,13 +10218,14 @@ T reflect(T i, T n) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector reflect(vector i, vector n) { __target_switch { case glsl: __intrinsic_asm "reflect"; case hlsl: __intrinsic_asm "reflect"; + case metal: __intrinsic_asm "reflect"; case spirv: return spirv_asm { OpExtInst $$vector result glsl450 Reflect $i $n }; @@ -10169,13 +10237,14 @@ vector reflect(vector i, vector n) // Refract incident vector given surface normal and index of refraction __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector refract(vector i, vector n, T eta) { __target_switch { case glsl: __intrinsic_asm "refract"; case hlsl: __intrinsic_asm "refract"; + case metal: __intrinsic_asm "refract"; case spirv: return spirv_asm { OpExtInst $$vector result glsl450 Refract $i $n $eta }; @@ -10189,13 +10258,14 @@ vector refract(vector i, vector n, T eta) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T refract(T i, T n, T eta) { __target_switch { case glsl: __intrinsic_asm "refract"; case hlsl: __intrinsic_asm "refract"; + case metal: __intrinsic_asm "refract"; case spirv: return spirv_asm { OpExtInst $$T result glsl450 Refract $i $n $eta }; @@ -10209,7 +10279,7 @@ T refract(T i, T n, T eta) // Reverse order of bits [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, shader5_sm_5_0)] uint reversebits(uint value) { __target_switch @@ -10221,6 +10291,8 @@ uint reversebits(uint value) case cuda: case cpp: __intrinsic_asm "$P_reversebits($0)"; + case metal: + __intrinsic_asm "reverse_bits"; case spirv: return spirv_asm {OpBitReverse $$uint result $value}; } @@ -10228,7 +10300,7 @@ uint reversebits(uint value) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, shader5_sm_5_0)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, shader5_sm_5_0)] vector reversebits(vector value) { __target_switch @@ -10237,6 +10309,8 @@ vector reversebits(vector value) VECTOR_MAP_UNARY(uint, N, reversebits, value); case glsl: __intrinsic_asm "bitfieldReverse"; + case metal: + __intrinsic_asm "reverse_bits"; case spirv: return spirv_asm {OpBitReverse $$vector result $value}; } @@ -10398,12 +10472,13 @@ matrix rsqrt(matrix x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T saturate(T x) { __target_switch { case hlsl: __intrinsic_asm "saturate"; + case metal: __intrinsic_asm "saturate"; default: return clamp(x, T(0), T(1)); } @@ -10411,12 +10486,13 @@ T saturate(T x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector saturate(vector x) { __target_switch { case hlsl: __intrinsic_asm "saturate"; + case metal: __intrinsic_asm "saturate"; default: return clamp(x, vector(T(0)), @@ -10426,7 +10502,7 @@ vector saturate(vector x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix saturate(matrix x) { __target_switch @@ -10453,6 +10529,7 @@ int sign(T x) __target_switch { case hlsl: __intrinsic_asm "sign"; + case metal: __intrinsic_asm "int(sign($0))"; case glsl: __intrinsic_asm "int(sign($0))"; case cuda: case cpp: @@ -10477,6 +10554,7 @@ vector sign(vector x) { case hlsl: __intrinsic_asm "sign"; case glsl: __intrinsic_asm "ivec$N0(sign($0))"; + case metal: __intrinsic_asm "vec(sign($0))"; case spirv: if (__isFloat()) return spirv_asm @@ -10714,13 +10792,14 @@ vector sinpi(vector x) // Smooth step (Hermite interpolation) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T smoothstep(T min, T max, T x) { __target_switch { case glsl: __intrinsic_asm "smoothstep"; case hlsl: __intrinsic_asm "smoothstep"; + case metal: __intrinsic_asm "smoothstep"; case spirv: return spirv_asm { OpExtInst $$T result glsl450 SmoothStep $min $max $x }; @@ -10732,13 +10811,14 @@ T smoothstep(T min, T max, T x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector smoothstep(vector min, vector max, vector x) { __target_switch { case glsl: __intrinsic_asm "smoothstep"; case hlsl: __intrinsic_asm "smoothstep"; + case metal: __intrinsic_asm "smoothstep"; case spirv: return spirv_asm { OpExtInst $$vector result glsl450 SmoothStep $min $max $x }; @@ -10749,7 +10829,7 @@ vector smoothstep(vector min, vector max, vector x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix smoothstep(matrix min, matrix max, matrix x) { __target_switch @@ -10813,13 +10893,14 @@ matrix sqrt(matrix x) // Step function __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] T step(T y, T x) { __target_switch { case glsl: __intrinsic_asm "step"; case hlsl: __intrinsic_asm "step"; + case metal: __intrinsic_asm "step"; case spirv: return spirv_asm { OpExtInst $$T result glsl450 Step $y $x }; @@ -10830,13 +10911,14 @@ T step(T y, T x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] vector step(vector y, vector x) { __target_switch { case glsl: __intrinsic_asm "step"; case hlsl: __intrinsic_asm "step"; + case metal: __intrinsic_asm "step"; case spirv: return spirv_asm { OpExtInst $$vector result glsl450 Step $y $x }; @@ -10847,7 +10929,7 @@ vector step(vector y, vector x) __generic [__readNone] -[require(cpp_cuda_glsl_hlsl_spirv, sm_2_0_GLSL_140)] +[require(cpp_cuda_glsl_hlsl_metal_spirv, sm_2_0_GLSL_140)] matrix step(matrix y, matrix x) { __target_switch diff --git a/source/slang/slang-capabilities.capdef b/source/slang/slang-capabilities.capdef index 1cf81da57..4d4fc7ccf 100644 --- a/source/slang/slang-capabilities.capdef +++ b/source/slang/slang-capabilities.capdef @@ -78,10 +78,12 @@ alias cpp_glsl_hlsl_spirv = cpp | glsl | hlsl | spirv_1_0; alias cpp_hlsl = cpp | hlsl; alias cuda_glsl_hlsl = cuda | glsl | hlsl; alias cuda_glsl_hlsl_spirv = cuda | glsl | hlsl | spirv_1_0; +alias cuda_glsl_hlsl_metal_spirv = cuda | glsl | hlsl | metal | spirv_1_0; alias cuda_glsl_spirv = cuda | glsl | spirv; alias cuda_hlsl = cuda | hlsl; alias cuda_hlsl_spirv = cuda | hlsl | spirv; alias glsl_hlsl_spirv = glsl | hlsl | spirv; +alias glsl_hlsl_metal_spirv = glsl | hlsl | metal | spirv; alias glsl_spirv = glsl | spirv; alias hlsl_spirv = hlsl | spirv; diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp index 2119d8fee..b44fa677c 100644 --- a/source/slang/slang-emit-c-like.cpp +++ b/source/slang/slang-emit-c-like.cpp @@ -541,6 +541,9 @@ void CLikeSourceEmitter::defaultEmitInstStmt(IRInst* inst) m_writer->emit(");\n"); } break; + case kIROp_discard: + m_writer->emit("discard;\n"); + break; default: diagnoseUnhandledInst(inst); } @@ -2876,7 +2879,7 @@ void CLikeSourceEmitter::_emitInst(IRInst* inst) break; case kIROp_discard: - m_writer->emit("discard;\n"); + emitInstStmt(inst); break; case kIROp_swizzleSet: diff --git a/source/slang/slang-emit-metal.cpp b/source/slang/slang-emit-metal.cpp index 4d8a207c3..1eb4b9abe 100644 --- a/source/slang/slang-emit-metal.cpp +++ b/source/slang/slang-emit-metal.cpp @@ -11,6 +11,40 @@ namespace Slang { +static const char* kMetalBuiltinPreludeMatrixCompMult = R"( +template +matrix _slang_matrixCompMult(matrix m1, matrix m2) +{ + matrix result; + for (int i = 0; i < A; i++) + result[i] = m1[i] * m2[i]; + return result; +} +)"; + +static const char* kMetalBuiltinPreludeMatrixReshape = R"( +template +matrix _slang_matrixReshape(matrix m) +{ + matrix result = T(0); + for (int i = 0; i < min(A,N); i++) + for (int j = 0; j < min(B,M); j++) + result[i] = m[i][j]; + return result; +} +)"; + +static const char* kMetalBuiltinPreludeVectorReshape = R"( +template +vec _slang_vectorReshape(vec v) +{ + vec result = T(0); + for (int i = 0; i < min(A,N); i++) + result[i] = v[i]; + return result; +} +)"; + void MetalSourceEmitter::_emitHLSLDecorationSingleString(const char* name, IRFunc* entryPoint, IRStringLit* val) { SLANG_UNUSED(entryPoint); @@ -163,7 +197,7 @@ void MetalSourceEmitter::emitEntryPointAttributesImpl(IRFunc* irFunc, IREntryPoi switch (stage) { - case Stage::Pixel: + case Stage::Pixel: { if (irFunc->findDecoration()) { @@ -176,12 +210,36 @@ void MetalSourceEmitter::emitEntryPointAttributesImpl(IRFunc* irFunc, IREntryPoi } } +void MetalSourceEmitter::ensurePrelude(const char* preludeText) +{ + IRStringLit* stringLit; + if (!m_builtinPreludes.tryGetValue(preludeText, stringLit)) + { + IRBuilder builder(m_irModule); + stringLit = builder.getStringValue(UnownedStringSlice(preludeText)); + m_builtinPreludes[preludeText] = stringLit; + } + m_requiredPreludes.add(stringLit); +} + +bool MetalSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) +{ + switch (inst->getOp()) + { + case kIROp_discard: + m_writer->emit("discard_fragment();\n"); + return true; + } + return false; +} + bool MetalSourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec) { switch (inst->getOp()) { case kIROp_MakeVector: case kIROp_MakeMatrix: + case kIROp_MakeVectorFromScalar: { if (inst->getOperandCount() == 1) { @@ -190,19 +248,71 @@ bool MetalSourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inO auto prec = getInfo(EmitOp::Prefix); needClose = maybeEmitParens(outerPrec, prec); - - // Need to emit as cast for HLSL emitType(inst->getDataType()); m_writer->emit("("); emitOperand(inst->getOperand(0), rightSide(outerPrec, prec)); m_writer->emit(") "); maybeCloseParens(needClose); - // Handled return true; } break; } + case kIROp_MatrixReshape: + { + ensurePrelude(kMetalBuiltinPreludeMatrixReshape); + m_writer->emit("_slang_matrixReshape<"); + auto matrixType = as(inst->getDataType()); + emitOperand(matrixType->getRowCount(), getInfo(EmitOp::General)); + m_writer->emit(", "); + emitOperand(matrixType->getColumnCount(), getInfo(EmitOp::General)); + m_writer->emit(">("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + m_writer->emit(")"); + return true; + } + case kIROp_VectorReshape: + { + ensurePrelude(kMetalBuiltinPreludeVectorReshape); + m_writer->emit("_slang_vectorReshape<"); + auto vectorType = as(inst->getDataType()); + emitOperand(vectorType->getElementCount(), getInfo(EmitOp::General)); + m_writer->emit(">("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + m_writer->emit(")"); + return true; + } + case kIROp_Mul: + { + // Component-wise multiplication needs to be special cased, + // because Metal uses infix `*` to express inner product + // when working with matrices. + + // Are both operands matrices? + if (as(inst->getOperand(0)->getDataType()) + && as(inst->getOperand(1)->getDataType())) + { + ensurePrelude(kMetalBuiltinPreludeMatrixCompMult); + m_writer->emit("_slang_matrixCompMult("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + m_writer->emit(", "); + emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); + m_writer->emit(")"); + return true; + } + break; + } + case kIROp_Select: + { + m_writer->emit("select("); + emitOperand(inst->getOperand(2), leftSide(getInfo(EmitOp::General), getInfo(EmitOp::General))); + m_writer->emit(", "); + emitOperand(inst->getOperand(1), leftSide(getInfo(EmitOp::General), getInfo(EmitOp::General))); + m_writer->emit(", "); + emitOperand(inst->getOperand(0), leftSide(getInfo(EmitOp::General), getInfo(EmitOp::General))); + m_writer->emit(")"); + return true; + } case kIROp_BitCast: { auto toType = inst->getDataType(); diff --git a/source/slang/slang-emit-metal.h b/source/slang/slang-emit-metal.h index a60d28b96..ddc1c7665 100644 --- a/source/slang/slang-emit-metal.h +++ b/source/slang/slang-emit-metal.h @@ -20,9 +20,12 @@ public: virtual RefObject* getExtensionTracker() SLANG_OVERRIDE { return m_extensionTracker; } + Dictionary m_builtinPreludes; protected: RefPtr m_extensionTracker; + void ensurePrelude(const char* preludeText); + virtual void emitParameterGroupImpl(IRGlobalParam* varDecl, IRUniformParameterGroupType* type) SLANG_OVERRIDE; virtual void emitEntryPointAttributesImpl(IRFunc* irFunc, IREntryPointDecoration* entryPointDecor) SLANG_OVERRIDE; @@ -43,6 +46,8 @@ protected: virtual void emitMatrixLayoutModifiersImpl(IRVarLayout* layout) SLANG_OVERRIDE; virtual bool tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec) SLANG_OVERRIDE; + virtual bool tryEmitInstStmtImpl(IRInst* inst) SLANG_OVERRIDE; + virtual void emitSimpleValueImpl(IRInst* inst) SLANG_OVERRIDE; virtual void emitLoopControlDecorationImpl(IRLoopControlDecoration* decl) SLANG_OVERRIDE; virtual void emitFuncDecorationImpl(IRDecoration* decoration) SLANG_OVERRIDE; diff --git a/tests/glsl-intrinsic/intrinsic-basic.slang b/tests/glsl-intrinsic/intrinsic-basic.slang index d374f0f84..6bd4e7fe0 100644 --- a/tests/glsl-intrinsic/intrinsic-basic.slang +++ b/tests/glsl-intrinsic/intrinsic-basic.slang @@ -2,8 +2,6 @@ //TEST:SIMPLE(filecheck=CHECK_GLSL_SPIRV): -allow-glsl -stage compute -entry computeMain -target spirv //TEST:SIMPLE(filecheck=CHECK_SPIR): -allow-glsl -stage compute -entry computeMain -target spirv -emit-spirv-directly //TEST:SIMPLE(filecheck=CHECK_HLSL): -allow-glsl -stage compute -entry computeMain -target hlsl -//TEST:SIMPLE(filecheck=CHECK_CUDA): -allow-glsl -stage compute -entry computeMain -target cuda -DTARGET_CUDA -//TEST:SIMPLE(filecheck=CHECK_CPP): -allow-glsl -stage compute -entry computeMain -target cpp //TEST(compute, vulkan):COMPARE_COMPUTE(filecheck-buffer=BUF):-vk -compute -entry computeMain -allow-glsl -output-using-type -emit-spirv-via-glsl //TEST(compute, vulkan):COMPARE_COMPUTE(filecheck-buffer=BUF):-vk -compute -entry computeMain -allow-glsl -output-using-type -emit-spirv-directly diff --git a/tests/metal/barrier.slang b/tests/metal/barrier.slang new file mode 100644 index 000000000..86e4265be --- /dev/null +++ b/tests/metal/barrier.slang @@ -0,0 +1,18 @@ +//TEST:SIMPLE(filecheck=CHECK): -target metal +//TEST:SIMPLE(filecheck=CHECK-ASM): -target metallib + +// CHECK: threadgroup_barrier + +// CHECK-ASM: define void @computeMain + +RWStructuredBuffer outputBuffer; + +[numthreads(1,1,1)] +void computeMain() +{ + outputBuffer[0] = 1; + GroupMemoryBarrierWithGroupSync(); + outputBuffer[1] = 2; + AllMemoryBarrierWithGroupSync(); + outputBuffer[2] = 3; +} \ No newline at end of file diff --git a/tests/metal/discard.slang b/tests/metal/discard.slang new file mode 100644 index 000000000..7c2622561 --- /dev/null +++ b/tests/metal/discard.slang @@ -0,0 +1,22 @@ +//TEST:SIMPLE(filecheck=CHECK): -target metal +//TEST:SIMPLE(filecheck=CHECK-ASM): -target metallib + +// CHECK: discard_fragment(); + +// CHECK-ASM: define {{.*}} @main_fragment1 + +struct VOut +{ + float4 position : SV_Position; + float4 vertexColor; + float2 vertexUV; + float3 vertexNormal : NORMAL; +} + +[shader("fragment")] +float4 main_fragment1(VOut fragmentIn) +{ + if (fragmentIn.vertexColor.x == 0.0) + discard; + return fragmentIn.vertexColor; +} \ No newline at end of file -- cgit v1.2.3