From a16f712bb99e426519c9a556b17b54bcc4d1d22d Mon Sep 17 00:00:00 2001
From: Jay Kwak <82421531+jkwak-work@users.noreply.github.com>
Date: Wed, 7 Feb 2024 12:12:15 -0800
Subject: Implement basic GLSL built-in functions (#3525)

* Implement basic GLSL built-in functions

Partially resolves #3362

This change implemented GLSL build-in functions described in the
following sections of "OpenGL Spec" document.
  8.1. Angle and Trigonometry Functions
  8.2. Exponential Functions
  8.3. Common Functions
  8.5. Geometric Functions
  8.7. Vector Relational Functions
  8.8. Integer Functions

About 40 functions are newly implemented and about 150 functions
were preexisted on HLSL side implementation.

The implementation of new functions hasn't been tested yet.

* Unify some of GLSL functions into hlsl.meta.slang

Partially resoves #3362

This change moves Some of GLSL functions from glsl.meta.slang to
hlsl.meta.slang, because those functions are generic enough to be used
for HLSL. Those functions are: dot, normalize, fma, and reflect.

There was "fma" for double in hlsl.meta.slang and it is converted to use
__BuiltinFloatingPointType type, which required some modifications in
diff.meta.slang. The implementation for "fma" in diff.meta.slang is very
similar to how "mad" is implemented.

* Implement more GLSL built-in functions

Partially resolves #3362

This change implements more GLSL built-in functions mentioned in the
following sections.
  8.4. Floating-Point Pack and Unpack Functions
  8.6. Matrix Functions

This change implemented 11 new GLSL built-in functions and there were 3
already working functions.

The mistake in "normalize" is fixed.

"refract" function is moved from glsl.meta.slang to hlsl.meta.slang.

* Implement basic GLSL built-in functions

Partially resolves #3362

This change implemented GLSL build-in functions described in the
following sections of "OpenGL Spec" document.
  8.1. Angle and Trigonometry Functions
  8.2. Exponential Functions
  8.3. Common Functions
  8.5. Geometric Functions
  8.7. Vector Relational Functions
  8.8. Integer Functions

About 40 functions are newly implemented and about 150 functions
were preexisted on HLSL side implementation.

The implementation of new functions hasn't been tested yet.

* Unify some of GLSL functions into hlsl.meta.slang

Partially resoves #3362

This change moves Some of GLSL functions from glsl.meta.slang to
hlsl.meta.slang, because those functions are generic enough to be used
for HLSL. Those functions are: dot, normalize, fma, and reflect.

There was "fma" for double in hlsl.meta.slang and it is converted to use
__BuiltinFloatingPointType type, which required some modifications in
diff.meta.slang. The implementation for "fma" in diff.meta.slang is very
similar to how "mad" is implemented.

* Implement more GLSL built-in functions

Partially resolves #3362

This change implements more GLSL built-in functions mentioned in the
following sections.
  8.4. Floating-Point Pack and Unpack Functions
  8.6. Matrix Functions

This change implemented 11 new GLSL built-in functions and there were 3
already working functions.

The mistake in "normalize" is fixed.

"refract" function is moved from glsl.meta.slang to hlsl.meta.slang.

* Fix a few minor bugs on GLSL builtin functions

Partially resovles #3362

Following bugs were addressed:
  1. "bitCounts" had to have a "Capability" on its function declaration.
  2. "roundEven" is implemented. It is almost same to "round()" but the
     behaivor is slightly different the given value is 1.5, 3.5, 5.5 and
     so on.
  3. umulExtended and imulExtended are simplified.
  4. exp2 is implemented with "__target_switch" for GLSL and SPIR-V.
  5. "tests/glsl-intrinsic/intrinsic-basic.slang" checks the results
     from the GLSL functions. Currently it is mainly to test if the
     functions exist or not, but it can now also test for a simple case
     where the input value is zero and the result is most of the time
     zero or one.

* Disable GLSL exp2 double type tests

This change disables some of GLSL exp2 related tests as a workaround.
The spir-v needs to handle the double-type argument for exp2 properly.

* Fix exp2(double) problem for SPIR-V

SPIR-V can handle a double-type input for exp2 with this change.

However, the slang-test is will failing to test it with an error message
saying, "abort compilation:".

With a simpler test case, I verified that SPIR-V assembly code is
properly generated for exp2(double) and I am not sure why slang-test is
still failing. We will need to revisit this issue later.

The simple testing is done with a following line:
  outputBuffer.result[0] = float(exp2(double(outputBuffer.result[0])));

And it generated following lines and it looks correct:
  ; Function main
  %main = OpFunction %void None %3
  %5 = OpLabel
  %16 = OpAccessChain %_ptr_Uniform_float %outputBuffer_0 %int_0 %uint_0
  %17 = OpLoad %float %16
  %19 = OpFConvert %double %17
  %20 = OpFConvert %float %19
  %21 = OpExtInst %float %1 Exp2 %20
  %22 = OpAccessChain %_ptr_Uniform_float %outputBuffer_0 %int_0 %uint_0
  OpStore %22 %21
  OpReturn
  OpFunctionEnd

* Add __floatCast that is safer than slang_noop_cast

Adding __floatCast that can be used for exp2 function.
---
 source/slang/diff.meta.slang |  62 +--
 source/slang/glsl.meta.slang | 891 ++++++++++++++++++++++++++++++++++++++++++-
 source/slang/hlsl.meta.slang | 100 ++++-
 3 files changed, 970 insertions(+), 83 deletions(-)

(limited to 'source')

diff --git a/source/slang/diff.meta.slang b/source/slang/diff.meta.slang
index 6f4888a5d..8a46f7d60 100644
--- a/source/slang/diff.meta.slang
+++ b/source/slang/diff.meta.slang
@@ -1563,71 +1563,27 @@ void __d_clamp(inout DifferentialPair<T> dpx, inout DifferentialPair<T> dpMin, i
 VECTOR_MATRIX_TERNARY_DIFF_IMPL(clamp)
 
 // fma
+__generic<T : __BuiltinFloatingPointType>
 [BackwardDifferentiable]
 [ForwardDerivativeOf(fma)]
 [PreferRecompute]
-DifferentialPair<double> __d_fma(DifferentialPair<double> dpx, DifferentialPair<double> dpy, DifferentialPair<double> dpz)
+DifferentialPair<T> __d_fma(DifferentialPair<T> dpx, DifferentialPair<T> dpy, DifferentialPair<T> dpz)
 {
-    return DifferentialPair<double>(
+    return DifferentialPair<T>(
         fma(dpx.p, dpy.p, dpz.p),
-        dpy.p * dpx.d + dpx.p * dpy.d + dpz.d);
+        T.dadd(T.dadd(__mul_p_d(dpy.p, dpx.d), __mul_p_d(dpx.p, dpy.d)), dpz.d));
 }
+__generic<T : __BuiltinFloatingPointType>
 [BackwardDifferentiable]
 [BackwardDerivativeOf(fma)]
 [PreferRecompute]
-void __d_fma(inout DifferentialPair<double> dpx, inout DifferentialPair<double> dpy, inout DifferentialPair<double> dpz, double dOut)
+void __d_fma(inout DifferentialPair<T> dpx, inout DifferentialPair<T> dpy, inout DifferentialPair<T> dpz, T.Differential dOut)
 {
-    dpx = diffPair(dpx.p, dpy.p * dOut);
-    dpy = diffPair(dpy.p, dpx.p * dOut);
+    dpx = diffPair(dpx.p, __mul_p_d(dpy.p, dOut));
+    dpy = diffPair(dpy.p, __mul_p_d(dpx.p, dOut));
     dpz = diffPair(dpz.p, dOut);
 }
-__generic<let N : int>
-[BackwardDifferentiable]
-[ForwardDerivativeOf(fma)]
-[PreferRecompute]
-DifferentialPair<vector<double, N>> __d_fma_vector(
-    DifferentialPair<vector<double, N>> dpx,
-    DifferentialPair<vector<double, N>> dpy,
-    DifferentialPair<vector<double, N>> dpz)
-{
-    vector<double, N> result;
-    vector<double, N>.Differential d_result;
-    [ForceUnroll] for (int i = 0; i < N; ++i)
-    {
-        DifferentialPair<double> dp_elem = __d_fma(
-            DifferentialPair<double>(dpx.p[i], dpx.d[i]),
-            DifferentialPair<double>(dpy.p[i], dpy.d[i]),
-            DifferentialPair<double>(dpz.p[i], dpz.d[i]));
-        result[i] = dp_elem.p;
-        d_result[i] = dp_elem.d;
-    }
-    return DifferentialPair<vector<double, N>>(result, d_result);
-}
-__generic<let N : int>
-[BackwardDifferentiable]
-[BackwardDerivativeOf(fma)]
-[PreferRecompute]
-void __d_fma_vector(
-        inout DifferentialPair<vector<double, N>> dpx,
-        inout DifferentialPair<vector<double, N>> dpy,
-        inout DifferentialPair<vector<double, N>> dpz,
-        vector<double, N> dOut)
-{
-    vector<double, N>.Differential x_d_result, y_d_result, z_d_result;
-    [ForceUnroll] for (int i = 0; i < N; ++i)
-    {
-        DifferentialPair<double> x_dp = diffPair(dpx.p[i], 0.0);
-        DifferentialPair<double> y_dp = diffPair(dpy.p[i], 0.0);
-        DifferentialPair<double> z_dp = diffPair(dpz.p[i], 0.0);
-        __d_fma(x_dp, y_dp, z_dp, dOut[i]);
-        x_d_result[i] = x_dp.d;
-        y_d_result[i] = y_dp.d;
-        z_d_result[i] = z_dp.d;
-    }
-    dpx = diffPair(dpx.p, x_d_result);
-    dpy = diffPair(dpy.p, y_d_result);
-    dpz = diffPair(dpz.p, z_d_result);
-}
+VECTOR_MATRIX_TERNARY_DIFF_IMPL(fma)
 
 // mad
 __generic<T : __BuiltinFloatingPointType>
diff --git a/source/slang/glsl.meta.slang b/source/slang/glsl.meta.slang
index 4fe56acf8..8403d1391 100644
--- a/source/slang/glsl.meta.slang
+++ b/source/slang/glsl.meta.slang
@@ -1,5 +1,22 @@
+// TODO: These keywords are not recognized but they should be.
+#define highp
+#define mediump
+#define lowp
+
+#define VECTOR_MAP_UNARY(TYPE, COUNT, FUNC, VALUE) \
+    vector<TYPE,COUNT> result; for(int i = 0; i < COUNT; ++i) { result[i] = FUNC(VALUE[i]); } return result
+
+#define VECTOR_MAP_TRINARY(TYPE, COUNT, FUNC, A, B, C) \
+    vector<TYPE,COUNT> result; for(int i = 0; i < COUNT; ++i) { result[i] = FUNC(A[i], B[i], C[i]); } return result
+
+#define REQUIRE_KHRONOS [require(glsl)] [require(spirv)]
+
+//
+// OpenGL 4.60 spec
+//
+
 //
-// From the GLSL spec, section 4.1. 'asic Types'
+// Section 4.1. 'asic Types'
 //
 
 public typealias vec2 = vector<float, 2>;
@@ -109,21 +126,21 @@ public in int gl_ViewportIndex : SV_ViewportArrayIndex;
 
 [OverloadRank(15)]
 [ForceInline]
-public matrix<float, N, N> operator*<let N : int>(matrix<float, N, N> m1, matrix<float, N, N> m2)
+public matrix<float, N, N> operator*<let N:int>(matrix<float, N, N> m1, matrix<float, N, N> m2)
 {
     return mul(m2, m1);
 }
 
 [OverloadRank(15)]
 [ForceInline]
-public matrix<half, N, N> operator*<let N : int>(matrix<half, N, N> m1, matrix<half, N, N> m2)
+public matrix<half, N, N> operator*<let N:int>(matrix<half, N, N> m1, matrix<half, N, N> m2)
 {
     return mul(m2, m1);
 }
 
 [OverloadRank(15)]
 [ForceInline]
-public matrix<double, N, N> operator*<let N : int>(matrix<double, N, N> m1, matrix<double, N, N> m2)
+public matrix<double, N, N> operator*<let N:int>(matrix<double, N, N> m1, matrix<double, N, N> m2)
 {
     return mul(m2, m1);
 }
@@ -150,7 +167,7 @@ public vector<T, C> operator*<T:__BuiltinFloatingPointType, let C : int, let R :
 }
 
 __intrinsic_op(mul)
-public matrix<T, N, M> matrixCompMult<T:__BuiltinFloatingPointType, let N : int, let M : int>(matrix<T,N,M> left, matrix<T,N,M> right);
+public matrix<T, N, M> matrixCompMult<T:__BuiltinFloatingPointType, let N:int, let M : int>(matrix<T,N,M> left, matrix<T,N,M> right);
 
 __intrinsic_op(cmpLE)
 public vector<bool, N> lessThanEqual<T, let N:int>(vector<T, N> x, vector<T, N> y);
@@ -180,42 +197,42 @@ public extension vector<T, 3>
 
 [ForceInline]
 [OverloadRank(15)]
-public bool operator==<T:__BuiltinArithmeticType, let N : int>(vector<T, N> left, vector<T, N> right)
+public bool operator==<T:__BuiltinArithmeticType, let N:int>(vector<T, N> left, vector<T, N> right)
 {
     return all(equal(left, right));
 }
 
 [ForceInline]
 [OverloadRank(15)]
-public bool operator!=<T:__BuiltinArithmeticType, let N : int>(vector<T, N> left, vector<T, N> right)
+public bool operator!=<T:__BuiltinArithmeticType, let N:int>(vector<T, N> left, vector<T, N> right)
 {
     return any(notEqual(left, right));
 }
 
 [ForceInline]
 [OverloadRank(14)]
-public bool operator==<T:__BuiltinFloatingPointType, let N : int>(vector<T, N> left, vector<T, N> right)
+public bool operator==<T:__BuiltinFloatingPointType, let N:int>(vector<T, N> left, vector<T, N> right)
 {
     return all(equal(left, right));
 }
 
 [ForceInline]
 [OverloadRank(14)]
-public bool operator!=<T:__BuiltinFloatingPointType, let N : int>(vector<T, N> left, vector<T, N> right)
+public bool operator!=<T:__BuiltinFloatingPointType, let N:int>(vector<T, N> left, vector<T, N> right)
 {
     return any(notEqual(left, right));
 }
 
 [ForceInline]
 [OverloadRank(14)]
-public bool operator==<T:__BuiltinLogicalType, let N : int>(vector<T, N> left, vector<T, N> right)
+public bool operator==<T:__BuiltinLogicalType, let N:int>(vector<T, N> left, vector<T, N> right)
 {
     return all(equal(left, right));
 }
 
 [ForceInline]
 [OverloadRank(14)]
-public bool operator!=<T:__BuiltinLogicalType, let N : int>(vector<T, N> left, vector<T, N> right)
+public bool operator!=<T:__BuiltinLogicalType, let N:int>(vector<T, N> left, vector<T, N> right)
 {
     return any(notEqual(left, right));
 }
@@ -227,14 +244,14 @@ for (auto type : kBaseTypes) {
 }}}}
 [ForceInline]
 [OverloadRank(15)]
-public bool operator==<let N : int>(vector<$(typeName), N> left, vector<$(typeName), N> right)
+public bool operator==<let N:int>(vector<$(typeName), N> left, vector<$(typeName), N> right)
 {
     return all(equal(left, right));
 }
 
 [ForceInline]
 [OverloadRank(15)]
-public bool operator!=<let N : int>(vector<$(typeName), N> left, vector<$(typeName), N> right)
+public bool operator!=<let N:int>(vector<$(typeName), N> left, vector<$(typeName), N> right)
 {
     return any(notEqual(left, right));
 }
@@ -242,17 +259,801 @@ ${{{{
 }
 }}}}
 
-[ForceInline] public int findLSB(int v) { return firstbitlow(v); }
-[ForceInline] public uint findLSB(uint v) { return firstbitlow(v); }
-[ForceInline] public vector<int,N> findLSB<let N:int>(vector<int,N> value)
+//
+// Section 8.1. Angle and Trigonometry Functions
+//
+
+__generic<T : __BuiltinFloatingPointType>
+[__readNone]
+[ForceInline]
+public T atan(T y, T x)
+{
+    return atan2(y, x);
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T,N> atan(vector<T,N> y, vector<T,N> x)
+{
+    return atan2(y, x);
+}
+
+__generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(cuda, "$P_asinh($0)")
+__target_intrinsic(cpp, "$P_asinh($0)")
+[__readNone]
+[ForceInline]
+public T asinh(T x)
+{
+    return log(x + sqrt(x * x + T(1)));
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T,N> asinh(vector<T,N> x)
+{
+    VECTOR_MAP_UNARY(T, N, asinh, x);
+}
+
+__generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(cuda, "$P_acosh($0)")
+__target_intrinsic(cpp, "$P_acosh($0)")
+[__readNone]
+[ForceInline]
+public T acosh(T x)
+{
+    return log(x + sqrt( x * x - T(1)));
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T,N> acosh(vector<T,N> x)
+{
+    VECTOR_MAP_UNARY(T, N, acosh, x);
+}
+
+__generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(cuda, "$P_atanh($0)")
+__target_intrinsic(cpp, "$P_atanh($0)")
+[__readNone]
+[ForceInline]
+public T atanh(T x)
+{
+    return T(0.5) * log((T(1) + x) / (T(1) - x));
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T,N> atanh(vector<T,N> x)
+{
+    VECTOR_MAP_UNARY(T, N, atanh, x);
+}
+
+//
+// Section 8.2. Exponential Functions
+//
+
+__generic<T : __BuiltinFloatingPointType>
+[__readNone]
+[ForceInline]
+public T inversesqrt(T x)
+{
+    return rsqrt(x);
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T, N> inversesqrt(vector<T, N> x)
+{
+    return rsqrt(x);
+}
+
+//
+// Section 8.3. Common Functions
+//
+
+__generic<T : __BuiltinFloatingPointType>
+[__readNone]
+[ForceInline]
+public T roundEven(T x)
+{
+    T i;
+    if (T(0.5) <= fmod(x, i))
+    {
+        bool evenInteger = (fmod(i, T(2)) == T(0));
+        if (!evenInteger)
+        {
+            x += T(0.1);
+        }
+    }
+    return round(x);
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T,N> roundEven(vector<T,N> x)
+{
+    VECTOR_MAP_UNARY(T, N, roundEven, x);
+}
+
+__generic<T : __BuiltinFloatingPointType>
+[__readNone]
+[ForceInline]
+public T fract(T x)
+{
+    return frac(x);
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T, N> fract(vector<T, N> x)
+{
+    return frac(x);
+}
+
+__generic<T : __BuiltinFloatingPointType>
+[__readNone]
+[ForceInline]
+public T mod(T x, T y)
+{
+    return fmod(x, y);
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T, N> mod(vector<T, N> x, T y)
+{
+    return fmod(x, vector<T, N>(y));
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T, N> mod(vector<T, N> x, vector<T, N> y)
+{
+    return fmod(x, y);
+}
+
+__generic<T : __BuiltinFloatingPointType>
+[__readNone]
+[ForceInline]
+public T mix(T x, T y, T a)
+{
+    return lerp(x, y, a);
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T, N> mix(vector<T, N> x, vector<T, N> y, T a)
+{
+    return lerp(x, y, vector<T, N>(a));
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T, N> mix(vector<T, N> x, vector<T, N> y, vector<T, N> a)
+{
+    return lerp(x, y, a);
+}
+
+__generic<T>
+[__readNone]
+[ForceInline]
+public T mix(T x, T y, bool a)
+{
+    return (a ? y : x);
+}
+
+__generic<T, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T, N> mix(vector<T, N> x, vector<T, N> y, vector<bool, N> a)
+{
+    vector<T, N> result;
+    for (int i = 0; i < N; i++)
+    {
+        result[i] = (a[i] ? y[i] : x[i]);
+    }
+    return result;
+}
+
+[__readNone]
+[ForceInline]
+public int floatBitsToInt(highp float x)
+{
+    return asint(x);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<int, N> floatBitsToInt(highp vector<float, N> x)
+{
+    return asint(x);
+}
+
+[__readNone]
+[ForceInline]
+public uint floatBitsToUint(highp float x)
+{
+    return asuint(x);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<uint, N> floatBitsToUint(highp vector<float, N> x)
+{
+    return asuint(x);
+}
+
+[__readNone]
+[ForceInline]
+public float intBitsToFloat(highp int x)
+{
+    return asfloat(x);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<float, N> intBitsToFloat(highp vector<int, N> x)
+{
+    return asfloat(x);
+}
+
+[__readNone]
+[ForceInline]
+public float uintBitsToFloat(highp uint x)
+{
+    return asfloat(x);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<float, N> uintBitsToFloat(highp vector<uint, N> x)
+{
+    return asfloat(x);
+}
+
+//
+// Section 8.4. Floating-Point Pack and Unpack Functions
+//
+
+[__readNone]
+[ForceInline]
+uint packUnorm1x16(float c)
+{
+    return uint(clamp(c, 0.0, 1.0) * 65535.0 + 0.5);
+}
+
+[__readNone]
+[ForceInline]
+uint packSnorm1x16(float v)
+{
+    return uint(clamp(v ,-1.0, 1.0) * 32767.0 + 32767.5);
+}
+
+[__readNone]
+[ForceInline]
+uint packUnorm1x8(float c)
+{
+    return uint(clamp(c, 0.0, 1.0) * 255.0 + 0.5);
+}
+
+[__readNone]
+[ForceInline]
+uint packSnorm1x8(float c)
+{
+    return uint(clamp(c, -1.0, 1.0) * 127.0 + 127.5);
+}
+
+[__readNone]
+[ForceInline]
+float unpackUnorm1x16(uint p)
+{
+    return float(p) / 65535.0;
+}
+
+[__readNone]
+[ForceInline]
+float unpackSnorm1x16(uint p)
+{
+    return clamp((float(p) - 32767.0) / 32767.0, -1.0, 1.0);
+}
+
+[__readNone]
+[ForceInline]
+float unpackUnorm1x8(uint p)
+{
+    return float(p) / 255.0;
+}
+
+[__readNone]
+[ForceInline]
+float unpackSnorm1x8(uint p)
+{
+    return clamp((float(p) - 127.0) / 127.0, -1.0, 1.0);
+}
+
+[__readNone]
+[ForceInline]
+uint float2half(float f)
+{
+    uint u = floatBitsToUint(f);
+    uint s = ((u >> uint(16)) & uint(0x8000));
+    uint e = 0;
+    uint m = ((u >> uint(13)) & uint(0x03ff));
+    if (m != 0)
+    {
+        e = ((((u & uint(0x7f800000)) - uint(0x38000000)) >> uint(13)) & uint(0x7c00));
+    }
+    return (s | e | m);
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public uint packUnorm2x16(vec2 v)
+{
+    return packUnorm1x16(v.x) | (packUnorm1x16(v.y) << uint(16));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public uint packSnorm2x16(vec2 v)
+{
+    return packSnorm1x16(v.x) | (packSnorm1x16(v.y) << uint(16));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public uint packUnorm4x8(vec4 v)
+{
+    return packUnorm1x8(v.x) | (packUnorm1x8(v.y) << uint(8)) | (packUnorm1x8(v.z) << uint(16)) | (packUnorm1x8(v.w) << uint(24));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public uint packSnorm4x8(vec4 v)
+{
+    return packSnorm1x8(v.x) | (packSnorm1x8(v.y) << uint(8)) | (packSnorm1x8(v.z) << uint(16)) | (packSnorm1x8(v.w) << uint(24));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public vec2 unpackUnorm2x16(uint p)
+{
+    return vec2(unpackUnorm1x16(p & uint(0xffff)), unpackUnorm1x16(p >> uint(16)));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public vec2 unpackSnorm2x16(uint p)
+{
+    return vec2(unpackSnorm1x16(p & uint(0xffff)), unpackSnorm1x16(p >> uint(16)));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public vec4 unpackUnorm4x8(highp uint p)
+{
+    return vec4(unpackUnorm1x8(p & uint(0xffff)), unpackUnorm1x8(p >> uint(8)), unpackUnorm1x8(p >> uint(16)), unpackUnorm1x8(p >> uint(24)));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public vec4 unpackSnorm4x8(highp uint p)
+{
+    return vec4(unpackSnorm1x8(p & uint(0xffff)), unpackSnorm1x8(p >> uint(8)), unpackSnorm1x8(p >> uint(16)), unpackSnorm1x8(p >> uint(24)));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public uint packHalf2x16(vec2 v)
+{
+    return float2half(v.x) | (float2half(v.y) << uint(16));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public float half2float(uint h)
+{
+    uint s = ((h & uint(0x8000)) << uint(16));
+    uint e = 0;
+    uint m = ((h & uint(0x03ff)) << uint(13));
+    if (m != 0)
+    {
+        e = (((h & uint(0x7c00)) + uint(0x1c000)) << uint(13));
+    }
+    return uintBitsToFloat(s | e | m); 
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public vec2 unpackHalf2x16(uint p)
+{
+    return vec2(half2float(p & uint(0xffff)), half2float(p >> uint(16)));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public double packDouble2x32(uvec2 v)
+{
+    // TODO: there is no "asdouble()"
+    //return asdouble(uint64_t(v.x) | (uint64_t(v.y) << 32));
+    return 0.0;
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public uvec2 unpackDouble2x32(double v)
+{
+    // TODO: there is no "asuint64()"
+    uint64_t u = 0; // asuint64(v);
+    return uvec2(uint(u & 0xFFFFFFFF), uint(u >> 32));
+}
+
+//
+// Section 8.5. Geometric Functions
+//
+
+__generic<T : __BuiltinFloatingPointType>
+[__readNone]
+[ForceInline]
+public T faceforward(T n, T i, T ng)
+{
+    return dot(ng, i) < T(0.0f) ? n : -n;
+}
+
+//
+// Section 8.6. Matrix Functions
+//
+
+__generic<T : __BuiltinFloatingPointType, let C : int, let R : int>
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+[OverloadRank(15)]
+public matrix<T, C, R> outerProduct(vector<T, C> c, vector<T, R> r)
+{
+    // Column major matrix in GLSL
+    matrix<T, C, R> result;
+    for (int i = 0; i < C; ++i)
+    {
+        for (int j = 0; j < R; ++j)
+        {
+            result[i][j] = c[i] * r[j];
+        }
+    }
+    return result;
+}
+
+__generic<T : __BuiltinFloatingPointType, let N : int>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+matrix<T,N,N> inverse(matrix<T,N,N> m);
+
+//
+// Section 8.8. Integer Functions
+//
+
+[__readNone]
+[ForceInline]
+public uint uaddCarry(highp uint x, highp uint y, out lowp uint carry)
+{
+    let result = x * y;
+    carry = ((result < x || result < y) ? 1 : 0);
+    return result;
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<uint,N> uaddCarry(highp vector<uint,N> x, highp vector<uint,N> y, out lowp vector<uint,N> carry)
+{
+    VECTOR_MAP_TRINARY(uint, N, uaddCarry, x, y, carry);
+}
+
+[__readNone]
+[ForceInline]
+public uint usubBorrow(highp uint x, highp uint y, out lowp uint borrow)
+{
+    borrow = (y > x) ? 1 : 0;
+    return x - y;
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<uint,N> usubBorrow(highp vector<uint,N> x, highp vector<uint,N> y, out lowp vector<uint,N> borrow)
+{
+    VECTOR_MAP_TRINARY(uint, N, usubBorrow, x, y, borrow);
+}
+
+[__readNone]
+[ForceInline]
+public void umulExtended(highp uint x, highp uint y, out highp uint msb, out highp uint lsb)
+{
+    uint64_t result = x * y;
+    msb = uint(result >> 32);
+    lsb = uint(result);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public void umulExtended(highp vector<uint,N> x, highp vector<uint,N> y, out highp vector<uint,N> msb, out highp vector<uint,N> lsb)
+{
+    for(int i = 0; i < N; ++i)
+    {
+       umulExtended(x[i], y[i], msb[i], lsb[i]);
+    }
+}
+
+[__readNone]
+[ForceInline]
+public void imulExtended(highp int x, highp int y, out highp int msb, out highp int lsb)
+{
+    int64_t result = x * y;
+    msb = int(result >> 32);
+    lsb = int(result);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public void imulExtended(highp vector<int,N> x, highp vector<int,N> y, out highp vector<int,N> msb, out highp vector<int,N> lsb)
+{
+    for(int i = 0; i < N; ++i)
+    {
+       imulExtended(x[i], y[i], msb[i], lsb[i]);
+    }
+}
+
+[__readNone]
+[ForceInline]
+public int bitfieldExtract(int value, int offset, int bits)
+{
+    return int(uint(value >> offset) & ((1u << bits) - 1));
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<int,N> bitfieldExtract(vector<int,N> value, int offset, int bits)
+{
+    vector<int,N> result;
+    for (int i = 0; i < N; ++i)
+    {
+        result[i] = bitfieldExtract(value[i], offset, bits);
+    }
+    return result;
+}
+
+[__readNone]
+[ForceInline]
+public uint bitfieldExtract(uint value, int offset, int bits)
+{
+    return (value >> offset) & ((1u << bits) - 1);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<uint,N> bitfieldExtract(vector<uint,N> value, int offset, int bits)
+{
+    vector<uint,N> result;
+    for (int i = 0; i < N; ++i)
+    {
+        result[i] = bitfieldExtract(value[i], offset, bits);
+    }
+    return result;
+}
+
+[__readNone]
+[ForceInline]
+public uint bitfieldInsert(uint base, uint insert, int offset, int bits)
+{
+    uint clearMask = ~(((1u << bits) - 1u) << offset);
+    uint clearedBase = base & clearMask;
+    uint maskedInsert = (insert & ((1u << bits) - 1u)) << offset;
+    return clearedBase | maskedInsert;
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<uint,N> bitfieldInsert(vector<uint,N> base, vector<uint,N> insert, int offset, int bits)
+{
+    vector<uint,N> result;
+    for (int i = 0; i < N; ++i)
+    {
+        result[i] = bitfieldInsert(base[i], insert[i], offset, bits);
+    }
+    return result;
+}
+
+[__readNone]
+[ForceInline]
+public int bitfieldInsert(int base, int insert, int offset, int bits)
+{
+    uint clearMask = ~(((1u << bits) - 1u) << offset);
+    uint clearedBase = base & clearMask;
+    uint maskedInsert = (insert & ((1u << bits) - 1u)) << offset;
+    return clearedBase | maskedInsert;
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<int,N> bitfieldInsert(vector<int,N> base, vector<int,N> insert, int offset, int bits)
+{
+    vector<int,N> result;
+    for (int i = 0; i < N; ++i)
+    {
+        result[i] = bitfieldInsert(base[i], insert[i], offset, bits);
+    }
+    return result;
+}
+
+[__readNone]
+[ForceInline]
+public int bitfieldReverse(highp int value)
+{
+    value = ((value & 0xAAAAAAAA) >> 1) | ((value & 0x55555555) << 1);
+    value = ((value & 0xCCCCCCCC) >> 2) | ((value & 0x33333333) << 2);
+    value = ((value & 0xF0F0F0F0) >> 4) | ((value & 0x0F0F0F0F) << 4);
+    value = ((value & 0xFF00FF00) >> 8) | ((value & 0x00FF00FF) << 8);
+    value = ((value & 0xFFFF0000) >> 16) | ((value & 0x0000FFFF) << 16);
+    return value;
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<int,N> bitfieldReverse(highp vector<int,N> value)
+{
+    VECTOR_MAP_UNARY(int, N, bitfieldReverse, value);
+}
+
+[__readNone]
+[ForceInline]
+public uint bitfieldReverse(highp uint value)
+{
+    value = ((value & 0xAAAAAAAA) >> 1) | ((value & 0x55555555) << 1);
+    value = ((value & 0xCCCCCCCC) >> 2) | ((value & 0x33333333) << 2);
+    value = ((value & 0xF0F0F0F0) >> 4) | ((value & 0x0F0F0F0F) << 4);
+    value = ((value & 0xFF00FF00) >> 8) | ((value & 0x00FF00FF) << 8);
+    value = ((value & 0xFFFF0000) >> 16) | ((value & 0x0000FFFF) << 16);
+    return value;
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<uint,N> bitfieldReverse(highp vector<uint,N> value)
+{
+    VECTOR_MAP_UNARY(int, N, bitfieldReverse, value);
+}
+
+[__readNone] [ForceInline] REQUIRE_KHRONOS
+public uint bitCount(uint value)
+{
+    return countbits(value);
+}
+
+__generic<let N:int>
+[__readNone] [ForceInline] REQUIRE_KHRONOS
+public vector<uint,N> bitCount(vector<uint,N> value)
+{
+    VECTOR_MAP_UNARY(uint, N, countbits, value);
+}
+
+[__readNone] [ForceInline] REQUIRE_KHRONOS
+public int bitCount(int value)
+{
+    return countbits(uint(value));
+}
+    
+__generic<let N:int>
+[__readNone] [ForceInline] REQUIRE_KHRONOS
+public vector<int,N> bitCount(vector<int,N> value)
+{
+    VECTOR_MAP_UNARY(int, N, countbits, value);
+}
+
+[__readNone]
+[ForceInline]
+public int findLSB(int v)
+{
+    return firstbitlow(v);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<int,N> findLSB(vector<int,N> value)
 {
     return firstbitlow(value);
 }
-[ForceInline] public vector<uint,N> findLSB<let N:int>(vector<uint,N> value)
+
+[__readNone]
+[ForceInline]
+public uint findLSB(uint v)
+{
+    return firstbitlow(v);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<uint,N> findLSB(vector<uint,N> value)
 {
     return firstbitlow(value);
 }
 
+[__readNone]
+[ForceInline]
+public int findMSB(int value)
+{
+    return firstbithigh(value);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<int,N> findMSB(vector<int,N> value)
+{
+    return firstbithigh(value);
+}
+
+[__readNone]
+[ForceInline]
+public uint findMSB(uint value)
+{
+    return firstbithigh(value);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<uint,N> findMSB(vector<uint,N> value)
+{
+    return firstbithigh(value);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<bool,N> not(vector<bool,N> x)
+{
+    return !x;
+}
+
 //
 // Section 8.9.1. Texture Query Functions
 //
@@ -1986,3 +2787,59 @@ public vec4 shadow2DProjLod(sampler2DShadow sampler, vec4 coord, float lod)
     return textureProjLod(sampler, coord, lod);
 }
 
+//
+// Ray tracing
+//
+
+public typealias rayQueryEXT = RayQuery;
+
+__glsl_extension(GL_EXT_ray_query)
+__glsl_version(460)
+[ForceInline]
+public void rayQueryConfirmIntersectionEXT(inout rayQueryEXT q)
+{
+    q.CommitNonOpaqueTriangleHit();
+}
+
+__glsl_extension(GL_EXT_ray_query)
+__glsl_version(460)
+[ForceInline]
+public bool rayQueryProceedEXT(inout rayQueryEXT q)
+{
+    return q.Proceed();
+}
+
+__glsl_extension(GL_EXT_ray_query)
+__glsl_version(460)
+[__NoSideEffect]
+public uint rayQueryGetIntersectionTypeEXT(rayQueryEXT q, bool committed)
+{
+    if (committed)
+    {
+        q.CommittedStatus();
+    }
+    else
+    {
+        q.CandidateType();
+    }
+    return 0;
+}
+
+
+//
+// Subgroup
+//
+
+__glsl_extension(KHR_shader_subgroup)
+__glsl_version(450)
+public void subgroupBarrier()
+{
+    //__subgroupBarrier();
+}
+
+__glsl_extension(KHR_shader_subgroup)
+__glsl_version(450)
+public void subgroupMemoryBarrier()
+{
+}
+
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 2bf0c1d80..8183c2030 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -2,6 +2,9 @@
 
 typedef uint UINT;
 
+__intrinsic_op($(kIROp_FloatCast))
+T __floatCast<T, U>(U v);
+
 [sealed]
 interface IBufferDataLayout
 {
@@ -4407,6 +4410,16 @@ T distance(T x, T y)
 
 // Vector dot product
 
+__generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+T dot(T x, T y)
+{
+    return x * y;
+}
+
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
@@ -4561,16 +4574,34 @@ matrix<T, N, M> exp(matrix<T, N, M> x)
 
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
-__target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_exp2($0)")
 __target_intrinsic(cpp, "$P_exp2($0)")
-__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Exp2 _0")
 [__readNone]
-T exp2(T x);
+T exp2(T x)
+{
+    __target_switch
+    {
+    case glsl:
+        if (__isHalf<T>())
+            __intrinsic_asm "exp2($0)";
+        __intrinsic_asm "exp2(float($0))";
+    case spirv:
+        if (__isHalf<T>())
+        {
+            return spirv_asm { OpExtInst $$T result glsl450 Exp2 $x };
+        }
+        else
+        {
+            float xf = __floatCast<float>(x);
+            return T(spirv_asm {
+                 result:$$float = OpExtInst glsl450 Exp2 $xf
+            });
+        }
+    }
+}
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
-__target_intrinsic(glsl)
 __target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Exp2 _0")
 [__readNone]
 vector<T,N> exp2(vector<T,N> x)
@@ -4786,31 +4817,41 @@ matrix<T, N, M> floor(matrix<T, N, M> x)
     MATRIX_MAP_UNARY(T, N, M, floor, x);
 }
 
-// Fused multiply-add for doubles
-__target_intrinsic(hlsl)
+// Fused multiply-add
+__generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_fma($0, $1, $2)")
 __target_intrinsic(cpp, "$P_fma($0, $1, $2)")
 __target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
 [__readNone]
-double fma(double a, double b, double c);
+T fma(T a, T b, T c)
+{
+    __target_switch
+    {
+    case hlsl:
+        if (__isFloat<T>() || __isHalf<T>())
+            return mad(a, b, c);
+        else
+            __intrinsic_asm "fma($0, $1, $2)";
+    }
+}
 
-__generic<let N : int>
+__generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
 [__readNone]
-vector<double, N> fma(vector<double, N> a, vector<double, N> b, vector<double, N> c)
+vector<T, N> fma(vector<T, N> a, vector<T, N> b, vector<T, N> c)
 {
-    VECTOR_MAP_TRINARY(double, N, fma, a, b, c);
+    VECTOR_MAP_TRINARY(T, N, fma, a, b, c);
 }
 
-__generic<let N : int, let M : int>
+__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 [__readNone]
-matrix<double, N, M> fma(matrix<double, N, M> a, matrix<double, N, M> b, matrix<double, N, M> c)
+matrix<T, N, M> fma(matrix<T, N, M> a, matrix<T, N, M> b, matrix<T, N, M> c)
 {
-    MATRIX_MAP_TRINARY(double, N, M, fma, a, b, c);
+    MATRIX_MAP_TRINARY(T, N, M, fma, a, b, c);
 }
 
 // Floating point remainder of x/y
@@ -6414,6 +6455,16 @@ vector<T,N> normalize(vector<T,N> x)
     return x / length(x);
 }
 
+__generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Normalize _0")
+[__readNone]
+T normalize(T x)
+{
+    return x / length(x);
+}
+
 // Raise to a power
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
@@ -6618,6 +6669,16 @@ matrix<T, N, M> rcp(matrix<T, N, M> x)
 }
 
 // Reflect incident vector across plane with given normal
+__generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Reflect _0 _1")
+[__readNone]
+T reflect(T i, T n)
+{
+    return i - T(2) * dot(n,i) * n;
+}
+
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
@@ -6642,6 +6703,19 @@ vector<T,N> refract(vector<T,N> i, vector<T,N> n, T eta)
     return eta * i - (eta * dotNI + sqrt(k)) * n;
 }
 
+__generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Refract _0 _1 _2")
+[__readNone]
+T refract(T i, T n, T eta)
+{
+    let dotNI = dot(n,i);
+    let k = T(1) - eta*eta*(T(1) - dotNI * dotNI);
+    if(k < T(0)) return T(0);
+    return eta * i - (eta * dotNI + sqrt(k)) * n;
+}
+
 // Reverse order of bits
 [__readNone]
 uint reversebits(uint value)
-- 
cgit v1.2.3