4 files changed, 1479 insertions, 83 deletions
diff --git a/source/slang/diff.meta.slang b/source/slang/diff.meta.slang
index 6f4888a5d..8a46f7d60 100644
--- a/source/slang/diff.meta.slang
+++ b/source/slang/diff.meta.slang
@@ -1563,71 +1563,27 @@ void __d_clamp(inout DifferentialPair<T> dpx, inout DifferentialPair<T> dpMin, i
 VECTOR_MATRIX_TERNARY_DIFF_IMPL(clamp)
 
 // fma
+__generic<T : __BuiltinFloatingPointType>
 [BackwardDifferentiable]
 [ForwardDerivativeOf(fma)]
 [PreferRecompute]
-DifferentialPair<double> __d_fma(DifferentialPair<double> dpx, DifferentialPair<double> dpy, DifferentialPair<double> dpz)
+DifferentialPair<T> __d_fma(DifferentialPair<T> dpx, DifferentialPair<T> dpy, DifferentialPair<T> dpz)
 {
-    return DifferentialPair<double>(
+    return DifferentialPair<T>(
         fma(dpx.p, dpy.p, dpz.p),
-        dpy.p * dpx.d + dpx.p * dpy.d + dpz.d);
+        T.dadd(T.dadd(__mul_p_d(dpy.p, dpx.d), __mul_p_d(dpx.p, dpy.d)), dpz.d));
 }
+__generic<T : __BuiltinFloatingPointType>
 [BackwardDifferentiable]
 [BackwardDerivativeOf(fma)]
 [PreferRecompute]
-void __d_fma(inout DifferentialPair<double> dpx, inout DifferentialPair<double> dpy, inout DifferentialPair<double> dpz, double dOut)
+void __d_fma(inout DifferentialPair<T> dpx, inout DifferentialPair<T> dpy, inout DifferentialPair<T> dpz, T.Differential dOut)
 {
-    dpx = diffPair(dpx.p, dpy.p * dOut);
-    dpy = diffPair(dpy.p, dpx.p * dOut);
+    dpx = diffPair(dpx.p, __mul_p_d(dpy.p, dOut));
+    dpy = diffPair(dpy.p, __mul_p_d(dpx.p, dOut));
     dpz = diffPair(dpz.p, dOut);
 }
-__generic<let N : int>
-[BackwardDifferentiable]
-[ForwardDerivativeOf(fma)]
-[PreferRecompute]
-DifferentialPair<vector<double, N>> __d_fma_vector(
-    DifferentialPair<vector<double, N>> dpx,
-    DifferentialPair<vector<double, N>> dpy,
-    DifferentialPair<vector<double, N>> dpz)
-{
-    vector<double, N> result;
-    vector<double, N>.Differential d_result;
-    [ForceUnroll] for (int i = 0; i < N; ++i)
-    {
-        DifferentialPair<double> dp_elem = __d_fma(
-            DifferentialPair<double>(dpx.p[i], dpx.d[i]),
-            DifferentialPair<double>(dpy.p[i], dpy.d[i]),
-            DifferentialPair<double>(dpz.p[i], dpz.d[i]));
-        result[i] = dp_elem.p;
-        d_result[i] = dp_elem.d;
-    }
-    return DifferentialPair<vector<double, N>>(result, d_result);
-}
-__generic<let N : int>
-[BackwardDifferentiable]
-[BackwardDerivativeOf(fma)]
-[PreferRecompute]
-void __d_fma_vector(
-        inout DifferentialPair<vector<double, N>> dpx,
-        inout DifferentialPair<vector<double, N>> dpy,
-        inout DifferentialPair<vector<double, N>> dpz,
-        vector<double, N> dOut)
-{
-    vector<double, N>.Differential x_d_result, y_d_result, z_d_result;
-    [ForceUnroll] for (int i = 0; i < N; ++i)
-    {
-        DifferentialPair<double> x_dp = diffPair(dpx.p[i], 0.0);
-        DifferentialPair<double> y_dp = diffPair(dpy.p[i], 0.0);
-        DifferentialPair<double> z_dp = diffPair(dpz.p[i], 0.0);
-        __d_fma(x_dp, y_dp, z_dp, dOut[i]);
-        x_d_result[i] = x_dp.d;
-        y_d_result[i] = y_dp.d;
-        z_d_result[i] = z_dp.d;
-    }
-    dpx = diffPair(dpx.p, x_d_result);
-    dpy = diffPair(dpy.p, y_d_result);
-    dpz = diffPair(dpz.p, z_d_result);
-}
+VECTOR_MATRIX_TERNARY_DIFF_IMPL(fma)
 
 // mad
 __generic<T : __BuiltinFloatingPointType>
diff --git a/source/slang/glsl.meta.slang b/source/slang/glsl.meta.slang
index 4fe56acf8..8403d1391 100644
--- a/source/slang/glsl.meta.slang
+++ b/source/slang/glsl.meta.slang
@@ -1,5 +1,22 @@
+// TODO: These keywords are not recognized but they should be.
+#define highp
+#define mediump
+#define lowp
+
+#define VECTOR_MAP_UNARY(TYPE, COUNT, FUNC, VALUE) \
+    vector<TYPE,COUNT> result; for(int i = 0; i < COUNT; ++i) { result[i] = FUNC(VALUE[i]); } return result
+
+#define VECTOR_MAP_TRINARY(TYPE, COUNT, FUNC, A, B, C) \
+    vector<TYPE,COUNT> result; for(int i = 0; i < COUNT; ++i) { result[i] = FUNC(A[i], B[i], C[i]); } return result
+
+#define REQUIRE_KHRONOS [require(glsl)] [require(spirv)]
+
+//
+// OpenGL 4.60 spec
+//
+
 //
-// From the GLSL spec, section 4.1. 'asic Types'
+// Section 4.1. 'asic Types'
 //
 
 public typealias vec2 = vector<float, 2>;
@@ -109,21 +126,21 @@ public in int gl_ViewportIndex : SV_ViewportArrayIndex;
 
 [OverloadRank(15)]
 [ForceInline]
-public matrix<float, N, N> operator*<let N : int>(matrix<float, N, N> m1, matrix<float, N, N> m2)
+public matrix<float, N, N> operator*<let N:int>(matrix<float, N, N> m1, matrix<float, N, N> m2)
 {
     return mul(m2, m1);
 }
 
 [OverloadRank(15)]
 [ForceInline]
-public matrix<half, N, N> operator*<let N : int>(matrix<half, N, N> m1, matrix<half, N, N> m2)
+public matrix<half, N, N> operator*<let N:int>(matrix<half, N, N> m1, matrix<half, N, N> m2)
 {
     return mul(m2, m1);
 }
 
 [OverloadRank(15)]
 [ForceInline]
-public matrix<double, N, N> operator*<let N : int>(matrix<double, N, N> m1, matrix<double, N, N> m2)
+public matrix<double, N, N> operator*<let N:int>(matrix<double, N, N> m1, matrix<double, N, N> m2)
 {
     return mul(m2, m1);
 }
@@ -150,7 +167,7 @@ public vector<T, C> operator*<T:__BuiltinFloatingPointType, let C : int, let R :
 }
 
 __intrinsic_op(mul)
-public matrix<T, N, M> matrixCompMult<T:__BuiltinFloatingPointType, let N : int, let M : int>(matrix<T,N,M> left, matrix<T,N,M> right);
+public matrix<T, N, M> matrixCompMult<T:__BuiltinFloatingPointType, let N:int, let M : int>(matrix<T,N,M> left, matrix<T,N,M> right);
 
 __intrinsic_op(cmpLE)
 public vector<bool, N> lessThanEqual<T, let N:int>(vector<T, N> x, vector<T, N> y);
@@ -180,42 +197,42 @@ public extension vector<T, 3>
 
 [ForceInline]
 [OverloadRank(15)]
-public bool operator==<T:__BuiltinArithmeticType, let N : int>(vector<T, N> left, vector<T, N> right)
+public bool operator==<T:__BuiltinArithmeticType, let N:int>(vector<T, N> left, vector<T, N> right)
 {
     return all(equal(left, right));
 }
 
 [ForceInline]
 [OverloadRank(15)]
-public bool operator!=<T:__BuiltinArithmeticType, let N : int>(vector<T, N> left, vector<T, N> right)
+public bool operator!=<T:__BuiltinArithmeticType, let N:int>(vector<T, N> left, vector<T, N> right)
 {
     return any(notEqual(left, right));
 }
 
 [ForceInline]
 [OverloadRank(14)]
-public bool operator==<T:__BuiltinFloatingPointType, let N : int>(vector<T, N> left, vector<T, N> right)
+public bool operator==<T:__BuiltinFloatingPointType, let N:int>(vector<T, N> left, vector<T, N> right)
 {
     return all(equal(left, right));
 }
 
 [ForceInline]
 [OverloadRank(14)]
-public bool operator!=<T:__BuiltinFloatingPointType, let N : int>(vector<T, N> left, vector<T, N> right)
+public bool operator!=<T:__BuiltinFloatingPointType, let N:int>(vector<T, N> left, vector<T, N> right)
 {
     return any(notEqual(left, right));
 }
 
 [ForceInline]
 [OverloadRank(14)]
-public bool operator==<T:__BuiltinLogicalType, let N : int>(vector<T, N> left, vector<T, N> right)
+public bool operator==<T:__BuiltinLogicalType, let N:int>(vector<T, N> left, vector<T, N> right)
 {
     return all(equal(left, right));
 }
 
 [ForceInline]
 [OverloadRank(14)]
-public bool operator!=<T:__BuiltinLogicalType, let N : int>(vector<T, N> left, vector<T, N> right)
+public bool operator!=<T:__BuiltinLogicalType, let N:int>(vector<T, N> left, vector<T, N> right)
 {
     return any(notEqual(left, right));
 }
@@ -227,14 +244,14 @@ for (auto type : kBaseTypes) {
 }}}}
 [ForceInline]
 [OverloadRank(15)]
-public bool operator==<let N : int>(vector<$(typeName), N> left, vector<$(typeName), N> right)
+public bool operator==<let N:int>(vector<$(typeName), N> left, vector<$(typeName), N> right)
 {
     return all(equal(left, right));
 }
 
 [ForceInline]
 [OverloadRank(15)]
-public bool operator!=<let N : int>(vector<$(typeName), N> left, vector<$(typeName), N> right)
+public bool operator!=<let N:int>(vector<$(typeName), N> left, vector<$(typeName), N> right)
 {
     return any(notEqual(left, right));
 }
@@ -242,17 +259,801 @@ ${{{{
 }
 }}}}
 
-[ForceInline] public int findLSB(int v) { return firstbitlow(v); }
-[ForceInline] public uint findLSB(uint v) { return firstbitlow(v); }
-[ForceInline] public vector<int,N> findLSB<let N:int>(vector<int,N> value)
+//
+// Section 8.1. Angle and Trigonometry Functions
+//
+
+__generic<T : __BuiltinFloatingPointType>
+[__readNone]
+[ForceInline]
+public T atan(T y, T x)
+{
+    return atan2(y, x);
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T,N> atan(vector<T,N> y, vector<T,N> x)
+{
+    return atan2(y, x);
+}
+
+__generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(cuda, "$P_asinh($0)")
+__target_intrinsic(cpp, "$P_asinh($0)")
+[__readNone]
+[ForceInline]
+public T asinh(T x)
+{
+    return log(x + sqrt(x * x + T(1)));
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T,N> asinh(vector<T,N> x)
+{
+    VECTOR_MAP_UNARY(T, N, asinh, x);
+}
+
+__generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(cuda, "$P_acosh($0)")
+__target_intrinsic(cpp, "$P_acosh($0)")
+[__readNone]
+[ForceInline]
+public T acosh(T x)
+{
+    return log(x + sqrt( x * x - T(1)));
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T,N> acosh(vector<T,N> x)
+{
+    VECTOR_MAP_UNARY(T, N, acosh, x);
+}
+
+__generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(cuda, "$P_atanh($0)")
+__target_intrinsic(cpp, "$P_atanh($0)")
+[__readNone]
+[ForceInline]
+public T atanh(T x)
+{
+    return T(0.5) * log((T(1) + x) / (T(1) - x));
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T,N> atanh(vector<T,N> x)
+{
+    VECTOR_MAP_UNARY(T, N, atanh, x);
+}
+
+//
+// Section 8.2. Exponential Functions
+//
+
+__generic<T : __BuiltinFloatingPointType>
+[__readNone]
+[ForceInline]
+public T inversesqrt(T x)
+{
+    return rsqrt(x);
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T, N> inversesqrt(vector<T, N> x)
+{
+    return rsqrt(x);
+}
+
+//
+// Section 8.3. Common Functions
+//
+
+__generic<T : __BuiltinFloatingPointType>
+[__readNone]
+[ForceInline]
+public T roundEven(T x)
+{
+    T i;
+    if (T(0.5) <= fmod(x, i))
+    {
+        bool evenInteger = (fmod(i, T(2)) == T(0));
+        if (!evenInteger)
+        {
+            x += T(0.1);
+        }
+    }
+    return round(x);
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T,N> roundEven(vector<T,N> x)
+{
+    VECTOR_MAP_UNARY(T, N, roundEven, x);
+}
+
+__generic<T : __BuiltinFloatingPointType>
+[__readNone]
+[ForceInline]
+public T fract(T x)
+{
+    return frac(x);
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T, N> fract(vector<T, N> x)
+{
+    return frac(x);
+}
+
+__generic<T : __BuiltinFloatingPointType>
+[__readNone]
+[ForceInline]
+public T mod(T x, T y)
+{
+    return fmod(x, y);
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T, N> mod(vector<T, N> x, T y)
+{
+    return fmod(x, vector<T, N>(y));
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T, N> mod(vector<T, N> x, vector<T, N> y)
+{
+    return fmod(x, y);
+}
+
+__generic<T : __BuiltinFloatingPointType>
+[__readNone]
+[ForceInline]
+public T mix(T x, T y, T a)
+{
+    return lerp(x, y, a);
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T, N> mix(vector<T, N> x, vector<T, N> y, T a)
+{
+    return lerp(x, y, vector<T, N>(a));
+}
+
+__generic<T : __BuiltinFloatingPointType, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T, N> mix(vector<T, N> x, vector<T, N> y, vector<T, N> a)
+{
+    return lerp(x, y, a);
+}
+
+__generic<T>
+[__readNone]
+[ForceInline]
+public T mix(T x, T y, bool a)
+{
+    return (a ? y : x);
+}
+
+__generic<T, let N:int>
+[__readNone]
+[ForceInline]
+public vector<T, N> mix(vector<T, N> x, vector<T, N> y, vector<bool, N> a)
+{
+    vector<T, N> result;
+    for (int i = 0; i < N; i++)
+    {
+        result[i] = (a[i] ? y[i] : x[i]);
+    }
+    return result;
+}
+
+[__readNone]
+[ForceInline]
+public int floatBitsToInt(highp float x)
+{
+    return asint(x);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<int, N> floatBitsToInt(highp vector<float, N> x)
+{
+    return asint(x);
+}
+
+[__readNone]
+[ForceInline]
+public uint floatBitsToUint(highp float x)
+{
+    return asuint(x);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<uint, N> floatBitsToUint(highp vector<float, N> x)
+{
+    return asuint(x);
+}
+
+[__readNone]
+[ForceInline]
+public float intBitsToFloat(highp int x)
+{
+    return asfloat(x);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<float, N> intBitsToFloat(highp vector<int, N> x)
+{
+    return asfloat(x);
+}
+
+[__readNone]
+[ForceInline]
+public float uintBitsToFloat(highp uint x)
+{
+    return asfloat(x);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<float, N> uintBitsToFloat(highp vector<uint, N> x)
+{
+    return asfloat(x);
+}
+
+//
+// Section 8.4. Floating-Point Pack and Unpack Functions
+//
+
+[__readNone]
+[ForceInline]
+uint packUnorm1x16(float c)
+{
+    return uint(clamp(c, 0.0, 1.0) * 65535.0 + 0.5);
+}
+
+[__readNone]
+[ForceInline]
+uint packSnorm1x16(float v)
+{
+    return uint(clamp(v ,-1.0, 1.0) * 32767.0 + 32767.5);
+}
+
+[__readNone]
+[ForceInline]
+uint packUnorm1x8(float c)
+{
+    return uint(clamp(c, 0.0, 1.0) * 255.0 + 0.5);
+}
+
+[__readNone]
+[ForceInline]
+uint packSnorm1x8(float c)
+{
+    return uint(clamp(c, -1.0, 1.0) * 127.0 + 127.5);
+}
+
+[__readNone]
+[ForceInline]
+float unpackUnorm1x16(uint p)
+{
+    return float(p) / 65535.0;
+}
+
+[__readNone]
+[ForceInline]
+float unpackSnorm1x16(uint p)
+{
+    return clamp((float(p) - 32767.0) / 32767.0, -1.0, 1.0);
+}
+
+[__readNone]
+[ForceInline]
+float unpackUnorm1x8(uint p)
+{
+    return float(p) / 255.0;
+}
+
+[__readNone]
+[ForceInline]
+float unpackSnorm1x8(uint p)
+{
+    return clamp((float(p) - 127.0) / 127.0, -1.0, 1.0);
+}
+
+[__readNone]
+[ForceInline]
+uint float2half(float f)
+{
+    uint u = floatBitsToUint(f);
+    uint s = ((u >> uint(16)) & uint(0x8000));
+    uint e = 0;
+    uint m = ((u >> uint(13)) & uint(0x03ff));
+    if (m != 0)
+    {
+        e = ((((u & uint(0x7f800000)) - uint(0x38000000)) >> uint(13)) & uint(0x7c00));
+    }
+    return (s | e | m);
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public uint packUnorm2x16(vec2 v)
+{
+    return packUnorm1x16(v.x) | (packUnorm1x16(v.y) << uint(16));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public uint packSnorm2x16(vec2 v)
+{
+    return packSnorm1x16(v.x) | (packSnorm1x16(v.y) << uint(16));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public uint packUnorm4x8(vec4 v)
+{
+    return packUnorm1x8(v.x) | (packUnorm1x8(v.y) << uint(8)) | (packUnorm1x8(v.z) << uint(16)) | (packUnorm1x8(v.w) << uint(24));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public uint packSnorm4x8(vec4 v)
+{
+    return packSnorm1x8(v.x) | (packSnorm1x8(v.y) << uint(8)) | (packSnorm1x8(v.z) << uint(16)) | (packSnorm1x8(v.w) << uint(24));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public vec2 unpackUnorm2x16(uint p)
+{
+    return vec2(unpackUnorm1x16(p & uint(0xffff)), unpackUnorm1x16(p >> uint(16)));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public vec2 unpackSnorm2x16(uint p)
+{
+    return vec2(unpackSnorm1x16(p & uint(0xffff)), unpackSnorm1x16(p >> uint(16)));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public vec4 unpackUnorm4x8(highp uint p)
+{
+    return vec4(unpackUnorm1x8(p & uint(0xffff)), unpackUnorm1x8(p >> uint(8)), unpackUnorm1x8(p >> uint(16)), unpackUnorm1x8(p >> uint(24)));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public vec4 unpackSnorm4x8(highp uint p)
+{
+    return vec4(unpackSnorm1x8(p & uint(0xffff)), unpackSnorm1x8(p >> uint(8)), unpackSnorm1x8(p >> uint(16)), unpackSnorm1x8(p >> uint(24)));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public uint packHalf2x16(vec2 v)
+{
+    return float2half(v.x) | (float2half(v.y) << uint(16));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public float half2float(uint h)
+{
+    uint s = ((h & uint(0x8000)) << uint(16));
+    uint e = 0;
+    uint m = ((h & uint(0x03ff)) << uint(13));
+    if (m != 0)
+    {
+        e = (((h & uint(0x7c00)) + uint(0x1c000)) << uint(13));
+    }
+    return uintBitsToFloat(s | e | m); 
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public vec2 unpackHalf2x16(uint p)
+{
+    return vec2(half2float(p & uint(0xffff)), half2float(p >> uint(16)));
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public double packDouble2x32(uvec2 v)
+{
+    // TODO: there is no "asdouble()"
+    //return asdouble(uint64_t(v.x) | (uint64_t(v.y) << 32));
+    return 0.0;
+}
+
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+public uvec2 unpackDouble2x32(double v)
+{
+    // TODO: there is no "asuint64()"
+    uint64_t u = 0; // asuint64(v);
+    return uvec2(uint(u & 0xFFFFFFFF), uint(u >> 32));
+}
+
+//
+// Section 8.5. Geometric Functions
+//
+
+__generic<T : __BuiltinFloatingPointType>
+[__readNone]
+[ForceInline]
+public T faceforward(T n, T i, T ng)
+{
+    return dot(ng, i) < T(0.0f) ? n : -n;
+}
+
+//
+// Section 8.6. Matrix Functions
+//
+
+__generic<T : __BuiltinFloatingPointType, let C : int, let R : int>
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+[OverloadRank(15)]
+public matrix<T, C, R> outerProduct(vector<T, C> c, vector<T, R> r)
+{
+    // Column major matrix in GLSL
+    matrix<T, C, R> result;
+    for (int i = 0; i < C; ++i)
+    {
+        for (int j = 0; j < R; ++j)
+        {
+            result[i][j] = c[i] * r[j];
+        }
+    }
+    return result;
+}
+
+__generic<T : __BuiltinFloatingPointType, let N : int>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+matrix<T,N,N> inverse(matrix<T,N,N> m);
+
+//
+// Section 8.8. Integer Functions
+//
+
+[__readNone]
+[ForceInline]
+public uint uaddCarry(highp uint x, highp uint y, out lowp uint carry)
+{
+    let result = x * y;
+    carry = ((result < x || result < y) ? 1 : 0);
+    return result;
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<uint,N> uaddCarry(highp vector<uint,N> x, highp vector<uint,N> y, out lowp vector<uint,N> carry)
+{
+    VECTOR_MAP_TRINARY(uint, N, uaddCarry, x, y, carry);
+}
+
+[__readNone]
+[ForceInline]
+public uint usubBorrow(highp uint x, highp uint y, out lowp uint borrow)
+{
+    borrow = (y > x) ? 1 : 0;
+    return x - y;
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<uint,N> usubBorrow(highp vector<uint,N> x, highp vector<uint,N> y, out lowp vector<uint,N> borrow)
+{
+    VECTOR_MAP_TRINARY(uint, N, usubBorrow, x, y, borrow);
+}
+
+[__readNone]
+[ForceInline]
+public void umulExtended(highp uint x, highp uint y, out highp uint msb, out highp uint lsb)
+{
+    uint64_t result = x * y;
+    msb = uint(result >> 32);
+    lsb = uint(result);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public void umulExtended(highp vector<uint,N> x, highp vector<uint,N> y, out highp vector<uint,N> msb, out highp vector<uint,N> lsb)
+{
+    for(int i = 0; i < N; ++i)
+    {
+       umulExtended(x[i], y[i], msb[i], lsb[i]);
+    }
+}
+
+[__readNone]
+[ForceInline]
+public void imulExtended(highp int x, highp int y, out highp int msb, out highp int lsb)
+{
+    int64_t result = x * y;
+    msb = int(result >> 32);
+    lsb = int(result);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public void imulExtended(highp vector<int,N> x, highp vector<int,N> y, out highp vector<int,N> msb, out highp vector<int,N> lsb)
+{
+    for(int i = 0; i < N; ++i)
+    {
+       imulExtended(x[i], y[i], msb[i], lsb[i]);
+    }
+}
+
+[__readNone]
+[ForceInline]
+public int bitfieldExtract(int value, int offset, int bits)
+{
+    return int(uint(value >> offset) & ((1u << bits) - 1));
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<int,N> bitfieldExtract(vector<int,N> value, int offset, int bits)
+{
+    vector<int,N> result;
+    for (int i = 0; i < N; ++i)
+    {
+        result[i] = bitfieldExtract(value[i], offset, bits);
+    }
+    return result;
+}
+
+[__readNone]
+[ForceInline]
+public uint bitfieldExtract(uint value, int offset, int bits)
+{
+    return (value >> offset) & ((1u << bits) - 1);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<uint,N> bitfieldExtract(vector<uint,N> value, int offset, int bits)
+{
+    vector<uint,N> result;
+    for (int i = 0; i < N; ++i)
+    {
+        result[i] = bitfieldExtract(value[i], offset, bits);
+    }
+    return result;
+}
+
+[__readNone]
+[ForceInline]
+public uint bitfieldInsert(uint base, uint insert, int offset, int bits)
+{
+    uint clearMask = ~(((1u << bits) - 1u) << offset);
+    uint clearedBase = base & clearMask;
+    uint maskedInsert = (insert & ((1u << bits) - 1u)) << offset;
+    return clearedBase | maskedInsert;
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<uint,N> bitfieldInsert(vector<uint,N> base, vector<uint,N> insert, int offset, int bits)
+{
+    vector<uint,N> result;
+    for (int i = 0; i < N; ++i)
+    {
+        result[i] = bitfieldInsert(base[i], insert[i], offset, bits);
+    }
+    return result;
+}
+
+[__readNone]
+[ForceInline]
+public int bitfieldInsert(int base, int insert, int offset, int bits)
+{
+    uint clearMask = ~(((1u << bits) - 1u) << offset);
+    uint clearedBase = base & clearMask;
+    uint maskedInsert = (insert & ((1u << bits) - 1u)) << offset;
+    return clearedBase | maskedInsert;
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<int,N> bitfieldInsert(vector<int,N> base, vector<int,N> insert, int offset, int bits)
+{
+    vector<int,N> result;
+    for (int i = 0; i < N; ++i)
+    {
+        result[i] = bitfieldInsert(base[i], insert[i], offset, bits);
+    }
+    return result;
+}
+
+[__readNone]
+[ForceInline]
+public int bitfieldReverse(highp int value)
+{
+    value = ((value & 0xAAAAAAAA) >> 1) | ((value & 0x55555555) << 1);
+    value = ((value & 0xCCCCCCCC) >> 2) | ((value & 0x33333333) << 2);
+    value = ((value & 0xF0F0F0F0) >> 4) | ((value & 0x0F0F0F0F) << 4);
+    value = ((value & 0xFF00FF00) >> 8) | ((value & 0x00FF00FF) << 8);
+    value = ((value & 0xFFFF0000) >> 16) | ((value & 0x0000FFFF) << 16);
+    return value;
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<int,N> bitfieldReverse(highp vector<int,N> value)
+{
+    VECTOR_MAP_UNARY(int, N, bitfieldReverse, value);
+}
+
+[__readNone]
+[ForceInline]
+public uint bitfieldReverse(highp uint value)
+{
+    value = ((value & 0xAAAAAAAA) >> 1) | ((value & 0x55555555) << 1);
+    value = ((value & 0xCCCCCCCC) >> 2) | ((value & 0x33333333) << 2);
+    value = ((value & 0xF0F0F0F0) >> 4) | ((value & 0x0F0F0F0F) << 4);
+    value = ((value & 0xFF00FF00) >> 8) | ((value & 0x00FF00FF) << 8);
+    value = ((value & 0xFFFF0000) >> 16) | ((value & 0x0000FFFF) << 16);
+    return value;
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<uint,N> bitfieldReverse(highp vector<uint,N> value)
+{
+    VECTOR_MAP_UNARY(int, N, bitfieldReverse, value);
+}
+
+[__readNone] [ForceInline] REQUIRE_KHRONOS
+public uint bitCount(uint value)
+{
+    return countbits(value);
+}
+
+__generic<let N:int>
+[__readNone] [ForceInline] REQUIRE_KHRONOS
+public vector<uint,N> bitCount(vector<uint,N> value)
+{
+    VECTOR_MAP_UNARY(uint, N, countbits, value);
+}
+
+[__readNone] [ForceInline] REQUIRE_KHRONOS
+public int bitCount(int value)
+{
+    return countbits(uint(value));
+}
+    
+__generic<let N:int>
+[__readNone] [ForceInline] REQUIRE_KHRONOS
+public vector<int,N> bitCount(vector<int,N> value)
+{
+    VECTOR_MAP_UNARY(int, N, countbits, value);
+}
+
+[__readNone]
+[ForceInline]
+public int findLSB(int v)
+{
+    return firstbitlow(v);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<int,N> findLSB(vector<int,N> value)
 {
     return firstbitlow(value);
 }
-[ForceInline] public vector<uint,N> findLSB<let N:int>(vector<uint,N> value)
+
+[__readNone]
+[ForceInline]
+public uint findLSB(uint v)
+{
+    return firstbitlow(v);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<uint,N> findLSB(vector<uint,N> value)
 {
     return firstbitlow(value);
 }
 
+[__readNone]
+[ForceInline]
+public int findMSB(int value)
+{
+    return firstbithigh(value);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<int,N> findMSB(vector<int,N> value)
+{
+    return firstbithigh(value);
+}
+
+[__readNone]
+[ForceInline]
+public uint findMSB(uint value)
+{
+    return firstbithigh(value);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<uint,N> findMSB(vector<uint,N> value)
+{
+    return firstbithigh(value);
+}
+
+__generic<let N:int>
+[__readNone]
+[ForceInline]
+public vector<bool,N> not(vector<bool,N> x)
+{
+    return !x;
+}
+
 //
 // Section 8.9.1. Texture Query Functions
 //
@@ -1986,3 +2787,59 @@ public vec4 shadow2DProjLod(sampler2DShadow sampler, vec4 coord, float lod)
     return textureProjLod(sampler, coord, lod);
 }
 
+//
+// Ray tracing
+//
+
+public typealias rayQueryEXT = RayQuery;
+
+__glsl_extension(GL_EXT_ray_query)
+__glsl_version(460)
+[ForceInline]
+public void rayQueryConfirmIntersectionEXT(inout rayQueryEXT q)
+{
+    q.CommitNonOpaqueTriangleHit();
+}
+
+__glsl_extension(GL_EXT_ray_query)
+__glsl_version(460)
+[ForceInline]
+public bool rayQueryProceedEXT(inout rayQueryEXT q)
+{
+    return q.Proceed();
+}
+
+__glsl_extension(GL_EXT_ray_query)
+__glsl_version(460)
+[__NoSideEffect]
+public uint rayQueryGetIntersectionTypeEXT(rayQueryEXT q, bool committed)
+{
+    if (committed)
+    {
+        q.CommittedStatus();
+    }
+    else
+    {
+        q.CandidateType();
+    }
+    return 0;
+}
+
+
+//
+// Subgroup
+//
+
+__glsl_extension(KHR_shader_subgroup)
+__glsl_version(450)
+public void subgroupBarrier()
+{
+    //__subgroupBarrier();
+}
+
+__glsl_extension(KHR_shader_subgroup)
+__glsl_version(450)
+public void subgroupMemoryBarrier()
+{
+}
+
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 2bf0c1d80..8183c2030 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -2,6 +2,9 @@
 
 typedef uint UINT;
 
+__intrinsic_op($(kIROp_FloatCast))
+T __floatCast<T, U>(U v);
+
 [sealed]
 interface IBufferDataLayout
 {
@@ -4407,6 +4410,16 @@ T distance(T x, T y)
 
 // Vector dot product
 
+__generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+[__readNone]
+[ForceInline]
+T dot(T x, T y)
+{
+    return x * y;
+}
+
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
@@ -4561,16 +4574,34 @@ matrix<T, N, M> exp(matrix<T, N, M> x)
 
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
-__target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_exp2($0)")
 __target_intrinsic(cpp, "$P_exp2($0)")
-__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Exp2 _0")
 [__readNone]
-T exp2(T x);
+T exp2(T x)
+{
+    __target_switch
+    {
+    case glsl:
+        if (__isHalf<T>())
+            __intrinsic_asm "exp2($0)";
+        __intrinsic_asm "exp2(float($0))";
+    case spirv:
+        if (__isHalf<T>())
+        {
+            return spirv_asm { OpExtInst $$T result glsl450 Exp2 $x };
+        }
+        else
+        {
+            float xf = __floatCast<float>(x);
+            return T(spirv_asm {
+                 result:$$float = OpExtInst glsl450 Exp2 $xf
+            });
+        }
+    }
+}
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
-__target_intrinsic(glsl)
 __target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Exp2 _0")
 [__readNone]
 vector<T,N> exp2(vector<T,N> x)
@@ -4786,31 +4817,41 @@ matrix<T, N, M> floor(matrix<T, N, M> x)
     MATRIX_MAP_UNARY(T, N, M, floor, x);
 }
 
-// Fused multiply-add for doubles
-__target_intrinsic(hlsl)
+// Fused multiply-add
+__generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_fma($0, $1, $2)")
 __target_intrinsic(cpp, "$P_fma($0, $1, $2)")
 __target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
 [__readNone]
-double fma(double a, double b, double c);
+T fma(T a, T b, T c)
+{
+    __target_switch
+    {
+    case hlsl:
+        if (__isFloat<T>() || __isHalf<T>())
+            return mad(a, b, c);
+        else
+            __intrinsic_asm "fma($0, $1, $2)";
+    }
+}
 
-__generic<let N : int>
+__generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
 [__readNone]
-vector<double, N> fma(vector<double, N> a, vector<double, N> b, vector<double, N> c)
+vector<T, N> fma(vector<T, N> a, vector<T, N> b, vector<T, N> c)
 {
-    VECTOR_MAP_TRINARY(double, N, fma, a, b, c);
+    VECTOR_MAP_TRINARY(T, N, fma, a, b, c);
 }
 
-__generic<let N : int, let M : int>
+__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 [__readNone]
-matrix<double, N, M> fma(matrix<double, N, M> a, matrix<double, N, M> b, matrix<double, N, M> c)
+matrix<T, N, M> fma(matrix<T, N, M> a, matrix<T, N, M> b, matrix<T, N, M> c)
 {
-    MATRIX_MAP_TRINARY(double, N, M, fma, a, b, c);
+    MATRIX_MAP_TRINARY(T, N, M, fma, a, b, c);
 }
 
 // Floating point remainder of x/y
@@ -6414,6 +6455,16 @@ vector<T,N> normalize(vector<T,N> x)
     return x / length(x);
 }
 
+__generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Normalize _0")
+[__readNone]
+T normalize(T x)
+{
+    return x / length(x);
+}
+
 // Raise to a power
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
@@ -6618,6 +6669,16 @@ matrix<T, N, M> rcp(matrix<T, N, M> x)
 }
 
 // Reflect incident vector across plane with given normal
+__generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Reflect _0 _1")
+[__readNone]
+T reflect(T i, T n)
+{
+    return i - T(2) * dot(n,i) * n;
+}
+
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
@@ -6642,6 +6703,19 @@ vector<T,N> refract(vector<T,N> i, vector<T,N> n, T eta)
     return eta * i - (eta * dotNI + sqrt(k)) * n;
 }
 
+__generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Refract _0 _1 _2")
+[__readNone]
+T refract(T i, T n, T eta)
+{
+    let dotNI = dot(n,i);
+    let k = T(1) - eta*eta*(T(1) - dotNI * dotNI);
+    if(k < T(0)) return T(0);
+    return eta * i - (eta * dotNI + sqrt(k)) * n;
+}
+
 // Reverse order of bits
 [__readNone]
 uint reversebits(uint value)
diff --git a/tests/glsl-intrinsic/intrinsic-basic.slang b/tests/glsl-intrinsic/intrinsic-basic.slang
new file mode 100644
index 000000000..14d03bf6c
--- /dev/null
+++ b/tests/glsl-intrinsic/intrinsic-basic.slang
@@ -0,0 +1,509 @@
+//TEST:SIMPLE(filecheck=CHECK_GLSL): -allow-glsl -stage fragment -entry computeMain -target glsl
+//TEST:SIMPLE(filecheck=CHECK_SPV):  -allow-glsl -stage fragment -entry computeMain -target spirv
+//TEST:SIMPLE(filecheck=CHECK_HLSL): -allow-glsl -stage fragment -entry computeMain -target hlsl
+//TEST:SIMPLE(filecheck=CHECK_CUDA): -allow-glsl -stage fragment -entry computeMain -target cuda -DTARGET_CUDA
+//TEST:SIMPLE(filecheck=CHECK_CPP):  -allow-glsl -stage fragment -entry computeMain -target cpp
+
+//TEST(compute, vulkan):COMPARE_COMPUTE(filecheck-buffer=BUF):-vk -compute -entry computeMain -allow-glsl -output-using-type
+//TEST(compute, vulkan):COMPARE_COMPUTE(filecheck-buffer=BUF):-vk -compute -entry computeMain -allow-glsl -output-using-type -emit-spirv-directly
+
+// "inverse()" function is not implemented yet.
+//#defined TEST_when_inverse_works
+
+// "ftransform()" function is not implemented yet.
+//#defined TEST_when_fransform_works
+
+// "exp2" for double type is causing an issue with SPIRV
+//#define TEST_when_exp2_double_type_works
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name=outputBuffer
+buffer MyBlockName2
+{
+    int4 result;
+} outputBuffer;
+
+
+__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
+bool equals(matrix<T, N, M> lhs, matrix<T, N, M> rhs)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        for (int j = 0; j < M; ++j)
+        {
+            if (lhs[i][j] != rhs[i][j])
+            {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+bool dim1TypeFuncs()
+{
+    typealias genFType = float;
+    typealias genDType = double;
+    typealias genIType = int;
+    typealias genUType = uint;
+    typealias genBType = bool;
+
+    genFType outGenFType;
+    genDType outGenDType;
+    genIType outGenIType, outGenIType2;
+    genUType outGenUType, outGenUType2;
+
+    constexpr float epsilon = 0.000001;
+
+    bool voidResults = true;
+
+    // 8.8. Integer Functions
+    umulExtended(genUType(0), genUType(0), outGenUType, outGenUType2);
+    voidResults = voidResults && genUType(0) == outGenUType;
+    voidResults = voidResults && genUType(0) == outGenUType2;
+
+    imulExtended(genIType(0), genIType(0), outGenIType, outGenIType2);
+    voidResults = voidResults && genIType(0) == outGenIType;
+    voidResults = voidResults && genIType(0) == outGenIType2;
+
+    return voidResults
+        // 8.1. Angle and Trigonometry Functions
+        && genFType(0) == radians(genFType(0))
+        && genFType(0) == degrees(genFType(0))
+        && genFType(0) == sin(genFType(0))
+        && genFType(1) == cos(genFType(0))
+        && genFType(0) == tan(genFType(0))
+        && genFType(0) == asin(genFType(0))
+        && genFType(0) == acos(genFType(1))
+        && genFType(0) == atan(genFType(0), genFType(0))
+        && genFType(0) == atan(genFType(0))
+        && genFType(0) == sinh(genFType(0))
+        && genFType(1) == cosh(genFType(0))
+        && genFType(0) == tanh(genFType(0))
+        && genFType(0) == asinh(genFType(0))
+        && genFType(0) == acosh(genFType(1))
+        && genFType(0) == atanh(genFType(0))
+
+        // 8.2. Exponential Functions
+        && genFType(1) == pow(genFType(1), genFType(0))
+        && genFType(1) == exp(genFType(0))
+        && genFType(0) == log(genFType(1))
+        && genFType(1) == exp2(genFType(0))
+#if defined(TEST_when_exp2_double_type_works)
+        && genDType(1) == exp2(genDType(0))
+#endif // #if defined(TEST_when_exp2_double_type_works)
+        && genFType(0) == log2(genFType(1))
+        && genFType(0) == sqrt(genFType(0))
+        && genDType(0) == sqrt(genDType(0))
+        && genFType(1) == inversesqrt(genFType(1))
+        && genDType(1) == inversesqrt(genDType(1))
+
+        // 8.3. Common Functions
+        && genFType(0) == abs(genFType(0))
+        && genIType(0) == abs(genIType(0))
+        && genDType(0) == abs(genDType(0))
+        && genFType(0) == sign(genFType(0))
+#if !defined(TARGET_CUDA)
+        && genIType(0) == sign(genIType(0))
+#endif // #if !defined(TARGET_CUDA)
+        && genDType(0) == sign(genDType(0))
+        && genFType(0) == floor(genFType(0))
+        && genDType(0) == floor(genDType(0))
+        && genFType(0) == trunc(genFType(0))
+        && genDType(0) == trunc(genDType(0))
+        && genFType(0) == round(genFType(0))
+        && genDType(0) == round(genDType(0))
+        && genFType(0) == roundEven(genFType(0))
+        && genDType(0) == roundEven(genDType(0))
+        && genFType(0) == ceil(genFType(0))
+        && genDType(0) == ceil(genDType(0))
+        && genFType(0) == fract(genFType(0))
+        && genDType(0) == fract(genDType(0))
+        && genFType(0) == mod(genFType(0), float(1))
+        && genFType(0) == mod(genFType(0), genFType(1))
+        && genDType(0) == mod(genDType(0), double(1))
+        && genDType(0) == mod(genDType(0), genDType(0))
+        && genFType(0) == modf(genFType(0), outGenFType) && 0 == outGenFType
+        && genDType(0) == modf(genDType(0), outGenDType) && 0 == outGenDType
+        && genFType(0) == min(genFType(0), genFType(0))
+        && genFType(0) == min(genFType(0), float(0))
+        && genDType(0) == min(genDType(0), genDType(0))
+        && genDType(0) == min(genDType(0), double(0))
+        && genIType(0) == min(genIType(0), genIType(0))
+        && genIType(0) == min(genIType(0), int(0))
+        && genUType(0) == min(genUType(0), genUType(0))
+        && genUType(0) == min(genUType(0), uint(0))
+        && genFType(0) == max(genFType(0), genFType(0))
+        && genFType(0) == max(genFType(0), float(0))
+        && genDType(0) == max(genDType(0), genDType(0))
+        && genDType(0) == max(genDType(0), double(0))
+        && genIType(0) == max(genIType(0), genIType(0))
+        && genIType(0) == max(genIType(0), int(0))
+        && genUType(0) == max(genUType(0), genUType(0))
+        && genUType(0) == max(genUType(0), uint(0))
+        && genFType(0) == clamp(genFType(0), genFType(0), genFType(0))
+        && genFType(0) == clamp(genFType(0), float(0), float(0))
+        && genDType(0) == clamp(genDType(0), genDType(0), genDType(0))
+        && genDType(0) == clamp(genDType(0), double(0), double(0))
+        && genIType(0) == clamp(genIType(0), genIType(0), genIType(0))
+        && genIType(0) == clamp(genIType(0), int(0), int(0))
+        && genUType(0) == clamp(genUType(0), genUType(0), genUType(0))
+        && genUType(0) == clamp(genUType(0), uint(0), uint(0))
+        && genFType(0) == mix(genFType(0), genFType(0), genFType(0))
+        && genFType(0) == mix(genFType(0), genFType(0), float(0))
+        && genDType(0) == mix(genDType(0), genDType(0), genDType(0))
+        && genDType(0) == mix(genDType(0), genDType(0), double(0))
+        && genFType(0) == mix(genFType(0), genFType(0), genBType(0))
+        && genDType(0) == mix(genDType(0), genDType(0), genBType(0))
+        && genIType(0) == mix(genIType(0), genIType(0), genBType(0))
+        && genUType(0) == mix(genUType(0), genUType(0), genBType(0))
+        && genBType(0) == mix(genBType(0), genBType(0), genBType(0))
+        && genFType(0) == step(genFType(1), genFType(0))
+        && genFType(0) == step(float(1), genFType(0))
+        && genDType(0) == step(genDType(1), genDType(0))
+        && genDType(0) == step(double(1), genDType(0))
+        && genFType(0) == smoothstep(genFType(0), genFType(1), genFType(0))
+        && genFType(0) == smoothstep(float(0), float(1), genFType(0))
+        && genDType(0) == smoothstep(genDType(0), genDType(1), genDType(0))
+        && genDType(0) == smoothstep(double(0), double(1), genDType(0))
+        && genBType(0) == isnan(genFType(0))
+        && genBType(0) == isnan(genDType(0))
+        && genBType(0) == isinf(genFType(0))
+        && genBType(0) == isinf(genDType(0))
+        && genIType(0) == floatBitsToInt(genFType(0))
+        && genUType(0) == floatBitsToUint(genFType(0))
+        && genFType(0) == intBitsToFloat(genIType(0))
+        && genFType(0) == uintBitsToFloat(genUType(0))
+        && genFType(0) == fma(genFType(0), genFType(0), genFType(0))
+        && genDType(0) == fma(genDType(0), genDType(0), genDType(0))
+        && genFType(0) == frexp(genFType(0), outGenIType) && genIType(0) == outGenIType
+        && genDType(0) == frexp(genDType(0), outGenIType) && genIType(0) == outGenIType
+        && genFType(0) == ldexp(genFType(0), genIType(0))
+#if defined(TEST_when_exp2_double_type_works)
+        && genDType(0) == ldexp(genDType(0), genIType(0))
+#endif // #if defined(TEST_when_exp2_double_type_works)
+
+        // 8.5. Geometric Functions
+        && float(0) == length(genFType(0))
+        && double(0) == length(genDType(0))
+        && float(0) == distance(genFType(0), genFType(0))
+        && double(0) == distance(genDType(0), genDType(0))
+        && float(0) == dot(genFType(0), genFType(0))
+        && double(0) == dot(genDType(0), genDType(0))
+        && (abs(float(1) - length(normalize(genFType(1)))) < epsilon)
+        && (abs(double(1) - length(normalize(genDType(1)))) < double(epsilon))
+        && genFType(1) == faceforward(genFType(1), genFType(1), genFType(-1))
+        && genDType(1) == faceforward(genDType(1), genDType(1), genDType(-1))
+        && genFType(0) == reflect(genFType(0), genFType(0))
+        && genDType(0) == reflect(genDType(0), genDType(0))
+        && genFType(0) == refract(genFType(0), genFType(0), float(0))
+        && genDType(0) == refract(genDType(0), genDType(0), double(0))
+
+        // 8.6. Matrix Functions
+        && equals(mat2x2(0), matrixCompMult(mat2x2(0), mat2x2(0)))
+        && equals(mat2x3(0), matrixCompMult(mat2x3(0), mat2x3(0)))
+        && equals(mat2x4(0), matrixCompMult(mat2x4(0), mat2x4(0)))
+        && equals(mat3x2(0), matrixCompMult(mat3x2(0), mat3x2(0)))
+        && equals(mat3x3(0), matrixCompMult(mat3x3(0), mat3x3(0)))
+        && equals(mat3x4(0), matrixCompMult(mat3x4(0), mat3x4(0)))
+        && equals(mat4x2(0), matrixCompMult(mat4x2(0), mat4x2(0)))
+        && equals(mat4x3(0), matrixCompMult(mat4x3(0), mat4x3(0)))
+        && equals(mat4x4(0), matrixCompMult(mat4x4(0), mat4x4(0)))
+        && equals(mat2(0), outerProduct(vec2(0), vec2(0)))
+        && equals(mat3(0), outerProduct(vec3(0), vec3(0)))
+        && equals(mat4(0), outerProduct(vec4(0), vec4(0)))
+        && equals(mat2x3(0), outerProduct(vec3(0), vec2(0)))
+        && equals(mat3x2(0), outerProduct(vec2(0), vec3(0)))
+        && equals(mat2x4(0), outerProduct(vec4(0), vec2(0)))
+        && equals(mat4x2(0), outerProduct(vec2(0), vec4(0)))
+        && equals(mat3x4(0), outerProduct(vec4(0), vec3(0)))
+        && equals(mat4x3(0), outerProduct(vec3(0), vec4(0)))
+        && equals(mat2(0), transpose(mat2(0)))
+        && equals(mat3(0), transpose(mat3(0)))
+        && equals(mat4(0), transpose(mat4(0)))
+        && equals(mat2x3(0), transpose(mat3x2(0)))
+        && equals(mat3x2(0), transpose(mat2x3(0)))
+        && equals(mat2x4(0), transpose(mat4x2(0)))
+        && equals(mat4x2(0), transpose(mat2x4(0)))
+        && equals(mat3x4(0), transpose(mat4x3(0)))
+        && equals(mat4x3(0), transpose(mat3x4(0)))
+        && float(0) == determinant(mat2(0))
+        && float(0) == determinant(mat3(0))
+        && float(0) == determinant(mat4(0))
+#if defined(TEST_when_inverse_works)
+        && equals(mat2(0), inverse(mat2(0)))
+        && equals(mat3(0), inverse(mat3(0)))
+        && equals(mat4(0), inverse(mat4(0)))
+#endif // #if defined(TEST_when_inverse_works)
+
+        // 8.8. Integer Functions
+        && genUType(0) == uaddCarry(genUType(0), genUType(0), outGenUType) && genUType(0) == outGenUType
+        && genUType(0) == usubBorrow(genUType(0), genUType(0), outGenUType) && genUType(0) == outGenUType
+        && genIType(0) == bitfieldExtract(genIType(0), int(0), int(0))
+        && genUType(0) == bitfieldExtract(genUType(0), int(0), int(0))
+        && genIType(0) == bitfieldInsert(genIType(0), genIType(0), int(0), int(0))
+        && genUType(0) == bitfieldInsert(genUType(0), genUType(0), int(0), int(0))
+        && genIType(0) == bitfieldReverse(genIType(0))
+        && genUType(0) == bitfieldReverse(genUType(0))
+        && genIType(0) == bitCount(genIType(0))
+        && genIType(0) == bitCount(genUType(0))
+        && genIType(-1) == findLSB(genIType(0))
+        && genIType(-1) == findLSB(genUType(0))
+        && genIType(-1) == findMSB(genIType(0))
+        && genIType(-1) == findMSB(genUType(0))
+        ;
+}
+
+__generic<let N : int>
+bool dimNTypeFuncs()
+{
+    typealias genFType = vector<float, N>;
+    typealias genDType = vector<double, N>;
+    typealias genIType = vector<int, N>;
+    typealias genUType = vector<uint, N>;
+    typealias genBType = vector<bool, N>;
+    typealias vec      = vector<float, N>;
+    typealias ivec     = vector<int, N>;
+    typealias uvec     = vector<uint, N>;
+    typealias bvec     = vector<bool, N>;
+
+    genFType outGenFType;
+    genDType outGenDType;
+    genIType outGenIType, outGenIType2;
+    genUType outGenUType, outGenUType2;
+
+    constexpr float epsilon = 0.000001;
+
+    bool voidResults = true;
+
+    // 8.8. Integer Functions
+    umulExtended(genUType(0), genUType(0), outGenUType, outGenUType2);
+    voidResults = voidResults && genUType(0) == outGenUType;
+    voidResults = voidResults && genUType(0) == outGenUType2;
+
+    imulExtended(genIType(0), genIType(0), outGenIType, outGenIType2);
+    voidResults = voidResults && genIType(0) == outGenIType;
+    voidResults = voidResults && genIType(0) == outGenIType2;
+
+    return voidResults
+        // 8.1. Angle and Trigonometry Functions
+        && genFType(0) == radians(genFType(0))
+        && genFType(0) == degrees(genFType(0))
+        && genFType(0) == sin(genFType(0))
+        && genFType(1) == cos(genFType(0))
+        && genFType(0) == tan(genFType(0))
+        && genFType(0) == asin(genFType(0))
+        && genFType(0) == acos(genFType(1))
+        && genFType(0) == atan(genFType(0), genFType(0))
+        && genFType(0) == atan(genFType(0))
+        && genFType(0) == sinh(genFType(0))
+        && genFType(1) == cosh(genFType(0))
+        && genFType(0) == tanh(genFType(0))
+#if !defined(TARGET_CUDA)
+        && genFType(0) == asinh(genFType(0))
+        && genFType(0) == acosh(genFType(1))
+        && genFType(0) == atanh(genFType(0))
+#endif // #if !defined(TARGET_CUDA)
+
+        // 8.2. Exponential Functions
+        && genFType(1) == pow(genFType(1), genFType(0))
+        && genFType(1) == exp(genFType(0))
+        && genFType(0) == log(genFType(1))
+        && genFType(1) == exp2(genFType(0))
+#if defined(TEST_when_exp2_double_type_works)
+        && genDType(1) == exp2(genDType(0))
+#endif // #if defined(TEST_when_exp2_double_type_works)
+        && genFType(0) == log2(genFType(1))
+        && genFType(0) == sqrt(genFType(0))
+        && genDType(0) == sqrt(genDType(0))
+        && genFType(1) == inversesqrt(genFType(1))
+        && genDType(1) == inversesqrt(genDType(1))
+
+        // 8.3. Common Functions
+        && genFType(0) == abs(genFType(0))
+        && genIType(0) == abs(genIType(0))
+        && genDType(0) == abs(genDType(0))
+        && genFType(0) == sign(genFType(0))
+#if !defined(TARGET_CUDA)
+        && genIType(0) == sign(genIType(0))
+#endif // #if !defined(TARGET_CUDA)
+        && genDType(0) == sign(genDType(0))
+        && genFType(0) == floor(genFType(0))
+        && genDType(0) == floor(genDType(0))
+        && genFType(0) == trunc(genFType(0))
+        && genDType(0) == trunc(genDType(0))
+        && genFType(0) == round(genFType(0))
+        && genDType(0) == round(genDType(0))
+        && genFType(0) == roundEven(genFType(0))
+        && genDType(0) == roundEven(genDType(0))
+        && genFType(0) == ceil(genFType(0))
+        && genDType(0) == ceil(genDType(0))
+        && genFType(0) == fract(genFType(0))
+        && genDType(0) == fract(genDType(0))
+        && genFType(0) == mod(genFType(0), float(1))
+        && genFType(0) == mod(genFType(0), genFType(1))
+        && genDType(0) == mod(genDType(0), double(1))
+        && genDType(0) == mod(genDType(0), genDType(0))
+        && genFType(0) == modf(genFType(0), outGenFType) && genFType(0) == outGenFType
+        && genDType(0) == modf(genDType(0), outGenDType) && genDType(0) == outGenDType
+        && genFType(0) == min(genFType(0), genFType(0))
+        && genFType(0) == min(genFType(0), float(0))
+        && genDType(0) == min(genDType(0), genDType(0))
+        && genDType(0) == min(genDType(0), double(0))
+        && genIType(0) == min(genIType(0), genIType(0))
+        && genIType(0) == min(genIType(0), int(0))
+        && genUType(0) == min(genUType(0), genUType(0))
+        && genUType(0) == min(genUType(0), uint(0))
+        && genFType(0) == max(genFType(0), genFType(0))
+        && genFType(0) == max(genFType(0), float(0))
+        && genDType(0) == max(genDType(0), genDType(0))
+        && genDType(0) == max(genDType(0), double(0))
+        && genIType(0) == max(genIType(0), genIType(0))
+        && genIType(0) == max(genIType(0), int(0))
+        && genUType(0) == max(genUType(0), genUType(0))
+        && genUType(0) == max(genUType(0), uint(0))
+        && genFType(0) == clamp(genFType(0), genFType(0), genFType(0))
+        && genFType(0) == clamp(genFType(0), float(0), float(0))
+        && genDType(0) == clamp(genDType(0), genDType(0), genDType(0))
+        && genDType(0) == clamp(genDType(0), double(0), double(0))
+        && genIType(0) == clamp(genIType(0), genIType(0), genIType(0))
+        && genIType(0) == clamp(genIType(0), int(0), int(0))
+        && genUType(0) == clamp(genUType(0), genUType(0), genUType(0))
+        && genUType(0) == clamp(genUType(0), uint(0), uint(0))
+        && genFType(0) == mix(genFType(0), genFType(0), genFType(0))
+        && genFType(0) == mix(genFType(0), genFType(0), float(0))
+        && genDType(0) == mix(genDType(0), genDType(0), genDType(0))
+        && genDType(0) == mix(genDType(0), genDType(0), double(0))
+#if !defined(TARGET_CUDA)
+        && genFType(0) == mix(genFType(0), genFType(0), genBType(0))
+        && genDType(0) == mix(genDType(0), genDType(0), genBType(0))
+        && genIType(0) == mix(genIType(0), genIType(0), genBType(0))
+        && genUType(0) == mix(genUType(0), genUType(0), genBType(0))
+        && genBType(0) == mix(genBType(0), genBType(0), genBType(0))
+#endif // #if !defined(TARGET_CUDA)
+        && genFType(0) == step(genFType(1), genFType(0))
+        && genFType(0) == step(float(1), genFType(0))
+        && genDType(0) == step(genDType(1), genDType(0))
+        && genDType(0) == step(double(1), genDType(0))
+        && genFType(0) == smoothstep(genFType(0), genFType(1), genFType(0))
+        && genFType(0) == smoothstep(float(0), float(1), genFType(0))
+        && genDType(0) == smoothstep(genDType(0), genDType(1), genDType(0))
+        && genDType(0) == smoothstep(double(0), double(1), genDType(0))
+#if !defined(TARGET_CUDA)
+        && genBType(0) == isnan(genFType(0))
+        && genBType(0) == isnan(genDType(0))
+        && genBType(0) == isinf(genFType(0))
+        && genBType(0) == isinf(genDType(0))
+#endif // #if !defined(TARGET_CUDA)
+        && genIType(0) == floatBitsToInt(genFType(0))
+        && genUType(0) == floatBitsToUint(genFType(0))
+        && genFType(0) == intBitsToFloat(genIType(0))
+        && genFType(0) == uintBitsToFloat(genUType(0))
+        && genFType(0) == fma(genFType(0), genFType(0), genFType(0))
+        && genDType(0) == fma(genDType(0), genDType(0), genDType(0))
+        && genFType(0) == frexp(genFType(0), outGenIType) && genIType(0) == outGenIType
+        && genDType(0) == frexp(genDType(0), outGenIType) && genIType(0) == outGenIType
+        && genFType(0) == ldexp(genFType(0), genIType(0))
+#if defined(TEST_when_exp2_double_type_works)
+        && genDType(0) == ldexp(genDType(0), genIType(0))
+#endif // #if defined(TEST_when_exp2_double_type_works)
+
+        // 8.4. Floating-Point Pack and Unpack Functions
+        && uint(0) == packUnorm2x16(vec2(0))
+        && uint(0) == packSnorm2x16(vec2(-1))
+        && uint(0) == packUnorm4x8(vec4(0))
+        && uint(0) == packSnorm4x8(vec4(-1))
+        && vec2(0) == unpackUnorm2x16(uint(0))
+        && vec2(-1) == unpackSnorm2x16(uint(0))
+        && vec4(0) == unpackUnorm4x8(uint(0))
+        && vec4(-1) == unpackSnorm4x8(uint(0))
+        && uint(0) == packHalf2x16(vec2(0))
+        && vec2(0) == unpackHalf2x16(uint(0))
+        && double(0) == packDouble2x32(uvec2(0))
+        && uvec2(0) == unpackDouble2x32(double(0))
+
+        // 8.5. Geometric Functions
+        && float(0) == length(genFType(0))
+        && double(0) == length(genDType(0))
+        && float(0) == distance(genFType(0), genFType(0))
+        && double(0) == distance(genDType(0), genDType(0))
+        && float(0) == dot(genFType(0), genFType(0))
+        && double(0) == dot(genDType(0), genDType(0))
+        && vec3(0) == cross(vec3(0), vec3(0))
+        && dvec3(0) == cross(dvec3(0), dvec3(0))
+        && (abs(float(1) - length(normalize(genFType(1)))) < epsilon)
+        && (abs(double(1) - length(normalize(genDType(1)))) < double(epsilon))
+#if defined(TEST_when_fransform_works)
+        && vec4(0) == ftransform()
+#endif // #if defined(TEST_when_fransform_works)
+        && genFType(1) == faceforward(genFType(1), genFType(1), genFType(-1))
+        && genDType(1) == faceforward(genDType(1), genDType(1), genDType(-1))
+        && genFType(0) == reflect(genFType(0), genFType(0))
+        && genDType(0) == reflect(genDType(0), genDType(0))
+        && genFType(0) == refract(genFType(0), genFType(0), float(0))
+        && genDType(0) == refract(genDType(0), genDType(0), double(0))
+
+        // 8.7. Vector Relational Functions
+#if !defined(TARGET_CUDA)
+        && bvec(1) == lessThan(vec(0), vec(1))
+        && bvec(1) == lessThan(ivec(0), ivec(1))
+        && bvec(1) == lessThan(uvec(0), uvec(1))
+        && bvec(1) == lessThanEqual(vec(0), vec(1))
+        && bvec(1) == lessThanEqual(ivec(0), ivec(1))
+        && bvec(1) == lessThanEqual(uvec(0), uvec(1))
+        && bvec(0) == greaterThan(vec(0), vec(1))
+        && bvec(0) == greaterThan(ivec(0), ivec(1))
+        && bvec(0) == greaterThan(uvec(0), uvec(1))
+        && bvec(0) == greaterThanEqual(vec(0), vec(1))
+        && bvec(0) == greaterThanEqual(ivec(0), ivec(1))
+        && bvec(0) == greaterThanEqual(uvec(0), uvec(1))
+        && bvec(1) == equal(vec(0), vec(0))
+        && bvec(1) == equal(ivec(0), ivec(0))
+        && bvec(1) == equal(uvec(0), uvec(0))
+        && bvec(1) == equal(bvec(0), bvec(0))
+        && bvec(0) == notEqual(vec(0), vec(0))
+        && bvec(0) == notEqual(ivec(0), ivec(0))
+        && bvec(0) == notEqual(uvec(0), uvec(0))
+        && bvec(0) == notEqual(bvec(0), bvec(0))
+        && bool(0) == any(bvec(0))
+        && bool(0) == all(bvec(0))
+        && bvec(1) == not(bvec(0))
+#endif // #if !defined(TARGET_CUDA)
+
+        // 8.8. Integer Functions
+        && genUType(0) == uaddCarry(genUType(0), genUType(0), outGenUType) && genUType(0) == outGenUType
+        && genUType(0) == usubBorrow(genUType(0), genUType(0), outGenUType) && genUType(0) == outGenUType
+        && genIType(0) == bitfieldExtract(genIType(0), int(0), int(0))
+        && genUType(0) == bitfieldExtract(genUType(0), int(0), int(0))
+        && genIType(0) == bitfieldInsert(genIType(0), genIType(0), int(0), int(0))
+        && genUType(0) == bitfieldInsert(genUType(0), genUType(0), int(0), int(0))
+        && genIType(0) == bitfieldReverse(genIType(0))
+        && genUType(0) == bitfieldReverse(genUType(0))
+        && genIType(0) == bitCount(genIType(0))
+        && genIType(0) == bitCount(genUType(0))
+#if !defined(TARGET_CUDA)
+        && genIType(-1) == findLSB(genIType(0))
+        && genIType(-1) == findLSB(genUType(0))
+        && genIType(-1) == findMSB(genIType(0))
+        && genIType(-1) == findMSB(genUType(0))
+#endif // #if !defined(TARGET_CUDA)
+        ;
+}
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    // CHECK_GLSL: void main(
+    // CHECK_SPV: OpEntryPoint
+    // CHECK_HLSL: void computeMain(
+    // CHECK_CUDA: void computeMain(
+    // CHECK_CPP: void _computeMain(
+    // BUF: 1
+
+    bool r = true
+        && dim1TypeFuncs()
+        && dimNTypeFuncs<2>()
+        && dimNTypeFuncs<3>()
+        && dimNTypeFuncs<4>();
+
+    outputBuffer.result[0] = int(r);
+}