Clean-ups related to expanded standard library coverage (#1269)

This change continues the work already started in moving the definitions of many built-in functions to the standard library. The main focus in this change was reducing the number of operations that had to be special-cased on the CPU and CUDA targets by making sure that the scalar cases of built-in functions map to the proper names in the prelude (e.g., `F32_sin()`) via the ordinary `__target_intrinsic` mechanism. In some cases this cleanup meant that special-case logic that was constructing definitions for those functions using C++ code could be scrapped. Additional changes made along the way: * A few scalar functions that were missing in the CPU/CUDA preludes got added: `round`, hyperbolic trigonometric functions, `frexp`, `modf`, and `fma` * The floating-point `min()` and `max()` definitions in the preludes were changed to use intrinsic operations on the target (which are likely to follow IEEE semantics, while our definitions did not) * For the CUDA target, many of the functions had their names translated during code emit from, e.g., `sin` to `sinf`. This change makes the CUDA target more closely match the C++/CPU target in using names like `F32_sin` consistently. * For the CUDA target, a few additional functions have intrinsics that don't exist (portably) on CPU: `sincos()` and `rsqrt()`. * For the Slang stdlib definitions to work, a new `$P` replacement was defined for `__targert_intrinsic` that expands to a type based on the first operand of the function (e.g., `F32` for `float`). * I removed the dedicated opcodes for matrix-matrix, matrix-vector, and vector-matrix multiplication, and instead turned them into ordinary functions with definitions and `__target_intrinsic` modifiers to map them appropriately for HLSL and GLSL. This is realistically how we would have implemented these if we'd had `__target_intrinsic` from the start. Notes about possible follow-on work: * The `ldexp` function is still left in the Slang stdlib because it has to account for a floating-point exponent and the `math.h` version only handles integers for the exponent. It is possible that we can/should define another overload for `ldexp` (and `frexp`) that uses an integer for exponent, and then have that one be a built-in on CPU/CUDA, with the HLSL `frexp` being defined in the stdlib to delegate to the correct `frexp` for those targets. * The `firstbithigh` and related functions are missing for our CPU and CUDA targets, and will need to be added. It is worth nothing that `firstbithigh` apparently has some very odd functionality around signed integer arguments (which are supported, despite MSDN being unclear on that point). General cleanup will be required for those functions. * Maxing the various matrix and vector products no longer be intrinsic ops might affect how we emit code for them as sub-expressions (both whether we fold them into use sites and how we parenthize them). This doesn't seem to affect any of our existing tests, but we could consider marking these functions with `[__readNone]` to ensure they can be folded, and then also adding whatever modifier(s) we might invent to control precdence and parentheses insertion during emit.
author: Tim Foley <tfoleyNV@users.noreply.github.com> 2020-03-11 08:50:38 -0700
committer: GitHub <noreply@github.com> 2020-03-11 08:50:38 -0700
commit: 935768c6a00c258bf5122a2d04b84064a1eee67d (patch)
tree: 68dac944da274a21acb8c8bf651401c26e289f4c
parent: b380b1af6ba6f5f58e3841c2a5b14db7ee8c372d (diff)
12 files changed, 351 insertions, 482 deletions
diff --git a/prelude/slang-cpp-scalar-intrinsics.h b/prelude/slang-cpp-scalar-intrinsics.h
index 95acd9335..c814365c6 100644
--- a/prelude/slang-cpp-scalar-intrinsics.h
+++ b/prelude/slang-cpp-scalar-intrinsics.h
@@ -46,12 +46,16 @@ SLANG_FORCE_INLINE float F32_calcSafeRadians(float radians)
 // Unary 
 SLANG_FORCE_INLINE float F32_ceil(float f) { return ::ceilf(f); }
 SLANG_FORCE_INLINE float F32_floor(float f) { return ::floorf(f); }
+SLANG_FORCE_INLINE float F32_round(float f) { return ::roundf(f); }
 SLANG_FORCE_INLINE float F32_sin(float f) { return ::sinf(f); }
 SLANG_FORCE_INLINE float F32_cos(float f) { return ::cosf(f); }
 SLANG_FORCE_INLINE float F32_tan(float f) { return ::tanf(f); }
 SLANG_FORCE_INLINE float F32_asin(float f) { return ::asinf(f); }
 SLANG_FORCE_INLINE float F32_acos(float f) { return ::acosf(f); }
 SLANG_FORCE_INLINE float F32_atan(float f) { return ::atanf(f); }
+SLANG_FORCE_INLINE float F32_sinh(float f) { return ::sinhf(f); }
+SLANG_FORCE_INLINE float F32_cosh(float f) { return ::coshf(f); }
+SLANG_FORCE_INLINE float F32_tanh(float f) { return ::tanhf(f); }
 SLANG_FORCE_INLINE float F32_log2(float f) { return ::log2f(f); }
 SLANG_FORCE_INLINE float F32_log(float f) { return ::logf(f); }
 SLANG_FORCE_INLINE float F32_log10(float f) { return ::log10f(f); }
@@ -61,42 +65,39 @@ SLANG_FORCE_INLINE float F32_abs(float f) { return ::fabsf(f); }
 SLANG_FORCE_INLINE float F32_trunc(float f) { return ::truncf(f); }
 SLANG_FORCE_INLINE float F32_sqrt(float f) { return ::sqrtf(f); }
 SLANG_FORCE_INLINE float F32_rsqrt(float f) { return 1.0f / F32_sqrt(f); }
-SLANG_FORCE_INLINE float F32_rcp(float f) { return 1.0f / f; }
 SLANG_FORCE_INLINE float F32_sign(float f) { return ( f == 0.0f) ? f : (( f < 0.0f) ? -1.0f : 1.0f); } 
-SLANG_FORCE_INLINE float F32_saturate(float f) { return (f < 0.0f) ? 0.0f : (f > 1.0f) ? 1.0f : f; }
 SLANG_FORCE_INLINE float F32_frac(float f) { return f - F32_floor(f); }
-SLANG_FORCE_INLINE float F32_radians(float f) { return f * 0.01745329222f; }
 
 SLANG_FORCE_INLINE bool F32_isnan(float f) { return isnan(f); }
 SLANG_FORCE_INLINE bool F32_isfinite(float f) { return isfinite(f); }
 SLANG_FORCE_INLINE bool F32_isinf(float f) { return isinf(f); }
 
 // Binary
-SLANG_FORCE_INLINE float F32_min(float a, float b) { return a < b ? a : b; }
-SLANG_FORCE_INLINE float F32_max(float a, float b) { return a > b ? a : b; }
+SLANG_FORCE_INLINE float F32_min(float a, float b) { return ::fminf(a, b); }
+SLANG_FORCE_INLINE float F32_max(float a, float b) { return ::fmaxf(a, b); }
 SLANG_FORCE_INLINE float F32_pow(float a, float b) { return ::powf(a, b); }
 SLANG_FORCE_INLINE float F32_fmod(float a, float b) { return ::fmodf(a, b); }
 SLANG_FORCE_INLINE float F32_remainder(float a, float b) { return ::remainderf(a, b); }
-SLANG_FORCE_INLINE float F32_step(float a, float b) { return float(b >= a); }
 SLANG_FORCE_INLINE float F32_atan2(float a, float b) { return float(::atan2(a, b)); }
 
-// TODO(JS): 
-// Note C++ has ldexp, but it takes an integer for the exponent, it seems HLSL takes both as float
-SLANG_FORCE_INLINE float F32_ldexp(float m, float e) { return m * ::powf(2.0f, e); }
-
-// Ternary 
-SLANG_FORCE_INLINE float F32_smoothstep(float min, float max, float x) 
-{ 
-    const float t = x < min ? 0.0f : ((x > max) ? 1.0f : (x - min) / (max - min)); 
-    return t * t * (3.0 - 2.0 * t);
+SLANG_FORCE_INLINE float F32_frexp(float x, float& e)
+{
+    int ei;
+    float m = ::frexpf(x, &ei);
+    e = ei;
+    return m;
+}
+SLANG_FORCE_INLINE float F32_modf(float x, float& ip)
+{
+    return ::modff(x, &ip);
 }
-SLANG_FORCE_INLINE float F32_lerp(float x, float y, float s) { return x + s * (y - x); }
-SLANG_FORCE_INLINE float F32_clamp(float x, float min, float max) { return ( x < min) ? min : ((x > max) ? max : x); }
-SLANG_FORCE_INLINE void F32_sincos(float f, float& outSin, float& outCos) { outSin = F32_sin(f); outCos = F32_cos(f); }
 
 SLANG_FORCE_INLINE uint32_t F32_asuint(float f) { Union32 u; u.f = f; return u.u; }
 SLANG_FORCE_INLINE int32_t F32_asint(float f) { Union32 u; u.f = f; return u.i; }
 
+// Ternary
+SLANG_FORCE_INLINE float F32_fma(float a, float b, float c) { return ::fmaf(a, b, c); }
+
 // ----------------------------- F64 -----------------------------------------
 
 SLANG_FORCE_INLINE double F64_calcSafeRadians(double radians)
@@ -112,12 +113,16 @@ SLANG_FORCE_INLINE double F64_calcSafeRadians(double radians)
 // Unary 
 SLANG_FORCE_INLINE double F64_ceil(double f) { return ::ceil(f); }
 SLANG_FORCE_INLINE double F64_floor(double f) { return ::floor(f); }
+SLANG_FORCE_INLINE double F64_round(double f) { return ::round(f); }
 SLANG_FORCE_INLINE double F64_sin(double f) { return ::sin(f); }
 SLANG_FORCE_INLINE double F64_cos(double f) { return ::cos(f); }
 SLANG_FORCE_INLINE double F64_tan(double f) { return ::tan(f); }
 SLANG_FORCE_INLINE double F64_asin(double f) { return ::asin(f); }
 SLANG_FORCE_INLINE double F64_acos(double f) { return ::acos(f); }
 SLANG_FORCE_INLINE double F64_atan(double f) { return ::atan(f); }
+SLANG_FORCE_INLINE double F64_sinh(double f) { return ::sinh(f); }
+SLANG_FORCE_INLINE double F64_cosh(double f) { return ::cosh(f); }
+SLANG_FORCE_INLINE double F64_tanh(double f) { return ::tanh(f); }
 SLANG_FORCE_INLINE double F64_log2(double f) { return ::log2(f); }
 SLANG_FORCE_INLINE double F64_log(double f) { return ::log(f); }
 SLANG_FORCE_INLINE double F64_log10(float f) { return ::log10(f); }
@@ -127,38 +132,32 @@ SLANG_FORCE_INLINE double F64_abs(double f) { return ::fabs(f); }
 SLANG_FORCE_INLINE double F64_trunc(double f) { return ::trunc(f); }
 SLANG_FORCE_INLINE double F64_sqrt(double f) { return ::sqrt(f); }
 SLANG_FORCE_INLINE double F64_rsqrt(double f) { return 1.0 / F64_sqrt(f); }
-SLANG_FORCE_INLINE double F64_rcp(double f) { return 1.0 / f; }
 SLANG_FORCE_INLINE double F64_sign(double f) { return (f == 0.0) ? f : ((f < 0.0) ? -1.0 : 1.0); }
-SLANG_FORCE_INLINE double F64_saturate(double f) { return (f < 0.0) ? 0.0 : (f > 1.0) ? 1.0 : f; }
 SLANG_FORCE_INLINE double F64_frac(double f) { return f - F64_floor(f); }
-SLANG_FORCE_INLINE double F64_radians(double f) { return f * 0.01745329222; }
 
 SLANG_FORCE_INLINE bool F64_isnan(double f) { return isnan(f); }
 SLANG_FORCE_INLINE bool F64_isfinite(double f) { return isfinite(f); }
 SLANG_FORCE_INLINE bool F64_isinf(double f) { return isinf(f); }
 
 // Binary
-SLANG_FORCE_INLINE double F64_min(double a, double b) { return a < b ? a : b; }
-SLANG_FORCE_INLINE double F64_max(double a, double b) { return a > b ? a : b; }
+SLANG_FORCE_INLINE double F64_min(double a, double b) { return ::fmin(a, b); }
+SLANG_FORCE_INLINE double F64_max(double a, double b) { return ::fmax(a, b); }
 SLANG_FORCE_INLINE double F64_pow(double a, double b) { return ::pow(a, b); }
 SLANG_FORCE_INLINE double F64_fmod(double a, double b) { return ::fmod(a, b); }
 SLANG_FORCE_INLINE double F64_remainder(double a, double b) { return ::remainder(a, b); }
-SLANG_FORCE_INLINE double F64_step(double a, double b) { return double(b >= a); }
 SLANG_FORCE_INLINE double F64_atan2(double a, double b) { return ::atan2(a, b); }
 
-// TODO(JS): 
-// Note C++ has ldexp, but it takes an integer for the exponent, it seems HLSL takes both as float
-SLANG_FORCE_INLINE double F64_ldexp(double m, double e) { return m * ::pow(2.0, e); }
-
-// Ternary 
-SLANG_FORCE_INLINE double F64_smoothstep(double min, double max, double x) 
-{ 
-    const double t = x < min ? 0.0 : ((x > max) ? 1.0 : (x - min) / (max - min)); 
-    return t * t * (3.0 - 2.0 * t);
+SLANG_FORCE_INLINE double F64_frexp(double x, double& e)
+{
+    int ei;
+    double m = ::frexp(x, &ei);
+    e = ei;
+    return m;
+}
+SLANG_FORCE_INLINE double F64_modf(double x, double& ip)
+{
+    return ::modf(x, &ip);
 }
-SLANG_FORCE_INLINE double F64_lerp(double x, double y, double s) { return x + s * (y - x); }
-SLANG_FORCE_INLINE double F64_clamp(double x, double min, double max) { return (x < min) ? min : ((x > max) ? max : x); }
-SLANG_FORCE_INLINE void F64_sincos(double f, double& outSin, double& outCos) { outSin = F64_sin(f); outCos = F64_cos(f); }
 
 SLANG_FORCE_INLINE void F64_asuint(double d, uint32_t& low, uint32_t& hi)
 {
@@ -176,6 +175,9 @@ SLANG_FORCE_INLINE void F64_asint(double d, int32_t& low, int32_t& hi)
     hi = int32_t(u.u >> 32);
 }
 
+// Ternary
+SLANG_FORCE_INLINE double F64_fma(double a, double b, double c) { return ::fma(a, b, c); }
+
 // ----------------------------- I32 -----------------------------------------
 
 SLANG_FORCE_INLINE int32_t I32_abs(int32_t f) { return (f < 0) ? -f : f; }
@@ -183,8 +185,6 @@ SLANG_FORCE_INLINE int32_t I32_abs(int32_t f) { return (f < 0) ? -f : f; }
 SLANG_FORCE_INLINE int32_t I32_min(int32_t a, int32_t b) { return a < b ? a : b; }
 SLANG_FORCE_INLINE int32_t I32_max(int32_t a, int32_t b) { return a > b ? a : b; }
 
-SLANG_FORCE_INLINE int32_t I32_clamp(int32_t x, int32_t min, int32_t max) { return ( x < min) ? min : ((x > max) ? max : x); }
-
 SLANG_FORCE_INLINE float I32_asfloat(int32_t x) { Union32 u; u.i = x; return u.f; }
 SLANG_FORCE_INLINE uint32_t I32_asuint(int32_t x) { return uint32_t(x); }
 SLANG_FORCE_INLINE double I32_asdouble(int32_t low, int32_t hi )
@@ -201,8 +201,6 @@ SLANG_FORCE_INLINE uint32_t U32_abs(uint32_t f) { return f; }
 SLANG_FORCE_INLINE uint32_t U32_min(uint32_t a, uint32_t b) { return a < b ? a : b; }
 SLANG_FORCE_INLINE uint32_t U32_max(uint32_t a, uint32_t b) { return a > b ? a : b; }
 
-SLANG_FORCE_INLINE uint32_t U32_clamp(uint32_t x, uint32_t min, uint32_t max) { return ( x < min) ? min : ((x > max) ? max : x); }
-
 SLANG_FORCE_INLINE float U32_asfloat(uint32_t x) { Union32 u; u.u = x; return u.f; }
 SLANG_FORCE_INLINE uint32_t U32_asint(int32_t x) { return uint32_t(x); } 
 
@@ -238,8 +236,6 @@ SLANG_FORCE_INLINE uint64_t U64_abs(uint64_t f) { return f; }
 SLANG_FORCE_INLINE uint64_t U64_min(uint64_t a, uint64_t b) { return a < b ? a : b; }
 SLANG_FORCE_INLINE uint64_t U64_max(uint64_t a, uint64_t b) { return a > b ? a : b; }
 
-SLANG_FORCE_INLINE uint64_t U64_clamp(uint64_t x, uint64_t min, uint64_t max) { return ( x < min) ? min : ((x > max) ? max : x); }
-
 SLANG_FORCE_INLINE uint32_t U64_countbits(uint64_t v)
 {
 #if SLANG_GCC_FAMILY    
@@ -264,8 +260,6 @@ SLANG_FORCE_INLINE int64_t I64_abs(int64_t f) { return (f < 0) ? -f : f; }
 SLANG_FORCE_INLINE int64_t I64_min(int64_t a, int64_t b) { return a < b ? a : b; }
 SLANG_FORCE_INLINE int64_t I64_max(int64_t a, int64_t b) { return a > b ? a : b; }
 
-SLANG_FORCE_INLINE int64_t I64_clamp(int64_t x, int64_t min, int64_t max) { return ( x < min) ? min : ((x > max) ? max : x); }
-
 #ifdef SLANG_PRELUDE_NAMESPACE
 } 
 #endif
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 457fb4246..0a2ec088b 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -131,67 +131,113 @@ union Union64
 // ----------------------------- F32 -----------------------------------------
 
 // Unary 
-SLANG_CUDA_CALL float F32_rcp(float f) { return 1.0f / f; }
+SLANG_CUDA_CALL float F32_ceil(float f) { return ::ceilf(f); }
+SLANG_CUDA_CALL float F32_floor(float f) { return ::floorf(f); }
+SLANG_CUDA_CALL float F32_round(float f) { return ::roundf(f); }
+SLANG_CUDA_CALL float F32_sin(float f) { return ::sinf(f); }
+SLANG_CUDA_CALL float F32_cos(float f) { return ::cosf(f); }
+SLANG_CUDA_CALL void F32_sincos(float f, float& s, float& c) { ::sincosf(f, &s, &c); }
+SLANG_CUDA_CALL float F32_tan(float f) { return ::tanf(f); }
+SLANG_CUDA_CALL float F32_asin(float f) { return ::asinf(f); }
+SLANG_CUDA_CALL float F32_acos(float f) { return ::acosf(f); }
+SLANG_CUDA_CALL float F32_atan(float f) { return ::atanf(f); }
+SLANG_CUDA_CALL float F32_sinh(float f) { return ::sinhf(f); }
+SLANG_CUDA_CALL float F32_cosh(float f) { return ::coshf(f); }
+SLANG_CUDA_CALL float F32_tanh(float f) { return ::tanhf(f); }
+SLANG_CUDA_CALL float F32_log2(float f) { return ::log2f(f); }
+SLANG_CUDA_CALL float F32_log(float f) { return ::logf(f); }
+SLANG_CUDA_CALL float F32_log10(float f) { return ::log10f(f); }
+SLANG_CUDA_CALL float F32_exp2(float f) { return ::exp2f(f); }
+SLANG_CUDA_CALL float F32_exp(float f) { return ::expf(f); }
+SLANG_CUDA_CALL float F32_abs(float f) { return ::fabsf(f); }
+SLANG_CUDA_CALL float F32_trunc(float f) { return ::truncf(f); }
+SLANG_CUDA_CALL float F32_sqrt(float f) { return ::sqrtf(f); }
+SLANG_CUDA_CALL float F32_rsqrt(float f) { return ::rsqrtf(f); }
 SLANG_CUDA_CALL float F32_sign(float f) { return ( f == 0.0f) ? f : (( f < 0.0f) ? -1.0f : 1.0f); } 
-SLANG_CUDA_CALL float F32_saturate(float f) { return (f < 0.0f) ? 0.0f : (f > 1.0f) ? 1.0f : f; }
-SLANG_CUDA_CALL float F32_frac(float f) { return f - floorf(f); }
+SLANG_CUDA_CALL float F32_frac(float f) { return f - F32_floor(f); }
 
 SLANG_CUDA_CALL bool F32_isnan(float f) { return isnan(f); }
 SLANG_CUDA_CALL bool F32_isfinite(float f) { return isfinite(f); }
 SLANG_CUDA_CALL bool F32_isinf(float f) { return isinf(f); }
 
 // Binary
-SLANG_CUDA_CALL float F32_min(float a, float b) { return a < b ? a : b; }
-SLANG_CUDA_CALL float F32_max(float a, float b) { return a > b ? a : b; }
-SLANG_CUDA_CALL float F32_step(float a, float b) { return float(b >= a); }
-
-// TODO(JS): 
-// Note CUDA has ldexp, but it takes an integer for the exponent, it seems HLSL takes both as float
-SLANG_CUDA_CALL float F32_ldexp(float m, float e) { return m * powf(2.0f, e); }
-
-// Ternary 
-SLANG_CUDA_CALL float F32_lerp(float x, float y, float s) { return x + s * (y - x); }
-SLANG_CUDA_CALL void F32_sincos(float f, float& outSin, float& outCos) { sincosf(f, &outSin, &outCos); }
-SLANG_CUDA_CALL float F32_smoothstep(float min, float max, float x) 
+SLANG_CUDA_CALL float F32_min(float a, float b) { return ::fminf(a, b); }
+SLANG_CUDA_CALL float F32_max(float a, float b) { return ::fmaxf(a, b); }
+SLANG_CUDA_CALL float F32_pow(float a, float b) { return ::powf(a, b); }
+SLANG_CUDA_CALL float F32_fmod(float a, float b) { return ::fmodf(a, b); }
+SLANG_CUDA_CALL float F32_remainder(float a, float b) { return ::remainderf(a, b); }
+SLANG_CUDA_CALL float F32_atan2(float a, float b) { return float(::atan2(a, b)); }
+
+SLANG_CUDA_CALL float F32_frexp(float x, float& e)
+{
+    int ei;
+    float m = ::frexpf(x, &ei);
+    e = ei;
+    return m;
+}
+SLANG_CUDA_CALL float F32_modf(float x, float& ip)
 {
-    const float t = x < min ? 0.0f : ((x > max) ? 1.0f : (x - min) / (max - min)); 
-    return t * t * (3.0 - 2.0 * t);
+    return ::modff(x, &ip);
 }
-SLANG_CUDA_CALL float F32_clamp(float x, float min, float max) { return ( x < min) ? min : ((x > max) ? max : x); }
 
 SLANG_CUDA_CALL uint32_t F32_asuint(float f) { Union32 u; u.f = f; return u.u; }
 SLANG_CUDA_CALL int32_t F32_asint(float f) { Union32 u; u.f = f; return u.i; }
 
+// Ternary
+SLANG_CUDA_CALL float F32_fma(float a, float b, float c) { return ::fmaf(a, b, c); }
+
+
 // ----------------------------- F64 -----------------------------------------
 
 // Unary 
-SLANG_CUDA_CALL double F64_rcp(double f) { return 1.0 / f; }
+SLANG_CUDA_CALL double F64_ceil(double f) { return ::ceil(f); }
+SLANG_CUDA_CALL double F64_floor(double f) { return ::floor(f); }
+SLANG_CUDA_CALL double F64_round(double f) { return ::round(f); }
+SLANG_CUDA_CALL double F64_sin(double f) { return ::sin(f); }
+SLANG_CUDA_CALL double F64_cos(double f) { return ::cos(f); }
+SLANG_CUDA_CALL void F64_sincos(double f, double& s, double& c) { ::sincos(f, &s, &c); }
+SLANG_CUDA_CALL double F64_tan(double f) { return ::tan(f); }
+SLANG_CUDA_CALL double F64_asin(double f) { return ::asin(f); }
+SLANG_CUDA_CALL double F64_acos(double f) { return ::acos(f); }
+SLANG_CUDA_CALL double F64_atan(double f) { return ::atan(f); }
+SLANG_CUDA_CALL double F64_sinh(double f) { return ::sinh(f); }
+SLANG_CUDA_CALL double F64_cosh(double f) { return ::cosh(f); }
+SLANG_CUDA_CALL double F64_tanh(double f) { return ::tanh(f); }
+SLANG_CUDA_CALL double F64_log2(double f) { return ::log2(f); }
+SLANG_CUDA_CALL double F64_log(double f) { return ::log(f); }
+SLANG_CUDA_CALL double F64_log10(float f) { return ::log10(f); }
+SLANG_CUDA_CALL double F64_exp2(double f) { return ::exp2(f); }
+SLANG_CUDA_CALL double F64_exp(double f) { return ::exp(f); }
+SLANG_CUDA_CALL double F64_abs(double f) { return ::fabs(f); }
+SLANG_CUDA_CALL double F64_trunc(double f) { return ::trunc(f); }
+SLANG_CUDA_CALL double F64_sqrt(double f) { return ::sqrt(f); }
+SLANG_CUDA_CALL double F64_rsqrt(double f) { return ::rsqrt(f); }
 SLANG_CUDA_CALL double F64_sign(double f) { return (f == 0.0) ? f : ((f < 0.0) ? -1.0 : 1.0); }
-SLANG_CUDA_CALL double F64_saturate(double f) { return (f < 0.0) ? 0.0 : (f > 1.0) ? 1.0 : f; }
-SLANG_CUDA_CALL double F64_frac(double f) { return f - floor(f); }
+SLANG_CUDA_CALL double F64_frac(double f) { return f - F64_floor(f); }
 
 SLANG_CUDA_CALL bool F64_isnan(double f) { return isnan(f); }
 SLANG_CUDA_CALL bool F64_isfinite(double f) { return isfinite(f); }
 SLANG_CUDA_CALL bool F64_isinf(double f) { return isinf(f); }
 
 // Binary
-SLANG_CUDA_CALL double F64_min(double a, double b) { return a < b ? a : b; }
-SLANG_CUDA_CALL double F64_max(double a, double b) { return a > b ? a : b; }
-SLANG_CUDA_CALL double F64_step(double a, double b) { return double(b >= a); }
-
-// TODO(JS): 
-// Note CUDA has ldexp, but it takes an integer for the exponent, it seems HLSL takes both as float
-SLANG_CUDA_CALL double F64_ldexp(double m, double e) { return m * pow(2.0, e); }
-
-// Ternary 
-SLANG_CUDA_CALL double F64_lerp(double x, double y, double s) { return x + s * (y - x); }
-SLANG_CUDA_CALL void F64_sincos(double f, double& outSin, double& outCos) { sincos(f, &outSin, &outCos); }
-SLANG_CUDA_CALL double F64_smoothstep(double min, double max, double x) 
-{ 
-    const double t = x < min ? 0.0 : ((x > max) ? 1.0 : (x - min) / (max - min)); 
-    return t * t * (3.0 - 2.0 * t);
+SLANG_CUDA_CALL double F64_min(double a, double b) { return ::fmin(a, b); }
+SLANG_CUDA_CALL double F64_max(double a, double b) { return ::fmax(a, b); }
+SLANG_CUDA_CALL double F64_pow(double a, double b) { return ::pow(a, b); }
+SLANG_CUDA_CALL double F64_fmod(double a, double b) { return ::fmod(a, b); }
+SLANG_CUDA_CALL double F64_remainder(double a, double b) { return ::remainder(a, b); }
+SLANG_CUDA_CALL double F64_atan2(double a, double b) { return ::atan2(a, b); }
+
+SLANG_CUDA_CALL double F64_frexp(double x, double& e)
+{
+    int ei;
+    double m = ::frexp(x, &ei);
+    e = ei;
+    return m;
+}
+SLANG_CUDA_CALL double F64_modf(double x, double& ip)
+{
+    return ::modf(x, &ip);
 }
-SLANG_CUDA_CALL double F64_clamp(double x, double min, double max) { return (x < min) ? min : ((x > max) ? max : x); }
 
 SLANG_CUDA_CALL void F64_asuint(double d, uint32_t& low, uint32_t& hi)
 {
@@ -209,6 +255,9 @@ SLANG_CUDA_CALL void F64_asint(double d, int32_t& low, int32_t& hi)
     hi = int32_t(u.u >> 32);
 }
 
+// Ternary
+SLANG_CUDA_CALL double F64_fma(double a, double b, double c) { return ::fma(a, b, c); }
+
 // ----------------------------- I32 -----------------------------------------
 
 // Unary
@@ -218,9 +267,6 @@ SLANG_CUDA_CALL int32_t I32_abs(int32_t f) { return (f < 0) ? -f : f; }
 SLANG_CUDA_CALL int32_t I32_min(int32_t a, int32_t b) { return a < b ? a : b; }
 SLANG_CUDA_CALL int32_t I32_max(int32_t a, int32_t b) { return a > b ? a : b; }
 
-// Ternary 
-SLANG_CUDA_CALL int32_t I32_clamp(int32_t x, int32_t min, int32_t max) { return ( x < min) ? min : ((x > max) ? max : x); }
-
 SLANG_CUDA_CALL float I32_asfloat(int32_t x) { Union32 u; u.i = x; return u.f; }
 SLANG_CUDA_CALL uint32_t I32_asuint(int32_t x) { return uint32_t(x); }
 SLANG_CUDA_CALL double I32_asdouble(int32_t low, int32_t hi )
@@ -239,9 +285,6 @@ SLANG_CUDA_CALL uint32_t U32_abs(uint32_t f) { return f; }
 SLANG_CUDA_CALL uint32_t U32_min(uint32_t a, uint32_t b) { return a < b ? a : b; }
 SLANG_CUDA_CALL uint32_t U32_max(uint32_t a, uint32_t b) { return a > b ? a : b; }
 
-// Ternary 
-SLANG_CUDA_CALL uint32_t U32_clamp(uint32_t x, uint32_t min, uint32_t max) { return ( x < min) ? min : ((x > max) ? max : x); }
-
 SLANG_CUDA_CALL float U32_asfloat(uint32_t x) { Union32 u; u.u = x; return u.f; }
 SLANG_CUDA_CALL uint32_t U32_asint(int32_t x) { return uint32_t(x); } 
 
@@ -266,8 +309,6 @@ SLANG_CUDA_CALL int64_t I64_abs(int64_t f) { return (f < 0) ? -f : f; }
 SLANG_CUDA_CALL int64_t I64_min(int64_t a, int64_t b) { return a < b ? a : b; }
 SLANG_CUDA_CALL int64_t I64_max(int64_t a, int64_t b) { return a > b ? a : b; }
 
-SLANG_CUDA_CALL int64_t I64_clamp(int64_t x, int64_t min, int64_t max) { return ( x < min) ? min : ((x > max) ? max : x); }
-
 // ----------------------------- U64 -----------------------------------------
 
 SLANG_CUDA_CALL int64_t U64_abs(uint64_t f) { return f; }
@@ -275,8 +316,6 @@ SLANG_CUDA_CALL int64_t U64_abs(uint64_t f) { return f; }
 SLANG_CUDA_CALL int64_t U64_min(uint64_t a, uint64_t b) { return a < b ? a : b; }
 SLANG_CUDA_CALL int64_t U64_max(uint64_t a, uint64_t b) { return a > b ? a : b; }
 
-SLANG_CUDA_CALL int64_t U64_clamp(uint64_t x, uint64_t min, uint64_t max) { return ( x < min) ? min : ((x > max) ? max : x); }
-
 SLANG_CUDA_CALL uint32_t U64_countbits(uint64_t v)
 {
     // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html#group__CUDA__MATH__INTRINSIC__INT_1g43c9c7d2b9ebf202ff1ef5769989be46
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 20158c1b1..03496ccc8 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -349,8 +349,13 @@ void abort();
 // Absolute value (HLSL SM 1.0)
 
 __generic<T : __BuiltinSignedArithmeticType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_abs($0)")
+__target_intrinsic(cpp, "$P_abs($0)")
 T abs(T x);
 /*{
+    // Note: this simple definition may not be appropriate for floating-point inputs
     return x < 0 ? -x : x;
 }*/
 
@@ -372,6 +377,10 @@ matrix<T,N,M> abs(matrix<T,N,M> x)
 // Inverse cosine (HLSL SM 1.0)
 
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_acos($0)")
+__target_intrinsic(cpp, "$P_acos($0)")
 T acos(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -530,6 +539,8 @@ matrix<float,N,M> asfloat(matrix<float,N,M> x);
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_asin($0)")
+__target_intrinsic(cpp, "$P_asin($0)")
 T asin(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -659,6 +670,10 @@ matrix<uint,N,M> asuint(matrix<uint,N,M> x);
 
 // Inverse tangent (HLSL SM 1.0)
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_atan($0)")
+__target_intrinsic(cpp, "$P_atan($0)")
 T atan(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -679,6 +694,8 @@ matrix<T, N, M> atan(matrix<T, N, M> x)
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl,"atan($0,$1)")
+__target_intrinsic(cuda, "$P_atan2($0, $1)")
+__target_intrinsic(cpp, "$P_atan2($0, $1)")
 T atan2(T y, T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -698,6 +715,10 @@ matrix<T,N,M> atan2(matrix<T,N,M> y, matrix<T,N,M> x)
 
 // Ceiling (HLSL SM 1.0)
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_ceil($0)")
+__target_intrinsic(cpp, "$P_ceil($0)")
 T ceil(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -767,6 +788,10 @@ void clip(matrix<T,N,M> x)
 
 // Cosine
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_cos($0)")
+__target_intrinsic(cpp, "$P_cos($0)")
 T cos(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -786,6 +811,10 @@ matrix<T, N, M> cos(matrix<T, N, M> x)
 
 // Hyperbolic cosine
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_cosh($0)")
+__target_intrinsic(cpp, "$P_cosh($0)")
 T cosh(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -804,7 +833,10 @@ matrix<T, N, M> cosh(matrix<T, N, M> x)
 }
 
 // Population count
+__target_intrinsic(hlsl)
 __target_intrinsic(glsl, "bitCount")
+__target_intrinsic(cuda, "$P_countbits($0)")
+__target_intrinsic(cpp, "$P_countbits($0)")
 uint countbits(uint value);
 
 // Cross product
@@ -1070,6 +1102,10 @@ matrix<T,N,M> EvaluateAttributeSnapped(matrix<T,N,M> x, int2 offset);
 // Base-e exponent
 
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_exp($0)")
+__target_intrinsic(cpp, "$P_exp($0)")
 T exp(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -1090,6 +1126,10 @@ matrix<T, N, M> exp(matrix<T, N, M> x)
 // Base-2 exponent
 
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_exp2($0)")
+__target_intrinsic(cpp, "$P_exp2($0)")
 T exp2(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -1133,7 +1173,10 @@ vector<T,N> faceforward(vector<T,N> n, vector<T,N> i, vector<T,N> ng)
 }
 
 // Find first set bit starting at high bit and working down
+__target_intrinsic(hlsl)
 __target_intrinsic(glsl,"findMSB")
+__target_intrinsic(cuda, "$P_firstbithigh($0)")
+__target_intrinsic(cpp, "$P_firstbithigh($0)")
 int firstbithigh(int value);
 
 __target_intrinsic(hlsl)
@@ -1144,7 +1187,10 @@ vector<int, N> firstbithigh(vector<int, N> value)
     VECTOR_MAP_UNARY(int, N, firstbithigh, value);
 }
 
+__target_intrinsic(hlsl)
 __target_intrinsic(glsl,"findMSB")
+__target_intrinsic(cuda, "$P_firstbithigh($0)")
+__target_intrinsic(cpp, "$P_firstbithigh($0)")
 uint firstbithigh(uint value);
 
 __target_intrinsic(hlsl)
@@ -1156,7 +1202,10 @@ vector<uint,N> firstbithigh(vector<uint,N> value)
 }
 
 // Find first set bit starting at low bit and working up
+__target_intrinsic(hlsl)
 __target_intrinsic(glsl,"findLSB")
+__target_intrinsic(cuda, "$P_firstbitlow($0)")
+__target_intrinsic(cpp, "$P_firstbitlow($0)")
 int firstbitlow(int value);
 
 __target_intrinsic(hlsl)
@@ -1167,7 +1216,10 @@ vector<int,N> firstbitlow(vector<int,N> value)
     VECTOR_MAP_UNARY(int, N, firstbitlow, value);
 }
 
+__target_intrinsic(hlsl)
 __target_intrinsic(glsl,"findLSB")
+__target_intrinsic(cuda, "$P_firstbitlow($0)")
+__target_intrinsic(cpp, "$P_firstbitlow($0)")
 uint firstbitlow(uint value);
 
 __target_intrinsic(hlsl)
@@ -1181,6 +1233,10 @@ vector<uint,N> firstbitlow(vector<uint,N> value)
 // Floor (HLSL SM 1.0)
 
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_floor($0)")
+__target_intrinsic(cpp, "$P_floor($0)")
 T floor(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -1201,6 +1257,8 @@ matrix<T, N, M> floor(matrix<T, N, M> x)
 // Fused multiply-add for doubles
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_fma($0, $1, $2)")
+__target_intrinsic(cpp, "$P_fma($0, $1, $2)")
 double fma(double a, double b, double c);
 
 __generic<let N : int>
@@ -1220,6 +1278,10 @@ matrix<double, N, M> fma(matrix<double, N, M> a, matrix<double, N, M> b, matrix<
 
 // Floating point remainder of x/y
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_fmod($0, $1)")
+__target_intrinsic(cpp, "$P_fmod($0, $1)")
 T fmod(T x, T y);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -1239,7 +1301,10 @@ matrix<T, N, M> fmod(matrix<T, N, M> x, matrix<T, N, M> y)
 
 // Fractional part
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
 __target_intrinsic(glsl, fract)
+__target_intrinsic(cuda, "$P_frac($0)")
+__target_intrinsic(cpp, "$P_frac($0)")
 T frac(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -1438,8 +1503,8 @@ void InterlockedXor(__ref uint dest, uint value, out uint original_value);
 
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
-__target_intrinsic(cpu)
-__target_intrinsic(cuda)
+__target_intrinsic(cuda, "$P_isfinite($0)")
+__target_intrinsic(cpp, "$P_isfinite($0)")
 bool isfinite(T x)
 {
     return !(isinf(x) || isnan(x));
@@ -1461,6 +1526,10 @@ matrix<bool, N, M> isfinite(matrix<T, N, M> x)
 
 // Is floating-point value infinite?
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_isinf($0)")
+__target_intrinsic(cpp, "$P_isinf($0)")
 bool isinf(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -1480,6 +1549,10 @@ matrix<bool, N, M> isinf(matrix<T, N, M> x)
 
 // Is floating-point value not-a-number?
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_isnan($0)")
+__target_intrinsic(cpp, "$P_isnan($0)")
 bool isnan(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -1565,6 +1638,10 @@ float4 lit(float n_dot_l, float n_dot_h, float m)
 
 // Base-e logarithm
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_log($0)")
+__target_intrinsic(cpp, "$P_log($0)")
 T log(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -1586,6 +1663,8 @@ matrix<T, N, M> log(matrix<T, N, M> x)
 __generic<T : __BuiltinFloatingPointType> 
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "(log( $0 ) * $S0( 0.43429448190325182765112891891661) )" )
+__target_intrinsic(cuda, "$P_log10($0)")
+__target_intrinsic(cpp, "$P_log10($0)")
 T log10(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -1605,6 +1684,10 @@ matrix<T,N,M> log10(matrix<T,N,M> x)
 
 // Base-2 logarithm
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_log2($0)")
+__target_intrinsic(cpp, "$P_log2($0)")
 T log2(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -1627,6 +1710,8 @@ matrix<T,N,M> log2(matrix<T,N,M> x)
 __generic<T : __BuiltinArithmeticType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, fma)
+__target_intrinsic(cuda, "$P_fma($0, $1, $2)")
+__target_intrinsic(cpp, "$P_fma($0, $1, $2)")
 T mad(T mvalue, T avalue, T bvalue);
 
 __generic<T : __BuiltinArithmeticType, let N : int>
@@ -1646,6 +1731,10 @@ matrix<T, N, M> mad(matrix<T, N, M> mvalue, matrix<T, N, M> avalue, matrix<T, N,
 
 // maximum
 __generic<T : __BuiltinArithmeticType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_max($0, $1)")
+__target_intrinsic(cpp, "$P_max($0, $1)")
 T max(T x, T y);
 // Note: a stdlib implementation of `max` (or `min`) will require splitting
 // floating-point and integer cases apart, because the floating-point
@@ -1669,6 +1758,10 @@ matrix<T, N, M> max(matrix<T, N, M> x, matrix<T, N, M> y)
 
 // minimum
 __generic<T : __BuiltinArithmeticType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_min($0, $1)")
+__target_intrinsic(cpp, "$P_min($0, $1)")
 T min(T x, T y);
 
 __generic<T : __BuiltinArithmeticType, let N : int>
@@ -1757,28 +1850,64 @@ T mul(vector<T, N> x, vector<T, N> y)
     return dot(x, y);
 }
 
-${{{{
-// TODO: The following functions could conceivably be defined
-// in the stdlib for the benefit of targets without direct
-// support for matrices, but the use of `__intrinsic_op` to
-// map them to a dedicated IR instruction interferes with
-// that choice.
-}}}}
-
 // vector-matrix
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int>
-__intrinsic_op(mulVectorMatrix)
-vector<T,M> mul(vector<T,N> x, matrix<T,N,M> y);
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl, "($1 * $0)")
+vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
+{
+    vector<T,M> result;
+    for( int j = 0; j < M; ++j )
+    {
+        T sum = T(0);
+        for( int i = 0; i < N; ++i )
+        {
+            sum += left[i] * right[i][j];
+        }
+        result[j] = sum;
+    }
+    return result;
+}
 
 // matrix-vector
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int>
-__intrinsic_op(mulMatrixVector)
-vector<T,N> mul(matrix<T,N,M> x, vector<T,M> y);
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl, "($1 * $0)")
+vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
+{
+    vector<T,N> result;
+    for( int i = 0; i < N; ++i )
+    {
+        T sum = T(0);
+        for( int j = 0; j < M; ++j )
+        {
+            sum += left[i][j] * right[j];
+        }
+        result[i] = sum;
+    }
+    return result;
+}
+
 
 // matrix-matrix
 __generic<T : __BuiltinArithmeticType, let R : int, let N : int, let C : int>
-__intrinsic_op(mulMatrixMatrix)
-matrix<T,R,C> mul(matrix<T,R,N> x, matrix<T,N,C> y);
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl, "($1 * $0)")
+matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
+{
+    matrix<T,R,C> result;
+    for( int r = 0; r < R; ++r)
+    for( int c = 0; c < C; ++c)
+    {
+        T sum = T(0);
+        for( int i = 0; i < N; ++i )
+        {
+            sum += left[r][i] * right[i][c];
+        }
+        result[r][c] = sum;
+    }
+    return result;
+}
 
 // noise (deprecated)
 
@@ -1839,6 +1968,10 @@ vector<T,N> normalize(vector<T,N> x)
 
 // Raise to a power
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_pow($0, $1)")
+__target_intrinsic(cpp, "$P_pow($0, $1)")
 T pow(T x, T y);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -1999,7 +2132,10 @@ vector<T,N> refract(vector<T,N> i, vector<T,N> n, T eta)
 }
 
 // Reverse order of bits
+__target_intrinsic(hlsl)
 __target_intrinsic(glsl, "bitfieldReverse")
+__target_intrinsic(cuda, "$P_reversebits($0)")
+__target_intrinsic(cpp, "$P_reversebits($0)")
 uint reversebits(uint value);
 
 __target_intrinsic(glsl, "bitfieldReverse")
@@ -2011,6 +2147,10 @@ vector<uint, N> reversebits(vector<uint, N> value)
 
 // Round-to-nearest
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_round($0)")
+__target_intrinsic(cpp, "$P_round($0)")
 T round(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -2032,7 +2172,12 @@ matrix<T,N,M> round(matrix<T,N,M> x)
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "inversesqrt($0)")
-T rsqrt(T x);
+__target_intrinsic(cuda, "$P_rsqrt($0)")
+__target_intrinsic(cpp, "$P_rsqrt($0)")
+T rsqrt(T x)
+{
+    return T(1.0) / sqrt(x);
+}
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
@@ -2076,7 +2221,10 @@ matrix<T,N,M> saturate(matrix<T,N,M> x)
 
 // Extract sign of value
 __generic<T : __BuiltinSignedArithmeticType>
+__target_intrinsic(hlsl)
 __target_intrinsic(glsl, "int(sign($0))")
+__target_intrinsic(cuda, "$P_sign($0)")
+__target_intrinsic(cpp, "$P_sign($0)")
 int sign(T x);
 
 __generic<T : __BuiltinSignedArithmeticType, let N : int>
@@ -2098,6 +2246,10 @@ matrix<int, N, M> sign(matrix<T, N, M> x)
 // Sine
 
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_sin($0)")
+__target_intrinsic(cpp, "$P_sin($0)")
 T sin(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -2118,6 +2270,7 @@ matrix<T, N, M> sin(matrix<T, N, M> x)
 // Sine and cosine
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
+__target_intrinsic(cuda, "$P_sincos($0, $1, $2)")
 void sincos(T x, out T s, out T c)
 {
     s = sin(x);
@@ -2142,6 +2295,10 @@ void sincos(matrix<T,N,M> x, out matrix<T,N,M> s, out matrix<T,N,M> c)
 
 // Hyperbolic Sine
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_sinh($0)")
+__target_intrinsic(cpp, "$P_sinh($0)")
 T sinh(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -2186,6 +2343,10 @@ matrix<T, N, M> smoothstep(matrix<T, N, M> min, matrix<T, N, M> max, matrix<T, N
 
 // Square root
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_sqrt($0)")
+__target_intrinsic(cpp, "$P_sqrt($0)")
 T sqrt(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -2229,6 +2390,10 @@ matrix<T, N, M> step(matrix<T, N, M> y, matrix<T, N, M> x)
 
 // Tangent
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_tan($0)")
+__target_intrinsic(cpp, "$P_tan($0)")
 T tan(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -2248,6 +2413,10 @@ matrix<T, N, M> tan(matrix<T, N, M> x)
 
 // Hyperbolic tangent
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_tanh($0)")
+__target_intrinsic(cpp, "$P_tanh($0)")
 T tanh(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
@@ -2280,6 +2449,10 @@ matrix<T, M, N> transpose(matrix<T, N, M> x)
 
 // Truncate to integer
 __generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(cuda, "$P_trunc($0)")
+__target_intrinsic(cpp, "$P_trunc($0)")
 T trunc(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp
index 55f251565..3631040b8 100644
--- a/source/slang/slang-emit-c-like.cpp
+++ b/source/slang/slang-emit-c-like.cpp
@@ -1777,6 +1777,42 @@ void CLikeSourceEmitter::emitIntrinsicCallExprImpl(
                 }
                 break;
 
+            case 'P':
+                // Type-based prefix as used for CUDA and C++ targets
+                {
+                    Index argIndex = 0;
+                    SLANG_RELEASE_ASSERT(argCount > argIndex);
+                    auto arg = args[argIndex].get();
+                    auto argType = arg->getDataType();
+
+                    const char* str = "";
+                    switch(argType->op)
+                    {
+                    #define CASE(OP, STR) \
+                    case kIROp_##OP: str = #STR; break
+
+                    CASE(Int8Type,      I8);
+                    CASE(Int16Type,     I16);
+                    CASE(IntType,       I32);
+                    CASE(Int64Type,     I64);
+                    CASE(UInt8Type,     U8);
+                    CASE(UInt16Type,    U16);
+                    CASE(UIntType,      U32);
+                    CASE(UInt64Type,    U64);
+                    CASE(HalfType,      F16);
+                    CASE(FloatType,     F32);
+                    CASE(DoubleType,    F64);
+
+                    #undef CASE
+
+                    default:
+                        SLANG_UNEXPECTED("unexpected type in intrinsic definition");
+                        break;
+                    }
+                    m_writer->emit(str);
+                }
+                break;
+
             default:
                 SLANG_UNEXPECTED("bad format in intrinsic definition");
                 break;
@@ -2059,17 +2095,6 @@ void CLikeSourceEmitter::defaultEmitInstExpr(IRInst* inst, const EmitOpInfo& inO
         }
         break;
 
-    case kIROp_Mul_Vector_Matrix:
-    case kIROp_Mul_Matrix_Vector:
-    case kIROp_Mul_Matrix_Matrix:
-        // Default impl
-        m_writer->emit("mul(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
-        m_writer->emit(")");
-        break;
-
     case kIROp_swizzle:
         {
             auto prec = getInfo(EmitOp::Postfix);
diff --git a/source/slang/slang-emit-cpp.cpp b/source/slang/slang-emit-cpp.cpp
index 7fb04c33b..bece6c2d0 100644
--- a/source/slang/slang-emit-cpp.cpp
+++ b/source/slang/slang-emit-cpp.cpp
@@ -901,56 +901,6 @@ void CPPSourceEmitter::_emitSignature(const UnownedStringSlice& funcName, const
     writer->emit(")");
 }
 
-void CPPSourceEmitter::_emitVecMatMulDefinition(const UnownedStringSlice& funcName, const HLSLIntrinsic* specOp)
-{
-    IRFuncType* funcType = specOp->signatureType;
-    SLANG_ASSERT(funcType->getParamCount() == 2);
-    IRType* paramType0 = funcType->getParamType(0);
-    IRType* paramType1 = funcType->getParamType(1);
-    IRType* retType = specOp->returnType;
-
-    SourceWriter* writer = getSourceWriter();
-
-    _emitSignature(funcName, specOp);
-
-    writer->emit("\n{\n");
-    writer->indent();
-
-    emitType(retType);
-    writer->emit(" r;\n");
-
-    TypeDimension dimA = _getTypeDimension(paramType0, false);
-    TypeDimension dimB = _getTypeDimension(paramType1, true);
-    TypeDimension resultDim = _getTypeDimension(retType, paramType1->op == kIROp_VectorType);
-
-    for (int i = 0; i < resultDim.rowCount; ++i)
-    {
-        for (int j = 0; j < resultDim.colCount; ++j)
-        {
-            _emitAccess(UnownedStringSlice::fromLiteral("r"), resultDim, i, j, writer);
-            writer->emit(" = ");
-
-            for (int k = 0; k < dimA.colCount; k++)
-            {
-                if (k > 0)
-                {
-                    writer->emit(" + ");
-                }
-                _emitAccess(UnownedStringSlice::fromLiteral("a"), dimA, i, k, writer);
-                writer->emit(" * ");
-                _emitAccess(UnownedStringSlice::fromLiteral("b"), dimB, k, j, writer);
-            }
-
-            writer->emit(";\n");
-        }
-    }
-
-    writer->emit("return r;\n");
-
-    writer->dedent();
-    writer->emit("}\n\n");
-}
-
 UnownedStringSlice CPPSourceEmitter::_getAndEmitSpecializedOperationDefinition(HLSLIntrinsic::Op op, IRType*const* argTypes, Int argCount, IRType* retType)
 {
     HLSLIntrinsic intrinsic;
@@ -960,38 +910,6 @@ UnownedStringSlice CPPSourceEmitter::_getAndEmitSpecializedOperationDefinition(H
     return  _getFuncName(specOp);
 }
 
-void CPPSourceEmitter::_emitLengthDefinition(const UnownedStringSlice& funcName, const HLSLIntrinsic* specOp)
-{
-    SourceWriter* writer = getSourceWriter();
-
-    IRFuncType* funcType = specOp->signatureType;
-    SLANG_ASSERT(funcType->getParamCount() == 1);
-    IRType* paramType0 = funcType->getParamType(0);
-
-    SLANG_ASSERT(paramType0->op == kIROp_VectorType);
-
-    IRBasicType* elementType = as<IRBasicType>(static_cast<IRVectorType*>(paramType0)->getElementType());
-
-    IRType* dotArgs[] = { paramType0, paramType0 };
-    UnownedStringSlice dotFuncName = _getAndEmitSpecializedOperationDefinition(HLSLIntrinsic::Op::Dot, dotArgs, SLANG_COUNT_OF(dotArgs), elementType);
-
-    UnownedStringSlice sqrtName = _getScalarFuncName(HLSLIntrinsic::Op::Sqrt, elementType);
-
-    _emitSignature(funcName, specOp);
-
-    writer->emit("\n{\n");
-    writer->indent();
-
-    writer->emit("return ");
-    writer->emit(sqrtName);
-    writer->emit("(");
-    writer->emit(dotFuncName);
-    writer->emit("(a, a));\n");
-   
-    writer->dedent();
-    writer->emit("}\n\n");
-}
-
 void CPPSourceEmitter::_emitGetAtDefinition(const UnownedStringSlice& funcName, const HLSLIntrinsic* specOp)
 {
     SourceWriter* writer = getSourceWriter();
@@ -1049,47 +967,6 @@ void CPPSourceEmitter::_emitGetAtDefinition(const UnownedStringSlice& funcName,
     }
 }
 
-void CPPSourceEmitter::_emitNormalizeDefinition(const UnownedStringSlice& funcName, const HLSLIntrinsic* specOp)
-{    
-    SourceWriter* writer = getSourceWriter();
-
-    IRFuncType* funcType = specOp->signatureType;
-    SLANG_ASSERT(funcType->getParamCount() == 1);
-    IRType* paramType0 = funcType->getParamType(0);
-
-    SLANG_ASSERT(paramType0->op == kIROp_VectorType);
-
-    IRBasicType* elementType = as<IRBasicType>(static_cast<IRVectorType*>(paramType0)->getElementType());
-
-    IRType* dotArgs[] = { paramType0, paramType0 };
-    UnownedStringSlice dotFuncName = _getAndEmitSpecializedOperationDefinition(HLSLIntrinsic::Op::Dot, dotArgs, SLANG_COUNT_OF(dotArgs), elementType);
-    UnownedStringSlice rsqrtName = _getScalarFuncName(HLSLIntrinsic::Op::RecipSqrt, elementType);
-    IRType* vecMulScalarArgs[] = { paramType0, elementType };
-    UnownedStringSlice vecMulScalarName = _getAndEmitSpecializedOperationDefinition(HLSLIntrinsic::Op::Mul, vecMulScalarArgs, SLANG_COUNT_OF(vecMulScalarArgs), paramType0);
-
-    TypeDimension dimA = _getTypeDimension(paramType0, false);
-
-    // Assumes C++
-
-    _emitSignature(funcName, specOp);
-
-    writer->emit("\n{\n");
-    writer->indent();
-
-    writer->emit("return ");
-
-    // Assumes C++ here
-    writer->emit("a * ");
-    writer->emit(rsqrtName);
-    writer->emit("(");
-    writer->emit(dotFuncName);
-    writer->emit("(a, a));\n");
-
-    writer->dedent();
-    writer->emit("}\n\n");
-}
-
-
 void CPPSourceEmitter::_emitConstructConvertDefinition(const UnownedStringSlice& funcName, const HLSLIntrinsic* specOp)
 {
     SourceWriter* writer = getSourceWriter();
@@ -1329,42 +1206,6 @@ void CPPSourceEmitter::_emitConstructFromScalarDefinition(const UnownedStringSli
     writer->emit("}\n\n");
 }
 
-void CPPSourceEmitter::_emitReflectDefinition(const UnownedStringSlice& funcName, const HLSLIntrinsic* specOp)
-{
-    SourceWriter* writer = getSourceWriter();
-
-    IRFuncType* funcType = specOp->signatureType;
-    SLANG_ASSERT(funcType->getParamCount() == 2);
-    IRType* paramType0 = funcType->getParamType(0);
-
-    SLANG_ASSERT(paramType0->op == kIROp_VectorType);
-
-    IRBasicType* elementType = as<IRBasicType>(static_cast<IRVectorType*>(paramType0)->getElementType());
-
-    // Make sure we have all these functions defined before emitting 
-    IRType* dotArgs[] = { paramType0, paramType0 };
-    UnownedStringSlice dotFuncName = _getAndEmitSpecializedOperationDefinition(HLSLIntrinsic::Op::Dot, dotArgs, SLANG_COUNT_OF(dotArgs), elementType);
-
-    IRType* subArgs[] = { paramType0, paramType0};
-    UnownedStringSlice subFuncName = _getAndEmitSpecializedOperationDefinition(HLSLIntrinsic::Op::Sub, subArgs, SLANG_COUNT_OF(subArgs), paramType0);
-
-    IRType* vecMulScalarArgs[] = { paramType0, elementType };
-    UnownedStringSlice vecMulScalarFuncName = _getAndEmitSpecializedOperationDefinition(HLSLIntrinsic::Op::Mul, vecMulScalarArgs, SLANG_COUNT_OF(vecMulScalarArgs), paramType0);
-
-    // Assumes C++
-
-    _emitSignature(funcName, specOp);
-    writer->emit("\n{\n");
-    writer->indent();
-
-    writer->emit("return a - b * 2.0 * ");
-    writer->emit(dotFuncName);
-    writer->emit("(a, b);\n");
-
-    writer->dedent();
-    writer->emit("}\n\n");
-}
-
 void CPPSourceEmitter::_maybeEmitSpecializedOperationDefinition(const HLSLIntrinsic* specOp)
 {
     // Check if it's been emitted already, if not add it.
@@ -1385,28 +1226,11 @@ void CPPSourceEmitter::emitSpecializedOperationDefinition(const HLSLIntrinsic* s
         {
             return _emitInitDefinition(_getFuncName(specOp), specOp);
         }
-        case Op::VecMatMul:
-        case Op::Dot:
-        {
-            return _emitVecMatMulDefinition(_getFuncName(specOp), specOp);
-        }
         case Op::Any:
         case Op::All:
         {
             return _emitAnyAllDefinition(_getFuncName(specOp), specOp);
         }
-        case Op::Normalize:
-        {
-            return _emitNormalizeDefinition(_getFuncName(specOp), specOp);
-        }
-        case Op::Length:
-        {
-            return _emitLengthDefinition(_getFuncName(specOp), specOp);
-        }
-        case Op::Reflect:
-        {
-            return _emitReflectDefinition(_getFuncName(specOp), specOp);
-        }
         case Op::ConstructConvert:
         {
             return _emitConstructConvertDefinition(_getFuncName(specOp), specOp);
diff --git a/source/slang/slang-emit-cpp.h b/source/slang/slang-emit-cpp.h
index 7f9046643..99f180850 100644
--- a/source/slang/slang-emit-cpp.h
+++ b/source/slang/slang-emit-cpp.h
@@ -91,15 +91,10 @@ protected:
     void _calcGlobalParams(const List<EmitAction>& actions, List<GlobalParamInfo>& outParams, IRGlobalParam** outEntryPointGlobalParams);
     void _emitUniformStateMembers(const List<EmitAction>& actions, IRGlobalParam** outEntryPointGlobalParams);
 
-    void _emitVecMatMulDefinition(const UnownedStringSlice& funcName, const HLSLIntrinsic* specOp);
-
     void _emitAryDefinition(const HLSLIntrinsic* specOp);
 
     // Really we don't want any of these defined like they are here, they should be defined in slang stdlib 
     void _emitAnyAllDefinition(const UnownedStringSlice& funcName, const HLSLIntrinsic* specOp);
-    void _emitLengthDefinition(const UnownedStringSlice& funcName, const HLSLIntrinsic* specOp);
-    void _emitNormalizeDefinition(const UnownedStringSlice& funcName, const HLSLIntrinsic* specOp);
-    void _emitReflectDefinition(const UnownedStringSlice& funcName, const HLSLIntrinsic* specOp);
     void _emitConstructConvertDefinition(const UnownedStringSlice& funcName, const HLSLIntrinsic* specOp);
     void _emitConstructFromScalarDefinition(const UnownedStringSlice& funcName, const HLSLIntrinsic* specOp);
     void _emitGetAtDefinition(const UnownedStringSlice& funcName, const HLSLIntrinsic* specOp);
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp
index 3531d55db..91439d5d3 100644
--- a/source/slang/slang-emit-cuda.cpp
+++ b/source/slang/slang-emit-cuda.cpp
@@ -112,26 +112,7 @@ SlangResult CUDASourceEmitter::calcScalarFuncName(HLSLIntrinsic::Op op, IRBasicT
     
     switch (op)
     {
-        case Op::Sin:
-        case Op::Cos:
-        case Op::Tan:
-        case Op::ArcSin:
-        case Op::ArcCos:
-        case Op::ArcTan:
-        case Op::ArcTan2:
-        case Op::Floor:
-        case Op::Ceil:
-        case Op::FMod:
-        case Op::Exp2:
-        case Op::Exp:
-        case Op::Log:
-        case Op::Log2:
-        case Op::Log10:
         case Op::FRem:
-        case Op::Sqrt:
-        case Op::RecipSqrt:
-        case Op::Pow:
-        case Op::Trunc:
         {
             if (type->op == kIROp_FloatType || type->op == kIROp_DoubleType)
             {
@@ -139,25 +120,6 @@ SlangResult CUDASourceEmitter::calcScalarFuncName(HLSLIntrinsic::Op op, IRBasicT
             }
             break;
         }
-        case Op::Max:
-        case Op::Min:
-        case Op::Abs:
-        {
-            // There are only floating point built in versions of these, prefixed with f
-            if (type->op == kIROp_FloatType || type->op == kIROp_DoubleType)
-            {
-                outBuilder << "f";
-                outBuilder << HLSLIntrinsic::getInfo(op).funcName;
-
-                if (type->op == kIROp_FloatType)
-                {
-                    outBuilder << "f";
-                }
-                return SLANG_OK;
-            }
-            break;
-        }
-
         default: break;
     }
 
@@ -171,23 +133,6 @@ SlangResult CUDASourceEmitter::calcScalarFuncName(HLSLIntrinsic::Op op, IRBasicT
         return SLANG_OK;
     }
 
-    // Missing ones:
-    // 
-    // sincos - the built in uses pointer, so we'll just define in prelude
-    // rcp
-    // sign
-    // saturate
-    // frac
-    // smoothstep
-    // lerp
-    // clamp
-    // step
-    // 
-    // For integer types
-    // abs
-    // min
-    // max
-
     // Defer to the supers impl
     return Super::calcScalarFuncName(op, type, outBuilder);
 }
diff --git a/source/slang/slang-emit-glsl.cpp b/source/slang/slang-emit-glsl.cpp
index 155b86a9c..b433b4d94 100644
--- a/source/slang/slang-emit-glsl.cpp
+++ b/source/slang/slang-emit-glsl.cpp
@@ -1096,31 +1096,6 @@ bool GLSLSourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
             }
             break;
         }
-        case kIROp_Mul_Vector_Matrix:
-        case kIROp_Mul_Matrix_Vector:
-        case kIROp_Mul_Matrix_Matrix:
-        {
-            EmitOpInfo outerPrec = inOuterPrec;
-            bool needClose = false;
-
-            // GLSL expresses inner-product multiplications
-            // with the ordinary infix `*` operator.
-            //
-            // Note that the order of the operands is reversed
-            // compared to HLSL (and Slang's internal representation)
-            // because the notion of what is a "row" vs. a "column"
-            // is reversed between HLSL/Slang and GLSL.
-            //
-            auto prec = getInfo(EmitOp::Mul);
-            needClose = maybeEmitParens(outerPrec, prec);
-
-            emitOperand(inst->getOperand(1), leftSide(outerPrec, prec));
-            m_writer->emit(" * ");
-            emitOperand(inst->getOperand(0), rightSide(prec, outerPrec));
-
-            maybeCloseParens(needClose);
-            return true;
-        }
         case kIROp_Select:
         {
             if (inst->getOperand(0)->getDataType()->op != kIROp_BoolType)
diff --git a/source/slang/slang-hlsl-intrinsic-set.cpp b/source/slang/slang-hlsl-intrinsic-set.cpp
index 82a8851e0..27871141d 100644
--- a/source/slang/slang-hlsl-intrinsic-set.cpp
+++ b/source/slang/slang-hlsl-intrinsic-set.cpp
@@ -220,42 +220,9 @@ SlangResult HLSLIntrinsicSet::makeIntrinsic(IRInst* inst, HLSLIntrinsic& out)
         {
         default: break;
 
-        case Op::Sin:
-        case Op::Cos:
-        case Op::Tan:
-        case Op::ArcSin:
-        case Op::ArcCos:
-        case Op::ArcTan:
-        case Op::ArcTan2:
-        case Op::Rcp:
-        case Op::Sign:
-        case Op::Frac:
-        case Op::Ceil:
-        case Op::Floor:
-        case Op::Trunc:
-        case Op::Sqrt:
-        case Op::RecipSqrt:
-        case Op::Exp2:
-        case Op::Exp:
-        case Op::Log:
-        case Op::Log2:
-        case Op::Log10:
-        case Op::Abs:
-        case Op::Min:
-        case Op::Max:
-        case Op::Pow:
-        case Op::FMod:
-        case Op::SmoothStep:
-        case Op::Lerp:
-        case Op::Clamp:
-        case Op::Step:
         case Op::AsFloat:
         case Op::AsInt:
         case Op::AsUInt:
-        case Op::IsInfinite:
-        case Op::IsFinite:
-        case Op::IsNan:
-        case Op::LdExp:
             // Note: the `any()`/`all()` case can't be handled via a stdlib definition
             // right now because `bool` vectors map to `int` vectors on the CUDA
             // path, so that the generated `geAt` operation is incorrect.
@@ -605,14 +572,6 @@ HLSLIntrinsic::Op HLSLIntrinsicOpLookup::getOpForIROp(IRInst* inst)
 
         case kIROp_constructVectorFromScalar: return Op::ConstructFromScalar;
 
-        case kIROp_Mul_Matrix_Matrix:
-        case kIROp_Mul_Matrix_Vector:
-        case kIROp_Mul_Vector_Matrix:
-        {
-            return Op::VecMatMul;
-        }
-        case kIROp_Dot:     return Op::Dot;
-
         default:            return Op::Invalid;
     }
 }
diff --git a/source/slang/slang-hlsl-intrinsic-set.h b/source/slang/slang-hlsl-intrinsic-set.h
index 6ab5480b3..ca3fced50 100644
--- a/source/slang/slang-hlsl-intrinsic-set.h
+++ b/source/slang/slang-hlsl-intrinsic-set.h
@@ -64,53 +64,6 @@ just constructXXXFromScalar. Would be good if there was a suitable name to encom
         \
         x(Swizzle, "", -1) \
         \
-        x(Dot, "dot", 2) \
-        x(VecMatMul, "mul", 2) \
-        \
-        x(Normalize, "normalize", 1) \
-        x(Length, "length", 1) \
-        \
-        x(Sin, "sin", 1) \
-        x(Cos, "cos", 1) \
-        x(Tan, "tan", 1) \
-        \
-        x(ArcSin, "asin", 1) \
-        x(ArcCos, "acos", 1) \
-        x(ArcTan, "atan", 1) \
-        \
-        x(ArcTan2, "atan2", 2) \
-        \
-        x(Rcp, "rcp", 1) \
-        x(Sign, "sign", 1) \
-        x(Frac, "frac", 1) \
-        \
-        x(Ceil, "ceil", 1) \
-        x(Floor, "floor", 1) \
-        x(Trunc, "trunc", 1) \
-        \
-        x(Sqrt, "sqrt", 1) \
-        x(RecipSqrt, "rsqrt", 1) \
-        \
-        x(Exp2, "exp2", 1) \
-        x(Exp, "exp", 1) \
-        \
-        x(Log, "log", 1) \
-        x(Log2, "log2", 1) \
-        x(Log10, "log10", 1) \
-        \
-        x(Abs, "abs", 1) \
-        \
-        x(Min, "min", 2) \
-        x(Max, "max", 2) \
-        x(Pow, "pow", 2) \
-        x(FMod, "fmod", 2) \
-        x(Reflect, "reflect", 2) \
-        \
-        x(SmoothStep, "smoothstep", 3) \
-        x(Lerp, "lerp", 3) \
-        x(Clamp, "clamp", 3) \
-        x(Step, "step", 2) \
-        \
         x(AsFloat, "asfloat", 1) \
         x(AsInt, "asint", -1) \
         x(AsUInt, "asuint", -1) \
@@ -120,13 +73,7 @@ just constructXXXFromScalar. Would be good if there was a suitable name to encom
         x(ConstructFromScalar, "", 1) \
         \
         x(GetAt, "", 2) \
-        \
-        x(CountBits, "countbits", 1) \
-        \
-        x(IsInfinite, "isinf", 1) \
-        x(IsFinite, "isfinite", 1) \
-        x(IsNan, "isnan", 1) \
-        x(LdExp, "ldexp", 2)
+        /* end */
 
 struct HLSLIntrinsic
 {
diff --git a/source/slang/slang-ir-inst-defs.h b/source/slang/slang-ir-inst-defs.h
index 89fec618c..3fdf9f113 100644
--- a/source/slang/slang-ir-inst-defs.h
+++ b/source/slang/slang-ir-inst-defs.h
@@ -373,10 +373,6 @@ INST(Dot, dot, 2, 0)
 
 INST(GetStringHash, getStringHash, 1, 0)
 
-INST(Mul_Vector_Matrix, mulVectorMatrix, 2, 0)
-INST(Mul_Matrix_Vector, mulMatrixVector, 2, 0)
-INST(Mul_Matrix_Matrix, mulMatrixMatrix, 2, 0)
-
 // Texture sampling operation of the form `t.Sample(s,u)`
 INST(Sample, sample, 3, 0)
 
diff --git a/source/slang/slang-ir.cpp b/source/slang/slang-ir.cpp
index 6e1b6fe83..f84300327 100644
--- a/source/slang/slang-ir.cpp
+++ b/source/slang/slang-ir.cpp
@@ -4951,9 +4951,6 @@ namespace Slang
         case kIROp_BitNot:
         case kIROp_Select:
         case kIROp_Dot:
-        case kIROp_Mul_Vector_Matrix:
-        case kIROp_Mul_Matrix_Vector:
-        case kIROp_Mul_Matrix_Matrix:
         case kIROp_MakeExistential:
         case kIROp_ExtractExistentialType:
         case kIROp_ExtractExistentialValue:
author	Tim Foley <tfoleyNV@users.noreply.github.com>	2020-03-11 08:50:38 -0700
committer	GitHub <noreply@github.com>	2020-03-11 08:50:38 -0700
commit	935768c6a00c258bf5122a2d04b84064a1eee67d (patch)
tree	68dac944da274a21acb8c8bf651401c26e289f4c
parent	b380b1af6ba6f5f58e3841c2a5b14db7ee8c372d (diff)