7 files changed, 799 insertions, 314 deletions
diff --git a/docs/user-guide/07-autodiff.md b/docs/user-guide/07-autodiff.md
index 5ca073983..244ebb47b 100644
--- a/docs/user-guide/07-autodiff.md
+++ b/docs/user-guide/07-autodiff.md
@@ -477,12 +477,17 @@ void back_prop(
 
 The following builtin functions are backward differentiable and both their forward-derivative and backward-propagation functions are already defined in the builtin library:
 
-- Arithmetic functions:  `abs`, `max`, `min`, `sqrt`
-- Trigonometric functions: `sin`, `cos`, `tan`
-- Exponential and logarithmic functions: `exp`, `pow`, `log`, `log2`
-- Vector: `dot`, `cross`
+- Arithmetic functions:  `abs`, `max`, `min`, `sqrt`, `rcp`, `rsqrt`,  `fma`, `mad`, `fmod`, `frac`, `radians`, `degrees`
+- Interpolation and clamping functions: `lerp`, `smoothstep`, `clamp`, `saturate`
+- Trigonometric functions: `sin`, `cos`, `sincos`, `tan`, `asin`, `acos`, `atan`, `atan2`
+- Hyperbolic functions: `sinh`, `cosh`, `tanh`
+- Exponential and logarithmic functions: `exp`, `exp2`, `pow`, `log`, `log2`, `log10`
+- Vector functions: `dot`, `cross`, `length`, `distance`, `normalize`, `reflect`, `refract`
 - Matrix transform: `mul(matrix, vector)`, `mul(vector, matrix)`, `mul(matrix, matrix)`, `transpose`
 
+Derivatives for the following legacy HLSL intrinsic functions are not implemented:
+- `dst`, `lit`, 
+
 ## Excluding Parameters From Differentiation
 
 Sometimes we do not wish a parameter to be considered differentiable despite it has a differentiable type. We can use the `no_diff` modifier on the parameter to inform the compiler to treat the parameter as non-differentiable and skip generating differentiation code for the parameter. The syntax is:
diff --git a/source/slang/diff.meta.slang b/source/slang/diff.meta.slang
index 8931cccdd..c303b39d9 100644
--- a/source/slang/diff.meta.slang
+++ b/source/slang/diff.meta.slang
@@ -162,6 +162,23 @@ extension Array<T, N> : IDifferentiable
     }
 }
 
+// Matrix transpose
+__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
+[ForceInline]
+[ForwardDerivativeOf(transpose)]
+DifferentialPair<matrix<T, M, N>> __d_transpose(DifferentialPair<matrix<T, N, M>> m)
+{
+    return DifferentialPair<matrix<T, M, N>>(transpose(m.p), transpose(m.d));
+}
+
+__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
+[ForceInline]
+[BackwardDerivativeOf(transpose)]
+void __d_transpose(inout DifferentialPair<matrix<T, N, M>> m, matrix<T, M, N>.Differential dOut)
+{
+    m = diffPair(m.p, transpose(dOut));
+}
+
 // vector-matrix
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 [ForceInline]
@@ -174,7 +191,6 @@ DifferentialPair<vector<T, M>> mul(DifferentialPair<vector<T, N>> left, Differen
 }
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
-[ForceInline]
 [BackwardDerivativeOf(mul)]
 void __d_mul(inout DifferentialPair<vector<T, N>> left, inout DifferentialPair<matrix<T, N, M>> right, vector<T, M>.Differential dOut)
 {
@@ -206,7 +222,6 @@ DifferentialPair<vector<T,N>> mul(DifferentialPair<matrix<T,N,M>> left, Differen
 }
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
-[ForceInline]
 [BackwardDerivativeOf(mul)]
 void __d_mul(inout DifferentialPair<matrix<T, N, M>> left, inout DifferentialPair<vector<T, M>> right, vector<T, N>.Differential dOut)
 {
@@ -238,7 +253,6 @@ DifferentialPair<matrix<T,R,C>> mul(DifferentialPair<matrix<T,R,N>> left, Differ
 }
 
 __generic<T : __BuiltinFloatingPointType, let R : int, let N : int, let C : int>
-[ForceInline]
 [BackwardDerivativeOf(mul)]
 void mul(inout DifferentialPair<matrix<T, R, N>> left, inout DifferentialPair<matrix<T, N, C>> right, matrix<T, R, C>.Differential dOut)
 {
@@ -267,442 +281,750 @@ void mul(inout DifferentialPair<matrix<T, R, N>> left, inout DifferentialPair<ma
     right = diffPair(right.p, right_d_result);
 }
 
-#define VECTOR_MAP_D_UNARY(TYPE, COUNT, D_FUNC, VALUE) \
-    vector<TYPE, COUNT> result; \
-    vector<TYPE, COUNT>.Differential d_result; \
-    [ForceUnroll]\
-    for (int i = 0; i < N; ++i) \
-    { \
-        DifferentialPair<TYPE> dp_elem = D_FUNC(DifferentialPair<TYPE>(VALUE.p[i], __slang_noop_cast<TYPE.Differential>(VALUE.d[i]))); \
-        result[i] = dp_elem.p; \
-        d_result[i] = __slang_noop_cast<TYPE>(dp_elem.d); \
-    } \
-    return DifferentialPair<vector<TYPE, COUNT>>(result, d_result)
-
-#define VECTOR_MAP_D_BINARY(TYPE, COUNT, D_FUNC, LEFT, RIGHT)                                                                          \
-    vector<TYPE, COUNT> result;                                                                                                        \
-    vector<TYPE, COUNT>.Differential d_result;                                                                                         \
-    [ForceUnroll]                                                                                                                      \
-    for (int i = 0; i < N; ++i)                                                                                                        \
-    {                                                                                                                                  \
-        DifferentialPair<TYPE> dp_elem = D_FUNC(DifferentialPair<TYPE>(LEFT.p[i], __slang_noop_cast<TYPE.Differential>(LEFT.d[i])),    \
-                                                DifferentialPair<TYPE>(RIGHT.p[i], __slang_noop_cast<TYPE.Differential>(RIGHT.d[i]))); \
-        result[i] = dp_elem.p;                                                                                                         \
-        d_result[i] = __slang_noop_cast<TYPE>(dp_elem.d);                                                                              \
-    }                                                                                                                                  \
-    return DifferentialPair<vector<TYPE, COUNT>>(result, d_result)
-
-#define VECTOR_MAP_BWD_D_UNARY(TYPE, COUNT, D_FUNC, VALUE, D_OUT)            \
-    vector<TYPE, COUNT>.Differential d_result;                               \
-    [ForceUnroll]                                                            \
-    for (int i = 0; i < N; ++i)                                              \
-    {                                                                        \
-        DifferentialPair<TYPE> dp_elem = diffPair(VALUE.p[i], TYPE.dzero()); \
-        D_FUNC(dp_elem, __slang_noop_cast<TYPE.Differential>(D_OUT[i]));     \
-        d_result[i] = __slang_noop_cast<TYPE>(dp_elem.d);                    \
-    }                                                                        \
-    VALUE = diffPair(VALUE.p, d_result)
-
-#define VECTOR_MAP_BWD_D_BINARY(TYPE, COUNT, D_FUNC, LEFT, RIGHT, D_OUT)           \
-    vector<TYPE, COUNT>.Differential left_d_result, right_d_result;                \
-    [ForceUnroll]                                                                  \
-    for (int i = 0; i < N; ++i)                                                    \
-    {                                                                              \
-        DifferentialPair<TYPE> left_dp = diffPair(LEFT.p[i], TYPE.dzero());        \
-        DifferentialPair<TYPE> right_dp = diffPair(RIGHT.p[i], TYPE.dzero());      \
-        D_FUNC(left_dp, right_dp, __slang_noop_cast<TYPE.Differential>(D_OUT[i])); \
-        left_d_result[i] = __slang_noop_cast<TYPE>(left_dp.d);                     \
-        right_d_result[i] = __slang_noop_cast<TYPE>(right_dp.d);                   \
-    }                                                                              \
-    LEFT = diffPair(LEFT.p, left_d_result);                                        \
-    RIGHT = diffPair(RIGHT.p, right_d_result)
+// Vector dot product
+__generic<T : __BuiltinFloatingPointType, let N : int>
+[ForwardDerivativeOf(dot)]
+DifferentialPair<T> __d_dot(DifferentialPair<vector<T, N>> dpx, DifferentialPair<vector<T, N>> dpy)
+{
+    T result = T(0);
+    T.Differential d_result = T.dzero();
+    [ForceUnroll]
+    for (int i = 0; i < N; ++i)
+    {
+        result = result + dpx.p[i] * dpy.p[i];
+        d_result = T.dadd(d_result, T.dmul(dpx.p[i], __slang_noop_cast<T.Differential>(dpy.d[i])));
+        d_result = T.dadd(d_result, T.dmul(dpy.p[i], __slang_noop_cast<T.Differential>(dpx.d[i])));
+    }
+    return DifferentialPair<T>(result, d_result);
+}
 
-// Detach and set derivatives to zero
+__generic<T : __BuiltinFloatingPointType, let N : int>
+[BackwardDerivativeOf(dot)]
+void __d_dot(inout DifferentialPair<vector<T, N>> dpx, inout DifferentialPair<vector<T, N>> dpy, T.Differential dOut)
+{
+    vector<T, N>.Differential x_d_result, y_d_result;
+    [ForceUnroll]
+    for (int i = 0; i < N; ++i)
+    {
+        x_d_result[i] = dpy.p[i] * __slang_noop_cast<T>(dOut);
+        y_d_result[i] = dpx.p[i] * __slang_noop_cast<T>(dOut);
+    }
+    dpx = diffPair(dpx.p, x_d_result);
+    dpy = diffPair(dpy.p, y_d_result);
+}
+
+// Cross product
+__generic<T : __BuiltinFloatingPointType>
+[ForwardDerivativeOf(cross)]
+DifferentialPair<vector<T, 3>> __d_cross(DifferentialPair<vector<T, 3>> a, DifferentialPair<vector<T, 3>> b)
+{
+    /*
+    cx = ay * bz − az * by
+    cy = az * bx − ax * bz
+    cz = ax * by − ay * bx
+    */
+    T aybz = a.p.y * b.p.z;
+    T azby = a.p.z * b.p.y;
+    T px = aybz - azby;
+    T dx = (b.p.z - azby) * a.d.y + (a.p.y - azby) * b.d.z + (aybz - b.p.y) * a.d.z + (aybz - a.p.z) * b.d.y;
+
+    T azbx = a.p.z * b.p.x;
+    T axbz = a.p.x * b.p.z;
+    T py = azbx - axbz;
+    T dy = (b.p.x - axbz) * a.d.z + (a.p.z - axbz) * b.d.x + (azbx - b.p.z) * a.d.x + (azbx - a.p.x) * b.d.z;
+
+    T axby = a.p.x * b.p.y;
+    T aybx = a.p.y * b.p.x;
+    T pz = axby - aybx;
+    T dz = (b.p.y - aybx) * a.d.x + (a.p.x - aybx) * b.d.y + (axby - b.p.x) * a.d.y + (axby - a.p.y) * b.d.x;
+    
+    return DifferentialPair<vector<T, 3>>(vector<T, 3>(px, py, pz), vector<T, 3>.Differential(dx, dy, dz));
+}
+
+__generic<T : __BuiltinFloatingPointType>
+[BackwardDerivativeOf(cross)]
+void __d_cross(inout DifferentialPair<vector<T, 3>> a, inout DifferentialPair<vector<T, 3>> b, vector<T, 3>.Differential dOut)
+{
+    /*
+    cx = ay * bz − az * by
+    cy = az * bx − ax * bz
+    cz = ax * by − ay * bx
+    */
+    T dax = (-b.p.z * dOut.y) + (b.p.y * dOut.z);
+    T day = (b.p.z * dOut.x) + (-b.p.x * dOut.z);
+    T daz = (-b.p.y * dOut.x) + (b.p.x * dOut.y);
+
+    T dbx = (a.p.z * dOut.y) + (-a.p.y * dOut.z);
+    T dby = (-a.p.z * dOut.x) + (a.p.x * dOut.z);
+    T dbz = (a.p.y * dOut.x) + (-a.p.x * dOut.y);
+
+    a = diffPair(a.p, vector<T, 3>.Differential(dax, day, daz));
+    b = diffPair(b.p, vector<T, 3>.Differential(dbx, dby, dbz));
+}
+
+#define VECTOR_MATRIX_BINARY_DIFF_IMPL(NAME)                                                 \
+    __generic<T : __BuiltinFloatingPointType, let N : int>                                   \
+    [ForwardDerivativeOf(NAME)]                                                              \
+    DifferentialPair<vector<T, N>> __d_##NAME##_vector(                                      \
+        DifferentialPair<vector<T, N>> dpx, DifferentialPair<vector<T, N>> dpy)              \
+    {                                                                                        \
+        vector<T, N> result;                                                                 \
+        vector<T, N>.Differential d_result;                                                  \
+        [ForceUnroll] for (int i = 0; i < N; ++i)                                            \
+        {                                                                                    \
+            DifferentialPair<T> dp_elem = __d_##NAME(                                        \
+                DifferentialPair<T>(dpx.p[i], __slang_noop_cast<T.Differential>(dpx.d[i])),  \
+                DifferentialPair<T>(dpy.p[i], __slang_noop_cast<T.Differential>(dpy.d[i]))); \
+            result[i] = dp_elem.p;                                                           \
+            d_result[i] = __slang_noop_cast<T>(dp_elem.d);                                   \
+        }                                                                                    \
+        return DifferentialPair<vector<T, N>>(result, d_result);                             \
+    }                                                                                        \
+    __generic<T : __BuiltinFloatingPointType, let N : int>                                   \
+    [BackwardDerivativeOf(NAME)]                                                             \
+    void __d_##NAME##_vector(                                                                \
+            inout DifferentialPair<vector<T, N>> dpx,                                        \
+            inout DifferentialPair<vector<T, N>> dpy,                                        \
+            vector<T, N>.Differential dOut)                                                  \
+    {                                                                                        \
+        vector<T, N>.Differential left_d_result, right_d_result;                             \
+        [ForceUnroll] for (int i = 0; i < N; ++i)                                            \
+        {                                                                                    \
+            DifferentialPair<T> left_dp = diffPair(dpx.p[i], T.dzero());                     \
+            DifferentialPair<T> right_dp = diffPair(dpy.p[i], T.dzero());                    \
+            __d_##NAME(left_dp, right_dp, __slang_noop_cast<T.Differential>(dOut[i]));       \
+            left_d_result[i] = __slang_noop_cast<T>(left_dp.d);                              \
+            right_d_result[i] = __slang_noop_cast<T>(right_dp.d);                            \
+        }                                                                                    \
+        dpx = diffPair(dpx.p, left_d_result);                                                \
+        dpy = diffPair(dpy.p, right_d_result);                                               \
+    }
 
+#define VECTOR_MATRIX_TERNARY_DIFF_IMPL(NAME)                                                \
+    __generic<T : __BuiltinFloatingPointType, let N : int>                                   \
+    [ForwardDerivativeOf(NAME)]                                                              \
+    DifferentialPair<vector<T, N>> __d_##NAME##_vector(                                      \
+        DifferentialPair<vector<T, N>> dpx,                                                  \
+        DifferentialPair<vector<T, N>> dpy,                                                  \
+        DifferentialPair<vector<T, N>> dpz)                                                  \
+{                                                                                            \
+        vector<T, N> result;                                                                 \
+        vector<T, N>.Differential d_result;                                                  \
+        [ForceUnroll] for (int i = 0; i < N; ++i)                                            \
+        {                                                                                    \
+            DifferentialPair<T> dp_elem = __d_##NAME(                                        \
+                DifferentialPair<T>(dpx.p[i], __slang_noop_cast<T.Differential>(dpx.d[i])),  \
+                DifferentialPair<T>(dpy.p[i], __slang_noop_cast<T.Differential>(dpy.d[i])),  \
+                DifferentialPair<T>(dpz.p[i], __slang_noop_cast<T.Differential>(dpz.d[i]))); \
+            result[i] = dp_elem.p;                                                           \
+            d_result[i] = __slang_noop_cast<T>(dp_elem.d);                                   \
+        }                                                                                    \
+        return DifferentialPair<vector<T, N>>(result, d_result);                             \
+    }                                                                                        \
+    __generic<T : __BuiltinFloatingPointType, let N : int>                                   \
+    [BackwardDerivativeOf(NAME)]                                                             \
+    void __d_##NAME##_vector(                                                                \
+            inout DifferentialPair<vector<T, N>> dpx,                                        \
+            inout DifferentialPair<vector<T, N>> dpy,                                        \
+            inout DifferentialPair<vector<T, N>> dpz,                                        \
+            vector<T, N>.Differential dOut)                                                  \
+    {                                                                                        \
+        vector<T, N>.Differential left_d_result, middle_d_result, right_d_result;            \
+        [ForceUnroll] for (int i = 0; i < N; ++i)                                            \
+        {                                                                                    \
+            DifferentialPair<T> left_dp = diffPair(dpx.p[i], T.dzero());                     \
+            DifferentialPair<T> middle_dp = diffPair(dpy.p[i], T.dzero());                   \
+            DifferentialPair<T> right_dp = diffPair(dpz.p[i], T.dzero());                    \
+            __d_##NAME(left_dp, middle_dp, right_dp,                                         \
+                __slang_noop_cast<T.Differential>(dOut[i]));                                 \
+            left_d_result[i] = __slang_noop_cast<T>(left_dp.d);                              \
+            middle_d_result[i] = __slang_noop_cast<T>(middle_dp.d);                          \
+            right_d_result[i] = __slang_noop_cast<T>(right_dp.d);                            \
+        }                                                                                    \
+        dpx = diffPair(dpx.p, left_d_result);                                                \
+        dpy = diffPair(dpy.p, middle_d_result);                                              \
+        dpz = diffPair(dpz.p, right_d_result);                                               \
+    }
+
+#define UNARY_DERIVATIVE_IMPL(NAME, FWD_DIFF_FUNC, BWD_DIFF_FUNC)                            \
+    __generic<T : __BuiltinFloatingPointType>                                                \
+    [ForwardDerivativeOf(NAME)]                                                              \
+    DifferentialPair<T> __d_##NAME(DifferentialPair<T> dpx)                                  \
+    {                                                                                        \
+        return DifferentialPair<T>(NAME(dpx.p), FWD_DIFF_FUNC);                              \
+    }                                                                                        \
+    __generic<T : __BuiltinFloatingPointType, let N : int>                                   \
+    [ForwardDerivativeOf(NAME)]                                                              \
+    DifferentialPair<vector<T, N>> __d_##NAME##_vector(DifferentialPair<vector<T, N>> dpx)   \
+    {                                                                                        \
+        vector<T, N> result;                                                                 \
+        vector<T, N>.Differential d_result;                                                  \
+        [ForceUnroll] for (int i = 0; i < N; ++i)                                            \
+        {                                                                                    \
+            DifferentialPair<T> dp_elem = __d_##NAME(                                        \
+                DifferentialPair<T>(dpx.p[i], __slang_noop_cast<T.Differential>(dpx.d[i]))); \
+            result[i] = dp_elem.p;                                                           \
+            d_result[i] = __slang_noop_cast<T>(dp_elem.d);                                   \
+        }                                                                                    \
+        return DifferentialPair<vector<T, N>>(result, d_result);                             \
+    }                                                                                        \
+    __generic<T : __BuiltinFloatingPointType, let M : int, let N : int>                      \
+    [ForwardDerivativeOf(NAME)]                                                              \
+    DifferentialPair<matrix<T, M, N>> __d_##NAME##_m(DifferentialPair<matrix<T, M, N>> dpx)  \
+    {                                                                                        \
+        matrix<T, M, N> result;                                                              \
+        matrix<T, M, N>.Differential d_result;                                               \
+        [ForceUnroll] for (int i = 0; i < M; ++i)                                            \
+        [ForceUnroll] for (int j = 0; j < N; ++j)                                            \
+        {                                                                                    \
+            DifferentialPair<T> dp_elem = __d_##NAME(                                        \
+               DifferentialPair<T>(dpx.p[i][j],                                              \
+                                   __slang_noop_cast<T.Differential>(dpx.d[i][j])));         \
+            result[i][j] = dp_elem.p;                                                        \
+            d_result[i][j] = __slang_noop_cast<T>(dp_elem.d);                                \
+        }                                                                                    \
+        return DifferentialPair<matrix<T, M, N>>(result, d_result);                          \
+    }                                                                                        \
+    __generic<T : __BuiltinFloatingPointType>                                                \
+    [BackwardDerivativeOf(NAME)]                                                             \
+    void __d_##NAME(inout DifferentialPair<T> dpx, T.Differential dOut)                      \
+    {                                                                                        \
+        dpx = diffPair(dpx.p, BWD_DIFF_FUNC);                                                \
+    }                                                                                        \
+    __generic<T : __BuiltinFloatingPointType, let N : int>                                   \
+    [BackwardDerivativeOf(NAME)]                                                             \
+    void __d_##NAME##_vector(                                                                \
+            inout DifferentialPair<vector<T, N>> dpx, vector<T, N>.Differential dOut)        \
+    {                                                                                        \
+        vector<T, N>.Differential d_result;                                                  \
+        [ForceUnroll] for (int i = 0; i < N; ++i)                                            \
+        {                                                                                    \
+            DifferentialPair<T> dp_elem = diffPair(dpx.p[i], T.dzero());                     \
+            __d_##NAME(dp_elem, __slang_noop_cast<T.Differential>(dOut[i]));                 \
+            d_result[i] = __slang_noop_cast<T>(dp_elem.d);                                   \
+        }                                                                                    \
+        dpx = diffPair(dpx.p, d_result);                                                     \
+    }                                                                                        \
+    __generic<T : __BuiltinFloatingPointType, let M : int, let N : int>                      \
+    [BackwardDerivativeOf(NAME)]                                                             \
+    void __d_##NAME##_matrix(                                                                \
+            inout DifferentialPair<matrix<T, M, N>> dpx, matrix<T, M, N>.Differential dOut)  \
+    {                                                                                        \
+        matrix<T, M, N>.Differential d_result;                                               \
+        [ForceUnroll] for (int i = 0; i < M; ++i)                                            \
+        [ForceUnroll] for (int j = 0; j < N; ++j)                                            \
+        {                                                                                    \
+            DifferentialPair<T> dp_elem = diffPair(dpx.p[i][j], T.dzero());                  \
+            __d_##NAME(dp_elem, __slang_noop_cast<T.Differential>(dOut[i][j]));              \
+            d_result[i][j] = __slang_noop_cast<T>(dp_elem.d);                                \
+        }                                                                                    \
+        dpx = diffPair(dpx.p, d_result);                                                     \
+    }
+#define SIMPLE_UNARY_DERIVATIVE_IMPL(NAME, DIFF_FUNC) UNARY_DERIVATIVE_IMPL(NAME, T.dmul(DIFF_FUNC, dpx.d), T.dmul(DIFF_FUNC, dOut))
+
+// Detach and set derivatives to zero
 __generic<T : IDifferentiable>
 __intrinsic_op($(kIROp_DetachDerivative))
 T detach(T x);
 
-// Natural Exponent
+#define SLANG_SQR(x) ((x)*(x))
 
+// Absolute value
+UNARY_DERIVATIVE_IMPL(abs, (dpx.p > T(0.0) ? dpx.d : T.dmul(T(-1.0), dpx.d)), (T.dmul(__slang_noop_cast<T>(sign(dpx.p)), dOut)))
+// Saturate
+UNARY_DERIVATIVE_IMPL(saturate, (dpx.p < T(0.0) || dpx.p > T(1.0) ? T.dzero() : dpx.d), (dpx.p < T(0.0) || dpx.p > T(1.0) ? T.dzero() : dOut))
+// frac
+UNARY_DERIVATIVE_IMPL(frac, dpx.d, dOut)
+// raidans, degrees
+SIMPLE_UNARY_DERIVATIVE_IMPL(radians, T(0.01745329251994329576923690768489))
+SIMPLE_UNARY_DERIVATIVE_IMPL(degrees, T(57.295779513082320876798154814105))
+// Exponent
+SIMPLE_UNARY_DERIVATIVE_IMPL(exp, exp(dpx.p))
+SIMPLE_UNARY_DERIVATIVE_IMPL(exp2, exp2(dpx.p)* T(50.69314718055994530941723212145818))
+// sin, sinh
+SIMPLE_UNARY_DERIVATIVE_IMPL(sin, cos(dpx.p))
+SIMPLE_UNARY_DERIVATIVE_IMPL(sinh, cosh(dpx.p))
+// cos, cosh
+SIMPLE_UNARY_DERIVATIVE_IMPL(cos, -sin(dpx.p))
+SIMPLE_UNARY_DERIVATIVE_IMPL(cosh, sinh(dpx.p))
+// tan, tanh
+SIMPLE_UNARY_DERIVATIVE_IMPL(tan, T(1.0) / (cos(dpx.p) * cos(dpx.p)))
+SIMPLE_UNARY_DERIVATIVE_IMPL(tanh, T(1.0) / (cosh(dpx.p) * cosh(dpx.p)))
+// Logarithm
+SIMPLE_UNARY_DERIVATIVE_IMPL(log, T(1.0) / dpx.p)
+SIMPLE_UNARY_DERIVATIVE_IMPL(log10, T(1.0) / (dpx.p * T(52.3025850929940456840179914546844)))
+SIMPLE_UNARY_DERIVATIVE_IMPL(log2, T(1.0) / (dpx.p * T(50.69314718055994530941723212145818)))
+// Square root
+SIMPLE_UNARY_DERIVATIVE_IMPL(sqrt, (dpx.p < T(1e-7) ? T(0.0) : T(0.5) / sqrt(dpx.p)))
+// Reciprocal
+SIMPLE_UNARY_DERIVATIVE_IMPL(rcp, (dpx.p < T(1e-7) ? T(0.0) : T(-1.0) / (dpx.p * dpx.p)))
+// rsqrt
+SIMPLE_UNARY_DERIVATIVE_IMPL(rsqrt, T(-0.5) / (dpx.p * sqrt(dpx.p)))
+// Arc-sin
+SIMPLE_UNARY_DERIVATIVE_IMPL(asin, T(1.0) / sqrt(T(1.0) - dpx.p * dpx.p))
+// Arc-cos
+SIMPLE_UNARY_DERIVATIVE_IMPL(acos, T(-1.0) / sqrt(T(1.0) - dpx.p * dpx.p))
+// Arc-tan
+SIMPLE_UNARY_DERIVATIVE_IMPL(atan, T(1.0) / (T(1.0) + dpx.p * dpx.p))
+
+// Atan2
 __generic<T : __BuiltinFloatingPointType>
-[ForwardDerivativeOf(exp)]
-DifferentialPair<T> __d_exp(DifferentialPair<T> dpx)
+[ForwardDerivativeOf(atan2)]
+DifferentialPair<T> __d_atan2(DifferentialPair<T> dpy, DifferentialPair<T> dpx)
 {
+    T.Differential dx = T.dmul(-dpy.p / (dpx.p * dpx.p + dpy.p * dpy.p), dpx.d);
+    T.Differential dy = T.dmul(-dpx.p / (dpx.p * dpx.p + dpy.p * dpy.p), dpy.d);
     return DifferentialPair<T>(
-        exp(dpx.p),
-        T.dmul(exp(dpx.p), dpx.d));
+        atan2(dpy.p, dpx.p),
+        T.dadd(dx, dy));
 }
 
-__generic<T : __BuiltinFloatingPointType, let N : int>
-[ForwardDerivativeOf(exp)]
-DifferentialPair<vector<T, N>> __d_exp_vector(DifferentialPair<vector<T, N>> dpx)
+__generic<T : __BuiltinFloatingPointType>
+[BackwardDerivativeOf(atan2)]
+void __d_atan2(inout DifferentialPair<T> dpy, inout DifferentialPair<T> dpx, T.Differential dOut)
 {
-    VECTOR_MAP_D_UNARY(T, N, __d_exp, dpx);
+    dpx = diffPair(dpx.p, T.dmul(-dpy.p / (dpx.p * dpx.p + dpy.p * dpy.p), dpx.d));
+    dpy = diffPair(dpy.p, T.dmul(-dpx.p / (dpx.p * dpx.p + dpy.p * dpy.p), dpy.d));
 }
 
+VECTOR_MATRIX_BINARY_DIFF_IMPL(atan2)
+
+// fmod
 __generic<T : __BuiltinFloatingPointType>
-[BackwardDerivativeOf(exp)]
-void __d_exp(inout DifferentialPair<T> dpx, T.Differential dOut)
+[ForwardDerivativeOf(fmod)]
+DifferentialPair<T> __d_fmod(DifferentialPair<T> x, DifferentialPair<T> y)
 {
-    dpx = diffPair(
-        dpx.p,
-        T.dmul(exp(dpx.p), dOut));
+    return DifferentialPair<T>(fmod(x.p, y.p), x.d);
 }
-
-__generic<T : __BuiltinFloatingPointType, let N : int>
-[BackwardDerivativeOf(exp)]
-void __d_exp_vector(inout DifferentialPair<vector<T, N>> dpx, vector<T, N>.Differential dOut)
+__generic<T : __BuiltinFloatingPointType>
+[BackwardDerivativeOf(fmod)]
+void __d_fmod(inout DifferentialPair<T> x, inout DifferentialPair<T> y, T.Differential dOut)
 {
-    dpx = diffPair(
-        dpx.p,
-        vector<T, N>.dmul(exp(dpx.p), dOut));
+    x = diffPair(x.p, dOut);
+    y = diffPair(y.p);
 }
+VECTOR_MATRIX_BINARY_DIFF_IMPL(fmod)
 
-// Absolute value
-
+// Raise to a power
 __generic<T : __BuiltinFloatingPointType>
-[ForwardDerivativeOf(abs)]
-DifferentialPair<T> __d_abs(DifferentialPair<T> dpx)
+[ForwardDerivativeOf(pow)]
+DifferentialPair<T> __d_pow(DifferentialPair<T> dpx, DifferentialPair<T> dpy)
 {
+    // Special case
+    if (dpx.p < T(1e-6))
+    {
+        return DifferentialPair<T>(T(0.0), T.dzero());
+    }
+
+    T val = pow(dpx.p, dpy.p);
+    T.Differential d1 = T.dmul(val * log(dpx.p), dpy.d);
+    T.Differential d2 = T.dmul(val * dpy.p / dpx.p, dpx.d);
     return DifferentialPair<T>(
-        abs(dpx.p),
-        dpx.p > T(0.0) ? dpx.d : T.dmul(T(-1.0), dpx.d)
+        val,
+        T.dadd(d1, d2)
     );
 }
 
-__generic<T : __BuiltinFloatingPointType, let N : int>
-[ForwardDerivativeOf(abs)]
-DifferentialPair<vector<T, N>> __d_abs_vector(DifferentialPair<vector<T, N>> dpx)
+__generic<T : __BuiltinFloatingPointType>
+[BackwardDerivativeOf(pow)]
+void __d_pow(inout DifferentialPair<T> dpx, inout DifferentialPair<T> dpy, T.Differential dOut)
 {
-    VECTOR_MAP_D_UNARY(T, N, __d_abs, dpx);
+    // Special case
+    if (dpx.p < T(1e-6))
+    {
+        dpx = diffPair(dpx.p, T.dzero());
+        dpy = diffPair(dpy.p, T.dzero());
+    }
+    else
+    {
+        T val = pow(dpx.p, dpy.p);
+        dpx = diffPair(
+            dpx.p,
+            T.dmul(val * dpy.p / dpx.p, dOut));
+        dpy = diffPair(
+            dpy.p,
+            T.dmul(val * log(dpx.p), dOut));
+    }
 }
 
+VECTOR_MATRIX_BINARY_DIFF_IMPL(pow)
+
+// Maximum
 __generic<T : __BuiltinFloatingPointType>
-[BackwardDerivativeOf(abs)]
-void __d_abs(inout DifferentialPair<T> dpx, T.Differential dOut)
+[ForwardDerivativeOf(max)]
+DifferentialPair<T> __d_max(DifferentialPair<T> dpx, DifferentialPair<T> dpy)
 {
-    dpx = diffPair(
-        dpx.p,
-        T.dmul(__slang_noop_cast<T>(sign(dpx.p)), dOut));
+    return DifferentialPair<T>(
+        max(dpx.p, dpy.p),
+        dpx.p > dpy.p ? dpx.d : dpy.d
+    );
 }
 
-__generic<T : __BuiltinFloatingPointType, let N : int>
-[BackwardDerivativeOf(abs)]
-void __d_abs_vector(inout DifferentialPair<vector<T, N>> dpx, vector<T, N>.Differential dOut)
+__generic<T : __BuiltinFloatingPointType>
+[BackwardDerivativeOf(max)]
+void __d_max(inout DifferentialPair<T> dpx, inout DifferentialPair<T> dpy, T.Differential dOut)
 {
-    VECTOR_MAP_BWD_D_UNARY(T, N, __d_abs, dpx, dOut);
+    dpx = diffPair(dpx.p, dpx.p > dpy.p ? dOut : T.dzero());
+    dpy = diffPair(dpy.p, dpy.p > dpx.p ? dOut : T.dzero());
 }
 
-// Sine
+VECTOR_MATRIX_BINARY_DIFF_IMPL(max)
 
+// Minimum
 __generic<T : __BuiltinFloatingPointType>
-[ForwardDerivativeOf(sin)]
-DifferentialPair<T> __d_sin(DifferentialPair<T> dpx)
+[ForwardDerivativeOf(min)]
+DifferentialPair<T> __d_min(DifferentialPair<T> dpx, DifferentialPair<T> dpy)
 {
     return DifferentialPair<T>(
-        sin(dpx.p),
-        T.dmul(cos(dpx.p), dpx.d));
+        min(dpx.p, dpy.p),
+        dpx.p < dpy.p ? dpx.d : dpy.d
+    );
 }
 
-__generic<T : __BuiltinFloatingPointType, let N : int>
-[ForwardDerivativeOf(sin)]
-DifferentialPair<vector<T, N>> __d_sin_vector(DifferentialPair<vector<T, N>> dpx)
+__generic<T : __BuiltinFloatingPointType>
+[BackwardDerivativeOf(min)]
+void __d_min(inout DifferentialPair<T> dpx, inout DifferentialPair<T> dpy, T.Differential dOut)
 {
-    VECTOR_MAP_D_UNARY(T, N, __d_sin, dpx);
+    dpx = diffPair(dpx.p, dpx.p < dpy.p ? dOut : T.dzero());
+    dpy = diffPair(dpy.p, dpy.p < dpx.p ? dOut : T.dzero());
 }
 
+VECTOR_MATRIX_BINARY_DIFF_IMPL(min)
+
+// Lerp
 __generic<T : __BuiltinFloatingPointType>
-[BackwardDerivativeOf(sin)]
-void __d_sin(inout DifferentialPair<T> dpx, T.Differential dOut)
+[ForwardDerivativeOf(lerp)]
+DifferentialPair<T> __d_lerp(DifferentialPair<T> dpx, DifferentialPair<T> dpy, DifferentialPair<T> dps)
 {
-    dpx = diffPair(
-        dpx.p,
-        T.dmul(cos(dpx.p), dOut));
+    return DifferentialPair<T>(
+        lerp(dpx.p, dpy.p, dps.p),
+        T.dadd(T.dadd(T.dmul((T(1.0) - dps.p), dpx.d), T.dmul(dps.p, dpy.d)), T.dmul(dpy.p - dpx.p, dps.d))
+    );
 }
-
-__generic<T : __BuiltinFloatingPointType, let N : int>
-[BackwardDerivativeOf(sin)]
-void __d_sin_vector(inout DifferentialPair<vector<T, N>> dpx, vector<T, N>.Differential dOut)
+__generic<T : __BuiltinFloatingPointType>
+[BackwardDerivativeOf(lerp)]
+void __d_lerp(inout DifferentialPair<T> dpx, inout DifferentialPair<T> dpy, inout DifferentialPair<T> dps, T.Differential dOut)
 {
-    dpx = diffPair(
-        dpx.p,
-        vector<T, N>.dmul(cos(dpx.p), dOut));
+    dpx = diffPair(dpx.p, T.dmul(T(1.0) - dps.p, dOut));
+    dpy = diffPair(dpy.p, T.dmul(dps.p, dOut));
+    dps = diffPair(dpy.p, T.dmul((dpy.p - dpx.p), dOut));
 }
+VECTOR_MATRIX_TERNARY_DIFF_IMPL(lerp)
 
-// Cosine
-
+//  Clamp
 __generic<T : __BuiltinFloatingPointType>
-[ForwardDerivativeOf(cos)]
-DifferentialPair<T> __d_cos(DifferentialPair<T> dpx)
+[ForwardDerivativeOf(clamp)]
+DifferentialPair<T> __d_clamp(DifferentialPair<T> dpx, DifferentialPair<T> dpMin, DifferentialPair<T> dpMax)
 {
     return DifferentialPair<T>(
-        cos(dpx.p),
-        T.dmul(-sin(dpx.p), dpx.d));
+        clamp(dpx.p, dpMin.p, dpMax.p),
+        dpx.p < dpMin.p ? (dpx.p > dpMax.p ? dpMax.d : dpx.d) : dpMin.d);
 }
-
-__generic<T : __BuiltinFloatingPointType, let N : int>
-[ForwardDerivativeOf(cos)]
-DifferentialPair<vector<T, N>> __d_cos_vector(DifferentialPair<vector<T, N>> dpx)
+__generic<T : __BuiltinFloatingPointType>
+[BackwardDerivativeOf(clamp)]
+void __d_clamp(inout DifferentialPair<T> dpx, inout DifferentialPair<T> dpMin, inout DifferentialPair<T> dpMax, T.Differential dOut)
 {
-    VECTOR_MAP_D_UNARY(T, N, __d_cos, dpx);
+    dpx = diffPair(dpx.p, dpx.p > dpMin.p && dpx.p < dpMax.p ? dOut : T.dzero());
+    dpMin = diffPair(dpMin.p, dpx.p <= dpMin.p ? dOut : T.dzero());
+    dpMax = diffPair(dpMin.p, dpx.p >= dpMax.p ? dOut : T.dzero());
 }
+VECTOR_MATRIX_TERNARY_DIFF_IMPL(clamp)
 
-__generic<T : __BuiltinFloatingPointType>
-[BackwardDerivativeOf(cos)]
-void __d_cos(inout DifferentialPair<T> dpx, T.Differential dOut)
+// fma
+[ForwardDerivativeOf(fma)]
+DifferentialPair<double> __d_fma(DifferentialPair<double> dpx, DifferentialPair<double> dpy, DifferentialPair<double> dpz)
 {
-    dpx = diffPair(
-        dpx.p,
-        T.dmul(-sin(dpx.p), dOut));
+    return DifferentialPair<double>(
+        fma(dpx.p, dpy.p, dpz.p),
+        dpy.p * dpx.d + dpx.p * dpy.d + dpz.d);
 }
-
-__generic<T : __BuiltinFloatingPointType, let N : int>
-[BackwardDerivativeOf(cos)]
-void __d_cos_vector(inout DifferentialPair<vector<T, N>> dpx, vector<T, N>.Differential dOut)
+[BackwardDerivativeOf(fma)]
+void __d_fma(inout DifferentialPair<double> dpx, inout DifferentialPair<double> dpy, inout DifferentialPair<double> dpz, double dOut)
 {
-    dpx = diffPair(
-        dpx.p,
-        vector<T, N>.dmul(-sin(dpx.p), dOut));
+    dpx = diffPair(dpx.p, dpy.p * dOut);
+    dpy = diffPair(dpy.p, dpx.p * dOut);
+    dpz = diffPair(dpz.p, dOut);
+}
+__generic<let N : int>
+[ForwardDerivativeOf(fma)]
+DifferentialPair<vector<double, N>> __d_fma_vector(
+    DifferentialPair<vector<double, N>> dpx,
+    DifferentialPair<vector<double, N>> dpy,
+    DifferentialPair<vector<double, N>> dpz)
+{
+    vector<double, N> result;
+    vector<double, N>.Differential d_result;
+    [ForceUnroll] for (int i = 0; i < N; ++i)
+    {
+        DifferentialPair<double> dp_elem = __d_fma(
+            DifferentialPair<double>(dpx.p[i], dpx.d[i]),
+            DifferentialPair<double>(dpy.p[i], dpy.d[i]),
+            DifferentialPair<double>(dpz.p[i], dpz.d[i]));
+        result[i] = dp_elem.p;
+        d_result[i] = dp_elem.d;
+    }
+    return DifferentialPair<vector<double, N>>(result, d_result);
+}
+__generic<let N : int>
+[BackwardDerivativeOf(fma)]
+void __d_fma_vector(
+        inout DifferentialPair<vector<double, N>> dpx,
+        inout DifferentialPair<vector<double, N>> dpy,
+        inout DifferentialPair<vector<double, N>> dpz,
+        vector<double, N> dOut)
+{
+    vector<double, N>.Differential x_d_result, y_d_result, z_d_result;
+    [ForceUnroll] for (int i = 0; i < N; ++i)
+    {
+        DifferentialPair<double> x_dp = diffPair(dpx.p[i], 0.0);
+        DifferentialPair<double> y_dp = diffPair(dpy.p[i], 0.0);
+        DifferentialPair<double> z_dp = diffPair(dpz.p[i], 0.0);
+        __d_fma(x_dp, y_dp, z_dp, dOut[i]);
+        x_d_result[i] = x_dp.d;
+        y_d_result[i] = y_dp.d;
+        z_d_result[i] = z_dp.d;
+    }
+    dpx = diffPair(dpx.p, x_d_result);
+    dpy = diffPair(dpy.p, y_d_result);
+    dpz = diffPair(dpz.p, z_d_result);
 }
 
-// Base-e logarithm
-
+// mad
 __generic<T : __BuiltinFloatingPointType>
-[ForwardDerivativeOf(log)]
-DifferentialPair<T> __d_log(DifferentialPair<T> dpx)
+[ForwardDerivativeOf(mad)]
+DifferentialPair<T> __d_mad(DifferentialPair<T> dpx, DifferentialPair<T> dpy, DifferentialPair<T> dpz)
 {
     return DifferentialPair<T>(
-        log(dpx.p),
-        T.dmul(T(1.0) / dpx.p, dpx.d)
-    );
+        mad(dpx.p, dpy.p, dpz.p),
+        T.dadd(T.dadd(T.dmul(dpy.p, dpx.d), T.dmul(dpx.p, dpy.d)), dpz.d));
 }
-
-__generic<T : __BuiltinFloatingPointType, let N : int>
-[ForwardDerivativeOf(log)]
-DifferentialPair<vector<T, N>> __d_log_vector(DifferentialPair<vector<T, N>> dpx)
+__generic<T : __BuiltinFloatingPointType>
+[BackwardDerivativeOf(mad)]
+void __d_mad(inout DifferentialPair<T> dpx, inout DifferentialPair<T> dpy, inout DifferentialPair<T> dpz, T.Differential dOut)
 {
-    VECTOR_MAP_D_UNARY(T, N, __d_log, dpx);
+    dpx = diffPair(dpx.p, T.dmul(dpy.p, dOut));
+    dpy = diffPair(dpy.p, T.dmul(dpx.p, dOut));
+    dpz = diffPair(dpz.p, dOut);
 }
+VECTOR_MATRIX_TERNARY_DIFF_IMPL(mad)
 
+// Smoothstep
 __generic<T : __BuiltinFloatingPointType>
-[BackwardDerivativeOf(log)]
-void __d_log(inout DifferentialPair<T> dpx, T.Differential dOut)
+[BackwardDifferentiable]
+T __smoothstep_impl(T minVal, T maxVal, T x)
 {
-    dpx = diffPair(dpx.p, T.dmul(T(1.0) / dpx.p, dOut));
+    let t = saturate((x - minVal) / (maxVal - minVal));
+    return t * t * (T(3.0) - T(2.0) * t);
 }
-
-__generic<T : __BuiltinFloatingPointType, let N : int>
-[BackwardDerivativeOf(log)]
-void __d_log_vector(inout DifferentialPair<vector<T, N>> dpx, vector<T, N>.Differential dOut)
+__generic<T : __BuiltinFloatingPointType>
+[ForwardDerivativeOf(smoothstep)]
+DifferentialPair<T> __d_smoothstep(DifferentialPair<T> minVal, DifferentialPair<T> maxVal, DifferentialPair<T> x)
 {
-    VECTOR_MAP_BWD_D_UNARY(T, N, __d_log, dpx, dOut);
+    return __fwd_diff(__smoothstep_impl)(minVal, maxVal, x);
 }
-
-// Square root
-
 __generic<T : __BuiltinFloatingPointType>
-[ForwardDerivativeOf(sqrt)]
-DifferentialPair<T> __d_sqrt(DifferentialPair<T> dpx)
+[BackwardDerivativeOf(smoothstep)]
+void __d_smoothstep(inout DifferentialPair<T> minVal, inout DifferentialPair<T> maxVal, inout DifferentialPair<T> x, T.Differential dOut)
 {
-    // Special case
-    if (dpx.p < T(1e-6))
+    __bwd_diff(__smoothstep_impl)(minVal, maxVal, x, dOut);
+}
+VECTOR_MATRIX_TERNARY_DIFF_IMPL(smoothstep)
+
+// Vector length
+__generic<T: __BuiltinFloatingPointType, let N : int>
+[BackwardDifferentiable]
+T __length_impl(vector<T, N> x)
+{
+    T len = T(0.0);
+    [ForceUnroll] for (int i = 0; i < N; i++)
     {
-        return DifferentialPair<T>(T(0.0), T.dzero());
+        len += x[i] * x[i];
     }
-
-    T val = sqrt(dpx.p);
-    return DifferentialPair<T>(
-        val,
-        T.dmul(T(0.5) / val, dpx.d)
-    );
+    return sqrt(len);
 }
 
-__generic<T : __BuiltinFloatingPointType, let N : int>
-[ForwardDerivativeOf(sqrt)]
-DifferentialPair<vector<T, N>> __d_sqrt_vector(DifferentialPair<vector<T, N>> dpx)
+__generic<T: __BuiltinFloatingPointType, let N : int>
+[ForwardDerivativeOf(length)]
+[ForceInline]
+DifferentialPair<T> __d_length(DifferentialPair<vector<T, N>> x)
 {
-    VECTOR_MAP_D_UNARY(T, N, __d_sqrt, dpx);
+    return __fwd_diff(__length_impl)(x);
 }
 
-__generic<T : __BuiltinFloatingPointType>
-[BackwardDerivativeOf(sqrt)]
-void __d_sqrt(inout DifferentialPair<T> dpx, T.Differential dOut)
+__generic<T: __BuiltinFloatingPointType, let N : int>
+[BackwardDerivativeOf(length)]
+[ForceInline]
+void __d_length(inout DifferentialPair<vector<T, N>> x, T.Differential dOut)
 {
-    // Special case
-    if (dpx.p < T(1e-6))
-    {
-        dpx = diffPair(dpx.p, T.dzero());
-    }
-    else
-    {
-        dpx = diffPair(
-            dpx.p,
-            T.dmul(T(0.5) / sqrt(dpx.p), dOut));
-    }
+    return __bwd_diff(__length_impl)(x, dOut);
 }
 
+// Vector distance
 __generic<T : __BuiltinFloatingPointType, let N : int>
-[BackwardDerivativeOf(sqrt)]
-void __d_sqrt_vector(inout DifferentialPair<vector<T, N>> dpx, vector<T, N>.Differential dOut)
+[BackwardDifferentiable]
+T __distance_impl(vector<T, N> x, vector<T, N> y)
 {
-    VECTOR_MAP_BWD_D_UNARY(T, N, __d_sqrt, dpx, dOut);
+    return length(y - x);
+}
+__generic<T: __BuiltinFloatingPointType, let N : int>
+[ForwardDerivativeOf(distance)]
+[ForceInline]
+DifferentialPair<T> __d_distance(DifferentialPair<vector<T, N>> x, DifferentialPair<vector<T, N>> y)
+{
+    return __fwd_diff(__distance_impl)(x, y);
 }
 
-// Maximum
-
-__generic<T : __BuiltinFloatingPointType>
-[ForwardDerivativeOf(max)]
-DifferentialPair<T> __d_max(DifferentialPair<T> dpx, DifferentialPair<T> dpy)
+__generic<T: __BuiltinFloatingPointType, let N : int>
+[BackwardDerivativeOf(distance)]
+[ForceInline]
+void __d_distance(inout DifferentialPair<vector<T, N>> x, inout DifferentialPair<vector<T, N>> y, T.Differential dOut)
 {
-    return DifferentialPair<T>(
-        max(dpx.p, dpy.p),
-        dpx.p > dpy.p ? dpx.d : dpy.d
-    );
+    return __bwd_diff(__distance_impl)(x, y, dOut);
 }
 
+// Vector normalize
 __generic<T : __BuiltinFloatingPointType, let N : int>
-[ForwardDerivativeOf(max)]
-DifferentialPair<vector<T, N>> __d_max_vector(DifferentialPair<vector<T, N>> dpx, DifferentialPair<vector<T, N>> dpy)
+[BackwardDifferentiable]
+vector<T, N> __normalize_impl(vector<T, N> x)
 {
-    VECTOR_MAP_D_BINARY(T, N, __d_max, dpx, dpy);
+    let r = T(1.0) / length(x);
+    return x * r;
 }
-
-__generic<T : __BuiltinFloatingPointType>
-[BackwardDerivativeOf(max)]
-void __d_max(inout DifferentialPair<T> dpx, inout DifferentialPair<T> dpy, T.Differential dOut)
+__generic<T: __BuiltinFloatingPointType, let N : int>
+[ForwardDerivativeOf(normalize)]
+[ForceInline]
+DifferentialPair<vector<T, N>> __d_normalize(DifferentialPair<vector<T, N>> x)
 {
-    dpx = diffPair(dpx.p, dpx.p > dpy.p ? dOut : T.dzero());
-    dpy = diffPair(dpy.p, dpy.p > dpx.p ? dOut : T.dzero());
+    return __fwd_diff(__normalize_impl)(x);
+}
+__generic<T: __BuiltinFloatingPointType, let N : int>
+[BackwardDerivativeOf(normalize)]
+[ForceInline]
+void __d_distance(inout DifferentialPair<vector<T, N>> x, vector<T, N>.Differential dOut)
+{
+    return __bwd_diff(__normalize_impl)(x, dOut);
 }
 
+// Vector reflect
 __generic<T : __BuiltinFloatingPointType, let N : int>
-[BackwardDerivativeOf(max)]
-void __d_max_vector(inout DifferentialPair<vector<T, N>> dpx, inout DifferentialPair<vector<T, N>> dpy, vector<T, N>.Differential dOut)
+[BackwardDifferentiable]
+vector<T, N> __reflect_impl(vector<T, N> i, vector<T, N> n)
 {
-    VECTOR_MAP_BWD_D_BINARY(T, N, __d_max, dpx, dpy, dOut);
+    return  i - n * (T(2.0) * dot(i, n));
 }
-
-// Minimum
-
-__generic<T : __BuiltinFloatingPointType>
-[ForwardDerivativeOf(min)]
-DifferentialPair<T> __d_min(DifferentialPair<T> dpx, DifferentialPair<T> dpy)
+__generic<T: __BuiltinFloatingPointType, let N : int>
+[ForwardDerivativeOf(reflect)]
+[ForceInline]
+DifferentialPair<vector<T, N>> __d_reflect(DifferentialPair<vector<T, N>> i, DifferentialPair<vector<T, N>> n)
 {
-    return DifferentialPair<T>(
-        min(dpx.p, dpy.p),
-        dpx.p < dpy.p ? dpx.d : dpy.d
-    );
+    return __fwd_diff(__reflect_impl)(i, n);
+}
+__generic<T: __BuiltinFloatingPointType, let N : int>
+[BackwardDerivativeOf(reflect)]
+[ForceInline]
+void __d_reflect(inout DifferentialPair<vector<T, N>> i, inout DifferentialPair<vector<T, N>> n, vector<T, N>.Differential dOut)
+{
+    return __bwd_diff(__reflect_impl)(i, n, dOut);
 }
 
+// Vector refract
 __generic<T : __BuiltinFloatingPointType, let N : int>
-[ForwardDerivativeOf(min)]
-DifferentialPair<vector<T, N>> __d_min_vector(DifferentialPair<vector<T, N>> dpx, DifferentialPair<vector<T, N>> dpy)
+[BackwardDifferentiable]
+vector<T, N> __refract_impl(vector<T, N> i, vector<T, N> n, T eta)
+{
+    let k = T(1.0) - eta * eta * (T(1.0) - dot(n, i) * dot(n, i));
+    return (k < T(0.0)) ? vector<T, N>(T(0.0)) : eta * i - (eta * dot(n, i) + sqrt(max(T(0.0),k))) * n;
+}
+__generic<T: __BuiltinFloatingPointType, let N : int>
+[ForwardDerivativeOf(refract)]
+[ForceInline]
+DifferentialPair<vector<T, N>> __d_refract(DifferentialPair<vector<T, N>> i, DifferentialPair<vector<T, N>> n, DifferentialPair<T> eta)
 {
-    VECTOR_MAP_D_BINARY(T, N, __d_min, dpx, dpy);
+    return __fwd_diff(__refract_impl)(i, n, eta);
+}
+__generic<T: __BuiltinFloatingPointType, let N : int>
+[BackwardDerivativeOf(refract)]
+[ForceInline]
+void __d_refract(inout DifferentialPair<vector<T, N>> i, inout DifferentialPair<vector<T, N>> n, inout DifferentialPair<T> eta, vector<T, N>.Differential dOut)
+{
+    return __bwd_diff(__refract_impl)(i, n, eta, dOut);
 }
 
+// Sine and cosine
 __generic<T : __BuiltinFloatingPointType>
-[BackwardDerivativeOf(min)]
-void __d_min(inout DifferentialPair<T> dpx, inout DifferentialPair<T> dpy, T.Differential dOut)
+[BackwardDifferentiable]
+void __sincos_impl(T x, out T s, out T c)
 {
-    dpx = diffPair(dpx.p, dpx.p < dpy.p ? dOut : T.dzero());
-    dpy = diffPair(dpy.p, dpy.p < dpx.p ? dOut : T.dzero());
+    s = sin(x);
+    c = cos(x);
 }
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
-[BackwardDerivativeOf(min)]
-void __d_min_vector(inout DifferentialPair<vector<T, N>> dpx, inout DifferentialPair<vector<T, N>> dpy, vector<T, N>.Differential dOut)
+[BackwardDifferentiable]
+void __sincos_impl(vector<T, N> x, out vector<T, N> s, out vector<T, N> c)
 {
-    VECTOR_MAP_BWD_D_BINARY(T, N, __d_min, dpx, dpy, dOut);
+    s = sin(x);
+    c = cos(x);
 }
 
-// Raise to a power
-
-__generic<T : __BuiltinFloatingPointType>
-[ForwardDerivativeOf(pow)]
-DifferentialPair<T> __d_pow(DifferentialPair<T> dpx, DifferentialPair<T> dpy)
+__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
+[BackwardDifferentiable]
+void __sincos_impl(matrix<T, N, M> x, out matrix<T, N, M> s, out matrix<T, N, M> c)
 {
-    // Special case
-    if (dpx.p < T(1e-6))
-    {
-        return DifferentialPair<T>(T(0.0), T.dzero());
-    }
-
-    T val = pow(dpx.p, dpy.p);
-    T.Differential d1 = T.dmul(val * log(dpx.p), dpy.d);
-    T.Differential d2 = T.dmul(val * dpy.p / dpx.p, dpx.d);
-    return DifferentialPair<T>(
-        val,
-        T.dadd(d1, d2)
-    );
+    s = sin(x);
+    c = cos(x);
 }
 
-__generic<T : __BuiltinFloatingPointType, let N : int>
-[ForwardDerivativeOf(pow)]
-DifferentialPair<vector<T, N>> __d_pow_vector(DifferentialPair<vector<T, N>> dpx, DifferentialPair<vector<T, N>> dpy)
+__generic<T: __BuiltinFloatingPointType>
+[ForwardDerivativeOf(sincos)]
+[ForceInline]
+void __d_sincos(DifferentialPair<T> x, out DifferentialPair<T> s, out DifferentialPair<T> c)
 {
-    VECTOR_MAP_D_BINARY(T, N, __d_pow, dpx, dpy);
+    __fwd_diff(__sincos_impl)(x, s, c);
 }
 
-__generic<T : __BuiltinFloatingPointType>
-[BackwardDerivativeOf(pow)]
-void __d_pow(inout DifferentialPair<T> dpx, inout DifferentialPair<T> dpy, T.Differential dOut)
+__generic<T : __BuiltinFloatingPointType, let N : int>
+[ForwardDerivativeOf(sincos)]
+[ForceInline]
+void __d_sincos(DifferentialPair<vector<T, N>> x, out DifferentialPair<vector<T, N>> s, out DifferentialPair<vector<T, N>> c)
 {
-    // Special case
-    if (dpx.p < T(1e-6))
-    {
-        dpx = diffPair(dpx.p, T.dzero());
-        dpy = diffPair(dpy.p, T.dzero());
-    }
-    else
-    {
-        T val = pow(dpx.p, dpy.p);
-        dpx = diffPair(
-            dpx.p,
-            T.dmul(val * dpy.p / dpx.p, dOut));
-        dpy = diffPair(
-            dpy.p,
-            T.dmul(val * log(dpx.p), dOut));
-    }
+    __fwd_diff(__sincos_impl)(x, s, c);
 }
 
-__generic<T : __BuiltinFloatingPointType, let N : int>
-[BackwardDerivativeOf(pow)]
-void __d_pow_vector(inout DifferentialPair<vector<T, N>> dpx, inout DifferentialPair<vector<T, N>> dpy, vector<T, N>.Differential dOut)
+__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
+[ForwardDerivativeOf(sincos)]
+[ForceInline]
+void __d_sincos(DifferentialPair<matrix<T, N, M>> x, out DifferentialPair<matrix<T, N, M>> s, out DifferentialPair<matrix<T, N, M>> c)
 {
-    VECTOR_MAP_BWD_D_BINARY(T, N, __d_pow, dpx, dpy, dOut);
+    __fwd_diff(__sincos_impl)(x, s, c);
 }
 
-// Vector dot product
+#if 0
+// TODO: this is not working right now since our type system can't resolve
+// the overload to `sincos` in `[BackwardDerivativeOf]` attribute. We need to implement
+// a proper overload resolver for custom backward derivatives.
 
-__generic<T : __BuiltinFloatingPointType, let N : int>
-[ForwardDerivativeOf(dot)]
-DifferentialPair<T> __d_dot(DifferentialPair<vector<T, N>> dpx, DifferentialPair<vector<T, N>> dpy)
+__generic<T: __BuiltinFloatingPointType>
+[BackwardDerivativeOf(sincos)]
+[ForceInline]
+void __d_sincos(inout DifferentialPair<T> x, T.Differential dS, T.Differential dC)
 {
-    T result = T(0);
-    T.Differential d_result = T.dzero();
-    [ForceUnroll]
-    for (int i = 0; i < N; ++i)
-    {
-        result = result + dpx.p[i] * dpy.p[i];
-        d_result = T.dadd(d_result, T.dmul(dpx.p[i], __slang_noop_cast<T.Differential>(dpy.d[i])));
-        d_result = T.dadd(d_result, T.dmul(dpy.p[i], __slang_noop_cast<T.Differential>(dpx.d[i])));
-    }
-    return DifferentialPair<T>(result, d_result);
+    __bwd_diff(__sincos_impl)(x, s, c);
+}
+__generic<T: __BuiltinFloatingPointType, let N : int>
+[BackwardDerivativeOf(sincos)]
+[ForceInline]
+void __d_sincos(inout DifferentialPair<vector<T, N>> x, vector<T, N>.Differential dS, vector<T, N>.Differential dC)
+{
+    __bwd_diff(__sincos_impl)(x, s, c);
 }
 
-__generic<T : __BuiltinFloatingPointType, let N : int>
-[BackwardDerivativeOf(dot)]
-void __d_dot(inout DifferentialPair<vector<T, N>> dpx, inout DifferentialPair<vector<T, N>> dpy, T.Differential dOut)
+__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
+[BackwardDerivativeOf(sincos)]
+[ForceInline]
+void __d_sincos(inout DifferentialPair<matrix<T, N, M>> x, matrix<T, N, M>.Differential dS, matrix<T, N, M>.Differential dC)
 {
-    vector<T, N>.Differential x_d_result, y_d_result;
-    [ForceUnroll]
-    for (int i = 0; i < N; ++i)
-    {
-        x_d_result[i] = dpy.p[i] * __slang_noop_cast<T>(dOut);
-        y_d_result[i] = dpx.p[i] * __slang_noop_cast<T>(dOut);
-    }
-    dpx = diffPair(dpx.p, x_d_result);
-    dpy = diffPair(dpy.p, y_d_result);
+    __bwd_diff(__sincos_impl)(x, s, c);
 }
+
+#endif
+\ No newline at end of file
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 0b7ca535b..5a01bc132 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -1526,7 +1526,7 @@ uint countbits(uint value);
 
 // Cross product
 // TODO: SPIRV does not support integer vectors.
-__generic<T : __BuiltinArithmeticType>
+__generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cross _0 _1")
@@ -1539,6 +1539,19 @@ vector<T,3> cross(vector<T,3> left, vector<T,3> right)
         left.x * right.y - left.y * right.x);
 }
 
+__generic<T : __BuiltinIntegerType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cross _0 _1")
+[__readNone]
+vector<T, 3> cross(vector<T, 3> left, vector<T, 3> right)
+{
+    return vector<T, 3>(
+        left.y * right.z - left.z * right.y,
+        left.z * right.x - left.x * right.z,
+        left.x * right.y - left.y * right.x);
+}
+
 // Convert encoded color
 __target_intrinsic(hlsl)
 [__readNone]
@@ -2696,7 +2709,7 @@ matrix<T,N,M> log2(matrix<T,N,M> x)
 
 // multiply-add
 
-__generic<T : __BuiltinArithmeticType>
+__generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, fma)
 __target_intrinsic(cuda, "$P_fma($0, $1, $2)")
@@ -2705,7 +2718,7 @@ __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _
 [__readNone]
 T mad(T mvalue, T avalue, T bvalue);
 
-__generic<T : __BuiltinArithmeticType, let N : int>
+__generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, fma)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
@@ -2715,7 +2728,7 @@ vector<T, N> mad(vector<T, N> mvalue, vector<T, N> avalue, vector<T, N> bvalue)
     VECTOR_MAP_TRINARY(T, N, mad, mvalue, avalue, bvalue);
 }
 
-__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
+__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 [__readNone]
 matrix<T, N, M> mad(matrix<T, N, M> mvalue, matrix<T, N, M> avalue, matrix<T, N, M> bvalue)
@@ -2723,6 +2736,34 @@ matrix<T, N, M> mad(matrix<T, N, M> mvalue, matrix<T, N, M> avalue, matrix<T, N,
     MATRIX_MAP_TRINARY(T, N, M, mad, mvalue, avalue, bvalue);
 }
 
+__generic<T : __BuiltinIntegerType>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl, fma)
+__target_intrinsic(cuda, "$P_fma($0, $1, $2)")
+__target_intrinsic(cpp, "$P_fma($0, $1, $2)")
+__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
+[__readNone]
+T mad(T mvalue, T avalue, T bvalue);
+
+__generic<T : __BuiltinIntegerType, let N : int>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl, fma)
+__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
+[__readNone]
+vector<T, N> mad(vector<T, N> mvalue, vector<T, N> avalue, vector<T, N> bvalue)
+{
+    VECTOR_MAP_TRINARY(T, N, mad, mvalue, avalue, bvalue);
+}
+
+__generic<T : __BuiltinIntegerType, let N : int, let M : int>
+__target_intrinsic(hlsl)
+[__readNone]
+matrix<T, N, M> mad(matrix<T, N, M> mvalue, matrix<T, N, M> avalue, matrix<T, N, M> bvalue)
+{
+    MATRIX_MAP_TRINARY(T, N, M, mad, mvalue, avalue, bvalue);
+}
+
+
 // maximum
 __generic<T : __BuiltinIntegerType>
 __target_intrinsic(hlsl)
@@ -3763,7 +3804,7 @@ matrix<T,N,M> tanh(matrix<T,N,M> x)
 }
 
 // Matrix transpose
-__generic<T : __BuiltinType, let N : int, let M : int>
+__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 [__readNone]
@@ -3775,6 +3816,30 @@ matrix<T, M, N> transpose(matrix<T, N, M> x)
             result[r][c] = x[c][r];
     return result;
 }
+__generic<T : __BuiltinIntegerType, let N : int, let M : int>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+[__readNone]
+matrix<T, M, N> transpose(matrix<T, N, M> x)
+{
+    matrix<T, M, N> result;
+    for (int r = 0; r < M; ++r)
+        for (int c = 0; c < N; ++c)
+            result[r][c] = x[c][r];
+    return result;
+}
+__generic<T : __BuiltinLogicalType, let N : int, let M : int>
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl)
+[__readNone]
+matrix<T, M, N> transpose(matrix<T, N, M> x)
+{
+    matrix<T, M, N> result;
+    for (int r = 0; r < M; ++r)
+        for (int c = 0; c < N; ++c)
+            result[r][c] = x[c][r];
+    return result;
+}
 
 // Truncate to integer
 __generic<T : __BuiltinFloatingPointType>
diff --git a/tests/autodiff-dstdlib/vector-cross.slang b/tests/autodiff-dstdlib/vector-cross.slang
new file mode 100644
index 000000000..be08894cb
--- /dev/null
+++ b/tests/autodiff-dstdlib/vector-cross.slang
@@ -0,0 +1,40 @@
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -output-using-type
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -shaderobj -output-using-type
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<float> outputBuffer;
+
+[BackwardDifferentiable]
+float3 crossImpl(float3 x, float3 y)
+{
+    return float3(x.y * y.z - y.y * x.z,
+                  x.z * y.x - y.z * x.x,
+                  x.x * y.y - y.x * x.y);
+}
+
+[numthreads(1, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    {
+        let x = float3(-0.51, 0.74, 0.86);
+        let y = float3(1.43, -0.92, 4.36);
+        let dOut = float3(3.41, 6.55, 2.39);
+        var dpx = diffPair(x);
+        var dpy = diffPair(y);
+        __bwd_diff(cross)(dpx, dpy, dOut);
+        outputBuffer[0] = dpx.d[0]; 
+        outputBuffer[1] = dpx.d[1]; 
+        outputBuffer[2] = dpx.d[2]; 
+        outputBuffer[3] = dpy.d[0]; 
+        outputBuffer[4] = dpy.d[1]; 
+        outputBuffer[5] = dpy.d[2]; 
+
+        __bwd_diff(crossImpl)(dpx, dpy, dOut);
+        outputBuffer[6] = dpx.d[0];
+        outputBuffer[7] = dpx.d[1];
+        outputBuffer[8] = dpx.d[2];
+        outputBuffer[9] = dpy.d[0];
+        outputBuffer[10] = dpy.d[1];
+        outputBuffer[11] = dpy.d[2];
+    }
+}
diff --git a/tests/autodiff-dstdlib/vector-cross.slang.expected.txt b/tests/autodiff-dstdlib/vector-cross.slang.expected.txt
new file mode 100644
index 000000000..9d472f078
--- /dev/null
+++ b/tests/autodiff-dstdlib/vector-cross.slang.expected.txt
@@ -0,0 +1,13 @@
+type: float
+-30.756802
+11.449901
+12.503700
+3.864400
+-4.151500
+5.863900
+-30.756804
+11.449901
+12.503700
+3.864400
+-4.151500
+5.863900
diff --git a/tests/autodiff-dstdlib/vector-length.slang b/tests/autodiff-dstdlib/vector-length.slang
new file mode 100644
index 000000000..c5064e54e
--- /dev/null
+++ b/tests/autodiff-dstdlib/vector-length.slang
@@ -0,0 +1,36 @@
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -output-using-type
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -shaderobj -output-using-type
+
+//TEST_INPUT:ubuffer(data=[0 0 0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<float> outputBuffer;
+
+float lengthDiffX(float3 x)
+{
+    return (length(float3(x.x + 0.001, x.yz)) - length(float3(x.x - 0.001, x.yz))) / 0.002;
+}
+float lengthDiffY(float3 x)
+{
+    return (length(float3(x.x, x.y + 0.001, x.z)) - length(float3(x.x, x.y - 0.001, x.z))) / 0.002;
+}
+float lengthDiffZ(float3 x)
+{
+    return (length(float3(x.xy, x.z + 0.001)) - length(float3(x.xy, x.z - 0.001))) / 0.002;
+}
+
+[numthreads(1, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    {
+        let x = float3(12, 23, 31);
+        var dpx = diffPair(x);
+        __bwd_diff(length)(dpx, 1.0);
+        outputBuffer[0] = dpx.d[0];
+        outputBuffer[1] = dpx.d[1];
+        outputBuffer[2] = dpx.d[2];
+
+        // for reference:
+        //outputBuffer[3] = lengthDiffX(x);
+        //outputBuffer[4] = lengthDiffY(x);
+        //outputBuffer[5] = lengthDiffZ(x);
+    }
+}
diff --git a/tests/autodiff-dstdlib/vector-length.slang.expected.txt b/tests/autodiff-dstdlib/vector-length.slang.expected.txt
new file mode 100644
index 000000000..3c3f3727d
--- /dev/null
+++ b/tests/autodiff-dstdlib/vector-length.slang.expected.txt
@@ -0,0 +1,4 @@
+type: float
+0.296862
+0.568986
+0.766895