From b1317cd16ab9c827596a28ccf4258ef1bb672d92 Mon Sep 17 00:00:00 2001 From: Tim Foley Date: Mon, 9 Mar 2020 09:02:36 -0700 Subject: Yet more definitions moved into the stdlib (#1263) The only big catch that I ran into with this batch was that I found the `float.getPi()` function was being emitted to the output GLSL even when that function wasn't being used. This seems to have been a latent problem in the earlier PR, but was only surfaced in the tests once a Slang->GLSL test started using another intrinsic that led to the `float : __BuiltinFloatingPointType` witness table being live in the IR. The fix for the gotcha here was to add a late IR pass that basically empties out all witness tables in the IR, so that functions that are only referenced by witness tables can then be removed as dead code. This pass is something we should *not* apply if/when we start supporting real dynamic dispatch through witness tables, but that is a problem to be solved on another day. The remaining tricky pieces of this change were: * Needed to remember to mark functions as target intrinsics on HLSL and/or GLSL as appropriate (hopefully I caught all the cases) so they don't get emitted as source there. * The `msad4` function in HLSL is very poorly documented, so filling in its definition was tricky. I made my best effort based on how it is described on MSDN, but it is likely that if anybody wants to rely on this function they will need us to vet our results with some tests. --- source/slang/hlsl.meta.slang | 202 ++++++++++++++++++------- source/slang/slang-emit.cpp | 11 ++ source/slang/slang-ir-strip-witness-tables.cpp | 33 ++++ source/slang/slang-ir-strip-witness-tables.h | 10 ++ source/slang/slang.vcxproj | 2 + source/slang/slang.vcxproj.filters | 8 +- 6 files changed, 211 insertions(+), 55 deletions(-) create mode 100644 source/slang/slang-ir-strip-witness-tables.cpp create mode 100644 source/slang/slang-ir-strip-witness-tables.h (limited to 'source/slang') diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 572b64b21..d9e40dd4f 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -1199,9 +1199,24 @@ matrix floor(matrix x) } // Fused multiply-add for doubles +__target_intrinsic(hlsl) +__target_intrinsic(glsl) double fma(double a, double b, double c); -__generic vector fma(vector a, vector b, vector c); -__generic matrix fma(matrix a, matrix b, matrix c); + +__generic +__target_intrinsic(hlsl) +__target_intrinsic(glsl) +vector fma(vector a, vector b, vector c) +{ + VECTOR_MAP_TRINARY(double, N, fma, a, b, c); +} + +__generic +__target_intrinsic(hlsl) +matrix fma(matrix a, matrix b, matrix c) +{ + MATRIX_MAP_TRINARY(double, N, M, fma, a, b, c); +} // Floating point remainder of x/y __generic @@ -1425,7 +1440,6 @@ __generic __target_intrinsic(hlsl) __target_intrinsic(cpu) __target_intrinsic(cuda) -//__target_intrinsic(glsl, "(!(isinf($0) || isnan($0)))") bool isfinite(T x) { return !(isinf(x) || isnan(x)); @@ -1433,7 +1447,6 @@ bool isfinite(T x) __generic __target_intrinsic(hlsl) -//__target_intrinsic(glsl, "(!(isinf($0) || isnan($0)))") vector isfinite(vector x) { VECTOR_MAP_UNARY(bool, N, isfinite, x); @@ -1488,18 +1501,16 @@ matrix isnan(matrix x) __generic __target_intrinsic(hlsl) -__target_intrinsic(glsl, "($0 * pow(2.0f, $1))") -T ldexp(T x, T exp); -/*{ +T ldexp(T x, T exp) +{ return x * exp2(exp); -}*/ +} __generic __target_intrinsic(hlsl) -__target_intrinsic(glsl, "($0 * pow(2.0f, $1))") vector ldexp(vector x, vector exp) { - VECTOR_MAP_BINARY(T, N, ldexp, x, exp); + return x * exp2(exp); } __generic @@ -1522,17 +1533,17 @@ T length(vector x) __generic __target_intrinsic(hlsl) __target_intrinsic(glsl, mix) -T lerp(T x, T y, T s); -/*{ - return x * (1 - s) + y * s; -}*/ +T lerp(T x, T y, T s) +{ + return x * (T(1.0f) - s) + y * s; +} __generic __target_intrinsic(hlsl) __target_intrinsic(glsl, mix) vector lerp(vector x, vector y, vector s) { - VECTOR_MAP_TRINARY(T, N, lerp, x, y, s); + return x * (T(1.0f) - s) + y * s; } __generic @@ -1543,7 +1554,14 @@ matrix lerp(matrix x, matrix y, matrix s) } // Legacy lighting function (obsolete) -float4 lit(float n_dot_l, float n_dot_h, float m); +__target_intrinsic(hlsl) +float4 lit(float n_dot_l, float n_dot_h, float m) +{ + let ambient = 1.0f; + let diffuse = max(n_dot_l, 0.0f); + let specular = step(0.0f, n_dot_l) * max(n_dot_h * m, 0.0f); + return float4(ambient, diffuse, specular, 1.0f); +} // Base-e logarithm __generic @@ -1606,14 +1624,25 @@ matrix log2(matrix x) // multiply-add +__generic +__target_intrinsic(hlsl) __target_intrinsic(glsl, fma) -__generic T mad(T mvalue, T avalue, T bvalue); +T mad(T mvalue, T avalue, T bvalue); +__generic +__target_intrinsic(hlsl) __target_intrinsic(glsl, fma) -__generic vector mad(vector mvalue, vector avalue, vector bvalue); +vector mad(vector mvalue, vector avalue, vector bvalue) +{ + VECTOR_MAP_TRINARY(T, N, mad, mvalue, avalue, bvalue); +} -__target_intrinsic(glsl, fma) -__generic matrix mad(matrix mvalue, matrix avalue, matrix bvalue); +__generic +__target_intrinsic(hlsl) +matrix mad(matrix mvalue, matrix avalue, matrix bvalue) +{ + MATRIX_MAP_TRINARY(T, N, M, mad, mvalue, avalue, bvalue); +} // maximum __generic @@ -1677,32 +1706,79 @@ matrix modf(matrix x, out matrix ip) } // msad4 (whatever that is) -uint4 msad4(uint reference, uint2 source, uint4 accum); +__target_intrinsic(hlsl) +uint4 msad4(uint reference, uint2 source, uint4 accum) +{ + int4 bytesRef = (reference >> uint4(24, 16, 8, 0)) & 0xFF; + int4 bytesX = (source.x >> uint4(24, 16, 8, 0)) & 0xFF; + int4 bytesY = (source.y >> uint4(24, 16, 8, 0)) & 0xFF; + + uint4 mask = bytesRef == 0 ? 0 : 0xFFFFFFFFu; + + uint4 result = accum; + result += mask.x & abs(bytesRef - int4(bytesX.x, bytesY.y, bytesY.z, bytesY.w)); + result += mask.y & abs(bytesRef - int4(bytesX.x, bytesX.y, bytesY.z, bytesY.w)); + result += mask.z & abs(bytesRef - int4(bytesX.x, bytesX.y, bytesX.z, bytesY.w)); + result += mask.w & abs(bytesRef - int4(bytesX.x, bytesX.y, bytesX.z, bytesX.w)); + return result; +} // General inner products // scalar-scalar -__generic T mul(T x, T y); +__generic +__intrinsic_op($(kIROp_Mul)) +T mul(T x, T y); // scalar-vector and vector-scalar -__generic vector mul(vector x, T y); -__generic vector mul(T x, vector y); +__generic +__intrinsic_op($(kIROp_Mul)) +vector mul(vector x, T y); + +__generic +__intrinsic_op($(kIROp_Mul)) +vector mul(T x, vector y); // scalar-matrix and matrix-scalar -__generic matrix mul(matrix x, T y); -__generic matrix mul(T x, matrix y); +__generic +__intrinsic_op($(kIROp_Mul)) +matrix mul(matrix x, T y); + +__generic +__intrinsic_op($(kIROp_Mul)) +matrix mul(T x, matrix y); // vector-vector (dot product) -__generic __intrinsic_op(dot) T mul(vector x, vector y); +__generic +__target_intrinsic(hlsl) +__target_intrinsic(glsl, "dot") +T mul(vector x, vector y) +{ + return dot(x, y); +} + +${{{{ +// TODO: The following functions could conceivably be defined +// in the stdlib for the benefit of targets without direct +// support for matrices, but the use of `__intrinsic_op` to +// map them to a dedicated IR instruction interferes with +// that choice. +}}}} // vector-matrix -__generic __intrinsic_op(mulVectorMatrix) vector mul(vector x, matrix y); +__generic +__intrinsic_op(mulVectorMatrix) +vector mul(vector x, matrix y); // matrix-vector -__generic __intrinsic_op(mulMatrixVector) vector mul(matrix x, vector y); +__generic +__intrinsic_op(mulMatrixVector) +vector mul(matrix x, vector y); // matrix-matrix -__generic __intrinsic_op(mulMatrixMatrix) matrix mul(matrix x, matrix y); +__generic +__intrinsic_op(mulMatrixMatrix) +matrix mul(matrix x, matrix y); // noise (deprecated) @@ -1753,10 +1829,13 @@ int NonUniformResourceIndex(int index) } // Normalize a vector -__generic vector normalize(vector x); -/*{ +__generic +__target_intrinsic(hlsl) +__target_intrinsic(glsl) +vector normalize(vector x) +{ return x / length(x); -}*/ +} // Raise to a power __generic @@ -1856,31 +1935,33 @@ void ProcessTriTessFactorsMin( __generic __target_intrinsic(hlsl) __target_intrinsic(glsl) -T radians(T x); +T radians(T x) +{ + return x * (T.getPi() / T(180.0f)); +} __generic __target_intrinsic(hlsl) __target_intrinsic(glsl) vector radians(vector x) { - VECTOR_MAP_UNARY(T, N, radians, x); + return x * (T.getPi() / T(180.0f)); } __generic __target_intrinsic(hlsl) matrix radians(matrix x) { - MATRIX_MAP_UNARY(T, N, M, radians, x); + return x * (T.getPi() / T(180.0f)); } // Approximate reciprocal __generic __target_intrinsic(hlsl) -__target_intrinsic(glsl, "1.0/($0)") -T rcp(T x); -/*{ - return T(1) / x; -}*/ +T rcp(T x) +{ + return T(1.0) / x; +} __generic __target_intrinsic(hlsl) @@ -1891,7 +1972,6 @@ vector rcp(vector x) __generic __target_intrinsic(hlsl) -// Note: GLSL doesn't define a vector `rcp`, so not intrinsic there matrix rcp(matrix x) { MATRIX_MAP_UNARY(T, N, M, rcp, x); @@ -1899,27 +1979,35 @@ matrix rcp(matrix x) // Reflect incident vector across plane with given normal __generic -vector reflect(vector i, vector n); -/*{ +__target_intrinsic(hlsl) +__target_intrinsic(glsl) +vector reflect(vector i, vector n) +{ return i - T(2) * dot(n,i) * n; -}*/ +} // Refract incident vector given surface normal and index of refraction __generic -vector refract(vector i, vector n, float eta); -/*{ +__target_intrinsic(hlsl) +__target_intrinsic(glsl) +vector refract(vector i, vector n, T eta) +{ let dotNI = dot(n,i); let k = T(1) - eta*eta*(T(1) - dotNI * dotNI); - if(k < 0) return vector(T(0)); + if(k < T(0)) return vector(T(0)); return eta * i - (eta * dotNI + sqrt(k)) * n; -}*/ +} // Reverse order of bits __target_intrinsic(glsl, "bitfieldReverse") uint reversebits(uint value); __target_intrinsic(glsl, "bitfieldReverse") -__generic vector reversebits(vector value); +__generic +vector reversebits(vector value) +{ + VECTOR_MAP_UNARY(uint, N, reversebits, value); +} // Round-to-nearest __generic @@ -2073,7 +2161,13 @@ matrix sinh(matrix x) // Smooth step (Hermite interpolation) __generic -T smoothstep(T min, T max, T x); +__target_intrinsic(hlsl) +__target_intrinsic(glsl) +T smoothstep(T min, T max, T x) +{ + let t = saturate((x - min) / (max - min)); + return t * t * (T(3.0f) - (t + t)); +} __generic __target_intrinsic(hlsl) @@ -2113,10 +2207,10 @@ matrix sqrt(matrix x) __generic __target_intrinsic(hlsl) __target_intrinsic(glsl) -T step(T y, T x); -/*{ +T step(T y, T x) +{ return x < y ? T(0.0f) : T(1.0f); -}*/ +} __generic __target_intrinsic(hlsl) diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp index dcca7d25e..e613f5462 100644 --- a/source/slang/slang-emit.cpp +++ b/source/slang/slang-emit.cpp @@ -15,6 +15,7 @@ #include "slang-ir-specialize.h" #include "slang-ir-specialize-resources.h" #include "slang-ir-ssa.h" +#include "slang-ir-strip-witness-tables.h" #include "slang-ir-union.h" #include "slang-ir-validate.h" #include "slang-ir-wrap-structured-buffers.h" @@ -441,6 +442,16 @@ Result linkAndOptimizeIR( break; } + // For all targets that don't support true dynamic dispatch through + // witness tables (that is all targets at present), we need + // to eliminate witness tables from the IR so that they + // don't keep symbols live that we don't actually need. + stripWitnessTables(irModule); +#if 0 + dumpIRIfEnabled(compileRequest, irModule, "AFTER STRIP WITNESS TABLES"); +#endif + validateIRModuleIfEnabled(compileRequest, irModule); + // The resource-based specialization pass above // may create specialized versions of functions, but // it does not try to completely eliminate the original diff --git a/source/slang/slang-ir-strip-witness-tables.cpp b/source/slang/slang-ir-strip-witness-tables.cpp new file mode 100644 index 000000000..8536508ba --- /dev/null +++ b/source/slang/slang-ir-strip-witness-tables.cpp @@ -0,0 +1,33 @@ +// slang-ir-strip-witness-tables.cpp +#include "slang-ir-strip-witness-tables.h" + +#include "slang-ir.h" +#include "slang-ir-insts.h" + +namespace Slang +{ + +void stripWitnessTables(IRModule* module) +{ + // Our goal here is to empty out any witness tables in + // the IR so that they don't keep other symbols alive + // further into compilation. Luckily we expect all + // witness tables to live directly at the global scope + // (or inside of a generic, which we can ignore for + // now because the emit logic also ignores generics), + // and there is a single function we can call to + // remove all of the content from the witness tables + // (since the key-value associations are stored as + // children of each table). + + for( auto inst : module->getGlobalInsts() ) + { + auto witnessTable = as(inst); + if(!witnessTable) + continue; + + witnessTable->removeAndDeallocateAllDecorationsAndChildren(); + } +} + +} \ No newline at end of file diff --git a/source/slang/slang-ir-strip-witness-tables.h b/source/slang/slang-ir-strip-witness-tables.h new file mode 100644 index 000000000..43bd0127d --- /dev/null +++ b/source/slang/slang-ir-strip-witness-tables.h @@ -0,0 +1,10 @@ +// slang-ir-strip-witness-tables.cpp +#pragma once + +namespace Slang +{ +struct IRModule; + + /// Strip the contents of all witness table instructions from the given IR `module` +void stripWitnessTables(IRModule* module); +} \ No newline at end of file diff --git a/source/slang/slang.vcxproj b/source/slang/slang.vcxproj index 76cffe08b..2f55fffdc 100644 --- a/source/slang/slang.vcxproj +++ b/source/slang/slang.vcxproj @@ -230,6 +230,7 @@ + @@ -313,6 +314,7 @@ + diff --git a/source/slang/slang.vcxproj.filters b/source/slang/slang.vcxproj.filters index 449f72069..7a0f465fe 100644 --- a/source/slang/slang.vcxproj.filters +++ b/source/slang/slang.vcxproj.filters @@ -1,4 +1,4 @@ - + @@ -249,6 +249,9 @@ Header Files + + Header Files + @@ -467,6 +470,9 @@ Source Files + + Source Files + -- cgit v1.2.3