From 7f567df6937b33c653c424af3abb20d32eb80561 Mon Sep 17 00:00:00 2001
From: Tim Foley <tfoleyNV@users.noreply.github.com>
Date: Wed, 2 Sep 2020 09:51:25 -0700
Subject: Add support for (undocumented) HLSL 16-bit bit-cast ops (#1528)

As of SM 6.2, the dxc compiler added support for a set of 16-bit bit-cast operations to mirror the `asuint`, `asfloat`, and `asint` operations that were provided for 32-bit scalar types. These operations are not publicly documented, so we didn't think to add them.

It should be noted that there was already a similar operation in HLSL, called `f32tof16`, that took as input a `float` and then packed a half-precision version of it into the low bits of a `uint`. The problem is that using that operation for `half`->`uint16_t` conversion required a round trip through a `float`, and downstream compilers seemingly can't optimize away that conversion.

This change adds the new operations along with a test that tries to make use of them to ensure the results are what is expected. There are enough cases to cover that I had to write the test in a way where each thread only writes out a subset of the required output.

There are two other changes here are that are not directly related to the main feature:

First, it seems like the `[__forceInlineEarly]` attribute on some of these overloads interacts poorly with generics, and results in an `IRVectorType` appearing at local scope in the output code. That is semantically reasonable given our IR model, but it would ideally be something that gets eliminated as a result of deduplication of types. For now I've introduced a slight hack to make types always get inlined into their use sites during emission, which should handle the case of locally-defined types. I'm not 100% happy with that solution, but it seemed better than introducing a bunch of unrelated fixes into this PR.

Second, the way that conversion operations were being declared for matrix types seems to have been incorrect: we had a single *explicit* initializer added to matrix types via an `extension` that allowed them to be initialized from other matrix types with the same size and *any* element type. In order to support implicit conversions of matrix types, I cribbed the code we were already using to introduce implicit conversion operations for vector types.
---
 source/slang/core.meta.slang       | 34 ++++++++++++++++----
 source/slang/hlsl.meta.slang       | 65 ++++++++++++++++++++++++++++++++++++++
 source/slang/slang-emit-c-like.cpp | 12 +++++++
 3 files changed, 104 insertions(+), 7 deletions(-)

(limited to 'source')
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index 0eaf5cb1a..62039992e 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -520,13 +520,6 @@ for( int C = 2; C <= 4; ++C )
     }
     sb << ");\n";
 
-
-    // initialize from another matrix of the same size
-    //
-    // TODO(tfoley): See comment about how this overlaps
-    // with implicit conversion, in the `vector` case above
-    sb << "__generic<U> __init(matrix<U," << R << ", " << C << ">);\n";
-
     // initialize from a matrix of larger size
     for(int rr = R; rr <= 4; ++rr)
     for( int cc = C; cc <= 4; ++cc )
@@ -537,6 +530,33 @@ for( int C = 2; C <= 4; ++C )
 
     sb << "}\n";
 }
+
+for (int tt = 0; tt < kBaseTypeCount; ++tt)
+{
+    if(kBaseTypes[tt].tag == BaseType::Void) continue;
+    auto toType = kBaseTypes[tt].name;
+}}}}
+__generic<let R : int, let C : int> extension matrix<$(toType),R,C>
+{
+${{{{
+    for (int ff = 0; ff < kBaseTypeCount; ++ff)
+    {
+        if(kBaseTypes[ff].tag == BaseType::Void) continue;
+        if( tt == ff ) continue;
+
+        auto cost = getBaseTypeConversionCost(
+            kBaseTypes[tt],
+            kBaseTypes[ff]);
+        auto fromType = kBaseTypes[ff].name;
+}}}}
+    __implicit_conversion($(cost))
+    __init(matrix<$(fromType),R,C> value);
+${{{{
+    }
+}}}}
+}
+${{{{
+}
 }}}}
 
 
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index c89ab8ff9..aeadf8eba 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -990,6 +990,71 @@ __generic<let N : int, let M : int>
 matrix<uint,N,M> asuint(matrix<uint,N,M> x)
 { return x; }
 
+
+// 16-bit bitcast ops (HLSL SM 6.2)
+//
+// TODO: We need to map these to GLSL/SPIR-V
+// operations that don't require an intermediate
+// conversion to fp32.
+
+// Identity cases:
+
+[__unsafeForceInlineEarly] float16_t asfloat16(float16_t value) { return value; }
+[__unsafeForceInlineEarly] vector<float16_t,N> asfloat16<let N : int>(vector<float16_t,N> value) { return value; }
+[__unsafeForceInlineEarly] matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<float16_t,R,C> value) { return value; }
+
+[__unsafeForceInlineEarly] int16_t asint16(int16_t value) { return value; }
+[__unsafeForceInlineEarly] vector<int16_t,N> asint16<let N : int>(vector<int16_t,N> value) { return value; }
+[__unsafeForceInlineEarly] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return value; }
+
+[__unsafeForceInlineEarly] uint16_t asuint16(uint16_t value) { return value; }
+[__unsafeForceInlineEarly] vector<uint16_t,N> asuint16<let N : int>(vector<uint16_t,N> value) { return value; }
+[__unsafeForceInlineEarly] matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<uint16_t,R,C> value) { return value; }
+
+// Signed<->unsigned cases:
+
+[__unsafeForceInlineEarly] int16_t asint16(uint16_t value) { return value; }
+[__unsafeForceInlineEarly] vector<int16_t,N> asint16<let N : int>(vector<uint16_t,N> value) { return value; }
+[__unsafeForceInlineEarly] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<uint16_t,R,C> value) { return value; }
+
+[__unsafeForceInlineEarly] uint16_t asuint16(int16_t value) { return value; }
+[__unsafeForceInlineEarly] vector<uint16_t,N> asuint16<let N : int>(vector<int16_t,N> value) { return value; }
+[__unsafeForceInlineEarly] matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return value; }
+
+// Float->unsigned cases:
+
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl, "uint16_t(packHalf2x16(vec2($0, 0.0)))")
+uint16_t asuint16(float16_t value);
+
+vector<uint16_t,N> asuint16<let N : int>(vector<float16_t,N> value)
+{ VECTOR_MAP_UNARY(uint16_t, N, asuint16, value); }
+
+matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<float16_t,R,C> value)
+{ MATRIX_MAP_UNARY(uint16_t, R, C, asuint16, value); }
+
+// Unsigned->float cases:
+
+__target_intrinsic(hlsl)
+__target_intrinsic(glsl, "float16_t(unpackHalf2x16($0).x)")
+float16_t asfloat16(uint16_t value);
+
+vector<float16_t,N> asfloat16<let N : int>(vector<uint16_t,N> value)
+{ VECTOR_MAP_UNARY(float16_t, N, asfloat16, value); }
+
+matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<uint16_t,R,C> value)
+{ MATRIX_MAP_UNARY(float16_t, R, C, asfloat16, value); }
+
+// Float<->signed cases:
+
+__target_intrinsic(hlsl) [__unsafeForceInlineEarly] int16_t asint16(float16_t value) { return asuint16(value); }
+__target_intrinsic(hlsl) [__unsafeForceInlineEarly] vector<int16_t,N> asint16<let N : int>(vector<float16_t,N> value) { return asuint16(value); }
+__target_intrinsic(hlsl) [__unsafeForceInlineEarly] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<float16_t,R,C> value) { return asuint16(value); }
+
+__target_intrinsic(hlsl) [__unsafeForceInlineEarly] float16_t asfloat16(int16_t value) { return asfloat16(asuint16(value)); }
+__target_intrinsic(hlsl) [__unsafeForceInlineEarly] vector<float16_t,N> asfloat16<let N : int>(vector<int16_t,N> value) { return asfloat16(asuint16(value)); }
+__target_intrinsic(hlsl) [__unsafeForceInlineEarly] matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return asfloat16(asuint16(value)); }
+
 // Inverse tangent (HLSL SM 1.0)
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp
index d85085639..4b9c01c55 100644
--- a/source/slang/slang-emit-c-like.cpp
+++ b/source/slang/slang-emit-c-like.cpp
@@ -974,6 +974,18 @@ bool CLikeSourceEmitter::shouldFoldInstIntoUseSites(IRInst* inst)
     // for temporary variables.
     auto type = inst->getDataType();
 
+    // We treat instructions that yield a type as things we should *always* fold.
+    //
+    // TODO: In general, at the point where we emit code we do not expect to
+    // find types being constructed locally (inside function bodies), but this
+    // can end up happening because of interaction between different features.
+    // Notably, if a generic function gets force-inlined early in codegen,
+    // then any types it constructs will be inlined into the body of the caller
+    // by default.
+    //
+    if(as<IRType>(inst) || as<IRTypeKind>(type))
+        return true;
+
     // Unwrap any layers of array-ness from the type, so that
     // we can look at the underlying data type, in case we
     // should *never* expose a value of that type
-- 
cgit v1.2.3