CUDA support improvements (#1168)

* Add test result for compile-to-cuda * Add RAII for some CUDA types to simplify usage. * First pass handling of some instrinsics on CUDA (for example transcendentals) * CUDA working with built in intrinsics. * Add missing CUDA prelude intrinsics. * CUDA matches CPU output on simple-cross-compile.slang * First pass at hlsl-scalar-float-intrinsic.slang test. * Fix smoothstep impl on CUDA and CPU. * Fixed step intrinsic on CUDA/CPU. * Added operator[] to Matrix for C++, to allow row access. Needs a fix for CUDA. * Fixed warning on clang build.
author: jsmall-nvidia <jsmall@nvidia.com> 2020-01-21 09:38:10 -0500
committer: GitHub <noreply@github.com> 2020-01-21 09:38:10 -0500
commit: 47392bc72b826b4ad427b703391a77e697735a65 (patch)
tree: 7c541c4295742b765124f42bab9f713276c83580
parent: a8669ade5cb3add8b9ce08e2c3bd96e93190bca8 (diff)
19 files changed, 746 insertions, 154 deletions
diff --git a/prelude/slang-cpp-scalar-intrinsics.h b/prelude/slang-cpp-scalar-intrinsics.h
index d89b20f44..e89338186 100644
--- a/prelude/slang-cpp-scalar-intrinsics.h
+++ b/prelude/slang-cpp-scalar-intrinsics.h
@@ -48,6 +48,7 @@ SLANG_FORCE_INLINE float F32_asin(float f) { return ::asinf(f); }
 SLANG_FORCE_INLINE float F32_acos(float f) { return ::acosf(f); }
 SLANG_FORCE_INLINE float F32_atan(float f) { return ::atanf(f); }
 SLANG_FORCE_INLINE float F32_log2(float f) { return ::log2f(f); }
+SLANG_FORCE_INLINE float F32_log(float f) { return ::logf(f); }
 SLANG_FORCE_INLINE float F32_exp2(float f) { return ::exp2f(f); }
 SLANG_FORCE_INLINE float F32_exp(float f) { return ::expf(f); }
 SLANG_FORCE_INLINE float F32_abs(float f) { return ::fabsf(f); }
@@ -66,11 +67,15 @@ SLANG_FORCE_INLINE float F32_max(float a, float b) { return a > b ? a : b; }
 SLANG_FORCE_INLINE float F32_pow(float a, float b) { return ::powf(a, b); }
 SLANG_FORCE_INLINE float F32_fmod(float a, float b) { return ::fmodf(a, b); }
 SLANG_FORCE_INLINE float F32_remainder(float a, float b) { return ::remainderf(a, b); }
-SLANG_FORCE_INLINE float F32_step(float a, float b) { return float(a >= b); }
+SLANG_FORCE_INLINE float F32_step(float a, float b) { return float(b >= a); }
 SLANG_FORCE_INLINE float F32_atan2(float a, float b) { return float(atan2(a, b)); }
 
 // Ternary 
-SLANG_FORCE_INLINE float F32_smoothstep(float min, float max, float x) { return x < min ? min : ((x > max) ? max : x / (max - min)); }
+SLANG_FORCE_INLINE float F32_smoothstep(float min, float max, float x) 
+{ 
+    const float t = x < min ? 0.0f : ((x > max) ? 1.0f : (x - min) / (max - min)); 
+    return t * t * (3.0 - 2.0 * t);
+}
 SLANG_FORCE_INLINE float F32_lerp(float x, float y, float s) { return x + s * (y - x); }
 SLANG_FORCE_INLINE float F32_clamp(float x, float min, float max) { return ( x < min) ? min : ((x > max) ? max : x); }
 SLANG_FORCE_INLINE void F32_sincos(float f, float& outSin, float& outCos) { outSin = F32_sin(f); outCos = F32_cos(f); }
@@ -100,6 +105,7 @@ SLANG_FORCE_INLINE double F64_asin(double f) { return ::asin(f); }
 SLANG_FORCE_INLINE double F64_acos(double f) { return ::acos(f); }
 SLANG_FORCE_INLINE double F64_atan(double f) { return ::atan(f); }
 SLANG_FORCE_INLINE double F64_log2(double f) { return ::log2(f); }
+SLANG_FORCE_INLINE double F64_log(double f) { return ::log(f); }
 SLANG_FORCE_INLINE double F64_exp2(double f) { return ::exp2(f); }
 SLANG_FORCE_INLINE double F64_exp(double f) { return ::exp(f); }
 SLANG_FORCE_INLINE double F64_abs(double f) { return ::fabs(f); }
@@ -118,15 +124,35 @@ SLANG_FORCE_INLINE double F64_max(double a, double b) { return a > b ? a : b; }
 SLANG_FORCE_INLINE double F64_pow(double a, double b) { return ::pow(a, b); }
 SLANG_FORCE_INLINE double F64_fmod(double a, double b) { return ::fmod(a, b); }
 SLANG_FORCE_INLINE double F64_remainder(double a, double b) { return ::remainder(a, b); }
-SLANG_FORCE_INLINE double F64_step(double a, double b) { return double(a >= b); }
+SLANG_FORCE_INLINE double F64_step(double a, double b) { return double(b >= a); }
 SLANG_FORCE_INLINE double F64_atan2(double a, double b) { return atan2(a, b); }
 
 // Ternary 
-SLANG_FORCE_INLINE double F64_smoothstep(double min, double max, double x) { return x < min ? min : ((x > max) ? max : x / (max - min)); }
+SLANG_FORCE_INLINE double F64_smoothstep(double min, double max, double x) 
+{ 
+    const double t = x < min ? 0.0 : ((x > max) ? 1.0 : (x - min) / (max - min)); 
+    return t * t * (3.0 - 2.0 * t);
+}
 SLANG_FORCE_INLINE double F64_lerp(double x, double y, double s) { return x + s * (y - x); }
 SLANG_FORCE_INLINE double F64_clamp(double x, double min, double max) { return (x < min) ? min : ((x > max) ? max : x); }
 SLANG_FORCE_INLINE void F64_sincos(double f, double& outSin, double& outCos) { outSin = F64_sin(f); outCos = F64_cos(f); }
 
+SLANG_FORCE_INLINE void F64_asuint(double d, uint32_t& low, uint32_t& hi)
+{
+    Union64 u;
+    u.d = d;
+    low = uint32_t(u.u);
+    hi = uint32_t(u.u >> 32);
+}
+
+SLANG_FORCE_INLINE void F64_asint(double d, int32_t& low, int32_t& hi)
+{
+    Union64 u;
+    u.d = d;
+    low = int32_t(u.u);
+    hi = int32_t(u.u >> 32);
+}
+
 // ----------------------------- I32 -----------------------------------------
 
 SLANG_FORCE_INLINE int32_t I32_abs(int32_t f) { return (f < 0) ? -f : f; }
@@ -164,23 +190,7 @@ SLANG_FORCE_INLINE double U32_asdouble(uint32_t low, uint32_t hi)
     return u.d;
 }
 
-// ----------------------------- F64 -----------------------------------------
 
-SLANG_FORCE_INLINE void F64_asuint(double d, uint32_t& low, uint32_t& hi)
-{
-    Union64 u;
-    u.d = d;
-    low = uint32_t(u.u);
-    hi = uint32_t(u.u >> 32);
-}
-
-SLANG_FORCE_INLINE void F64_asint(double d, int32_t& low, int32_t& hi)
-{
-    Union64 u;
-    u.d = d;
-    low = int32_t(u.u);
-    hi = int32_t(u.u >> 32);
-}
 
 
 #ifdef SLANG_PRELUDE_NAMESPACE
diff --git a/prelude/slang-cpp-types.h b/prelude/slang-cpp-types.h
index 67db607f6..a7ecf5991 100644
--- a/prelude/slang-cpp-types.h
+++ b/prelude/slang-cpp-types.h
@@ -82,6 +82,9 @@ typedef Vector<uint32_t, 4> uint4;
 template <typename T, int ROWS, int COLS>
 struct Matrix
 {
+    Vector<T, COLS>& operator[](int i) { SLANG_PRELUDE_ASSERT(i >= 0 && i < ROWS); return rows[i]; }
+    const Vector<T, COLS>& operator[](int i) const { SLANG_PRELUDE_ASSERT(i >= 0 && i < ROWS); return rows[i]; }
+    
     Vector<T, COLS> rows[ROWS];
 };
 
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 6e20d55c0..8d100b0db 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -2,17 +2,170 @@
 // For now we'll disable any asserts in this prelude
 #define SLANG_PRELUDE_ASSERT(x) 
 
+//
+#define SLANG_FORCE_INLINE inline
+
+#define SLANG_CUDA_CALL __device__ 
+
+#define SLANG_FORCE_INLINE inline
+#define SLANG_INLINE inline
+
 template <typename T, size_t SIZE>
 struct FixedArray
 {
-    __device__ const T& operator[](size_t index) const { SLANG_PRELUDE_ASSERT(index < SIZE); return m_data[index]; }
-    __device__ T& operator[](size_t index) { SLANG_PRELUDE_ASSERT(index < SIZE); return m_data[index]; }
+    SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_PRELUDE_ASSERT(index < SIZE); return m_data[index]; }
+    SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_PRELUDE_ASSERT(index < SIZE); return m_data[index]; }
     
     T m_data[SIZE];
 };
 
+// Code generator will generate the specific type
+template <typename T, int ROWS, int COLS>
+struct Matrix;
+
+typedef bool bool1;
+typedef int2 bool2;
+typedef int3 bool3;
+typedef int4 bool4; 
+
+
+typedef signed char int8_t;
+typedef short int16_t;
+typedef int int32_t;
+typedef long long int64_t;
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+union Union32 
+{
+    uint32_t u;
+    int32_t i;
+    float f;
+};
+
+union Union64
+{
+    uint64_t u;
+    int64_t i;
+    double d;
+};
+
+// ----------------------------- F32 -----------------------------------------
+
+// Unary 
+SLANG_CUDA_CALL float F32_rcp(float f) { return 1.0f / f; }
+SLANG_CUDA_CALL float F32_sign(float f) { return ( f == 0.0f) ? f : (( f < 0.0f) ? -1.0f : 1.0f); } 
+SLANG_CUDA_CALL float F32_saturate(float f) { return (f < 0.0f) ? 0.0f : (f > 1.0f) ? 1.0f : f; }
+SLANG_CUDA_CALL float F32_frac(float f) { return f - floorf(f); }
+
+// Binary
+SLANG_CUDA_CALL float F32_min(float a, float b) { return a < b ? a : b; }
+SLANG_CUDA_CALL float F32_max(float a, float b) { return a > b ? a : b; }
+SLANG_CUDA_CALL float F32_step(float a, float b) { return float(b >= a); }
+
+// Ternary 
+SLANG_CUDA_CALL float F32_lerp(float x, float y, float s) { return x + s * (y - x); }
+SLANG_CUDA_CALL void F32_sincos(float f, float& outSin, float& outCos) { sincosf(f, &outSin, &outCos); }
+SLANG_CUDA_CALL float F32_smoothstep(float min, float max, float x) 
+{
+    const float t = x < min ? 0.0f : ((x > max) ? 1.0f : (x - min) / (max - min)); 
+    return t * t * (3.0 - 2.0 * t);
+}
+SLANG_CUDA_CALL float F32_clamp(float x, float min, float max) { return ( x < min) ? min : ((x > max) ? max : x); }
+
+SLANG_CUDA_CALL uint32_t F32_asuint(float f) { Union32 u; u.f = f; return u.u; }
+SLANG_CUDA_CALL int32_t F32_asint(float f) { Union32 u; u.f = f; return u.i; }
+
+// ----------------------------- F64 -----------------------------------------
+
+// Unary 
+SLANG_CUDA_CALL double F64_rcp(double f) { return 1.0 / f; }
+SLANG_CUDA_CALL double F64_sign(double f) { return (f == 0.0) ? f : ((f < 0.0) ? -1.0 : 1.0); }
+SLANG_CUDA_CALL double F64_saturate(double f) { return (f < 0.0) ? 0.0 : (f > 1.0) ? 1.0 : f; }
+SLANG_CUDA_CALL double F64_frac(double f) { return f - floor(f); }
+
+// Binary
+SLANG_CUDA_CALL double F64_min(double a, double b) { return a < b ? a : b; }
+SLANG_CUDA_CALL double F64_max(double a, double b) { return a > b ? a : b; }
+SLANG_CUDA_CALL double F64_step(double a, double b) { return double(b >= a); }
+
+// Ternary 
+SLANG_CUDA_CALL double F64_lerp(double x, double y, double s) { return x + s * (y - x); }
+SLANG_CUDA_CALL void F64_sincos(double f, double& outSin, double& outCos) { sincos(f, &outSin, &outCos); }
+SLANG_CUDA_CALL double F64_smoothstep(double min, double max, double x) 
+{ 
+    const double t = x < min ? 0.0 : ((x > max) ? 1.0 : (x - min) / (max - min)); 
+    return t * t * (3.0 - 2.0 * t);
+}
+SLANG_CUDA_CALL double F64_clamp(double x, double min, double max) { return (x < min) ? min : ((x > max) ? max : x); }
+
+SLANG_CUDA_CALL void F64_asuint(double d, uint32_t& low, uint32_t& hi)
+{
+    Union64 u;
+    u.d = d;
+    low = uint32_t(u.u);
+    hi = uint32_t(u.u >> 32);
+}
+
+SLANG_CUDA_CALL void F64_asint(double d, int32_t& low, int32_t& hi)
+{
+    Union64 u;
+    u.d = d;
+    low = int32_t(u.u);
+    hi = int32_t(u.u >> 32);
+}
+
+// ----------------------------- I32 -----------------------------------------
+
+// Unary
+SLANG_CUDA_CALL int32_t I32_abs(int32_t f) { return (f < 0) ? -f : f; }
+
+// Binary
+SLANG_CUDA_CALL int32_t I32_min(int32_t a, int32_t b) { return a < b ? a : b; }
+SLANG_CUDA_CALL int32_t I32_max(int32_t a, int32_t b) { return a > b ? a : b; }
+
+// Ternary 
+SLANG_CUDA_CALL int32_t I32_clamp(int32_t x, int32_t min, int32_t max) { return ( x < min) ? min : ((x > max) ? max : x); }
+
+SLANG_CUDA_CALL float I32_asfloat(int32_t x) { Union32 u; u.i = x; return u.f; }
+SLANG_CUDA_CALL uint32_t I32_asuint(int32_t x) { return uint32_t(x); }
+SLANG_CUDA_CALL double I32_asdouble(int32_t low, int32_t hi )
+{
+    Union64 u;
+    u.u = (uint64_t(hi) << 32) | uint32_t(low);
+    return u.d;
+}
+
+// ----------------------------- U32 -----------------------------------------
+
+// Unary 
+SLANG_CUDA_CALL uint32_t U32_abs(uint32_t f) { return f; }
+
+// Binary
+SLANG_CUDA_CALL uint32_t U32_min(uint32_t a, uint32_t b) { return a < b ? a : b; }
+SLANG_CUDA_CALL uint32_t U32_max(uint32_t a, uint32_t b) { return a > b ? a : b; }
+
+// Ternary 
+SLANG_CUDA_CALL uint32_t U32_clamp(uint32_t x, uint32_t min, uint32_t max) { return ( x < min) ? min : ((x > max) ? max : x); }
+
+SLANG_CUDA_CALL float U32_asfloat(uint32_t x) { Union32 u; u.u = x; return u.f; }
+SLANG_CUDA_CALL uint32_t U32_asint(int32_t x) { return uint32_t(x); } 
+
+SLANG_CUDA_CALL double U32_asdouble(uint32_t low, uint32_t hi)
+{
+    Union64 u;
+    u.u = (uint64_t(hi) << 32) | low;
+    return u.d;
+}
+
+/* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */
+
+
 /* Type that defines the uniform entry point params. The actual content of this type is dependent on the entry point parameters, and can be
 found via reflection or defined such that it matches the shader appropriately.
 */
 struct UniformEntryPointParams;
-struct UniformState;
-\ No newline at end of file
+struct UniformState;
diff --git a/source/core/slang-nvrtc-compiler.cpp b/source/core/slang-nvrtc-compiler.cpp
index 1bb2669b8..bc7d1f4f6 100644
--- a/source/core/slang-nvrtc-compiler.cpp
+++ b/source/core/slang-nvrtc-compiler.cpp
@@ -276,7 +276,10 @@ SlangResult NVRTCDownstreamCompiler::compile(const CompileOptions& options, RefP
         cmdLine.addArg("-I");
         cmdLine.addArg(include);
     }
-    
+
+    {
+        cmdLine.addArg("-std=c++14");
+    }
 
     nvrtcProgram program = nullptr;
     nvrtcResult res = m_nvrtcCreateProgram(&program, options.sourceContents.getBuffer(), options.sourceContentsPath.getBuffer(), 0, nullptr, nullptr);
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 7c88e530f..22a846eb7 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -1240,7 +1240,9 @@ __generic<T : __BuiltinFloatingPointType, let N : int> vector<T,N> sin(vector<T,
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int> matrix<T,N,M> sin(matrix<T,N,M> x);
 
 // Sine and cosine
-__generic<T : __BuiltinFloatingPointType> void sincos(T x, out T s, out T c);
+__generic<T : __BuiltinFloatingPointType>
+__target_intrinsic(glsl, "$1 = sin($0); $2 = cos($0);")
+void sincos(T x, out T s, out T c);
 __generic<T : __BuiltinFloatingPointType, let N : int> void sincos(vector<T,N> x, out vector<T,N> s, out vector<T,N> c);
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int> void sincos(matrix<T,N,M> x, out matrix<T,N,M> s, out matrix<T,N,M> c);
 
diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h
index db0fc2285..0abae51b0 100644
--- a/source/slang/hlsl.meta.slang.h
+++ b/source/slang/hlsl.meta.slang.h
@@ -1316,7 +1316,9 @@ SLANG_RAW("__generic<T : __BuiltinFloatingPointType, let N : int> vector<T,N> si
 SLANG_RAW("__generic<T : __BuiltinFloatingPointType, let N : int, let M : int> matrix<T,N,M> sin(matrix<T,N,M> x);\n")
 SLANG_RAW("\n")
 SLANG_RAW("// Sine and cosine\n")
-SLANG_RAW("__generic<T : __BuiltinFloatingPointType> void sincos(T x, out T s, out T c);\n")
+SLANG_RAW("__generic<T : __BuiltinFloatingPointType>\n")
+SLANG_RAW("__target_intrinsic(glsl, \"$1 = sin($0); $2 = cos($0);\")\n")
+SLANG_RAW("void sincos(T x, out T s, out T c);\n")
 SLANG_RAW("__generic<T : __BuiltinFloatingPointType, let N : int> void sincos(vector<T,N> x, out vector<T,N> s, out vector<T,N> c);\n")
 SLANG_RAW("__generic<T : __BuiltinFloatingPointType, let N : int, let M : int> void sincos(matrix<T,N,M> x, out matrix<T,N,M> s, out matrix<T,N,M> c);\n")
 SLANG_RAW("\n")
@@ -1577,7 +1579,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa)
 
     sb << "};\n";
 }
-SLANG_RAW("#line 1504 \"hlsl.meta.slang\"")
+SLANG_RAW("#line 1506 \"hlsl.meta.slang\"")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
diff --git a/source/slang/slang-emit-cpp.cpp b/source/slang/slang-emit-cpp.cpp
index 848ebd6e0..db442d131 100644
--- a/source/slang/slang-emit-cpp.cpp
+++ b/source/slang/slang-emit-cpp.cpp
@@ -403,16 +403,16 @@ SlangResult CPPSourceEmitter::calcTypeName(IRType* type, CodeGenTarget target, S
         {
             auto vecType = static_cast<IRVectorType*>(type);
             auto vecCount = int(GetIntVal(vecType->getElementCount()));
-            const IROp elemType = vecType->getElementType()->op;
+            auto elemType = vecType->getElementType();
 
-            if (target == CodeGenTarget::CPPSource)
+            if (target == CodeGenTarget::CPPSource || target == CodeGenTarget::CUDASource)
             {
-                out << "Vector<" << getBuiltinTypeName(elemType) << ", " << vecCount << ">";
+                out << "Vector<" << _getTypeName(elemType) << ", " << vecCount << ">";
             }
             else
             {             
                 out << "Vec";
-                UnownedStringSlice postFix = _getCTypeVecPostFix(elemType);
+                UnownedStringSlice postFix = _getCTypeVecPostFix(elemType->op);
 
                 out << postFix;
                 if (postFix.size() > 1)
@@ -431,9 +431,9 @@ SlangResult CPPSourceEmitter::calcTypeName(IRType* type, CodeGenTarget target, S
             const auto rowCount = int(GetIntVal(matType->getRowCount()));
             const auto colCount = int(GetIntVal(matType->getColumnCount()));
 
-            if (target == CodeGenTarget::CPPSource)
+            if (target == CodeGenTarget::CPPSource || target == CodeGenTarget::CUDASource)
             {
-                out << "Matrix<" << getBuiltinTypeName(elementType->op) << ", " << rowCount << ", " << colCount << ">";
+                out << "Matrix<" << _getTypeName(elementType) << ", " << rowCount << ", " << colCount << ">";
             }
             else
             {
@@ -800,6 +800,8 @@ void CPPSourceEmitter::_emitSignature(const UnownedStringSlice& funcName, const
     const int paramsCount = int(funcType->getParamCount());
     IRType* retType = specOp->returnType;
 
+    emitSpecializedOperationDefinitionPreamble(specOp);
+
     SourceWriter* writer = getSourceWriter();
 
     emitType(retType);
@@ -900,9 +902,19 @@ void CPPSourceEmitter::_emitCrossDefinition(const UnownedStringSlice& funcName,
     writer->indent();
 
     writer->emit("return ");
-    emitType(specOp->returnType);
-    writer->emit("{ a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x }; \n");
+    if (m_target == CodeGenTarget::CUDASource)
+    {
+        m_writer->emit("make_");
+        emitType(specOp->returnType);
+        writer->emit("( a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x ); \n");
+    }
+    else
+    {
+        emitType(specOp->returnType);
+        writer->emit("{ a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x }; \n");
+    }
 
+    
     writer->dedent();
     writer->emit("}\n\n");
 }
@@ -912,7 +924,7 @@ UnownedStringSlice CPPSourceEmitter::_getAndEmitSpecializedOperationDefinition(H
     HLSLIntrinsic intrinsic;
     m_intrinsicSet.calcIntrinsic(op, retType, argTypes, argCount, intrinsic);
     auto specOp = m_intrinsicSet.add(intrinsic);
-    emitSpecializedOperationDefinition(specOp);
+    _maybeEmitSpecializedOperationDefinition(specOp);
     return  _getFuncName(specOp);
 }
 
@@ -1184,15 +1196,19 @@ void CPPSourceEmitter::_emitReflectDefinition(const UnownedStringSlice& funcName
     writer->emit("}\n\n");
 }
 
-void CPPSourceEmitter::emitSpecializedOperationDefinition(const HLSLIntrinsic* specOp)
+void CPPSourceEmitter::_maybeEmitSpecializedOperationDefinition(const HLSLIntrinsic* specOp)
 {
-    typedef HLSLIntrinsic::Op Op;
-
     // Check if it's been emitted already, if not add it.
     if (!m_intrinsicEmitted.Add(specOp))
     {
         return;
     }
+    emitSpecializedOperationDefinition(specOp);
+}
+
+void CPPSourceEmitter::emitSpecializedOperationDefinition(const HLSLIntrinsic* specOp)
+{
+    typedef HLSLIntrinsic::Op Op;
 
     switch (specOp->op)
     {
@@ -1318,8 +1334,8 @@ void CPPSourceEmitter::emitCall(const HLSLIntrinsic* specOp, IRInst* inst, const
                     if (IRBasicType::isaImpl(retType->op))
                     {
                         SLANG_ASSERT(numOperands == 1);
-
-                        writer->emit(getBuiltinTypeName(retType->op));
+                        
+                        writer->emit(_getTypeName(retType));
                         writer->emitChar('(');
 
                         emitOperand(operands[0].get(), getInfo(EmitOp::General));
@@ -1432,16 +1448,29 @@ HLSLIntrinsic* CPPSourceEmitter::_addIntrinsic(HLSLIntrinsic::Op op, IRType* ret
     return addedIntrinsic;
 }
 
-StringSlicePool::Handle CPPSourceEmitter::_calcScalarFuncName(HLSLIntrinsic::Op op, IRBasicType* type)
+SlangResult CPPSourceEmitter::calcScalarFuncName(HLSLIntrinsic::Op op, IRBasicType* type, StringBuilder& outBuilder)
 {
-    StringBuilder builder;
-    builder << _getTypePrefix(type->op) << "_" << HLSLIntrinsic::getInfo(op).funcName;
-    return m_slicePool.add(builder);
+    outBuilder << _getTypePrefix(type->op) << "_" << HLSLIntrinsic::getInfo(op).funcName;
+    return SLANG_OK;   
 }
 
 UnownedStringSlice CPPSourceEmitter::_getScalarFuncName(HLSLIntrinsic::Op op, IRBasicType* type)
 {
-    return m_slicePool.getSlice(_calcScalarFuncName(op, type));
+    /* TODO(JS): This is kind of fast and loose. That we don't know all the parameters that are taken or
+    what the return type is, so we can't add to the HLSLIntrinsic map - we just generate the scalar
+    function name and use it (whilst also adding to the slice pool, so that we can return an
+    unowned slice). */
+
+    StringBuilder builder;
+    if (SLANG_FAILED(calcScalarFuncName(op, type, builder)))
+    {
+        SLANG_ASSERT(!"Unable to create scalar function name");
+        return UnownedStringSlice();
+    }
+
+    // Add to the pool. 
+    auto handle = m_slicePool.add(builder);
+    return m_slicePool.getSlice(handle);
 }
 
 UnownedStringSlice CPPSourceEmitter::_getFuncName(const HLSLIntrinsic* specOp)
@@ -1452,14 +1481,22 @@ UnownedStringSlice CPPSourceEmitter::_getFuncName(const HLSLIntrinsic* specOp)
         return m_slicePool.getSlice(handle);
     }
 
-    handle = _calcFuncName(specOp);
+    StringBuilder builder;
+    if (SLANG_FAILED(calcFuncName(specOp, builder)))
+    {
+        SLANG_ASSERT(!"Unable to create function name");
+        // Return an empty slice, as an error...
+        return UnownedStringSlice();
+    }
+
+    handle = m_slicePool.add(builder);
     m_intrinsicNameMap.Add(specOp, handle);
 
     SLANG_ASSERT(handle != StringSlicePool::kNullHandle);
     return m_slicePool.getSlice(handle);
 }
 
-StringSlicePool::Handle CPPSourceEmitter::_calcFuncName(const HLSLIntrinsic* specOp)
+SlangResult CPPSourceEmitter::calcFuncName(const HLSLIntrinsic* specOp, StringBuilder& outBuilder)
 {
     typedef HLSLIntrinsic::Op Op;
 
@@ -1468,7 +1505,7 @@ StringSlicePool::Handle CPPSourceEmitter::_calcFuncName(const HLSLIntrinsic* spe
         IRType* paramType = specOp->signatureType->getParamType(0);
         IRBasicType* basicType = as<IRBasicType>(paramType);
         SLANG_ASSERT(basicType);
-        return _calcScalarFuncName(specOp->op, basicType);
+        return calcScalarFuncName(specOp->op, basicType, outBuilder);
     }
     else
     {
@@ -1483,14 +1520,10 @@ StringSlicePool::Handle CPPSourceEmitter::_calcFuncName(const HLSLIntrinsic* spe
                 IRType* dstType = signatureType->getParamType(0);
                 //IRType* srcType = signatureType->getParamType(1);
 
-                StringBuilder builder;
-                builder << "convert_";
+                outBuilder << "convert_";
                 // I need a function that is called that will construct this
-                if (SLANG_FAILED(calcTypeName(dstType, CodeGenTarget::CSource, builder)))
-                {
-                    return StringSlicePool::kNullHandle;
-                }
-                return m_slicePool.add(builder);
+                SLANG_RETURN_ON_FAIL(calcTypeName(dstType, CodeGenTarget::CSource, outBuilder));
+                return SLANG_OK;
             }
             case Op::ConstructFromScalar:
             {
@@ -1500,22 +1533,20 @@ StringSlicePool::Handle CPPSourceEmitter::_calcFuncName(const HLSLIntrinsic* spe
 
                 IRType* dstType = signatureType->getParamType(0);
                 
-                StringBuilder builder;
-                builder << "constructFromScalar_";
+                outBuilder << "constructFromScalar_";
                 // I need a function that is called that will construct this
-                if (SLANG_FAILED(calcTypeName(dstType, CodeGenTarget::CSource, builder)))
-                {
-                    return StringSlicePool::kNullHandle;
-                }
-                return m_slicePool.add(builder);
+                SLANG_RETURN_ON_FAIL(calcTypeName(dstType, CodeGenTarget::CSource, outBuilder));
+                return SLANG_OK;
             }
             case Op::GetAt:
             {
-                return m_slicePool.add(UnownedStringSlice::fromLiteral("getAt"));
+                outBuilder << "getAt";
+                return SLANG_OK;
             }
             case Op::SetAt:
             {
-                return m_slicePool.add(UnownedStringSlice::fromLiteral("setAt"));
+                outBuilder << "setAt";
+                return SLANG_OK;
             }
             default: break;
         }
@@ -1525,10 +1556,15 @@ StringSlicePool::Handle CPPSourceEmitter::_calcFuncName(const HLSLIntrinsic* spe
         {
             if (!_isOperator(info.funcName))
             {
-                return m_slicePool.add(info.funcName);
+                // If there is a standard default name, just use that
+                outBuilder << info.funcName;
+                return SLANG_OK;
             }
         }
-        return m_slicePool.add(info.name);
+
+        // Just use the name of the Op. This is probably wrong, but gives a pretty good idea of what the desired (presumably missing) op is.
+        outBuilder << info.name;
+        return SLANG_OK;
     }
 }
 
@@ -1993,7 +2029,7 @@ void CPPSourceEmitter::emitPreprocessorDirectivesImpl()
     // Emit all the intrinsics that were used
     for (const auto& keyValue : m_intrinsicNameMap)
     {
-        emitSpecializedOperationDefinition(keyValue.Key);
+        _maybeEmitSpecializedOperationDefinition(keyValue.Key);
     }
 }
 
diff --git a/source/slang/slang-emit-cpp.h b/source/slang/slang-emit-cpp.h
index df1dec380..12bc0939e 100644
--- a/source/slang/slang-emit-cpp.h
+++ b/source/slang/slang-emit-cpp.h
@@ -80,8 +80,12 @@ protected:
 
     // Replaceable for classes derived from CPPSourceEmitter
     virtual SlangResult calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out);
+    virtual SlangResult calcFuncName(const HLSLIntrinsic* specOp, StringBuilder& out);
+    virtual SlangResult calcScalarFuncName(HLSLIntrinsic::Op op, IRBasicType* type, StringBuilder& outBuilder);
+    virtual void emitSpecializedOperationDefinitionPreamble(const HLSLIntrinsic* specOp) { SLANG_UNUSED(specOp); }
 
-    
+
+    void _maybeEmitSpecializedOperationDefinition(const HLSLIntrinsic* specOp);
 
     void emitIntrinsicCallExpr(
         IRCall*                         inst,
@@ -115,11 +119,9 @@ protected:
     static TypeDimension _getTypeDimension(IRType* type, bool vecSwap);
     static void _emitAccess(const UnownedStringSlice& name, const TypeDimension& dimension, int row, int col, SourceWriter* writer);
 
-    StringSlicePool::Handle _calcScalarFuncName(HLSLIntrinsic::Op, IRBasicType* type);
     UnownedStringSlice _getScalarFuncName(HLSLIntrinsic::Op operation, IRBasicType* scalarType);
 
     UnownedStringSlice _getFuncName(const HLSLIntrinsic* specOp);
-    StringSlicePool::Handle _calcFuncName(const HLSLIntrinsic* specOp);
 
     UnownedStringSlice _getTypeName(IRType* type);
     
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp
index 39a25aafa..c72b9125a 100644
--- a/source/slang/slang-emit-cuda.cpp
+++ b/source/slang/slang-emit-cuda.cpp
@@ -10,6 +10,23 @@
 
 namespace Slang {
 
+static bool _isSingleNameBasicType(IROp op)
+{
+    switch (op)
+    {
+        case kIROp_Int64Type:   
+        case kIROp_UInt8Type: 
+        case kIROp_UInt16Type:
+        case kIROp_UIntType: 
+        case kIROp_UInt64Type:
+        {
+            return false;
+        }
+        default: return true;
+
+    }
+}
+
 /* static */ UnownedStringSlice CUDASourceEmitter::getBuiltinTypeName(IROp op)
 {
     switch (op)
@@ -110,10 +127,93 @@ SlangResult CUDASourceEmitter::_calcCUDATextureTypeName(IRTextureTypeBase* texTy
     return SLANG_OK;
 }
 
-void CUDASourceEmitter::emitSpecializedOperationDefinition(const HLSLIntrinsic* specOp)
+
+SlangResult CUDASourceEmitter::calcScalarFuncName(HLSLIntrinsic::Op op, IRBasicType* type, StringBuilder& outBuilder)
 {
-    m_writer->emit("__device__ ");
-    Super::emitSpecializedOperationDefinition(specOp);
+    typedef HLSLIntrinsic::Op Op;
+
+    UnownedStringSlice funcName;
+    
+    switch (op)
+    {
+        case Op::Sin:
+        case Op::Cos:
+        case Op::Tan:
+        case Op::ArcSin:
+        case Op::ArcCos:
+        case Op::ArcTan:
+        case Op::ArcTan2:
+        case Op::Floor:
+        case Op::Ceil:
+        case Op::FMod:
+        case Op::Exp2:
+        case Op::Exp:
+        case Op::Log:
+        case Op::Log2:
+        case Op::Log10:
+        case Op::FRem:
+        case Op::Sqrt:
+        case Op::RecipSqrt:
+        case Op::Pow:
+        case Op::Trunc:
+        {
+            if (type->op == kIROp_FloatType || type->op == kIROp_DoubleType)
+            {
+                funcName = HLSLIntrinsic::getInfo(op).funcName;
+            }
+            break;
+        }
+        case Op::Max:
+        case Op::Min:
+        case Op::Abs:
+        {
+            // There are only floating point built in versions of these, prefixed with f
+            if (type->op == kIROp_FloatType || type->op == kIROp_DoubleType)
+            {
+                outBuilder << "f";
+                outBuilder << HLSLIntrinsic::getInfo(op).funcName;
+
+                if (type->op == kIROp_FloatType)
+                {
+                    outBuilder << "f";
+                }
+                return SLANG_OK;
+            }
+            break;
+        }
+
+        default: break;
+    }
+
+    if (funcName.size())
+    {
+        outBuilder << funcName;
+        if (type->op == kIROp_FloatType)
+        {
+            outBuilder << "f";
+        }
+        return SLANG_OK;
+    }
+
+    // Missing ones:
+    // 
+    // sincos - the built in uses pointer, so we'll just define in prelude
+    // rcp
+    // sign
+    // saturate
+    // frac
+    // smoothstep
+    // lerp
+    // clamp
+    // step
+    // 
+    // For integer types
+    // abs
+    // min
+    // max
+
+    // Defer to the supers impl
+    return Super::calcScalarFuncName(op, type, outBuilder);
 }
 
 SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out)
@@ -278,73 +378,73 @@ void CUDASourceEmitter::emitOperandImpl(IRInst* inst, EmitOpInfo const& outerPre
     Super::emitOperandImpl(inst, outerPrec);
 }
 
-bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec)
+void CUDASourceEmitter::emitCall(const HLSLIntrinsic* specOp, IRInst* inst, const IRUse* operands, int numOperands, const EmitOpInfo& inOuterPrec)
 {
-    switch (inst->op)
+    switch (specOp->op)
     {
-        case kIROp_Construct:
-        case kIROp_makeVector:
+        case HLSLIntrinsic::Op::Init:
         {
-            if (inst->getOperandCount() == 1)
-            {
-                EmitOpInfo outerPrec = inOuterPrec;
-                bool needClose = false;
+            // For CUDA vector types we construct with make_
 
-                auto prec = getInfo(EmitOp::Prefix);
-                needClose = maybeEmitParens(outerPrec, prec);
+            auto writer = m_writer;
 
-                // Need to emit as cast for HLSL
-                m_writer->emit("(");
-                emitType(inst->getDataType());
-                m_writer->emit(") ");
-                emitOperand(inst->getOperand(0), rightSide(outerPrec, prec));
+            IRType* retType = specOp->returnType;
 
-                maybeCloseParens(needClose);
-                // Handled
-                return true;
-            }
-            else
+            switch (retType->op)
             {
-                m_writer->emit("make_");
-                m_writer->emit(_getTypeName(inst->getDataType()));
-                emitArgs(inst);
-                return true;
+                case kIROp_VectorType:
+                {
+                    // Get the type name
+                    writer->emit("make_");
+                    emitType(retType);
+                    writer->emitChar('(');
+
+                    for (int i = 0; i < numOperands; ++i)
+                    {
+                        if (i > 0)
+                        {
+                            writer->emit(", ");
+                        }
+                        emitOperand(operands[i].get(), getInfo(EmitOp::General));
+                    }
+
+                    writer->emitChar(')');
+                    return;
+                }
+                default: break; 
             }
             break;
         }
-        case kIROp_MakeMatrix:
-        {
-            return false;
-        }
-        case kIROp_BitCast:
+        default: break;
+    }
+
+    return Super::emitCall(specOp, inst, operands, numOperands, inOuterPrec);
+}
+
+bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec)
+{
+    switch(inst->op)
+    {
+        case kIROp_Construct:
         {
-            auto toType = extractBaseType(inst->getDataType());
-            switch (toType)
+            // Simple constructor call
+            // On CUDA some of the built in types can't be used as constructors directly
+
+            IRType* type = inst->getDataType();
+            if (auto basicType = as<IRBasicType>(type) && !_isSingleNameBasicType(type->op))
             {
-                default:
-                    m_writer->emit("/* unhandled */");
-                    break;
-                case BaseType::UInt:
-                    break;
-                case BaseType::Int:
-                    m_writer->emit("(");
-                    emitType(inst->getDataType());
-                    m_writer->emit(")");
-                    break;
-                case BaseType::Float:
-                    m_writer->emit("asfloat");
-                    break;
+                m_writer->emit("(");
+                emitType(inst->getDataType());
+                m_writer->emit(")");
+                emitArgs(inst);
+                return true;
             }
-
-            m_writer->emit("(");
-            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-            m_writer->emit(")");
-            return true;
+            break;
         }
         default: break;
     }
-    // Not handled
-    return false;
+
+    return Super::tryEmitInstExprImpl(inst, inOuterPrec);
 }
 
 void CUDASourceEmitter::emitLayoutDirectivesImpl(TargetRequest* targetReq)
@@ -398,7 +498,7 @@ void CUDASourceEmitter::emitSimpleFuncParamsImpl(IRFunc* func)
 
 void CUDASourceEmitter::emitSimpleFuncImpl(IRFunc* func)
 {
-    // Mark as run on device. Don't need to worry about entry point, as that is output separtely to call the __device_ implementation
+    // Mark as run on device. Don't need to worry about entry point, as that is output separately to call the __device_ implementation
     m_writer->emit("__device__ ");    
 
     CLikeSourceEmitter::emitSimpleFuncImpl(func);
@@ -444,7 +544,7 @@ void CUDASourceEmitter::emitPreprocessorDirectivesImpl()
     // Emit all the intrinsics that were used
     for (const auto& keyValue : m_intrinsicNameMap)
     {
-        emitSpecializedOperationDefinition(keyValue.Key);
+        _maybeEmitSpecializedOperationDefinition(keyValue.Key);
     }
 }
 
diff --git a/source/slang/slang-emit-cuda.h b/source/slang/slang-emit-cuda.h
index c3c88e156..e75eb4e88 100644
--- a/source/slang/slang-emit-cuda.h
+++ b/source/slang/slang-emit-cuda.h
@@ -46,6 +46,7 @@ protected:
     virtual void emitVarDecorationsImpl(IRInst* varDecl) SLANG_OVERRIDE;
     virtual void emitMatrixLayoutModifiersImpl(IRVarLayout* layout) SLANG_OVERRIDE;
     virtual void emitOperandImpl(IRInst* inst, EmitOpInfo const&  outerPrec) SLANG_OVERRIDE;
+    virtual void emitCall(const HLSLIntrinsic* specOp, IRInst* inst, const IRUse* operands, int numOperands, const EmitOpInfo& inOuterPrec) SLANG_OVERRIDE;
 
     //virtual bool tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType) SLANG_OVERRIDE;
     virtual bool tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec) SLANG_OVERRIDE;
@@ -56,7 +57,8 @@ protected:
 
     // CPPSourceEmitter overrides 
     virtual SlangResult calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out) SLANG_OVERRIDE;
-    virtual void emitSpecializedOperationDefinition(const HLSLIntrinsic* specOp) SLANG_OVERRIDE;
+    virtual SlangResult calcScalarFuncName(HLSLIntrinsic::Op op, IRBasicType* type, StringBuilder& outBuilder) SLANG_OVERRIDE;
+    virtual void emitSpecializedOperationDefinitionPreamble(const HLSLIntrinsic* specOp) SLANG_OVERRIDE { SLANG_UNUSED(specOp); m_writer->emit("__device__ "); }
 
     SlangResult _calcCUDATextureTypeName(IRTextureTypeBase* texType, StringBuilder& outName);
 };
diff --git a/source/slang/slang-hlsl-intrinsic-set.h b/source/slang/slang-hlsl-intrinsic-set.h
index 5e01c0599..ee17dd571 100644
--- a/source/slang/slang-hlsl-intrinsic-set.h
+++ b/source/slang/slang-hlsl-intrinsic-set.h
@@ -96,6 +96,10 @@ just constructXXXFromScalar. Would be good if there was a suitable name to encom
         x(Exp2, "exp2", 1) \
         x(Exp, "exp", 1) \
         \
+        x(Log, "log", 1) \
+        x(Log2, "log2", 1) \
+        x(Log10, "log10", 1) \
+        \
         x(Abs, "abs", 1) \
         \
         x(Min, "min", 2) \
diff --git a/tests/compute/hlsl-scalar-float-intrinsic.slang b/tests/compute/hlsl-scalar-float-intrinsic.slang
new file mode 100644
index 000000000..213db4b23
--- /dev/null
+++ b/tests/compute/hlsl-scalar-float-intrinsic.slang
@@ -0,0 +1,88 @@
+//TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    int idx = int(dispatchThreadID.x);
+    
+    float f = idx * (1.0f / (4.0f - 1));
+
+    int it = 0;
+    float ft = 0.0f;
+    
+    // fmod
+    // ft += f % 0.5f;
+    
+    ft += sin(f);
+    ft += cos(f);
+    ft += tan(f);
+    
+    ft += asin(f);
+    ft += acos(f);
+    ft += atan(f);
+    
+    ft += atan2(f, 2.0);
+    
+    {
+        float sf, cf;
+        sincos(f, sf, cf);
+        
+        ft += sf;
+        ft += cf;
+    }
+    
+    ft += rcp(1.0 + f);
+    ft += sign(f - 0.5);
+    
+    ft += saturate(f * 4 - 2.0);
+    
+    ft += sqrt(f);
+    ft += rsqrt(1.0f + f);
+    
+    ft += exp2(f);
+    ft += exp(f);
+        
+        
+    ft += frac(f * 3);
+//    ft += ceil(f * 5 - 3);
+    
+    ft += floor(f * 10 - 7);
+    ft += trunc(f * 7);
+     
+   
+    ft += log(f + 10.0);
+    ft += log2(f * 3 + 2);
+    
+    // ft += log10(f * 10 + 4);
+    
+    ft += abs(f * 4 - 2.0f);
+    
+    ft += min(0.5, f);
+    ft += max(f, 0.75);
+
+    ft += pow(0.5, f);
+
+    ft += smoothstep(0.2, 0.7, f);
+    ft += lerp(-100, 100, f);
+
+
+    ft += clamp(f, 0.1, 0.3);
+
+    ft += step(f, 0.5);
+
+    int vi = asint(f - f) + idx;
+    
+    ft += float(vi);
+    
+    uint vu = asuint(f);
+    ft += asfloat(vu);
+    
+    outputBuffer[idx] = int(ft * 16);
+}
+\ No newline at end of file
diff --git a/tests/compute/hlsl-scalar-float-intrinsic.slang.expected.txt b/tests/compute/hlsl-scalar-float-intrinsic.slang.expected.txt
new file mode 100644
index 000000000..04d17659c
--- /dev/null
+++ b/tests/compute/hlsl-scalar-float-intrinsic.slang.expected.txt
@@ -0,0 +1,4 @@
+FFFFFA3C
+FFFFFEEC
+3CA
+8C1
diff --git a/tests/compute/transcendental.slang b/tests/compute/transcendental.slang
index aa40da752..bde43ee38 100644
--- a/tests/compute/transcendental.slang
+++ b/tests/compute/transcendental.slang
@@ -1,3 +1,4 @@
+//TEST(compute):COMPARE_COMPUTE:-cuda
 //TEST(compute):COMPARE_COMPUTE:-cpu
 //TEST(compute):COMPARE_COMPUTE:
 //TEST(compute,vulcan):COMPARE_COMPUTE:-vk
diff --git a/tests/cross-compile/simple-cross-compile.slang b/tests/cross-compile/simple-cross-compile.slang
new file mode 100644
index 000000000..e5fe9d3cc
--- /dev/null
+++ b/tests/cross-compile/simple-cross-compile.slang
@@ -0,0 +1,109 @@
+//TEST(compute):COMPARE_COMPUTE:-cpu
+//TEST(compute):COMPARE_COMPUTE:-cuda
+
+enum Color
+{
+    Red,
+    Green = 2,
+    Blue,
+}
+
+int test(int val)
+{
+    Color c = Color.Red;
+
+    if(val > 1)
+    {
+        c = Color.Green;
+    }
+
+    if(c == Color.Red)
+    {
+        if(val & 1)
+        {
+            c = Color.Blue;
+        }
+    }
+
+    switch(c)
+    {
+    case Color.Red:
+        val = 1;
+        break;
+
+    case Color.Green:
+        val = 2;
+        break;
+
+    case Color.Blue:
+        val = 3;
+        break;
+
+    default:
+        val = -1;
+        break;
+    }
+
+    return (val << 4) + int(c);
+}
+
+float sum(float a[3])
+{
+    float total = a[0];
+    for (int i = 1; i < 3; ++i)
+    {
+        total += a[i];
+    }
+    return total;
+}
+
+struct Thing
+{
+    int a;
+    float b;
+};
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    uint tid = dispatchThreadID.x;
+
+    Thing thing = { 10, -1.0 };
+
+    float array[3] = { thing.a, 2, 3};
+
+    float anotherArray[] = { 1, 2, 5 };
+
+    array[0] += anotherArray[1];
+
+    matrix<float, 2, 3> mat = { { sum(array), 1, 2 }, { 3, 4, 5} };
+    vector<float, 2> vec = { float(tid + 1), float(tid + 2) };
+
+    vec += float2(7, 11);
+
+    vector<float, 3> vec2 = max(sin(mul(vec, mat)), float3(1, 2, -1));
+    vector<float, 3> vec3 = mul(vec, mat);
+    
+    float3 vec4 = lerp(vec2, vec3, float3(tid * (1.0f / 4), 1, 1));
+    
+    float3 crossVec = normalize(cross(vec4, vec4) + float3(2, 3, 1));
+    
+    vec2.x = fmod(crossVec.y, crossVec.x);
+    
+    vec2 = fmod(vec2, crossVec);
+    
+    vec2 += (-vec2.zyx) * 2 + crossVec * length(crossVec) + reflect(vec4, normalize(crossVec));
+    
+    vector<bool, 3> z = vec2 > 0;
+
+    int val = (int(tid) + (any(z) ? 1 : 0) + (all(z) ? 2 : 0)) % 100;
+    
+    val = asint(asfloat(asuint(asfloat(val))));
+    
+    val = test(val);
+
+    outputBuffer[tid] = val + int(dot(vec2, vec4));
+}
+\ No newline at end of file
diff --git a/tests/cross-compile/simple-cross-compile.slang.expected.txt b/tests/cross-compile/simple-cross-compile.slang.expected.txt
new file mode 100644
index 000000000..c99b80180
--- /dev/null
+++ b/tests/cross-compile/simple-cross-compile.slang.expected.txt
@@ -0,0 +1,4 @@
+147
+FFFFE732
+FFFFCCE6
+FFFFB6C7
diff --git a/tests/cuda/compile-to-cuda.slang b/tests/cuda/compile-to-cuda.slang
index be7d775bd..d7399d469 100644
--- a/tests/cuda/compile-to-cuda.slang
+++ b/tests/cuda/compile-to-cuda.slang
@@ -8,12 +8,10 @@ RWStructuredBuffer<int> outputBuffer : register(u0);
 [numthreads(4, 1, 1)]
 void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
-
 	int tid = int(dispatchThreadID.x);
     
     outputBuffer[tid * 4] = tid;
     outputBuffer[tid * 4 + 1] = tid + 1;
     outputBuffer[tid * 4 + 2] = tid + 2; 
     outputBuffer[tid * 4 + 3] = tid + 3;
- 
 }
diff --git a/tests/cuda/compile-to-cuda.slang.expected.txt b/tests/cuda/compile-to-cuda.slang.expected.txt
new file mode 100644
index 000000000..27a9fcd89
--- /dev/null
+++ b/tests/cuda/compile-to-cuda.slang.expected.txt
@@ -0,0 +1,16 @@
+0
+1
+2
+3
+1
+2
+3
+4
+2
+3
+4
+5
+3
+4
+5
+6
diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp
index 74810e675..e42a0a53e 100644
--- a/tools/render-test/cuda/cuda-compute-util.cpp
+++ b/tools/render-test/cuda/cuda-compute-util.cpp
@@ -58,6 +58,76 @@ public:
     void* m_cudaMemory;
 };
 
+class ScopeCUDAModule
+{
+public:
+
+    operator CUmodule () const { return m_module; }
+
+    ScopeCUDAModule(): m_module(nullptr) {}
+    SlangResult load(const void* image)
+    {
+        release(); 
+        SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&m_module, image));
+        return SLANG_OK;
+    }
+    void release()
+    {
+        if (m_module)
+        {
+            cuModuleUnload(m_module);
+            m_module = nullptr;
+        }
+    }
+
+    ~ScopeCUDAModule() { release(); }
+
+    CUmodule m_module;
+};
+
+class ScopeCUDAStream
+{
+public:
+
+    SlangResult init(unsigned int flags)
+    {
+        release();
+        SLANG_ASSERT(m_stream == nullptr);
+        SLANG_CUDA_RETURN_ON_FAIL(cudaStreamCreateWithFlags(&m_stream, flags));
+        return SLANG_OK;
+    }
+
+    SlangResult sync()
+    {
+        if (m_stream)
+        {
+            SLANG_CUDA_RETURN_ON_FAIL(cudaStreamSynchronize(m_stream));
+        }
+        else
+        {
+            SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceSynchronize());
+        }
+        return SLANG_OK;
+    }
+
+    void release()
+    {
+        if (m_stream)
+        {
+            sync();
+            SLANG_CUDA_ASSERT_ON_FAIL(cudaStreamDestroy(m_stream));
+            m_stream = nullptr;
+        }
+    }
+
+    ScopeCUDAStream():m_stream(nullptr) {}
+
+    ~ScopeCUDAStream() { release(); }
+
+    operator cudaStream_t () const { return m_stream; }
+
+    cudaStream_t m_stream;
+};
 
 
 
@@ -250,9 +320,9 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
     CUfunction kernel;
     SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(&kernel, module, entryPointName));
 
-    // A stream of 0 means no stream
-    cudaStream_t stream = 0;
-    //SLANG_CUDA_RETURN_ON_FAIL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    // A default stream, will act as a global stream. Calling sync will globally sync
+    ScopeCUDAStream cudaStream;
+    //SLANG_CUDA_RETURN_ON_FAIL(cudaStream.init(cudaStreamNonBlocking));
 
     {
         // Okay now we need to set up binding
@@ -464,21 +534,14 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                 1, 1, 1,                                                                                // Blocks
                 int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]),        // Threads per block
                 0,                                                                                      // Shared memory size
-                stream,                                                                                 // Stream. 0 is no stream.
+                cudaStream,                                                                                 // Stream. 0 is no stream.
                 args,                                                                                   // Args
                 nullptr);                                                                               // extra
 
             SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult);
 
-            if (stream)
-            {
-                SLANG_CUDA_RETURN_ON_FAIL(cudaStreamSynchronize(stream));
-            }
-            else
-            {
-                // Do a sync here. Makes sure any issues are detected early and not on some implicit sync
-                SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceSynchronize()); 
-            }
+            // Do a sync here. Makes sure any issues are detected early and not on some implicit sync
+            SLANG_RETURN_ON_FAIL(cudaStream.sync());
         }
 
         // Finally we need to copy the data back
@@ -503,11 +566,6 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                 }
             }
         }
-
-        if (stream)
-        {
-            SLANG_CUDA_RETURN_ON_FAIL(cudaStreamDestroy(stream));
-        }
     }
 
     // Release all othe CUDA resource/allocations
@@ -529,12 +587,9 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
 
     const auto& kernel = outputAndLayout.output.kernelDescs[index];
 
-    CUmodule module = 0;
-    SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&module, kernel.codeBegin));
-
-    SLANG_RETURN_ON_FAIL(_compute(cudaContext, module, outputAndLayout, outContext));
-
-    SLANG_CUDA_RETURN_ON_FAIL(cuModuleUnload(module));
+    ScopeCUDAModule cudaModule;
+    SLANG_RETURN_ON_FAIL(cudaModule.load(kernel.codeBegin));
+    SLANG_RETURN_ON_FAIL(_compute(cudaContext, cudaModule, outputAndLayout, outContext));
 
     return SLANG_OK;
 }
author	jsmall-nvidia <jsmall@nvidia.com>	2020-01-21 09:38:10 -0500
committer	GitHub <noreply@github.com>	2020-01-21 09:38:10 -0500
commit	47392bc72b826b4ad427b703391a77e697735a65 (patch)
tree	7c541c4295742b765124f42bab9f713276c83580
parent	a8669ade5cb3add8b9ce08e2c3bd96e93190bca8 (diff)