8 files changed, 180 insertions, 19 deletions
diff --git a/docs/64bit-type-support.md b/docs/64bit-type-support.md
new file mode 100644
index 000000000..6b026d3b2
--- /dev/null
+++ b/docs/64bit-type-support.md
@@ -0,0 +1,78 @@
+Slang 64-bit Type Support
+=========================
+
+The Slang language supports 64 bit built in types. Such as
+
+* double
+* uint64_t
+* int64_t
+
+This also applies to vector and matrix versions of these types. 
+
+Unfortunately if a specific target supports the type or the typical HLSL instrinsic functions (such as sin/cos/max/min etc) depends very much on the target. 
+
+Note this initial testing only tested scalar usage, and not vector or matrix intrinsics.
+
+Double support
+==============
+
+Target   | Compiler/Binary  |  Double Type   |   Intrinsics          |  Notes
+---------|------------------|----------------|-----------------------|-----------
+CPU      |                  |      Yes       |          Yes          |  1
+CUDA     | Nvrtx/PTX        |      Yes       |          Yes          |  1
+D3D12    | DXC/DXIL         |      Yes       |          No           |  2 
+Vulkan   | GlSlang/Spir-V   |      Yes       |          No           |  3
+D3D11    | FXC/DXBC         |      No        |          No           |
+D3D12    | FXC/DXBC         |      No        |          No           | 
+
+1) CUDA and CPU support most intrinsics, with the notable exception currently of matrix invert
+2) Requires SM 6.0 and above  https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12
+3) Restriction is described in  https://www.khronos.org/registry/spir-v/specs/1.0/GLSL.std.450.html
+Note that GlSlang does produce spir-v that contains double intrinsic calls, the failure happens when validating the Spir-V 
+
+```
+Validation: error 0:  [ UNASSIGNED-CoreValidation-Shader-InconsistentSpirv ] Object: VK_NULL_HANDLE (Type = 0) | SPIR-V module not valid: GLSL.std.450 Sin: expected Result Type to be a 16 or 32-bit scalar or vector float type
+  %57 = OpExtInst %double %1 Sin %56
+```
+
+D3D12 and VK may have some very limited intrinsic support such as sqrt, rsqrt
+
+uint64_t Support
+=================
+
+Target   | Compiler/Binary  |  uint64_t Type |  Intrinsic support | Notes
+---------|------------------|----------------|--------------------|--------
+CPU      |                  |      Yes       |          Yes       |   
+CUDA     | Nvrtx/PTX        |      Yes       |          Yes       |   
+D3D12    | DXC/DXIL         |      Yes       |          Yes       |   
+Vulkan   | GlSlang/Spir-V   |      Yes       |          Yes       |   
+D3D11    | FXC/DXBC         |      No        |          No        |   1
+D3D12    | FXC/DXBC         |      No        |          No        |   1
+
+1) uint64_t support requires https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12, so DXBC is not a target.
+
+The intrinsics available on uint64_t type are `abs`, `min`, `max`, `clamp` and `countbits`.
+
+int64_t Support
+================
+
+Target   | Compiler/Binary  |  int64_t Type |  Intrinsic support | Notes
+---------|------------------|----------------|--------------------|--------
+CPU      |                  |      Yes       |          Yes       |   
+CUDA     | Nvrtx/PTX        |      Yes       |          Yes       |   
+Vulkan   | GlSlang/Spir-V   |      Yes       |          Yes       |   
+D3D12    | DXC/DXIL         |      Yes       |          Yes       | 1
+D3D11    | FXC/DXBC         |      No        |          No        | 2 
+D3D12    | FXC/DXBC         |      No        |          No        | 2
+
+1) The sm6.0 docs (https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12) describe only supports uint64_t, but the dxc compiler page says int64_t is supported in HLSL 2016 (https://github.com/Microsoft/DirectXShaderCompiler/wiki/Language-Versions). Tests show that this is indeed the case.
+
+2) uint64_t support requires https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12, so DXBC is not a target.
+
+The intrinsics available on uint64_t type are `abs`, `min`, `max` and `clamp`.
+
+GLSL
+====
+
+GLSL/Spir-v based targets do not support 'generated' intrinsics on matrix types. For example 'sin(mat)' will not work on GLSL/Spir-v.
+
diff --git a/prelude/slang-cpp-scalar-intrinsics.h b/prelude/slang-cpp-scalar-intrinsics.h
index 6c577733d..9fc387b6e 100644
--- a/prelude/slang-cpp-scalar-intrinsics.h
+++ b/prelude/slang-cpp-scalar-intrinsics.h
@@ -197,6 +197,7 @@ SLANG_FORCE_INLINE double U32_asdouble(uint32_t low, uint32_t hi)
     return u.d;
 }
 
+
 SLANG_FORCE_INLINE uint32_t U32_countbits(uint32_t v)
 {
 #if SLANG_GCC_FAMILY    
@@ -214,6 +215,40 @@ SLANG_FORCE_INLINE uint32_t U32_countbits(uint32_t v)
 #endif
 }
 
+// ----------------------------- U64 -----------------------------------------
+
+SLANG_FORCE_INLINE uint64_t U64_abs(uint64_t f) { return f; }
+
+SLANG_FORCE_INLINE uint64_t U64_min(uint64_t a, uint64_t b) { return a < b ? a : b; }
+SLANG_FORCE_INLINE uint64_t U64_max(uint64_t a, uint64_t b) { return a > b ? a : b; }
+
+SLANG_FORCE_INLINE uint64_t U64_clamp(uint64_t x, uint64_t min, uint64_t max) { return ( x < min) ? min : ((x > max) ? max : x); }
+
+SLANG_FORCE_INLINE uint32_t U64_countbits(uint64_t v)
+{
+#if SLANG_GCC_FAMILY    
+    return __builtin_popcountl(v);
+#elif SLANG_PROCESSOR_X86_64 && SLANG_VC
+    return __popcnt64(v);
+#else     
+    uint64_t c = 0;
+    while (v)
+    {
+        c++;
+        v &= v - 1;
+    }
+    return c;
+#endif
+}
+
+// ----------------------------- I64 -----------------------------------------
+
+SLANG_FORCE_INLINE int64_t I64_abs(int64_t f) { return (f < 0) ? -f : f; }
+
+SLANG_FORCE_INLINE int64_t I64_min(int64_t a, int64_t b) { return a < b ? a : b; }
+SLANG_FORCE_INLINE int64_t I64_max(int64_t a, int64_t b) { return a > b ? a : b; }
+
+SLANG_FORCE_INLINE int64_t I64_clamp(int64_t x, int64_t min, int64_t max) { return ( x < min) ? min : ((x > max) ? max : x); }
 
 #ifdef SLANG_PRELUDE_NAMESPACE
 } 
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 7e6e5957d..233903134 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -214,6 +214,32 @@ SLANG_CUDA_CALL uint32_t U32_countbits(uint32_t v)
     return __popc(v);
 }
 
+
+// ----------------------------- I64 -----------------------------------------
+
+SLANG_CUDA_CALL int64_t I64_abs(int64_t f) { return (f < 0) ? -f : f; }
+
+SLANG_CUDA_CALL int64_t I64_min(int64_t a, int64_t b) { return a < b ? a : b; }
+SLANG_CUDA_CALL int64_t I64_max(int64_t a, int64_t b) { return a > b ? a : b; }
+
+SLANG_CUDA_CALL int64_t I64_clamp(int64_t x, int64_t min, int64_t max) { return ( x < min) ? min : ((x > max) ? max : x); }
+
+// ----------------------------- U64 -----------------------------------------
+
+SLANG_CUDA_CALL int64_t U64_abs(uint64_t f) { return f; }
+
+SLANG_CUDA_CALL int64_t U64_min(uint64_t a, uint64_t b) { return a < b ? a : b; }
+SLANG_CUDA_CALL int64_t U64_max(uint64_t a, uint64_t b) { return a > b ? a : b; }
+
+SLANG_CUDA_CALL int64_t U64_clamp(uint64_t x, uint64_t min, uint64_t max) { return ( x < min) ? min : ((x > max) ? max : x); }
+
+SLANG_CUDA_CALL uint32_t U64_countbits(uint64_t v)
+{
+    // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html#group__CUDA__MATH__INTRINSIC__INT_1g43c9c7d2b9ebf202ff1ef5769989be46
+    return __popcll(v);
+}
+
+
 // ----------------------------- ResourceType -----------------------------------------
 
 
diff --git a/source/slang/slang-emit-cpp.cpp b/source/slang/slang-emit-cpp.cpp
index 99cc2f61c..93d1cf2dd 100644
--- a/source/slang/slang-emit-cpp.cpp
+++ b/source/slang/slang-emit-cpp.cpp
@@ -72,6 +72,7 @@ static UnownedStringSlice _getTypePrefix(IROp op)
         case kIROp_UIntType:        return UnownedStringSlice::fromLiteral("U32");
         case kIROp_FloatType:       return UnownedStringSlice::fromLiteral("F32");
         case kIROp_Int64Type:       return UnownedStringSlice::fromLiteral("I64");
+        case kIROp_UInt64Type:      return UnownedStringSlice::fromLiteral("U64");
         case kIROp_DoubleType:      return UnownedStringSlice::fromLiteral("F64");
         default:                    return UnownedStringSlice::fromLiteral("?");
     }
diff --git a/tests/hlsl-intrinsic/scalar-int64.slang b/tests/hlsl-intrinsic/scalar-int64.slang
index 4da2a553e..4ad805081 100644
--- a/tests/hlsl-intrinsic/scalar-int64.slang
+++ b/tests/hlsl-intrinsic/scalar-int64.slang
@@ -1,9 +1,9 @@
 //TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
-// No support for int64_t on dx11
+// No support for int64_t on dx11 (no sm 6.0)
 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
-// No support for int64_t on HLSL
+// No support with Dx12 with dxbc. Needs SM6.0 + dxil
 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 
-//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -profile cs_6_0 -dx12 -use-dxil
 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-cuda -compute
 
@@ -13,9 +13,18 @@ RWStructuredBuffer<int> outputBuffer;
 [numthreads(4, 1, 1)]
 void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
-    uint idx = dispatchThreadID.x;
+    int64_t idx = int64_t(dispatchThreadID.x);
     
-    int64_t v = int64_t(idx) * 0x400010035435435ll; 
+    int64_t ti =0;
+    
+    ti += max(2, idx);
+    ti += min(idx, 1);
+    ti += abs(idx - 2);
+    ti += (idx * 3) % 5;
+    
+    ti += clamp(idx * 10, 11, 23);
+
+    int64_t v = (ti * 0x400010035435435ll) / 3ll + 7ll - 9ll; 
     
     outputBuffer[idx] = int(v) ^ int(((v >> 32) & 0xffffffff)); 
 }
 \ No newline at end of file
diff --git a/tests/hlsl-intrinsic/scalar-int64.slang.expected.txt b/tests/hlsl-intrinsic/scalar-int64.slang.expected.txt
index c0bb016cd..6ca5a87e0 100644
--- a/tests/hlsl-intrinsic/scalar-int64.slang.expected.txt
+++ b/tests/hlsl-intrinsic/scalar-int64.slang.expected.txt
@@ -1,4 +1,4 @@
-0
-31435535
-6286AA6A
-93C9FF9F
+1E50A006
+2793FF3D
+8A1AA9A7
+ED76E236
diff --git a/tests/hlsl-intrinsic/scalar-uint64.slang b/tests/hlsl-intrinsic/scalar-uint64.slang
index a990ccc22..dd165d8b8 100644
--- a/tests/hlsl-intrinsic/scalar-uint64.slang
+++ b/tests/hlsl-intrinsic/scalar-uint64.slang
@@ -4,7 +4,7 @@
 // No support for uint64_t on fxc - we need SM6.0 and dxil
 // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12
 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12
-//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_3 -use-dxil 
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil 
 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-cuda -compute
 
@@ -13,12 +13,24 @@ RWStructuredBuffer<int> outputBuffer;
 
 [numthreads(4, 1, 1)]
 void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    uint idx = dispatchThreadID.x;
+{    
+    uint64_t idx = dispatchThreadID.x;
     
-    uint64_t v = uint64_t(idx) * 0x8000100354354354ull; 
+    uint64_t ti = 0;
+   
+    ti += max(2, idx);
+    ti += min(idx, 1ull);
+    ti += (idx * 3) % 5;
+    
+    ti += clamp(idx * 10, 11, 23);
+    
+    ti += countbits(idx * 13);
+   
+    uint64_t v = uint64_t(ti) * 0x8000100354354354ull; 
     // Let's check all the bits make it
-    v |= 0x8000000000000000ull;
+    uint64_t u = v | 0x8000000000000000ull;
     
-    outputBuffer[idx] = int(v) ^ int(v >> 32); 
+    v = max(u, v);
+      
+    outputBuffer[dispatchThreadID.x] = int(v) ^ int(v >> 32); 
 }
 \ No newline at end of file
diff --git a/tests/hlsl-intrinsic/scalar-uint64.slang.expected.txt b/tests/hlsl-intrinsic/scalar-uint64.slang.expected.txt
index b8be0469a..d3843ea52 100644
--- a/tests/hlsl-intrinsic/scalar-uint64.slang.expected.txt
+++ b/tests/hlsl-intrinsic/scalar-uint64.slang.expected.txt
@@ -1,4 +1,4 @@
-80000000
-D4355357
-286AA6AE
-7C9FF9F5
+C6B4BB6F
+142802D2
+619FA985
+34A0408