From 4db6bd3cd6da1871fdac520c280bd9f933e48489 Mon Sep 17 00:00:00 2001
From: jsmall-nvidia <jsmall@nvidia.com>
Date: Wed, 8 Jun 2022 19:51:49 -0400
Subject: Improved bounds checking for C++/CUDA (#2263)

* #include an absolute path didn't work - because paths were taken to always be relative.

* Use TerminatedUnownedStringSlice for literals in output C++.

* Remove Escape/Unescape functions used in slang-token-reader.cpp
Add target type of 'host-cpp' etc to map to the target types.

* Fix some corner cases around string encoding.

* Added unit test for string escaping.
Fixed some assorted escaping bugs.

* Updated test output.

* Added decode test.

* Stop using hex output, to get around 'greedy' aspect. Use octal instead.

* Added HostHostCallable
Small changes to use ArtifactDesc/Info instead of large switches.

* Fix C++ emit to handle arbitrary function export.

* Add options handling for callable without an output being specified.

* Can compile with COM interface. Added example using com interface.

* Use the IR Ptr type instead of hack in C++ emit for interfaces.

* Fix issue with outputting the COM call when ptr is used.

* Fix crash issue on compilation failure.

* Add support for __global.

* Added `ActualGlobalRate`
Added special handling around globals and COM interfaces.
Tested out in cpu-com-example.

* Fix typo in NodeBase.

* Support for accessing globals by name working.

* Bounds checking for C++
Improved bounds checks for CUDA.

* Check that actual global initialization is working.

* Fix typo.

* Refactor the com replacement such that it doesn't need a cache or do anything special with GlobalVar.

* Fix typo in CUDA prelude.

* Remove context.
Only create replacement if needed.

* Split out COM host-callable into a unit-test.

* host-callable com testing on C++and llvm.

* Comment around the COM ptr replacement.

* WIP Zero bound test.

* Disable com test on vs 32 bit.
Fix C++ prelude

* Disable 32 bit targets testing com host-callable.

* For now disable zero index test.

* Enable bounds checking for CPU/CUDA.

* Small fixes.
Disable CUDA zero index bound fix.

* Add test result for bound check.

* Work around for index wrapping issue.

* Added Fixed array test.

* Only enable prelude asserts via SLANG_PRELUDE_ENABLE_ASSERT (unless defined by the user)
---
 source/slang/hlsl.meta.slang       | 22 +++++++++++-----------
 source/slang/slang-compiler.cpp    | 25 +++++++++++++++++--------
 source/slang/slang-lower-to-ir.cpp |  3 +++
 3 files changed, 31 insertions(+), 19 deletions(-)

(limited to 'source/slang')
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 7d107888a..b2f6fa06b 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -331,7 +331,7 @@ ${{{{
 
     __target_intrinsic(hlsl, "($3 = NvInterlockedAddFp32($0, $1, $2))")
     __cuda_sm_version(2.0)
-    __target_intrinsic(cuda, "(*$3 = atomicAdd((float*)$0._getPtrAt($1), $2))")
+    __target_intrinsic(cuda, "(*$3 = atomicAdd($0._getPtrAt<float>($1), $2))")
     [__requiresNVAPI]
     void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue);
 
@@ -347,7 +347,7 @@ ${{{{
     __target_intrinsic(hlsl, "(NvInterlockedAddFp32($0, $1, $2))")
     [__requiresNVAPI]
     __cuda_sm_version(2.0)
-    __target_intrinsic(cuda, "atomicAdd((float*)$0._getPtrAt($1), $2)")
+    __target_intrinsic(cuda, "atomicAdd($0._getPtrAt<float>($1), $2)")
     void InterlockedAddF32(uint byteAddress, float valueToAdd);
 
     __specialized_for_target(glsl)
@@ -359,7 +359,7 @@ ${{{{
 
     // Int64 Add
     __cuda_sm_version(6.0)
-    __target_intrinsic(cuda, "(*$3 = atomicAdd((uint64_t*)$0._getPtrAt($1), $2))")
+    __target_intrinsic(cuda, "(*$3 = atomicAdd($0._getPtrAt<uint64_t>($1), $2))")
     void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue);
 
     __specialized_for_target(hlsl)
@@ -377,7 +377,7 @@ ${{{{
 
     // Without returning original value
     __cuda_sm_version(6.0)
-    __target_intrinsic(cuda, "atomicAdd((uint64_t*)$0._getPtrAt($1), $2)")
+    __target_intrinsic(cuda, "atomicAdd($0._getPtrAt<uint64_t>($1), $2)")
     void InterlockedAddI64(uint byteAddress, int64_t valueToAdd);
 
     __specialized_for_target(hlsl)
@@ -395,7 +395,7 @@ ${{{{
 
     // Cas uint64_t
 
-    __target_intrinsic(cuda, "(*$4 = atomicCAS((uint64_t*)$0._getPtrAt($1), $2, $3))")
+    __target_intrinsic(cuda, "(*$4 = atomicCAS($0._getPtrAt<uint64_t>($1), $2, $3))")
     void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue);
 
     __specialized_for_target(hlsl)
@@ -414,7 +414,7 @@ ${{{{
     // Max
 
     __cuda_sm_version(3.5)
-    __target_intrinsic(cuda, "atomicMax((uint64_t*)$0._getPtrAt($1), $2)")
+    __target_intrinsic(cuda, "atomicMax($0._getPtrAt<uint64_t>($1), $2)")
     uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value);
 
     __specialized_for_target(hlsl)
@@ -430,7 +430,7 @@ ${{{{
     // Min
     
     __cuda_sm_version(3.5)
-    __target_intrinsic(cuda, "atomicMin((uint64_t*)$0._getPtrAt($1), $2)")
+    __target_intrinsic(cuda, "atomicMin($0._getPtrAt<uint64_t>($1), $2)")
     uint64_t InterlockedMinU64(uint byteAddress, uint64_t value);
 
     __specialized_for_target(hlsl)
@@ -445,7 +445,7 @@ ${{{{
 
     // And
 
-    __target_intrinsic(cuda, "atomicAnd((uint64_t*)$0._getPtrAt($1), $2)")
+    __target_intrinsic(cuda, "atomicAnd($0._getPtrAt<uint64_t>($1), $2)")
     uint64_t InterlockedAndU64(uint byteAddress, uint64_t value);
 
     __specialized_for_target(hlsl)
@@ -460,7 +460,7 @@ ${{{{
 
     // Or
 
-    __target_intrinsic(cuda, "atomicOr((uint64_t*)$0._getPtrAt($1), $2)")
+    __target_intrinsic(cuda, "atomicOr($0._getPtrAt<uint64_t>($1), $2)")
     uint64_t InterlockedOrU64(uint byteAddress, uint64_t value);
 
     __specialized_for_target(hlsl)
@@ -475,7 +475,7 @@ ${{{{
 
     // Xor
 
-    __target_intrinsic(cuda, "atomicXor((uint64_t*)$0._getPtrAt($1), $2)")
+    __target_intrinsic(cuda, "atomicXor($0._getPtrAt<uint64_t>($1), $2)")
     uint64_t InterlockedXorU64(uint byteAddress, uint64_t value);
 
     __specialized_for_target(hlsl)
@@ -490,7 +490,7 @@ ${{{{
 
     // Exchange
 
-    __target_intrinsic(cuda, "atomicExch((uint64_t*)$0._getPtrAt($1), $2)")
+    __target_intrinsic(cuda, "atomicExch($0._getPtrAt<uint64_t>($1), $2)")
     uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value);
 
     __specialized_for_target(hlsl)
diff --git a/source/slang/slang-compiler.cpp b/source/slang/slang-compiler.cpp
index bcf857fc8..d4c5812b7 100644
--- a/source/slang/slang-compiler.cpp
+++ b/source/slang/slang-compiler.cpp
@@ -1103,14 +1103,7 @@ void printDiagnosticArg(StringBuilder& sb, CodeGenTarget val)
             {
                 preprocessorDefinitions.Add(define.Key, define.Value);
             }
-            {
-                auto linkage = getLinkage();
-                for (auto& define : linkage->preprocessorDefinitions)
-                {
-                    preprocessorDefinitions.Add(define.Key, define.Value);
-                }
-            }
-
+            
             {
                 /* TODO(JS): Not totally clear what options should be set here. If we are using the pass through - then using say the defines/includes
                 all makes total sense. If we are generating C++ code from slang, then should we really be using these values -> aren't they what is
@@ -1168,6 +1161,22 @@ void printDiagnosticArg(StringBuilder& sb, CodeGenTarget val)
             sourceLanguage = (SourceLanguage)TypeConvertUtil::getSourceLanguageFromTarget((SlangCompileTarget)sourceTarget);
         }
 
+        // Add any preprocessor definitions associated with the linkage
+        {
+            // TODO(JS): This is somewhat arguable - should defines passed to Slang really be
+            // passed to downstream compilers? It does appear consistent with the behavior if 
+            // there is an endToEndReq.
+            // 
+            // That said it's very convenient and provides way to control aspects 
+            // of downstream compilation. 
+            
+            auto linkage = getLinkage();
+            for (auto& define : linkage->preprocessorDefinitions)
+            {
+                preprocessorDefinitions.Add(define.Key, define.Value);
+            }
+        }
+
         // If we have an extension tracker, we may need to set options such as SPIR-V version
         // and CUDA Shader Model.
         if (extensionTracker)
diff --git a/source/slang/slang-lower-to-ir.cpp b/source/slang/slang-lower-to-ir.cpp
index d175b69dd..86edf9282 100644
--- a/source/slang/slang-lower-to-ir.cpp
+++ b/source/slang/slang-lower-to-ir.cpp
@@ -6128,6 +6128,9 @@ struct DeclLoweringVisitor : DeclVisitor<DeclLoweringVisitor, LoweredValInfo>
 
         auto builder = getBuilder();
 
+        // TODO(JS): Do we create something derived from IRGlobalVar? Or do we use 
+        // a decoration to identify an *actual* global?
+
         IRGlobalValueWithCode* irGlobal = builder->createGlobalVar(varType);
         LoweredValInfo globalVal = LoweredValInfo::ptr(irGlobal);
 
-- 
cgit v1.2.3