13 files changed, 215 insertions, 65 deletions
diff --git a/source/slang/diff.meta.slang b/source/slang/diff.meta.slang
index 51cf1cdb7..252b6f5e9 100644
--- a/source/slang/diff.meta.slang
+++ b/source/slang/diff.meta.slang
@@ -34,6 +34,15 @@ struct TensorView
     [__readNone]
     Ptr<T> data_ptr();
 
+    __target_intrinsic(cuda, "$0.data_ptr_at<$G0>($1)")
+    [__readNone]
+    Ptr<T> data_ptr_at(uint index);
+
+    __generic<let N: int>
+    __target_intrinsic(cuda, "$0.data_ptr_at<$G0>($1)")
+    [__readNone]
+    Ptr<T> data_ptr_at(vector<uint, N> index);
+
     __implicit_conversion($(kConversionCost_ImplicitDereference))
     __intrinsic_op($(kIROp_TorchTensorGetView))
     __init(TorchTensor<T> t);
@@ -65,6 +74,13 @@ struct TensorView
     __target_intrinsic(cuda, "$0.store<$G0>($1, $2, $3, $4, $5, $6)")
     void store(uint i0, uint i1, uint i2, uint i3, uint i4, T val);
 
+    __target_intrinsic(cuda, "atomicAdd($0.data_ptr_at<$TR>($1), $2)")
+    T InterlockedAdd(uint index, T val);
+    
+    __generic<let N:int>
+    __target_intrinsic(cuda, "atomicAdd($0.data_ptr_at<$TR>($1), $2)")
+    T InterlockedAdd(vector<uint, N> index, T val);
+
     __target_intrinsic(cuda, "$0.dimensionCount")
     [__readNone]
     uint dims();
@@ -81,44 +97,139 @@ struct TensorView
     {
         [ForceInline] [__readNone] get { return load(index); }
         [ForceInline] set { store(index, newValue); }
+        
+        __target_intrinsic(cuda, "$0.load<$G0>($1)")
+        ref;
     }
     __subscript(uint i1, uint i2) -> T
     {
         [ForceInline] [__readNone] get { return load(i1, i2); }
         [ForceInline] set { store(i1, i2, newValue); }
+        __target_intrinsic(cuda, "$0.load<$G0>($1, $2)")
+        ref;
     }
     __subscript(uint2 i) -> T
     {
         [ForceInline] [__readNone] get { return load(i.x, i.y); }
         [ForceInline] set { store(i.x, i.y, newValue); }
+        __target_intrinsic(cuda, "$0.load<$G0>($1.x, $1.y)")
+        ref;
     }
     __subscript(uint i1, uint i2, uint i3) -> T
     {
         [ForceInline] [__readNone] get { return load(i1, i2, i3); }
         [ForceInline] set { store(i1, i2, i3, newValue); }
+        __target_intrinsic(cuda, "$0.load<$G0>($1, $2, $3)")
+        ref;
     }
     __subscript(uint3 i) -> T
     {
         [ForceInline] [__readNone] get { return load(i.x, i.y, i.z); }
         [ForceInline] set { store(i.x, i.y, i.z, newValue); }
+        __target_intrinsic(cuda, "$0.load<$G0>($1.x, $1.y, $1.z)")
+        ref;
     }
     __subscript(uint i1, uint i2, uint i3, uint i4) -> T
     {
         [ForceInline] [__readNone] get { return load(i1, i2, i3, i4); }
         [ForceInline] set { store(i1, i2, i3, i4, newValue); }
+        __target_intrinsic(cuda, "$0.load<$G0>($1, $2, $3, $4)")
+        ref;
     }
     __subscript(uint4 i) -> T
     {
         [__readNone][ForceInline] get { return load(i.x, i.y, i.z, i.w); }
         [ForceInline] set { store(i.x, i.y, i.z, i.w, newValue); }
+        __target_intrinsic(cuda, "$0.load<$G0>($1.x, $1.y, $1.z, $1.w)")
+        ref;
     }
     __subscript(uint i1, uint i2, uint i3, uint i4, uint i5) -> T
     {
         [ForceInline] [__readNone] get { return load(i1, i2, i3, i4, i5); }
         [ForceInline] set { store(i1, i2, i3, i4, i5, newValue); }
+        __target_intrinsic(cuda, "$0.load<$G0>($1, $2, $3, $4, $5)")
+        ref;
     }
 }
 
+${{{{
+for (auto atomicIntegerTypeName : kCudaAtomicIntegerTypes)
+{
+}}}}
+extension TensorView<$(atomicIntegerTypeName)>
+{
+    typealias __Element = $(atomicIntegerTypeName);
+    __target_intrinsic(cuda, "atomicInc($0.data_ptr_at<$TR>($1), $2)")
+    __Element InterlockedIncrement(uint index, __Element val);
+
+    __generic<let N:int>
+    __target_intrinsic(cuda, "atomicInc($0.data_ptr_at<$TR>($1), $2)")
+    __Element InterlockedIncrement(vector<uint, N> index, __Element val);
+
+    __target_intrinsic(cuda, "atomicMin($0.data_ptr_at<$TR>($1), $2)")
+    __Element InterlockedMin(uint index, __Element val);
+    
+    __generic<let N:int>
+    __target_intrinsic(cuda, "atomicMin($0.data_ptr_at<$TR>($1), $2)")
+    __Element InterlockedMin(vector<uint, N> index, __Element val);
+
+    __target_intrinsic(cuda, "atomicMax($0.data_ptr_at<$TR>($1), $2)")
+    __Element InterlockedMax<T>(uint index, __Element val);
+    
+    __generic<let N:int>
+    __target_intrinsic(cuda, "atomicMax($0.data_ptr_at<$TR>($1), $2)")
+    __Element InterlockedMax(vector<uint, N> index, __Element val);
+
+    __target_intrinsic(cuda, "atomicAnd($0.data_ptr_at<$TR>($1), $2)")
+    __Element InterlockedAnd<T>(uint index, __Element val);
+    
+    __generic<let N:int>
+    __target_intrinsic(cuda, "atomicAnd($0.data_ptr_at<$TR>($1), $2)")
+    __Element InterlockedAnd(vector<uint, N> index, __Element val);
+
+    __target_intrinsic(cuda, "atomicOr($0.data_ptr_at<$TR>($1), $2)")
+    __Element InterlockedOr<T>(uint index, __Element val);
+    
+    __generic<let N:int>
+    __target_intrinsic(cuda, "atomicOr($0.data_ptr_at<$TR>($1), $2)")
+    __Element InterlockedOr(vector<uint, N> index, __Element val);
+
+    __target_intrinsic(cuda, "atomicXor($0.data_ptr_at<$TR>($1), $2)")
+    __Element InterlockedXor<T>(uint index, __Element val);
+    
+    __generic<let N:int>
+    __target_intrinsic(cuda, "atomicXor($0.data_ptr_at<$TR>($1), $2)")
+    __Element InterlockedXor(vector<uint, N> index, __Element val);
+    
+    __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$TR>($1), $2)")
+    __Element InterlockedExchange(uint index, __Element val);
+    
+    __generic<let N:int>
+    __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$TR>($1), $2)")
+    __Element InterlockedExchange(vector<uint, N> index, __Element val);
+
+    __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$TR>($1), $2, $3)")
+    __Element InterlockedCompareExchange(uint index, __Element compare, __Element val);
+    
+    __generic<let N:int>
+    __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$TR>($1), $2, $3)")
+    __Element InterlockedCompareExchange(vector<uint, N> index, __Element compare, __Element val);
+}
+
+${{{{
+} // end for atomicIntegerTypeName
+}}}}
+
+extension TensorView<float>
+{
+    __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$G0>($1), $2)")
+    float InterlockedExchange(uint index, float val);
+    
+    __generic<let N:int>
+    __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$G0>($1), $2)")
+    float InterlockedExchange(vector<uint, N> index, float val);
+}
+
 __generic<T>
 __intrinsic_type($(kIROp_TorchTensorType))
 struct TorchTensor
@@ -162,9 +273,26 @@ struct TorchTensor
     static TorchTensor<T> alloc(uint i0, uint i1, uint i2, uint i3, uint i4);
 
     __intrinsic_op($(kIROp_AllocateTorchTensor))
-    static TorchTensor<T> zerosLike(TorchTensor<T> other);
+    static TorchTensor<T> emptyLike(TorchTensor<T> other);
+
+    __target_intrinsic(cpp, "$0.zero_()")
+    void fillZero();
+
+    __target_intrinsic(cpp, "$0.fill_($1)")
+    void fillValue(T val);
+
+    static TorchTensor<T> zerosLike(TorchTensor<T> other)
+    {
+        var result = emptyLike(other);
+        result.fillZero();
+        return result;
+    }
+
 }
 
+__target_intrinsic(cpp, "AT_CUDA_CHECK(cudaStreamSynchronize(at::cuda::getCurrentCUDAStream()))")
+void syncTorchCudaStream();
+
 __generic<T: IDifferentiable>
 __intrinsic_op($(kIROp_MakeDifferentialPairUserCode))
 DifferentialPair<T> diffPair(T primal, T.Differential diff);
diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp
index 08ba050db..166c131d5 100644
--- a/source/slang/slang-emit-c-like.cpp
+++ b/source/slang/slang-emit-c-like.cpp
@@ -2405,6 +2405,11 @@ void CLikeSourceEmitter::_emitInst(IRInst* inst)
         m_writer->emit(";\n");
         break;
 
+        // Insts that needs to be emitted as code blocks.
+    case kIROp_CudaKernelLaunch:
+        emitInstStmtImpl(inst);
+        break;
+
     case kIROp_LiveRangeStart:
     case kIROp_LiveRangeEnd:
         emitLiveness(inst);
diff --git a/source/slang/slang-emit-c-like.h b/source/slang/slang-emit-c-like.h
index 8046fa633..bfe3dcc4e 100644
--- a/source/slang/slang-emit-c-like.h
+++ b/source/slang/slang-emit-c-like.h
@@ -530,6 +530,7 @@ public:
 
     virtual bool tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType) { SLANG_UNUSED(varDecl); SLANG_UNUSED(varType); return false; }
     virtual bool tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec) { SLANG_UNUSED(inst); SLANG_UNUSED(inOuterPrec); return false; }
+    virtual void emitInstStmtImpl(IRInst* inst) { SLANG_UNUSED(inst); }
 
     virtual void emitPostKeywordTypeAttributesImpl(IRInst* inst) { SLANG_UNUSED(inst); }
 
diff --git a/source/slang/slang-emit-torch.cpp b/source/slang/slang-emit-torch.cpp
index 276164ed5..c198b011d 100644
--- a/source/slang/slang-emit-torch.cpp
+++ b/source/slang/slang-emit-torch.cpp
@@ -65,6 +65,49 @@ void emitTorchScalarTypeName(SourceWriter* m_writer, IRInst* type)
     }
 }
 
+void TorchCppSourceEmitter::emitInstStmtImpl(IRInst* inst)
+{
+    switch (inst->getOp())
+    {
+    default:
+        return;
+    case kIROp_CudaKernelLaunch:
+        {
+            m_writer->emit("AT_CUDA_CHECK(cudaLaunchKernel(");
+            // func
+            m_writer->emit("(const void*)(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+            m_writer->emit("), ");
+
+            // gridDim
+            m_writer->emit("slang_bit_cast<dim3>(");
+            emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
+            m_writer->emit("), ");
+
+            // blockDim
+            m_writer->emit("slang_bit_cast<dim3>(");
+            emitOperand(inst->getOperand(2), getInfo(EmitOp::General));
+            m_writer->emit("), ");
+
+            // args
+            emitOperand(inst->getOperand(3), getInfo(EmitOp::General));
+            m_writer->emit(", ");
+
+            // shared mem
+            m_writer->emit("slangGetCudaKernelSharedMemSize((const void*)(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+            m_writer->emit(")), ");
+
+            // stream
+            m_writer->emit("((cudaStream_t)");
+            emitOperand(inst->getOperand(4), getInfo(EmitOp::General));
+            m_writer->emit(")));\n");
+        
+            break;
+        }
+    }
+}
+
 bool TorchCppSourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec)
 {
     switch (inst->getOp())
@@ -78,47 +121,12 @@ bool TorchCppSourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo&
         m_writer->emit("make_tensor_view(");
         emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
         m_writer->emit(", ");
-        emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitStringLiteral(getUnmangledName(inst->getOperand(1)));
+        emitStringLiteral(getUnmangledName(inst->getOperand(0)));
         m_writer->emit(", ");
-        emitTorchScalarTypeName(m_writer, inst->getOperand(1)->getDataType());
+        emitTorchScalarTypeName(m_writer, inst->getOperand(0)->getDataType());
         m_writer->emit(")");
         return true;
     }
-    case kIROp_CudaKernelLaunch:
-    {
-        m_writer->emit("cudaLaunchKernel(");
-        // func
-        m_writer->emit("(const void*)(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-        m_writer->emit("), ");
-
-        // gridDim
-        m_writer->emit("slang_bit_cast<dim3>(");
-        emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
-        m_writer->emit("), ");
-
-        // blockDim
-        m_writer->emit("slang_bit_cast<dim3>(");
-        emitOperand(inst->getOperand(2), getInfo(EmitOp::General));
-        m_writer->emit("), ");
-
-        // args
-        emitOperand(inst->getOperand(3), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-
-        // shared mem
-        m_writer->emit("slangGetCudaKernelSharedMemSize((const void*)(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-        m_writer->emit(")), ");
-
-        // stream
-        m_writer->emit("((cudaStream_t)");
-        emitOperand(inst->getOperand(4), getInfo(EmitOp::General));
-        m_writer->emit("))");
-        return true;
-    }
     case kIROp_TorchGetCudaStream:
     {
         m_writer->emit("at::cuda::getCurrentCUDAStream()");
@@ -131,12 +139,14 @@ bool TorchCppSourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo&
             /*
             Emit something like:
                 ```
-                torch::Tensor out = torch::zeros_like(other);
+                torch::Tensor out = torch::empty_like(other);
                 ```
             */
-            m_writer->emit("torch::zeros_like(");
+            m_writer->emit("torch::empty_like(");
             emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-            m_writer->emit(")");
+            m_writer->emit(", torch::TensorOptions().device(torch::kCUDA).dtype(");
+            emitTorchScalarTypeName(m_writer, inst->getDataType());
+            m_writer->emit("))");
         }
         else
         {
@@ -180,11 +190,6 @@ SlangResult TorchCppSourceEmitter::calcTypeName(IRType* type, CodeGenTarget targ
         out << "torch::Tensor";
         return SLANG_OK;
     }
-    case kIROp_TorchKernelMemoryAllocatorType:
-    {
-        out << "CudaTaskMemoryAllocator";
-        return SLANG_OK;
-    }
     }
 }
 
diff --git a/source/slang/slang-emit-torch.h b/source/slang/slang-emit-torch.h
index 84ce42331..aeb9058a4 100644
--- a/source/slang/slang-emit-torch.h
+++ b/source/slang/slang-emit-torch.h
@@ -19,6 +19,8 @@ public:
 
 protected:
     // CPPSourceEmitter overrides 
+    virtual void emitInstStmtImpl(IRInst* inst) override;
+
     virtual bool tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec) override;
     virtual SlangResult calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out) override;
     virtual void emitModuleImpl(IRModule* module, DiagnosticSink* sink) override;
diff --git a/source/slang/slang-intrinsic-expand.cpp b/source/slang/slang-intrinsic-expand.cpp
index de3396efb..7a4744d59 100644
--- a/source/slang/slang-intrinsic-expand.cpp
+++ b/source/slang/slang-intrinsic-expand.cpp
@@ -268,17 +268,28 @@ const char* IntrinsicExpandContext::_emitSpecial(const char* cursor)
         break;
 
         case 'T':
-            // Get the the 'element' type for the type of the param at the index
+            // Get the 'element' or `return` type for the type of the param at the index
         {
-            SLANG_RELEASE_ASSERT(*cursor >= '0' && *cursor <= '9');
-            Index argIndex = (*cursor++) - '0' + m_argIndexOffset;
-            SLANG_RELEASE_ASSERT(m_argCount > argIndex);
-
-            IRType* type = m_args[argIndex].get()->getDataType();
-            if (auto baseTextureType = as<IRTextureType>(type))
+            IRType* type = nullptr;
+            if (*cursor == 'R')
             {
-                type = baseTextureType->getElementType();
+                // Get the return type of the call
+                cursor++;
+                type = m_callInst->getDataType();
+            }
+            else
+            {
+                SLANG_RELEASE_ASSERT(*cursor >= '0' && *cursor <= '9');
+                Index argIndex = (*cursor++) - '0' + m_argIndexOffset;
+                SLANG_RELEASE_ASSERT(m_argCount > argIndex);
+
+                type = m_args[argIndex].get()->getDataType();
+                if (auto baseTextureType = as<IRTextureType>(type))
+                {
+                    type = baseTextureType->getElementType();
+                }
             }
+            SLANG_RELEASE_ASSERT(type);
             m_emitter->emitType(type);
         }
         break;
diff --git a/source/slang/slang-ir-check-differentiability.cpp b/source/slang/slang-ir-check-differentiability.cpp
index c4b09d9e8..355381559 100644
--- a/source/slang/slang-ir-check-differentiability.cpp
+++ b/source/slang/slang-ir-check-differentiability.cpp
@@ -332,8 +332,6 @@ public:
             {
                 if (expectDiffInstWorkListSet.Add(inst))
                 {
-                    if (inst->getFullType() && inst->getFullType()->getOp() == kIROp_IntType)
-                        printf("break");
                     expectDiffInstWorkList.add(inst);
                 }
             }
diff --git a/source/slang/slang-ir-inst-defs.h b/source/slang/slang-ir-inst-defs.h
index e58094b15..8d06c6970 100644
--- a/source/slang/slang-ir-inst-defs.h
+++ b/source/slang/slang-ir-inst-defs.h
@@ -69,7 +69,6 @@ INST(Nop, nop, 0, 0)
 
     INST(TensorViewType, TensorView, 1, HOISTABLE)
     INST(TorchTensorType, TorchTensor, 0, HOISTABLE)
-    INST(TorchKernelMemoryAllocatorType, TorchMemAllocatorType, 0, HOISTABLE)
     INST(ArrayListType, ArrayListVector, 1, HOISTABLE)
     
     /* BindExistentialsTypeBase */
@@ -614,7 +613,7 @@ INST(GetOptiXSbtDataPtr, getOptiXSbtDataPointer, 0, 0)
 
 INST(MakeArrayList, makeArrayList, 0, 0)
 INST(MakeTensorView, makeTensorView, 0, 0)
-INST(AllocateTorchTensor, allocTorchTensor , 0, 0)
+INST(AllocateTorchTensor, allocTorchTensor, 0, 0)
 INST(TorchGetCudaStream, TorchGetCudaStream, 0, 0)
 INST(TorchTensorGetView, TorchTensorGetView, 0, 0)
 
diff --git a/source/slang/slang-ir-insts.h b/source/slang/slang-ir-insts.h
index f5b03eb45..26d00d4df 100644
--- a/source/slang/slang-ir-insts.h
+++ b/source/slang/slang-ir-insts.h
@@ -3129,7 +3129,7 @@ public:
         return emitMakeStruct(type, args.getCount(), args.getBuffer());
     }
 
-    IRInst* emitMakeTensorView(IRType* type, IRInst* allocator, IRInst* val);
+    IRInst* emitMakeTensorView(IRType* type, IRInst* val);
 
     IRInst* emitMakeExistential(
         IRType* type,
diff --git a/source/slang/slang-ir-pytorch-cpp-binding.cpp b/source/slang/slang-ir-pytorch-cpp-binding.cpp
index 971e87a6f..d59d57474 100644
--- a/source/slang/slang-ir-pytorch-cpp-binding.cpp
+++ b/source/slang/slang-ir-pytorch-cpp-binding.cpp
@@ -263,8 +263,6 @@ static void generateCppBindingForFunc(IRFunc* func, DiagnosticSink* sink)
         oldParam->removeAndDeallocate();
     }
 
-    auto allocator = builder.emitVar(builder.getType(kIROp_TorchKernelMemoryAllocatorType));
-
     for (auto block : func->getBlocks())
     {
         for (auto inst : block->getChildren())
@@ -297,7 +295,7 @@ static void generateCppBindingForFunc(IRFunc* func, DiagnosticSink* sink)
             else if (auto getView = as<IRTorchTensorGetView>(inst))
             {
                 builder.setInsertBefore(getView);
-                auto makeView = builder.emitMakeTensorView(getView->getFullType(), allocator, inst->getOperand(0));
+                auto makeView = builder.emitMakeTensorView(getView->getFullType(), inst->getOperand(0));
                 getView->replaceUsesWith(makeView);
                 instsToRemove.add(getView);
             }
diff --git a/source/slang/slang-ir.cpp b/source/slang/slang-ir.cpp
index d03096483..6ca05d2d6 100644
--- a/source/slang/slang-ir.cpp
+++ b/source/slang/slang-ir.cpp
@@ -3951,10 +3951,9 @@ namespace Slang
         return emitIntrinsicInst(type, kIROp_MakeStruct, argCount, args);
     }
 
-    IRInst* IRBuilder::emitMakeTensorView(IRType* type, IRInst* allocator, IRInst* val)
+    IRInst* IRBuilder::emitMakeTensorView(IRType* type, IRInst* val)
     {
-        IRInst* args[2] = { allocator, val };
-        return emitIntrinsicInst(type, kIROp_MakeTensorView, 2, args);
+        return emitIntrinsicInst(type, kIROp_MakeTensorView, 1, &val);
     }
 
     IRInst* IRBuilder::emitMakeExistential(
diff --git a/source/slang/slang-stdlib.cpp b/source/slang/slang-stdlib.cpp
index 17fb3d28b..9d4a079c8 100644
--- a/source/slang/slang-stdlib.cpp
+++ b/source/slang/slang-stdlib.cpp
@@ -244,6 +244,9 @@ namespace Slang
         {kIROp_Leq, "leq", "<=", "__BuiltinArithmeticType", ARITHMETIC_MASK | BOOL_RESULT},
     };
 
+    // Integer types that can be used in atomic operations in CUDA.
+    static const char* kCudaAtomicIntegerTypes[] = { "int", "uint", "uint64_t", "int64_t" };
+
     // Both the following functions use these macros.
     // NOTE! They require a variable named path to emit the #line correctly if in source file.
 #define SLANG_RAW(TEXT) sb << TEXT;
diff --git a/source/slang/slang.cpp b/source/slang/slang.cpp
index 79f9e56ba..b831859a5 100644
--- a/source/slang/slang.cpp
+++ b/source/slang/slang.cpp
@@ -1493,6 +1493,7 @@ CapabilitySet TargetRequest::getTargetCaps()
         break;
 
     case CodeGenTarget::CPPSource:
+    case CodeGenTarget::PyTorchCppBinding:
     case CodeGenTarget::HostExecutable:
     case CodeGenTarget::ShaderSharedLibrary:
     case CodeGenTarget::HostHostCallable: