8 files changed, 410 insertions, 208 deletions
diff --git a/docs/user-guide/a1-02-slangpy.md b/docs/user-guide/a1-02-slangpy.md
index 0337b4d86..99476c5b2 100644
--- a/docs/user-guide/a1-02-slangpy.md
+++ b/docs/user-guide/a1-02-slangpy.md
@@ -22,7 +22,6 @@ pip install slangpy
 
 Note that `slangpy` requires `torch` with CUDA support. See the [pytorch](https://pytorch.org/) installation page to find the right version for your platform.
 
-
 You can check that you have the right installation by running: 
 ```sh
 python -c "import torch; print(f'cuda: {torch.cuda.is_available()}')"
@@ -34,28 +33,32 @@ From **v2023.4.0**, Slang supports auto-binding features that make it easier tha
 
 Here's a barebones example of a simple squaring kernel written in Slang (`square.slang`):
 
-``` csharp
+```csharp
 [AutoPyBindCUDA]
 [CUDAKernel]
 void square(TensorView<float> input, TensorView<float> output)
 {
     // Get the 'global' index of this thread.
-    uint3 launchIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim();
+    uint3 dispatchIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim();
 
     // If the thread index is beyond the input size, exit early.
-    if (launchIdx.x < input.size(0))
+    if (dispatchIdx.x < input.size(0))
         return;
 
-    output[launchIdx.x] = input[launchIdx.x] * input[launchIdx.x];
+    output[dispatchIdx.x] = input[dispatchIdx.x] * input[dispatchIdx.x];
 }
 
 ```
 
-`square` performs **element-wise** squaring on `input` and writes them to `output`
+This code follows the standard pattern of a typical CUDA kernel function. It takes as input
+two tensors, `input` and `output`. 
+It first obtains the global dispatch index of the current thread and performs range check to make sure we don't read or write out
+of the bounds of input and output tensors, and then calls `square()` to compute the per-element result, and
+store it at the corresponding location in `output` tensor.
 
 
 `slangpy` works by compiling kernels to CUDA and it identifies the functions to compile by checking for the `[CUDAKernel]` attribute.
-The second attribute `[AutoPyBindCUDA]` allows us to call `multiply` directly from python without having to write any host code. If you would like to write the host code yourself for finer control, see the other version of this example [here](#manually-binding-kernels).
+The second attribute `[AutoPyBindCUDA]` allows us to call `square` directly from python without having to write any host code. If you would like to write the host code yourself for finer control, see the other version of this example [here](#manually-binding-kernels).
 
 You can now simply invoke this kernel from python:
 
@@ -63,14 +66,14 @@ You can now simply invoke this kernel from python:
 import torch
 import slangpy
 
-m = slangpy.loadModule('multiply.slang')
+m = slangpy.loadModule('square.slang')
 
 A = torch.randn((1024,), dtype=torch.float).cuda()
 
 output = torch.zeros_like(A).cuda()
 
 # Number of threads launched = blockSize * gridSize
-m.multiply(input=A, output=output).launchRaw(blockSize=(32, 1, 1), gridSize=(64, 1, 1))
+m.square(input=A, output=output).launchRaw(blockSize=(32, 1, 1), gridSize=(64, 1, 1))
 
 print(output)
 ```
@@ -89,27 +92,22 @@ The `[AutoPyBindCUDA]` attribute can also be used on differentiable functions de
 
 One key point is that the basic `TensorView<T>` objects are not differentiable. They can be used as buffers for data that does not require derivatives, or even as buffers for the manual accumulation of derivatives.
 
-Instead, use the `DiffTensorView` type for when you need differentiable tensors. Currently, `DiffTensorView` only supports the `float` dtype variety, and requires the use of `.load(offset)` and `.store(offset, val)` instead of `[]`, although
-`offset` can be a scalar `uint` or vector `uint2`, `uint3`, etc.. for multi-dimensional indexing.
+Instead, use the `DiffTensorView` type for when you need differentiable tensors. Currently, `DiffTensorView` only supports the `float` dtype variety.
 
-Here's a barebones example of a differentiable `sqr` that computes `x*x`
+Here's a barebones example of a differentiable version of `square`:
 
-``` C
+```csharp
 [AutoPyBindCUDA]
 [CUDAKernel]
 [Differentiable]
 void square(DiffTensorView input, DiffTensorView output)
 {
-    uint3 launchIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim();
+    uint3 dispatchIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim();
 
-    if (launchIdx.x < inputA.size(0))
+    if (dispatchIdx.x < input.size(0))
         return;
     
-    float val = input.load(launchIdx.x);
-
-    float result = x * x;
-
-    output.store(launchIdx.x, result);
+    output[dispatchIdx.x] = input[dispatchIdx.x] * input[dispatchIdx.x];
 }
 ```
 
@@ -117,13 +115,14 @@ Now, `slangpy.loadModule("square.slang")` returns a scope with three callable ha
 
 You can invoke `square()` normally to get the same effect as the previous example, or invoke `square.fwd()` / `square.bwd()` by binding pairs of tensors to compute the derivatives.
 
+
 ``` Python
 import torch
 import slangpy
 
 m = slangpy.loadModule('square.slang')
 
-input = torch.tensor((0, 1, 3, 4, 5), dtype=torch.float).cuda()
+input = torch.tensor((0, 1, 2, 3, 4, 5), dtype=torch.float).cuda()
 output = torch.zeros_like(input).cuda()
 
 # Invoke normally
@@ -132,14 +131,14 @@ m.square(input=input, output=output).launchRaw(blockSize=(6, 1, 1), gridSize=(1,
 print(output)
 
 # Invoke reverse-mode autodiff by first allocating tensors to hold the gradients
-input = torch.tensor((0, 1, 3, 4, 5), dtype=torch.float).cuda()
+input = torch.tensor((0, 1, 2, 3, 4, 5), dtype=torch.float).cuda()
 input_grad = torch.zeros_like(input).cuda()
 
 output = torch.zeros_like(input)
 # Pass in all 1s as the output derivative for our example
 output_grad = torch.ones_like(output) 
 
-m.sqr.bwd(
+m.square.bwd(
     input=(input, input_grad), output=(output, output_grad)
 ).launchRaw(
     blockSize=(6, 1, 1), gridSize=(1, 1, 1))
@@ -152,7 +151,7 @@ print(input_grad)
 print(output_grad)
 ```
 
-`slangpy` also binds the forward-mode version of your kernel (propagate derivatives of inputs to the output) which can be invoked the same way using `module.sqr.fwd()`
+`slangpy` also binds the forward-mode version of your kernel (propagate derivatives of inputs to the output) which can be invoked the same way using `module.square.fwd()`
 
 You can refer to [this documentation](07-autodiff.md) for a detailed reference of Slang's automatic differentiation feature.
 
@@ -160,7 +159,7 @@ You can refer to [this documentation](07-autodiff.md) for a detailed reference o
 
 `pytorch` offers an easy way to define a custom operation using `torch.autograd.Function`, and defining the `.forward()` and `.backward()` members.
 
-This can be very helpful to wrap your Slang kernels. Here's an example of the `square` kernel as a differentiable pytorch function.
+This can be a very helpful way to wrap your Slang kernels as pytorch-compatible operations. Here's an example of the `square` kernel as a differentiable pytorch function.
 
 ```python
 m = slangpy.loadModule("square.slang")
@@ -201,7 +200,7 @@ Now we can use the autograd function `MySquareFunc` in our python script:
 ```python
 x = torch.tensor([[3.0, 4.0],[0.0, 1.0]], requires_grad=True, device='cuda')
 print(f"X = {x}")
-y_pred = MySquareFuncInSlang.apply(x)
+y_pred = MySquareFunc.apply(x)
 loss = y_pred.sum()
 loss.backward()
 print(f"dX = {x.grad.cpu()}")
@@ -214,6 +213,14 @@ X = tensor([[3., 4.],
 dX = tensor([[6., 8.],
         [0., 2.]])
 ```
+
+And that's it! `slangpy.loadModule` uses JIT compilation to compile your Slang source into CUDA binary.
+It may take a little longer the first time you execute the script, but the compiled binaries will be cached and as long as the kernel code is not changed, future runs will not rebuild the CUDA kernel.
+
+Because the PyTorch JIT system requires `ninja`, you need to make sure `ninja` is installed on your system
+and is discoverable from the current environment, you also need to have a C++ compiler available on the system.
+On Windows, this means that Visual Studio need to be installed.
+
 ## Specializing shaders using slangpy
 
 `slangpy.loadModule` allows specialization parameters to be specified since it might be easier to write shaders with placeholder definitions that can be substituted at load-time.
@@ -447,67 +454,53 @@ Again, to understand all the details of the automatic differentiation system, pl
 ## Manually binding kernels
 `[AutoPyBindCUDA]` works for most use cases, but in certain situations, it may be necessary to write the *host* function by hand. The host function can also be written in Slang, and `slangpy` handles its compilation to C++.
 
-Here's the same `square` example from before, but with a hand-written host function:
+Here's the same `square` example from before:
 
 ```csharp
 // square.slang
-float square(float x)
+float compute_square(float x)
 {
     return x * x;
 }
-```
 
-This function is self-explanatory. To use it in PyTorch, we need to write a GPU kernel function (that maps to a 
-`__global__` CUDA function) that defines how to compute each element of the input tensor. So we continue to write
-the following Slang function:
-
-```csharp
 [CudaKernel]
-void square_fwd_kernel(TensorView<float> input, TensorView<float> output)
+void square_kernel(TensorView<float> input, TensorView<float> output)
 {
     uint3 globalIdx = cudaBlockIdx() * cudaBlockDim() + cudaThreadIdx();
 
-    if (globalIdx.x > input.size(0) || globalIdx.y > input.size(1))
+    if (globalIdx.x > input.size(0))
         return;
-    float result = square(input[globalIdx.xy]);
-    output[globalIdx.xy] = result;
+
+    float result = compute_square(input[globalIdx.x]);
+
+    output[globalIdx.x] = result;
 }
 ```
 
-This code follows the standard pattern of a typical CUDA kernel function. It takes as input
-two tensors, `input` and `output`. 
-It first obtains the global dispatch index of the current thread and performs range check to make sure we don't read or write out
-of the bounds of input and output tensors, and then calls `square()` to compute the per-element result, and
-store it at the corresponding location in `output` tensor.
+To manually invoke this kernel, we then need to write a CPU(host) function that defines how this kernel is dispatched. This can be defined in the same Slang file:
 
-With a kernel function defined, we then need to expose a CPU(host) function that defines how this kernel is dispatched:
 ```csharp
 [TorchEntryPoint]
-TorchTensor<float> square_fwd(TorchTensor<float> input)
+TorchTensor<float> square(TorchTensor<float> input)
 {
     var result = TorchTensor<float>.zerosLike(input);
     let blockCount = uint3(1);
     let groupSize = uint3(result.size(0), result.size(1), 1);
-    __dispatch_kernel(square_fwd_kernel, blockCount, groupSize)(input, result);
+    __dispatch_kernel(square_kernel, blockCount, groupSize)(input, result);
     return result;
 }
 ```
-Here, we mark the function with the `[TorchEntryPoint]` attribute, so it will be exported to Python. In the function body, we call `TorchTensor<float>.zerosLike` to allocate a 2D-tensor that has the same size as the input.
-`zerosLike` returns a `TorchTensor<float>` object that represents a CPU handle of a PyTorch tensor.
-Then we launch `square_fwd_kernel` with the `__dispatch_kernel` syntax. Note that we can directly pass
-`TorchTensor<float>` arguments to a `TensorView<float>` parameter and the compiler will automatically convert
-the type and obtain a view into the tensor that can be accessed by the GPU kernel function.
 
-### Calling Slang module from Python
+Here, we mark the function with the `[TorchEntryPoint]` attribute, so it will be compiled to C++ and exported as a python callable. 
+Since this is a host function, we can perform tensor allocations. For instnace, `square()` calls `TorchTensor<float>.zerosLike` to allocate a 2D-tensor that has the same size as the input.
+`zerosLike` returns a `TorchTensor<float>` object that represents a CPU handle of a PyTorch tensor.
 
-Next, let's see how we can call the `square_fwd` function we defined in the Slang module.
-To do so, we use a python package called `slangpy`. You can obtain it with
+Then we launch `square_kernel` with the `__dispatch_kernel` syntax. Note that we can directly pass
+`TorchTensor<float>` arguments to a `TensorView<float>` parameter and the compiler will automatically convert the type and obtain a view into the tensor that can be accessed by the GPU kernel function.
 
-```bash
-pip install slangpy
-```
+### Calling a `[TorchEntryPoint]` function from Python
 
-With that, you can use the following code to call `square_fwd` from Python:
+You can use the following code to call `square` from Python:
 
 ```python
 import torch
@@ -517,7 +510,7 @@ m = slangpy.loadModule("square.slang")
 
 x = torch.randn(2,2)
 print(f"X = {x}")
-y = m.square_fwd(x)
+y = m.square(x)
 print(f"Y = {y.cpu()}")
 ```
 
@@ -529,14 +522,6 @@ Y = tensor([[0.0198, 0.4349],
         [0.8060, 2.9688]])
 ```
 
-And that's it! `slangpy.loadModule` uses JIT compilation to compile your Slang source into CUDA binary.
-It may take a little longer the first time you execute the script, but the compiled binaries will be cached and as
-long as the kernel code is not changed, future runs will not rebuild the CUDA kernel.
-
-Because the PyTorch JIT system requires `ninja`, you need to make sure `ninja` is installed on your system
-and is discoverable from the current environment, you also need to have a C++ compiler available on the system.
-On Windows, this means that Visual Studio need to be installed.
-
 ### Manual binding for kernel derivatives
 
 The above example demonstrates how to write a simple kernel function in Slang and call it from Python.
@@ -554,7 +539,7 @@ float square(float x)
     return x * x;
 }
 ```
-This is done by simply adding a `[Differentiable]` attribute to our `square`function.
+This is done by simply adding a `[Differentiable]` attribute to our `square` function.
 
 With that, we can now define `square_bwd_kernel` that performs backward propagation as:
 
@@ -689,14 +674,32 @@ Atomically swaps `val` into the element at `index` if the element equals to `com
 
 ### `DiffTensorView` methods
 
+#### `DiffTensorView.operator[uint x, uint y, ...]`
+Provide an accessor to data content in a tensor. This method is **differentiable**, and has the same semantics as using a `.load()` to get data, and `.store()` to set data.
+
+#### `DiffTensorView.operator[vector<uint, N> index]`
+Provide an accessor to data content in a tensor, indexed by a uint vector.`tensor[uint3(1,2,3)]` is equivalent to `tensor[1,2,3]`. This method is **differentiable**, and has the same semantics as using a `.load()` to get data, and `.store()` to set data.
+
 #### `float DiffTensorView.load(vector<uint, N> index)`
 Loads the 32-bit floating point data at the specified multi-dimensional `index`. This method is **differentiable**, and in reverse-mode will perform an atomic-add.
 
 #### `void DiffTensorView.store(vector<uint, N> index, float val)`
 Stores the 32-bit floating point value `val` at the specified multi-dimensional `index`. This method is **differentiable**, and in reverse-mode will perform an *atomic exchange* to retrieve the derivative and replace with 0.
 
+#### `float DiffTensorView.loadOnce(vector<uint, N> index)`
+Loads the 32-bit floating point data at the specified multi-dimensional `index`. This method is **differentiable**, and uses a simple `store` for the reverse-mode for faster gradient aggregation, but `loadOnce` **must** be used at most once per index. `loadOnce` is ideal for situations where each thread loads data from a unique index, but will cause incorrect gradients when an index may be accessed multiple times.
+
+#### `void DiffTensorView.storeOnce(vector<uint, N> index, float val)`
+Stores the 32-bit floating point value `val` at the specified multi-dimensional `index`. This method is **differentiable**, and uses a simple `load` for the reverse-mode for faster gradient loading, but `storeOnce` **must** be used at most once per index. `loadOnce` is ideal for situations where each thread stores data to a unique index, but will cause incorrect gradient propagation when an index may be accessed multiple times.
+
 #### `uint DiffTensorView.size(int dim)`
-Returns the tensor's size (in number of elements) at `dim`.
+Returns the underlying primal tensor's size (in number of elements) at `dim`.
+
+#### `uint DiffTensorView.dims()`
+Returns the underlying primal tensor's dimension count.
+
+#### `uint DiffTensorView.stride(uint dim)`
+Returns the stride of the underlying primal tensor's `dim` dimension
 
 ### CUDA Support Functions
 
@@ -799,4 +802,4 @@ Calling `myFunc` from python will result in a python tuple in the form of
 [[tensor, tensor, tensor], float]
 ```
 
-The same transform rules apply to parameter types.
+The same transform rules apply to parameter types.
+\ No newline at end of file
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 77ed2d51f..9075ed3d3 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -2204,6 +2204,17 @@ struct TensorView
         return reinterpret_cast<T*>(data + offset);
     }
 
+    template<typename T, unsigned int N>
+    __device__ T* data_ptr_at(uint index[N])
+    {
+        uint64_t offset = 0;
+        for (unsigned int i = 0; i < N; ++i)
+        {
+            offset += strides[i] * index[i];
+        }
+        return reinterpret_cast<T*>(data + offset);
+    }
+
     template<typename T>
     __device__ T& load(uint32_t x)
     {
@@ -2215,20 +2226,48 @@ struct TensorView
         return *reinterpret_cast<T*>(data + strides[0] * x + strides[1] * y);
     }
     template<typename T>
+    __device__ T& load(uint2 index)
+    {
+        return *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y);
+    }
+    template<typename T>
     __device__ T& load(uint32_t x, uint32_t y, uint32_t z)
     {
         return *reinterpret_cast<T*>(data + strides[0] * x + strides[1] * y + strides[2] * z);
     }
     template<typename T>
+    __device__ T& load(uint3 index)
+    {
+        return *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z);
+    }
+    template<typename T>
     __device__ T& load(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
     {
         return *reinterpret_cast<T*>(data + strides[0] * x + strides[1] * y + strides[2] * z + strides[3] * w);
     }
     template<typename T>
+    __device__ T& load(uint4 index)
+    {
+        return *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z + strides[3] * index.w);
+    }
+    template<typename T>
     __device__ T& load(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4)
     {
         return *reinterpret_cast<T*>(data + strides[0] * i0 + strides[1] * i1 + strides[2] * i2 + strides[3] * i3 + strides[4] * i4);
     }
+
+    // Generic version of load
+    template<typename T, unsigned int N>
+    __device__ T& load(uint index[N])
+    {
+        uint64_t offset = 0;
+        for (unsigned int i = 0; i < N; ++i)
+        {
+            offset += strides[i] * index[i];
+        }
+        return *reinterpret_cast<T*>(data + offset);
+    }
+
     template<typename T>
     __device__ void store(uint32_t x, T val)
     {
@@ -2240,19 +2279,46 @@ struct TensorView
         *reinterpret_cast<T*>(data + strides[0] * x + strides[1] * y) = val;
     }
     template<typename T>
+    __device__ void store(uint2 index, T val)
+    {
+        *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y) = val;
+    }
+    template<typename T>
     __device__ void store(uint32_t x, uint32_t y, uint32_t z, T val)
     {
         *reinterpret_cast<T*>(data + strides[0] * x + strides[1] * y + strides[2] * z) = val;
     }
     template<typename T>
+    __device__ void store(uint3 index, T val)
+    {
+        *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z) = val;
+    }
+    template<typename T>
     __device__ void store(uint32_t x, uint32_t y, uint32_t z, uint32_t w, T val)
     {
         *reinterpret_cast<T*>(
             data + strides[0] * x + strides[1] * y + strides[2] * z + strides[3] * w) = val;
     }
     template<typename T>
+    __device__ void store(uint4 index, T val)
+    {
+        *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z + strides[3] * index.w) = val;
+    }
+    template<typename T>
     __device__ void store(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, T val)
     {
         *reinterpret_cast<T*>(data + strides[0] * i0 + strides[1] * i1 + strides[2] * i2 + strides[3] * i3 + strides[4] * i4) = val;
     }
+
+    // Generic version
+    template<typename T, unsigned int N>
+    __device__ void store(uint index[N], T val)
+    {
+        uint64_t offset = 0;
+        for (unsigned int i = 0; i < N; ++i)
+        {
+            offset += strides[i] * index[i];
+        }
+        *reinterpret_cast<T*>(data + offset) = val;
+    }
 };
diff --git a/prelude/slang-torch-prelude.h b/prelude/slang-torch-prelude.h
index 8d978642d..a2e4a1980 100644
--- a/prelude/slang-torch-prelude.h
+++ b/prelude/slang-torch-prelude.h
@@ -72,7 +72,7 @@ struct TensorView
 };
 
 
-TensorView make_tensor_view(torch::Tensor val, const char* name, torch::ScalarType targetScalarType)
+TensorView make_tensor_view(torch::Tensor val, const char* name, torch::ScalarType targetScalarType, bool requireContiguous)
 {
     // We're currently not trying to implicitly cast or transfer to device for two reasons:
     // 1. There appears to be a bug with .to() where successive calls after the first one fail.
@@ -88,7 +88,7 @@ TensorView make_tensor_view(torch::Tensor val, const char* name, torch::ScalarTy
         throw std::runtime_error(std::string(name).append(": tensor is not of the expected type.").c_str());
 
     // Check that the tensor is contiguous
-    if (!val.is_contiguous())
+    if (requireContiguous && !val.is_contiguous())
         throw std::runtime_error(std::string(name).append(": tensor is not contiguous.").c_str());
 
     TensorView res = {};
diff --git a/source/slang/diff.meta.slang b/source/slang/diff.meta.slang
index 75c57018c..5fe1440e6 100644
--- a/source/slang/diff.meta.slang
+++ b/source/slang/diff.meta.slang
@@ -88,6 +88,11 @@ struct TensorView
     [__NoSideEffect]
     T load(uint i0, uint i1, uint i2, uint i3, uint i4);
 
+    [__NoSideEffect]
+    __generic<let N : int>
+    __target_intrinsic(cuda, "$0.load<$TR>($1)")
+    T load(vector<uint, N> index);
+
     __target_intrinsic(cuda, "$0.store<$G0>($1, $2)")
     void store(uint x, T val);
     __target_intrinsic(cuda, "$0.store<$G0>($1, $2, $3)")
@@ -99,6 +104,11 @@ struct TensorView
     __target_intrinsic(cuda, "$0.store<$G0>($1, $2, $3, $4, $5, $6)")
     void store(uint i0, uint i1, uint i2, uint i3, uint i4, T val);
 
+    [__NoSideEffect]
+    __generic<let N : int>
+    __target_intrinsic(cuda, "$0.store<$T1>($1)")
+    void store(vector<uint, N> index, T val);
+
     __target_intrinsic(cuda, "*($3) = atomicAdd($0.data_ptr_at<$T2>($1), $2)")
     void InterlockedAdd(uint index, T val, out T oldVal);
     
@@ -266,165 +276,173 @@ extension TensorView<float>
 
 interface IDiffTensorWrapper
 {
-    __generic<T : __BuiltinFloatingPointType>
-    T load_forward(uint offset);
+    // Derivatives for universal load/store operations.
 
     __generic<T : __BuiltinFloatingPointType>
-    T load_forward_2(uint2 offset);
+    T load_forward(uint i);
 
-    __generic<T : __BuiltinFloatingPointType>
-    T load_forward_3(uint3 offset);
+    __generic<T : __BuiltinFloatingPointType, let N : int>
+    T load_forward(vector<uint, N> i);
 
     __generic<T : __BuiltinFloatingPointType>
-    T load_forward_4(uint4 offset);
+    void load_backward(uint i, T dOut);
 
-    __generic<T : __BuiltinFloatingPointType>
-    void load_backward(uint offset, T dOut);
+    __generic<T : __BuiltinFloatingPointType, let N : int>
+    void load_backward(vector<uint, N> i, T dOut);
 
     __generic<T : __BuiltinFloatingPointType>
-    void load_backward_2(uint2 offset, T dOut);
+    void store_forward(uint i, T dx);
 
-    __generic<T : __BuiltinFloatingPointType>
-    void load_backward_3(uint3 offset, T dOut);
+    __generic<T : __BuiltinFloatingPointType, let N : int>
+    void store_forward(vector<uint, N> i, T dx);
 
     __generic<T : __BuiltinFloatingPointType>
-    void load_backward_4(uint4 offset, T dOut);
+    T store_backward(uint i);
 
-    __generic<T : __BuiltinFloatingPointType>
-    void store_forward(uint offset, T dx);
+    __generic<T : __BuiltinFloatingPointType, let N : int>
+    T store_backward(vector<uint, N> i);
 
-    __generic<T : __BuiltinFloatingPointType>
-    void store_forward_2(uint2 offset, T dx);
+    // Derivatives for loadOnce/storeOnce operations. These operations
+    // are designed to only run once per-address and don't need atomic
+    // gradient handling.
+    //
 
     __generic<T : __BuiltinFloatingPointType>
-    void store_forward_3(uint3 offset, T dx);
+    T loadOnce_forward(uint i);
 
-    __generic<T : __BuiltinFloatingPointType>
-    void store_forward_4(uint4 offset, T dx);
+    __generic<T : __BuiltinFloatingPointType, let N : int>
+    T loadOnce_forward(vector<uint, N> i);
 
     __generic<T : __BuiltinFloatingPointType>
-    T store_backward(uint offset);
+    void loadOnce_backward(uint i, T dOut);
 
-    __generic<T : __BuiltinFloatingPointType>
-    T store_backward_2(uint2 offset);
+    __generic<T : __BuiltinFloatingPointType, let N : int>
+    void loadOnce_backward(vector<uint, N> i, T dOut);
 
     __generic<T : __BuiltinFloatingPointType>
-    T store_backward_3(uint3 offset);
+    void storeOnce_forward(uint i, T dx);
+
+    __generic<T : __BuiltinFloatingPointType, let N : int>
+    void storeOnce_forward(vector<uint, N> i, T dx);
 
     __generic<T : __BuiltinFloatingPointType>
-    T store_backward_4(uint4 offset);
+    T storeOnce_backward(uint i);
+
+    __generic<T : __BuiltinFloatingPointType, let N : int>
+    T storeOnce_backward(vector<uint, N> i);
 };
 
 struct AtomicAdd : IDiffTensorWrapper
 {
     TensorView<float> diff;
 
+    // Derivatives for universal load/store operations.
+
     __generic<T : __BuiltinFloatingPointType>
     T load_forward(uint i)
     {
         return __realCast<T, float>(diff.load(i));
     }
 
-    __generic<T : __BuiltinFloatingPointType>
-    T load_forward_2(uint2 i)
+    __generic<T : __BuiltinFloatingPointType, let N : int>
+    T load_forward(vector<uint, N> i)
     {
-        return __realCast<T, float>(diff.load(i.x, i.y));
+        return __realCast<T, float>(diff.load(i));
     }
 
     __generic<T : __BuiltinFloatingPointType>
-    T load_forward_3(uint3 i)
+    void load_backward(uint i, T dOut)
     {
-        return __realCast<T, float>(diff.load(i.x, i.y, i.z));
+        float oldVal;
+        diff.InterlockedAdd(i, __realCast<float, T>(dOut), oldVal);
     }
 
-    __generic<T : __BuiltinFloatingPointType>
-    T load_forward_4(uint4 i)
+    __generic<T : __BuiltinFloatingPointType, let N : int>
+    void load_backward(vector<uint, N> i, T dOut)
     {
-        return __realCast<T, float>(diff.load(i.x, i.y, i.z, i.w));
+        float oldVal;
+        diff.InterlockedAdd(i, __realCast<float, T>(dOut), oldVal);
     }
 
     __generic<T : __BuiltinFloatingPointType>
-    void load_backward(uint i, T dOut)
+    void store_forward(uint i, T dx)
     {
-        float oldVal;
-        diff.InterlockedAdd(i, __realCast<float, T>(dOut), oldVal);
+        diff.store(i, __realCast<float, T>(dx));
     }
 
-    __generic<T : __BuiltinFloatingPointType>
-    void load_backward_2(uint2 i, T dOut)
+    __generic<T : __BuiltinFloatingPointType, let N : int>
+    void store_forward(vector<uint, N> i, T dx)
     {
-        float oldVal;
-        diff.InterlockedAdd(i, __realCast<float, T>(dOut), oldVal);
+        diff.store(i, __realCast<float, T>(dx));
     }
 
     __generic<T : __BuiltinFloatingPointType>
-    void load_backward_3(uint3 i, T dOut)
+    T store_backward(uint i)
     {
         float oldVal;
-        diff.InterlockedAdd(i, __realCast<float, T>(dOut), oldVal);
+        diff.InterlockedExchange(i, (float)0, oldVal);
+        return __realCast<T, float>(oldVal);
     }
 
-    __generic<T : __BuiltinFloatingPointType>
-    void load_backward_4(uint4 i, T dOut)
+    __generic<T : __BuiltinFloatingPointType, let N : int>
+    T store_backward(vector<uint, N> i)
     {
         float oldVal;
-        diff.InterlockedAdd(i, __realCast<float, T>(dOut), oldVal);
+        diff.InterlockedExchange(i, (float)0, oldVal);
+        return __realCast<T, float>(oldVal);
     }
 
+    // Derivatives for loadOnce/storeOnce operations. These operations
+    // are designed to only run once per-address and don't need atomic
+    // gradient handling.
+    //
+
     __generic<T : __BuiltinFloatingPointType>
-    void store_forward(uint i, T dx)
+    T loadOnce_forward(uint i)
     {
-        diff.store(i, __realCast<float, T>(dx));
+        return __realCast<T, float>(diff.load(i));
     }
 
-    __generic<T : __BuiltinFloatingPointType>
-    void store_forward_2(uint2 i, T dx)
+    __generic<T : __BuiltinFloatingPointType, let N : int>
+    T loadOnce_forward(vector<uint, N> i)
     {
-        diff.store(i.x, i.y, __realCast<float, T>(dx));
+        return __realCast<T, float>(diff.load(i));
     }
 
     __generic<T : __BuiltinFloatingPointType>
-    void store_forward_3(uint3 i, T dx)
+    void loadOnce_backward(uint i, T dOut)
     {
-        diff.store(i.x, i.y, i.z, __realCast<float, T>(dx));
+        diff.store(i, __realCast<float, T>(dOut));
     }
 
-    __generic<T : __BuiltinFloatingPointType>
-    void store_forward_4(uint4 i, T dx)
+    __generic<T : __BuiltinFloatingPointType, let N : int>
+    void loadOnce_backward(vector<uint, N> i, T dOut)
     {
-        diff.store(i.x, i.y, i.z, i.w, __realCast<float, T>(dx));
+        diff.store(i, __realCast<float, T>(dOut));
     }
 
     __generic<T : __BuiltinFloatingPointType>
-    T store_backward(uint i)
+    void storeOnce_forward(uint i, T dx)
     {
-        float oldVal;
-        diff.InterlockedExchange(i, (float)0, oldVal);
-        return __realCast<T, float>(oldVal);
+        diff.store(i, __realCast<float, T>(dx));
     }
 
-    __generic<T : __BuiltinFloatingPointType>
-    T store_backward_2(uint2 i)
+    __generic<T : __BuiltinFloatingPointType, let N : int>
+    void storeOnce_forward(vector<uint, N> i, T dx)
     {
-        float oldVal;
-        diff.InterlockedExchange(i, (float)0, oldVal);
-        return __realCast<T, float>(oldVal);
+        diff.store(i, __realCast<float, T>(dx));
     }
 
     __generic<T : __BuiltinFloatingPointType>
-    T store_backward_3(uint3 i)
+    T storeOnce_backward(uint i)
     {
-        float oldVal;
-        diff.InterlockedExchange(i, (float)0, oldVal);
-        return __realCast<T, float>(oldVal);
+        return __realCast<T, float>(diff.load(i));
     }
 
-    __generic<T : __BuiltinFloatingPointType>
-    T store_backward_4(uint4 i)
+    __generic<T : __BuiltinFloatingPointType, let N : int>
+    T storeOnce_backward(vector<uint, N> i)
     {
-        float oldVal;
-        diff.InterlockedExchange(i, (float)0, oldVal);
-        return __realCast<T, float>(oldVal);
+        return __realCast<T, float>(diff.load(i));
     }
 };
 
@@ -439,120 +457,223 @@ struct DiffTensorView
         return primal.size(i);
     }
 
-    [BackwardDerivative(load_backward)]
-    [ForwardDerivative(load_forward)]
-    T load(uint i) { return primal.load(i); }
+    uint dims()
+    {
+        return primal.dims();
+    }
 
-    [BackwardDerivative(load_backward)]
-    [ForwardDerivative(load_forward)]
-    T load(uint2 i) { return primal.load(i.x, i.y); }
+    uint stride(uint i)
+    {
+        return primal.stride(i);
+    }
 
-    [BackwardDerivative(load_backward)]
-    [ForwardDerivative(load_forward)]
-    T load(uint3 i) { return primal.load(i.x, i.y, i.z); }
+    // Constructors
+    __init(TensorView<T> primal, A diff)
+    {
+        this.primal = primal;
+        this.diff = diff;
+    }
 
-    [BackwardDerivative(load_backward)]
-    [ForwardDerivative(load_forward)]
-    T load(uint4 i) { return primal.load(i.x, i.y, i.z, i.w); }
+    __init(TensorView<T> primal)
+    {
+        this.primal = primal;
+    }
+
+    // Universal load/store operations.
+
+    [BackwardDerivative(__load_backward)]
+    [ForwardDerivative(__load_forward)]
+    T load(uint i) { return primal.load(i); }
+
+    [BackwardDerivative(__load_backward)]
+    [ForwardDerivative(__load_forward)]
+    __generic<let N : int>
+    T load(vector<uint, N> i) { return primal.load(i); }
 
-    DifferentialPair<T> load_forward(uint x)
+    DifferentialPair<T> __load_forward(uint x)
     {
         return diffPair(primal.load(x), reinterpret<T.Differential, T>(diff.load_forward<T>(x)));
     }
 
-    DifferentialPair<T> load_forward(uint2 x)
+    __generic<let N : int>
+    DifferentialPair<T> __load_forward(vector<uint, N> x)
     {
-        return diffPair(primal.load(x.x, x.y), reinterpret<T.Differential, T>(diff.load_forward_2<T>(x)));
+        return diffPair(primal.load(x), reinterpret<T.Differential, T>(diff.load_forward<T, N>(x)));
     }
 
-    DifferentialPair<T> load_forward(uint3 x)
+    void __load_backward(uint x, T.Differential dOut)
     {
-        return diffPair(primal.load(x.x, x.y, x.z), reinterpret<T.Differential, T>(diff.load_forward_3<T>(x)));
+        diff.load_backward<T>(x, reinterpret<T, T.Differential>(dOut));
     }
 
-    DifferentialPair<T> load_forward(uint4 x)
+    __generic<let N : int>
+    void __load_backward(vector<uint, N> x, T.Differential dOut)
     {
-        return diffPair(primal.load(x.x, x.y, x.z, x.w), reinterpret<T.Differential, T>(diff.load_forward_4<T>(x)));
+        diff.load_backward<T, N>(x, reinterpret<T, T.Differential>(dOut));
     }
 
-    void load_backward(uint x, T.Differential dOut)
+    [BackwardDerivative(__store_backward)]
+    [ForwardDerivative(__store_forward)]
+    void store(uint x, T val) { primal.store(x, val); }
+
+    [BackwardDerivative(__store_backward)]
+    [ForwardDerivative(__store_forward)]
+    __generic<let N : int>
+    void store(vector<uint, N> x, T val) { primal.store(x, val); }
+
+    void __store_forward(uint x, DifferentialPair<T> dpval)
     {
-        diff.load_backward<T>(x, reinterpret<T, T.Differential>(dOut));
+        primal.store(x, dpval.p);
+        diff.store_forward<T>(x, reinterpret<T, T.Differential>(dpval.d));
     }
 
-    void load_backward(uint2 x, T.Differential dOut)
+    __generic<let N : int>
+    void __store_forward(vector<uint, N> x, DifferentialPair<T> dpval)
     {
-        diff.load_backward_2<T>(x, reinterpret<T, T.Differential>(dOut));
+        primal.store(x, dpval.p);
+        diff.store_forward<T, N>(x, reinterpret<T, T.Differential>(dpval.d));
     }
 
-    void load_backward(uint3 x, T.Differential dOut)
+    void __store_backward(uint x, inout DifferentialPair<T> dpval)
     {
-        diff.load_backward_3<T>(x, reinterpret<T, T.Differential>(dOut));
+        dpval = diffPair(dpval.p, reinterpret<T.Differential, T>(diff.store_backward<T>(x)));
     }
 
-    void load_backward(uint4 x, T.Differential dOut)
+    __generic<let N : int>
+    void __store_backward(vector<uint, N> x, inout DifferentialPair<T> dpval)
     {
-        diff.load_backward_4<T>(x, reinterpret<T, T.Differential>(dOut));
+        dpval = diffPair(dpval.p, reinterpret<T.Differential, T>(diff.store_backward<T, N>(x)));
     }
 
-    [BackwardDerivative(store_backward)]
-    [ForwardDerivative(store_forward)]
-    void store(uint x, T val) { primal.store(x, val); }
+    __subscript(uint index)->T
+    {
+        [__unsafeForceInlineEarly] [Differentiable] [__NoSideEffect] get { return load(index); }
+        [__unsafeForceInlineEarly] [Differentiable] set { store(index, newValue); }
 
-    [BackwardDerivative(store_backward)]
-    [ForwardDerivative(store_forward)]
-    void store(uint2 x, T val) { primal.store(x.x, x.y, val); }
+        [__NoSideEffect]
+        ref;
+    }
 
-    [BackwardDerivative(store_backward)]
-    [ForwardDerivative(store_forward)]
-    void store(uint3 x, T val) { primal.store(x.x, x.y, x.z, val); }
+    __subscript(uint2 index)->T
+    {
+        [__unsafeForceInlineEarly] [Differentiable] [__NoSideEffect] get { return load(index); }
+        [__unsafeForceInlineEarly] [Differentiable] set { store(index, newValue); }
 
-    [BackwardDerivative(store_backward)]
-    [ForwardDerivative(store_forward)]
-    void store(uint4 x, T val) { primal.store(x.x, x.y, x.z, x.w, val); }
+        [__NoSideEffect]
+        ref;
+    }
 
-    void store_forward(uint x, DifferentialPair<T> dpval)
+    __subscript(uint x, uint y)->T
     {
-        primal.store(x, dpval.p);
-        diff.store_forward<T>(x, reinterpret<T, T.Differential>(dpval.d));
+        [__unsafeForceInlineEarly] [Differentiable] [__NoSideEffect] get { return load(uint2(x, y)); }
+        [__unsafeForceInlineEarly] [Differentiable] set { store(uint2(x, y), newValue); }
+
+        [__NoSideEffect]
+        ref;
     }
 
-    void store_forward(uint2 x, DifferentialPair<T> dpval)
+    __subscript(uint3 index)->T
     {
-        primal.store(x.x, x.y, dpval.p);
-        diff.store_forward_2<T>(x, reinterpret<T, T.Differential>(dpval.d));
+        [__unsafeForceInlineEarly] [Differentiable] [__NoSideEffect] get { return load(index); }
+        [__unsafeForceInlineEarly] [Differentiable] set { store(index, newValue); }
+
+        [__NoSideEffect]
+        ref;
     }
 
-    void store_forward(uint3 x, DifferentialPair<T> dpval)
+    __subscript(uint x, uint y, uint z)->T
     {
-        primal.store(x.x, x.y, x.z, dpval.p);
-        diff.store_forward_3<T>(x, reinterpret<T, T.Differential>(dpval.d));
+        [__unsafeForceInlineEarly] [Differentiable] [__NoSideEffect] get { return load(uint3(x, y, z)); }
+        [__unsafeForceInlineEarly] [Differentiable] set { store(uint3(x, y, z), newValue); }
+
+        [__NoSideEffect]
+        ref;
     }
 
-    void store_forward(uint4 x, DifferentialPair<T> dpval)
+    __subscript(uint4 index)->T
     {
-        primal.store(x.x, x.y, x.z, x.w, dpval.p);
-        diff.store_forward_4<T>(x, reinterpret<T, T.Differential>(dpval.d));
+        [__unsafeForceInlineEarly] [Differentiable] [__NoSideEffect] get { return load(index); }
+        [__unsafeForceInlineEarly] [Differentiable] set { store(index, newValue); }
+
+        [__NoSideEffect]
+        ref;
     }
 
-    void store_backward(uint x, inout DifferentialPair<T> dpval)
+    __subscript(uint x, uint y, uint z, uint w)->T
     {
-        dpval = diffPair(dpval.p, reinterpret<T.Differential, T>(diff.store_backward<T>(x)));
+        [__unsafeForceInlineEarly] [Differentiable] [__NoSideEffect] get { return load(uint4(x, y, z, w)); }
+        [__unsafeForceInlineEarly] [Differentiable] set { store(uint4(x, y, z, w), newValue); }
+
+        [__NoSideEffect]
+        ref;
     }
 
-    void store_backward(uint2 x, inout DifferentialPair<T> dpval)
+    // loadOnce/storeOnce operations. These operations are designed to only run once per-address and 
+    // don't need atomic gradient handling.
+    //
+
+    [BackwardDerivative(__loadOnce_backward)]
+    [ForwardDerivative(__loadOnce_forward)]
+    T loadOnce(uint i) { return primal.load(i); }
+
+    [BackwardDerivative(__loadOnce_backward)]
+    [ForwardDerivative(__loadOnce_forward)]
+    __generic<let N : int>
+    T loadOnce(vector<uint, N> i) { return primal.load(i); }
+
+    DifferentialPair<T> __loadOnce_forward(uint x)
     {
-        dpval = diffPair(dpval.p, reinterpret<T.Differential, T>(diff.store_backward_2<T>(x)));
+        return diffPair(primal.load(x), reinterpret<T.Differential, T>(diff.loadOnce_forward<T>(x)));
     }
 
-    void store_backward(uint3 x, inout DifferentialPair<T> dpval)
+    __generic<let N : int>
+    DifferentialPair<T> __loadOnce_forward(vector<uint, N> x)
     {
-        dpval = diffPair(dpval.p, reinterpret<T.Differential, T>(diff.store_backward_3<T>(x)));
+        return diffPair(primal.load(x), reinterpret<T.Differential, T>(diff.loadOnce_forward<T, N>(x)));
     }
 
-    void store_backward(uint4 x, inout DifferentialPair<T> dpval)
+    void __loadOnce_backward(uint x, T.Differential dOut)
+    {
+        diff.loadOnce_backward<T>(x, reinterpret<T, T.Differential>(dOut));
+    }
+
+    __generic<let N : int>
+    void __loadOnce_backward(vector<uint, N> x, T.Differential dOut)
+    {
+        diff.loadOnce_backward<T, N>(x, reinterpret<T, T.Differential>(dOut));
+    }
+
+    [BackwardDerivative(__storeOnce_backward)]
+    [ForwardDerivative(__storeOnce_forward)]
+    void storeOnce(uint x, T val) { primal.store(x, val); }
+
+    [BackwardDerivative(__storeOnce_backward)]
+    [ForwardDerivative(__storeOnce_forward)]
+    __generic<let N : int>
+    void storeOnce(vector<uint, N> x, T val) { primal.store(x, val); }
+
+    void __storeOnce_forward(uint x, DifferentialPair<T> dpval)
+    {
+        primal.store(x, dpval.p);
+        diff.storeOnce_forward<T>(x, reinterpret<T, T.Differential>(dpval.d));
+    }
+
+    __generic<let N : int>
+    void __storeOnce_forward(vector<uint, N> x, DifferentialPair<T> dpval)
+    {
+        primal.store(x, dpval.p);
+        diff.storeOnce_forward<T, N>(x, reinterpret<T, T.Differential>(dpval.d));
+    }
+
+    void __storeOnce_backward(uint x, inout DifferentialPair<T> dpval)
+    {
+        dpval = diffPair(dpval.p, reinterpret<T.Differential, T>(diff.storeOnce_backward<T>(x)));
+    }
+
+    __generic<let N : int>
+    void __storeOnce_backward(vector<uint, N> x, inout DifferentialPair<T> dpval)
     {
-        dpval = diffPair(dpval.p, reinterpret<T.Differential, T>(diff.store_backward_4<T>(x)));
+        dpval = diffPair(dpval.p, reinterpret<T.Differential, T>(diff.storeOnce_backward<T, N>(x)));
     }
 };
 
diff --git a/source/slang/slang-check-overload.cpp b/source/slang/slang-check-overload.cpp
index d7ed5975d..c668155df 100644
--- a/source/slang/slang-check-overload.cpp
+++ b/source/slang/slang-check-overload.cpp
@@ -964,10 +964,14 @@ namespace Slang
         {
             auto leftType = DeclRefType::create(m_astBuilder, left.declRef.getParent());
             auto rightType = DeclRefType::create(m_astBuilder, right.declRef.getParent());
-            if (isSubtype(leftType, rightType))
-                return -1;
-            if (isSubtype(rightType, leftType))
-                return 1;
+
+            if (!leftType->equals(rightType))
+            {
+                if (isSubtype(leftType, rightType))
+                    return -1;
+                if (isSubtype(rightType, leftType))
+                    return 1;
+            }
         }
 
         // TODO: We should generalize above rules such that in a tie a declaration
diff --git a/source/slang/slang-emit-torch.cpp b/source/slang/slang-emit-torch.cpp
index 7cd793ec1..54408aa80 100644
--- a/source/slang/slang-emit-torch.cpp
+++ b/source/slang/slang-emit-torch.cpp
@@ -118,6 +118,13 @@ bool TorchCppSourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo&
         emitStringLiteral(getUnmangledName(inst->getOperand(0)));
         m_writer->emit(", ");
         emitTorchScalarTypeName(m_writer, inst->getOperand(0)->getDataType());
+        m_writer->emit(", ");
+
+        if (as<IRVectorType>(inst->getOperand(0)->getDataType()))
+            m_writer->emit("true");
+        else
+            m_writer->emit("false");
+            
         m_writer->emit(")");
         return true;
     }
diff --git a/source/slang/slang-ir-autodiff-cfg-norm.cpp b/source/slang/slang-ir-autodiff-cfg-norm.cpp
index a9db3aecc..e7c269756 100644
--- a/source/slang/slang-ir-autodiff-cfg-norm.cpp
+++ b/source/slang/slang-ir-autodiff-cfg-norm.cpp
@@ -619,6 +619,7 @@ struct CFGNormalizationPass
                 // SLANG_UNEXPECTED("Switch-case normalization not implemented yet.");
                 BreakableRegionInfo info;
                 info.breakBlock = as<IRSwitch>(branchInst)->getBreakLabel();
+                info.headerBlock = as<IRBlock>(branchInst->getParent());
 
                 // Emit var into parent block.
                 builder.setInsertBefore(as<IRBlock>(branchInst->getParent())->getTerminator());
diff --git a/tests/autodiff/autopybind-differentiable.slang b/tests/autodiff/autopybind-differentiable.slang
index 9595a09d2..27df19b8a 100644
--- a/tests/autodiff/autopybind-differentiable.slang
+++ b/tests/autodiff/autopybind-differentiable.slang
@@ -12,7 +12,7 @@ void myKernel(DiffTensorView inValues, DiffTensorView outValues)
 {
     if (cudaThreadIdx().x > 0)
         return;
-    outValues.store(cudaThreadIdx().x, sin(inValues.load(cudaThreadIdx().x)));
+    outValues[cudaThreadIdx().x] = sin(inValues[cudaThreadIdx().x]);
 }
 
 // TORCH:      {{^SLANG_PRELUDE_EXPORT$}}