summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--docs/user-guide/a1-02-slangpy.md79
-rw-r--r--source/slang/diff.meta.slang105
2 files changed, 116 insertions, 68 deletions
diff --git a/docs/user-guide/a1-02-slangpy.md b/docs/user-guide/a1-02-slangpy.md
index 6a9b8baa3..8ee5233ba 100644
--- a/docs/user-guide/a1-02-slangpy.md
+++ b/docs/user-guide/a1-02-slangpy.md
@@ -226,53 +226,100 @@ The `TensorView<T>` represents the GPU view of a tensor and provides accesors to
Following is a list of builtin methods and attributes for PyTorch interop.
-### `static TorchTensor<T> TorchTensor<T>.alloc(uint x, uint y, ...)`
+### `TorchTensor` methods
+
+#### `static TorchTensor<T> TorchTensor<T>.alloc(uint x, uint y, ...)`
Allocates a new PyTorch tensor with the given dimensions.
-### `static TorchTensor<T> TorchTensor<T>.zerosLike(TorchTensor<T> other)`
+#### `static TorchTensor<T> TorchTensor<T>.emptyLike(TorchTensor<T> other)`
+Allocates a new PyTorch tensor that has the same dimensions as `other` without initializing it.
+
+#### `static TorchTensor<T> TorchTensor<T>.zerosLike(TorchTensor<T> other)`
Allocates a new PyTorch tensor that has the same dimensions as `other` and initialize it to zero.
-### `uint TorchTensor<T>.dims()`
+#### `uint TorchTensor<T>.dims()`
Returns the tensor's dimension count.
-### `uint TorchTensor<T>.size(int dim)`
+#### `uint TorchTensor<T>.size(int dim)`
Returns the tensor's size (in number of elements) at `dim`.
-### `uint TorchTensor<T>.stride(int dim)`
+#### `uint TorchTensor<T>.stride(int dim)`
Returns the tensor's stride (in bytes) at `dim`.
-### `TensorView<T>.operator[uint x, uint y, ...]`
+### `TensorView` methods
+
+#### `TensorView<T>.operator[uint x, uint y, ...]`
Provide an accessor to data content in a tensor.
-### `TensorView<T>.operator[vector<uint, N> index]`
+#### `TensorView<T>.operator[vector<uint, N> index]`
Provide an accessor to data content in a tensor, indexed by a uint vector.
`tensor[uint3(1,2,3)]` is equivalent to `tensor[1,2,3]`.
-### `uint TensorView<T>.dims()`
+#### `uint TensorView<T>.dims()`
Returns the tensor's dimension count.
-### `uint TensorView<T>.size(int dim)`
+#### `uint TensorView<T>.size(int dim)`
Returns the tensor's size (in number of elements) at `dim`.
-### `uint TensorView<T>.stride(int dim)`
+#### `uint TensorView<T>.stride(int dim)`
Returns the tensor's stride (in bytes) at `dim`.
-### `cudaThreadIdx()`
+#### `void TensorView<T>.fillZero()`
+Fills the tensor with zeros. Modifies the tensor in-place.
+
+#### `void TensorView<T>.fillValue(T value)`
+Fills the tensor with the specified value, modifies the tensor in-place.
+
+#### `T* TensorView<T>.data_ptr_at(vector<uint, N> index)`
+Returns a pointer to the element at `index`.
+
+#### `void TensorView<T>.InterlockedAdd(vector<uint, N> index, T val, out T oldVal)`
+Atomically add `val` to element at `index`.
+
+#### `void TensorView<T>.InterlockedMin(vector<uint, N> index, T val, out T oldVal)`
+Atomically computes the min of `val` and the element at `index`. Available for 32 and 64 bit integer types only.
+
+#### `void TensorView<T>.InterlockedMax(vector<uint, N> index, T val, out T oldVal)`
+Atomically computes the max of `val` and the element at `index`. Available for 32 and 64 bit integer types only.
+
+#### `void TensorView<T>.InterlockedAnd(vector<uint, N> index, T val, out T oldVal)`
+Atomically computes the bitwise and of `val` and the element at `index`. Available for 32 and 64 bit integer types only.
+
+#### `void TensorView<T>.InterlockedOr(vector<uint, N> index, T val, out T oldVal)`
+Atomically computes the bitwise or of `val` and the element at `index`. Available for 32 and 64 bit integer types only.
+
+#### `void TensorView<T>.InterlockedXor(vector<uint, N> index, T val, out T oldVal)`
+Atomically computes the bitwise xor of `val` and the element at `index`. Available for 32 and 64 bit integer types only.
+
+#### `void TensorView<T>.InterlockedExchange(vector<uint, N> index, T val, out T oldVal)`
+Atomically swaps `val` into the element at `index`. Available for `float` and 32/64 bit integer types only.
+
+#### `void TensorView<T>.InterlockedCompareExchange(vector<uint, N> index, T compare, T val)`
+Atomically swaps `val` into the element at `index` if the element equals to `compare`. Available for `float` and 32/64 bit integer types only.
+
+### CUDA Support Functions
+
+#### `cudaThreadIdx()`
Returns the `threadIdx` variable in CUDA.
-### `cudaBlockIdx()`
+#### `cudaBlockIdx()`
Returns the `blockIdx` variable in CUDA.
-### `cudaBlockDim()`
+#### `cudaBlockDim()`
Returns the `blockDim` variable in CUDA.
-### `[CudaKernel]` attribute
+#### `syncTorchCudaStream()`
+Waits for all pending CUDA kernel executions to complete on host.
+
+### Attributes for PyTorch Interop
+
+#### `[CudaKernel]` attribute
Marks a function as a CUDA kernel (maps to a `__global__` function)
-### `[TorchEntryPoint]` attribute
+#### `[TorchEntryPoint]` attribute
Marks a function for export to Python. Functions marked with `[TorchEntryPoint]` will be accessible from a loaded module returned by `slangpy.loadModule`.
-### `[CudaDeviceExport]` attribute
+#### `[CudaDeviceExport]` attribute
Marks a function as a cuda device function, and ensures the compiler to include it in the generated cuda source.
## Type Marshalling Between Slang and Python
diff --git a/source/slang/diff.meta.slang b/source/slang/diff.meta.slang
index 252b6f5e9..2bdaccee3 100644
--- a/source/slang/diff.meta.slang
+++ b/source/slang/diff.meta.slang
@@ -74,12 +74,12 @@ struct TensorView
__target_intrinsic(cuda, "$0.store<$G0>($1, $2, $3, $4, $5, $6)")
void store(uint i0, uint i1, uint i2, uint i3, uint i4, T val);
- __target_intrinsic(cuda, "atomicAdd($0.data_ptr_at<$TR>($1), $2)")
- T InterlockedAdd(uint index, T val);
+ __target_intrinsic(cuda, "*($3) = atomicAdd($0.data_ptr_at<$T2>($1), $2)")
+ void InterlockedAdd(uint index, T val, out T oldVal);
__generic<let N:int>
- __target_intrinsic(cuda, "atomicAdd($0.data_ptr_at<$TR>($1), $2)")
- T InterlockedAdd(vector<uint, N> index, T val);
+ __target_intrinsic(cuda, "*($3) = atomicAdd($0.data_ptr_at<$T2>($1), $2)")
+ void InterlockedAdd(vector<uint, N> index, T val, out T oldVal);
__target_intrinsic(cuda, "$0.dimensionCount")
[__readNone]
@@ -159,61 +159,55 @@ for (auto atomicIntegerTypeName : kCudaAtomicIntegerTypes)
extension TensorView<$(atomicIntegerTypeName)>
{
typealias __Element = $(atomicIntegerTypeName);
- __target_intrinsic(cuda, "atomicInc($0.data_ptr_at<$TR>($1), $2)")
- __Element InterlockedIncrement(uint index, __Element val);
- __generic<let N:int>
- __target_intrinsic(cuda, "atomicInc($0.data_ptr_at<$TR>($1), $2)")
- __Element InterlockedIncrement(vector<uint, N> index, __Element val);
+ __target_intrinsic(cuda, "*($3) = atomicMin($0.data_ptr_at<$T2>($1), $2)")
+ void InterlockedMin(uint index, __Element val, out __Element oldVal);
- __target_intrinsic(cuda, "atomicMin($0.data_ptr_at<$TR>($1), $2)")
- __Element InterlockedMin(uint index, __Element val);
-
- __generic<let N:int>
- __target_intrinsic(cuda, "atomicMin($0.data_ptr_at<$TR>($1), $2)")
- __Element InterlockedMin(vector<uint, N> index, __Element val);
+ __generic<let N : int>
+ __target_intrinsic(cuda, "*($3) = atomicMin($0.data_ptr_at<$T2>($1), $2)")
+ void InterlockedMin(vector<uint, N> index, __Element val, out __Element oldVal);
- __target_intrinsic(cuda, "atomicMax($0.data_ptr_at<$TR>($1), $2)")
- __Element InterlockedMax<T>(uint index, __Element val);
-
- __generic<let N:int>
- __target_intrinsic(cuda, "atomicMax($0.data_ptr_at<$TR>($1), $2)")
- __Element InterlockedMax(vector<uint, N> index, __Element val);
+ __target_intrinsic(cuda, "*($3) = atomicMax($0.data_ptr_at<$T2>($1), $2)")
+ void InterlockedMax<T>(uint index, __Element val, out __Element oldVal);
- __target_intrinsic(cuda, "atomicAnd($0.data_ptr_at<$TR>($1), $2)")
- __Element InterlockedAnd<T>(uint index, __Element val);
-
- __generic<let N:int>
- __target_intrinsic(cuda, "atomicAnd($0.data_ptr_at<$TR>($1), $2)")
- __Element InterlockedAnd(vector<uint, N> index, __Element val);
+ __generic<let N : int>
+ __target_intrinsic(cuda, "*($3) = atomicMax($0.data_ptr_at<$T2>($1), $2)")
+ void InterlockedMax(vector<uint, N> index, __Element val, out __Element oldVal);
- __target_intrinsic(cuda, "atomicOr($0.data_ptr_at<$TR>($1), $2)")
- __Element InterlockedOr<T>(uint index, __Element val);
-
- __generic<let N:int>
- __target_intrinsic(cuda, "atomicOr($0.data_ptr_at<$TR>($1), $2)")
- __Element InterlockedOr(vector<uint, N> index, __Element val);
+ __target_intrinsic(cuda, "*($3) = atomicAnd($0.data_ptr_at<$T2>($1), $2)")
+ void InterlockedAnd<T>(uint index, __Element val, out __Element oldVal);
- __target_intrinsic(cuda, "atomicXor($0.data_ptr_at<$TR>($1), $2)")
- __Element InterlockedXor<T>(uint index, __Element val);
-
- __generic<let N:int>
- __target_intrinsic(cuda, "atomicXor($0.data_ptr_at<$TR>($1), $2)")
- __Element InterlockedXor(vector<uint, N> index, __Element val);
-
- __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$TR>($1), $2)")
- __Element InterlockedExchange(uint index, __Element val);
+ __generic<let N : int>
+ __target_intrinsic(cuda, "*($3) = atomicAnd($0.data_ptr_at<$T2>($1), $2)")
+ void InterlockedAnd(vector<uint, N> index, __Element val, out __Element oldVal);
+
+ __target_intrinsic(cuda, "*($3) = atomicOr($0.data_ptr_at<$T2>($1), $2)")
+ void InterlockedOr<T>(uint index, __Element val, out __Element oldVal);
+
+ __generic<let N : int>
+ __target_intrinsic(cuda, "*($3) = atomicOr($0.data_ptr_at<$T2>($1), $2)")
+ void InterlockedOr(vector<uint, N> index, __Element val, out __Element oldVal);
+
+ __target_intrinsic(cuda, "*($3) = atomicXor($0.data_ptr_at<$T2>($1), $2)")
+ void InterlockedXor<T>(uint index, __Element val, out __Element oldVal);
+
+ __generic<let N : int>
+ __target_intrinsic(cuda, "*($3) = atomicXor($0.data_ptr_at<$T2>($1), $2)")
+ void InterlockedXor(vector<uint, N> index, __Element val, out __Element oldVal);
+
+ __target_intrinsic(cuda, "*($3) = atomicExch($0.data_ptr_at<$T2>($1), $2)")
+ void InterlockedExchange(uint index, __Element va, out __Element oldVall);
__generic<let N:int>
- __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$TR>($1), $2)")
- __Element InterlockedExchange(vector<uint, N> index, __Element val);
+ __target_intrinsic(cuda, "*($3) = atomicExch($0.data_ptr_at<$T2>($1), $2)")
+ void InterlockedExchange(vector<uint, N> index, __Element val, out __Element oldVal);
- __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$TR>($1), $2, $3)")
- __Element InterlockedCompareExchange(uint index, __Element compare, __Element val);
+ __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$T2>($1), $2, $3)")
+ void InterlockedCompareExchange(uint index, __Element compare, __Element val);
__generic<let N:int>
- __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$TR>($1), $2, $3)")
- __Element InterlockedCompareExchange(vector<uint, N> index, __Element compare, __Element val);
+ __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$T2>($1), $2, $3)")
+ void InterlockedCompareExchange(vector<uint, N> index, __Element compare, __Element val);
}
${{{{
@@ -222,12 +216,19 @@ ${{{{
extension TensorView<float>
{
- __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$G0>($1), $2)")
- float InterlockedExchange(uint index, float val);
+ __target_intrinsic(cuda, "*($3) = atomicExch($0.data_ptr_at<float>($1), $2)")
+ float InterlockedExchange(uint index, float val, out float oldVal);
__generic<let N:int>
- __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$G0>($1), $2)")
- float InterlockedExchange(vector<uint, N> index, float val);
+ __target_intrinsic(cuda, "*($3) = atomicExch($0.data_ptr_at<float>($1), $2)")
+ float InterlockedExchange(vector<uint, N> index, float val, out float oldVal);
+
+ __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<uint32_t>($1), slang_bit_cast<uint32_t>($2), slang_bit_cast<uint32_t>($3))")
+ void InterlockedCompareExchange(uint index, float compare, float val);
+
+ __generic<let N : int>
+ __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<uint32_t>($1), slang_bit_cast<uint32_t>($2), slang_bit_cast<uint32_t>($3))")
+ void InterlockedCompareExchange(vector<uint, N> index, float compare, float val);
}
__generic<T>