diff options
| -rw-r--r-- | docs/user-guide/a1-02-slangpy.md | 79 | ||||
| -rw-r--r-- | source/slang/diff.meta.slang | 105 |
2 files changed, 116 insertions, 68 deletions
diff --git a/docs/user-guide/a1-02-slangpy.md b/docs/user-guide/a1-02-slangpy.md index 6a9b8baa3..8ee5233ba 100644 --- a/docs/user-guide/a1-02-slangpy.md +++ b/docs/user-guide/a1-02-slangpy.md @@ -226,53 +226,100 @@ The `TensorView<T>` represents the GPU view of a tensor and provides accesors to Following is a list of builtin methods and attributes for PyTorch interop. -### `static TorchTensor<T> TorchTensor<T>.alloc(uint x, uint y, ...)` +### `TorchTensor` methods + +#### `static TorchTensor<T> TorchTensor<T>.alloc(uint x, uint y, ...)` Allocates a new PyTorch tensor with the given dimensions. -### `static TorchTensor<T> TorchTensor<T>.zerosLike(TorchTensor<T> other)` +#### `static TorchTensor<T> TorchTensor<T>.emptyLike(TorchTensor<T> other)` +Allocates a new PyTorch tensor that has the same dimensions as `other` without initializing it. + +#### `static TorchTensor<T> TorchTensor<T>.zerosLike(TorchTensor<T> other)` Allocates a new PyTorch tensor that has the same dimensions as `other` and initialize it to zero. -### `uint TorchTensor<T>.dims()` +#### `uint TorchTensor<T>.dims()` Returns the tensor's dimension count. -### `uint TorchTensor<T>.size(int dim)` +#### `uint TorchTensor<T>.size(int dim)` Returns the tensor's size (in number of elements) at `dim`. -### `uint TorchTensor<T>.stride(int dim)` +#### `uint TorchTensor<T>.stride(int dim)` Returns the tensor's stride (in bytes) at `dim`. -### `TensorView<T>.operator[uint x, uint y, ...]` +### `TensorView` methods + +#### `TensorView<T>.operator[uint x, uint y, ...]` Provide an accessor to data content in a tensor. -### `TensorView<T>.operator[vector<uint, N> index]` +#### `TensorView<T>.operator[vector<uint, N> index]` Provide an accessor to data content in a tensor, indexed by a uint vector. `tensor[uint3(1,2,3)]` is equivalent to `tensor[1,2,3]`. -### `uint TensorView<T>.dims()` +#### `uint TensorView<T>.dims()` Returns the tensor's dimension count. -### `uint TensorView<T>.size(int dim)` +#### `uint TensorView<T>.size(int dim)` Returns the tensor's size (in number of elements) at `dim`. -### `uint TensorView<T>.stride(int dim)` +#### `uint TensorView<T>.stride(int dim)` Returns the tensor's stride (in bytes) at `dim`. -### `cudaThreadIdx()` +#### `void TensorView<T>.fillZero()` +Fills the tensor with zeros. Modifies the tensor in-place. + +#### `void TensorView<T>.fillValue(T value)` +Fills the tensor with the specified value, modifies the tensor in-place. + +#### `T* TensorView<T>.data_ptr_at(vector<uint, N> index)` +Returns a pointer to the element at `index`. + +#### `void TensorView<T>.InterlockedAdd(vector<uint, N> index, T val, out T oldVal)` +Atomically add `val` to element at `index`. + +#### `void TensorView<T>.InterlockedMin(vector<uint, N> index, T val, out T oldVal)` +Atomically computes the min of `val` and the element at `index`. Available for 32 and 64 bit integer types only. + +#### `void TensorView<T>.InterlockedMax(vector<uint, N> index, T val, out T oldVal)` +Atomically computes the max of `val` and the element at `index`. Available for 32 and 64 bit integer types only. + +#### `void TensorView<T>.InterlockedAnd(vector<uint, N> index, T val, out T oldVal)` +Atomically computes the bitwise and of `val` and the element at `index`. Available for 32 and 64 bit integer types only. + +#### `void TensorView<T>.InterlockedOr(vector<uint, N> index, T val, out T oldVal)` +Atomically computes the bitwise or of `val` and the element at `index`. Available for 32 and 64 bit integer types only. + +#### `void TensorView<T>.InterlockedXor(vector<uint, N> index, T val, out T oldVal)` +Atomically computes the bitwise xor of `val` and the element at `index`. Available for 32 and 64 bit integer types only. + +#### `void TensorView<T>.InterlockedExchange(vector<uint, N> index, T val, out T oldVal)` +Atomically swaps `val` into the element at `index`. Available for `float` and 32/64 bit integer types only. + +#### `void TensorView<T>.InterlockedCompareExchange(vector<uint, N> index, T compare, T val)` +Atomically swaps `val` into the element at `index` if the element equals to `compare`. Available for `float` and 32/64 bit integer types only. + +### CUDA Support Functions + +#### `cudaThreadIdx()` Returns the `threadIdx` variable in CUDA. -### `cudaBlockIdx()` +#### `cudaBlockIdx()` Returns the `blockIdx` variable in CUDA. -### `cudaBlockDim()` +#### `cudaBlockDim()` Returns the `blockDim` variable in CUDA. -### `[CudaKernel]` attribute +#### `syncTorchCudaStream()` +Waits for all pending CUDA kernel executions to complete on host. + +### Attributes for PyTorch Interop + +#### `[CudaKernel]` attribute Marks a function as a CUDA kernel (maps to a `__global__` function) -### `[TorchEntryPoint]` attribute +#### `[TorchEntryPoint]` attribute Marks a function for export to Python. Functions marked with `[TorchEntryPoint]` will be accessible from a loaded module returned by `slangpy.loadModule`. -### `[CudaDeviceExport]` attribute +#### `[CudaDeviceExport]` attribute Marks a function as a cuda device function, and ensures the compiler to include it in the generated cuda source. ## Type Marshalling Between Slang and Python diff --git a/source/slang/diff.meta.slang b/source/slang/diff.meta.slang index 252b6f5e9..2bdaccee3 100644 --- a/source/slang/diff.meta.slang +++ b/source/slang/diff.meta.slang @@ -74,12 +74,12 @@ struct TensorView __target_intrinsic(cuda, "$0.store<$G0>($1, $2, $3, $4, $5, $6)") void store(uint i0, uint i1, uint i2, uint i3, uint i4, T val); - __target_intrinsic(cuda, "atomicAdd($0.data_ptr_at<$TR>($1), $2)") - T InterlockedAdd(uint index, T val); + __target_intrinsic(cuda, "*($3) = atomicAdd($0.data_ptr_at<$T2>($1), $2)") + void InterlockedAdd(uint index, T val, out T oldVal); __generic<let N:int> - __target_intrinsic(cuda, "atomicAdd($0.data_ptr_at<$TR>($1), $2)") - T InterlockedAdd(vector<uint, N> index, T val); + __target_intrinsic(cuda, "*($3) = atomicAdd($0.data_ptr_at<$T2>($1), $2)") + void InterlockedAdd(vector<uint, N> index, T val, out T oldVal); __target_intrinsic(cuda, "$0.dimensionCount") [__readNone] @@ -159,61 +159,55 @@ for (auto atomicIntegerTypeName : kCudaAtomicIntegerTypes) extension TensorView<$(atomicIntegerTypeName)> { typealias __Element = $(atomicIntegerTypeName); - __target_intrinsic(cuda, "atomicInc($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedIncrement(uint index, __Element val); - __generic<let N:int> - __target_intrinsic(cuda, "atomicInc($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedIncrement(vector<uint, N> index, __Element val); + __target_intrinsic(cuda, "*($3) = atomicMin($0.data_ptr_at<$T2>($1), $2)") + void InterlockedMin(uint index, __Element val, out __Element oldVal); - __target_intrinsic(cuda, "atomicMin($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedMin(uint index, __Element val); - - __generic<let N:int> - __target_intrinsic(cuda, "atomicMin($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedMin(vector<uint, N> index, __Element val); + __generic<let N : int> + __target_intrinsic(cuda, "*($3) = atomicMin($0.data_ptr_at<$T2>($1), $2)") + void InterlockedMin(vector<uint, N> index, __Element val, out __Element oldVal); - __target_intrinsic(cuda, "atomicMax($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedMax<T>(uint index, __Element val); - - __generic<let N:int> - __target_intrinsic(cuda, "atomicMax($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedMax(vector<uint, N> index, __Element val); + __target_intrinsic(cuda, "*($3) = atomicMax($0.data_ptr_at<$T2>($1), $2)") + void InterlockedMax<T>(uint index, __Element val, out __Element oldVal); - __target_intrinsic(cuda, "atomicAnd($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedAnd<T>(uint index, __Element val); - - __generic<let N:int> - __target_intrinsic(cuda, "atomicAnd($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedAnd(vector<uint, N> index, __Element val); + __generic<let N : int> + __target_intrinsic(cuda, "*($3) = atomicMax($0.data_ptr_at<$T2>($1), $2)") + void InterlockedMax(vector<uint, N> index, __Element val, out __Element oldVal); - __target_intrinsic(cuda, "atomicOr($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedOr<T>(uint index, __Element val); - - __generic<let N:int> - __target_intrinsic(cuda, "atomicOr($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedOr(vector<uint, N> index, __Element val); + __target_intrinsic(cuda, "*($3) = atomicAnd($0.data_ptr_at<$T2>($1), $2)") + void InterlockedAnd<T>(uint index, __Element val, out __Element oldVal); - __target_intrinsic(cuda, "atomicXor($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedXor<T>(uint index, __Element val); - - __generic<let N:int> - __target_intrinsic(cuda, "atomicXor($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedXor(vector<uint, N> index, __Element val); - - __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedExchange(uint index, __Element val); + __generic<let N : int> + __target_intrinsic(cuda, "*($3) = atomicAnd($0.data_ptr_at<$T2>($1), $2)") + void InterlockedAnd(vector<uint, N> index, __Element val, out __Element oldVal); + + __target_intrinsic(cuda, "*($3) = atomicOr($0.data_ptr_at<$T2>($1), $2)") + void InterlockedOr<T>(uint index, __Element val, out __Element oldVal); + + __generic<let N : int> + __target_intrinsic(cuda, "*($3) = atomicOr($0.data_ptr_at<$T2>($1), $2)") + void InterlockedOr(vector<uint, N> index, __Element val, out __Element oldVal); + + __target_intrinsic(cuda, "*($3) = atomicXor($0.data_ptr_at<$T2>($1), $2)") + void InterlockedXor<T>(uint index, __Element val, out __Element oldVal); + + __generic<let N : int> + __target_intrinsic(cuda, "*($3) = atomicXor($0.data_ptr_at<$T2>($1), $2)") + void InterlockedXor(vector<uint, N> index, __Element val, out __Element oldVal); + + __target_intrinsic(cuda, "*($3) = atomicExch($0.data_ptr_at<$T2>($1), $2)") + void InterlockedExchange(uint index, __Element va, out __Element oldVall); __generic<let N:int> - __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedExchange(vector<uint, N> index, __Element val); + __target_intrinsic(cuda, "*($3) = atomicExch($0.data_ptr_at<$T2>($1), $2)") + void InterlockedExchange(vector<uint, N> index, __Element val, out __Element oldVal); - __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$TR>($1), $2, $3)") - __Element InterlockedCompareExchange(uint index, __Element compare, __Element val); + __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$T2>($1), $2, $3)") + void InterlockedCompareExchange(uint index, __Element compare, __Element val); __generic<let N:int> - __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$TR>($1), $2, $3)") - __Element InterlockedCompareExchange(vector<uint, N> index, __Element compare, __Element val); + __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$T2>($1), $2, $3)") + void InterlockedCompareExchange(vector<uint, N> index, __Element compare, __Element val); } ${{{{ @@ -222,12 +216,19 @@ ${{{{ extension TensorView<float> { - __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$G0>($1), $2)") - float InterlockedExchange(uint index, float val); + __target_intrinsic(cuda, "*($3) = atomicExch($0.data_ptr_at<float>($1), $2)") + float InterlockedExchange(uint index, float val, out float oldVal); __generic<let N:int> - __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$G0>($1), $2)") - float InterlockedExchange(vector<uint, N> index, float val); + __target_intrinsic(cuda, "*($3) = atomicExch($0.data_ptr_at<float>($1), $2)") + float InterlockedExchange(vector<uint, N> index, float val, out float oldVal); + + __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<uint32_t>($1), slang_bit_cast<uint32_t>($2), slang_bit_cast<uint32_t>($3))") + void InterlockedCompareExchange(uint index, float compare, float val); + + __generic<let N : int> + __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<uint32_t>($1), slang_bit_cast<uint32_t>($2), slang_bit_cast<uint32_t>($3))") + void InterlockedCompareExchange(vector<uint, N> index, float compare, float val); } __generic<T> |
