diff options
| author | Sai Praveen Bangaru <31557731+saipraveenb25@users.noreply.github.com> | 2023-09-25 18:29:35 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-09-25 15:29:35 -0700 |
| commit | 56c4a8cba30b463fdcab21d33680f8ba70b452e0 (patch) | |
| tree | 8d2fb5dbb951a88c24fd3cc6dc337671773989bf /docs/user-guide | |
| parent | 2e761512add35fc719b5e5f5ef3315577777124c (diff) | |
Update a1-02-slangpy.md (#3237)
Diffstat (limited to 'docs/user-guide')
| -rw-r--r-- | docs/user-guide/a1-02-slangpy.md | 17 |
1 files changed, 10 insertions, 7 deletions
diff --git a/docs/user-guide/a1-02-slangpy.md b/docs/user-guide/a1-02-slangpy.md index 99476c5b2..8def87cea 100644 --- a/docs/user-guide/a1-02-slangpy.md +++ b/docs/user-guide/a1-02-slangpy.md @@ -42,7 +42,7 @@ void square(TensorView<float> input, TensorView<float> output) uint3 dispatchIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim(); // If the thread index is beyond the input size, exit early. - if (dispatchIdx.x < input.size(0)) + if (dispatchIdx.x >= input.size(0)) return; output[dispatchIdx.x] = input[dispatchIdx.x] * input[dispatchIdx.x]; @@ -104,7 +104,7 @@ void square(DiffTensorView input, DiffTensorView output) { uint3 dispatchIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim(); - if (dispatchIdx.x < input.size(0)) + if (dispatchIdx.x >= input.size(0)) return; output[dispatchIdx.x] = input[dispatchIdx.x] * input[dispatchIdx.x]; @@ -116,7 +116,7 @@ Now, `slangpy.loadModule("square.slang")` returns a scope with three callable ha You can invoke `square()` normally to get the same effect as the previous example, or invoke `square.fwd()` / `square.bwd()` by binding pairs of tensors to compute the derivatives. -``` Python +```python import torch import slangpy @@ -162,6 +162,9 @@ You can refer to [this documentation](07-autodiff.md) for a detailed reference o This can be a very helpful way to wrap your Slang kernels as pytorch-compatible operations. Here's an example of the `square` kernel as a differentiable pytorch function. ```python +import torch +import slangpy + m = slangpy.loadModule("square.slang") class MySquareFunc(torch.autograd.Function): @@ -172,7 +175,7 @@ class MySquareFunc(torch.autograd.Function): kernel_with_args = m.square(input=input, output=output) kernel_with_args.launchRaw( blockSize=(32, 32, 1), - gridSize=((input.shape[0] + 31) / 32, (input.shape[1] + 31) / 32, 1)) + gridSize=((input.shape[0] + 31) // 32, (input.shape[1] + 31) // 32, 1)) ctx.save_for_backward(input, output) @@ -190,7 +193,7 @@ class MySquareFunc(torch.autograd.Function): kernel_with_args = m.square.bwd(input=(input, input_grad), output=(output, grad_output)) kernel_with_args.launchRaw( blockSize=(32, 32, 1), - gridSize=((input.shape[0] + 31) / 32, (input.shape[1] + 31) / 32, 1)) + gridSize=((input.shape[0] + 31) // 32, (input.shape[1] + 31) // 32, 1)) return input_grad ``` @@ -468,7 +471,7 @@ void square_kernel(TensorView<float> input, TensorView<float> output) { uint3 globalIdx = cudaBlockIdx() * cudaBlockDim() + cudaThreadIdx(); - if (globalIdx.x > input.size(0)) + if (globalIdx.x >= input.size(0)) return; float result = compute_square(input[globalIdx.x]); @@ -549,7 +552,7 @@ void square_bwd_kernel(TensorView<float> input, TensorView<float> grad_out, Tens { uint3 globalIdx = cudaBlockIdx() * cudaBlockDim() + cudaThreadIdx(); - if (globalIdx.x > input.size(0) || globalIdx.y > input.size(1)) + if (globalIdx.x >= input.size(0) || globalIdx.y >= input.size(1)) return; DifferentialPair<float> dpInput = diffPair(input[globalIdx.xy]); |
