Update a1-02-slangpy.md (#3237)

author: Sai Praveen Bangaru <31557731+saipraveenb25@users.noreply.github.com> 2023-09-25 18:29:35 -0400
committer: GitHub <noreply@github.com> 2023-09-25 15:29:35 -0700
commit: 56c4a8cba30b463fdcab21d33680f8ba70b452e0 (patch)
tree: 8d2fb5dbb951a88c24fd3cc6dc337671773989bf /docs/user-guide
parent: 2e761512add35fc719b5e5f5ef3315577777124c (diff)
1 files changed, 10 insertions, 7 deletions
diff --git a/docs/user-guide/a1-02-slangpy.md b/docs/user-guide/a1-02-slangpy.md
index 99476c5b2..8def87cea 100644
--- a/docs/user-guide/a1-02-slangpy.md
+++ b/docs/user-guide/a1-02-slangpy.md
@@ -42,7 +42,7 @@ void square(TensorView<float> input, TensorView<float> output)
     uint3 dispatchIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim();
 
     // If the thread index is beyond the input size, exit early.
-    if (dispatchIdx.x < input.size(0))
+    if (dispatchIdx.x >= input.size(0))
         return;
 
     output[dispatchIdx.x] = input[dispatchIdx.x] * input[dispatchIdx.x];
@@ -104,7 +104,7 @@ void square(DiffTensorView input, DiffTensorView output)
 {
     uint3 dispatchIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim();
 
-    if (dispatchIdx.x < input.size(0))
+    if (dispatchIdx.x >= input.size(0))
         return;
     
     output[dispatchIdx.x] = input[dispatchIdx.x] * input[dispatchIdx.x];
@@ -116,7 +116,7 @@ Now, `slangpy.loadModule("square.slang")` returns a scope with three callable ha
 You can invoke `square()` normally to get the same effect as the previous example, or invoke `square.fwd()` / `square.bwd()` by binding pairs of tensors to compute the derivatives.
 
 
-``` Python
+```python
 import torch
 import slangpy
 
@@ -162,6 +162,9 @@ You can refer to [this documentation](07-autodiff.md) for a detailed reference o
 This can be a very helpful way to wrap your Slang kernels as pytorch-compatible operations. Here's an example of the `square` kernel as a differentiable pytorch function.
 
 ```python
+import torch
+import slangpy
+
 m = slangpy.loadModule("square.slang")
 
 class MySquareFunc(torch.autograd.Function):
@@ -172,7 +175,7 @@ class MySquareFunc(torch.autograd.Function):
         kernel_with_args = m.square(input=input, output=output)
         kernel_with_args.launchRaw(
             blockSize=(32, 32, 1),
-            gridSize=((input.shape[0] + 31) / 32, (input.shape[1] + 31) / 32, 1))
+            gridSize=((input.shape[0] + 31) // 32, (input.shape[1] + 31) // 32, 1))
 
         ctx.save_for_backward(input, output)
 
@@ -190,7 +193,7 @@ class MySquareFunc(torch.autograd.Function):
         kernel_with_args = m.square.bwd(input=(input, input_grad), output=(output, grad_output))
         kernel_with_args.launchRaw(
             blockSize=(32, 32, 1),
-            gridSize=((input.shape[0] + 31) / 32, (input.shape[1] + 31) / 32, 1))
+            gridSize=((input.shape[0] + 31) // 32, (input.shape[1] + 31) // 32, 1))
         
         return input_grad
 ```
@@ -468,7 +471,7 @@ void square_kernel(TensorView<float> input, TensorView<float> output)
 {
     uint3 globalIdx = cudaBlockIdx() * cudaBlockDim() + cudaThreadIdx();
 
-    if (globalIdx.x > input.size(0))
+    if (globalIdx.x >= input.size(0))
         return;
 
     float result = compute_square(input[globalIdx.x]);
@@ -549,7 +552,7 @@ void square_bwd_kernel(TensorView<float> input, TensorView<float> grad_out, Tens
 {
     uint3 globalIdx = cudaBlockIdx() * cudaBlockDim() + cudaThreadIdx();
 
-    if (globalIdx.x > input.size(0) || globalIdx.y > input.size(1))
+    if (globalIdx.x >= input.size(0) || globalIdx.y >= input.size(1))
         return;
 
     DifferentialPair<float> dpInput = diffPair(input[globalIdx.xy]);
author	Sai Praveen Bangaru <31557731+saipraveenb25@users.noreply.github.com>	2023-09-25 18:29:35 -0400
committer	GitHub <noreply@github.com>	2023-09-25 15:29:35 -0700
commit	56c4a8cba30b463fdcab21d33680f8ba70b452e0 (patch)
tree	8d2fb5dbb951a88c24fd3cc6dc337671773989bf /docs/user-guide
parent	2e761512add35fc719b5e5f5ef3315577777124c (diff)