Added diagnostics & built-in type lowering for `[CUDAKernel]` functions (#4042)

* Added diagnostics & built-in type lowering for `[CUDAKernel]` functions This PR adds - Diagnostics for non-void return from a cuda kernel entry point - Diagnostics for using differentiable types in a differentiable cuda kernel entry point - Logic for converting built-in types (float3, float3x3, etc..) to portable struct types and unpacks the parameter back into a built-in type on the CUDA side. This is because built-in types have different implementations in CUDA & CPP targets, which causes signature mis-match when linking. * Fix error codes * Add ability to lower structs and arrays that contain built-in types. + Added tests + Fix issue where the host-side was not marshalling data to lowered types. * Update slang-ir-pytorch-cpp-binding.cpp --------- Co-authored-by: Yong He <yonghe@outlook.com>
author: Sai Praveen Bangaru <31557731+saipraveenb25@users.noreply.github.com> 2024-04-30 16:05:33 -0400
committer: GitHub <noreply@github.com> 2024-04-30 16:05:33 -0400
commit: 52b91231cdadc048f93b224f5035759cf1a96eaa (patch)
tree: 23d3263bc662eb96d6284266282695a9b0f1e2db /tests
parent: 70111daf43c87e182695666c34345e061e114a68 (diff)
6 files changed, 131 insertions, 0 deletions
diff --git a/tests/autodiff/autobind-plain-matrix-input.slang b/tests/autodiff/autobind-plain-matrix-input.slang
new file mode 100644
index 000000000..d09e77fc1
--- /dev/null
+++ b/tests/autodiff/autobind-plain-matrix-input.slang
@@ -0,0 +1,21 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -line-directive-mode none
+//TEST:SIMPLE(filecheck=TORCH): -target torch -line-directive-mode none
+
+[AutoPyBindCUDA]
+[CUDAKernel]
+void plain_copy(float3x3 input, TensorView<float> output)
+{
+    // CUDA: __global__ void __kernel__plain_copy(_MatrixStorage_float3x3_ColMajor_0 input_0, TensorView output_0)
+    // TORCH:    void __kernel__plain_copy(_MatrixStorage_float3x3_ColMajor_0 _0, TensorView _1);
+
+    // Get the 'global' index of this thread.
+    uint3 dispatchIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim();
+
+    // If the thread index is beyond the input size, exit early.
+    if (dispatchIdx.x >= 1)
+        return;
+
+    output[0] = input[0][0];
+    output[1] = input[1][1];
+    output[2] = input[2][2];
+}
diff --git a/tests/autodiff/autobind-plain-vector-input.slang b/tests/autodiff/autobind-plain-vector-input.slang
new file mode 100644
index 000000000..216585093
--- /dev/null
+++ b/tests/autodiff/autobind-plain-vector-input.slang
@@ -0,0 +1,21 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -line-directive-mode none
+//TEST:SIMPLE(filecheck=TORCH): -target torch -line-directive-mode none
+
+[AutoPyBindCUDA]
+[CUDAKernel]
+void plain_copy(float3 input, TensorView<float> output)
+{
+    // CUDA: __global__ void __kernel__plain_copy(_VectorStorage_float3_0 input_0, TensorView output_0)
+    // TORCH: void __kernel__plain_copy(_VectorStorage_float3_0 _0, TensorView _1);
+
+    // Get the 'global' index of this thread.
+    uint3 dispatchIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim();
+
+    // If the thread index is beyond the input size, exit early.
+    if (dispatchIdx.x >= 1)
+        return;
+
+    output[0] = input.x;
+    output[1] = input.y;
+    output[2] = input.z;
+}
diff --git a/tests/autodiff/autobind-struct-with-array-of-builtins.slang b/tests/autodiff/autobind-struct-with-array-of-builtins.slang
new file mode 100644
index 000000000..69904fadd
--- /dev/null
+++ b/tests/autodiff/autobind-struct-with-array-of-builtins.slang
@@ -0,0 +1,22 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -line-directive-mode none
+//TEST:SIMPLE(filecheck=TORCH): -target torch -line-directive-mode none
+
+
+[AutoPyBindCUDA]
+[CUDAKernel]
+void plain_copy(float3[4] input, TensorView<float> output)
+{
+    // CUDA: __global__ void __kernel__plain_copy(FixedArray<_VectorStorage_float3_0, 4>  input_0, TensorView output_0)
+    // TORCH:    void __kernel__plain_copy(FixedArray<_VectorStorage_float3_0, 4>  _0, TensorView _1);
+
+    // Get the 'global' index of this thread.
+    uint3 dispatchIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim();
+
+    // If the thread index is beyond the input size, exit early.
+    if (dispatchIdx.x >= 1)
+        return;
+
+    output[0] = input[0].x;
+    output[1] = input[2].y;
+    output[2] = input[3].z;
+}
diff --git a/tests/autodiff/autobind-struct-with-builtin-types.slang b/tests/autodiff/autobind-struct-with-builtin-types.slang
new file mode 100644
index 000000000..70832cc40
--- /dev/null
+++ b/tests/autodiff/autobind-struct-with-builtin-types.slang
@@ -0,0 +1,32 @@
+//TEST:SIMPLE(filecheck=CUDA): -target cuda -line-directive-mode none
+//TEST:SIMPLE(filecheck=TORCH): -target torch -line-directive-mode none
+
+struct MyStruct
+{
+    float3x3 data;
+    float3 vec;
+};
+
+struct MyStruct2
+{
+    float data;
+};
+
+[AutoPyBindCUDA]
+[CUDAKernel]
+void plain_copy(MyStruct input, MyStruct2 input2, TensorView<float> output)
+{
+    // CUDA: __global__ void __kernel__plain_copy(U_StructStorage_MyStruct_0 input_0, MyStruct2_0 input2_0, TensorView output_0)
+    // TORCH:    void __kernel__plain_copy(U_StructStorage_MyStruct_0 _0, MyStruct2_0 _1, TensorView _2);
+
+    // Get the 'global' index of this thread.
+    uint3 dispatchIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim();
+
+    // If the thread index is beyond the input size, exit early.
+    if (dispatchIdx.x >= 1)
+        return;
+
+    output[0] = input.data[0][0];
+    output[1] = input.vec[1];
+    output[2] = input.data[2][2];
+}
diff --git a/tests/diagnostics/cuda-kernel-differentiable-params.slang b/tests/diagnostics/cuda-kernel-differentiable-params.slang
new file mode 100644
index 000000000..0e7604b3d
--- /dev/null
+++ b/tests/diagnostics/cuda-kernel-differentiable-params.slang
@@ -0,0 +1,18 @@
+//DIAGNOSTIC_TEST:SIMPLE(filecheck=CHECK):
+
+// Simple check to see if the compiler throws an error if a CUDA kernel is declared with non-void return type.
+
+[CudaKernel]
+[Differentiable]
+void myBadKernel(float x, TensorView<float> t1, TensorView<float> t2)
+{
+    // CHECK:      tests/diagnostics/cuda-kernel-differentiable-params.slang([[@LINE-2]]): error 31214: differentiable kernel entry point cannot have differentiable parameters. Consider using DiffTensorView to pass differentiable data, or marking this parameter with 'no_diff'
+    // CHECK-NEXT:     void myBadKernel(float x, TensorView<float> t1, TensorView<float> t2)
+    // CHECK-NEXT:                            ^
+}
+
+[CudaKernel]
+void myGoodKernel(float x, TensorView<float> t1, TensorView<float> t2)
+{
+
+}
+\ No newline at end of file
diff --git a/tests/diagnostics/cuda-kernel-non-void-return.slang b/tests/diagnostics/cuda-kernel-non-void-return.slang
new file mode 100644
index 000000000..75c8bc6d4
--- /dev/null
+++ b/tests/diagnostics/cuda-kernel-non-void-return.slang
@@ -0,0 +1,17 @@
+//DIAGNOSTIC_TEST:SIMPLE(filecheck=CHECK):
+
+// Simple check to see if the compiler throws an error if a CUDA kernel is declared with non-void return type.
+
+[CudaKernel]
+float myBadKernel(TensorView<float> t1, TensorView<float> t2)
+{
+    // CHECK:      tests/diagnostics/cuda-kernel-non-void-return.slang([[@LINE-2]]): error 31213: return type of a CUDA kernel function cannot be non-void.
+    // CHECK-NEXT:     float myBadKernel(TensorView<float> t1, TensorView<float> t2)
+    // CHECK-NEXT:           ^~~~~~~~~~~
+}
+
+[CudaKernel]
+void myGoodKernel(TensorView<float> t1, TensorView<float> t2)
+{
+
+}
+\ No newline at end of file
author	Sai Praveen Bangaru <31557731+saipraveenb25@users.noreply.github.com>	2024-04-30 16:05:33 -0400
committer	GitHub <noreply@github.com>	2024-04-30 16:05:33 -0400
commit	52b91231cdadc048f93b224f5035759cf1a96eaa (patch)
tree	23d3263bc662eb96d6284266282695a9b0f1e2db /tests
parent	70111daf43c87e182695666c34345e061e114a68 (diff)