Binary for Heterogeneous Example (#1467)

* Binary Heterogeneous Example This PR introduces the ability to insert the binary of a non-CPU target by using the -heterogeneous flag. Specifically, this PR updates the emitting logic to produce a variable of name `__[name_of_entryPoint]` when the heterogeneous flag is present. * Prelude path fix Co-authored-by: Tim Foley <tfoleyNV@users.noreply.github.com>
author: Dietrich Geisler <dag368@cornell.edu> 2020-07-31 17:51:52 -0400
committer: GitHub <noreply@github.com> 2020-07-31 14:51:52 -0700
commit: 011a743668e7cd0b7cf97d27e3bed7d519794aeb (patch)
tree: 49cf484df958aa705ff910631e8f732a6f0a57b9 /examples
parent: 4549597709e29b85b5f95503f4f2258c16db12be (diff)
2 files changed, 125 insertions, 110 deletions
diff --git a/examples/heterogeneous-hello-world/main.cpp b/examples/heterogeneous-hello-world/main.cpp
index a590f8c4b..47df20dc5 100644
--- a/examples/heterogeneous-hello-world/main.cpp
+++ b/examples/heterogeneous-hello-world/main.cpp
@@ -63,103 +63,15 @@ struct gfx_DescriptorSet_0;
 struct gfx_PipelineState_0;
 
 bool executeComputation_0();
+extern unsigned char __computeMain[];
+extern size_t __computeMainSize;
 
 gfx::ShaderProgram* loadShaderProgram(gfx::Renderer* renderer)
 {
-    // First, we need to create a "session" for interacting with the Slang
-    // compiler. This scopes all of our application's interactions
-    // with the Slang library. At the moment, creating a session causes
-    // Slang to load and validate its standard library, so this is a
-    // somewhat heavy-weight operation. When possible, an application
-    // should try to re-use the same session across multiple compiles.
+    // We extract the begin/end pointers to the output code buffers directly
     //
-    SlangSession* slangSession = spCreateSession(NULL);
-
-    // A compile request represents a single invocation of the compiler,
-    // to process some inputs and produce outputs (or errors).
-    //
-    SlangCompileRequest* slangRequest = spCreateCompileRequest(slangSession);
-
-    // We would like to request a single target (output) format: DirectX shader bytecode (DXBC)
-    int targetIndex = spAddCodeGenTarget(slangRequest, SLANG_DXBC);
-
-    // We will specify the desired "profile" for this one target in terms of the
-    // DirectX "shader model" that should be supported.
-    //
-    spSetTargetProfile(slangRequest, targetIndex, spFindProfile(slangSession, "sm_4_0"));
-
-    // A compile request can include one or more "translation units," which more or
-    // less amount to individual source files (think `.c` files, not the `.h` files they
-    // might include).
-    //
-    // For this example, our code will all be in the Slang language. The user may
-    // also specify HLSL input here, but that currently doesn't affect the compiler's
-    // behavior much.
-    //
-    int translationUnitIndex = spAddTranslationUnit(slangRequest, SLANG_SOURCE_LANGUAGE_SLANG, nullptr);
-
-    // We will load source code for our translation unit from the file `shaders.slang`.
-    // There are also variations of this API for adding source code from application-provided buffers.
-    //
-    spAddTranslationUnitSourceFile(slangRequest, translationUnitIndex, "shader.slang");
-
-    // Next we will specify the entry points we'd like to compile.
-    // It is often convenient to put more than one entry point in the same file,
-    // and the Slang API makes it convenient to use a single run of the compiler
-    // to compile all entry points.
-    //
-    // For each entry point, we need to specify the name of a function, the
-    // translation unit in which that function can be found, and the stage
-    // that we need to compile for (e.g., vertex, fragment, geometry, ...).
-    //
-    char const* computeEntryPointName = "computeMain";
-    int computeIndex = spAddEntryPoint(slangRequest, translationUnitIndex, computeEntryPointName,   SLANG_STAGE_COMPUTE);
-
-    // Once all of the input options for the compiler have been specified,
-    // we can invoke `spCompile` to run the compiler and see if any errors
-    // were detected.
-    //
-    const SlangResult compileRes = spCompile(slangRequest);
-
-    // Even if there were no errors that forced compilation to fail, the
-    // compiler may have produced "diagnostic" output such as warnings.
-    // We will go ahead and print that output here.
-    //
-    if(auto diagnostics = spGetDiagnosticOutput(slangRequest))
-    {
-        reportError("%s", diagnostics);
-    }
-
-    // If compilation failed, there is no point in continuing any further.
-    if(SLANG_FAILED(compileRes))
-    {
-        spDestroyCompileRequest(slangRequest);
-        spDestroySession(slangSession);
-        return nullptr;
-    }
-
-    // If compilation was successful, then we will extract the code for
-    // our two entry points as "blobs".
-    //
-    // If you are using a D3D API, then your application may want to
-    // take advantage of the fact taht these blobs are binary compatible
-    // with the `ID3DBlob`, `ID3D10Blob`, etc. interfaces.
-    //
-
-    ISlangBlob* computeShaderBlob = nullptr;
-    spGetEntryPointCodeBlob(slangRequest, computeIndex, 0, &computeShaderBlob);
-
-    // We extract the begin/end pointers to the output code buffers
-    // using operations on the `ISlangBlob` interface.
-    //
-    char const* computeCode = (char const*) computeShaderBlob->getBufferPointer();
-    char const* computeCodeEnd = computeCode + computeShaderBlob->getBufferSize();
-
-    // Once we have extracted the output blobs, it is safe to destroy
-    // the compile request and even the session.
-    //
-    spDestroyCompileRequest(slangRequest);
-    spDestroySession(slangSession);
+    char unsigned const* computeCode = __computeMain;
+    char unsigned const* computeCodeEnd = computeCode + __computeMainSize;
 
     // Now we use the operations of the example graphics API abstraction
     // layer to load shader code into the underlying API.
@@ -179,11 +91,6 @@ gfx::ShaderProgram* loadShaderProgram(gfx::Renderer* renderer)
 
     gShaderProgram = renderer->createProgram(programDesc);
 
-    // Once we've used the output blobs from the Slang compiler to initialize
-    // the API-specific shader program, we can release their memory.
-    //
-    computeShaderBlob->release();
-
     return gShaderProgram;
 }
 
diff --git a/examples/heterogeneous-hello-world/shader.cpp b/examples/heterogeneous-hello-world/shader.cpp
index e8656bed7..d489f7136 100644
--- a/examples/heterogeneous-hello-world/shader.cpp
+++ b/examples/heterogeneous-hello-world/shader.cpp
@@ -1,4 +1,4 @@
-#include "../../prelude/slang-cpp-prelude.h"
+#include "../../slang/prelude/slang-cpp-prelude.h"
 
 
 //namespace { // anonymous 
@@ -7,8 +7,43 @@
 using namespace SLANG_PRELUDE_NAMESPACE;
 #endif
 
+Vector<uint32_t, 3> operator+(Vector<uint32_t, 3> a, Vector<uint32_t, 3> b)
+{
+    Vector<uint32_t, 3> r;
+    r.x = a.x + b.x;
+    r.y = a.y + b.y;
+    r.z = a.z + b.z;
+    return r;
+}
+
+Vector<uint32_t, 3> operator*(Vector<uint32_t, 3> a, Vector<uint32_t, 3> b)
+{
+    Vector<uint32_t, 3> r;
+    r.x = a.x * b.x;
+    r.y = a.y * b.y;
+    r.z = a.z * b.z;
+    return r;
+}
+
+Vector<uint32_t, 3> make_VecU3(uint32_t a, uint32_t b, uint32_t c)
+{
+    return Vector<uint32_t, 3>{ a, b, c};
+}
+
+size_t __computeMainSize = 652;
+unsigned char __computeMain[] = {68, 88, 66, 67, 85, 217, 21, 44, 5, 208, 4, 46, 7, 254, 139, 84, 132, 65, 108, 79, 1, 0, 0, 0, 140, 2, 0, 0, 5, 0, 0, 0, 52, 0, 0, 0, 248, 0, 0, 0, 8, 1, 0, 0, 24, 1, 0, 0, 16, 2, 0, 0, 82, 68, 69, 70, 188, 0, 0, 0, 1, 0, 0, 0, 72, 0, 0, 0, 1, 0, 0, 0, 28, 0, 0, 0, 0, 4, 83, 67, 0, 9, 16, 0, 148, 0, 0, 0, 60, 0, 0, 0, 6, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 105, 111, 66, 117, 102, 102, 101, 114, 95, 48, 0, 171, 60, 0, 0, 0, 1, 0, 0, 0, 96, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 120, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 2, 0, 0, 0, 132, 0, 0, 0, 0, 0, 0, 0, 36, 69, 108, 101, 109, 101, 110, 116, 0, 171, 171, 171, 0, 0, 3, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 77, 105, 99, 114, 111, 115, 111, 102, 116, 32, 40, 82, 41, 32, 72, 76, 83, 76, 32, 83, 104, 97, 100, 101, 114, 32, 67, 111, 109, 112, 105, 108, 101, 114, 32, 49, 48, 46, 49, 0, 73, 83, 71, 78, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 79, 83, 71, 78, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 83, 72, 69, 88, 240, 0, 0, 0, 64, 0, 5, 0, 60, 0, 0, 0, 106, 8, 0, 1, 158, 0, 0, 4, 0, 224, 17, 0, 0, 0, 0, 0, 4, 0, 0, 0, 95, 0, 0, 2, 18, 0, 2, 0, 104, 0, 0, 2, 1, 0, 0, 0, 155, 0, 0, 4, 4, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 167, 0, 0, 8, 18, 0, 16, 0, 0, 0, 0, 0, 10, 0, 2, 0, 1, 64, 0, 0, 0, 0, 0, 0, 6, 224, 17, 0, 0, 0, 0, 0, 49, 0, 0, 7, 34, 0, 16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 0, 0, 0, 0, 1, 64, 0, 0, 0, 0, 0, 63, 0, 0, 0, 7, 66, 0, 16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 0, 0, 0, 0, 75, 0, 0, 5, 18, 0, 16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 0, 0, 0, 0, 55, 0, 0, 9, 18, 0, 16, 0, 0, 0, 0, 0, 26, 0, 16, 0, 0, 0, 0, 0, 42, 0, 16, 0, 0, 0, 0, 0, 10, 0, 16, 0, 0, 0, 0, 0, 168, 0, 0, 8, 18, 224, 17, 0, 0, 0, 0, 0, 10, 0, 2, 0, 1, 64, 0, 0, 0, 0, 0, 0, 10, 0, 16, 0, 0, 0, 0, 0, 62, 0, 0, 1, 83, 84, 65, 84, 116, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+#line 11 "shader.slang"
+struct GlobalParams_0
+{
+    RWStructuredBuffer<float> ioBuffer_0;
+};
+
+struct KernelContext_0
+{
+    GlobalParams_0* globalParams_0;
+};
 
-#line 21 "../../examples/heterogeneous-hello-world/shader.slang"
 struct gfx_Window_0
 {
 };
@@ -55,6 +90,41 @@ struct gfx_PipelineState_0
 };
 
 
+#line 7
+void _computeMain(void* _S1, void* entryPointParams_0, void* _S2)
+{
+    ComputeThreadVaryingInput* _S3 = ((ComputeThreadVaryingInput*)(_S1));
+    KernelContext_0 kernelContext_0;
+    *(&(&kernelContext_0)->globalParams_0) = ((GlobalParams_0*)(_S2));
+
+#line 9
+    uint32_t tid_0 = (*(&_S3->groupID) * make_VecU3(4U, 1U, 1U) + *(&_S3->groupThreadID)).x;
+
+    float* _S4 = &(*(&(*(&(&kernelContext_0)->globalParams_0))->ioBuffer_0))[tid_0];
+
+#line 11
+    float i_0 = *_S4;
+    bool _S5 = i_0 < 0.50000000000000000000f;
+
+#line 12
+    float _S6 = i_0 + i_0;
+
+#line 12
+    float _S7 = (F32_sqrt((i_0)));
+
+#line 12
+    float o_0 = _S5 ? _S6 : _S7;
+
+    float* _S8 = &(*(&(*(&(&kernelContext_0)->globalParams_0))->ioBuffer_0))[tid_0];
+
+#line 14
+    *_S8 = o_0;
+
+#line 7
+    return;
+}
+
+
 #line 34
 gfx_Window_0* createWindow_0(int32_t _0, int32_t _1);
 
@@ -110,17 +180,17 @@ bool executeComputation_0()
     FixedArray<float, 4> initialArray_0 = { 3.00000000000000000000f, -20.00000000000000000000f, -6.00000000000000000000f, 8.00000000000000000000f };
 
 
-    gfx_Window_0* _S1 = createWindow_0(int(1024), int(768));
-    gfx_Renderer_0* _S2 = createRenderer_0(int(1024), int(768), _S1);
-    gfx_BufferResource_0* _S3 = createStructuredBuffer_0(_S2, initialArray_0);
-    gfx_ShaderProgram_0* _S4 = loadShaderProgram_0(_S2);
-    gfx_DescriptorSetLayout_0* _S5 = buildDescriptorSetLayout_0(_S2);
-    gfx_PipelineLayout_0* _S6 = buildPipeline_0(_S2, _S5);
-    gfx_DescriptorSet_0* _S7 = buildDescriptorSet_0(_S2, _S5, _S3);
-    gfx_PipelineState_0* _S8 = buildPipelineState_0(_S4, _S2, _S6);
+    gfx_Window_0* _S9 = createWindow_0(int(1024), int(768));
+    gfx_Renderer_0* _S10 = createRenderer_0(int(1024), int(768), _S9);
+    gfx_BufferResource_0* _S11 = createStructuredBuffer_0(_S10, initialArray_0);
+    gfx_ShaderProgram_0* _S12 = loadShaderProgram_0(_S10);
+    gfx_DescriptorSetLayout_0* _S13 = buildDescriptorSetLayout_0(_S10);
+    gfx_PipelineLayout_0* _S14 = buildPipeline_0(_S10, _S13);
+    gfx_DescriptorSet_0* _S15 = buildDescriptorSet_0(_S10, _S13, _S11);
+    gfx_PipelineState_0* _S16 = buildPipelineState_0(_S12, _S10, _S14);
     printInitialValues_0(initialArray_0, int(4));
-    dispatchComputation_0(_S2, _S8, _S6, _S7);
-    print_output_0(_S2, _S3, int(4));
+    dispatchComputation_0(_S10, _S16, _S14, _S15);
+    print_output_0(_S10, _S11, int(4));
 
 
     return true;
@@ -128,3 +198,41 @@ bool executeComputation_0()
 
 //} // anonymous
 
+// [numthreads(4, 1, 1)]
+SLANG_PRELUDE_EXPORT
+void computeMain_Thread(ComputeThreadVaryingInput* varyingInput, void* entryPointParams, void* globalParams)
+{
+    _computeMain(varyingInput, entryPointParams, globalParams);
+}
+// [numthreads(4, 1, 1)]
+SLANG_PRELUDE_EXPORT
+void computeMain_Group(ComputeVaryingInput* varyingInput, void* entryPointParams, void* globalParams)
+{
+    ComputeThreadVaryingInput threadInput = {};
+    threadInput.groupID = varyingInput->startGroupID;
+    for (uint32_t x = 0; x < 4; ++x)
+    {
+        threadInput.groupThreadID.x = x;
+        _computeMain(&threadInput, entryPointParams, globalParams);
+    }
+}
+// [numthreads(4, 1, 1)]
+SLANG_PRELUDE_EXPORT
+void computeMain(ComputeVaryingInput* varyingInput, void* entryPointParams, void* globalParams)
+{
+    ComputeVaryingInput vi = *varyingInput;
+    ComputeVaryingInput groupVaryingInput = {};
+    for (uint32_t z = vi.startGroupID.z; z < vi.endGroupID.z; ++z)
+    {
+        groupVaryingInput.startGroupID.z = z;
+        for (uint32_t y = vi.startGroupID.y; y < vi.endGroupID.y; ++y)
+        {
+            groupVaryingInput.startGroupID.y = y;
+            for (uint32_t x = vi.startGroupID.x; x < vi.endGroupID.x; ++x)
+            {
+                groupVaryingInput.startGroupID.x = x;
+                computeMain_Group(&groupVaryingInput, entryPointParams, globalParams);
+            }
+        }
+    }
+}
author	Dietrich Geisler <dag368@cornell.edu>	2020-07-31 17:51:52 -0400
committer	GitHub <noreply@github.com>	2020-07-31 14:51:52 -0700
commit	011a743668e7cd0b7cf97d27e3bed7d519794aeb (patch)
tree	49cf484df958aa705ff910631e8f732a6f0a57b9 /examples
parent	4549597709e29b85b5f95503f4f2258c16db12be (diff)