From cae5ddd4a2c9343ec7367c9049c5cc0c8628a9c4 Mon Sep 17 00:00:00 2001
From: jsmall-nvidia <jsmall@nvidia.com>
Date: Wed, 8 Jan 2020 11:09:20 -0500
Subject: Setup of runtime cuda device (#1162)

* CUDA generated first test compiles.

* WIP on enabling CUDA in render-test.

* Detect CUDA_PATH environmental variable to build build cuda support into render-test.
Added WIP cuda-compute-util.cpp/h
Added  CUDA as a renderer type.

* Fix libraries needed for cuda in premake.

* Added -enable-cuda premake option. Defaults to false.

* Creates CUDA device, loads PTX and finds entry point.

* Fix some erroneous cruft from slang-cuda-prelude.h
---
 prelude/slang-cuda-prelude.h                 |   2 +-
 premake5.lua                                 |  31 ++++
 source/core/slang-render-api-util.cpp        |   6 +
 source/core/slang-render-api-util.h          |   2 +
 source/slang/slang-emit-cuda.cpp             |   2 +-
 tools/gfx/render.cpp                         |   2 +
 tools/gfx/render.h                           |   4 +-
 tools/render-test/cpu-compute-util.h         |   2 +-
 tools/render-test/cuda/cuda-compute-util.cpp | 206 +++++++++++++++++++++++++++
 tools/render-test/cuda/cuda-compute-util.h   |  21 +++
 tools/render-test/options.cpp                |   5 +-
 tools/render-test/render-test-main.cpp       |  54 +++++--
 tools/render-test/slang-support.h            |  13 ++
 tools/slang-test/slang-test-main.cpp         |   5 +
 14 files changed, 341 insertions(+), 14 deletions(-)
 create mode 100644 tools/render-test/cuda/cuda-compute-util.cpp
 create mode 100644 tools/render-test/cuda/cuda-compute-util.h
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index ac299eac0..4d4681baf 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -1,5 +1,5 @@
-#line 1 "slang-cuda-prelude.h"
 
+// For now we'll disable any asserts in this prelude
 #define SLANG_PRELUDE_ASSERT(x) 
 
 template <typename T, size_t SIZE>
diff --git a/premake5.lua b/premake5.lua
index 1905bd2f4..4de256321 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -84,10 +84,26 @@ newoption {
    allowed     = { { "true", "True"}, { "false", "False" } }
 }
 
+newoption {
+   trigger     = "enable-cuda",
+   description = "(Optional) If true will enable cuda tests, if CUDA is found via CUDA_PATH",
+   value       = "bool",
+   default     = "false",
+   allowed     = { { "true", "True"}, { "false", "False" } }
+}
+
 buildLocation = _OPTIONS["build-location"]
 executeBinary = (_OPTIONS["execute-binary"] == "true")
 targetDetail = _OPTIONS["target-detail"]
 buildGlslang = (_OPTIONS["build-glslang"] == "true")
+enableCuda = (_OPTIONS["enable-cuda"] == "true")
+
+-- cudaPath is only set if cuda is enabled, and CUDA_PATH enviromental variable is set
+cudaPath = nil
+if enableCuda then
+    -- Get the CUDA path from the environment variable. If set, CUDA will be assumed installed
+    cudaPath = os.getenv("CUDA_PATH")
+end    
 
 -- Is true when the target is really windows (ie not something on top of windows like cygwin)
 local isTargetWindows = (os.target() == "windows") and not (targetDetail == "mingw" or targetDetail == "cygwin")
@@ -529,6 +545,21 @@ toolSharedLibrary "render-test"
         -- d3dcompiler_47.dll is copied from the external/slang-binaries submodule.
         postbuildcommands { '"$(SolutionDir)tools\\copy-hlsl-libs.bat" "$(WindowsSdkDir)Redist/D3D/%{cfg.platform:lower()}/" "%{cfg.targetdir}/" "windows-%{cfg.platform:lower()}"'}    
     end
+   
+    if type(cudaPath) == "string" and isTargetWindows then
+        addSourceDir "tools/render-test/cuda"
+        defines { "RENDER_TEST_CUDA" }
+        includedirs { cudaPath .. "/include" }
+        includedirs { cudaPath .. "/include", cudaPath .. "/common/inc" }
+        
+        filter { "platforms:x86" }
+            libdirs { cudaPath .. "/lib/Win32/" }
+           
+        filter { "platforms:x64" }
+            libdirs { cudaPath .. "/lib/x64/" }   
+           
+        links { "cuda", "cudart" }   
+    end
   
 --
 -- `gfx` is a utility library for doing GPU rendering
diff --git a/source/core/slang-render-api-util.cpp b/source/core/slang-render-api-util.cpp
index a9339c14e..960537a0b 100644
--- a/source/core/slang-render-api-util.cpp
+++ b/source/core/slang-render-api-util.cpp
@@ -18,6 +18,7 @@ namespace Slang {
     { RenderApiType::D3D12,  "dx12,d3d12",      ""},
     { RenderApiType::D3D11,  "dx11,d3d11",      "hlsl,hlsl-rewrite,slang"},
     { RenderApiType::CPU,    "cpu",             ""},
+    { RenderApiType::CUDA,   "cuda",            "cuda,ptx"},
 };
 
 static int _calcAvailableApis()
@@ -268,6 +269,11 @@ static bool _canLoadSharedLibrary(const char* libName)
         case RenderApiType::D3D11:     return _canLoadSharedLibrary("d3d11"); 
         case RenderApiType::D3D12:     return _canLoadSharedLibrary("d3d12");
         case RenderApiType::CPU:       return true;
+        case RenderApiType::CUDA:
+        {
+            // We'll assume it's available, and if not trying to create it will detect it
+            return true;
+        }
         default: break; 
     }
 #elif SLANG_UNIX_FAMILY
diff --git a/source/core/slang-render-api-util.h b/source/core/slang-render-api-util.h
index 48b599653..b028d3996 100644
--- a/source/core/slang-render-api-util.h
+++ b/source/core/slang-render-api-util.h
@@ -16,6 +16,7 @@ enum class RenderApiType
     D3D12,
     D3D11,
     CPU,
+    CUDA,
     CountOf,
 };
 
@@ -29,6 +30,7 @@ struct RenderApiFlag
         D3D12 = 1 << int(RenderApiType::D3D12),
         D3D11 = 1 << int(RenderApiType::D3D11),
         CPU   = 1 << int(RenderApiType::CPU),
+        CUDA  = 1 << int(RenderApiType::CUDA),
         AllOf = (1 << int(RenderApiType::CountOf)) - 1                   ///< All bits set
     };
 };
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp
index f2c9a1e80..980e94a29 100644
--- a/source/slang/slang-emit-cuda.cpp
+++ b/source/slang/slang-emit-cuda.cpp
@@ -270,7 +270,7 @@ void CUDASourceEmitter::emitEntryPointAttributesImpl(IRFunc* irFunc, IREntryPoin
             m_writer->emit(")]\n");
 #endif
 
-            m_writer->emit("__global__ ");
+            m_writer->emit("extern \"C\" __global__ ");
             break;
         }
         
diff --git a/tools/gfx/render.cpp b/tools/gfx/render.cpp
index 785343927..43a255817 100644
--- a/tools/gfx/render.cpp
+++ b/tools/gfx/render.cpp
@@ -75,6 +75,7 @@ const Resource::DescBase& Resource::getDescBase() const
     BindingStyle::OpenGl,       // OpenGl,
     BindingStyle::Vulkan,       // Vulkan
     BindingStyle::CPU,          // CPU
+    BindingStyle::CUDA,         // CUDA
 };
 
 /* static */void RendererUtil::compileTimeAsserts()
@@ -405,6 +406,7 @@ ProjectionStyle RendererUtil::getProjectionStyle(RendererType type)
         case RendererType::Vulkan:          return UnownedStringSlice::fromLiteral("Vulkan");
         case RendererType::Unknown:         return UnownedStringSlice::fromLiteral("Unknown");
         case RendererType::CPU:             return UnownedStringSlice::fromLiteral("CPU");
+        case RendererType::CUDA:            return UnownedStringSlice::fromLiteral("CUDA");
         default:                            return UnownedStringSlice::fromLiteral("?!?");
     }
 }
diff --git a/tools/gfx/render.h b/tools/gfx/render.h
index 65f3c00c0..a4d042a9a 100644
--- a/tools/gfx/render.h
+++ b/tools/gfx/render.h
@@ -68,13 +68,14 @@ enum class RendererType
     OpenGl,
     Vulkan,
     CPU,
+    CUDA,
     CountOf,
 };
 
 enum class ProjectionStyle
 {
     Unknown,
-    OpenGl,
+    OpenGl, 
     DirectX,
     Vulkan,
     CountOf,
@@ -88,6 +89,7 @@ enum class BindingStyle
     OpenGl,
     Vulkan,
     CPU,
+    CUDA,
     CountOf,
 };
 
diff --git a/tools/render-test/cpu-compute-util.h b/tools/render-test/cpu-compute-util.h
index 9430eb841..179985f6f 100644
--- a/tools/render-test/cpu-compute-util.h
+++ b/tools/render-test/cpu-compute-util.h
@@ -64,4 +64,4 @@ struct CPUComputeUtil
 
 } // renderer_test
 
-#endif //CPU_MEMORY_BINDING_H
+#endif //CPU_COMPUTE_UTIL_H
diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp
new file mode 100644
index 000000000..138f842b4
--- /dev/null
+++ b/tools/render-test/cuda/cuda-compute-util.cpp
@@ -0,0 +1,206 @@
+
+#include "cuda-compute-util.h"
+
+#include "../../slang-com-helper.h"
+
+#include "../../source/core/slang-std-writers.h"
+#include "../../source/core/slang-token-reader.h"
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+namespace renderer_test {
+using namespace Slang;
+
+#define SLANG_CUDA_RETURN_ON_FAIL(x) { int _res = (int)(x); if (_res != 0) return SLANG_FAIL; }
+
+static int _calcSMCountPerMultiProcessor(int major, int minor)
+{
+    // Defines for GPU Architecture types (using the SM version to determine
+    // the # of cores per SM
+    struct SMInfo
+    {
+        int sm;  // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
+        int coreCount;
+    };
+
+    static const SMInfo infos[] =
+    {
+        {0x30, 192},
+        {0x32, 192},
+        {0x35, 192},
+        {0x37, 192},
+        {0x50, 128},
+        {0x52, 128},
+        {0x53, 128},
+        {0x60,  64},
+        {0x61, 128},
+        {0x62, 128},
+        {0x70,  64},
+        {0x72,  64},
+        {0x75,  64}
+    };
+
+    const int sm = ((major << 4) + minor);
+    for (Index i = 0; i < SLANG_COUNT_OF(infos); ++i)
+    {
+        if (infos[i].sm == sm)
+        {
+            return infos[i].coreCount;
+        }
+    }
+
+    const auto& last = infos[SLANG_COUNT_OF(infos) - 1];
+
+    // It must be newer presumably
+    SLANG_ASSERT(sm > last.coreCount );
+
+    // Default to the last entry
+    return last.coreCount;
+}
+
+static SlangResult _findMaxFlopsDeviceId(int* outDevice)
+{
+    int smPerMultiproc = 0;
+    int maxPerfDevice = -1;
+    int deviceCount = 0;
+    int devicesProhibited = 0;
+
+    uint64_t maxComputePerf = 0;
+    SLANG_CUDA_RETURN_ON_FAIL(cudaGetDeviceCount(&deviceCount));
+
+    // Find the best CUDA capable GPU device
+    for (int currentDevice = 0; currentDevice < deviceCount; ++currentDevice)
+    {
+        int computeMode = -1, major = 0, minor = 0;
+        SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, currentDevice));
+        SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, currentDevice));
+        SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, currentDevice));
+
+        // If this GPU is not running on Compute Mode prohibited,
+        // then we can add it to the list
+        if (computeMode != cudaComputeModeProhibited)
+        {
+            if (major == 9999 && minor == 9999)
+            {
+                smPerMultiproc = 1;
+            }
+            else
+            {
+                smPerMultiproc = _calcSMCountPerMultiProcessor(major, minor);
+            }
+
+            int multiProcessorCount = 0, clockRate = 0;
+            SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, currentDevice));
+            SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, currentDevice));
+            uint64_t compute_perf = uint64_t(multiProcessorCount) * smPerMultiproc * clockRate;
+
+            if (compute_perf > maxComputePerf)
+            {
+                maxComputePerf = compute_perf;
+                maxPerfDevice = currentDevice;
+            }
+        }
+        else
+        {
+            devicesProhibited++;
+        }
+    }
+
+    if (maxPerfDevice < 0)
+    {
+        return SLANG_FAIL;
+    }
+
+    *outDevice = maxPerfDevice;
+    return SLANG_OK;
+}
+
+static SlangResult _initCuda()
+{
+    static CUresult res = cuInit(0);
+    SLANG_CUDA_RETURN_ON_FAIL(res);
+
+    return SLANG_OK;
+}
+
+
+
+/* static */SlangResult _createDevice(CUcontext* outContext)
+{
+    SLANG_RETURN_ON_FAIL(_initCuda());
+
+    int deviceId;
+    SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceId(&deviceId));
+    SLANG_CUDA_RETURN_ON_FAIL(cudaSetDevice(deviceId));
+
+    CUcontext context;
+
+    // Create context
+    SLANG_CUDA_RETURN_ON_FAIL(cuCtxCreate(&context, 0, deviceId));
+
+    *outContext = context;
+    return SLANG_OK;
+}
+
+/* static */bool CUDAComputeUtil::canCreateDevice()
+{
+    CUcontext context;
+    if (SLANG_SUCCEEDED(_createDevice(&context)))
+    {
+        cuCtxDestroy(context);
+        return true;
+    }
+
+    return false;
+}
+
+static SlangResult _compute(CUcontext context, CUmodule module, const ShaderCompilerUtil::OutputAndLayout& outputAndLayout)
+{
+    auto request = outputAndLayout.output.request;
+    auto reflection = (slang::ShaderReflection*) spGetReflection(request);
+
+    slang::EntryPointReflection* entryPoint = nullptr;
+    auto entryPointCount = reflection->getEntryPointCount();
+    SLANG_ASSERT(entryPointCount == 1);
+
+    entryPoint = reflection->getEntryPointByIndex(0);
+
+    const char* entryPointName = entryPoint->getName();
+
+    // Get the entry point
+    CUfunction kernel;
+
+    SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(&kernel, module, entryPointName));
+
+
+    return SLANG_OK;
+}
+
+/* static */SlangResult CUDAComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout)
+{
+    CUcontext context;
+    SLANG_RETURN_ON_FAIL(_createDevice(&context));
+
+    const Index index = outputAndLayout.output.findKernelDescIndex(StageType::Compute);
+    if (index < 0)
+    {
+        return SLANG_FAIL;
+    }
+
+    const auto& kernel = outputAndLayout.output.kernelDescs[index];
+
+    CUmodule module = 0;
+    SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&module, kernel.codeBegin));
+
+    SLANG_RETURN_ON_FAIL(_compute(context, module, outputAndLayout));
+
+    SLANG_CUDA_RETURN_ON_FAIL(cuModuleUnload(module));
+
+    cuCtxDestroy(context);
+
+    return SLANG_OK;
+}
+
+
+} // renderer_test
diff --git a/tools/render-test/cuda/cuda-compute-util.h b/tools/render-test/cuda/cuda-compute-util.h
new file mode 100644
index 000000000..9c7d83b1f
--- /dev/null
+++ b/tools/render-test/cuda/cuda-compute-util.h
@@ -0,0 +1,21 @@
+#ifndef CUDA_COMPUTE_UTIL_H
+#define CUDA_COMPUTE_UTIL_H
+
+#include "../slang-support.h"
+#include "../options.h"
+
+#include "../../source/core/slang-smart-pointer.h"
+
+namespace renderer_test {
+
+struct CUDAComputeUtil
+{
+    static SlangResult execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout);
+
+    static bool canCreateDevice();
+};
+
+
+} // renderer_test
+
+#endif //CPU_MEMORY_BINDING_H
diff --git a/tools/render-test/options.cpp b/tools/render-test/options.cpp
index a614336e7..4d5d83ce5 100644
--- a/tools/render-test/options.cpp
+++ b/tools/render-test/options.cpp
@@ -25,11 +25,12 @@ static gfx::RendererType _toRenderType(Slang::RenderApiType apiType)
     using namespace Slang;
     switch (apiType)
     {
-    case RenderApiType::D3D11: return gfx::RendererType::DirectX11;
-    case RenderApiType::D3D12: return gfx::RendererType::DirectX12;
+    case RenderApiType::D3D11:  return gfx::RendererType::DirectX11;
+    case RenderApiType::D3D12:  return gfx::RendererType::DirectX12;
     case RenderApiType::OpenGl: return gfx::RendererType::OpenGl;
     case RenderApiType::Vulkan: return gfx::RendererType::Vulkan;
     case RenderApiType::CPU:    return gfx::RendererType::CPU;
+    case RenderApiType::CUDA:   return gfx::RendererType::CUDA;
     default: return gfx::RendererType::Unknown;
     }
 }
diff --git a/tools/render-test/render-test-main.cpp b/tools/render-test/render-test-main.cpp
index 1355402cf..a110b6ca9 100644
--- a/tools/render-test/render-test-main.cpp
+++ b/tools/render-test/render-test-main.cpp
@@ -25,6 +25,10 @@
 
 #include "cpu-compute-util.h"
 
+#if RENDER_TEST_CUDA
+#   include "cuda/cuda-compute-util.h"
+#endif
+
 namespace renderer_test {
 
 using Slang::Result;
@@ -461,6 +465,13 @@ SLANG_TEST_TOOL_API SlangResult innerMain(Slang::StdWriters* stdWriters, SlangSe
             nativeLanguage = SLANG_SOURCE_LANGUAGE_CPP;
             slangPassThrough = SLANG_PASS_THROUGH_GENERIC_C_CPP;
             break;
+        case RendererType::CUDA:
+            input.target = SLANG_PTX;
+            input.profile = "";
+            nativeLanguage = SLANG_SOURCE_LANGUAGE_CUDA;
+            slangPassThrough = SLANG_PASS_THROUGH_NVRTC;
+            break;
+
 		default:
 			fprintf(stderr, "error: unexpected\n");
 			return SLANG_FAIL;
@@ -502,20 +513,30 @@ SLANG_TEST_TOOL_API SlangResult innerMain(Slang::StdWriters* stdWriters, SlangSe
         rendererName << "'" << gOptions.adapter << "'";
     }
 
-    // If it's CPU testing we don't need a window or a renderer
-    if (gOptions.rendererType == RendererType::CPU)
+    if (gOptions.onlyStartup)
     {
-        if (gOptions.onlyStartup)
+        switch (gOptions.rendererType)
         {
-            // Need generic C/C++
-            if (SLANG_FAILED(spSessionCheckPassThroughSupport(session, SLANG_PASS_THROUGH_GENERIC_C_CPP)))
+            case RendererType::CUDA:
             {
+#if RENDER_TEST_CUDA
+                return SLANG_SUCCEEDED(spSessionCheckPassThroughSupport(session, SLANG_PASS_THROUGH_NVRTC)) && CUDAComputeUtil::canCreateDevice() ? SLANG_OK : SLANG_FAIL;
+#else
                 return SLANG_FAIL;
+#endif
+            }
+            case RendererType::CPU:
+            {
+                // As long as we have CPU, then this should work
+                return spSessionCheckPassThroughSupport(session, SLANG_PASS_THROUGH_GENERIC_C_CPP);
             }
-            // Should work ... 
-            return SLANG_OK;
+            default: break;
         }
+    }
 
+    // If it's CPU testing we don't need a window or a renderer
+    if (gOptions.rendererType == RendererType::CPU)
+    {
         ShaderCompilerUtil::OutputAndLayout compilationAndLayout;
         SLANG_RETURN_ON_FAIL(ShaderCompilerUtil::compileWithLayout(session, gOptions.sourcePath, gOptions.compileArgs, gOptions.shaderType, input, compilationAndLayout));
 
@@ -564,10 +585,27 @@ SLANG_TEST_TOOL_API SlangResult innerMain(Slang::StdWriters* stdWriters, SlangSe
             }
         }
 
-        
         return SLANG_OK;
     }
 
+    if (gOptions.rendererType == RendererType::CUDA)
+    {
+        ShaderCompilerUtil::OutputAndLayout compilationAndLayout;
+        SLANG_RETURN_ON_FAIL(ShaderCompilerUtil::compileWithLayout(session, gOptions.sourcePath, gOptions.compileArgs, gOptions.shaderType, input, compilationAndLayout));
+
+#if RENDER_TEST_CUDA
+
+        // TODO(JS):
+        // We don't know how to execute it yet..
+
+        SLANG_RETURN_ON_FAIL(CUDAComputeUtil::execute(compilationAndLayout));
+
+        return SLANG_OK;
+#else
+        return SLANG_FAIL;
+#endif
+    }
+
     Slang::RefPtr<Renderer> renderer;
     {
         RendererUtil::CreateFunc createFunc = RendererUtil::getCreateFunc(gOptions.rendererType);
diff --git a/tools/render-test/slang-support.h b/tools/render-test/slang-support.h
index cd3223c87..97b85ff8f 100644
--- a/tools/render-test/slang-support.h
+++ b/tools/render-test/slang-support.h
@@ -55,6 +55,19 @@ struct ShaderCompilerUtil
                  spDestroyCompileRequest(request);
             }
         }
+
+        Slang::Index findKernelDescIndex(gfx::StageType stage) const
+        {
+            for (Slang::Index i = 0; i < kernelDescs.getCount(); ++i)
+            {
+                if (kernelDescs[i].stage == stage)
+                {
+                    return i;
+                }
+            }
+            return -1;
+        }
+
         List<ShaderProgram::KernelDesc> kernelDescs;
         ShaderProgram::Desc desc;
         SlangCompileRequest* request = nullptr;
diff --git a/tools/slang-test/slang-test-main.cpp b/tools/slang-test/slang-test-main.cpp
index 6401ac852..9ccf6840f 100644
--- a/tools/slang-test/slang-test-main.cpp
+++ b/tools/slang-test/slang-test-main.cpp
@@ -670,6 +670,11 @@ static SlangResult _extractRenderTestRequirements(const CommandLine& cmdLine, Te
             nativeLanguage = SLANG_SOURCE_LANGUAGE_CPP;
             passThru = SLANG_PASS_THROUGH_GENERIC_C_CPP;
             break;
+        case RenderApiType::CUDA:
+            target = SLANG_PTX;
+            nativeLanguage = SLANG_SOURCE_LANGUAGE_CUDA;
+            passThru = SLANG_PASS_THROUGH_NVRTC;
+            break;
     }
 
     SlangSourceLanguage sourceLanguage = nativeLanguage;
-- 
cgit v1.2.3