diff options
| -rw-r--r-- | source/slang/slang-emit-cuda.cpp | 2 | ||||
| -rw-r--r-- | source/slang/slang-ir-entry-point-uniforms.cpp | 2 | ||||
| -rw-r--r-- | source/slang/slang-type-layout.cpp | 54 | ||||
| -rw-r--r-- | tests/cuda/compile-to-cuda.slang | 24 | ||||
| -rw-r--r-- | tools/render-test/bind-location.cpp | 8 | ||||
| -rw-r--r-- | tools/render-test/bind-location.h | 3 | ||||
| -rw-r--r-- | tools/render-test/cpu-compute-util.cpp | 53 | ||||
| -rw-r--r-- | tools/render-test/cpu-compute-util.h | 2 | ||||
| -rw-r--r-- | tools/render-test/cuda/cuda-compute-util.cpp | 395 | ||||
| -rw-r--r-- | tools/render-test/cuda/cuda-compute-util.h | 11 | ||||
| -rw-r--r-- | tools/render-test/render-test-main.cpp | 12 | ||||
| -rw-r--r-- | tools/render-test/shader-input-layout.cpp | 58 | ||||
| -rw-r--r-- | tools/render-test/shader-input-layout.h | 6 |
13 files changed, 520 insertions, 110 deletions
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp index 12807e9e2..39a25aafa 100644 --- a/source/slang/slang-emit-cuda.cpp +++ b/source/slang/slang-emit-cuda.cpp @@ -509,7 +509,7 @@ void CUDASourceEmitter::emitModuleImpl(IRModule* module) // Output all the thread locals for (auto action : actions) { - if (action.level == EmitAction::Level::Definition && _isVariable(action.inst->op)) + if (action.level == EmitAction::Level::Definition && action.inst->op == kIROp_GlobalVar) { emitGlobalInst(action.inst); } diff --git a/source/slang/slang-ir-entry-point-uniforms.cpp b/source/slang/slang-ir-entry-point-uniforms.cpp index ad535b747..388a7004d 100644 --- a/source/slang/slang-ir-entry-point-uniforms.cpp +++ b/source/slang/slang-ir-entry-point-uniforms.cpp @@ -452,6 +452,8 @@ void moveEntryPointUniformParamsToGlobalScope( case CodeGenTarget::Executable: case CodeGenTarget::SharedLibrary: case CodeGenTarget::HostCallable: + case CodeGenTarget::CUDASource: + case CodeGenTarget::PTX: { context.targetNeedsConstantBuffer = false; break; diff --git a/source/slang/slang-type-layout.cpp b/source/slang/slang-type-layout.cpp index 772686163..644f54a95 100644 --- a/source/slang/slang-type-layout.cpp +++ b/source/slang/slang-type-layout.cpp @@ -730,11 +730,55 @@ struct CPUObjectLayoutRulesImpl : ObjectLayoutRulesImpl }; -// TODO(JS): Most likely wrong! Use CPU layout for CUDA for now +// TODO(JS): Most likely wrong! Assumes largely CPU layout which is probably not right struct CUDAObjectLayoutRulesImpl : CPUObjectLayoutRulesImpl { typedef CPUObjectLayoutRulesImpl Super; + virtual SimpleLayoutInfo GetObjectLayout(ShaderParameterKind kind) override + { + switch (kind) + { + case ShaderParameterKind::ConstantBuffer: + // It's a pointer to the actual uniform data + return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), sizeof(void*)); + + case ShaderParameterKind::MutableTexture: + case ShaderParameterKind::TextureUniformBuffer: + case ShaderParameterKind::Texture: + // It's a pointer to a texture interface + return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), sizeof(void*)); + + case ShaderParameterKind::StructuredBuffer: + case ShaderParameterKind::MutableStructuredBuffer: + // TODO(JS): We are just storing as a pointer for now + // It's a ptr and a size of the amount of elements + return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), sizeof(void*)); + + case ShaderParameterKind::RawBuffer: + case ShaderParameterKind::Buffer: + case ShaderParameterKind::MutableRawBuffer: + case ShaderParameterKind::MutableBuffer: + + // TODO(JS): We are storing as a pointer for now + + // It's a pointer and a size in bytes + return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), sizeof(void*)); + + case ShaderParameterKind::SamplerState: + // It's a pointer + return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), sizeof(void*)); + + case ShaderParameterKind::TextureSampler: + case ShaderParameterKind::MutableTextureSampler: + case ShaderParameterKind::InputRenderTarget: + // TODO: how to handle these? + default: + SLANG_UNEXPECTED("unhandled shader parameter kind"); + UNREACHABLE_RETURN(SimpleLayoutInfo()); + } + } + }; static CPUObjectLayoutRulesImpl kCPUObjectLayoutRulesImpl; @@ -747,10 +791,10 @@ LayoutRulesImpl kCPULayoutRulesImpl_ = { // CUDA static CUDAObjectLayoutRulesImpl kCUDAObjectLayoutRulesImpl; -static CUDALayoutRulesImpl kCUALayoutRulesImpl; +static CUDALayoutRulesImpl kCUDALayoutRulesImpl; LayoutRulesImpl kCUDALayoutRulesImpl_ = { - &kCPULayoutRulesFamilyImpl, &kCUALayoutRulesImpl, &kCUDAObjectLayoutRulesImpl, + &kCUDALayoutRulesFamilyImpl, &kCUDALayoutRulesImpl, &kCUDAObjectLayoutRulesImpl, }; @@ -1033,12 +1077,12 @@ LayoutRulesImpl* CPULayoutRulesFamilyImpl::getStructuredBufferRules() LayoutRulesImpl* CUDALayoutRulesFamilyImpl::getConstantBufferRules() { - return &kCPULayoutRulesImpl_; + return &kCUDALayoutRulesImpl_; } LayoutRulesImpl* CUDALayoutRulesFamilyImpl::getPushConstantBufferRules() { - return &kCPULayoutRulesImpl_; + return &kCUDALayoutRulesImpl_; } LayoutRulesImpl* CUDALayoutRulesFamilyImpl::getTextureBufferRules() diff --git a/tests/cuda/compile-to-cuda.slang b/tests/cuda/compile-to-cuda.slang index 6166aaf0b..be7d775bd 100644 --- a/tests/cuda/compile-to-cuda.slang +++ b/tests/cuda/compile-to-cuda.slang @@ -1,29 +1,19 @@ //DISABLE_TEST(smoke):SIMPLE: -target ptx -entry computeMain -stage compute +//DISABLE_TEST(compute):COMPARE_COMPUTE:-cpu -compute +//TEST(compute):COMPARE_COMPUTE:-cuda -compute //TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name=outputBuffer RWStructuredBuffer<int> outputBuffer : register(u0); -int quantize(double value) -{ - return int(value * 256); -} - -int quantize(float value) -{ - return int(value * 256); -} - [numthreads(4, 1, 1)] void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) { - float values[] = { -9, 9, -3, 3 }; int tid = int(dispatchThreadID.x); - float value = values[tid]; - - outputBuffer[tid * 4] = quantize(sin(value)); - outputBuffer[tid * 4 + 1] = quantize(cos(value)); - outputBuffer[tid * 4 + 2] = quantize(sin(double(value))); - outputBuffer[tid * 4 + 3] = quantize(cos(double(value))); + outputBuffer[tid * 4] = tid; + outputBuffer[tid * 4 + 1] = tid + 1; + outputBuffer[tid * 4 + 2] = tid + 2; + outputBuffer[tid * 4 + 3] = tid + 3; + } diff --git a/tools/render-test/bind-location.cpp b/tools/render-test/bind-location.cpp index 6548e861c..30b9de0f8 100644 --- a/tools/render-test/bind-location.cpp +++ b/tools/render-test/bind-location.cpp @@ -551,6 +551,14 @@ void BindSet::getBindings(List<BindLocation>& outLocations, List<Value*>& outRes } } +void BindSet::releaseValueTargets() +{ + for (Value* value : m_values) + { + value->m_target.setNull(); + } +} + // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! BindLocation !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! BindLocation::BindLocation(slang::TypeLayoutReflection* typeLayout, const BindPoints& points, BindSet_Value* value) : diff --git a/tools/render-test/bind-location.h b/tools/render-test/bind-location.h index 0ce99731d..e4119a103 100644 --- a/tools/render-test/bind-location.h +++ b/tools/render-test/bind-location.h @@ -336,6 +336,9 @@ public: /// Get all of the bindings void getBindings(Slang::List<BindLocation>& outLocations, Slang::List<Value*>& outValues) const; + /// + void releaseValueTargets(); + /// Ctor BindSet(); diff --git a/tools/render-test/cpu-compute-util.cpp b/tools/render-test/cpu-compute-util.cpp index e94a6d6e1..2bb0baf88 100644 --- a/tools/render-test/cpu-compute-util.cpp +++ b/tools/render-test/cpu-compute-util.cpp @@ -15,42 +15,6 @@ namespace renderer_test { using namespace Slang; -/* static */SlangResult CPUComputeUtil::writeBindings(const ShaderInputLayout& layout, const List<BindSet::Value*>& buffers, const String& fileName) -{ - FILE * f = fopen(fileName.getBuffer(), "wb"); - if (!f) - { - return SLANG_FAIL; - } - - const auto& entries = layout.entries; - - for (int i = 0; i < entries.getCount(); ++i) - { - const auto& entry = entries[i]; - if (entry.isOutput) - { - BindSet::Value* buffer = buffers[i]; - - unsigned int* ptr = (unsigned int*)buffer->m_data; - - const int size = int(entry.bufferData.getCount()); - // Must be the same size or less than allocated buffer - SLANG_ASSERT(size * sizeof(unsigned int) <= buffer->m_sizeInBytes); - - for (int i = 0; i < size; ++i) - { - unsigned int v = ptr[i]; - - fprintf(f, "%X\n", v); - } - } - } - fclose(f); - return SLANG_OK; -} - - template <int COUNT> struct OneTexture2D : public CPUComputeUtil::Resource, public CPPPrelude::ITexture2D { @@ -109,21 +73,8 @@ static CPUComputeUtil::Resource* _newOneTexture2D(int elemCount) // Okay lets iterate adding buffers auto outStream = StdWriters::getOut(); SLANG_RETURN_ON_FAIL(ShaderInputLayout::addBindSetValues(compilationAndLayout.layout.entries, compilationAndLayout.sourcePath, outStream, outContext.m_bindRoot)); - - { - const auto& entries = compilationAndLayout.layout.entries; - outContext.m_buffers.setCount(entries.getCount()); - - const auto& values = outContext.m_bindSet.getValues(); - for (BindSet::Value* value : values) - { - if (value->m_userIndex >= 0) - { - outContext.m_buffers[value->m_userIndex] = value; - } - } - } - + ShaderInputLayout::getValueBuffers(compilationAndLayout.layout.entries, outContext.m_bindSet, outContext.m_buffers); + // Okay we need to find all of the bindings and match up to those in the layout const ShaderInputLayout& layout = compilationAndLayout.layout; diff --git a/tools/render-test/cpu-compute-util.h b/tools/render-test/cpu-compute-util.h index 21c40ba43..e6e896b6a 100644 --- a/tools/render-test/cpu-compute-util.h +++ b/tools/render-test/cpu-compute-util.h @@ -58,8 +58,6 @@ struct CPUComputeUtil static SlangResult calcExecuteInfo(ExecuteStyle style, ISlangSharedLibrary* sharedLib, const uint32_t dispatchSize[3], const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout, Context& context, ExecuteInfo& out); static SlangResult execute(const ExecuteInfo& info); - - static SlangResult writeBindings(const ShaderInputLayout& layout, const List<BindSet::Value*>& buffers, const Slang::String& fileName); }; diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp index 138f842b4..74810e675 100644 --- a/tools/render-test/cuda/cuda-compute-util.cpp +++ b/tools/render-test/cuda/cuda-compute-util.cpp @@ -6,13 +6,60 @@ #include "../../source/core/slang-std-writers.h" #include "../../source/core/slang-token-reader.h" +#include "../bind-location.h" + #include <cuda.h> #include <cuda_runtime_api.h> namespace renderer_test { using namespace Slang; -#define SLANG_CUDA_RETURN_ON_FAIL(x) { int _res = (int)(x); if (_res != 0) return SLANG_FAIL; } +SLANG_FORCE_INLINE static bool _isError(CUresult result) { return result != 0; } +SLANG_FORCE_INLINE static bool _isError(cudaError_t result) { return result != 0; } + +#if 0 +#define SLANG_CUDA_RETURN_ON_FAIL(x) { auto _res = x; if (_isError(_res)) return SLANG_FAIL; } +#else + +#define SLANG_CUDA_RETURN_ON_FAIL(x) { auto _res = x; if (_isError(_res)) { SLANG_ASSERT(!"Failed CUDA call"); return SLANG_FAIL; } } + +#endif + +#define SLANG_CUDA_ASSERT_ON_FAIL(x) { auto _res = x; if (_isError(_res)) { SLANG_ASSERT(!"Failed CUDA call"); }; } + +class CUDAResource : public RefObject +{ +public: + typedef RefObject Super; + + /// Dtor + CUDAResource(): m_cudaMemory(nullptr) {} + CUDAResource(void* cudaMemory): m_cudaMemory(cudaMemory) {} + + ~CUDAResource() + { + if (m_cudaMemory) + { + SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory)); + } + } + + /// Helper function to get the cuda memory pointer when given a value + static void* getCUDAData(BindSet::Value* value) + { + if (value) + { + auto resource = dynamic_cast<CUDAResource*>(value->m_target.Ptr()); + return resource ? resource->m_cudaMemory : nullptr; + } + return nullptr; + } + + void* m_cudaMemory; +}; + + + static int _calcSMCountPerMultiProcessor(int major, int minor) { @@ -124,39 +171,70 @@ static SlangResult _initCuda() return SLANG_OK; } - - -/* static */SlangResult _createDevice(CUcontext* outContext) +class ScopeCUDAContext { - SLANG_RETURN_ON_FAIL(_initCuda()); +public: + ScopeCUDAContext() : m_context(nullptr) {} + + SlangResult init(unsigned int flags, CUdevice device) + { + SLANG_RETURN_ON_FAIL(_initCuda()); - int deviceId; - SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceId(&deviceId)); - SLANG_CUDA_RETURN_ON_FAIL(cudaSetDevice(deviceId)); + if (m_context) + { + cuCtxDestroy(m_context); + m_context = nullptr; + } + if (_isError(cuCtxCreate(&m_context, flags, device))) + { + return SLANG_FAIL; + } + return SLANG_OK; + } - CUcontext context; + SlangResult init(unsigned int flags) + { + SLANG_RETURN_ON_FAIL(_initCuda()); - // Create context - SLANG_CUDA_RETURN_ON_FAIL(cuCtxCreate(&context, 0, deviceId)); + int deviceId; + SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceId(&deviceId)); + SLANG_CUDA_RETURN_ON_FAIL(cudaSetDevice(deviceId)); - *outContext = context; - return SLANG_OK; -} + if (m_context) + { + cuCtxDestroy(m_context); + m_context = nullptr; + } + if (_isError(cuCtxCreate(&m_context, flags, deviceId))) + { + return SLANG_FAIL; + } + return SLANG_OK; + } -/* static */bool CUDAComputeUtil::canCreateDevice() -{ - CUcontext context; - if (SLANG_SUCCEEDED(_createDevice(&context))) + ~ScopeCUDAContext() { - cuCtxDestroy(context); - return true; + if (m_context) + { + cuCtxDestroy(m_context); + } } + SLANG_FORCE_INLINE operator CUcontext () const { return m_context; } - return false; + CUcontext m_context; +}; + +/* static */bool CUDAComputeUtil::canCreateDevice() +{ + ScopeCUDAContext context; + return SLANG_SUCCEEDED(context.init(0)); } -static SlangResult _compute(CUcontext context, CUmodule module, const ShaderCompilerUtil::OutputAndLayout& outputAndLayout) +static SlangResult _compute(CUcontext context, CUmodule module, const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, CUDAComputeUtil::Context& outContext) { + auto& bindSet = outContext.m_bindSet; + auto& bindRoot = outContext.m_bindRoot; + auto request = outputAndLayout.output.request; auto reflection = (slang::ShaderReflection*) spGetReflection(request); @@ -170,17 +248,278 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp // Get the entry point CUfunction kernel; - SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(&kernel, module, entryPointName)); + // A stream of 0 means no stream + cudaStream_t stream = 0; + //SLANG_CUDA_RETURN_ON_FAIL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + { + // Okay now we need to set up binding + bindRoot.init(&bindSet, reflection, 0); + + // Will set up any root buffers + bindRoot.addDefaultValues(); + + // Now set up the Values from the test + + auto outStream = StdWriters::getOut(); + SLANG_RETURN_ON_FAIL(ShaderInputLayout::addBindSetValues(outputAndLayout.layout.entries, outputAndLayout.sourcePath, outStream, bindRoot)); + + ShaderInputLayout::getValueBuffers(outputAndLayout.layout.entries, bindSet, outContext.m_buffers); + + // First create all of the resources for the values + + { + const auto& values = bindSet.getValues(); + const auto& entries = outputAndLayout.layout.entries; + + for (BindSet::Value* value : values) + { + auto typeLayout = value->m_type; + + // Get the type kind, if typeLayout is not set we'll assume a 'constant buffer' will do + slang::TypeReflection::Kind kind = typeLayout ? typeLayout->getKind() : slang::TypeReflection::Kind::ConstantBuffer; + + // TODO(JS): + // Here we should be using information about what textures hold to create appropriate + // textures. For now we only support 2d textures that always return 1. + + switch (kind) + { + case slang::TypeReflection::Kind::ConstantBuffer: + case slang::TypeReflection::Kind::ParameterBlock: + { + // We can construct the buffers. We can't copy into yet, as we need to set all of the bindings first + + void* cudaMem = nullptr; + SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&cudaMem, value->m_sizeInBytes)); + value->m_target = new CUDAResource(cudaMem); + break; + } + case slang::TypeReflection::Kind::Resource: + { + auto type = typeLayout->getType(); + auto shape = type->getResourceShape(); + + //auto access = type->getResourceAccess(); + + switch (shape & SLANG_RESOURCE_BASE_SHAPE_MASK) + { + case SLANG_TEXTURE_2D: + { + SLANG_ASSERT(value->m_userIndex >= 0); + auto& srcEntry = entries[value->m_userIndex]; + + // TODO(JS): + // We should use the srcEntry to determine what data to store in the texture, + // it's dimensions etc. For now we just support it being 1. + + slang::TypeReflection* typeReflection = typeLayout->getResourceResultType(); + + int count = 1; + if (typeReflection->getKind() == slang::TypeReflection::Kind::Vector) + { + count = int(typeReflection->getElementCount()); + } + + // TODO(JS): Should use the input setup to work how to create this texture + // Store the target specific value + //value->m_target = _newOneTexture2D(count); + break; + } + case SLANG_TEXTURE_1D: + case SLANG_TEXTURE_3D: + case SLANG_TEXTURE_CUBE: + case SLANG_TEXTURE_BUFFER: + { + // Need a CPU impl for these... + // For now we can just leave as target will just be nullptr + break; + } + + case SLANG_BYTE_ADDRESS_BUFFER: + case SLANG_STRUCTURED_BUFFER: + { + // On CPU we just use the memory in the BindSet buffer, so don't need to create anything + + void* cudaMem = nullptr; + SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&cudaMem, value->m_sizeInBytes)); + value->m_target = new CUDAResource(cudaMem); + + break; + } + } + } + default: break; + } + } + } + + // Now we need to go through all of the bindings and set the appropriate data + + { + List<BindLocation> locations; + List<BindSet::Value*> values; + bindSet.getBindings(locations, values); + + for (Index i = 0; i < locations.getCount(); ++i) + { + const auto& location = locations[i]; + BindSet::Value* value = values[i]; + + // Okay now we need to set up the actual handles that CPU will follow. + auto typeLayout = location.getTypeLayout(); + + const auto kind = typeLayout->getKind(); + switch (kind) + { + case slang::TypeReflection::Kind::Array: + { + auto elementCount = int(typeLayout->getElementCount()); + if (elementCount == 0) + { + void** array = location.getUniform<void*>(); + // If set, we setup the data needed for array on CPU side + if (value && array) + { + // TODO(JS): For now we'll just assume a pointer... + *array = CUDAResource::getCUDAData(value); + } + } + break; + } + case slang::TypeReflection::Kind::ConstantBuffer: + case slang::TypeReflection::Kind::ParameterBlock: + { + // These map down to just pointers + *location.getUniform<void*>() = CUDAResource::getCUDAData(value); + break; + } + case slang::TypeReflection::Kind::Resource: + { + auto type = typeLayout->getType(); + auto shape = type->getResourceShape(); + + //auto access = type->getResourceAccess(); + + switch (shape & SLANG_RESOURCE_BASE_SHAPE_MASK) + { + case SLANG_BYTE_ADDRESS_BUFFER: + case SLANG_STRUCTURED_BUFFER: + { + // TODO(JS): These will need bounds ... + // For the moment these are just pointers + *location.getUniform<void*>() = CUDAResource::getCUDAData(value); + break; + } + } + break; + } + default: break; + } + } + } + + // Okay now the memory is all set up, we can copy everything over + { + const auto& values = bindSet.getValues(); + for (BindSet::Value* value : values) + { + void* cudaMem = CUDAResource::getCUDAData(value); + if (value && value->m_data && cudaMem) + { + // Okay copy the data over... + SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(cudaMem, value->m_data, value->m_sizeInBytes, cudaMemcpyHostToDevice)); + } + } + } + + // Now we can execute the kernel + + { + // Get the max threads per block for this function + + int maxTheadsPerBlock; + SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&maxTheadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel)); + + int sharedSizeInBytes; + SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel)); + + // Work out the args + void* uniformCUDAData = CUDAResource::getCUDAData(bindRoot.getRootValue()); + void* entryPointCUDAData = CUDAResource::getCUDAData(bindRoot.getEntryPointValue()); + + // NOTE! These are pointers to the cuda memory pointers + void* args[] = { &entryPointCUDAData , &uniformCUDAData }; + + SlangUInt numThreadsPerAxis[3]; + entryPoint->getComputeThreadGroupSize(3, numThreadsPerAxis); + + // Launch + // TODO(JS): We probably want to do something a little more clever here using the maxThreadsPerBlock, + // but for now just launch a single block, and hope it all fits. + + auto cudaLaunchResult = cuLaunchKernel(kernel, + 1, 1, 1, // Blocks + int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]), // Threads per block + 0, // Shared memory size + stream, // Stream. 0 is no stream. + args, // Args + nullptr); // extra + + SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult); + + if (stream) + { + SLANG_CUDA_RETURN_ON_FAIL(cudaStreamSynchronize(stream)); + } + else + { + // Do a sync here. Makes sure any issues are detected early and not on some implicit sync + SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceSynchronize()); + } + } + + // Finally we need to copy the data back + + { + const auto& entries = outputAndLayout.layout.entries; + + for (Index i = 0; i < entries.getCount(); ++i) + { + const auto& entry = entries[i]; + BindSet::Value* value = outContext.m_buffers[i]; + + if (entry.isOutput) + { + // Copy back to CPU memory + void* cudaMem = CUDAResource::getCUDAData(value); + if (value && value->m_data && cudaMem) + { + // Okay copy the data back... + SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(value->m_data, cudaMem, value->m_sizeInBytes, cudaMemcpyDeviceToHost)); + } + } + } + } + + if (stream) + { + SLANG_CUDA_RETURN_ON_FAIL(cudaStreamDestroy(stream)); + } + } + + // Release all othe CUDA resource/allocations + bindSet.releaseValueTargets(); return SLANG_OK; } -/* static */SlangResult CUDAComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout) +/* static */SlangResult CUDAComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, Context& outContext) { - CUcontext context; - SLANG_RETURN_ON_FAIL(_createDevice(&context)); + ScopeCUDAContext cudaContext; + SLANG_RETURN_ON_FAIL(cudaContext.init(0)); const Index index = outputAndLayout.output.findKernelDescIndex(StageType::Compute); if (index < 0) @@ -193,12 +532,10 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp CUmodule module = 0; SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&module, kernel.codeBegin)); - SLANG_RETURN_ON_FAIL(_compute(context, module, outputAndLayout)); + SLANG_RETURN_ON_FAIL(_compute(cudaContext, module, outputAndLayout, outContext)); SLANG_CUDA_RETURN_ON_FAIL(cuModuleUnload(module)); - cuCtxDestroy(context); - return SLANG_OK; } diff --git a/tools/render-test/cuda/cuda-compute-util.h b/tools/render-test/cuda/cuda-compute-util.h index 9c7d83b1f..58ca21716 100644 --- a/tools/render-test/cuda/cuda-compute-util.h +++ b/tools/render-test/cuda/cuda-compute-util.h @@ -10,7 +10,16 @@ namespace renderer_test { struct CUDAComputeUtil { - static SlangResult execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout); + struct Context + { + /// Holds the binding information + BindSet m_bindSet; + CPULikeBindRoot m_bindRoot; + /// Buffers are held in same order as entries in layout (useful for dumping out bindings) + List<BindSet::Value*> m_buffers; + }; + + static SlangResult execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, Context& outContext); static bool canCreateDevice(); }; diff --git a/tools/render-test/render-test-main.cpp b/tools/render-test/render-test-main.cpp index d91592ccf..050a6d2c8 100644 --- a/tools/render-test/render-test-main.cpp +++ b/tools/render-test/render-test-main.cpp @@ -583,7 +583,7 @@ SLANG_TEST_TOOL_API SlangResult innerMain(Slang::StdWriters* stdWriters, SlangSe if (gOptions.outputPath) { // Dump everything out that was written - SLANG_RETURN_ON_FAIL(CPUComputeUtil::writeBindings(compilationAndLayout.layout, context.m_buffers, gOptions.outputPath)); + SLANG_RETURN_ON_FAIL(ShaderInputLayout::writeBindings(compilationAndLayout.layout, context.m_buffers, gOptions.outputPath)); // Check all execution styles produce the same result SLANG_RETURN_ON_FAIL(CPUComputeUtil::checkStyleConsistency(sharedLibrary, gOptions.computeDispatchSize, compilationAndLayout)); @@ -600,10 +600,14 @@ SLANG_TEST_TOOL_API SlangResult innerMain(Slang::StdWriters* stdWriters, SlangSe #if RENDER_TEST_CUDA - // TODO(JS): - // We don't know how to execute it yet.. + CUDAComputeUtil::Context context; + SLANG_RETURN_ON_FAIL(CUDAComputeUtil::execute(compilationAndLayout, context)); - SLANG_RETURN_ON_FAIL(CUDAComputeUtil::execute(compilationAndLayout)); + if (gOptions.outputPath) + { + // Dump everything out that was written + SLANG_RETURN_ON_FAIL(ShaderInputLayout::writeBindings(compilationAndLayout.layout, context.m_buffers, gOptions.outputPath)); + } return SLANG_OK; #else diff --git a/tools/render-test/shader-input-layout.cpp b/tools/render-test/shader-input-layout.cpp index 5ae35b90d..ee4f5fc2c 100644 --- a/tools/render-test/shader-input-layout.cpp +++ b/tools/render-test/shader-input-layout.cpp @@ -1,3 +1,6 @@ +// Stop warnings from Visual Studio +#define _CRT_SECURE_NO_WARNINGS 1 + #include "shader-input-layout.h" #include "core/slang-token-reader.h" @@ -676,6 +679,61 @@ namespace renderer_test return SLANG_OK; } + /* static */void ShaderInputLayout::getValueBuffers(const Slang::List<ShaderInputLayoutEntry>& entries, const BindSet& bindSet, List<BindSet::Value*>& outBuffers) + { + outBuffers.setCount(entries.getCount()); + + for (Index i = 0; i< outBuffers.getCount(); ++i) + { + outBuffers[i] = nullptr; + } + + const auto& values = bindSet.getValues(); + for (BindSet::Value* value : values) + { + if (value->m_userIndex >= 0) + { + outBuffers[value->m_userIndex] = value; + } + } + } + + + /* static */SlangResult ShaderInputLayout::writeBindings(const ShaderInputLayout& layout, const List<BindSet::Value*>& buffers, const String& fileName) + { + FILE * f = fopen(fileName.getBuffer(), "wb"); + if (!f) + { + return SLANG_FAIL; + } + + const auto& entries = layout.entries; + + for (int i = 0; i < entries.getCount(); ++i) + { + const auto& entry = entries[i]; + if (entry.isOutput) + { + BindSet::Value* buffer = buffers[i]; + + unsigned int* ptr = (unsigned int*)buffer->m_data; + + const int size = int(entry.bufferData.getCount()); + // Must be the same size or less than allocated buffer + SLANG_ASSERT(size * sizeof(unsigned int) <= buffer->m_sizeInBytes); + + for (int i = 0; i < size; ++i) + { + unsigned int v = ptr[i]; + + fprintf(f, "%X\n", v); + } + } + } + fclose(f); + return SLANG_OK; + } + void generateTextureData(TextureData& output, const InputTextureDesc& desc) { switch (desc.format) diff --git a/tools/render-test/shader-input-layout.h b/tools/render-test/shader-input-layout.h index 3e33f876e..3399df848 100644 --- a/tools/render-test/shader-input-layout.h +++ b/tools/render-test/shader-input-layout.h @@ -102,6 +102,12 @@ public: /// For buffers, the Resources will be setup with the contents of the entry. /// That if a resource is created that maps to an entry, the m_userData member of Resource will be set to it's index static SlangResult addBindSetValues(const Slang::List<ShaderInputLayoutEntry>& entries, const Slang::String& sourcePath, Slang::WriterHelper outError, BindRoot& bindRoot); + + /// Put into outBuffer the value buffers that were set via addbindSetValues (which will set m_userIndex to be the entries index) + static void getValueBuffers(const Slang::List<ShaderInputLayoutEntry>& entries, const BindSet& bindSet, Slang::List<BindSet::Value*>& outBuffers); + + /// Write bindings from values in memory from buffers + static SlangResult writeBindings(const ShaderInputLayout& layout, const Slang::List<BindSet::Value*>& buffers, const Slang::String& fileName); }; void generateTextureDataRGB8(TextureData& output, const InputTextureDesc& desc); |
