diff options
Diffstat (limited to 'tools/render-test/cuda/cuda-compute-util.cpp')
| -rw-r--r-- | tools/render-test/cuda/cuda-compute-util.cpp | 395 |
1 files changed, 366 insertions, 29 deletions
diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp index 138f842b4..74810e675 100644 --- a/tools/render-test/cuda/cuda-compute-util.cpp +++ b/tools/render-test/cuda/cuda-compute-util.cpp @@ -6,13 +6,60 @@ #include "../../source/core/slang-std-writers.h" #include "../../source/core/slang-token-reader.h" +#include "../bind-location.h" + #include <cuda.h> #include <cuda_runtime_api.h> namespace renderer_test { using namespace Slang; -#define SLANG_CUDA_RETURN_ON_FAIL(x) { int _res = (int)(x); if (_res != 0) return SLANG_FAIL; } +SLANG_FORCE_INLINE static bool _isError(CUresult result) { return result != 0; } +SLANG_FORCE_INLINE static bool _isError(cudaError_t result) { return result != 0; } + +#if 0 +#define SLANG_CUDA_RETURN_ON_FAIL(x) { auto _res = x; if (_isError(_res)) return SLANG_FAIL; } +#else + +#define SLANG_CUDA_RETURN_ON_FAIL(x) { auto _res = x; if (_isError(_res)) { SLANG_ASSERT(!"Failed CUDA call"); return SLANG_FAIL; } } + +#endif + +#define SLANG_CUDA_ASSERT_ON_FAIL(x) { auto _res = x; if (_isError(_res)) { SLANG_ASSERT(!"Failed CUDA call"); }; } + +class CUDAResource : public RefObject +{ +public: + typedef RefObject Super; + + /// Dtor + CUDAResource(): m_cudaMemory(nullptr) {} + CUDAResource(void* cudaMemory): m_cudaMemory(cudaMemory) {} + + ~CUDAResource() + { + if (m_cudaMemory) + { + SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory)); + } + } + + /// Helper function to get the cuda memory pointer when given a value + static void* getCUDAData(BindSet::Value* value) + { + if (value) + { + auto resource = dynamic_cast<CUDAResource*>(value->m_target.Ptr()); + return resource ? resource->m_cudaMemory : nullptr; + } + return nullptr; + } + + void* m_cudaMemory; +}; + + + static int _calcSMCountPerMultiProcessor(int major, int minor) { @@ -124,39 +171,70 @@ static SlangResult _initCuda() return SLANG_OK; } - - -/* static */SlangResult _createDevice(CUcontext* outContext) +class ScopeCUDAContext { - SLANG_RETURN_ON_FAIL(_initCuda()); +public: + ScopeCUDAContext() : m_context(nullptr) {} + + SlangResult init(unsigned int flags, CUdevice device) + { + SLANG_RETURN_ON_FAIL(_initCuda()); - int deviceId; - SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceId(&deviceId)); - SLANG_CUDA_RETURN_ON_FAIL(cudaSetDevice(deviceId)); + if (m_context) + { + cuCtxDestroy(m_context); + m_context = nullptr; + } + if (_isError(cuCtxCreate(&m_context, flags, device))) + { + return SLANG_FAIL; + } + return SLANG_OK; + } - CUcontext context; + SlangResult init(unsigned int flags) + { + SLANG_RETURN_ON_FAIL(_initCuda()); - // Create context - SLANG_CUDA_RETURN_ON_FAIL(cuCtxCreate(&context, 0, deviceId)); + int deviceId; + SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceId(&deviceId)); + SLANG_CUDA_RETURN_ON_FAIL(cudaSetDevice(deviceId)); - *outContext = context; - return SLANG_OK; -} + if (m_context) + { + cuCtxDestroy(m_context); + m_context = nullptr; + } + if (_isError(cuCtxCreate(&m_context, flags, deviceId))) + { + return SLANG_FAIL; + } + return SLANG_OK; + } -/* static */bool CUDAComputeUtil::canCreateDevice() -{ - CUcontext context; - if (SLANG_SUCCEEDED(_createDevice(&context))) + ~ScopeCUDAContext() { - cuCtxDestroy(context); - return true; + if (m_context) + { + cuCtxDestroy(m_context); + } } + SLANG_FORCE_INLINE operator CUcontext () const { return m_context; } - return false; + CUcontext m_context; +}; + +/* static */bool CUDAComputeUtil::canCreateDevice() +{ + ScopeCUDAContext context; + return SLANG_SUCCEEDED(context.init(0)); } -static SlangResult _compute(CUcontext context, CUmodule module, const ShaderCompilerUtil::OutputAndLayout& outputAndLayout) +static SlangResult _compute(CUcontext context, CUmodule module, const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, CUDAComputeUtil::Context& outContext) { + auto& bindSet = outContext.m_bindSet; + auto& bindRoot = outContext.m_bindRoot; + auto request = outputAndLayout.output.request; auto reflection = (slang::ShaderReflection*) spGetReflection(request); @@ -170,17 +248,278 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp // Get the entry point CUfunction kernel; - SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(&kernel, module, entryPointName)); + // A stream of 0 means no stream + cudaStream_t stream = 0; + //SLANG_CUDA_RETURN_ON_FAIL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + { + // Okay now we need to set up binding + bindRoot.init(&bindSet, reflection, 0); + + // Will set up any root buffers + bindRoot.addDefaultValues(); + + // Now set up the Values from the test + + auto outStream = StdWriters::getOut(); + SLANG_RETURN_ON_FAIL(ShaderInputLayout::addBindSetValues(outputAndLayout.layout.entries, outputAndLayout.sourcePath, outStream, bindRoot)); + + ShaderInputLayout::getValueBuffers(outputAndLayout.layout.entries, bindSet, outContext.m_buffers); + + // First create all of the resources for the values + + { + const auto& values = bindSet.getValues(); + const auto& entries = outputAndLayout.layout.entries; + + for (BindSet::Value* value : values) + { + auto typeLayout = value->m_type; + + // Get the type kind, if typeLayout is not set we'll assume a 'constant buffer' will do + slang::TypeReflection::Kind kind = typeLayout ? typeLayout->getKind() : slang::TypeReflection::Kind::ConstantBuffer; + + // TODO(JS): + // Here we should be using information about what textures hold to create appropriate + // textures. For now we only support 2d textures that always return 1. + + switch (kind) + { + case slang::TypeReflection::Kind::ConstantBuffer: + case slang::TypeReflection::Kind::ParameterBlock: + { + // We can construct the buffers. We can't copy into yet, as we need to set all of the bindings first + + void* cudaMem = nullptr; + SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&cudaMem, value->m_sizeInBytes)); + value->m_target = new CUDAResource(cudaMem); + break; + } + case slang::TypeReflection::Kind::Resource: + { + auto type = typeLayout->getType(); + auto shape = type->getResourceShape(); + + //auto access = type->getResourceAccess(); + + switch (shape & SLANG_RESOURCE_BASE_SHAPE_MASK) + { + case SLANG_TEXTURE_2D: + { + SLANG_ASSERT(value->m_userIndex >= 0); + auto& srcEntry = entries[value->m_userIndex]; + + // TODO(JS): + // We should use the srcEntry to determine what data to store in the texture, + // it's dimensions etc. For now we just support it being 1. + + slang::TypeReflection* typeReflection = typeLayout->getResourceResultType(); + + int count = 1; + if (typeReflection->getKind() == slang::TypeReflection::Kind::Vector) + { + count = int(typeReflection->getElementCount()); + } + + // TODO(JS): Should use the input setup to work how to create this texture + // Store the target specific value + //value->m_target = _newOneTexture2D(count); + break; + } + case SLANG_TEXTURE_1D: + case SLANG_TEXTURE_3D: + case SLANG_TEXTURE_CUBE: + case SLANG_TEXTURE_BUFFER: + { + // Need a CPU impl for these... + // For now we can just leave as target will just be nullptr + break; + } + + case SLANG_BYTE_ADDRESS_BUFFER: + case SLANG_STRUCTURED_BUFFER: + { + // On CPU we just use the memory in the BindSet buffer, so don't need to create anything + + void* cudaMem = nullptr; + SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&cudaMem, value->m_sizeInBytes)); + value->m_target = new CUDAResource(cudaMem); + + break; + } + } + } + default: break; + } + } + } + + // Now we need to go through all of the bindings and set the appropriate data + + { + List<BindLocation> locations; + List<BindSet::Value*> values; + bindSet.getBindings(locations, values); + + for (Index i = 0; i < locations.getCount(); ++i) + { + const auto& location = locations[i]; + BindSet::Value* value = values[i]; + + // Okay now we need to set up the actual handles that CPU will follow. + auto typeLayout = location.getTypeLayout(); + + const auto kind = typeLayout->getKind(); + switch (kind) + { + case slang::TypeReflection::Kind::Array: + { + auto elementCount = int(typeLayout->getElementCount()); + if (elementCount == 0) + { + void** array = location.getUniform<void*>(); + // If set, we setup the data needed for array on CPU side + if (value && array) + { + // TODO(JS): For now we'll just assume a pointer... + *array = CUDAResource::getCUDAData(value); + } + } + break; + } + case slang::TypeReflection::Kind::ConstantBuffer: + case slang::TypeReflection::Kind::ParameterBlock: + { + // These map down to just pointers + *location.getUniform<void*>() = CUDAResource::getCUDAData(value); + break; + } + case slang::TypeReflection::Kind::Resource: + { + auto type = typeLayout->getType(); + auto shape = type->getResourceShape(); + + //auto access = type->getResourceAccess(); + + switch (shape & SLANG_RESOURCE_BASE_SHAPE_MASK) + { + case SLANG_BYTE_ADDRESS_BUFFER: + case SLANG_STRUCTURED_BUFFER: + { + // TODO(JS): These will need bounds ... + // For the moment these are just pointers + *location.getUniform<void*>() = CUDAResource::getCUDAData(value); + break; + } + } + break; + } + default: break; + } + } + } + + // Okay now the memory is all set up, we can copy everything over + { + const auto& values = bindSet.getValues(); + for (BindSet::Value* value : values) + { + void* cudaMem = CUDAResource::getCUDAData(value); + if (value && value->m_data && cudaMem) + { + // Okay copy the data over... + SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(cudaMem, value->m_data, value->m_sizeInBytes, cudaMemcpyHostToDevice)); + } + } + } + + // Now we can execute the kernel + + { + // Get the max threads per block for this function + + int maxTheadsPerBlock; + SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&maxTheadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel)); + + int sharedSizeInBytes; + SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel)); + + // Work out the args + void* uniformCUDAData = CUDAResource::getCUDAData(bindRoot.getRootValue()); + void* entryPointCUDAData = CUDAResource::getCUDAData(bindRoot.getEntryPointValue()); + + // NOTE! These are pointers to the cuda memory pointers + void* args[] = { &entryPointCUDAData , &uniformCUDAData }; + + SlangUInt numThreadsPerAxis[3]; + entryPoint->getComputeThreadGroupSize(3, numThreadsPerAxis); + + // Launch + // TODO(JS): We probably want to do something a little more clever here using the maxThreadsPerBlock, + // but for now just launch a single block, and hope it all fits. + + auto cudaLaunchResult = cuLaunchKernel(kernel, + 1, 1, 1, // Blocks + int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]), // Threads per block + 0, // Shared memory size + stream, // Stream. 0 is no stream. + args, // Args + nullptr); // extra + + SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult); + + if (stream) + { + SLANG_CUDA_RETURN_ON_FAIL(cudaStreamSynchronize(stream)); + } + else + { + // Do a sync here. Makes sure any issues are detected early and not on some implicit sync + SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceSynchronize()); + } + } + + // Finally we need to copy the data back + + { + const auto& entries = outputAndLayout.layout.entries; + + for (Index i = 0; i < entries.getCount(); ++i) + { + const auto& entry = entries[i]; + BindSet::Value* value = outContext.m_buffers[i]; + + if (entry.isOutput) + { + // Copy back to CPU memory + void* cudaMem = CUDAResource::getCUDAData(value); + if (value && value->m_data && cudaMem) + { + // Okay copy the data back... + SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(value->m_data, cudaMem, value->m_sizeInBytes, cudaMemcpyDeviceToHost)); + } + } + } + } + + if (stream) + { + SLANG_CUDA_RETURN_ON_FAIL(cudaStreamDestroy(stream)); + } + } + + // Release all othe CUDA resource/allocations + bindSet.releaseValueTargets(); return SLANG_OK; } -/* static */SlangResult CUDAComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout) +/* static */SlangResult CUDAComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, Context& outContext) { - CUcontext context; - SLANG_RETURN_ON_FAIL(_createDevice(&context)); + ScopeCUDAContext cudaContext; + SLANG_RETURN_ON_FAIL(cudaContext.init(0)); const Index index = outputAndLayout.output.findKernelDescIndex(StageType::Compute); if (index < 0) @@ -193,12 +532,10 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp CUmodule module = 0; SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&module, kernel.codeBegin)); - SLANG_RETURN_ON_FAIL(_compute(context, module, outputAndLayout)); + SLANG_RETURN_ON_FAIL(_compute(cudaContext, module, outputAndLayout, outContext)); SLANG_CUDA_RETURN_ON_FAIL(cuModuleUnload(module)); - cuCtxDestroy(context); - return SLANG_OK; } |
