diff options
| author | Ellie Hermaszewska <ellieh@nvidia.com> | 2024-10-29 14:49:26 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-10-29 14:49:26 +0800 |
| commit | f65d756bff8d4c5cbc15bd0322a2ae8e6b896a21 (patch) | |
| tree | ea1d61342cd29368e19135000ec2948813096205 /prelude | |
| parent | a729c15e9dce9f5116a38afc66329ab2ca4cea54 (diff) | |
format
* format
* Minor test fixes
* enable checking cpp format in ci
Diffstat (limited to 'prelude')
| -rw-r--r-- | prelude/slang-cpp-host-prelude.h | 57 | ||||
| -rw-r--r-- | prelude/slang-cpp-prelude.h | 362 | ||||
| -rw-r--r-- | prelude/slang-cpp-scalar-intrinsics.h | 591 | ||||
| -rw-r--r-- | prelude/slang-cpp-types-core.h | 569 | ||||
| -rw-r--r-- | prelude/slang-cpp-types.h | 1062 | ||||
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 2635 | ||||
| -rw-r--r-- | prelude/slang-hlsl-prelude.h | 4 | ||||
| -rw-r--r-- | prelude/slang-llvm.h | 322 | ||||
| -rw-r--r-- | prelude/slang-torch-prelude.h | 93 |
9 files changed, 3741 insertions, 1954 deletions
diff --git a/prelude/slang-cpp-host-prelude.h b/prelude/slang-cpp-host-prelude.h index 48056169d..8bc0f5cad 100644 --- a/prelude/slang-cpp-host-prelude.h +++ b/prelude/slang-cpp-host-prelude.h @@ -1,8 +1,8 @@ #ifndef SLANG_CPP_HOST_PRELUDE_H #define SLANG_CPP_HOST_PRELUDE_H -#include <cstdio> #include <cmath> +#include <cstdio> #include <cstring> #define SLANG_COM_PTR_ENABLE_REF_OPERATOR 1 @@ -14,42 +14,45 @@ #ifdef SLANG_LLVM #include "slang-llvm.h" #else // SLANG_LLVM -# if SLANG_GCC_FAMILY && __GNUC__ < 6 -# include <cmath> -# define SLANG_PRELUDE_STD std:: -# else -# include <math.h> -# define SLANG_PRELUDE_STD -# endif - -# include <assert.h> -# include <stdlib.h> -# include <string.h> -# include <stdint.h> +#if SLANG_GCC_FAMILY && __GNUC__ < 6 +#include <cmath> +#define SLANG_PRELUDE_STD std:: +#else +#include <math.h> +#define SLANG_PRELUDE_STD +#endif + +#include <assert.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> #endif // SLANG_LLVM #if defined(_MSC_VER) -# define SLANG_PRELUDE_SHARED_LIB_EXPORT __declspec(dllexport) +#define SLANG_PRELUDE_SHARED_LIB_EXPORT __declspec(dllexport) #else -# define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__((__visibility__("default"))) -//# define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__ ((dllexport)) __attribute__((__visibility__("default"))) -#endif - -#ifdef __cplusplus -# define SLANG_PRELUDE_EXTERN_C extern "C" -# define SLANG_PRELUDE_EXTERN_C_START extern "C" { -# define SLANG_PRELUDE_EXTERN_C_END } +#define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__((__visibility__("default"))) +// # define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__ ((dllexport)) +// __attribute__((__visibility__("default"))) +#endif + +#ifdef __cplusplus +#define SLANG_PRELUDE_EXTERN_C extern "C" +#define SLANG_PRELUDE_EXTERN_C_START \ + extern "C" \ + { +#define SLANG_PRELUDE_EXTERN_C_END } #else -# define SLANG_PRELUDE_EXTERN_C -# define SLANG_PRELUDE_EXTERN_C_START -# define SLANG_PRELUDE_EXTERN_C_END -#endif +#define SLANG_PRELUDE_EXTERN_C +#define SLANG_PRELUDE_EXTERN_C_START +#define SLANG_PRELUDE_EXTERN_C_END +#endif #include "slang-cpp-scalar-intrinsics.h" using namespace Slang; template<typename TResult, typename... Args> -using Slang_FuncType = TResult(SLANG_MCALL *)(Args...); +using Slang_FuncType = TResult(SLANG_MCALL*)(Args...); #endif diff --git a/prelude/slang-cpp-prelude.h b/prelude/slang-cpp-prelude.h index 2b848dc3b..4dacac9c5 100644 --- a/prelude/slang-cpp-prelude.h +++ b/prelude/slang-cpp-prelude.h @@ -2,42 +2,45 @@ #define SLANG_CPP_PRELUDE_H // Because the signiture of isnan, isfinite, and is isinf changed in C++, we use the macro -// to use the version in the std namespace. +// to use the version in the std namespace. // https://stackoverflow.com/questions/39130040/cmath-hides-isnan-in-math-h-in-c14-c11 - + #ifdef SLANG_LLVM #include "slang-llvm.h" #else // SLANG_LLVM -# if SLANG_GCC_FAMILY && __GNUC__ < 6 -# include <cmath> -# define SLANG_PRELUDE_STD std:: -# else -# include <math.h> -# define SLANG_PRELUDE_STD -# endif - -# include <assert.h> -# include <stdlib.h> -# include <string.h> -# include <stdint.h> +#if SLANG_GCC_FAMILY && __GNUC__ < 6 +#include <cmath> +#define SLANG_PRELUDE_STD std:: +#else +#include <math.h> +#define SLANG_PRELUDE_STD +#endif + +#include <assert.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> #endif // SLANG_LLVM #if defined(_MSC_VER) -# define SLANG_PRELUDE_SHARED_LIB_EXPORT __declspec(dllexport) +#define SLANG_PRELUDE_SHARED_LIB_EXPORT __declspec(dllexport) #else -# define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__((__visibility__("default"))) -//# define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__ ((dllexport)) __attribute__((__visibility__("default"))) -#endif - -#ifdef __cplusplus -# define SLANG_PRELUDE_EXTERN_C extern "C" -# define SLANG_PRELUDE_EXTERN_C_START extern "C" { -# define SLANG_PRELUDE_EXTERN_C_END } +#define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__((__visibility__("default"))) +// # define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__ ((dllexport)) +// __attribute__((__visibility__("default"))) +#endif + +#ifdef __cplusplus +#define SLANG_PRELUDE_EXTERN_C extern "C" +#define SLANG_PRELUDE_EXTERN_C_START \ + extern "C" \ + { +#define SLANG_PRELUDE_EXTERN_C_END } #else -# define SLANG_PRELUDE_EXTERN_C -# define SLANG_PRELUDE_EXTERN_C_START -# define SLANG_PRELUDE_EXTERN_C_END -#endif +#define SLANG_PRELUDE_EXTERN_C +#define SLANG_PRELUDE_EXTERN_C_START +#define SLANG_PRELUDE_EXTERN_C_END +#endif #define SLANG_PRELUDE_EXPORT SLANG_PRELUDE_EXTERN_C SLANG_PRELUDE_SHARED_LIB_EXPORT #define SLANG_PRELUDE_EXPORT_START SLANG_PRELUDE_EXTERN_C_START SLANG_PRELUDE_SHARED_LIB_EXPORT @@ -45,65 +48,65 @@ #ifndef INFINITY // Must overflow for double -# define INFINITY float(1e+300 * 1e+300) +#define INFINITY float(1e+300 * 1e+300) #endif #ifndef SLANG_INFINITY -# define SLANG_INFINITY INFINITY +#define SLANG_INFINITY INFINITY #endif // Detect the compiler type #ifndef SLANG_COMPILER -# define SLANG_COMPILER +#define SLANG_COMPILER /* Compiler defines, see http://sourceforge.net/p/predef/wiki/Compilers/ NOTE that SLANG_VC holds the compiler version - not just 1 or 0 */ -# if defined(_MSC_VER) -# if _MSC_VER >= 1900 -# define SLANG_VC 14 -# elif _MSC_VER >= 1800 -# define SLANG_VC 12 -# elif _MSC_VER >= 1700 -# define SLANG_VC 11 -# elif _MSC_VER >= 1600 -# define SLANG_VC 10 -# elif _MSC_VER >= 1500 -# define SLANG_VC 9 -# else -# error "unknown version of Visual C++ compiler" -# endif -# elif defined(__clang__) -# define SLANG_CLANG 1 -# elif defined(__SNC__) -# define SLANG_SNC 1 -# elif defined(__ghs__) -# define SLANG_GHS 1 -# elif defined(__GNUC__) /* note: __clang__, __SNC__, or __ghs__ imply __GNUC__ */ -# define SLANG_GCC 1 -# else -# error "unknown compiler" -# endif +#if defined(_MSC_VER) +#if _MSC_VER >= 1900 +#define SLANG_VC 14 +#elif _MSC_VER >= 1800 +#define SLANG_VC 12 +#elif _MSC_VER >= 1700 +#define SLANG_VC 11 +#elif _MSC_VER >= 1600 +#define SLANG_VC 10 +#elif _MSC_VER >= 1500 +#define SLANG_VC 9 +#else +#error "unknown version of Visual C++ compiler" +#endif +#elif defined(__clang__) +#define SLANG_CLANG 1 +#elif defined(__SNC__) +#define SLANG_SNC 1 +#elif defined(__ghs__) +#define SLANG_GHS 1 +#elif defined(__GNUC__) /* note: __clang__, __SNC__, or __ghs__ imply __GNUC__ */ +#define SLANG_GCC 1 +#else +#error "unknown compiler" +#endif /* Any compilers not detected by the above logic are now now explicitly zeroed out. */ -# ifndef SLANG_VC -# define SLANG_VC 0 -# endif -# ifndef SLANG_CLANG -# define SLANG_CLANG 0 -# endif -# ifndef SLANG_SNC -# define SLANG_SNC 0 -# endif -# ifndef SLANG_GHS -# define SLANG_GHS 0 -# endif -# ifndef SLANG_GCC -# define SLANG_GCC 0 -# endif +#ifndef SLANG_VC +#define SLANG_VC 0 +#endif +#ifndef SLANG_CLANG +#define SLANG_CLANG 0 +#endif +#ifndef SLANG_SNC +#define SLANG_SNC 0 +#endif +#ifndef SLANG_GHS +#define SLANG_GHS 0 +#endif +#ifndef SLANG_GCC +#define SLANG_GCC 0 +#endif #endif /* SLANG_COMPILER */ /* @@ -116,89 +119,90 @@ used later in the file. Most applications should not need to touch this section. */ #ifndef SLANG_PLATFORM -# define SLANG_PLATFORM +#define SLANG_PLATFORM /** Operating system defines, see http://sourceforge.net/p/predef/wiki/OperatingSystems/ */ -# if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_PARTITION_APP -# define SLANG_WINRT 1 /* Windows Runtime, either on Windows RT or Windows 8 */ -# elif defined(XBOXONE) -# define SLANG_XBOXONE 1 -# elif defined(_WIN64) /* note: XBOXONE implies _WIN64 */ -# define SLANG_WIN64 1 -# elif defined(_M_PPC) -# define SLANG_X360 1 -# elif defined(_WIN32) /* note: _M_PPC implies _WIN32 */ -# define SLANG_WIN32 1 -# elif defined(__ANDROID__) -# define SLANG_ANDROID 1 -# elif defined(__linux__) || defined(__CYGWIN__) /* note: __ANDROID__ implies __linux__ */ -# define SLANG_LINUX 1 -# elif defined(__APPLE__) && !defined(SLANG_LLVM) -# include "TargetConditionals.h" -# if TARGET_OS_MAC -# define SLANG_OSX 1 -# else -# define SLANG_IOS 1 -# endif -# elif defined(__APPLE__) -// On `slang-llvm` we can't inclue "TargetConditionals.h" in general, so for now assume its OSX. -# define SLANG_OSX 1 -# elif defined(__CELLOS_LV2__) -# define SLANG_PS3 1 -# elif defined(__ORBIS__) -# define SLANG_PS4 1 -# elif defined(__SNC__) && defined(__arm__) -# define SLANG_PSP2 1 -# elif defined(__ghs__) -# define SLANG_WIIU 1 -# else -# error "unknown target platform" -# endif +#if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_PARTITION_APP +#define SLANG_WINRT 1 /* Windows Runtime, either on Windows RT or Windows 8 */ +#elif defined(XBOXONE) +#define SLANG_XBOXONE 1 +#elif defined(_WIN64) /* note: XBOXONE implies _WIN64 */ +#define SLANG_WIN64 1 +#elif defined(_M_PPC) +#define SLANG_X360 1 +#elif defined(_WIN32) /* note: _M_PPC implies _WIN32 */ +#define SLANG_WIN32 1 +#elif defined(__ANDROID__) +#define SLANG_ANDROID 1 +#elif defined(__linux__) || defined(__CYGWIN__) /* note: __ANDROID__ implies __linux__ */ +#define SLANG_LINUX 1 +#elif defined(__APPLE__) && !defined(SLANG_LLVM) +#include "TargetConditionals.h" +#if TARGET_OS_MAC +#define SLANG_OSX 1 +#else +#define SLANG_IOS 1 +#endif +#elif defined(__APPLE__) +// On `slang-llvm` we can't inclue "TargetConditionals.h" in general, so for now assume its +// OSX. +#define SLANG_OSX 1 +#elif defined(__CELLOS_LV2__) +#define SLANG_PS3 1 +#elif defined(__ORBIS__) +#define SLANG_PS4 1 +#elif defined(__SNC__) && defined(__arm__) +#define SLANG_PSP2 1 +#elif defined(__ghs__) +#define SLANG_WIIU 1 +#else +#error "unknown target platform" +#endif /* Any platforms not detected by the above logic are now now explicitly zeroed out. */ -# ifndef SLANG_WINRT -# define SLANG_WINRT 0 -# endif -# ifndef SLANG_XBOXONE -# define SLANG_XBOXONE 0 -# endif -# ifndef SLANG_WIN64 -# define SLANG_WIN64 0 -# endif -# ifndef SLANG_X360 -# define SLANG_X360 0 -# endif -# ifndef SLANG_WIN32 -# define SLANG_WIN32 0 -# endif -# ifndef SLANG_ANDROID -# define SLANG_ANDROID 0 -# endif -# ifndef SLANG_LINUX -# define SLANG_LINUX 0 -# endif -# ifndef SLANG_IOS -# define SLANG_IOS 0 -# endif -# ifndef SLANG_OSX -# define SLANG_OSX 0 -# endif -# ifndef SLANG_PS3 -# define SLANG_PS3 0 -# endif -# ifndef SLANG_PS4 -# define SLANG_PS4 0 -# endif -# ifndef SLANG_PSP2 -# define SLANG_PSP2 0 -# endif -# ifndef SLANG_WIIU -# define SLANG_WIIU 0 -# endif +#ifndef SLANG_WINRT +#define SLANG_WINRT 0 +#endif +#ifndef SLANG_XBOXONE +#define SLANG_XBOXONE 0 +#endif +#ifndef SLANG_WIN64 +#define SLANG_WIN64 0 +#endif +#ifndef SLANG_X360 +#define SLANG_X360 0 +#endif +#ifndef SLANG_WIN32 +#define SLANG_WIN32 0 +#endif +#ifndef SLANG_ANDROID +#define SLANG_ANDROID 0 +#endif +#ifndef SLANG_LINUX +#define SLANG_LINUX 0 +#endif +#ifndef SLANG_IOS +#define SLANG_IOS 0 +#endif +#ifndef SLANG_OSX +#define SLANG_OSX 0 +#endif +#ifndef SLANG_PS3 +#define SLANG_PS3 0 +#endif +#ifndef SLANG_PS4 +#define SLANG_PS4 0 +#endif +#ifndef SLANG_PSP2 +#define SLANG_PSP2 0 +#endif +#ifndef SLANG_WIIU +#define SLANG_WIIU 0 +#endif #endif /* SLANG_PLATFORM */ /* Shorthands for "families" of compilers/platforms */ @@ -206,37 +210,38 @@ Any platforms not detected by the above logic are now now explicitly zeroed out. #define SLANG_WINDOWS_FAMILY (SLANG_WINRT || SLANG_WIN32 || SLANG_WIN64) #define SLANG_MICROSOFT_FAMILY (SLANG_XBOXONE || SLANG_X360 || SLANG_WINDOWS_FAMILY) #define SLANG_LINUX_FAMILY (SLANG_LINUX || SLANG_ANDROID) -#define SLANG_APPLE_FAMILY (SLANG_IOS || SLANG_OSX) /* equivalent to #if __APPLE__ */ -#define SLANG_UNIX_FAMILY (SLANG_LINUX_FAMILY || SLANG_APPLE_FAMILY) /* shortcut for unix/posix platforms */ +#define SLANG_APPLE_FAMILY (SLANG_IOS || SLANG_OSX) /* equivalent to #if __APPLE__ */ +#define SLANG_UNIX_FAMILY \ + (SLANG_LINUX_FAMILY || SLANG_APPLE_FAMILY) /* shortcut for unix/posix platforms */ // GCC Specific #if SLANG_GCC_FAMILY -# define SLANG_ALIGN_OF(T) __alignof__(T) +#define SLANG_ALIGN_OF(T) __alignof__(T) -# define SLANG_BREAKPOINT(id) __builtin_trap() +#define SLANG_BREAKPOINT(id) __builtin_trap() -// Use this macro instead of offsetof, because gcc produces warning if offsetof is used on a +// Use this macro instead of offsetof, because gcc produces warning if offsetof is used on a // non POD type, even though it produces the correct result -# define SLANG_OFFSET_OF(T, ELEMENT) (size_t(&((T*)1)->ELEMENT) - 1) +#define SLANG_OFFSET_OF(T, ELEMENT) (size_t(&((T*)1)->ELEMENT) - 1) #endif // SLANG_GCC_FAMILY // Microsoft VC specific #if SLANG_VC -# define SLANG_ALIGN_OF(T) __alignof(T) +#define SLANG_ALIGN_OF(T) __alignof(T) -# define SLANG_BREAKPOINT(id) __debugbreak(); +#define SLANG_BREAKPOINT(id) __debugbreak(); #endif // SLANG_VC // Default impls #ifndef SLANG_OFFSET_OF -# define SLANG_OFFSET_OF(X, Y) offsetof(X, Y) +#define SLANG_OFFSET_OF(X, Y) offsetof(X, Y) #endif #ifndef SLANG_BREAKPOINT // Make it crash with a write to 0! -# define SLANG_BREAKPOINT(id) (*((int*)0) = int(id)); +#define SLANG_BREAKPOINT(id) (*((int*)0) = int(id)); #endif // If slang.h has been included we don't need any of these definitions @@ -244,33 +249,33 @@ Any platforms not detected by the above logic are now now explicitly zeroed out. /* Macro for declaring if a method is no throw. Should be set before the return parameter. */ #ifndef SLANG_NO_THROW -# if SLANG_WINDOWS_FAMILY && !defined(SLANG_DISABLE_EXCEPTIONS) -# define SLANG_NO_THROW __declspec(nothrow) -# endif +#if SLANG_WINDOWS_FAMILY && !defined(SLANG_DISABLE_EXCEPTIONS) +#define SLANG_NO_THROW __declspec(nothrow) +#endif #endif #ifndef SLANG_NO_THROW -# define SLANG_NO_THROW +#define SLANG_NO_THROW #endif /* The `SLANG_STDCALL` and `SLANG_MCALL` defines are used to set the calling convention for interface methods. */ #ifndef SLANG_STDCALL -# if SLANG_MICROSOFT_FAMILY -# define SLANG_STDCALL __stdcall -# else -# define SLANG_STDCALL -# endif +#if SLANG_MICROSOFT_FAMILY +#define SLANG_STDCALL __stdcall +#else +#define SLANG_STDCALL +#endif #endif #ifndef SLANG_MCALL -# define SLANG_MCALL SLANG_STDCALL +#define SLANG_MCALL SLANG_STDCALL #endif #ifndef SLANG_FORCE_INLINE -# define SLANG_FORCE_INLINE inline +#define SLANG_FORCE_INLINE inline #endif -// TODO(JS): Should these be in slang-cpp-types.h? +// TODO(JS): Should these be in slang-cpp-types.h? // They are more likely to clash with slang.h struct SlangUUID @@ -278,24 +283,25 @@ struct SlangUUID uint32_t data1; uint16_t data2; uint16_t data3; - uint8_t data4[8]; + uint8_t data4[8]; }; typedef int32_t SlangResult; struct ISlangUnknown { - virtual SLANG_NO_THROW SlangResult SLANG_MCALL queryInterface(SlangUUID const& uuid, void** outObject) = 0; + virtual SLANG_NO_THROW SlangResult SLANG_MCALL + queryInterface(SlangUUID const& uuid, void** outObject) = 0; virtual SLANG_NO_THROW uint32_t SLANG_MCALL addRef() = 0; virtual SLANG_NO_THROW uint32_t SLANG_MCALL release() = 0; }; -#define SLANG_COM_INTERFACE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \ - public: \ - SLANG_FORCE_INLINE static const SlangUUID& getTypeGuid() \ - { \ - static const SlangUUID guid = { a, b, c, d0, d1, d2, d3, d4, d5, d6, d7 }; \ - return guid; \ +#define SLANG_COM_INTERFACE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \ +public: \ + SLANG_FORCE_INLINE static const SlangUUID& getTypeGuid() \ + { \ + static const SlangUUID guid = {a, b, c, d0, d1, d2, d3, d4, d5, d6, d7}; \ + return guid; \ } #endif // SLANG_H @@ -304,13 +310,13 @@ struct ISlangUnknown #include "slang-cpp-scalar-intrinsics.h" #include "slang-cpp-types.h" -// TODO(JS): Hack! Output C++ code from slang can copy uninitialized variables. +// TODO(JS): Hack! Output C++ code from slang can copy uninitialized variables. #if defined(_MSC_VER) -# pragma warning(disable : 4700) +#pragma warning(disable : 4700) #endif #ifndef SLANG_UNROLL -# define SLANG_UNROLL +#define SLANG_UNROLL #endif #endif diff --git a/prelude/slang-cpp-scalar-intrinsics.h b/prelude/slang-cpp-scalar-intrinsics.h index 1ade8614f..6aa72df4f 100644 --- a/prelude/slang-cpp-scalar-intrinsics.h +++ b/prelude/slang-cpp-scalar-intrinsics.h @@ -2,24 +2,26 @@ #define SLANG_PRELUDE_SCALAR_INTRINSICS_H #if !defined(SLANG_LLVM) && SLANG_PROCESSOR_X86_64 && SLANG_VC -// If we have visual studio and 64 bit processor, we can assume we have popcnt, and can include x86 intrinsics -# include <intrin.h> +// If we have visual studio and 64 bit processor, we can assume we have popcnt, and can include +// x86 intrinsics +#include <intrin.h> #endif #ifndef SLANG_FORCE_INLINE -# define SLANG_FORCE_INLINE inline +#define SLANG_FORCE_INLINE inline #endif #ifdef SLANG_PRELUDE_NAMESPACE -namespace SLANG_PRELUDE_NAMESPACE { +namespace SLANG_PRELUDE_NAMESPACE +{ #endif #ifndef SLANG_PRELUDE_PI -# define SLANG_PRELUDE_PI 3.14159265358979323846 +#define SLANG_PRELUDE_PI 3.14159265358979323846 #endif -union Union32 +union Union32 { uint32_t u; int32_t i; @@ -34,10 +36,30 @@ union Union64 }; // 32 bit cast conversions -SLANG_FORCE_INLINE int32_t _bitCastFloatToInt(float f) { Union32 u; u.f = f; return u.i; } -SLANG_FORCE_INLINE float _bitCastIntToFloat(int32_t i) { Union32 u; u.i = i; return u.f; } -SLANG_FORCE_INLINE uint32_t _bitCastFloatToUInt(float f) { Union32 u; u.f = f; return u.u; } -SLANG_FORCE_INLINE float _bitCastUIntToFloat(uint32_t ui) { Union32 u; u.u = ui; return u.f; } +SLANG_FORCE_INLINE int32_t _bitCastFloatToInt(float f) +{ + Union32 u; + u.f = f; + return u.i; +} +SLANG_FORCE_INLINE float _bitCastIntToFloat(int32_t i) +{ + Union32 u; + u.i = i; + return u.f; +} +SLANG_FORCE_INLINE uint32_t _bitCastFloatToUInt(float f) +{ + Union32 u; + u.f = f; + return u.u; +} +SLANG_FORCE_INLINE float _bitCastUIntToFloat(uint32_t ui) +{ + Union32 u; + u.u = ui; + return u.f; +} // ----------------------------- F16 ----------------------------------------- @@ -61,27 +83,27 @@ SLANG_FORCE_INLINE uint32_t f32tof16(const float value) if (e == 0xff) { // Could be a NAN or INF. Is INF if *input* mantissa is 0. - + // Remove last bit for rounding to make output mantissa. m >>= 1; - + // We *assume* float16/float32 signaling bit and remaining bits // semantics are the same. (The signalling bit convention is target specific!). // Non signal bit's usage within mantissa for a NAN are also target specific. - - // If the m is 0, it could be because the result is INF, but it could also be because all the - // bits that made NAN were dropped as we have less mantissa bits in f16. - + + // If the m is 0, it could be because the result is INF, but it could also be because all + // the bits that made NAN were dropped as we have less mantissa bits in f16. + // To fix for this we make non zero if m is 0 and the input mantissa was not. // This will (typically) produce a signalling NAN. m += uint32_t(m == 0 && (inBits & 0x007fffffu)); - + // Combine for output return (bits | 0x7c00u | m); } if (e > 142) { - // INF. + // INF. return bits | 0x7c00u; } if (e < 113) @@ -105,7 +127,7 @@ SLANG_FORCE_INLINE float f16tof32(const uint32_t value) if (exponent == 0) { - // If mantissa is 0 we are done, as output is 0. + // If mantissa is 0 we are done, as output is 0. // If it's not zero we must have a denormal. if (mantissa) { @@ -113,16 +135,17 @@ SLANG_FORCE_INLINE float f16tof32(const uint32_t value) return _bitCastIntToFloat(sign | ((value & 0x7fff) << 13)) * g_f16tof32Magic; } } - else + else { - // If the exponent is NAN or INF exponent is 0x1f on input. + // If the exponent is NAN or INF exponent is 0x1f on input. // If that's the case, we just need to set the exponent to 0xff on output - // and the mantissa can just stay the same. If its 0 it's INF, else it is NAN and we just copy the bits + // and the mantissa can just stay the same. If its 0 it's INF, else it is NAN and we just + // copy the bits // // Else we need to correct the exponent in the normalized case. exponent = (exponent == 0x1F) ? 0xff : (exponent + (-15 + 127)); } - + return _bitCastUIntToFloat(sign | (exponent << 23) | (mantissa << 13)); } @@ -135,7 +158,7 @@ SLANG_FORCE_INLINE float F32_calcSafeRadians(float radians); SLANG_PRELUDE_EXTERN_C_START -// Unary +// Unary float F32_ceil(float f); float F32_floor(float f); float F32_round(float f); @@ -158,12 +181,18 @@ float F32_trunc(float f); float F32_sqrt(float f); bool F32_isnan(float f); -bool F32_isfinite(float f); +bool F32_isfinite(float f); bool F32_isinf(float f); // Binary -SLANG_FORCE_INLINE float F32_min(float a, float b) { return a < b ? a : b; } -SLANG_FORCE_INLINE float F32_max(float a, float b) { return a > b ? a : b; } +SLANG_FORCE_INLINE float F32_min(float a, float b) +{ + return a < b ? a : b; +} +SLANG_FORCE_INLINE float F32_max(float a, float b) +{ + return a > b ? a : b; +} float F32_pow(float a, float b); float F32_fmod(float a, float b); float F32_remainder(float a, float b); @@ -174,47 +203,140 @@ float F32_frexp(float x, int* e); float F32_modf(float x, float* ip); // Ternary -SLANG_FORCE_INLINE float F32_fma(float a, float b, float c) { return a * b + c; } +SLANG_FORCE_INLINE float F32_fma(float a, float b, float c) +{ + return a * b + c; +} SLANG_PRELUDE_EXTERN_C_END #else -// Unary -SLANG_FORCE_INLINE float F32_ceil(float f) { return ::ceilf(f); } -SLANG_FORCE_INLINE float F32_floor(float f) { return ::floorf(f); } -SLANG_FORCE_INLINE float F32_round(float f) { return ::roundf(f); } -SLANG_FORCE_INLINE float F32_sin(float f) { return ::sinf(f); } -SLANG_FORCE_INLINE float F32_cos(float f) { return ::cosf(f); } -SLANG_FORCE_INLINE float F32_tan(float f) { return ::tanf(f); } -SLANG_FORCE_INLINE float F32_asin(float f) { return ::asinf(f); } -SLANG_FORCE_INLINE float F32_acos(float f) { return ::acosf(f); } -SLANG_FORCE_INLINE float F32_atan(float f) { return ::atanf(f); } -SLANG_FORCE_INLINE float F32_sinh(float f) { return ::sinhf(f); } -SLANG_FORCE_INLINE float F32_cosh(float f) { return ::coshf(f); } -SLANG_FORCE_INLINE float F32_tanh(float f) { return ::tanhf(f); } -SLANG_FORCE_INLINE float F32_log2(float f) { return ::log2f(f); } -SLANG_FORCE_INLINE float F32_log(float f) { return ::logf(f); } -SLANG_FORCE_INLINE float F32_log10(float f) { return ::log10f(f); } -SLANG_FORCE_INLINE float F32_exp2(float f) { return ::exp2f(f); } -SLANG_FORCE_INLINE float F32_exp(float f) { return ::expf(f); } -SLANG_FORCE_INLINE float F32_abs(float f) { return ::fabsf(f); } -SLANG_FORCE_INLINE float F32_trunc(float f) { return ::truncf(f); } -SLANG_FORCE_INLINE float F32_sqrt(float f) { return ::sqrtf(f); } - -SLANG_FORCE_INLINE bool F32_isnan(float f) { return SLANG_PRELUDE_STD isnan(f); } -SLANG_FORCE_INLINE bool F32_isfinite(float f) { return SLANG_PRELUDE_STD isfinite(f); } -SLANG_FORCE_INLINE bool F32_isinf(float f) { return SLANG_PRELUDE_STD isinf(f); } +// Unary +SLANG_FORCE_INLINE float F32_ceil(float f) +{ + return ::ceilf(f); +} +SLANG_FORCE_INLINE float F32_floor(float f) +{ + return ::floorf(f); +} +SLANG_FORCE_INLINE float F32_round(float f) +{ + return ::roundf(f); +} +SLANG_FORCE_INLINE float F32_sin(float f) +{ + return ::sinf(f); +} +SLANG_FORCE_INLINE float F32_cos(float f) +{ + return ::cosf(f); +} +SLANG_FORCE_INLINE float F32_tan(float f) +{ + return ::tanf(f); +} +SLANG_FORCE_INLINE float F32_asin(float f) +{ + return ::asinf(f); +} +SLANG_FORCE_INLINE float F32_acos(float f) +{ + return ::acosf(f); +} +SLANG_FORCE_INLINE float F32_atan(float f) +{ + return ::atanf(f); +} +SLANG_FORCE_INLINE float F32_sinh(float f) +{ + return ::sinhf(f); +} +SLANG_FORCE_INLINE float F32_cosh(float f) +{ + return ::coshf(f); +} +SLANG_FORCE_INLINE float F32_tanh(float f) +{ + return ::tanhf(f); +} +SLANG_FORCE_INLINE float F32_log2(float f) +{ + return ::log2f(f); +} +SLANG_FORCE_INLINE float F32_log(float f) +{ + return ::logf(f); +} +SLANG_FORCE_INLINE float F32_log10(float f) +{ + return ::log10f(f); +} +SLANG_FORCE_INLINE float F32_exp2(float f) +{ + return ::exp2f(f); +} +SLANG_FORCE_INLINE float F32_exp(float f) +{ + return ::expf(f); +} +SLANG_FORCE_INLINE float F32_abs(float f) +{ + return ::fabsf(f); +} +SLANG_FORCE_INLINE float F32_trunc(float f) +{ + return ::truncf(f); +} +SLANG_FORCE_INLINE float F32_sqrt(float f) +{ + return ::sqrtf(f); +} + +SLANG_FORCE_INLINE bool F32_isnan(float f) +{ + return SLANG_PRELUDE_STD isnan(f); +} +SLANG_FORCE_INLINE bool F32_isfinite(float f) +{ + return SLANG_PRELUDE_STD isfinite(f); +} +SLANG_FORCE_INLINE bool F32_isinf(float f) +{ + return SLANG_PRELUDE_STD isinf(f); +} // Binary -SLANG_FORCE_INLINE float F32_min(float a, float b) { return ::fminf(a, b); } -SLANG_FORCE_INLINE float F32_max(float a, float b) { return ::fmaxf(a, b); } -SLANG_FORCE_INLINE float F32_pow(float a, float b) { return ::powf(a, b); } -SLANG_FORCE_INLINE float F32_fmod(float a, float b) { return ::fmodf(a, b); } -SLANG_FORCE_INLINE float F32_remainder(float a, float b) { return ::remainderf(a, b); } -SLANG_FORCE_INLINE float F32_atan2(float a, float b) { return float(::atan2(a, b)); } +SLANG_FORCE_INLINE float F32_min(float a, float b) +{ + return ::fminf(a, b); +} +SLANG_FORCE_INLINE float F32_max(float a, float b) +{ + return ::fmaxf(a, b); +} +SLANG_FORCE_INLINE float F32_pow(float a, float b) +{ + return ::powf(a, b); +} +SLANG_FORCE_INLINE float F32_fmod(float a, float b) +{ + return ::fmodf(a, b); +} +SLANG_FORCE_INLINE float F32_remainder(float a, float b) +{ + return ::remainderf(a, b); +} +SLANG_FORCE_INLINE float F32_atan2(float a, float b) +{ + return float(::atan2(a, b)); +} -SLANG_FORCE_INLINE float F32_frexp(float x, int* e) { return ::frexpf(x, e); } +SLANG_FORCE_INLINE float F32_frexp(float x, int* e) +{ + return ::frexpf(x, e); +} SLANG_FORCE_INLINE float F32_modf(float x, float* ip) { @@ -222,26 +344,48 @@ SLANG_FORCE_INLINE float F32_modf(float x, float* ip) } // Ternary -SLANG_FORCE_INLINE float F32_fma(float a, float b, float c) { return ::fmaf(a, b, c); } +SLANG_FORCE_INLINE float F32_fma(float a, float b, float c) +{ + return ::fmaf(a, b, c); +} #endif SLANG_FORCE_INLINE float F32_calcSafeRadians(float radians) { - // Put 0 to 2pi cycles to cycle around 0 to 1 - float a = radians * (1.0f / float(SLANG_PRELUDE_PI * 2)); + // Put 0 to 2pi cycles to cycle around 0 to 1 + float a = radians * (1.0f / float(SLANG_PRELUDE_PI * 2)); // Get truncated fraction, as value in 0 - 1 range a = a - F32_floor(a); // Convert back to 0 - 2pi range - return (a * float(SLANG_PRELUDE_PI * 2)); + return (a * float(SLANG_PRELUDE_PI * 2)); } -SLANG_FORCE_INLINE float F32_rsqrt(float f) { return 1.0f / F32_sqrt(f); } -SLANG_FORCE_INLINE float F32_sign(float f) { return ( f == 0.0f) ? f : (( f < 0.0f) ? -1.0f : 1.0f); } -SLANG_FORCE_INLINE float F32_frac(float f) { return f - F32_floor(f); } +SLANG_FORCE_INLINE float F32_rsqrt(float f) +{ + return 1.0f / F32_sqrt(f); +} +SLANG_FORCE_INLINE float F32_sign(float f) +{ + return (f == 0.0f) ? f : ((f < 0.0f) ? -1.0f : 1.0f); +} +SLANG_FORCE_INLINE float F32_frac(float f) +{ + return f - F32_floor(f); +} -SLANG_FORCE_INLINE uint32_t F32_asuint(float f) { Union32 u; u.f = f; return u.u; } -SLANG_FORCE_INLINE int32_t F32_asint(float f) { Union32 u; u.f = f; return u.i; } +SLANG_FORCE_INLINE uint32_t F32_asuint(float f) +{ + Union32 u; + u.f = f; + return u.u; +} +SLANG_FORCE_INLINE int32_t F32_asint(float f) +{ + Union32 u; + u.f = f; + return u.i; +} // ----------------------------- F64 ----------------------------------------- @@ -251,7 +395,7 @@ SLANG_FORCE_INLINE double F64_calcSafeRadians(double radians); SLANG_PRELUDE_EXTERN_C_START -// Unary +// Unary double F64_ceil(double f); double F64_floor(double f); double F64_round(double f); @@ -278,8 +422,14 @@ bool F64_isfinite(double f); bool F64_isinf(double f); // Binary -SLANG_FORCE_INLINE double F64_min(double a, double b) { return a < b ? a : b; } -SLANG_FORCE_INLINE double F64_max(double a, double b) { return a > b ? a : b; } +SLANG_FORCE_INLINE double F64_min(double a, double b) +{ + return a < b ? a : b; +} +SLANG_FORCE_INLINE double F64_max(double a, double b) +{ + return a > b ? a : b; +} double F64_pow(double a, double b); double F64_fmod(double a, double b); double F64_remainder(double a, double b); @@ -290,48 +440,141 @@ double F64_frexp(double x, int* e); double F64_modf(double x, double* ip); // Ternary -SLANG_FORCE_INLINE double F64_fma(double a, double b, double c) { return a * b + c; } +SLANG_FORCE_INLINE double F64_fma(double a, double b, double c) +{ + return a * b + c; +} SLANG_PRELUDE_EXTERN_C_END #else // SLANG_LLVM -// Unary -SLANG_FORCE_INLINE double F64_ceil(double f) { return ::ceil(f); } -SLANG_FORCE_INLINE double F64_floor(double f) { return ::floor(f); } -SLANG_FORCE_INLINE double F64_round(double f) { return ::round(f); } -SLANG_FORCE_INLINE double F64_sin(double f) { return ::sin(f); } -SLANG_FORCE_INLINE double F64_cos(double f) { return ::cos(f); } -SLANG_FORCE_INLINE double F64_tan(double f) { return ::tan(f); } -SLANG_FORCE_INLINE double F64_asin(double f) { return ::asin(f); } -SLANG_FORCE_INLINE double F64_acos(double f) { return ::acos(f); } -SLANG_FORCE_INLINE double F64_atan(double f) { return ::atan(f); } -SLANG_FORCE_INLINE double F64_sinh(double f) { return ::sinh(f); } -SLANG_FORCE_INLINE double F64_cosh(double f) { return ::cosh(f); } -SLANG_FORCE_INLINE double F64_tanh(double f) { return ::tanh(f); } -SLANG_FORCE_INLINE double F64_log2(double f) { return ::log2(f); } -SLANG_FORCE_INLINE double F64_log(double f) { return ::log(f); } -SLANG_FORCE_INLINE double F64_log10(float f) { return ::log10(f); } -SLANG_FORCE_INLINE double F64_exp2(double f) { return ::exp2(f); } -SLANG_FORCE_INLINE double F64_exp(double f) { return ::exp(f); } -SLANG_FORCE_INLINE double F64_abs(double f) { return ::fabs(f); } -SLANG_FORCE_INLINE double F64_trunc(double f) { return ::trunc(f); } -SLANG_FORCE_INLINE double F64_sqrt(double f) { return ::sqrt(f); } - - -SLANG_FORCE_INLINE bool F64_isnan(double f) { return SLANG_PRELUDE_STD isnan(f); } -SLANG_FORCE_INLINE bool F64_isfinite(double f) { return SLANG_PRELUDE_STD isfinite(f); } -SLANG_FORCE_INLINE bool F64_isinf(double f) { return SLANG_PRELUDE_STD isinf(f); } +// Unary +SLANG_FORCE_INLINE double F64_ceil(double f) +{ + return ::ceil(f); +} +SLANG_FORCE_INLINE double F64_floor(double f) +{ + return ::floor(f); +} +SLANG_FORCE_INLINE double F64_round(double f) +{ + return ::round(f); +} +SLANG_FORCE_INLINE double F64_sin(double f) +{ + return ::sin(f); +} +SLANG_FORCE_INLINE double F64_cos(double f) +{ + return ::cos(f); +} +SLANG_FORCE_INLINE double F64_tan(double f) +{ + return ::tan(f); +} +SLANG_FORCE_INLINE double F64_asin(double f) +{ + return ::asin(f); +} +SLANG_FORCE_INLINE double F64_acos(double f) +{ + return ::acos(f); +} +SLANG_FORCE_INLINE double F64_atan(double f) +{ + return ::atan(f); +} +SLANG_FORCE_INLINE double F64_sinh(double f) +{ + return ::sinh(f); +} +SLANG_FORCE_INLINE double F64_cosh(double f) +{ + return ::cosh(f); +} +SLANG_FORCE_INLINE double F64_tanh(double f) +{ + return ::tanh(f); +} +SLANG_FORCE_INLINE double F64_log2(double f) +{ + return ::log2(f); +} +SLANG_FORCE_INLINE double F64_log(double f) +{ + return ::log(f); +} +SLANG_FORCE_INLINE double F64_log10(float f) +{ + return ::log10(f); +} +SLANG_FORCE_INLINE double F64_exp2(double f) +{ + return ::exp2(f); +} +SLANG_FORCE_INLINE double F64_exp(double f) +{ + return ::exp(f); +} +SLANG_FORCE_INLINE double F64_abs(double f) +{ + return ::fabs(f); +} +SLANG_FORCE_INLINE double F64_trunc(double f) +{ + return ::trunc(f); +} +SLANG_FORCE_INLINE double F64_sqrt(double f) +{ + return ::sqrt(f); +} + + +SLANG_FORCE_INLINE bool F64_isnan(double f) +{ + return SLANG_PRELUDE_STD isnan(f); +} +SLANG_FORCE_INLINE bool F64_isfinite(double f) +{ + return SLANG_PRELUDE_STD isfinite(f); +} +SLANG_FORCE_INLINE bool F64_isinf(double f) +{ + return SLANG_PRELUDE_STD isinf(f); +} // Binary -SLANG_FORCE_INLINE double F64_min(double a, double b) { return ::fmin(a, b); } -SLANG_FORCE_INLINE double F64_max(double a, double b) { return ::fmax(a, b); } -SLANG_FORCE_INLINE double F64_pow(double a, double b) { return ::pow(a, b); } -SLANG_FORCE_INLINE double F64_fmod(double a, double b) { return ::fmod(a, b); } -SLANG_FORCE_INLINE double F64_remainder(double a, double b) { return ::remainder(a, b); } -SLANG_FORCE_INLINE double F64_atan2(double a, double b) { return ::atan2(a, b); } +SLANG_FORCE_INLINE double F64_min(double a, double b) +{ + return ::fmin(a, b); +} +SLANG_FORCE_INLINE double F64_max(double a, double b) +{ + return ::fmax(a, b); +} +SLANG_FORCE_INLINE double F64_pow(double a, double b) +{ + return ::pow(a, b); +} +SLANG_FORCE_INLINE double F64_fmod(double a, double b) +{ + return ::fmod(a, b); +} +SLANG_FORCE_INLINE double F64_remainder(double a, double b) +{ + return ::remainder(a, b); +} +SLANG_FORCE_INLINE double F64_atan2(double a, double b) +{ + return ::atan2(a, b); +} -SLANG_FORCE_INLINE double F64_frexp(double x, int* e) { return ::frexp(x, e); } +SLANG_FORCE_INLINE double F64_frexp(double x, int* e) +{ + return ::frexp(x, e); +} SLANG_FORCE_INLINE double F64_modf(double x, double* ip) { @@ -339,13 +582,25 @@ SLANG_FORCE_INLINE double F64_modf(double x, double* ip) } // Ternary -SLANG_FORCE_INLINE double F64_fma(double a, double b, double c) { return ::fma(a, b, c); } +SLANG_FORCE_INLINE double F64_fma(double a, double b, double c) +{ + return ::fma(a, b, c); +} #endif // SLANG_LLVM -SLANG_FORCE_INLINE double F64_rsqrt(double f) { return 1.0 / F64_sqrt(f); } -SLANG_FORCE_INLINE double F64_sign(double f) { return (f == 0.0) ? f : ((f < 0.0) ? -1.0 : 1.0); } -SLANG_FORCE_INLINE double F64_frac(double f) { return f - F64_floor(f); } +SLANG_FORCE_INLINE double F64_rsqrt(double f) +{ + return 1.0 / F64_sqrt(f); +} +SLANG_FORCE_INLINE double F64_sign(double f) +{ + return (f == 0.0) ? f : ((f < 0.0) ? -1.0 : 1.0); +} +SLANG_FORCE_INLINE double F64_frac(double f) +{ + return f - F64_floor(f); +} SLANG_FORCE_INLINE void F64_asuint(double d, uint32_t* low, uint32_t* hi) { @@ -365,24 +620,41 @@ SLANG_FORCE_INLINE void F64_asint(double d, int32_t* low, int32_t* hi) SLANG_FORCE_INLINE double F64_calcSafeRadians(double radians) { - // Put 0 to 2pi cycles to cycle around 0 to 1 - double a = radians * (1.0f / (SLANG_PRELUDE_PI * 2)); + // Put 0 to 2pi cycles to cycle around 0 to 1 + double a = radians * (1.0f / (SLANG_PRELUDE_PI * 2)); // Get truncated fraction, as value in 0 - 1 range a = a - F64_floor(a); // Convert back to 0 - 2pi range - return (a * (SLANG_PRELUDE_PI * 2)); + return (a * (SLANG_PRELUDE_PI * 2)); } // ----------------------------- I32 ----------------------------------------- -SLANG_FORCE_INLINE int32_t I32_abs(int32_t f) { return (f < 0) ? -f : f; } +SLANG_FORCE_INLINE int32_t I32_abs(int32_t f) +{ + return (f < 0) ? -f : f; +} -SLANG_FORCE_INLINE int32_t I32_min(int32_t a, int32_t b) { return a < b ? a : b; } -SLANG_FORCE_INLINE int32_t I32_max(int32_t a, int32_t b) { return a > b ? a : b; } +SLANG_FORCE_INLINE int32_t I32_min(int32_t a, int32_t b) +{ + return a < b ? a : b; +} +SLANG_FORCE_INLINE int32_t I32_max(int32_t a, int32_t b) +{ + return a > b ? a : b; +} -SLANG_FORCE_INLINE float I32_asfloat(int32_t x) { Union32 u; u.i = x; return u.f; } -SLANG_FORCE_INLINE uint32_t I32_asuint(int32_t x) { return uint32_t(x); } -SLANG_FORCE_INLINE double I32_asdouble(int32_t low, int32_t hi ) +SLANG_FORCE_INLINE float I32_asfloat(int32_t x) +{ + Union32 u; + u.i = x; + return u.f; +} +SLANG_FORCE_INLINE uint32_t I32_asuint(int32_t x) +{ + return uint32_t(x); +} +SLANG_FORCE_INLINE double I32_asdouble(int32_t low, int32_t hi) { Union64 u; u.u = (uint64_t(hi) << 32) | uint32_t(low); @@ -391,13 +663,30 @@ SLANG_FORCE_INLINE double I32_asdouble(int32_t low, int32_t hi ) // ----------------------------- U32 ----------------------------------------- -SLANG_FORCE_INLINE uint32_t U32_abs(uint32_t f) { return f; } +SLANG_FORCE_INLINE uint32_t U32_abs(uint32_t f) +{ + return f; +} -SLANG_FORCE_INLINE uint32_t U32_min(uint32_t a, uint32_t b) { return a < b ? a : b; } -SLANG_FORCE_INLINE uint32_t U32_max(uint32_t a, uint32_t b) { return a > b ? a : b; } +SLANG_FORCE_INLINE uint32_t U32_min(uint32_t a, uint32_t b) +{ + return a < b ? a : b; +} +SLANG_FORCE_INLINE uint32_t U32_max(uint32_t a, uint32_t b) +{ + return a > b ? a : b; +} -SLANG_FORCE_INLINE float U32_asfloat(uint32_t x) { Union32 u; u.u = x; return u.f; } -SLANG_FORCE_INLINE uint32_t U32_asint(int32_t x) { return uint32_t(x); } +SLANG_FORCE_INLINE float U32_asfloat(uint32_t x) +{ + Union32 u; + u.u = x; + return u.f; +} +SLANG_FORCE_INLINE uint32_t U32_asint(int32_t x) +{ + return uint32_t(x); +} SLANG_FORCE_INLINE double U32_asdouble(uint32_t low, uint32_t hi) { @@ -413,7 +702,7 @@ SLANG_FORCE_INLINE uint32_t U32_countbits(uint32_t v) return __builtin_popcount(v); #elif SLANG_PROCESSOR_X86_64 && SLANG_VC return __popcnt(v); -#else +#else uint32_t c = 0; while (v) { @@ -426,21 +715,30 @@ SLANG_FORCE_INLINE uint32_t U32_countbits(uint32_t v) // ----------------------------- U64 ----------------------------------------- -SLANG_FORCE_INLINE uint64_t U64_abs(uint64_t f) { return f; } +SLANG_FORCE_INLINE uint64_t U64_abs(uint64_t f) +{ + return f; +} -SLANG_FORCE_INLINE uint64_t U64_min(uint64_t a, uint64_t b) { return a < b ? a : b; } -SLANG_FORCE_INLINE uint64_t U64_max(uint64_t a, uint64_t b) { return a > b ? a : b; } +SLANG_FORCE_INLINE uint64_t U64_min(uint64_t a, uint64_t b) +{ + return a < b ? a : b; +} +SLANG_FORCE_INLINE uint64_t U64_max(uint64_t a, uint64_t b) +{ + return a > b ? a : b; +} // TODO(JS): We don't define countbits for 64bit in the core module currently. -// It's not clear from documentation if it should return 32 or 64 bits, if it exists. -// 32 bits can always hold the result, and will be implicitly promoted. +// It's not clear from documentation if it should return 32 or 64 bits, if it exists. +// 32 bits can always hold the result, and will be implicitly promoted. SLANG_FORCE_INLINE uint32_t U64_countbits(uint64_t v) { -#if SLANG_GCC_FAMILY && !defined(SLANG_LLVM) +#if SLANG_GCC_FAMILY && !defined(SLANG_LLVM) return uint32_t(__builtin_popcountl(v)); #elif SLANG_PROCESSOR_X86_64 && SLANG_VC return uint32_t(__popcnt64(v)); -#else +#else uint32_t c = 0; while (v) { @@ -453,10 +751,19 @@ SLANG_FORCE_INLINE uint32_t U64_countbits(uint64_t v) // ----------------------------- I64 ----------------------------------------- -SLANG_FORCE_INLINE int64_t I64_abs(int64_t f) { return (f < 0) ? -f : f; } +SLANG_FORCE_INLINE int64_t I64_abs(int64_t f) +{ + return (f < 0) ? -f : f; +} -SLANG_FORCE_INLINE int64_t I64_min(int64_t a, int64_t b) { return a < b ? a : b; } -SLANG_FORCE_INLINE int64_t I64_max(int64_t a, int64_t b) { return a > b ? a : b; } +SLANG_FORCE_INLINE int64_t I64_min(int64_t a, int64_t b) +{ + return a < b ? a : b; +} +SLANG_FORCE_INLINE int64_t I64_max(int64_t a, int64_t b) +{ + return a > b ? a : b; +} // ----------------------------- Interlocked --------------------------------- @@ -465,17 +772,17 @@ SLANG_FORCE_INLINE int64_t I64_max(int64_t a, int64_t b) { return a > b ? a : b; #else // SLANG_LLVM -# ifdef _WIN32 -# include <intrin.h> -# endif +#ifdef _WIN32 +#include <intrin.h> +#endif SLANG_FORCE_INLINE void InterlockedAdd(uint32_t* dest, uint32_t value, uint32_t* oldValue) { -# ifdef _WIN32 +#ifdef _WIN32 *oldValue = _InterlockedExchangeAdd((long*)dest, (long)value); -# else +#else *oldValue = __sync_fetch_and_add(dest, value); -# endif +#endif } #endif // SLANG_LLVM @@ -492,7 +799,7 @@ SLANG_FORCE_INLINE double _slang_fmod(double x, double y) } #ifdef SLANG_PRELUDE_NAMESPACE -} +} #endif #endif diff --git a/prelude/slang-cpp-types-core.h b/prelude/slang-cpp-types-core.h index 25fe47202..6c0bb7544 100644 --- a/prelude/slang-cpp-types-core.h +++ b/prelude/slang-cpp-types-core.h @@ -2,11 +2,11 @@ #define SLANG_PRELUDE_CPP_TYPES_CORE_H #ifndef SLANG_PRELUDE_ASSERT -# ifdef SLANG_PRELUDE_ENABLE_ASSERT -# define SLANG_PRELUDE_ASSERT(VALUE) assert(VALUE) -# else -# define SLANG_PRELUDE_ASSERT(VALUE) -# endif +#ifdef SLANG_PRELUDE_ENABLE_ASSERT +#define SLANG_PRELUDE_ASSERT(VALUE) assert(VALUE) +#else +#define SLANG_PRELUDE_ASSERT(VALUE) +#endif #endif // Since we are using unsigned arithmatic care is need in this comparison. @@ -15,35 +15,42 @@ // Asserts for bounds checking. // It is assumed index/count are unsigned types. -#define SLANG_BOUND_ASSERT(index, count) SLANG_PRELUDE_ASSERT(index < count); -#define SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_PRELUDE_ASSERT(index <= (sizeInBytes - elemSize) && (index & 3) == 0); +#define SLANG_BOUND_ASSERT(index, count) SLANG_PRELUDE_ASSERT(index < count); +#define SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) \ + SLANG_PRELUDE_ASSERT(index <= (sizeInBytes - elemSize) && (index & 3) == 0); // Macros to zero index if an access is out of range -#define SLANG_BOUND_ZERO_INDEX(index, count) index = (index < count) ? index : 0; -#define SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) index = (index <= (sizeInBytes - elemSize)) ? index : 0; - -// The 'FIX' macro define how the index is fixed. The default is to do nothing. If SLANG_ENABLE_BOUND_ZERO_INDEX -// the fix macro will zero the index, if out of range -#ifdef SLANG_ENABLE_BOUND_ZERO_INDEX -# define SLANG_BOUND_FIX(index, count) SLANG_BOUND_ZERO_INDEX(index, count) -# define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) -# define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) SLANG_BOUND_ZERO_INDEX(index, count) +#define SLANG_BOUND_ZERO_INDEX(index, count) index = (index < count) ? index : 0; +#define SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) \ + index = (index <= (sizeInBytes - elemSize)) ? index : 0; + +// The 'FIX' macro define how the index is fixed. The default is to do nothing. If +// SLANG_ENABLE_BOUND_ZERO_INDEX the fix macro will zero the index, if out of range +#ifdef SLANG_ENABLE_BOUND_ZERO_INDEX +#define SLANG_BOUND_FIX(index, count) SLANG_BOUND_ZERO_INDEX(index, count) +#define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) \ + SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) +#define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) SLANG_BOUND_ZERO_INDEX(index, count) #else -# define SLANG_BOUND_FIX(index, count) -# define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) -# define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) +#define SLANG_BOUND_FIX(index, count) +#define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) +#define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) #endif #ifndef SLANG_BOUND_CHECK -# define SLANG_BOUND_CHECK(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX(index, count) +#define SLANG_BOUND_CHECK(index, count) \ + SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX(index, count) #endif #ifndef SLANG_BOUND_CHECK_BYTE_ADDRESS -# define SLANG_BOUND_CHECK_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) +#define SLANG_BOUND_CHECK_BYTE_ADDRESS(index, elemSize, sizeInBytes) \ + SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) \ + SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) #endif #ifndef SLANG_BOUND_CHECK_FIXED_ARRAY -# define SLANG_BOUND_CHECK_FIXED_ARRAY(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX_FIXED_ARRAY(index, count) +#define SLANG_BOUND_CHECK_FIXED_ARRAY(index, count) \ + SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX_FIXED_ARRAY(index, count) #endif struct TypeInfo @@ -51,34 +58,51 @@ struct TypeInfo size_t typeSize; }; -template <typename T, size_t SIZE> +template<typename T, size_t SIZE> struct FixedArray { - const T& operator[](size_t index) const { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; } - T& operator[](size_t index) { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; } + const T& operator[](size_t index) const + { + SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); + return m_data[index]; + } + T& operator[](size_t index) + { + SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); + return m_data[index]; + } T m_data[SIZE]; }; -// An array that has no specified size, becomes a 'Array'. This stores the size so it can potentially -// do bounds checking. -template <typename T> +// An array that has no specified size, becomes a 'Array'. This stores the size so it can +// potentially do bounds checking. +template<typename T> struct Array { - const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } - T& operator[](size_t index) { SLANG_BOUND_CHECK(index, count); return data[index]; } + const T& operator[](size_t index) const + { + SLANG_BOUND_CHECK(index, count); + return data[index]; + } + T& operator[](size_t index) + { + SLANG_BOUND_CHECK(index, count); + return data[index]; + } T* data; size_t count; }; -/* Constant buffers become a pointer to the contained type, so ConstantBuffer<T> becomes T* in C++ code. -*/ +/* Constant buffers become a pointer to the contained type, so ConstantBuffer<T> becomes T* in C++ + * code. + */ -template <typename T, int COUNT> +template<typename T, int COUNT> struct Vector; -template <typename T> +template<typename T> struct Vector<T, 1> { T x; @@ -86,58 +110,54 @@ struct Vector<T, 1> T& operator[](size_t /*index*/) { return x; } operator T() const { return x; } Vector() = default; - Vector(T scalar) - { - x = scalar; - } - template <typename U> + Vector(T scalar) { x = scalar; } + template<typename U> Vector(Vector<U, 1> other) { x = (T)other.x; } - template <typename U, int otherSize> + template<typename U, int otherSize> Vector(Vector<U, otherSize> other) { int minSize = 1; - if (otherSize < minSize) minSize = otherSize; + if (otherSize < minSize) + minSize = otherSize; for (int i = 0; i < minSize; i++) (*this)[i] = (T)other[i]; } }; -template <typename T> +template<typename T> struct Vector<T, 2> { T x, y; const T& operator[](size_t index) const { return index == 0 ? x : y; } T& operator[](size_t index) { return index == 0 ? x : y; } Vector() = default; - Vector(T scalar) - { - x = y = scalar; - } + Vector(T scalar) { x = y = scalar; } Vector(T _x, T _y) { x = _x; y = _y; } - template <typename U> + template<typename U> Vector(Vector<U, 2> other) { x = (T)other.x; y = (T)other.y; } - template <typename U, int otherSize> + template<typename U, int otherSize> Vector(Vector<U, otherSize> other) { int minSize = 2; - if (otherSize < minSize) minSize = otherSize; + if (otherSize < minSize) + minSize = otherSize; for (int i = 0; i < minSize; i++) (*this)[i] = (T)other[i]; } }; -template <typename T> +template<typename T> struct Vector<T, 3> { T x, y, z; @@ -145,34 +165,32 @@ struct Vector<T, 3> T& operator[](size_t index) { return *((T*)(this) + index); } Vector() = default; - Vector(T scalar) - { - x = y = z = scalar; - } + Vector(T scalar) { x = y = z = scalar; } Vector(T _x, T _y, T _z) { x = _x; y = _y; z = _z; } - template <typename U> + template<typename U> Vector(Vector<U, 3> other) { x = (T)other.x; y = (T)other.y; z = (T)other.z; } - template <typename U, int otherSize> + template<typename U, int otherSize> Vector(Vector<U, otherSize> other) { int minSize = 3; - if (otherSize < minSize) minSize = otherSize; + if (otherSize < minSize) + minSize = otherSize; for (int i = 0; i < minSize; i++) (*this)[i] = (T)other[i]; } }; -template <typename T> +template<typename T> struct Vector<T, 4> { T x, y, z, w; @@ -180,10 +198,7 @@ struct Vector<T, 4> const T& operator[](size_t index) const { return *((T*)(this) + index); } T& operator[](size_t index) { return *((T*)(this) + index); } Vector() = default; - Vector(T scalar) - { - x = y = z = w = scalar; - } + Vector(T scalar) { x = y = z = w = scalar; } Vector(T _x, T _y, T _z, T _w) { x = _x; @@ -191,19 +206,22 @@ struct Vector<T, 4> z = _z; w = _w; } - template <typename U, int otherSize> + template<typename U, int otherSize> Vector(Vector<U, otherSize> other) { int minSize = 4; - if (otherSize < minSize) minSize = otherSize; + if (otherSize < minSize) + minSize = otherSize; for (int i = 0; i < minSize; i++) (*this)[i] = (T)other[i]; } - }; template<typename T, int N> -SLANG_FORCE_INLINE Vector<T, N> _slang_select(Vector<bool, N> condition, Vector<T, N> v0, Vector<T, N> v1) +SLANG_FORCE_INLINE Vector<T, N> _slang_select( + Vector<bool, N> condition, + Vector<T, N> v0, + Vector<T, N> v1) { Vector<T, N> result; for (int i = 0; i < N; i++) @@ -228,7 +246,7 @@ SLANG_FORCE_INLINE T _slang_vector_get_element(Vector<T, N> x, int index) template<typename T, int N> SLANG_FORCE_INLINE const T* _slang_vector_get_element_ptr(const Vector<T, N>* x, int index) { - return &((*const_cast<Vector<T,N>*>(x))[index]); + return &((*const_cast<Vector<T, N>*>(x))[index]); } template<typename T, int N> @@ -253,66 +271,70 @@ SLANG_FORCE_INLINE Vector<T, n> _slang_vector_reshape(const Vector<OtherT, m> ot typedef uint32_t uint; -#define SLANG_VECTOR_BINARY_OP(T, op) \ - template<int n> \ - SLANG_FORCE_INLINE Vector<T, n> operator op(const Vector<T, n>& thisVal, const Vector<T, n>& other) \ - { \ - Vector<T, n> result;\ - for (int i = 0; i < n; i++) \ - result[i] = thisVal[i] op other[i]; \ - return result;\ - } -#define SLANG_VECTOR_BINARY_COMPARE_OP(T, op) \ - template<int n> \ - SLANG_FORCE_INLINE Vector<bool, n> operator op(const Vector<T, n>& thisVal, const Vector<T, n>& other) \ - { \ - Vector<bool, n> result;\ - for (int i = 0; i < n; i++) \ - result[i] = thisVal[i] op other[i]; \ - return result;\ - } - -#define SLANG_VECTOR_UNARY_OP(T, op) \ - template<int n> \ +#define SLANG_VECTOR_BINARY_OP(T, op) \ + template<int n> \ + SLANG_FORCE_INLINE Vector<T, n> operator op( \ + const Vector<T, n>& thisVal, \ + const Vector<T, n>& other) \ + { \ + Vector<T, n> result; \ + for (int i = 0; i < n; i++) \ + result[i] = thisVal[i] op other[i]; \ + return result; \ + } +#define SLANG_VECTOR_BINARY_COMPARE_OP(T, op) \ + template<int n> \ + SLANG_FORCE_INLINE Vector<bool, n> operator op( \ + const Vector<T, n>& thisVal, \ + const Vector<T, n>& other) \ + { \ + Vector<bool, n> result; \ + for (int i = 0; i < n; i++) \ + result[i] = thisVal[i] op other[i]; \ + return result; \ + } + +#define SLANG_VECTOR_UNARY_OP(T, op) \ + template<int n> \ SLANG_FORCE_INLINE Vector<T, n> operator op(const Vector<T, n>& thisVal) \ - { \ - Vector<T, n> result;\ - for (int i = 0; i < n; i++) \ - result[i] = op thisVal[i]; \ - return result;\ - } -#define SLANG_INT_VECTOR_OPS(T) \ - SLANG_VECTOR_BINARY_OP(T, +)\ - SLANG_VECTOR_BINARY_OP(T, -)\ - SLANG_VECTOR_BINARY_OP(T, *)\ - SLANG_VECTOR_BINARY_OP(T, / )\ - SLANG_VECTOR_BINARY_OP(T, &)\ - SLANG_VECTOR_BINARY_OP(T, |)\ - SLANG_VECTOR_BINARY_OP(T, &&)\ - SLANG_VECTOR_BINARY_OP(T, ||)\ - SLANG_VECTOR_BINARY_OP(T, ^)\ - SLANG_VECTOR_BINARY_OP(T, %)\ - SLANG_VECTOR_BINARY_OP(T, >>)\ - SLANG_VECTOR_BINARY_OP(T, <<)\ - SLANG_VECTOR_BINARY_COMPARE_OP(T, >)\ - SLANG_VECTOR_BINARY_COMPARE_OP(T, <)\ - SLANG_VECTOR_BINARY_COMPARE_OP(T, >=)\ - SLANG_VECTOR_BINARY_COMPARE_OP(T, <=)\ - SLANG_VECTOR_BINARY_COMPARE_OP(T, ==)\ - SLANG_VECTOR_BINARY_COMPARE_OP(T, !=)\ - SLANG_VECTOR_UNARY_OP(T, !)\ + { \ + Vector<T, n> result; \ + for (int i = 0; i < n; i++) \ + result[i] = op thisVal[i]; \ + return result; \ + } +#define SLANG_INT_VECTOR_OPS(T) \ + SLANG_VECTOR_BINARY_OP(T, +) \ + SLANG_VECTOR_BINARY_OP(T, -) \ + SLANG_VECTOR_BINARY_OP(T, *) \ + SLANG_VECTOR_BINARY_OP(T, /) \ + SLANG_VECTOR_BINARY_OP(T, &) \ + SLANG_VECTOR_BINARY_OP(T, |) \ + SLANG_VECTOR_BINARY_OP(T, &&) \ + SLANG_VECTOR_BINARY_OP(T, ||) \ + SLANG_VECTOR_BINARY_OP(T, ^) \ + SLANG_VECTOR_BINARY_OP(T, %) \ + SLANG_VECTOR_BINARY_OP(T, >>) \ + SLANG_VECTOR_BINARY_OP(T, <<) \ + SLANG_VECTOR_BINARY_COMPARE_OP(T, >) \ + SLANG_VECTOR_BINARY_COMPARE_OP(T, <) \ + SLANG_VECTOR_BINARY_COMPARE_OP(T, >=) \ + SLANG_VECTOR_BINARY_COMPARE_OP(T, <=) \ + SLANG_VECTOR_BINARY_COMPARE_OP(T, ==) \ + SLANG_VECTOR_BINARY_COMPARE_OP(T, !=) \ + SLANG_VECTOR_UNARY_OP(T, !) \ SLANG_VECTOR_UNARY_OP(T, ~) -#define SLANG_FLOAT_VECTOR_OPS(T) \ - SLANG_VECTOR_BINARY_OP(T, +)\ - SLANG_VECTOR_BINARY_OP(T, -)\ - SLANG_VECTOR_BINARY_OP(T, *)\ - SLANG_VECTOR_BINARY_OP(T, /)\ - SLANG_VECTOR_UNARY_OP(T, -)\ - SLANG_VECTOR_BINARY_COMPARE_OP(T, >)\ - SLANG_VECTOR_BINARY_COMPARE_OP(T, <)\ - SLANG_VECTOR_BINARY_COMPARE_OP(T, >=)\ - SLANG_VECTOR_BINARY_COMPARE_OP(T, <=)\ - SLANG_VECTOR_BINARY_COMPARE_OP(T, ==)\ +#define SLANG_FLOAT_VECTOR_OPS(T) \ + SLANG_VECTOR_BINARY_OP(T, +) \ + SLANG_VECTOR_BINARY_OP(T, -) \ + SLANG_VECTOR_BINARY_OP(T, *) \ + SLANG_VECTOR_BINARY_OP(T, /) \ + SLANG_VECTOR_UNARY_OP(T, -) \ + SLANG_VECTOR_BINARY_COMPARE_OP(T, >) \ + SLANG_VECTOR_BINARY_COMPARE_OP(T, <) \ + SLANG_VECTOR_BINARY_COMPARE_OP(T, >=) \ + SLANG_VECTOR_BINARY_COMPARE_OP(T, <=) \ + SLANG_VECTOR_BINARY_COMPARE_OP(T, ==) \ SLANG_VECTOR_BINARY_COMPARE_OP(T, !=) SLANG_INT_VECTOR_OPS(bool) @@ -328,14 +350,14 @@ SLANG_INT_VECTOR_OPS(uint64_t) SLANG_FLOAT_VECTOR_OPS(float) SLANG_FLOAT_VECTOR_OPS(double) -#define SLANG_VECTOR_INT_NEG_OP(T) \ - template<int N>\ +#define SLANG_VECTOR_INT_NEG_OP(T) \ + template<int N> \ Vector<T, N> operator-(const Vector<T, N>& thisVal) \ - { \ - Vector<T, N> result;\ - for (int i = 0; i < N; i++) \ - result[i] = 0 - thisVal[i]; \ - return result;\ + { \ + Vector<T, N> result; \ + for (int i = 0; i < N; i++) \ + result[i] = 0 - thisVal[i]; \ + return result; \ } SLANG_VECTOR_INT_NEG_OP(int) SLANG_VECTOR_INT_NEG_OP(int8_t) @@ -346,14 +368,14 @@ SLANG_VECTOR_INT_NEG_OP(uint8_t) SLANG_VECTOR_INT_NEG_OP(uint16_t) SLANG_VECTOR_INT_NEG_OP(uint64_t) -#define SLANG_FLOAT_VECTOR_MOD(T)\ - template<int N> \ +#define SLANG_FLOAT_VECTOR_MOD(T) \ + template<int N> \ Vector<T, N> operator%(const Vector<T, N>& left, const Vector<T, N>& right) \ - {\ - Vector<T, N> result;\ - for (int i = 0; i < N; i++) \ - result[i] = _slang_fmod(left[i], right[i]); \ - return result;\ + { \ + Vector<T, N> result; \ + for (int i = 0; i < N; i++) \ + result[i] = _slang_fmod(left[i], right[i]); \ + return result; \ } SLANG_FLOAT_VECTOR_MOD(float) @@ -366,7 +388,7 @@ SLANG_FLOAT_VECTOR_MOD(double) #undef SLANG_VECTOR_INT_NEG_OP #undef SLANG_FLOAT_VECTOR_MOD -template <typename T, int ROWS, int COLS> +template<typename T, int ROWS, int COLS> struct Matrix { Vector<T, COLS> rows[ROWS]; @@ -377,10 +399,7 @@ struct Matrix for (int i = 0; i < ROWS; i++) rows[i] = Vector<T, COLS>(scalar); } - Matrix(const Vector<T, COLS>& row0) - { - rows[0] = row0; - } + Matrix(const Vector<T, COLS>& row0) { rows[0] = row0; } Matrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1) { rows[0] = row0; @@ -392,7 +411,11 @@ struct Matrix rows[1] = row1; rows[2] = row2; } - Matrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1, const Vector<T, COLS>& row2, const Vector<T, COLS>& row3) + Matrix( + const Vector<T, COLS>& row0, + const Vector<T, COLS>& row1, + const Vector<T, COLS>& row2, + const Vector<T, COLS>& row3) { rows[0] = row0; rows[1] = row1; @@ -404,116 +427,188 @@ struct Matrix { int minRow = ROWS; int minCol = COLS; - if (minRow > otherRow) minRow = otherRow; - if (minCol > otherCol) minCol = otherCol; + if (minRow > otherRow) + minRow = otherRow; + if (minCol > otherCol) + minCol = otherCol; for (int i = 0; i < minRow; i++) for (int j = 0; j < minCol; j++) rows[i][j] = (T)other.rows[i][j]; } Matrix(T v0, T v1, T v2, T v3) { - rows[0][0] = v0; rows[0][1] = v1; - rows[1][0] = v2; rows[1][1] = v3; + rows[0][0] = v0; + rows[0][1] = v1; + rows[1][0] = v2; + rows[1][1] = v3; } Matrix(T v0, T v1, T v2, T v3, T v4, T v5) { if (COLS == 3) { - rows[0][0] = v0; rows[0][1] = v1; rows[0][2] = v2; - rows[1][0] = v3; rows[1][1] = v4; rows[1][2] = v5; + rows[0][0] = v0; + rows[0][1] = v1; + rows[0][2] = v2; + rows[1][0] = v3; + rows[1][1] = v4; + rows[1][2] = v5; } else { - rows[0][0] = v0; rows[0][1] = v1; - rows[1][0] = v2; rows[1][1] = v3; - rows[2][0] = v4; rows[2][1] = v5; + rows[0][0] = v0; + rows[0][1] = v1; + rows[1][0] = v2; + rows[1][1] = v3; + rows[2][0] = v4; + rows[2][1] = v5; } } Matrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) { if (COLS == 4) { - rows[0][0] = v0; rows[0][1] = v1; rows[0][2] = v2; rows[0][3] = v3; - rows[1][0] = v4; rows[1][1] = v5; rows[1][2] = v6; rows[1][3] = v7; + rows[0][0] = v0; + rows[0][1] = v1; + rows[0][2] = v2; + rows[0][3] = v3; + rows[1][0] = v4; + rows[1][1] = v5; + rows[1][2] = v6; + rows[1][3] = v7; } else { - rows[0][0] = v0; rows[0][1] = v1; - rows[1][0] = v2; rows[1][1] = v3; - rows[2][0] = v4; rows[2][1] = v5; - rows[3][0] = v6; rows[3][1] = v7; + rows[0][0] = v0; + rows[0][1] = v1; + rows[1][0] = v2; + rows[1][1] = v3; + rows[2][0] = v4; + rows[2][1] = v5; + rows[3][0] = v6; + rows[3][1] = v7; } } Matrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8) { - rows[0][0] = v0; rows[0][1] = v1; rows[0][2] = v2; - rows[1][0] = v3; rows[1][1] = v4; rows[1][2] = v5; - rows[2][0] = v6; rows[2][1] = v7; rows[2][2] = v8; + rows[0][0] = v0; + rows[0][1] = v1; + rows[0][2] = v2; + rows[1][0] = v3; + rows[1][1] = v4; + rows[1][2] = v5; + rows[2][0] = v6; + rows[2][1] = v7; + rows[2][2] = v8; } Matrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11) { if (COLS == 4) { - rows[0][0] = v0; rows[0][1] = v1; rows[0][2] = v2; rows[0][3] = v3; - rows[1][0] = v4; rows[1][1] = v5; rows[1][2] = v6; rows[1][3] = v7; - rows[2][0] = v8; rows[2][1] = v9; rows[2][2] = v10; rows[2][3] = v11; + rows[0][0] = v0; + rows[0][1] = v1; + rows[0][2] = v2; + rows[0][3] = v3; + rows[1][0] = v4; + rows[1][1] = v5; + rows[1][2] = v6; + rows[1][3] = v7; + rows[2][0] = v8; + rows[2][1] = v9; + rows[2][2] = v10; + rows[2][3] = v11; } else { - rows[0][0] = v0; rows[0][1] = v1; rows[0][2] = v2; - rows[1][0] = v3; rows[1][1] = v4; rows[1][2] = v5; - rows[2][0] = v6; rows[2][1] = v7; rows[2][2] = v8; - rows[3][0] = v9; rows[3][1] = v10; rows[3][2] = v11; + rows[0][0] = v0; + rows[0][1] = v1; + rows[0][2] = v2; + rows[1][0] = v3; + rows[1][1] = v4; + rows[1][2] = v5; + rows[2][0] = v6; + rows[2][1] = v7; + rows[2][2] = v8; + rows[3][0] = v9; + rows[3][1] = v10; + rows[3][2] = v11; } } - Matrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) + Matrix( + T v0, + T v1, + T v2, + T v3, + T v4, + T v5, + T v6, + T v7, + T v8, + T v9, + T v10, + T v11, + T v12, + T v13, + T v14, + T v15) { - rows[0][0] = v0; rows[0][1] = v1; rows[0][2] = v2; rows[0][3] = v3; - rows[1][0] = v4; rows[1][1] = v5; rows[1][2] = v6; rows[1][3] = v7; - rows[2][0] = v8; rows[2][1] = v9; rows[2][2] = v10; rows[2][3] = v11; - rows[3][0] = v12; rows[3][1] = v13; rows[3][2] = v14; rows[3][3] = v15; + rows[0][0] = v0; + rows[0][1] = v1; + rows[0][2] = v2; + rows[0][3] = v3; + rows[1][0] = v4; + rows[1][1] = v5; + rows[1][2] = v6; + rows[1][3] = v7; + rows[2][0] = v8; + rows[2][1] = v9; + rows[2][2] = v10; + rows[2][3] = v11; + rows[3][0] = v12; + rows[3][1] = v13; + rows[3][2] = v14; + rows[3][3] = v15; } }; -#define SLANG_MATRIX_BINARY_OP(T, op) \ - template<int R, int C> \ +#define SLANG_MATRIX_BINARY_OP(T, op) \ + template<int R, int C> \ Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal, const Matrix<T, R, C>& other) \ - { \ - Matrix<T, R, C> result;\ - for (int i = 0; i < R; i++) \ - for (int j = 0; j < C; j++) \ - result.rows[i][j] = thisVal.rows[i][j] op other.rows[i][j]; \ - return result;\ + { \ + Matrix<T, R, C> result; \ + for (int i = 0; i < R; i++) \ + for (int j = 0; j < C; j++) \ + result.rows[i][j] = thisVal.rows[i][j] op other.rows[i][j]; \ + return result; \ } -#define SLANG_MATRIX_UNARY_OP(T, op) \ - template<int R, int C> \ +#define SLANG_MATRIX_UNARY_OP(T, op) \ + template<int R, int C> \ Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal) \ - { \ - Matrix<T, R, C> result;\ - for (int i = 0; i < R; i++) \ - for (int j = 0; j < C; j++) \ - result[i].rows[i][j] = op thisVal.rows[i][j]; \ - return result;\ - } -#define SLANG_INT_MATRIX_OPS(T) \ - SLANG_MATRIX_BINARY_OP(T, +)\ - SLANG_MATRIX_BINARY_OP(T, -)\ - SLANG_MATRIX_BINARY_OP(T, *)\ - SLANG_MATRIX_BINARY_OP(T, / )\ - SLANG_MATRIX_BINARY_OP(T, &)\ - SLANG_MATRIX_BINARY_OP(T, |)\ - SLANG_MATRIX_BINARY_OP(T, &&)\ - SLANG_MATRIX_BINARY_OP(T, ||)\ - SLANG_MATRIX_BINARY_OP(T, ^)\ - SLANG_MATRIX_BINARY_OP(T, %)\ - SLANG_MATRIX_UNARY_OP(T, !)\ + { \ + Matrix<T, R, C> result; \ + for (int i = 0; i < R; i++) \ + for (int j = 0; j < C; j++) \ + result[i].rows[i][j] = op thisVal.rows[i][j]; \ + return result; \ + } +#define SLANG_INT_MATRIX_OPS(T) \ + SLANG_MATRIX_BINARY_OP(T, +) \ + SLANG_MATRIX_BINARY_OP(T, -) \ + SLANG_MATRIX_BINARY_OP(T, *) \ + SLANG_MATRIX_BINARY_OP(T, /) \ + SLANG_MATRIX_BINARY_OP(T, &) \ + SLANG_MATRIX_BINARY_OP(T, |) \ + SLANG_MATRIX_BINARY_OP(T, &&) \ + SLANG_MATRIX_BINARY_OP(T, ||) \ + SLANG_MATRIX_BINARY_OP(T, ^) \ + SLANG_MATRIX_BINARY_OP(T, %) \ + SLANG_MATRIX_UNARY_OP(T, !) \ SLANG_MATRIX_UNARY_OP(T, ~) #define SLANG_FLOAT_MATRIX_OPS(T) \ - SLANG_MATRIX_BINARY_OP(T, +)\ - SLANG_MATRIX_BINARY_OP(T, -)\ - SLANG_MATRIX_BINARY_OP(T, *)\ - SLANG_MATRIX_BINARY_OP(T, /)\ + SLANG_MATRIX_BINARY_OP(T, +) \ + SLANG_MATRIX_BINARY_OP(T, -) \ + SLANG_MATRIX_BINARY_OP(T, *) \ + SLANG_MATRIX_BINARY_OP(T, /) \ SLANG_MATRIX_UNARY_OP(T, -) SLANG_INT_MATRIX_OPS(int) SLANG_INT_MATRIX_OPS(int8_t) @@ -527,38 +622,38 @@ SLANG_INT_MATRIX_OPS(uint64_t) SLANG_FLOAT_MATRIX_OPS(float) SLANG_FLOAT_MATRIX_OPS(double) -#define SLANG_MATRIX_INT_NEG_OP(T) \ - template<int R, int C>\ +#define SLANG_MATRIX_INT_NEG_OP(T) \ + template<int R, int C> \ SLANG_FORCE_INLINE Matrix<T, R, C> operator-(Matrix<T, R, C> thisVal) \ - { \ - Matrix<T, R, C> result;\ - for (int i = 0; i < R; i++) \ - for (int j = 0; j < C; j++) \ - result.rows[i][j] = 0 - thisVal.rows[i][j]; \ - return result;\ - } - SLANG_MATRIX_INT_NEG_OP(int) - SLANG_MATRIX_INT_NEG_OP(int8_t) - SLANG_MATRIX_INT_NEG_OP(int16_t) - SLANG_MATRIX_INT_NEG_OP(int64_t) - SLANG_MATRIX_INT_NEG_OP(uint) - SLANG_MATRIX_INT_NEG_OP(uint8_t) - SLANG_MATRIX_INT_NEG_OP(uint16_t) - SLANG_MATRIX_INT_NEG_OP(uint64_t) - -#define SLANG_FLOAT_MATRIX_MOD(T)\ - template<int R, int C> \ + { \ + Matrix<T, R, C> result; \ + for (int i = 0; i < R; i++) \ + for (int j = 0; j < C; j++) \ + result.rows[i][j] = 0 - thisVal.rows[i][j]; \ + return result; \ + } +SLANG_MATRIX_INT_NEG_OP(int) +SLANG_MATRIX_INT_NEG_OP(int8_t) +SLANG_MATRIX_INT_NEG_OP(int16_t) +SLANG_MATRIX_INT_NEG_OP(int64_t) +SLANG_MATRIX_INT_NEG_OP(uint) +SLANG_MATRIX_INT_NEG_OP(uint8_t) +SLANG_MATRIX_INT_NEG_OP(uint16_t) +SLANG_MATRIX_INT_NEG_OP(uint64_t) + +#define SLANG_FLOAT_MATRIX_MOD(T) \ + template<int R, int C> \ SLANG_FORCE_INLINE Matrix<T, R, C> operator%(Matrix<T, R, C> left, Matrix<T, R, C> right) \ - {\ - Matrix<T, R, C> result;\ - for (int i = 0; i < R; i++) \ - for (int j = 0; j < C; j++) \ - result.rows[i][j] = _slang_fmod(left.rows[i][j], right.rows[i][j]); \ - return result;\ + { \ + Matrix<T, R, C> result; \ + for (int i = 0; i < R; i++) \ + for (int j = 0; j < C; j++) \ + result.rows[i][j] = _slang_fmod(left.rows[i][j], right.rows[i][j]); \ + return result; \ } - SLANG_FLOAT_MATRIX_MOD(float) - SLANG_FLOAT_MATRIX_MOD(double) +SLANG_FLOAT_MATRIX_MOD(float) +SLANG_FLOAT_MATRIX_MOD(double) #undef SLANG_FLOAT_MATRIX_MOD #undef SLANG_MATRIX_BINARY_OP #undef SLANG_MATRIX_UNARY_OP @@ -574,5 +669,3 @@ TResult slang_bit_cast(TInput val) } #endif - - diff --git a/prelude/slang-cpp-types.h b/prelude/slang-cpp-types.h index 3f805a8b7..010ab8d6c 100644 --- a/prelude/slang-cpp-types.h +++ b/prelude/slang-cpp-types.h @@ -2,11 +2,12 @@ #define SLANG_PRELUDE_CPP_TYPES_H #ifdef SLANG_PRELUDE_NAMESPACE -namespace SLANG_PRELUDE_NAMESPACE { +namespace SLANG_PRELUDE_NAMESPACE +{ #endif #ifndef SLANG_FORCE_INLINE -# define SLANG_FORCE_INLINE inline +#define SLANG_FORCE_INLINE inline #endif #include "slang-cpp-types-core.h" @@ -23,8 +24,8 @@ typedef Vector<uint32_t, 2> uint2; typedef Vector<uint32_t, 3> uint3; typedef Vector<uint32_t, 4> uint4; -// We can just map `NonUniformResourceIndex` type directly to the index type on CPU, as CPU does not require -// any special handling around such accesses. +// We can just map `NonUniformResourceIndex` type directly to the index type on CPU, as CPU does not +// require any special handling around such accesses. typedef size_t NonUniformResourceIndex; // ----------------------------- ResourceType ----------------------------------------- @@ -32,47 +33,87 @@ typedef size_t NonUniformResourceIndex; // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sm5-object-structuredbuffer-getdimensions // Missing Load(_In_ int Location, _Out_ uint Status); -template <typename T> +template<typename T> struct RWStructuredBuffer { - SLANG_FORCE_INLINE T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } - const T& Load(size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } - void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride) { *outNumStructs = uint32_t(count); *outStride = uint32_t(sizeof(T)); } - + SLANG_FORCE_INLINE T& operator[](size_t index) const + { + SLANG_BOUND_CHECK(index, count); + return data[index]; + } + const T& Load(size_t index) const + { + SLANG_BOUND_CHECK(index, count); + return data[index]; + } + void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride) + { + *outNumStructs = uint32_t(count); + *outStride = uint32_t(sizeof(T)); + } + T* data; size_t count; }; -template <typename T> +template<typename T> struct StructuredBuffer { - SLANG_FORCE_INLINE const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } - const T& Load(size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } - void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride) { *outNumStructs = uint32_t(count); *outStride = uint32_t(sizeof(T)); } - + SLANG_FORCE_INLINE const T& operator[](size_t index) const + { + SLANG_BOUND_CHECK(index, count); + return data[index]; + } + const T& Load(size_t index) const + { + SLANG_BOUND_CHECK(index, count); + return data[index]; + } + void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride) + { + *outNumStructs = uint32_t(count); + *outStride = uint32_t(sizeof(T)); + } + T* data; size_t count; }; -template <typename T> +template<typename T> struct RWBuffer { - SLANG_FORCE_INLINE T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } - const T& Load(size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } + SLANG_FORCE_INLINE T& operator[](size_t index) const + { + SLANG_BOUND_CHECK(index, count); + return data[index]; + } + const T& Load(size_t index) const + { + SLANG_BOUND_CHECK(index, count); + return data[index]; + } void GetDimensions(uint32_t* outCount) { *outCount = uint32_t(count); } - + T* data; size_t count; }; -template <typename T> +template<typename T> struct Buffer { - SLANG_FORCE_INLINE const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } - const T& Load(size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } + SLANG_FORCE_INLINE const T& operator[](size_t index) const + { + SLANG_BOUND_CHECK(index, count); + return data[index]; + } + const T& Load(size_t index) const + { + SLANG_BOUND_CHECK(index, count); + return data[index]; + } void GetDimensions(uint32_t* outCount) { *outCount = uint32_t(count); } - + T* data; size_t count; }; @@ -81,28 +122,28 @@ struct Buffer struct ByteAddressBuffer { void GetDimensions(uint32_t* outDim) const { *outDim = uint32_t(sizeInBytes); } - uint32_t Load(size_t index) const - { + uint32_t Load(size_t index) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes); - return data[index >> 2]; + return data[index >> 2]; } - uint2 Load2(size_t index) const - { + uint2 Load2(size_t index) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); - const size_t dataIdx = index >> 2; - return uint2{data[dataIdx], data[dataIdx + 1]}; + const size_t dataIdx = index >> 2; + return uint2{data[dataIdx], data[dataIdx + 1]}; } - uint3 Load3(size_t index) const - { + uint3 Load3(size_t index) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes); - const size_t dataIdx = index >> 2; - return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; + const size_t dataIdx = index >> 2; + return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; } - uint4 Load4(size_t index) const - { + uint4 Load4(size_t index) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes); - const size_t dataIdx = index >> 2; - return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; + const size_t dataIdx = index >> 2; + return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; } template<typename T> T Load(size_t index) const @@ -110,40 +151,40 @@ struct ByteAddressBuffer SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes); return *(const T*)(((const char*)data) + index); } - + const uint32_t* data; - size_t sizeInBytes; //< Must be multiple of 4 + size_t sizeInBytes; //< Must be multiple of 4 }; // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sm5-object-rwbyteaddressbuffer -// Missing support for Atomic operations +// Missing support for Atomic operations // Missing support for Load with status struct RWByteAddressBuffer { void GetDimensions(uint32_t* outDim) const { *outDim = uint32_t(sizeInBytes); } - - uint32_t Load(size_t index) const - { + + uint32_t Load(size_t index) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes); - return data[index >> 2]; + return data[index >> 2]; } - uint2 Load2(size_t index) const - { + uint2 Load2(size_t index) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); - const size_t dataIdx = index >> 2; - return uint2{data[dataIdx], data[dataIdx + 1]}; + const size_t dataIdx = index >> 2; + return uint2{data[dataIdx], data[dataIdx + 1]}; } - uint3 Load3(size_t index) const - { + uint3 Load3(size_t index) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes); - const size_t dataIdx = index >> 2; - return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; + const size_t dataIdx = index >> 2; + return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; } - uint4 Load4(size_t index) const - { + uint4 Load4(size_t index) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes); - const size_t dataIdx = index >> 2; - return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; + const size_t dataIdx = index >> 2; + return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; } template<typename T> T Load(size_t index) const @@ -152,30 +193,30 @@ struct RWByteAddressBuffer return *(const T*)(((const char*)data) + index); } - void Store(size_t index, uint32_t v) const - { + void Store(size_t index, uint32_t v) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes); - data[index >> 2] = v; + data[index >> 2] = v; } - void Store2(size_t index, uint2 v) const - { + void Store2(size_t index, uint2 v) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); - const size_t dataIdx = index >> 2; + const size_t dataIdx = index >> 2; data[dataIdx + 0] = v.x; data[dataIdx + 1] = v.y; } - void Store3(size_t index, uint3 v) const - { + void Store3(size_t index, uint3 v) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes); - const size_t dataIdx = index >> 2; + const size_t dataIdx = index >> 2; data[dataIdx + 0] = v.x; data[dataIdx + 1] = v.y; data[dataIdx + 2] = v.z; } - void Store4(size_t index, uint4 v) const - { + void Store4(size_t index, uint4 v) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes); - const size_t dataIdx = index >> 2; + const size_t dataIdx = index >> 2; data[dataIdx + 0] = v.x; data[dataIdx + 1] = v.y; data[dataIdx + 2] = v.z; @@ -189,7 +230,7 @@ struct RWByteAddressBuffer } uint32_t* data; - size_t sizeInBytes; //< Must be multiple of 4 + size_t sizeInBytes; //< Must be multiple of 4 }; struct ISamplerState; @@ -206,7 +247,7 @@ struct SamplerComparisonState }; #ifndef SLANG_RESOURCE_SHAPE -# define SLANG_RESOURCE_SHAPE +#define SLANG_RESOURCE_SHAPE typedef unsigned int SlangResourceShape; enum { @@ -243,7 +284,7 @@ enum }; #endif -// +// struct TextureDimensions { void reset() @@ -259,25 +300,25 @@ struct TextureDimensions int count = 0; switch (baseShape) { - case SLANG_TEXTURE_1D: + case SLANG_TEXTURE_1D: { outDims[count++] = width; break; } - case SLANG_TEXTURE_2D: + case SLANG_TEXTURE_2D: { outDims[count++] = width; outDims[count++] = height; break; } - case SLANG_TEXTURE_3D: + case SLANG_TEXTURE_3D: { outDims[count++] = width; outDims[count++] = height; outDims[count++] = depth; break; } - case SLANG_TEXTURE_CUBE: + case SLANG_TEXTURE_CUBE: { outDims[count++] = width; outDims[count++] = height; @@ -298,19 +339,19 @@ struct TextureDimensions int count = 0; switch (baseShape) { - case SLANG_TEXTURE_1D: + case SLANG_TEXTURE_1D: { outDims[count++] = width; break; } - case SLANG_TEXTURE_CUBE: - case SLANG_TEXTURE_2D: + case SLANG_TEXTURE_CUBE: + case SLANG_TEXTURE_2D: { outDims[count++] = width; outDims[count++] = height; break; } - case SLANG_TEXTURE_3D: + case SLANG_TEXTURE_3D: { outDims[count++] = width; outDims[count++] = height; @@ -345,97 +386,146 @@ struct TextureDimensions uint32_t shape; uint32_t width, height, depth; uint32_t numberOfLevels; - uint32_t arrayElementCount; ///< For array types, 0 otherwise + uint32_t arrayElementCount; ///< For array types, 0 otherwise }; - - - // Texture struct ITexture { virtual TextureDimensions GetDimensions(int mipLevel = -1) = 0; virtual void Load(const int32_t* v, void* outData, size_t dataSize) = 0; - virtual void Sample(SamplerState samplerState, const float* loc, void* outData, size_t dataSize) = 0; - virtual void SampleLevel(SamplerState samplerState, const float* loc, float level, void* outData, size_t dataSize) = 0; + virtual void Sample( + SamplerState samplerState, + const float* loc, + void* outData, + size_t dataSize) = 0; + virtual void SampleLevel( + SamplerState samplerState, + const float* loc, + float level, + void* outData, + size_t dataSize) = 0; }; -template <typename T> +template<typename T> struct Texture1D { void GetDimensions(uint32_t* outWidth) { *outWidth = texture->GetDimensions().width; } - void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outNumberOfLevels) - { - auto dims = texture->GetDimensions(mipLevel); - *outWidth = dims.width; - *outNumberOfLevels = dims.numberOfLevels; + void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outNumberOfLevels) + { + auto dims = texture->GetDimensions(mipLevel); + *outWidth = dims.width; + *outNumberOfLevels = dims.numberOfLevels; } - + void GetDimensions(float* outWidth) { *outWidth = texture->GetDimensions().width; } - void GetDimensions(uint32_t mipLevel, float* outWidth, float* outNumberOfLevels) - { - auto dims = texture->GetDimensions(mipLevel); - *outWidth = dims.width; - *outNumberOfLevels = dims.numberOfLevels; + void GetDimensions(uint32_t mipLevel, float* outWidth, float* outNumberOfLevels) + { + auto dims = texture->GetDimensions(mipLevel); + *outWidth = dims.width; + *outNumberOfLevels = dims.numberOfLevels; + } + + T Load(const int2& loc) const + { + T out; + texture->Load(&loc.x, &out, sizeof(out)); + return out; } - - T Load(const int2& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; } - T Sample(SamplerState samplerState, float loc) const { T out; texture->Sample(samplerState, &loc, &out, sizeof(out)); return out; } - T SampleLevel(SamplerState samplerState, float loc, float level) { T out; texture->SampleLevel(samplerState, &loc, level, &out, sizeof(out)); return out; } - - ITexture* texture; + T Sample(SamplerState samplerState, float loc) const + { + T out; + texture->Sample(samplerState, &loc, &out, sizeof(out)); + return out; + } + T SampleLevel(SamplerState samplerState, float loc, float level) + { + T out; + texture->SampleLevel(samplerState, &loc, level, &out, sizeof(out)); + return out; + } + + ITexture* texture; }; -template <typename T> +template<typename T> struct Texture2D { - void GetDimensions(uint32_t* outWidth, uint32_t* outHeight) - { - const auto dims = texture->GetDimensions(); - *outWidth = dims.width; - *outHeight = dims.height; + void GetDimensions(uint32_t* outWidth, uint32_t* outHeight) + { + const auto dims = texture->GetDimensions(); + *outWidth = dims.width; + *outHeight = dims.height; } - void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + uint32_t* outWidth, + uint32_t* outHeight, + uint32_t* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; *outHeight = dims.height; *outNumberOfLevels = dims.numberOfLevels; } - void GetDimensions(float* outWidth, float* outHeight) - { - const auto dims = texture->GetDimensions(); - *outWidth = dims.width; - *outHeight = dims.height; + void GetDimensions(float* outWidth, float* outHeight) + { + const auto dims = texture->GetDimensions(); + *outWidth = dims.width; + *outHeight = dims.height; } - void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + float* outWidth, + float* outHeight, + float* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; *outHeight = dims.height; *outNumberOfLevels = dims.numberOfLevels; } - - T Load(const int3& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; } - T Sample(SamplerState samplerState, const float2& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; } - T SampleLevel(SamplerState samplerState, const float2& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; } - - ITexture* texture; + + T Load(const int3& loc) const + { + T out; + texture->Load(&loc.x, &out, sizeof(out)); + return out; + } + T Sample(SamplerState samplerState, const float2& loc) const + { + T out; + texture->Sample(samplerState, &loc.x, &out, sizeof(out)); + return out; + } + T SampleLevel(SamplerState samplerState, const float2& loc, float level) + { + T out; + texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); + return out; + } + + ITexture* texture; }; -template <typename T> +template<typename T> struct Texture3D { void GetDimensions(uint32_t* outWidth, uint32_t* outHeight, uint32_t* outDepth) { - const auto dims = texture->GetDimensions(); - *outWidth = dims.width; - *outHeight = dims.height; + const auto dims = texture->GetDimensions(); + *outWidth = dims.width; + *outHeight = dims.height; *outDepth = dims.depth; } - void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outDepth, uint32_t* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + uint32_t* outWidth, + uint32_t* outHeight, + uint32_t* outDepth, + uint32_t* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; @@ -445,12 +535,17 @@ struct Texture3D } void GetDimensions(float* outWidth, float* outHeight, float* outDepth) { - const auto dims = texture->GetDimensions(); - *outWidth = dims.width; - *outHeight = dims.height; + const auto dims = texture->GetDimensions(); + *outWidth = dims.width; + *outHeight = dims.height; *outDepth = dims.depth; } - void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outDepth, float* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + float* outWidth, + float* outHeight, + float* outDepth, + float* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; @@ -458,78 +553,144 @@ struct Texture3D *outDepth = dims.depth; *outNumberOfLevels = dims.numberOfLevels; } - - T Load(const int4& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; } - T Sample(SamplerState samplerState, const float3& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; } - T SampleLevel(SamplerState samplerState, const float3& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; } - - ITexture* texture; + + T Load(const int4& loc) const + { + T out; + texture->Load(&loc.x, &out, sizeof(out)); + return out; + } + T Sample(SamplerState samplerState, const float3& loc) const + { + T out; + texture->Sample(samplerState, &loc.x, &out, sizeof(out)); + return out; + } + T SampleLevel(SamplerState samplerState, const float3& loc, float level) + { + T out; + texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); + return out; + } + + ITexture* texture; }; -template <typename T> +template<typename T> struct TextureCube { - void GetDimensions(uint32_t* outWidth, uint32_t* outHeight) - { - const auto dims = texture->GetDimensions(); - *outWidth = dims.width; - *outHeight = dims.height; + void GetDimensions(uint32_t* outWidth, uint32_t* outHeight) + { + const auto dims = texture->GetDimensions(); + *outWidth = dims.width; + *outHeight = dims.height; } - void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + uint32_t* outWidth, + uint32_t* outHeight, + uint32_t* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; *outHeight = dims.height; *outNumberOfLevels = dims.numberOfLevels; } - void GetDimensions(float* outWidth, float* outHeight) - { - const auto dims = texture->GetDimensions(); - *outWidth = dims.width; - *outHeight = dims.height; + void GetDimensions(float* outWidth, float* outHeight) + { + const auto dims = texture->GetDimensions(); + *outWidth = dims.width; + *outHeight = dims.height; } - void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + float* outWidth, + float* outHeight, + float* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; *outHeight = dims.height; *outNumberOfLevels = dims.numberOfLevels; } - - T Sample(SamplerState samplerState, const float3& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; } - T SampleLevel(SamplerState samplerState, const float3& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; } - - ITexture* texture; + + T Sample(SamplerState samplerState, const float3& loc) const + { + T out; + texture->Sample(samplerState, &loc.x, &out, sizeof(out)); + return out; + } + T SampleLevel(SamplerState samplerState, const float3& loc, float level) + { + T out; + texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); + return out; + } + + ITexture* texture; }; -template <typename T> +template<typename T> struct Texture1DArray { - void GetDimensions(uint32_t* outWidth, uint32_t* outElements) { auto dims = texture->GetDimensions(); *outWidth = dims.width; *outElements = dims.arrayElementCount; } - void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outElements, uint32_t* outNumberOfLevels) + void GetDimensions(uint32_t* outWidth, uint32_t* outElements) { - auto dims = texture->GetDimensions(mipLevel); - *outWidth = dims.width; + auto dims = texture->GetDimensions(); + *outWidth = dims.width; + *outElements = dims.arrayElementCount; + } + void GetDimensions( + uint32_t mipLevel, + uint32_t* outWidth, + uint32_t* outElements, + uint32_t* outNumberOfLevels) + { + auto dims = texture->GetDimensions(mipLevel); + *outWidth = dims.width; *outNumberOfLevels = dims.numberOfLevels; - *outElements = dims.arrayElementCount; - } - void GetDimensions(float* outWidth, float* outElements) { auto dims = texture->GetDimensions(); *outWidth = dims.width; *outElements = dims.arrayElementCount; } - void GetDimensions(uint32_t mipLevel, float* outWidth, float* outElements, float* outNumberOfLevels) + *outElements = dims.arrayElementCount; + } + void GetDimensions(float* outWidth, float* outElements) { - auto dims = texture->GetDimensions(mipLevel); - *outWidth = dims.width; + auto dims = texture->GetDimensions(); + *outWidth = dims.width; + *outElements = dims.arrayElementCount; + } + void GetDimensions( + uint32_t mipLevel, + float* outWidth, + float* outElements, + float* outNumberOfLevels) + { + auto dims = texture->GetDimensions(mipLevel); + *outWidth = dims.width; *outNumberOfLevels = dims.numberOfLevels; - *outElements = dims.arrayElementCount; + *outElements = dims.arrayElementCount; + } + + T Load(const int3& loc) const + { + T out; + texture->Load(&loc.x, &out, sizeof(out)); + return out; + } + T Sample(SamplerState samplerState, const float2& loc) const + { + T out; + texture->Sample(samplerState, &loc.x, &out, sizeof(out)); + return out; + } + T SampleLevel(SamplerState samplerState, const float2& loc, float level) + { + T out; + texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); + return out; } - - T Load(const int3& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; } - T Sample(SamplerState samplerState, const float2& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; } - T SampleLevel(SamplerState samplerState, const float2& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; } - - ITexture* texture; + + ITexture* texture; }; -template <typename T> +template<typename T> struct Texture2DArray { void GetDimensions(uint32_t* outWidth, uint32_t* outHeight, uint32_t* outElements) @@ -539,7 +700,12 @@ struct Texture2DArray *outHeight = dims.height; *outElements = dims.arrayElementCount; } - void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outElements, uint32_t* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + uint32_t* outWidth, + uint32_t* outHeight, + uint32_t* outElements, + uint32_t* outNumberOfLevels) { auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; @@ -547,7 +713,7 @@ struct Texture2DArray *outElements = dims.arrayElementCount; *outNumberOfLevels = dims.numberOfLevels; } - + void GetDimensions(uint32_t* outWidth, float* outHeight, float* outElements) { auto dims = texture->GetDimensions(); @@ -555,7 +721,12 @@ struct Texture2DArray *outHeight = dims.height; *outElements = dims.arrayElementCount; } - void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outElements, float* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + float* outWidth, + float* outHeight, + float* outElements, + float* outNumberOfLevels) { auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; @@ -563,15 +734,30 @@ struct Texture2DArray *outElements = dims.arrayElementCount; *outNumberOfLevels = dims.numberOfLevels; } - - T Load(const int4& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; } - T Sample(SamplerState samplerState, const float3& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; } - T SampleLevel(SamplerState samplerState, const float3& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; } - - ITexture* texture; + + T Load(const int4& loc) const + { + T out; + texture->Load(&loc.x, &out, sizeof(out)); + return out; + } + T Sample(SamplerState samplerState, const float3& loc) const + { + T out; + texture->Sample(samplerState, &loc.x, &out, sizeof(out)); + return out; + } + T SampleLevel(SamplerState samplerState, const float3& loc, float level) + { + T out; + texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); + return out; + } + + ITexture* texture; }; -template <typename T> +template<typename T> struct TextureCubeArray { void GetDimensions(uint32_t* outWidth, uint32_t* outHeight, uint32_t* outElements) @@ -581,7 +767,12 @@ struct TextureCubeArray *outHeight = dims.height; *outElements = dims.arrayElementCount; } - void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outElements, uint32_t* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + uint32_t* outWidth, + uint32_t* outHeight, + uint32_t* outElements, + uint32_t* outNumberOfLevels) { auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; @@ -589,7 +780,7 @@ struct TextureCubeArray *outElements = dims.arrayElementCount; *outNumberOfLevels = dims.numberOfLevels; } - + void GetDimensions(uint32_t* outWidth, float* outHeight, float* outElements) { auto dims = texture->GetDimensions(); @@ -597,7 +788,12 @@ struct TextureCubeArray *outHeight = dims.height; *outElements = dims.arrayElementCount; } - void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outElements, float* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + float* outWidth, + float* outHeight, + float* outElements, + float* outNumberOfLevels) { auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; @@ -605,81 +801,124 @@ struct TextureCubeArray *outElements = dims.arrayElementCount; *outNumberOfLevels = dims.numberOfLevels; } - - T Sample(SamplerState samplerState, const float4& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; } - T SampleLevel(SamplerState samplerState, const float4& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; } - - ITexture* texture; + + T Sample(SamplerState samplerState, const float4& loc) const + { + T out; + texture->Sample(samplerState, &loc.x, &out, sizeof(out)); + return out; + } + T SampleLevel(SamplerState samplerState, const float4& loc, float level) + { + T out; + texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); + return out; + } + + ITexture* texture; }; /* !!!!!!!!!!!!!!!!!!!!!!!!!!! RWTexture !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */ struct IRWTexture : ITexture { - /// Get the reference to the element at loc. + /// Get the reference to the element at loc. virtual void* refAt(const uint32_t* loc) = 0; }; -template <typename T> +template<typename T> struct RWTexture1D { void GetDimensions(uint32_t* outWidth) { *outWidth = texture->GetDimensions().width; } - void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outNumberOfLevels) { auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; *outNumberOfLevels = dims.numberOfLevels; } - + void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outNumberOfLevels) + { + auto dims = texture->GetDimensions(mipLevel); + *outWidth = dims.width; + *outNumberOfLevels = dims.numberOfLevels; + } + void GetDimensions(float* outWidth) { *outWidth = texture->GetDimensions().width; } - void GetDimensions(uint32_t mipLevel, float* outWidth, float* outNumberOfLevels) { auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; *outNumberOfLevels = dims.numberOfLevels; } - - T Load(int32_t loc) const { T out; texture->Load(&loc, &out, sizeof(out)); return out; } + void GetDimensions(uint32_t mipLevel, float* outWidth, float* outNumberOfLevels) + { + auto dims = texture->GetDimensions(mipLevel); + *outWidth = dims.width; + *outNumberOfLevels = dims.numberOfLevels; + } + + T Load(int32_t loc) const + { + T out; + texture->Load(&loc, &out, sizeof(out)); + return out; + } T& operator[](uint32_t loc) { return *(T*)texture->refAt(&loc); } - IRWTexture* texture; + IRWTexture* texture; }; -template <typename T> +template<typename T> struct RWTexture2D { - void GetDimensions(uint32_t* outWidth, uint32_t* outHeight) - { - const auto dims = texture->GetDimensions(); - *outWidth = dims.width; - *outHeight = dims.height; + void GetDimensions(uint32_t* outWidth, uint32_t* outHeight) + { + const auto dims = texture->GetDimensions(); + *outWidth = dims.width; + *outHeight = dims.height; } - void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + uint32_t* outWidth, + uint32_t* outHeight, + uint32_t* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; *outHeight = dims.height; *outNumberOfLevels = dims.numberOfLevels; } - void GetDimensions(float* outWidth, float* outHeight) - { - const auto dims = texture->GetDimensions(); - *outWidth = dims.width; - *outHeight = dims.height; + void GetDimensions(float* outWidth, float* outHeight) + { + const auto dims = texture->GetDimensions(); + *outWidth = dims.width; + *outHeight = dims.height; } - void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + float* outWidth, + float* outHeight, + float* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; *outHeight = dims.height; *outNumberOfLevels = dims.numberOfLevels; } - - T Load(const int2& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; } + + T Load(const int2& loc) const + { + T out; + texture->Load(&loc.x, &out, sizeof(out)); + return out; + } T& operator[](const uint2& loc) { return *(T*)texture->refAt(&loc.x); } IRWTexture* texture; }; -template <typename T> +template<typename T> struct RWTexture3D { void GetDimensions(uint32_t* outWidth, uint32_t* outHeight, uint32_t* outDepth) { - const auto dims = texture->GetDimensions(); - *outWidth = dims.width; - *outHeight = dims.height; + const auto dims = texture->GetDimensions(); + *outWidth = dims.width; + *outHeight = dims.height; *outDepth = dims.depth; } - void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outDepth, uint32_t* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + uint32_t* outWidth, + uint32_t* outHeight, + uint32_t* outDepth, + uint32_t* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; @@ -689,12 +928,17 @@ struct RWTexture3D } void GetDimensions(float* outWidth, float* outHeight, float* outDepth) { - const auto dims = texture->GetDimensions(); - *outWidth = dims.width; - *outHeight = dims.height; + const auto dims = texture->GetDimensions(); + *outWidth = dims.width; + *outHeight = dims.height; *outDepth = dims.depth; } - void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outDepth, float* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + float* outWidth, + float* outHeight, + float* outDepth, + float* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; @@ -702,60 +946,83 @@ struct RWTexture3D *outDepth = dims.depth; *outNumberOfLevels = dims.numberOfLevels; } - - T Load(const int3& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; } + + T Load(const int3& loc) const + { + T out; + texture->Load(&loc.x, &out, sizeof(out)); + return out; + } T& operator[](const uint3& loc) { return *(T*)texture->refAt(&loc.x); } IRWTexture* texture; }; -template <typename T> +template<typename T> struct RWTexture1DArray { - void GetDimensions(uint32_t* outWidth, uint32_t* outElements) - { - auto dims = texture->GetDimensions(); - *outWidth = dims.width; - *outElements = dims.arrayElementCount; + void GetDimensions(uint32_t* outWidth, uint32_t* outElements) + { + auto dims = texture->GetDimensions(); + *outWidth = dims.width; + *outElements = dims.arrayElementCount; } - void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outElements, uint32_t* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + uint32_t* outWidth, + uint32_t* outElements, + uint32_t* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; *outElements = dims.arrayElementCount; *outNumberOfLevels = dims.numberOfLevels; } - void GetDimensions(float* outWidth, float* outElements) - { - auto dims = texture->GetDimensions(); - *outWidth = dims.width; - *outElements = dims.arrayElementCount; + void GetDimensions(float* outWidth, float* outElements) + { + auto dims = texture->GetDimensions(); + *outWidth = dims.width; + *outElements = dims.arrayElementCount; } - void GetDimensions(uint32_t mipLevel, float* outWidth, float* outElements, float* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + float* outWidth, + float* outElements, + float* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; *outElements = dims.arrayElementCount; *outNumberOfLevels = dims.numberOfLevels; } - - T Load(int2 loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; } + + T Load(int2 loc) const + { + T out; + texture->Load(&loc.x, &out, sizeof(out)); + return out; + } T& operator[](uint2 loc) { return *(T*)texture->refAt(&loc.x); } IRWTexture* texture; }; -template <typename T> +template<typename T> struct RWTexture2DArray { void GetDimensions(uint32_t* outWidth, uint32_t* outHeight, uint32_t* outElements) { - auto dims = texture->GetDimensions(); - *outWidth = dims.width; + auto dims = texture->GetDimensions(); + *outWidth = dims.width; *outHeight = dims.height; - *outElements = dims.arrayElementCount; + *outElements = dims.arrayElementCount; } - void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outElements, uint32_t* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + uint32_t* outWidth, + uint32_t* outHeight, + uint32_t* outElements, + uint32_t* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; @@ -765,12 +1032,17 @@ struct RWTexture2DArray } void GetDimensions(float* outWidth, float* outHeight, float* outElements) { - auto dims = texture->GetDimensions(); - *outWidth = dims.width; + auto dims = texture->GetDimensions(); + *outWidth = dims.width; *outHeight = dims.height; - *outElements = dims.arrayElementCount; + *outElements = dims.arrayElementCount; } - void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outElements, float* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + float* outWidth, + float* outHeight, + float* outElements, + float* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; @@ -778,8 +1050,13 @@ struct RWTexture2DArray *outElements = dims.arrayElementCount; *outNumberOfLevels = dims.numberOfLevels; } - - T Load(const int3& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; } + + T Load(const int3& loc) const + { + T out; + texture->Load(&loc.x, &out, sizeof(out)); + return out; + } T& operator[](const uint3& loc) { return *(T*)texture->refAt(&loc.x); } IRWTexture* texture; @@ -787,91 +1064,167 @@ struct RWTexture2DArray // FeedbackTexture -struct FeedbackType {}; -struct SAMPLER_FEEDBACK_MIN_MIP : FeedbackType {}; -struct SAMPLER_FEEDBACK_MIP_REGION_USED : FeedbackType {}; +struct FeedbackType +{ +}; +struct SAMPLER_FEEDBACK_MIN_MIP : FeedbackType +{ +}; +struct SAMPLER_FEEDBACK_MIP_REGION_USED : FeedbackType +{ +}; struct IFeedbackTexture { virtual TextureDimensions GetDimensions(int mipLevel = -1) = 0; - // Note here we pass the optional clamp parameter as a pointer. Passing nullptr means no clamp. - // This was preferred over having two function definitions, and having to differentiate their names - virtual void WriteSamplerFeedback(ITexture* tex, SamplerState samp, const float* location, const float* clamp = nullptr) = 0; - virtual void WriteSamplerFeedbackBias(ITexture* tex, SamplerState samp, const float* location, float bias, const float* clamp = nullptr) = 0; - virtual void WriteSamplerFeedbackGrad(ITexture* tex, SamplerState samp, const float* location, const float* ddx, const float* ddy, const float* clamp = nullptr) = 0; - - virtual void WriteSamplerFeedbackLevel(ITexture* tex, SamplerState samp, const float* location, float lod) = 0; + // Note here we pass the optional clamp parameter as a pointer. Passing nullptr means no clamp. + // This was preferred over having two function definitions, and having to differentiate their + // names + virtual void WriteSamplerFeedback( + ITexture* tex, + SamplerState samp, + const float* location, + const float* clamp = nullptr) = 0; + virtual void WriteSamplerFeedbackBias( + ITexture* tex, + SamplerState samp, + const float* location, + float bias, + const float* clamp = nullptr) = 0; + virtual void WriteSamplerFeedbackGrad( + ITexture* tex, + SamplerState samp, + const float* location, + const float* ddx, + const float* ddy, + const float* clamp = nullptr) = 0; + + virtual void WriteSamplerFeedbackLevel( + ITexture* tex, + SamplerState samp, + const float* location, + float lod) = 0; }; -template <typename T> +template<typename T> struct FeedbackTexture2D { - void GetDimensions(uint32_t* outWidth, uint32_t* outHeight) - { - const auto dims = texture->GetDimensions(); - *outWidth = dims.width; - *outHeight = dims.height; + void GetDimensions(uint32_t* outWidth, uint32_t* outHeight) + { + const auto dims = texture->GetDimensions(); + *outWidth = dims.width; + *outHeight = dims.height; } - void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + uint32_t* outWidth, + uint32_t* outHeight, + uint32_t* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; *outHeight = dims.height; *outNumberOfLevels = dims.numberOfLevels; } - void GetDimensions(float* outWidth, float* outHeight) - { - const auto dims = texture->GetDimensions(); - *outWidth = dims.width; - *outHeight = dims.height; + void GetDimensions(float* outWidth, float* outHeight) + { + const auto dims = texture->GetDimensions(); + *outWidth = dims.width; + *outHeight = dims.height; } - void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + float* outWidth, + float* outHeight, + float* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; *outHeight = dims.height; *outNumberOfLevels = dims.numberOfLevels; } - - template <typename S> - void WriteSamplerFeedback(Texture2D<S> tex, SamplerState samp, float2 location, float clamp) { texture->WriteSamplerFeedback(tex.texture, samp, &location.x, &clamp); } - template <typename S> - void WriteSamplerFeedbackBias(Texture2D<S> tex, SamplerState samp, float2 location, float bias, float clamp) { texture->WriteSamplerFeedbackBias(tex.texture, samp, &location.x, bias, &clamp); } + template<typename S> + void WriteSamplerFeedback(Texture2D<S> tex, SamplerState samp, float2 location, float clamp) + { + texture->WriteSamplerFeedback(tex.texture, samp, &location.x, &clamp); + } - template <typename S> - void WriteSamplerFeedbackGrad(Texture2D<S> tex, SamplerState samp, float2 location, float2 ddx, float2 ddy, float clamp) { texture->WriteSamplerFeedbackGrad(tex.texture, samp, &location.x, &ddx.x, &ddy.x, &clamp); } + template<typename S> + void WriteSamplerFeedbackBias( + Texture2D<S> tex, + SamplerState samp, + float2 location, + float bias, + float clamp) + { + texture->WriteSamplerFeedbackBias(tex.texture, samp, &location.x, bias, &clamp); + } + + template<typename S> + void WriteSamplerFeedbackGrad( + Texture2D<S> tex, + SamplerState samp, + float2 location, + float2 ddx, + float2 ddy, + float clamp) + { + texture->WriteSamplerFeedbackGrad(tex.texture, samp, &location.x, &ddx.x, &ddy.x, &clamp); + } // Level - template <typename S> - void WriteSamplerFeedbackLevel(Texture2D<S> tex, SamplerState samp, float2 location, float lod) { texture->WriteSamplerFeedbackLevel(tex.texture, samp, &location.x, lod); } - + template<typename S> + void WriteSamplerFeedbackLevel(Texture2D<S> tex, SamplerState samp, float2 location, float lod) + { + texture->WriteSamplerFeedbackLevel(tex.texture, samp, &location.x, lod); + } + // Without Clamp - template <typename S> - void WriteSamplerFeedback(Texture2D<S> tex, SamplerState samp, float2 location) { texture->WriteSamplerFeedback(tex.texture, samp, &location.x); } + template<typename S> + void WriteSamplerFeedback(Texture2D<S> tex, SamplerState samp, float2 location) + { + texture->WriteSamplerFeedback(tex.texture, samp, &location.x); + } + + template<typename S> + void WriteSamplerFeedbackBias(Texture2D<S> tex, SamplerState samp, float2 location, float bias) + { + texture->WriteSamplerFeedbackBias(tex.texture, samp, &location.x, bias); + } - template <typename S> - void WriteSamplerFeedbackBias(Texture2D<S> tex, SamplerState samp, float2 location, float bias) { texture->WriteSamplerFeedbackBias(tex.texture, samp, &location.x, bias); } + template<typename S> + void WriteSamplerFeedbackGrad( + Texture2D<S> tex, + SamplerState samp, + float2 location, + float2 ddx, + float2 ddy) + { + texture->WriteSamplerFeedbackGrad(tex.texture, samp, &location.x, &ddx.x, &ddy.x); + } - template <typename S> - void WriteSamplerFeedbackGrad(Texture2D<S> tex, SamplerState samp, float2 location, float2 ddx, float2 ddy) { texture->WriteSamplerFeedbackGrad(tex.texture, samp, &location.x, &ddx.x, &ddy.x); } - IFeedbackTexture* texture; }; -template <typename T> +template<typename T> struct FeedbackTexture2DArray { void GetDimensions(uint32_t* outWidth, uint32_t* outHeight, uint32_t* outElements) { - auto dims = texture->GetDimensions(); - *outWidth = dims.width; + auto dims = texture->GetDimensions(); + *outWidth = dims.width; *outHeight = dims.height; - *outElements = dims.arrayElementCount; + *outElements = dims.arrayElementCount; } - void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outElements, uint32_t* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + uint32_t* outWidth, + uint32_t* outHeight, + uint32_t* outElements, + uint32_t* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; @@ -881,12 +1234,17 @@ struct FeedbackTexture2DArray } void GetDimensions(float* outWidth, float* outHeight, float* outElements) { - auto dims = texture->GetDimensions(); - *outWidth = dims.width; + auto dims = texture->GetDimensions(); + *outWidth = dims.width; *outHeight = dims.height; - *outElements = dims.arrayElementCount; + *outElements = dims.arrayElementCount; } - void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outElements, float* outNumberOfLevels) + void GetDimensions( + uint32_t mipLevel, + float* outWidth, + float* outHeight, + float* outElements, + float* outNumberOfLevels) { const auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; @@ -894,31 +1252,81 @@ struct FeedbackTexture2DArray *outElements = dims.arrayElementCount; *outNumberOfLevels = dims.numberOfLevels; } - - template <typename S> - void WriteSamplerFeedback(Texture2DArray<S> texArray, SamplerState samp, float3 location, float clamp) { texture->WriteSamplerFeedback(texArray.texture, samp, &location.x, &clamp); } - template <typename S> - void WriteSamplerFeedbackBias(Texture2DArray<S> texArray, SamplerState samp, float3 location, float bias, float clamp) { texture->WriteSamplerFeedbackBias(texArray.texture, samp, &location.x, bias, &clamp); } + template<typename S> + void WriteSamplerFeedback( + Texture2DArray<S> texArray, + SamplerState samp, + float3 location, + float clamp) + { + texture->WriteSamplerFeedback(texArray.texture, samp, &location.x, &clamp); + } + + template<typename S> + void WriteSamplerFeedbackBias( + Texture2DArray<S> texArray, + SamplerState samp, + float3 location, + float bias, + float clamp) + { + texture->WriteSamplerFeedbackBias(texArray.texture, samp, &location.x, bias, &clamp); + } - template <typename S> - void WriteSamplerFeedbackGrad(Texture2DArray<S> texArray, SamplerState samp, float3 location, float3 ddx, float3 ddy, float clamp) { texture->WriteSamplerFeedbackGrad(texArray.texture, samp, &location.x, &ddx.x, &ddy.x, &clamp); } + template<typename S> + void WriteSamplerFeedbackGrad( + Texture2DArray<S> texArray, + SamplerState samp, + float3 location, + float3 ddx, + float3 ddy, + float clamp) + { + texture + ->WriteSamplerFeedbackGrad(texArray.texture, samp, &location.x, &ddx.x, &ddy.x, &clamp); + } // Level - template <typename S> - void WriteSamplerFeedbackLevel(Texture2DArray<S> texArray, SamplerState samp, float3 location, float lod) { texture->WriteSamplerFeedbackLevel(texArray.texture, samp, &location.x, lod); } + template<typename S> + void WriteSamplerFeedbackLevel( + Texture2DArray<S> texArray, + SamplerState samp, + float3 location, + float lod) + { + texture->WriteSamplerFeedbackLevel(texArray.texture, samp, &location.x, lod); + } // Without Clamp - template <typename S> - void WriteSamplerFeedback(Texture2DArray<S> texArray, SamplerState samp, float3 location) { texture->WriteSamplerFeedback(texArray.texture, samp, &location.x); } + template<typename S> + void WriteSamplerFeedback(Texture2DArray<S> texArray, SamplerState samp, float3 location) + { + texture->WriteSamplerFeedback(texArray.texture, samp, &location.x); + } - template <typename S> - void WriteSamplerFeedbackBias(Texture2DArray<S> texArray, SamplerState samp, float3 location, float bias) { texture->WriteSamplerFeedbackBias(texArray.texture, samp, &location.x, bias); } + template<typename S> + void WriteSamplerFeedbackBias( + Texture2DArray<S> texArray, + SamplerState samp, + float3 location, + float bias) + { + texture->WriteSamplerFeedbackBias(texArray.texture, samp, &location.x, bias); + } + + template<typename S> + void WriteSamplerFeedbackGrad( + Texture2DArray<S> texArray, + SamplerState samp, + float3 location, + float3 ddx, + float3 ddy) + { + texture->WriteSamplerFeedbackGrad(texArray.texture, samp, &location.x, &ddx.x, &ddy.x); + } - template <typename S> - void WriteSamplerFeedbackGrad(Texture2DArray<S> texArray, SamplerState samp, float3 location, float3 ddx, float3 ddy) { texture->WriteSamplerFeedbackGrad(texArray.texture, samp, &location.x, &ddx.x, &ddy.x); } - IFeedbackTexture* texture; }; @@ -933,20 +1341,24 @@ struct ComputeThreadVaryingInput struct ComputeVaryingInput { - uint3 startGroupID; ///< start groupID - uint3 endGroupID; ///< Non inclusive end groupID + uint3 startGroupID; ///< start groupID + uint3 endGroupID; ///< Non inclusive end groupID }; -// The uniformEntryPointParams and uniformState must be set to structures that match layout that the kernel expects. -// This can be determined via reflection for example. +// The uniformEntryPointParams and uniformState must be set to structures that match layout that the +// kernel expects. This can be determined via reflection for example. -typedef void(*ComputeThreadFunc)(ComputeThreadVaryingInput* varyingInput, void* uniformEntryPointParams, void* uniformState); -typedef void(*ComputeFunc)(ComputeVaryingInput* varyingInput, void* uniformEntryPointParams, void* uniformState); +typedef void (*ComputeThreadFunc)( + ComputeThreadVaryingInput* varyingInput, + void* uniformEntryPointParams, + void* uniformState); +typedef void (*ComputeFunc)( + ComputeVaryingInput* varyingInput, + void* uniformEntryPointParams, + void* uniformState); #ifdef SLANG_PRELUDE_NAMESPACE } #endif #endif - - diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index e0335f08a..9ac903955 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -15,51 +15,53 @@ #endif -// Define SLANG_CUDA_ENABLE_HALF to use the cuda_fp16 include to add half support. +// Define SLANG_CUDA_ENABLE_HALF to use the cuda_fp16 include to add half support. // For this to work NVRTC needs to have the path to the CUDA SDK. // -// As it stands the includes paths defined for Slang are passed down to NVRTC. Similarly defines defined for the Slang compile -// are passed down. +// As it stands the includes paths defined for Slang are passed down to NVRTC. Similarly defines +// defined for the Slang compile are passed down. #ifdef SLANG_CUDA_ENABLE_HALF -// We don't want half2 operators, because it will implement comparison operators that return a bool(!). We want to generate -// those functions. Doing so means that we will have to define all the other half2 operators. -# define __CUDA_NO_HALF2_OPERATORS__ -# include <cuda_fp16.h> +// We don't want half2 operators, because it will implement comparison operators that return a +// bool(!). We want to generate those functions. Doing so means that we will have to define all +// the other half2 operators. +#define __CUDA_NO_HALF2_OPERATORS__ +#include <cuda_fp16.h> #endif #ifdef SLANG_CUDA_ENABLE_OPTIX #include <optix.h> #endif -// Define slang offsetof implementation +// Define slang offsetof implementation #ifndef SLANG_OFFSET_OF -# define SLANG_OFFSET_OF(type, member) (size_t)((char*)&(((type *)0)->member) - (char*)0) +#define SLANG_OFFSET_OF(type, member) (size_t)((char*)&(((type*)0)->member) - (char*)0) #endif #ifndef SLANG_ALIGN_OF -# define SLANG_ALIGN_OF(type) __alignof__(type) +#define SLANG_ALIGN_OF(type) __alignof__(type) #endif // Must be large enough to cause overflow and therefore infinity #ifndef SLANG_INFINITY -# define SLANG_INFINITY ((float)(1e+300 * 1e+300)) +#define SLANG_INFINITY ((float)(1e+300 * 1e+300)) #endif // For now we'll disable any asserts in this prelude -#define SLANG_PRELUDE_ASSERT(x) +#define SLANG_PRELUDE_ASSERT(x) -#ifndef SLANG_CUDA_WARP_SIZE -# define SLANG_CUDA_WARP_SIZE 32 +#ifndef SLANG_CUDA_WARP_SIZE +#define SLANG_CUDA_WARP_SIZE 32 #endif -#define SLANG_CUDA_WARP_MASK (SLANG_CUDA_WARP_SIZE - 1) // Used for masking threadIdx.x to the warp lane index +#define SLANG_CUDA_WARP_MASK \ + (SLANG_CUDA_WARP_SIZE - 1) // Used for masking threadIdx.x to the warp lane index #define SLANG_CUDA_WARP_BITMASK (~int(0)) // #define SLANG_FORCE_INLINE inline -#define SLANG_CUDA_CALL __device__ +#define SLANG_CUDA_CALL __device__ #define SLANG_FORCE_INLINE inline #define SLANG_INLINE inline @@ -71,54 +73,63 @@ // Asserts for bounds checking. // It is assumed index/count are unsigned types. -#define SLANG_BOUND_ASSERT(index, count) SLANG_PRELUDE_ASSERT(index < count); -#define SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_PRELUDE_ASSERT(index <= (sizeInBytes - elemSize) && (index & 3) == 0); +#define SLANG_BOUND_ASSERT(index, count) SLANG_PRELUDE_ASSERT(index < count); +#define SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) \ + SLANG_PRELUDE_ASSERT(index <= (sizeInBytes - elemSize) && (index & 3) == 0); // Macros to zero index if an access is out of range -#define SLANG_BOUND_ZERO_INDEX(index, count) index = (index < count) ? index : 0; -#define SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) index = (index <= (sizeInBytes - elemSize)) ? index : 0; - -// The 'FIX' macro define how the index is fixed. The default is to do nothing. If SLANG_ENABLE_BOUND_ZERO_INDEX -// the fix macro will zero the index, if out of range -#ifdef SLANG_ENABLE_BOUND_ZERO_INDEX -# define SLANG_BOUND_FIX(index, count) SLANG_BOUND_ZERO_INDEX(index, count) -# define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) -# define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) SLANG_BOUND_ZERO_INDEX(index, count) SLANG_BOUND_ZERO_INDEX(index, count) +#define SLANG_BOUND_ZERO_INDEX(index, count) index = (index < count) ? index : 0; +#define SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) \ + index = (index <= (sizeInBytes - elemSize)) ? index : 0; + +// The 'FIX' macro define how the index is fixed. The default is to do nothing. If +// SLANG_ENABLE_BOUND_ZERO_INDEX the fix macro will zero the index, if out of range +#ifdef SLANG_ENABLE_BOUND_ZERO_INDEX +#define SLANG_BOUND_FIX(index, count) SLANG_BOUND_ZERO_INDEX(index, count) +#define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) \ + SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) +#define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) \ + SLANG_BOUND_ZERO_INDEX(index, count) SLANG_BOUND_ZERO_INDEX(index, count) #else -# define SLANG_BOUND_FIX(index, count) -# define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) -# define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) +#define SLANG_BOUND_FIX(index, count) +#define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) +#define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) #endif #ifndef SLANG_BOUND_CHECK -# define SLANG_BOUND_CHECK(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX(index, count) +#define SLANG_BOUND_CHECK(index, count) \ + SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX(index, count) #endif #ifndef SLANG_BOUND_CHECK_BYTE_ADDRESS -# define SLANG_BOUND_CHECK_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) +#define SLANG_BOUND_CHECK_BYTE_ADDRESS(index, elemSize, sizeInBytes) \ + SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) \ + SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) #endif #ifndef SLANG_BOUND_CHECK_FIXED_ARRAY -# define SLANG_BOUND_CHECK_FIXED_ARRAY(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX_FIXED_ARRAY(index, count) +#define SLANG_BOUND_CHECK_FIXED_ARRAY(index, count) \ + SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX_FIXED_ARRAY(index, count) #endif - // This macro handles how out-of-range surface coordinates are handled; - // I can equal - // cudaBoundaryModeClamp, in which case out-of-range coordinates are clamped to the valid range - // cudaBoundaryModeZero, in which case out-of-range reads return zero and out-of-range writes are ignored - // cudaBoundaryModeTrap, in which case out-of-range accesses cause the kernel execution to fail. - +// This macro handles how out-of-range surface coordinates are handled; +// I can equal +// cudaBoundaryModeClamp, in which case out-of-range coordinates are clamped to the valid range +// cudaBoundaryModeZero, in which case out-of-range reads return zero and out-of-range writes are +// ignored cudaBoundaryModeTrap, in which case out-of-range accesses cause the kernel execution to +// fail. + #ifndef SLANG_CUDA_BOUNDARY_MODE -# define SLANG_CUDA_BOUNDARY_MODE cudaBoundaryModeZero +#define SLANG_CUDA_BOUNDARY_MODE cudaBoundaryModeZero // Can be one of SLANG_CUDA_PTX_BOUNDARY_MODE. Only applies *PTX* emitted CUDA operations // which currently is just RWTextureRW format writes -// +// // .trap causes an execution trap on out-of-bounds addresses // .clamp stores data at the nearest surface location (sized appropriately) -// .zero drops stores to out-of-bounds addresses +// .zero drops stores to out-of-bounds addresses -# define SLANG_PTX_BOUNDARY_MODE "zero" +#define SLANG_PTX_BOUNDARY_MODE "zero" #endif struct TypeInfo @@ -126,51 +137,67 @@ struct TypeInfo size_t typeSize; }; -template <typename T, size_t SIZE> +template<typename T, size_t SIZE> struct FixedArray { - SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; } - SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; } - + SLANG_CUDA_CALL const T& operator[](size_t index) const + { + SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); + return m_data[index]; + } + SLANG_CUDA_CALL T& operator[](size_t index) + { + SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); + return m_data[index]; + } + T m_data[SIZE]; }; -// An array that has no specified size, becomes a 'Array'. This stores the size so it can potentially -// do bounds checking. -template <typename T> +// An array that has no specified size, becomes a 'Array'. This stores the size so it can +// potentially do bounds checking. +template<typename T> struct Array { - SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } - SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_BOUND_CHECK(index, count); return data[index]; } - + SLANG_CUDA_CALL const T& operator[](size_t index) const + { + SLANG_BOUND_CHECK(index, count); + return data[index]; + } + SLANG_CUDA_CALL T& operator[](size_t index) + { + SLANG_BOUND_CHECK(index, count); + return data[index]; + } + T* data; size_t count; }; // Typically defined in cuda.h, but we can't ship/rely on that, so just define here -typedef unsigned long long CUtexObject; -typedef unsigned long long CUsurfObject; +typedef unsigned long long CUtexObject; +typedef unsigned long long CUsurfObject; -// On CUDA sampler state is actually bound up with the texture object. We have a SamplerState type, -// backed as a pointer, to simplify code generation, with the downside that such a binding will take up -// uniform space, even though it will have no effect. +// On CUDA sampler state is actually bound up with the texture object. We have a SamplerState type, +// backed as a pointer, to simplify code generation, with the downside that such a binding will take +// up uniform space, even though it will have no effect. // TODO(JS): Consider ways to strip use of variables of this type so have no binding, struct SamplerStateUnused; typedef SamplerStateUnused* SamplerState; // TODO(JS): Not clear yet if this can be handled on CUDA, by just ignoring. -// For now, just map to the index type. +// For now, just map to the index type. typedef size_t NonUniformResourceIndex; // Code generator will generate the specific type -template <typename T, int ROWS, int COLS> +template<typename T, int ROWS, int COLS> struct Matrix; typedef int1 bool1; typedef int2 bool2; typedef int3 bool3; -typedef int4 bool4; +typedef int4 bool4; #if SLANG_CUDA_RTC @@ -193,7 +220,7 @@ typedef unsigned char uchar; typedef unsigned short ushort; typedef unsigned int uint; -union Union32 +union Union32 { uint32_t u; int32_t i; @@ -225,16 +252,37 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL double _slang_fmod(double x, double y) #if SLANG_CUDA_ENABLE_HALF // Add the other vector half types -struct __half1 { __half x; }; -struct __align__(4) __half3 { __half x, y, z; }; -struct __align__(4) __half4 { __half x, y, z, w; }; +struct __half1 +{ + __half x; +}; +struct __align__(4) __half3 +{ + __half x, y, z; +}; +struct __align__(4) __half4 +{ + __half x, y, z, w; +}; #endif -#define SLANG_VECTOR_GET_ELEMENT(T) \ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##1 x, int index) { return ((T*)(&x))[index]; }\ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##2 x, int index) { return ((T*)(&x))[index]; }\ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##3 x, int index) { return ((T*)(&x))[index]; }\ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##4 x, int index) { return ((T*)(&x))[index]; } +#define SLANG_VECTOR_GET_ELEMENT(T) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##1 x, int index) \ + { \ + return ((T*)(&x))[index]; \ + } \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##2 x, int index) \ + { \ + return ((T*)(&x))[index]; \ + } \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##3 x, int index) \ + { \ + return ((T*)(&x))[index]; \ + } \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##4 x, int index) \ + { \ + return ((T*)(&x))[index]; \ + } SLANG_VECTOR_GET_ELEMENT(int) SLANG_VECTOR_GET_ELEMENT(uint) SLANG_VECTOR_GET_ELEMENT(short) @@ -246,11 +294,23 @@ SLANG_VECTOR_GET_ELEMENT(ulonglong) SLANG_VECTOR_GET_ELEMENT(float) SLANG_VECTOR_GET_ELEMENT(double) -#define SLANG_VECTOR_GET_ELEMENT_PTR(T) \ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##1* x, int index) { return ((T*)(x)) + index; }\ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##2* x, int index) { return ((T*)(x)) + index; }\ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##3* x, int index) { return ((T*)(x)) + index; }\ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##4* x, int index) { return ((T*)(x)) + index; } +#define SLANG_VECTOR_GET_ELEMENT_PTR(T) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##1 * x, int index) \ + { \ + return ((T*)(x)) + index; \ + } \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##2 * x, int index) \ + { \ + return ((T*)(x)) + index; \ + } \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##3 * x, int index) \ + { \ + return ((T*)(x)) + index; \ + } \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##4 * x, int index) \ + { \ + return ((T*)(x)) + index; \ + } SLANG_VECTOR_GET_ELEMENT_PTR(int) SLANG_VECTOR_GET_ELEMENT_PTR(uint) SLANG_VECTOR_GET_ELEMENT_PTR(short) @@ -267,57 +327,60 @@ SLANG_VECTOR_GET_ELEMENT(__half) SLANG_VECTOR_GET_ELEMENT_PTR(__half) #endif -#define SLANG_CUDA_VECTOR_BINARY_OP(T, n, op) \ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator op(T##n thisVal, T##n other) \ - { \ - T##n result;\ - for (int i = 0; i < n; i++) \ - *_slang_vector_get_element_ptr(&result, i) = _slang_vector_get_element(thisVal,i) op _slang_vector_get_element(other,i); \ - return result;\ +#define SLANG_CUDA_VECTOR_BINARY_OP(T, n, op) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator op(T##n thisVal, T##n other) \ + { \ + T##n result; \ + for (int i = 0; i < n; i++) \ + *_slang_vector_get_element_ptr(&result, i) = \ + _slang_vector_get_element(thisVal, i) op _slang_vector_get_element(other, i); \ + return result; \ } -#define SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, op) \ +#define SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, op) \ SLANG_FORCE_INLINE SLANG_CUDA_CALL bool##n operator op(T##n thisVal, T##n other) \ - { \ - bool##n result;\ - for (int i = 0; i < n; i++) \ - *_slang_vector_get_element_ptr(&result, i) = (int)(_slang_vector_get_element(thisVal,i) op _slang_vector_get_element(other,i)); \ - return result;\ - } -#define SLANG_CUDA_VECTOR_UNARY_OP(T, n, op) \ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator op(T##n thisVal) \ - { \ - T##n result;\ - for (int i = 0; i < n; i++) \ - *_slang_vector_get_element_ptr(&result, i) = op _slang_vector_get_element(thisVal,i); \ - return result;\ - } - -#define SLANG_CUDA_VECTOR_INT_OP(T, n) \ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, +)\ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, -)\ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, *)\ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, /)\ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, %)\ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, ^)\ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, &)\ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, |)\ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, &&)\ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, ||)\ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, >>)\ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, <<)\ - SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >)\ - SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <)\ - SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >=)\ - SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <=)\ - SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, ==)\ - SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, !=)\ - SLANG_CUDA_VECTOR_UNARY_OP(T, n, !)\ - SLANG_CUDA_VECTOR_UNARY_OP(T, n, -)\ + { \ + bool##n result; \ + for (int i = 0; i < n; i++) \ + *_slang_vector_get_element_ptr(&result, i) = \ + (int)(_slang_vector_get_element(thisVal, i) \ + op _slang_vector_get_element(other, i)); \ + return result; \ + } +#define SLANG_CUDA_VECTOR_UNARY_OP(T, n, op) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator op(T##n thisVal) \ + { \ + T##n result; \ + for (int i = 0; i < n; i++) \ + *_slang_vector_get_element_ptr(&result, i) = op _slang_vector_get_element(thisVal, i); \ + return result; \ + } + +#define SLANG_CUDA_VECTOR_INT_OP(T, n) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, +) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, -) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, *) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, /) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, %) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, ^) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, &) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, |) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, &&) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, ||) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, >>) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, <<) \ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >) \ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <) \ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >=) \ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <=) \ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, ==) \ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, !=) \ + SLANG_CUDA_VECTOR_UNARY_OP(T, n, !) \ + SLANG_CUDA_VECTOR_UNARY_OP(T, n, -) \ SLANG_CUDA_VECTOR_UNARY_OP(T, n, ~) #define SLANG_CUDA_VECTOR_INT_OPS(T) \ - SLANG_CUDA_VECTOR_INT_OP(T, 2) \ - SLANG_CUDA_VECTOR_INT_OP(T, 3) \ + SLANG_CUDA_VECTOR_INT_OP(T, 2) \ + SLANG_CUDA_VECTOR_INT_OP(T, 3) \ SLANG_CUDA_VECTOR_INT_OP(T, 4) SLANG_CUDA_VECTOR_INT_OPS(int) @@ -329,23 +392,23 @@ SLANG_CUDA_VECTOR_INT_OPS(uchar) SLANG_CUDA_VECTOR_INT_OPS(longlong) SLANG_CUDA_VECTOR_INT_OPS(ulonglong) -#define SLANG_CUDA_VECTOR_FLOAT_OP(T, n) \ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, +)\ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, -)\ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, *)\ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, /)\ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, &&)\ - SLANG_CUDA_VECTOR_BINARY_OP(T, n, ||)\ - SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >)\ - SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <)\ - SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >=)\ - SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <=)\ - SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, ==)\ - SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, !=)\ +#define SLANG_CUDA_VECTOR_FLOAT_OP(T, n) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, +) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, -) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, *) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, /) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, &&) \ + SLANG_CUDA_VECTOR_BINARY_OP(T, n, ||) \ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >) \ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <) \ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >=) \ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <=) \ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, ==) \ + SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, !=) \ SLANG_CUDA_VECTOR_UNARY_OP(T, n, -) #define SLANG_CUDA_VECTOR_FLOAT_OPS(T) \ - SLANG_CUDA_VECTOR_FLOAT_OP(T, 2) \ - SLANG_CUDA_VECTOR_FLOAT_OP(T, 3) \ + SLANG_CUDA_VECTOR_FLOAT_OP(T, 2) \ + SLANG_CUDA_VECTOR_FLOAT_OP(T, 3) \ SLANG_CUDA_VECTOR_FLOAT_OP(T, 4) SLANG_CUDA_VECTOR_FLOAT_OPS(float) @@ -353,27 +416,38 @@ SLANG_CUDA_VECTOR_FLOAT_OPS(double) #if SLANG_CUDA_ENABLE_HALF SLANG_CUDA_VECTOR_FLOAT_OPS(__half) #endif -#define SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, n)\ +#define SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, n) \ SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator%(const T##n& left, const T##n& right) \ - {\ - T##n result;\ - for (int i = 0; i < n; i++) \ - *_slang_vector_get_element_ptr(&result, i) = _slang_fmod(_slang_vector_get_element(left,i), _slang_vector_get_element(right,i)); \ - return result;\ - } -#define SLANG_CUDA_FLOAT_VECTOR_MOD(T) \ - SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 2)\ - SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 3)\ + { \ + T##n result; \ + for (int i = 0; i < n; i++) \ + *_slang_vector_get_element_ptr(&result, i) = _slang_fmod( \ + _slang_vector_get_element(left, i), \ + _slang_vector_get_element(right, i)); \ + return result; \ + } +#define SLANG_CUDA_FLOAT_VECTOR_MOD(T) \ + SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 2) \ + SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 3) \ SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 4) SLANG_CUDA_FLOAT_VECTOR_MOD(float) SLANG_CUDA_FLOAT_VECTOR_MOD(double) #if SLANG_CUDA_RTC || SLANG_CUDA_ENABLE_HALF -#define SLANG_MAKE_VECTOR(T) \ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x, T y) { return T##2{x, y}; }\ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x, T y, T z) { return T##3{ x, y, z }; }\ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x, T y, T z, T w) { return T##4{ x, y, z, w }; } +#define SLANG_MAKE_VECTOR(T) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x, T y) \ + { \ + return T##2 {x, y}; \ + } \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x, T y, T z) \ + { \ + return T##3 {x, y, z}; \ + } \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x, T y, T z, T w) \ + { \ + return T##4 {x, y, z, w}; \ + } #endif #if SLANG_CUDA_RTC @@ -393,25 +467,67 @@ SLANG_MAKE_VECTOR(ulonglong) SLANG_MAKE_VECTOR(__half) #endif -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool1 make_bool1(bool x) { return bool1{ x }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool2 make_bool2(bool x, bool y) { return bool2{ x, y }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool3 make_bool3(bool x, bool y, bool z) { return bool3{ x, y, z }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool4 make_bool4(bool x, bool y, bool z, bool w) { return bool4{ x, y, z, w }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool2 make_bool2(bool x) { return bool2{ x, x }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool3 make_bool3(bool x) { return bool3{ x, x, x }; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool4 make_bool4(bool x) { return bool4{ x, x, x, x }; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool1 make_bool1(bool x) +{ + return bool1{x}; +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool2 make_bool2(bool x, bool y) +{ + return bool2{x, y}; +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool3 make_bool3(bool x, bool y, bool z) +{ + return bool3{x, y, z}; +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool4 make_bool4(bool x, bool y, bool z, bool w) +{ + return bool4{x, y, z, w}; +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool2 make_bool2(bool x) +{ + return bool2{x, x}; +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool3 make_bool3(bool x) +{ + return bool3{x, x, x}; +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool4 make_bool4(bool x) +{ + return bool4{x, x, x, x}; +} #if SLANG_CUDA_RTC -#define SLANG_MAKE_VECTOR_FROM_SCALAR(T) \ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T##1 make_##T##1(T x) { return T##1{x}; }\ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x) { return make_##T##2(x, x); }\ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x) { return make_##T##3(x, x, x); }\ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x) { return make_##T##4(x, x, x, x); } +#define SLANG_MAKE_VECTOR_FROM_SCALAR(T) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##1 make_##T##1(T x) \ + { \ + return T##1 {x}; \ + } \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x) \ + { \ + return make_##T##2(x, x); \ + } \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x) \ + { \ + return make_##T##3(x, x, x); \ + } \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x) \ + { \ + return make_##T##4(x, x, x, x); \ + } #else -#define SLANG_MAKE_VECTOR_FROM_SCALAR(T) \ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x) { return make_##T##2(x, x); }\ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x) { return make_##T##3(x, x, x); }\ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x) { return make_##T##4(x, x, x, x); } +#define SLANG_MAKE_VECTOR_FROM_SCALAR(T) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x) \ + { \ + return make_##T##2(x, x); \ + } \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x) \ + { \ + return make_##T##3(x, x, x); \ + } \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x) \ + { \ + return make_##T##4(x, x, x, x); \ + } #endif SLANG_MAKE_VECTOR_FROM_SCALAR(int) SLANG_MAKE_VECTOR_FROM_SCALAR(uint) @@ -426,18 +542,22 @@ SLANG_MAKE_VECTOR_FROM_SCALAR(double) #if SLANG_CUDA_ENABLE_HALF SLANG_MAKE_VECTOR_FROM_SCALAR(__half) #if !SLANG_CUDA_RTC -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half1 make___half1(__half x) { return __half1{x}; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half1 make___half1(__half x) +{ + return __half1{x}; +} #endif #endif -#define SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(Fn,T,N) \ - SLANG_FORCE_INLINE SLANG_CUDA_CALL T##N Fn(T##N* address, T##N val) \ - {\ - T##N result; \ - for (int i = 0; i < N; i++) \ - *_slang_vector_get_element_ptr(&result, i) = Fn(_slang_vector_get_element_ptr(address, i), _slang_vector_get_element(val, i)); \ - return result; \ - }\ +#define SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(Fn, T, N) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T##N Fn(T##N* address, T##N val) \ + { \ + T##N result; \ + for (int i = 0; i < N; i++) \ + *_slang_vector_get_element_ptr(&result, i) = \ + Fn(_slang_vector_get_element_ptr(address, i), _slang_vector_get_element(val, i)); \ + return result; \ + } #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 900 SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, float, 2) @@ -455,19 +575,24 @@ SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, ulonglong, 3) SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, ulonglong, 4) template<typename T, int n> -struct GetVectorTypeImpl {}; - -#define GET_VECTOR_TYPE_IMPL(T, n)\ -template<>\ -struct GetVectorTypeImpl<T,n>\ -{\ - typedef T##n type;\ - static SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n fromScalar(T v) { return make_##T##n(v); } \ +struct GetVectorTypeImpl +{ }; -#define GET_VECTOR_TYPE_IMPL_N(T)\ - GET_VECTOR_TYPE_IMPL(T, 1)\ - GET_VECTOR_TYPE_IMPL(T, 2)\ - GET_VECTOR_TYPE_IMPL(T, 3)\ + +#define GET_VECTOR_TYPE_IMPL(T, n) \ + template<> \ + struct GetVectorTypeImpl<T, n> \ + { \ + typedef T##n type; \ + static SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n fromScalar(T v) \ + { \ + return make_##T##n(v); \ + } \ + }; +#define GET_VECTOR_TYPE_IMPL_N(T) \ + GET_VECTOR_TYPE_IMPL(T, 1) \ + GET_VECTOR_TYPE_IMPL(T, 2) \ + GET_VECTOR_TYPE_IMPL(T, 3) \ GET_VECTOR_TYPE_IMPL(T, 4) GET_VECTOR_TYPE_IMPL_N(int) @@ -500,11 +625,14 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, n> _slang_vector_reshape(const Vect return result; } -template <typename T, int ROWS, int COLS> +template<typename T, int ROWS, int COLS> struct Matrix { Vector<T, COLS> rows[ROWS]; - SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, COLS>& operator[](size_t index) { return rows[index]; } + SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, COLS>& operator[](size_t index) + { + return rows[index]; + } }; @@ -515,7 +643,6 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T scalar) for (int i = 0; i < ROWS; i++) result.rows[i] = GetVectorTypeImpl<T, COLS>::fromScalar(scalar); return result; - } template<typename T, int ROWS, int COLS> @@ -527,7 +654,9 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector } template<typename T, int ROWS, int COLS> -SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1) +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix( + const Vector<T, COLS>& row0, + const Vector<T, COLS>& row1) { Matrix<T, ROWS, COLS> result; result.rows[0] = row0; @@ -536,7 +665,10 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector } template<typename T, int ROWS, int COLS> -SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1, const Vector<T, COLS>& row2) +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix( + const Vector<T, COLS>& row0, + const Vector<T, COLS>& row1, + const Vector<T, COLS>& row2) { Matrix<T, ROWS, COLS> result; result.rows[0] = row0; @@ -546,7 +678,11 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector } template<typename T, int ROWS, int COLS> -SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1, const Vector<T, COLS>& row2, const Vector<T, COLS>& row3) +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix( + const Vector<T, COLS>& row0, + const Vector<T, COLS>& row1, + const Vector<T, COLS>& row2, + const Vector<T, COLS>& row3) { Matrix<T, ROWS, COLS> result; result.rows[0] = row0; @@ -557,16 +693,20 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector } template<typename T, int ROWS, int COLS, typename U, int otherRow, int otherCol> -SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Matrix<U, otherRow, otherCol>& other) +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix( + const Matrix<U, otherRow, otherCol>& other) { Matrix<T, ROWS, COLS> result; int minRow = ROWS; int minCol = COLS; - if (minRow > otherRow) minRow = otherRow; - if (minCol > otherCol) minCol = otherCol; + if (minRow > otherRow) + minRow = otherRow; + if (minCol > otherCol) + minCol = otherCol; for (int i = 0; i < minRow; i++) for (int j = 0; j < minCol; j++) - *_slang_vector_get_element_ptr(result.rows + i, j) = (T)_slang_vector_get_element(other.rows[i], j); + *_slang_vector_get_element_ptr(result.rows + i, j) = + (T)_slang_vector_get_element(other.rows[i], j); return result; } @@ -574,129 +714,238 @@ template<typename T, int ROWS, int COLS> SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3) { Matrix<T, ROWS, COLS> rs; - rs.rows[0].x = v0; rs.rows[0].y = v1; - rs.rows[1].x = v2; rs.rows[1].y = v3; + rs.rows[0].x = v0; + rs.rows[0].y = v1; + rs.rows[1].x = v2; + rs.rows[1].y = v3; return rs; } template<typename T, int ROWS, int COLS> -SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5) +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix( + T v0, + T v1, + T v2, + T v3, + T v4, + T v5) { Matrix<T, ROWS, COLS> rs; if (COLS == 3) { - rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; - rs.rows[1].x = v3; rs.rows[1].y = v4; rs.rows[1].z = v5; + rs.rows[0].x = v0; + rs.rows[0].y = v1; + rs.rows[0].z = v2; + rs.rows[1].x = v3; + rs.rows[1].y = v4; + rs.rows[1].z = v5; } else { - rs.rows[0].x = v0; rs.rows[0].y = v1; - rs.rows[1].x = v2; rs.rows[1].y = v3; - rs.rows[2].x = v4; rs.rows[2].y = v5; + rs.rows[0].x = v0; + rs.rows[0].y = v1; + rs.rows[1].x = v2; + rs.rows[1].y = v3; + rs.rows[2].x = v4; + rs.rows[2].y = v5; } return rs; - } template<typename T, int ROWS, int COLS> -SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix( + T v0, + T v1, + T v2, + T v3, + T v4, + T v5, + T v6, + T v7) { Matrix<T, ROWS, COLS> rs; if (COLS == 4) { - rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; rs.rows[0].w = v3; - rs.rows[1].x = v4; rs.rows[1].y = v5; rs.rows[1].z = v6; rs.rows[1].w = v7; + rs.rows[0].x = v0; + rs.rows[0].y = v1; + rs.rows[0].z = v2; + rs.rows[0].w = v3; + rs.rows[1].x = v4; + rs.rows[1].y = v5; + rs.rows[1].z = v6; + rs.rows[1].w = v7; } else { - rs.rows[0].x = v0; rs.rows[0].y = v1; - rs.rows[1].x = v2; rs.rows[1].y = v3; - rs.rows[2].x = v4; rs.rows[2].y = v5; - rs.rows[3].x = v6; rs.rows[3].y = v7; + rs.rows[0].x = v0; + rs.rows[0].y = v1; + rs.rows[1].x = v2; + rs.rows[1].y = v3; + rs.rows[2].x = v4; + rs.rows[2].y = v5; + rs.rows[3].x = v6; + rs.rows[3].y = v7; } return rs; } template<typename T, int ROWS, int COLS> -SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8) +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix( + T v0, + T v1, + T v2, + T v3, + T v4, + T v5, + T v6, + T v7, + T v8) { Matrix<T, ROWS, COLS> rs; - rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; - rs.rows[1].x = v3; rs.rows[1].y = v4; rs.rows[1].z = v5; - rs.rows[2].x = v6; rs.rows[2].y = v7; rs.rows[2].z = v8; + rs.rows[0].x = v0; + rs.rows[0].y = v1; + rs.rows[0].z = v2; + rs.rows[1].x = v3; + rs.rows[1].y = v4; + rs.rows[1].z = v5; + rs.rows[2].x = v6; + rs.rows[2].y = v7; + rs.rows[2].z = v8; return rs; } template<typename T, int ROWS, int COLS> -SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11) +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix( + T v0, + T v1, + T v2, + T v3, + T v4, + T v5, + T v6, + T v7, + T v8, + T v9, + T v10, + T v11) { Matrix<T, ROWS, COLS> rs; if (COLS == 4) { - rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; rs.rows[0].w = v3; - rs.rows[1].x = v4; rs.rows[1].y = v5; rs.rows[1].z = v6; rs.rows[1].w = v7; - rs.rows[2].x = v8; rs.rows[2].y = v9; rs.rows[2].z = v10; rs.rows[2].w = v11; + rs.rows[0].x = v0; + rs.rows[0].y = v1; + rs.rows[0].z = v2; + rs.rows[0].w = v3; + rs.rows[1].x = v4; + rs.rows[1].y = v5; + rs.rows[1].z = v6; + rs.rows[1].w = v7; + rs.rows[2].x = v8; + rs.rows[2].y = v9; + rs.rows[2].z = v10; + rs.rows[2].w = v11; } else { - rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; - rs.rows[1].x = v3; rs.rows[1].y = v4; rs.rows[1].z = v5; - rs.rows[2].x = v6; rs.rows[2].y = v7; rs.rows[2].z = v8; - rs.rows[3].x = v9; rs.rows[3].y = v10; rs.rows[3].z = v11; + rs.rows[0].x = v0; + rs.rows[0].y = v1; + rs.rows[0].z = v2; + rs.rows[1].x = v3; + rs.rows[1].y = v4; + rs.rows[1].z = v5; + rs.rows[2].x = v6; + rs.rows[2].y = v7; + rs.rows[2].z = v8; + rs.rows[3].x = v9; + rs.rows[3].y = v10; + rs.rows[3].z = v11; } return rs; } template<typename T, int ROWS, int COLS> -SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix( + T v0, + T v1, + T v2, + T v3, + T v4, + T v5, + T v6, + T v7, + T v8, + T v9, + T v10, + T v11, + T v12, + T v13, + T v14, + T v15) { Matrix<T, ROWS, COLS> rs; - rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; rs.rows[0].w = v3; - rs.rows[1].x = v4; rs.rows[1].y = v5; rs.rows[1].z = v6; rs.rows[1].w = v7; - rs.rows[2].x = v8; rs.rows[2].y = v9; rs.rows[2].z = v10; rs.rows[2].w = v11; - rs.rows[3].x = v12; rs.rows[3].y = v13; rs.rows[3].z = v14; rs.rows[3].w = v15; + rs.rows[0].x = v0; + rs.rows[0].y = v1; + rs.rows[0].z = v2; + rs.rows[0].w = v3; + rs.rows[1].x = v4; + rs.rows[1].y = v5; + rs.rows[1].z = v6; + rs.rows[1].w = v7; + rs.rows[2].x = v8; + rs.rows[2].y = v9; + rs.rows[2].z = v10; + rs.rows[2].w = v11; + rs.rows[3].x = v12; + rs.rows[3].y = v13; + rs.rows[3].z = v14; + rs.rows[3].w = v15; return rs; } -#define SLANG_MATRIX_BINARY_OP(T, op) \ - template<int R, int C> \ - SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal, const Matrix<T, R, C>& other) \ - { \ - Matrix<T, R, C> result;\ - for (int i = 0; i < R; i++) \ - for (int j = 0; j < C; j++) \ - *_slang_vector_get_element_ptr(result.rows+i,j) = _slang_vector_get_element(thisVal.rows[i], j) op _slang_vector_get_element(other.rows[i], j); \ - return result;\ +#define SLANG_MATRIX_BINARY_OP(T, op) \ + template<int R, int C> \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator op( \ + const Matrix<T, R, C>& thisVal, \ + const Matrix<T, R, C>& other) \ + { \ + Matrix<T, R, C> result; \ + for (int i = 0; i < R; i++) \ + for (int j = 0; j < C; j++) \ + *_slang_vector_get_element_ptr(result.rows + i, j) = \ + _slang_vector_get_element(thisVal.rows[i], j) \ + op _slang_vector_get_element(other.rows[i], j); \ + return result; \ } -#define SLANG_MATRIX_UNARY_OP(T, op) \ - template<int R, int C> \ +#define SLANG_MATRIX_UNARY_OP(T, op) \ + template<int R, int C> \ SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal) \ - { \ - Matrix<T, R, C> result;\ - for (int i = 0; i < R; i++) \ - for (int j = 0; j < C; j++) \ - *_slang_vector_get_element_ptr(result.rows+i,j) = op _slang_vector_get_element(thisVal.rows[i], j); \ - return result;\ - } -#define SLANG_INT_MATRIX_OPS(T) \ - SLANG_MATRIX_BINARY_OP(T, +)\ - SLANG_MATRIX_BINARY_OP(T, -)\ - SLANG_MATRIX_BINARY_OP(T, *)\ - SLANG_MATRIX_BINARY_OP(T, / )\ - SLANG_MATRIX_BINARY_OP(T, &)\ - SLANG_MATRIX_BINARY_OP(T, |)\ - SLANG_MATRIX_BINARY_OP(T, &&)\ - SLANG_MATRIX_BINARY_OP(T, ||)\ - SLANG_MATRIX_BINARY_OP(T, ^)\ - SLANG_MATRIX_BINARY_OP(T, %)\ - SLANG_MATRIX_UNARY_OP(T, !)\ + { \ + Matrix<T, R, C> result; \ + for (int i = 0; i < R; i++) \ + for (int j = 0; j < C; j++) \ + *_slang_vector_get_element_ptr(result.rows + i, j) = \ + op _slang_vector_get_element(thisVal.rows[i], j); \ + return result; \ + } +#define SLANG_INT_MATRIX_OPS(T) \ + SLANG_MATRIX_BINARY_OP(T, +) \ + SLANG_MATRIX_BINARY_OP(T, -) \ + SLANG_MATRIX_BINARY_OP(T, *) \ + SLANG_MATRIX_BINARY_OP(T, /) \ + SLANG_MATRIX_BINARY_OP(T, &) \ + SLANG_MATRIX_BINARY_OP(T, |) \ + SLANG_MATRIX_BINARY_OP(T, &&) \ + SLANG_MATRIX_BINARY_OP(T, ||) \ + SLANG_MATRIX_BINARY_OP(T, ^) \ + SLANG_MATRIX_BINARY_OP(T, %) \ + SLANG_MATRIX_UNARY_OP(T, !) \ SLANG_MATRIX_UNARY_OP(T, ~) #define SLANG_FLOAT_MATRIX_OPS(T) \ - SLANG_MATRIX_BINARY_OP(T, +)\ - SLANG_MATRIX_BINARY_OP(T, -)\ - SLANG_MATRIX_BINARY_OP(T, *)\ - SLANG_MATRIX_BINARY_OP(T, /)\ + SLANG_MATRIX_BINARY_OP(T, +) \ + SLANG_MATRIX_BINARY_OP(T, -) \ + SLANG_MATRIX_BINARY_OP(T, *) \ + SLANG_MATRIX_BINARY_OP(T, /) \ SLANG_MATRIX_UNARY_OP(T, -) SLANG_INT_MATRIX_OPS(int) SLANG_INT_MATRIX_OPS(uint) @@ -711,48 +960,57 @@ SLANG_FLOAT_MATRIX_OPS(double) #if SLANG_CUDA_ENABLE_HALF SLANG_FLOAT_MATRIX_OPS(__half) #endif -#define SLANG_MATRIX_INT_NEG_OP(T) \ - template<int R, int C>\ +#define SLANG_MATRIX_INT_NEG_OP(T) \ + template<int R, int C> \ SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator-(Matrix<T, R, C> thisVal) \ - { \ - Matrix<T, R, C> result;\ - for (int i = 0; i < R; i++) \ - for (int j = 0; j < C; j++) \ - *_slang_vector_get_element_ptr(result.rows+i,j) = 0 - _slang_vector_get_element(thisVal.rows[i], j); \ - return result;\ - } - SLANG_MATRIX_INT_NEG_OP(int) - SLANG_MATRIX_INT_NEG_OP(uint) - SLANG_MATRIX_INT_NEG_OP(short) - SLANG_MATRIX_INT_NEG_OP(ushort) - SLANG_MATRIX_INT_NEG_OP(char) - SLANG_MATRIX_INT_NEG_OP(uchar) - SLANG_MATRIX_INT_NEG_OP(longlong) - SLANG_MATRIX_INT_NEG_OP(ulonglong) - -#define SLANG_FLOAT_MATRIX_MOD(T)\ - template<int R, int C> \ - SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator%(Matrix<T, R, C> left, Matrix<T, R, C> right) \ - {\ - Matrix<T, R, C> result;\ - for (int i = 0; i < R; i++) \ - for (int j = 0; j < C; j++) \ - *_slang_vector_get_element_ptr(result.rows+i,j) = _slang_fmod(_slang_vector_get_element(left.rows[i], j), _slang_vector_get_element(right.rows[i], j)); \ - return result;\ - } - - SLANG_FLOAT_MATRIX_MOD(float) - SLANG_FLOAT_MATRIX_MOD(double) + { \ + Matrix<T, R, C> result; \ + for (int i = 0; i < R; i++) \ + for (int j = 0; j < C; j++) \ + *_slang_vector_get_element_ptr(result.rows + i, j) = \ + 0 - _slang_vector_get_element(thisVal.rows[i], j); \ + return result; \ + } +SLANG_MATRIX_INT_NEG_OP(int) +SLANG_MATRIX_INT_NEG_OP(uint) +SLANG_MATRIX_INT_NEG_OP(short) +SLANG_MATRIX_INT_NEG_OP(ushort) +SLANG_MATRIX_INT_NEG_OP(char) +SLANG_MATRIX_INT_NEG_OP(uchar) +SLANG_MATRIX_INT_NEG_OP(longlong) +SLANG_MATRIX_INT_NEG_OP(ulonglong) + +#define SLANG_FLOAT_MATRIX_MOD(T) \ + template<int R, int C> \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator%( \ + Matrix<T, R, C> left, \ + Matrix<T, R, C> right) \ + { \ + Matrix<T, R, C> result; \ + for (int i = 0; i < R; i++) \ + for (int j = 0; j < C; j++) \ + *_slang_vector_get_element_ptr(result.rows + i, j) = _slang_fmod( \ + _slang_vector_get_element(left.rows[i], j), \ + _slang_vector_get_element(right.rows[i], j)); \ + return result; \ + } + +SLANG_FLOAT_MATRIX_MOD(float) +SLANG_FLOAT_MATRIX_MOD(double) #if SLANG_CUDA_ENABLE_HALF - template<int R, int C> - SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<__half, R, C> operator%(Matrix<__half, R, C> left, Matrix<__half, R, C> right) - { - Matrix<__half, R, C> result; - for (int i = 0; i < R; i++) - for (int j = 0; j < C; j++) - * _slang_vector_get_element_ptr(result.rows + i, j) = __float2half(_slang_fmod(__half2float(_slang_vector_get_element(left.rows[i], j)), __half2float(_slang_vector_get_element(right.rows[i], j)))); - return result; - } +template<int R, int C> +SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<__half, R, C> operator%( + Matrix<__half, R, C> left, + Matrix<__half, R, C> right) +{ + Matrix<__half, R, C> result; + for (int i = 0; i < R; i++) + for (int j = 0; j < C; j++) + *_slang_vector_get_element_ptr(result.rows + i, j) = __float2half(_slang_fmod( + __half2float(_slang_vector_get_element(left.rows[i], j)), + __half2float(_slang_vector_get_element(right.rows[i], j)))); + return result; +} #endif #undef SLANG_FLOAT_MATRIX_MOD #undef SLANG_MATRIX_BINARY_OP @@ -762,19 +1020,24 @@ SLANG_FLOAT_MATRIX_OPS(__half) #undef SLANG_MATRIX_INT_NEG_OP #undef SLANG_FLOAT_MATRIX_MOD -#define SLANG_SELECT_IMPL(T, N)\ -SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, N> _slang_select(bool##N condition, Vector<T, N> v0, Vector<T, N> v1) \ -{ \ - Vector<T, N> result; \ - for (int i = 0; i < N; i++) \ - { \ - *_slang_vector_get_element_ptr(&result, i) = _slang_vector_get_element(condition, i) ? _slang_vector_get_element(v0, i) : _slang_vector_get_element(v1, i); \ - } \ - return result; \ -} -#define SLANG_SELECT_T(T)\ - SLANG_SELECT_IMPL(T, 2)\ - SLANG_SELECT_IMPL(T, 3)\ +#define SLANG_SELECT_IMPL(T, N) \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, N> _slang_select( \ + bool##N condition, \ + Vector<T, N> v0, \ + Vector<T, N> v1) \ + { \ + Vector<T, N> result; \ + for (int i = 0; i < N; i++) \ + { \ + *_slang_vector_get_element_ptr(&result, i) = _slang_vector_get_element(condition, i) \ + ? _slang_vector_get_element(v0, i) \ + : _slang_vector_get_element(v1, i); \ + } \ + return result; \ + } +#define SLANG_SELECT_T(T) \ + SLANG_SELECT_IMPL(T, 2) \ + SLANG_SELECT_IMPL(T, 3) \ SLANG_SELECT_IMPL(T, 4) SLANG_SELECT_T(int) @@ -794,53 +1057,103 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_select(bool condition, T v0, T v1) // // Half support -// +// #if SLANG_CUDA_ENABLE_HALF SLANG_SELECT_T(__half) // Convenience functions ushort -> half -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __ushort_as_half(const ushort2& i) { return __halves2half2(__ushort_as_half(i.x), __ushort_as_half(i.y)); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 __ushort_as_half(const ushort3& i) { return __half3{__ushort_as_half(i.x), __ushort_as_half(i.y), __ushort_as_half(i.z)}; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 __ushort_as_half(const ushort4& i) { return __half4{ __ushort_as_half(i.x), __ushort_as_half(i.y), __ushort_as_half(i.z), __ushort_as_half(i.w) }; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __ushort_as_half(const ushort2& i) +{ + return __halves2half2(__ushort_as_half(i.x), __ushort_as_half(i.y)); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 __ushort_as_half(const ushort3& i) +{ + return __half3{__ushort_as_half(i.x), __ushort_as_half(i.y), __ushort_as_half(i.z)}; +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 __ushort_as_half(const ushort4& i) +{ + return __half4{ + __ushort_as_half(i.x), + __ushort_as_half(i.y), + __ushort_as_half(i.z), + __ushort_as_half(i.w)}; +} // Convenience functions half -> ushort -SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort2 __half_as_ushort(const __half2& i) { return make_ushort2(__half_as_ushort(i.x), __half_as_ushort(i.y)); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort3 __half_as_ushort(const __half3& i) { return make_ushort3(__half_as_ushort(i.x), __half_as_ushort(i.y), __half_as_ushort(i.z)); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort4 __half_as_ushort(const __half4& i) { return make_ushort4(__half_as_ushort(i.x), __half_as_ushort(i.y), __half_as_ushort(i.z), __half_as_ushort(i.w)); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort2 __half_as_ushort(const __half2& i) +{ + return make_ushort2(__half_as_ushort(i.x), __half_as_ushort(i.y)); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort3 __half_as_ushort(const __half3& i) +{ + return make_ushort3(__half_as_ushort(i.x), __half_as_ushort(i.y), __half_as_ushort(i.z)); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort4 __half_as_ushort(const __half4& i) +{ + return make_ushort4( + __half_as_ushort(i.x), + __half_as_ushort(i.y), + __half_as_ushort(i.z), + __half_as_ushort(i.w)); +} -// This is a little bit of a hack. Fortunately CUDA has the definitions of the templated types in +// This is a little bit of a hack. Fortunately CUDA has the definitions of the templated types in // include/surface_indirect_functions.h -// Here we find the template definition requires a specialization of __nv_isurf_trait to allow -// a specialization of the surface write functions. -// This *isn't* a problem on the read functions as they don't have a return type that uses this mechanism +// Here we find the template definition requires a specialization of __nv_isurf_trait to allow +// a specialization of the surface write functions. +// This *isn't* a problem on the read functions as they don't have a return type that uses this +// mechanism -template<> struct __nv_isurf_trait<__half> { typedef void type; }; -template<> struct __nv_isurf_trait<__half2> { typedef void type; }; -template<> struct __nv_isurf_trait<__half4> { typedef void type; }; +template<> +struct __nv_isurf_trait<__half> +{ + typedef void type; +}; +template<> +struct __nv_isurf_trait<__half2> +{ + typedef void type; +}; +template<> +struct __nv_isurf_trait<__half4> +{ + typedef void type; +}; #define SLANG_DROP_PARENS(...) __VA_ARGS__ -#define SLANG_SURFACE_READ(FUNC_NAME, TYPE_ARGS, ARGS) \ -template <> \ -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half FUNC_NAME<__half>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ -{ \ - return __ushort_as_half(FUNC_NAME<ushort>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ -} \ -\ -template <> \ -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 FUNC_NAME<__half2>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ -{ \ - return __ushort_as_half(FUNC_NAME<ushort2>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ -} \ -\ -template <> \ -SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 FUNC_NAME<__half4>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ -{ \ - return __ushort_as_half(FUNC_NAME<ushort4>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ -} +#define SLANG_SURFACE_READ(FUNC_NAME, TYPE_ARGS, ARGS) \ + template<> \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL __half FUNC_NAME<__half>( \ + cudaSurfaceObject_t surfObj, \ + SLANG_DROP_PARENS TYPE_ARGS, \ + cudaSurfaceBoundaryMode boundaryMode) \ + { \ + return __ushort_as_half(FUNC_NAME<ushort>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ + } \ + \ + template<> \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 FUNC_NAME<__half2>( \ + cudaSurfaceObject_t surfObj, \ + SLANG_DROP_PARENS TYPE_ARGS, \ + cudaSurfaceBoundaryMode boundaryMode) \ + { \ + return __ushort_as_half( \ + FUNC_NAME<ushort2>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ + } \ + \ + template<> \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 FUNC_NAME<__half4>( \ + cudaSurfaceObject_t surfObj, \ + SLANG_DROP_PARENS TYPE_ARGS, \ + cudaSurfaceBoundaryMode boundaryMode) \ + { \ + return __ushort_as_half( \ + FUNC_NAME<ushort4>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ + } SLANG_SURFACE_READ(surf1Dread, (int x), (x)) SLANG_SURFACE_READ(surf2Dread, (int x, int y), (x, y)) @@ -850,24 +1163,36 @@ SLANG_SURFACE_READ(surf2DLayeredread, (int x, int y, int layer), (x, y, layer)) SLANG_SURFACE_READ(surfCubemapread, (int x, int y, int face), (x, y, face)) SLANG_SURFACE_READ(surfCubemapLayeredread, (int x, int y, int layerFace), (x, y, layerFace)) -#define SLANG_SURFACE_WRITE(FUNC_NAME, TYPE_ARGS, ARGS) \ -template <> \ -SLANG_FORCE_INLINE SLANG_CUDA_CALL void FUNC_NAME<__half>(__half data, cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ -{ \ - FUNC_NAME<ushort>(__half_as_ushort(data), surfObj, SLANG_DROP_PARENS ARGS, boundaryMode); \ -} \ -\ -template <> \ -SLANG_FORCE_INLINE SLANG_CUDA_CALL void FUNC_NAME<__half2>(__half2 data, cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ -{ \ - FUNC_NAME<ushort2>(__half_as_ushort(data), surfObj, SLANG_DROP_PARENS ARGS, boundaryMode); \ -} \ -\ -template <> \ -SLANG_FORCE_INLINE SLANG_CUDA_CALL void FUNC_NAME<__half4>(__half4 data, cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ -{ \ - FUNC_NAME<ushort4>(__half_as_ushort(data), surfObj, SLANG_DROP_PARENS ARGS, boundaryMode); \ -} +#define SLANG_SURFACE_WRITE(FUNC_NAME, TYPE_ARGS, ARGS) \ + template<> \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL void FUNC_NAME<__half>( \ + __half data, \ + cudaSurfaceObject_t surfObj, \ + SLANG_DROP_PARENS TYPE_ARGS, \ + cudaSurfaceBoundaryMode boundaryMode) \ + { \ + FUNC_NAME<ushort>(__half_as_ushort(data), surfObj, SLANG_DROP_PARENS ARGS, boundaryMode); \ + } \ + \ + template<> \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL void FUNC_NAME<__half2>( \ + __half2 data, \ + cudaSurfaceObject_t surfObj, \ + SLANG_DROP_PARENS TYPE_ARGS, \ + cudaSurfaceBoundaryMode boundaryMode) \ + { \ + FUNC_NAME<ushort2>(__half_as_ushort(data), surfObj, SLANG_DROP_PARENS ARGS, boundaryMode); \ + } \ + \ + template<> \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL void FUNC_NAME<__half4>( \ + __half4 data, \ + cudaSurfaceObject_t surfObj, \ + SLANG_DROP_PARENS TYPE_ARGS, \ + cudaSurfaceBoundaryMode boundaryMode) \ + { \ + FUNC_NAME<ushort4>(__half_as_ushort(data), surfObj, SLANG_DROP_PARENS ARGS, boundaryMode); \ + } SLANG_SURFACE_WRITE(surf1Dwrite, (int x), (x)) SLANG_SURFACE_WRITE(surf2Dwrite, (int x, int y), (x, y)) @@ -878,38 +1203,54 @@ SLANG_SURFACE_WRITE(surfCubemapwrite, (int x, int y, int face), (x, y, face)) SLANG_SURFACE_WRITE(surfCubemapLayeredwrite, (int x, int y, int layerFace), (x, y, layerFace)) // ! Hack to test out reading !!! -// Only works converting *from* half - -//template <typename T> -//SLANG_FORCE_INLINE SLANG_CUDA_CALL T surf2Dread_convert(cudaSurfaceObject_t surfObj, int x, int y, cudaSurfaceBoundaryMode boundaryMode); - -#define SLANG_SURFACE_READ_HALF_CONVERT(FUNC_NAME, TYPE_ARGS, ARGS) \ -\ -template <typename T> \ -SLANG_FORCE_INLINE SLANG_CUDA_CALL T FUNC_NAME##_convert(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode); \ -\ -template <> \ -SLANG_FORCE_INLINE SLANG_CUDA_CALL float FUNC_NAME##_convert<float>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ -{ \ - return __ushort_as_half(FUNC_NAME<uint16_t>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ -} \ -\ -template <> \ -SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 FUNC_NAME##_convert<float2>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ -{ \ - const __half2 v = __ushort_as_half(FUNC_NAME<ushort2>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ - return float2{v.x, v.y}; \ -} \ -\ -template <> \ -SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 FUNC_NAME##_convert<float4>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ -{ \ - const __half4 v = __ushort_as_half(FUNC_NAME<ushort4>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ - return float4{v.x, v.y, v.z, v.w}; \ -} - -SLANG_SURFACE_READ_HALF_CONVERT(surf1Dread, (int x), (x)) -SLANG_SURFACE_READ_HALF_CONVERT(surf2Dread, (int x, int y), (x, y)) +// Only works converting *from* half + +// template <typename T> +// SLANG_FORCE_INLINE SLANG_CUDA_CALL T surf2Dread_convert(cudaSurfaceObject_t surfObj, int x, int +// y, cudaSurfaceBoundaryMode boundaryMode); + +#define SLANG_SURFACE_READ_HALF_CONVERT(FUNC_NAME, TYPE_ARGS, ARGS) \ + \ + template<typename T> \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL T FUNC_NAME##_convert( \ + cudaSurfaceObject_t surfObj, \ + SLANG_DROP_PARENS TYPE_ARGS, \ + cudaSurfaceBoundaryMode boundaryMode); \ + \ + template<> \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL float FUNC_NAME##_convert<float>( \ + cudaSurfaceObject_t surfObj, \ + SLANG_DROP_PARENS TYPE_ARGS, \ + cudaSurfaceBoundaryMode boundaryMode) \ + { \ + return __ushort_as_half( \ + FUNC_NAME<uint16_t>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ + } \ + \ + template<> \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 FUNC_NAME##_convert<float2>( \ + cudaSurfaceObject_t surfObj, \ + SLANG_DROP_PARENS TYPE_ARGS, \ + cudaSurfaceBoundaryMode boundaryMode) \ + { \ + const __half2 v = \ + __ushort_as_half(FUNC_NAME<ushort2>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ + return float2{v.x, v.y}; \ + } \ + \ + template<> \ + SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 FUNC_NAME##_convert<float4>( \ + cudaSurfaceObject_t surfObj, \ + SLANG_DROP_PARENS TYPE_ARGS, \ + cudaSurfaceBoundaryMode boundaryMode) \ + { \ + const __half4 v = \ + __ushort_as_half(FUNC_NAME<ushort4>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ + return float4{v.x, v.y, v.z, v.w}; \ + } + +SLANG_SURFACE_READ_HALF_CONVERT(surf1Dread, (int x), (x)) +SLANG_SURFACE_READ_HALF_CONVERT(surf2Dread, (int x, int y), (x, y)) SLANG_SURFACE_READ_HALF_CONVERT(surf3Dread, (int x, int y, int z), (x, y, z)) #endif @@ -917,178 +1258,506 @@ SLANG_SURFACE_READ_HALF_CONVERT(surf3Dread, (int x, int y, int z), (x, y, z)) // Support for doing format conversion when writing to a surface/RWTexture // NOTE! For normal surface access x values are *byte* addressed. -// For the _convert versions they are *not*. They don't need to be because sust.p does not require it. +// For the _convert versions they are *not*. They don't need to be because sust.p does not require +// it. -template <typename T> -SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert(T, cudaSurfaceObject_t surfObj, int x, cudaSurfaceBoundaryMode boundaryMode); -template <typename T> -SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert(T, cudaSurfaceObject_t surfObj, int x, int y, cudaSurfaceBoundaryMode boundaryMode); -template <typename T> -SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert(T, cudaSurfaceObject_t surfObj, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode); +template<typename T> +SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert( + T, + cudaSurfaceObject_t surfObj, + int x, + cudaSurfaceBoundaryMode boundaryMode); +template<typename T> +SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert( + T, + cudaSurfaceObject_t surfObj, + int x, + int y, + cudaSurfaceBoundaryMode boundaryMode); +template<typename T> +SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert( + T, + cudaSurfaceObject_t surfObj, + int x, + int y, + int z, + cudaSurfaceBoundaryMode boundaryMode); // https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust // Float -template <> -SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float>(float v, cudaSurfaceObject_t surfObj, int x, cudaSurfaceBoundaryMode boundaryMode) +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float>( + float v, + cudaSurfaceObject_t surfObj, + int x, + cudaSurfaceBoundaryMode boundaryMode) { - asm volatile ( "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2};}\n\t" :: "l"(surfObj),"r"(x),"f"(v)); + asm volatile( + "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2};}\n\t" ::"l"(surfObj), + "r"(x), + "f"(v)); } - -template <> -SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float>(float v, cudaSurfaceObject_t surfObj, int x, int y, cudaSurfaceBoundaryMode boundaryMode) + +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float>( + float v, + cudaSurfaceObject_t surfObj, + int x, + int y, + cudaSurfaceBoundaryMode boundaryMode) { - asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"f"(v)); + asm volatile( + "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" ::"l"(surfObj), + "r"(x), + "r"(y), + "f"(v)); } -template <> -SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float>(float v, cudaSurfaceObject_t surfObj, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode) -{ - asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3}], {%4};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"r"(z),"f"(v)); +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float>( + float v, + cudaSurfaceObject_t surfObj, + int x, + int y, + int z, + cudaSurfaceBoundaryMode boundaryMode) +{ + asm volatile( + "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3}], {%4};}\n\t" ::"l"(surfObj), + "r"(x), + "r"(y), + "r"(z), + "f"(v)); } // Float2 -template <> -SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float2>(float2 v, cudaSurfaceObject_t surfObj, int x, cudaSurfaceBoundaryMode boundaryMode) +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float2>( + float2 v, + cudaSurfaceObject_t surfObj, + int x, + cudaSurfaceBoundaryMode boundaryMode) { const float vx = v.x, vy = v.y; - asm volatile ( "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2,%3};}\n\t" :: "l"(surfObj),"r"(x),"f"(vx),"f"(vy)); + asm volatile( + "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2,%3};}\n\t" ::"l"(surfObj), + "r"(x), + "f"(vx), + "f"(vy)); } - -template <> -SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float2>(float2 v, cudaSurfaceObject_t surfObj, int x, int y, cudaSurfaceBoundaryMode boundaryMode) + +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float2>( + float2 v, + cudaSurfaceObject_t surfObj, + int x, + int y, + cudaSurfaceBoundaryMode boundaryMode) { const float vx = v.x, vy = v.y; - asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"f"(vx),"f"(vy)); + asm volatile( + "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4};}\n\t" ::"l"(surfObj), + "r"(x), + "r"(y), + "f"(vx), + "f"(vy)); } -template <> -SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float2>(float2 v, cudaSurfaceObject_t surfObj, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode) +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float2>( + float2 v, + cudaSurfaceObject_t surfObj, + int x, + int y, + int z, + cudaSurfaceBoundaryMode boundaryMode) { const float vx = v.x, vy = v.y; - asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3}], {%4,%5};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"r"(z),"f"(vx),"f"(vy)); + asm volatile( + "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3}], {%4,%5};}\n\t" ::"l"(surfObj), + "r"(x), + "r"(y), + "r"(z), + "f"(vx), + "f"(vy)); } // Float4 -template <> -SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float4>(float4 v, cudaSurfaceObject_t surfObj, int x, cudaSurfaceBoundaryMode boundaryMode) +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float4>( + float4 v, + cudaSurfaceObject_t surfObj, + int x, + cudaSurfaceBoundaryMode boundaryMode) { const float vx = v.x, vy = v.y, vz = v.z, vw = v.w; - asm volatile ( "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2,%3,%4,%5};}\n\t" :: "l"(surfObj),"r"(x),"f"(vx),"f"(vy),"f"(vz),"f"(vw)); + asm volatile( + "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2,%3,%4,%5};}\n\t" ::"l"(surfObj), + "r"(x), + "f"(vx), + "f"(vy), + "f"(vz), + "f"(vw)); } - -template <> -SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float4>(float4 v, cudaSurfaceObject_t surfObj, int x, int y, cudaSurfaceBoundaryMode boundaryMode) + +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float4>( + float4 v, + cudaSurfaceObject_t surfObj, + int x, + int y, + cudaSurfaceBoundaryMode boundaryMode) { const float vx = v.x, vy = v.y, vz = v.z, vw = v.w; - asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4,%5,%6};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"f"(vx),"f"(vy),"f"(vz),"f"(vw)); + asm volatile( + "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE + " [%0, {%1,%2}], {%3,%4,%5,%6};}\n\t" ::"l"(surfObj), + "r"(x), + "r"(y), + "f"(vx), + "f"(vy), + "f"(vz), + "f"(vw)); } -template <> -SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float4>(float4 v, cudaSurfaceObject_t surfObj, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode) +template<> +SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float4>( + float4 v, + cudaSurfaceObject_t surfObj, + int x, + int y, + int z, + cudaSurfaceBoundaryMode boundaryMode) { const float vx = v.x, vy = v.y, vz = v.z, vw = v.w; - asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3}], {%4,%5,%6,%7};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"r"(z),"f"(vx),"f"(vy),"f"(vz),"f"(vw)); + asm volatile( + "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE + " [%0, {%1,%2,%3}], {%4,%5,%6,%7};}\n\t" ::"l"(surfObj), + "r"(x), + "r"(y), + "r"(z), + "f"(vx), + "f"(vy), + "f"(vz), + "f"(vw)); } // ----------------------------- F32 ----------------------------------------- -// Unary -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_ceil(float f) { return ::ceilf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_floor(float f) { return ::floorf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_round(float f) { return ::roundf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sin(float f) { return ::sinf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_cos(float f) { return ::cosf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL void F32_sincos(float f, float* s, float* c) { ::sincosf(f, s, c); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_tan(float f) { return ::tanf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_asin(float f) { return ::asinf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_acos(float f) { return ::acosf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_atan(float f) { return ::atanf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sinh(float f) { return ::sinhf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_cosh(float f) { return ::coshf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_tanh(float f) { return ::tanhf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_log2(float f) { return ::log2f(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_log(float f) { return ::logf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_log10(float f) { return ::log10f(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_exp2(float f) { return ::exp2f(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_exp(float f) { return ::expf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_abs(float f) { return ::fabsf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_trunc(float f) { return ::truncf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sqrt(float f) { return ::sqrtf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_rsqrt(float f) { return ::rsqrtf(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sign(float f) { return ( f == 0.0f) ? f : (( f < 0.0f) ? -1.0f : 1.0f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_frac(float f) { return f - F32_floor(f); } - -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F32_isnan(float f) { return isnan(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F32_isfinite(float f) { return isfinite(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F32_isinf(float f) { return isinf(f); } +// Unary +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_ceil(float f) +{ + return ::ceilf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_floor(float f) +{ + return ::floorf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_round(float f) +{ + return ::roundf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sin(float f) +{ + return ::sinf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_cos(float f) +{ + return ::cosf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL void F32_sincos(float f, float* s, float* c) +{ + ::sincosf(f, s, c); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_tan(float f) +{ + return ::tanf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_asin(float f) +{ + return ::asinf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_acos(float f) +{ + return ::acosf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_atan(float f) +{ + return ::atanf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sinh(float f) +{ + return ::sinhf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_cosh(float f) +{ + return ::coshf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_tanh(float f) +{ + return ::tanhf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_log2(float f) +{ + return ::log2f(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_log(float f) +{ + return ::logf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_log10(float f) +{ + return ::log10f(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_exp2(float f) +{ + return ::exp2f(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_exp(float f) +{ + return ::expf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_abs(float f) +{ + return ::fabsf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_trunc(float f) +{ + return ::truncf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sqrt(float f) +{ + return ::sqrtf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_rsqrt(float f) +{ + return ::rsqrtf(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sign(float f) +{ + return (f == 0.0f) ? f : ((f < 0.0f) ? -1.0f : 1.0f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_frac(float f) +{ + return f - F32_floor(f); +} + +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F32_isnan(float f) +{ + return isnan(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F32_isfinite(float f) +{ + return isfinite(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F32_isinf(float f) +{ + return isinf(f); +} // Binary -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_min(float a, float b) { return ::fminf(a, b); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_max(float a, float b) { return ::fmaxf(a, b); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_pow(float a, float b) { return ::powf(a, b); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_fmod(float a, float b) { return ::fmodf(a, b); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_remainder(float a, float b) { return ::remainderf(a, b); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_atan2(float a, float b) { return float(::atan2(a, b)); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_min(float a, float b) +{ + return ::fminf(a, b); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_max(float a, float b) +{ + return ::fmaxf(a, b); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_pow(float a, float b) +{ + return ::powf(a, b); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_fmod(float a, float b) +{ + return ::fmodf(a, b); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_remainder(float a, float b) +{ + return ::remainderf(a, b); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_atan2(float a, float b) +{ + return float(::atan2(a, b)); +} -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_frexp(float x, int* e) { return frexpf(x, e); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_frexp(float x, int* e) +{ + return frexpf(x, e); +} SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_modf(float x, float* ip) { return ::modff(x, ip); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t F32_asuint(float f) { Union32 u; u.f = f; return u.u; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t F32_asint(float f) { Union32 u; u.f = f; return u.i; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t F32_asuint(float f) +{ + Union32 u; + u.f = f; + return u.u; +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t F32_asint(float f) +{ + Union32 u; + u.f = f; + return u.i; +} // Ternary -SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_fma(float a, float b, float c) { return ::fmaf(a, b, c); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_fma(float a, float b, float c) +{ + return ::fmaf(a, b, c); +} // ----------------------------- F64 ----------------------------------------- -// Unary -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_ceil(double f) { return ::ceil(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_floor(double f) { return ::floor(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_round(double f) { return ::round(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sin(double f) { return ::sin(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_cos(double f) { return ::cos(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL void F64_sincos(double f, double* s, double* c) { ::sincos(f, s, c); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_tan(double f) { return ::tan(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_asin(double f) { return ::asin(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_acos(double f) { return ::acos(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_atan(double f) { return ::atan(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sinh(double f) { return ::sinh(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_cosh(double f) { return ::cosh(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_tanh(double f) { return ::tanh(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_log2(double f) { return ::log2(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_log(double f) { return ::log(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_log10(float f) { return ::log10(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_exp2(double f) { return ::exp2(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_exp(double f) { return ::exp(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_abs(double f) { return ::fabs(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_trunc(double f) { return ::trunc(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sqrt(double f) { return ::sqrt(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_rsqrt(double f) { return ::rsqrt(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sign(double f) { return (f == 0.0) ? f : ((f < 0.0) ? -1.0 : 1.0); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_frac(double f) { return f - F64_floor(f); } - -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F64_isnan(double f) { return isnan(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F64_isfinite(double f) { return isfinite(f); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F64_isinf(double f) { return isinf(f); } +// Unary +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_ceil(double f) +{ + return ::ceil(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_floor(double f) +{ + return ::floor(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_round(double f) +{ + return ::round(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sin(double f) +{ + return ::sin(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_cos(double f) +{ + return ::cos(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL void F64_sincos(double f, double* s, double* c) +{ + ::sincos(f, s, c); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_tan(double f) +{ + return ::tan(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_asin(double f) +{ + return ::asin(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_acos(double f) +{ + return ::acos(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_atan(double f) +{ + return ::atan(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sinh(double f) +{ + return ::sinh(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_cosh(double f) +{ + return ::cosh(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_tanh(double f) +{ + return ::tanh(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_log2(double f) +{ + return ::log2(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_log(double f) +{ + return ::log(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_log10(float f) +{ + return ::log10(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_exp2(double f) +{ + return ::exp2(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_exp(double f) +{ + return ::exp(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_abs(double f) +{ + return ::fabs(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_trunc(double f) +{ + return ::trunc(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sqrt(double f) +{ + return ::sqrt(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_rsqrt(double f) +{ + return ::rsqrt(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sign(double f) +{ + return (f == 0.0) ? f : ((f < 0.0) ? -1.0 : 1.0); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_frac(double f) +{ + return f - F64_floor(f); +} + +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F64_isnan(double f) +{ + return isnan(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F64_isfinite(double f) +{ + return isfinite(f); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F64_isinf(double f) +{ + return isinf(f); +} // Binary -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_min(double a, double b) { return ::fmin(a, b); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_max(double a, double b) { return ::fmax(a, b); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_pow(double a, double b) { return ::pow(a, b); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_fmod(double a, double b) { return ::fmod(a, b); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_remainder(double a, double b) { return ::remainder(a, b); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_atan2(double a, double b) { return ::atan2(a, b); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_min(double a, double b) +{ + return ::fmin(a, b); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_max(double a, double b) +{ + return ::fmax(a, b); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_pow(double a, double b) +{ + return ::pow(a, b); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_fmod(double a, double b) +{ + return ::fmod(a, b); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_remainder(double a, double b) +{ + return ::remainder(a, b); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_atan2(double a, double b) +{ + return ::atan2(a, b); +} -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_frexp(double x, int* e) { return ::frexp(x, e); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_frexp(double x, int* e) +{ + return ::frexp(x, e); +} SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_modf(double x, double* ip) { @@ -1112,20 +1781,40 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL void F64_asint(double d, int32_t* low, int32_ } // Ternary -SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_fma(double a, double b, double c) { return ::fma(a, b, c); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_fma(double a, double b, double c) +{ + return ::fma(a, b, c); +} // ----------------------------- I32 ----------------------------------------- // Unary -SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_abs(int32_t f) { return (f < 0) ? -f : f; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_abs(int32_t f) +{ + return (f < 0) ? -f : f; +} // Binary -SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_min(int32_t a, int32_t b) { return a < b ? a : b; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_max(int32_t a, int32_t b) { return a > b ? a : b; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_min(int32_t a, int32_t b) +{ + return a < b ? a : b; +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_max(int32_t a, int32_t b) +{ + return a > b ? a : b; +} -SLANG_FORCE_INLINE SLANG_CUDA_CALL float I32_asfloat(int32_t x) { Union32 u; u.i = x; return u.f; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_asuint(int32_t x) { return uint32_t(x); } -SLANG_FORCE_INLINE SLANG_CUDA_CALL double I32_asdouble(int32_t low, int32_t hi ) +SLANG_FORCE_INLINE SLANG_CUDA_CALL float I32_asfloat(int32_t x) +{ + Union32 u; + u.i = x; + return u.f; +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_asuint(int32_t x) +{ + return uint32_t(x); +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL double I32_asdouble(int32_t low, int32_t hi) { Union64 u; u.u = (uint64_t(hi) << 32) | uint32_t(low); @@ -1134,15 +1823,32 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL double I32_asdouble(int32_t low, int32_t hi ) // ----------------------------- U32 ----------------------------------------- -// Unary -SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_abs(uint32_t f) { return f; } +// Unary +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_abs(uint32_t f) +{ + return f; +} // Binary -SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_min(uint32_t a, uint32_t b) { return a < b ? a : b; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_max(uint32_t a, uint32_t b) { return a > b ? a : b; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_min(uint32_t a, uint32_t b) +{ + return a < b ? a : b; +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_max(uint32_t a, uint32_t b) +{ + return a > b ? a : b; +} -SLANG_FORCE_INLINE SLANG_CUDA_CALL float U32_asfloat(uint32_t x) { Union32 u; u.u = x; return u.f; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_asint(int32_t x) { return uint32_t(x); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL float U32_asfloat(uint32_t x) +{ + Union32 u; + u.u = x; + return u.f; +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_asint(int32_t x) +{ + return uint32_t(x); +} SLANG_FORCE_INLINE SLANG_CUDA_CALL double U32_asdouble(uint32_t low, uint32_t hi) { @@ -1160,17 +1866,35 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_countbits(uint32_t v) // ----------------------------- I64 ----------------------------------------- -SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_abs(int64_t f) { return (f < 0) ? -f : f; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_abs(int64_t f) +{ + return (f < 0) ? -f : f; +} -SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_min(int64_t a, int64_t b) { return a < b ? a : b; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_max(int64_t a, int64_t b) { return a > b ? a : b; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_min(int64_t a, int64_t b) +{ + return a < b ? a : b; +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_max(int64_t a, int64_t b) +{ + return a > b ? a : b; +} // ----------------------------- U64 ----------------------------------------- -SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_abs(uint64_t f) { return f; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_abs(uint64_t f) +{ + return f; +} -SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_min(uint64_t a, uint64_t b) { return a < b ? a : b; } -SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_max(uint64_t a, uint64_t b) { return a > b ? a : b; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_min(uint64_t a, uint64_t b) +{ + return a < b ? a : b; +} +SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_max(uint64_t a, uint64_t b) +{ + return a > b ? a : b; +} SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U64_countbits(uint64_t v) { @@ -1185,7 +1909,7 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U64_countbits(uint64_t v) // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sm5-object-structuredbuffer-getdimensions // Missing Load(_In_ int Location, _Out_ uint Status); -template <typename T> +template<typename T> struct StructuredBuffer { SLANG_CUDA_CALL const T& operator[](size_t index) const @@ -1205,7 +1929,11 @@ struct StructuredBuffer } #ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT - SLANG_CUDA_CALL void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride) { *outNumStructs = uint32_t(count); *outStride = uint32_t(sizeof(T)); } + SLANG_CUDA_CALL void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride) + { + *outNumStructs = uint32_t(count); + *outStride = uint32_t(sizeof(T)); + } #endif T* data; @@ -1214,7 +1942,7 @@ struct StructuredBuffer #endif }; -template <typename T> +template<typename T> struct RWStructuredBuffer : StructuredBuffer<T> { SLANG_CUDA_CALL T& operator[](size_t index) const @@ -1230,28 +1958,28 @@ struct RWStructuredBuffer : StructuredBuffer<T> struct ByteAddressBuffer { SLANG_CUDA_CALL void GetDimensions(uint32_t* outDim) const { *outDim = uint32_t(sizeInBytes); } - SLANG_CUDA_CALL uint32_t Load(size_t index) const - { + SLANG_CUDA_CALL uint32_t Load(size_t index) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes); - return data[index >> 2]; + return data[index >> 2]; } - SLANG_CUDA_CALL uint2 Load2(size_t index) const - { - SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); - const size_t dataIdx = index >> 2; - return uint2{data[dataIdx], data[dataIdx + 1]}; + SLANG_CUDA_CALL uint2 Load2(size_t index) const + { + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); + const size_t dataIdx = index >> 2; + return uint2{data[dataIdx], data[dataIdx + 1]}; } - SLANG_CUDA_CALL uint3 Load3(size_t index) const - { + SLANG_CUDA_CALL uint3 Load3(size_t index) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes); - const size_t dataIdx = index >> 2; - return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; + const size_t dataIdx = index >> 2; + return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; } - SLANG_CUDA_CALL uint4 Load4(size_t index) const - { + SLANG_CUDA_CALL uint4 Load4(size_t index) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes); - const size_t dataIdx = index >> 2; - return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; + const size_t dataIdx = index >> 2; + return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; } template<typename T> SLANG_CUDA_CALL T Load(size_t index) const @@ -1270,38 +1998,38 @@ struct ByteAddressBuffer return rs; } const uint32_t* data; - size_t sizeInBytes; //< Must be multiple of 4 + size_t sizeInBytes; //< Must be multiple of 4 }; // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sm5-object-rwbyteaddressbuffer -// Missing support for Atomic operations +// Missing support for Atomic operations // Missing support for Load with status struct RWByteAddressBuffer { SLANG_CUDA_CALL void GetDimensions(uint32_t* outDim) const { *outDim = uint32_t(sizeInBytes); } - - SLANG_CUDA_CALL uint32_t Load(size_t index) const - { + + SLANG_CUDA_CALL uint32_t Load(size_t index) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes); - return data[index >> 2]; + return data[index >> 2]; } - SLANG_CUDA_CALL uint2 Load2(size_t index) const - { + SLANG_CUDA_CALL uint2 Load2(size_t index) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); - const size_t dataIdx = index >> 2; - return uint2{data[dataIdx], data[dataIdx + 1]}; + const size_t dataIdx = index >> 2; + return uint2{data[dataIdx], data[dataIdx + 1]}; } - SLANG_CUDA_CALL uint3 Load3(size_t index) const - { + SLANG_CUDA_CALL uint3 Load3(size_t index) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes); - const size_t dataIdx = index >> 2; - return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; + const size_t dataIdx = index >> 2; + return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; } - SLANG_CUDA_CALL uint4 Load4(size_t index) const - { + SLANG_CUDA_CALL uint4 Load4(size_t index) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes); - const size_t dataIdx = index >> 2; - return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; + const size_t dataIdx = index >> 2; + return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; } template<typename T> SLANG_CUDA_CALL T Load(size_t index) const @@ -1311,31 +2039,31 @@ struct RWByteAddressBuffer memcpy(&data, ((const char*)this->data) + index, sizeof(T)); return data; } - - SLANG_CUDA_CALL void Store(size_t index, uint32_t v) const - { + + SLANG_CUDA_CALL void Store(size_t index, uint32_t v) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes); - data[index >> 2] = v; + data[index >> 2] = v; } - SLANG_CUDA_CALL void Store2(size_t index, uint2 v) const - { + SLANG_CUDA_CALL void Store2(size_t index, uint2 v) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); - const size_t dataIdx = index >> 2; + const size_t dataIdx = index >> 2; data[dataIdx + 0] = v.x; data[dataIdx + 1] = v.y; } - SLANG_CUDA_CALL void Store3(size_t index, uint3 v) const - { + SLANG_CUDA_CALL void Store3(size_t index, uint3 v) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes); - const size_t dataIdx = index >> 2; + const size_t dataIdx = index >> 2; data[dataIdx + 0] = v.x; data[dataIdx + 1] = v.y; data[dataIdx + 2] = v.z; } - SLANG_CUDA_CALL void Store4(size_t index, uint4 v) const - { + SLANG_CUDA_CALL void Store4(size_t index, uint4 v) const + { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes); - const size_t dataIdx = index >> 2; + const size_t dataIdx = index >> 2; data[dataIdx + 0] = v.x; data[dataIdx + 1] = v.y; data[dataIdx + 2] = v.z; @@ -1347,9 +2075,9 @@ struct RWByteAddressBuffer SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes); memcpy((char*)data + index, &value, sizeof(T)); } - - /// Can be used in the core module to gain access - template <typename T> + + /// Can be used in the core module to gain access + template<typename T> SLANG_CUDA_CALL T* _getPtrAt(size_t index) { SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes); @@ -1364,69 +2092,71 @@ struct RWByteAddressBuffer return rs; } uint32_t* data; - size_t sizeInBytes; //< Must be multiple of 4 + size_t sizeInBytes; //< Must be multiple of 4 }; // ---------------------- Wave -------------------------------------- -// TODO(JS): It appears that cuda does not have a simple way to get a lane index. -// -// Another approach could be... -// laneId = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & SLANG_CUDA_WARP_MASK -// If that is really true another way to do this, would be for code generator to add this function -// with the [numthreads] baked in. -// -// For now I'll just assume you have a launch that makes the following correct if the kernel uses WaveGetLaneIndex() +// TODO(JS): It appears that cuda does not have a simple way to get a lane index. +// +// Another approach could be... +// laneId = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & +// SLANG_CUDA_WARP_MASK If that is really true another way to do this, would be for code generator +// to add this function with the [numthreads] baked in. +// +// For now I'll just assume you have a launch that makes the following correct if the kernel uses +// WaveGetLaneIndex() #ifndef SLANG_USE_ASM_LANE_ID - __forceinline__ __device__ uint32_t _getLaneId() +__forceinline__ __device__ uint32_t _getLaneId() { - // If the launch is (or I guess some multiple of the warp size) - // we try this mechanism, which is apparently faster. + // If the launch is (or I guess some multiple of the warp size) + // we try this mechanism, which is apparently faster. return threadIdx.x & SLANG_CUDA_WARP_MASK; } #else __forceinline__ __device__ uint32_t _getLaneId() { // https://stackoverflow.com/questions/44337309/whats-the-most-efficient-way-to-calculate-the-warp-id-lane-id-in-a-1-d-grid# - // This mechanism is not the fastest way to do it, and that is why the other mechanism - // is the default. But the other mechanism relies on a launch that makes the assumption + // This mechanism is not the fastest way to do it, and that is why the other mechanism + // is the default. But the other mechanism relies on a launch that makes the assumption // true. - unsigned ret; - asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret)); + unsigned ret; + asm volatile("mov.u32 %0, %laneid;" : "=r"(ret)); return ret; } #endif typedef int WarpMask; -// It appears that the __activemask() cannot always be used because -// threads need to be converged. -// +// It appears that the __activemask() cannot always be used because +// threads need to be converged. +// // For CUDA the article claims mask has to be used carefully // https://devblogs.nvidia.com/using-cuda-warp-level-primitives/ -// With the Warp intrinsics there is no mask, and it's just the 'active lanes'. +// With the Warp intrinsics there is no mask, and it's just the 'active lanes'. // __activemask() though does not require there is convergence, so that doesn't work. -// -// '__ballot_sync' produces a convergance. -// +// +// '__ballot_sync' produces a convergance. +// // From the CUDA docs: -// ```For __all_sync, __any_sync, and __ballot_sync, a mask must be passed that specifies the threads -// participating in the call. A bit, representing the thread's lane ID, must be set for each participating thread -// to ensure they are properly converged before the intrinsic is executed by the hardware. All active threads named -// in mask must execute the same intrinsic with the same mask, or the result is undefined.``` +// ```For __all_sync, __any_sync, and __ballot_sync, a mask must be passed that specifies the +// threads participating in the call. A bit, representing the thread's lane ID, must be set for each +// participating thread to ensure they are properly converged before the intrinsic is executed by +// the hardware. All active threads named in mask must execute the same intrinsic with the same +// mask, or the result is undefined.``` // // Currently there isn't a mechanism to correctly get the mask without it being passed through. -// Doing so will most likely require some changes to slang code generation to track masks, for now then we use -// _getActiveMask. +// Doing so will most likely require some changes to slang code generation to track masks, for now +// then we use _getActiveMask. // Return mask of all the lanes less than the current lane __forceinline__ __device__ WarpMask _getLaneLtMask() { return (int(1) << _getLaneId()) - 1; -} +} -// TODO(JS): +// TODO(JS): // THIS IS NOT CORRECT! That determining the appropriate active mask requires appropriate // mask tracking. __forceinline__ __device__ WarpMask _getActiveMask() @@ -1478,30 +2208,30 @@ __inline__ __device__ int _waveCalcPow2Offset(WarpMask mask) __inline__ __device__ bool _waveIsFirstLane() { const WarpMask mask = __activemask(); - // We special case bit 0, as that most warps are expected to be fully active. - + // We special case bit 0, as that most warps are expected to be fully active. + // mask & -mask, isolates the lowest set bit. - //return (mask & 1 ) || ((mask & -mask) == (1 << _getLaneId())); - - // This mechanism is most similar to what was in an nVidia post, so assume it is prefered. - return (mask & 1 ) || ((__ffs(mask) - 1) == _getLaneId()); + // return (mask & 1 ) || ((mask & -mask) == (1 << _getLaneId())); + + // This mechanism is most similar to what was in an nVidia post, so assume it is prefered. + return (mask & 1) || ((__ffs(mask) - 1) == _getLaneId()); } -template <typename T> +template<typename T> struct WaveOpOr { __inline__ __device__ static T getInitial(T a) { return 0; } __inline__ __device__ static T doOp(T a, T b) { return a | b; } }; -template <typename T> +template<typename T> struct WaveOpAnd { __inline__ __device__ static T getInitial(T a) { return ~T(0); } __inline__ __device__ static T doOp(T a, T b) { return a & b; } }; -template <typename T> +template<typename T> struct WaveOpXor { __inline__ __device__ static T getInitial(T a) { return 0; } @@ -1509,7 +2239,7 @@ struct WaveOpXor __inline__ __device__ static T doInverse(T a, T b) { return a ^ b; } }; -template <typename T> +template<typename T> struct WaveOpAdd { __inline__ __device__ static T getInitial(T a) { return 0; } @@ -1517,77 +2247,166 @@ struct WaveOpAdd __inline__ __device__ static T doInverse(T a, T b) { return a - b; } }; -template <typename T> +template<typename T> struct WaveOpMul { __inline__ __device__ static T getInitial(T a) { return T(1); } __inline__ __device__ static T doOp(T a, T b) { return a * b; } - // Using this inverse for int is probably undesirable - because in general it requires T to have more precision - // There is also a performance aspect to it, where divides are generally significantly slower + // Using this inverse for int is probably undesirable - because in general it requires T to have + // more precision There is also a performance aspect to it, where divides are generally + // significantly slower __inline__ __device__ static T doInverse(T a, T b) { return a / b; } }; -template <typename T> +template<typename T> struct WaveOpMax { __inline__ __device__ static T getInitial(T a) { return a; } __inline__ __device__ static T doOp(T a, T b) { return a > b ? a : b; } }; -template <typename T> +template<typename T> struct WaveOpMin { - __inline__ __device__ static T getInitial(T a) { return a; } + __inline__ __device__ static T getInitial(T a) { return a; } __inline__ __device__ static T doOp(T a, T b) { return a < b ? a : b; } }; -template <typename T> +template<typename T> struct ElementTypeTrait; // Scalar -template <> struct ElementTypeTrait<int> { typedef int Type; }; -template <> struct ElementTypeTrait<uint> { typedef uint Type; }; -template <> struct ElementTypeTrait<float> { typedef float Type; }; -template <> struct ElementTypeTrait<double> { typedef double Type; }; -template <> struct ElementTypeTrait<uint64_t> { typedef uint64_t Type; }; -template <> struct ElementTypeTrait<int64_t> { typedef int64_t Type; }; +template<> +struct ElementTypeTrait<int> +{ + typedef int Type; +}; +template<> +struct ElementTypeTrait<uint> +{ + typedef uint Type; +}; +template<> +struct ElementTypeTrait<float> +{ + typedef float Type; +}; +template<> +struct ElementTypeTrait<double> +{ + typedef double Type; +}; +template<> +struct ElementTypeTrait<uint64_t> +{ + typedef uint64_t Type; +}; +template<> +struct ElementTypeTrait<int64_t> +{ + typedef int64_t Type; +}; // Vector -template <> struct ElementTypeTrait<int1> { typedef int Type; }; -template <> struct ElementTypeTrait<int2> { typedef int Type; }; -template <> struct ElementTypeTrait<int3> { typedef int Type; }; -template <> struct ElementTypeTrait<int4> { typedef int Type; }; - -template <> struct ElementTypeTrait<uint1> { typedef uint Type; }; -template <> struct ElementTypeTrait<uint2> { typedef uint Type; }; -template <> struct ElementTypeTrait<uint3> { typedef uint Type; }; -template <> struct ElementTypeTrait<uint4> { typedef uint Type; }; - -template <> struct ElementTypeTrait<float1> { typedef float Type; }; -template <> struct ElementTypeTrait<float2> { typedef float Type; }; -template <> struct ElementTypeTrait<float3> { typedef float Type; }; -template <> struct ElementTypeTrait<float4> { typedef float Type; }; - -template <> struct ElementTypeTrait<double1> { typedef double Type; }; -template <> struct ElementTypeTrait<double2> { typedef double Type; }; -template <> struct ElementTypeTrait<double3> { typedef double Type; }; -template <> struct ElementTypeTrait<double4> { typedef double Type; }; +template<> +struct ElementTypeTrait<int1> +{ + typedef int Type; +}; +template<> +struct ElementTypeTrait<int2> +{ + typedef int Type; +}; +template<> +struct ElementTypeTrait<int3> +{ + typedef int Type; +}; +template<> +struct ElementTypeTrait<int4> +{ + typedef int Type; +}; + +template<> +struct ElementTypeTrait<uint1> +{ + typedef uint Type; +}; +template<> +struct ElementTypeTrait<uint2> +{ + typedef uint Type; +}; +template<> +struct ElementTypeTrait<uint3> +{ + typedef uint Type; +}; +template<> +struct ElementTypeTrait<uint4> +{ + typedef uint Type; +}; + +template<> +struct ElementTypeTrait<float1> +{ + typedef float Type; +}; +template<> +struct ElementTypeTrait<float2> +{ + typedef float Type; +}; +template<> +struct ElementTypeTrait<float3> +{ + typedef float Type; +}; +template<> +struct ElementTypeTrait<float4> +{ + typedef float Type; +}; + +template<> +struct ElementTypeTrait<double1> +{ + typedef double Type; +}; +template<> +struct ElementTypeTrait<double2> +{ + typedef double Type; +}; +template<> +struct ElementTypeTrait<double3> +{ + typedef double Type; +}; +template<> +struct ElementTypeTrait<double4> +{ + typedef double Type; +}; // Matrix -template <typename T, int ROWS, int COLS> -struct ElementTypeTrait<Matrix<T, ROWS, COLS> > -{ - typedef T Type; +template<typename T, int ROWS, int COLS> +struct ElementTypeTrait<Matrix<T, ROWS, COLS>> +{ + typedef T Type; }; -// Scalar -template <typename INTF, typename T> +// Scalar +template<typename INTF, typename T> __device__ T _waveReduceScalar(WarpMask mask, T val) { const int offsetSize = _waveCalcPow2Offset(mask); if (offsetSize > 0) { - // Fast path O(log2(activeLanes)) + // Fast path O(log2(activeLanes)) for (int offset = offsetSize >> 1; offset > 0; offset >>= 1) { val = INTF::doOp(val, __shfl_xor_sync(mask, val, offset)); @@ -1600,9 +2419,9 @@ __device__ T _waveReduceScalar(WarpMask mask, T val) while (remaining) { const int laneBit = remaining & -remaining; - // Get the sourceLane + // Get the sourceLane const int srcLane = __ffs(laneBit) - 1; - // Broadcast (can also broadcast to self) + // Broadcast (can also broadcast to self) result = INTF::doOp(result, __shfl_sync(mask, val, srcLane)); remaining &= ~laneBit; } @@ -1613,13 +2432,13 @@ __device__ T _waveReduceScalar(WarpMask mask, T val) // Multiple values -template <typename INTF, typename T, size_t COUNT> +template<typename INTF, typename T, size_t COUNT> __device__ void _waveReduceMultiple(WarpMask mask, T* val) { const int offsetSize = _waveCalcPow2Offset(mask); if (offsetSize > 0) { - // Fast path O(log2(activeLanes)) + // Fast path O(log2(activeLanes)) for (int offset = offsetSize >> 1; offset > 0; offset >>= 1) { for (size_t i = 0; i < COUNT; ++i) @@ -1638,14 +2457,14 @@ __device__ void _waveReduceMultiple(WarpMask mask, T* val) originalVal[i] = v; val[i] = INTF::getInitial(v); } - + int remaining = mask; while (remaining) { const int laneBit = remaining & -remaining; - // Get the sourceLane + // Get the sourceLane const int srcLane = __ffs(laneBit) - 1; - // Broadcast (can also broadcast to self) + // Broadcast (can also broadcast to self) for (size_t i = 0; i < COUNT; ++i) { val[i] = INTF::doOp(val[i], __shfl_sync(mask, originalVal[i], srcLane)); @@ -1655,99 +2474,182 @@ __device__ void _waveReduceMultiple(WarpMask mask, T* val) } } -template <typename INTF, typename T> +template<typename INTF, typename T> __device__ void _waveReduceMultiple(WarpMask mask, T* val) { - typedef typename ElementTypeTrait<T>::Type ElemType; + typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<INTF, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)val); } -template <typename T> -__inline__ __device__ T _waveOr(WarpMask mask, T val) { return _waveReduceScalar<WaveOpOr<T>, T>(mask, val); } +template<typename T> +__inline__ __device__ T _waveOr(WarpMask mask, T val) +{ + return _waveReduceScalar<WaveOpOr<T>, T>(mask, val); +} -template <typename T> -__inline__ __device__ T _waveAnd(WarpMask mask, T val) { return _waveReduceScalar<WaveOpAnd<T>, T>(mask, val); } +template<typename T> +__inline__ __device__ T _waveAnd(WarpMask mask, T val) +{ + return _waveReduceScalar<WaveOpAnd<T>, T>(mask, val); +} -template <typename T> -__inline__ __device__ T _waveXor(WarpMask mask, T val) { return _waveReduceScalar<WaveOpXor<T>, T>(mask, val); } +template<typename T> +__inline__ __device__ T _waveXor(WarpMask mask, T val) +{ + return _waveReduceScalar<WaveOpXor<T>, T>(mask, val); +} -template <typename T> -__inline__ __device__ T _waveProduct(WarpMask mask, T val) { return _waveReduceScalar<WaveOpMul<T>, T>(mask, val); } +template<typename T> +__inline__ __device__ T _waveProduct(WarpMask mask, T val) +{ + return _waveReduceScalar<WaveOpMul<T>, T>(mask, val); +} -template <typename T> -__inline__ __device__ T _waveSum(WarpMask mask, T val) { return _waveReduceScalar<WaveOpAdd<T>, T>(mask, val); } +template<typename T> +__inline__ __device__ T _waveSum(WarpMask mask, T val) +{ + return _waveReduceScalar<WaveOpAdd<T>, T>(mask, val); +} -template <typename T> -__inline__ __device__ T _waveMin(WarpMask mask, T val) { return _waveReduceScalar<WaveOpMin<T>, T>(mask, val); } +template<typename T> +__inline__ __device__ T _waveMin(WarpMask mask, T val) +{ + return _waveReduceScalar<WaveOpMin<T>, T>(mask, val); +} -template <typename T> -__inline__ __device__ T _waveMax(WarpMask mask, T val) { return _waveReduceScalar<WaveOpMax<T>, T>(mask, val); } +template<typename T> +__inline__ __device__ T _waveMax(WarpMask mask, T val) +{ + return _waveReduceScalar<WaveOpMax<T>, T>(mask, val); +} // Fast-path specializations when CUDA warp reduce operators are available #if __CUDA_ARCH__ >= 800 // 8.x or higher template<> -__inline__ __device__ unsigned _waveOr<unsigned>(WarpMask mask, unsigned val) { return __reduce_or_sync(mask, val); } +__inline__ __device__ unsigned _waveOr<unsigned>(WarpMask mask, unsigned val) +{ + return __reduce_or_sync(mask, val); +} template<> -__inline__ __device__ unsigned _waveAnd<unsigned>(WarpMask mask, unsigned val) { return __reduce_and_sync(mask, val); } +__inline__ __device__ unsigned _waveAnd<unsigned>(WarpMask mask, unsigned val) +{ + return __reduce_and_sync(mask, val); +} template<> -__inline__ __device__ unsigned _waveXor<unsigned>(WarpMask mask, unsigned val) { return __reduce_xor_sync(mask, val); } +__inline__ __device__ unsigned _waveXor<unsigned>(WarpMask mask, unsigned val) +{ + return __reduce_xor_sync(mask, val); +} template<> -__inline__ __device__ unsigned _waveSum<unsigned>(WarpMask mask, unsigned val) { return __reduce_add_sync(mask, val); } +__inline__ __device__ unsigned _waveSum<unsigned>(WarpMask mask, unsigned val) +{ + return __reduce_add_sync(mask, val); +} template<> -__inline__ __device__ int _waveSum<int>(WarpMask mask, int val) { return __reduce_add_sync(mask, val); } +__inline__ __device__ int _waveSum<int>(WarpMask mask, int val) +{ + return __reduce_add_sync(mask, val); +} template<> -__inline__ __device__ unsigned _waveMin<unsigned>(WarpMask mask, unsigned val) { return __reduce_min_sync(mask, val); } +__inline__ __device__ unsigned _waveMin<unsigned>(WarpMask mask, unsigned val) +{ + return __reduce_min_sync(mask, val); +} template<> -__inline__ __device__ int _waveMin<int>(WarpMask mask, int val) { return __reduce_min_sync(mask, val); } +__inline__ __device__ int _waveMin<int>(WarpMask mask, int val) +{ + return __reduce_min_sync(mask, val); +} template<> -__inline__ __device__ unsigned _waveMax<unsigned>(WarpMask mask, unsigned val) { return __reduce_max_sync(mask, val); } +__inline__ __device__ unsigned _waveMax<unsigned>(WarpMask mask, unsigned val) +{ + return __reduce_max_sync(mask, val); +} template<> -__inline__ __device__ int _waveMax<int>(WarpMask mask, int val) { return __reduce_max_sync(mask, val); } +__inline__ __device__ int _waveMax<int>(WarpMask mask, int val) +{ + return __reduce_max_sync(mask, val); +} #endif // Multiple -template <typename T> -__inline__ __device__ T _waveOrMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpOr<ElemType> >(mask, &val); return val; } +template<typename T> +__inline__ __device__ T _waveOrMultiple(WarpMask mask, T val) +{ + typedef typename ElementTypeTrait<T>::Type ElemType; + _waveReduceMultiple<WaveOpOr<ElemType>>(mask, &val); + return val; +} -template <typename T> -__inline__ __device__ T _waveAndMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpAnd<ElemType> >(mask, &val); return val; } +template<typename T> +__inline__ __device__ T _waveAndMultiple(WarpMask mask, T val) +{ + typedef typename ElementTypeTrait<T>::Type ElemType; + _waveReduceMultiple<WaveOpAnd<ElemType>>(mask, &val); + return val; +} -template <typename T> -__inline__ __device__ T _waveXorMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpXor<ElemType> >(mask, &val); return val; } +template<typename T> +__inline__ __device__ T _waveXorMultiple(WarpMask mask, T val) +{ + typedef typename ElementTypeTrait<T>::Type ElemType; + _waveReduceMultiple<WaveOpXor<ElemType>>(mask, &val); + return val; +} -template <typename T> -__inline__ __device__ T _waveProductMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMul<ElemType> >(mask, &val); return val; } +template<typename T> +__inline__ __device__ T _waveProductMultiple(WarpMask mask, T val) +{ + typedef typename ElementTypeTrait<T>::Type ElemType; + _waveReduceMultiple<WaveOpMul<ElemType>>(mask, &val); + return val; +} -template <typename T> -__inline__ __device__ T _waveSumMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpAdd<ElemType> >(mask, &val); return val; } +template<typename T> +__inline__ __device__ T _waveSumMultiple(WarpMask mask, T val) +{ + typedef typename ElementTypeTrait<T>::Type ElemType; + _waveReduceMultiple<WaveOpAdd<ElemType>>(mask, &val); + return val; +} -template <typename T> -__inline__ __device__ T _waveMinMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMin<ElemType> >(mask, &val); return val; } +template<typename T> +__inline__ __device__ T _waveMinMultiple(WarpMask mask, T val) +{ + typedef typename ElementTypeTrait<T>::Type ElemType; + _waveReduceMultiple<WaveOpMin<ElemType>>(mask, &val); + return val; +} -template <typename T> -__inline__ __device__ T _waveMaxMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMax<ElemType> >(mask, &val); return val; } +template<typename T> +__inline__ __device__ T _waveMaxMultiple(WarpMask mask, T val) +{ + typedef typename ElementTypeTrait<T>::Type ElemType; + _waveReduceMultiple<WaveOpMax<ElemType>>(mask, &val); + return val; +} -template <typename T> -__inline__ __device__ bool _waveAllEqual(WarpMask mask, T val) +template<typename T> +__inline__ __device__ bool _waveAllEqual(WarpMask mask, T val) { int pred; __match_all_sync(mask, val, &pred); return pred != 0; } -template <typename T> -__inline__ __device__ bool _waveAllEqualMultiple(WarpMask mask, T inVal) +template<typename T> +__inline__ __device__ bool _waveAllEqualMultiple(WarpMask mask, T inVal) { typedef typename ElementTypeTrait<T>::Type ElemType; const size_t count = sizeof(T) / sizeof(ElemType); @@ -1764,15 +2666,15 @@ __inline__ __device__ bool _waveAllEqualMultiple(WarpMask mask, T inVal) return true; } -template <typename T> -__inline__ __device__ T _waveReadFirst(WarpMask mask, T val) +template<typename T> +__inline__ __device__ T _waveReadFirst(WarpMask mask, T val) { const int lowestLaneId = __ffs(mask) - 1; - return __shfl_sync(mask, val, lowestLaneId); + return __shfl_sync(mask, val, lowestLaneId); } -template <typename T> -__inline__ __device__ T _waveReadFirstMultiple(WarpMask mask, T inVal) +template<typename T> +__inline__ __device__ T _waveReadFirstMultiple(WarpMask mask, T inVal) { typedef typename ElementTypeTrait<T>::Type ElemType; const size_t count = sizeof(T) / sizeof(ElemType); @@ -1782,12 +2684,12 @@ __inline__ __device__ T _waveReadFirstMultiple(WarpMask mask, T inVal) const int lowestLaneId = __ffs(mask) - 1; for (size_t i = 0; i < count; ++i) { - dst[i] = __shfl_sync(mask, src[i], lowestLaneId); + dst[i] = __shfl_sync(mask, src[i], lowestLaneId); } return outVal; } -template <typename T> +template<typename T> __inline__ __device__ T _waveShuffleMultiple(WarpMask mask, T inVal, int lane) { typedef typename ElementTypeTrait<T>::Type ElemType; @@ -1797,27 +2699,27 @@ __inline__ __device__ T _waveShuffleMultiple(WarpMask mask, T inVal, int lane) ElemType* dst = (ElemType*)&outVal; for (size_t i = 0; i < count; ++i) { - dst[i] = __shfl_sync(mask, src[i], lane); + dst[i] = __shfl_sync(mask, src[i], lane); } return outVal; } -// Scalar +// Scalar -// Invertable means that when we get to the end of the reduce, we can remove val (to make exclusive), using -// the inverse of the op. -template <typename INTF, typename T> +// Invertable means that when we get to the end of the reduce, we can remove val (to make +// exclusive), using the inverse of the op. +template<typename INTF, typename T> __device__ T _wavePrefixInvertableScalar(WarpMask mask, T val) { const int offsetSize = _waveCalcPow2Offset(mask); - + const int laneId = _getLaneId(); T result; if (offsetSize > 0) - { + { // Sum is calculated inclusive of this lanes value result = val; - for (int i = 1; i < offsetSize; i += i) + for (int i = 1; i < offsetSize; i += i) { const T readVal = __shfl_up_sync(mask, result, i, offsetSize); if (laneId >= i) @@ -1828,7 +2730,7 @@ __device__ T _wavePrefixInvertableScalar(WarpMask mask, T val) // Remove val from the result, by applyin inverse result = INTF::doInverse(result, val); } - else + else { result = INTF::getInitial(val); if (!_waveIsSingleLane(mask)) @@ -1837,9 +2739,9 @@ __device__ T _wavePrefixInvertableScalar(WarpMask mask, T val) while (remaining) { const int laneBit = remaining & -remaining; - // Get the sourceLane + // Get the sourceLane const int srcLane = __ffs(laneBit) - 1; - // Broadcast (can also broadcast to self) + // Broadcast (can also broadcast to self) const T readValue = __shfl_sync(mask, val, srcLane); // Only accumulate if srcLane is less than this lane if (srcLane < laneId) @@ -1848,27 +2750,28 @@ __device__ T _wavePrefixInvertableScalar(WarpMask mask, T val) } remaining &= ~laneBit; } - } + } } return result; } - + // This implementation separately tracks the value to be propogated, and the value -// that is the final result -template <typename INTF, typename T> +// that is the final result +template<typename INTF, typename T> __device__ T _wavePrefixScalar(WarpMask mask, T val) { const int offsetSize = _waveCalcPow2Offset(mask); - + const int laneId = _getLaneId(); - T result = INTF::getInitial(val); + T result = INTF::getInitial(val); if (offsetSize > 0) - { + { // For transmitted value we will do it inclusively with this lanes value - // For the result we do not include the lanes value. This means an extra multiply for each iteration - // but means we don't need to have a divide at the end and also removes overflow issues in that scenario. - for (int i = 1; i < offsetSize; i += i) + // For the result we do not include the lanes value. This means an extra multiply for each + // iteration but means we don't need to have a divide at the end and also removes overflow + // issues in that scenario. + for (int i = 1; i < offsetSize; i += i) { const T readVal = __shfl_up_sync(mask, val, i, offsetSize); if (laneId >= i) @@ -1878,7 +2781,7 @@ __device__ T _wavePrefixScalar(WarpMask mask, T val) } } } - else + else { if (!_waveIsSingleLane(mask)) { @@ -1886,9 +2789,9 @@ __device__ T _wavePrefixScalar(WarpMask mask, T val) while (remaining) { const int laneBit = remaining & -remaining; - // Get the sourceLane + // Get the sourceLane const int srcLane = __ffs(laneBit) - 1; - // Broadcast (can also broadcast to self) + // Broadcast (can also broadcast to self) const T readValue = __shfl_sync(mask, val, srcLane); // Only accumulate if srcLane is less than this lane if (srcLane < laneId) @@ -1903,51 +2806,51 @@ __device__ T _wavePrefixScalar(WarpMask mask, T val) } -template <typename INTF, typename T, size_t COUNT> +template<typename INTF, typename T, size_t COUNT> __device__ T _waveOpCopy(T* dst, const T* src) { for (size_t j = 0; j < COUNT; ++j) { dst[j] = src[j]; } -} +} -template <typename INTF, typename T, size_t COUNT> +template<typename INTF, typename T, size_t COUNT> __device__ T _waveOpDoInverse(T* inOut, const T* val) { for (size_t j = 0; j < COUNT; ++j) { inOut[j] = INTF::doInverse(inOut[j], val[j]); } -} +} -template <typename INTF, typename T, size_t COUNT> +template<typename INTF, typename T, size_t COUNT> __device__ T _waveOpSetInitial(T* out, const T* val) { for (size_t j = 0; j < COUNT; ++j) { out[j] = INTF::getInitial(val[j]); } -} +} -template <typename INTF, typename T, size_t COUNT> +template<typename INTF, typename T, size_t COUNT> __device__ T _wavePrefixInvertableMultiple(WarpMask mask, T* val) { const int offsetSize = _waveCalcPow2Offset(mask); - + const int laneId = _getLaneId(); T originalVal[COUNT]; _waveOpCopy<INTF, T, COUNT>(originalVal, val); - + if (offsetSize > 0) - { + { // Sum is calculated inclusive of this lanes value - for (int i = 1; i < offsetSize; i += i) + for (int i = 1; i < offsetSize; i += i) { // TODO(JS): Note that here I don't split the laneId outside so it's only tested once. - // This may be better but it would also mean that there would be shfl between lanes - // that are on different (albeit identical) instructions. So this seems more likely to + // This may be better but it would also mean that there would be shfl between lanes + // that are on different (albeit identical) instructions. So this seems more likely to // work as expected with everything in lock step. for (size_t j = 0; j < COUNT; ++j) { @@ -1961,7 +2864,7 @@ __device__ T _wavePrefixInvertableMultiple(WarpMask mask, T* val) // Remove originalVal from the result, by applyin inverse _waveOpDoInverse<INTF, T, COUNT>(val, originalVal); } - else + else { _waveOpSetInitial<INTF, T, COUNT>(val, val); if (!_waveIsSingleLane(mask)) @@ -1970,12 +2873,12 @@ __device__ T _wavePrefixInvertableMultiple(WarpMask mask, T* val) while (remaining) { const int laneBit = remaining & -remaining; - // Get the sourceLane + // Get the sourceLane const int srcLane = __ffs(laneBit) - 1; - + for (size_t j = 0; j < COUNT; ++j) { - // Broadcast (can also broadcast to self) + // Broadcast (can also broadcast to self) const T readValue = __shfl_sync(mask, originalVal[j], srcLane); // Only accumulate if srcLane is less than this lane if (srcLane < laneId) @@ -1985,27 +2888,28 @@ __device__ T _wavePrefixInvertableMultiple(WarpMask mask, T* val) remaining &= ~laneBit; } } - } + } } } - -template <typename INTF, typename T, size_t COUNT> + +template<typename INTF, typename T, size_t COUNT> __device__ T _wavePrefixMultiple(WarpMask mask, T* val) { const int offsetSize = _waveCalcPow2Offset(mask); - + const int laneId = _getLaneId(); - + T work[COUNT]; _waveOpCopy<INTF, T, COUNT>(work, val); _waveOpSetInitial<INTF, T, COUNT>(val, val); - + if (offsetSize > 0) - { + { // For transmitted value we will do it inclusively with this lanes value - // For the result we do not include the lanes value. This means an extra op for each iteration - // but means we don't need to have a divide at the end and also removes overflow issues in that scenario. - for (int i = 1; i < offsetSize; i += i) + // For the result we do not include the lanes value. This means an extra op for each + // iteration but means we don't need to have a divide at the end and also removes overflow + // issues in that scenario. + for (int i = 1; i < offsetSize; i += i) { for (size_t j = 0; j < COUNT; ++j) { @@ -2013,12 +2917,12 @@ __device__ T _wavePrefixMultiple(WarpMask mask, T* val) if (laneId >= i) { work[j] = INTF::doOp(work[j], readVal); - val[j] = INTF::doOp(val[j], readVal); + val[j] = INTF::doOp(val[j], readVal); } } } } - else + else { if (!_waveIsSingleLane(mask)) { @@ -2026,12 +2930,12 @@ __device__ T _wavePrefixMultiple(WarpMask mask, T* val) while (remaining) { const int laneBit = remaining & -remaining; - // Get the sourceLane + // Get the sourceLane const int srcLane = __ffs(laneBit) - 1; - + for (size_t j = 0; j < COUNT; ++j) { - // Broadcast (can also broadcast to self) + // Broadcast (can also broadcast to self) const T readValue = __shfl_sync(mask, work[j], srcLane); // Only accumulate if srcLane is less than this lane if (srcLane < laneId) @@ -2045,71 +2949,96 @@ __device__ T _wavePrefixMultiple(WarpMask mask, T* val) } } -template <typename T> -__inline__ __device__ T _wavePrefixProduct(WarpMask mask, T val) { return _wavePrefixScalar<WaveOpMul<T>, T>(mask, val); } - -template <typename T> -__inline__ __device__ T _wavePrefixSum(WarpMask mask, T val) { return _wavePrefixInvertableScalar<WaveOpAdd<T>, T>(mask, val); } - -template <typename T> -__inline__ __device__ T _wavePrefixXor(WarpMask mask, T val) { return _wavePrefixInvertableScalar<WaveOpXor<T>, T>(mask, val); } - -template <typename T> -__inline__ __device__ T _wavePrefixOr(WarpMask mask, T val) { return _wavePrefixScalar<WaveOpOr<T>, T>(mask, val); } - -template <typename T> -__inline__ __device__ T _wavePrefixAnd(WarpMask mask, T val) { return _wavePrefixScalar<WaveOpAnd<T>, T>(mask, val); } - - -template <typename T> -__inline__ __device__ T _wavePrefixProductMultiple(WarpMask mask, T val) -{ - typedef typename ElementTypeTrait<T>::Type ElemType; - _wavePrefixInvertableMultiple<WaveOpMul<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)&val); +template<typename T> +__inline__ __device__ T _wavePrefixProduct(WarpMask mask, T val) +{ + return _wavePrefixScalar<WaveOpMul<T>, T>(mask, val); +} + +template<typename T> +__inline__ __device__ T _wavePrefixSum(WarpMask mask, T val) +{ + return _wavePrefixInvertableScalar<WaveOpAdd<T>, T>(mask, val); +} + +template<typename T> +__inline__ __device__ T _wavePrefixXor(WarpMask mask, T val) +{ + return _wavePrefixInvertableScalar<WaveOpXor<T>, T>(mask, val); +} + +template<typename T> +__inline__ __device__ T _wavePrefixOr(WarpMask mask, T val) +{ + return _wavePrefixScalar<WaveOpOr<T>, T>(mask, val); +} + +template<typename T> +__inline__ __device__ T _wavePrefixAnd(WarpMask mask, T val) +{ + return _wavePrefixScalar<WaveOpAnd<T>, T>(mask, val); +} + + +template<typename T> +__inline__ __device__ T _wavePrefixProductMultiple(WarpMask mask, T val) +{ + typedef typename ElementTypeTrait<T>::Type ElemType; + _wavePrefixInvertableMultiple<WaveOpMul<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>( + mask, + (ElemType*)&val); return val; } -template <typename T> -__inline__ __device__ T _wavePrefixSumMultiple(WarpMask mask, T val) -{ - typedef typename ElementTypeTrait<T>::Type ElemType; - _wavePrefixInvertableMultiple<WaveOpAdd<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)&val); +template<typename T> +__inline__ __device__ T _wavePrefixSumMultiple(WarpMask mask, T val) +{ + typedef typename ElementTypeTrait<T>::Type ElemType; + _wavePrefixInvertableMultiple<WaveOpAdd<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>( + mask, + (ElemType*)&val); return val; } -template <typename T> -__inline__ __device__ T _wavePrefixXorMultiple(WarpMask mask, T val) -{ - typedef typename ElementTypeTrait<T>::Type ElemType; - _wavePrefixInvertableMultiple<WaveOpXor<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)&val); +template<typename T> +__inline__ __device__ T _wavePrefixXorMultiple(WarpMask mask, T val) +{ + typedef typename ElementTypeTrait<T>::Type ElemType; + _wavePrefixInvertableMultiple<WaveOpXor<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>( + mask, + (ElemType*)&val); return val; } -template <typename T> -__inline__ __device__ T _wavePrefixOrMultiple(WarpMask mask, T val) -{ - typedef typename ElementTypeTrait<T>::Type ElemType; - _wavePrefixMultiple<WaveOpOr<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)&val); +template<typename T> +__inline__ __device__ T _wavePrefixOrMultiple(WarpMask mask, T val) +{ + typedef typename ElementTypeTrait<T>::Type ElemType; + _wavePrefixMultiple<WaveOpOr<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>( + mask, + (ElemType*)&val); return val; } -template <typename T> -__inline__ __device__ T _wavePrefixAndMultiple(WarpMask mask, T val) -{ - typedef typename ElementTypeTrait<T>::Type ElemType; - _wavePrefixMultiple<WaveOpAnd<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)&val); +template<typename T> +__inline__ __device__ T _wavePrefixAndMultiple(WarpMask mask, T val) +{ + typedef typename ElementTypeTrait<T>::Type ElemType; + _wavePrefixMultiple<WaveOpAnd<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>( + mask, + (ElemType*)&val); return val; } -template <typename T> -__inline__ __device__ uint4 _waveMatchScalar(WarpMask mask, T val) +template<typename T> +__inline__ __device__ uint4 _waveMatchScalar(WarpMask mask, T val) { int pred; return make_uint4(__match_all_sync(mask, val, &pred), 0, 0, 0); } -template <typename T> -__inline__ __device__ uint4 _waveMatchMultiple(WarpMask mask, const T& inVal) +template<typename T> +__inline__ __device__ uint4 _waveMatchMultiple(WarpMask mask, const T& inVal) { typedef typename ElementTypeTrait<T>::Type ElemType; const size_t count = sizeof(T) / sizeof(ElemType); @@ -2123,7 +3052,7 @@ __inline__ __device__ uint4 _waveMatchMultiple(WarpMask mask, const T& inVal) return make_uint4(matchBits, 0, 0, 0); } -__device__ uint getAt(dim3 a, int b) +__device__ uint getAt(dim3 a, int b) { SLANG_PRELUDE_ASSERT(b >= 0 && b < 3); return (&a.x)[b]; @@ -2146,8 +3075,9 @@ __inline__ __device__ TResult slang_bit_cast(TInput val) /* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */ -/* Type that defines the uniform entry point params. The actual content of this type is dependent on the entry point parameters, and can be -found via reflection or defined such that it matches the shader appropriately. +/* Type that defines the uniform entry point params. The actual content of this type is dependent on +the entry point parameters, and can be found via reflection or defined such that it matches the +shader appropriately. */ struct UniformEntryPointParams; struct UniformState; @@ -2157,28 +3087,29 @@ struct UniformState; struct RayDesc { float3 Origin; - float TMin; + float TMin; float3 Direction; - float TMax; + float TMax; }; -static __forceinline__ __device__ -void *unpackOptiXRayPayloadPointer(uint32_t i0, uint32_t i1) +static __forceinline__ __device__ void* unpackOptiXRayPayloadPointer(uint32_t i0, uint32_t i1) { const uint64_t uptr = static_cast<uint64_t>(i0) << 32 | i1; - void* ptr = reinterpret_cast<void*>(uptr); + void* ptr = reinterpret_cast<void*>(uptr); return ptr; } -static __forceinline__ __device__ -void packOptiXRayPayloadPointer(void* ptr, uint32_t& i0, uint32_t& i1) +static __forceinline__ __device__ void packOptiXRayPayloadPointer( + void* ptr, + uint32_t& i0, + uint32_t& i1) { const uint64_t uptr = reinterpret_cast<uint64_t>(ptr); i0 = uptr >> 32; i1 = uptr & 0x00000000ffffffff; } -static __forceinline__ __device__ void *getOptiXRayPayloadPtr() +static __forceinline__ __device__ void* getOptiXRayPayloadPtr() { const uint32_t u0 = optixGetPayload_0(); const uint32_t u1 = optixGetPayload_1(); @@ -2186,7 +3117,7 @@ static __forceinline__ __device__ void *getOptiXRayPayloadPtr() } template<typename T> -__forceinline__ __device__ void *traceOptiXRay( +__forceinline__ __device__ void* traceOptiXRay( OptixTraversableHandle AccelerationStructure, uint32_t RayFlags, uint32_t InstanceInclusionMask, @@ -2194,8 +3125,8 @@ __forceinline__ __device__ void *traceOptiXRay( uint32_t MultiplierForGeometryContributionToHitGroupIndex, uint32_t MissShaderIndex, RayDesc Ray, - T *Payload -) { + T* Payload) +{ uint32_t r0, r1; packOptiXRayPayloadPointer((void*)Payload, r0, r1); optixTrace( @@ -2210,8 +3141,8 @@ __forceinline__ __device__ void *traceOptiXRay( RayContributionToHitGroupIndex, MultiplierForGeometryContributionToHitGroupIndex, MissShaderIndex, - r0, r1 - ); + r0, + r1); } #endif @@ -2256,7 +3187,8 @@ struct TensorView template<typename T> __device__ T* data_ptr_at(uint4 index) { - uint64_t offset = strides[0] * index.x + strides[1] * index.y + strides[2] * index.z + strides[3] * index.w; + uint64_t offset = strides[0] * index.x + strides[1] * index.y + strides[2] * index.z + + strides[3] * index.w; return reinterpret_cast<T*>(data + offset); } @@ -2294,22 +3226,28 @@ struct TensorView template<typename T> __device__ T& load(uint3 index) { - return *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z); + return *reinterpret_cast<T*>( + data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z); } template<typename T> __device__ T& load(uint32_t x, uint32_t y, uint32_t z, uint32_t w) { - return *reinterpret_cast<T*>(data + strides[0] * x + strides[1] * y + strides[2] * z + strides[3] * w); + return *reinterpret_cast<T*>( + data + strides[0] * x + strides[1] * y + strides[2] * z + strides[3] * w); } template<typename T> __device__ T& load(uint4 index) { - return *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z + strides[3] * index.w); + return *reinterpret_cast<T*>( + data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z + + strides[3] * index.w); } template<typename T> __device__ T& load(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4) { - return *reinterpret_cast<T*>(data + strides[0] * i0 + strides[1] * i1 + strides[2] * i2 + strides[3] * i3 + strides[4] * i4); + return *reinterpret_cast<T*>( + data + strides[0] * i0 + strides[1] * i1 + strides[2] * i2 + strides[3] * i3 + + strides[4] * i4); } // Generic version of load @@ -2347,7 +3285,8 @@ struct TensorView template<typename T> __device__ void store(uint3 index, T val) { - *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z) = val; + *reinterpret_cast<T*>( + data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z) = val; } template<typename T> __device__ void store(uint32_t x, uint32_t y, uint32_t z, uint32_t w, T val) @@ -2358,12 +3297,16 @@ struct TensorView template<typename T> __device__ void store(uint4 index, T val) { - *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z + strides[3] * index.w) = val; + *reinterpret_cast<T*>( + data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z + + strides[3] * index.w) = val; } template<typename T> __device__ void store(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, T val) { - *reinterpret_cast<T*>(data + strides[0] * i0 + strides[1] * i1 + strides[2] * i2 + strides[3] * i3 + strides[4] * i4) = val; + *reinterpret_cast<T*>( + data + strides[0] * i0 + strides[1] * i1 + strides[2] * i2 + strides[3] * i3 + + strides[4] * i4) = val; } // Generic version diff --git a/prelude/slang-hlsl-prelude.h b/prelude/slang-hlsl-prelude.h index d892f228c..8e77201f9 100644 --- a/prelude/slang-hlsl-prelude.h +++ b/prelude/slang-hlsl-prelude.h @@ -3,6 +3,6 @@ #endif #ifndef __DXC_VERSION_MAJOR - // warning X3557: loop doesn't seem to do anything, forcing loop to unroll - #pragma warning(disable: 3557) +// warning X3557: loop doesn't seem to do anything, forcing loop to unroll +#pragma warning(disable : 3557) #endif diff --git a/prelude/slang-llvm.h b/prelude/slang-llvm.h index b41380581..e0bbbd14a 100644 --- a/prelude/slang-llvm.h +++ b/prelude/slang-llvm.h @@ -1,46 +1,54 @@ #ifndef SLANG_LLVM_H #define SLANG_LLVM_H -// TODO(JS): +// TODO(JS): // Disable exception declspecs, as not supported on LLVM without some extra options. // We could enable with `-fms-extensions` #define SLANG_DISABLE_EXCEPTIONS 1 #ifndef SLANG_PRELUDE_ASSERT -# ifdef SLANG_PRELUDE_ENABLE_ASSERT +#ifdef SLANG_PRELUDE_ENABLE_ASSERT extern "C" void assertFailure(const char* msg); -# define SLANG_PRELUDE_EXPECT(VALUE, MSG) if(VALUE) {} else assertFailure("assertion failed: '" MSG "'") -# define SLANG_PRELUDE_ASSERT(VALUE) SLANG_PRELUDE_EXPECT(VALUE, #VALUE) -# else // SLANG_PRELUDE_ENABLE_ASSERT -# define SLANG_PRELUDE_EXPECT(VALUE, MSG) -# define SLANG_PRELUDE_ASSERT(x) -# endif // SLANG_PRELUDE_ENABLE_ASSERT +#define SLANG_PRELUDE_EXPECT(VALUE, MSG) \ + if (VALUE) \ + { \ + } \ + else \ + assertFailure("assertion failed: '" MSG "'") +#define SLANG_PRELUDE_ASSERT(VALUE) SLANG_PRELUDE_EXPECT(VALUE, #VALUE) +#else // SLANG_PRELUDE_ENABLE_ASSERT +#define SLANG_PRELUDE_EXPECT(VALUE, MSG) +#define SLANG_PRELUDE_ASSERT(x) +#endif // SLANG_PRELUDE_ENABLE_ASSERT #endif /* -Taken from stddef.h +Taken from stddef.h */ typedef __PTRDIFF_TYPE__ ptrdiff_t; typedef __SIZE_TYPE__ size_t; typedef __SIZE_TYPE__ rsize_t; -//typedef __WCHAR_TYPE__ wchar_t; +// typedef __WCHAR_TYPE__ wchar_t; #if defined(__need_NULL) #undef NULL #ifdef __cplusplus -# if !defined(__MINGW32__) && !defined(_MSC_VER) -# define NULL __null -# else -# define NULL 0 -# endif +#if !defined(__MINGW32__) && !defined(_MSC_VER) +#define NULL __null #else -# define NULL ((void*)0) +#define NULL 0 +#endif +#else +#define NULL ((void*)0) #endif #ifdef __cplusplus #if defined(_MSC_EXTENSIONS) && defined(_NATIVE_NULLPTR_SUPPORTED) -namespace std { typedef decltype(nullptr) nullptr_t; } +namespace std +{ +typedef decltype(nullptr) nullptr_t; +} using ::std::nullptr_t; #endif #endif @@ -49,18 +57,18 @@ using ::std::nullptr_t; /* -The following are taken verbatim from stdint.h from Clang in LLVM. Only 8/16/32/64 types are needed. +The following are taken verbatim from stdint.h from Clang in LLVM. Only 8/16/32/64 types are needed. */ // LLVM/Clang types such that we can use LLVM/Clang without headers for C++ output from Slang #ifdef __INT64_TYPE__ -# ifndef __int8_t_defined /* glibc sys/types.h also defines int64_t*/ +#ifndef __int8_t_defined /* glibc sys/types.h also defines int64_t*/ typedef __INT64_TYPE__ int64_t; -# endif /* __int8_t_defined */ +#endif /* __int8_t_defined */ typedef __UINT64_TYPE__ uint64_t; -# define __int_least64_t int64_t -# define __uint_least64_t uint64_t +#define __int_least64_t int64_t +#define __uint_least64_t uint64_t #endif /* __INT64_TYPE__ */ #ifdef __int_least64_t @@ -72,17 +80,17 @@ typedef __uint_least64_t uint_fast64_t; #ifdef __INT32_TYPE__ -# ifndef __int8_t_defined /* glibc sys/types.h also defines int32_t*/ +#ifndef __int8_t_defined /* glibc sys/types.h also defines int32_t*/ typedef __INT32_TYPE__ int32_t; -# endif /* __int8_t_defined */ +#endif /* __int8_t_defined */ -# ifndef __uint32_t_defined /* more glibc compatibility */ -# define __uint32_t_defined +#ifndef __uint32_t_defined /* more glibc compatibility */ +#define __uint32_t_defined typedef __UINT32_TYPE__ uint32_t; -# endif /* __uint32_t_defined */ +#endif /* __uint32_t_defined */ -# define __int_least32_t int32_t -# define __uint_least32_t uint32_t +#define __int_least32_t int32_t +#define __uint_least32_t uint32_t #endif /* __INT32_TYPE__ */ #ifdef __int_least32_t @@ -97,8 +105,8 @@ typedef __uint_least32_t uint_fast32_t; typedef __INT16_TYPE__ int16_t; #endif /* __int8_t_defined */ typedef __UINT16_TYPE__ uint16_t; -# define __int_least16_t int16_t -# define __uint_least16_t uint16_t +#define __int_least16_t int16_t +#define __uint_least16_t uint16_t #endif /* __INT16_TYPE__ */ #ifdef __int_least16_t @@ -109,12 +117,12 @@ typedef __uint_least16_t uint_fast16_t; #endif /* __int_least16_t */ #ifdef __INT8_TYPE__ -#ifndef __int8_t_defined /* glibc sys/types.h also defines int8_t*/ +#ifndef __int8_t_defined /* glibc sys/types.h also defines int8_t*/ typedef __INT8_TYPE__ int8_t; #endif /* __int8_t_defined */ typedef __UINT8_TYPE__ uint8_t; -# define __int_least8_t int8_t -# define __uint_least8_t uint8_t +#define __int_least8_t int8_t +#define __uint_least8_t uint8_t #endif /* __INT8_TYPE__ */ #ifdef __int_least8_t @@ -126,12 +134,12 @@ typedef __uint_least8_t uint_fast8_t; /* prevent glibc sys/types.h from defining conflicting types */ #ifndef __int8_t_defined -# define __int8_t_defined +#define __int8_t_defined #endif /* __int8_t_defined */ /* C99 7.18.1.4 Integer types capable of holding object pointers. */ -#define __stdint_join3(a,b,c) a ## b ## c +#define __stdint_join3(a, b, c) a##b##c #ifndef _INTPTR_T #ifndef __intptr_t_defined @@ -148,7 +156,7 @@ typedef __UINTPTR_TYPE__ uintptr_t; /* C99 7.18.1.5 Greatest-width integer types. */ -typedef __INTMAX_TYPE__ intmax_t; +typedef __INTMAX_TYPE__ intmax_t; typedef __UINTMAX_TYPE__ uintmax_t; /* C99 7.18.4 Macros for minimum-width integer constants. @@ -168,82 +176,82 @@ typedef __UINTMAX_TYPE__ uintmax_t; * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]). */ -#define __int_c_join(a, b) a ## b +#define __int_c_join(a, b) a##b #define __int_c(v, suffix) __int_c_join(v, suffix) #define __uint_c(v, suffix) __int_c_join(v##U, suffix) #ifdef __INT64_TYPE__ -# ifdef __INT64_C_SUFFIX__ -# define __int64_c_suffix __INT64_C_SUFFIX__ -# else -# undef __int64_c_suffix -# endif /* __INT64_C_SUFFIX__ */ +#ifdef __INT64_C_SUFFIX__ +#define __int64_c_suffix __INT64_C_SUFFIX__ +#else +#undef __int64_c_suffix +#endif /* __INT64_C_SUFFIX__ */ #endif /* __INT64_TYPE__ */ #ifdef __int_least64_t -# ifdef __int64_c_suffix -# define INT64_C(v) __int_c(v, __int64_c_suffix) -# define UINT64_C(v) __uint_c(v, __int64_c_suffix) -# else -# define INT64_C(v) v -# define UINT64_C(v) v ## U -# endif /* __int64_c_suffix */ +#ifdef __int64_c_suffix +#define INT64_C(v) __int_c(v, __int64_c_suffix) +#define UINT64_C(v) __uint_c(v, __int64_c_suffix) +#else +#define INT64_C(v) v +#define UINT64_C(v) v##U +#endif /* __int64_c_suffix */ #endif /* __int_least64_t */ #ifdef __INT32_TYPE__ -# ifdef __INT32_C_SUFFIX__ -# define __int32_c_suffix __INT32_C_SUFFIX__ +#ifdef __INT32_C_SUFFIX__ +#define __int32_c_suffix __INT32_C_SUFFIX__ #else -# undef __int32_c_suffix -# endif /* __INT32_C_SUFFIX__ */ +#undef __int32_c_suffix +#endif /* __INT32_C_SUFFIX__ */ #endif /* __INT32_TYPE__ */ #ifdef __int_least32_t -# ifdef __int32_c_suffix -# define INT32_C(v) __int_c(v, __int32_c_suffix) -# define UINT32_C(v) __uint_c(v, __int32_c_suffix) -# else -# define INT32_C(v) v -# define UINT32_C(v) v ## U -# endif /* __int32_c_suffix */ +#ifdef __int32_c_suffix +#define INT32_C(v) __int_c(v, __int32_c_suffix) +#define UINT32_C(v) __uint_c(v, __int32_c_suffix) +#else +#define INT32_C(v) v +#define UINT32_C(v) v##U +#endif /* __int32_c_suffix */ #endif /* __int_least32_t */ #ifdef __INT16_TYPE__ -# ifdef __INT16_C_SUFFIX__ -# define __int16_c_suffix __INT16_C_SUFFIX__ +#ifdef __INT16_C_SUFFIX__ +#define __int16_c_suffix __INT16_C_SUFFIX__ #else -# undef __int16_c_suffix -# endif /* __INT16_C_SUFFIX__ */ +#undef __int16_c_suffix +#endif /* __INT16_C_SUFFIX__ */ #endif /* __INT16_TYPE__ */ #ifdef __int_least16_t -# ifdef __int16_c_suffix -# define INT16_C(v) __int_c(v, __int16_c_suffix) -# define UINT16_C(v) __uint_c(v, __int16_c_suffix) -# else -# define INT16_C(v) v -# define UINT16_C(v) v ## U -# endif /* __int16_c_suffix */ +#ifdef __int16_c_suffix +#define INT16_C(v) __int_c(v, __int16_c_suffix) +#define UINT16_C(v) __uint_c(v, __int16_c_suffix) +#else +#define INT16_C(v) v +#define UINT16_C(v) v##U +#endif /* __int16_c_suffix */ #endif /* __int_least16_t */ #ifdef __INT8_TYPE__ -# ifdef __INT8_C_SUFFIX__ -# define __int8_c_suffix __INT8_C_SUFFIX__ +#ifdef __INT8_C_SUFFIX__ +#define __int8_c_suffix __INT8_C_SUFFIX__ #else -# undef __int8_c_suffix -# endif /* __INT8_C_SUFFIX__ */ +#undef __int8_c_suffix +#endif /* __INT8_C_SUFFIX__ */ #endif /* __INT8_TYPE__ */ #ifdef __int_least8_t -# ifdef __int8_c_suffix -# define INT8_C(v) __int_c(v, __int8_c_suffix) -# define UINT8_C(v) __uint_c(v, __int8_c_suffix) -# else -# define INT8_C(v) v -# define UINT8_C(v) v ## U -# endif /* __int8_c_suffix */ +#ifdef __int8_c_suffix +#define INT8_C(v) __int_c(v, __int8_c_suffix) +#define UINT8_C(v) __uint_c(v, __int8_c_suffix) +#else +#define INT8_C(v) v +#define UINT8_C(v) v##U +#endif /* __int8_c_suffix */ #endif /* __int_least8_t */ /* C99 7.18.2.1 Limits of exact-width integer types. @@ -266,133 +274,131 @@ typedef __UINTMAX_TYPE__ uintmax_t; */ #ifdef __INT64_TYPE__ -# define INT64_MAX INT64_C( 9223372036854775807) -# define INT64_MIN (-INT64_C( 9223372036854775807)-1) -# define UINT64_MAX UINT64_C(18446744073709551615) -# define __INT_LEAST64_MIN INT64_MIN -# define __INT_LEAST64_MAX INT64_MAX -# define __UINT_LEAST64_MAX UINT64_MAX +#define INT64_MAX INT64_C(9223372036854775807) +#define INT64_MIN (-INT64_C(9223372036854775807) - 1) +#define UINT64_MAX UINT64_C(18446744073709551615) +#define __INT_LEAST64_MIN INT64_MIN +#define __INT_LEAST64_MAX INT64_MAX +#define __UINT_LEAST64_MAX UINT64_MAX #endif /* __INT64_TYPE__ */ #ifdef __INT_LEAST64_MIN -# define INT_LEAST64_MIN __INT_LEAST64_MIN -# define INT_LEAST64_MAX __INT_LEAST64_MAX -# define UINT_LEAST64_MAX __UINT_LEAST64_MAX -# define INT_FAST64_MIN __INT_LEAST64_MIN -# define INT_FAST64_MAX __INT_LEAST64_MAX -# define UINT_FAST64_MAX __UINT_LEAST64_MAX +#define INT_LEAST64_MIN __INT_LEAST64_MIN +#define INT_LEAST64_MAX __INT_LEAST64_MAX +#define UINT_LEAST64_MAX __UINT_LEAST64_MAX +#define INT_FAST64_MIN __INT_LEAST64_MIN +#define INT_FAST64_MAX __INT_LEAST64_MAX +#define UINT_FAST64_MAX __UINT_LEAST64_MAX #endif /* __INT_LEAST64_MIN */ #ifdef __INT32_TYPE__ -# define INT32_MAX INT32_C(2147483647) -# define INT32_MIN (-INT32_C(2147483647)-1) -# define UINT32_MAX UINT32_C(4294967295) -# define __INT_LEAST32_MIN INT32_MIN -# define __INT_LEAST32_MAX INT32_MAX -# define __UINT_LEAST32_MAX UINT32_MAX +#define INT32_MAX INT32_C(2147483647) +#define INT32_MIN (-INT32_C(2147483647) - 1) +#define UINT32_MAX UINT32_C(4294967295) +#define __INT_LEAST32_MIN INT32_MIN +#define __INT_LEAST32_MAX INT32_MAX +#define __UINT_LEAST32_MAX UINT32_MAX #endif /* __INT32_TYPE__ */ #ifdef __INT_LEAST32_MIN -# define INT_LEAST32_MIN __INT_LEAST32_MIN -# define INT_LEAST32_MAX __INT_LEAST32_MAX -# define UINT_LEAST32_MAX __UINT_LEAST32_MAX -# define INT_FAST32_MIN __INT_LEAST32_MIN -# define INT_FAST32_MAX __INT_LEAST32_MAX -# define UINT_FAST32_MAX __UINT_LEAST32_MAX +#define INT_LEAST32_MIN __INT_LEAST32_MIN +#define INT_LEAST32_MAX __INT_LEAST32_MAX +#define UINT_LEAST32_MAX __UINT_LEAST32_MAX +#define INT_FAST32_MIN __INT_LEAST32_MIN +#define INT_FAST32_MAX __INT_LEAST32_MAX +#define UINT_FAST32_MAX __UINT_LEAST32_MAX #endif /* __INT_LEAST32_MIN */ #ifdef __INT16_TYPE__ -#define INT16_MAX INT16_C(32767) -#define INT16_MIN (-INT16_C(32767)-1) -#define UINT16_MAX UINT16_C(65535) -# define __INT_LEAST16_MIN INT16_MIN -# define __INT_LEAST16_MAX INT16_MAX -# define __UINT_LEAST16_MAX UINT16_MAX +#define INT16_MAX INT16_C(32767) +#define INT16_MIN (-INT16_C(32767) - 1) +#define UINT16_MAX UINT16_C(65535) +#define __INT_LEAST16_MIN INT16_MIN +#define __INT_LEAST16_MAX INT16_MAX +#define __UINT_LEAST16_MAX UINT16_MAX #endif /* __INT16_TYPE__ */ #ifdef __INT_LEAST16_MIN -# define INT_LEAST16_MIN __INT_LEAST16_MIN -# define INT_LEAST16_MAX __INT_LEAST16_MAX -# define UINT_LEAST16_MAX __UINT_LEAST16_MAX -# define INT_FAST16_MIN __INT_LEAST16_MIN -# define INT_FAST16_MAX __INT_LEAST16_MAX -# define UINT_FAST16_MAX __UINT_LEAST16_MAX +#define INT_LEAST16_MIN __INT_LEAST16_MIN +#define INT_LEAST16_MAX __INT_LEAST16_MAX +#define UINT_LEAST16_MAX __UINT_LEAST16_MAX +#define INT_FAST16_MIN __INT_LEAST16_MIN +#define INT_FAST16_MAX __INT_LEAST16_MAX +#define UINT_FAST16_MAX __UINT_LEAST16_MAX #endif /* __INT_LEAST16_MIN */ #ifdef __INT8_TYPE__ -# define INT8_MAX INT8_C(127) -# define INT8_MIN (-INT8_C(127)-1) -# define UINT8_MAX UINT8_C(255) -# define __INT_LEAST8_MIN INT8_MIN -# define __INT_LEAST8_MAX INT8_MAX -# define __UINT_LEAST8_MAX UINT8_MAX +#define INT8_MAX INT8_C(127) +#define INT8_MIN (-INT8_C(127) - 1) +#define UINT8_MAX UINT8_C(255) +#define __INT_LEAST8_MIN INT8_MIN +#define __INT_LEAST8_MAX INT8_MAX +#define __UINT_LEAST8_MAX UINT8_MAX #endif /* __INT8_TYPE__ */ #ifdef __INT_LEAST8_MIN -# define INT_LEAST8_MIN __INT_LEAST8_MIN -# define INT_LEAST8_MAX __INT_LEAST8_MAX -# define UINT_LEAST8_MAX __UINT_LEAST8_MAX -# define INT_FAST8_MIN __INT_LEAST8_MIN -# define INT_FAST8_MAX __INT_LEAST8_MAX -# define UINT_FAST8_MAX __UINT_LEAST8_MAX +#define INT_LEAST8_MIN __INT_LEAST8_MIN +#define INT_LEAST8_MAX __INT_LEAST8_MAX +#define UINT_LEAST8_MAX __UINT_LEAST8_MAX +#define INT_FAST8_MIN __INT_LEAST8_MIN +#define INT_FAST8_MAX __INT_LEAST8_MAX +#define UINT_FAST8_MAX __UINT_LEAST8_MAX #endif /* __INT_LEAST8_MIN */ /* Some utility macros */ -#define __INTN_MIN(n) __stdint_join3( INT, n, _MIN) -#define __INTN_MAX(n) __stdint_join3( INT, n, _MAX) -#define __UINTN_MAX(n) __stdint_join3(UINT, n, _MAX) -#define __INTN_C(n, v) __stdint_join3( INT, n, _C(v)) +#define __INTN_MIN(n) __stdint_join3(INT, n, _MIN) +#define __INTN_MAX(n) __stdint_join3(INT, n, _MAX) +#define __UINTN_MAX(n) __stdint_join3(UINT, n, _MAX) +#define __INTN_C(n, v) __stdint_join3(INT, n, _C(v)) #define __UINTN_C(n, v) __stdint_join3(UINT, n, _C(v)) /* C99 7.18.2.4 Limits of integer types capable of holding object pointers. */ /* C99 7.18.3 Limits of other integer types. */ -#define INTPTR_MIN (-__INTPTR_MAX__-1) -#define INTPTR_MAX __INTPTR_MAX__ -#define UINTPTR_MAX __UINTPTR_MAX__ -#define PTRDIFF_MIN (-__PTRDIFF_MAX__-1) -#define PTRDIFF_MAX __PTRDIFF_MAX__ -#define SIZE_MAX __SIZE_MAX__ +#define INTPTR_MIN (-__INTPTR_MAX__ - 1) +#define INTPTR_MAX __INTPTR_MAX__ +#define UINTPTR_MAX __UINTPTR_MAX__ +#define PTRDIFF_MIN (-__PTRDIFF_MAX__ - 1) +#define PTRDIFF_MAX __PTRDIFF_MAX__ +#define SIZE_MAX __SIZE_MAX__ /* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__ * is enabled. */ #if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1 -#define RSIZE_MAX (SIZE_MAX >> 1) +#define RSIZE_MAX (SIZE_MAX >> 1) #endif /* C99 7.18.2.5 Limits of greatest-width integer types. */ -#define INTMAX_MIN (-__INTMAX_MAX__-1) -#define INTMAX_MAX __INTMAX_MAX__ -#define UINTMAX_MAX __UINTMAX_MAX__ +#define INTMAX_MIN (-__INTMAX_MAX__ - 1) +#define INTMAX_MAX __INTMAX_MAX__ +#define UINTMAX_MAX __UINTMAX_MAX__ /* C99 7.18.3 Limits of other integer types. */ #define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__) #define SIG_ATOMIC_MAX __INTN_MAX(__SIG_ATOMIC_WIDTH__) #ifdef __WINT_UNSIGNED__ -# define WINT_MIN __UINTN_C(__WINT_WIDTH__, 0) -# define WINT_MAX __UINTN_MAX(__WINT_WIDTH__) +#define WINT_MIN __UINTN_C(__WINT_WIDTH__, 0) +#define WINT_MAX __UINTN_MAX(__WINT_WIDTH__) #else -# define WINT_MIN __INTN_MIN(__WINT_WIDTH__) -# define WINT_MAX __INTN_MAX(__WINT_WIDTH__) +#define WINT_MIN __INTN_MIN(__WINT_WIDTH__) +#define WINT_MAX __INTN_MAX(__WINT_WIDTH__) #endif #ifndef WCHAR_MAX -# define WCHAR_MAX __WCHAR_MAX__ +#define WCHAR_MAX __WCHAR_MAX__ #endif #ifndef WCHAR_MIN -# if __WCHAR_MAX__ == __INTN_MAX(__WCHAR_WIDTH__) -# define WCHAR_MIN __INTN_MIN(__WCHAR_WIDTH__) -# else -# define WCHAR_MIN __UINTN_C(__WCHAR_WIDTH__, 0) -# endif +#if __WCHAR_MAX__ == __INTN_MAX(__WCHAR_WIDTH__) +#define WCHAR_MIN __INTN_MIN(__WCHAR_WIDTH__) +#else +#define WCHAR_MIN __UINTN_C(__WCHAR_WIDTH__, 0) +#endif #endif /* 7.18.4.2 Macros for greatest-width integer constants. */ -#define INTMAX_C(v) __int_c(v, __INTMAX_C_SUFFIX__) +#define INTMAX_C(v) __int_c(v, __INTMAX_C_SUFFIX__) #define UINTMAX_C(v) __int_c(v, __UINTMAX_C_SUFFIX__) #endif // SLANG_LLVM_H - - diff --git a/prelude/slang-torch-prelude.h b/prelude/slang-torch-prelude.h index 11ffe3b66..d303c1045 100644 --- a/prelude/slang-torch-prelude.h +++ b/prelude/slang-torch-prelude.h @@ -1,64 +1,67 @@ // Prelude for PyTorch cpp binding. -#include <torch/extension.h> #include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAUtils.h> -#include <vector> #include <stdexcept> #include <string> +#include <torch/extension.h> +#include <vector> #ifdef SLANG_LLVM #include "slang-llvm.h" #else // SLANG_LLVM -# if SLANG_GCC_FAMILY && __GNUC__ < 6 -# include <cmath> -# define SLANG_PRELUDE_STD std:: -# else -# include <math.h> -# define SLANG_PRELUDE_STD -# endif - -# include <assert.h> -# include <stdlib.h> -# include <string.h> -# include <stdint.h> +#if SLANG_GCC_FAMILY && __GNUC__ < 6 +#include <cmath> +#define SLANG_PRELUDE_STD std:: +#else +#include <math.h> +#define SLANG_PRELUDE_STD +#endif + +#include <assert.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> #endif // SLANG_LLVM #include "../source/core/slang-string.h" #if defined(_MSC_VER) -# define SLANG_PRELUDE_SHARED_LIB_EXPORT __declspec(dllexport) +#define SLANG_PRELUDE_SHARED_LIB_EXPORT __declspec(dllexport) #else -# define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__((__visibility__("default"))) -//# define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__ ((dllexport)) __attribute__((__visibility__("default"))) -#endif - -#ifdef __cplusplus -# define SLANG_PRELUDE_EXTERN_C extern "C" -# define SLANG_PRELUDE_EXTERN_C_START extern "C" { -# define SLANG_PRELUDE_EXTERN_C_END } +#define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__((__visibility__("default"))) +// # define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__ ((dllexport)) +// __attribute__((__visibility__("default"))) +#endif + +#ifdef __cplusplus +#define SLANG_PRELUDE_EXTERN_C extern "C" +#define SLANG_PRELUDE_EXTERN_C_START \ + extern "C" \ + { +#define SLANG_PRELUDE_EXTERN_C_END } #else -# define SLANG_PRELUDE_EXTERN_C -# define SLANG_PRELUDE_EXTERN_C_START -# define SLANG_PRELUDE_EXTERN_C_END -#endif +#define SLANG_PRELUDE_EXTERN_C +#define SLANG_PRELUDE_EXTERN_C_START +#define SLANG_PRELUDE_EXTERN_C_END +#endif #define SLANG_PRELUDE_NAMESPACE #ifndef SLANG_NO_THROW -# define SLANG_NO_THROW +#define SLANG_NO_THROW #endif #ifndef SLANG_STDCALL -# define SLANG_STDCALL +#define SLANG_STDCALL #endif #ifndef SLANG_MCALL -# define SLANG_MCALL SLANG_STDCALL +#define SLANG_MCALL SLANG_STDCALL #endif #ifndef SLANG_FORCE_INLINE -# define SLANG_FORCE_INLINE inline +#define SLANG_FORCE_INLINE inline #endif -#include "slang-cpp-types-core.h" #include "slang-cpp-scalar-intrinsics.h" +#include "slang-cpp-types-core.h" static const int kSlangTorchTensorMaxDim = 5; @@ -72,20 +75,26 @@ struct TensorView }; -TensorView make_tensor_view(torch::Tensor val, const char* name, torch::ScalarType targetScalarType, bool requireContiguous) +TensorView make_tensor_view( + torch::Tensor val, + const char* name, + torch::ScalarType targetScalarType, + bool requireContiguous) { // We're currently not trying to implicitly cast or transfer to device for two reasons: // 1. There appears to be a bug with .to() where successive calls after the first one fail. - // 2. Silent casts like this can cause large memory allocations & unexpected overheads. + // 2. Silent casts like this can cause large memory allocations & unexpected overheads. // It's better to be explicit. // Expect tensors to be on CUDA device if (!val.device().is_cuda()) - throw std::runtime_error(std::string(name).append(": tensor is not on CUDA device.").c_str()); + throw std::runtime_error( + std::string(name).append(": tensor is not on CUDA device.").c_str()); // Expect tensors to be the right type. if (val.dtype() != targetScalarType) - throw std::runtime_error(std::string(name).append(": tensor is not of the expected type.").c_str()); + throw std::runtime_error( + std::string(name).append(": tensor is not of the expected type.").c_str()); // Check that the tensor is contiguous if (requireContiguous && !val.is_contiguous()) @@ -138,14 +147,22 @@ TensorView make_tensor_view(torch::Tensor val, const char* name, torch::ScalarTy } if (val.dim() > kSlangTorchTensorMaxDim) - throw std::runtime_error(std::string(name).append(": number of dimensions exceeds limit (").append(std::to_string(kSlangTorchTensorMaxDim)).append(")").c_str()); + throw std::runtime_error(std::string(name) + .append(": number of dimensions exceeds limit (") + .append(std::to_string(kSlangTorchTensorMaxDim)) + .append(")") + .c_str()); bool isEmpty = true; for (int i = 0; i < val.dim(); ++i) { res.strides[i] = val.stride(i) * elementSize; if (res.strides[i] == 0) - throw std::runtime_error(std::string(name).append(": tensors with broadcasted dimensions are not supported (use tensor.contiguous() to make tensor whole)").c_str()); + throw std::runtime_error( + std::string(name) + .append(": tensors with broadcasted dimensions are not supported (use " + "tensor.contiguous() to make tensor whole)") + .c_str()); res.sizes[i] = val.size(i); if (res.sizes[i] > 0) |
