format

* format * Minor test fixes * enable checking cpp format in ci
author: Ellie Hermaszewska <ellieh@nvidia.com> 2024-10-29 14:49:26 +0800
committer: GitHub <noreply@github.com> 2024-10-29 14:49:26 +0800
commit: f65d756bff8d4c5cbc15bd0322a2ae8e6b896a21 (patch)
tree: ea1d61342cd29368e19135000ec2948813096205 /prelude
parent: a729c15e9dce9f5116a38afc66329ab2ca4cea54 (diff)
9 files changed, 3741 insertions, 1954 deletions
diff --git a/prelude/slang-cpp-host-prelude.h b/prelude/slang-cpp-host-prelude.h
index 48056169d..8bc0f5cad 100644
--- a/prelude/slang-cpp-host-prelude.h
+++ b/prelude/slang-cpp-host-prelude.h
@@ -1,8 +1,8 @@
 #ifndef SLANG_CPP_HOST_PRELUDE_H
 #define SLANG_CPP_HOST_PRELUDE_H
 
-#include <cstdio>
 #include <cmath>
+#include <cstdio>
 #include <cstring>
 
 #define SLANG_COM_PTR_ENABLE_REF_OPERATOR 1
@@ -14,42 +14,45 @@
 #ifdef SLANG_LLVM
 #include "slang-llvm.h"
 #else // SLANG_LLVM
-#   if SLANG_GCC_FAMILY && __GNUC__ < 6
-#       include <cmath>
-#       define SLANG_PRELUDE_STD std::
-#   else
-#       include <math.h>
-#       define SLANG_PRELUDE_STD
-#   endif
-
-#   include <assert.h>
-#   include <stdlib.h>
-#   include <string.h>
-#   include <stdint.h>
+#if SLANG_GCC_FAMILY && __GNUC__ < 6
+#include <cmath>
+#define SLANG_PRELUDE_STD std::
+#else
+#include <math.h>
+#define SLANG_PRELUDE_STD
+#endif
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
 #endif // SLANG_LLVM
 
 #if defined(_MSC_VER)
-#   define SLANG_PRELUDE_SHARED_LIB_EXPORT __declspec(dllexport)
+#define SLANG_PRELUDE_SHARED_LIB_EXPORT __declspec(dllexport)
 #else
-#   define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__((__visibility__("default")))
-//#   define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__ ((dllexport)) __attribute__((__visibility__("default")))
-#endif    
-
-#ifdef __cplusplus    
-#   define SLANG_PRELUDE_EXTERN_C extern "C"
-#   define SLANG_PRELUDE_EXTERN_C_START extern "C" {
-#   define SLANG_PRELUDE_EXTERN_C_END }
+#define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__((__visibility__("default")))
+// #   define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__ ((dllexport))
+// __attribute__((__visibility__("default")))
+#endif
+
+#ifdef __cplusplus
+#define SLANG_PRELUDE_EXTERN_C extern "C"
+#define SLANG_PRELUDE_EXTERN_C_START \
+    extern "C"                       \
+    {
+#define SLANG_PRELUDE_EXTERN_C_END }
 #else
-#   define SLANG_PRELUDE_EXTERN_C 
-#   define SLANG_PRELUDE_EXTERN_C_START
-#   define SLANG_PRELUDE_EXTERN_C_END 
-#endif    
+#define SLANG_PRELUDE_EXTERN_C
+#define SLANG_PRELUDE_EXTERN_C_START
+#define SLANG_PRELUDE_EXTERN_C_END
+#endif
 
 #include "slang-cpp-scalar-intrinsics.h"
 
 using namespace Slang;
 
 template<typename TResult, typename... Args>
-using Slang_FuncType = TResult(SLANG_MCALL *)(Args...);
+using Slang_FuncType = TResult(SLANG_MCALL*)(Args...);
 
 #endif
diff --git a/prelude/slang-cpp-prelude.h b/prelude/slang-cpp-prelude.h
index 2b848dc3b..4dacac9c5 100644
--- a/prelude/slang-cpp-prelude.h
+++ b/prelude/slang-cpp-prelude.h
@@ -2,42 +2,45 @@
 #define SLANG_CPP_PRELUDE_H
 
 // Because the signiture of isnan, isfinite, and is isinf changed in C++, we use the macro
-// to use the version in the std namespace. 
+// to use the version in the std namespace.
 // https://stackoverflow.com/questions/39130040/cmath-hides-isnan-in-math-h-in-c14-c11
- 
+
 #ifdef SLANG_LLVM
 #include "slang-llvm.h"
 #else // SLANG_LLVM
-#   if SLANG_GCC_FAMILY && __GNUC__ < 6
-#       include <cmath>
-#       define SLANG_PRELUDE_STD std::
-#   else
-#       include <math.h>
-#       define SLANG_PRELUDE_STD
-#   endif
-
-#   include <assert.h>
-#   include <stdlib.h>
-#   include <string.h>
-#   include <stdint.h>
+#if SLANG_GCC_FAMILY && __GNUC__ < 6
+#include <cmath>
+#define SLANG_PRELUDE_STD std::
+#else
+#include <math.h>
+#define SLANG_PRELUDE_STD
+#endif
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
 #endif // SLANG_LLVM
 
 #if defined(_MSC_VER)
-#   define SLANG_PRELUDE_SHARED_LIB_EXPORT __declspec(dllexport)
+#define SLANG_PRELUDE_SHARED_LIB_EXPORT __declspec(dllexport)
 #else
-#   define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__((__visibility__("default")))
-//#   define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__ ((dllexport)) __attribute__((__visibility__("default")))
-#endif    
-
-#ifdef __cplusplus    
-#   define SLANG_PRELUDE_EXTERN_C extern "C"
-#   define SLANG_PRELUDE_EXTERN_C_START extern "C" {
-#   define SLANG_PRELUDE_EXTERN_C_END }
+#define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__((__visibility__("default")))
+// #   define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__ ((dllexport))
+// __attribute__((__visibility__("default")))
+#endif
+
+#ifdef __cplusplus
+#define SLANG_PRELUDE_EXTERN_C extern "C"
+#define SLANG_PRELUDE_EXTERN_C_START \
+    extern "C"                       \
+    {
+#define SLANG_PRELUDE_EXTERN_C_END }
 #else
-#   define SLANG_PRELUDE_EXTERN_C 
-#   define SLANG_PRELUDE_EXTERN_C_START
-#   define SLANG_PRELUDE_EXTERN_C_END 
-#endif    
+#define SLANG_PRELUDE_EXTERN_C
+#define SLANG_PRELUDE_EXTERN_C_START
+#define SLANG_PRELUDE_EXTERN_C_END
+#endif
 
 #define SLANG_PRELUDE_EXPORT SLANG_PRELUDE_EXTERN_C SLANG_PRELUDE_SHARED_LIB_EXPORT
 #define SLANG_PRELUDE_EXPORT_START SLANG_PRELUDE_EXTERN_C_START SLANG_PRELUDE_SHARED_LIB_EXPORT
@@ -45,65 +48,65 @@
 
 #ifndef INFINITY
 // Must overflow for double
-#   define INFINITY float(1e+300 * 1e+300)
+#define INFINITY float(1e+300 * 1e+300)
 #endif
 
 #ifndef SLANG_INFINITY
-#   define SLANG_INFINITY   INFINITY
+#define SLANG_INFINITY INFINITY
 #endif
 
 // Detect the compiler type
 
 #ifndef SLANG_COMPILER
-#    define SLANG_COMPILER
+#define SLANG_COMPILER
 
 /*
 Compiler defines, see http://sourceforge.net/p/predef/wiki/Compilers/
 NOTE that SLANG_VC holds the compiler version - not just 1 or 0
 */
-#    if defined(_MSC_VER)
-#        if _MSC_VER >= 1900
-#            define SLANG_VC 14
-#        elif _MSC_VER >= 1800
-#            define SLANG_VC 12
-#        elif _MSC_VER >= 1700
-#            define SLANG_VC 11
-#        elif _MSC_VER >= 1600
-#            define SLANG_VC 10
-#        elif _MSC_VER >= 1500
-#            define SLANG_VC 9
-#        else
-#            error "unknown version of Visual C++ compiler"
-#        endif
-#    elif defined(__clang__)
-#        define SLANG_CLANG 1
-#    elif defined(__SNC__)
-#        define SLANG_SNC 1
-#    elif defined(__ghs__)
-#        define SLANG_GHS 1
-#    elif defined(__GNUC__) /* note: __clang__, __SNC__, or __ghs__ imply __GNUC__ */
-#        define SLANG_GCC 1
-#    else
-#        error "unknown compiler"
-#    endif
+#if defined(_MSC_VER)
+#if _MSC_VER >= 1900
+#define SLANG_VC 14
+#elif _MSC_VER >= 1800
+#define SLANG_VC 12
+#elif _MSC_VER >= 1700
+#define SLANG_VC 11
+#elif _MSC_VER >= 1600
+#define SLANG_VC 10
+#elif _MSC_VER >= 1500
+#define SLANG_VC 9
+#else
+#error "unknown version of Visual C++ compiler"
+#endif
+#elif defined(__clang__)
+#define SLANG_CLANG 1
+#elif defined(__SNC__)
+#define SLANG_SNC 1
+#elif defined(__ghs__)
+#define SLANG_GHS 1
+#elif defined(__GNUC__) /* note: __clang__, __SNC__, or __ghs__ imply __GNUC__ */
+#define SLANG_GCC 1
+#else
+#error "unknown compiler"
+#endif
 /*
 Any compilers not detected by the above logic are now now explicitly zeroed out.
 */
-#    ifndef SLANG_VC
-#        define SLANG_VC 0
-#    endif
-#    ifndef SLANG_CLANG
-#        define SLANG_CLANG 0
-#    endif
-#    ifndef SLANG_SNC
-#        define SLANG_SNC 0
-#    endif
-#    ifndef SLANG_GHS
-#        define SLANG_GHS 0
-#    endif
-#    ifndef SLANG_GCC
-#        define SLANG_GCC 0
-#    endif
+#ifndef SLANG_VC
+#define SLANG_VC 0
+#endif
+#ifndef SLANG_CLANG
+#define SLANG_CLANG 0
+#endif
+#ifndef SLANG_SNC
+#define SLANG_SNC 0
+#endif
+#ifndef SLANG_GHS
+#define SLANG_GHS 0
+#endif
+#ifndef SLANG_GCC
+#define SLANG_GCC 0
+#endif
 #endif /* SLANG_COMPILER */
 
 /*
@@ -116,89 +119,90 @@ used later in the file.
 Most applications should not need to touch this section.
 */
 #ifndef SLANG_PLATFORM
-#    define SLANG_PLATFORM
+#define SLANG_PLATFORM
 /**
 Operating system defines, see http://sourceforge.net/p/predef/wiki/OperatingSystems/
 */
-#    if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_PARTITION_APP
-#        define SLANG_WINRT 1 /* Windows Runtime, either on Windows RT or Windows 8 */
-#    elif defined(XBOXONE)
-#        define SLANG_XBOXONE 1
-#    elif defined(_WIN64) /* note: XBOXONE implies _WIN64 */
-#        define SLANG_WIN64 1
-#    elif defined(_M_PPC)
-#        define SLANG_X360 1
-#    elif defined(_WIN32) /* note: _M_PPC implies _WIN32 */
-#        define SLANG_WIN32 1
-#    elif defined(__ANDROID__)
-#        define SLANG_ANDROID 1
-#    elif defined(__linux__) || defined(__CYGWIN__) /* note: __ANDROID__ implies __linux__ */
-#        define SLANG_LINUX 1
-#    elif defined(__APPLE__) && !defined(SLANG_LLVM)
-#        include "TargetConditionals.h"
-#        if TARGET_OS_MAC
-#            define SLANG_OSX 1
-#        else
-#            define SLANG_IOS 1
-#        endif
-#    elif defined(__APPLE__)
-// On `slang-llvm` we can't inclue "TargetConditionals.h" in general, so for now assume its OSX.
-#       define SLANG_OSX 1
-#    elif defined(__CELLOS_LV2__)
-#        define SLANG_PS3 1
-#    elif defined(__ORBIS__)
-#        define SLANG_PS4 1
-#    elif defined(__SNC__) && defined(__arm__)
-#        define SLANG_PSP2 1
-#    elif defined(__ghs__)
-#        define SLANG_WIIU 1
-#    else
-#        error "unknown target platform"
-#    endif
+#if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_PARTITION_APP
+#define SLANG_WINRT 1 /* Windows Runtime, either on Windows RT or Windows 8 */
+#elif defined(XBOXONE)
+#define SLANG_XBOXONE 1
+#elif defined(_WIN64) /* note: XBOXONE implies _WIN64 */
+#define SLANG_WIN64 1
+#elif defined(_M_PPC)
+#define SLANG_X360 1
+#elif defined(_WIN32) /* note: _M_PPC implies _WIN32 */
+#define SLANG_WIN32 1
+#elif defined(__ANDROID__)
+#define SLANG_ANDROID 1
+#elif defined(__linux__) || defined(__CYGWIN__) /* note: __ANDROID__ implies __linux__ */
+#define SLANG_LINUX 1
+#elif defined(__APPLE__) && !defined(SLANG_LLVM)
+#include "TargetConditionals.h"
+#if TARGET_OS_MAC
+#define SLANG_OSX 1
+#else
+#define SLANG_IOS 1
+#endif
+#elif defined(__APPLE__)
+// On `slang-llvm` we can't inclue "TargetConditionals.h" in general, so for now assume its
+// OSX.
+#define SLANG_OSX 1
+#elif defined(__CELLOS_LV2__)
+#define SLANG_PS3 1
+#elif defined(__ORBIS__)
+#define SLANG_PS4 1
+#elif defined(__SNC__) && defined(__arm__)
+#define SLANG_PSP2 1
+#elif defined(__ghs__)
+#define SLANG_WIIU 1
+#else
+#error "unknown target platform"
+#endif
 
 
 /*
 Any platforms not detected by the above logic are now now explicitly zeroed out.
 */
-#    ifndef SLANG_WINRT
-#        define SLANG_WINRT 0
-#    endif
-#    ifndef SLANG_XBOXONE
-#        define SLANG_XBOXONE 0
-#    endif
-#    ifndef SLANG_WIN64
-#        define SLANG_WIN64 0
-#    endif
-#    ifndef SLANG_X360
-#        define SLANG_X360 0
-#    endif
-#    ifndef SLANG_WIN32
-#        define SLANG_WIN32 0
-#    endif
-#    ifndef SLANG_ANDROID
-#        define SLANG_ANDROID 0
-#    endif
-#    ifndef SLANG_LINUX
-#        define SLANG_LINUX 0
-#    endif
-#    ifndef SLANG_IOS
-#        define SLANG_IOS 0
-#    endif
-#    ifndef SLANG_OSX
-#        define SLANG_OSX 0
-#    endif
-#    ifndef SLANG_PS3
-#        define SLANG_PS3 0
-#    endif
-#    ifndef SLANG_PS4
-#        define SLANG_PS4 0
-#    endif
-#    ifndef SLANG_PSP2
-#        define SLANG_PSP2 0
-#    endif
-#    ifndef SLANG_WIIU
-#        define SLANG_WIIU 0
-#    endif
+#ifndef SLANG_WINRT
+#define SLANG_WINRT 0
+#endif
+#ifndef SLANG_XBOXONE
+#define SLANG_XBOXONE 0
+#endif
+#ifndef SLANG_WIN64
+#define SLANG_WIN64 0
+#endif
+#ifndef SLANG_X360
+#define SLANG_X360 0
+#endif
+#ifndef SLANG_WIN32
+#define SLANG_WIN32 0
+#endif
+#ifndef SLANG_ANDROID
+#define SLANG_ANDROID 0
+#endif
+#ifndef SLANG_LINUX
+#define SLANG_LINUX 0
+#endif
+#ifndef SLANG_IOS
+#define SLANG_IOS 0
+#endif
+#ifndef SLANG_OSX
+#define SLANG_OSX 0
+#endif
+#ifndef SLANG_PS3
+#define SLANG_PS3 0
+#endif
+#ifndef SLANG_PS4
+#define SLANG_PS4 0
+#endif
+#ifndef SLANG_PSP2
+#define SLANG_PSP2 0
+#endif
+#ifndef SLANG_WIIU
+#define SLANG_WIIU 0
+#endif
 #endif /* SLANG_PLATFORM */
 
 /* Shorthands for "families" of compilers/platforms */
@@ -206,37 +210,38 @@ Any platforms not detected by the above logic are now now explicitly zeroed out.
 #define SLANG_WINDOWS_FAMILY (SLANG_WINRT || SLANG_WIN32 || SLANG_WIN64)
 #define SLANG_MICROSOFT_FAMILY (SLANG_XBOXONE || SLANG_X360 || SLANG_WINDOWS_FAMILY)
 #define SLANG_LINUX_FAMILY (SLANG_LINUX || SLANG_ANDROID)
-#define SLANG_APPLE_FAMILY (SLANG_IOS || SLANG_OSX)                  /* equivalent to #if __APPLE__ */
-#define SLANG_UNIX_FAMILY (SLANG_LINUX_FAMILY || SLANG_APPLE_FAMILY) /* shortcut for unix/posix platforms */
+#define SLANG_APPLE_FAMILY (SLANG_IOS || SLANG_OSX) /* equivalent to #if __APPLE__ */
+#define SLANG_UNIX_FAMILY \
+    (SLANG_LINUX_FAMILY || SLANG_APPLE_FAMILY) /* shortcut for unix/posix platforms */
 
 // GCC Specific
 #if SLANG_GCC_FAMILY
-#	define SLANG_ALIGN_OF(T)	__alignof__(T)
+#define SLANG_ALIGN_OF(T) __alignof__(T)
 
-#   define SLANG_BREAKPOINT(id) __builtin_trap()
+#define SLANG_BREAKPOINT(id) __builtin_trap()
 
-// Use this macro instead of offsetof, because gcc produces warning if offsetof is used on a 
+// Use this macro instead of offsetof, because gcc produces warning if offsetof is used on a
 // non POD type, even though it produces the correct result
-#   define SLANG_OFFSET_OF(T, ELEMENT) (size_t(&((T*)1)->ELEMENT) - 1)
+#define SLANG_OFFSET_OF(T, ELEMENT) (size_t(&((T*)1)->ELEMENT) - 1)
 #endif // SLANG_GCC_FAMILY
 
 // Microsoft VC specific
 #if SLANG_VC
-#   define SLANG_ALIGN_OF(T) __alignof(T)
+#define SLANG_ALIGN_OF(T) __alignof(T)
 
-#	define SLANG_BREAKPOINT(id) __debugbreak();
+#define SLANG_BREAKPOINT(id) __debugbreak();
 
 #endif // SLANG_VC
 
 // Default impls
 
 #ifndef SLANG_OFFSET_OF
-#   define SLANG_OFFSET_OF(X, Y) offsetof(X, Y)
+#define SLANG_OFFSET_OF(X, Y) offsetof(X, Y)
 #endif
 
 #ifndef SLANG_BREAKPOINT
 // Make it crash with a write to 0!
-#   define SLANG_BREAKPOINT(id) (*((int*)0) = int(id));
+#define SLANG_BREAKPOINT(id) (*((int*)0) = int(id));
 #endif
 
 // If slang.h has been included we don't need any of these definitions
@@ -244,33 +249,33 @@ Any platforms not detected by the above logic are now now explicitly zeroed out.
 
 /* Macro for declaring if a method is no throw. Should be set before the return parameter. */
 #ifndef SLANG_NO_THROW
-#   if SLANG_WINDOWS_FAMILY && !defined(SLANG_DISABLE_EXCEPTIONS)
-#       define SLANG_NO_THROW __declspec(nothrow)
-#   endif
+#if SLANG_WINDOWS_FAMILY && !defined(SLANG_DISABLE_EXCEPTIONS)
+#define SLANG_NO_THROW __declspec(nothrow)
+#endif
 #endif
 #ifndef SLANG_NO_THROW
-#   define SLANG_NO_THROW
+#define SLANG_NO_THROW
 #endif
 
 /* The `SLANG_STDCALL` and `SLANG_MCALL` defines are used to set the calling
 convention for interface methods.
 */
 #ifndef SLANG_STDCALL
-#   if SLANG_MICROSOFT_FAMILY
-#       define SLANG_STDCALL __stdcall
-#   else
-#       define SLANG_STDCALL
-#   endif
+#if SLANG_MICROSOFT_FAMILY
+#define SLANG_STDCALL __stdcall
+#else
+#define SLANG_STDCALL
+#endif
 #endif
 #ifndef SLANG_MCALL
-#   define SLANG_MCALL SLANG_STDCALL
+#define SLANG_MCALL SLANG_STDCALL
 #endif
 
 #ifndef SLANG_FORCE_INLINE
-#    define SLANG_FORCE_INLINE inline
+#define SLANG_FORCE_INLINE inline
 #endif
 
-// TODO(JS): Should these be in slang-cpp-types.h? 
+// TODO(JS): Should these be in slang-cpp-types.h?
 // They are more likely to clash with slang.h
 
 struct SlangUUID
@@ -278,24 +283,25 @@ struct SlangUUID
     uint32_t data1;
     uint16_t data2;
     uint16_t data3;
-    uint8_t  data4[8];
+    uint8_t data4[8];
 };
 
 typedef int32_t SlangResult;
 
 struct ISlangUnknown
 {
-    virtual SLANG_NO_THROW SlangResult SLANG_MCALL queryInterface(SlangUUID const& uuid, void** outObject) = 0;
+    virtual SLANG_NO_THROW SlangResult SLANG_MCALL
+    queryInterface(SlangUUID const& uuid, void** outObject) = 0;
     virtual SLANG_NO_THROW uint32_t SLANG_MCALL addRef() = 0;
     virtual SLANG_NO_THROW uint32_t SLANG_MCALL release() = 0;
 };
 
-#define SLANG_COM_INTERFACE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
-    public: \
-    SLANG_FORCE_INLINE static const SlangUUID& getTypeGuid() \
-    { \
-        static const SlangUUID guid = { a, b, c, d0, d1, d2, d3, d4, d5, d6, d7 }; \
-        return guid; \
+#define SLANG_COM_INTERFACE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)             \
+public:                                                                          \
+    SLANG_FORCE_INLINE static const SlangUUID& getTypeGuid()                     \
+    {                                                                            \
+        static const SlangUUID guid = {a, b, c, d0, d1, d2, d3, d4, d5, d6, d7}; \
+        return guid;                                                             \
     }
 #endif // SLANG_H
 
@@ -304,13 +310,13 @@ struct ISlangUnknown
 #include "slang-cpp-scalar-intrinsics.h"
 #include "slang-cpp-types.h"
 
-// TODO(JS): Hack! Output C++ code from slang can copy uninitialized variables. 
+// TODO(JS): Hack! Output C++ code from slang can copy uninitialized variables.
 #if defined(_MSC_VER)
-#   pragma warning(disable : 4700)
+#pragma warning(disable : 4700)
 #endif
 
 #ifndef SLANG_UNROLL
-#   define SLANG_UNROLL
+#define SLANG_UNROLL
 #endif
 
 #endif
diff --git a/prelude/slang-cpp-scalar-intrinsics.h b/prelude/slang-cpp-scalar-intrinsics.h
index 1ade8614f..6aa72df4f 100644
--- a/prelude/slang-cpp-scalar-intrinsics.h
+++ b/prelude/slang-cpp-scalar-intrinsics.h
@@ -2,24 +2,26 @@
 #define SLANG_PRELUDE_SCALAR_INTRINSICS_H
 
 #if !defined(SLANG_LLVM) && SLANG_PROCESSOR_X86_64 && SLANG_VC
-//  If we have visual studio and 64 bit processor, we can assume we have popcnt, and can include x86 intrinsics
-#   include <intrin.h>
+//  If we have visual studio and 64 bit processor, we can assume we have popcnt, and can include
+//  x86 intrinsics
+#include <intrin.h>
 #endif
 
 #ifndef SLANG_FORCE_INLINE
-#    define SLANG_FORCE_INLINE inline
+#define SLANG_FORCE_INLINE inline
 #endif
 
 #ifdef SLANG_PRELUDE_NAMESPACE
-namespace SLANG_PRELUDE_NAMESPACE {
+namespace SLANG_PRELUDE_NAMESPACE
+{
 #endif
 
 #ifndef SLANG_PRELUDE_PI
-#   define SLANG_PRELUDE_PI           3.14159265358979323846
+#define SLANG_PRELUDE_PI 3.14159265358979323846
 #endif
 
 
-union Union32 
+union Union32
 {
     uint32_t u;
     int32_t i;
@@ -34,10 +36,30 @@ union Union64
 };
 
 // 32 bit cast conversions
-SLANG_FORCE_INLINE int32_t _bitCastFloatToInt(float f) { Union32 u; u.f = f; return u.i; }
-SLANG_FORCE_INLINE float _bitCastIntToFloat(int32_t i) { Union32 u; u.i = i; return u.f; }
-SLANG_FORCE_INLINE uint32_t _bitCastFloatToUInt(float f) { Union32 u; u.f = f; return u.u; }
-SLANG_FORCE_INLINE float _bitCastUIntToFloat(uint32_t ui) { Union32 u; u.u = ui; return u.f; }
+SLANG_FORCE_INLINE int32_t _bitCastFloatToInt(float f)
+{
+    Union32 u;
+    u.f = f;
+    return u.i;
+}
+SLANG_FORCE_INLINE float _bitCastIntToFloat(int32_t i)
+{
+    Union32 u;
+    u.i = i;
+    return u.f;
+}
+SLANG_FORCE_INLINE uint32_t _bitCastFloatToUInt(float f)
+{
+    Union32 u;
+    u.f = f;
+    return u.u;
+}
+SLANG_FORCE_INLINE float _bitCastUIntToFloat(uint32_t ui)
+{
+    Union32 u;
+    u.u = ui;
+    return u.f;
+}
 
 // ----------------------------- F16 -----------------------------------------
 
@@ -61,27 +83,27 @@ SLANG_FORCE_INLINE uint32_t f32tof16(const float value)
     if (e == 0xff)
     {
         // Could be a NAN or INF. Is INF if *input* mantissa is 0.
-        
+
         // Remove last bit for rounding to make output mantissa.
         m >>= 1;
-       
+
         // We *assume* float16/float32 signaling bit and remaining bits
         // semantics are the same. (The signalling bit convention is target specific!).
         // Non signal bit's usage within mantissa for a NAN are also target specific.
-      
-        // If the m is 0, it could be because the result is INF, but it could also be because all the 
-        // bits that made NAN were dropped as we have less mantissa bits in f16. 
-           
+
+        // If the m is 0, it could be because the result is INF, but it could also be because all
+        // the bits that made NAN were dropped as we have less mantissa bits in f16.
+
         // To fix for this we make non zero if m is 0 and the input mantissa was not.
         // This will (typically) produce a signalling NAN.
         m += uint32_t(m == 0 && (inBits & 0x007fffffu));
-       
+
         // Combine for output
         return (bits | 0x7c00u | m);
     }
     if (e > 142)
     {
-        // INF. 
+        // INF.
         return bits | 0x7c00u;
     }
     if (e < 113)
@@ -105,7 +127,7 @@ SLANG_FORCE_INLINE float f16tof32(const uint32_t value)
 
     if (exponent == 0)
     {
-        // If mantissa is 0 we are done, as output is 0. 
+        // If mantissa is 0 we are done, as output is 0.
         // If it's not zero we must have a denormal.
         if (mantissa)
         {
@@ -113,16 +135,17 @@ SLANG_FORCE_INLINE float f16tof32(const uint32_t value)
             return _bitCastIntToFloat(sign | ((value & 0x7fff) << 13)) * g_f16tof32Magic;
         }
     }
-    else 
+    else
     {
-        // If the exponent is NAN or INF exponent is 0x1f on input. 
+        // If the exponent is NAN or INF exponent is 0x1f on input.
         // If that's the case, we just need to set the exponent to 0xff on output
-        // and the mantissa can just stay the same. If its 0 it's INF, else it is NAN and we just copy the bits
+        // and the mantissa can just stay the same. If its 0 it's INF, else it is NAN and we just
+        // copy the bits
         //
         // Else we need to correct the exponent in the normalized case.
         exponent = (exponent == 0x1F) ? 0xff : (exponent + (-15 + 127));
     }
-    
+
     return _bitCastUIntToFloat(sign | (exponent << 23) | (mantissa << 13));
 }
 
@@ -135,7 +158,7 @@ SLANG_FORCE_INLINE float F32_calcSafeRadians(float radians);
 
 SLANG_PRELUDE_EXTERN_C_START
 
-// Unary 
+// Unary
 float F32_ceil(float f);
 float F32_floor(float f);
 float F32_round(float f);
@@ -158,12 +181,18 @@ float F32_trunc(float f);
 float F32_sqrt(float f);
 
 bool F32_isnan(float f);
-bool F32_isfinite(float f); 
+bool F32_isfinite(float f);
 bool F32_isinf(float f);
 
 // Binary
-SLANG_FORCE_INLINE float F32_min(float a, float b) { return a < b ? a : b; }
-SLANG_FORCE_INLINE float F32_max(float a, float b) { return a > b ? a : b; }
+SLANG_FORCE_INLINE float F32_min(float a, float b)
+{
+    return a < b ? a : b;
+}
+SLANG_FORCE_INLINE float F32_max(float a, float b)
+{
+    return a > b ? a : b;
+}
 float F32_pow(float a, float b);
 float F32_fmod(float a, float b);
 float F32_remainder(float a, float b);
@@ -174,47 +203,140 @@ float F32_frexp(float x, int* e);
 float F32_modf(float x, float* ip);
 
 // Ternary
-SLANG_FORCE_INLINE float F32_fma(float a, float b, float c) { return a * b + c; }
+SLANG_FORCE_INLINE float F32_fma(float a, float b, float c)
+{
+    return a * b + c;
+}
 
 SLANG_PRELUDE_EXTERN_C_END
 
 #else
 
-// Unary 
-SLANG_FORCE_INLINE float F32_ceil(float f) { return ::ceilf(f); }
-SLANG_FORCE_INLINE float F32_floor(float f) { return ::floorf(f); }
-SLANG_FORCE_INLINE float F32_round(float f) { return ::roundf(f); }
-SLANG_FORCE_INLINE float F32_sin(float f) { return ::sinf(f); }
-SLANG_FORCE_INLINE float F32_cos(float f) { return ::cosf(f); }
-SLANG_FORCE_INLINE float F32_tan(float f) { return ::tanf(f); }
-SLANG_FORCE_INLINE float F32_asin(float f) { return ::asinf(f); }
-SLANG_FORCE_INLINE float F32_acos(float f) { return ::acosf(f); }
-SLANG_FORCE_INLINE float F32_atan(float f) { return ::atanf(f); }
-SLANG_FORCE_INLINE float F32_sinh(float f) { return ::sinhf(f); }
-SLANG_FORCE_INLINE float F32_cosh(float f) { return ::coshf(f); }
-SLANG_FORCE_INLINE float F32_tanh(float f) { return ::tanhf(f); }
-SLANG_FORCE_INLINE float F32_log2(float f) { return ::log2f(f); }
-SLANG_FORCE_INLINE float F32_log(float f) { return ::logf(f); }
-SLANG_FORCE_INLINE float F32_log10(float f) { return ::log10f(f); }
-SLANG_FORCE_INLINE float F32_exp2(float f) { return ::exp2f(f); }
-SLANG_FORCE_INLINE float F32_exp(float f) { return ::expf(f); }
-SLANG_FORCE_INLINE float F32_abs(float f) { return ::fabsf(f); }
-SLANG_FORCE_INLINE float F32_trunc(float f) { return ::truncf(f); }
-SLANG_FORCE_INLINE float F32_sqrt(float f) { return ::sqrtf(f); }
-
-SLANG_FORCE_INLINE bool F32_isnan(float f) { return SLANG_PRELUDE_STD isnan(f); }
-SLANG_FORCE_INLINE bool F32_isfinite(float f) { return SLANG_PRELUDE_STD isfinite(f); }
-SLANG_FORCE_INLINE bool F32_isinf(float f) { return SLANG_PRELUDE_STD isinf(f); }
+// Unary
+SLANG_FORCE_INLINE float F32_ceil(float f)
+{
+    return ::ceilf(f);
+}
+SLANG_FORCE_INLINE float F32_floor(float f)
+{
+    return ::floorf(f);
+}
+SLANG_FORCE_INLINE float F32_round(float f)
+{
+    return ::roundf(f);
+}
+SLANG_FORCE_INLINE float F32_sin(float f)
+{
+    return ::sinf(f);
+}
+SLANG_FORCE_INLINE float F32_cos(float f)
+{
+    return ::cosf(f);
+}
+SLANG_FORCE_INLINE float F32_tan(float f)
+{
+    return ::tanf(f);
+}
+SLANG_FORCE_INLINE float F32_asin(float f)
+{
+    return ::asinf(f);
+}
+SLANG_FORCE_INLINE float F32_acos(float f)
+{
+    return ::acosf(f);
+}
+SLANG_FORCE_INLINE float F32_atan(float f)
+{
+    return ::atanf(f);
+}
+SLANG_FORCE_INLINE float F32_sinh(float f)
+{
+    return ::sinhf(f);
+}
+SLANG_FORCE_INLINE float F32_cosh(float f)
+{
+    return ::coshf(f);
+}
+SLANG_FORCE_INLINE float F32_tanh(float f)
+{
+    return ::tanhf(f);
+}
+SLANG_FORCE_INLINE float F32_log2(float f)
+{
+    return ::log2f(f);
+}
+SLANG_FORCE_INLINE float F32_log(float f)
+{
+    return ::logf(f);
+}
+SLANG_FORCE_INLINE float F32_log10(float f)
+{
+    return ::log10f(f);
+}
+SLANG_FORCE_INLINE float F32_exp2(float f)
+{
+    return ::exp2f(f);
+}
+SLANG_FORCE_INLINE float F32_exp(float f)
+{
+    return ::expf(f);
+}
+SLANG_FORCE_INLINE float F32_abs(float f)
+{
+    return ::fabsf(f);
+}
+SLANG_FORCE_INLINE float F32_trunc(float f)
+{
+    return ::truncf(f);
+}
+SLANG_FORCE_INLINE float F32_sqrt(float f)
+{
+    return ::sqrtf(f);
+}
+
+SLANG_FORCE_INLINE bool F32_isnan(float f)
+{
+    return SLANG_PRELUDE_STD isnan(f);
+}
+SLANG_FORCE_INLINE bool F32_isfinite(float f)
+{
+    return SLANG_PRELUDE_STD isfinite(f);
+}
+SLANG_FORCE_INLINE bool F32_isinf(float f)
+{
+    return SLANG_PRELUDE_STD isinf(f);
+}
 
 // Binary
-SLANG_FORCE_INLINE float F32_min(float a, float b) { return ::fminf(a, b); }
-SLANG_FORCE_INLINE float F32_max(float a, float b) { return ::fmaxf(a, b); }
-SLANG_FORCE_INLINE float F32_pow(float a, float b) { return ::powf(a, b); }
-SLANG_FORCE_INLINE float F32_fmod(float a, float b) { return ::fmodf(a, b); }
-SLANG_FORCE_INLINE float F32_remainder(float a, float b) { return ::remainderf(a, b); }
-SLANG_FORCE_INLINE float F32_atan2(float a, float b) { return float(::atan2(a, b)); }
+SLANG_FORCE_INLINE float F32_min(float a, float b)
+{
+    return ::fminf(a, b);
+}
+SLANG_FORCE_INLINE float F32_max(float a, float b)
+{
+    return ::fmaxf(a, b);
+}
+SLANG_FORCE_INLINE float F32_pow(float a, float b)
+{
+    return ::powf(a, b);
+}
+SLANG_FORCE_INLINE float F32_fmod(float a, float b)
+{
+    return ::fmodf(a, b);
+}
+SLANG_FORCE_INLINE float F32_remainder(float a, float b)
+{
+    return ::remainderf(a, b);
+}
+SLANG_FORCE_INLINE float F32_atan2(float a, float b)
+{
+    return float(::atan2(a, b));
+}
 
-SLANG_FORCE_INLINE float F32_frexp(float x, int* e) { return ::frexpf(x, e); }
+SLANG_FORCE_INLINE float F32_frexp(float x, int* e)
+{
+    return ::frexpf(x, e);
+}
 
 SLANG_FORCE_INLINE float F32_modf(float x, float* ip)
 {
@@ -222,26 +344,48 @@ SLANG_FORCE_INLINE float F32_modf(float x, float* ip)
 }
 
 // Ternary
-SLANG_FORCE_INLINE float F32_fma(float a, float b, float c) { return ::fmaf(a, b, c); }
+SLANG_FORCE_INLINE float F32_fma(float a, float b, float c)
+{
+    return ::fmaf(a, b, c);
+}
 
 #endif
 
 SLANG_FORCE_INLINE float F32_calcSafeRadians(float radians)
 {
-    // Put 0 to 2pi cycles to cycle around 0 to 1 
-	float a = radians * (1.0f /  float(SLANG_PRELUDE_PI * 2));
+    // Put 0 to 2pi cycles to cycle around 0 to 1
+    float a = radians * (1.0f / float(SLANG_PRELUDE_PI * 2));
     // Get truncated fraction, as value in  0 - 1 range
     a = a - F32_floor(a);
     // Convert back to 0 - 2pi range
-	return (a * float(SLANG_PRELUDE_PI * 2));
+    return (a * float(SLANG_PRELUDE_PI * 2));
 }
 
-SLANG_FORCE_INLINE float F32_rsqrt(float f) { return 1.0f / F32_sqrt(f); }
-SLANG_FORCE_INLINE float F32_sign(float f) { return ( f == 0.0f) ? f : (( f < 0.0f) ? -1.0f : 1.0f); } 
-SLANG_FORCE_INLINE float F32_frac(float f) { return f - F32_floor(f); }
+SLANG_FORCE_INLINE float F32_rsqrt(float f)
+{
+    return 1.0f / F32_sqrt(f);
+}
+SLANG_FORCE_INLINE float F32_sign(float f)
+{
+    return (f == 0.0f) ? f : ((f < 0.0f) ? -1.0f : 1.0f);
+}
+SLANG_FORCE_INLINE float F32_frac(float f)
+{
+    return f - F32_floor(f);
+}
 
-SLANG_FORCE_INLINE uint32_t F32_asuint(float f) { Union32 u; u.f = f; return u.u; }
-SLANG_FORCE_INLINE int32_t F32_asint(float f) { Union32 u; u.f = f; return u.i; }
+SLANG_FORCE_INLINE uint32_t F32_asuint(float f)
+{
+    Union32 u;
+    u.f = f;
+    return u.u;
+}
+SLANG_FORCE_INLINE int32_t F32_asint(float f)
+{
+    Union32 u;
+    u.f = f;
+    return u.i;
+}
 
 // ----------------------------- F64 -----------------------------------------
 
@@ -251,7 +395,7 @@ SLANG_FORCE_INLINE double F64_calcSafeRadians(double radians);
 
 SLANG_PRELUDE_EXTERN_C_START
 
-// Unary 
+// Unary
 double F64_ceil(double f);
 double F64_floor(double f);
 double F64_round(double f);
@@ -278,8 +422,14 @@ bool F64_isfinite(double f);
 bool F64_isinf(double f);
 
 // Binary
-SLANG_FORCE_INLINE double F64_min(double a, double b) { return a < b ? a : b; }
-SLANG_FORCE_INLINE double F64_max(double a, double b) { return a > b ? a : b; }
+SLANG_FORCE_INLINE double F64_min(double a, double b)
+{
+    return a < b ? a : b;
+}
+SLANG_FORCE_INLINE double F64_max(double a, double b)
+{
+    return a > b ? a : b;
+}
 double F64_pow(double a, double b);
 double F64_fmod(double a, double b);
 double F64_remainder(double a, double b);
@@ -290,48 +440,141 @@ double F64_frexp(double x, int* e);
 double F64_modf(double x, double* ip);
 
 // Ternary
-SLANG_FORCE_INLINE double F64_fma(double a, double b, double c) { return a * b + c; }
+SLANG_FORCE_INLINE double F64_fma(double a, double b, double c)
+{
+    return a * b + c;
+}
 
 SLANG_PRELUDE_EXTERN_C_END
 
 #else // SLANG_LLVM
 
-// Unary 
-SLANG_FORCE_INLINE double F64_ceil(double f) { return ::ceil(f); }
-SLANG_FORCE_INLINE double F64_floor(double f) { return ::floor(f); }
-SLANG_FORCE_INLINE double F64_round(double f) { return ::round(f); }
-SLANG_FORCE_INLINE double F64_sin(double f) { return ::sin(f); }
-SLANG_FORCE_INLINE double F64_cos(double f) { return ::cos(f); }
-SLANG_FORCE_INLINE double F64_tan(double f) { return ::tan(f); }
-SLANG_FORCE_INLINE double F64_asin(double f) { return ::asin(f); }
-SLANG_FORCE_INLINE double F64_acos(double f) { return ::acos(f); }
-SLANG_FORCE_INLINE double F64_atan(double f) { return ::atan(f); }
-SLANG_FORCE_INLINE double F64_sinh(double f) { return ::sinh(f); }
-SLANG_FORCE_INLINE double F64_cosh(double f) { return ::cosh(f); }
-SLANG_FORCE_INLINE double F64_tanh(double f) { return ::tanh(f); }
-SLANG_FORCE_INLINE double F64_log2(double f) { return ::log2(f); }
-SLANG_FORCE_INLINE double F64_log(double f) { return ::log(f); }
-SLANG_FORCE_INLINE double F64_log10(float f) { return ::log10(f); }
-SLANG_FORCE_INLINE double F64_exp2(double f) { return ::exp2(f); }
-SLANG_FORCE_INLINE double F64_exp(double f) { return ::exp(f); }
-SLANG_FORCE_INLINE double F64_abs(double f) { return ::fabs(f); }
-SLANG_FORCE_INLINE double F64_trunc(double f) { return ::trunc(f); }
-SLANG_FORCE_INLINE double F64_sqrt(double f) { return ::sqrt(f); }
-
-
-SLANG_FORCE_INLINE bool F64_isnan(double f) { return SLANG_PRELUDE_STD isnan(f); }
-SLANG_FORCE_INLINE bool F64_isfinite(double f) { return SLANG_PRELUDE_STD isfinite(f); }
-SLANG_FORCE_INLINE bool F64_isinf(double f) { return SLANG_PRELUDE_STD isinf(f); }
+// Unary
+SLANG_FORCE_INLINE double F64_ceil(double f)
+{
+    return ::ceil(f);
+}
+SLANG_FORCE_INLINE double F64_floor(double f)
+{
+    return ::floor(f);
+}
+SLANG_FORCE_INLINE double F64_round(double f)
+{
+    return ::round(f);
+}
+SLANG_FORCE_INLINE double F64_sin(double f)
+{
+    return ::sin(f);
+}
+SLANG_FORCE_INLINE double F64_cos(double f)
+{
+    return ::cos(f);
+}
+SLANG_FORCE_INLINE double F64_tan(double f)
+{
+    return ::tan(f);
+}
+SLANG_FORCE_INLINE double F64_asin(double f)
+{
+    return ::asin(f);
+}
+SLANG_FORCE_INLINE double F64_acos(double f)
+{
+    return ::acos(f);
+}
+SLANG_FORCE_INLINE double F64_atan(double f)
+{
+    return ::atan(f);
+}
+SLANG_FORCE_INLINE double F64_sinh(double f)
+{
+    return ::sinh(f);
+}
+SLANG_FORCE_INLINE double F64_cosh(double f)
+{
+    return ::cosh(f);
+}
+SLANG_FORCE_INLINE double F64_tanh(double f)
+{
+    return ::tanh(f);
+}
+SLANG_FORCE_INLINE double F64_log2(double f)
+{
+    return ::log2(f);
+}
+SLANG_FORCE_INLINE double F64_log(double f)
+{
+    return ::log(f);
+}
+SLANG_FORCE_INLINE double F64_log10(float f)
+{
+    return ::log10(f);
+}
+SLANG_FORCE_INLINE double F64_exp2(double f)
+{
+    return ::exp2(f);
+}
+SLANG_FORCE_INLINE double F64_exp(double f)
+{
+    return ::exp(f);
+}
+SLANG_FORCE_INLINE double F64_abs(double f)
+{
+    return ::fabs(f);
+}
+SLANG_FORCE_INLINE double F64_trunc(double f)
+{
+    return ::trunc(f);
+}
+SLANG_FORCE_INLINE double F64_sqrt(double f)
+{
+    return ::sqrt(f);
+}
+
+
+SLANG_FORCE_INLINE bool F64_isnan(double f)
+{
+    return SLANG_PRELUDE_STD isnan(f);
+}
+SLANG_FORCE_INLINE bool F64_isfinite(double f)
+{
+    return SLANG_PRELUDE_STD isfinite(f);
+}
+SLANG_FORCE_INLINE bool F64_isinf(double f)
+{
+    return SLANG_PRELUDE_STD isinf(f);
+}
 
 // Binary
-SLANG_FORCE_INLINE double F64_min(double a, double b) { return ::fmin(a, b); }
-SLANG_FORCE_INLINE double F64_max(double a, double b) { return ::fmax(a, b); }
-SLANG_FORCE_INLINE double F64_pow(double a, double b) { return ::pow(a, b); }
-SLANG_FORCE_INLINE double F64_fmod(double a, double b) { return ::fmod(a, b); }
-SLANG_FORCE_INLINE double F64_remainder(double a, double b) { return ::remainder(a, b); }
-SLANG_FORCE_INLINE double F64_atan2(double a, double b) { return ::atan2(a, b); }
+SLANG_FORCE_INLINE double F64_min(double a, double b)
+{
+    return ::fmin(a, b);
+}
+SLANG_FORCE_INLINE double F64_max(double a, double b)
+{
+    return ::fmax(a, b);
+}
+SLANG_FORCE_INLINE double F64_pow(double a, double b)
+{
+    return ::pow(a, b);
+}
+SLANG_FORCE_INLINE double F64_fmod(double a, double b)
+{
+    return ::fmod(a, b);
+}
+SLANG_FORCE_INLINE double F64_remainder(double a, double b)
+{
+    return ::remainder(a, b);
+}
+SLANG_FORCE_INLINE double F64_atan2(double a, double b)
+{
+    return ::atan2(a, b);
+}
 
-SLANG_FORCE_INLINE double F64_frexp(double x, int* e) { return ::frexp(x, e); }
+SLANG_FORCE_INLINE double F64_frexp(double x, int* e)
+{
+    return ::frexp(x, e);
+}
 
 SLANG_FORCE_INLINE double F64_modf(double x, double* ip)
 {
@@ -339,13 +582,25 @@ SLANG_FORCE_INLINE double F64_modf(double x, double* ip)
 }
 
 // Ternary
-SLANG_FORCE_INLINE double F64_fma(double a, double b, double c) { return ::fma(a, b, c); }
+SLANG_FORCE_INLINE double F64_fma(double a, double b, double c)
+{
+    return ::fma(a, b, c);
+}
 
 #endif // SLANG_LLVM
 
-SLANG_FORCE_INLINE double F64_rsqrt(double f) { return 1.0 / F64_sqrt(f); }
-SLANG_FORCE_INLINE double F64_sign(double f) { return (f == 0.0) ? f : ((f < 0.0) ? -1.0 : 1.0); }
-SLANG_FORCE_INLINE double F64_frac(double f) { return f - F64_floor(f); }
+SLANG_FORCE_INLINE double F64_rsqrt(double f)
+{
+    return 1.0 / F64_sqrt(f);
+}
+SLANG_FORCE_INLINE double F64_sign(double f)
+{
+    return (f == 0.0) ? f : ((f < 0.0) ? -1.0 : 1.0);
+}
+SLANG_FORCE_INLINE double F64_frac(double f)
+{
+    return f - F64_floor(f);
+}
 
 SLANG_FORCE_INLINE void F64_asuint(double d, uint32_t* low, uint32_t* hi)
 {
@@ -365,24 +620,41 @@ SLANG_FORCE_INLINE void F64_asint(double d, int32_t* low, int32_t* hi)
 
 SLANG_FORCE_INLINE double F64_calcSafeRadians(double radians)
 {
-    // Put 0 to 2pi cycles to cycle around 0 to 1 
-	double a = radians * (1.0f /  (SLANG_PRELUDE_PI * 2));
+    // Put 0 to 2pi cycles to cycle around 0 to 1
+    double a = radians * (1.0f / (SLANG_PRELUDE_PI * 2));
     // Get truncated fraction, as value in  0 - 1 range
     a = a - F64_floor(a);
     // Convert back to 0 - 2pi range
-	return (a * (SLANG_PRELUDE_PI * 2));
+    return (a * (SLANG_PRELUDE_PI * 2));
 }
 
 // ----------------------------- I32 -----------------------------------------
 
-SLANG_FORCE_INLINE int32_t I32_abs(int32_t f) { return (f < 0) ? -f : f; }
+SLANG_FORCE_INLINE int32_t I32_abs(int32_t f)
+{
+    return (f < 0) ? -f : f;
+}
 
-SLANG_FORCE_INLINE int32_t I32_min(int32_t a, int32_t b) { return a < b ? a : b; }
-SLANG_FORCE_INLINE int32_t I32_max(int32_t a, int32_t b) { return a > b ? a : b; }
+SLANG_FORCE_INLINE int32_t I32_min(int32_t a, int32_t b)
+{
+    return a < b ? a : b;
+}
+SLANG_FORCE_INLINE int32_t I32_max(int32_t a, int32_t b)
+{
+    return a > b ? a : b;
+}
 
-SLANG_FORCE_INLINE float I32_asfloat(int32_t x) { Union32 u; u.i = x; return u.f; }
-SLANG_FORCE_INLINE uint32_t I32_asuint(int32_t x) { return uint32_t(x); }
-SLANG_FORCE_INLINE double I32_asdouble(int32_t low, int32_t hi )
+SLANG_FORCE_INLINE float I32_asfloat(int32_t x)
+{
+    Union32 u;
+    u.i = x;
+    return u.f;
+}
+SLANG_FORCE_INLINE uint32_t I32_asuint(int32_t x)
+{
+    return uint32_t(x);
+}
+SLANG_FORCE_INLINE double I32_asdouble(int32_t low, int32_t hi)
 {
     Union64 u;
     u.u = (uint64_t(hi) << 32) | uint32_t(low);
@@ -391,13 +663,30 @@ SLANG_FORCE_INLINE double I32_asdouble(int32_t low, int32_t hi )
 
 // ----------------------------- U32 -----------------------------------------
 
-SLANG_FORCE_INLINE uint32_t U32_abs(uint32_t f) { return f; }
+SLANG_FORCE_INLINE uint32_t U32_abs(uint32_t f)
+{
+    return f;
+}
 
-SLANG_FORCE_INLINE uint32_t U32_min(uint32_t a, uint32_t b) { return a < b ? a : b; }
-SLANG_FORCE_INLINE uint32_t U32_max(uint32_t a, uint32_t b) { return a > b ? a : b; }
+SLANG_FORCE_INLINE uint32_t U32_min(uint32_t a, uint32_t b)
+{
+    return a < b ? a : b;
+}
+SLANG_FORCE_INLINE uint32_t U32_max(uint32_t a, uint32_t b)
+{
+    return a > b ? a : b;
+}
 
-SLANG_FORCE_INLINE float U32_asfloat(uint32_t x) { Union32 u; u.u = x; return u.f; }
-SLANG_FORCE_INLINE uint32_t U32_asint(int32_t x) { return uint32_t(x); } 
+SLANG_FORCE_INLINE float U32_asfloat(uint32_t x)
+{
+    Union32 u;
+    u.u = x;
+    return u.f;
+}
+SLANG_FORCE_INLINE uint32_t U32_asint(int32_t x)
+{
+    return uint32_t(x);
+}
 
 SLANG_FORCE_INLINE double U32_asdouble(uint32_t low, uint32_t hi)
 {
@@ -413,7 +702,7 @@ SLANG_FORCE_INLINE uint32_t U32_countbits(uint32_t v)
     return __builtin_popcount(v);
 #elif SLANG_PROCESSOR_X86_64 && SLANG_VC
     return __popcnt(v);
-#else     
+#else
     uint32_t c = 0;
     while (v)
     {
@@ -426,21 +715,30 @@ SLANG_FORCE_INLINE uint32_t U32_countbits(uint32_t v)
 
 // ----------------------------- U64 -----------------------------------------
 
-SLANG_FORCE_INLINE uint64_t U64_abs(uint64_t f) { return f; }
+SLANG_FORCE_INLINE uint64_t U64_abs(uint64_t f)
+{
+    return f;
+}
 
-SLANG_FORCE_INLINE uint64_t U64_min(uint64_t a, uint64_t b) { return a < b ? a : b; }
-SLANG_FORCE_INLINE uint64_t U64_max(uint64_t a, uint64_t b) { return a > b ? a : b; }
+SLANG_FORCE_INLINE uint64_t U64_min(uint64_t a, uint64_t b)
+{
+    return a < b ? a : b;
+}
+SLANG_FORCE_INLINE uint64_t U64_max(uint64_t a, uint64_t b)
+{
+    return a > b ? a : b;
+}
 
 // TODO(JS): We don't define countbits for 64bit in the core module currently.
-// It's not clear from documentation if it should return 32 or 64 bits, if it exists. 
-// 32 bits can always hold the result, and will be implicitly promoted. 
+// It's not clear from documentation if it should return 32 or 64 bits, if it exists.
+// 32 bits can always hold the result, and will be implicitly promoted.
 SLANG_FORCE_INLINE uint32_t U64_countbits(uint64_t v)
 {
-#if SLANG_GCC_FAMILY && !defined(SLANG_LLVM)   
+#if SLANG_GCC_FAMILY && !defined(SLANG_LLVM)
     return uint32_t(__builtin_popcountl(v));
 #elif SLANG_PROCESSOR_X86_64 && SLANG_VC
     return uint32_t(__popcnt64(v));
-#else     
+#else
     uint32_t c = 0;
     while (v)
     {
@@ -453,10 +751,19 @@ SLANG_FORCE_INLINE uint32_t U64_countbits(uint64_t v)
 
 // ----------------------------- I64 -----------------------------------------
 
-SLANG_FORCE_INLINE int64_t I64_abs(int64_t f) { return (f < 0) ? -f : f; }
+SLANG_FORCE_INLINE int64_t I64_abs(int64_t f)
+{
+    return (f < 0) ? -f : f;
+}
 
-SLANG_FORCE_INLINE int64_t I64_min(int64_t a, int64_t b) { return a < b ? a : b; }
-SLANG_FORCE_INLINE int64_t I64_max(int64_t a, int64_t b) { return a > b ? a : b; }
+SLANG_FORCE_INLINE int64_t I64_min(int64_t a, int64_t b)
+{
+    return a < b ? a : b;
+}
+SLANG_FORCE_INLINE int64_t I64_max(int64_t a, int64_t b)
+{
+    return a > b ? a : b;
+}
 
 
 // ----------------------------- Interlocked ---------------------------------
@@ -465,17 +772,17 @@ SLANG_FORCE_INLINE int64_t I64_max(int64_t a, int64_t b) { return a > b ? a : b;
 
 #else // SLANG_LLVM
 
-#   ifdef _WIN32
-#       include <intrin.h>
-#   endif
+#ifdef _WIN32
+#include <intrin.h>
+#endif
 
 SLANG_FORCE_INLINE void InterlockedAdd(uint32_t* dest, uint32_t value, uint32_t* oldValue)
 {
-#   ifdef _WIN32
+#ifdef _WIN32
     *oldValue = _InterlockedExchangeAdd((long*)dest, (long)value);
-#   else
+#else
     *oldValue = __sync_fetch_and_add(dest, value);
-#   endif
+#endif
 }
 
 #endif // SLANG_LLVM
@@ -492,7 +799,7 @@ SLANG_FORCE_INLINE double _slang_fmod(double x, double y)
 }
 
 #ifdef SLANG_PRELUDE_NAMESPACE
-} 
+}
 #endif
 
 #endif
diff --git a/prelude/slang-cpp-types-core.h b/prelude/slang-cpp-types-core.h
index 25fe47202..6c0bb7544 100644
--- a/prelude/slang-cpp-types-core.h
+++ b/prelude/slang-cpp-types-core.h
@@ -2,11 +2,11 @@
 #define SLANG_PRELUDE_CPP_TYPES_CORE_H
 
 #ifndef SLANG_PRELUDE_ASSERT
-#   ifdef SLANG_PRELUDE_ENABLE_ASSERT
-#       define SLANG_PRELUDE_ASSERT(VALUE) assert(VALUE)
-#   else
-#       define SLANG_PRELUDE_ASSERT(VALUE) 
-#   endif
+#ifdef SLANG_PRELUDE_ENABLE_ASSERT
+#define SLANG_PRELUDE_ASSERT(VALUE) assert(VALUE)
+#else
+#define SLANG_PRELUDE_ASSERT(VALUE)
+#endif
 #endif
 
 // Since we are using unsigned arithmatic care is need in this comparison.
@@ -15,35 +15,42 @@
 
 // Asserts for bounds checking.
 // It is assumed index/count are unsigned types.
-#define SLANG_BOUND_ASSERT(index, count)  SLANG_PRELUDE_ASSERT(index < count); 
-#define SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_PRELUDE_ASSERT(index <= (sizeInBytes - elemSize) && (index & 3) == 0);
+#define SLANG_BOUND_ASSERT(index, count) SLANG_PRELUDE_ASSERT(index < count);
+#define SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) \
+    SLANG_PRELUDE_ASSERT(index <= (sizeInBytes - elemSize) && (index & 3) == 0);
 
 // Macros to zero index if an access is out of range
-#define SLANG_BOUND_ZERO_INDEX(index, count) index = (index < count) ? index : 0; 
-#define SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) index = (index <= (sizeInBytes - elemSize)) ? index : 0; 
-
-// The 'FIX' macro define how the index is fixed. The default is to do nothing. If SLANG_ENABLE_BOUND_ZERO_INDEX
-// the fix macro will zero the index, if out of range
-#ifdef  SLANG_ENABLE_BOUND_ZERO_INDEX
-#   define SLANG_BOUND_FIX(index, count) SLANG_BOUND_ZERO_INDEX(index, count)
-#   define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes)
-#   define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) SLANG_BOUND_ZERO_INDEX(index, count)
+#define SLANG_BOUND_ZERO_INDEX(index, count) index = (index < count) ? index : 0;
+#define SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) \
+    index = (index <= (sizeInBytes - elemSize)) ? index : 0;
+
+// The 'FIX' macro define how the index is fixed. The default is to do nothing. If
+// SLANG_ENABLE_BOUND_ZERO_INDEX the fix macro will zero the index, if out of range
+#ifdef SLANG_ENABLE_BOUND_ZERO_INDEX
+#define SLANG_BOUND_FIX(index, count) SLANG_BOUND_ZERO_INDEX(index, count)
+#define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) \
+    SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes)
+#define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) SLANG_BOUND_ZERO_INDEX(index, count)
 #else
-#   define SLANG_BOUND_FIX(index, count) 
-#   define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) 
-#   define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) 
+#define SLANG_BOUND_FIX(index, count)
+#define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes)
+#define SLANG_BOUND_FIX_FIXED_ARRAY(index, count)
 #endif
 
 #ifndef SLANG_BOUND_CHECK
-#   define SLANG_BOUND_CHECK(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX(index, count)
+#define SLANG_BOUND_CHECK(index, count) \
+    SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX(index, count)
 #endif
 
 #ifndef SLANG_BOUND_CHECK_BYTE_ADDRESS
-#   define SLANG_BOUND_CHECK_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes)
+#define SLANG_BOUND_CHECK_BYTE_ADDRESS(index, elemSize, sizeInBytes) \
+    SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes)    \
+    SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes)
 #endif
 
 #ifndef SLANG_BOUND_CHECK_FIXED_ARRAY
-#   define SLANG_BOUND_CHECK_FIXED_ARRAY(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX_FIXED_ARRAY(index, count)
+#define SLANG_BOUND_CHECK_FIXED_ARRAY(index, count) \
+    SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX_FIXED_ARRAY(index, count)
 #endif
 
 struct TypeInfo
@@ -51,34 +58,51 @@ struct TypeInfo
     size_t typeSize;
 };
 
-template <typename T, size_t SIZE>
+template<typename T, size_t SIZE>
 struct FixedArray
 {
-    const T& operator[](size_t index) const { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; }
-    T& operator[](size_t index) { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; }
+    const T& operator[](size_t index) const
+    {
+        SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE);
+        return m_data[index];
+    }
+    T& operator[](size_t index)
+    {
+        SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE);
+        return m_data[index];
+    }
 
     T m_data[SIZE];
 };
 
-// An array that has no specified size, becomes a 'Array'. This stores the size so it can potentially 
-// do bounds checking.  
-template <typename T>
+// An array that has no specified size, becomes a 'Array'. This stores the size so it can
+// potentially do bounds checking.
+template<typename T>
 struct Array
 {
-    const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
-    T& operator[](size_t index) { SLANG_BOUND_CHECK(index, count); return data[index]; }
+    const T& operator[](size_t index) const
+    {
+        SLANG_BOUND_CHECK(index, count);
+        return data[index];
+    }
+    T& operator[](size_t index)
+    {
+        SLANG_BOUND_CHECK(index, count);
+        return data[index];
+    }
 
     T* data;
     size_t count;
 };
 
-/* Constant buffers become a pointer to the contained type, so ConstantBuffer<T> becomes T* in C++ code.
-*/
+/* Constant buffers become a pointer to the contained type, so ConstantBuffer<T> becomes T* in C++
+ * code.
+ */
 
-template <typename T, int COUNT>
+template<typename T, int COUNT>
 struct Vector;
 
-template <typename T>
+template<typename T>
 struct Vector<T, 1>
 {
     T x;
@@ -86,58 +110,54 @@ struct Vector<T, 1>
     T& operator[](size_t /*index*/) { return x; }
     operator T() const { return x; }
     Vector() = default;
-    Vector(T scalar)
-    {
-        x = scalar;
-    }
-    template <typename U>
+    Vector(T scalar) { x = scalar; }
+    template<typename U>
     Vector(Vector<U, 1> other)
     {
         x = (T)other.x;
     }
-    template <typename U, int otherSize>
+    template<typename U, int otherSize>
     Vector(Vector<U, otherSize> other)
     {
         int minSize = 1;
-        if (otherSize < minSize) minSize = otherSize;
+        if (otherSize < minSize)
+            minSize = otherSize;
         for (int i = 0; i < minSize; i++)
             (*this)[i] = (T)other[i];
     }
 };
 
-template <typename T>
+template<typename T>
 struct Vector<T, 2>
 {
     T x, y;
     const T& operator[](size_t index) const { return index == 0 ? x : y; }
     T& operator[](size_t index) { return index == 0 ? x : y; }
     Vector() = default;
-    Vector(T scalar)
-    {
-        x = y = scalar;
-    }
+    Vector(T scalar) { x = y = scalar; }
     Vector(T _x, T _y)
     {
         x = _x;
         y = _y;
     }
-    template <typename U>
+    template<typename U>
     Vector(Vector<U, 2> other)
     {
         x = (T)other.x;
         y = (T)other.y;
     }
-    template <typename U, int otherSize>
+    template<typename U, int otherSize>
     Vector(Vector<U, otherSize> other)
     {
         int minSize = 2;
-        if (otherSize < minSize) minSize = otherSize;
+        if (otherSize < minSize)
+            minSize = otherSize;
         for (int i = 0; i < minSize; i++)
             (*this)[i] = (T)other[i];
     }
 };
 
-template <typename T>
+template<typename T>
 struct Vector<T, 3>
 {
     T x, y, z;
@@ -145,34 +165,32 @@ struct Vector<T, 3>
     T& operator[](size_t index) { return *((T*)(this) + index); }
 
     Vector() = default;
-    Vector(T scalar)
-    {
-        x = y = z = scalar;
-    }
+    Vector(T scalar) { x = y = z = scalar; }
     Vector(T _x, T _y, T _z)
     {
         x = _x;
         y = _y;
         z = _z;
     }
-    template <typename U>
+    template<typename U>
     Vector(Vector<U, 3> other)
     {
         x = (T)other.x;
         y = (T)other.y;
         z = (T)other.z;
     }
-    template <typename U, int otherSize>
+    template<typename U, int otherSize>
     Vector(Vector<U, otherSize> other)
     {
         int minSize = 3;
-        if (otherSize < minSize) minSize = otherSize;
+        if (otherSize < minSize)
+            minSize = otherSize;
         for (int i = 0; i < minSize; i++)
             (*this)[i] = (T)other[i];
     }
 };
 
-template <typename T>
+template<typename T>
 struct Vector<T, 4>
 {
     T x, y, z, w;
@@ -180,10 +198,7 @@ struct Vector<T, 4>
     const T& operator[](size_t index) const { return *((T*)(this) + index); }
     T& operator[](size_t index) { return *((T*)(this) + index); }
     Vector() = default;
-    Vector(T scalar)
-    {
-        x = y = z = w = scalar;
-    }
+    Vector(T scalar) { x = y = z = w = scalar; }
     Vector(T _x, T _y, T _z, T _w)
     {
         x = _x;
@@ -191,19 +206,22 @@ struct Vector<T, 4>
         z = _z;
         w = _w;
     }
-    template <typename U, int otherSize>
+    template<typename U, int otherSize>
     Vector(Vector<U, otherSize> other)
     {
         int minSize = 4;
-        if (otherSize < minSize) minSize = otherSize;
+        if (otherSize < minSize)
+            minSize = otherSize;
         for (int i = 0; i < minSize; i++)
             (*this)[i] = (T)other[i];
     }
- 
 };
 
 template<typename T, int N>
-SLANG_FORCE_INLINE Vector<T, N> _slang_select(Vector<bool, N> condition, Vector<T, N> v0, Vector<T, N> v1)
+SLANG_FORCE_INLINE Vector<T, N> _slang_select(
+    Vector<bool, N> condition,
+    Vector<T, N> v0,
+    Vector<T, N> v1)
 {
     Vector<T, N> result;
     for (int i = 0; i < N; i++)
@@ -228,7 +246,7 @@ SLANG_FORCE_INLINE T _slang_vector_get_element(Vector<T, N> x, int index)
 template<typename T, int N>
 SLANG_FORCE_INLINE const T* _slang_vector_get_element_ptr(const Vector<T, N>* x, int index)
 {
-    return &((*const_cast<Vector<T,N>*>(x))[index]);
+    return &((*const_cast<Vector<T, N>*>(x))[index]);
 }
 
 template<typename T, int N>
@@ -253,66 +271,70 @@ SLANG_FORCE_INLINE Vector<T, n> _slang_vector_reshape(const Vector<OtherT, m> ot
 
 typedef uint32_t uint;
 
-#define SLANG_VECTOR_BINARY_OP(T, op) \
-    template<int n> \
-    SLANG_FORCE_INLINE Vector<T, n> operator op(const Vector<T, n>& thisVal, const Vector<T, n>& other) \
-    { \
-        Vector<T, n> result;\
-        for (int i = 0; i < n; i++) \
-            result[i] = thisVal[i] op other[i]; \
-        return result;\
-    }
-#define SLANG_VECTOR_BINARY_COMPARE_OP(T, op) \
-    template<int n> \
-    SLANG_FORCE_INLINE Vector<bool, n> operator op(const Vector<T, n>& thisVal, const Vector<T, n>& other) \
-    { \
-        Vector<bool, n> result;\
-        for (int i = 0; i < n; i++) \
-            result[i] = thisVal[i] op other[i]; \
-        return result;\
-    }
-
-#define SLANG_VECTOR_UNARY_OP(T, op) \
-    template<int n> \
+#define SLANG_VECTOR_BINARY_OP(T, op)            \
+    template<int n>                              \
+    SLANG_FORCE_INLINE Vector<T, n> operator op( \
+        const Vector<T, n>& thisVal,             \
+        const Vector<T, n>& other)               \
+    {                                            \
+        Vector<T, n> result;                     \
+        for (int i = 0; i < n; i++)              \
+            result[i] = thisVal[i] op other[i];  \
+        return result;                           \
+    }
+#define SLANG_VECTOR_BINARY_COMPARE_OP(T, op)       \
+    template<int n>                                 \
+    SLANG_FORCE_INLINE Vector<bool, n> operator op( \
+        const Vector<T, n>& thisVal,                \
+        const Vector<T, n>& other)                  \
+    {                                               \
+        Vector<bool, n> result;                     \
+        for (int i = 0; i < n; i++)                 \
+            result[i] = thisVal[i] op other[i];     \
+        return result;                              \
+    }
+
+#define SLANG_VECTOR_UNARY_OP(T, op)                                         \
+    template<int n>                                                          \
     SLANG_FORCE_INLINE Vector<T, n> operator op(const Vector<T, n>& thisVal) \
-    { \
-        Vector<T, n> result;\
-        for (int i = 0; i < n; i++) \
-            result[i] = op thisVal[i]; \
-        return result;\
-    }
-#define SLANG_INT_VECTOR_OPS(T) \
-    SLANG_VECTOR_BINARY_OP(T, +)\
-    SLANG_VECTOR_BINARY_OP(T, -)\
-    SLANG_VECTOR_BINARY_OP(T, *)\
-    SLANG_VECTOR_BINARY_OP(T, / )\
-    SLANG_VECTOR_BINARY_OP(T, &)\
-    SLANG_VECTOR_BINARY_OP(T, |)\
-    SLANG_VECTOR_BINARY_OP(T, &&)\
-    SLANG_VECTOR_BINARY_OP(T, ||)\
-    SLANG_VECTOR_BINARY_OP(T, ^)\
-    SLANG_VECTOR_BINARY_OP(T, %)\
-    SLANG_VECTOR_BINARY_OP(T, >>)\
-    SLANG_VECTOR_BINARY_OP(T, <<)\
-    SLANG_VECTOR_BINARY_COMPARE_OP(T, >)\
-    SLANG_VECTOR_BINARY_COMPARE_OP(T, <)\
-    SLANG_VECTOR_BINARY_COMPARE_OP(T, >=)\
-    SLANG_VECTOR_BINARY_COMPARE_OP(T, <=)\
-    SLANG_VECTOR_BINARY_COMPARE_OP(T, ==)\
-    SLANG_VECTOR_BINARY_COMPARE_OP(T, !=)\
-    SLANG_VECTOR_UNARY_OP(T, !)\
+    {                                                                        \
+        Vector<T, n> result;                                                 \
+        for (int i = 0; i < n; i++)                                          \
+            result[i] = op thisVal[i];                                       \
+        return result;                                                       \
+    }
+#define SLANG_INT_VECTOR_OPS(T)           \
+    SLANG_VECTOR_BINARY_OP(T, +)          \
+    SLANG_VECTOR_BINARY_OP(T, -)          \
+    SLANG_VECTOR_BINARY_OP(T, *)          \
+    SLANG_VECTOR_BINARY_OP(T, /)          \
+    SLANG_VECTOR_BINARY_OP(T, &)          \
+    SLANG_VECTOR_BINARY_OP(T, |)          \
+    SLANG_VECTOR_BINARY_OP(T, &&)         \
+    SLANG_VECTOR_BINARY_OP(T, ||)         \
+    SLANG_VECTOR_BINARY_OP(T, ^)          \
+    SLANG_VECTOR_BINARY_OP(T, %)          \
+    SLANG_VECTOR_BINARY_OP(T, >>)         \
+    SLANG_VECTOR_BINARY_OP(T, <<)         \
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, >)  \
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, <)  \
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, >=) \
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, <=) \
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, ==) \
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, !=) \
+    SLANG_VECTOR_UNARY_OP(T, !)           \
     SLANG_VECTOR_UNARY_OP(T, ~)
-#define SLANG_FLOAT_VECTOR_OPS(T) \
-    SLANG_VECTOR_BINARY_OP(T, +)\
-    SLANG_VECTOR_BINARY_OP(T, -)\
-    SLANG_VECTOR_BINARY_OP(T, *)\
-    SLANG_VECTOR_BINARY_OP(T, /)\
-    SLANG_VECTOR_UNARY_OP(T, -)\
-    SLANG_VECTOR_BINARY_COMPARE_OP(T, >)\
-    SLANG_VECTOR_BINARY_COMPARE_OP(T, <)\
-    SLANG_VECTOR_BINARY_COMPARE_OP(T, >=)\
-    SLANG_VECTOR_BINARY_COMPARE_OP(T, <=)\
-    SLANG_VECTOR_BINARY_COMPARE_OP(T, ==)\
+#define SLANG_FLOAT_VECTOR_OPS(T)         \
+    SLANG_VECTOR_BINARY_OP(T, +)          \
+    SLANG_VECTOR_BINARY_OP(T, -)          \
+    SLANG_VECTOR_BINARY_OP(T, *)          \
+    SLANG_VECTOR_BINARY_OP(T, /)          \
+    SLANG_VECTOR_UNARY_OP(T, -)           \
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, >)  \
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, <)  \
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, >=) \
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, <=) \
+    SLANG_VECTOR_BINARY_COMPARE_OP(T, ==) \
     SLANG_VECTOR_BINARY_COMPARE_OP(T, !=)
 
 SLANG_INT_VECTOR_OPS(bool)
@@ -328,14 +350,14 @@ SLANG_INT_VECTOR_OPS(uint64_t)
 SLANG_FLOAT_VECTOR_OPS(float)
 SLANG_FLOAT_VECTOR_OPS(double)
 
-#define SLANG_VECTOR_INT_NEG_OP(T) \
-    template<int N>\
+#define SLANG_VECTOR_INT_NEG_OP(T)                      \
+    template<int N>                                     \
     Vector<T, N> operator-(const Vector<T, N>& thisVal) \
-    { \
-        Vector<T, N> result;\
-        for (int i = 0; i < N; i++) \
-            result[i] = 0 - thisVal[i]; \
-        return result;\
+    {                                                   \
+        Vector<T, N> result;                            \
+        for (int i = 0; i < N; i++)                     \
+            result[i] = 0 - thisVal[i];                 \
+        return result;                                  \
     }
 SLANG_VECTOR_INT_NEG_OP(int)
 SLANG_VECTOR_INT_NEG_OP(int8_t)
@@ -346,14 +368,14 @@ SLANG_VECTOR_INT_NEG_OP(uint8_t)
 SLANG_VECTOR_INT_NEG_OP(uint16_t)
 SLANG_VECTOR_INT_NEG_OP(uint64_t)
 
-#define SLANG_FLOAT_VECTOR_MOD(T)\
-    template<int N> \
+#define SLANG_FLOAT_VECTOR_MOD(T)                                               \
+    template<int N>                                                             \
     Vector<T, N> operator%(const Vector<T, N>& left, const Vector<T, N>& right) \
-    {\
-        Vector<T, N> result;\
-        for (int i = 0; i < N; i++) \
-            result[i] = _slang_fmod(left[i], right[i]); \
-        return result;\
+    {                                                                           \
+        Vector<T, N> result;                                                    \
+        for (int i = 0; i < N; i++)                                             \
+            result[i] = _slang_fmod(left[i], right[i]);                         \
+        return result;                                                          \
     }
 
 SLANG_FLOAT_VECTOR_MOD(float)
@@ -366,7 +388,7 @@ SLANG_FLOAT_VECTOR_MOD(double)
 #undef SLANG_VECTOR_INT_NEG_OP
 #undef SLANG_FLOAT_VECTOR_MOD
 
-template <typename T, int ROWS, int COLS>
+template<typename T, int ROWS, int COLS>
 struct Matrix
 {
     Vector<T, COLS> rows[ROWS];
@@ -377,10 +399,7 @@ struct Matrix
         for (int i = 0; i < ROWS; i++)
             rows[i] = Vector<T, COLS>(scalar);
     }
-    Matrix(const Vector<T, COLS>& row0)
-    {
-        rows[0] = row0;
-    }
+    Matrix(const Vector<T, COLS>& row0) { rows[0] = row0; }
     Matrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1)
     {
         rows[0] = row0;
@@ -392,7 +411,11 @@ struct Matrix
         rows[1] = row1;
         rows[2] = row2;
     }
-    Matrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1, const Vector<T, COLS>& row2, const Vector<T, COLS>& row3)
+    Matrix(
+        const Vector<T, COLS>& row0,
+        const Vector<T, COLS>& row1,
+        const Vector<T, COLS>& row2,
+        const Vector<T, COLS>& row3)
     {
         rows[0] = row0;
         rows[1] = row1;
@@ -404,116 +427,188 @@ struct Matrix
     {
         int minRow = ROWS;
         int minCol = COLS;
-        if (minRow > otherRow) minRow = otherRow;
-        if (minCol > otherCol) minCol = otherCol;
+        if (minRow > otherRow)
+            minRow = otherRow;
+        if (minCol > otherCol)
+            minCol = otherCol;
         for (int i = 0; i < minRow; i++)
             for (int j = 0; j < minCol; j++)
                 rows[i][j] = (T)other.rows[i][j];
     }
     Matrix(T v0, T v1, T v2, T v3)
     {
-        rows[0][0] = v0;  rows[0][1] = v1;
-        rows[1][0] = v2;  rows[1][1] = v3;
+        rows[0][0] = v0;
+        rows[0][1] = v1;
+        rows[1][0] = v2;
+        rows[1][1] = v3;
     }
     Matrix(T v0, T v1, T v2, T v3, T v4, T v5)
     {
         if (COLS == 3)
         {
-            rows[0][0] = v0;  rows[0][1] = v1; rows[0][2] = v2;
-            rows[1][0] = v3;  rows[1][1] = v4; rows[1][2] = v5;
+            rows[0][0] = v0;
+            rows[0][1] = v1;
+            rows[0][2] = v2;
+            rows[1][0] = v3;
+            rows[1][1] = v4;
+            rows[1][2] = v5;
         }
         else
         {
-            rows[0][0] = v0;  rows[0][1] = v1;
-            rows[1][0] = v2;  rows[1][1] = v3;
-            rows[2][0] = v4;  rows[2][1] = v5;
+            rows[0][0] = v0;
+            rows[0][1] = v1;
+            rows[1][0] = v2;
+            rows[1][1] = v3;
+            rows[2][0] = v4;
+            rows[2][1] = v5;
         }
     }
     Matrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7)
     {
         if (COLS == 4)
         {
-            rows[0][0] = v0;  rows[0][1] = v1; rows[0][2] = v2; rows[0][3] = v3;
-            rows[1][0] = v4;  rows[1][1] = v5; rows[1][2] = v6; rows[1][3] = v7;
+            rows[0][0] = v0;
+            rows[0][1] = v1;
+            rows[0][2] = v2;
+            rows[0][3] = v3;
+            rows[1][0] = v4;
+            rows[1][1] = v5;
+            rows[1][2] = v6;
+            rows[1][3] = v7;
         }
         else
         {
-            rows[0][0] = v0;  rows[0][1] = v1;
-            rows[1][0] = v2;  rows[1][1] = v3;
-            rows[2][0] = v4;  rows[2][1] = v5;
-            rows[3][0] = v6;  rows[3][1] = v7;
+            rows[0][0] = v0;
+            rows[0][1] = v1;
+            rows[1][0] = v2;
+            rows[1][1] = v3;
+            rows[2][0] = v4;
+            rows[2][1] = v5;
+            rows[3][0] = v6;
+            rows[3][1] = v7;
         }
     }
     Matrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8)
     {
-        rows[0][0] = v0;  rows[0][1] = v1;  rows[0][2] = v2;
-        rows[1][0] = v3;  rows[1][1] = v4;  rows[1][2] = v5;
-        rows[2][0] = v6;  rows[2][1] = v7;  rows[2][2] = v8;
+        rows[0][0] = v0;
+        rows[0][1] = v1;
+        rows[0][2] = v2;
+        rows[1][0] = v3;
+        rows[1][1] = v4;
+        rows[1][2] = v5;
+        rows[2][0] = v6;
+        rows[2][1] = v7;
+        rows[2][2] = v8;
     }
     Matrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11)
     {
         if (COLS == 4)
         {
-            rows[0][0] = v0;  rows[0][1] = v1;  rows[0][2] = v2;  rows[0][3] = v3;
-            rows[1][0] = v4;  rows[1][1] = v5;  rows[1][2] = v6;  rows[1][3] = v7;
-            rows[2][0] = v8;  rows[2][1] = v9;  rows[2][2] = v10; rows[2][3] = v11;
+            rows[0][0] = v0;
+            rows[0][1] = v1;
+            rows[0][2] = v2;
+            rows[0][3] = v3;
+            rows[1][0] = v4;
+            rows[1][1] = v5;
+            rows[1][2] = v6;
+            rows[1][3] = v7;
+            rows[2][0] = v8;
+            rows[2][1] = v9;
+            rows[2][2] = v10;
+            rows[2][3] = v11;
         }
         else
         {
-            rows[0][0] = v0;  rows[0][1] = v1;  rows[0][2] = v2;
-            rows[1][0] = v3;  rows[1][1] = v4;  rows[1][2] = v5;
-            rows[2][0] = v6;  rows[2][1] = v7;  rows[2][2] = v8;
-            rows[3][0] = v9;  rows[3][1] = v10; rows[3][2] = v11;
+            rows[0][0] = v0;
+            rows[0][1] = v1;
+            rows[0][2] = v2;
+            rows[1][0] = v3;
+            rows[1][1] = v4;
+            rows[1][2] = v5;
+            rows[2][0] = v6;
+            rows[2][1] = v7;
+            rows[2][2] = v8;
+            rows[3][0] = v9;
+            rows[3][1] = v10;
+            rows[3][2] = v11;
         }
     }
-    Matrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15)
+    Matrix(
+        T v0,
+        T v1,
+        T v2,
+        T v3,
+        T v4,
+        T v5,
+        T v6,
+        T v7,
+        T v8,
+        T v9,
+        T v10,
+        T v11,
+        T v12,
+        T v13,
+        T v14,
+        T v15)
     {
-        rows[0][0] = v0;  rows[0][1] = v1;  rows[0][2] = v2;  rows[0][3] = v3;
-        rows[1][0] = v4;  rows[1][1] = v5;  rows[1][2] = v6;  rows[1][3] = v7;
-        rows[2][0] = v8;  rows[2][1] = v9;  rows[2][2] = v10; rows[2][3] = v11;
-        rows[3][0] = v12; rows[3][1] = v13; rows[3][2] = v14; rows[3][3] = v15;
+        rows[0][0] = v0;
+        rows[0][1] = v1;
+        rows[0][2] = v2;
+        rows[0][3] = v3;
+        rows[1][0] = v4;
+        rows[1][1] = v5;
+        rows[1][2] = v6;
+        rows[1][3] = v7;
+        rows[2][0] = v8;
+        rows[2][1] = v9;
+        rows[2][2] = v10;
+        rows[2][3] = v11;
+        rows[3][0] = v12;
+        rows[3][1] = v13;
+        rows[3][2] = v14;
+        rows[3][3] = v15;
     }
 };
 
-#define SLANG_MATRIX_BINARY_OP(T, op) \
-    template<int R, int C> \
+#define SLANG_MATRIX_BINARY_OP(T, op)                                                         \
+    template<int R, int C>                                                                    \
     Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal, const Matrix<T, R, C>& other) \
-    { \
-        Matrix<T, R, C> result;\
-        for (int i = 0; i < R; i++) \
-            for (int j = 0; j < C; j++) \
-                result.rows[i][j] = thisVal.rows[i][j] op other.rows[i][j]; \
-        return result;\
+    {                                                                                         \
+        Matrix<T, R, C> result;                                                               \
+        for (int i = 0; i < R; i++)                                                           \
+            for (int j = 0; j < C; j++)                                                       \
+                result.rows[i][j] = thisVal.rows[i][j] op other.rows[i][j];                   \
+        return result;                                                                        \
     }
 
-#define SLANG_MATRIX_UNARY_OP(T, op) \
-    template<int R, int C> \
+#define SLANG_MATRIX_UNARY_OP(T, op)                            \
+    template<int R, int C>                                      \
     Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal) \
-    { \
-        Matrix<T, R, C> result;\
-        for (int i = 0; i < R; i++) \
-            for (int j = 0; j < C; j++) \
-                result[i].rows[i][j] = op thisVal.rows[i][j]; \
-        return result;\
-    }
-#define SLANG_INT_MATRIX_OPS(T) \
-    SLANG_MATRIX_BINARY_OP(T, +)\
-    SLANG_MATRIX_BINARY_OP(T, -)\
-    SLANG_MATRIX_BINARY_OP(T, *)\
-    SLANG_MATRIX_BINARY_OP(T, / )\
-    SLANG_MATRIX_BINARY_OP(T, &)\
-    SLANG_MATRIX_BINARY_OP(T, |)\
-    SLANG_MATRIX_BINARY_OP(T, &&)\
-    SLANG_MATRIX_BINARY_OP(T, ||)\
-    SLANG_MATRIX_BINARY_OP(T, ^)\
-    SLANG_MATRIX_BINARY_OP(T, %)\
-    SLANG_MATRIX_UNARY_OP(T, !)\
+    {                                                           \
+        Matrix<T, R, C> result;                                 \
+        for (int i = 0; i < R; i++)                             \
+            for (int j = 0; j < C; j++)                         \
+                result[i].rows[i][j] = op thisVal.rows[i][j];   \
+        return result;                                          \
+    }
+#define SLANG_INT_MATRIX_OPS(T)   \
+    SLANG_MATRIX_BINARY_OP(T, +)  \
+    SLANG_MATRIX_BINARY_OP(T, -)  \
+    SLANG_MATRIX_BINARY_OP(T, *)  \
+    SLANG_MATRIX_BINARY_OP(T, /)  \
+    SLANG_MATRIX_BINARY_OP(T, &)  \
+    SLANG_MATRIX_BINARY_OP(T, |)  \
+    SLANG_MATRIX_BINARY_OP(T, &&) \
+    SLANG_MATRIX_BINARY_OP(T, ||) \
+    SLANG_MATRIX_BINARY_OP(T, ^)  \
+    SLANG_MATRIX_BINARY_OP(T, %)  \
+    SLANG_MATRIX_UNARY_OP(T, !)   \
     SLANG_MATRIX_UNARY_OP(T, ~)
 #define SLANG_FLOAT_MATRIX_OPS(T) \
-    SLANG_MATRIX_BINARY_OP(T, +)\
-    SLANG_MATRIX_BINARY_OP(T, -)\
-    SLANG_MATRIX_BINARY_OP(T, *)\
-    SLANG_MATRIX_BINARY_OP(T, /)\
+    SLANG_MATRIX_BINARY_OP(T, +)  \
+    SLANG_MATRIX_BINARY_OP(T, -)  \
+    SLANG_MATRIX_BINARY_OP(T, *)  \
+    SLANG_MATRIX_BINARY_OP(T, /)  \
     SLANG_MATRIX_UNARY_OP(T, -)
 SLANG_INT_MATRIX_OPS(int)
 SLANG_INT_MATRIX_OPS(int8_t)
@@ -527,38 +622,38 @@ SLANG_INT_MATRIX_OPS(uint64_t)
 SLANG_FLOAT_MATRIX_OPS(float)
 SLANG_FLOAT_MATRIX_OPS(double)
 
-#define SLANG_MATRIX_INT_NEG_OP(T) \
-    template<int R, int C>\
+#define SLANG_MATRIX_INT_NEG_OP(T)                                        \
+    template<int R, int C>                                                \
     SLANG_FORCE_INLINE Matrix<T, R, C> operator-(Matrix<T, R, C> thisVal) \
-    { \
-        Matrix<T, R, C> result;\
-        for (int i = 0; i < R; i++) \
-            for (int j = 0; j < C; j++) \
-            result.rows[i][j] = 0 - thisVal.rows[i][j]; \
-        return result;\
-    }
-    SLANG_MATRIX_INT_NEG_OP(int)
-    SLANG_MATRIX_INT_NEG_OP(int8_t)
-    SLANG_MATRIX_INT_NEG_OP(int16_t)
-    SLANG_MATRIX_INT_NEG_OP(int64_t)
-    SLANG_MATRIX_INT_NEG_OP(uint)
-    SLANG_MATRIX_INT_NEG_OP(uint8_t)
-    SLANG_MATRIX_INT_NEG_OP(uint16_t)
-    SLANG_MATRIX_INT_NEG_OP(uint64_t)
-
-#define SLANG_FLOAT_MATRIX_MOD(T)\
-    template<int R, int C> \
+    {                                                                     \
+        Matrix<T, R, C> result;                                           \
+        for (int i = 0; i < R; i++)                                       \
+            for (int j = 0; j < C; j++)                                   \
+                result.rows[i][j] = 0 - thisVal.rows[i][j];               \
+        return result;                                                    \
+    }
+SLANG_MATRIX_INT_NEG_OP(int)
+SLANG_MATRIX_INT_NEG_OP(int8_t)
+SLANG_MATRIX_INT_NEG_OP(int16_t)
+SLANG_MATRIX_INT_NEG_OP(int64_t)
+SLANG_MATRIX_INT_NEG_OP(uint)
+SLANG_MATRIX_INT_NEG_OP(uint8_t)
+SLANG_MATRIX_INT_NEG_OP(uint16_t)
+SLANG_MATRIX_INT_NEG_OP(uint64_t)
+
+#define SLANG_FLOAT_MATRIX_MOD(T)                                                             \
+    template<int R, int C>                                                                    \
     SLANG_FORCE_INLINE Matrix<T, R, C> operator%(Matrix<T, R, C> left, Matrix<T, R, C> right) \
-    {\
-        Matrix<T, R, C> result;\
-        for (int i = 0; i < R; i++) \
-            for (int j = 0; j < C; j++) \
-                result.rows[i][j] = _slang_fmod(left.rows[i][j], right.rows[i][j]); \
-        return result;\
+    {                                                                                         \
+        Matrix<T, R, C> result;                                                               \
+        for (int i = 0; i < R; i++)                                                           \
+            for (int j = 0; j < C; j++)                                                       \
+                result.rows[i][j] = _slang_fmod(left.rows[i][j], right.rows[i][j]);           \
+        return result;                                                                        \
     }
 
-    SLANG_FLOAT_MATRIX_MOD(float)
-    SLANG_FLOAT_MATRIX_MOD(double)
+SLANG_FLOAT_MATRIX_MOD(float)
+SLANG_FLOAT_MATRIX_MOD(double)
 #undef SLANG_FLOAT_MATRIX_MOD
 #undef SLANG_MATRIX_BINARY_OP
 #undef SLANG_MATRIX_UNARY_OP
@@ -574,5 +669,3 @@ TResult slang_bit_cast(TInput val)
 }
 
 #endif
-
-
diff --git a/prelude/slang-cpp-types.h b/prelude/slang-cpp-types.h
index 3f805a8b7..010ab8d6c 100644
--- a/prelude/slang-cpp-types.h
+++ b/prelude/slang-cpp-types.h
@@ -2,11 +2,12 @@
 #define SLANG_PRELUDE_CPP_TYPES_H
 
 #ifdef SLANG_PRELUDE_NAMESPACE
-namespace SLANG_PRELUDE_NAMESPACE {
+namespace SLANG_PRELUDE_NAMESPACE
+{
 #endif
 
 #ifndef SLANG_FORCE_INLINE
-#    define SLANG_FORCE_INLINE inline
+#define SLANG_FORCE_INLINE inline
 #endif
 
 #include "slang-cpp-types-core.h"
@@ -23,8 +24,8 @@ typedef Vector<uint32_t, 2> uint2;
 typedef Vector<uint32_t, 3> uint3;
 typedef Vector<uint32_t, 4> uint4;
 
-// We can just map `NonUniformResourceIndex` type directly to the index type on CPU, as CPU does not require
-// any special handling around such accesses.
+// We can just map `NonUniformResourceIndex` type directly to the index type on CPU, as CPU does not
+// require any special handling around such accesses.
 typedef size_t NonUniformResourceIndex;
 
 // ----------------------------- ResourceType -----------------------------------------
@@ -32,47 +33,87 @@ typedef size_t NonUniformResourceIndex;
 // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sm5-object-structuredbuffer-getdimensions
 // Missing  Load(_In_  int  Location, _Out_ uint Status);
 
-template <typename T>
+template<typename T>
 struct RWStructuredBuffer
 {
-    SLANG_FORCE_INLINE T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
-    const T& Load(size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }  
-    void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride) { *outNumStructs = uint32_t(count); *outStride = uint32_t(sizeof(T)); }
-  
+    SLANG_FORCE_INLINE T& operator[](size_t index) const
+    {
+        SLANG_BOUND_CHECK(index, count);
+        return data[index];
+    }
+    const T& Load(size_t index) const
+    {
+        SLANG_BOUND_CHECK(index, count);
+        return data[index];
+    }
+    void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride)
+    {
+        *outNumStructs = uint32_t(count);
+        *outStride = uint32_t(sizeof(T));
+    }
+
     T* data;
     size_t count;
 };
 
-template <typename T>
+template<typename T>
 struct StructuredBuffer
 {
-    SLANG_FORCE_INLINE const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
-    const T& Load(size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
-    void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride) { *outNumStructs = uint32_t(count); *outStride = uint32_t(sizeof(T)); }
-    
+    SLANG_FORCE_INLINE const T& operator[](size_t index) const
+    {
+        SLANG_BOUND_CHECK(index, count);
+        return data[index];
+    }
+    const T& Load(size_t index) const
+    {
+        SLANG_BOUND_CHECK(index, count);
+        return data[index];
+    }
+    void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride)
+    {
+        *outNumStructs = uint32_t(count);
+        *outStride = uint32_t(sizeof(T));
+    }
+
     T* data;
     size_t count;
 };
 
 
-template <typename T>
+template<typename T>
 struct RWBuffer
 {
-    SLANG_FORCE_INLINE T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
-    const T& Load(size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
+    SLANG_FORCE_INLINE T& operator[](size_t index) const
+    {
+        SLANG_BOUND_CHECK(index, count);
+        return data[index];
+    }
+    const T& Load(size_t index) const
+    {
+        SLANG_BOUND_CHECK(index, count);
+        return data[index];
+    }
     void GetDimensions(uint32_t* outCount) { *outCount = uint32_t(count); }
-    
+
     T* data;
     size_t count;
 };
 
-template <typename T>
+template<typename T>
 struct Buffer
 {
-    SLANG_FORCE_INLINE const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
-    const T& Load(size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
+    SLANG_FORCE_INLINE const T& operator[](size_t index) const
+    {
+        SLANG_BOUND_CHECK(index, count);
+        return data[index];
+    }
+    const T& Load(size_t index) const
+    {
+        SLANG_BOUND_CHECK(index, count);
+        return data[index];
+    }
     void GetDimensions(uint32_t* outCount) { *outCount = uint32_t(count); }
-    
+
     T* data;
     size_t count;
 };
@@ -81,28 +122,28 @@ struct Buffer
 struct ByteAddressBuffer
 {
     void GetDimensions(uint32_t* outDim) const { *outDim = uint32_t(sizeInBytes); }
-    uint32_t Load(size_t index) const 
-    { 
+    uint32_t Load(size_t index) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes);
-        return data[index >> 2]; 
+        return data[index >> 2];
     }
-    uint2 Load2(size_t index) const 
-    { 
+    uint2 Load2(size_t index) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
-        return uint2{data[dataIdx], data[dataIdx + 1]}; 
+        const size_t dataIdx = index >> 2;
+        return uint2{data[dataIdx], data[dataIdx + 1]};
     }
-    uint3 Load3(size_t index) const 
-    { 
+    uint3 Load3(size_t index) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
-        return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; 
+        const size_t dataIdx = index >> 2;
+        return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]};
     }
-    uint4 Load4(size_t index) const 
-    { 
+    uint4 Load4(size_t index) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
-        return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; 
+        const size_t dataIdx = index >> 2;
+        return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]};
     }
     template<typename T>
     T Load(size_t index) const
@@ -110,40 +151,40 @@ struct ByteAddressBuffer
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes);
         return *(const T*)(((const char*)data) + index);
     }
-    
+
     const uint32_t* data;
-    size_t sizeInBytes;  //< Must be multiple of 4
+    size_t sizeInBytes; //< Must be multiple of 4
 };
 
 // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sm5-object-rwbyteaddressbuffer
-// Missing support for Atomic operations 
+// Missing support for Atomic operations
 // Missing support for Load with status
 struct RWByteAddressBuffer
 {
     void GetDimensions(uint32_t* outDim) const { *outDim = uint32_t(sizeInBytes); }
-    
-    uint32_t Load(size_t index) const 
-    { 
+
+    uint32_t Load(size_t index) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes);
-        return data[index >> 2]; 
+        return data[index >> 2];
     }
-    uint2 Load2(size_t index) const 
-    { 
+    uint2 Load2(size_t index) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
-        return uint2{data[dataIdx], data[dataIdx + 1]}; 
+        const size_t dataIdx = index >> 2;
+        return uint2{data[dataIdx], data[dataIdx + 1]};
     }
-    uint3 Load3(size_t index) const 
-    { 
+    uint3 Load3(size_t index) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
-        return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; 
+        const size_t dataIdx = index >> 2;
+        return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]};
     }
-    uint4 Load4(size_t index) const 
-    { 
+    uint4 Load4(size_t index) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
-        return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; 
+        const size_t dataIdx = index >> 2;
+        return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]};
     }
     template<typename T>
     T Load(size_t index) const
@@ -152,30 +193,30 @@ struct RWByteAddressBuffer
         return *(const T*)(((const char*)data) + index);
     }
 
-    void Store(size_t index, uint32_t v) const 
-    { 
+    void Store(size_t index, uint32_t v) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes);
-        data[index >> 2] = v; 
+        data[index >> 2] = v;
     }
-    void Store2(size_t index, uint2 v) const 
-    { 
+    void Store2(size_t index, uint2 v) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
+        const size_t dataIdx = index >> 2;
         data[dataIdx + 0] = v.x;
         data[dataIdx + 1] = v.y;
     }
-    void Store3(size_t index, uint3 v) const 
-    {  
+    void Store3(size_t index, uint3 v) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
+        const size_t dataIdx = index >> 2;
         data[dataIdx + 0] = v.x;
         data[dataIdx + 1] = v.y;
         data[dataIdx + 2] = v.z;
     }
-    void Store4(size_t index, uint4 v) const 
-    { 
+    void Store4(size_t index, uint4 v) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
+        const size_t dataIdx = index >> 2;
         data[dataIdx + 0] = v.x;
         data[dataIdx + 1] = v.y;
         data[dataIdx + 2] = v.z;
@@ -189,7 +230,7 @@ struct RWByteAddressBuffer
     }
 
     uint32_t* data;
-    size_t sizeInBytes; //< Must be multiple of 4 
+    size_t sizeInBytes; //< Must be multiple of 4
 };
 
 struct ISamplerState;
@@ -206,7 +247,7 @@ struct SamplerComparisonState
 };
 
 #ifndef SLANG_RESOURCE_SHAPE
-#    define SLANG_RESOURCE_SHAPE
+#define SLANG_RESOURCE_SHAPE
 typedef unsigned int SlangResourceShape;
 enum
 {
@@ -243,7 +284,7 @@ enum
 };
 #endif
 
-// 
+//
 struct TextureDimensions
 {
     void reset()
@@ -259,25 +300,25 @@ struct TextureDimensions
         int count = 0;
         switch (baseShape)
         {
-            case SLANG_TEXTURE_1D:
+        case SLANG_TEXTURE_1D:
             {
                 outDims[count++] = width;
                 break;
             }
-            case SLANG_TEXTURE_2D:
+        case SLANG_TEXTURE_2D:
             {
                 outDims[count++] = width;
                 outDims[count++] = height;
                 break;
             }
-            case SLANG_TEXTURE_3D:
+        case SLANG_TEXTURE_3D:
             {
                 outDims[count++] = width;
                 outDims[count++] = height;
                 outDims[count++] = depth;
                 break;
             }
-            case SLANG_TEXTURE_CUBE:
+        case SLANG_TEXTURE_CUBE:
             {
                 outDims[count++] = width;
                 outDims[count++] = height;
@@ -298,19 +339,19 @@ struct TextureDimensions
         int count = 0;
         switch (baseShape)
         {
-            case SLANG_TEXTURE_1D:
+        case SLANG_TEXTURE_1D:
             {
                 outDims[count++] = width;
                 break;
             }
-            case SLANG_TEXTURE_CUBE:
-            case SLANG_TEXTURE_2D:
+        case SLANG_TEXTURE_CUBE:
+        case SLANG_TEXTURE_2D:
             {
                 outDims[count++] = width;
                 outDims[count++] = height;
                 break;
             }
-            case SLANG_TEXTURE_3D:
+        case SLANG_TEXTURE_3D:
             {
                 outDims[count++] = width;
                 outDims[count++] = height;
@@ -345,97 +386,146 @@ struct TextureDimensions
     uint32_t shape;
     uint32_t width, height, depth;
     uint32_t numberOfLevels;
-    uint32_t arrayElementCount;                  ///< For array types, 0 otherwise
+    uint32_t arrayElementCount; ///< For array types, 0 otherwise
 };
 
 
-
-
-
 // Texture
 
 struct ITexture
 {
     virtual TextureDimensions GetDimensions(int mipLevel = -1) = 0;
     virtual void Load(const int32_t* v, void* outData, size_t dataSize) = 0;
-    virtual void Sample(SamplerState samplerState, const float* loc, void* outData, size_t dataSize) = 0;
-    virtual void SampleLevel(SamplerState samplerState, const float* loc, float level, void* outData, size_t dataSize) = 0;
+    virtual void Sample(
+        SamplerState samplerState,
+        const float* loc,
+        void* outData,
+        size_t dataSize) = 0;
+    virtual void SampleLevel(
+        SamplerState samplerState,
+        const float* loc,
+        float level,
+        void* outData,
+        size_t dataSize) = 0;
 };
 
-template <typename T>
+template<typename T>
 struct Texture1D
 {
     void GetDimensions(uint32_t* outWidth) { *outWidth = texture->GetDimensions().width; }
-    void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outNumberOfLevels) 
-    { 
-        auto dims = texture->GetDimensions(mipLevel); 
-        *outWidth = dims.width; 
-        *outNumberOfLevels = dims.numberOfLevels; 
+    void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outNumberOfLevels)
+    {
+        auto dims = texture->GetDimensions(mipLevel);
+        *outWidth = dims.width;
+        *outNumberOfLevels = dims.numberOfLevels;
     }
-    
+
     void GetDimensions(float* outWidth) { *outWidth = texture->GetDimensions().width; }
-    void GetDimensions(uint32_t mipLevel, float* outWidth, float* outNumberOfLevels) 
-    { 
-        auto dims = texture->GetDimensions(mipLevel); 
-        *outWidth = dims.width; 
-        *outNumberOfLevels = dims.numberOfLevels; 
+    void GetDimensions(uint32_t mipLevel, float* outWidth, float* outNumberOfLevels)
+    {
+        auto dims = texture->GetDimensions(mipLevel);
+        *outWidth = dims.width;
+        *outNumberOfLevels = dims.numberOfLevels;
+    }
+
+    T Load(const int2& loc) const
+    {
+        T out;
+        texture->Load(&loc.x, &out, sizeof(out));
+        return out;
     }
-    
-    T Load(const int2& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
-    T Sample(SamplerState samplerState, float loc) const { T out; texture->Sample(samplerState, &loc, &out, sizeof(out)); return out; }
-    T SampleLevel(SamplerState samplerState, float loc, float level) { T out; texture->SampleLevel(samplerState, &loc, level, &out, sizeof(out)); return out; }
-    
-    ITexture* texture;              
+    T Sample(SamplerState samplerState, float loc) const
+    {
+        T out;
+        texture->Sample(samplerState, &loc, &out, sizeof(out));
+        return out;
+    }
+    T SampleLevel(SamplerState samplerState, float loc, float level)
+    {
+        T out;
+        texture->SampleLevel(samplerState, &loc, level, &out, sizeof(out));
+        return out;
+    }
+
+    ITexture* texture;
 };
 
-template <typename T>
+template<typename T>
 struct Texture2D
 {
-    void GetDimensions(uint32_t* outWidth, uint32_t* outHeight) 
-    { 
-        const auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
-        *outHeight = dims.height; 
+    void GetDimensions(uint32_t* outWidth, uint32_t* outHeight)
+    {
+        const auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
+        *outHeight = dims.height;
     }
-    void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        uint32_t* outWidth,
+        uint32_t* outHeight,
+        uint32_t* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
         *outHeight = dims.height;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    void GetDimensions(float* outWidth, float* outHeight) 
-    { 
-        const auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
-        *outHeight = dims.height; 
+    void GetDimensions(float* outWidth, float* outHeight)
+    {
+        const auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
+        *outHeight = dims.height;
     }
-    void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        float* outWidth,
+        float* outHeight,
+        float* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
         *outHeight = dims.height;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    
-    T Load(const int3& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
-    T Sample(SamplerState samplerState, const float2& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; }
-    T SampleLevel(SamplerState samplerState, const float2& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; }
-    
-    ITexture* texture;              
+
+    T Load(const int3& loc) const
+    {
+        T out;
+        texture->Load(&loc.x, &out, sizeof(out));
+        return out;
+    }
+    T Sample(SamplerState samplerState, const float2& loc) const
+    {
+        T out;
+        texture->Sample(samplerState, &loc.x, &out, sizeof(out));
+        return out;
+    }
+    T SampleLevel(SamplerState samplerState, const float2& loc, float level)
+    {
+        T out;
+        texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out));
+        return out;
+    }
+
+    ITexture* texture;
 };
 
-template <typename T>
+template<typename T>
 struct Texture3D
 {
     void GetDimensions(uint32_t* outWidth, uint32_t* outHeight, uint32_t* outDepth)
     {
-        const auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
-        *outHeight = dims.height; 
+        const auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
+        *outHeight = dims.height;
         *outDepth = dims.depth;
     }
-    void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outDepth, uint32_t* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        uint32_t* outWidth,
+        uint32_t* outHeight,
+        uint32_t* outDepth,
+        uint32_t* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
@@ -445,12 +535,17 @@ struct Texture3D
     }
     void GetDimensions(float* outWidth, float* outHeight, float* outDepth)
     {
-        const auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
-        *outHeight = dims.height; 
+        const auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
+        *outHeight = dims.height;
         *outDepth = dims.depth;
     }
-    void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outDepth, float* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        float* outWidth,
+        float* outHeight,
+        float* outDepth,
+        float* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
@@ -458,78 +553,144 @@ struct Texture3D
         *outDepth = dims.depth;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    
-    T Load(const int4& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
-    T Sample(SamplerState samplerState, const float3& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; }
-    T SampleLevel(SamplerState samplerState, const float3& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; }
-    
-    ITexture* texture;              
+
+    T Load(const int4& loc) const
+    {
+        T out;
+        texture->Load(&loc.x, &out, sizeof(out));
+        return out;
+    }
+    T Sample(SamplerState samplerState, const float3& loc) const
+    {
+        T out;
+        texture->Sample(samplerState, &loc.x, &out, sizeof(out));
+        return out;
+    }
+    T SampleLevel(SamplerState samplerState, const float3& loc, float level)
+    {
+        T out;
+        texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out));
+        return out;
+    }
+
+    ITexture* texture;
 };
 
-template <typename T>
+template<typename T>
 struct TextureCube
 {
-    void GetDimensions(uint32_t* outWidth, uint32_t* outHeight) 
-    { 
-        const auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
-        *outHeight = dims.height; 
+    void GetDimensions(uint32_t* outWidth, uint32_t* outHeight)
+    {
+        const auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
+        *outHeight = dims.height;
     }
-    void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        uint32_t* outWidth,
+        uint32_t* outHeight,
+        uint32_t* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
         *outHeight = dims.height;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    void GetDimensions(float* outWidth, float* outHeight) 
-    { 
-        const auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
-        *outHeight = dims.height; 
+    void GetDimensions(float* outWidth, float* outHeight)
+    {
+        const auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
+        *outHeight = dims.height;
     }
-    void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        float* outWidth,
+        float* outHeight,
+        float* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
         *outHeight = dims.height;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    
-    T Sample(SamplerState samplerState, const float3& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; }
-    T SampleLevel(SamplerState samplerState, const float3& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; }
-    
-    ITexture* texture;              
+
+    T Sample(SamplerState samplerState, const float3& loc) const
+    {
+        T out;
+        texture->Sample(samplerState, &loc.x, &out, sizeof(out));
+        return out;
+    }
+    T SampleLevel(SamplerState samplerState, const float3& loc, float level)
+    {
+        T out;
+        texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out));
+        return out;
+    }
+
+    ITexture* texture;
 };
 
-template <typename T>
+template<typename T>
 struct Texture1DArray
 {
-    void GetDimensions(uint32_t* outWidth, uint32_t* outElements) { auto dims = texture->GetDimensions(); *outWidth = dims.width; *outElements = dims.arrayElementCount; }
-    void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outElements, uint32_t* outNumberOfLevels) 
+    void GetDimensions(uint32_t* outWidth, uint32_t* outElements)
     {
-        auto dims = texture->GetDimensions(mipLevel); 
-        *outWidth = dims.width; 
+        auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
+        *outElements = dims.arrayElementCount;
+    }
+    void GetDimensions(
+        uint32_t mipLevel,
+        uint32_t* outWidth,
+        uint32_t* outElements,
+        uint32_t* outNumberOfLevels)
+    {
+        auto dims = texture->GetDimensions(mipLevel);
+        *outWidth = dims.width;
         *outNumberOfLevels = dims.numberOfLevels;
-        *outElements = dims.arrayElementCount; 
-    }        
-    void GetDimensions(float* outWidth, float* outElements) { auto dims = texture->GetDimensions(); *outWidth = dims.width; *outElements = dims.arrayElementCount; }
-    void GetDimensions(uint32_t mipLevel, float* outWidth, float* outElements, float* outNumberOfLevels) 
+        *outElements = dims.arrayElementCount;
+    }
+    void GetDimensions(float* outWidth, float* outElements)
     {
-        auto dims = texture->GetDimensions(mipLevel); 
-        *outWidth = dims.width; 
+        auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
+        *outElements = dims.arrayElementCount;
+    }
+    void GetDimensions(
+        uint32_t mipLevel,
+        float* outWidth,
+        float* outElements,
+        float* outNumberOfLevels)
+    {
+        auto dims = texture->GetDimensions(mipLevel);
+        *outWidth = dims.width;
         *outNumberOfLevels = dims.numberOfLevels;
-        *outElements = dims.arrayElementCount; 
+        *outElements = dims.arrayElementCount;
+    }
+
+    T Load(const int3& loc) const
+    {
+        T out;
+        texture->Load(&loc.x, &out, sizeof(out));
+        return out;
+    }
+    T Sample(SamplerState samplerState, const float2& loc) const
+    {
+        T out;
+        texture->Sample(samplerState, &loc.x, &out, sizeof(out));
+        return out;
+    }
+    T SampleLevel(SamplerState samplerState, const float2& loc, float level)
+    {
+        T out;
+        texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out));
+        return out;
     }
-    
-    T Load(const int3& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
-    T Sample(SamplerState samplerState, const float2& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; }
-    T SampleLevel(SamplerState samplerState, const float2& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; }
-    
-    ITexture* texture;              
+
+    ITexture* texture;
 };
 
-template <typename T>
+template<typename T>
 struct Texture2DArray
 {
     void GetDimensions(uint32_t* outWidth, uint32_t* outHeight, uint32_t* outElements)
@@ -539,7 +700,12 @@ struct Texture2DArray
         *outHeight = dims.height;
         *outElements = dims.arrayElementCount;
     }
-    void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outElements, uint32_t* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        uint32_t* outWidth,
+        uint32_t* outHeight,
+        uint32_t* outElements,
+        uint32_t* outNumberOfLevels)
     {
         auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
@@ -547,7 +713,7 @@ struct Texture2DArray
         *outElements = dims.arrayElementCount;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    
+
     void GetDimensions(uint32_t* outWidth, float* outHeight, float* outElements)
     {
         auto dims = texture->GetDimensions();
@@ -555,7 +721,12 @@ struct Texture2DArray
         *outHeight = dims.height;
         *outElements = dims.arrayElementCount;
     }
-    void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outElements, float* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        float* outWidth,
+        float* outHeight,
+        float* outElements,
+        float* outNumberOfLevels)
     {
         auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
@@ -563,15 +734,30 @@ struct Texture2DArray
         *outElements = dims.arrayElementCount;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    
-    T Load(const int4& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
-    T Sample(SamplerState samplerState, const float3& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; }
-    T SampleLevel(SamplerState samplerState, const float3& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; }
-    
-    ITexture* texture;              
+
+    T Load(const int4& loc) const
+    {
+        T out;
+        texture->Load(&loc.x, &out, sizeof(out));
+        return out;
+    }
+    T Sample(SamplerState samplerState, const float3& loc) const
+    {
+        T out;
+        texture->Sample(samplerState, &loc.x, &out, sizeof(out));
+        return out;
+    }
+    T SampleLevel(SamplerState samplerState, const float3& loc, float level)
+    {
+        T out;
+        texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out));
+        return out;
+    }
+
+    ITexture* texture;
 };
 
-template <typename T>
+template<typename T>
 struct TextureCubeArray
 {
     void GetDimensions(uint32_t* outWidth, uint32_t* outHeight, uint32_t* outElements)
@@ -581,7 +767,12 @@ struct TextureCubeArray
         *outHeight = dims.height;
         *outElements = dims.arrayElementCount;
     }
-    void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outElements, uint32_t* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        uint32_t* outWidth,
+        uint32_t* outHeight,
+        uint32_t* outElements,
+        uint32_t* outNumberOfLevels)
     {
         auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
@@ -589,7 +780,7 @@ struct TextureCubeArray
         *outElements = dims.arrayElementCount;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    
+
     void GetDimensions(uint32_t* outWidth, float* outHeight, float* outElements)
     {
         auto dims = texture->GetDimensions();
@@ -597,7 +788,12 @@ struct TextureCubeArray
         *outHeight = dims.height;
         *outElements = dims.arrayElementCount;
     }
-    void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outElements, float* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        float* outWidth,
+        float* outHeight,
+        float* outElements,
+        float* outNumberOfLevels)
     {
         auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
@@ -605,81 +801,124 @@ struct TextureCubeArray
         *outElements = dims.arrayElementCount;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    
-    T Sample(SamplerState samplerState, const float4& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; }
-    T SampleLevel(SamplerState samplerState, const float4& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; }
-    
-    ITexture* texture;              
+
+    T Sample(SamplerState samplerState, const float4& loc) const
+    {
+        T out;
+        texture->Sample(samplerState, &loc.x, &out, sizeof(out));
+        return out;
+    }
+    T SampleLevel(SamplerState samplerState, const float4& loc, float level)
+    {
+        T out;
+        texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out));
+        return out;
+    }
+
+    ITexture* texture;
 };
 
 /* !!!!!!!!!!!!!!!!!!!!!!!!!!! RWTexture !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */
 
 struct IRWTexture : ITexture
 {
-        /// Get the reference to the element at loc. 
+    /// Get the reference to the element at loc.
     virtual void* refAt(const uint32_t* loc) = 0;
 };
 
-template <typename T>
+template<typename T>
 struct RWTexture1D
 {
     void GetDimensions(uint32_t* outWidth) { *outWidth = texture->GetDimensions().width; }
-    void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outNumberOfLevels) { auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; *outNumberOfLevels = dims.numberOfLevels; }
-    
+    void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outNumberOfLevels)
+    {
+        auto dims = texture->GetDimensions(mipLevel);
+        *outWidth = dims.width;
+        *outNumberOfLevels = dims.numberOfLevels;
+    }
+
     void GetDimensions(float* outWidth) { *outWidth = texture->GetDimensions().width; }
-    void GetDimensions(uint32_t mipLevel, float* outWidth, float* outNumberOfLevels) { auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; *outNumberOfLevels = dims.numberOfLevels; }
-    
-    T Load(int32_t loc) const { T out; texture->Load(&loc, &out, sizeof(out)); return out; }
+    void GetDimensions(uint32_t mipLevel, float* outWidth, float* outNumberOfLevels)
+    {
+        auto dims = texture->GetDimensions(mipLevel);
+        *outWidth = dims.width;
+        *outNumberOfLevels = dims.numberOfLevels;
+    }
+
+    T Load(int32_t loc) const
+    {
+        T out;
+        texture->Load(&loc, &out, sizeof(out));
+        return out;
+    }
     T& operator[](uint32_t loc) { return *(T*)texture->refAt(&loc); }
-    IRWTexture* texture;              
+    IRWTexture* texture;
 };
 
-template <typename T>
+template<typename T>
 struct RWTexture2D
 {
-    void GetDimensions(uint32_t* outWidth, uint32_t* outHeight) 
-    { 
-        const auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
-        *outHeight = dims.height; 
+    void GetDimensions(uint32_t* outWidth, uint32_t* outHeight)
+    {
+        const auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
+        *outHeight = dims.height;
     }
-    void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        uint32_t* outWidth,
+        uint32_t* outHeight,
+        uint32_t* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
         *outHeight = dims.height;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    void GetDimensions(float* outWidth, float* outHeight) 
-    { 
-        const auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
-        *outHeight = dims.height; 
+    void GetDimensions(float* outWidth, float* outHeight)
+    {
+        const auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
+        *outHeight = dims.height;
     }
-    void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        float* outWidth,
+        float* outHeight,
+        float* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
         *outHeight = dims.height;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    
-    T Load(const int2& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
+
+    T Load(const int2& loc) const
+    {
+        T out;
+        texture->Load(&loc.x, &out, sizeof(out));
+        return out;
+    }
     T& operator[](const uint2& loc) { return *(T*)texture->refAt(&loc.x); }
     IRWTexture* texture;
 };
 
-template <typename T>
+template<typename T>
 struct RWTexture3D
 {
     void GetDimensions(uint32_t* outWidth, uint32_t* outHeight, uint32_t* outDepth)
     {
-        const auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
-        *outHeight = dims.height; 
+        const auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
+        *outHeight = dims.height;
         *outDepth = dims.depth;
     }
-    void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outDepth, uint32_t* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        uint32_t* outWidth,
+        uint32_t* outHeight,
+        uint32_t* outDepth,
+        uint32_t* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
@@ -689,12 +928,17 @@ struct RWTexture3D
     }
     void GetDimensions(float* outWidth, float* outHeight, float* outDepth)
     {
-        const auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
-        *outHeight = dims.height; 
+        const auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
+        *outHeight = dims.height;
         *outDepth = dims.depth;
     }
-    void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outDepth, float* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        float* outWidth,
+        float* outHeight,
+        float* outDepth,
+        float* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
@@ -702,60 +946,83 @@ struct RWTexture3D
         *outDepth = dims.depth;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    
-    T Load(const int3& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
+
+    T Load(const int3& loc) const
+    {
+        T out;
+        texture->Load(&loc.x, &out, sizeof(out));
+        return out;
+    }
     T& operator[](const uint3& loc) { return *(T*)texture->refAt(&loc.x); }
     IRWTexture* texture;
 };
 
 
-template <typename T>
+template<typename T>
 struct RWTexture1DArray
 {
-    void GetDimensions(uint32_t* outWidth, uint32_t* outElements) 
-    { 
-        auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
-        *outElements = dims.arrayElementCount; 
+    void GetDimensions(uint32_t* outWidth, uint32_t* outElements)
+    {
+        auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
+        *outElements = dims.arrayElementCount;
     }
-    void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outElements, uint32_t* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        uint32_t* outWidth,
+        uint32_t* outElements,
+        uint32_t* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
         *outElements = dims.arrayElementCount;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    void GetDimensions(float* outWidth, float* outElements) 
-    { 
-        auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
-        *outElements = dims.arrayElementCount; 
+    void GetDimensions(float* outWidth, float* outElements)
+    {
+        auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
+        *outElements = dims.arrayElementCount;
     }
-    void GetDimensions(uint32_t mipLevel, float* outWidth, float* outElements, float* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        float* outWidth,
+        float* outElements,
+        float* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
         *outElements = dims.arrayElementCount;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    
-    T Load(int2 loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
+
+    T Load(int2 loc) const
+    {
+        T out;
+        texture->Load(&loc.x, &out, sizeof(out));
+        return out;
+    }
     T& operator[](uint2 loc) { return *(T*)texture->refAt(&loc.x); }
 
     IRWTexture* texture;
 };
 
-template <typename T>
+template<typename T>
 struct RWTexture2DArray
 {
     void GetDimensions(uint32_t* outWidth, uint32_t* outHeight, uint32_t* outElements)
     {
-        auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
+        auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
         *outHeight = dims.height;
-        *outElements = dims.arrayElementCount; 
+        *outElements = dims.arrayElementCount;
     }
-    void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outElements, uint32_t* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        uint32_t* outWidth,
+        uint32_t* outHeight,
+        uint32_t* outElements,
+        uint32_t* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
@@ -765,12 +1032,17 @@ struct RWTexture2DArray
     }
     void GetDimensions(float* outWidth, float* outHeight, float* outElements)
     {
-        auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
+        auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
         *outHeight = dims.height;
-        *outElements = dims.arrayElementCount; 
+        *outElements = dims.arrayElementCount;
     }
-    void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outElements, float* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        float* outWidth,
+        float* outHeight,
+        float* outElements,
+        float* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
@@ -778,8 +1050,13 @@ struct RWTexture2DArray
         *outElements = dims.arrayElementCount;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    
-    T Load(const int3& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
+
+    T Load(const int3& loc) const
+    {
+        T out;
+        texture->Load(&loc.x, &out, sizeof(out));
+        return out;
+    }
     T& operator[](const uint3& loc) { return *(T*)texture->refAt(&loc.x); }
 
     IRWTexture* texture;
@@ -787,91 +1064,167 @@ struct RWTexture2DArray
 
 // FeedbackTexture
 
-struct FeedbackType {};
-struct SAMPLER_FEEDBACK_MIN_MIP : FeedbackType {};
-struct SAMPLER_FEEDBACK_MIP_REGION_USED : FeedbackType {};
+struct FeedbackType
+{
+};
+struct SAMPLER_FEEDBACK_MIN_MIP : FeedbackType
+{
+};
+struct SAMPLER_FEEDBACK_MIP_REGION_USED : FeedbackType
+{
+};
 
 struct IFeedbackTexture
 {
     virtual TextureDimensions GetDimensions(int mipLevel = -1) = 0;
 
-    // Note here we pass the optional clamp parameter as a pointer. Passing nullptr means no clamp. 
-    // This was preferred over having two function definitions, and having to differentiate their names
-    virtual void WriteSamplerFeedback(ITexture* tex, SamplerState samp, const float* location, const float* clamp = nullptr) = 0;
-    virtual void WriteSamplerFeedbackBias(ITexture* tex, SamplerState samp, const float* location, float bias, const float* clamp = nullptr) = 0;
-    virtual void WriteSamplerFeedbackGrad(ITexture* tex, SamplerState samp, const float* location, const float* ddx, const float* ddy, const float* clamp = nullptr) = 0;
-    
-    virtual void WriteSamplerFeedbackLevel(ITexture* tex, SamplerState samp, const float* location, float lod) = 0;
+    // Note here we pass the optional clamp parameter as a pointer. Passing nullptr means no clamp.
+    // This was preferred over having two function definitions, and having to differentiate their
+    // names
+    virtual void WriteSamplerFeedback(
+        ITexture* tex,
+        SamplerState samp,
+        const float* location,
+        const float* clamp = nullptr) = 0;
+    virtual void WriteSamplerFeedbackBias(
+        ITexture* tex,
+        SamplerState samp,
+        const float* location,
+        float bias,
+        const float* clamp = nullptr) = 0;
+    virtual void WriteSamplerFeedbackGrad(
+        ITexture* tex,
+        SamplerState samp,
+        const float* location,
+        const float* ddx,
+        const float* ddy,
+        const float* clamp = nullptr) = 0;
+
+    virtual void WriteSamplerFeedbackLevel(
+        ITexture* tex,
+        SamplerState samp,
+        const float* location,
+        float lod) = 0;
 };
 
-template <typename T>
+template<typename T>
 struct FeedbackTexture2D
 {
-    void GetDimensions(uint32_t* outWidth, uint32_t* outHeight) 
-    { 
-        const auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
-        *outHeight = dims.height; 
+    void GetDimensions(uint32_t* outWidth, uint32_t* outHeight)
+    {
+        const auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
+        *outHeight = dims.height;
     }
-    void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        uint32_t* outWidth,
+        uint32_t* outHeight,
+        uint32_t* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
         *outHeight = dims.height;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    void GetDimensions(float* outWidth, float* outHeight) 
-    { 
-        const auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
-        *outHeight = dims.height; 
+    void GetDimensions(float* outWidth, float* outHeight)
+    {
+        const auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
+        *outHeight = dims.height;
     }
-    void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        float* outWidth,
+        float* outHeight,
+        float* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
         *outHeight = dims.height;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    
-    template <typename S>
-    void WriteSamplerFeedback(Texture2D<S> tex, SamplerState samp, float2 location, float clamp) { texture->WriteSamplerFeedback(tex.texture, samp, &location.x, &clamp); } 
 
-    template <typename S>
-    void WriteSamplerFeedbackBias(Texture2D<S> tex, SamplerState samp, float2 location, float bias, float clamp) { texture->WriteSamplerFeedbackBias(tex.texture, samp, &location.x, bias, &clamp); }
+    template<typename S>
+    void WriteSamplerFeedback(Texture2D<S> tex, SamplerState samp, float2 location, float clamp)
+    {
+        texture->WriteSamplerFeedback(tex.texture, samp, &location.x, &clamp);
+    }
 
-    template <typename S>
-    void WriteSamplerFeedbackGrad(Texture2D<S> tex, SamplerState samp, float2 location, float2 ddx, float2 ddy, float clamp) { texture->WriteSamplerFeedbackGrad(tex.texture, samp, &location.x, &ddx.x, &ddy.x, &clamp); }
+    template<typename S>
+    void WriteSamplerFeedbackBias(
+        Texture2D<S> tex,
+        SamplerState samp,
+        float2 location,
+        float bias,
+        float clamp)
+    {
+        texture->WriteSamplerFeedbackBias(tex.texture, samp, &location.x, bias, &clamp);
+    }
+
+    template<typename S>
+    void WriteSamplerFeedbackGrad(
+        Texture2D<S> tex,
+        SamplerState samp,
+        float2 location,
+        float2 ddx,
+        float2 ddy,
+        float clamp)
+    {
+        texture->WriteSamplerFeedbackGrad(tex.texture, samp, &location.x, &ddx.x, &ddy.x, &clamp);
+    }
 
     // Level
 
-    template <typename S> 
-    void WriteSamplerFeedbackLevel(Texture2D<S> tex, SamplerState samp, float2 location, float lod) { texture->WriteSamplerFeedbackLevel(tex.texture, samp, &location.x, lod); }
-    
+    template<typename S>
+    void WriteSamplerFeedbackLevel(Texture2D<S> tex, SamplerState samp, float2 location, float lod)
+    {
+        texture->WriteSamplerFeedbackLevel(tex.texture, samp, &location.x, lod);
+    }
+
     // Without Clamp
-    template <typename S> 
-    void WriteSamplerFeedback(Texture2D<S> tex, SamplerState samp, float2 location) { texture->WriteSamplerFeedback(tex.texture, samp, &location.x); }
+    template<typename S>
+    void WriteSamplerFeedback(Texture2D<S> tex, SamplerState samp, float2 location)
+    {
+        texture->WriteSamplerFeedback(tex.texture, samp, &location.x);
+    }
+
+    template<typename S>
+    void WriteSamplerFeedbackBias(Texture2D<S> tex, SamplerState samp, float2 location, float bias)
+    {
+        texture->WriteSamplerFeedbackBias(tex.texture, samp, &location.x, bias);
+    }
 
-    template <typename S> 
-    void WriteSamplerFeedbackBias(Texture2D<S> tex, SamplerState samp, float2 location, float bias) { texture->WriteSamplerFeedbackBias(tex.texture, samp, &location.x, bias); }
+    template<typename S>
+    void WriteSamplerFeedbackGrad(
+        Texture2D<S> tex,
+        SamplerState samp,
+        float2 location,
+        float2 ddx,
+        float2 ddy)
+    {
+        texture->WriteSamplerFeedbackGrad(tex.texture, samp, &location.x, &ddx.x, &ddy.x);
+    }
 
-    template <typename S> 
-    void WriteSamplerFeedbackGrad(Texture2D<S> tex, SamplerState samp, float2 location, float2 ddx, float2 ddy) { texture->WriteSamplerFeedbackGrad(tex.texture, samp, &location.x, &ddx.x, &ddy.x); }
-    
     IFeedbackTexture* texture;
 };
 
-template <typename T>
+template<typename T>
 struct FeedbackTexture2DArray
 {
     void GetDimensions(uint32_t* outWidth, uint32_t* outHeight, uint32_t* outElements)
     {
-        auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
+        auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
         *outHeight = dims.height;
-        *outElements = dims.arrayElementCount; 
+        *outElements = dims.arrayElementCount;
     }
-    void GetDimensions(uint32_t mipLevel, uint32_t* outWidth, uint32_t* outHeight, uint32_t* outElements, uint32_t* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        uint32_t* outWidth,
+        uint32_t* outHeight,
+        uint32_t* outElements,
+        uint32_t* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
@@ -881,12 +1234,17 @@ struct FeedbackTexture2DArray
     }
     void GetDimensions(float* outWidth, float* outHeight, float* outElements)
     {
-        auto dims = texture->GetDimensions(); 
-        *outWidth = dims.width; 
+        auto dims = texture->GetDimensions();
+        *outWidth = dims.width;
         *outHeight = dims.height;
-        *outElements = dims.arrayElementCount; 
+        *outElements = dims.arrayElementCount;
     }
-    void GetDimensions(uint32_t mipLevel, float* outWidth, float* outHeight, float* outElements, float* outNumberOfLevels)
+    void GetDimensions(
+        uint32_t mipLevel,
+        float* outWidth,
+        float* outHeight,
+        float* outElements,
+        float* outNumberOfLevels)
     {
         const auto dims = texture->GetDimensions(mipLevel);
         *outWidth = dims.width;
@@ -894,31 +1252,81 @@ struct FeedbackTexture2DArray
         *outElements = dims.arrayElementCount;
         *outNumberOfLevels = dims.numberOfLevels;
     }
-    
-    template <typename S>
-    void WriteSamplerFeedback(Texture2DArray<S> texArray, SamplerState samp, float3 location, float clamp) { texture->WriteSamplerFeedback(texArray.texture, samp, &location.x, &clamp); }
 
-    template <typename S>
-    void WriteSamplerFeedbackBias(Texture2DArray<S> texArray, SamplerState samp, float3 location, float bias, float clamp) { texture->WriteSamplerFeedbackBias(texArray.texture, samp, &location.x, bias, &clamp); }
+    template<typename S>
+    void WriteSamplerFeedback(
+        Texture2DArray<S> texArray,
+        SamplerState samp,
+        float3 location,
+        float clamp)
+    {
+        texture->WriteSamplerFeedback(texArray.texture, samp, &location.x, &clamp);
+    }
+
+    template<typename S>
+    void WriteSamplerFeedbackBias(
+        Texture2DArray<S> texArray,
+        SamplerState samp,
+        float3 location,
+        float bias,
+        float clamp)
+    {
+        texture->WriteSamplerFeedbackBias(texArray.texture, samp, &location.x, bias, &clamp);
+    }
 
-    template <typename S>
-    void WriteSamplerFeedbackGrad(Texture2DArray<S> texArray, SamplerState samp, float3 location, float3 ddx, float3 ddy, float clamp) { texture->WriteSamplerFeedbackGrad(texArray.texture, samp, &location.x, &ddx.x, &ddy.x, &clamp); }
+    template<typename S>
+    void WriteSamplerFeedbackGrad(
+        Texture2DArray<S> texArray,
+        SamplerState samp,
+        float3 location,
+        float3 ddx,
+        float3 ddy,
+        float clamp)
+    {
+        texture
+            ->WriteSamplerFeedbackGrad(texArray.texture, samp, &location.x, &ddx.x, &ddy.x, &clamp);
+    }
 
     // Level
-    template <typename S>
-    void WriteSamplerFeedbackLevel(Texture2DArray<S> texArray, SamplerState samp, float3 location, float lod) { texture->WriteSamplerFeedbackLevel(texArray.texture, samp, &location.x, lod); }
+    template<typename S>
+    void WriteSamplerFeedbackLevel(
+        Texture2DArray<S> texArray,
+        SamplerState samp,
+        float3 location,
+        float lod)
+    {
+        texture->WriteSamplerFeedbackLevel(texArray.texture, samp, &location.x, lod);
+    }
 
     // Without Clamp
 
-    template <typename S>
-    void WriteSamplerFeedback(Texture2DArray<S> texArray, SamplerState samp, float3 location) { texture->WriteSamplerFeedback(texArray.texture, samp, &location.x); }
+    template<typename S>
+    void WriteSamplerFeedback(Texture2DArray<S> texArray, SamplerState samp, float3 location)
+    {
+        texture->WriteSamplerFeedback(texArray.texture, samp, &location.x);
+    }
 
-    template <typename S>
-    void WriteSamplerFeedbackBias(Texture2DArray<S> texArray, SamplerState samp, float3 location, float bias) { texture->WriteSamplerFeedbackBias(texArray.texture, samp, &location.x, bias); }
+    template<typename S>
+    void WriteSamplerFeedbackBias(
+        Texture2DArray<S> texArray,
+        SamplerState samp,
+        float3 location,
+        float bias)
+    {
+        texture->WriteSamplerFeedbackBias(texArray.texture, samp, &location.x, bias);
+    }
+
+    template<typename S>
+    void WriteSamplerFeedbackGrad(
+        Texture2DArray<S> texArray,
+        SamplerState samp,
+        float3 location,
+        float3 ddx,
+        float3 ddy)
+    {
+        texture->WriteSamplerFeedbackGrad(texArray.texture, samp, &location.x, &ddx.x, &ddy.x);
+    }
 
-    template <typename S>
-    void WriteSamplerFeedbackGrad(Texture2DArray<S> texArray, SamplerState samp, float3 location, float3 ddx, float3 ddy) { texture->WriteSamplerFeedbackGrad(texArray.texture, samp, &location.x, &ddx.x, &ddy.x); }
-    
     IFeedbackTexture* texture;
 };
 
@@ -933,20 +1341,24 @@ struct ComputeThreadVaryingInput
 
 struct ComputeVaryingInput
 {
-    uint3 startGroupID;     ///< start groupID
-    uint3 endGroupID;       ///< Non inclusive end groupID
+    uint3 startGroupID; ///< start groupID
+    uint3 endGroupID;   ///< Non inclusive end groupID
 };
 
-// The uniformEntryPointParams and uniformState must be set to structures that match layout that the kernel expects.
-// This can be determined via reflection for example.
+// The uniformEntryPointParams and uniformState must be set to structures that match layout that the
+// kernel expects. This can be determined via reflection for example.
 
-typedef void(*ComputeThreadFunc)(ComputeThreadVaryingInput* varyingInput, void* uniformEntryPointParams, void* uniformState);
-typedef void(*ComputeFunc)(ComputeVaryingInput* varyingInput, void* uniformEntryPointParams, void* uniformState);
+typedef void (*ComputeThreadFunc)(
+    ComputeThreadVaryingInput* varyingInput,
+    void* uniformEntryPointParams,
+    void* uniformState);
+typedef void (*ComputeFunc)(
+    ComputeVaryingInput* varyingInput,
+    void* uniformEntryPointParams,
+    void* uniformState);
 
 #ifdef SLANG_PRELUDE_NAMESPACE
 }
 #endif
 
 #endif
-
-
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index e0335f08a..9ac903955 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -15,51 +15,53 @@
 
 #endif
 
-// Define SLANG_CUDA_ENABLE_HALF to use the cuda_fp16 include to add half support. 
+// Define SLANG_CUDA_ENABLE_HALF to use the cuda_fp16 include to add half support.
 // For this to work NVRTC needs to have the path to the CUDA SDK.
 //
-// As it stands the includes paths defined for Slang are passed down to NVRTC. Similarly defines defined for the Slang compile
-// are passed down. 
+// As it stands the includes paths defined for Slang are passed down to NVRTC. Similarly defines
+// defined for the Slang compile are passed down.
 
 #ifdef SLANG_CUDA_ENABLE_HALF
-// We don't want half2 operators, because it will implement comparison operators that return a bool(!). We want to generate
-// those functions. Doing so means that we will have to define all the other half2 operators.
-#   define __CUDA_NO_HALF2_OPERATORS__
-#   include <cuda_fp16.h>
+// We don't want half2 operators, because it will implement comparison operators that return a
+// bool(!). We want to generate those functions. Doing so means that we will have to define all
+// the other half2 operators.
+#define __CUDA_NO_HALF2_OPERATORS__
+#include <cuda_fp16.h>
 #endif
 
 #ifdef SLANG_CUDA_ENABLE_OPTIX
 #include <optix.h>
 #endif
 
-// Define slang offsetof implementation 
+// Define slang offsetof implementation
 #ifndef SLANG_OFFSET_OF
-#   define SLANG_OFFSET_OF(type, member) (size_t)((char*)&(((type *)0)->member) - (char*)0)
+#define SLANG_OFFSET_OF(type, member) (size_t)((char*)&(((type*)0)->member) - (char*)0)
 #endif
 
 #ifndef SLANG_ALIGN_OF
-#   define SLANG_ALIGN_OF(type) __alignof__(type)
+#define SLANG_ALIGN_OF(type) __alignof__(type)
 #endif
 
 // Must be large enough to cause overflow and therefore infinity
 #ifndef SLANG_INFINITY
-#   define SLANG_INFINITY   ((float)(1e+300 * 1e+300))
+#define SLANG_INFINITY ((float)(1e+300 * 1e+300))
 #endif
 
 // For now we'll disable any asserts in this prelude
-#define SLANG_PRELUDE_ASSERT(x) 
+#define SLANG_PRELUDE_ASSERT(x)
 
-#ifndef SLANG_CUDA_WARP_SIZE 
-#   define SLANG_CUDA_WARP_SIZE 32
+#ifndef SLANG_CUDA_WARP_SIZE
+#define SLANG_CUDA_WARP_SIZE 32
 #endif
 
-#define SLANG_CUDA_WARP_MASK (SLANG_CUDA_WARP_SIZE - 1) // Used for masking threadIdx.x to the warp lane index
+#define SLANG_CUDA_WARP_MASK \
+    (SLANG_CUDA_WARP_SIZE - 1) // Used for masking threadIdx.x to the warp lane index
 #define SLANG_CUDA_WARP_BITMASK (~int(0))
 
 //
 #define SLANG_FORCE_INLINE inline
 
-#define SLANG_CUDA_CALL __device__ 
+#define SLANG_CUDA_CALL __device__
 
 #define SLANG_FORCE_INLINE inline
 #define SLANG_INLINE inline
@@ -71,54 +73,63 @@
 
 // Asserts for bounds checking.
 // It is assumed index/count are unsigned types.
-#define SLANG_BOUND_ASSERT(index, count)  SLANG_PRELUDE_ASSERT(index < count); 
-#define SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_PRELUDE_ASSERT(index <= (sizeInBytes - elemSize) && (index & 3) == 0);
+#define SLANG_BOUND_ASSERT(index, count) SLANG_PRELUDE_ASSERT(index < count);
+#define SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) \
+    SLANG_PRELUDE_ASSERT(index <= (sizeInBytes - elemSize) && (index & 3) == 0);
 
 // Macros to zero index if an access is out of range
-#define SLANG_BOUND_ZERO_INDEX(index, count) index = (index < count) ? index : 0; 
-#define SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) index = (index <= (sizeInBytes - elemSize)) ? index : 0; 
-
-// The 'FIX' macro define how the index is fixed. The default is to do nothing. If SLANG_ENABLE_BOUND_ZERO_INDEX
-// the fix macro will zero the index, if out of range
-#ifdef  SLANG_ENABLE_BOUND_ZERO_INDEX
-#   define SLANG_BOUND_FIX(index, count) SLANG_BOUND_ZERO_INDEX(index, count)
-#   define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes)
-#   define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) SLANG_BOUND_ZERO_INDEX(index, count) SLANG_BOUND_ZERO_INDEX(index, count)
+#define SLANG_BOUND_ZERO_INDEX(index, count) index = (index < count) ? index : 0;
+#define SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) \
+    index = (index <= (sizeInBytes - elemSize)) ? index : 0;
+
+// The 'FIX' macro define how the index is fixed. The default is to do nothing. If
+// SLANG_ENABLE_BOUND_ZERO_INDEX the fix macro will zero the index, if out of range
+#ifdef SLANG_ENABLE_BOUND_ZERO_INDEX
+#define SLANG_BOUND_FIX(index, count) SLANG_BOUND_ZERO_INDEX(index, count)
+#define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) \
+    SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes)
+#define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) \
+    SLANG_BOUND_ZERO_INDEX(index, count) SLANG_BOUND_ZERO_INDEX(index, count)
 #else
-#   define SLANG_BOUND_FIX(index, count) 
-#   define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) 
-#   define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) 
+#define SLANG_BOUND_FIX(index, count)
+#define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes)
+#define SLANG_BOUND_FIX_FIXED_ARRAY(index, count)
 #endif
 
 #ifndef SLANG_BOUND_CHECK
-#   define SLANG_BOUND_CHECK(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX(index, count)
+#define SLANG_BOUND_CHECK(index, count) \
+    SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX(index, count)
 #endif
 
 #ifndef SLANG_BOUND_CHECK_BYTE_ADDRESS
-#   define SLANG_BOUND_CHECK_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes)
+#define SLANG_BOUND_CHECK_BYTE_ADDRESS(index, elemSize, sizeInBytes) \
+    SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes)    \
+    SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes)
 #endif
 
 #ifndef SLANG_BOUND_CHECK_FIXED_ARRAY
-#   define SLANG_BOUND_CHECK_FIXED_ARRAY(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX_FIXED_ARRAY(index, count)
+#define SLANG_BOUND_CHECK_FIXED_ARRAY(index, count) \
+    SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX_FIXED_ARRAY(index, count)
 #endif
 
- // This macro handles how out-of-range surface coordinates are handled; 
- // I can equal
- // cudaBoundaryModeClamp, in which case out-of-range coordinates are clamped to the valid range
- // cudaBoundaryModeZero, in which case out-of-range reads return zero and out-of-range writes are ignored
- // cudaBoundaryModeTrap, in which case out-of-range accesses cause the kernel execution to fail. 
- 
+// This macro handles how out-of-range surface coordinates are handled;
+// I can equal
+// cudaBoundaryModeClamp, in which case out-of-range coordinates are clamped to the valid range
+// cudaBoundaryModeZero, in which case out-of-range reads return zero and out-of-range writes are
+// ignored cudaBoundaryModeTrap, in which case out-of-range accesses cause the kernel execution to
+// fail.
+
 #ifndef SLANG_CUDA_BOUNDARY_MODE
-#   define SLANG_CUDA_BOUNDARY_MODE cudaBoundaryModeZero
+#define SLANG_CUDA_BOUNDARY_MODE cudaBoundaryModeZero
 
 // Can be one of SLANG_CUDA_PTX_BOUNDARY_MODE. Only applies *PTX* emitted CUDA operations
 // which currently is just RWTextureRW format writes
-// 
+//
 // .trap         causes an execution trap on out-of-bounds addresses
 // .clamp        stores data at the nearest surface location (sized appropriately)
-// .zero         drops stores to out-of-bounds addresses 
+// .zero         drops stores to out-of-bounds addresses
 
-#   define SLANG_PTX_BOUNDARY_MODE "zero"
+#define SLANG_PTX_BOUNDARY_MODE "zero"
 #endif
 
 struct TypeInfo
@@ -126,51 +137,67 @@ struct TypeInfo
     size_t typeSize;
 };
 
-template <typename T, size_t SIZE>
+template<typename T, size_t SIZE>
 struct FixedArray
 {
-    SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; }
-    SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; }
-    
+    SLANG_CUDA_CALL const T& operator[](size_t index) const
+    {
+        SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE);
+        return m_data[index];
+    }
+    SLANG_CUDA_CALL T& operator[](size_t index)
+    {
+        SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE);
+        return m_data[index];
+    }
+
     T m_data[SIZE];
 };
 
-// An array that has no specified size, becomes a 'Array'. This stores the size so it can potentially 
-// do bounds checking.  
-template <typename T>
+// An array that has no specified size, becomes a 'Array'. This stores the size so it can
+// potentially do bounds checking.
+template<typename T>
 struct Array
 {
-    SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
-    SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_BOUND_CHECK(index, count); return data[index]; }
-    
+    SLANG_CUDA_CALL const T& operator[](size_t index) const
+    {
+        SLANG_BOUND_CHECK(index, count);
+        return data[index];
+    }
+    SLANG_CUDA_CALL T& operator[](size_t index)
+    {
+        SLANG_BOUND_CHECK(index, count);
+        return data[index];
+    }
+
     T* data;
     size_t count;
 };
 
 // Typically defined in cuda.h, but we can't ship/rely on that, so just define here
-typedef unsigned long long CUtexObject;                   
-typedef unsigned long long CUsurfObject;                  
+typedef unsigned long long CUtexObject;
+typedef unsigned long long CUsurfObject;
 
-// On CUDA sampler state is actually bound up with the texture object. We have a SamplerState type, 
-// backed as a pointer, to simplify code generation, with the downside that such a binding will take up 
-// uniform space, even though it will have no effect. 
+// On CUDA sampler state is actually bound up with the texture object. We have a SamplerState type,
+// backed as a pointer, to simplify code generation, with the downside that such a binding will take
+// up uniform space, even though it will have no effect.
 // TODO(JS): Consider ways to strip use of variables of this type so have no binding,
 struct SamplerStateUnused;
 typedef SamplerStateUnused* SamplerState;
 
 
 // TODO(JS): Not clear yet if this can be handled on CUDA, by just ignoring.
-// For now, just map to the index type. 
+// For now, just map to the index type.
 typedef size_t NonUniformResourceIndex;
 
 // Code generator will generate the specific type
-template <typename T, int ROWS, int COLS>
+template<typename T, int ROWS, int COLS>
 struct Matrix;
 
 typedef int1 bool1;
 typedef int2 bool2;
 typedef int3 bool3;
-typedef int4 bool4; 
+typedef int4 bool4;
 
 #if SLANG_CUDA_RTC
 
@@ -193,7 +220,7 @@ typedef unsigned char uchar;
 typedef unsigned short ushort;
 typedef unsigned int uint;
 
-union Union32 
+union Union32
 {
     uint32_t u;
     int32_t i;
@@ -225,16 +252,37 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL double _slang_fmod(double x, double y)
 #if SLANG_CUDA_ENABLE_HALF
 
 // Add the other vector half types
-struct __half1 { __half x; };
-struct __align__(4) __half3 { __half x, y, z; };
-struct __align__(4) __half4 { __half x, y, z, w; };
+struct __half1
+{
+    __half x;
+};
+struct __align__(4) __half3
+{
+    __half x, y, z;
+};
+struct __align__(4) __half4
+{
+    __half x, y, z, w;
+};
 #endif
 
-#define SLANG_VECTOR_GET_ELEMENT(T) \
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##1 x, int index) { return ((T*)(&x))[index]; }\
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##2 x, int index) { return ((T*)(&x))[index]; }\
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##3 x, int index) { return ((T*)(&x))[index]; }\
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##4 x, int index) { return ((T*)(&x))[index]; }
+#define SLANG_VECTOR_GET_ELEMENT(T)                                                   \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##1 x, int index) \
+    {                                                                                 \
+        return ((T*)(&x))[index];                                                     \
+    }                                                                                 \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##2 x, int index) \
+    {                                                                                 \
+        return ((T*)(&x))[index];                                                     \
+    }                                                                                 \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##3 x, int index) \
+    {                                                                                 \
+        return ((T*)(&x))[index];                                                     \
+    }                                                                                 \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##4 x, int index) \
+    {                                                                                 \
+        return ((T*)(&x))[index];                                                     \
+    }
 SLANG_VECTOR_GET_ELEMENT(int)
 SLANG_VECTOR_GET_ELEMENT(uint)
 SLANG_VECTOR_GET_ELEMENT(short)
@@ -246,11 +294,23 @@ SLANG_VECTOR_GET_ELEMENT(ulonglong)
 SLANG_VECTOR_GET_ELEMENT(float)
 SLANG_VECTOR_GET_ELEMENT(double)
 
-#define SLANG_VECTOR_GET_ELEMENT_PTR(T) \
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##1* x, int index) { return ((T*)(x)) + index; }\
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##2* x, int index) { return ((T*)(x)) + index; }\
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##3* x, int index) { return ((T*)(x)) + index; }\
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##4* x, int index) { return ((T*)(x)) + index; }
+#define SLANG_VECTOR_GET_ELEMENT_PTR(T)                                                      \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##1 * x, int index) \
+    {                                                                                        \
+        return ((T*)(x)) + index;                                                            \
+    }                                                                                        \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##2 * x, int index) \
+    {                                                                                        \
+        return ((T*)(x)) + index;                                                            \
+    }                                                                                        \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##3 * x, int index) \
+    {                                                                                        \
+        return ((T*)(x)) + index;                                                            \
+    }                                                                                        \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##4 * x, int index) \
+    {                                                                                        \
+        return ((T*)(x)) + index;                                                            \
+    }
 SLANG_VECTOR_GET_ELEMENT_PTR(int)
 SLANG_VECTOR_GET_ELEMENT_PTR(uint)
 SLANG_VECTOR_GET_ELEMENT_PTR(short)
@@ -267,57 +327,60 @@ SLANG_VECTOR_GET_ELEMENT(__half)
 SLANG_VECTOR_GET_ELEMENT_PTR(__half)
 #endif
 
-#define SLANG_CUDA_VECTOR_BINARY_OP(T, n, op) \
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator op(T##n thisVal, T##n other) \
-    { \
-        T##n result;\
-        for (int i = 0; i < n; i++) \
-            *_slang_vector_get_element_ptr(&result, i) = _slang_vector_get_element(thisVal,i) op _slang_vector_get_element(other,i); \
-        return result;\
+#define SLANG_CUDA_VECTOR_BINARY_OP(T, n, op)                                                 \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator op(T##n thisVal, T##n other)             \
+    {                                                                                         \
+        T##n result;                                                                          \
+        for (int i = 0; i < n; i++)                                                           \
+            *_slang_vector_get_element_ptr(&result, i) =                                      \
+                _slang_vector_get_element(thisVal, i) op _slang_vector_get_element(other, i); \
+        return result;                                                                        \
     }
-#define SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, op) \
+#define SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, op)                                \
     SLANG_FORCE_INLINE SLANG_CUDA_CALL bool##n operator op(T##n thisVal, T##n other) \
-    { \
-        bool##n result;\
-        for (int i = 0; i < n; i++) \
-            *_slang_vector_get_element_ptr(&result, i) = (int)(_slang_vector_get_element(thisVal,i) op _slang_vector_get_element(other,i)); \
-        return result;\
-    }
-#define SLANG_CUDA_VECTOR_UNARY_OP(T, n, op) \
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator op(T##n thisVal) \
-    { \
-        T##n result;\
-        for (int i = 0; i < n; i++) \
-            *_slang_vector_get_element_ptr(&result, i) = op _slang_vector_get_element(thisVal,i); \
-        return result;\
-    }
-
-#define SLANG_CUDA_VECTOR_INT_OP(T, n) \
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, +)\
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, -)\
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, *)\
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, /)\
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, %)\
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, ^)\
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, &)\
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, |)\
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, &&)\
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, ||)\
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, >>)\
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, <<)\
-    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >)\
-    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <)\
-    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >=)\
-    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <=)\
-    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, ==)\
-    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, !=)\
-    SLANG_CUDA_VECTOR_UNARY_OP(T, n, !)\
-    SLANG_CUDA_VECTOR_UNARY_OP(T, n, -)\
+    {                                                                                \
+        bool##n result;                                                              \
+        for (int i = 0; i < n; i++)                                                  \
+            *_slang_vector_get_element_ptr(&result, i) =                             \
+                (int)(_slang_vector_get_element(thisVal, i)                          \
+                          op _slang_vector_get_element(other, i));                   \
+        return result;                                                               \
+    }
+#define SLANG_CUDA_VECTOR_UNARY_OP(T, n, op)                                                       \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator op(T##n thisVal)                              \
+    {                                                                                              \
+        T##n result;                                                                               \
+        for (int i = 0; i < n; i++)                                                                \
+            *_slang_vector_get_element_ptr(&result, i) = op _slang_vector_get_element(thisVal, i); \
+        return result;                                                                             \
+    }
+
+#define SLANG_CUDA_VECTOR_INT_OP(T, n)            \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, +)          \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, -)          \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, *)          \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, /)          \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, %)          \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, ^)          \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, &)          \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, |)          \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, &&)         \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, ||)         \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, >>)         \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, <<)         \
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >)  \
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <)  \
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >=) \
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <=) \
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, ==) \
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, !=) \
+    SLANG_CUDA_VECTOR_UNARY_OP(T, n, !)           \
+    SLANG_CUDA_VECTOR_UNARY_OP(T, n, -)           \
     SLANG_CUDA_VECTOR_UNARY_OP(T, n, ~)
 
 #define SLANG_CUDA_VECTOR_INT_OPS(T) \
-    SLANG_CUDA_VECTOR_INT_OP(T, 2) \
-    SLANG_CUDA_VECTOR_INT_OP(T, 3) \
+    SLANG_CUDA_VECTOR_INT_OP(T, 2)   \
+    SLANG_CUDA_VECTOR_INT_OP(T, 3)   \
     SLANG_CUDA_VECTOR_INT_OP(T, 4)
 
 SLANG_CUDA_VECTOR_INT_OPS(int)
@@ -329,23 +392,23 @@ SLANG_CUDA_VECTOR_INT_OPS(uchar)
 SLANG_CUDA_VECTOR_INT_OPS(longlong)
 SLANG_CUDA_VECTOR_INT_OPS(ulonglong)
 
-#define SLANG_CUDA_VECTOR_FLOAT_OP(T, n) \
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, +)\
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, -)\
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, *)\
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, /)\
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, &&)\
-    SLANG_CUDA_VECTOR_BINARY_OP(T, n, ||)\
-    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >)\
-    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <)\
-    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >=)\
-    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <=)\
-    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, ==)\
-    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, !=)\
+#define SLANG_CUDA_VECTOR_FLOAT_OP(T, n)          \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, +)          \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, -)          \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, *)          \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, /)          \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, &&)         \
+    SLANG_CUDA_VECTOR_BINARY_OP(T, n, ||)         \
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >)  \
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <)  \
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >=) \
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <=) \
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, ==) \
+    SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, !=) \
     SLANG_CUDA_VECTOR_UNARY_OP(T, n, -)
 #define SLANG_CUDA_VECTOR_FLOAT_OPS(T) \
-    SLANG_CUDA_VECTOR_FLOAT_OP(T, 2) \
-    SLANG_CUDA_VECTOR_FLOAT_OP(T, 3) \
+    SLANG_CUDA_VECTOR_FLOAT_OP(T, 2)   \
+    SLANG_CUDA_VECTOR_FLOAT_OP(T, 3)   \
     SLANG_CUDA_VECTOR_FLOAT_OP(T, 4)
 
 SLANG_CUDA_VECTOR_FLOAT_OPS(float)
@@ -353,27 +416,38 @@ SLANG_CUDA_VECTOR_FLOAT_OPS(double)
 #if SLANG_CUDA_ENABLE_HALF
 SLANG_CUDA_VECTOR_FLOAT_OPS(__half)
 #endif
-#define SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, n)\
+#define SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, n)                                             \
     SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator%(const T##n& left, const T##n& right) \
-    {\
-        T##n result;\
-        for (int i = 0; i < n; i++) \
-            *_slang_vector_get_element_ptr(&result, i) = _slang_fmod(_slang_vector_get_element(left,i), _slang_vector_get_element(right,i)); \
-        return result;\
-    }
-#define SLANG_CUDA_FLOAT_VECTOR_MOD(T) \
-    SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 2)\
-    SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 3)\
+    {                                                                                      \
+        T##n result;                                                                       \
+        for (int i = 0; i < n; i++)                                                        \
+            *_slang_vector_get_element_ptr(&result, i) = _slang_fmod(                      \
+                _slang_vector_get_element(left, i),                                        \
+                _slang_vector_get_element(right, i));                                      \
+        return result;                                                                     \
+    }
+#define SLANG_CUDA_FLOAT_VECTOR_MOD(T)     \
+    SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 2) \
+    SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 3) \
     SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 4)
 
 SLANG_CUDA_FLOAT_VECTOR_MOD(float)
 SLANG_CUDA_FLOAT_VECTOR_MOD(double)
 
 #if SLANG_CUDA_RTC || SLANG_CUDA_ENABLE_HALF
-#define SLANG_MAKE_VECTOR(T) \
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x, T y) { return T##2{x, y}; }\
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x, T y, T z) { return T##3{ x, y, z }; }\
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x, T y, T z, T w) { return T##4{ x, y, z, w }; }
+#define SLANG_MAKE_VECTOR(T)                                                \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x, T y)           \
+    {                                                                       \
+        return T##2 {x, y};                                                 \
+    }                                                                       \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x, T y, T z)      \
+    {                                                                       \
+        return T##3 {x, y, z};                                              \
+    }                                                                       \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x, T y, T z, T w) \
+    {                                                                       \
+        return T##4 {x, y, z, w};                                           \
+    }
 #endif
 
 #if SLANG_CUDA_RTC
@@ -393,25 +467,67 @@ SLANG_MAKE_VECTOR(ulonglong)
 SLANG_MAKE_VECTOR(__half)
 #endif
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL bool1 make_bool1(bool x) { return bool1{ x }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL bool2 make_bool2(bool x, bool y) { return bool2{ x, y }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL bool3 make_bool3(bool x, bool y, bool z) { return bool3{ x, y, z }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL bool4 make_bool4(bool x, bool y, bool z, bool w) { return bool4{ x, y, z, w }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL bool2 make_bool2(bool x) { return bool2{ x, x }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL bool3 make_bool3(bool x) { return bool3{ x, x, x }; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL bool4 make_bool4(bool x) { return bool4{ x, x, x, x }; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool1 make_bool1(bool x)
+{
+    return bool1{x};
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool2 make_bool2(bool x, bool y)
+{
+    return bool2{x, y};
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool3 make_bool3(bool x, bool y, bool z)
+{
+    return bool3{x, y, z};
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool4 make_bool4(bool x, bool y, bool z, bool w)
+{
+    return bool4{x, y, z, w};
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool2 make_bool2(bool x)
+{
+    return bool2{x, x};
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool3 make_bool3(bool x)
+{
+    return bool3{x, x, x};
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool4 make_bool4(bool x)
+{
+    return bool4{x, x, x, x};
+}
 
 #if SLANG_CUDA_RTC
-#define SLANG_MAKE_VECTOR_FROM_SCALAR(T) \
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##1 make_##T##1(T x) { return T##1{x}; }\
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x) { return make_##T##2(x, x); }\
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x) { return make_##T##3(x, x, x); }\
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x) { return make_##T##4(x, x, x, x); }
+#define SLANG_MAKE_VECTOR_FROM_SCALAR(T)                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##1 make_##T##1(T x) \
+    {                                                        \
+        return T##1 {x};                                     \
+    }                                                        \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x) \
+    {                                                        \
+        return make_##T##2(x, x);                            \
+    }                                                        \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x) \
+    {                                                        \
+        return make_##T##3(x, x, x);                         \
+    }                                                        \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x) \
+    {                                                        \
+        return make_##T##4(x, x, x, x);                      \
+    }
 #else
-#define SLANG_MAKE_VECTOR_FROM_SCALAR(T) \
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x) { return make_##T##2(x, x); }\
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x) { return make_##T##3(x, x, x); }\
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x) { return make_##T##4(x, x, x, x); }
+#define SLANG_MAKE_VECTOR_FROM_SCALAR(T)                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x) \
+    {                                                        \
+        return make_##T##2(x, x);                            \
+    }                                                        \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x) \
+    {                                                        \
+        return make_##T##3(x, x, x);                         \
+    }                                                        \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x) \
+    {                                                        \
+        return make_##T##4(x, x, x, x);                      \
+    }
 #endif
 SLANG_MAKE_VECTOR_FROM_SCALAR(int)
 SLANG_MAKE_VECTOR_FROM_SCALAR(uint)
@@ -426,18 +542,22 @@ SLANG_MAKE_VECTOR_FROM_SCALAR(double)
 #if SLANG_CUDA_ENABLE_HALF
 SLANG_MAKE_VECTOR_FROM_SCALAR(__half)
 #if !SLANG_CUDA_RTC
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half1 make___half1(__half x) { return __half1{x}; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half1 make___half1(__half x)
+{
+    return __half1{x};
+}
 #endif
 #endif
 
-#define SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(Fn,T,N) \
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##N Fn(T##N* address, T##N val) \
-    {\
-        T##N result; \
-        for (int i = 0; i < N; i++) \
-            *_slang_vector_get_element_ptr(&result, i) = Fn(_slang_vector_get_element_ptr(address, i), _slang_vector_get_element(val, i)); \
-        return result; \
-    }\
+#define SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(Fn, T, N)                                            \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T##N Fn(T##N* address, T##N val)                           \
+    {                                                                                             \
+        T##N result;                                                                              \
+        for (int i = 0; i < N; i++)                                                               \
+            *_slang_vector_get_element_ptr(&result, i) =                                          \
+                Fn(_slang_vector_get_element_ptr(address, i), _slang_vector_get_element(val, i)); \
+        return result;                                                                            \
+    }
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 900
 SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, float, 2)
@@ -455,19 +575,24 @@ SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, ulonglong, 3)
 SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, ulonglong, 4)
 
 template<typename T, int n>
-struct GetVectorTypeImpl {};
-
-#define GET_VECTOR_TYPE_IMPL(T, n)\
-template<>\
-struct GetVectorTypeImpl<T,n>\
-{\
-    typedef T##n type;\
-    static SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n fromScalar(T v) { return make_##T##n(v); } \
+struct GetVectorTypeImpl
+{
 };
-#define GET_VECTOR_TYPE_IMPL_N(T)\
-    GET_VECTOR_TYPE_IMPL(T, 1)\
-    GET_VECTOR_TYPE_IMPL(T, 2)\
-    GET_VECTOR_TYPE_IMPL(T, 3)\
+
+#define GET_VECTOR_TYPE_IMPL(T, n)                                     \
+    template<>                                                         \
+    struct GetVectorTypeImpl<T, n>                                     \
+    {                                                                  \
+        typedef T##n type;                                             \
+        static SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n fromScalar(T v) \
+        {                                                              \
+            return make_##T##n(v);                                     \
+        }                                                              \
+    };
+#define GET_VECTOR_TYPE_IMPL_N(T) \
+    GET_VECTOR_TYPE_IMPL(T, 1)    \
+    GET_VECTOR_TYPE_IMPL(T, 2)    \
+    GET_VECTOR_TYPE_IMPL(T, 3)    \
     GET_VECTOR_TYPE_IMPL(T, 4)
 
 GET_VECTOR_TYPE_IMPL_N(int)
@@ -500,11 +625,14 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, n> _slang_vector_reshape(const Vect
     return result;
 }
 
-template <typename T, int ROWS, int COLS>
+template<typename T, int ROWS, int COLS>
 struct Matrix
 {
     Vector<T, COLS> rows[ROWS];
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, COLS>& operator[](size_t index) { return rows[index]; }
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, COLS>& operator[](size_t index)
+    {
+        return rows[index];
+    }
 };
 
 
@@ -515,7 +643,6 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T scalar)
     for (int i = 0; i < ROWS; i++)
         result.rows[i] = GetVectorTypeImpl<T, COLS>::fromScalar(scalar);
     return result;
-
 }
 
 template<typename T, int ROWS, int COLS>
@@ -527,7 +654,9 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector
 }
 
 template<typename T, int ROWS, int COLS>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(
+    const Vector<T, COLS>& row0,
+    const Vector<T, COLS>& row1)
 {
     Matrix<T, ROWS, COLS> result;
     result.rows[0] = row0;
@@ -536,7 +665,10 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector
 }
 
 template<typename T, int ROWS, int COLS>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1, const Vector<T, COLS>& row2)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(
+    const Vector<T, COLS>& row0,
+    const Vector<T, COLS>& row1,
+    const Vector<T, COLS>& row2)
 {
     Matrix<T, ROWS, COLS> result;
     result.rows[0] = row0;
@@ -546,7 +678,11 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector
 }
 
 template<typename T, int ROWS, int COLS>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1, const Vector<T, COLS>& row2, const Vector<T, COLS>& row3)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(
+    const Vector<T, COLS>& row0,
+    const Vector<T, COLS>& row1,
+    const Vector<T, COLS>& row2,
+    const Vector<T, COLS>& row3)
 {
     Matrix<T, ROWS, COLS> result;
     result.rows[0] = row0;
@@ -557,16 +693,20 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector
 }
 
 template<typename T, int ROWS, int COLS, typename U, int otherRow, int otherCol>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Matrix<U, otherRow, otherCol>& other)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(
+    const Matrix<U, otherRow, otherCol>& other)
 {
     Matrix<T, ROWS, COLS> result;
     int minRow = ROWS;
     int minCol = COLS;
-    if (minRow > otherRow) minRow = otherRow;
-    if (minCol > otherCol) minCol = otherCol;
+    if (minRow > otherRow)
+        minRow = otherRow;
+    if (minCol > otherCol)
+        minCol = otherCol;
     for (int i = 0; i < minRow; i++)
         for (int j = 0; j < minCol; j++)
-            *_slang_vector_get_element_ptr(result.rows + i, j) = (T)_slang_vector_get_element(other.rows[i], j);
+            *_slang_vector_get_element_ptr(result.rows + i, j) =
+                (T)_slang_vector_get_element(other.rows[i], j);
     return result;
 }
 
@@ -574,129 +714,238 @@ template<typename T, int ROWS, int COLS>
 SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3)
 {
     Matrix<T, ROWS, COLS> rs;
-    rs.rows[0].x = v0;  rs.rows[0].y = v1;
-    rs.rows[1].x = v2;  rs.rows[1].y = v3;
+    rs.rows[0].x = v0;
+    rs.rows[0].y = v1;
+    rs.rows[1].x = v2;
+    rs.rows[1].y = v3;
     return rs;
 }
 
 template<typename T, int ROWS, int COLS>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(
+    T v0,
+    T v1,
+    T v2,
+    T v3,
+    T v4,
+    T v5)
 {
     Matrix<T, ROWS, COLS> rs;
     if (COLS == 3)
     {
-        rs.rows[0].x = v0;  rs.rows[0].y = v1; rs.rows[0].z = v2;
-        rs.rows[1].x = v3;  rs.rows[1].y = v4; rs.rows[1].z = v5;
+        rs.rows[0].x = v0;
+        rs.rows[0].y = v1;
+        rs.rows[0].z = v2;
+        rs.rows[1].x = v3;
+        rs.rows[1].y = v4;
+        rs.rows[1].z = v5;
     }
     else
     {
-        rs.rows[0].x = v0;  rs.rows[0].y = v1;
-        rs.rows[1].x = v2;  rs.rows[1].y = v3;
-        rs.rows[2].x = v4;  rs.rows[2].y = v5;
+        rs.rows[0].x = v0;
+        rs.rows[0].y = v1;
+        rs.rows[1].x = v2;
+        rs.rows[1].y = v3;
+        rs.rows[2].x = v4;
+        rs.rows[2].y = v5;
     }
     return rs;
-
 }
 
 template<typename T, int ROWS, int COLS>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(
+    T v0,
+    T v1,
+    T v2,
+    T v3,
+    T v4,
+    T v5,
+    T v6,
+    T v7)
 {
     Matrix<T, ROWS, COLS> rs;
     if (COLS == 4)
     {
-        rs.rows[0].x = v0;  rs.rows[0].y = v1; rs.rows[0].z = v2; rs.rows[0].w = v3;
-        rs.rows[1].x = v4;  rs.rows[1].y = v5; rs.rows[1].z = v6; rs.rows[1].w = v7;
+        rs.rows[0].x = v0;
+        rs.rows[0].y = v1;
+        rs.rows[0].z = v2;
+        rs.rows[0].w = v3;
+        rs.rows[1].x = v4;
+        rs.rows[1].y = v5;
+        rs.rows[1].z = v6;
+        rs.rows[1].w = v7;
     }
     else
     {
-        rs.rows[0].x = v0;  rs.rows[0].y = v1;
-        rs.rows[1].x = v2;  rs.rows[1].y = v3;
-        rs.rows[2].x = v4;  rs.rows[2].y = v5;
-        rs.rows[3].x = v6;  rs.rows[3].y = v7;
+        rs.rows[0].x = v0;
+        rs.rows[0].y = v1;
+        rs.rows[1].x = v2;
+        rs.rows[1].y = v3;
+        rs.rows[2].x = v4;
+        rs.rows[2].y = v5;
+        rs.rows[3].x = v6;
+        rs.rows[3].y = v7;
     }
     return rs;
 }
 
 template<typename T, int ROWS, int COLS>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(
+    T v0,
+    T v1,
+    T v2,
+    T v3,
+    T v4,
+    T v5,
+    T v6,
+    T v7,
+    T v8)
 {
     Matrix<T, ROWS, COLS> rs;
-    rs.rows[0].x = v0;  rs.rows[0].y = v1;  rs.rows[0].z = v2;
-    rs.rows[1].x = v3;  rs.rows[1].y = v4;  rs.rows[1].z = v5;
-    rs.rows[2].x = v6;  rs.rows[2].y = v7;  rs.rows[2].z = v8;
+    rs.rows[0].x = v0;
+    rs.rows[0].y = v1;
+    rs.rows[0].z = v2;
+    rs.rows[1].x = v3;
+    rs.rows[1].y = v4;
+    rs.rows[1].z = v5;
+    rs.rows[2].x = v6;
+    rs.rows[2].y = v7;
+    rs.rows[2].z = v8;
     return rs;
 }
 
 template<typename T, int ROWS, int COLS>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(
+    T v0,
+    T v1,
+    T v2,
+    T v3,
+    T v4,
+    T v5,
+    T v6,
+    T v7,
+    T v8,
+    T v9,
+    T v10,
+    T v11)
 {
     Matrix<T, ROWS, COLS> rs;
     if (COLS == 4)
     {
-        rs.rows[0].x = v0;  rs.rows[0].y = v1;  rs.rows[0].z = v2;  rs.rows[0].w = v3;
-        rs.rows[1].x = v4;  rs.rows[1].y = v5;  rs.rows[1].z = v6;  rs.rows[1].w = v7;
-        rs.rows[2].x = v8;  rs.rows[2].y = v9;  rs.rows[2].z = v10; rs.rows[2].w = v11;
+        rs.rows[0].x = v0;
+        rs.rows[0].y = v1;
+        rs.rows[0].z = v2;
+        rs.rows[0].w = v3;
+        rs.rows[1].x = v4;
+        rs.rows[1].y = v5;
+        rs.rows[1].z = v6;
+        rs.rows[1].w = v7;
+        rs.rows[2].x = v8;
+        rs.rows[2].y = v9;
+        rs.rows[2].z = v10;
+        rs.rows[2].w = v11;
     }
     else
     {
-        rs.rows[0].x = v0;  rs.rows[0].y = v1;  rs.rows[0].z = v2;
-        rs.rows[1].x = v3;  rs.rows[1].y = v4;  rs.rows[1].z = v5;
-        rs.rows[2].x = v6;  rs.rows[2].y = v7;  rs.rows[2].z = v8;
-        rs.rows[3].x = v9;  rs.rows[3].y = v10; rs.rows[3].z = v11;
+        rs.rows[0].x = v0;
+        rs.rows[0].y = v1;
+        rs.rows[0].z = v2;
+        rs.rows[1].x = v3;
+        rs.rows[1].y = v4;
+        rs.rows[1].z = v5;
+        rs.rows[2].x = v6;
+        rs.rows[2].y = v7;
+        rs.rows[2].z = v8;
+        rs.rows[3].x = v9;
+        rs.rows[3].y = v10;
+        rs.rows[3].z = v11;
     }
     return rs;
 }
 
 template<typename T, int ROWS, int COLS>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(
+    T v0,
+    T v1,
+    T v2,
+    T v3,
+    T v4,
+    T v5,
+    T v6,
+    T v7,
+    T v8,
+    T v9,
+    T v10,
+    T v11,
+    T v12,
+    T v13,
+    T v14,
+    T v15)
 {
     Matrix<T, ROWS, COLS> rs;
-    rs.rows[0].x = v0;  rs.rows[0].y = v1;  rs.rows[0].z = v2;  rs.rows[0].w = v3;
-    rs.rows[1].x = v4;  rs.rows[1].y = v5;  rs.rows[1].z = v6;  rs.rows[1].w = v7;
-    rs.rows[2].x = v8;  rs.rows[2].y = v9;  rs.rows[2].z = v10; rs.rows[2].w = v11;
-    rs.rows[3].x = v12; rs.rows[3].y = v13; rs.rows[3].z = v14; rs.rows[3].w = v15;
+    rs.rows[0].x = v0;
+    rs.rows[0].y = v1;
+    rs.rows[0].z = v2;
+    rs.rows[0].w = v3;
+    rs.rows[1].x = v4;
+    rs.rows[1].y = v5;
+    rs.rows[1].z = v6;
+    rs.rows[1].w = v7;
+    rs.rows[2].x = v8;
+    rs.rows[2].y = v9;
+    rs.rows[2].z = v10;
+    rs.rows[2].w = v11;
+    rs.rows[3].x = v12;
+    rs.rows[3].y = v13;
+    rs.rows[3].z = v14;
+    rs.rows[3].w = v15;
     return rs;
 }
 
-#define SLANG_MATRIX_BINARY_OP(T, op) \
-    template<int R, int C> \
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal, const Matrix<T, R, C>& other) \
-    { \
-        Matrix<T, R, C> result;\
-        for (int i = 0; i < R; i++) \
-            for (int j = 0; j < C; j++) \
-                *_slang_vector_get_element_ptr(result.rows+i,j) = _slang_vector_get_element(thisVal.rows[i], j) op _slang_vector_get_element(other.rows[i], j); \
-        return result;\
+#define SLANG_MATRIX_BINARY_OP(T, op)                                   \
+    template<int R, int C>                                              \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator op(     \
+        const Matrix<T, R, C>& thisVal,                                 \
+        const Matrix<T, R, C>& other)                                   \
+    {                                                                   \
+        Matrix<T, R, C> result;                                         \
+        for (int i = 0; i < R; i++)                                     \
+            for (int j = 0; j < C; j++)                                 \
+                *_slang_vector_get_element_ptr(result.rows + i, j) =    \
+                    _slang_vector_get_element(thisVal.rows[i], j)       \
+                        op _slang_vector_get_element(other.rows[i], j); \
+        return result;                                                  \
     }
 
-#define SLANG_MATRIX_UNARY_OP(T, op) \
-    template<int R, int C> \
+#define SLANG_MATRIX_UNARY_OP(T, op)                                                               \
+    template<int R, int C>                                                                         \
     SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal) \
-    { \
-        Matrix<T, R, C> result;\
-        for (int i = 0; i < R; i++) \
-            for (int j = 0; j < C; j++) \
-                *_slang_vector_get_element_ptr(result.rows+i,j) = op _slang_vector_get_element(thisVal.rows[i], j); \
-        return result;\
-    }
-#define SLANG_INT_MATRIX_OPS(T) \
-    SLANG_MATRIX_BINARY_OP(T, +)\
-    SLANG_MATRIX_BINARY_OP(T, -)\
-    SLANG_MATRIX_BINARY_OP(T, *)\
-    SLANG_MATRIX_BINARY_OP(T, / )\
-    SLANG_MATRIX_BINARY_OP(T, &)\
-    SLANG_MATRIX_BINARY_OP(T, |)\
-    SLANG_MATRIX_BINARY_OP(T, &&)\
-    SLANG_MATRIX_BINARY_OP(T, ||)\
-    SLANG_MATRIX_BINARY_OP(T, ^)\
-    SLANG_MATRIX_BINARY_OP(T, %)\
-    SLANG_MATRIX_UNARY_OP(T, !)\
+    {                                                                                              \
+        Matrix<T, R, C> result;                                                                    \
+        for (int i = 0; i < R; i++)                                                                \
+            for (int j = 0; j < C; j++)                                                            \
+                *_slang_vector_get_element_ptr(result.rows + i, j) =                               \
+                    op _slang_vector_get_element(thisVal.rows[i], j);                              \
+        return result;                                                                             \
+    }
+#define SLANG_INT_MATRIX_OPS(T)   \
+    SLANG_MATRIX_BINARY_OP(T, +)  \
+    SLANG_MATRIX_BINARY_OP(T, -)  \
+    SLANG_MATRIX_BINARY_OP(T, *)  \
+    SLANG_MATRIX_BINARY_OP(T, /)  \
+    SLANG_MATRIX_BINARY_OP(T, &)  \
+    SLANG_MATRIX_BINARY_OP(T, |)  \
+    SLANG_MATRIX_BINARY_OP(T, &&) \
+    SLANG_MATRIX_BINARY_OP(T, ||) \
+    SLANG_MATRIX_BINARY_OP(T, ^)  \
+    SLANG_MATRIX_BINARY_OP(T, %)  \
+    SLANG_MATRIX_UNARY_OP(T, !)   \
     SLANG_MATRIX_UNARY_OP(T, ~)
 #define SLANG_FLOAT_MATRIX_OPS(T) \
-    SLANG_MATRIX_BINARY_OP(T, +)\
-    SLANG_MATRIX_BINARY_OP(T, -)\
-    SLANG_MATRIX_BINARY_OP(T, *)\
-    SLANG_MATRIX_BINARY_OP(T, /)\
+    SLANG_MATRIX_BINARY_OP(T, +)  \
+    SLANG_MATRIX_BINARY_OP(T, -)  \
+    SLANG_MATRIX_BINARY_OP(T, *)  \
+    SLANG_MATRIX_BINARY_OP(T, /)  \
     SLANG_MATRIX_UNARY_OP(T, -)
 SLANG_INT_MATRIX_OPS(int)
 SLANG_INT_MATRIX_OPS(uint)
@@ -711,48 +960,57 @@ SLANG_FLOAT_MATRIX_OPS(double)
 #if SLANG_CUDA_ENABLE_HALF
 SLANG_FLOAT_MATRIX_OPS(__half)
 #endif
-#define SLANG_MATRIX_INT_NEG_OP(T) \
-    template<int R, int C>\
+#define SLANG_MATRIX_INT_NEG_OP(T)                                                        \
+    template<int R, int C>                                                                \
     SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator-(Matrix<T, R, C> thisVal) \
-    { \
-        Matrix<T, R, C> result;\
-        for (int i = 0; i < R; i++) \
-            for (int j = 0; j < C; j++) \
-                *_slang_vector_get_element_ptr(result.rows+i,j) = 0 - _slang_vector_get_element(thisVal.rows[i], j); \
-        return result;\
-    }
-    SLANG_MATRIX_INT_NEG_OP(int)
-    SLANG_MATRIX_INT_NEG_OP(uint)
-    SLANG_MATRIX_INT_NEG_OP(short)
-    SLANG_MATRIX_INT_NEG_OP(ushort)
-    SLANG_MATRIX_INT_NEG_OP(char)
-    SLANG_MATRIX_INT_NEG_OP(uchar)
-    SLANG_MATRIX_INT_NEG_OP(longlong)
-    SLANG_MATRIX_INT_NEG_OP(ulonglong)
-
-#define SLANG_FLOAT_MATRIX_MOD(T)\
-    template<int R, int C> \
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator%(Matrix<T, R, C> left, Matrix<T, R, C> right) \
-    {\
-        Matrix<T, R, C> result;\
-        for (int i = 0; i < R; i++) \
-            for (int j = 0; j < C; j++) \
-                *_slang_vector_get_element_ptr(result.rows+i,j) = _slang_fmod(_slang_vector_get_element(left.rows[i], j), _slang_vector_get_element(right.rows[i], j)); \
-        return result;\
-    }
-
-    SLANG_FLOAT_MATRIX_MOD(float)
-    SLANG_FLOAT_MATRIX_MOD(double)
+    {                                                                                     \
+        Matrix<T, R, C> result;                                                           \
+        for (int i = 0; i < R; i++)                                                       \
+            for (int j = 0; j < C; j++)                                                   \
+                *_slang_vector_get_element_ptr(result.rows + i, j) =                      \
+                    0 - _slang_vector_get_element(thisVal.rows[i], j);                    \
+        return result;                                                                    \
+    }
+SLANG_MATRIX_INT_NEG_OP(int)
+SLANG_MATRIX_INT_NEG_OP(uint)
+SLANG_MATRIX_INT_NEG_OP(short)
+SLANG_MATRIX_INT_NEG_OP(ushort)
+SLANG_MATRIX_INT_NEG_OP(char)
+SLANG_MATRIX_INT_NEG_OP(uchar)
+SLANG_MATRIX_INT_NEG_OP(longlong)
+SLANG_MATRIX_INT_NEG_OP(ulonglong)
+
+#define SLANG_FLOAT_MATRIX_MOD(T)                                                 \
+    template<int R, int C>                                                        \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator%(                 \
+        Matrix<T, R, C> left,                                                     \
+        Matrix<T, R, C> right)                                                    \
+    {                                                                             \
+        Matrix<T, R, C> result;                                                   \
+        for (int i = 0; i < R; i++)                                               \
+            for (int j = 0; j < C; j++)                                           \
+                *_slang_vector_get_element_ptr(result.rows + i, j) = _slang_fmod( \
+                    _slang_vector_get_element(left.rows[i], j),                   \
+                    _slang_vector_get_element(right.rows[i], j));                 \
+        return result;                                                            \
+    }
+
+SLANG_FLOAT_MATRIX_MOD(float)
+SLANG_FLOAT_MATRIX_MOD(double)
 #if SLANG_CUDA_ENABLE_HALF
-    template<int R, int C> 
-    SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<__half, R, C> operator%(Matrix<__half, R, C> left, Matrix<__half, R, C> right)
-    {
-        Matrix<__half, R, C> result;
-        for (int i = 0; i < R; i++) 
-            for (int j = 0; j < C; j++) 
-                * _slang_vector_get_element_ptr(result.rows + i, j) = __float2half(_slang_fmod(__half2float(_slang_vector_get_element(left.rows[i], j)), __half2float(_slang_vector_get_element(right.rows[i], j))));
-        return result;
-    }
+template<int R, int C>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<__half, R, C> operator%(
+    Matrix<__half, R, C> left,
+    Matrix<__half, R, C> right)
+{
+    Matrix<__half, R, C> result;
+    for (int i = 0; i < R; i++)
+        for (int j = 0; j < C; j++)
+            *_slang_vector_get_element_ptr(result.rows + i, j) = __float2half(_slang_fmod(
+                __half2float(_slang_vector_get_element(left.rows[i], j)),
+                __half2float(_slang_vector_get_element(right.rows[i], j))));
+    return result;
+}
 #endif
 #undef SLANG_FLOAT_MATRIX_MOD
 #undef SLANG_MATRIX_BINARY_OP
@@ -762,19 +1020,24 @@ SLANG_FLOAT_MATRIX_OPS(__half)
 #undef SLANG_MATRIX_INT_NEG_OP
 #undef SLANG_FLOAT_MATRIX_MOD
 
-#define SLANG_SELECT_IMPL(T, N)\
-SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, N> _slang_select(bool##N condition, Vector<T, N> v0, Vector<T, N> v1) \
-{ \
-    Vector<T, N> result; \
-    for (int i = 0; i < N; i++) \
-    { \
-        *_slang_vector_get_element_ptr(&result, i) = _slang_vector_get_element(condition, i) ? _slang_vector_get_element(v0, i) : _slang_vector_get_element(v1, i); \
-    } \
-    return result; \
-}
-#define SLANG_SELECT_T(T)\
-    SLANG_SELECT_IMPL(T, 2)\
-    SLANG_SELECT_IMPL(T, 3)\
+#define SLANG_SELECT_IMPL(T, N)                                                                  \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, N> _slang_select(                               \
+        bool##N condition,                                                                       \
+        Vector<T, N> v0,                                                                         \
+        Vector<T, N> v1)                                                                         \
+    {                                                                                            \
+        Vector<T, N> result;                                                                     \
+        for (int i = 0; i < N; i++)                                                              \
+        {                                                                                        \
+            *_slang_vector_get_element_ptr(&result, i) = _slang_vector_get_element(condition, i) \
+                                                             ? _slang_vector_get_element(v0, i)  \
+                                                             : _slang_vector_get_element(v1, i); \
+        }                                                                                        \
+        return result;                                                                           \
+    }
+#define SLANG_SELECT_T(T)   \
+    SLANG_SELECT_IMPL(T, 2) \
+    SLANG_SELECT_IMPL(T, 3) \
     SLANG_SELECT_IMPL(T, 4)
 
 SLANG_SELECT_T(int)
@@ -794,53 +1057,103 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_select(bool condition, T v0, T v1)
 
 //
 // Half support
-// 
+//
 
 #if SLANG_CUDA_ENABLE_HALF
 SLANG_SELECT_T(__half)
 
 // Convenience functions ushort -> half
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __ushort_as_half(const ushort2& i) { return __halves2half2(__ushort_as_half(i.x), __ushort_as_half(i.y)); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 __ushort_as_half(const ushort3& i) { return __half3{__ushort_as_half(i.x), __ushort_as_half(i.y), __ushort_as_half(i.z)}; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 __ushort_as_half(const ushort4& i) { return __half4{ __ushort_as_half(i.x), __ushort_as_half(i.y), __ushort_as_half(i.z), __ushort_as_half(i.w) }; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __ushort_as_half(const ushort2& i)
+{
+    return __halves2half2(__ushort_as_half(i.x), __ushort_as_half(i.y));
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 __ushort_as_half(const ushort3& i)
+{
+    return __half3{__ushort_as_half(i.x), __ushort_as_half(i.y), __ushort_as_half(i.z)};
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 __ushort_as_half(const ushort4& i)
+{
+    return __half4{
+        __ushort_as_half(i.x),
+        __ushort_as_half(i.y),
+        __ushort_as_half(i.z),
+        __ushort_as_half(i.w)};
+}
 
 // Convenience functions half -> ushort
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort2 __half_as_ushort(const __half2& i) { return make_ushort2(__half_as_ushort(i.x), __half_as_ushort(i.y)); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort3 __half_as_ushort(const __half3& i) { return make_ushort3(__half_as_ushort(i.x), __half_as_ushort(i.y), __half_as_ushort(i.z)); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort4 __half_as_ushort(const __half4& i) { return make_ushort4(__half_as_ushort(i.x), __half_as_ushort(i.y), __half_as_ushort(i.z), __half_as_ushort(i.w)); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort2 __half_as_ushort(const __half2& i)
+{
+    return make_ushort2(__half_as_ushort(i.x), __half_as_ushort(i.y));
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort3 __half_as_ushort(const __half3& i)
+{
+    return make_ushort3(__half_as_ushort(i.x), __half_as_ushort(i.y), __half_as_ushort(i.z));
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort4 __half_as_ushort(const __half4& i)
+{
+    return make_ushort4(
+        __half_as_ushort(i.x),
+        __half_as_ushort(i.y),
+        __half_as_ushort(i.z),
+        __half_as_ushort(i.w));
+}
 
-// This is a little bit of a hack. Fortunately CUDA has the definitions of the templated types in 
+// This is a little bit of a hack. Fortunately CUDA has the definitions of the templated types in
 // include/surface_indirect_functions.h
-// Here we find the template definition requires a specialization of __nv_isurf_trait to allow 
-// a specialization of the surface write functions. 
-// This *isn't* a problem on the read functions as they don't have a return type that uses this mechanism 
+// Here we find the template definition requires a specialization of __nv_isurf_trait to allow
+// a specialization of the surface write functions.
+// This *isn't* a problem on the read functions as they don't have a return type that uses this
+// mechanism
 
-template<> struct __nv_isurf_trait<__half> { typedef void type; };
-template<> struct __nv_isurf_trait<__half2> { typedef void type; };
-template<> struct __nv_isurf_trait<__half4> { typedef void type; };
+template<>
+struct __nv_isurf_trait<__half>
+{
+    typedef void type;
+};
+template<>
+struct __nv_isurf_trait<__half2>
+{
+    typedef void type;
+};
+template<>
+struct __nv_isurf_trait<__half4>
+{
+    typedef void type;
+};
 
 #define SLANG_DROP_PARENS(...) __VA_ARGS__
 
-#define SLANG_SURFACE_READ(FUNC_NAME, TYPE_ARGS, ARGS) \
-template <> \
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half FUNC_NAME<__half>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \
-{ \
-    return __ushort_as_half(FUNC_NAME<ushort>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \
-} \
-\
-template <> \
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 FUNC_NAME<__half2>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \
-{ \
-    return __ushort_as_half(FUNC_NAME<ushort2>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \
-} \
-\
-template <> \
-SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 FUNC_NAME<__half4>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \
-{ \
-    return __ushort_as_half(FUNC_NAME<ushort4>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \
-}
+#define SLANG_SURFACE_READ(FUNC_NAME, TYPE_ARGS, ARGS)                                             \
+    template<>                                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL __half FUNC_NAME<__half>(                                   \
+        cudaSurfaceObject_t surfObj,                                                               \
+        SLANG_DROP_PARENS TYPE_ARGS,                                                               \
+        cudaSurfaceBoundaryMode boundaryMode)                                                      \
+    {                                                                                              \
+        return __ushort_as_half(FUNC_NAME<ushort>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \
+    }                                                                                              \
+                                                                                                   \
+    template<>                                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 FUNC_NAME<__half2>(                                 \
+        cudaSurfaceObject_t surfObj,                                                               \
+        SLANG_DROP_PARENS TYPE_ARGS,                                                               \
+        cudaSurfaceBoundaryMode boundaryMode)                                                      \
+    {                                                                                              \
+        return __ushort_as_half(                                                                   \
+            FUNC_NAME<ushort2>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode));                    \
+    }                                                                                              \
+                                                                                                   \
+    template<>                                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 FUNC_NAME<__half4>(                                 \
+        cudaSurfaceObject_t surfObj,                                                               \
+        SLANG_DROP_PARENS TYPE_ARGS,                                                               \
+        cudaSurfaceBoundaryMode boundaryMode)                                                      \
+    {                                                                                              \
+        return __ushort_as_half(                                                                   \
+            FUNC_NAME<ushort4>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode));                    \
+    }
 
 SLANG_SURFACE_READ(surf1Dread, (int x), (x))
 SLANG_SURFACE_READ(surf2Dread, (int x, int y), (x, y))
@@ -850,24 +1163,36 @@ SLANG_SURFACE_READ(surf2DLayeredread, (int x, int y, int layer), (x, y, layer))
 SLANG_SURFACE_READ(surfCubemapread, (int x, int y, int face), (x, y, face))
 SLANG_SURFACE_READ(surfCubemapLayeredread, (int x, int y, int layerFace), (x, y, layerFace))
 
-#define SLANG_SURFACE_WRITE(FUNC_NAME, TYPE_ARGS, ARGS) \
-template <> \
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void FUNC_NAME<__half>(__half data, cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \
-{ \
-    FUNC_NAME<ushort>(__half_as_ushort(data), surfObj, SLANG_DROP_PARENS ARGS, boundaryMode);  \
-} \
-\
-template <> \
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void FUNC_NAME<__half2>(__half2 data, cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \
-{ \
-    FUNC_NAME<ushort2>(__half_as_ushort(data), surfObj, SLANG_DROP_PARENS ARGS, boundaryMode);  \
-} \
-\
-template <> \
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void FUNC_NAME<__half4>(__half4 data, cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \
-{ \
-    FUNC_NAME<ushort4>(__half_as_ushort(data), surfObj, SLANG_DROP_PARENS ARGS, boundaryMode); \
-}
+#define SLANG_SURFACE_WRITE(FUNC_NAME, TYPE_ARGS, ARGS)                                            \
+    template<>                                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void FUNC_NAME<__half>(                                     \
+        __half data,                                                                               \
+        cudaSurfaceObject_t surfObj,                                                               \
+        SLANG_DROP_PARENS TYPE_ARGS,                                                               \
+        cudaSurfaceBoundaryMode boundaryMode)                                                      \
+    {                                                                                              \
+        FUNC_NAME<ushort>(__half_as_ushort(data), surfObj, SLANG_DROP_PARENS ARGS, boundaryMode);  \
+    }                                                                                              \
+                                                                                                   \
+    template<>                                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void FUNC_NAME<__half2>(                                    \
+        __half2 data,                                                                              \
+        cudaSurfaceObject_t surfObj,                                                               \
+        SLANG_DROP_PARENS TYPE_ARGS,                                                               \
+        cudaSurfaceBoundaryMode boundaryMode)                                                      \
+    {                                                                                              \
+        FUNC_NAME<ushort2>(__half_as_ushort(data), surfObj, SLANG_DROP_PARENS ARGS, boundaryMode); \
+    }                                                                                              \
+                                                                                                   \
+    template<>                                                                                     \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL void FUNC_NAME<__half4>(                                    \
+        __half4 data,                                                                              \
+        cudaSurfaceObject_t surfObj,                                                               \
+        SLANG_DROP_PARENS TYPE_ARGS,                                                               \
+        cudaSurfaceBoundaryMode boundaryMode)                                                      \
+    {                                                                                              \
+        FUNC_NAME<ushort4>(__half_as_ushort(data), surfObj, SLANG_DROP_PARENS ARGS, boundaryMode); \
+    }
 
 SLANG_SURFACE_WRITE(surf1Dwrite, (int x), (x))
 SLANG_SURFACE_WRITE(surf2Dwrite, (int x, int y), (x, y))
@@ -878,38 +1203,54 @@ SLANG_SURFACE_WRITE(surfCubemapwrite, (int x, int y, int face), (x, y, face))
 SLANG_SURFACE_WRITE(surfCubemapLayeredwrite, (int x, int y, int layerFace), (x, y, layerFace))
 
 // ! Hack to test out reading !!!
-// Only works converting *from* half 
- 
-//template <typename T> 
-//SLANG_FORCE_INLINE SLANG_CUDA_CALL T surf2Dread_convert(cudaSurfaceObject_t surfObj, int x, int y, cudaSurfaceBoundaryMode boundaryMode);
-
-#define SLANG_SURFACE_READ_HALF_CONVERT(FUNC_NAME, TYPE_ARGS, ARGS) \
-\
-template <typename T>  \
-SLANG_FORCE_INLINE SLANG_CUDA_CALL T FUNC_NAME##_convert(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode); \
-\
-template <> \
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float FUNC_NAME##_convert<float>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode)  \
-{ \
-    return __ushort_as_half(FUNC_NAME<uint16_t>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \
-} \
-\
-template <> \
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 FUNC_NAME##_convert<float2>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \
-{ \
-    const __half2 v = __ushort_as_half(FUNC_NAME<ushort2>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \
-    return float2{v.x, v.y}; \
-} \
-\
-template <> \
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 FUNC_NAME##_convert<float4>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \
-{ \
-    const __half4 v = __ushort_as_half(FUNC_NAME<ushort4>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \
-    return float4{v.x, v.y, v.z, v.w}; \
-}
-
-SLANG_SURFACE_READ_HALF_CONVERT(surf1Dread, (int x), (x)) 
-SLANG_SURFACE_READ_HALF_CONVERT(surf2Dread, (int x, int y), (x, y)) 
+// Only works converting *from* half
+
+// template <typename T>
+// SLANG_FORCE_INLINE SLANG_CUDA_CALL T surf2Dread_convert(cudaSurfaceObject_t surfObj, int x, int
+// y, cudaSurfaceBoundaryMode boundaryMode);
+
+#define SLANG_SURFACE_READ_HALF_CONVERT(FUNC_NAME, TYPE_ARGS, ARGS)                              \
+                                                                                                 \
+    template<typename T>                                                                         \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL T FUNC_NAME##_convert(                                    \
+        cudaSurfaceObject_t surfObj,                                                             \
+        SLANG_DROP_PARENS TYPE_ARGS,                                                             \
+        cudaSurfaceBoundaryMode boundaryMode);                                                   \
+                                                                                                 \
+    template<>                                                                                   \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL float FUNC_NAME##_convert<float>(                         \
+        cudaSurfaceObject_t surfObj,                                                             \
+        SLANG_DROP_PARENS TYPE_ARGS,                                                             \
+        cudaSurfaceBoundaryMode boundaryMode)                                                    \
+    {                                                                                            \
+        return __ushort_as_half(                                                                 \
+            FUNC_NAME<uint16_t>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode));                 \
+    }                                                                                            \
+                                                                                                 \
+    template<>                                                                                   \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 FUNC_NAME##_convert<float2>(                       \
+        cudaSurfaceObject_t surfObj,                                                             \
+        SLANG_DROP_PARENS TYPE_ARGS,                                                             \
+        cudaSurfaceBoundaryMode boundaryMode)                                                    \
+    {                                                                                            \
+        const __half2 v =                                                                        \
+            __ushort_as_half(FUNC_NAME<ushort2>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \
+        return float2{v.x, v.y};                                                                 \
+    }                                                                                            \
+                                                                                                 \
+    template<>                                                                                   \
+    SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 FUNC_NAME##_convert<float4>(                       \
+        cudaSurfaceObject_t surfObj,                                                             \
+        SLANG_DROP_PARENS TYPE_ARGS,                                                             \
+        cudaSurfaceBoundaryMode boundaryMode)                                                    \
+    {                                                                                            \
+        const __half4 v =                                                                        \
+            __ushort_as_half(FUNC_NAME<ushort4>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \
+        return float4{v.x, v.y, v.z, v.w};                                                       \
+    }
+
+SLANG_SURFACE_READ_HALF_CONVERT(surf1Dread, (int x), (x))
+SLANG_SURFACE_READ_HALF_CONVERT(surf2Dread, (int x, int y), (x, y))
 SLANG_SURFACE_READ_HALF_CONVERT(surf3Dread, (int x, int y, int z), (x, y, z))
 
 #endif
@@ -917,178 +1258,506 @@ SLANG_SURFACE_READ_HALF_CONVERT(surf3Dread, (int x, int y, int z), (x, y, z))
 // Support for doing format conversion when writing to a surface/RWTexture
 
 // NOTE! For normal surface access x values are *byte* addressed.
-// For the _convert versions they are *not*. They don't need to be because sust.p does not require it.
+// For the _convert versions they are *not*. They don't need to be because sust.p does not require
+// it.
 
-template <typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert(T, cudaSurfaceObject_t surfObj, int x, cudaSurfaceBoundaryMode boundaryMode);
-template <typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert(T, cudaSurfaceObject_t surfObj, int x, int y, cudaSurfaceBoundaryMode boundaryMode);
-template <typename T>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert(T, cudaSurfaceObject_t surfObj, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode);
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert(
+    T,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    cudaSurfaceBoundaryMode boundaryMode);
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert(
+    T,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    cudaSurfaceBoundaryMode boundaryMode);
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert(
+    T,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    int z,
+    cudaSurfaceBoundaryMode boundaryMode);
 
 // https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust
 
 // Float
 
-template <>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float>(float v, cudaSurfaceObject_t surfObj, int x, cudaSurfaceBoundaryMode boundaryMode)
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float>(
+    float v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    cudaSurfaceBoundaryMode boundaryMode)
 {
-    asm volatile ( "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2};}\n\t" :: "l"(surfObj),"r"(x),"f"(v));     
+    asm volatile(
+        "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "f"(v));
 }
- 
-template <>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float>(float v, cudaSurfaceObject_t surfObj, int x, int y, cudaSurfaceBoundaryMode boundaryMode)
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float>(
+    float v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    cudaSurfaceBoundaryMode boundaryMode)
 {
-    asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"f"(v));
+    asm volatile(
+        "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "f"(v));
 }
 
-template <>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float>(float v, cudaSurfaceObject_t surfObj, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode)
-{
-    asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3}], {%4};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"r"(z),"f"(v));
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float>(
+    float v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    int z,
+    cudaSurfaceBoundaryMode boundaryMode)
+{
+    asm volatile(
+        "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3}], {%4};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "r"(z),
+        "f"(v));
 }
 
 // Float2
 
-template <>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float2>(float2 v, cudaSurfaceObject_t surfObj, int x, cudaSurfaceBoundaryMode boundaryMode)
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float2>(
+    float2 v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    cudaSurfaceBoundaryMode boundaryMode)
 {
     const float vx = v.x, vy = v.y;
-    asm volatile ( "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2,%3};}\n\t" :: "l"(surfObj),"r"(x),"f"(vx),"f"(vy));     
+    asm volatile(
+        "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2,%3};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "f"(vx),
+        "f"(vy));
 }
- 
-template <>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float2>(float2 v, cudaSurfaceObject_t surfObj, int x, int y, cudaSurfaceBoundaryMode boundaryMode)
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float2>(
+    float2 v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    cudaSurfaceBoundaryMode boundaryMode)
 {
     const float vx = v.x, vy = v.y;
-    asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"f"(vx),"f"(vy));
+    asm volatile(
+        "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "f"(vx),
+        "f"(vy));
 }
 
-template <>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float2>(float2 v, cudaSurfaceObject_t surfObj, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode)
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float2>(
+    float2 v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    int z,
+    cudaSurfaceBoundaryMode boundaryMode)
 {
     const float vx = v.x, vy = v.y;
-    asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3}], {%4,%5};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"r"(z),"f"(vx),"f"(vy));
+    asm volatile(
+        "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3}], {%4,%5};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "r"(z),
+        "f"(vx),
+        "f"(vy));
 }
 
 // Float4
-template <>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float4>(float4 v, cudaSurfaceObject_t surfObj, int x, cudaSurfaceBoundaryMode boundaryMode)
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float4>(
+    float4 v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    cudaSurfaceBoundaryMode boundaryMode)
 {
     const float vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile ( "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2,%3,%4,%5};}\n\t" :: "l"(surfObj),"r"(x),"f"(vx),"f"(vy),"f"(vz),"f"(vw));     
+    asm volatile(
+        "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2,%3,%4,%5};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "f"(vx),
+        "f"(vy),
+        "f"(vz),
+        "f"(vw));
 }
- 
-template <>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float4>(float4 v, cudaSurfaceObject_t surfObj, int x, int y, cudaSurfaceBoundaryMode boundaryMode)
+
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float4>(
+    float4 v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    cudaSurfaceBoundaryMode boundaryMode)
 {
     const float vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4,%5,%6};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"f"(vx),"f"(vy),"f"(vz),"f"(vw));
+    asm volatile(
+        "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE
+        " [%0, {%1,%2}], {%3,%4,%5,%6};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "f"(vx),
+        "f"(vy),
+        "f"(vz),
+        "f"(vw));
 }
 
-template <>
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float4>(float4 v, cudaSurfaceObject_t surfObj, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode)
+template<>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float4>(
+    float4 v,
+    cudaSurfaceObject_t surfObj,
+    int x,
+    int y,
+    int z,
+    cudaSurfaceBoundaryMode boundaryMode)
 {
     const float vx = v.x, vy = v.y, vz = v.z, vw = v.w;
-    asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3}], {%4,%5,%6,%7};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"r"(z),"f"(vx),"f"(vy),"f"(vz),"f"(vw));
+    asm volatile(
+        "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE
+        " [%0, {%1,%2,%3}], {%4,%5,%6,%7};}\n\t" ::"l"(surfObj),
+        "r"(x),
+        "r"(y),
+        "r"(z),
+        "f"(vx),
+        "f"(vy),
+        "f"(vz),
+        "f"(vw));
 }
 
 // ----------------------------- F32 -----------------------------------------
 
-// Unary 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_ceil(float f) { return ::ceilf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_floor(float f) { return ::floorf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_round(float f) { return ::roundf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sin(float f) { return ::sinf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_cos(float f) { return ::cosf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void F32_sincos(float f, float* s, float* c) { ::sincosf(f, s, c); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_tan(float f) { return ::tanf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_asin(float f) { return ::asinf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_acos(float f) { return ::acosf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_atan(float f) { return ::atanf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sinh(float f) { return ::sinhf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_cosh(float f) { return ::coshf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_tanh(float f) { return ::tanhf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_log2(float f) { return ::log2f(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_log(float f) { return ::logf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_log10(float f) { return ::log10f(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_exp2(float f) { return ::exp2f(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_exp(float f) { return ::expf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_abs(float f) { return ::fabsf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_trunc(float f) { return ::truncf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sqrt(float f) { return ::sqrtf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_rsqrt(float f) { return ::rsqrtf(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sign(float f) { return ( f == 0.0f) ? f : (( f < 0.0f) ? -1.0f : 1.0f); } 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_frac(float f) { return f - F32_floor(f); }
-
-SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F32_isnan(float f) { return isnan(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F32_isfinite(float f) { return isfinite(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F32_isinf(float f) { return isinf(f); }
+// Unary
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_ceil(float f)
+{
+    return ::ceilf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_floor(float f)
+{
+    return ::floorf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_round(float f)
+{
+    return ::roundf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sin(float f)
+{
+    return ::sinf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_cos(float f)
+{
+    return ::cosf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void F32_sincos(float f, float* s, float* c)
+{
+    ::sincosf(f, s, c);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_tan(float f)
+{
+    return ::tanf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_asin(float f)
+{
+    return ::asinf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_acos(float f)
+{
+    return ::acosf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_atan(float f)
+{
+    return ::atanf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sinh(float f)
+{
+    return ::sinhf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_cosh(float f)
+{
+    return ::coshf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_tanh(float f)
+{
+    return ::tanhf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_log2(float f)
+{
+    return ::log2f(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_log(float f)
+{
+    return ::logf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_log10(float f)
+{
+    return ::log10f(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_exp2(float f)
+{
+    return ::exp2f(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_exp(float f)
+{
+    return ::expf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_abs(float f)
+{
+    return ::fabsf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_trunc(float f)
+{
+    return ::truncf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sqrt(float f)
+{
+    return ::sqrtf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_rsqrt(float f)
+{
+    return ::rsqrtf(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sign(float f)
+{
+    return (f == 0.0f) ? f : ((f < 0.0f) ? -1.0f : 1.0f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_frac(float f)
+{
+    return f - F32_floor(f);
+}
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F32_isnan(float f)
+{
+    return isnan(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F32_isfinite(float f)
+{
+    return isfinite(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F32_isinf(float f)
+{
+    return isinf(f);
+}
 
 // Binary
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_min(float a, float b) { return ::fminf(a, b); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_max(float a, float b) { return ::fmaxf(a, b); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_pow(float a, float b) { return ::powf(a, b); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_fmod(float a, float b) { return ::fmodf(a, b); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_remainder(float a, float b) { return ::remainderf(a, b); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_atan2(float a, float b) { return float(::atan2(a, b)); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_min(float a, float b)
+{
+    return ::fminf(a, b);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_max(float a, float b)
+{
+    return ::fmaxf(a, b);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_pow(float a, float b)
+{
+    return ::powf(a, b);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_fmod(float a, float b)
+{
+    return ::fmodf(a, b);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_remainder(float a, float b)
+{
+    return ::remainderf(a, b);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_atan2(float a, float b)
+{
+    return float(::atan2(a, b));
+}
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_frexp(float x, int* e) { return frexpf(x, e); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_frexp(float x, int* e)
+{
+    return frexpf(x, e);
+}
 
 SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_modf(float x, float* ip)
 {
     return ::modff(x, ip);
 }
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t F32_asuint(float f) { Union32 u; u.f = f; return u.u; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t F32_asint(float f) { Union32 u; u.f = f; return u.i; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t F32_asuint(float f)
+{
+    Union32 u;
+    u.f = f;
+    return u.u;
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t F32_asint(float f)
+{
+    Union32 u;
+    u.f = f;
+    return u.i;
+}
 
 // Ternary
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_fma(float a, float b, float c) { return ::fmaf(a, b, c); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_fma(float a, float b, float c)
+{
+    return ::fmaf(a, b, c);
+}
 
 
 // ----------------------------- F64 -----------------------------------------
 
-// Unary 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_ceil(double f) { return ::ceil(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_floor(double f) { return ::floor(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_round(double f) { return ::round(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sin(double f) { return ::sin(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_cos(double f) { return ::cos(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL void F64_sincos(double f, double* s, double* c) { ::sincos(f, s, c); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_tan(double f) { return ::tan(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_asin(double f) { return ::asin(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_acos(double f) { return ::acos(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_atan(double f) { return ::atan(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sinh(double f) { return ::sinh(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_cosh(double f) { return ::cosh(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_tanh(double f) { return ::tanh(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_log2(double f) { return ::log2(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_log(double f) { return ::log(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_log10(float f) { return ::log10(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_exp2(double f) { return ::exp2(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_exp(double f) { return ::exp(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_abs(double f) { return ::fabs(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_trunc(double f) { return ::trunc(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sqrt(double f) { return ::sqrt(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_rsqrt(double f) { return ::rsqrt(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sign(double f) { return (f == 0.0) ? f : ((f < 0.0) ? -1.0 : 1.0); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_frac(double f) { return f - F64_floor(f); }
-
-SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F64_isnan(double f) { return isnan(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F64_isfinite(double f) { return isfinite(f); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F64_isinf(double f) { return isinf(f); }
+// Unary
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_ceil(double f)
+{
+    return ::ceil(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_floor(double f)
+{
+    return ::floor(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_round(double f)
+{
+    return ::round(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sin(double f)
+{
+    return ::sin(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_cos(double f)
+{
+    return ::cos(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL void F64_sincos(double f, double* s, double* c)
+{
+    ::sincos(f, s, c);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_tan(double f)
+{
+    return ::tan(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_asin(double f)
+{
+    return ::asin(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_acos(double f)
+{
+    return ::acos(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_atan(double f)
+{
+    return ::atan(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sinh(double f)
+{
+    return ::sinh(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_cosh(double f)
+{
+    return ::cosh(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_tanh(double f)
+{
+    return ::tanh(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_log2(double f)
+{
+    return ::log2(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_log(double f)
+{
+    return ::log(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_log10(float f)
+{
+    return ::log10(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_exp2(double f)
+{
+    return ::exp2(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_exp(double f)
+{
+    return ::exp(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_abs(double f)
+{
+    return ::fabs(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_trunc(double f)
+{
+    return ::trunc(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sqrt(double f)
+{
+    return ::sqrt(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_rsqrt(double f)
+{
+    return ::rsqrt(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sign(double f)
+{
+    return (f == 0.0) ? f : ((f < 0.0) ? -1.0 : 1.0);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_frac(double f)
+{
+    return f - F64_floor(f);
+}
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F64_isnan(double f)
+{
+    return isnan(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F64_isfinite(double f)
+{
+    return isfinite(f);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F64_isinf(double f)
+{
+    return isinf(f);
+}
 
 // Binary
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_min(double a, double b) { return ::fmin(a, b); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_max(double a, double b) { return ::fmax(a, b); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_pow(double a, double b) { return ::pow(a, b); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_fmod(double a, double b) { return ::fmod(a, b); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_remainder(double a, double b) { return ::remainder(a, b); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_atan2(double a, double b) { return ::atan2(a, b); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_min(double a, double b)
+{
+    return ::fmin(a, b);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_max(double a, double b)
+{
+    return ::fmax(a, b);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_pow(double a, double b)
+{
+    return ::pow(a, b);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_fmod(double a, double b)
+{
+    return ::fmod(a, b);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_remainder(double a, double b)
+{
+    return ::remainder(a, b);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_atan2(double a, double b)
+{
+    return ::atan2(a, b);
+}
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_frexp(double x, int* e) { return ::frexp(x, e); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_frexp(double x, int* e)
+{
+    return ::frexp(x, e);
+}
 
 SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_modf(double x, double* ip)
 {
@@ -1112,20 +1781,40 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL void F64_asint(double d, int32_t* low, int32_
 }
 
 // Ternary
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_fma(double a, double b, double c) { return ::fma(a, b, c); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_fma(double a, double b, double c)
+{
+    return ::fma(a, b, c);
+}
 
 // ----------------------------- I32 -----------------------------------------
 
 // Unary
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_abs(int32_t f) { return (f < 0) ? -f : f; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_abs(int32_t f)
+{
+    return (f < 0) ? -f : f;
+}
 
 // Binary
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_min(int32_t a, int32_t b) { return a < b ? a : b; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_max(int32_t a, int32_t b) { return a > b ? a : b; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_min(int32_t a, int32_t b)
+{
+    return a < b ? a : b;
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_max(int32_t a, int32_t b)
+{
+    return a > b ? a : b;
+}
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float I32_asfloat(int32_t x) { Union32 u; u.i = x; return u.f; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_asuint(int32_t x) { return uint32_t(x); }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double I32_asdouble(int32_t low, int32_t hi )
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float I32_asfloat(int32_t x)
+{
+    Union32 u;
+    u.i = x;
+    return u.f;
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_asuint(int32_t x)
+{
+    return uint32_t(x);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double I32_asdouble(int32_t low, int32_t hi)
 {
     Union64 u;
     u.u = (uint64_t(hi) << 32) | uint32_t(low);
@@ -1134,15 +1823,32 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL double I32_asdouble(int32_t low, int32_t hi )
 
 // ----------------------------- U32 -----------------------------------------
 
-// Unary 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_abs(uint32_t f) { return f; }
+// Unary
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_abs(uint32_t f)
+{
+    return f;
+}
 
 // Binary
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_min(uint32_t a, uint32_t b) { return a < b ? a : b; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_max(uint32_t a, uint32_t b) { return a > b ? a : b; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_min(uint32_t a, uint32_t b)
+{
+    return a < b ? a : b;
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_max(uint32_t a, uint32_t b)
+{
+    return a > b ? a : b;
+}
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float U32_asfloat(uint32_t x) { Union32 u; u.u = x; return u.f; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_asint(int32_t x) { return uint32_t(x); }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float U32_asfloat(uint32_t x)
+{
+    Union32 u;
+    u.u = x;
+    return u.f;
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_asint(int32_t x)
+{
+    return uint32_t(x);
+}
 
 SLANG_FORCE_INLINE SLANG_CUDA_CALL double U32_asdouble(uint32_t low, uint32_t hi)
 {
@@ -1160,17 +1866,35 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_countbits(uint32_t v)
 
 // ----------------------------- I64 -----------------------------------------
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_abs(int64_t f) { return (f < 0) ? -f : f; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_abs(int64_t f)
+{
+    return (f < 0) ? -f : f;
+}
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_min(int64_t a, int64_t b) { return a < b ? a : b; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_max(int64_t a, int64_t b) { return a > b ? a : b; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_min(int64_t a, int64_t b)
+{
+    return a < b ? a : b;
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_max(int64_t a, int64_t b)
+{
+    return a > b ? a : b;
+}
 
 // ----------------------------- U64 -----------------------------------------
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_abs(uint64_t f) { return f; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_abs(uint64_t f)
+{
+    return f;
+}
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_min(uint64_t a, uint64_t b) { return a < b ? a : b; }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_max(uint64_t a, uint64_t b) { return a > b ? a : b; }
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_min(uint64_t a, uint64_t b)
+{
+    return a < b ? a : b;
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_max(uint64_t a, uint64_t b)
+{
+    return a > b ? a : b;
+}
 
 SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U64_countbits(uint64_t v)
 {
@@ -1185,7 +1909,7 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U64_countbits(uint64_t v)
 // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sm5-object-structuredbuffer-getdimensions
 // Missing  Load(_In_  int  Location, _Out_ uint Status);
 
-template <typename T>
+template<typename T>
 struct StructuredBuffer
 {
     SLANG_CUDA_CALL const T& operator[](size_t index) const
@@ -1205,7 +1929,11 @@ struct StructuredBuffer
     }
 
 #ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT
-    SLANG_CUDA_CALL void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride) { *outNumStructs = uint32_t(count); *outStride = uint32_t(sizeof(T)); }
+    SLANG_CUDA_CALL void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride)
+    {
+        *outNumStructs = uint32_t(count);
+        *outStride = uint32_t(sizeof(T));
+    }
 #endif
 
     T* data;
@@ -1214,7 +1942,7 @@ struct StructuredBuffer
 #endif
 };
 
-template <typename T>
+template<typename T>
 struct RWStructuredBuffer : StructuredBuffer<T>
 {
     SLANG_CUDA_CALL T& operator[](size_t index) const
@@ -1230,28 +1958,28 @@ struct RWStructuredBuffer : StructuredBuffer<T>
 struct ByteAddressBuffer
 {
     SLANG_CUDA_CALL void GetDimensions(uint32_t* outDim) const { *outDim = uint32_t(sizeInBytes); }
-    SLANG_CUDA_CALL uint32_t Load(size_t index) const 
-    { 
+    SLANG_CUDA_CALL uint32_t Load(size_t index) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes);
-        return data[index >> 2]; 
+        return data[index >> 2];
     }
-    SLANG_CUDA_CALL uint2 Load2(size_t index) const 
-    { 
-        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); 
-        const size_t dataIdx = index >> 2; 
-        return uint2{data[dataIdx], data[dataIdx + 1]}; 
+    SLANG_CUDA_CALL uint2 Load2(size_t index) const
+    {
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes);
+        const size_t dataIdx = index >> 2;
+        return uint2{data[dataIdx], data[dataIdx + 1]};
     }
-    SLANG_CUDA_CALL uint3 Load3(size_t index) const 
-    { 
+    SLANG_CUDA_CALL uint3 Load3(size_t index) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
-        return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; 
+        const size_t dataIdx = index >> 2;
+        return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]};
     }
-    SLANG_CUDA_CALL uint4 Load4(size_t index) const 
-    { 
+    SLANG_CUDA_CALL uint4 Load4(size_t index) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
-        return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; 
+        const size_t dataIdx = index >> 2;
+        return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]};
     }
     template<typename T>
     SLANG_CUDA_CALL T Load(size_t index) const
@@ -1270,38 +1998,38 @@ struct ByteAddressBuffer
         return rs;
     }
     const uint32_t* data;
-    size_t sizeInBytes;  //< Must be multiple of 4
+    size_t sizeInBytes; //< Must be multiple of 4
 };
 
 // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sm5-object-rwbyteaddressbuffer
-// Missing support for Atomic operations 
+// Missing support for Atomic operations
 // Missing support for Load with status
 struct RWByteAddressBuffer
 {
     SLANG_CUDA_CALL void GetDimensions(uint32_t* outDim) const { *outDim = uint32_t(sizeInBytes); }
-    
-    SLANG_CUDA_CALL uint32_t Load(size_t index) const 
-    { 
+
+    SLANG_CUDA_CALL uint32_t Load(size_t index) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes);
-        return data[index >> 2]; 
+        return data[index >> 2];
     }
-    SLANG_CUDA_CALL uint2 Load2(size_t index) const 
-    { 
+    SLANG_CUDA_CALL uint2 Load2(size_t index) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
-        return uint2{data[dataIdx], data[dataIdx + 1]}; 
+        const size_t dataIdx = index >> 2;
+        return uint2{data[dataIdx], data[dataIdx + 1]};
     }
-    SLANG_CUDA_CALL uint3 Load3(size_t index) const 
-    { 
+    SLANG_CUDA_CALL uint3 Load3(size_t index) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
-        return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; 
+        const size_t dataIdx = index >> 2;
+        return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]};
     }
-    SLANG_CUDA_CALL uint4 Load4(size_t index) const 
-    { 
+    SLANG_CUDA_CALL uint4 Load4(size_t index) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
-        return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; 
+        const size_t dataIdx = index >> 2;
+        return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]};
     }
     template<typename T>
     SLANG_CUDA_CALL T Load(size_t index) const
@@ -1311,31 +2039,31 @@ struct RWByteAddressBuffer
         memcpy(&data, ((const char*)this->data) + index, sizeof(T));
         return data;
     }
-    
-    SLANG_CUDA_CALL void Store(size_t index, uint32_t v) const 
-    { 
+
+    SLANG_CUDA_CALL void Store(size_t index, uint32_t v) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes);
-        data[index >> 2] = v; 
+        data[index >> 2] = v;
     }
-    SLANG_CUDA_CALL void Store2(size_t index, uint2 v) const 
-    { 
+    SLANG_CUDA_CALL void Store2(size_t index, uint2 v) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
+        const size_t dataIdx = index >> 2;
         data[dataIdx + 0] = v.x;
         data[dataIdx + 1] = v.y;
     }
-    SLANG_CUDA_CALL void Store3(size_t index, uint3 v) const 
-    { 
+    SLANG_CUDA_CALL void Store3(size_t index, uint3 v) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
+        const size_t dataIdx = index >> 2;
         data[dataIdx + 0] = v.x;
         data[dataIdx + 1] = v.y;
         data[dataIdx + 2] = v.z;
     }
-    SLANG_CUDA_CALL void Store4(size_t index, uint4 v) const 
-    { 
+    SLANG_CUDA_CALL void Store4(size_t index, uint4 v) const
+    {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes);
-        const size_t dataIdx = index >> 2; 
+        const size_t dataIdx = index >> 2;
         data[dataIdx + 0] = v.x;
         data[dataIdx + 1] = v.y;
         data[dataIdx + 2] = v.z;
@@ -1347,9 +2075,9 @@ struct RWByteAddressBuffer
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes);
         memcpy((char*)data + index, &value, sizeof(T));
     }
-    
-        /// Can be used in the core module to gain access
-    template <typename T>
+
+    /// Can be used in the core module to gain access
+    template<typename T>
     SLANG_CUDA_CALL T* _getPtrAt(size_t index)
     {
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes);
@@ -1364,69 +2092,71 @@ struct RWByteAddressBuffer
         return rs;
     }
     uint32_t* data;
-    size_t sizeInBytes; //< Must be multiple of 4 
+    size_t sizeInBytes; //< Must be multiple of 4
 };
 
 
 // ---------------------- Wave --------------------------------------
 
-// TODO(JS): It appears that cuda does not have a simple way to get a lane index. 
-// 
-// Another approach could be... 
-// laneId = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & SLANG_CUDA_WARP_MASK
-// If that is really true another way to do this, would be for code generator to add this function 
-// with the [numthreads] baked in. 
-// 
-// For now I'll just assume you have a launch that makes the following correct if the kernel uses WaveGetLaneIndex()
+// TODO(JS): It appears that cuda does not have a simple way to get a lane index.
+//
+// Another approach could be...
+// laneId = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) &
+// SLANG_CUDA_WARP_MASK If that is really true another way to do this, would be for code generator
+// to add this function with the [numthreads] baked in.
+//
+// For now I'll just assume you have a launch that makes the following correct if the kernel uses
+// WaveGetLaneIndex()
 #ifndef SLANG_USE_ASM_LANE_ID
- __forceinline__ __device__ uint32_t _getLaneId()
+__forceinline__ __device__ uint32_t _getLaneId()
 {
-    // If the launch is (or I guess some multiple of the warp size) 
-    // we try this mechanism, which is apparently faster. 
+    // If the launch is (or I guess some multiple of the warp size)
+    // we try this mechanism, which is apparently faster.
     return threadIdx.x & SLANG_CUDA_WARP_MASK;
 }
 #else
 __forceinline__ __device__ uint32_t _getLaneId()
 {
     // https://stackoverflow.com/questions/44337309/whats-the-most-efficient-way-to-calculate-the-warp-id-lane-id-in-a-1-d-grid#
-    // This mechanism is not the fastest way to do it, and that is why the other mechanism 
-    // is the default. But the other mechanism relies on a launch that makes the assumption 
+    // This mechanism is not the fastest way to do it, and that is why the other mechanism
+    // is the default. But the other mechanism relies on a launch that makes the assumption
     // true.
-    unsigned ret; 
-    asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret));
+    unsigned ret;
+    asm volatile("mov.u32 %0, %laneid;" : "=r"(ret));
     return ret;
 }
 #endif
 
 typedef int WarpMask;
 
-// It appears that the __activemask() cannot always be used because 
-// threads need to be converged. 
-// 
+// It appears that the __activemask() cannot always be used because
+// threads need to be converged.
+//
 // For CUDA the article claims mask has to be used carefully
 // https://devblogs.nvidia.com/using-cuda-warp-level-primitives/
-// With the Warp intrinsics there is no mask, and it's just the 'active lanes'. 
+// With the Warp intrinsics there is no mask, and it's just the 'active lanes'.
 // __activemask() though does not require there is convergence, so that doesn't work.
-// 
-// '__ballot_sync' produces a convergance. 
-// 
+//
+// '__ballot_sync' produces a convergance.
+//
 // From the CUDA docs:
-// ```For __all_sync, __any_sync, and __ballot_sync, a mask must be passed that specifies the threads 
-// participating in the call. A bit, representing the thread's lane ID, must be set for each participating thread 
-// to ensure they are properly converged before the intrinsic is executed by the hardware. All active threads named 
-// in mask must execute the same intrinsic with the same mask, or the result is undefined.```
+// ```For __all_sync, __any_sync, and __ballot_sync, a mask must be passed that specifies the
+// threads participating in the call. A bit, representing the thread's lane ID, must be set for each
+// participating thread to ensure they are properly converged before the intrinsic is executed by
+// the hardware. All active threads named in mask must execute the same intrinsic with the same
+// mask, or the result is undefined.```
 //
 // Currently there isn't a mechanism to correctly get the mask without it being passed through.
-// Doing so will most likely require some changes to slang code generation to track masks, for now then we use
-// _getActiveMask. 
+// Doing so will most likely require some changes to slang code generation to track masks, for now
+// then we use _getActiveMask.
 
 // Return mask of all the lanes less than the current lane
 __forceinline__ __device__ WarpMask _getLaneLtMask()
 {
     return (int(1) << _getLaneId()) - 1;
-}    
+}
 
-// TODO(JS): 
+// TODO(JS):
 // THIS IS NOT CORRECT! That determining the appropriate active mask requires appropriate
 // mask tracking.
 __forceinline__ __device__ WarpMask _getActiveMask()
@@ -1478,30 +2208,30 @@ __inline__ __device__ int _waveCalcPow2Offset(WarpMask mask)
 __inline__ __device__ bool _waveIsFirstLane()
 {
     const WarpMask mask = __activemask();
-    // We special case bit 0, as that most warps are expected to be fully active. 
-    
+    // We special case bit 0, as that most warps are expected to be fully active.
+
     // mask & -mask, isolates the lowest set bit.
-    //return (mask & 1 ) || ((mask & -mask) == (1 << _getLaneId()));
-    
-    // This mechanism is most similar to what was in an nVidia post, so assume it is prefered. 
-    return (mask & 1 ) || ((__ffs(mask) - 1) == _getLaneId());
+    // return (mask & 1 ) || ((mask & -mask) == (1 << _getLaneId()));
+
+    // This mechanism is most similar to what was in an nVidia post, so assume it is prefered.
+    return (mask & 1) || ((__ffs(mask) - 1) == _getLaneId());
 }
 
-template <typename T>
+template<typename T>
 struct WaveOpOr
 {
     __inline__ __device__ static T getInitial(T a) { return 0; }
     __inline__ __device__ static T doOp(T a, T b) { return a | b; }
 };
 
-template <typename T>
+template<typename T>
 struct WaveOpAnd
 {
     __inline__ __device__ static T getInitial(T a) { return ~T(0); }
     __inline__ __device__ static T doOp(T a, T b) { return a & b; }
 };
 
-template <typename T>
+template<typename T>
 struct WaveOpXor
 {
     __inline__ __device__ static T getInitial(T a) { return 0; }
@@ -1509,7 +2239,7 @@ struct WaveOpXor
     __inline__ __device__ static T doInverse(T a, T b) { return a ^ b; }
 };
 
-template <typename T>
+template<typename T>
 struct WaveOpAdd
 {
     __inline__ __device__ static T getInitial(T a) { return 0; }
@@ -1517,77 +2247,166 @@ struct WaveOpAdd
     __inline__ __device__ static T doInverse(T a, T b) { return a - b; }
 };
 
-template <typename T>
+template<typename T>
 struct WaveOpMul
 {
     __inline__ __device__ static T getInitial(T a) { return T(1); }
     __inline__ __device__ static T doOp(T a, T b) { return a * b; }
-    // Using this inverse for int is probably undesirable - because in general it requires T to have more precision
-    // There is also a performance aspect to it, where divides are generally significantly slower
+    // Using this inverse for int is probably undesirable - because in general it requires T to have
+    // more precision There is also a performance aspect to it, where divides are generally
+    // significantly slower
     __inline__ __device__ static T doInverse(T a, T b) { return a / b; }
 };
 
-template <typename T>
+template<typename T>
 struct WaveOpMax
 {
     __inline__ __device__ static T getInitial(T a) { return a; }
     __inline__ __device__ static T doOp(T a, T b) { return a > b ? a : b; }
 };
 
-template <typename T>
+template<typename T>
 struct WaveOpMin
 {
-    __inline__  __device__ static T getInitial(T a) { return a; }
+    __inline__ __device__ static T getInitial(T a) { return a; }
     __inline__ __device__ static T doOp(T a, T b) { return a < b ? a : b; }
 };
 
-template <typename T>
+template<typename T>
 struct ElementTypeTrait;
 
 // Scalar
-template <> struct ElementTypeTrait<int> { typedef int Type; };
-template <> struct ElementTypeTrait<uint> { typedef uint Type; };
-template <> struct ElementTypeTrait<float> { typedef float Type; };
-template <> struct ElementTypeTrait<double> { typedef double Type; };
-template <> struct ElementTypeTrait<uint64_t> { typedef uint64_t Type; };
-template <> struct ElementTypeTrait<int64_t> { typedef int64_t Type; };
+template<>
+struct ElementTypeTrait<int>
+{
+    typedef int Type;
+};
+template<>
+struct ElementTypeTrait<uint>
+{
+    typedef uint Type;
+};
+template<>
+struct ElementTypeTrait<float>
+{
+    typedef float Type;
+};
+template<>
+struct ElementTypeTrait<double>
+{
+    typedef double Type;
+};
+template<>
+struct ElementTypeTrait<uint64_t>
+{
+    typedef uint64_t Type;
+};
+template<>
+struct ElementTypeTrait<int64_t>
+{
+    typedef int64_t Type;
+};
 
 // Vector
-template <> struct ElementTypeTrait<int1> { typedef int Type; };
-template <> struct ElementTypeTrait<int2> { typedef int Type; };
-template <> struct ElementTypeTrait<int3> { typedef int Type; };
-template <> struct ElementTypeTrait<int4> { typedef int Type; };
-
-template <> struct ElementTypeTrait<uint1> { typedef uint Type; };
-template <> struct ElementTypeTrait<uint2> { typedef uint Type; };
-template <> struct ElementTypeTrait<uint3> { typedef uint Type; };
-template <> struct ElementTypeTrait<uint4> { typedef uint Type; };
-
-template <> struct ElementTypeTrait<float1> { typedef float Type; };
-template <> struct ElementTypeTrait<float2> { typedef float Type; };
-template <> struct ElementTypeTrait<float3> { typedef float Type; };
-template <> struct ElementTypeTrait<float4> { typedef float Type; };
-
-template <> struct ElementTypeTrait<double1> { typedef double Type; };
-template <> struct ElementTypeTrait<double2> { typedef double Type; };
-template <> struct ElementTypeTrait<double3> { typedef double Type; };
-template <> struct ElementTypeTrait<double4> { typedef double Type; };
+template<>
+struct ElementTypeTrait<int1>
+{
+    typedef int Type;
+};
+template<>
+struct ElementTypeTrait<int2>
+{
+    typedef int Type;
+};
+template<>
+struct ElementTypeTrait<int3>
+{
+    typedef int Type;
+};
+template<>
+struct ElementTypeTrait<int4>
+{
+    typedef int Type;
+};
+
+template<>
+struct ElementTypeTrait<uint1>
+{
+    typedef uint Type;
+};
+template<>
+struct ElementTypeTrait<uint2>
+{
+    typedef uint Type;
+};
+template<>
+struct ElementTypeTrait<uint3>
+{
+    typedef uint Type;
+};
+template<>
+struct ElementTypeTrait<uint4>
+{
+    typedef uint Type;
+};
+
+template<>
+struct ElementTypeTrait<float1>
+{
+    typedef float Type;
+};
+template<>
+struct ElementTypeTrait<float2>
+{
+    typedef float Type;
+};
+template<>
+struct ElementTypeTrait<float3>
+{
+    typedef float Type;
+};
+template<>
+struct ElementTypeTrait<float4>
+{
+    typedef float Type;
+};
+
+template<>
+struct ElementTypeTrait<double1>
+{
+    typedef double Type;
+};
+template<>
+struct ElementTypeTrait<double2>
+{
+    typedef double Type;
+};
+template<>
+struct ElementTypeTrait<double3>
+{
+    typedef double Type;
+};
+template<>
+struct ElementTypeTrait<double4>
+{
+    typedef double Type;
+};
 
 // Matrix
-template <typename T, int ROWS, int COLS> 
-struct ElementTypeTrait<Matrix<T, ROWS, COLS> >  
-{ 
-    typedef T Type; 
+template<typename T, int ROWS, int COLS>
+struct ElementTypeTrait<Matrix<T, ROWS, COLS>>
+{
+    typedef T Type;
 };
 
-// Scalar 
-template <typename INTF, typename T>
+// Scalar
+template<typename INTF, typename T>
 __device__ T _waveReduceScalar(WarpMask mask, T val)
 {
     const int offsetSize = _waveCalcPow2Offset(mask);
     if (offsetSize > 0)
     {
-        // Fast path O(log2(activeLanes)) 
+        // Fast path O(log2(activeLanes))
         for (int offset = offsetSize >> 1; offset > 0; offset >>= 1)
         {
             val = INTF::doOp(val, __shfl_xor_sync(mask, val, offset));
@@ -1600,9 +2419,9 @@ __device__ T _waveReduceScalar(WarpMask mask, T val)
         while (remaining)
         {
             const int laneBit = remaining & -remaining;
-            // Get the sourceLane 
+            // Get the sourceLane
             const int srcLane = __ffs(laneBit) - 1;
-            // Broadcast (can also broadcast to self) 
+            // Broadcast (can also broadcast to self)
             result = INTF::doOp(result, __shfl_sync(mask, val, srcLane));
             remaining &= ~laneBit;
         }
@@ -1613,13 +2432,13 @@ __device__ T _waveReduceScalar(WarpMask mask, T val)
 
 
 // Multiple values
-template <typename INTF, typename T, size_t COUNT>
+template<typename INTF, typename T, size_t COUNT>
 __device__ void _waveReduceMultiple(WarpMask mask, T* val)
 {
     const int offsetSize = _waveCalcPow2Offset(mask);
     if (offsetSize > 0)
     {
-        // Fast path O(log2(activeLanes)) 
+        // Fast path O(log2(activeLanes))
         for (int offset = offsetSize >> 1; offset > 0; offset >>= 1)
         {
             for (size_t i = 0; i < COUNT; ++i)
@@ -1638,14 +2457,14 @@ __device__ void _waveReduceMultiple(WarpMask mask, T* val)
             originalVal[i] = v;
             val[i] = INTF::getInitial(v);
         }
-        
+
         int remaining = mask;
         while (remaining)
         {
             const int laneBit = remaining & -remaining;
-            // Get the sourceLane 
+            // Get the sourceLane
             const int srcLane = __ffs(laneBit) - 1;
-            // Broadcast (can also broadcast to self) 
+            // Broadcast (can also broadcast to self)
             for (size_t i = 0; i < COUNT; ++i)
             {
                 val[i] = INTF::doOp(val[i], __shfl_sync(mask, originalVal[i], srcLane));
@@ -1655,99 +2474,182 @@ __device__ void _waveReduceMultiple(WarpMask mask, T* val)
     }
 }
 
-template <typename INTF, typename T>
+template<typename INTF, typename T>
 __device__ void _waveReduceMultiple(WarpMask mask, T* val)
 {
-    typedef typename ElementTypeTrait<T>::Type ElemType;    
+    typedef typename ElementTypeTrait<T>::Type ElemType;
     _waveReduceMultiple<INTF, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)val);
 }
 
-template <typename T>
-__inline__ __device__  T _waveOr(WarpMask mask, T val) { return _waveReduceScalar<WaveOpOr<T>, T>(mask, val); }
+template<typename T>
+__inline__ __device__ T _waveOr(WarpMask mask, T val)
+{
+    return _waveReduceScalar<WaveOpOr<T>, T>(mask, val);
+}
 
-template <typename T>
-__inline__ __device__ T _waveAnd(WarpMask mask, T val) { return _waveReduceScalar<WaveOpAnd<T>, T>(mask, val); }
+template<typename T>
+__inline__ __device__ T _waveAnd(WarpMask mask, T val)
+{
+    return _waveReduceScalar<WaveOpAnd<T>, T>(mask, val);
+}
 
-template <typename T>
-__inline__ __device__ T _waveXor(WarpMask mask, T val) { return _waveReduceScalar<WaveOpXor<T>, T>(mask, val); }
+template<typename T>
+__inline__ __device__ T _waveXor(WarpMask mask, T val)
+{
+    return _waveReduceScalar<WaveOpXor<T>, T>(mask, val);
+}
 
-template <typename T>
-__inline__ __device__ T _waveProduct(WarpMask mask, T val) { return _waveReduceScalar<WaveOpMul<T>, T>(mask, val); }
+template<typename T>
+__inline__ __device__ T _waveProduct(WarpMask mask, T val)
+{
+    return _waveReduceScalar<WaveOpMul<T>, T>(mask, val);
+}
 
-template <typename T>
-__inline__ __device__ T _waveSum(WarpMask mask, T val) { return _waveReduceScalar<WaveOpAdd<T>, T>(mask, val); }
+template<typename T>
+__inline__ __device__ T _waveSum(WarpMask mask, T val)
+{
+    return _waveReduceScalar<WaveOpAdd<T>, T>(mask, val);
+}
 
-template <typename T>
-__inline__ __device__ T _waveMin(WarpMask mask, T val) { return _waveReduceScalar<WaveOpMin<T>, T>(mask, val); }
+template<typename T>
+__inline__ __device__ T _waveMin(WarpMask mask, T val)
+{
+    return _waveReduceScalar<WaveOpMin<T>, T>(mask, val);
+}
 
-template <typename T>
-__inline__ __device__ T _waveMax(WarpMask mask, T val) { return _waveReduceScalar<WaveOpMax<T>, T>(mask, val); }
+template<typename T>
+__inline__ __device__ T _waveMax(WarpMask mask, T val)
+{
+    return _waveReduceScalar<WaveOpMax<T>, T>(mask, val);
+}
 
 // Fast-path specializations when CUDA warp reduce operators are available
 #if __CUDA_ARCH__ >= 800 // 8.x or higher
 template<>
-__inline__ __device__ unsigned _waveOr<unsigned>(WarpMask mask, unsigned val) { return __reduce_or_sync(mask, val); }
+__inline__ __device__ unsigned _waveOr<unsigned>(WarpMask mask, unsigned val)
+{
+    return __reduce_or_sync(mask, val);
+}
 
 template<>
-__inline__ __device__ unsigned _waveAnd<unsigned>(WarpMask mask, unsigned val) { return __reduce_and_sync(mask, val); }
+__inline__ __device__ unsigned _waveAnd<unsigned>(WarpMask mask, unsigned val)
+{
+    return __reduce_and_sync(mask, val);
+}
 
 template<>
-__inline__ __device__ unsigned _waveXor<unsigned>(WarpMask mask, unsigned val) { return __reduce_xor_sync(mask, val); }
+__inline__ __device__ unsigned _waveXor<unsigned>(WarpMask mask, unsigned val)
+{
+    return __reduce_xor_sync(mask, val);
+}
 
 template<>
-__inline__ __device__ unsigned _waveSum<unsigned>(WarpMask mask, unsigned val) { return __reduce_add_sync(mask, val); }
+__inline__ __device__ unsigned _waveSum<unsigned>(WarpMask mask, unsigned val)
+{
+    return __reduce_add_sync(mask, val);
+}
 
 template<>
-__inline__ __device__ int _waveSum<int>(WarpMask mask, int val) { return __reduce_add_sync(mask, val); }
+__inline__ __device__ int _waveSum<int>(WarpMask mask, int val)
+{
+    return __reduce_add_sync(mask, val);
+}
 
 template<>
-__inline__ __device__ unsigned _waveMin<unsigned>(WarpMask mask, unsigned val) { return __reduce_min_sync(mask, val); }
+__inline__ __device__ unsigned _waveMin<unsigned>(WarpMask mask, unsigned val)
+{
+    return __reduce_min_sync(mask, val);
+}
 
 template<>
-__inline__ __device__ int _waveMin<int>(WarpMask mask, int val) { return __reduce_min_sync(mask, val); }
+__inline__ __device__ int _waveMin<int>(WarpMask mask, int val)
+{
+    return __reduce_min_sync(mask, val);
+}
 
 template<>
-__inline__ __device__ unsigned _waveMax<unsigned>(WarpMask mask, unsigned val) { return __reduce_max_sync(mask, val); }
+__inline__ __device__ unsigned _waveMax<unsigned>(WarpMask mask, unsigned val)
+{
+    return __reduce_max_sync(mask, val);
+}
 
 template<>
-__inline__ __device__ int _waveMax<int>(WarpMask mask, int val) { return __reduce_max_sync(mask, val); }
+__inline__ __device__ int _waveMax<int>(WarpMask mask, int val)
+{
+    return __reduce_max_sync(mask, val);
+}
 #endif
 
 
 // Multiple
 
-template <typename T>
-__inline__ __device__  T _waveOrMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpOr<ElemType> >(mask, &val); return val; }
+template<typename T>
+__inline__ __device__ T _waveOrMultiple(WarpMask mask, T val)
+{
+    typedef typename ElementTypeTrait<T>::Type ElemType;
+    _waveReduceMultiple<WaveOpOr<ElemType>>(mask, &val);
+    return val;
+}
 
-template <typename T>
-__inline__ __device__  T _waveAndMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpAnd<ElemType> >(mask, &val); return val; }
+template<typename T>
+__inline__ __device__ T _waveAndMultiple(WarpMask mask, T val)
+{
+    typedef typename ElementTypeTrait<T>::Type ElemType;
+    _waveReduceMultiple<WaveOpAnd<ElemType>>(mask, &val);
+    return val;
+}
 
-template <typename T>
-__inline__ __device__  T _waveXorMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpXor<ElemType> >(mask, &val); return val; }
+template<typename T>
+__inline__ __device__ T _waveXorMultiple(WarpMask mask, T val)
+{
+    typedef typename ElementTypeTrait<T>::Type ElemType;
+    _waveReduceMultiple<WaveOpXor<ElemType>>(mask, &val);
+    return val;
+}
 
-template <typename T>
-__inline__ __device__  T _waveProductMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMul<ElemType> >(mask, &val); return val; }
+template<typename T>
+__inline__ __device__ T _waveProductMultiple(WarpMask mask, T val)
+{
+    typedef typename ElementTypeTrait<T>::Type ElemType;
+    _waveReduceMultiple<WaveOpMul<ElemType>>(mask, &val);
+    return val;
+}
 
-template <typename T>
-__inline__ __device__  T _waveSumMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpAdd<ElemType> >(mask, &val); return val; }
+template<typename T>
+__inline__ __device__ T _waveSumMultiple(WarpMask mask, T val)
+{
+    typedef typename ElementTypeTrait<T>::Type ElemType;
+    _waveReduceMultiple<WaveOpAdd<ElemType>>(mask, &val);
+    return val;
+}
 
-template <typename T>
-__inline__ __device__  T _waveMinMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMin<ElemType> >(mask, &val); return val; }
+template<typename T>
+__inline__ __device__ T _waveMinMultiple(WarpMask mask, T val)
+{
+    typedef typename ElementTypeTrait<T>::Type ElemType;
+    _waveReduceMultiple<WaveOpMin<ElemType>>(mask, &val);
+    return val;
+}
 
-template <typename T>
-__inline__ __device__  T _waveMaxMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMax<ElemType> >(mask, &val); return val; }
+template<typename T>
+__inline__ __device__ T _waveMaxMultiple(WarpMask mask, T val)
+{
+    typedef typename ElementTypeTrait<T>::Type ElemType;
+    _waveReduceMultiple<WaveOpMax<ElemType>>(mask, &val);
+    return val;
+}
 
 
-template <typename T>
-__inline__ __device__ bool _waveAllEqual(WarpMask mask, T val) 
+template<typename T>
+__inline__ __device__ bool _waveAllEqual(WarpMask mask, T val)
 {
     int pred;
     __match_all_sync(mask, val, &pred);
     return pred != 0;
 }
 
-template <typename T>
-__inline__ __device__ bool _waveAllEqualMultiple(WarpMask mask, T inVal) 
+template<typename T>
+__inline__ __device__ bool _waveAllEqualMultiple(WarpMask mask, T inVal)
 {
     typedef typename ElementTypeTrait<T>::Type ElemType;
     const size_t count = sizeof(T) / sizeof(ElemType);
@@ -1764,15 +2666,15 @@ __inline__ __device__ bool _waveAllEqualMultiple(WarpMask mask, T inVal)
     return true;
 }
 
-template <typename T>
-__inline__ __device__ T _waveReadFirst(WarpMask mask, T val) 
+template<typename T>
+__inline__ __device__ T _waveReadFirst(WarpMask mask, T val)
 {
     const int lowestLaneId = __ffs(mask) - 1;
-    return __shfl_sync(mask, val, lowestLaneId);   
+    return __shfl_sync(mask, val, lowestLaneId);
 }
 
-template <typename T>
-__inline__ __device__ T _waveReadFirstMultiple(WarpMask mask, T inVal) 
+template<typename T>
+__inline__ __device__ T _waveReadFirstMultiple(WarpMask mask, T inVal)
 {
     typedef typename ElementTypeTrait<T>::Type ElemType;
     const size_t count = sizeof(T) / sizeof(ElemType);
@@ -1782,12 +2684,12 @@ __inline__ __device__ T _waveReadFirstMultiple(WarpMask mask, T inVal)
     const int lowestLaneId = __ffs(mask) - 1;
     for (size_t i = 0; i < count; ++i)
     {
-        dst[i] = __shfl_sync(mask, src[i], lowestLaneId);   
+        dst[i] = __shfl_sync(mask, src[i], lowestLaneId);
     }
     return outVal;
 }
 
-template <typename T>
+template<typename T>
 __inline__ __device__ T _waveShuffleMultiple(WarpMask mask, T inVal, int lane)
 {
     typedef typename ElementTypeTrait<T>::Type ElemType;
@@ -1797,27 +2699,27 @@ __inline__ __device__ T _waveShuffleMultiple(WarpMask mask, T inVal, int lane)
     ElemType* dst = (ElemType*)&outVal;
     for (size_t i = 0; i < count; ++i)
     {
-        dst[i] = __shfl_sync(mask, src[i], lane);   
+        dst[i] = __shfl_sync(mask, src[i], lane);
     }
     return outVal;
 }
 
-// Scalar 
+// Scalar
 
-// Invertable means that when we get to the end of the reduce, we can remove val (to make exclusive), using 
-// the inverse of the op.
-template <typename INTF, typename T>
+// Invertable means that when we get to the end of the reduce, we can remove val (to make
+// exclusive), using the inverse of the op.
+template<typename INTF, typename T>
 __device__ T _wavePrefixInvertableScalar(WarpMask mask, T val)
 {
     const int offsetSize = _waveCalcPow2Offset(mask);
-    
+
     const int laneId = _getLaneId();
     T result;
     if (offsetSize > 0)
-    {    
+    {
         // Sum is calculated inclusive of this lanes value
         result = val;
-        for (int i = 1; i < offsetSize; i += i) 
+        for (int i = 1; i < offsetSize; i += i)
         {
             const T readVal = __shfl_up_sync(mask, result, i, offsetSize);
             if (laneId >= i)
@@ -1828,7 +2730,7 @@ __device__ T _wavePrefixInvertableScalar(WarpMask mask, T val)
         // Remove val from the result, by applyin inverse
         result = INTF::doInverse(result, val);
     }
-    else 
+    else
     {
         result = INTF::getInitial(val);
         if (!_waveIsSingleLane(mask))
@@ -1837,9 +2739,9 @@ __device__ T _wavePrefixInvertableScalar(WarpMask mask, T val)
             while (remaining)
             {
                 const int laneBit = remaining & -remaining;
-                // Get the sourceLane 
+                // Get the sourceLane
                 const int srcLane = __ffs(laneBit) - 1;
-                // Broadcast (can also broadcast to self) 
+                // Broadcast (can also broadcast to self)
                 const T readValue = __shfl_sync(mask, val, srcLane);
                 // Only accumulate if srcLane is less than this lane
                 if (srcLane < laneId)
@@ -1848,27 +2750,28 @@ __device__ T _wavePrefixInvertableScalar(WarpMask mask, T val)
                 }
                 remaining &= ~laneBit;
             }
-        }   
+        }
     }
     return result;
 }
- 
+
 
 // This implementation separately tracks the value to be propogated, and the value
-// that is the final result 
-template <typename INTF, typename T>
+// that is the final result
+template<typename INTF, typename T>
 __device__ T _wavePrefixScalar(WarpMask mask, T val)
 {
     const int offsetSize = _waveCalcPow2Offset(mask);
-    
+
     const int laneId = _getLaneId();
-    T result = INTF::getInitial(val);           
+    T result = INTF::getInitial(val);
     if (offsetSize > 0)
-    {    
+    {
         // For transmitted value we will do it inclusively with this lanes value
-        // For the result we do not include the lanes value. This means an extra multiply for each iteration
-        // but means we don't need to have a divide at the end and also removes overflow issues in that scenario.
-        for (int i = 1; i < offsetSize; i += i) 
+        // For the result we do not include the lanes value. This means an extra multiply for each
+        // iteration but means we don't need to have a divide at the end and also removes overflow
+        // issues in that scenario.
+        for (int i = 1; i < offsetSize; i += i)
         {
             const T readVal = __shfl_up_sync(mask, val, i, offsetSize);
             if (laneId >= i)
@@ -1878,7 +2781,7 @@ __device__ T _wavePrefixScalar(WarpMask mask, T val)
             }
         }
     }
-    else 
+    else
     {
         if (!_waveIsSingleLane(mask))
         {
@@ -1886,9 +2789,9 @@ __device__ T _wavePrefixScalar(WarpMask mask, T val)
             while (remaining)
             {
                 const int laneBit = remaining & -remaining;
-                // Get the sourceLane 
+                // Get the sourceLane
                 const int srcLane = __ffs(laneBit) - 1;
-                // Broadcast (can also broadcast to self) 
+                // Broadcast (can also broadcast to self)
                 const T readValue = __shfl_sync(mask, val, srcLane);
                 // Only accumulate if srcLane is less than this lane
                 if (srcLane < laneId)
@@ -1903,51 +2806,51 @@ __device__ T _wavePrefixScalar(WarpMask mask, T val)
 }
 
 
-template <typename INTF, typename T, size_t COUNT>
+template<typename INTF, typename T, size_t COUNT>
 __device__ T _waveOpCopy(T* dst, const T* src)
 {
     for (size_t j = 0; j < COUNT; ++j)
     {
         dst[j] = src[j];
     }
-}    
+}
 
 
-template <typename INTF, typename T, size_t COUNT>
+template<typename INTF, typename T, size_t COUNT>
 __device__ T _waveOpDoInverse(T* inOut, const T* val)
 {
     for (size_t j = 0; j < COUNT; ++j)
     {
         inOut[j] = INTF::doInverse(inOut[j], val[j]);
     }
-}    
+}
 
-template <typename INTF, typename T, size_t COUNT>
+template<typename INTF, typename T, size_t COUNT>
 __device__ T _waveOpSetInitial(T* out, const T* val)
 {
     for (size_t j = 0; j < COUNT; ++j)
     {
         out[j] = INTF::getInitial(val[j]);
     }
-} 
+}
 
-template <typename INTF, typename T, size_t COUNT>
+template<typename INTF, typename T, size_t COUNT>
 __device__ T _wavePrefixInvertableMultiple(WarpMask mask, T* val)
 {
     const int offsetSize = _waveCalcPow2Offset(mask);
-    
+
     const int laneId = _getLaneId();
     T originalVal[COUNT];
     _waveOpCopy<INTF, T, COUNT>(originalVal, val);
-    
+
     if (offsetSize > 0)
-    {    
+    {
         // Sum is calculated inclusive of this lanes value
-        for (int i = 1; i < offsetSize; i += i) 
+        for (int i = 1; i < offsetSize; i += i)
         {
             // TODO(JS): Note that here I don't split the laneId outside so it's only tested once.
-            // This may be better but it would also mean that there would be shfl between lanes 
-            // that are on different (albeit identical) instructions. So this seems more likely to 
+            // This may be better but it would also mean that there would be shfl between lanes
+            // that are on different (albeit identical) instructions. So this seems more likely to
             // work as expected with everything in lock step.
             for (size_t j = 0; j < COUNT; ++j)
             {
@@ -1961,7 +2864,7 @@ __device__ T _wavePrefixInvertableMultiple(WarpMask mask, T* val)
         // Remove originalVal from the result, by applyin inverse
         _waveOpDoInverse<INTF, T, COUNT>(val, originalVal);
     }
-    else 
+    else
     {
         _waveOpSetInitial<INTF, T, COUNT>(val, val);
         if (!_waveIsSingleLane(mask))
@@ -1970,12 +2873,12 @@ __device__ T _wavePrefixInvertableMultiple(WarpMask mask, T* val)
             while (remaining)
             {
                 const int laneBit = remaining & -remaining;
-                // Get the sourceLane 
+                // Get the sourceLane
                 const int srcLane = __ffs(laneBit) - 1;
-                
+
                 for (size_t j = 0; j < COUNT; ++j)
                 {
-                    // Broadcast (can also broadcast to self) 
+                    // Broadcast (can also broadcast to self)
                     const T readValue = __shfl_sync(mask, originalVal[j], srcLane);
                     // Only accumulate if srcLane is less than this lane
                     if (srcLane < laneId)
@@ -1985,27 +2888,28 @@ __device__ T _wavePrefixInvertableMultiple(WarpMask mask, T* val)
                     remaining &= ~laneBit;
                 }
             }
-        }   
+        }
     }
 }
- 
-template <typename INTF, typename T, size_t COUNT>
+
+template<typename INTF, typename T, size_t COUNT>
 __device__ T _wavePrefixMultiple(WarpMask mask, T* val)
 {
     const int offsetSize = _waveCalcPow2Offset(mask);
-    
+
     const int laneId = _getLaneId();
-    
+
     T work[COUNT];
     _waveOpCopy<INTF, T, COUNT>(work, val);
     _waveOpSetInitial<INTF, T, COUNT>(val, val);
-    
+
     if (offsetSize > 0)
-    {    
+    {
         // For transmitted value we will do it inclusively with this lanes value
-        // For the result we do not include the lanes value. This means an extra op for each iteration
-        // but means we don't need to have a divide at the end and also removes overflow issues in that scenario.
-        for (int i = 1; i < offsetSize; i += i) 
+        // For the result we do not include the lanes value. This means an extra op for each
+        // iteration but means we don't need to have a divide at the end and also removes overflow
+        // issues in that scenario.
+        for (int i = 1; i < offsetSize; i += i)
         {
             for (size_t j = 0; j < COUNT; ++j)
             {
@@ -2013,12 +2917,12 @@ __device__ T _wavePrefixMultiple(WarpMask mask, T* val)
                 if (laneId >= i)
                 {
                     work[j] = INTF::doOp(work[j], readVal);
-                    val[j] = INTF::doOp(val[j], readVal);     
+                    val[j] = INTF::doOp(val[j], readVal);
                 }
             }
         }
     }
-    else 
+    else
     {
         if (!_waveIsSingleLane(mask))
         {
@@ -2026,12 +2930,12 @@ __device__ T _wavePrefixMultiple(WarpMask mask, T* val)
             while (remaining)
             {
                 const int laneBit = remaining & -remaining;
-                // Get the sourceLane 
+                // Get the sourceLane
                 const int srcLane = __ffs(laneBit) - 1;
-                
+
                 for (size_t j = 0; j < COUNT; ++j)
                 {
-                    // Broadcast (can also broadcast to self) 
+                    // Broadcast (can also broadcast to self)
                     const T readValue = __shfl_sync(mask, work[j], srcLane);
                     // Only accumulate if srcLane is less than this lane
                     if (srcLane < laneId)
@@ -2045,71 +2949,96 @@ __device__ T _wavePrefixMultiple(WarpMask mask, T* val)
     }
 }
 
-template <typename T>
-__inline__ __device__ T _wavePrefixProduct(WarpMask mask, T val) { return _wavePrefixScalar<WaveOpMul<T>, T>(mask, val); }
-
-template <typename T>
-__inline__ __device__ T _wavePrefixSum(WarpMask mask, T val) { return _wavePrefixInvertableScalar<WaveOpAdd<T>, T>(mask, val); }    
-
-template <typename T>
-__inline__ __device__ T _wavePrefixXor(WarpMask mask, T val) { return _wavePrefixInvertableScalar<WaveOpXor<T>, T>(mask, val); }    
-    
-template <typename T>
-__inline__ __device__ T _wavePrefixOr(WarpMask mask, T val) { return _wavePrefixScalar<WaveOpOr<T>, T>(mask, val); }      
-    
-template <typename T>
-__inline__ __device__ T _wavePrefixAnd(WarpMask mask, T val) { return _wavePrefixScalar<WaveOpAnd<T>, T>(mask, val); }      
-    
-    
-template <typename T>
-__inline__ __device__ T _wavePrefixProductMultiple(WarpMask mask, T val)  
-{ 
-    typedef typename ElementTypeTrait<T>::Type ElemType;    
-    _wavePrefixInvertableMultiple<WaveOpMul<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)&val);    
+template<typename T>
+__inline__ __device__ T _wavePrefixProduct(WarpMask mask, T val)
+{
+    return _wavePrefixScalar<WaveOpMul<T>, T>(mask, val);
+}
+
+template<typename T>
+__inline__ __device__ T _wavePrefixSum(WarpMask mask, T val)
+{
+    return _wavePrefixInvertableScalar<WaveOpAdd<T>, T>(mask, val);
+}
+
+template<typename T>
+__inline__ __device__ T _wavePrefixXor(WarpMask mask, T val)
+{
+    return _wavePrefixInvertableScalar<WaveOpXor<T>, T>(mask, val);
+}
+
+template<typename T>
+__inline__ __device__ T _wavePrefixOr(WarpMask mask, T val)
+{
+    return _wavePrefixScalar<WaveOpOr<T>, T>(mask, val);
+}
+
+template<typename T>
+__inline__ __device__ T _wavePrefixAnd(WarpMask mask, T val)
+{
+    return _wavePrefixScalar<WaveOpAnd<T>, T>(mask, val);
+}
+
+
+template<typename T>
+__inline__ __device__ T _wavePrefixProductMultiple(WarpMask mask, T val)
+{
+    typedef typename ElementTypeTrait<T>::Type ElemType;
+    _wavePrefixInvertableMultiple<WaveOpMul<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(
+        mask,
+        (ElemType*)&val);
     return val;
 }
 
-template <typename T>
-__inline__ __device__ T _wavePrefixSumMultiple(WarpMask mask, T val) 
-{ 
-    typedef typename ElementTypeTrait<T>::Type ElemType;    
-    _wavePrefixInvertableMultiple<WaveOpAdd<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)&val);    
+template<typename T>
+__inline__ __device__ T _wavePrefixSumMultiple(WarpMask mask, T val)
+{
+    typedef typename ElementTypeTrait<T>::Type ElemType;
+    _wavePrefixInvertableMultiple<WaveOpAdd<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(
+        mask,
+        (ElemType*)&val);
     return val;
 }
 
-template <typename T>
-__inline__ __device__ T _wavePrefixXorMultiple(WarpMask mask, T val)  
-{ 
-    typedef typename ElementTypeTrait<T>::Type ElemType;    
-    _wavePrefixInvertableMultiple<WaveOpXor<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)&val);    
+template<typename T>
+__inline__ __device__ T _wavePrefixXorMultiple(WarpMask mask, T val)
+{
+    typedef typename ElementTypeTrait<T>::Type ElemType;
+    _wavePrefixInvertableMultiple<WaveOpXor<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(
+        mask,
+        (ElemType*)&val);
     return val;
 }
 
-template <typename T>
-__inline__ __device__ T _wavePrefixOrMultiple(WarpMask mask, T val) 
-{ 
-    typedef typename ElementTypeTrait<T>::Type ElemType;    
-    _wavePrefixMultiple<WaveOpOr<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)&val);    
+template<typename T>
+__inline__ __device__ T _wavePrefixOrMultiple(WarpMask mask, T val)
+{
+    typedef typename ElementTypeTrait<T>::Type ElemType;
+    _wavePrefixMultiple<WaveOpOr<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(
+        mask,
+        (ElemType*)&val);
     return val;
 }
 
-template <typename T>
-__inline__ __device__ T _wavePrefixAndMultiple(WarpMask mask, T val)  
-{ 
-    typedef typename ElementTypeTrait<T>::Type ElemType;    
-    _wavePrefixMultiple<WaveOpAnd<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)&val);    
+template<typename T>
+__inline__ __device__ T _wavePrefixAndMultiple(WarpMask mask, T val)
+{
+    typedef typename ElementTypeTrait<T>::Type ElemType;
+    _wavePrefixMultiple<WaveOpAnd<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(
+        mask,
+        (ElemType*)&val);
     return val;
 }
 
-template <typename T>
-__inline__ __device__ uint4 _waveMatchScalar(WarpMask mask, T val) 
+template<typename T>
+__inline__ __device__ uint4 _waveMatchScalar(WarpMask mask, T val)
 {
     int pred;
     return make_uint4(__match_all_sync(mask, val, &pred), 0, 0, 0);
 }
 
-template <typename T>
-__inline__ __device__ uint4 _waveMatchMultiple(WarpMask mask, const T& inVal) 
+template<typename T>
+__inline__ __device__ uint4 _waveMatchMultiple(WarpMask mask, const T& inVal)
 {
     typedef typename ElementTypeTrait<T>::Type ElemType;
     const size_t count = sizeof(T) / sizeof(ElemType);
@@ -2123,7 +3052,7 @@ __inline__ __device__ uint4 _waveMatchMultiple(WarpMask mask, const T& inVal)
     return make_uint4(matchBits, 0, 0, 0);
 }
 
-__device__ uint getAt(dim3 a,  int b)
+__device__ uint getAt(dim3 a, int b)
 {
     SLANG_PRELUDE_ASSERT(b >= 0 && b < 3);
     return (&a.x)[b];
@@ -2146,8 +3075,9 @@ __inline__ __device__ TResult slang_bit_cast(TInput val)
 /* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */
 
 
-/* Type that defines the uniform entry point params. The actual content of this type is dependent on the entry point parameters, and can be
-found via reflection or defined such that it matches the shader appropriately.
+/* Type that defines the uniform entry point params. The actual content of this type is dependent on
+the entry point parameters, and can be found via reflection or defined such that it matches the
+shader appropriately.
 */
 struct UniformEntryPointParams;
 struct UniformState;
@@ -2157,28 +3087,29 @@ struct UniformState;
 struct RayDesc
 {
     float3 Origin;
-    float  TMin;
+    float TMin;
     float3 Direction;
-    float  TMax;
+    float TMax;
 };
 
-static __forceinline__ __device__
-void *unpackOptiXRayPayloadPointer(uint32_t i0, uint32_t i1)
+static __forceinline__ __device__ void* unpackOptiXRayPayloadPointer(uint32_t i0, uint32_t i1)
 {
     const uint64_t uptr = static_cast<uint64_t>(i0) << 32 | i1;
-    void*           ptr = reinterpret_cast<void*>(uptr);
+    void* ptr = reinterpret_cast<void*>(uptr);
     return ptr;
 }
 
-static __forceinline__ __device__
-void  packOptiXRayPayloadPointer(void* ptr, uint32_t& i0, uint32_t& i1)
+static __forceinline__ __device__ void packOptiXRayPayloadPointer(
+    void* ptr,
+    uint32_t& i0,
+    uint32_t& i1)
 {
     const uint64_t uptr = reinterpret_cast<uint64_t>(ptr);
     i0 = uptr >> 32;
     i1 = uptr & 0x00000000ffffffff;
 }
 
-static __forceinline__ __device__ void *getOptiXRayPayloadPtr()
+static __forceinline__ __device__ void* getOptiXRayPayloadPtr()
 {
     const uint32_t u0 = optixGetPayload_0();
     const uint32_t u1 = optixGetPayload_1();
@@ -2186,7 +3117,7 @@ static __forceinline__ __device__ void *getOptiXRayPayloadPtr()
 }
 
 template<typename T>
-__forceinline__ __device__ void *traceOptiXRay(
+__forceinline__ __device__ void* traceOptiXRay(
     OptixTraversableHandle AccelerationStructure,
     uint32_t RayFlags,
     uint32_t InstanceInclusionMask,
@@ -2194,8 +3125,8 @@ __forceinline__ __device__ void *traceOptiXRay(
     uint32_t MultiplierForGeometryContributionToHitGroupIndex,
     uint32_t MissShaderIndex,
     RayDesc Ray,
-    T *Payload
-) {
+    T* Payload)
+{
     uint32_t r0, r1;
     packOptiXRayPayloadPointer((void*)Payload, r0, r1);
     optixTrace(
@@ -2210,8 +3141,8 @@ __forceinline__ __device__ void *traceOptiXRay(
         RayContributionToHitGroupIndex,
         MultiplierForGeometryContributionToHitGroupIndex,
         MissShaderIndex,
-        r0, r1
-    );
+        r0,
+        r1);
 }
 
 #endif
@@ -2256,7 +3187,8 @@ struct TensorView
     template<typename T>
     __device__ T* data_ptr_at(uint4 index)
     {
-        uint64_t offset = strides[0] * index.x + strides[1] * index.y + strides[2] * index.z + strides[3] * index.w;
+        uint64_t offset = strides[0] * index.x + strides[1] * index.y + strides[2] * index.z +
+                          strides[3] * index.w;
         return reinterpret_cast<T*>(data + offset);
     }
 
@@ -2294,22 +3226,28 @@ struct TensorView
     template<typename T>
     __device__ T& load(uint3 index)
     {
-        return *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z);
+        return *reinterpret_cast<T*>(
+            data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z);
     }
     template<typename T>
     __device__ T& load(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
     {
-        return *reinterpret_cast<T*>(data + strides[0] * x + strides[1] * y + strides[2] * z + strides[3] * w);
+        return *reinterpret_cast<T*>(
+            data + strides[0] * x + strides[1] * y + strides[2] * z + strides[3] * w);
     }
     template<typename T>
     __device__ T& load(uint4 index)
     {
-        return *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z + strides[3] * index.w);
+        return *reinterpret_cast<T*>(
+            data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z +
+            strides[3] * index.w);
     }
     template<typename T>
     __device__ T& load(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4)
     {
-        return *reinterpret_cast<T*>(data + strides[0] * i0 + strides[1] * i1 + strides[2] * i2 + strides[3] * i3 + strides[4] * i4);
+        return *reinterpret_cast<T*>(
+            data + strides[0] * i0 + strides[1] * i1 + strides[2] * i2 + strides[3] * i3 +
+            strides[4] * i4);
     }
 
     // Generic version of load
@@ -2347,7 +3285,8 @@ struct TensorView
     template<typename T>
     __device__ void store(uint3 index, T val)
     {
-        *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z) = val;
+        *reinterpret_cast<T*>(
+            data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z) = val;
     }
     template<typename T>
     __device__ void store(uint32_t x, uint32_t y, uint32_t z, uint32_t w, T val)
@@ -2358,12 +3297,16 @@ struct TensorView
     template<typename T>
     __device__ void store(uint4 index, T val)
     {
-        *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z + strides[3] * index.w) = val;
+        *reinterpret_cast<T*>(
+            data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z +
+            strides[3] * index.w) = val;
     }
     template<typename T>
     __device__ void store(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, T val)
     {
-        *reinterpret_cast<T*>(data + strides[0] * i0 + strides[1] * i1 + strides[2] * i2 + strides[3] * i3 + strides[4] * i4) = val;
+        *reinterpret_cast<T*>(
+            data + strides[0] * i0 + strides[1] * i1 + strides[2] * i2 + strides[3] * i3 +
+            strides[4] * i4) = val;
     }
 
     // Generic version
diff --git a/prelude/slang-hlsl-prelude.h b/prelude/slang-hlsl-prelude.h
index d892f228c..8e77201f9 100644
--- a/prelude/slang-hlsl-prelude.h
+++ b/prelude/slang-hlsl-prelude.h
@@ -3,6 +3,6 @@
 #endif
 
 #ifndef __DXC_VERSION_MAJOR
-    // warning X3557: loop doesn't seem to do anything, forcing loop to unroll
-    #pragma warning(disable: 3557)
+// warning X3557: loop doesn't seem to do anything, forcing loop to unroll
+#pragma warning(disable : 3557)
 #endif
diff --git a/prelude/slang-llvm.h b/prelude/slang-llvm.h
index b41380581..e0bbbd14a 100644
--- a/prelude/slang-llvm.h
+++ b/prelude/slang-llvm.h
@@ -1,46 +1,54 @@
 #ifndef SLANG_LLVM_H
 #define SLANG_LLVM_H
 
-// TODO(JS): 
+// TODO(JS):
 // Disable exception declspecs, as not supported on LLVM without some extra options.
 // We could enable with `-fms-extensions`
 #define SLANG_DISABLE_EXCEPTIONS 1
 
 #ifndef SLANG_PRELUDE_ASSERT
-#   ifdef SLANG_PRELUDE_ENABLE_ASSERT
+#ifdef SLANG_PRELUDE_ENABLE_ASSERT
 extern "C" void assertFailure(const char* msg);
-#       define SLANG_PRELUDE_EXPECT(VALUE, MSG) if(VALUE) {} else assertFailure("assertion failed: '" MSG "'")
-#       define SLANG_PRELUDE_ASSERT(VALUE) SLANG_PRELUDE_EXPECT(VALUE, #VALUE)
-#   else // SLANG_PRELUDE_ENABLE_ASSERT
-#       define SLANG_PRELUDE_EXPECT(VALUE, MSG)
-#       define SLANG_PRELUDE_ASSERT(x) 
-#   endif // SLANG_PRELUDE_ENABLE_ASSERT
+#define SLANG_PRELUDE_EXPECT(VALUE, MSG) \
+    if (VALUE)                           \
+    {                                    \
+    }                                    \
+    else                                 \
+        assertFailure("assertion failed: '" MSG "'")
+#define SLANG_PRELUDE_ASSERT(VALUE) SLANG_PRELUDE_EXPECT(VALUE, #VALUE)
+#else // SLANG_PRELUDE_ENABLE_ASSERT
+#define SLANG_PRELUDE_EXPECT(VALUE, MSG)
+#define SLANG_PRELUDE_ASSERT(x)
+#endif // SLANG_PRELUDE_ENABLE_ASSERT
 #endif
 
 /*
-Taken from stddef.h 
+Taken from stddef.h
 */
 
 typedef __PTRDIFF_TYPE__ ptrdiff_t;
 typedef __SIZE_TYPE__ size_t;
 typedef __SIZE_TYPE__ rsize_t;
 
-//typedef __WCHAR_TYPE__ wchar_t;
+// typedef __WCHAR_TYPE__ wchar_t;
 
 #if defined(__need_NULL)
 #undef NULL
 #ifdef __cplusplus
-#  if !defined(__MINGW32__) && !defined(_MSC_VER)
-#    define NULL __null
-#  else
-#    define NULL 0
-#  endif
+#if !defined(__MINGW32__) && !defined(_MSC_VER)
+#define NULL __null
 #else
-#  define NULL ((void*)0)
+#define NULL 0
+#endif
+#else
+#define NULL ((void*)0)
 #endif
 #ifdef __cplusplus
 #if defined(_MSC_EXTENSIONS) && defined(_NATIVE_NULLPTR_SUPPORTED)
-namespace std { typedef decltype(nullptr) nullptr_t; }
+namespace std
+{
+typedef decltype(nullptr) nullptr_t;
+}
 using ::std::nullptr_t;
 #endif
 #endif
@@ -49,18 +57,18 @@ using ::std::nullptr_t;
 
 
 /*
-The following are taken verbatim from stdint.h from Clang in LLVM. Only 8/16/32/64 types are needed. 
+The following are taken verbatim from stdint.h from Clang in LLVM. Only 8/16/32/64 types are needed.
 */
 
 // LLVM/Clang types such that we can use LLVM/Clang without headers for C++ output from Slang
 
 #ifdef __INT64_TYPE__
-# ifndef __int8_t_defined /* glibc sys/types.h also defines int64_t*/
+#ifndef __int8_t_defined /* glibc sys/types.h also defines int64_t*/
 typedef __INT64_TYPE__ int64_t;
-# endif /* __int8_t_defined */
+#endif /* __int8_t_defined */
 typedef __UINT64_TYPE__ uint64_t;
-# define __int_least64_t int64_t
-# define __uint_least64_t uint64_t
+#define __int_least64_t int64_t
+#define __uint_least64_t uint64_t
 #endif /* __INT64_TYPE__ */
 
 #ifdef __int_least64_t
@@ -72,17 +80,17 @@ typedef __uint_least64_t uint_fast64_t;
 
 #ifdef __INT32_TYPE__
 
-# ifndef __int8_t_defined /* glibc sys/types.h also defines int32_t*/
+#ifndef __int8_t_defined /* glibc sys/types.h also defines int32_t*/
 typedef __INT32_TYPE__ int32_t;
-# endif /* __int8_t_defined */
+#endif /* __int8_t_defined */
 
-# ifndef __uint32_t_defined  /* more glibc compatibility */
-# define __uint32_t_defined
+#ifndef __uint32_t_defined /* more glibc compatibility */
+#define __uint32_t_defined
 typedef __UINT32_TYPE__ uint32_t;
-# endif /* __uint32_t_defined */
+#endif /* __uint32_t_defined */
 
-# define __int_least32_t int32_t
-# define __uint_least32_t uint32_t
+#define __int_least32_t int32_t
+#define __uint_least32_t uint32_t
 #endif /* __INT32_TYPE__ */
 
 #ifdef __int_least32_t
@@ -97,8 +105,8 @@ typedef __uint_least32_t uint_fast32_t;
 typedef __INT16_TYPE__ int16_t;
 #endif /* __int8_t_defined */
 typedef __UINT16_TYPE__ uint16_t;
-# define __int_least16_t int16_t
-# define __uint_least16_t uint16_t
+#define __int_least16_t int16_t
+#define __uint_least16_t uint16_t
 #endif /* __INT16_TYPE__ */
 
 #ifdef __int_least16_t
@@ -109,12 +117,12 @@ typedef __uint_least16_t uint_fast16_t;
 #endif /* __int_least16_t */
 
 #ifdef __INT8_TYPE__
-#ifndef __int8_t_defined  /* glibc sys/types.h also defines int8_t*/
+#ifndef __int8_t_defined /* glibc sys/types.h also defines int8_t*/
 typedef __INT8_TYPE__ int8_t;
 #endif /* __int8_t_defined */
 typedef __UINT8_TYPE__ uint8_t;
-# define __int_least8_t int8_t
-# define __uint_least8_t uint8_t
+#define __int_least8_t int8_t
+#define __uint_least8_t uint8_t
 #endif /* __INT8_TYPE__ */
 
 #ifdef __int_least8_t
@@ -126,12 +134,12 @@ typedef __uint_least8_t uint_fast8_t;
 
 /* prevent glibc sys/types.h from defining conflicting types */
 #ifndef __int8_t_defined
-# define __int8_t_defined
+#define __int8_t_defined
 #endif /* __int8_t_defined */
 
 /* C99 7.18.1.4 Integer types capable of holding object pointers.
  */
-#define __stdint_join3(a,b,c) a ## b ## c
+#define __stdint_join3(a, b, c) a##b##c
 
 #ifndef _INTPTR_T
 #ifndef __intptr_t_defined
@@ -148,7 +156,7 @@ typedef __UINTPTR_TYPE__ uintptr_t;
 
 /* C99 7.18.1.5 Greatest-width integer types.
  */
-typedef __INTMAX_TYPE__  intmax_t;
+typedef __INTMAX_TYPE__ intmax_t;
 typedef __UINTMAX_TYPE__ uintmax_t;
 
 /* C99 7.18.4 Macros for minimum-width integer constants.
@@ -168,82 +176,82 @@ typedef __UINTMAX_TYPE__ uintmax_t;
  * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
  */
 
-#define __int_c_join(a, b) a ## b
+#define __int_c_join(a, b) a##b
 #define __int_c(v, suffix) __int_c_join(v, suffix)
 #define __uint_c(v, suffix) __int_c_join(v##U, suffix)
 
 #ifdef __INT64_TYPE__
-# ifdef __INT64_C_SUFFIX__
-#  define __int64_c_suffix __INT64_C_SUFFIX__
-# else
-#  undef __int64_c_suffix
-# endif /* __INT64_C_SUFFIX__ */
+#ifdef __INT64_C_SUFFIX__
+#define __int64_c_suffix __INT64_C_SUFFIX__
+#else
+#undef __int64_c_suffix
+#endif /* __INT64_C_SUFFIX__ */
 #endif /* __INT64_TYPE__ */
 
 #ifdef __int_least64_t
-# ifdef __int64_c_suffix
-#  define INT64_C(v) __int_c(v, __int64_c_suffix)
-#  define UINT64_C(v) __uint_c(v, __int64_c_suffix)
-# else
-#  define INT64_C(v) v
-#  define UINT64_C(v) v ## U
-# endif /* __int64_c_suffix */
+#ifdef __int64_c_suffix
+#define INT64_C(v) __int_c(v, __int64_c_suffix)
+#define UINT64_C(v) __uint_c(v, __int64_c_suffix)
+#else
+#define INT64_C(v) v
+#define UINT64_C(v) v##U
+#endif /* __int64_c_suffix */
 #endif /* __int_least64_t */
 
 
 #ifdef __INT32_TYPE__
-# ifdef __INT32_C_SUFFIX__
-#  define __int32_c_suffix __INT32_C_SUFFIX__
+#ifdef __INT32_C_SUFFIX__
+#define __int32_c_suffix __INT32_C_SUFFIX__
 #else
-#  undef __int32_c_suffix
-# endif /* __INT32_C_SUFFIX__ */
+#undef __int32_c_suffix
+#endif /* __INT32_C_SUFFIX__ */
 #endif /* __INT32_TYPE__ */
 
 #ifdef __int_least32_t
-# ifdef __int32_c_suffix
-#  define INT32_C(v) __int_c(v, __int32_c_suffix)
-#  define UINT32_C(v) __uint_c(v, __int32_c_suffix)
-# else
-#  define INT32_C(v) v
-#  define UINT32_C(v) v ## U
-# endif /* __int32_c_suffix */
+#ifdef __int32_c_suffix
+#define INT32_C(v) __int_c(v, __int32_c_suffix)
+#define UINT32_C(v) __uint_c(v, __int32_c_suffix)
+#else
+#define INT32_C(v) v
+#define UINT32_C(v) v##U
+#endif /* __int32_c_suffix */
 #endif /* __int_least32_t */
 
 #ifdef __INT16_TYPE__
-# ifdef __INT16_C_SUFFIX__
-#  define __int16_c_suffix __INT16_C_SUFFIX__
+#ifdef __INT16_C_SUFFIX__
+#define __int16_c_suffix __INT16_C_SUFFIX__
 #else
-#  undef __int16_c_suffix
-# endif /* __INT16_C_SUFFIX__ */
+#undef __int16_c_suffix
+#endif /* __INT16_C_SUFFIX__ */
 #endif /* __INT16_TYPE__ */
 
 #ifdef __int_least16_t
-# ifdef __int16_c_suffix
-#  define INT16_C(v) __int_c(v, __int16_c_suffix)
-#  define UINT16_C(v) __uint_c(v, __int16_c_suffix)
-# else
-#  define INT16_C(v) v
-#  define UINT16_C(v) v ## U
-# endif /* __int16_c_suffix */
+#ifdef __int16_c_suffix
+#define INT16_C(v) __int_c(v, __int16_c_suffix)
+#define UINT16_C(v) __uint_c(v, __int16_c_suffix)
+#else
+#define INT16_C(v) v
+#define UINT16_C(v) v##U
+#endif /* __int16_c_suffix */
 #endif /* __int_least16_t */
 
 
 #ifdef __INT8_TYPE__
-# ifdef __INT8_C_SUFFIX__
-#  define __int8_c_suffix __INT8_C_SUFFIX__
+#ifdef __INT8_C_SUFFIX__
+#define __int8_c_suffix __INT8_C_SUFFIX__
 #else
-#  undef  __int8_c_suffix
-# endif /* __INT8_C_SUFFIX__ */
+#undef __int8_c_suffix
+#endif /* __INT8_C_SUFFIX__ */
 #endif /* __INT8_TYPE__ */
 
 #ifdef __int_least8_t
-# ifdef __int8_c_suffix
-#  define INT8_C(v) __int_c(v, __int8_c_suffix)
-#  define UINT8_C(v) __uint_c(v, __int8_c_suffix)
-# else
-#  define INT8_C(v) v
-#  define UINT8_C(v) v ## U
-# endif /* __int8_c_suffix */
+#ifdef __int8_c_suffix
+#define INT8_C(v) __int_c(v, __int8_c_suffix)
+#define UINT8_C(v) __uint_c(v, __int8_c_suffix)
+#else
+#define INT8_C(v) v
+#define UINT8_C(v) v##U
+#endif /* __int8_c_suffix */
 #endif /* __int_least8_t */
 
 /* C99 7.18.2.1 Limits of exact-width integer types.
@@ -266,133 +274,131 @@ typedef __UINTMAX_TYPE__ uintmax_t;
  */
 
 #ifdef __INT64_TYPE__
-# define INT64_MAX           INT64_C( 9223372036854775807)
-# define INT64_MIN         (-INT64_C( 9223372036854775807)-1)
-# define UINT64_MAX         UINT64_C(18446744073709551615)
-# define __INT_LEAST64_MIN   INT64_MIN
-# define __INT_LEAST64_MAX   INT64_MAX
-# define __UINT_LEAST64_MAX UINT64_MAX
+#define INT64_MAX INT64_C(9223372036854775807)
+#define INT64_MIN (-INT64_C(9223372036854775807) - 1)
+#define UINT64_MAX UINT64_C(18446744073709551615)
+#define __INT_LEAST64_MIN INT64_MIN
+#define __INT_LEAST64_MAX INT64_MAX
+#define __UINT_LEAST64_MAX UINT64_MAX
 #endif /* __INT64_TYPE__ */
 
 #ifdef __INT_LEAST64_MIN
-# define INT_LEAST64_MIN   __INT_LEAST64_MIN
-# define INT_LEAST64_MAX   __INT_LEAST64_MAX
-# define UINT_LEAST64_MAX __UINT_LEAST64_MAX
-# define INT_FAST64_MIN    __INT_LEAST64_MIN
-# define INT_FAST64_MAX    __INT_LEAST64_MAX
-# define UINT_FAST64_MAX  __UINT_LEAST64_MAX
+#define INT_LEAST64_MIN __INT_LEAST64_MIN
+#define INT_LEAST64_MAX __INT_LEAST64_MAX
+#define UINT_LEAST64_MAX __UINT_LEAST64_MAX
+#define INT_FAST64_MIN __INT_LEAST64_MIN
+#define INT_FAST64_MAX __INT_LEAST64_MAX
+#define UINT_FAST64_MAX __UINT_LEAST64_MAX
 #endif /* __INT_LEAST64_MIN */
 
 #ifdef __INT32_TYPE__
-# define INT32_MAX           INT32_C(2147483647)
-# define INT32_MIN         (-INT32_C(2147483647)-1)
-# define UINT32_MAX         UINT32_C(4294967295)
-# define __INT_LEAST32_MIN   INT32_MIN
-# define __INT_LEAST32_MAX   INT32_MAX
-# define __UINT_LEAST32_MAX UINT32_MAX
+#define INT32_MAX INT32_C(2147483647)
+#define INT32_MIN (-INT32_C(2147483647) - 1)
+#define UINT32_MAX UINT32_C(4294967295)
+#define __INT_LEAST32_MIN INT32_MIN
+#define __INT_LEAST32_MAX INT32_MAX
+#define __UINT_LEAST32_MAX UINT32_MAX
 #endif /* __INT32_TYPE__ */
 
 #ifdef __INT_LEAST32_MIN
-# define INT_LEAST32_MIN   __INT_LEAST32_MIN
-# define INT_LEAST32_MAX   __INT_LEAST32_MAX
-# define UINT_LEAST32_MAX __UINT_LEAST32_MAX
-# define INT_FAST32_MIN    __INT_LEAST32_MIN
-# define INT_FAST32_MAX    __INT_LEAST32_MAX
-# define UINT_FAST32_MAX  __UINT_LEAST32_MAX
+#define INT_LEAST32_MIN __INT_LEAST32_MIN
+#define INT_LEAST32_MAX __INT_LEAST32_MAX
+#define UINT_LEAST32_MAX __UINT_LEAST32_MAX
+#define INT_FAST32_MIN __INT_LEAST32_MIN
+#define INT_FAST32_MAX __INT_LEAST32_MAX
+#define UINT_FAST32_MAX __UINT_LEAST32_MAX
 #endif /* __INT_LEAST32_MIN */
 
 #ifdef __INT16_TYPE__
-#define INT16_MAX            INT16_C(32767)
-#define INT16_MIN          (-INT16_C(32767)-1)
-#define UINT16_MAX          UINT16_C(65535)
-# define __INT_LEAST16_MIN   INT16_MIN
-# define __INT_LEAST16_MAX   INT16_MAX
-# define __UINT_LEAST16_MAX UINT16_MAX
+#define INT16_MAX INT16_C(32767)
+#define INT16_MIN (-INT16_C(32767) - 1)
+#define UINT16_MAX UINT16_C(65535)
+#define __INT_LEAST16_MIN INT16_MIN
+#define __INT_LEAST16_MAX INT16_MAX
+#define __UINT_LEAST16_MAX UINT16_MAX
 #endif /* __INT16_TYPE__ */
 
 #ifdef __INT_LEAST16_MIN
-# define INT_LEAST16_MIN   __INT_LEAST16_MIN
-# define INT_LEAST16_MAX   __INT_LEAST16_MAX
-# define UINT_LEAST16_MAX __UINT_LEAST16_MAX
-# define INT_FAST16_MIN    __INT_LEAST16_MIN
-# define INT_FAST16_MAX    __INT_LEAST16_MAX
-# define UINT_FAST16_MAX  __UINT_LEAST16_MAX
+#define INT_LEAST16_MIN __INT_LEAST16_MIN
+#define INT_LEAST16_MAX __INT_LEAST16_MAX
+#define UINT_LEAST16_MAX __UINT_LEAST16_MAX
+#define INT_FAST16_MIN __INT_LEAST16_MIN
+#define INT_FAST16_MAX __INT_LEAST16_MAX
+#define UINT_FAST16_MAX __UINT_LEAST16_MAX
 #endif /* __INT_LEAST16_MIN */
 
 
 #ifdef __INT8_TYPE__
-# define INT8_MAX            INT8_C(127)
-# define INT8_MIN          (-INT8_C(127)-1)
-# define UINT8_MAX          UINT8_C(255)
-# define __INT_LEAST8_MIN    INT8_MIN
-# define __INT_LEAST8_MAX    INT8_MAX
-# define __UINT_LEAST8_MAX  UINT8_MAX
+#define INT8_MAX INT8_C(127)
+#define INT8_MIN (-INT8_C(127) - 1)
+#define UINT8_MAX UINT8_C(255)
+#define __INT_LEAST8_MIN INT8_MIN
+#define __INT_LEAST8_MAX INT8_MAX
+#define __UINT_LEAST8_MAX UINT8_MAX
 #endif /* __INT8_TYPE__ */
 
 #ifdef __INT_LEAST8_MIN
-# define INT_LEAST8_MIN   __INT_LEAST8_MIN
-# define INT_LEAST8_MAX   __INT_LEAST8_MAX
-# define UINT_LEAST8_MAX __UINT_LEAST8_MAX
-# define INT_FAST8_MIN    __INT_LEAST8_MIN
-# define INT_FAST8_MAX    __INT_LEAST8_MAX
-# define UINT_FAST8_MAX  __UINT_LEAST8_MAX
+#define INT_LEAST8_MIN __INT_LEAST8_MIN
+#define INT_LEAST8_MAX __INT_LEAST8_MAX
+#define UINT_LEAST8_MAX __UINT_LEAST8_MAX
+#define INT_FAST8_MIN __INT_LEAST8_MIN
+#define INT_FAST8_MAX __INT_LEAST8_MAX
+#define UINT_FAST8_MAX __UINT_LEAST8_MAX
 #endif /* __INT_LEAST8_MIN */
 
 /* Some utility macros */
-#define  __INTN_MIN(n)  __stdint_join3( INT, n, _MIN)
-#define  __INTN_MAX(n)  __stdint_join3( INT, n, _MAX)
-#define __UINTN_MAX(n)  __stdint_join3(UINT, n, _MAX)
-#define  __INTN_C(n, v) __stdint_join3( INT, n, _C(v))
+#define __INTN_MIN(n) __stdint_join3(INT, n, _MIN)
+#define __INTN_MAX(n) __stdint_join3(INT, n, _MAX)
+#define __UINTN_MAX(n) __stdint_join3(UINT, n, _MAX)
+#define __INTN_C(n, v) __stdint_join3(INT, n, _C(v))
 #define __UINTN_C(n, v) __stdint_join3(UINT, n, _C(v))
 
 /* C99 7.18.2.4 Limits of integer types capable of holding object pointers. */
 /* C99 7.18.3 Limits of other integer types. */
 
-#define  INTPTR_MIN  (-__INTPTR_MAX__-1)
-#define  INTPTR_MAX    __INTPTR_MAX__
-#define UINTPTR_MAX   __UINTPTR_MAX__
-#define PTRDIFF_MIN (-__PTRDIFF_MAX__-1)
-#define PTRDIFF_MAX   __PTRDIFF_MAX__
-#define    SIZE_MAX      __SIZE_MAX__
+#define INTPTR_MIN (-__INTPTR_MAX__ - 1)
+#define INTPTR_MAX __INTPTR_MAX__
+#define UINTPTR_MAX __UINTPTR_MAX__
+#define PTRDIFF_MIN (-__PTRDIFF_MAX__ - 1)
+#define PTRDIFF_MAX __PTRDIFF_MAX__
+#define SIZE_MAX __SIZE_MAX__
 
 /* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__
  * is enabled. */
 #if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1
-#define   RSIZE_MAX            (SIZE_MAX >> 1)
+#define RSIZE_MAX (SIZE_MAX >> 1)
 #endif
 
 /* C99 7.18.2.5 Limits of greatest-width integer types. */
-#define  INTMAX_MIN (-__INTMAX_MAX__-1)
-#define  INTMAX_MAX   __INTMAX_MAX__
-#define UINTMAX_MAX  __UINTMAX_MAX__
+#define INTMAX_MIN (-__INTMAX_MAX__ - 1)
+#define INTMAX_MAX __INTMAX_MAX__
+#define UINTMAX_MAX __UINTMAX_MAX__
 
 /* C99 7.18.3 Limits of other integer types. */
 #define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__)
 #define SIG_ATOMIC_MAX __INTN_MAX(__SIG_ATOMIC_WIDTH__)
 #ifdef __WINT_UNSIGNED__
-# define WINT_MIN       __UINTN_C(__WINT_WIDTH__, 0)
-# define WINT_MAX       __UINTN_MAX(__WINT_WIDTH__)
+#define WINT_MIN __UINTN_C(__WINT_WIDTH__, 0)
+#define WINT_MAX __UINTN_MAX(__WINT_WIDTH__)
 #else
-# define WINT_MIN       __INTN_MIN(__WINT_WIDTH__)
-# define WINT_MAX       __INTN_MAX(__WINT_WIDTH__)
+#define WINT_MIN __INTN_MIN(__WINT_WIDTH__)
+#define WINT_MAX __INTN_MAX(__WINT_WIDTH__)
 #endif
 
 #ifndef WCHAR_MAX
-# define WCHAR_MAX __WCHAR_MAX__
+#define WCHAR_MAX __WCHAR_MAX__
 #endif
 #ifndef WCHAR_MIN
-# if __WCHAR_MAX__ == __INTN_MAX(__WCHAR_WIDTH__)
-#  define WCHAR_MIN __INTN_MIN(__WCHAR_WIDTH__)
-# else
-#  define WCHAR_MIN __UINTN_C(__WCHAR_WIDTH__, 0)
-# endif
+#if __WCHAR_MAX__ == __INTN_MAX(__WCHAR_WIDTH__)
+#define WCHAR_MIN __INTN_MIN(__WCHAR_WIDTH__)
+#else
+#define WCHAR_MIN __UINTN_C(__WCHAR_WIDTH__, 0)
+#endif
 #endif
 
 /* 7.18.4.2 Macros for greatest-width integer constants. */
-#define  INTMAX_C(v) __int_c(v,  __INTMAX_C_SUFFIX__)
+#define INTMAX_C(v) __int_c(v, __INTMAX_C_SUFFIX__)
 #define UINTMAX_C(v) __int_c(v, __UINTMAX_C_SUFFIX__)
 
 
 #endif // SLANG_LLVM_H
-
-
diff --git a/prelude/slang-torch-prelude.h b/prelude/slang-torch-prelude.h
index 11ffe3b66..d303c1045 100644
--- a/prelude/slang-torch-prelude.h
+++ b/prelude/slang-torch-prelude.h
@@ -1,64 +1,67 @@
 // Prelude for PyTorch cpp binding.
 
-#include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAUtils.h>
-#include <vector>
 #include <stdexcept>
 #include <string>
+#include <torch/extension.h>
+#include <vector>
 
 #ifdef SLANG_LLVM
 #include "slang-llvm.h"
 #else // SLANG_LLVM
-#   if SLANG_GCC_FAMILY && __GNUC__ < 6
-#       include <cmath>
-#       define SLANG_PRELUDE_STD std::
-#   else
-#       include <math.h>
-#       define SLANG_PRELUDE_STD
-#   endif
-
-#   include <assert.h>
-#   include <stdlib.h>
-#   include <string.h>
-#   include <stdint.h>
+#if SLANG_GCC_FAMILY && __GNUC__ < 6
+#include <cmath>
+#define SLANG_PRELUDE_STD std::
+#else
+#include <math.h>
+#define SLANG_PRELUDE_STD
+#endif
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
 #endif // SLANG_LLVM
 
 #include "../source/core/slang-string.h"
 
 #if defined(_MSC_VER)
-#   define SLANG_PRELUDE_SHARED_LIB_EXPORT __declspec(dllexport)
+#define SLANG_PRELUDE_SHARED_LIB_EXPORT __declspec(dllexport)
 #else
-#   define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__((__visibility__("default")))
-//#   define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__ ((dllexport)) __attribute__((__visibility__("default")))
-#endif    
-
-#ifdef __cplusplus    
-#   define SLANG_PRELUDE_EXTERN_C extern "C"
-#   define SLANG_PRELUDE_EXTERN_C_START extern "C" {
-#   define SLANG_PRELUDE_EXTERN_C_END }
+#define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__((__visibility__("default")))
+// #   define SLANG_PRELUDE_SHARED_LIB_EXPORT __attribute__ ((dllexport))
+// __attribute__((__visibility__("default")))
+#endif
+
+#ifdef __cplusplus
+#define SLANG_PRELUDE_EXTERN_C extern "C"
+#define SLANG_PRELUDE_EXTERN_C_START \
+    extern "C"                       \
+    {
+#define SLANG_PRELUDE_EXTERN_C_END }
 #else
-#   define SLANG_PRELUDE_EXTERN_C 
-#   define SLANG_PRELUDE_EXTERN_C_START
-#   define SLANG_PRELUDE_EXTERN_C_END 
-#endif    
+#define SLANG_PRELUDE_EXTERN_C
+#define SLANG_PRELUDE_EXTERN_C_START
+#define SLANG_PRELUDE_EXTERN_C_END
+#endif
 
 #define SLANG_PRELUDE_NAMESPACE
 
 #ifndef SLANG_NO_THROW
-#   define SLANG_NO_THROW
+#define SLANG_NO_THROW
 #endif
 #ifndef SLANG_STDCALL
-#   define SLANG_STDCALL
+#define SLANG_STDCALL
 #endif
 #ifndef SLANG_MCALL
-#   define SLANG_MCALL SLANG_STDCALL
+#define SLANG_MCALL SLANG_STDCALL
 #endif
 #ifndef SLANG_FORCE_INLINE
-#    define SLANG_FORCE_INLINE inline
+#define SLANG_FORCE_INLINE inline
 #endif
-#include "slang-cpp-types-core.h"
 #include "slang-cpp-scalar-intrinsics.h"
+#include "slang-cpp-types-core.h"
 
 
 static const int kSlangTorchTensorMaxDim = 5;
@@ -72,20 +75,26 @@ struct TensorView
 };
 
 
-TensorView make_tensor_view(torch::Tensor val, const char* name, torch::ScalarType targetScalarType, bool requireContiguous)
+TensorView make_tensor_view(
+    torch::Tensor val,
+    const char* name,
+    torch::ScalarType targetScalarType,
+    bool requireContiguous)
 {
     // We're currently not trying to implicitly cast or transfer to device for two reasons:
     // 1. There appears to be a bug with .to() where successive calls after the first one fail.
-    // 2. Silent casts like this can cause large memory allocations & unexpected overheads. 
+    // 2. Silent casts like this can cause large memory allocations & unexpected overheads.
     //    It's better to be explicit.
 
     // Expect tensors to be on CUDA device
     if (!val.device().is_cuda())
-        throw std::runtime_error(std::string(name).append(": tensor is not on CUDA device.").c_str());
+        throw std::runtime_error(
+            std::string(name).append(": tensor is not on CUDA device.").c_str());
 
     // Expect tensors to be the right type.
     if (val.dtype() != targetScalarType)
-        throw std::runtime_error(std::string(name).append(": tensor is not of the expected type.").c_str());
+        throw std::runtime_error(
+            std::string(name).append(": tensor is not of the expected type.").c_str());
 
     // Check that the tensor is contiguous
     if (requireContiguous && !val.is_contiguous())
@@ -138,14 +147,22 @@ TensorView make_tensor_view(torch::Tensor val, const char* name, torch::ScalarTy
     }
 
     if (val.dim() > kSlangTorchTensorMaxDim)
-        throw std::runtime_error(std::string(name).append(": number of dimensions exceeds limit (").append(std::to_string(kSlangTorchTensorMaxDim)).append(")").c_str());
+        throw std::runtime_error(std::string(name)
+                                     .append(": number of dimensions exceeds limit (")
+                                     .append(std::to_string(kSlangTorchTensorMaxDim))
+                                     .append(")")
+                                     .c_str());
 
     bool isEmpty = true;
     for (int i = 0; i < val.dim(); ++i)
     {
         res.strides[i] = val.stride(i) * elementSize;
         if (res.strides[i] == 0)
-            throw std::runtime_error(std::string(name).append(": tensors with broadcasted dimensions are not supported (use tensor.contiguous() to make tensor whole)").c_str());
+            throw std::runtime_error(
+                std::string(name)
+                    .append(": tensors with broadcasted dimensions are not supported (use "
+                            "tensor.contiguous() to make tensor whole)")
+                    .c_str());
 
         res.sizes[i] = val.size(i);
         if (res.sizes[i] > 0)
author	Ellie Hermaszewska <ellieh@nvidia.com>	2024-10-29 14:49:26 +0800
committer	GitHub <noreply@github.com>	2024-10-29 14:49:26 +0800
commit	f65d756bff8d4c5cbc15bd0322a2ae8e6b896a21 (patch)
tree	ea1d61342cd29368e19135000ec2948813096205 /prelude
parent	a729c15e9dce9f5116a38afc66329ab2ca4cea54 (diff)