diff options
Diffstat (limited to 'ComputeShaders')
57 files changed, 4086 insertions, 0 deletions
diff --git a/ComputeShaders/ComputeShaders.cpp b/ComputeShaders/ComputeShaders.cpp new file mode 100644 index 0000000..9c03f27 --- /dev/null +++ b/ComputeShaders/ComputeShaders.cpp @@ -0,0 +1,3 @@ +void fnComputeShaders() +{ +}
\ No newline at end of file diff --git a/ComputeShaders/ComputeShaders.vcxproj b/ComputeShaders/ComputeShaders.vcxproj new file mode 100644 index 0000000..350d266 --- /dev/null +++ b/ComputeShaders/ComputeShaders.vcxproj @@ -0,0 +1,221 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup Label="ProjectConfigurations"> + <ProjectConfiguration Include="Release|x64"> + <Configuration>Release</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Debug|x64"> + <Configuration>Debug</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|x64"> + <Configuration>Release</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Debug|x64"> + <Configuration>Debug</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + </ItemGroup> + <PropertyGroup Label="Globals"> + <VCProjectVersion>16.0</VCProjectVersion> + <Keyword>Win32Proj</Keyword> + <ProjectGuid>{1c39d386-96d0-47a1-bbfa-68bbdb24439c}</ProjectGuid> + <RootNamespace>ComputeShaders</RootNamespace> + <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> + <ConfigurationType>StaticLibrary</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <PlatformToolset>v143</PlatformToolset> + <WholeProgramOptimization>true</WholeProgramOptimization> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> + <ConfigurationType>StaticLibrary</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <PlatformToolset>v143</PlatformToolset> + <WholeProgramOptimization>true</WholeProgramOptimization> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> + <ConfigurationType>StaticLibrary</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <PlatformToolset>v143</PlatformToolset> + <WholeProgramOptimization>true</WholeProgramOptimization> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> + <ConfigurationType>StaticLibrary</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <PlatformToolset>v143</PlatformToolset> + <WholeProgramOptimization>true</WholeProgramOptimization> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <ImportGroup Label="Shared"> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <MultiProcFXC>true</MultiProcFXC> + <OutDir>$(Platform)\$(Configuration)\</OutDir> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <OutDir>$(Platform)\$(Configuration)\</OutDir> + <MultiProcFXC>true</MultiProcFXC> + </PropertyGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <FunctionLevelLinking>true</FunctionLevelLinking> + <IntrinsicFunctions>true</IntrinsicFunctions> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + </ClCompile> + <Link> + <SubSystem> + </SubSystem> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <OptimizeReferences>true</OptimizeReferences> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <FunctionLevelLinking>true</FunctionLevelLinking> + <IntrinsicFunctions>true</IntrinsicFunctions> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + </ClCompile> + <Link> + <SubSystem> + </SubSystem> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <OptimizeReferences>true</OptimizeReferences> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <FunctionLevelLinking>true</FunctionLevelLinking> + <IntrinsicFunctions>true</IntrinsicFunctions> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + </ClCompile> + <Link> + <SubSystem> + </SubSystem> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <OptimizeReferences>true</OptimizeReferences> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + <FxCompile> + <ShaderModel>5.0</ShaderModel> + <ShaderType>Compute</ShaderType> + </FxCompile> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <FunctionLevelLinking>true</FunctionLevelLinking> + <IntrinsicFunctions>true</IntrinsicFunctions> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + </ClCompile> + <Link> + <SubSystem> + </SubSystem> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <OptimizeReferences>true</OptimizeReferences> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + <FxCompile> + <ShaderModel>5.0</ShaderModel> + <ShaderType>Compute</ShaderType> + <DisableOptimizations>true</DisableOptimizations> + <EnableDebuggingInformation>true</EnableDebuggingInformation> + </FxCompile> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="ComputeShaders.cpp" /> + </ItemGroup> + <ItemGroup> + <FxCompile Include="add.hlsl" /> + <FxCompile Include="addInPlace.hlsl" /> + <FxCompile Include="addRepeat.hlsl" /> + <FxCompile Include="addRepeat64.hlsl" /> + <FxCompile Include="addRepeatGelu.hlsl" /> + <FxCompile Include="addRepeatGelu64.hlsl" /> + <FxCompile Include="addRepeatScale.hlsl" /> + <FxCompile Include="addRows.hlsl" /> + <FxCompile Include="convolutionMain.hlsl" /> + <FxCompile Include="convolutionMain2.hlsl" /> + <FxCompile Include="convolutionMain2Fixed.hlsl" /> + <FxCompile Include="convolutionPrep1.hlsl" /> + <FxCompile Include="convolutionPrep2.hlsl" /> + <FxCompile Include="copyConvert.hlsl" /> + <FxCompile Include="copyTranspose.hlsl" /> + <FxCompile Include="diagMaskInf.hlsl" /> + <FxCompile Include="flashAttention.hlsl" /> + <FxCompile Include="flashAttentionCompat1.hlsl" /> + <FxCompile Include="flashAttentionCompat2.hlsl" /> + <FxCompile Include="flashAttentionCompat3.hlsl" /> + <FxCompile Include="fmaRepeat1.hlsl" /> + <FxCompile Include="fmaRepeat164.hlsl" /> + <FxCompile Include="fmaRepeat2.hlsl" /> + <FxCompile Include="matReshapePanels.hlsl" /> + <FxCompile Include="mulMatByRow.hlsl" /> + <FxCompile Include="mulMatByRow64.hlsl" /> + <FxCompile Include="mulMatByRowTiled.hlsl" /> + <FxCompile Include="mulMatByRowTiled64.hlsl" /> + <FxCompile Include="mulMatByRowTiledEx.hlsl" /> + <FxCompile Include="mulMatByScalar.hlsl" /> + <FxCompile Include="mulMatDotMain.hlsl" /> + <FxCompile Include="mulMatDotReshape.hlsl" /> + <FxCompile Include="mulMatMadMain.hlsl" /> + <FxCompile Include="mulMatTiled.hlsl" /> + <FxCompile Include="mulMatTiled64.hlsl" /> + <FxCompile Include="mulMatTiledEx.hlsl" /> + <FxCompile Include="norm.hlsl" /> + <FxCompile Include="normCompat.hlsl" /> + <FxCompile Include="normFixed.hlsl" /> + <FxCompile Include="normFixed64.hlsl" /> + <FxCompile Include="scaleInPlace.hlsl" /> + <FxCompile Include="softMax.hlsl" /> + <FxCompile Include="softMax64.hlsl" /> + <FxCompile Include="softMaxCompat.hlsl" /> + <FxCompile Include="softMaxFixed.hlsl" /> + <FxCompile Include="zeroMemory.hlsl" /> + </ItemGroup> + <ItemGroup> + <None Include="componentwiseBinaryOp.hlsli" /> + <None Include="flashAttentionCommon.hlsli" /> + <None Include="fp64Utils.hlsli" /> + <None Include="groupReduce.hlsli" /> + <None Include="groupReduce64.hlsli" /> + <None Include="miscUtils.hlsli" /> + <None Include="repeatUtils.hlsli" /> + </ItemGroup> + <ItemGroup> + <Text Include="Readme.txt" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/ComputeShaders/ComputeShaders.vcxproj.filters b/ComputeShaders/ComputeShaders.vcxproj.filters new file mode 100644 index 0000000..12f1559 --- /dev/null +++ b/ComputeShaders/ComputeShaders.vcxproj.filters @@ -0,0 +1,66 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <ClCompile Include="ComputeShaders.cpp" /> + </ItemGroup> + <ItemGroup> + <FxCompile Include="mulMatDotMain.hlsl" /> + <FxCompile Include="mulMatDotReshape.hlsl" /> + <FxCompile Include="convolutionMain.hlsl" /> + <FxCompile Include="convolutionPrep1.hlsl" /> + <FxCompile Include="convolutionPrep2.hlsl" /> + <FxCompile Include="add.hlsl" /> + <FxCompile Include="flashAttention.hlsl" /> + <FxCompile Include="convolutionMain2.hlsl" /> + <FxCompile Include="norm.hlsl" /> + <FxCompile Include="copyConvert.hlsl" /> + <FxCompile Include="copyTranspose.hlsl" /> + <FxCompile Include="normCompat.hlsl" /> + <FxCompile Include="flashAttentionCompat1.hlsl" /> + <FxCompile Include="flashAttentionCompat3.hlsl" /> + <FxCompile Include="flashAttentionCompat2.hlsl" /> + <FxCompile Include="scaleInPlace.hlsl" /> + <FxCompile Include="diagMaskInf.hlsl" /> + <FxCompile Include="softMaxCompat.hlsl" /> + <FxCompile Include="mulMatMadMain.hlsl" /> + <FxCompile Include="addRepeat.hlsl" /> + <FxCompile Include="fmaRepeat1.hlsl" /> + <FxCompile Include="fmaRepeat2.hlsl" /> + <FxCompile Include="addInPlace.hlsl" /> + <FxCompile Include="softMax.hlsl" /> + <FxCompile Include="addRepeatScale.hlsl" /> + <FxCompile Include="mulMatByRow.hlsl" /> + <FxCompile Include="mulMatByScalar.hlsl" /> + <FxCompile Include="mulMatTiled.hlsl" /> + <FxCompile Include="mulMatByRow64.hlsl" /> + <FxCompile Include="softMax64.hlsl" /> + <FxCompile Include="softMaxFixed.hlsl" /> + <FxCompile Include="addRepeat64.hlsl" /> + <FxCompile Include="fmaRepeat164.hlsl" /> + <FxCompile Include="addRepeatGelu.hlsl" /> + <FxCompile Include="addRepeatGelu64.hlsl" /> + <FxCompile Include="normFixed.hlsl" /> + <FxCompile Include="normFixed64.hlsl" /> + <FxCompile Include="mulMatByRowTiled.hlsl" /> + <FxCompile Include="convolutionMain2Fixed.hlsl" /> + <FxCompile Include="mulMatByRowTiled64.hlsl" /> + <FxCompile Include="addRows.hlsl" /> + <FxCompile Include="mulMatTiled64.hlsl" /> + <FxCompile Include="zeroMemory.hlsl" /> + <FxCompile Include="mulMatTiledEx.hlsl" /> + <FxCompile Include="matReshapePanels.hlsl" /> + <FxCompile Include="mulMatByRowTiledEx.hlsl" /> + </ItemGroup> + <ItemGroup> + <None Include="componentwiseBinaryOp.hlsli" /> + <None Include="miscUtils.hlsli" /> + <None Include="groupReduce.hlsli" /> + <None Include="fp64Utils.hlsli" /> + <None Include="flashAttentionCommon.hlsli" /> + <None Include="repeatUtils.hlsli" /> + <None Include="groupReduce64.hlsli" /> + </ItemGroup> + <ItemGroup> + <Text Include="Readme.txt" /> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/ComputeShaders/Readme.txt b/ComputeShaders/Readme.txt new file mode 100644 index 0000000..18d089e --- /dev/null +++ b/ComputeShaders/Readme.txt @@ -0,0 +1,11 @@ +This project compiles all the compute shaders which implement the model. + +Many shaders come in 2 versions, something.hlsl and something64.hlsl + +The version with the `64` suffix is used on AMD GPUs, the version without suffix is used on nVidia and Intel GPUs. + +Not all of these shaders are actually used for anything. +Some of them are implementing binary compatibility for the reference CPU version, and not used unless messing with the `constexpr` flags in MlContext C++ class. +Such shaders often require FP64 support, which is an optional feature in D3D11. +CompressShaders tool detects such shaders by looking at the SFI0 chunk in the binary, and outputs a bitmap of the FP64 shaders. +This way, missing FP64 hardware support shouldn’t break the library.
\ No newline at end of file diff --git a/ComputeShaders/add.hlsl b/ComputeShaders/add.hlsl new file mode 100644 index 0000000..819443e --- /dev/null +++ b/ComputeShaders/add.hlsl @@ -0,0 +1,6 @@ +inline float compute( float a, float b ) +{ + return a + b; +} + +#include "componentwiseBinaryOp.hlsli"
\ No newline at end of file diff --git a/ComputeShaders/addInPlace.hlsl b/ComputeShaders/addInPlace.hlsl new file mode 100644 index 0000000..a83c221 --- /dev/null +++ b/ComputeShaders/addInPlace.hlsl @@ -0,0 +1,39 @@ +#ifndef THREADS +#define THREADS 512 +#endif + +Buffer<float> arg0: register( t0 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 size: packoffset( c0 ); + uint4 strides: packoffset( c1 ); + uint4 argStrides: packoffset( c3 ); +} + +inline uint rowOffset( uint3 idx, uint4 strides ) +{ + return idx[ 0 ] * strides[ 1 ] + idx[ 1 ] * strides[ 2 ] + idx[ 2 ] * strides[ 3 ]; +} + +[ numthreads( THREADS, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + uint rdi = rowOffset( group, strides ); + uint rsi = rowOffset( group, argStrides ); + + const uint rdiEnd = rdi + size[ 0 ] * strides[ 0 ]; + rdi += thread * strides[ 0 ]; + rsi += thread * argStrides[ 0 ]; + + const uint rdiInc = THREADS * strides[ 0 ]; + const uint rsiInc = THREADS * argStrides[ 0 ]; + + for( ; rdi < rdiEnd; rdi += rdiInc, rsi += rsiInc ) + { + float f = result[ rdi ]; + f += arg0[ rsi ]; + result[ rdi ] = f; + } +}
\ No newline at end of file diff --git a/ComputeShaders/addRepeat.hlsl b/ComputeShaders/addRepeat.hlsl new file mode 100644 index 0000000..e5cdaa3 --- /dev/null +++ b/ComputeShaders/addRepeat.hlsl @@ -0,0 +1,70 @@ +// Compute tensor = tensor + repeat( pattern, tensor ) in 1 shot, without VRAM allocations +// Dispatch [ nb[ 1 ], nb[ 2 ], nb[ 3 ] ] thread groups of this shader, where nb is size of the destination tensor +RWBuffer<float> tensor: register( u0 ); +Buffer<float> pattern: register( t0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 tensorSize: packoffset( c0 ); + uint4 tensorStrides: packoffset( c1 ); + uint4 patternSize: packoffset( c2 ); + uint4 patternStrides: packoffset( c3 ); +} + +#ifndef THREADS +#define THREADS 256 +#endif + +#include "repeatUtils.hlsli" + +inline void computeSimple( uint idx, float add ) +{ + float f = tensor[ idx ]; + f += add; + tensor[ idx ] = f; +} + +[ numthreads( THREADS, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + uint3 it = tensorIteratorState( group, thread, tensorSize, tensorStrides ); + uint rsi = rowOffset( group % patternSize.yzw, patternStrides ); + + if( patternSize[ 0 ] == 1 ) + { + // The pattern only has 1 column - broadcasting over the row + const float p = pattern[ rsi ]; + ROW_LOOP( it ) + computeSimple( it.x, p ); + } + else if( patternSize[ 0 ] <= THREADS ) + { + // pattern size doesn't exceed thread group size: load pattern value outside of the loop + const uint threadsPerGroup = THREADS - ( THREADS % patternSize[ 0 ] ); + if( thread >= threadsPerGroup ) + return; + + const float p = pattern[ rsi + ( thread % patternSize[ 0 ] ) * patternStrides[ 0 ] ]; + ROW_LOOP_EX( it, threadsPerGroup, tensorStrides ) + computeSimple( it.x, p ); + } + else + { + // Pattern rows are larger than the thread group, need to stream from both buffers + const uint rsiInc = THREADS * patternStrides[ 0 ]; + const uint rsiDec = patternSize[ 0 ] * patternStrides[ 0 ]; + const uint rsiEnd = rsi + rsiDec; + rsi += thread * patternStrides[ 0 ]; + + ROW_LOOP( it ) + { + float f = tensor[ it.x ]; + float p = pattern[ rsi ]; + rsi += rsiInc; + if( rsi >= rsiEnd ) + rsi -= rsiDec; + f += p; + tensor[ it.x ] = f; + } + } +}
\ No newline at end of file diff --git a/ComputeShaders/addRepeat64.hlsl b/ComputeShaders/addRepeat64.hlsl new file mode 100644 index 0000000..b6c8c19 --- /dev/null +++ b/ComputeShaders/addRepeat64.hlsl @@ -0,0 +1,2 @@ +#define THREADS 64 +#include "addRepeat.hlsl"
\ No newline at end of file diff --git a/ComputeShaders/addRepeatGelu.hlsl b/ComputeShaders/addRepeatGelu.hlsl new file mode 100644 index 0000000..7f63653 --- /dev/null +++ b/ComputeShaders/addRepeatGelu.hlsl @@ -0,0 +1,88 @@ +// Compute tensor = GELU( tensor + repeat( pattern, tensor ) ) in 1 shot, without VRAM allocations +// Dispatch [ nb[ 1 ], nb[ 2 ], nb[ 3 ] ] thread groups of this shader, where nb is size of the destination tensor +RWBuffer<float> tensor: register( u0 ); +Buffer<float> pattern: register( t0 ); +Buffer<uint> lookupTable: register( t1 ); + +cbuffer Constants: register( b0 ) +{ + uint4 tensorSize: packoffset( c0 ); + uint4 tensorStrides: packoffset( c1 ); + uint4 patternSize: packoffset( c2 ); + uint4 patternStrides: packoffset( c3 ); +} + +#ifndef THREADS +#define THREADS 1024 +#endif + +#include "repeatUtils.hlsli" +#include "miscUtils.hlsli" + +inline float gelu( float x ) +{ +#if 1 + const uint index = fp16Rounded( x ); + const uint res16 = lookupTable[ index ]; + return f16tof32( res16 ); +#else + // This version is much slower, at least on AMD, despite saving these VRAM loads. + const float GELU_COEF_A = 0.044715; + const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876; + return 0.5 * x * ( 1.0 + tanh( SQRT_2_OVER_PI * x * ( 1.0 + GELU_COEF_A * x * x ) ) ); +#endif +} + +inline void computeSimple( uint idx, float add ) +{ + float f = tensor[ idx ]; + f += add; + f = gelu( f ); + tensor[ idx ] = f; +} + +[ numthreads( THREADS, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + uint3 it = tensorIteratorState( group, thread, tensorSize, tensorStrides ); + uint rsi = rowOffset( group % patternSize.yzw, patternStrides ); + + if( patternSize[ 0 ] == 1 ) + { + // The pattern only has 1 column - broadcasting over the row + const float p = pattern[ rsi ]; + ROW_LOOP( it ) + computeSimple( it.x, p ); + } + else if( patternSize[ 0 ] <= THREADS ) + { + // pattern size doesn't exceed thread group size: load pattern value outside of the loop + const uint threadsPerGroup = THREADS - ( THREADS % patternSize[ 0 ] ); + if( thread >= threadsPerGroup ) + return; + + const float p = pattern[ rsi + ( thread % patternSize[ 0 ] ) * patternStrides[ 0 ] ]; + ROW_LOOP_EX( it, threadsPerGroup, tensorStrides ) + computeSimple( it.x, p ); + } + else + { + // Pattern rows are larger than the thread group, need to stream from both buffers + const uint rsiInc = THREADS * patternStrides[ 0 ]; + const uint rsiDec = patternSize[ 0 ] * patternStrides[ 0 ]; + const uint rsiEnd = rsi + rsiDec; + rsi += thread * patternStrides[ 0 ]; + + ROW_LOOP( it ) + { + float f = tensor[ it.x ]; + float p = pattern[ rsi ]; + rsi += rsiInc; + if( rsi >= rsiEnd ) + rsi -= rsiDec; + f += p; + f = gelu( f ); + tensor[ it.x ] = f; + } + } +}
\ No newline at end of file diff --git a/ComputeShaders/addRepeatGelu64.hlsl b/ComputeShaders/addRepeatGelu64.hlsl new file mode 100644 index 0000000..3d9b2e8 --- /dev/null +++ b/ComputeShaders/addRepeatGelu64.hlsl @@ -0,0 +1,2 @@ +#define THREADS 64 +#include "addRepeatGelu.hlsl"
\ No newline at end of file diff --git a/ComputeShaders/addRepeatScale.hlsl b/ComputeShaders/addRepeatScale.hlsl new file mode 100644 index 0000000..8c24088 --- /dev/null +++ b/ComputeShaders/addRepeatScale.hlsl @@ -0,0 +1,73 @@ +// Compute tensor = ( tensor + repeat( pattern, tensor ) ) * scale in 1 shot, without VRAM allocations +// Dispatch [ nb[ 1 ], nb[ 2 ], nb[ 3 ] ] thread groups of this shader, where nb is size of the destination tensor +RWBuffer<float> tensor: register( u0 ); +Buffer<float> pattern: register( t0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 tensorSize: packoffset( c0 ); + uint4 tensorStrides: packoffset( c1 ); + uint4 patternSize: packoffset( c2 ); + uint4 patternStrides: packoffset( c3 ); + float scalingMul : packoffset( c4.x ); +} + +#ifndef THREADS +#define THREADS 512 +#endif + +#include "repeatUtils.hlsli" + +inline void computeSimple( uint idx, float add ) +{ + float f = tensor[ idx ]; + f += add; + f *= scalingMul; + tensor[ idx ] = f; +} + +[ numthreads( THREADS, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + uint3 it = tensorIteratorState( group, thread, tensorSize, tensorStrides ); + uint rsi = rowOffset( group % patternSize.yzw, patternStrides ); + + if( patternSize[ 0 ] == 1 ) + { + // The pattern only has 1 column - broadcasting over the row + const float p = pattern[ rsi ]; + ROW_LOOP( it ) + computeSimple( it.x, p ); + } + else if( patternSize[ 0 ] <= THREADS ) + { + // pattern size doesn't exceed thread group size: load pattern value outside of the loop + const uint threadsPerGroup = THREADS - ( THREADS % patternSize[ 0 ] ); + if( thread >= threadsPerGroup ) + return; + + const float p = pattern[ rsi + ( thread % patternSize[ 0 ] ) * patternStrides[ 0 ] ]; + ROW_LOOP_EX( it, threadsPerGroup, tensorStrides ) + computeSimple( it.x, p ); + } + else + { + // Pattern rows are larger than the thread group, need to stream from both buffers + const uint rsiInc = THREADS * patternStrides[ 0 ]; + const uint rsiDec = patternSize[ 0 ] * patternStrides[ 0 ]; + const uint rsiEnd = rsi + rsiDec; + rsi += thread * patternStrides[ 0 ]; + + ROW_LOOP( it ) + { + float f = tensor[ it.x ]; + float p = pattern[ rsi ]; + rsi += rsiInc; + if( rsi >= rsiEnd ) + rsi -= rsiDec; + f += p; + f *= scalingMul; + tensor[ it.x ] = f; + } + } +}
\ No newline at end of file diff --git a/ComputeShaders/addRows.hlsl b/ComputeShaders/addRows.hlsl new file mode 100644 index 0000000..21e5e73 --- /dev/null +++ b/ComputeShaders/addRows.hlsl @@ -0,0 +1,46 @@ +#ifndef THREADS +#define THREADS 256 +#endif + +// dec.tokenEmbedding tensor +Buffer<float> tokenEmbedding: register( t0 ); +// dec.positionalEmbedding tensor +Buffer<float> positionalEmbedding: register( t1 ); +// R32_UINT buffer with the input tokens +Buffer<uint> embd: register( t2 ); +// Output tensor +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint rowLength: packoffset( c0.x ); + uint pastTokensCount: packoffset( c0.y ); + uint outputRowStride: packoffset( c0.z ); + uint2 embStrides: packoffset( c1.x ); + uint2 posStrides: packoffset( c1.z ); +} + +[ numthreads( THREADS, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint row = group.x; + const uint rowTok = embd[ row ]; + const uint rowPos = row + pastTokensCount; + + uint rdi = row * outputRowStride; + const uint rdiEnd = rdi + rowLength; + rdi += thread; + + uint rsiTok = rowTok * embStrides.y; + rsiTok += thread * embStrides.x; + + uint rsiPos = rowPos * posStrides.y; + rsiPos += thread * posStrides.x; + + for( ; rdi < rdiEnd; rdi += THREADS, rsiTok += THREADS * embStrides.x, rsiPos += THREADS * posStrides.x ) + { + float a = tokenEmbedding[ rsiTok ]; + float b = positionalEmbedding[ rsiPos ]; + result[ rdi ] = a + b; + } +}
\ No newline at end of file diff --git a/ComputeShaders/componentwiseBinaryOp.hlsli b/ComputeShaders/componentwiseBinaryOp.hlsli new file mode 100644 index 0000000..0a523ca --- /dev/null +++ b/ComputeShaders/componentwiseBinaryOp.hlsli @@ -0,0 +1,43 @@ +Buffer<float> arg0: register( t0 ); +Buffer<float> arg1: register( t1 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 src0_elements: packoffset( c0 ); + uint4 src0_strides: packoffset( c1 ); + uint4 src1_elements: packoffset( c2 ); + uint4 src1_strides: packoffset( c3 ); + uint4 result_elements: packoffset( c4 ); + uint4 result_strides: packoffset( c5 ); +} + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint j = group.x; + const uint nb1 = result_strides[ 1 ]; + const uint nb01 = src0_strides[ 1 ]; + + const uint nb10 = src1_strides[ 0 ]; + const uint nb11 = src1_strides[ 1 ]; + const uint nc = src0_elements[ 0 ]; + + uint rsi0 = j * nb01; + uint rsi1 = j * nb11; + uint rdi = j * nb1; + const uint rsi0End = rsi0 + nc; + + rsi0 += thread; + rsi1 += thread * nb10; + rdi += thread; + + const uint rsi1Inc = 32 * nb10; + for( ; rsi0 < rsi0End; rsi0 += 32, rsi1 += rsi1Inc, rdi += 32 ) + { + const float a = arg0[ rsi0 ]; + const float b = arg1[ rsi1 ]; + const float res = compute( a, b ); + result[ rdi ] = res; + } +}
\ No newline at end of file diff --git a/ComputeShaders/convolutionMain.hlsl b/ComputeShaders/convolutionMain.hlsl new file mode 100644 index 0000000..eee3ebb --- /dev/null +++ b/ComputeShaders/convolutionMain.hlsl @@ -0,0 +1,76 @@ +// ggml_compute_forward_conv_1d_1s_f16_f32, GGML_TASK_COMPUTE implementation +// Dispatch [ ne10, ne02, 1 ] thread groups +Buffer<float> arg0: register( t0 ); +Buffer<float> arg1: register( t1 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 src0_elements: packoffset( c0 ); + uint4 src0_strides: packoffset( c1 ); + uint4 src1_elements: packoffset( c2 ); + uint4 result_elements: packoffset( c4 ); + uint4 result_strides: packoffset( c5 ); +} + +#include "groupReduce.hlsli" + +inline void computeDotProduct( uint s0, uint s1, uint len, uint thread, inout float acc ) +{ + float curr = 0; + const uint completeVectors = len / 32; + uint i; + for( i = 0; i < completeVectors; i++, s0 += 32, s1 += 32 ) + curr = mad( arg0[ s0 + thread ], arg1[ s1 + thread ], curr ); + + horizontalSumCompatNew( thread, curr ); + + if( 0 == thread ) + { + const uint rem = len % 32; + if( 0 != rem ) + { + double f64 = curr; + for( i = 0; i < rem; i++ ) + { + precise float a = arg0[ s0 + i ]; + precise float b = arg1[ s1 + i ]; + precise float prod = a * b; + f64 += prod; + } + curr = (float)f64; + } + acc += curr; + } +} + +#include "miscUtils.hlsli" + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint i1 = group.y; + const uint i0 = group.x; + + const uint ne00 = src0_elements[ 0 ]; + const uint nk = ne00; + const int nh = (int)( nk / 2 ); + + const uint ne01 = src0_elements[ 1 ]; + const int ew0 = roundUp32( ne01 ); + + float res = 0; + for( int k = -nh; k <= nh; k++ ) + { + const uint source0 = i1 * ew0 * ne00 + uint( nh + k ) * ew0; + const uint source1 = uint( i0 + nh + k ) * ew0; + computeDotProduct( source0, source1, ew0, thread, res ); + } + + if( 0 != thread ) + return; + + const uint nb1 = result_strides[ 1 ]; + const uint rdi = i1 * nb1 + i0; + result[ rdi ] = res; +}
\ No newline at end of file diff --git a/ComputeShaders/convolutionMain2.hlsl b/ComputeShaders/convolutionMain2.hlsl new file mode 100644 index 0000000..73c9da1 --- /dev/null +++ b/ComputeShaders/convolutionMain2.hlsl @@ -0,0 +1,60 @@ +// ggml_compute_forward_conv_1d_2s_f16_f32, GGML_TASK_COMPUTE implementation +// Dispatch [ ne10 / 2, ne02, 1 ] thread groups +Buffer<float> arg0: register( t0 ); +Buffer<float> arg1: register( t1 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 src0_elements: packoffset( c0 ); + uint4 src0_strides: packoffset( c1 ); + uint4 src1_elements: packoffset( c2 ); + uint4 result_elements: packoffset( c4 ); + uint4 result_strides: packoffset( c5 ); +} + +#include "groupReduce.hlsli" + +inline void computeDotProduct( uint s0, uint s1, uint len, uint thread, inout float acc ) +{ + float curr = 0; + const uint s0End = s0 + len; + s0 += thread; + s1 += thread; + for( ; s0 < s0End; s0 += 32, s1 += 32 ) + curr = mad( arg0[ s0 ], arg1[ s1 ], curr ); + + horizontalSumCompatNew( thread, curr ); + if( 0 == thread ) + acc += curr; +} + +#include "miscUtils.hlsli" + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint ne00 = src0_elements[ 0 ]; + const uint ne01 = src0_elements[ 1 ]; + const int ew0 = roundUp32( ne01 ); + + float res = 0; + uint s0 = group.y * ew0 * ne00; + uint s1 = group.x * 2 * ew0; + // The original implementation did following: + // int nh = (int)( nk / 2 ); + // for( int k = -nh; k <= nh; k++ ) + // What we doing instead: + // for( uint len = ( nk / 2 ) * 2 + 1, i = 0; i < len; i++ ) + // len = ( nk / 2 ) * 2 + 1 is equal to ( nk | 1 ) + const uint s0End = s0 + ( ne00 | 1u ) * ew0; + for( ; s0 < s0End; s0 += ew0, s1 += ew0 ) + computeDotProduct( s0, s1, ew0, thread, res ); + + if( 0 != thread ) + return; + + const uint nb1 = result_strides[ 1 ]; + const uint rdi = group.y * nb1 + group.x; + result[ rdi ] = res; +}
\ No newline at end of file diff --git a/ComputeShaders/convolutionMain2Fixed.hlsl b/ComputeShaders/convolutionMain2Fixed.hlsl new file mode 100644 index 0000000..d21dcbb --- /dev/null +++ b/ComputeShaders/convolutionMain2Fixed.hlsl @@ -0,0 +1,119 @@ +// Optimized version of convolutionMain2.hlsl for kernel size = 3 +// Dispatch [ ( ( ne10 / 2 ) + TILE_Y - 1 ) / TILE_Y, ne02, 1 ] thread groups of this shader +#ifndef TILE_Y +static const uint TILE_Y = 8; +#endif +#ifndef THREADS +static const uint THREADS = 64; +#endif + +Buffer<float> arg0: register( t0 ); +Buffer<float> arg1: register( t1 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 src0_elements: packoffset( c0 ); + uint4 src0_strides: packoffset( c1 ); + uint4 src1_elements: packoffset( c2 ); + uint4 result_elements: packoffset( c4 ); + uint4 result_strides: packoffset( c5 ); +} + +// The accumulators we're after +groupshared float resTemp[ TILE_Y ][ THREADS ]; + +// Multiply + accumulate the specified row +inline void accumulate( float a0, float a1, const uint resultRow, const uint thread ) +{ + float acc = resTemp[ resultRow ][ thread ]; + acc = mad( a0, a1, acc ); + resTemp[ resultRow ][ thread ] = acc; +} + +inline void convolutionTile( const uint s0, uint s1, const uint thread, const uint stride, const uint height ) +{ + // Load 3 rows from arg0 + const float3 a0 = float3( arg0[ s0 ], arg0[ s0 + stride ], arg0[ s0 + stride * 2 ] ); + + // Row 0 + float a1 = arg1[ s1 ]; + accumulate( a0[ 0 ], a1, 0, thread ); + s1 += stride; + + for( uint i = 1; i < height; i++ ) + { + // Row i*2-1 + // Even-indexed rows only contribute to a single output rows, after muiltiplied by kernel row #1 + a1 = arg1[ s1 ]; + accumulate( a0[ 1 ], a1, i - 1, thread ); + s1 += stride; + + // Row i*2, contributes to 2 output rows corresponding to kernel rows #0 and #2 + a1 = arg1[ s1 ]; + accumulate( a0[ 2 ], a1, i - 1, thread ); + accumulate( a0[ 0 ], a1, i, thread ); + s1 += stride; + } + + // Row height*2 - 1 + a1 = arg1[ s1 ]; + accumulate( a0[ 1 ], a1, height - 1, thread ); + s1 += stride; + + // Row height*2 + a1 = arg1[ s1 ]; + accumulate( a0[ 2 ], a1, height - 1, thread ); +} + +#include "miscUtils.hlsli" + +[ numthreads( THREADS, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + uint i; + // Zero out the accumulators + for( i = 0; i < TILE_Y; i++ ) + resTemp[ i ][ thread ] = 0.0; + GroupMemoryBarrierWithGroupSync(); + + const uint i1 = group.y; + const uint i0 = group.x * TILE_Y * 2; + const uint height = min( TILE_Y, ( src1_elements.x / 2 ) - group.x * TILE_Y ); + + const uint ne00 = src0_elements[ 0 ]; + const uint ne01 = src0_elements[ 1 ]; + const int ew0 = roundUp32( ne01 ); + + uint s0 = i1 * ew0 * ne00; + const uint s0End = s0 + ew0; + uint s1 = i0 * ew0; + s0 += thread; + s1 += thread; + for( ; s0 < s0End; s0 += THREADS, s1 += THREADS ) + convolutionTile( s0, s1, thread, ew0, height ); + + GroupMemoryBarrierWithGroupSync(); + + // Now we need horizontal sums of these shared accumulators, i.e. reduce [height][THREADS] shared array into [height][1] column + for( i = THREADS / 2; i > 0; i /= 2 ) + { + if( thread < i ) + { + for( uint j = 0; j < height; j++ ) + { + float sum = resTemp[ j ][ thread ]; + sum += resTemp[ j ][ thread + i ]; + resTemp[ j ][ thread ] = sum; + } + } + GroupMemoryBarrierWithGroupSync(); + } + + // And finally, store that column to global memory + if( thread >= height ) + return; + const uint nb1 = result_strides[ 1 ]; + const uint rdi = i1 * nb1 + group.x * TILE_Y + thread; + result[ rdi ] = resTemp[ thread ][ 0 ]; +}
\ No newline at end of file diff --git a/ComputeShaders/convolutionPrep1.hlsl b/ComputeShaders/convolutionPrep1.hlsl new file mode 100644 index 0000000..528ff16 --- /dev/null +++ b/ComputeShaders/convolutionPrep1.hlsl @@ -0,0 +1,39 @@ +// ggml_compute_forward_conv_1d_1s_f16_f32, prepare kernel data (src0) +// Dispatch [ ne01, ne02, 1 ] thread groups +Buffer<float> arg0: register( t0 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 src0_elements: packoffset( c0 ); + uint4 src0_strides: packoffset( c1 ); +} + +inline uint roundUp32( uint x ) +{ + return ( x + 31 ) & ( ~31u ); +} + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint nb01 = src0_strides[ 1 ]; + const uint nb02 = src0_strides[ 2 ]; + + const uint ne00 = src0_elements[ 0 ]; + const uint ne01 = src0_elements[ 1 ]; + const uint ew0 = roundUp32( ne01 ); + + const uint i02 = group.y; + const uint i01 = group.x; + + uint rsi = i02 * nb02 + i01 * nb01; + const uint rsiEnd = rsi + ne00; + uint rdi = i02 * ew0 * ne00 + i01; + rsi += thread; + rdi += thread * ew0; + const uint rdiInc = 32 * ew0; + + for( ; rsi < rsiEnd; rsi += 32, rdi += rdiInc ) + result[ rdi ] = arg0[ rsi ]; +}
\ No newline at end of file diff --git a/ComputeShaders/convolutionPrep2.hlsl b/ComputeShaders/convolutionPrep2.hlsl new file mode 100644 index 0000000..a7e7172 --- /dev/null +++ b/ComputeShaders/convolutionPrep2.hlsl @@ -0,0 +1,43 @@ +// ggml_compute_forward_conv_1d_1s_f16_f32, prepare source data (src1) +// Dispatch [ ne11, 1, 1 ] thread groups +Buffer<float> arg1: register( t0 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 src0_elements: packoffset( c0 ); + uint4 src1_elements: packoffset( c2 ); + uint4 src1_strides: packoffset( c3 ); +} + +#include "miscUtils.hlsli" + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint i11 = group.x; + + const uint ne00 = src0_elements[ 0 ]; + const uint ne01 = src0_elements[ 1 ]; + const uint ne10 = src1_elements[ 0 ]; + const uint nb11 = src1_strides[ 1 ]; + + const uint nk = ne00; + const uint nh = nk / 2; + const int ew0 = roundUp32( ne01 ); + + uint rsi = i11 * nb11; + uint rdi = nh * ew0 + i11; + const uint rdiInc = ew0 * 32; + const uint rsiEnd = rsi + ne10; + + rsi += thread; + rdi += thread * ew0; + + for( ; rsi < rsiEnd; rsi += 32, rdi += rdiInc ) + { + float f = arg1[ rsi ]; + f = adjustFp16( f ); + result[ rdi ] = f; + } +}
\ No newline at end of file diff --git a/ComputeShaders/copyConvert.hlsl b/ComputeShaders/copyConvert.hlsl new file mode 100644 index 0000000..399147f --- /dev/null +++ b/ComputeShaders/copyConvert.hlsl @@ -0,0 +1,50 @@ +// ggml_compute_forward_dup_f32 when we only need to convert types, but not reshape the tensor +// Dispatch [ ne01, ne02, ne03 ] thread groups of this shader +Buffer<float> arg0: register( t0 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 src0_elements: packoffset( c0 ); + uint4 src0_strides: packoffset( c1 ); + bool downcastFp32 : packoffset( c2.x ); +} + +#include "miscUtils.hlsli" + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint nb00 = src0_strides[ 0 ]; + const uint nb01 = src0_strides[ 1 ]; + const uint nb02 = src0_strides[ 2 ]; + const uint nb03 = src0_strides[ 3 ]; + + const uint ne00 = src0_elements[ 0 ]; + const uint ne01 = src0_elements[ 1 ]; + const uint ne02 = src0_elements[ 2 ]; + const uint ne03 = src0_elements[ 3 ]; + + const uint i01 = group.x; + const uint i02 = group.y; + const uint i03 = group.z; + + const uint rs = ne00 * nb00; + //const uint id = i01 + i02 * ne02 + i03 * ne01 * ne02; + const uint id = ( i03 * ne01 + i02 ) * ne02 + i01; + + uint rsi = i01 * nb01 + i02 * nb02 + i03 * nb03; + uint rdi = id * rs; + + const uint rsiEnd = rsi + rs; + rsi += thread; + rdi += thread; + for( ; rsi < rsiEnd; rsi += 32, rdi += 32 ) + { + float f = arg0[ rsi ]; + [branch] + if( downcastFp32 ) + f = adjustFp16( f ); + result[ rdi ] = f; + } +}
\ No newline at end of file diff --git a/ComputeShaders/copyTranspose.hlsl b/ComputeShaders/copyTranspose.hlsl new file mode 100644 index 0000000..fc3e82f --- /dev/null +++ b/ComputeShaders/copyTranspose.hlsl @@ -0,0 +1,56 @@ +// ggml_compute_forward_dup_f32 when we actually need to reshape the tensor +// Dispatch [ ne01, ne02, ne03 ] thread groups of this shader +Buffer<float> arg0: register( t0 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 src0_elements: packoffset( c0 ); + uint4 src0_strides: packoffset( c1 ); + bool downcastFp32 : packoffset( c2.x ); +} + +#include "miscUtils.hlsli" + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint nb00 = src0_strides[ 0 ]; + const uint nb01 = src0_strides[ 1 ]; + const uint nb02 = src0_strides[ 2 ]; + const uint nb03 = src0_strides[ 3 ]; + + const uint ne00 = src0_elements[ 0 ]; + const uint ne01 = src0_elements[ 1 ]; + const uint ne02 = src0_elements[ 2 ]; + const uint ne03 = src0_elements[ 3 ]; + + const uint i01 = group.x; + const uint i02 = group.y; + const uint i03 = group.z; + + // We need following integer: i01*ne00 + i02*ne00*ne01 + i03*ne00*ne01*ne02 + // We want to minimize count of integer multiplications + // Also, DXBC assembly features `imad` instruction which computes a*b+c for integers, the actual hardware hopefully has an equivalent + // i03*ne00*ne01*ne02 + i02*ne00*ne01 + i01*ne00 + // ( i03*ne01*ne02 + i02*ne01 + i01 ) * ne00 + // ( ( i03*ne02 + i02) * ne01 + i01 ) * ne00 + uint rdi = ( ( i03 * ne02 + i02 ) * ne01 + i01 ) * ne00; + + const uint rdiEnd = rdi + ne00; + + uint rsi = i01 * nb01 + i02 * nb02 + i03 * nb03; + const uint rsiInc = 32 * nb00; + + rdi += thread; + rsi += thread * nb00; + + for( ; rdi < rdiEnd; rdi += 32, rsi += rsiInc ) + { + float f = arg0[ rsi ]; + [branch] + if( downcastFp32 ) + f = adjustFp16( f ); + result[ rdi ] = f; + } +}
\ No newline at end of file diff --git a/ComputeShaders/diagMaskInf.hlsl b/ComputeShaders/diagMaskInf.hlsl new file mode 100644 index 0000000..18e3938 --- /dev/null +++ b/ComputeShaders/diagMaskInf.hlsl @@ -0,0 +1,30 @@ +// ggml_compute_forward_diag_mask_inf_f32 +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 elements: packoffset( c0 ); + uint4 strides: packoffset( c1 ); + uint n_past : packoffset( c2.x ); +} + +static const float negativeInfinity = asfloat( 0xff800000 ); + +[numthreads( 32, 1, 1 )] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint k = group.y; + const uint j = group.x; + + // Start of the row + uint rdi = k * strides[ 2 ] + j * strides[ 1 ]; + // End of the row + const uint rdiEnd = rdi + elements[ 0 ] * strides[ 0 ]; + // First index to write in this thread + rdi += ( n_past + j + thread + 1 ) * strides[ 0 ]; + // Index increment + const uint rdiInc = 32 * strides[ 0 ]; + + for( ; rdi < rdiEnd; rdi += rdiInc ) + result[ rdi ] = negativeInfinity; +}
\ No newline at end of file diff --git a/ComputeShaders/flashAttention.hlsl b/ComputeShaders/flashAttention.hlsl new file mode 100644 index 0000000..65212d0 --- /dev/null +++ b/ComputeShaders/flashAttention.hlsl @@ -0,0 +1,170 @@ +// Ported from ggml_compute_forward_flash_attn_f16 +// Dispatch with [ neq1*neq2*neq3, 1, 1 ] thread groups + +#include "flashAttentionCommon.hlsli" +Buffer<uint> lookupTable: register( t3 ); +#include "groupReduce.hlsli" + +inline void computeDotProduct( Buffer<float> buff0, Buffer<float> buff1, uint s0, uint s1, const uint len, const uint thread, inout float acc ) +{ + acc = 0; + const uint s0End = s0 + len; + s0 += thread; + s1 += thread; + for( ; s0 < s0End; s0 += 32, s1 += 32 ) + acc = mad( buff0[ s0 ], buff1[ s1 ], acc ); + + horizontalSum( thread, acc ); +} + +inline void computeDotProduct( Buffer<float> buff0, RWBuffer<float> buff1, uint s0, uint s1, const uint len, const uint thread, inout float acc ) +{ + acc = 0; + const uint s0End = s0 + len; + s0 += thread; + s1 += thread; + for( ; s0 < s0End; s0 += 32, s1 += 32 ) + acc = mad( buff0[ s0 ], buff1[ s1 ], acc ); + + horizontalSum( thread, acc ); +} + +void scaleTempVector( uint i, const uint length, const uint thread, const float multiplier, bool round ) +{ + const uint end = i + length; + for( i += thread; i < end; i += 32 ) + { + float f = temp[ i ]; + f *= multiplier; + if( round ) + f = roundToFp16( f ); + temp[ i ] = f; + } +} + +#include "miscUtils.hlsli" + +// Transform temp[ i ] = exp( temp[ i ] - tempMax ), and return the sum of these values +inline float applySoftMax( uint i, const uint length, const uint thread, const float tempMax ) +{ + // Transform the values, and compute per-thread sum + const uint end = i + length; + float sum = 0; + for( i += thread; i < end; i += 32 ) + { + float f = temp[ i ]; + [branch] + if( f != negativeInfinity ) + { + f -= tempMax; + const uint index = fp16Rounded( f ); + const uint res16 = lookupTable[ index ]; + f = f16tof32( res16 ); + } + else + f = 0; + + temp[ i ] = f; + sum += f; + } + + // Reduce per-thread sum to the global one, over all threads of the group + horizontalSumBroadcast( thread, sum ); + return sum; +} + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint neq0 = q_elements[ 0 ]; + const uint neq1 = q_elements[ 1 ]; + const uint neq2 = q_elements[ 2 ]; + const uint neq3 = q_elements[ 3 ]; + + const uint nek0 = k_elements[ 0 ]; + const uint nek1 = k_elements[ 1 ]; + + const uint nev1 = v_elements[ 1 ]; + + const uint ne0 = res_elements[ 0 ]; + const uint ne1 = res_elements[ 1 ]; + + const uint nbk0 = k_strides[ 0 ]; + const uint nbk1 = k_strides[ 1 ]; + const uint nbk2 = k_strides[ 2 ]; + const uint nbk3 = k_strides[ 3 ]; + + const uint nbq0 = q_strides[ 0 ]; + const uint nbq1 = q_strides[ 1 ]; + const uint nbq2 = q_strides[ 2 ]; + const uint nbq3 = q_strides[ 3 ]; + + const uint nbv0 = v_strides[ 0 ]; + const uint nbv1 = v_strides[ 1 ]; + const uint nbv2 = v_strides[ 2 ]; + const uint nbv3 = v_strides[ 3 ]; + + const uint nb0 = res_strides[ 0 ]; + const uint nb1 = res_strides[ 1 ]; + const uint nb2 = res_strides[ 2 ]; + const uint nb3 = res_strides[ 3 ]; + + const uint D = neq0; + const uint N = neq1; + const uint P = nek1 - N; + const uint M = nek1; + + const uint ir = group.x; + const uint iq3 = ir / ( neq2 * neq1 ); + const uint iq2 = ( ir - iq3 * neq2 * neq1 ) / neq1; + const uint iq1 = ( ir - iq3 * neq2 * neq1 - iq2 * neq1 ); + + const uint tempIndex = ir * tempBufferStride; + + uint ic; + float tvm = negativeInfinity; + const uint s1 = iq1 * nbq1 + iq2 * nbq2 + iq3 * nbq3; + uint s0 = iq2 * nbk2 + iq3 * nbk3; + for( ic = 0; ic < nek1; ic++, s0 += nbk1 ) + { + if( masked ) + { + if( ic > P + iq1 ) + { + if( 0 == thread ) + temp[ tempIndex + ic ] = negativeInfinity; + continue; + } + } + + float dp; + computeDotProduct( k, q, s0, s1, neq0, thread, dp ); + if( 0 == thread ) + { + dp *= scale; + temp[ tempIndex + ic ] = dp; + tvm = max( tvm, dp ); + } + } + + if( 0 == thread ) + sharedAccumulators[ 0 ] = tvm; + GroupMemoryBarrierWithGroupSync(); + tvm = sharedAccumulators[ 0 ]; + + // Softmax + { + float sum = applySoftMax( tempIndex, M, thread, tvm ); + scaleTempVector( tempIndex, M, thread, 1.0 / sum, true ); + } + + s0 = iq2 * nbv2 + iq3 * nbv3; + uint rdi = iq1 * nb1 + iq2 * nb2 + iq3 * nb3; + for( ic = 0; ic < nev1; ic++, s0 += nbv1, rdi += nb0 ) + { + float dp; + computeDotProduct( v, temp, s0, tempIndex, nek1, thread, dp ); + if( 0 == thread ) + result[ rdi ] = dp; + } +}
\ No newline at end of file diff --git a/ComputeShaders/flashAttentionCommon.hlsli b/ComputeShaders/flashAttentionCommon.hlsli new file mode 100644 index 0000000..68ed30b --- /dev/null +++ b/ComputeShaders/flashAttentionCommon.hlsli @@ -0,0 +1,67 @@ +// Ported from ggml_compute_forward_flash_attn_f16 +// Dispatch with [ neq1*neq2*neq3, 1, 1 ] thread groups +Buffer<float> q: register( t0 ); +Buffer<float> k: register( t1 ); +Buffer<float> v: register( t2 ); + +RWBuffer<float> result: register( u0 ); +// This temporary buffer should fit tempBufferStride * neq1 * neq2 * neq3 elements, FP32 precision +RWBuffer<float> temp: register( u1 ); + +cbuffer Constants: register( b0 ) +{ + uint4 q_elements: packoffset( c0 ); + uint4 q_strides: packoffset( c1 ); + uint4 k_elements: packoffset( c2 ); + uint4 k_strides: packoffset( c3 ); + uint4 v_elements: packoffset( c4 ); + uint4 v_strides: packoffset( c5 ); + uint4 res_elements: packoffset( c6 ); + uint4 res_strides: packoffset( c7 ); + + bool masked : packoffset( c8.x ); + // 1.0 / sqrt( (double) D ) + float scale : packoffset( c8.y ); + // This number is required to be >= nek1, and ideally rounded up to either 32 (L2 line) or 128 (L1 line) bytes + uint tempBufferStride: packoffset( c8.z ); +} + +static const float negativeInfinity = asfloat( 0xff800000 ); + +// Convert FP32 number to FP16 using rounding to nearest, then upcast back to FP32 +inline float roundToFp16( const float src ) +{ + const uint trunc16 = f32tof16( src ); + const float trunc32 = f16tof32( trunc16 ); + + const uint truncExp = ( trunc16 >> 10 ) & 0x1F; + if( truncExp != 0x1F ) + { + const uint next16 = trunc16 + 1; + const float next32 = f16tof32( next16 ); + + const float errTrunc = abs( src - trunc32 ); + const float errNext = abs( src - next32 ); + + if( errTrunc < errNext ) + { + // Truncated was closer to the source + return trunc32; + } + else if( errTrunc > errNext ) + { + // Truncated + 1 was closer to the source + return next32; + } + else + { + // Exactly half, doing banker's rounding to nearest even + return ( 0 == ( trunc16 & 1 ) ) ? trunc32 : next32; + } + } + else + { + // INF or NAN + return trunc32; + } +}
\ No newline at end of file diff --git a/ComputeShaders/flashAttentionCompat1.hlsl b/ComputeShaders/flashAttentionCompat1.hlsl new file mode 100644 index 0000000..f1f2ddb --- /dev/null +++ b/ComputeShaders/flashAttentionCompat1.hlsl @@ -0,0 +1,125 @@ +// Dispatch with [ neq1*neq2*neq3, 1, 1 ] thread groups +#include "flashAttentionCommon.hlsli" +#include "groupReduce.hlsli" + +inline void computeDotProduct( Buffer<float> buff0, Buffer<float> buff1, uint s0, uint s1, const uint len, const uint thread, inout float acc ) +{ + acc = 0; + /* + const uint s0End = s0 + len; + s0 += thread; + s1 += thread; + for( ; s0 < s0End; s0 += 32, s1 += 32 ) + acc = mad( buff0[ s0 ], buff1[ s1 ], acc ); + horizontalSumCompatNew( thread, acc ); + */ + + const uint completeVectors = len / 32; + uint i; + for( i = 0; i < completeVectors; i++, s0 += 32, s1 += 32 ) + acc = mad( buff0[ s0 + thread ], buff1[ s1 + thread ], acc ); + + horizontalSumCompatNew( thread, acc ); + + if( 0 == thread ) + { + const uint rem = len % 32; + for( i = 0; i < rem; i++ ) + { + precise float a = buff0[ s0 + i ]; + precise float b = buff1[ s1 + i ]; + precise float prod = a * b; + acc += prod; + } + } +} + +void scaleTempVector( uint i, const uint length, const uint thread, const float multiplier ) +{ + const uint end = i + length; + for( i += thread; i < end; i += 32 ) + { + float f = temp[ i ]; + f *= multiplier; + temp[ i ] = f; + } +} + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint neq0 = q_elements[ 0 ]; + const uint neq1 = q_elements[ 1 ]; + const uint neq2 = q_elements[ 2 ]; + const uint neq3 = q_elements[ 3 ]; + + const uint nek0 = k_elements[ 0 ]; + const uint nek1 = k_elements[ 1 ]; + + const uint nev1 = v_elements[ 1 ]; + + const uint ne0 = res_elements[ 0 ]; + const uint ne1 = res_elements[ 1 ]; + + const uint nbk0 = k_strides[ 0 ]; + const uint nbk1 = k_strides[ 1 ]; + const uint nbk2 = k_strides[ 2 ]; + const uint nbk3 = k_strides[ 3 ]; + + const uint nbq0 = q_strides[ 0 ]; + const uint nbq1 = q_strides[ 1 ]; + const uint nbq2 = q_strides[ 2 ]; + const uint nbq3 = q_strides[ 3 ]; + + const uint nbv0 = v_strides[ 0 ]; + const uint nbv1 = v_strides[ 1 ]; + const uint nbv2 = v_strides[ 2 ]; + const uint nbv3 = v_strides[ 3 ]; + + const uint nb0 = res_strides[ 0 ]; + const uint nb1 = res_strides[ 1 ]; + const uint nb2 = res_strides[ 2 ]; + const uint nb3 = res_strides[ 3 ]; + + const uint D = neq0; + const uint N = neq1; + const uint P = nek1 - N; + // const uint M = P + N; + const uint M = nek1; + + const uint ir = group.x; + const uint iq3 = ir / ( neq2 * neq1 ); + const uint iq2 = ( ir - iq3 * neq2 * neq1 ) / neq1; + const uint iq1 = ( ir - iq3 * neq2 * neq1 - iq2 * neq1 ); + + const uint tempIndex = ir * tempBufferStride; + + uint ic; + for( ic = 0; ic < nek1; ic++ ) + { + // k indices + const uint ik3 = iq3; + const uint ik2 = iq2; + const uint ik1 = ic; + + // S indices + const uint i1 = ik1; + + if( masked ) + { + if( ic > P + iq1 ) + { + if( 0 == thread ) + temp[ tempIndex + ic ] = negativeInfinity; + continue; + } + } + + const uint s0 = ik1 * nbk1 + ik2 * nbk2 + ik3 * nbk3; + const uint s1 = iq1 * nbq1 + iq2 * nbq2 + iq3 * nbq3; + float dp; + computeDotProduct( k, q, s0, s1, neq0, thread, dp ); + if( 0 == thread ) + temp[ tempIndex + ic ] = dp * scale; + } +}
\ No newline at end of file diff --git a/ComputeShaders/flashAttentionCompat2.hlsl b/ComputeShaders/flashAttentionCompat2.hlsl new file mode 100644 index 0000000..73f5fce --- /dev/null +++ b/ComputeShaders/flashAttentionCompat2.hlsl @@ -0,0 +1,114 @@ +// Dispatch with [ ( neq1*neq2*neq3 + 31 ) / 32, 1, 1 ] thread groups +#include "flashAttentionCommon.hlsli" +Buffer<uint> lookupTable: register( t3 ); + +void scaleTempVector( uint i, const uint length, const float multiplier ) +{ + const uint end = i + length; + for( ; i < end; i++ ) + { + float f = temp[ i ]; + f *= multiplier; + // Rounding in this shader causes numerical errors on my GeForce 1080 Ti GPU, driver 527.56 + // f = roundToFp16( f ); + temp[ i ] = f; + } +} + +inline float computeTempVectorMax( uint i, const uint length ) +{ + // Compute per-thread maximum + const uint end = i + length; + float ax = negativeInfinity; + for( ; i < end; i++ ) + ax = max( ax, temp[ i ] ); + return ax; +} + +#include "miscUtils.hlsli" +#include "fp64Utils.hlsli" + +// Transform temp[ i ] = exp( temp[ i ] - tempMax ), and return the sum of these values +inline double applySoftMax( uint i, const uint length, const float tempMax ) +{ + // Transform the values, and compute per-thread sum + const uint end = i + length; + double sum = 0; + for( ; i < end; i++ ) + { + float f = temp[ i ]; + [branch] + if( f != negativeInfinity ) + { + f -= tempMax; + const uint index = fp16Rounded( f ); + const uint res16 = lookupTable[ index ]; + f = f16tof32( res16 ); + sum += f; + } + else + f = 0; + + temp[ i ] = f; + } + return sum; +} + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 dtid: SV_DispatchThreadID ) +{ + const uint neq0 = q_elements[ 0 ]; + const uint neq1 = q_elements[ 1 ]; + const uint neq2 = q_elements[ 2 ]; + const uint neq3 = q_elements[ 3 ]; + + const uint nek0 = k_elements[ 0 ]; + const uint nek1 = k_elements[ 1 ]; + + const uint nev1 = v_elements[ 1 ]; + + const uint ne0 = res_elements[ 0 ]; + const uint ne1 = res_elements[ 1 ]; + + const uint nbk0 = k_strides[ 0 ]; + const uint nbk1 = k_strides[ 1 ]; + const uint nbk2 = k_strides[ 2 ]; + const uint nbk3 = k_strides[ 3 ]; + + const uint nbq0 = q_strides[ 0 ]; + const uint nbq1 = q_strides[ 1 ]; + const uint nbq2 = q_strides[ 2 ]; + const uint nbq3 = q_strides[ 3 ]; + + const uint nbv0 = v_strides[ 0 ]; + const uint nbv1 = v_strides[ 1 ]; + const uint nbv2 = v_strides[ 2 ]; + const uint nbv3 = v_strides[ 3 ]; + + const uint nb0 = res_strides[ 0 ]; + const uint nb1 = res_strides[ 1 ]; + const uint nb2 = res_strides[ 2 ]; + const uint nb3 = res_strides[ 3 ]; + + const uint D = neq0; + const uint N = neq1; + const uint P = nek1 - N; + // const uint M = P + N; + const uint M = nek1; + + const uint ir = dtid.x; + if( ir >= neq1 * neq2 * neq3 ) + return; + + const uint iq3 = ir / ( neq2 * neq1 ); + const uint iq2 = ( ir - iq3 * neq2 * neq1 ) / neq1; + const uint iq1 = ( ir - iq3 * neq2 * neq1 - iq2 * neq1 ); + + const uint tempIndex = ir * tempBufferStride; + + // Softmax + float tvm = computeTempVectorMax( tempIndex, M ); + double sum = applySoftMax( tempIndex, M, tvm ); + + scaleTempVector( tempIndex, M, (float)( 1.0 / sum ) ); +}
\ No newline at end of file diff --git a/ComputeShaders/flashAttentionCompat3.hlsl b/ComputeShaders/flashAttentionCompat3.hlsl new file mode 100644 index 0000000..e0a4061 --- /dev/null +++ b/ComputeShaders/flashAttentionCompat3.hlsl @@ -0,0 +1,118 @@ +// Dispatch with [ neq1*neq2*neq3, 1, 1 ] thread groups +#include "flashAttentionCommon.hlsli" +#include "groupReduce.hlsli" +#include "miscUtils.hlsli" + +inline void roundTempVector( uint i, const uint len, const uint thread ) +{ + const uint iEnd = i + len; + for( i += thread; i < iEnd; i += 32 ) + { + float f = temp[ i ]; + f = roundToFp16( f ); + temp[ i ] = f; + } +} + +inline void computeDotProduct( Buffer<float> buff0, RWBuffer<float> buff1, uint s0, uint s1, const uint len, const uint thread, inout float acc ) +{ + acc = 0; +/* const uint s0End = s0 + len; + s0 += thread; + s1 += thread; + for( ; s0 < s0End; s0 += 32, s1 += 32 ) + acc = mad( buff0[ s0 ], buff1[ s1 ], acc ); + + horizontalSumCompatNew( thread, acc ); */ + const uint completeVectors = len / 32; + uint i; + for( i = 0; i < completeVectors; i++, s0 += 32, s1 += 32 ) + acc = mad( buff0[ s0 + thread ], buff1[ s1 + thread ], acc ); + + horizontalSumCompatNew( thread, acc ); + + if( 0 == thread ) + { + const uint rem = len % 32; + if( 0 != rem ) + { + double f64 = acc; + for( i = 0; i < rem; i++ ) + { + precise float a = buff0[ s0 + i ]; + precise float b = buff1[ s1 + i ]; + precise float prod = a * b; + f64 += prod; + } + acc = (float)f64; + } + } +} + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint neq0 = q_elements[ 0 ]; + const uint neq1 = q_elements[ 1 ]; + const uint neq2 = q_elements[ 2 ]; + const uint neq3 = q_elements[ 3 ]; + + const uint nek0 = k_elements[ 0 ]; + const uint nek1 = k_elements[ 1 ]; + + const uint nev1 = v_elements[ 1 ]; + + const uint ne0 = res_elements[ 0 ]; + const uint ne1 = res_elements[ 1 ]; + + const uint nbk0 = k_strides[ 0 ]; + const uint nbk1 = k_strides[ 1 ]; + const uint nbk2 = k_strides[ 2 ]; + const uint nbk3 = k_strides[ 3 ]; + + const uint nbq0 = q_strides[ 0 ]; + const uint nbq1 = q_strides[ 1 ]; + const uint nbq2 = q_strides[ 2 ]; + const uint nbq3 = q_strides[ 3 ]; + + const uint nbv0 = v_strides[ 0 ]; + const uint nbv1 = v_strides[ 1 ]; + const uint nbv2 = v_strides[ 2 ]; + const uint nbv3 = v_strides[ 3 ]; + + const uint nb0 = res_strides[ 0 ]; + const uint nb1 = res_strides[ 1 ]; + const uint nb2 = res_strides[ 2 ]; + const uint nb3 = res_strides[ 3 ]; + + const uint D = neq0; + const uint N = neq1; + const uint P = nek1 - N; + // const uint M = P + N; + const uint M = nek1; + + const uint ir = group.x; + const uint iq3 = ir / ( neq2 * neq1 ); + const uint iq2 = ( ir - iq3 * neq2 * neq1 ) / neq1; + const uint iq1 = ( ir - iq3 * neq2 * neq1 - iq2 * neq1 ); + + const uint tempIndex = ir * tempBufferStride; + + roundTempVector( tempIndex, nek1, thread ); + AllMemoryBarrierWithGroupSync(); + + uint rdi = iq1 * nb1 + iq2 * nb2 + iq3 * nb3; + for( uint ic = 0; ic < nev1; ic++, rdi += nb0 ) + { + // dst indices + const uint i1 = iq1; + const uint i2 = iq2; + const uint i3 = iq3; + + const uint s0 = ic * nbv1 + i2 * nbv2 + i3 * nbv3; + float dp; + computeDotProduct( v, temp, s0, tempIndex, nek1, thread, dp ); + if( 0 == thread ) + result[ rdi ] = dp; + } +}
\ No newline at end of file diff --git a/ComputeShaders/fmaRepeat1.hlsl b/ComputeShaders/fmaRepeat1.hlsl new file mode 100644 index 0000000..3db3827 --- /dev/null +++ b/ComputeShaders/fmaRepeat1.hlsl @@ -0,0 +1,77 @@ +// Implementation of fmaRepeat() when both source arguments have same size and strides +// Dispatch [ nb[ 1 ], nb[ 2 ], nb[ 3 ] ] thread groups of this shader, where nb is size of the destination tensor +RWBuffer<float> tensor: register( u0 ); +Buffer<float> patternMul: register( t0 ); +Buffer<float> patternAdd: register( t1 ); + +cbuffer Constants: register( b0 ) +{ + uint4 tensorSize: packoffset( c0 ); + uint4 tensorStrides: packoffset( c1 ); + uint4 patternSize: packoffset( c2 ); + uint4 patternStrides: packoffset( c3 ); +} + +#ifndef THREADS +#define THREADS 512 +#endif + +#include "repeatUtils.hlsli" + +inline void computeSimple( uint idx, float mul, float add ) +{ + precise float f = tensor[ idx ]; + f *= mul; + f += add; + tensor[ idx ] = f; +} + +[ numthreads( THREADS, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + uint3 it = tensorIteratorState( group, thread, tensorSize, tensorStrides ); + uint rsi = rowOffset( group % patternSize.yzw, patternStrides ); + + if( patternSize[ 0 ] == 1 ) + { + // The pattern only has 1 column - broadcasting over the row + const float pMul = patternMul[ rsi ]; + const float pAdd = patternAdd[ rsi ]; + ROW_LOOP( it ) + computeSimple( it.x, pMul, pAdd ); + } + else if( patternSize[ 0 ] <= THREADS ) + { + // pattern size doesn't exceed thread group size: load pattern value outside of the loop + const uint threadsPerGroup = THREADS - ( THREADS % patternSize[ 0 ] ); + if( thread >= threadsPerGroup ) + return; + + rsi += ( thread % patternSize[ 0 ] ) * patternStrides[ 0 ]; + const float pMul = patternMul[ rsi ]; + const float pAdd = patternAdd[ rsi ]; + ROW_LOOP_EX( it, threadsPerGroup, tensorStrides ) + computeSimple( it.x, pMul, pAdd ); + } + else + { + // Pattern rows are larger than the thread group, need to stream from both buffers + const uint rsiInc = THREADS * patternStrides[ 0 ]; + const uint rsiDec = patternSize[ 0 ] * patternStrides[ 0 ]; + const uint rsiEnd = rsi + rsiDec; + rsi += thread * patternStrides[ 0 ]; + + ROW_LOOP( it ) + { + precise float f = tensor[ it.x ]; + float mul = patternMul[ rsi ]; + float add = patternAdd[ rsi ]; + rsi += rsiInc; + if( rsi >= rsiEnd ) + rsi -= rsiDec; + f *= mul; + f += add; + tensor[ it.x ] = f; + } + } +}
\ No newline at end of file diff --git a/ComputeShaders/fmaRepeat164.hlsl b/ComputeShaders/fmaRepeat164.hlsl new file mode 100644 index 0000000..99813f6 --- /dev/null +++ b/ComputeShaders/fmaRepeat164.hlsl @@ -0,0 +1,2 @@ +#define THREADS 64 +#include "fmaRepeat1.hlsl"
\ No newline at end of file diff --git a/ComputeShaders/fmaRepeat2.hlsl b/ComputeShaders/fmaRepeat2.hlsl new file mode 100644 index 0000000..edadf0a --- /dev/null +++ b/ComputeShaders/fmaRepeat2.hlsl @@ -0,0 +1,45 @@ +// Implementation of fmaRepeat() when source arguments have different shape or VRAM layout +// Dispatch [ nb[ 1 ], nb[ 2 ], nb[ 3 ] ] thread groups of this shader, where nb is size of the destination tensor +RWBuffer<float> tensor: register( u0 ); +Buffer<float> patternMul: register( t0 ); +Buffer<float> patternAdd: register( t1 ); + +cbuffer Constants: register( b0 ) +{ + uint4 tensorSize: packoffset( c0 ); + uint4 tensorStrides: packoffset( c1 ); + uint4 patternSizeMul: packoffset( c2 ); + uint4 patternStridesMul: packoffset( c3 ); + uint4 patternSizeAdd: packoffset( c4 ); + uint4 patternStridesAdd: packoffset( c5 ); +} + +#ifndef THREADS +#define THREADS 32 +#endif + +#include "repeatUtils.hlsli" + +inline float loadPattern( Buffer<float> buffer, uint rowStart, uint i, uint4 size, uint4 stride ) +{ + i %= size.x; + return buffer[ i * stride.x + rowStart ]; +} + +[ numthreads( THREADS, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + uint3 it = tensorIteratorState( group, thread, tensorSize, tensorStrides ); + const uint rsiMul = rowOffset( group % patternSizeMul.yzw, patternStridesMul ); + const uint rsiAdd = rowOffset( group % patternSizeAdd.yzw, patternStridesAdd ); + + for( uint i = thread; it.x < it.z; it.x += it.y, i++ ) + { + precise float f = tensor[ it.x ]; + float mul = loadPattern( patternMul, rsiMul, i, patternSizeMul, patternStridesMul ); + float add = loadPattern( patternAdd, rsiAdd, i, patternSizeAdd, patternStridesAdd ); + f *= mul; + f += add; + tensor[ it.x ] = f; + } +}
\ No newline at end of file diff --git a/ComputeShaders/fp64Utils.hlsli b/ComputeShaders/fp64Utils.hlsli new file mode 100644 index 0000000..9782718 --- /dev/null +++ b/ComputeShaders/fp64Utils.hlsli @@ -0,0 +1,28 @@ +// TODO: compile another version of these shader, and use it on GPUs with ExtendedDoublesShaderInstructions flag, will become slightly faster +// https://learn.microsoft.com/en-us/windows/win32/api/d3d11/ns-d3d11-d3d11_feature_data_d3d11_options +#ifndef ExtendedDoublesShaderInstructions +#define ExtendedDoublesShaderInstructions 0 +#endif + +// Compute num/den in FP64 precision +inline double div64( double num, double den ) +{ +#if ExtendedDoublesShaderInstructions + return num / den; +#else + // https://en.wikipedia.org/wiki/Division_algorithm#Newton%E2%80%93Raphson_division + double x = 1.0f / (float)den; + x += x * ( 1.0 - den * x ); + x += x * ( 1.0 - den * x ); + return num * x; +#endif +} + +// Compute sqrt(x) in FP64 precision +inline double sqrt64( double x ) +{ + double root = sqrt( (float)x ); + root = 0.5 * ( root + div64( x, root ) ); + root = 0.5 * ( root + div64( x, root ) ); + return root; +}
\ No newline at end of file diff --git a/ComputeShaders/groupReduce.hlsli b/ComputeShaders/groupReduce.hlsli new file mode 100644 index 0000000..1ffe1d8 --- /dev/null +++ b/ComputeShaders/groupReduce.hlsli @@ -0,0 +1,139 @@ +groupshared float sharedAccumulators[ 32 ]; + +// Compute horisontal sum of the numbers. The result is only correct on the thread #0 of the group. +void horizontalSum( const uint thread, inout float sum ) +{ + sharedAccumulators[ thread ] = sum; + for( uint i = 16; i > 1; i /= 2 ) + { + GroupMemoryBarrierWithGroupSync(); + if( thread < i ) + { + sum += sharedAccumulators[ thread + i ]; + sharedAccumulators[ thread ] = sum; + } + } + GroupMemoryBarrierWithGroupSync(); + if( 0 == thread ) + sum += sharedAccumulators[ 1 ]; +} + +// Compute horisontal sum of the numbers, and broadcast to all threads of the group. +void horizontalSumBroadcast( const uint thread, inout float sum ) +{ + horizontalSum( thread, sum ); + if( 0 == thread ) + sharedAccumulators[ 0 ] = sum; + GroupMemoryBarrierWithGroupSync(); + sum = sharedAccumulators[ 0 ]; +} + +// Compute horisontal sum of the numbers, in the order equal to the CPU-running dot product implementation. +// The result is only correct on the thread #0 of the group. +void horizontalSumCompat( const uint thread, inout float sum ) +{ + sharedAccumulators[ thread ] = sum; + GroupMemoryBarrierWithGroupSync(); + + if( 0 == ( thread & 8 ) ) + { + // This runs on threads [ 0 .. 7 ] and [ 16 .. 23 ] + // sum01 = _mm256_add_ps( sum0, sum1 ); + // sum23 = _mm256_add_ps( sum2, sum3 ); + sum += sharedAccumulators[ thread + 8 ]; + sharedAccumulators[ thread ] = sum; + } + + GroupMemoryBarrierWithGroupSync(); + if( thread < 8 ) + { + // This runs on threads [ 0 .. 7 ] + // sum0123 = _mm256_add_ps( sum01, sum23 ); + sum += sharedAccumulators[ thread + 16 ]; + sharedAccumulators[ thread ] = sum; + } + + GroupMemoryBarrierWithGroupSync(); + if( thread < 4 ) + { + // const __m128 r4 = _mm_add_ps( _mm256_castps256_ps128( sum0123 ), _mm256_extractf128_ps( sum0123, 1 ) ); + sum += sharedAccumulators[ thread + 4 ]; + sharedAccumulators[ thread ] = sum; + } + + GroupMemoryBarrierWithGroupSync(); + if( thread < 2 ) + { + // const __m128 r2 = _mm_add_ps( r4, _mm_movehl_ps( r4, r4 ) ); + sum += sharedAccumulators[ thread + 2 ]; + sharedAccumulators[ thread ] = sum; + } + + GroupMemoryBarrierWithGroupSync(); + if( 0 == thread ) + { + // const __m128 r1 = _mm_add_ss( r2, _mm_movehdup_ps( r2 ) ); + sum += sharedAccumulators[ 1 ]; + } +} + +// Compute horisontal sum of the numbers, in yet another creative summation order recently implemented in the upstream +void horizontalSumCompatNew( const uint thread, inout float sum ) +{ + // GGML_F32x8_REDUCE + sharedAccumulators[ thread ] = sum; + GroupMemoryBarrierWithGroupSync(); + + if( 0 == ( thread & 8 ) ) + { + // Runs on threads [ 0 .. 7 ] and [ 16 .. 23 ] + sum += sharedAccumulators[ thread | 8 ]; + sharedAccumulators[ thread ] = sum; + } + GroupMemoryBarrierWithGroupSync(); + + if( thread < 8 ) + { + // Runs on threads [ 0 .. 7 ] + sum += sharedAccumulators[ thread | 0x10 ]; + sharedAccumulators[ thread ] = sum; + } + GroupMemoryBarrierWithGroupSync(); + + if( thread < 4 ) + { + // Runs on threads [ 0 .. 3 ] + sum += sharedAccumulators[ thread | 4 ]; + sharedAccumulators[ thread ] = sum; + } + GroupMemoryBarrierWithGroupSync(); + + if( thread < 4 && 0 == ( thread & 1 ) ) + { + // Runs on threads [ 0, 2 ] + sum += sharedAccumulators[ thread | 1 ]; + sharedAccumulators[ thread ] = sum; + } + GroupMemoryBarrierWithGroupSync(); + + if( 0 == thread ) + sum += sharedAccumulators[ 2 ]; +} + + +// Compute horizontal maximum of the numbers, and broadcast to all threads of the group. +void horizontalMaxBroadcast( const uint thread, inout float ax ) +{ + sharedAccumulators[ thread ] = ax; + for( uint i = 16; i > 0; i /= 2 ) + { + GroupMemoryBarrierWithGroupSync(); + if( thread < i ) + { + ax = max( ax, sharedAccumulators[ thread + i ] ); + sharedAccumulators[ thread ] = ax; + } + } + GroupMemoryBarrierWithGroupSync(); + ax = sharedAccumulators[ 0 ]; +}
\ No newline at end of file diff --git a/ComputeShaders/groupReduce64.hlsli b/ComputeShaders/groupReduce64.hlsli new file mode 100644 index 0000000..7094d03 --- /dev/null +++ b/ComputeShaders/groupReduce64.hlsli @@ -0,0 +1,46 @@ +groupshared float sharedAccumulators[ 64 ]; + +// Compute horisontal sum of the numbers. The result is only correct on the thread #0 of the group. +void horizontalSum( const uint thread, inout float sum ) +{ + sharedAccumulators[ thread ] = sum; + for( uint i = 32; i > 1; i /= 2 ) + { + GroupMemoryBarrierWithGroupSync(); + if( thread < i ) + { + sum += sharedAccumulators[ thread + i ]; + sharedAccumulators[ thread ] = sum; + } + } + GroupMemoryBarrierWithGroupSync(); + if( 0 == thread ) + sum += sharedAccumulators[ 1 ]; +} + +// Compute horisontal sum of the numbers, and broadcast to all threads of the group. +void horizontalSumBroadcast( const uint thread, inout float sum ) +{ + horizontalSum( thread, sum ); + if( 0 == thread ) + sharedAccumulators[ 0 ] = sum; + GroupMemoryBarrierWithGroupSync(); + sum = sharedAccumulators[ 0 ]; +} + +// Compute horizontal maximum of the numbers, and broadcast to all threads of the group. +void horizontalMaxBroadcast( const uint thread, inout float ax ) +{ + sharedAccumulators[ thread ] = ax; + for( uint i = 32; i > 0; i /= 2 ) + { + GroupMemoryBarrierWithGroupSync(); + if( thread < i ) + { + ax = max( ax, sharedAccumulators[ thread + i ] ); + sharedAccumulators[ thread ] = ax; + } + } + GroupMemoryBarrierWithGroupSync(); + ax = sharedAccumulators[ 0 ]; +}
\ No newline at end of file diff --git a/ComputeShaders/matReshapePanels.hlsl b/ComputeShaders/matReshapePanels.hlsl new file mode 100644 index 0000000..f26f246 --- /dev/null +++ b/ComputeShaders/matReshapePanels.hlsl @@ -0,0 +1,105 @@ +// This shader reshapes a matrix into the shape expected by mulMatTiledEx.hlsl and mulMatByRowTiledEx.hlsl compute shaders +// It's called in runtime, also while loading models from disk. +// So far, it's only used when running on AMD GPUs. +#ifndef TILE_SIZE +static const uint TILE_SIZE = 32; +#endif + +// Input tensor +Buffer<float> source: register( t0 ); +// Output tensor +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 arg0Size: packoffset( c0 ); + uint4 arg0Strides: packoffset( c1 ); + // Count of elements per panel + uint panelSize : packoffset( c2.y ); + // Layer strides of the output matrix + uint2 layerStrides: packoffset( c2.z ); +} + +inline uint hadd( uint2 v2 ) { return v2.x + v2.y; } + +groupshared float tileBuffer[ TILE_SIZE ][ TILE_SIZE ]; + +[ numthreads( TILE_SIZE, 1, 1 ) ] +void main( const uint3 group: SV_GroupID, const uint thread : SV_GroupIndex ) +{ + uint rdi = hadd( group.yz * layerStrides ); + rdi += group.x * panelSize; + rdi += thread; + + uint rsi = hadd( group.yz * arg0Strides.zw ); + const uint baseY = group.x * TILE_SIZE; + const uint dispatchThread = baseY + thread; + // Reshaping into a column major horizontal panel, height = TILE_SIZE, width = width of the source matrix + uint width = arg0Size.x; + // Usually TILE_SIZE; can be less for the last panel on the matrix when we need to generate zeros instead of loading these numbers + const uint height = min( TILE_SIZE, arg0Size.y - baseY ); + + if( arg0Strides.x == 1 ) + { + // The input matrix is row major, can improve performance with coalesced loads and group shared buffer. + rsi += baseY * arg0Strides.y; + + const uint widthCompleteTiles = width / TILE_SIZE; + + if( height < TILE_SIZE ) + { + // This thread group was dispatched for the last panel of the matrix, it doesn't have enough rows + // Write zeros to the corresponding elements of the groupshared buffer + for( uint j = height; j < TILE_SIZE; j++ ) + tileBuffer[ thread ][ j ] = 0.0; + } + + for( uint i = 0; i < widthCompleteTiles; i++, rsi += TILE_SIZE ) + { + // Load [ TILE_SIZE ] * [ TILE_SIZE ] block with fully coalesced loads, store to group shared buffer in transposed order + uint rsiTile = rsi + thread; + uint j; + for( j = 0; j < height; j++, rsiTile += arg0Strides.y ) + { + // Each iteration of the loop loads a row of [ TILE_SIZE ] elements from the corresponding row of the source tensor + // Fully coalesced load + float f = source[ rsiTile ]; + // Random store but the local memory's fast, this works rather well in practice + tileBuffer[ thread ][ j ] = f; + } + + GroupMemoryBarrierWithGroupSync(); + + // Copy from group shared buffer to output tensor + for( j = 0; j < TILE_SIZE; j++, rdi += TILE_SIZE ) + { + // Fully coalesced loads and stores + float f = tileBuffer[ j ][ thread ]; + result[ rdi ] = f; + } + + GroupMemoryBarrierWithGroupSync(); + } + + width %= TILE_SIZE; + if( 0 == width ) + return; + rsi += thread * arg0Strides.y; + } + else + rsi += dispatchThread * arg0Strides.y; + + for( uint i = 0; i < width; i++ ) + { + float f; + [branch] + if( thread < height ) + f = source[ rsi ]; + else + f = 0.0; + rsi += arg0Strides.x; + + result[ rdi ] = f; + rdi += TILE_SIZE; + } +}
\ No newline at end of file diff --git a/ComputeShaders/miscUtils.hlsli b/ComputeShaders/miscUtils.hlsli new file mode 100644 index 0000000..b957a06 --- /dev/null +++ b/ComputeShaders/miscUtils.hlsli @@ -0,0 +1,84 @@ +// When GPUs are converting FP32 to FP16, they always truncate towards 0, documented there: +// https://learn.microsoft.com/en-us/windows/win32/direct3d10/d3d10-graphics-programming-guide-resources-data-conversion#conververting-from-a-higher-range-representation-to-a-lower-range-representation +// Whisper code uses _mm_cvtps_ph( x, 0 ), the 0 stands for "Round to nearest even": https://www.felixcloutier.com/x86/vcvtps2ph +// This function adjusts FP32 value making it so that truncation towards 0 results in the value equal to what CPU is doing +inline float adjustFp16( const float src ) +{ + const uint trunc16 = f32tof16( src ); + const float trunc32 = f16tof32( trunc16 ); + + const uint truncExp = ( trunc16 >> 10 ) & 0x1F; + if( truncExp != 0x1F ) + { + const uint next16 = trunc16 + 1; + const float next32 = f16tof32( next16 ); + + const float errTrunc = abs( src - trunc32 ); + const float errNext = abs( src - next32 ); + + if( errTrunc < errNext ) + { + // Truncated was closer to the source + return src; + } + else if( errTrunc > errNext ) + { + // Truncated + 1 was closer to the source + return next32; + } + else + { + // Exactly half, doing banker's rounding to nearest even + return ( 0 == ( trunc16 & 1 ) ) ? src : next32; + } + } + else + { + // INF or NAN + return src; + } +} + +// Convert FP32 number to FP16, using rounding to nearest +inline uint fp16Rounded( const float src ) +{ + const uint trunc16 = f32tof16( src ); + const float trunc32 = f16tof32( trunc16 ); + + const uint truncExp = ( trunc16 >> 10 ) & 0x1F; + if( truncExp != 0x1F ) + { + const uint next16 = trunc16 + 1; + const float next32 = f16tof32( next16 ); + + const float errTrunc = abs( src - trunc32 ); + const float errNext = abs( src - next32 ); + + if( errTrunc < errNext ) + { + // Truncated was closer to the source + return trunc16; + } + else if( errTrunc > errNext ) + { + // Truncated + 1 was closer to the source + return next16; + } + else + { + // Exactly half, doing banker's rounding to nearest even + return ( 0 == ( trunc16 & 1 ) ) ? trunc16 : next16; + } + } + else + { + // INF or NAN + return trunc16; + } +} + +// Round up the number to be a multiple of 32 +inline uint roundUp32( uint x ) +{ + return ( x + 31 ) & ( ~31u ); +}
\ No newline at end of file diff --git a/ComputeShaders/mulMatByRow.hlsl b/ComputeShaders/mulMatByRow.hlsl new file mode 100644 index 0000000..565cfdc --- /dev/null +++ b/ComputeShaders/mulMatByRow.hlsl @@ -0,0 +1,49 @@ +// Matrix * row product, like [ E0, E1, E2, E3 ] * [ E0, 1, E2, E3 ] = [ E1, 1, E2, E3 ] +// Dispatch [ E1, E2, E3 ] groups of this shader +Buffer<float> arg0: register( t0 ); +Buffer<float> arg1: register( t1 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 arg0Size: packoffset( c0 ); + uint4 arg0Strides: packoffset( c1 ); + uint4 arg1Size: packoffset( c2 ); + uint4 arg1Strides: packoffset( c3 ); + uint4 resultSize: packoffset( c4 ); + uint4 resultStrides: packoffset( c5 ); +} + +#include "groupReduce.hlsli" + +inline uint hadd( uint3 vec ) +{ + return vec.x + vec.y + vec.z; +} +inline uint hadd( uint2 vec ) +{ + return vec.x + vec.y; +} + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + uint s0 = hadd( group * arg0Strides.yzw ); + uint s1 = hadd( group.yz * arg1Strides.zw ); + const uint s0End = s0 + arg0Size.x * arg0Strides.x; + const uint s0Inc = 32 * arg0Strides.x; + const uint s1Inc = 32 * arg1Strides.x; + + s0 += thread * arg0Strides.x; + s1 += thread * arg1Strides.x; + float dp = 0; + for( ; s0 < s0End; s0 += s0Inc, s1 += s1Inc ) + dp = mad( arg0[ s0 ], arg1[ s1 ], dp ); + + horizontalSum( thread, dp ); + if( 0 != thread ) + return; + + const uint rdi = group.x + hadd( group.yz * resultStrides.zw ); + result[ rdi ] = dp; +}
\ No newline at end of file diff --git a/ComputeShaders/mulMatByRow64.hlsl b/ComputeShaders/mulMatByRow64.hlsl new file mode 100644 index 0000000..db5f801 --- /dev/null +++ b/ComputeShaders/mulMatByRow64.hlsl @@ -0,0 +1,90 @@ +// Matrix * row product, like [ E0, E1, E2, E3 ] * [ E0, 1, E2, E3 ] = [ E1, 1, E2, E3 ] +// Dispatch [ E1, E2, E3 ] groups of this shader +Buffer<float> arg0: register( t0 ); +Buffer<float> arg1: register( t1 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 arg0Size: packoffset( c0 ); + uint4 arg0Strides: packoffset( c1 ); + uint4 arg1Size: packoffset( c2 ); + uint4 arg1Strides: packoffset( c3 ); + uint4 resultSize: packoffset( c4 ); + uint4 resultStrides: packoffset( c5 ); +} + +inline uint hadd( uint3 vec ) +{ + return vec.x + vec.y + vec.z; +} +inline uint hadd( uint2 vec ) +{ + return vec.x + vec.y; +} + +// No idea why, but that particular configuration appears to be the fastest one on Ryzen 7 5700G iGPU +// Not by much, though: when trying a few numbers I saw 1.30 - 1.42 seconds for this compute shader +static const uint THREADS = 64; +static const uint REDUCTION_BUFFER = 32; +groupshared float sharedAccumulators[ REDUCTION_BUFFER ]; + +// Compute horisontal sum of the numbers. The result is only correct on the thread #0 of the group. +void horizontalSum( const uint thread, inout float sum ) +{ + if( THREADS > REDUCTION_BUFFER ) + { + for( uint t = REDUCTION_BUFFER; t < THREADS; t += REDUCTION_BUFFER ) + { + // Threads [ t .. t + REDUCTION_BUFFER ] store into the buffer + if( thread >= t && thread < t + REDUCTION_BUFFER ) + sharedAccumulators[ thread - t ] = sum; + + GroupMemoryBarrierWithGroupSync(); + + // Threads [ 0 .. REDUCTION_BUFFER ] increment their local sum with the value loaded from the buffer + if( thread < REDUCTION_BUFFER ) + sum += sharedAccumulators[ thread ]; + } + } + + if( thread < REDUCTION_BUFFER ) + sharedAccumulators[ thread ] = sum; + + for( uint i = REDUCTION_BUFFER / 2; i > 1; i /= 2 ) + { + GroupMemoryBarrierWithGroupSync(); + if( thread < i ) + { + sum += sharedAccumulators[ thread + i ]; + sharedAccumulators[ thread ] = sum; + } + } + + GroupMemoryBarrierWithGroupSync(); + if( 0 == thread ) + sum += sharedAccumulators[ 1 ]; +} + +[ numthreads( THREADS, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + uint s0 = hadd( group * arg0Strides.yzw ); + uint s1 = hadd( group.yz * arg1Strides.zw ); + const uint s0End = s0 + arg0Size.x * arg0Strides.x; + const uint s0Inc = THREADS * arg0Strides.x; + const uint s1Inc = THREADS * arg1Strides.x; + + s0 += thread * arg0Strides.x; + s1 += thread * arg1Strides.x; + float dp = 0; + for( ; s0 < s0End; s0 += s0Inc, s1 += s1Inc ) + dp = mad( arg0[ s0 ], arg1[ s1 ], dp ); + + horizontalSum( thread, dp ); + if( 0 != thread ) + return; + + const uint rdi = group.x + hadd( group.yz * resultStrides.zw ); + result[ rdi ] = dp; +}
\ No newline at end of file diff --git a/ComputeShaders/mulMatByRowTiled.hlsl b/ComputeShaders/mulMatByRowTiled.hlsl new file mode 100644 index 0000000..fea2fcb --- /dev/null +++ b/ComputeShaders/mulMatByRowTiled.hlsl @@ -0,0 +1,120 @@ +// Matrix * row product, like [ E0, E1, E2, E3 ] * [ E0, 1, E2, E3 ] = [ E1, 1, E2, E3 ] +// Dispatch [ ( E1 + TILE_Y - 1 ) / TILE_Y, E2, E3 ] thread groups of this shader + +#ifndef TILE_Y +static const uint TILE_Y = 64; +#endif +#ifndef THREADS_X +static const uint THREADS_X = 32; +#endif +#ifndef THREADS_Y +static const uint THREADS_Y = 16; +#endif + +Buffer<float> arg0: register( t0 ); +Buffer<float> arg1: register( t1 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 arg0Size: packoffset( c0 ); + uint4 arg0Strides: packoffset( c1 ); + uint4 arg1Size: packoffset( c2 ); + uint4 arg1Strides: packoffset( c3 ); + uint4 resultSize: packoffset( c4 ); + uint4 resultStrides: packoffset( c5 ); +} + +groupshared float resTemp[ TILE_Y ][ THREADS_X ]; + +inline uint hadd( uint2 vec ) +{ + return vec.x + vec.y; +} + +[ numthreads( THREADS_X, THREADS_Y, 1 ) ] +void main( uint3 group: SV_GroupID, uint3 thread : SV_GroupThreadID, uint threadFlattenned : SV_GroupIndex ) +{ + uint i; + // Zero out the shared buffer + for( i = thread.y; i < TILE_Y; i += THREADS_Y ) + resTemp[ i ][ thread.x ] = 0.0; + GroupMemoryBarrierWithGroupSync(); + + // Count of rows to compute in this thread group + const uint height = min( TILE_Y, arg0Size.y - group.x * TILE_Y ); + + uint s0 = hadd( group.yz * arg0Strides.zw ); //< arg0 layer for the thread group + s0 += group.x * TILE_Y * arg0Strides.y; //< arg0 first row for the thread group + s0 += hadd( arg0Strides.xy * thread.xy ); //< arg0 load index for the thread + + uint s1 = hadd( group.yz * arg1Strides.zw ); //< arg1 layer for the thread group + s1 += thread.x * arg1Strides.x; //< arg1 load index for the thread + + const uint completeTiles = arg0Size.x / THREADS_X; + // Each iteration of that loop loads THREADS_X elements from arg1, + // a block of [ THREADS_X, height ] elements from arg0, + // and accumulates these dot products in the shared buffer + for( uint t = 0; t < completeTiles; t++, s0 += THREADS_X * arg0Strides.x, s1 += THREADS_X * arg1Strides.x ) + { + // Load THREADS_X elements from arg1 + const float v1 = arg1[ s1 ]; + + uint rsi = s0; + for( i = thread.y; i < height; i += THREADS_Y, rsi += arg0Strides.y * THREADS_Y ) + { + // Load THREADS_X elements from arg0 + const float v0 = arg0[ rsi ]; + // Multiply and accumulate in the shared buffer + float acc = resTemp[ i ][ thread.x ]; + acc = mad( v0, v1, acc ); + resTemp[ i ][ thread.x ] = acc; + } + GroupMemoryBarrierWithGroupSync(); + } + + const uint rem = arg0Size.x % THREADS_X; + if( rem != 0 ) + { + // E0 ain't a multiple of THREADS_X, we have a remainder + float v1; + if( thread.x < rem ) + v1 = arg1[ s1 ]; + else + v1 = 0.0; + + for( i = thread.y; i < height; i += THREADS_Y, s0 += arg0Strides.y * THREADS_Y ) + { + if( thread.x >= rem ) + continue; + const float v0 = arg0[ s0 ]; + float acc = resTemp[ i ][ thread.x ]; + acc = mad( v0, v1, acc ); + resTemp[ i ][ thread.x ] = acc; + } + GroupMemoryBarrierWithGroupSync(); + } + + // Now we need horizontal sums of these shared accumulators, i.e. reduce [height][THREADS_X] shared array into [height][1] column + for( i = THREADS_X / 2; i > 0; i /= 2 ) + { + if( thread.x < i ) + { + for( uint j = thread.y; j < height; j += THREADS_Y ) + { + float sum = resTemp[ j ][ thread.x ]; + sum += resTemp[ j ][ thread.x + i ]; + resTemp[ j ][ thread.x ] = sum; + } + } + GroupMemoryBarrierWithGroupSync(); + } + + // And finally, store that column to global memory + if( threadFlattenned >= height ) + return; + + uint rdi = hadd( group.yz * resultStrides.zw ) + group.x * TILE_Y * resultStrides.x; + rdi += threadFlattenned * resultStrides.x; + result[ rdi ] = resTemp[ threadFlattenned ][ 0 ]; +}
\ No newline at end of file diff --git a/ComputeShaders/mulMatByRowTiled64.hlsl b/ComputeShaders/mulMatByRowTiled64.hlsl new file mode 100644 index 0000000..6c63f2d --- /dev/null +++ b/ComputeShaders/mulMatByRowTiled64.hlsl @@ -0,0 +1,4 @@ +#define THREADS_Y 32 +#define THREADS_X 32 +#define TILE_Y 128 +#include "mulMatByRowTiled.hlsl"
\ No newline at end of file diff --git a/ComputeShaders/mulMatByRowTiledEx.hlsl b/ComputeShaders/mulMatByRowTiledEx.hlsl new file mode 100644 index 0000000..d377b8c --- /dev/null +++ b/ComputeShaders/mulMatByRowTiledEx.hlsl @@ -0,0 +1,156 @@ +// matrix*row vector product, needs first argument reshaped into a sequence of horizontal column major panels +#ifndef TILE_SIZE +static const uint TILE_SIZE = 32; +#endif +#ifndef TILE_HEIGHT +static const uint TILE_HEIGHT = 32; +#endif +#ifndef THREADS_Y +static const uint THREADS_Y = 16; +#endif + +// First tensor, reshaped into dense column major horizontal panels of size [ width, TILE_SIZE ] +Buffer<float> arg0: register( t0 ); +// Second tensor, reshaped into dense column major horizontal panels of size [ width, TILE_SIZE ] +Buffer<float> arg1: register( t1 ); +// FP32 output tensor, row major and continuous +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 arg0Size: packoffset( c0 ); + uint arg0panel: packoffset( c1.y ); + uint2 arg0LayerStrides: packoffset( c1.z ); + // uint4 arg1Size: packoffset( c2 ); + uint4 arg1Strides: packoffset( c3 ); + uint4 resultSize: packoffset( c4 ); + uint4 resultStrides: packoffset( c5 ); +} + +groupshared float tileOutput[ THREADS_Y ][ TILE_SIZE ]; +groupshared float tile0[ TILE_HEIGHT ][ TILE_SIZE ]; +groupshared float tile1[ TILE_HEIGHT ]; + +void multiplyTiles( const uint3 thread ) +{ + float r = 0.0; + for( uint i = thread.y; i < TILE_HEIGHT; i += THREADS_Y ) + { + float a = tile0[ i ][ thread.x ]; + float b = tile1[ i ]; + r = mad( a, b, r ); + } + tileOutput[ thread.y ][ thread.x ] += r; +} + +void reduceOutput( const uint3 thread ) +{ + float curr = 0.0; + [branch] + if( thread.y < THREADS_Y / 2 ) + curr = tileOutput[ thread.y ][ thread.x ]; + + for( uint i = THREADS_Y / 2; i > 0; i /= 2 ) + { + [branch] + if( thread.y < i ) + { + curr += tileOutput[ thread.y + i ][ thread.x ]; + tileOutput[ thread.y ][ thread.x ] = curr; + } + GroupMemoryBarrierWithGroupSync(); + } +} + +void storeTile( const uint threadFlat, const uint4 pos, const uint size ) +{ + if( threadFlat >= size ) + return; + const uint4 prod4 = pos * resultStrides; + const uint2 prod2 = prod4.xy + prod4.zw; + uint rdi = prod2.x + prod2.y; + result[ rdi + threadFlat ] = tileOutput[ 0 ][ threadFlat ]; +} + +[ numthreads( TILE_SIZE, THREADS_Y, 1 ) ] +void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID, uint threadFlat : SV_GroupIndex ) +{ + uint i; + // Zero all 3 shared buffers + tileOutput[ thread.y ][ thread.x ] = 0.0; + for( i = thread.y; i < TILE_HEIGHT; i += THREADS_Y ) + tile0[ i ][ thread.x ] = 0.0; + if( threadFlat < THREADS_Y ) + tile1[ threadFlat ] = 0.0; + + const uint2 layer = group.yz; + uint rsi0 = group.x * arg0panel + layer.x * arg0LayerStrides.x + layer.y * arg0LayerStrides.y; + uint rsi1 = layer.x * arg1Strides.z + layer.y * arg1Strides.w; + + const uint threadOffset = thread.y * TILE_SIZE + thread.x; + rsi0 += threadOffset; + rsi1 += threadFlat * arg1Strides.x; + + const uint completeTiles = arg0Size.x / TILE_HEIGHT; + for( i = 0; i < completeTiles; i++ ) + { + // Load [ TILE_SIZE, TILE_HEIGHT ] block from the first source tensor into the groupshared buffer + for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y ) + { + tile0[ j ][ thread.x ] = arg0[ rsi0 ]; + rsi0 += THREADS_Y * TILE_SIZE; + } + // Load [ TILE_HEIGHT ] row from the second source into another groupshared buffer + [ branch ] + if( threadFlat < TILE_HEIGHT ) + tile1[ threadFlat ] = arg1[ rsi1 ]; + rsi1 += TILE_HEIGHT * arg1Strides.x; + + GroupMemoryBarrierWithGroupSync(); + + multiplyTiles( thread ); + + GroupMemoryBarrierWithGroupSync(); + } + + const uint rem = arg0Size.x % TILE_HEIGHT; + if( rem != 0 ) + { + for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y ) + { + float a; + [branch] + if( j < rem ) + { + a = arg0[ rsi0 ]; + rsi0 += THREADS_Y * TILE_SIZE; + } + else + a = 0.0; + tile0[ j ][ thread.x ] = a; + } + + if( threadFlat < TILE_HEIGHT ) + { + float b; + [branch] + if( threadFlat < rem ) + b = arg1[ rsi1 ]; + else + b = 0.0; + tile1[ threadFlat ] = b; + } + + GroupMemoryBarrierWithGroupSync(); + + multiplyTiles( thread ); + + GroupMemoryBarrierWithGroupSync(); + } + + reduceOutput( thread ); + + const uint resultPos = group.x * TILE_SIZE; + const uint outputSize = min( TILE_SIZE, resultSize.x - resultPos ); + storeTile( threadFlat, uint4( resultPos, 0, layer ), outputSize ); +}
\ No newline at end of file diff --git a/ComputeShaders/mulMatByScalar.hlsl b/ComputeShaders/mulMatByScalar.hlsl new file mode 100644 index 0000000..82df5d4 --- /dev/null +++ b/ComputeShaders/mulMatByScalar.hlsl @@ -0,0 +1,41 @@ +// Matrix * scalar product, like [ 1, E1, E2, E3 ] * [ 1, 1, E2, E3 ] = [ E1, 1, E2, E3 ] +// Dispatch [ E2, E3, 1 ] thread groups of this shader +Buffer<float> arg0: register( t0 ); +Buffer<float> arg1: register( t1 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 arg0Size: packoffset( c0 ); + uint4 arg0Strides: packoffset( c1 ); + uint4 arg1Size: packoffset( c2 ); + uint4 arg1Strides: packoffset( c3 ); + uint4 resultSize: packoffset( c4 ); + uint4 resultStrides: packoffset( c5 ); +} + +inline uint hadd( uint2 vec ) +{ + return vec.x + vec.y; +} + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const float scalarValue = arg1[ hadd( group.xy * arg1Strides.zw ) ]; + + uint s0 = hadd( group.xy * arg0Strides.zw ); + const uint s0Inc = 32 * arg0Strides.y; + s0 += thread * arg0Strides.y; + + uint rdi = hadd( group.xy * resultStrides.zw ); + const uint rdiEnd = rdi + arg0Size.y; + rdi += thread; + + for( ; rdi < rdiEnd; rdi += 32, s0 += s0Inc ) + { + float f = arg0[ s0 ]; + f *= scalarValue; + result[ rdi ] = f; + } +}
\ No newline at end of file diff --git a/ComputeShaders/mulMatDotMain.hlsl b/ComputeShaders/mulMatDotMain.hlsl new file mode 100644 index 0000000..47c6d3e --- /dev/null +++ b/ComputeShaders/mulMatDotMain.hlsl @@ -0,0 +1,95 @@ +// GGML_TASK_COMPUTE step for matrix*matrix product, where nb01 >= nb00; +// Dispatch with [ ne11, ne01*ne02*ne03 ] thread groups +// Each thread group computes a single dot product +Buffer<float> arg0: register( t0 ); +Buffer<float> arg1: register( t1 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 src0_elements: packoffset( c0 ); + uint4 src0_strides: packoffset( c1 ); + uint4 src1_elements: packoffset( c2 ); + uint4 result_elements: packoffset( c4 ); + uint4 result_strides: packoffset( c5 ); +} + +inline uint product( uint3 vec ) +{ + return vec.x * vec.y * vec.z; +} + +inline uint product( uint4 vec ) +{ + uint2 tmp = vec.xy * vec.zw; + return tmp.x * tmp.y; +} + +inline float dotProductInner( uint i0, uint i1, uint length, uint thread ) +{ + float res = 0; + for( uint i = thread; i < length; i += 32 ) + res = mad( arg0[ i0 + i ], arg1[ i1 + i ], res ); + return res; +} + +#include "groupReduce.hlsli" + +[numthreads( 32, 1, 1 )] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint ne00 = src0_elements.x; + const uint ne01 = src0_elements.y; + const uint ne02 = src0_elements.z; + const uint ne03 = src0_elements.w; + + const uint ne10 = src1_elements.x; + const uint ne11 = src1_elements.y; + const uint ne12 = src1_elements.z; + const uint ne13 = src1_elements.w; + + const int nb00 = src0_strides.x; + const int nb01 = src0_strides.y; + const int nb02 = src0_strides.z; + const int nb03 = src0_strides.w; + + // total rows in src0 + // const int nr = ne01*ne02*ne03; + const uint nr = product( src0_elements.yzw ); + + const uint ir = group.y; + + // src0 indices + const uint i03 = ir / ( ne02 * ne01 ); + const uint i02 = ( ir - i03 * ne02 * ne01 ) / ne01; + const uint i01 = ( ir - i03 * ne02 * ne01 - i02 * ne01 ); + + const uint i13 = i03; + const uint i12 = i02; + + const uint i0 = i01; + const uint i2 = i02; + const uint i3 = i03; + + // src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); + // src1_col = wdata + ( i13 * ne12 * ne11 + i12 * ne11 + 0 ) * ne00; + const uint src0_row = i01 * nb01 + i02 * nb02 + i03 * nb03; + const uint src1_col = ( i13 * ne12 * ne11 + i12 * ne11 ) * ne00; + + const uint ic = group.x; + float curr = dotProductInner( src0_row, src1_col + ic * ne00, ne00, thread ); + horizontalSumCompatNew( thread, curr ); + + if( 0 != thread ) + return; + + const uint nb0 = result_strides.x; + const uint nb1 = result_strides.y; + const uint nb2 = result_strides.z; + const uint nb3 = result_strides.w; + + const uint ne0 = result_elements.x; + // float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); + const uint dst_col = i0 * nb0 + i2 * nb2 + i3 * nb3; + result[ dst_col + ic * ne0 ] = curr; +}
\ No newline at end of file diff --git a/ComputeShaders/mulMatDotReshape.hlsl b/ComputeShaders/mulMatDotReshape.hlsl new file mode 100644 index 0000000..ffb6f83 --- /dev/null +++ b/ComputeShaders/mulMatDotReshape.hlsl @@ -0,0 +1,33 @@ +// GGML_TASK_INIT step for matrix*matrix product, where nb01 >= nb00; +// Dispatch with [ ne11, ne12 ] groups +Buffer<float> arg0: register( t0 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 src0_elements: packoffset( c0 ); + uint4 src0_strides: packoffset( c1 ); +} + +#include "miscUtils.hlsli" + +// Each thread group of this shader copies a single rows of the matrix +[ numthreads( 32, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint i12 = group.y; + const uint i11 = group.x; + const uint ne10 = src0_elements.x; + const uint ne11 = src0_elements.y; + const uint nb12 = src0_strides.z; + const uint nb11 = src0_strides.y; + + uint rdi = i11 * ne10 + i12 * ne10 * ne11; + const uint rdiEnd = rdi + ne10; + uint rsi = i12 * nb12 + i11 * nb11; + rdi += thread; + rsi += thread; + + for( ; rdi < rdiEnd; rdi += 32, rsi += 32 ) + result[ rdi ] = adjustFp16( arg0[ rsi ] ); +}
\ No newline at end of file diff --git a/ComputeShaders/mulMatMadMain.hlsl b/ComputeShaders/mulMatMadMain.hlsl new file mode 100644 index 0000000..0bd5753 --- /dev/null +++ b/ComputeShaders/mulMatMadMain.hlsl @@ -0,0 +1,154 @@ +// GGML_TASK_COMPUTE step for matrix*matrix product, where nb01 < nb00 +Buffer<float> arg0: register( t0 ); +Buffer<float> arg1: register( t1 ); +RWBuffer<float> resultTensor: register( u0 ); +RWBuffer<float> tempBuffer: register( u1 ); + +cbuffer Constants: register( b0 ) +{ + uint4 aSize: packoffset( c0 ); + uint4 aStride: packoffset( c1 ); + uint4 bSize: packoffset( c2 ); + uint4 bStride: packoffset( c3 ); + uint4 resSize: packoffset( c4 ); + bool resultFp16 : packoffset( c5.x ); + uint ne: packoffset( c5.y ); +} + +#include "miscUtils.hlsli" + +// tempBuffer[ rdi .. ] = 0.0 +inline void writeTempZeros( uint rdi, const uint len, const uint thread ) +{ + const uint rdiEnd = rdi + len; + for( rdi += thread; rdi < rdiEnd; rdi += 32 ) + tempBuffer[ rdi ] = 0.0; +} + +// tempBuffer[ rdi .. ] += mul * arg0[ rsi .. ] +inline void vectorMad( uint rsi, uint rdi, const uint len, const float mul, const uint thread ) +{ + const uint rsiEnd = rsi + len; + rsi += thread; + rdi += thread; + for( ; rsi < rsiEnd; rsi += 32, rdi += 32 ) + { + float f = tempBuffer[ rdi ]; + f = mad( mul, arg0[ rsi ], f ); + [branch] + if( resultFp16 ) + f = adjustFp16( f ); + tempBuffer[ rdi ] = f; + } +} + +// resultTensor[ rdi .. ] = tempBuffer[ rsi .. ] +inline void copyRow( uint rsi, uint rdi, const uint len, const uint thread ) +{ + const uint rsiEnd = rsi + len; + rsi += thread; + rdi += thread; + for( ; rsi < rsiEnd; rsi += 32, rdi += 32 ) + { + float f = tempBuffer[ rsi ]; + resultTensor[ rdi ] = f; + } +} + +// resultTensor[ rdi .. ] += tempBuffer[ rsi .. ] +inline void addRow( uint rsi, uint rdi, const uint len, const uint thread ) +{ + const uint rsiEnd = rsi + len; + rsi += thread; + rdi += thread; + for( ; rsi < rsiEnd; rsi += 32, rdi += 32 ) + { + float f = resultTensor[ rdi ]; + f += tempBuffer[ rsi ]; + resultTensor[ rdi ] = f; + } +} + +[numthreads( 32, 1, 1 )] +void main( const uint3 group: SV_GroupID, const uint thread : SV_GroupIndex ) +{ + const uint i1 = group[ 0 ]; + const uint i2 = group[ 1 ]; + const uint i3 = group[ 2 ]; + + const uint ne00 = aSize[ 0 ]; + const uint ne01 = aSize[ 1 ]; + const uint ne02 = aSize[ 2 ]; + const uint ne03 = aSize[ 3 ]; + + const uint ne10 = bSize[ 0 ]; + const uint ne11 = bSize[ 1 ]; + const uint ne12 = bSize[ 2 ]; + const uint ne13 = bSize[ 3 ]; + + const uint ne0 = resSize[ 0 ]; + const uint ne1 = resSize[ 1 ]; + const uint ne2 = resSize[ 2 ]; + const uint ne3 = resSize[ 3 ]; + + const uint nb00 = aStride[ 0 ]; + const uint nb01 = aStride[ 1 ]; + const uint nb02 = aStride[ 2 ]; + const uint nb03 = aStride[ 3 ]; + + const uint nb10 = bStride[ 0 ]; + const uint nb11 = bStride[ 1 ]; + const uint nb12 = bStride[ 2 ]; + const uint nb13 = bStride[ 3 ]; + + // dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0; + const uint tempRowThread0 = i3 * ne2 * ne1 * ne0 + i2 * ne1 * ne0 + i1 * ne0; + + // Faking 4 CPU threads trying to achieve bitwise compatibility with the CPU version + const uint nth = 4; + + // GGML_TASK_COMPUTE + { + // src0_col = src0->data + ( i00 * nb00 + i02 * nb02 + i03 * nb03 ); + const uint aBase = i2 * nb02 + i3 * nb03; + // src1_val = * (float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + const uint bBase = i1 * nb11 + i2 * nb12 + i3 * nb13; + + // total columns in src1 + const uint nc = ne10; + // columns per thread + const uint dc = ( nc + nth - 1 ) / nth; + + uint tempRow = tempRowThread0; + for( uint ith = 0; ith < nth; ith++, tempRow += ne ) + { + writeTempZeros( tempRow, ne01, thread ); + + // column range for this thread + const uint ic0 = dc * ith; + const uint ic1 = min( ic0 + dc, nc ); + + for( uint ic = ic0; ic < ic1; ic++ ) + { + const uint idxA = aBase + ic * aStride[ 0 ]; + const uint idxB = bBase + ic * bStride[ 0 ]; + const float bValue = arg1[ idxB ]; + vectorMad( idxA, tempRow, ne01, bValue, thread ); + } + } + } + + // GGML_TASK_FINALIZE + { + const uint rdi = tempRowThread0; + // const uint rdi = i1 * resSize[ 0 ] + i2 * resSize[ 0 ] * resSize[ 1 ] + i3 * resSize[ 0 ] * resSize[ 1 ] * resSize[ 2 ]; + // const uint rdi = ( ( i3 * resSize[ 2 ] + i2 ) * resSize[ 1 ] + i1 ) * resSize[ 0 ]; + + uint tempRow = tempRowThread0; + copyRow( tempRow, rdi, ne01, thread ); + + tempRow += ne; + for( uint ith = 1; ith < nth; ith++, tempRow += ne ) + addRow( tempRow, rdi, ne01, thread ); + } +}
\ No newline at end of file diff --git a/ComputeShaders/mulMatTiled.hlsl b/ComputeShaders/mulMatTiled.hlsl new file mode 100644 index 0000000..7e4d7d8 --- /dev/null +++ b/ComputeShaders/mulMatTiled.hlsl @@ -0,0 +1,236 @@ +// This compute shader implements matrix*matrix product, using tiling and other tricks to improve the performance +#ifndef TILE_SIZE +static const uint TILE_SIZE = 32; +#endif + +#ifndef THREADS_Y +// Performance measures on Ryzen 7 5700G iGPU, the time is just for this shader: +// 1 (32 threads per group) - 17.1 seconds, 2 - 9.02424 seconds, 4 - 6.95762 seconds, 6 - 6.79011 seconds, 8 - 6.67279 seconds, 10 - 6.9456 seconds, 16 - 7.20502 seconds +// On nVidia, 8 is also the fastest option. +static const uint THREADS_Y = 8; +#endif + +#ifndef STREAM_SECOND_MATRIX +#define STREAM_SECOND_MATRIX 0 +#endif + +#ifndef LOAD_ORDER + +// Load with coalesced loads from global memory whenever possible, store into groupshared buffer with random stores +// #define LOAD_ORDER bool2( ( 1 == arg0Strides[ 0 ] ) || ( 1 != arg0Strides[ 1 ] ), ( 1 == arg1Strides[ 0 ] ) || ( 1 != arg1Strides[ 1 ] ) ) + +// Load with random loads from global memory, store into groupshared buffer with coalesced stores +// On my AMD iGPU inside Ryzen 7 5700G, there's whopping 15% performance win with that tactics, from 6.67 to 5.66 seconds for this shader. +// My nVidia GPU does about the same +#define LOAD_ORDER bool2( false, true ) + +#endif + +Buffer<float> arg0: register( t0 ); +Buffer<float> arg1: register( t1 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 arg0Size: packoffset( c0 ); + uint4 arg0Strides: packoffset( c1 ); + uint4 arg1Strides: packoffset( c3 ); + uint4 resultSize: packoffset( c4 ); + uint4 resultStrides: packoffset( c5 ); +} + +groupshared float tile0[ TILE_SIZE ][ TILE_SIZE ]; +#if !STREAM_SECOND_MATRIX +groupshared float tile1[ TILE_SIZE ][ TILE_SIZE ]; +#endif +groupshared float resTemp[ TILE_SIZE ][ TILE_SIZE ]; + +#if STREAM_SECOND_MATRIX +void multiplyTiles( uint rsi, const uint3 thread, const uint w, const uint h ) +{ + for( uint i = thread.y; i < h; i += THREADS_Y, rsi += THREADS_Y * arg1Strides.y ) + { + float r = 0; + uint rsiRow = rsi; + for( uint j = 0; j < w; j++, rsiRow += arg1Strides.x ) + { + // One TILE_SIZE * 4 bytes coalesced load, broadcasted into THREADS_Y copies + const float s0 = tile0[ j ][ thread.x ]; + // THREADS_Y broadcasts from global memory, each one is 4 bytes broadcasted into TILE_SIZE copies + const float s1 = arg1[ rsiRow ]; + // Multiply and accumulate + r = mad( s0, s1, r ); + } + // Accumulate into the output tile + // THREADS_Y * 128 bytes coalesced loads and stores + resTemp[ i ][ thread.x ] += r; + } +} +#else +// Compute resTemp += tile0 * tile1, for TILE_SIZE^2 square matrices +// The group size is TILE_SIZE*THREADS_Y threads in this shader +void multiplyTiles( const uint3 thread ) +{ + for( uint i = thread.y; i < TILE_SIZE; i += THREADS_Y ) + { + float r = 0; + for( uint j = 0; j < TILE_SIZE; j++ ) + { + // One TILE_SIZE * 4 bytes coalesced load, broadcasted into THREADS_Y copies + const float s0 = tile0[ j ][ thread.x ]; + // THREADS_Y broadcasts, each one is 4 bytes broadcasted into TILE_SIZE copies + const float s1 = tile1[ i ][ j ]; + // Multiply and accumulate + r = mad( s0, s1, r ); + } + // Accumulate into the output tile + // THREADS_Y * 128 bytes coalesced loads and stores + resTemp[ i ][ thread.x ] += r; + } +} +#endif + +// Note we transposed these tiles while loading +void loadTile0( uint rsi, const uint3 thread, const uint w, const uint h, const bool rowMajor ) +{ + uint i; + if( rowMajor ) + { + rsi += arg0Strides.y * thread.y; + for( i = thread.y; i < h; i += THREADS_Y, rsi += arg0Strides.y * THREADS_Y ) + { + if( thread.x < w ) + tile0[ thread.x ][ i ] = arg0[ rsi + thread.x * arg0Strides.x ]; + else + tile0[ thread.x ][ i ] = 0.0; + } + } + else + { + // Unlike width which is smaller for the last tile, the height is always the same, and all these tiles are zero-initialized + if( thread.x >= h ) + return; + + rsi += arg0Strides.x * thread.y; + for( i = thread.y; i < w; i += THREADS_Y, rsi += arg0Strides.x * THREADS_Y ) + tile0[ i ][ thread.x ] = arg0[ rsi + thread.x * arg0Strides.y ]; + + if( i >= TILE_SIZE ) + return; + for( ; i < TILE_SIZE; i += THREADS_Y ) + tile0[ i ][ thread.x ] = 0.0; + } +} + +#if !STREAM_SECOND_MATRIX +void loadTile1( uint rsi, const uint3 thread, const uint w, const uint h, const bool rowMajor ) +{ + uint i; + if( rowMajor ) + { + rsi += thread.y * arg1Strides.y; + + for( i = thread.y; i < h; i += THREADS_Y, rsi += arg1Strides.y * THREADS_Y ) + { + if( thread.x < w ) + tile1[ i ][ thread.x ] = arg1[ rsi + thread.x * arg1Strides.x ]; + else + tile1[ i ][ thread.x ] = 0.0; + } + } + else + { + // Unlike width which is smaller for the last tile, the height is always the same, and all these tiles are zero-initialized + if( thread.x >= h ) + return; + + rsi += thread.y * arg1Strides.x; + for( i = thread.y; i < w; i += THREADS_Y, rsi += arg1Strides.x * THREADS_Y ) + tile1[ thread.x ][ i ] = arg1[ rsi + thread.x * arg0Strides.y ]; + if( i >= TILE_SIZE ) + return; + for( ; i < TILE_SIZE; i += THREADS_Y ) + tile1[ thread.x ][ i ] = 0.0; + } +} +#endif + +void storeTile( const uint3 thread, const uint4 pos, const uint2 size ) +{ + if( thread.x >= size.x ) + return; + const uint4 prod4 = pos * resultStrides; + const uint2 prod2 = prod4.xy + prod4.zw; + uint rdi = prod2.x + prod2.y; + rdi += resultStrides.y * thread.y; + for( uint i = thread.y; i < size.y; i += THREADS_Y, rdi += resultStrides.y * THREADS_Y ) + result[ rdi + thread.x * resultStrides.x ] = resTemp[ i ][ thread.x ]; +} + +[ numthreads( TILE_SIZE, THREADS_Y, 1 ) ] +void main( uint3 group: SV_GroupID, uint3 thread : SV_GroupThreadID ) +{ + // Zero out these shared buffers + for( uint i = 0; i < TILE_SIZE; i += THREADS_Y ) + { + tile0[ i + thread.y ][ thread.x ] = 0.0; +#if !STREAM_SECOND_MATRIX + tile1[ i + thread.y ][ thread.x ] = 0.0; +#endif + resTemp[ i + thread.y ][ thread.x ] = 0.0; + } + + const uint2 resultPos = group.xy * TILE_SIZE; + const uint2 layer = uint2( group.z % resultSize.z, group.z / resultSize.z ); + uint rsi0 = resultPos.x * arg0Strides.y + layer.x * arg0Strides.z + layer.y * arg0Strides.w; + uint rsi1 = resultPos.y * arg1Strides.y + layer.x * arg1Strides.z + layer.y * arg1Strides.w; + + const uint rsi0Inc = TILE_SIZE * arg0Strides.x; + const uint rsi1Inc = TILE_SIZE * arg1Strides.x; + + const uint completeTiles = arg0Size.x / TILE_SIZE; + const uint rsi0AndAligned = rsi0 + rsi0Inc * completeTiles; + // Output tile size + // Normally TILE_SIZE^2, less than that for the tiles at the right and bottom edges of the output matrix + const uint2 outputSize = min( TILE_SIZE, resultSize.xy - resultPos ); + + const bool2 loadOrder = LOAD_ORDER; + +#if STREAM_SECOND_MATRIX + rsi1 += thread.y * arg1Strides.y; +#endif + for( ; rsi0 < rsi0AndAligned; rsi0 += rsi0Inc, rsi1 += rsi1Inc ) + { + loadTile0( rsi0, thread, TILE_SIZE, outputSize.x, loadOrder.x ); +#if STREAM_SECOND_MATRIX + GroupMemoryBarrierWithGroupSync(); + multiplyTiles( rsi1, thread, TILE_SIZE, outputSize.y ); +#else + loadTile1( rsi1, thread, TILE_SIZE, outputSize.y, loadOrder.y ); + GroupMemoryBarrierWithGroupSync(); + multiplyTiles( thread ); +#endif + // Need one moar barrier here. + // Otherwise, some threads of the group are loading the next tile into tile0/tile1 groupshared buffers on the next iteration of the loop, + // while other threads of the same group are still computing the matrix product, and getting incorrect values from that groupshared buffer. + // The missing barrier only caused a bug on AMD, and only with "ggml-large.bin" model; no idea why that is. + GroupMemoryBarrierWithGroupSync(); + } + + const uint rem = arg0Size.x % TILE_SIZE; + if( 0 != rem ) + { + loadTile0( rsi0, thread, rem, outputSize.x, loadOrder.x ); +#if STREAM_SECOND_MATRIX + GroupMemoryBarrierWithGroupSync(); + multiplyTiles( rsi1, thread, rem, outputSize.y ); +#else + loadTile1( rsi1, thread, rem, outputSize.y, loadOrder.y ); + GroupMemoryBarrierWithGroupSync(); + multiplyTiles( thread ); +#endif + } + + GroupMemoryBarrierWithGroupSync(); + storeTile( thread, uint4( resultPos, layer ), outputSize ); +}
\ No newline at end of file diff --git a/ComputeShaders/mulMatTiled64.hlsl b/ComputeShaders/mulMatTiled64.hlsl new file mode 100644 index 0000000..45d77b1 --- /dev/null +++ b/ComputeShaders/mulMatTiled64.hlsl @@ -0,0 +1,3 @@ +#define TILE_SIZE 64 +#define STREAM_SECOND_MATRIX 1 +#include "mulMatTiled.hlsl"
\ No newline at end of file diff --git a/ComputeShaders/mulMatTiledEx.hlsl b/ComputeShaders/mulMatTiledEx.hlsl new file mode 100644 index 0000000..0f23da2 --- /dev/null +++ b/ComputeShaders/mulMatTiledEx.hlsl @@ -0,0 +1,194 @@ +// This compute shader implements yet another version of matrix*matrix product +// For optimal VRAM access pattern, it requires both arguments to be reshaped into a sequence of horizontal column major panels. +// The panel height is TILE_SIZE, and the last panel of the matrix needs to be padded with zeros; see matReshapePanels.hlsl shader for the reshaping. +// So far, it's only used when running on AMD GPUs. +#ifndef TILE_SIZE +static const uint TILE_SIZE = 32; +#endif +#ifndef TILE_HEIGHT +static const uint TILE_HEIGHT = 32; +#endif +#ifndef THREADS_Y +static const uint THREADS_Y = 16; +#endif + +#ifndef STREAM_SECOND_MATRIX +#define STREAM_SECOND_MATRIX 1 +#endif + +// First tensor, reshaped into dense column major horizontal panels of size [ width, TILE_SIZE ] +Buffer<float> arg0: register( t0 ); +// Second tensor, reshaped into dense column major horizontal panels of size [ width, TILE_SIZE ] +Buffer<float> arg1: register( t1 ); +// FP32 output tensor, row major and continuous +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 arg0Size: packoffset( c0 ); + uint arg0panel: packoffset( c1.y ); + uint2 arg0LayerStrides: packoffset( c1.z ); + + // uint4 arg1Size: packoffset( c2 ); + uint arg1panel: packoffset( c3.y ); + uint2 arg1LayerStrides: packoffset( c3.z ); + + uint4 resultSize: packoffset( c4 ); + uint4 resultStrides: packoffset( c5 ); +} + +// Accumulator for the output tile +// That last `+1` helps a bit, I'm not sure why exactly but probebly because memory bank conflicts. +groupshared float tileOutput[ TILE_SIZE ][ TILE_SIZE + 1 ]; +// A smaller tile loaded from the first source matrix +groupshared float tile0[ TILE_HEIGHT ][ TILE_SIZE ]; +#if !STREAM_SECOND_MATRIX +// A smaller tile loaded from the second source matrix +groupshared float tile1[ TILE_HEIGHT ][ TILE_SIZE ]; +#endif + +#if STREAM_SECOND_MATRIX +void multiplyTiles( const uint3 thread, uint rsi, const uint h ) +{ + uint2 i = uint2( thread.y, rsi ); + const uint2 iInc = uint2( THREADS_Y, THREADS_Y ); + for( ; i.x < TILE_SIZE; i += iInc ) + { + float r = 0.0; + uint2 j = uint2( 0, i.y ); + const uint2 jInc = uint2( 1, TILE_SIZE ); + for( ; j.x < h; j += jInc ) + { + float a = tile0[ j.x ][ thread.x ]; + float b = arg1[ j.y ]; + r = mad( a, b, r ); + } + tileOutput[ i.x ][ thread.x ] += r; + } +} +#else +void multiplyTiles( const uint3 thread ) +{ + for( uint row = thread.y; row < TILE_SIZE; row += THREADS_Y ) + { + float r = 0.0; + for( uint j = 0; j < TILE_HEIGHT; j++ ) + { + float a = tile0[ j ][ thread.x ]; + float b = tile1[ j ][ row ]; + r = mad( a, b, r ); + } + tileOutput[ row ][ thread.x ] += r; + } +} +#endif + +void storeTile( const uint3 thread, const uint4 pos, const uint2 size ) +{ + if( thread.x >= size.x ) + return; + const uint4 prod4 = pos * resultStrides; + const uint2 prod2 = prod4.xy + prod4.zw; + uint rdi = prod2.x + prod2.y; + rdi += resultStrides.y * thread.y; + rdi += thread.x; + for( uint i = thread.y; i < size.y; i += THREADS_Y, rdi += resultStrides.y * THREADS_Y ) + result[ rdi ] = tileOutput[ i ][ thread.x ]; +} + +[numthreads( TILE_SIZE, THREADS_Y, 1 )] +void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID ) +{ + uint i; + // Zero all 3 shared buffers + for( i = thread.y; i < TILE_SIZE; i += THREADS_Y ) + tileOutput[ i ][ thread.x ] = 0.0; + for( i = thread.y; i < TILE_HEIGHT; i += THREADS_Y ) + { + tile0[ i ][ thread.x ] = 0.0; +#if !STREAM_SECOND_MATRIX + tile1[ i ][ thread.x ] = 0.0; +#endif + } + + const uint2 layer = uint2( group.z % resultSize.z, group.z / resultSize.z ); + + uint rsi0 = group.x * arg0panel + layer.x * arg0LayerStrides.x + layer.y * arg0LayerStrides.y; + uint rsi1 = group.y * arg1panel + layer.x * arg1LayerStrides.x + layer.y * arg1LayerStrides.y; + + const uint threadOffset = thread.y * TILE_SIZE + thread.x; + rsi0 += threadOffset; +#if STREAM_SECOND_MATRIX + rsi1 += thread.y; +#else + rsi1 += threadOffset; +#endif + + const uint completeTiles = arg0Size.x / TILE_HEIGHT; + for( i = 0; i < completeTiles; i++ ) + { + // Load [ TILE_SIZE, TILE_HEIGHT ] block from both source tensors into these groupshared buffers + for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y ) + { + tile0[ j ][ thread.x ] = arg0[ rsi0 ]; + rsi0 += THREADS_Y * TILE_SIZE; +#if !STREAM_SECOND_MATRIX + tile1[ j ][ thread.x ] = arg1[ rsi1 ]; + rsi1 += THREADS_Y * TILE_SIZE; +#endif + } + + // Wait for all threads in the group to complete these loads + GroupMemoryBarrierWithGroupSync(); + +#if STREAM_SECOND_MATRIX + multiplyTiles( thread, rsi1, TILE_HEIGHT ); + rsi1 += TILE_HEIGHT * TILE_SIZE; +#else + // Multiply + accumulate the elements collected in the groupshared buffers + multiplyTiles( thread ); +#endif + GroupMemoryBarrierWithGroupSync(); + } + + const uint rem = arg0Size.x % TILE_HEIGHT; + if( rem != 0 ) + { + // Load [ TILE_SIZE, rem ] block from both source tensors, and zero out the padding elements + for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y ) + { + [branch] + if( j < rem ) + { + tile0[ j ][ thread.x ] = arg0[ rsi0 ]; + rsi0 += THREADS_Y * TILE_SIZE; +#if !STREAM_SECOND_MATRIX + tile1[ j ][ thread.x ] = arg1[ rsi1 ]; + rsi1 += THREADS_Y * TILE_SIZE; +#endif + } + else + { + tile0[ j ][ thread.x ] = 0.0; +#if !STREAM_SECOND_MATRIX + tile1[ j ][ thread.x ] = 0.0; +#endif + } + } + + // Wait for all threads in the group to complete these loads + GroupMemoryBarrierWithGroupSync(); + + // Multiply + accumulate the elements collected in the groupshared buffers +#if STREAM_SECOND_MATRIX + multiplyTiles( thread, rsi1, rem ); +#else + multiplyTiles( thread ); +#endif + GroupMemoryBarrierWithGroupSync(); + } + + const uint2 resultPos = group.xy * TILE_SIZE; + const uint2 outputSize = min( TILE_SIZE, resultSize.xy - resultPos ); + storeTile( thread, uint4( resultPos, layer ), outputSize ); +}
\ No newline at end of file diff --git a/ComputeShaders/norm.hlsl b/ComputeShaders/norm.hlsl new file mode 100644 index 0000000..eeb82b7 --- /dev/null +++ b/ComputeShaders/norm.hlsl @@ -0,0 +1,86 @@ +// Ported from ggml_compute_forward_norm_f32 +// Dispatch [ ne01, ne02, ne03 ] thread groups of this shader +Buffer<float> arg0: register( t0 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 src0_elements: packoffset( c0 ); + uint4 src0_strides: packoffset( c1 ); + uint4 result_strides: packoffset( c3 ); +} + +static const float eps = 1e-5f; // TODO: make this a parameter + +#include "groupReduce.hlsli" + +float computeVectorSum( uint i, const uint length, const uint thread ) +{ + float res = 0.0; + + const uint iEnd = i + length; + i += thread; + for( ; i < iEnd; i += 32 ) + res += arg0[ i ]; + + horizontalSumBroadcast( thread, res ); + return res; +} + +float offsetAndComputeSumSquares( uint rsi, uint rdi, const float mean, const uint length, const uint thread ) +{ + float sum2 = 0.0; + + const uint rsiEnd = rsi + length; + rsi += thread; + rdi += thread; + for( ; rsi < rsiEnd; rsi += 32, rdi += 32 ) + { + float v = arg0[ rsi ] - mean; + result[ rdi ] = v; + sum2 = mad( v, v, sum2 ); + } + + horizontalSumBroadcast( thread, sum2 ); + return sum2; +} + +void scaleVector( uint rdi, const float scale, const uint length, const uint thread ) +{ + const uint rdiEnd = rdi + length; + for( rdi += thread; rdi < rdiEnd; rdi += 32 ) + { + float f = result[ rdi ]; + f *= scale; + result[ rdi ] = f; + } +} + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint i03 = group.z; + const uint i02 = group.y; + const uint i01 = group.x; + + const uint nb01 = src0_strides[ 1 ]; + const uint nb02 = src0_strides[ 2 ]; + const uint nb03 = src0_strides[ 3 ]; + + const uint p = i01 * nb01 + i02 * nb02 + i03 * nb03; + + const uint ne00 = src0_elements[ 0 ]; + + float mean = computeVectorSum( p, ne00, thread ); + mean /= (float)(int)ne00; + + const uint nb1 = result_strides[ 1 ]; + const uint nb2 = result_strides[ 2 ]; + const uint nb3 = result_strides[ 3 ]; + const uint y = i01 * nb1 + i02 * nb2 + i03 * nb3; + + float sum2 = offsetAndComputeSumSquares( p, y, mean, ne00, thread ); + const float scale = 1.0 / sqrt( sum2 / (float)(int)ne00 + eps ); + + scaleVector( y, scale, ne00, thread ); +}
\ No newline at end of file diff --git a/ComputeShaders/normCompat.hlsl b/ComputeShaders/normCompat.hlsl new file mode 100644 index 0000000..23e1228 --- /dev/null +++ b/ComputeShaders/normCompat.hlsl @@ -0,0 +1,82 @@ +// Ported from ggml_compute_forward_norm_f32 +// Dispatch [ ( ne01 + 31 ) / 32, ne02, ne03 ] thread groups of this shader +Buffer<float> arg0: register( t0 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 src0_elements: packoffset( c0 ); + uint4 src0_strides: packoffset( c1 ); + uint4 result_strides: packoffset( c3 ); +} + +static const double eps = 1e-5; // TODO: make this a parameter + +#include "groupReduce.hlsli" + +double computeVectorSum( uint i, const uint length ) +{ + double res = 0.0; + const uint iEnd = i + length; + for( ; i < iEnd; i++ ) + res += arg0[ i ]; + return res; +} + +double offsetAndComputeSumSquares( uint rsi, uint rdi, const double mean, const uint length ) +{ + precise double sum2 = 0.0; + const uint rsiEnd = rsi + length; + for( ; rsi < rsiEnd; rsi++, rdi++ ) + { + double v = arg0[ rsi ]; + v -= mean; + result[ rdi ] = (float)v; + double prod = v * v; + sum2 += prod; + } + return sum2; +} + +void scaleVector( uint rdi, const float scale, const uint length ) +{ + const uint rdiEnd = rdi + length; + for( ; rdi < rdiEnd; rdi++ ) + { + float f = result[ rdi ]; + f *= scale; + result[ rdi ] = f; + } +} + +#include "fp64Utils.hlsli" + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 dtid: SV_DispatchThreadID ) +{ + const uint i03 = dtid.z; + const uint i02 = dtid.y; + const uint i01 = dtid.x; + if( i01 >= src0_elements[ 1 ] ) + return; + + const uint nb01 = src0_strides[ 1 ]; + const uint nb02 = src0_strides[ 2 ]; + const uint nb03 = src0_strides[ 3 ]; + + const uint p = i01 * nb01 + i02 * nb02 + i03 * nb03; + const uint ne00 = src0_elements[ 0 ]; + + double mean = computeVectorSum( p, ne00 ); + mean = div64( mean, (double)(int)ne00 ); + + const uint nb1 = result_strides[ 1 ]; + const uint nb2 = result_strides[ 2 ]; + const uint nb3 = result_strides[ 3 ]; + const uint y = i01 * nb1 + i02 * nb2 + i03 * nb3; + + const double sum2 = offsetAndComputeSumSquares( p, y, mean, ne00 ); + const float scale = (float)div64( 1.0, sqrt64( sum2 / (float)(int)ne00 + eps ) ); + + scaleVector( y, scale, ne00 ); +}
\ No newline at end of file diff --git a/ComputeShaders/normFixed.hlsl b/ComputeShaders/normFixed.hlsl new file mode 100644 index 0000000..8f2267f --- /dev/null +++ b/ComputeShaders/normFixed.hlsl @@ -0,0 +1,124 @@ +// Ported from ggml_compute_forward_norm_f32 +// Dispatch [ ne01, ne02, ne03 ] thread groups of this shader +Buffer<float> arg0: register( t0 ); +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 src0_elements: packoffset( c0 ); + uint4 src0_strides: packoffset( c1 ); + uint4 result_strides: packoffset( c3 ); +} + +static const float eps = 1e-5f; // TODO: make this a parameter + +// #include "groupReduce.hlsli" + +#ifndef THREADS +static const uint THREADS = 32; +#endif +static const uint ROW_LENGTH = 1024; +groupshared float rowBuffer[ ROW_LENGTH ]; + +static const uint REDUCTION_BUFFER = 32; +groupshared float sharedAccumulators[ REDUCTION_BUFFER ]; + +// Compute horisontal sum of the numbers. The result is only correct on the thread #0 of the group. +void horizontalSum( const uint thread, inout float sum ) +{ + if( THREADS > REDUCTION_BUFFER ) + { + for( uint t = REDUCTION_BUFFER; t < THREADS; t += REDUCTION_BUFFER ) + { + // Threads [ t .. t + REDUCTION_BUFFER ] store into the buffer + if( thread >= t && thread < t + REDUCTION_BUFFER ) + sharedAccumulators[ thread - t ] = sum; + + GroupMemoryBarrierWithGroupSync(); + + // Threads [ 0 .. REDUCTION_BUFFER ] increment their local sum with the value loaded from the buffer + if( thread < REDUCTION_BUFFER ) + sum += sharedAccumulators[ thread ]; + } + } + + if( thread < REDUCTION_BUFFER ) + sharedAccumulators[ thread ] = sum; + + for( uint i = REDUCTION_BUFFER / 2; i > 1; i /= 2 ) + { + GroupMemoryBarrierWithGroupSync(); + if( thread < i ) + { + sum += sharedAccumulators[ thread + i ]; + sharedAccumulators[ thread ] = sum; + } + } + + GroupMemoryBarrierWithGroupSync(); + if( 0 == thread ) + sum += sharedAccumulators[ 1 ]; +} + +[ numthreads( THREADS, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint i03 = group.z; + const uint i02 = group.y; + const uint i01 = group.x; + const uint ne00 = ROW_LENGTH; + + // First pass: copy the data to local buffer, and compute sum + { + const uint nb01 = src0_strides[ 1 ]; + const uint nb02 = src0_strides[ 2 ]; + const uint nb03 = src0_strides[ 3 ]; + const uint p = i01 * nb01 + i02 * nb02 + i03 * nb03; + + float sum = 0; + for( uint i = thread; i < ne00; i += THREADS ) + { + float f = arg0[ p + i ]; + rowBuffer[ i ] = f; + sum += f; + } + horizontalSum( thread, sum ); + if( 0 == thread ) + sharedAccumulators[ 0 ] = sum / (float)(int)ne00; + GroupMemoryBarrierWithGroupSync(); + } + + // Second pass: offset and compute sum of squares + { + const float mean = sharedAccumulators[ 0 ]; + float sum2 = 0; + for( uint i = thread; i < ne00; i += THREADS ) + { + float v = rowBuffer[ i ]; + v -= mean; + rowBuffer[ i ] = v; + sum2 = mad( v, v, sum2 ); + } + horizontalSum( thread, sum2 ); + if( 0 == thread ) + sharedAccumulators[ 0 ] = 1.0 / sqrt( sum2 / (float)(int)ne00 + eps ); + GroupMemoryBarrierWithGroupSync(); + } + + // Final pass: apply the scale, and copy from group shared buffer to the destination + { + const float scale = sharedAccumulators[ 0 ]; + + const uint nb1 = result_strides[ 1 ]; + const uint nb2 = result_strides[ 2 ]; + const uint nb3 = result_strides[ 3 ]; + const uint y = i01 * nb1 + i02 * nb2 + i03 * nb3; + + for( uint i = thread; i < ne00; i += THREADS ) + { + float v = rowBuffer[ i ]; + v *= scale; + result[ y + i ] = v; + } + } +}
\ No newline at end of file diff --git a/ComputeShaders/normFixed64.hlsl b/ComputeShaders/normFixed64.hlsl new file mode 100644 index 0000000..14aab3d --- /dev/null +++ b/ComputeShaders/normFixed64.hlsl @@ -0,0 +1,2 @@ +#define THREADS 64 +#include "normFixed.hlsl"
\ No newline at end of file diff --git a/ComputeShaders/repeatUtils.hlsli b/ComputeShaders/repeatUtils.hlsli new file mode 100644 index 0000000..1181501 --- /dev/null +++ b/ComputeShaders/repeatUtils.hlsli @@ -0,0 +1,21 @@ +inline uint rowOffset( uint3 idx, uint4 strides ) +{ + return idx[ 0 ] * strides[ 1 ] + idx[ 1 ] * strides[ 2 ] + idx[ 2 ] * strides[ 3 ]; +} + +// Initial iterator state for a row of the output tensor +// x = current index, y = index increment, z = end of the index +inline uint3 tensorIteratorState( uint3 group, uint thread, uint4 size, uint4 stride ) +{ + uint3 res; + res.x = rowOffset( group, stride ); + res.y = THREADS * stride[ 0 ]; + res.z = res.x + size[ 0 ] * stride[ 0 ]; + res.x += thread * stride[ 0 ]; + return res; +} + +// Handle a complete row of output tensor, using the iterator made by tensorIteratorState() function +#define ROW_LOOP( ts ) for( ; ts.x < ts.z; ts.x += ts.y ) +// Same as above, using different row length +#define ROW_LOOP_EX( ts, len, stride ) for( ; ts.x < ts.z; ts.x += len * stride[ 0 ] )
\ No newline at end of file diff --git a/ComputeShaders/scaleInPlace.hlsl b/ComputeShaders/scaleInPlace.hlsl new file mode 100644 index 0000000..d0320b4 --- /dev/null +++ b/ComputeShaders/scaleInPlace.hlsl @@ -0,0 +1,23 @@ +RWBuffer<float> buffer: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 src0_elements: packoffset( c0 ); + uint4 src0_strides: packoffset( c1 ); + float multiplier: packoffset( c2.x ); +} + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint nc0 = src0_elements[ 0 ]; + uint i = group.x * src0_strides[ 1 ]; + const uint iEnd = i + nc0; + const float mul = multiplier; + for( i += thread; i < iEnd; i += 32 ) + { + float f = buffer[ i ]; + f *= mul; + buffer[ i ] = f; + } +}
\ No newline at end of file diff --git a/ComputeShaders/softMax.hlsl b/ComputeShaders/softMax.hlsl new file mode 100644 index 0000000..6ebd0f2 --- /dev/null +++ b/ComputeShaders/softMax.hlsl @@ -0,0 +1,71 @@ +// Dispatch [ nr, 1, 1 ] thread groups of this shader +RWBuffer<float> result: register( u0 ); + +// table_exp_f16 +Buffer<uint> lookupTable: register( t0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 elements: packoffset( c0 ); + uint4 strides: packoffset( c1 ); + uint nr: packoffset( c2.x ); + float inputScale: packoffset( c2.y ); +} + +#include "miscUtils.hlsli" +#include "groupReduce.hlsli" + +static const float negativeInfinity = asfloat( 0xff800000 ); + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint p = group.x * strides[ 1 ]; + const uint nc = elements[ 0 ]; + const uint pEnd = p + nc; + uint i; + + float m = negativeInfinity; + for( i = p + thread; i < pEnd; i += 32 ) + m = max( m, result[ i ] ); + horizontalMaxBroadcast( thread, m ); + + float sum = 0; + for( i = p + thread; i < pEnd; i += 32 ) + { + float f = result[ i ]; + + [branch] + if( f != negativeInfinity ) + { + f = ( f - m ) * inputScale; +#if 1 + // Similar to Radeon Graphics, computing the exponent on nVidia 1080Ti is also slightly faster than loading from the lookup table + f = exp( f ); +#else + uint s = fp16Rounded( f ); + s = lookupTable[ s ]; + f = f16tof32( s ); +#endif + sum += f; + } + else + f = 0; + + result[ i ] = f; + } + + horizontalSum( thread, sum ); + if( 0 == thread ) + sharedAccumulators[ 0 ] = 1.0 / sum; + GroupMemoryBarrierWithGroupSync(); + const float scale = sharedAccumulators[ 0 ]; + + // ggml_vec_scale_f32 + for( i = p + thread; i < pEnd; i += 32 ) + { + float f = result[ i ]; + f *= scale; + result[ i ] = f; + } +}
\ No newline at end of file diff --git a/ComputeShaders/softMax64.hlsl b/ComputeShaders/softMax64.hlsl new file mode 100644 index 0000000..7ecd2ef --- /dev/null +++ b/ComputeShaders/softMax64.hlsl @@ -0,0 +1,71 @@ +// Dispatch [ nr, 1, 1 ] thread groups of this shader +RWBuffer<float> result: register( u0 ); + +// table_exp_f16 +Buffer<uint> lookupTable: register( t0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 elements: packoffset( c0 ); + uint4 strides: packoffset( c1 ); + uint nr: packoffset( c2.x ); + float inputScale: packoffset( c2.y ); +} + +#include "miscUtils.hlsli" +#include "groupReduce64.hlsli" + +static const float negativeInfinity = asfloat( 0xff800000 ); + +[ numthreads( 64, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint p = group.x * strides[ 1 ]; + const uint nc = elements[ 0 ]; + const uint pEnd = p + nc; + uint i; + + float m = negativeInfinity; + for( i = p + thread; i < pEnd; i += 64 ) + m = max( m, result[ i ] ); + horizontalMaxBroadcast( thread, m ); + + float sum = 0; + for( i = p + thread; i < pEnd; i += 64 ) + { + float f = result[ i ]; + + [branch] + if( f != negativeInfinity ) + { + f = ( f - m ) * inputScale; +#if 1 + // At least on Radeon Graphics GPU inside Ryzen 7 5700G, computing exponent instead of loading from the buffer improves the performance + f = exp( f ); +#else + uint s = fp16Rounded( f ); + s = lookupTable[ s ]; + f = f16tof32( s ); +#endif + sum += f; + } + else + f = 0; + + result[ i ] = f; + } + + horizontalSum( thread, sum ); + if( 0 == thread ) + sharedAccumulators[ 0 ] = 1.0 / sum; + GroupMemoryBarrierWithGroupSync(); + const float scale = sharedAccumulators[ 0 ]; + + // ggml_vec_scale_f32 + for( i = p + thread; i < pEnd; i += 64 ) + { + float f = result[ i ]; + f *= scale; + result[ i ] = f; + } +}
\ No newline at end of file diff --git a/ComputeShaders/softMaxCompat.hlsl b/ComputeShaders/softMaxCompat.hlsl new file mode 100644 index 0000000..2215ebd --- /dev/null +++ b/ComputeShaders/softMaxCompat.hlsl @@ -0,0 +1,62 @@ +// ggml_compute_forward_soft_max_f32 +// Dispatch [ ( nr + 31 ) / 32, 1, 1 ] thread groups of this shader +RWBuffer<float> result: register( u0 ); + +// table_exp_f16 +Buffer<uint> lookupTable: register( t0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 elements: packoffset( c0 ); + uint4 strides: packoffset( c1 ); + uint nr: packoffset( c2.x ); +} + +#include "miscUtils.hlsli" +#include "fp64Utils.hlsli" + +static const float negativeInfinity = asfloat( 0xff800000 ); + +[ numthreads( 32, 1, 1 ) ] +void main( uint3 dtid: SV_DispatchThreadID ) +{ + if( dtid.x >= nr ) + return; + + const uint p = dtid.x * strides[ 1 ]; + const uint nc = elements[ 0 ]; + const uint pEnd = p + nc; + uint i; + + float m = negativeInfinity; + for( i = p; i < pEnd; i++ ) + m = max( m, result[ i ] ); + + double sum = 0; + for( i = p; i < pEnd; i++ ) + { + float f = result[ i ]; + + [branch] + if( f != negativeInfinity ) + { + uint s = fp16Rounded( f - m ); + s = lookupTable[ s ]; + f = f16tof32( s ); + sum += f; + } + else + f = 0; + + result[ i ] = f; + } + + const float scale = (float)div64( 1.0, sum ); + // ggml_vec_scale_f32 + for( i = p; i < pEnd; i++ ) + { + float f = result[ i ]; + f *= scale; + result[ i ] = f; + } +}
\ No newline at end of file diff --git a/ComputeShaders/softMaxFixed.hlsl b/ComputeShaders/softMaxFixed.hlsl new file mode 100644 index 0000000..7b4add2 --- /dev/null +++ b/ComputeShaders/softMaxFixed.hlsl @@ -0,0 +1,79 @@ +// Special softMax shader for matrices with rows of 1500 elements. +// Uses group shared buffer of that length to save global memory bandwidth, more than 2x faster than the original. +// Dispatch [ nr, 1, 1 ] thread groups of this shader +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 elements: packoffset( c0 ); + uint4 strides: packoffset( c1 ); + uint nr: packoffset( c2.x ); + float inputScale: packoffset( c2.y ); +} + +#include "miscUtils.hlsli" +#include "groupReduce64.hlsli" + +static const uint THREADS = 64; +static const uint ROW_LENGTH = 1500; +groupshared float rowBuffer[ ROW_LENGTH ]; + +static const float negativeInfinity = asfloat( 0xff800000 ); + +[ numthreads( THREADS, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint p = group.x * strides[ 1 ]; + const uint nc = ROW_LENGTH; + uint i; + + float m = negativeInfinity; + // First pass: compute maximum, and copy the row into the group shared buffer + for( i = thread; i < nc; i += THREADS ) + { + float f = result[ p + i ]; + m = max( m, f ); + rowBuffer[ i ] = f; + } + horizontalMaxBroadcast( thread, m ); + + // Second pass: apply initial scale, compute the exponent, and compute total sum over the row + float sum = 0; + for( i = thread; i < nc; i += THREADS ) + { + float f = rowBuffer[ i ]; + + [branch] + if( f != negativeInfinity ) + { + f = ( f - m ) * inputScale; +#if 1 + // At least on Radeon Graphics GPU inside Ryzen 7 5700G, computing exponent instead of loading from the buffer improves the performance + f = exp( f ); +#else + uint s = fp16Rounded( f ); + s = lookupTable[ s ]; + f = f16tof32( s ); +#endif + sum += f; + } + else + f = 0; + + rowBuffer[ i ] = f; + } + + horizontalSum( thread, sum ); + if( 0 == thread ) + sharedAccumulators[ 0 ] = 1.0 / sum; + GroupMemoryBarrierWithGroupSync(); + const float scale = sharedAccumulators[ 0 ]; + + // Final pass: apply the final scale, and copy the row from the group shared buffer back into the global memory + for( i = thread; i < nc; i += THREADS ) + { + float f = rowBuffer[ i ]; + f *= scale; + result[ p + i ] = f; + } +}
\ No newline at end of file diff --git a/ComputeShaders/zeroMemory.hlsl b/ComputeShaders/zeroMemory.hlsl new file mode 100644 index 0000000..c486636 --- /dev/null +++ b/ComputeShaders/zeroMemory.hlsl @@ -0,0 +1,27 @@ +RWBuffer<float> result: register( u0 ); + +cbuffer Constants: register( b0 ) +{ + uint elements: packoffset( c0.x ); +} + +// Thread group index is 16 bits per coordinate: +// https://learn.microsoft.com/en-us/windows/win32/api/d3d11/nf-d3d11-id3d11devicecontext-dispatch +// We want this shader to support buffers up to 2 GB. +#ifndef THREADS +static const uint THREADS = 512; +#endif +#ifndef ITERATIONS +static const uint ITERATIONS = 128; +#endif + +static const uint itemsPerGroup = THREADS * ITERATIONS; + +[numthreads( THREADS, 1, 1 )] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + uint rdi = group.x * itemsPerGroup; + const uint rdiEnd = min( rdi + itemsPerGroup, elements ); + for( rdi += thread; rdi < rdiEnd; rdi += THREADS ) + result[ rdi ] = 0.0; +}
\ No newline at end of file |
