summaryrefslogtreecommitdiffstats
path: root/ComputeShaders
diff options
context:
space:
mode:
Diffstat (limited to 'ComputeShaders')
-rw-r--r--ComputeShaders/ComputeShaders.cpp3
-rw-r--r--ComputeShaders/ComputeShaders.vcxproj221
-rw-r--r--ComputeShaders/ComputeShaders.vcxproj.filters66
-rw-r--r--ComputeShaders/Readme.txt11
-rw-r--r--ComputeShaders/add.hlsl6
-rw-r--r--ComputeShaders/addInPlace.hlsl39
-rw-r--r--ComputeShaders/addRepeat.hlsl70
-rw-r--r--ComputeShaders/addRepeat64.hlsl2
-rw-r--r--ComputeShaders/addRepeatGelu.hlsl88
-rw-r--r--ComputeShaders/addRepeatGelu64.hlsl2
-rw-r--r--ComputeShaders/addRepeatScale.hlsl73
-rw-r--r--ComputeShaders/addRows.hlsl46
-rw-r--r--ComputeShaders/componentwiseBinaryOp.hlsli43
-rw-r--r--ComputeShaders/convolutionMain.hlsl76
-rw-r--r--ComputeShaders/convolutionMain2.hlsl60
-rw-r--r--ComputeShaders/convolutionMain2Fixed.hlsl119
-rw-r--r--ComputeShaders/convolutionPrep1.hlsl39
-rw-r--r--ComputeShaders/convolutionPrep2.hlsl43
-rw-r--r--ComputeShaders/copyConvert.hlsl50
-rw-r--r--ComputeShaders/copyTranspose.hlsl56
-rw-r--r--ComputeShaders/diagMaskInf.hlsl30
-rw-r--r--ComputeShaders/flashAttention.hlsl170
-rw-r--r--ComputeShaders/flashAttentionCommon.hlsli67
-rw-r--r--ComputeShaders/flashAttentionCompat1.hlsl125
-rw-r--r--ComputeShaders/flashAttentionCompat2.hlsl114
-rw-r--r--ComputeShaders/flashAttentionCompat3.hlsl118
-rw-r--r--ComputeShaders/fmaRepeat1.hlsl77
-rw-r--r--ComputeShaders/fmaRepeat164.hlsl2
-rw-r--r--ComputeShaders/fmaRepeat2.hlsl45
-rw-r--r--ComputeShaders/fp64Utils.hlsli28
-rw-r--r--ComputeShaders/groupReduce.hlsli139
-rw-r--r--ComputeShaders/groupReduce64.hlsli46
-rw-r--r--ComputeShaders/matReshapePanels.hlsl105
-rw-r--r--ComputeShaders/miscUtils.hlsli84
-rw-r--r--ComputeShaders/mulMatByRow.hlsl49
-rw-r--r--ComputeShaders/mulMatByRow64.hlsl90
-rw-r--r--ComputeShaders/mulMatByRowTiled.hlsl120
-rw-r--r--ComputeShaders/mulMatByRowTiled64.hlsl4
-rw-r--r--ComputeShaders/mulMatByRowTiledEx.hlsl156
-rw-r--r--ComputeShaders/mulMatByScalar.hlsl41
-rw-r--r--ComputeShaders/mulMatDotMain.hlsl95
-rw-r--r--ComputeShaders/mulMatDotReshape.hlsl33
-rw-r--r--ComputeShaders/mulMatMadMain.hlsl154
-rw-r--r--ComputeShaders/mulMatTiled.hlsl236
-rw-r--r--ComputeShaders/mulMatTiled64.hlsl3
-rw-r--r--ComputeShaders/mulMatTiledEx.hlsl194
-rw-r--r--ComputeShaders/norm.hlsl86
-rw-r--r--ComputeShaders/normCompat.hlsl82
-rw-r--r--ComputeShaders/normFixed.hlsl124
-rw-r--r--ComputeShaders/normFixed64.hlsl2
-rw-r--r--ComputeShaders/repeatUtils.hlsli21
-rw-r--r--ComputeShaders/scaleInPlace.hlsl23
-rw-r--r--ComputeShaders/softMax.hlsl71
-rw-r--r--ComputeShaders/softMax64.hlsl71
-rw-r--r--ComputeShaders/softMaxCompat.hlsl62
-rw-r--r--ComputeShaders/softMaxFixed.hlsl79
-rw-r--r--ComputeShaders/zeroMemory.hlsl27
57 files changed, 4086 insertions, 0 deletions
diff --git a/ComputeShaders/ComputeShaders.cpp b/ComputeShaders/ComputeShaders.cpp
new file mode 100644
index 0000000..9c03f27
--- /dev/null
+++ b/ComputeShaders/ComputeShaders.cpp
@@ -0,0 +1,3 @@
+void fnComputeShaders()
+{
+} \ No newline at end of file
diff --git a/ComputeShaders/ComputeShaders.vcxproj b/ComputeShaders/ComputeShaders.vcxproj
new file mode 100644
index 0000000..350d266
--- /dev/null
+++ b/ComputeShaders/ComputeShaders.vcxproj
@@ -0,0 +1,221 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <VCProjectVersion>16.0</VCProjectVersion>
+ <Keyword>Win32Proj</Keyword>
+ <ProjectGuid>{1c39d386-96d0-47a1-bbfa-68bbdb24439c}</ProjectGuid>
+ <RootNamespace>ComputeShaders</RootNamespace>
+ <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>StaticLibrary</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v143</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>StaticLibrary</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v143</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>StaticLibrary</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v143</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>StaticLibrary</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v143</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="Shared">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <MultiProcFXC>true</MultiProcFXC>
+ <OutDir>$(Platform)\$(Configuration)\</OutDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <OutDir>$(Platform)\$(Configuration)\</OutDir>
+ <MultiProcFXC>true</MultiProcFXC>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <SDLCheck>true</SDLCheck>
+ <PreprocessorDefinitions>NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <ConformanceMode>true</ConformanceMode>
+ </ClCompile>
+ <Link>
+ <SubSystem>
+ </SubSystem>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <SDLCheck>true</SDLCheck>
+ <PreprocessorDefinitions>NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <ConformanceMode>true</ConformanceMode>
+ </ClCompile>
+ <Link>
+ <SubSystem>
+ </SubSystem>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <SDLCheck>true</SDLCheck>
+ <PreprocessorDefinitions>NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <ConformanceMode>true</ConformanceMode>
+ </ClCompile>
+ <Link>
+ <SubSystem>
+ </SubSystem>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ </Link>
+ <FxCompile>
+ <ShaderModel>5.0</ShaderModel>
+ <ShaderType>Compute</ShaderType>
+ </FxCompile>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <SDLCheck>true</SDLCheck>
+ <PreprocessorDefinitions>NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <ConformanceMode>true</ConformanceMode>
+ </ClCompile>
+ <Link>
+ <SubSystem>
+ </SubSystem>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ </Link>
+ <FxCompile>
+ <ShaderModel>5.0</ShaderModel>
+ <ShaderType>Compute</ShaderType>
+ <DisableOptimizations>true</DisableOptimizations>
+ <EnableDebuggingInformation>true</EnableDebuggingInformation>
+ </FxCompile>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClCompile Include="ComputeShaders.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <FxCompile Include="add.hlsl" />
+ <FxCompile Include="addInPlace.hlsl" />
+ <FxCompile Include="addRepeat.hlsl" />
+ <FxCompile Include="addRepeat64.hlsl" />
+ <FxCompile Include="addRepeatGelu.hlsl" />
+ <FxCompile Include="addRepeatGelu64.hlsl" />
+ <FxCompile Include="addRepeatScale.hlsl" />
+ <FxCompile Include="addRows.hlsl" />
+ <FxCompile Include="convolutionMain.hlsl" />
+ <FxCompile Include="convolutionMain2.hlsl" />
+ <FxCompile Include="convolutionMain2Fixed.hlsl" />
+ <FxCompile Include="convolutionPrep1.hlsl" />
+ <FxCompile Include="convolutionPrep2.hlsl" />
+ <FxCompile Include="copyConvert.hlsl" />
+ <FxCompile Include="copyTranspose.hlsl" />
+ <FxCompile Include="diagMaskInf.hlsl" />
+ <FxCompile Include="flashAttention.hlsl" />
+ <FxCompile Include="flashAttentionCompat1.hlsl" />
+ <FxCompile Include="flashAttentionCompat2.hlsl" />
+ <FxCompile Include="flashAttentionCompat3.hlsl" />
+ <FxCompile Include="fmaRepeat1.hlsl" />
+ <FxCompile Include="fmaRepeat164.hlsl" />
+ <FxCompile Include="fmaRepeat2.hlsl" />
+ <FxCompile Include="matReshapePanels.hlsl" />
+ <FxCompile Include="mulMatByRow.hlsl" />
+ <FxCompile Include="mulMatByRow64.hlsl" />
+ <FxCompile Include="mulMatByRowTiled.hlsl" />
+ <FxCompile Include="mulMatByRowTiled64.hlsl" />
+ <FxCompile Include="mulMatByRowTiledEx.hlsl" />
+ <FxCompile Include="mulMatByScalar.hlsl" />
+ <FxCompile Include="mulMatDotMain.hlsl" />
+ <FxCompile Include="mulMatDotReshape.hlsl" />
+ <FxCompile Include="mulMatMadMain.hlsl" />
+ <FxCompile Include="mulMatTiled.hlsl" />
+ <FxCompile Include="mulMatTiled64.hlsl" />
+ <FxCompile Include="mulMatTiledEx.hlsl" />
+ <FxCompile Include="norm.hlsl" />
+ <FxCompile Include="normCompat.hlsl" />
+ <FxCompile Include="normFixed.hlsl" />
+ <FxCompile Include="normFixed64.hlsl" />
+ <FxCompile Include="scaleInPlace.hlsl" />
+ <FxCompile Include="softMax.hlsl" />
+ <FxCompile Include="softMax64.hlsl" />
+ <FxCompile Include="softMaxCompat.hlsl" />
+ <FxCompile Include="softMaxFixed.hlsl" />
+ <FxCompile Include="zeroMemory.hlsl" />
+ </ItemGroup>
+ <ItemGroup>
+ <None Include="componentwiseBinaryOp.hlsli" />
+ <None Include="flashAttentionCommon.hlsli" />
+ <None Include="fp64Utils.hlsli" />
+ <None Include="groupReduce.hlsli" />
+ <None Include="groupReduce64.hlsli" />
+ <None Include="miscUtils.hlsli" />
+ <None Include="repeatUtils.hlsli" />
+ </ItemGroup>
+ <ItemGroup>
+ <Text Include="Readme.txt" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
+</Project> \ No newline at end of file
diff --git a/ComputeShaders/ComputeShaders.vcxproj.filters b/ComputeShaders/ComputeShaders.vcxproj.filters
new file mode 100644
index 0000000..12f1559
--- /dev/null
+++ b/ComputeShaders/ComputeShaders.vcxproj.filters
@@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <ClCompile Include="ComputeShaders.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <FxCompile Include="mulMatDotMain.hlsl" />
+ <FxCompile Include="mulMatDotReshape.hlsl" />
+ <FxCompile Include="convolutionMain.hlsl" />
+ <FxCompile Include="convolutionPrep1.hlsl" />
+ <FxCompile Include="convolutionPrep2.hlsl" />
+ <FxCompile Include="add.hlsl" />
+ <FxCompile Include="flashAttention.hlsl" />
+ <FxCompile Include="convolutionMain2.hlsl" />
+ <FxCompile Include="norm.hlsl" />
+ <FxCompile Include="copyConvert.hlsl" />
+ <FxCompile Include="copyTranspose.hlsl" />
+ <FxCompile Include="normCompat.hlsl" />
+ <FxCompile Include="flashAttentionCompat1.hlsl" />
+ <FxCompile Include="flashAttentionCompat3.hlsl" />
+ <FxCompile Include="flashAttentionCompat2.hlsl" />
+ <FxCompile Include="scaleInPlace.hlsl" />
+ <FxCompile Include="diagMaskInf.hlsl" />
+ <FxCompile Include="softMaxCompat.hlsl" />
+ <FxCompile Include="mulMatMadMain.hlsl" />
+ <FxCompile Include="addRepeat.hlsl" />
+ <FxCompile Include="fmaRepeat1.hlsl" />
+ <FxCompile Include="fmaRepeat2.hlsl" />
+ <FxCompile Include="addInPlace.hlsl" />
+ <FxCompile Include="softMax.hlsl" />
+ <FxCompile Include="addRepeatScale.hlsl" />
+ <FxCompile Include="mulMatByRow.hlsl" />
+ <FxCompile Include="mulMatByScalar.hlsl" />
+ <FxCompile Include="mulMatTiled.hlsl" />
+ <FxCompile Include="mulMatByRow64.hlsl" />
+ <FxCompile Include="softMax64.hlsl" />
+ <FxCompile Include="softMaxFixed.hlsl" />
+ <FxCompile Include="addRepeat64.hlsl" />
+ <FxCompile Include="fmaRepeat164.hlsl" />
+ <FxCompile Include="addRepeatGelu.hlsl" />
+ <FxCompile Include="addRepeatGelu64.hlsl" />
+ <FxCompile Include="normFixed.hlsl" />
+ <FxCompile Include="normFixed64.hlsl" />
+ <FxCompile Include="mulMatByRowTiled.hlsl" />
+ <FxCompile Include="convolutionMain2Fixed.hlsl" />
+ <FxCompile Include="mulMatByRowTiled64.hlsl" />
+ <FxCompile Include="addRows.hlsl" />
+ <FxCompile Include="mulMatTiled64.hlsl" />
+ <FxCompile Include="zeroMemory.hlsl" />
+ <FxCompile Include="mulMatTiledEx.hlsl" />
+ <FxCompile Include="matReshapePanels.hlsl" />
+ <FxCompile Include="mulMatByRowTiledEx.hlsl" />
+ </ItemGroup>
+ <ItemGroup>
+ <None Include="componentwiseBinaryOp.hlsli" />
+ <None Include="miscUtils.hlsli" />
+ <None Include="groupReduce.hlsli" />
+ <None Include="fp64Utils.hlsli" />
+ <None Include="flashAttentionCommon.hlsli" />
+ <None Include="repeatUtils.hlsli" />
+ <None Include="groupReduce64.hlsli" />
+ </ItemGroup>
+ <ItemGroup>
+ <Text Include="Readme.txt" />
+ </ItemGroup>
+</Project> \ No newline at end of file
diff --git a/ComputeShaders/Readme.txt b/ComputeShaders/Readme.txt
new file mode 100644
index 0000000..18d089e
--- /dev/null
+++ b/ComputeShaders/Readme.txt
@@ -0,0 +1,11 @@
+This project compiles all the compute shaders which implement the model.
+
+Many shaders come in 2 versions, something.hlsl and something64.hlsl
+
+The version with the `64` suffix is used on AMD GPUs, the version without suffix is used on nVidia and Intel GPUs.
+
+Not all of these shaders are actually used for anything.
+Some of them are implementing binary compatibility for the reference CPU version, and not used unless messing with the `constexpr` flags in MlContext C++ class.
+Such shaders often require FP64 support, which is an optional feature in D3D11.
+CompressShaders tool detects such shaders by looking at the SFI0 chunk in the binary, and outputs a bitmap of the FP64 shaders.
+This way, missing FP64 hardware support shouldn’t break the library. \ No newline at end of file
diff --git a/ComputeShaders/add.hlsl b/ComputeShaders/add.hlsl
new file mode 100644
index 0000000..819443e
--- /dev/null
+++ b/ComputeShaders/add.hlsl
@@ -0,0 +1,6 @@
+inline float compute( float a, float b )
+{
+ return a + b;
+}
+
+#include "componentwiseBinaryOp.hlsli" \ No newline at end of file
diff --git a/ComputeShaders/addInPlace.hlsl b/ComputeShaders/addInPlace.hlsl
new file mode 100644
index 0000000..a83c221
--- /dev/null
+++ b/ComputeShaders/addInPlace.hlsl
@@ -0,0 +1,39 @@
+#ifndef THREADS
+#define THREADS 512
+#endif
+
+Buffer<float> arg0: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 size: packoffset( c0 );
+ uint4 strides: packoffset( c1 );
+ uint4 argStrides: packoffset( c3 );
+}
+
+inline uint rowOffset( uint3 idx, uint4 strides )
+{
+ return idx[ 0 ] * strides[ 1 ] + idx[ 1 ] * strides[ 2 ] + idx[ 2 ] * strides[ 3 ];
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ uint rdi = rowOffset( group, strides );
+ uint rsi = rowOffset( group, argStrides );
+
+ const uint rdiEnd = rdi + size[ 0 ] * strides[ 0 ];
+ rdi += thread * strides[ 0 ];
+ rsi += thread * argStrides[ 0 ];
+
+ const uint rdiInc = THREADS * strides[ 0 ];
+ const uint rsiInc = THREADS * argStrides[ 0 ];
+
+ for( ; rdi < rdiEnd; rdi += rdiInc, rsi += rsiInc )
+ {
+ float f = result[ rdi ];
+ f += arg0[ rsi ];
+ result[ rdi ] = f;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/addRepeat.hlsl b/ComputeShaders/addRepeat.hlsl
new file mode 100644
index 0000000..e5cdaa3
--- /dev/null
+++ b/ComputeShaders/addRepeat.hlsl
@@ -0,0 +1,70 @@
+// Compute tensor = tensor + repeat( pattern, tensor ) in 1 shot, without VRAM allocations
+// Dispatch [ nb[ 1 ], nb[ 2 ], nb[ 3 ] ] thread groups of this shader, where nb is size of the destination tensor
+RWBuffer<float> tensor: register( u0 );
+Buffer<float> pattern: register( t0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 tensorSize: packoffset( c0 );
+ uint4 tensorStrides: packoffset( c1 );
+ uint4 patternSize: packoffset( c2 );
+ uint4 patternStrides: packoffset( c3 );
+}
+
+#ifndef THREADS
+#define THREADS 256
+#endif
+
+#include "repeatUtils.hlsli"
+
+inline void computeSimple( uint idx, float add )
+{
+ float f = tensor[ idx ];
+ f += add;
+ tensor[ idx ] = f;
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ uint3 it = tensorIteratorState( group, thread, tensorSize, tensorStrides );
+ uint rsi = rowOffset( group % patternSize.yzw, patternStrides );
+
+ if( patternSize[ 0 ] == 1 )
+ {
+ // The pattern only has 1 column - broadcasting over the row
+ const float p = pattern[ rsi ];
+ ROW_LOOP( it )
+ computeSimple( it.x, p );
+ }
+ else if( patternSize[ 0 ] <= THREADS )
+ {
+ // pattern size doesn't exceed thread group size: load pattern value outside of the loop
+ const uint threadsPerGroup = THREADS - ( THREADS % patternSize[ 0 ] );
+ if( thread >= threadsPerGroup )
+ return;
+
+ const float p = pattern[ rsi + ( thread % patternSize[ 0 ] ) * patternStrides[ 0 ] ];
+ ROW_LOOP_EX( it, threadsPerGroup, tensorStrides )
+ computeSimple( it.x, p );
+ }
+ else
+ {
+ // Pattern rows are larger than the thread group, need to stream from both buffers
+ const uint rsiInc = THREADS * patternStrides[ 0 ];
+ const uint rsiDec = patternSize[ 0 ] * patternStrides[ 0 ];
+ const uint rsiEnd = rsi + rsiDec;
+ rsi += thread * patternStrides[ 0 ];
+
+ ROW_LOOP( it )
+ {
+ float f = tensor[ it.x ];
+ float p = pattern[ rsi ];
+ rsi += rsiInc;
+ if( rsi >= rsiEnd )
+ rsi -= rsiDec;
+ f += p;
+ tensor[ it.x ] = f;
+ }
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/addRepeat64.hlsl b/ComputeShaders/addRepeat64.hlsl
new file mode 100644
index 0000000..b6c8c19
--- /dev/null
+++ b/ComputeShaders/addRepeat64.hlsl
@@ -0,0 +1,2 @@
+#define THREADS 64
+#include "addRepeat.hlsl" \ No newline at end of file
diff --git a/ComputeShaders/addRepeatGelu.hlsl b/ComputeShaders/addRepeatGelu.hlsl
new file mode 100644
index 0000000..7f63653
--- /dev/null
+++ b/ComputeShaders/addRepeatGelu.hlsl
@@ -0,0 +1,88 @@
+// Compute tensor = GELU( tensor + repeat( pattern, tensor ) ) in 1 shot, without VRAM allocations
+// Dispatch [ nb[ 1 ], nb[ 2 ], nb[ 3 ] ] thread groups of this shader, where nb is size of the destination tensor
+RWBuffer<float> tensor: register( u0 );
+Buffer<float> pattern: register( t0 );
+Buffer<uint> lookupTable: register( t1 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 tensorSize: packoffset( c0 );
+ uint4 tensorStrides: packoffset( c1 );
+ uint4 patternSize: packoffset( c2 );
+ uint4 patternStrides: packoffset( c3 );
+}
+
+#ifndef THREADS
+#define THREADS 1024
+#endif
+
+#include "repeatUtils.hlsli"
+#include "miscUtils.hlsli"
+
+inline float gelu( float x )
+{
+#if 1
+ const uint index = fp16Rounded( x );
+ const uint res16 = lookupTable[ index ];
+ return f16tof32( res16 );
+#else
+ // This version is much slower, at least on AMD, despite saving these VRAM loads.
+ const float GELU_COEF_A = 0.044715;
+ const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876;
+ return 0.5 * x * ( 1.0 + tanh( SQRT_2_OVER_PI * x * ( 1.0 + GELU_COEF_A * x * x ) ) );
+#endif
+}
+
+inline void computeSimple( uint idx, float add )
+{
+ float f = tensor[ idx ];
+ f += add;
+ f = gelu( f );
+ tensor[ idx ] = f;
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ uint3 it = tensorIteratorState( group, thread, tensorSize, tensorStrides );
+ uint rsi = rowOffset( group % patternSize.yzw, patternStrides );
+
+ if( patternSize[ 0 ] == 1 )
+ {
+ // The pattern only has 1 column - broadcasting over the row
+ const float p = pattern[ rsi ];
+ ROW_LOOP( it )
+ computeSimple( it.x, p );
+ }
+ else if( patternSize[ 0 ] <= THREADS )
+ {
+ // pattern size doesn't exceed thread group size: load pattern value outside of the loop
+ const uint threadsPerGroup = THREADS - ( THREADS % patternSize[ 0 ] );
+ if( thread >= threadsPerGroup )
+ return;
+
+ const float p = pattern[ rsi + ( thread % patternSize[ 0 ] ) * patternStrides[ 0 ] ];
+ ROW_LOOP_EX( it, threadsPerGroup, tensorStrides )
+ computeSimple( it.x, p );
+ }
+ else
+ {
+ // Pattern rows are larger than the thread group, need to stream from both buffers
+ const uint rsiInc = THREADS * patternStrides[ 0 ];
+ const uint rsiDec = patternSize[ 0 ] * patternStrides[ 0 ];
+ const uint rsiEnd = rsi + rsiDec;
+ rsi += thread * patternStrides[ 0 ];
+
+ ROW_LOOP( it )
+ {
+ float f = tensor[ it.x ];
+ float p = pattern[ rsi ];
+ rsi += rsiInc;
+ if( rsi >= rsiEnd )
+ rsi -= rsiDec;
+ f += p;
+ f = gelu( f );
+ tensor[ it.x ] = f;
+ }
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/addRepeatGelu64.hlsl b/ComputeShaders/addRepeatGelu64.hlsl
new file mode 100644
index 0000000..3d9b2e8
--- /dev/null
+++ b/ComputeShaders/addRepeatGelu64.hlsl
@@ -0,0 +1,2 @@
+#define THREADS 64
+#include "addRepeatGelu.hlsl" \ No newline at end of file
diff --git a/ComputeShaders/addRepeatScale.hlsl b/ComputeShaders/addRepeatScale.hlsl
new file mode 100644
index 0000000..8c24088
--- /dev/null
+++ b/ComputeShaders/addRepeatScale.hlsl
@@ -0,0 +1,73 @@
+// Compute tensor = ( tensor + repeat( pattern, tensor ) ) * scale in 1 shot, without VRAM allocations
+// Dispatch [ nb[ 1 ], nb[ 2 ], nb[ 3 ] ] thread groups of this shader, where nb is size of the destination tensor
+RWBuffer<float> tensor: register( u0 );
+Buffer<float> pattern: register( t0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 tensorSize: packoffset( c0 );
+ uint4 tensorStrides: packoffset( c1 );
+ uint4 patternSize: packoffset( c2 );
+ uint4 patternStrides: packoffset( c3 );
+ float scalingMul : packoffset( c4.x );
+}
+
+#ifndef THREADS
+#define THREADS 512
+#endif
+
+#include "repeatUtils.hlsli"
+
+inline void computeSimple( uint idx, float add )
+{
+ float f = tensor[ idx ];
+ f += add;
+ f *= scalingMul;
+ tensor[ idx ] = f;
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ uint3 it = tensorIteratorState( group, thread, tensorSize, tensorStrides );
+ uint rsi = rowOffset( group % patternSize.yzw, patternStrides );
+
+ if( patternSize[ 0 ] == 1 )
+ {
+ // The pattern only has 1 column - broadcasting over the row
+ const float p = pattern[ rsi ];
+ ROW_LOOP( it )
+ computeSimple( it.x, p );
+ }
+ else if( patternSize[ 0 ] <= THREADS )
+ {
+ // pattern size doesn't exceed thread group size: load pattern value outside of the loop
+ const uint threadsPerGroup = THREADS - ( THREADS % patternSize[ 0 ] );
+ if( thread >= threadsPerGroup )
+ return;
+
+ const float p = pattern[ rsi + ( thread % patternSize[ 0 ] ) * patternStrides[ 0 ] ];
+ ROW_LOOP_EX( it, threadsPerGroup, tensorStrides )
+ computeSimple( it.x, p );
+ }
+ else
+ {
+ // Pattern rows are larger than the thread group, need to stream from both buffers
+ const uint rsiInc = THREADS * patternStrides[ 0 ];
+ const uint rsiDec = patternSize[ 0 ] * patternStrides[ 0 ];
+ const uint rsiEnd = rsi + rsiDec;
+ rsi += thread * patternStrides[ 0 ];
+
+ ROW_LOOP( it )
+ {
+ float f = tensor[ it.x ];
+ float p = pattern[ rsi ];
+ rsi += rsiInc;
+ if( rsi >= rsiEnd )
+ rsi -= rsiDec;
+ f += p;
+ f *= scalingMul;
+ tensor[ it.x ] = f;
+ }
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/addRows.hlsl b/ComputeShaders/addRows.hlsl
new file mode 100644
index 0000000..21e5e73
--- /dev/null
+++ b/ComputeShaders/addRows.hlsl
@@ -0,0 +1,46 @@
+#ifndef THREADS
+#define THREADS 256
+#endif
+
+// dec.tokenEmbedding tensor
+Buffer<float> tokenEmbedding: register( t0 );
+// dec.positionalEmbedding tensor
+Buffer<float> positionalEmbedding: register( t1 );
+// R32_UINT buffer with the input tokens
+Buffer<uint> embd: register( t2 );
+// Output tensor
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint rowLength: packoffset( c0.x );
+ uint pastTokensCount: packoffset( c0.y );
+ uint outputRowStride: packoffset( c0.z );
+ uint2 embStrides: packoffset( c1.x );
+ uint2 posStrides: packoffset( c1.z );
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint row = group.x;
+ const uint rowTok = embd[ row ];
+ const uint rowPos = row + pastTokensCount;
+
+ uint rdi = row * outputRowStride;
+ const uint rdiEnd = rdi + rowLength;
+ rdi += thread;
+
+ uint rsiTok = rowTok * embStrides.y;
+ rsiTok += thread * embStrides.x;
+
+ uint rsiPos = rowPos * posStrides.y;
+ rsiPos += thread * posStrides.x;
+
+ for( ; rdi < rdiEnd; rdi += THREADS, rsiTok += THREADS * embStrides.x, rsiPos += THREADS * posStrides.x )
+ {
+ float a = tokenEmbedding[ rsiTok ];
+ float b = positionalEmbedding[ rsiPos ];
+ result[ rdi ] = a + b;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/componentwiseBinaryOp.hlsli b/ComputeShaders/componentwiseBinaryOp.hlsli
new file mode 100644
index 0000000..0a523ca
--- /dev/null
+++ b/ComputeShaders/componentwiseBinaryOp.hlsli
@@ -0,0 +1,43 @@
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 src0_elements: packoffset( c0 );
+ uint4 src0_strides: packoffset( c1 );
+ uint4 src1_elements: packoffset( c2 );
+ uint4 src1_strides: packoffset( c3 );
+ uint4 result_elements: packoffset( c4 );
+ uint4 result_strides: packoffset( c5 );
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint j = group.x;
+ const uint nb1 = result_strides[ 1 ];
+ const uint nb01 = src0_strides[ 1 ];
+
+ const uint nb10 = src1_strides[ 0 ];
+ const uint nb11 = src1_strides[ 1 ];
+ const uint nc = src0_elements[ 0 ];
+
+ uint rsi0 = j * nb01;
+ uint rsi1 = j * nb11;
+ uint rdi = j * nb1;
+ const uint rsi0End = rsi0 + nc;
+
+ rsi0 += thread;
+ rsi1 += thread * nb10;
+ rdi += thread;
+
+ const uint rsi1Inc = 32 * nb10;
+ for( ; rsi0 < rsi0End; rsi0 += 32, rsi1 += rsi1Inc, rdi += 32 )
+ {
+ const float a = arg0[ rsi0 ];
+ const float b = arg1[ rsi1 ];
+ const float res = compute( a, b );
+ result[ rdi ] = res;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/convolutionMain.hlsl b/ComputeShaders/convolutionMain.hlsl
new file mode 100644
index 0000000..eee3ebb
--- /dev/null
+++ b/ComputeShaders/convolutionMain.hlsl
@@ -0,0 +1,76 @@
+// ggml_compute_forward_conv_1d_1s_f16_f32, GGML_TASK_COMPUTE implementation
+// Dispatch [ ne10, ne02, 1 ] thread groups
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 src0_elements: packoffset( c0 );
+ uint4 src0_strides: packoffset( c1 );
+ uint4 src1_elements: packoffset( c2 );
+ uint4 result_elements: packoffset( c4 );
+ uint4 result_strides: packoffset( c5 );
+}
+
+#include "groupReduce.hlsli"
+
+inline void computeDotProduct( uint s0, uint s1, uint len, uint thread, inout float acc )
+{
+ float curr = 0;
+ const uint completeVectors = len / 32;
+ uint i;
+ for( i = 0; i < completeVectors; i++, s0 += 32, s1 += 32 )
+ curr = mad( arg0[ s0 + thread ], arg1[ s1 + thread ], curr );
+
+ horizontalSumCompatNew( thread, curr );
+
+ if( 0 == thread )
+ {
+ const uint rem = len % 32;
+ if( 0 != rem )
+ {
+ double f64 = curr;
+ for( i = 0; i < rem; i++ )
+ {
+ precise float a = arg0[ s0 + i ];
+ precise float b = arg1[ s1 + i ];
+ precise float prod = a * b;
+ f64 += prod;
+ }
+ curr = (float)f64;
+ }
+ acc += curr;
+ }
+}
+
+#include "miscUtils.hlsli"
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint i1 = group.y;
+ const uint i0 = group.x;
+
+ const uint ne00 = src0_elements[ 0 ];
+ const uint nk = ne00;
+ const int nh = (int)( nk / 2 );
+
+ const uint ne01 = src0_elements[ 1 ];
+ const int ew0 = roundUp32( ne01 );
+
+ float res = 0;
+ for( int k = -nh; k <= nh; k++ )
+ {
+ const uint source0 = i1 * ew0 * ne00 + uint( nh + k ) * ew0;
+ const uint source1 = uint( i0 + nh + k ) * ew0;
+ computeDotProduct( source0, source1, ew0, thread, res );
+ }
+
+ if( 0 != thread )
+ return;
+
+ const uint nb1 = result_strides[ 1 ];
+ const uint rdi = i1 * nb1 + i0;
+ result[ rdi ] = res;
+} \ No newline at end of file
diff --git a/ComputeShaders/convolutionMain2.hlsl b/ComputeShaders/convolutionMain2.hlsl
new file mode 100644
index 0000000..73c9da1
--- /dev/null
+++ b/ComputeShaders/convolutionMain2.hlsl
@@ -0,0 +1,60 @@
+// ggml_compute_forward_conv_1d_2s_f16_f32, GGML_TASK_COMPUTE implementation
+// Dispatch [ ne10 / 2, ne02, 1 ] thread groups
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 src0_elements: packoffset( c0 );
+ uint4 src0_strides: packoffset( c1 );
+ uint4 src1_elements: packoffset( c2 );
+ uint4 result_elements: packoffset( c4 );
+ uint4 result_strides: packoffset( c5 );
+}
+
+#include "groupReduce.hlsli"
+
+inline void computeDotProduct( uint s0, uint s1, uint len, uint thread, inout float acc )
+{
+ float curr = 0;
+ const uint s0End = s0 + len;
+ s0 += thread;
+ s1 += thread;
+ for( ; s0 < s0End; s0 += 32, s1 += 32 )
+ curr = mad( arg0[ s0 ], arg1[ s1 ], curr );
+
+ horizontalSumCompatNew( thread, curr );
+ if( 0 == thread )
+ acc += curr;
+}
+
+#include "miscUtils.hlsli"
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint ne00 = src0_elements[ 0 ];
+ const uint ne01 = src0_elements[ 1 ];
+ const int ew0 = roundUp32( ne01 );
+
+ float res = 0;
+ uint s0 = group.y * ew0 * ne00;
+ uint s1 = group.x * 2 * ew0;
+ // The original implementation did following:
+ // int nh = (int)( nk / 2 );
+ // for( int k = -nh; k <= nh; k++ )
+ // What we doing instead:
+ // for( uint len = ( nk / 2 ) * 2 + 1, i = 0; i < len; i++ )
+ // len = ( nk / 2 ) * 2 + 1 is equal to ( nk | 1 )
+ const uint s0End = s0 + ( ne00 | 1u ) * ew0;
+ for( ; s0 < s0End; s0 += ew0, s1 += ew0 )
+ computeDotProduct( s0, s1, ew0, thread, res );
+
+ if( 0 != thread )
+ return;
+
+ const uint nb1 = result_strides[ 1 ];
+ const uint rdi = group.y * nb1 + group.x;
+ result[ rdi ] = res;
+} \ No newline at end of file
diff --git a/ComputeShaders/convolutionMain2Fixed.hlsl b/ComputeShaders/convolutionMain2Fixed.hlsl
new file mode 100644
index 0000000..d21dcbb
--- /dev/null
+++ b/ComputeShaders/convolutionMain2Fixed.hlsl
@@ -0,0 +1,119 @@
+// Optimized version of convolutionMain2.hlsl for kernel size = 3
+// Dispatch [ ( ( ne10 / 2 ) + TILE_Y - 1 ) / TILE_Y, ne02, 1 ] thread groups of this shader
+#ifndef TILE_Y
+static const uint TILE_Y = 8;
+#endif
+#ifndef THREADS
+static const uint THREADS = 64;
+#endif
+
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 src0_elements: packoffset( c0 );
+ uint4 src0_strides: packoffset( c1 );
+ uint4 src1_elements: packoffset( c2 );
+ uint4 result_elements: packoffset( c4 );
+ uint4 result_strides: packoffset( c5 );
+}
+
+// The accumulators we're after
+groupshared float resTemp[ TILE_Y ][ THREADS ];
+
+// Multiply + accumulate the specified row
+inline void accumulate( float a0, float a1, const uint resultRow, const uint thread )
+{
+ float acc = resTemp[ resultRow ][ thread ];
+ acc = mad( a0, a1, acc );
+ resTemp[ resultRow ][ thread ] = acc;
+}
+
+inline void convolutionTile( const uint s0, uint s1, const uint thread, const uint stride, const uint height )
+{
+ // Load 3 rows from arg0
+ const float3 a0 = float3( arg0[ s0 ], arg0[ s0 + stride ], arg0[ s0 + stride * 2 ] );
+
+ // Row 0
+ float a1 = arg1[ s1 ];
+ accumulate( a0[ 0 ], a1, 0, thread );
+ s1 += stride;
+
+ for( uint i = 1; i < height; i++ )
+ {
+ // Row i*2-1
+ // Even-indexed rows only contribute to a single output rows, after muiltiplied by kernel row #1
+ a1 = arg1[ s1 ];
+ accumulate( a0[ 1 ], a1, i - 1, thread );
+ s1 += stride;
+
+ // Row i*2, contributes to 2 output rows corresponding to kernel rows #0 and #2
+ a1 = arg1[ s1 ];
+ accumulate( a0[ 2 ], a1, i - 1, thread );
+ accumulate( a0[ 0 ], a1, i, thread );
+ s1 += stride;
+ }
+
+ // Row height*2 - 1
+ a1 = arg1[ s1 ];
+ accumulate( a0[ 1 ], a1, height - 1, thread );
+ s1 += stride;
+
+ // Row height*2
+ a1 = arg1[ s1 ];
+ accumulate( a0[ 2 ], a1, height - 1, thread );
+}
+
+#include "miscUtils.hlsli"
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ uint i;
+ // Zero out the accumulators
+ for( i = 0; i < TILE_Y; i++ )
+ resTemp[ i ][ thread ] = 0.0;
+ GroupMemoryBarrierWithGroupSync();
+
+ const uint i1 = group.y;
+ const uint i0 = group.x * TILE_Y * 2;
+ const uint height = min( TILE_Y, ( src1_elements.x / 2 ) - group.x * TILE_Y );
+
+ const uint ne00 = src0_elements[ 0 ];
+ const uint ne01 = src0_elements[ 1 ];
+ const int ew0 = roundUp32( ne01 );
+
+ uint s0 = i1 * ew0 * ne00;
+ const uint s0End = s0 + ew0;
+ uint s1 = i0 * ew0;
+ s0 += thread;
+ s1 += thread;
+ for( ; s0 < s0End; s0 += THREADS, s1 += THREADS )
+ convolutionTile( s0, s1, thread, ew0, height );
+
+ GroupMemoryBarrierWithGroupSync();
+
+ // Now we need horizontal sums of these shared accumulators, i.e. reduce [height][THREADS] shared array into [height][1] column
+ for( i = THREADS / 2; i > 0; i /= 2 )
+ {
+ if( thread < i )
+ {
+ for( uint j = 0; j < height; j++ )
+ {
+ float sum = resTemp[ j ][ thread ];
+ sum += resTemp[ j ][ thread + i ];
+ resTemp[ j ][ thread ] = sum;
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ // And finally, store that column to global memory
+ if( thread >= height )
+ return;
+ const uint nb1 = result_strides[ 1 ];
+ const uint rdi = i1 * nb1 + group.x * TILE_Y + thread;
+ result[ rdi ] = resTemp[ thread ][ 0 ];
+} \ No newline at end of file
diff --git a/ComputeShaders/convolutionPrep1.hlsl b/ComputeShaders/convolutionPrep1.hlsl
new file mode 100644
index 0000000..528ff16
--- /dev/null
+++ b/ComputeShaders/convolutionPrep1.hlsl
@@ -0,0 +1,39 @@
+// ggml_compute_forward_conv_1d_1s_f16_f32, prepare kernel data (src0)
+// Dispatch [ ne01, ne02, 1 ] thread groups
+Buffer<float> arg0: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 src0_elements: packoffset( c0 );
+ uint4 src0_strides: packoffset( c1 );
+}
+
+inline uint roundUp32( uint x )
+{
+ return ( x + 31 ) & ( ~31u );
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint nb01 = src0_strides[ 1 ];
+ const uint nb02 = src0_strides[ 2 ];
+
+ const uint ne00 = src0_elements[ 0 ];
+ const uint ne01 = src0_elements[ 1 ];
+ const uint ew0 = roundUp32( ne01 );
+
+ const uint i02 = group.y;
+ const uint i01 = group.x;
+
+ uint rsi = i02 * nb02 + i01 * nb01;
+ const uint rsiEnd = rsi + ne00;
+ uint rdi = i02 * ew0 * ne00 + i01;
+ rsi += thread;
+ rdi += thread * ew0;
+ const uint rdiInc = 32 * ew0;
+
+ for( ; rsi < rsiEnd; rsi += 32, rdi += rdiInc )
+ result[ rdi ] = arg0[ rsi ];
+} \ No newline at end of file
diff --git a/ComputeShaders/convolutionPrep2.hlsl b/ComputeShaders/convolutionPrep2.hlsl
new file mode 100644
index 0000000..a7e7172
--- /dev/null
+++ b/ComputeShaders/convolutionPrep2.hlsl
@@ -0,0 +1,43 @@
+// ggml_compute_forward_conv_1d_1s_f16_f32, prepare source data (src1)
+// Dispatch [ ne11, 1, 1 ] thread groups
+Buffer<float> arg1: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 src0_elements: packoffset( c0 );
+ uint4 src1_elements: packoffset( c2 );
+ uint4 src1_strides: packoffset( c3 );
+}
+
+#include "miscUtils.hlsli"
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint i11 = group.x;
+
+ const uint ne00 = src0_elements[ 0 ];
+ const uint ne01 = src0_elements[ 1 ];
+ const uint ne10 = src1_elements[ 0 ];
+ const uint nb11 = src1_strides[ 1 ];
+
+ const uint nk = ne00;
+ const uint nh = nk / 2;
+ const int ew0 = roundUp32( ne01 );
+
+ uint rsi = i11 * nb11;
+ uint rdi = nh * ew0 + i11;
+ const uint rdiInc = ew0 * 32;
+ const uint rsiEnd = rsi + ne10;
+
+ rsi += thread;
+ rdi += thread * ew0;
+
+ for( ; rsi < rsiEnd; rsi += 32, rdi += rdiInc )
+ {
+ float f = arg1[ rsi ];
+ f = adjustFp16( f );
+ result[ rdi ] = f;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/copyConvert.hlsl b/ComputeShaders/copyConvert.hlsl
new file mode 100644
index 0000000..399147f
--- /dev/null
+++ b/ComputeShaders/copyConvert.hlsl
@@ -0,0 +1,50 @@
+// ggml_compute_forward_dup_f32 when we only need to convert types, but not reshape the tensor
+// Dispatch [ ne01, ne02, ne03 ] thread groups of this shader
+Buffer<float> arg0: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 src0_elements: packoffset( c0 );
+ uint4 src0_strides: packoffset( c1 );
+ bool downcastFp32 : packoffset( c2.x );
+}
+
+#include "miscUtils.hlsli"
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint nb00 = src0_strides[ 0 ];
+ const uint nb01 = src0_strides[ 1 ];
+ const uint nb02 = src0_strides[ 2 ];
+ const uint nb03 = src0_strides[ 3 ];
+
+ const uint ne00 = src0_elements[ 0 ];
+ const uint ne01 = src0_elements[ 1 ];
+ const uint ne02 = src0_elements[ 2 ];
+ const uint ne03 = src0_elements[ 3 ];
+
+ const uint i01 = group.x;
+ const uint i02 = group.y;
+ const uint i03 = group.z;
+
+ const uint rs = ne00 * nb00;
+ //const uint id = i01 + i02 * ne02 + i03 * ne01 * ne02;
+ const uint id = ( i03 * ne01 + i02 ) * ne02 + i01;
+
+ uint rsi = i01 * nb01 + i02 * nb02 + i03 * nb03;
+ uint rdi = id * rs;
+
+ const uint rsiEnd = rsi + rs;
+ rsi += thread;
+ rdi += thread;
+ for( ; rsi < rsiEnd; rsi += 32, rdi += 32 )
+ {
+ float f = arg0[ rsi ];
+ [branch]
+ if( downcastFp32 )
+ f = adjustFp16( f );
+ result[ rdi ] = f;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/copyTranspose.hlsl b/ComputeShaders/copyTranspose.hlsl
new file mode 100644
index 0000000..fc3e82f
--- /dev/null
+++ b/ComputeShaders/copyTranspose.hlsl
@@ -0,0 +1,56 @@
+// ggml_compute_forward_dup_f32 when we actually need to reshape the tensor
+// Dispatch [ ne01, ne02, ne03 ] thread groups of this shader
+Buffer<float> arg0: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 src0_elements: packoffset( c0 );
+ uint4 src0_strides: packoffset( c1 );
+ bool downcastFp32 : packoffset( c2.x );
+}
+
+#include "miscUtils.hlsli"
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint nb00 = src0_strides[ 0 ];
+ const uint nb01 = src0_strides[ 1 ];
+ const uint nb02 = src0_strides[ 2 ];
+ const uint nb03 = src0_strides[ 3 ];
+
+ const uint ne00 = src0_elements[ 0 ];
+ const uint ne01 = src0_elements[ 1 ];
+ const uint ne02 = src0_elements[ 2 ];
+ const uint ne03 = src0_elements[ 3 ];
+
+ const uint i01 = group.x;
+ const uint i02 = group.y;
+ const uint i03 = group.z;
+
+ // We need following integer: i01*ne00 + i02*ne00*ne01 + i03*ne00*ne01*ne02
+ // We want to minimize count of integer multiplications
+ // Also, DXBC assembly features `imad` instruction which computes a*b+c for integers, the actual hardware hopefully has an equivalent
+ // i03*ne00*ne01*ne02 + i02*ne00*ne01 + i01*ne00
+ // ( i03*ne01*ne02 + i02*ne01 + i01 ) * ne00
+ // ( ( i03*ne02 + i02) * ne01 + i01 ) * ne00
+ uint rdi = ( ( i03 * ne02 + i02 ) * ne01 + i01 ) * ne00;
+
+ const uint rdiEnd = rdi + ne00;
+
+ uint rsi = i01 * nb01 + i02 * nb02 + i03 * nb03;
+ const uint rsiInc = 32 * nb00;
+
+ rdi += thread;
+ rsi += thread * nb00;
+
+ for( ; rdi < rdiEnd; rdi += 32, rsi += rsiInc )
+ {
+ float f = arg0[ rsi ];
+ [branch]
+ if( downcastFp32 )
+ f = adjustFp16( f );
+ result[ rdi ] = f;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/diagMaskInf.hlsl b/ComputeShaders/diagMaskInf.hlsl
new file mode 100644
index 0000000..18e3938
--- /dev/null
+++ b/ComputeShaders/diagMaskInf.hlsl
@@ -0,0 +1,30 @@
+// ggml_compute_forward_diag_mask_inf_f32
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 elements: packoffset( c0 );
+ uint4 strides: packoffset( c1 );
+ uint n_past : packoffset( c2.x );
+}
+
+static const float negativeInfinity = asfloat( 0xff800000 );
+
+[numthreads( 32, 1, 1 )]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint k = group.y;
+ const uint j = group.x;
+
+ // Start of the row
+ uint rdi = k * strides[ 2 ] + j * strides[ 1 ];
+ // End of the row
+ const uint rdiEnd = rdi + elements[ 0 ] * strides[ 0 ];
+ // First index to write in this thread
+ rdi += ( n_past + j + thread + 1 ) * strides[ 0 ];
+ // Index increment
+ const uint rdiInc = 32 * strides[ 0 ];
+
+ for( ; rdi < rdiEnd; rdi += rdiInc )
+ result[ rdi ] = negativeInfinity;
+} \ No newline at end of file
diff --git a/ComputeShaders/flashAttention.hlsl b/ComputeShaders/flashAttention.hlsl
new file mode 100644
index 0000000..65212d0
--- /dev/null
+++ b/ComputeShaders/flashAttention.hlsl
@@ -0,0 +1,170 @@
+// Ported from ggml_compute_forward_flash_attn_f16
+// Dispatch with [ neq1*neq2*neq3, 1, 1 ] thread groups
+
+#include "flashAttentionCommon.hlsli"
+Buffer<uint> lookupTable: register( t3 );
+#include "groupReduce.hlsli"
+
+inline void computeDotProduct( Buffer<float> buff0, Buffer<float> buff1, uint s0, uint s1, const uint len, const uint thread, inout float acc )
+{
+ acc = 0;
+ const uint s0End = s0 + len;
+ s0 += thread;
+ s1 += thread;
+ for( ; s0 < s0End; s0 += 32, s1 += 32 )
+ acc = mad( buff0[ s0 ], buff1[ s1 ], acc );
+
+ horizontalSum( thread, acc );
+}
+
+inline void computeDotProduct( Buffer<float> buff0, RWBuffer<float> buff1, uint s0, uint s1, const uint len, const uint thread, inout float acc )
+{
+ acc = 0;
+ const uint s0End = s0 + len;
+ s0 += thread;
+ s1 += thread;
+ for( ; s0 < s0End; s0 += 32, s1 += 32 )
+ acc = mad( buff0[ s0 ], buff1[ s1 ], acc );
+
+ horizontalSum( thread, acc );
+}
+
+void scaleTempVector( uint i, const uint length, const uint thread, const float multiplier, bool round )
+{
+ const uint end = i + length;
+ for( i += thread; i < end; i += 32 )
+ {
+ float f = temp[ i ];
+ f *= multiplier;
+ if( round )
+ f = roundToFp16( f );
+ temp[ i ] = f;
+ }
+}
+
+#include "miscUtils.hlsli"
+
+// Transform temp[ i ] = exp( temp[ i ] - tempMax ), and return the sum of these values
+inline float applySoftMax( uint i, const uint length, const uint thread, const float tempMax )
+{
+ // Transform the values, and compute per-thread sum
+ const uint end = i + length;
+ float sum = 0;
+ for( i += thread; i < end; i += 32 )
+ {
+ float f = temp[ i ];
+ [branch]
+ if( f != negativeInfinity )
+ {
+ f -= tempMax;
+ const uint index = fp16Rounded( f );
+ const uint res16 = lookupTable[ index ];
+ f = f16tof32( res16 );
+ }
+ else
+ f = 0;
+
+ temp[ i ] = f;
+ sum += f;
+ }
+
+ // Reduce per-thread sum to the global one, over all threads of the group
+ horizontalSumBroadcast( thread, sum );
+ return sum;
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint neq0 = q_elements[ 0 ];
+ const uint neq1 = q_elements[ 1 ];
+ const uint neq2 = q_elements[ 2 ];
+ const uint neq3 = q_elements[ 3 ];
+
+ const uint nek0 = k_elements[ 0 ];
+ const uint nek1 = k_elements[ 1 ];
+
+ const uint nev1 = v_elements[ 1 ];
+
+ const uint ne0 = res_elements[ 0 ];
+ const uint ne1 = res_elements[ 1 ];
+
+ const uint nbk0 = k_strides[ 0 ];
+ const uint nbk1 = k_strides[ 1 ];
+ const uint nbk2 = k_strides[ 2 ];
+ const uint nbk3 = k_strides[ 3 ];
+
+ const uint nbq0 = q_strides[ 0 ];
+ const uint nbq1 = q_strides[ 1 ];
+ const uint nbq2 = q_strides[ 2 ];
+ const uint nbq3 = q_strides[ 3 ];
+
+ const uint nbv0 = v_strides[ 0 ];
+ const uint nbv1 = v_strides[ 1 ];
+ const uint nbv2 = v_strides[ 2 ];
+ const uint nbv3 = v_strides[ 3 ];
+
+ const uint nb0 = res_strides[ 0 ];
+ const uint nb1 = res_strides[ 1 ];
+ const uint nb2 = res_strides[ 2 ];
+ const uint nb3 = res_strides[ 3 ];
+
+ const uint D = neq0;
+ const uint N = neq1;
+ const uint P = nek1 - N;
+ const uint M = nek1;
+
+ const uint ir = group.x;
+ const uint iq3 = ir / ( neq2 * neq1 );
+ const uint iq2 = ( ir - iq3 * neq2 * neq1 ) / neq1;
+ const uint iq1 = ( ir - iq3 * neq2 * neq1 - iq2 * neq1 );
+
+ const uint tempIndex = ir * tempBufferStride;
+
+ uint ic;
+ float tvm = negativeInfinity;
+ const uint s1 = iq1 * nbq1 + iq2 * nbq2 + iq3 * nbq3;
+ uint s0 = iq2 * nbk2 + iq3 * nbk3;
+ for( ic = 0; ic < nek1; ic++, s0 += nbk1 )
+ {
+ if( masked )
+ {
+ if( ic > P + iq1 )
+ {
+ if( 0 == thread )
+ temp[ tempIndex + ic ] = negativeInfinity;
+ continue;
+ }
+ }
+
+ float dp;
+ computeDotProduct( k, q, s0, s1, neq0, thread, dp );
+ if( 0 == thread )
+ {
+ dp *= scale;
+ temp[ tempIndex + ic ] = dp;
+ tvm = max( tvm, dp );
+ }
+ }
+
+ if( 0 == thread )
+ sharedAccumulators[ 0 ] = tvm;
+ GroupMemoryBarrierWithGroupSync();
+ tvm = sharedAccumulators[ 0 ];
+
+ // Softmax
+ {
+ float sum = applySoftMax( tempIndex, M, thread, tvm );
+ scaleTempVector( tempIndex, M, thread, 1.0 / sum, true );
+ }
+
+ s0 = iq2 * nbv2 + iq3 * nbv3;
+ uint rdi = iq1 * nb1 + iq2 * nb2 + iq3 * nb3;
+ for( ic = 0; ic < nev1; ic++, s0 += nbv1, rdi += nb0 )
+ {
+ float dp;
+ computeDotProduct( v, temp, s0, tempIndex, nek1, thread, dp );
+ if( 0 == thread )
+ result[ rdi ] = dp;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/flashAttentionCommon.hlsli b/ComputeShaders/flashAttentionCommon.hlsli
new file mode 100644
index 0000000..68ed30b
--- /dev/null
+++ b/ComputeShaders/flashAttentionCommon.hlsli
@@ -0,0 +1,67 @@
+// Ported from ggml_compute_forward_flash_attn_f16
+// Dispatch with [ neq1*neq2*neq3, 1, 1 ] thread groups
+Buffer<float> q: register( t0 );
+Buffer<float> k: register( t1 );
+Buffer<float> v: register( t2 );
+
+RWBuffer<float> result: register( u0 );
+// This temporary buffer should fit tempBufferStride * neq1 * neq2 * neq3 elements, FP32 precision
+RWBuffer<float> temp: register( u1 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 q_elements: packoffset( c0 );
+ uint4 q_strides: packoffset( c1 );
+ uint4 k_elements: packoffset( c2 );
+ uint4 k_strides: packoffset( c3 );
+ uint4 v_elements: packoffset( c4 );
+ uint4 v_strides: packoffset( c5 );
+ uint4 res_elements: packoffset( c6 );
+ uint4 res_strides: packoffset( c7 );
+
+ bool masked : packoffset( c8.x );
+ // 1.0 / sqrt( (double) D )
+ float scale : packoffset( c8.y );
+ // This number is required to be >= nek1, and ideally rounded up to either 32 (L2 line) or 128 (L1 line) bytes
+ uint tempBufferStride: packoffset( c8.z );
+}
+
+static const float negativeInfinity = asfloat( 0xff800000 );
+
+// Convert FP32 number to FP16 using rounding to nearest, then upcast back to FP32
+inline float roundToFp16( const float src )
+{
+ const uint trunc16 = f32tof16( src );
+ const float trunc32 = f16tof32( trunc16 );
+
+ const uint truncExp = ( trunc16 >> 10 ) & 0x1F;
+ if( truncExp != 0x1F )
+ {
+ const uint next16 = trunc16 + 1;
+ const float next32 = f16tof32( next16 );
+
+ const float errTrunc = abs( src - trunc32 );
+ const float errNext = abs( src - next32 );
+
+ if( errTrunc < errNext )
+ {
+ // Truncated was closer to the source
+ return trunc32;
+ }
+ else if( errTrunc > errNext )
+ {
+ // Truncated + 1 was closer to the source
+ return next32;
+ }
+ else
+ {
+ // Exactly half, doing banker's rounding to nearest even
+ return ( 0 == ( trunc16 & 1 ) ) ? trunc32 : next32;
+ }
+ }
+ else
+ {
+ // INF or NAN
+ return trunc32;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/flashAttentionCompat1.hlsl b/ComputeShaders/flashAttentionCompat1.hlsl
new file mode 100644
index 0000000..f1f2ddb
--- /dev/null
+++ b/ComputeShaders/flashAttentionCompat1.hlsl
@@ -0,0 +1,125 @@
+// Dispatch with [ neq1*neq2*neq3, 1, 1 ] thread groups
+#include "flashAttentionCommon.hlsli"
+#include "groupReduce.hlsli"
+
+inline void computeDotProduct( Buffer<float> buff0, Buffer<float> buff1, uint s0, uint s1, const uint len, const uint thread, inout float acc )
+{
+ acc = 0;
+ /*
+ const uint s0End = s0 + len;
+ s0 += thread;
+ s1 += thread;
+ for( ; s0 < s0End; s0 += 32, s1 += 32 )
+ acc = mad( buff0[ s0 ], buff1[ s1 ], acc );
+ horizontalSumCompatNew( thread, acc );
+ */
+
+ const uint completeVectors = len / 32;
+ uint i;
+ for( i = 0; i < completeVectors; i++, s0 += 32, s1 += 32 )
+ acc = mad( buff0[ s0 + thread ], buff1[ s1 + thread ], acc );
+
+ horizontalSumCompatNew( thread, acc );
+
+ if( 0 == thread )
+ {
+ const uint rem = len % 32;
+ for( i = 0; i < rem; i++ )
+ {
+ precise float a = buff0[ s0 + i ];
+ precise float b = buff1[ s1 + i ];
+ precise float prod = a * b;
+ acc += prod;
+ }
+ }
+}
+
+void scaleTempVector( uint i, const uint length, const uint thread, const float multiplier )
+{
+ const uint end = i + length;
+ for( i += thread; i < end; i += 32 )
+ {
+ float f = temp[ i ];
+ f *= multiplier;
+ temp[ i ] = f;
+ }
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint neq0 = q_elements[ 0 ];
+ const uint neq1 = q_elements[ 1 ];
+ const uint neq2 = q_elements[ 2 ];
+ const uint neq3 = q_elements[ 3 ];
+
+ const uint nek0 = k_elements[ 0 ];
+ const uint nek1 = k_elements[ 1 ];
+
+ const uint nev1 = v_elements[ 1 ];
+
+ const uint ne0 = res_elements[ 0 ];
+ const uint ne1 = res_elements[ 1 ];
+
+ const uint nbk0 = k_strides[ 0 ];
+ const uint nbk1 = k_strides[ 1 ];
+ const uint nbk2 = k_strides[ 2 ];
+ const uint nbk3 = k_strides[ 3 ];
+
+ const uint nbq0 = q_strides[ 0 ];
+ const uint nbq1 = q_strides[ 1 ];
+ const uint nbq2 = q_strides[ 2 ];
+ const uint nbq3 = q_strides[ 3 ];
+
+ const uint nbv0 = v_strides[ 0 ];
+ const uint nbv1 = v_strides[ 1 ];
+ const uint nbv2 = v_strides[ 2 ];
+ const uint nbv3 = v_strides[ 3 ];
+
+ const uint nb0 = res_strides[ 0 ];
+ const uint nb1 = res_strides[ 1 ];
+ const uint nb2 = res_strides[ 2 ];
+ const uint nb3 = res_strides[ 3 ];
+
+ const uint D = neq0;
+ const uint N = neq1;
+ const uint P = nek1 - N;
+ // const uint M = P + N;
+ const uint M = nek1;
+
+ const uint ir = group.x;
+ const uint iq3 = ir / ( neq2 * neq1 );
+ const uint iq2 = ( ir - iq3 * neq2 * neq1 ) / neq1;
+ const uint iq1 = ( ir - iq3 * neq2 * neq1 - iq2 * neq1 );
+
+ const uint tempIndex = ir * tempBufferStride;
+
+ uint ic;
+ for( ic = 0; ic < nek1; ic++ )
+ {
+ // k indices
+ const uint ik3 = iq3;
+ const uint ik2 = iq2;
+ const uint ik1 = ic;
+
+ // S indices
+ const uint i1 = ik1;
+
+ if( masked )
+ {
+ if( ic > P + iq1 )
+ {
+ if( 0 == thread )
+ temp[ tempIndex + ic ] = negativeInfinity;
+ continue;
+ }
+ }
+
+ const uint s0 = ik1 * nbk1 + ik2 * nbk2 + ik3 * nbk3;
+ const uint s1 = iq1 * nbq1 + iq2 * nbq2 + iq3 * nbq3;
+ float dp;
+ computeDotProduct( k, q, s0, s1, neq0, thread, dp );
+ if( 0 == thread )
+ temp[ tempIndex + ic ] = dp * scale;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/flashAttentionCompat2.hlsl b/ComputeShaders/flashAttentionCompat2.hlsl
new file mode 100644
index 0000000..73f5fce
--- /dev/null
+++ b/ComputeShaders/flashAttentionCompat2.hlsl
@@ -0,0 +1,114 @@
+// Dispatch with [ ( neq1*neq2*neq3 + 31 ) / 32, 1, 1 ] thread groups
+#include "flashAttentionCommon.hlsli"
+Buffer<uint> lookupTable: register( t3 );
+
+void scaleTempVector( uint i, const uint length, const float multiplier )
+{
+ const uint end = i + length;
+ for( ; i < end; i++ )
+ {
+ float f = temp[ i ];
+ f *= multiplier;
+ // Rounding in this shader causes numerical errors on my GeForce 1080 Ti GPU, driver 527.56
+ // f = roundToFp16( f );
+ temp[ i ] = f;
+ }
+}
+
+inline float computeTempVectorMax( uint i, const uint length )
+{
+ // Compute per-thread maximum
+ const uint end = i + length;
+ float ax = negativeInfinity;
+ for( ; i < end; i++ )
+ ax = max( ax, temp[ i ] );
+ return ax;
+}
+
+#include "miscUtils.hlsli"
+#include "fp64Utils.hlsli"
+
+// Transform temp[ i ] = exp( temp[ i ] - tempMax ), and return the sum of these values
+inline double applySoftMax( uint i, const uint length, const float tempMax )
+{
+ // Transform the values, and compute per-thread sum
+ const uint end = i + length;
+ double sum = 0;
+ for( ; i < end; i++ )
+ {
+ float f = temp[ i ];
+ [branch]
+ if( f != negativeInfinity )
+ {
+ f -= tempMax;
+ const uint index = fp16Rounded( f );
+ const uint res16 = lookupTable[ index ];
+ f = f16tof32( res16 );
+ sum += f;
+ }
+ else
+ f = 0;
+
+ temp[ i ] = f;
+ }
+ return sum;
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 dtid: SV_DispatchThreadID )
+{
+ const uint neq0 = q_elements[ 0 ];
+ const uint neq1 = q_elements[ 1 ];
+ const uint neq2 = q_elements[ 2 ];
+ const uint neq3 = q_elements[ 3 ];
+
+ const uint nek0 = k_elements[ 0 ];
+ const uint nek1 = k_elements[ 1 ];
+
+ const uint nev1 = v_elements[ 1 ];
+
+ const uint ne0 = res_elements[ 0 ];
+ const uint ne1 = res_elements[ 1 ];
+
+ const uint nbk0 = k_strides[ 0 ];
+ const uint nbk1 = k_strides[ 1 ];
+ const uint nbk2 = k_strides[ 2 ];
+ const uint nbk3 = k_strides[ 3 ];
+
+ const uint nbq0 = q_strides[ 0 ];
+ const uint nbq1 = q_strides[ 1 ];
+ const uint nbq2 = q_strides[ 2 ];
+ const uint nbq3 = q_strides[ 3 ];
+
+ const uint nbv0 = v_strides[ 0 ];
+ const uint nbv1 = v_strides[ 1 ];
+ const uint nbv2 = v_strides[ 2 ];
+ const uint nbv3 = v_strides[ 3 ];
+
+ const uint nb0 = res_strides[ 0 ];
+ const uint nb1 = res_strides[ 1 ];
+ const uint nb2 = res_strides[ 2 ];
+ const uint nb3 = res_strides[ 3 ];
+
+ const uint D = neq0;
+ const uint N = neq1;
+ const uint P = nek1 - N;
+ // const uint M = P + N;
+ const uint M = nek1;
+
+ const uint ir = dtid.x;
+ if( ir >= neq1 * neq2 * neq3 )
+ return;
+
+ const uint iq3 = ir / ( neq2 * neq1 );
+ const uint iq2 = ( ir - iq3 * neq2 * neq1 ) / neq1;
+ const uint iq1 = ( ir - iq3 * neq2 * neq1 - iq2 * neq1 );
+
+ const uint tempIndex = ir * tempBufferStride;
+
+ // Softmax
+ float tvm = computeTempVectorMax( tempIndex, M );
+ double sum = applySoftMax( tempIndex, M, tvm );
+
+ scaleTempVector( tempIndex, M, (float)( 1.0 / sum ) );
+} \ No newline at end of file
diff --git a/ComputeShaders/flashAttentionCompat3.hlsl b/ComputeShaders/flashAttentionCompat3.hlsl
new file mode 100644
index 0000000..e0a4061
--- /dev/null
+++ b/ComputeShaders/flashAttentionCompat3.hlsl
@@ -0,0 +1,118 @@
+// Dispatch with [ neq1*neq2*neq3, 1, 1 ] thread groups
+#include "flashAttentionCommon.hlsli"
+#include "groupReduce.hlsli"
+#include "miscUtils.hlsli"
+
+inline void roundTempVector( uint i, const uint len, const uint thread )
+{
+ const uint iEnd = i + len;
+ for( i += thread; i < iEnd; i += 32 )
+ {
+ float f = temp[ i ];
+ f = roundToFp16( f );
+ temp[ i ] = f;
+ }
+}
+
+inline void computeDotProduct( Buffer<float> buff0, RWBuffer<float> buff1, uint s0, uint s1, const uint len, const uint thread, inout float acc )
+{
+ acc = 0;
+/* const uint s0End = s0 + len;
+ s0 += thread;
+ s1 += thread;
+ for( ; s0 < s0End; s0 += 32, s1 += 32 )
+ acc = mad( buff0[ s0 ], buff1[ s1 ], acc );
+
+ horizontalSumCompatNew( thread, acc ); */
+ const uint completeVectors = len / 32;
+ uint i;
+ for( i = 0; i < completeVectors; i++, s0 += 32, s1 += 32 )
+ acc = mad( buff0[ s0 + thread ], buff1[ s1 + thread ], acc );
+
+ horizontalSumCompatNew( thread, acc );
+
+ if( 0 == thread )
+ {
+ const uint rem = len % 32;
+ if( 0 != rem )
+ {
+ double f64 = acc;
+ for( i = 0; i < rem; i++ )
+ {
+ precise float a = buff0[ s0 + i ];
+ precise float b = buff1[ s1 + i ];
+ precise float prod = a * b;
+ f64 += prod;
+ }
+ acc = (float)f64;
+ }
+ }
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint neq0 = q_elements[ 0 ];
+ const uint neq1 = q_elements[ 1 ];
+ const uint neq2 = q_elements[ 2 ];
+ const uint neq3 = q_elements[ 3 ];
+
+ const uint nek0 = k_elements[ 0 ];
+ const uint nek1 = k_elements[ 1 ];
+
+ const uint nev1 = v_elements[ 1 ];
+
+ const uint ne0 = res_elements[ 0 ];
+ const uint ne1 = res_elements[ 1 ];
+
+ const uint nbk0 = k_strides[ 0 ];
+ const uint nbk1 = k_strides[ 1 ];
+ const uint nbk2 = k_strides[ 2 ];
+ const uint nbk3 = k_strides[ 3 ];
+
+ const uint nbq0 = q_strides[ 0 ];
+ const uint nbq1 = q_strides[ 1 ];
+ const uint nbq2 = q_strides[ 2 ];
+ const uint nbq3 = q_strides[ 3 ];
+
+ const uint nbv0 = v_strides[ 0 ];
+ const uint nbv1 = v_strides[ 1 ];
+ const uint nbv2 = v_strides[ 2 ];
+ const uint nbv3 = v_strides[ 3 ];
+
+ const uint nb0 = res_strides[ 0 ];
+ const uint nb1 = res_strides[ 1 ];
+ const uint nb2 = res_strides[ 2 ];
+ const uint nb3 = res_strides[ 3 ];
+
+ const uint D = neq0;
+ const uint N = neq1;
+ const uint P = nek1 - N;
+ // const uint M = P + N;
+ const uint M = nek1;
+
+ const uint ir = group.x;
+ const uint iq3 = ir / ( neq2 * neq1 );
+ const uint iq2 = ( ir - iq3 * neq2 * neq1 ) / neq1;
+ const uint iq1 = ( ir - iq3 * neq2 * neq1 - iq2 * neq1 );
+
+ const uint tempIndex = ir * tempBufferStride;
+
+ roundTempVector( tempIndex, nek1, thread );
+ AllMemoryBarrierWithGroupSync();
+
+ uint rdi = iq1 * nb1 + iq2 * nb2 + iq3 * nb3;
+ for( uint ic = 0; ic < nev1; ic++, rdi += nb0 )
+ {
+ // dst indices
+ const uint i1 = iq1;
+ const uint i2 = iq2;
+ const uint i3 = iq3;
+
+ const uint s0 = ic * nbv1 + i2 * nbv2 + i3 * nbv3;
+ float dp;
+ computeDotProduct( v, temp, s0, tempIndex, nek1, thread, dp );
+ if( 0 == thread )
+ result[ rdi ] = dp;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/fmaRepeat1.hlsl b/ComputeShaders/fmaRepeat1.hlsl
new file mode 100644
index 0000000..3db3827
--- /dev/null
+++ b/ComputeShaders/fmaRepeat1.hlsl
@@ -0,0 +1,77 @@
+// Implementation of fmaRepeat() when both source arguments have same size and strides
+// Dispatch [ nb[ 1 ], nb[ 2 ], nb[ 3 ] ] thread groups of this shader, where nb is size of the destination tensor
+RWBuffer<float> tensor: register( u0 );
+Buffer<float> patternMul: register( t0 );
+Buffer<float> patternAdd: register( t1 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 tensorSize: packoffset( c0 );
+ uint4 tensorStrides: packoffset( c1 );
+ uint4 patternSize: packoffset( c2 );
+ uint4 patternStrides: packoffset( c3 );
+}
+
+#ifndef THREADS
+#define THREADS 512
+#endif
+
+#include "repeatUtils.hlsli"
+
+inline void computeSimple( uint idx, float mul, float add )
+{
+ precise float f = tensor[ idx ];
+ f *= mul;
+ f += add;
+ tensor[ idx ] = f;
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ uint3 it = tensorIteratorState( group, thread, tensorSize, tensorStrides );
+ uint rsi = rowOffset( group % patternSize.yzw, patternStrides );
+
+ if( patternSize[ 0 ] == 1 )
+ {
+ // The pattern only has 1 column - broadcasting over the row
+ const float pMul = patternMul[ rsi ];
+ const float pAdd = patternAdd[ rsi ];
+ ROW_LOOP( it )
+ computeSimple( it.x, pMul, pAdd );
+ }
+ else if( patternSize[ 0 ] <= THREADS )
+ {
+ // pattern size doesn't exceed thread group size: load pattern value outside of the loop
+ const uint threadsPerGroup = THREADS - ( THREADS % patternSize[ 0 ] );
+ if( thread >= threadsPerGroup )
+ return;
+
+ rsi += ( thread % patternSize[ 0 ] ) * patternStrides[ 0 ];
+ const float pMul = patternMul[ rsi ];
+ const float pAdd = patternAdd[ rsi ];
+ ROW_LOOP_EX( it, threadsPerGroup, tensorStrides )
+ computeSimple( it.x, pMul, pAdd );
+ }
+ else
+ {
+ // Pattern rows are larger than the thread group, need to stream from both buffers
+ const uint rsiInc = THREADS * patternStrides[ 0 ];
+ const uint rsiDec = patternSize[ 0 ] * patternStrides[ 0 ];
+ const uint rsiEnd = rsi + rsiDec;
+ rsi += thread * patternStrides[ 0 ];
+
+ ROW_LOOP( it )
+ {
+ precise float f = tensor[ it.x ];
+ float mul = patternMul[ rsi ];
+ float add = patternAdd[ rsi ];
+ rsi += rsiInc;
+ if( rsi >= rsiEnd )
+ rsi -= rsiDec;
+ f *= mul;
+ f += add;
+ tensor[ it.x ] = f;
+ }
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/fmaRepeat164.hlsl b/ComputeShaders/fmaRepeat164.hlsl
new file mode 100644
index 0000000..99813f6
--- /dev/null
+++ b/ComputeShaders/fmaRepeat164.hlsl
@@ -0,0 +1,2 @@
+#define THREADS 64
+#include "fmaRepeat1.hlsl" \ No newline at end of file
diff --git a/ComputeShaders/fmaRepeat2.hlsl b/ComputeShaders/fmaRepeat2.hlsl
new file mode 100644
index 0000000..edadf0a
--- /dev/null
+++ b/ComputeShaders/fmaRepeat2.hlsl
@@ -0,0 +1,45 @@
+// Implementation of fmaRepeat() when source arguments have different shape or VRAM layout
+// Dispatch [ nb[ 1 ], nb[ 2 ], nb[ 3 ] ] thread groups of this shader, where nb is size of the destination tensor
+RWBuffer<float> tensor: register( u0 );
+Buffer<float> patternMul: register( t0 );
+Buffer<float> patternAdd: register( t1 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 tensorSize: packoffset( c0 );
+ uint4 tensorStrides: packoffset( c1 );
+ uint4 patternSizeMul: packoffset( c2 );
+ uint4 patternStridesMul: packoffset( c3 );
+ uint4 patternSizeAdd: packoffset( c4 );
+ uint4 patternStridesAdd: packoffset( c5 );
+}
+
+#ifndef THREADS
+#define THREADS 32
+#endif
+
+#include "repeatUtils.hlsli"
+
+inline float loadPattern( Buffer<float> buffer, uint rowStart, uint i, uint4 size, uint4 stride )
+{
+ i %= size.x;
+ return buffer[ i * stride.x + rowStart ];
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ uint3 it = tensorIteratorState( group, thread, tensorSize, tensorStrides );
+ const uint rsiMul = rowOffset( group % patternSizeMul.yzw, patternStridesMul );
+ const uint rsiAdd = rowOffset( group % patternSizeAdd.yzw, patternStridesAdd );
+
+ for( uint i = thread; it.x < it.z; it.x += it.y, i++ )
+ {
+ precise float f = tensor[ it.x ];
+ float mul = loadPattern( patternMul, rsiMul, i, patternSizeMul, patternStridesMul );
+ float add = loadPattern( patternAdd, rsiAdd, i, patternSizeAdd, patternStridesAdd );
+ f *= mul;
+ f += add;
+ tensor[ it.x ] = f;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/fp64Utils.hlsli b/ComputeShaders/fp64Utils.hlsli
new file mode 100644
index 0000000..9782718
--- /dev/null
+++ b/ComputeShaders/fp64Utils.hlsli
@@ -0,0 +1,28 @@
+// TODO: compile another version of these shader, and use it on GPUs with ExtendedDoublesShaderInstructions flag, will become slightly faster
+// https://learn.microsoft.com/en-us/windows/win32/api/d3d11/ns-d3d11-d3d11_feature_data_d3d11_options
+#ifndef ExtendedDoublesShaderInstructions
+#define ExtendedDoublesShaderInstructions 0
+#endif
+
+// Compute num/den in FP64 precision
+inline double div64( double num, double den )
+{
+#if ExtendedDoublesShaderInstructions
+ return num / den;
+#else
+ // https://en.wikipedia.org/wiki/Division_algorithm#Newton%E2%80%93Raphson_division
+ double x = 1.0f / (float)den;
+ x += x * ( 1.0 - den * x );
+ x += x * ( 1.0 - den * x );
+ return num * x;
+#endif
+}
+
+// Compute sqrt(x) in FP64 precision
+inline double sqrt64( double x )
+{
+ double root = sqrt( (float)x );
+ root = 0.5 * ( root + div64( x, root ) );
+ root = 0.5 * ( root + div64( x, root ) );
+ return root;
+} \ No newline at end of file
diff --git a/ComputeShaders/groupReduce.hlsli b/ComputeShaders/groupReduce.hlsli
new file mode 100644
index 0000000..1ffe1d8
--- /dev/null
+++ b/ComputeShaders/groupReduce.hlsli
@@ -0,0 +1,139 @@
+groupshared float sharedAccumulators[ 32 ];
+
+// Compute horisontal sum of the numbers. The result is only correct on the thread #0 of the group.
+void horizontalSum( const uint thread, inout float sum )
+{
+ sharedAccumulators[ thread ] = sum;
+ for( uint i = 16; i > 1; i /= 2 )
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if( thread < i )
+ {
+ sum += sharedAccumulators[ thread + i ];
+ sharedAccumulators[ thread ] = sum;
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ if( 0 == thread )
+ sum += sharedAccumulators[ 1 ];
+}
+
+// Compute horisontal sum of the numbers, and broadcast to all threads of the group.
+void horizontalSumBroadcast( const uint thread, inout float sum )
+{
+ horizontalSum( thread, sum );
+ if( 0 == thread )
+ sharedAccumulators[ 0 ] = sum;
+ GroupMemoryBarrierWithGroupSync();
+ sum = sharedAccumulators[ 0 ];
+}
+
+// Compute horisontal sum of the numbers, in the order equal to the CPU-running dot product implementation.
+// The result is only correct on the thread #0 of the group.
+void horizontalSumCompat( const uint thread, inout float sum )
+{
+ sharedAccumulators[ thread ] = sum;
+ GroupMemoryBarrierWithGroupSync();
+
+ if( 0 == ( thread & 8 ) )
+ {
+ // This runs on threads [ 0 .. 7 ] and [ 16 .. 23 ]
+ // sum01 = _mm256_add_ps( sum0, sum1 );
+ // sum23 = _mm256_add_ps( sum2, sum3 );
+ sum += sharedAccumulators[ thread + 8 ];
+ sharedAccumulators[ thread ] = sum;
+ }
+
+ GroupMemoryBarrierWithGroupSync();
+ if( thread < 8 )
+ {
+ // This runs on threads [ 0 .. 7 ]
+ // sum0123 = _mm256_add_ps( sum01, sum23 );
+ sum += sharedAccumulators[ thread + 16 ];
+ sharedAccumulators[ thread ] = sum;
+ }
+
+ GroupMemoryBarrierWithGroupSync();
+ if( thread < 4 )
+ {
+ // const __m128 r4 = _mm_add_ps( _mm256_castps256_ps128( sum0123 ), _mm256_extractf128_ps( sum0123, 1 ) );
+ sum += sharedAccumulators[ thread + 4 ];
+ sharedAccumulators[ thread ] = sum;
+ }
+
+ GroupMemoryBarrierWithGroupSync();
+ if( thread < 2 )
+ {
+ // const __m128 r2 = _mm_add_ps( r4, _mm_movehl_ps( r4, r4 ) );
+ sum += sharedAccumulators[ thread + 2 ];
+ sharedAccumulators[ thread ] = sum;
+ }
+
+ GroupMemoryBarrierWithGroupSync();
+ if( 0 == thread )
+ {
+ // const __m128 r1 = _mm_add_ss( r2, _mm_movehdup_ps( r2 ) );
+ sum += sharedAccumulators[ 1 ];
+ }
+}
+
+// Compute horisontal sum of the numbers, in yet another creative summation order recently implemented in the upstream
+void horizontalSumCompatNew( const uint thread, inout float sum )
+{
+ // GGML_F32x8_REDUCE
+ sharedAccumulators[ thread ] = sum;
+ GroupMemoryBarrierWithGroupSync();
+
+ if( 0 == ( thread & 8 ) )
+ {
+ // Runs on threads [ 0 .. 7 ] and [ 16 .. 23 ]
+ sum += sharedAccumulators[ thread | 8 ];
+ sharedAccumulators[ thread ] = sum;
+ }
+ GroupMemoryBarrierWithGroupSync();
+
+ if( thread < 8 )
+ {
+ // Runs on threads [ 0 .. 7 ]
+ sum += sharedAccumulators[ thread | 0x10 ];
+ sharedAccumulators[ thread ] = sum;
+ }
+ GroupMemoryBarrierWithGroupSync();
+
+ if( thread < 4 )
+ {
+ // Runs on threads [ 0 .. 3 ]
+ sum += sharedAccumulators[ thread | 4 ];
+ sharedAccumulators[ thread ] = sum;
+ }
+ GroupMemoryBarrierWithGroupSync();
+
+ if( thread < 4 && 0 == ( thread & 1 ) )
+ {
+ // Runs on threads [ 0, 2 ]
+ sum += sharedAccumulators[ thread | 1 ];
+ sharedAccumulators[ thread ] = sum;
+ }
+ GroupMemoryBarrierWithGroupSync();
+
+ if( 0 == thread )
+ sum += sharedAccumulators[ 2 ];
+}
+
+
+// Compute horizontal maximum of the numbers, and broadcast to all threads of the group.
+void horizontalMaxBroadcast( const uint thread, inout float ax )
+{
+ sharedAccumulators[ thread ] = ax;
+ for( uint i = 16; i > 0; i /= 2 )
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if( thread < i )
+ {
+ ax = max( ax, sharedAccumulators[ thread + i ] );
+ sharedAccumulators[ thread ] = ax;
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ ax = sharedAccumulators[ 0 ];
+} \ No newline at end of file
diff --git a/ComputeShaders/groupReduce64.hlsli b/ComputeShaders/groupReduce64.hlsli
new file mode 100644
index 0000000..7094d03
--- /dev/null
+++ b/ComputeShaders/groupReduce64.hlsli
@@ -0,0 +1,46 @@
+groupshared float sharedAccumulators[ 64 ];
+
+// Compute horisontal sum of the numbers. The result is only correct on the thread #0 of the group.
+void horizontalSum( const uint thread, inout float sum )
+{
+ sharedAccumulators[ thread ] = sum;
+ for( uint i = 32; i > 1; i /= 2 )
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if( thread < i )
+ {
+ sum += sharedAccumulators[ thread + i ];
+ sharedAccumulators[ thread ] = sum;
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ if( 0 == thread )
+ sum += sharedAccumulators[ 1 ];
+}
+
+// Compute horisontal sum of the numbers, and broadcast to all threads of the group.
+void horizontalSumBroadcast( const uint thread, inout float sum )
+{
+ horizontalSum( thread, sum );
+ if( 0 == thread )
+ sharedAccumulators[ 0 ] = sum;
+ GroupMemoryBarrierWithGroupSync();
+ sum = sharedAccumulators[ 0 ];
+}
+
+// Compute horizontal maximum of the numbers, and broadcast to all threads of the group.
+void horizontalMaxBroadcast( const uint thread, inout float ax )
+{
+ sharedAccumulators[ thread ] = ax;
+ for( uint i = 32; i > 0; i /= 2 )
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if( thread < i )
+ {
+ ax = max( ax, sharedAccumulators[ thread + i ] );
+ sharedAccumulators[ thread ] = ax;
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ ax = sharedAccumulators[ 0 ];
+} \ No newline at end of file
diff --git a/ComputeShaders/matReshapePanels.hlsl b/ComputeShaders/matReshapePanels.hlsl
new file mode 100644
index 0000000..f26f246
--- /dev/null
+++ b/ComputeShaders/matReshapePanels.hlsl
@@ -0,0 +1,105 @@
+// This shader reshapes a matrix into the shape expected by mulMatTiledEx.hlsl and mulMatByRowTiledEx.hlsl compute shaders
+// It's called in runtime, also while loading models from disk.
+// So far, it's only used when running on AMD GPUs.
+#ifndef TILE_SIZE
+static const uint TILE_SIZE = 32;
+#endif
+
+// Input tensor
+Buffer<float> source: register( t0 );
+// Output tensor
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 arg0Size: packoffset( c0 );
+ uint4 arg0Strides: packoffset( c1 );
+ // Count of elements per panel
+ uint panelSize : packoffset( c2.y );
+ // Layer strides of the output matrix
+ uint2 layerStrides: packoffset( c2.z );
+}
+
+inline uint hadd( uint2 v2 ) { return v2.x + v2.y; }
+
+groupshared float tileBuffer[ TILE_SIZE ][ TILE_SIZE ];
+
+[ numthreads( TILE_SIZE, 1, 1 ) ]
+void main( const uint3 group: SV_GroupID, const uint thread : SV_GroupIndex )
+{
+ uint rdi = hadd( group.yz * layerStrides );
+ rdi += group.x * panelSize;
+ rdi += thread;
+
+ uint rsi = hadd( group.yz * arg0Strides.zw );
+ const uint baseY = group.x * TILE_SIZE;
+ const uint dispatchThread = baseY + thread;
+ // Reshaping into a column major horizontal panel, height = TILE_SIZE, width = width of the source matrix
+ uint width = arg0Size.x;
+ // Usually TILE_SIZE; can be less for the last panel on the matrix when we need to generate zeros instead of loading these numbers
+ const uint height = min( TILE_SIZE, arg0Size.y - baseY );
+
+ if( arg0Strides.x == 1 )
+ {
+ // The input matrix is row major, can improve performance with coalesced loads and group shared buffer.
+ rsi += baseY * arg0Strides.y;
+
+ const uint widthCompleteTiles = width / TILE_SIZE;
+
+ if( height < TILE_SIZE )
+ {
+ // This thread group was dispatched for the last panel of the matrix, it doesn't have enough rows
+ // Write zeros to the corresponding elements of the groupshared buffer
+ for( uint j = height; j < TILE_SIZE; j++ )
+ tileBuffer[ thread ][ j ] = 0.0;
+ }
+
+ for( uint i = 0; i < widthCompleteTiles; i++, rsi += TILE_SIZE )
+ {
+ // Load [ TILE_SIZE ] * [ TILE_SIZE ] block with fully coalesced loads, store to group shared buffer in transposed order
+ uint rsiTile = rsi + thread;
+ uint j;
+ for( j = 0; j < height; j++, rsiTile += arg0Strides.y )
+ {
+ // Each iteration of the loop loads a row of [ TILE_SIZE ] elements from the corresponding row of the source tensor
+ // Fully coalesced load
+ float f = source[ rsiTile ];
+ // Random store but the local memory's fast, this works rather well in practice
+ tileBuffer[ thread ][ j ] = f;
+ }
+
+ GroupMemoryBarrierWithGroupSync();
+
+ // Copy from group shared buffer to output tensor
+ for( j = 0; j < TILE_SIZE; j++, rdi += TILE_SIZE )
+ {
+ // Fully coalesced loads and stores
+ float f = tileBuffer[ j ][ thread ];
+ result[ rdi ] = f;
+ }
+
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ width %= TILE_SIZE;
+ if( 0 == width )
+ return;
+ rsi += thread * arg0Strides.y;
+ }
+ else
+ rsi += dispatchThread * arg0Strides.y;
+
+ for( uint i = 0; i < width; i++ )
+ {
+ float f;
+ [branch]
+ if( thread < height )
+ f = source[ rsi ];
+ else
+ f = 0.0;
+ rsi += arg0Strides.x;
+
+ result[ rdi ] = f;
+ rdi += TILE_SIZE;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/miscUtils.hlsli b/ComputeShaders/miscUtils.hlsli
new file mode 100644
index 0000000..b957a06
--- /dev/null
+++ b/ComputeShaders/miscUtils.hlsli
@@ -0,0 +1,84 @@
+// When GPUs are converting FP32 to FP16, they always truncate towards 0, documented there:
+// https://learn.microsoft.com/en-us/windows/win32/direct3d10/d3d10-graphics-programming-guide-resources-data-conversion#conververting-from-a-higher-range-representation-to-a-lower-range-representation
+// Whisper code uses _mm_cvtps_ph( x, 0 ), the 0 stands for "Round to nearest even": https://www.felixcloutier.com/x86/vcvtps2ph
+// This function adjusts FP32 value making it so that truncation towards 0 results in the value equal to what CPU is doing
+inline float adjustFp16( const float src )
+{
+ const uint trunc16 = f32tof16( src );
+ const float trunc32 = f16tof32( trunc16 );
+
+ const uint truncExp = ( trunc16 >> 10 ) & 0x1F;
+ if( truncExp != 0x1F )
+ {
+ const uint next16 = trunc16 + 1;
+ const float next32 = f16tof32( next16 );
+
+ const float errTrunc = abs( src - trunc32 );
+ const float errNext = abs( src - next32 );
+
+ if( errTrunc < errNext )
+ {
+ // Truncated was closer to the source
+ return src;
+ }
+ else if( errTrunc > errNext )
+ {
+ // Truncated + 1 was closer to the source
+ return next32;
+ }
+ else
+ {
+ // Exactly half, doing banker's rounding to nearest even
+ return ( 0 == ( trunc16 & 1 ) ) ? src : next32;
+ }
+ }
+ else
+ {
+ // INF or NAN
+ return src;
+ }
+}
+
+// Convert FP32 number to FP16, using rounding to nearest
+inline uint fp16Rounded( const float src )
+{
+ const uint trunc16 = f32tof16( src );
+ const float trunc32 = f16tof32( trunc16 );
+
+ const uint truncExp = ( trunc16 >> 10 ) & 0x1F;
+ if( truncExp != 0x1F )
+ {
+ const uint next16 = trunc16 + 1;
+ const float next32 = f16tof32( next16 );
+
+ const float errTrunc = abs( src - trunc32 );
+ const float errNext = abs( src - next32 );
+
+ if( errTrunc < errNext )
+ {
+ // Truncated was closer to the source
+ return trunc16;
+ }
+ else if( errTrunc > errNext )
+ {
+ // Truncated + 1 was closer to the source
+ return next16;
+ }
+ else
+ {
+ // Exactly half, doing banker's rounding to nearest even
+ return ( 0 == ( trunc16 & 1 ) ) ? trunc16 : next16;
+ }
+ }
+ else
+ {
+ // INF or NAN
+ return trunc16;
+ }
+}
+
+// Round up the number to be a multiple of 32
+inline uint roundUp32( uint x )
+{
+ return ( x + 31 ) & ( ~31u );
+} \ No newline at end of file
diff --git a/ComputeShaders/mulMatByRow.hlsl b/ComputeShaders/mulMatByRow.hlsl
new file mode 100644
index 0000000..565cfdc
--- /dev/null
+++ b/ComputeShaders/mulMatByRow.hlsl
@@ -0,0 +1,49 @@
+// Matrix * row product, like [ E0, E1, E2, E3 ] * [ E0, 1, E2, E3 ] = [ E1, 1, E2, E3 ]
+// Dispatch [ E1, E2, E3 ] groups of this shader
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 arg0Size: packoffset( c0 );
+ uint4 arg0Strides: packoffset( c1 );
+ uint4 arg1Size: packoffset( c2 );
+ uint4 arg1Strides: packoffset( c3 );
+ uint4 resultSize: packoffset( c4 );
+ uint4 resultStrides: packoffset( c5 );
+}
+
+#include "groupReduce.hlsli"
+
+inline uint hadd( uint3 vec )
+{
+ return vec.x + vec.y + vec.z;
+}
+inline uint hadd( uint2 vec )
+{
+ return vec.x + vec.y;
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ uint s0 = hadd( group * arg0Strides.yzw );
+ uint s1 = hadd( group.yz * arg1Strides.zw );
+ const uint s0End = s0 + arg0Size.x * arg0Strides.x;
+ const uint s0Inc = 32 * arg0Strides.x;
+ const uint s1Inc = 32 * arg1Strides.x;
+
+ s0 += thread * arg0Strides.x;
+ s1 += thread * arg1Strides.x;
+ float dp = 0;
+ for( ; s0 < s0End; s0 += s0Inc, s1 += s1Inc )
+ dp = mad( arg0[ s0 ], arg1[ s1 ], dp );
+
+ horizontalSum( thread, dp );
+ if( 0 != thread )
+ return;
+
+ const uint rdi = group.x + hadd( group.yz * resultStrides.zw );
+ result[ rdi ] = dp;
+} \ No newline at end of file
diff --git a/ComputeShaders/mulMatByRow64.hlsl b/ComputeShaders/mulMatByRow64.hlsl
new file mode 100644
index 0000000..db5f801
--- /dev/null
+++ b/ComputeShaders/mulMatByRow64.hlsl
@@ -0,0 +1,90 @@
+// Matrix * row product, like [ E0, E1, E2, E3 ] * [ E0, 1, E2, E3 ] = [ E1, 1, E2, E3 ]
+// Dispatch [ E1, E2, E3 ] groups of this shader
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 arg0Size: packoffset( c0 );
+ uint4 arg0Strides: packoffset( c1 );
+ uint4 arg1Size: packoffset( c2 );
+ uint4 arg1Strides: packoffset( c3 );
+ uint4 resultSize: packoffset( c4 );
+ uint4 resultStrides: packoffset( c5 );
+}
+
+inline uint hadd( uint3 vec )
+{
+ return vec.x + vec.y + vec.z;
+}
+inline uint hadd( uint2 vec )
+{
+ return vec.x + vec.y;
+}
+
+// No idea why, but that particular configuration appears to be the fastest one on Ryzen 7 5700G iGPU
+// Not by much, though: when trying a few numbers I saw 1.30 - 1.42 seconds for this compute shader
+static const uint THREADS = 64;
+static const uint REDUCTION_BUFFER = 32;
+groupshared float sharedAccumulators[ REDUCTION_BUFFER ];
+
+// Compute horisontal sum of the numbers. The result is only correct on the thread #0 of the group.
+void horizontalSum( const uint thread, inout float sum )
+{
+ if( THREADS > REDUCTION_BUFFER )
+ {
+ for( uint t = REDUCTION_BUFFER; t < THREADS; t += REDUCTION_BUFFER )
+ {
+ // Threads [ t .. t + REDUCTION_BUFFER ] store into the buffer
+ if( thread >= t && thread < t + REDUCTION_BUFFER )
+ sharedAccumulators[ thread - t ] = sum;
+
+ GroupMemoryBarrierWithGroupSync();
+
+ // Threads [ 0 .. REDUCTION_BUFFER ] increment their local sum with the value loaded from the buffer
+ if( thread < REDUCTION_BUFFER )
+ sum += sharedAccumulators[ thread ];
+ }
+ }
+
+ if( thread < REDUCTION_BUFFER )
+ sharedAccumulators[ thread ] = sum;
+
+ for( uint i = REDUCTION_BUFFER / 2; i > 1; i /= 2 )
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if( thread < i )
+ {
+ sum += sharedAccumulators[ thread + i ];
+ sharedAccumulators[ thread ] = sum;
+ }
+ }
+
+ GroupMemoryBarrierWithGroupSync();
+ if( 0 == thread )
+ sum += sharedAccumulators[ 1 ];
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ uint s0 = hadd( group * arg0Strides.yzw );
+ uint s1 = hadd( group.yz * arg1Strides.zw );
+ const uint s0End = s0 + arg0Size.x * arg0Strides.x;
+ const uint s0Inc = THREADS * arg0Strides.x;
+ const uint s1Inc = THREADS * arg1Strides.x;
+
+ s0 += thread * arg0Strides.x;
+ s1 += thread * arg1Strides.x;
+ float dp = 0;
+ for( ; s0 < s0End; s0 += s0Inc, s1 += s1Inc )
+ dp = mad( arg0[ s0 ], arg1[ s1 ], dp );
+
+ horizontalSum( thread, dp );
+ if( 0 != thread )
+ return;
+
+ const uint rdi = group.x + hadd( group.yz * resultStrides.zw );
+ result[ rdi ] = dp;
+} \ No newline at end of file
diff --git a/ComputeShaders/mulMatByRowTiled.hlsl b/ComputeShaders/mulMatByRowTiled.hlsl
new file mode 100644
index 0000000..fea2fcb
--- /dev/null
+++ b/ComputeShaders/mulMatByRowTiled.hlsl
@@ -0,0 +1,120 @@
+// Matrix * row product, like [ E0, E1, E2, E3 ] * [ E0, 1, E2, E3 ] = [ E1, 1, E2, E3 ]
+// Dispatch [ ( E1 + TILE_Y - 1 ) / TILE_Y, E2, E3 ] thread groups of this shader
+
+#ifndef TILE_Y
+static const uint TILE_Y = 64;
+#endif
+#ifndef THREADS_X
+static const uint THREADS_X = 32;
+#endif
+#ifndef THREADS_Y
+static const uint THREADS_Y = 16;
+#endif
+
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 arg0Size: packoffset( c0 );
+ uint4 arg0Strides: packoffset( c1 );
+ uint4 arg1Size: packoffset( c2 );
+ uint4 arg1Strides: packoffset( c3 );
+ uint4 resultSize: packoffset( c4 );
+ uint4 resultStrides: packoffset( c5 );
+}
+
+groupshared float resTemp[ TILE_Y ][ THREADS_X ];
+
+inline uint hadd( uint2 vec )
+{
+ return vec.x + vec.y;
+}
+
+[ numthreads( THREADS_X, THREADS_Y, 1 ) ]
+void main( uint3 group: SV_GroupID, uint3 thread : SV_GroupThreadID, uint threadFlattenned : SV_GroupIndex )
+{
+ uint i;
+ // Zero out the shared buffer
+ for( i = thread.y; i < TILE_Y; i += THREADS_Y )
+ resTemp[ i ][ thread.x ] = 0.0;
+ GroupMemoryBarrierWithGroupSync();
+
+ // Count of rows to compute in this thread group
+ const uint height = min( TILE_Y, arg0Size.y - group.x * TILE_Y );
+
+ uint s0 = hadd( group.yz * arg0Strides.zw ); //< arg0 layer for the thread group
+ s0 += group.x * TILE_Y * arg0Strides.y; //< arg0 first row for the thread group
+ s0 += hadd( arg0Strides.xy * thread.xy ); //< arg0 load index for the thread
+
+ uint s1 = hadd( group.yz * arg1Strides.zw ); //< arg1 layer for the thread group
+ s1 += thread.x * arg1Strides.x; //< arg1 load index for the thread
+
+ const uint completeTiles = arg0Size.x / THREADS_X;
+ // Each iteration of that loop loads THREADS_X elements from arg1,
+ // a block of [ THREADS_X, height ] elements from arg0,
+ // and accumulates these dot products in the shared buffer
+ for( uint t = 0; t < completeTiles; t++, s0 += THREADS_X * arg0Strides.x, s1 += THREADS_X * arg1Strides.x )
+ {
+ // Load THREADS_X elements from arg1
+ const float v1 = arg1[ s1 ];
+
+ uint rsi = s0;
+ for( i = thread.y; i < height; i += THREADS_Y, rsi += arg0Strides.y * THREADS_Y )
+ {
+ // Load THREADS_X elements from arg0
+ const float v0 = arg0[ rsi ];
+ // Multiply and accumulate in the shared buffer
+ float acc = resTemp[ i ][ thread.x ];
+ acc = mad( v0, v1, acc );
+ resTemp[ i ][ thread.x ] = acc;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ const uint rem = arg0Size.x % THREADS_X;
+ if( rem != 0 )
+ {
+ // E0 ain't a multiple of THREADS_X, we have a remainder
+ float v1;
+ if( thread.x < rem )
+ v1 = arg1[ s1 ];
+ else
+ v1 = 0.0;
+
+ for( i = thread.y; i < height; i += THREADS_Y, s0 += arg0Strides.y * THREADS_Y )
+ {
+ if( thread.x >= rem )
+ continue;
+ const float v0 = arg0[ s0 ];
+ float acc = resTemp[ i ][ thread.x ];
+ acc = mad( v0, v1, acc );
+ resTemp[ i ][ thread.x ] = acc;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ // Now we need horizontal sums of these shared accumulators, i.e. reduce [height][THREADS_X] shared array into [height][1] column
+ for( i = THREADS_X / 2; i > 0; i /= 2 )
+ {
+ if( thread.x < i )
+ {
+ for( uint j = thread.y; j < height; j += THREADS_Y )
+ {
+ float sum = resTemp[ j ][ thread.x ];
+ sum += resTemp[ j ][ thread.x + i ];
+ resTemp[ j ][ thread.x ] = sum;
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ // And finally, store that column to global memory
+ if( threadFlattenned >= height )
+ return;
+
+ uint rdi = hadd( group.yz * resultStrides.zw ) + group.x * TILE_Y * resultStrides.x;
+ rdi += threadFlattenned * resultStrides.x;
+ result[ rdi ] = resTemp[ threadFlattenned ][ 0 ];
+} \ No newline at end of file
diff --git a/ComputeShaders/mulMatByRowTiled64.hlsl b/ComputeShaders/mulMatByRowTiled64.hlsl
new file mode 100644
index 0000000..6c63f2d
--- /dev/null
+++ b/ComputeShaders/mulMatByRowTiled64.hlsl
@@ -0,0 +1,4 @@
+#define THREADS_Y 32
+#define THREADS_X 32
+#define TILE_Y 128
+#include "mulMatByRowTiled.hlsl" \ No newline at end of file
diff --git a/ComputeShaders/mulMatByRowTiledEx.hlsl b/ComputeShaders/mulMatByRowTiledEx.hlsl
new file mode 100644
index 0000000..d377b8c
--- /dev/null
+++ b/ComputeShaders/mulMatByRowTiledEx.hlsl
@@ -0,0 +1,156 @@
+// matrix*row vector product, needs first argument reshaped into a sequence of horizontal column major panels
+#ifndef TILE_SIZE
+static const uint TILE_SIZE = 32;
+#endif
+#ifndef TILE_HEIGHT
+static const uint TILE_HEIGHT = 32;
+#endif
+#ifndef THREADS_Y
+static const uint THREADS_Y = 16;
+#endif
+
+// First tensor, reshaped into dense column major horizontal panels of size [ width, TILE_SIZE ]
+Buffer<float> arg0: register( t0 );
+// Second tensor, reshaped into dense column major horizontal panels of size [ width, TILE_SIZE ]
+Buffer<float> arg1: register( t1 );
+// FP32 output tensor, row major and continuous
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 arg0Size: packoffset( c0 );
+ uint arg0panel: packoffset( c1.y );
+ uint2 arg0LayerStrides: packoffset( c1.z );
+ // uint4 arg1Size: packoffset( c2 );
+ uint4 arg1Strides: packoffset( c3 );
+ uint4 resultSize: packoffset( c4 );
+ uint4 resultStrides: packoffset( c5 );
+}
+
+groupshared float tileOutput[ THREADS_Y ][ TILE_SIZE ];
+groupshared float tile0[ TILE_HEIGHT ][ TILE_SIZE ];
+groupshared float tile1[ TILE_HEIGHT ];
+
+void multiplyTiles( const uint3 thread )
+{
+ float r = 0.0;
+ for( uint i = thread.y; i < TILE_HEIGHT; i += THREADS_Y )
+ {
+ float a = tile0[ i ][ thread.x ];
+ float b = tile1[ i ];
+ r = mad( a, b, r );
+ }
+ tileOutput[ thread.y ][ thread.x ] += r;
+}
+
+void reduceOutput( const uint3 thread )
+{
+ float curr = 0.0;
+ [branch]
+ if( thread.y < THREADS_Y / 2 )
+ curr = tileOutput[ thread.y ][ thread.x ];
+
+ for( uint i = THREADS_Y / 2; i > 0; i /= 2 )
+ {
+ [branch]
+ if( thread.y < i )
+ {
+ curr += tileOutput[ thread.y + i ][ thread.x ];
+ tileOutput[ thread.y ][ thread.x ] = curr;
+ }
+ GroupMemoryBarrierWithGroupSync();
+ }
+}
+
+void storeTile( const uint threadFlat, const uint4 pos, const uint size )
+{
+ if( threadFlat >= size )
+ return;
+ const uint4 prod4 = pos * resultStrides;
+ const uint2 prod2 = prod4.xy + prod4.zw;
+ uint rdi = prod2.x + prod2.y;
+ result[ rdi + threadFlat ] = tileOutput[ 0 ][ threadFlat ];
+}
+
+[ numthreads( TILE_SIZE, THREADS_Y, 1 ) ]
+void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID, uint threadFlat : SV_GroupIndex )
+{
+ uint i;
+ // Zero all 3 shared buffers
+ tileOutput[ thread.y ][ thread.x ] = 0.0;
+ for( i = thread.y; i < TILE_HEIGHT; i += THREADS_Y )
+ tile0[ i ][ thread.x ] = 0.0;
+ if( threadFlat < THREADS_Y )
+ tile1[ threadFlat ] = 0.0;
+
+ const uint2 layer = group.yz;
+ uint rsi0 = group.x * arg0panel + layer.x * arg0LayerStrides.x + layer.y * arg0LayerStrides.y;
+ uint rsi1 = layer.x * arg1Strides.z + layer.y * arg1Strides.w;
+
+ const uint threadOffset = thread.y * TILE_SIZE + thread.x;
+ rsi0 += threadOffset;
+ rsi1 += threadFlat * arg1Strides.x;
+
+ const uint completeTiles = arg0Size.x / TILE_HEIGHT;
+ for( i = 0; i < completeTiles; i++ )
+ {
+ // Load [ TILE_SIZE, TILE_HEIGHT ] block from the first source tensor into the groupshared buffer
+ for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y )
+ {
+ tile0[ j ][ thread.x ] = arg0[ rsi0 ];
+ rsi0 += THREADS_Y * TILE_SIZE;
+ }
+ // Load [ TILE_HEIGHT ] row from the second source into another groupshared buffer
+ [ branch ]
+ if( threadFlat < TILE_HEIGHT )
+ tile1[ threadFlat ] = arg1[ rsi1 ];
+ rsi1 += TILE_HEIGHT * arg1Strides.x;
+
+ GroupMemoryBarrierWithGroupSync();
+
+ multiplyTiles( thread );
+
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ const uint rem = arg0Size.x % TILE_HEIGHT;
+ if( rem != 0 )
+ {
+ for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y )
+ {
+ float a;
+ [branch]
+ if( j < rem )
+ {
+ a = arg0[ rsi0 ];
+ rsi0 += THREADS_Y * TILE_SIZE;
+ }
+ else
+ a = 0.0;
+ tile0[ j ][ thread.x ] = a;
+ }
+
+ if( threadFlat < TILE_HEIGHT )
+ {
+ float b;
+ [branch]
+ if( threadFlat < rem )
+ b = arg1[ rsi1 ];
+ else
+ b = 0.0;
+ tile1[ threadFlat ] = b;
+ }
+
+ GroupMemoryBarrierWithGroupSync();
+
+ multiplyTiles( thread );
+
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ reduceOutput( thread );
+
+ const uint resultPos = group.x * TILE_SIZE;
+ const uint outputSize = min( TILE_SIZE, resultSize.x - resultPos );
+ storeTile( threadFlat, uint4( resultPos, 0, layer ), outputSize );
+} \ No newline at end of file
diff --git a/ComputeShaders/mulMatByScalar.hlsl b/ComputeShaders/mulMatByScalar.hlsl
new file mode 100644
index 0000000..82df5d4
--- /dev/null
+++ b/ComputeShaders/mulMatByScalar.hlsl
@@ -0,0 +1,41 @@
+// Matrix * scalar product, like [ 1, E1, E2, E3 ] * [ 1, 1, E2, E3 ] = [ E1, 1, E2, E3 ]
+// Dispatch [ E2, E3, 1 ] thread groups of this shader
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 arg0Size: packoffset( c0 );
+ uint4 arg0Strides: packoffset( c1 );
+ uint4 arg1Size: packoffset( c2 );
+ uint4 arg1Strides: packoffset( c3 );
+ uint4 resultSize: packoffset( c4 );
+ uint4 resultStrides: packoffset( c5 );
+}
+
+inline uint hadd( uint2 vec )
+{
+ return vec.x + vec.y;
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const float scalarValue = arg1[ hadd( group.xy * arg1Strides.zw ) ];
+
+ uint s0 = hadd( group.xy * arg0Strides.zw );
+ const uint s0Inc = 32 * arg0Strides.y;
+ s0 += thread * arg0Strides.y;
+
+ uint rdi = hadd( group.xy * resultStrides.zw );
+ const uint rdiEnd = rdi + arg0Size.y;
+ rdi += thread;
+
+ for( ; rdi < rdiEnd; rdi += 32, s0 += s0Inc )
+ {
+ float f = arg0[ s0 ];
+ f *= scalarValue;
+ result[ rdi ] = f;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/mulMatDotMain.hlsl b/ComputeShaders/mulMatDotMain.hlsl
new file mode 100644
index 0000000..47c6d3e
--- /dev/null
+++ b/ComputeShaders/mulMatDotMain.hlsl
@@ -0,0 +1,95 @@
+// GGML_TASK_COMPUTE step for matrix*matrix product, where nb01 >= nb00;
+// Dispatch with [ ne11, ne01*ne02*ne03 ] thread groups
+// Each thread group computes a single dot product
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 src0_elements: packoffset( c0 );
+ uint4 src0_strides: packoffset( c1 );
+ uint4 src1_elements: packoffset( c2 );
+ uint4 result_elements: packoffset( c4 );
+ uint4 result_strides: packoffset( c5 );
+}
+
+inline uint product( uint3 vec )
+{
+ return vec.x * vec.y * vec.z;
+}
+
+inline uint product( uint4 vec )
+{
+ uint2 tmp = vec.xy * vec.zw;
+ return tmp.x * tmp.y;
+}
+
+inline float dotProductInner( uint i0, uint i1, uint length, uint thread )
+{
+ float res = 0;
+ for( uint i = thread; i < length; i += 32 )
+ res = mad( arg0[ i0 + i ], arg1[ i1 + i ], res );
+ return res;
+}
+
+#include "groupReduce.hlsli"
+
+[numthreads( 32, 1, 1 )]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint ne00 = src0_elements.x;
+ const uint ne01 = src0_elements.y;
+ const uint ne02 = src0_elements.z;
+ const uint ne03 = src0_elements.w;
+
+ const uint ne10 = src1_elements.x;
+ const uint ne11 = src1_elements.y;
+ const uint ne12 = src1_elements.z;
+ const uint ne13 = src1_elements.w;
+
+ const int nb00 = src0_strides.x;
+ const int nb01 = src0_strides.y;
+ const int nb02 = src0_strides.z;
+ const int nb03 = src0_strides.w;
+
+ // total rows in src0
+ // const int nr = ne01*ne02*ne03;
+ const uint nr = product( src0_elements.yzw );
+
+ const uint ir = group.y;
+
+ // src0 indices
+ const uint i03 = ir / ( ne02 * ne01 );
+ const uint i02 = ( ir - i03 * ne02 * ne01 ) / ne01;
+ const uint i01 = ( ir - i03 * ne02 * ne01 - i02 * ne01 );
+
+ const uint i13 = i03;
+ const uint i12 = i02;
+
+ const uint i0 = i01;
+ const uint i2 = i02;
+ const uint i3 = i03;
+
+ // src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
+ // src1_col = wdata + ( i13 * ne12 * ne11 + i12 * ne11 + 0 ) * ne00;
+ const uint src0_row = i01 * nb01 + i02 * nb02 + i03 * nb03;
+ const uint src1_col = ( i13 * ne12 * ne11 + i12 * ne11 ) * ne00;
+
+ const uint ic = group.x;
+ float curr = dotProductInner( src0_row, src1_col + ic * ne00, ne00, thread );
+ horizontalSumCompatNew( thread, curr );
+
+ if( 0 != thread )
+ return;
+
+ const uint nb0 = result_strides.x;
+ const uint nb1 = result_strides.y;
+ const uint nb2 = result_strides.z;
+ const uint nb3 = result_strides.w;
+
+ const uint ne0 = result_elements.x;
+ // float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
+ const uint dst_col = i0 * nb0 + i2 * nb2 + i3 * nb3;
+ result[ dst_col + ic * ne0 ] = curr;
+} \ No newline at end of file
diff --git a/ComputeShaders/mulMatDotReshape.hlsl b/ComputeShaders/mulMatDotReshape.hlsl
new file mode 100644
index 0000000..ffb6f83
--- /dev/null
+++ b/ComputeShaders/mulMatDotReshape.hlsl
@@ -0,0 +1,33 @@
+// GGML_TASK_INIT step for matrix*matrix product, where nb01 >= nb00;
+// Dispatch with [ ne11, ne12 ] groups
+Buffer<float> arg0: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 src0_elements: packoffset( c0 );
+ uint4 src0_strides: packoffset( c1 );
+}
+
+#include "miscUtils.hlsli"
+
+// Each thread group of this shader copies a single rows of the matrix
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint i12 = group.y;
+ const uint i11 = group.x;
+ const uint ne10 = src0_elements.x;
+ const uint ne11 = src0_elements.y;
+ const uint nb12 = src0_strides.z;
+ const uint nb11 = src0_strides.y;
+
+ uint rdi = i11 * ne10 + i12 * ne10 * ne11;
+ const uint rdiEnd = rdi + ne10;
+ uint rsi = i12 * nb12 + i11 * nb11;
+ rdi += thread;
+ rsi += thread;
+
+ for( ; rdi < rdiEnd; rdi += 32, rsi += 32 )
+ result[ rdi ] = adjustFp16( arg0[ rsi ] );
+} \ No newline at end of file
diff --git a/ComputeShaders/mulMatMadMain.hlsl b/ComputeShaders/mulMatMadMain.hlsl
new file mode 100644
index 0000000..0bd5753
--- /dev/null
+++ b/ComputeShaders/mulMatMadMain.hlsl
@@ -0,0 +1,154 @@
+// GGML_TASK_COMPUTE step for matrix*matrix product, where nb01 < nb00
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> resultTensor: register( u0 );
+RWBuffer<float> tempBuffer: register( u1 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 aSize: packoffset( c0 );
+ uint4 aStride: packoffset( c1 );
+ uint4 bSize: packoffset( c2 );
+ uint4 bStride: packoffset( c3 );
+ uint4 resSize: packoffset( c4 );
+ bool resultFp16 : packoffset( c5.x );
+ uint ne: packoffset( c5.y );
+}
+
+#include "miscUtils.hlsli"
+
+// tempBuffer[ rdi .. ] = 0.0
+inline void writeTempZeros( uint rdi, const uint len, const uint thread )
+{
+ const uint rdiEnd = rdi + len;
+ for( rdi += thread; rdi < rdiEnd; rdi += 32 )
+ tempBuffer[ rdi ] = 0.0;
+}
+
+// tempBuffer[ rdi .. ] += mul * arg0[ rsi .. ]
+inline void vectorMad( uint rsi, uint rdi, const uint len, const float mul, const uint thread )
+{
+ const uint rsiEnd = rsi + len;
+ rsi += thread;
+ rdi += thread;
+ for( ; rsi < rsiEnd; rsi += 32, rdi += 32 )
+ {
+ float f = tempBuffer[ rdi ];
+ f = mad( mul, arg0[ rsi ], f );
+ [branch]
+ if( resultFp16 )
+ f = adjustFp16( f );
+ tempBuffer[ rdi ] = f;
+ }
+}
+
+// resultTensor[ rdi .. ] = tempBuffer[ rsi .. ]
+inline void copyRow( uint rsi, uint rdi, const uint len, const uint thread )
+{
+ const uint rsiEnd = rsi + len;
+ rsi += thread;
+ rdi += thread;
+ for( ; rsi < rsiEnd; rsi += 32, rdi += 32 )
+ {
+ float f = tempBuffer[ rsi ];
+ resultTensor[ rdi ] = f;
+ }
+}
+
+// resultTensor[ rdi .. ] += tempBuffer[ rsi .. ]
+inline void addRow( uint rsi, uint rdi, const uint len, const uint thread )
+{
+ const uint rsiEnd = rsi + len;
+ rsi += thread;
+ rdi += thread;
+ for( ; rsi < rsiEnd; rsi += 32, rdi += 32 )
+ {
+ float f = resultTensor[ rdi ];
+ f += tempBuffer[ rsi ];
+ resultTensor[ rdi ] = f;
+ }
+}
+
+[numthreads( 32, 1, 1 )]
+void main( const uint3 group: SV_GroupID, const uint thread : SV_GroupIndex )
+{
+ const uint i1 = group[ 0 ];
+ const uint i2 = group[ 1 ];
+ const uint i3 = group[ 2 ];
+
+ const uint ne00 = aSize[ 0 ];
+ const uint ne01 = aSize[ 1 ];
+ const uint ne02 = aSize[ 2 ];
+ const uint ne03 = aSize[ 3 ];
+
+ const uint ne10 = bSize[ 0 ];
+ const uint ne11 = bSize[ 1 ];
+ const uint ne12 = bSize[ 2 ];
+ const uint ne13 = bSize[ 3 ];
+
+ const uint ne0 = resSize[ 0 ];
+ const uint ne1 = resSize[ 1 ];
+ const uint ne2 = resSize[ 2 ];
+ const uint ne3 = resSize[ 3 ];
+
+ const uint nb00 = aStride[ 0 ];
+ const uint nb01 = aStride[ 1 ];
+ const uint nb02 = aStride[ 2 ];
+ const uint nb03 = aStride[ 3 ];
+
+ const uint nb10 = bStride[ 0 ];
+ const uint nb11 = bStride[ 1 ];
+ const uint nb12 = bStride[ 2 ];
+ const uint nb13 = bStride[ 3 ];
+
+ // dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0;
+ const uint tempRowThread0 = i3 * ne2 * ne1 * ne0 + i2 * ne1 * ne0 + i1 * ne0;
+
+ // Faking 4 CPU threads trying to achieve bitwise compatibility with the CPU version
+ const uint nth = 4;
+
+ // GGML_TASK_COMPUTE
+ {
+ // src0_col = src0->data + ( i00 * nb00 + i02 * nb02 + i03 * nb03 );
+ const uint aBase = i2 * nb02 + i3 * nb03;
+ // src1_val = * (float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+ const uint bBase = i1 * nb11 + i2 * nb12 + i3 * nb13;
+
+ // total columns in src1
+ const uint nc = ne10;
+ // columns per thread
+ const uint dc = ( nc + nth - 1 ) / nth;
+
+ uint tempRow = tempRowThread0;
+ for( uint ith = 0; ith < nth; ith++, tempRow += ne )
+ {
+ writeTempZeros( tempRow, ne01, thread );
+
+ // column range for this thread
+ const uint ic0 = dc * ith;
+ const uint ic1 = min( ic0 + dc, nc );
+
+ for( uint ic = ic0; ic < ic1; ic++ )
+ {
+ const uint idxA = aBase + ic * aStride[ 0 ];
+ const uint idxB = bBase + ic * bStride[ 0 ];
+ const float bValue = arg1[ idxB ];
+ vectorMad( idxA, tempRow, ne01, bValue, thread );
+ }
+ }
+ }
+
+ // GGML_TASK_FINALIZE
+ {
+ const uint rdi = tempRowThread0;
+ // const uint rdi = i1 * resSize[ 0 ] + i2 * resSize[ 0 ] * resSize[ 1 ] + i3 * resSize[ 0 ] * resSize[ 1 ] * resSize[ 2 ];
+ // const uint rdi = ( ( i3 * resSize[ 2 ] + i2 ) * resSize[ 1 ] + i1 ) * resSize[ 0 ];
+
+ uint tempRow = tempRowThread0;
+ copyRow( tempRow, rdi, ne01, thread );
+
+ tempRow += ne;
+ for( uint ith = 1; ith < nth; ith++, tempRow += ne )
+ addRow( tempRow, rdi, ne01, thread );
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/mulMatTiled.hlsl b/ComputeShaders/mulMatTiled.hlsl
new file mode 100644
index 0000000..7e4d7d8
--- /dev/null
+++ b/ComputeShaders/mulMatTiled.hlsl
@@ -0,0 +1,236 @@
+// This compute shader implements matrix*matrix product, using tiling and other tricks to improve the performance
+#ifndef TILE_SIZE
+static const uint TILE_SIZE = 32;
+#endif
+
+#ifndef THREADS_Y
+// Performance measures on Ryzen 7 5700G iGPU, the time is just for this shader:
+// 1 (32 threads per group) - 17.1 seconds, 2 - 9.02424 seconds, 4 - 6.95762 seconds, 6 - 6.79011 seconds, 8 - 6.67279 seconds, 10 - 6.9456 seconds, 16 - 7.20502 seconds
+// On nVidia, 8 is also the fastest option.
+static const uint THREADS_Y = 8;
+#endif
+
+#ifndef STREAM_SECOND_MATRIX
+#define STREAM_SECOND_MATRIX 0
+#endif
+
+#ifndef LOAD_ORDER
+
+// Load with coalesced loads from global memory whenever possible, store into groupshared buffer with random stores
+// #define LOAD_ORDER bool2( ( 1 == arg0Strides[ 0 ] ) || ( 1 != arg0Strides[ 1 ] ), ( 1 == arg1Strides[ 0 ] ) || ( 1 != arg1Strides[ 1 ] ) )
+
+// Load with random loads from global memory, store into groupshared buffer with coalesced stores
+// On my AMD iGPU inside Ryzen 7 5700G, there's whopping 15% performance win with that tactics, from 6.67 to 5.66 seconds for this shader.
+// My nVidia GPU does about the same
+#define LOAD_ORDER bool2( false, true )
+
+#endif
+
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 arg0Size: packoffset( c0 );
+ uint4 arg0Strides: packoffset( c1 );
+ uint4 arg1Strides: packoffset( c3 );
+ uint4 resultSize: packoffset( c4 );
+ uint4 resultStrides: packoffset( c5 );
+}
+
+groupshared float tile0[ TILE_SIZE ][ TILE_SIZE ];
+#if !STREAM_SECOND_MATRIX
+groupshared float tile1[ TILE_SIZE ][ TILE_SIZE ];
+#endif
+groupshared float resTemp[ TILE_SIZE ][ TILE_SIZE ];
+
+#if STREAM_SECOND_MATRIX
+void multiplyTiles( uint rsi, const uint3 thread, const uint w, const uint h )
+{
+ for( uint i = thread.y; i < h; i += THREADS_Y, rsi += THREADS_Y * arg1Strides.y )
+ {
+ float r = 0;
+ uint rsiRow = rsi;
+ for( uint j = 0; j < w; j++, rsiRow += arg1Strides.x )
+ {
+ // One TILE_SIZE * 4 bytes coalesced load, broadcasted into THREADS_Y copies
+ const float s0 = tile0[ j ][ thread.x ];
+ // THREADS_Y broadcasts from global memory, each one is 4 bytes broadcasted into TILE_SIZE copies
+ const float s1 = arg1[ rsiRow ];
+ // Multiply and accumulate
+ r = mad( s0, s1, r );
+ }
+ // Accumulate into the output tile
+ // THREADS_Y * 128 bytes coalesced loads and stores
+ resTemp[ i ][ thread.x ] += r;
+ }
+}
+#else
+// Compute resTemp += tile0 * tile1, for TILE_SIZE^2 square matrices
+// The group size is TILE_SIZE*THREADS_Y threads in this shader
+void multiplyTiles( const uint3 thread )
+{
+ for( uint i = thread.y; i < TILE_SIZE; i += THREADS_Y )
+ {
+ float r = 0;
+ for( uint j = 0; j < TILE_SIZE; j++ )
+ {
+ // One TILE_SIZE * 4 bytes coalesced load, broadcasted into THREADS_Y copies
+ const float s0 = tile0[ j ][ thread.x ];
+ // THREADS_Y broadcasts, each one is 4 bytes broadcasted into TILE_SIZE copies
+ const float s1 = tile1[ i ][ j ];
+ // Multiply and accumulate
+ r = mad( s0, s1, r );
+ }
+ // Accumulate into the output tile
+ // THREADS_Y * 128 bytes coalesced loads and stores
+ resTemp[ i ][ thread.x ] += r;
+ }
+}
+#endif
+
+// Note we transposed these tiles while loading
+void loadTile0( uint rsi, const uint3 thread, const uint w, const uint h, const bool rowMajor )
+{
+ uint i;
+ if( rowMajor )
+ {
+ rsi += arg0Strides.y * thread.y;
+ for( i = thread.y; i < h; i += THREADS_Y, rsi += arg0Strides.y * THREADS_Y )
+ {
+ if( thread.x < w )
+ tile0[ thread.x ][ i ] = arg0[ rsi + thread.x * arg0Strides.x ];
+ else
+ tile0[ thread.x ][ i ] = 0.0;
+ }
+ }
+ else
+ {
+ // Unlike width which is smaller for the last tile, the height is always the same, and all these tiles are zero-initialized
+ if( thread.x >= h )
+ return;
+
+ rsi += arg0Strides.x * thread.y;
+ for( i = thread.y; i < w; i += THREADS_Y, rsi += arg0Strides.x * THREADS_Y )
+ tile0[ i ][ thread.x ] = arg0[ rsi + thread.x * arg0Strides.y ];
+
+ if( i >= TILE_SIZE )
+ return;
+ for( ; i < TILE_SIZE; i += THREADS_Y )
+ tile0[ i ][ thread.x ] = 0.0;
+ }
+}
+
+#if !STREAM_SECOND_MATRIX
+void loadTile1( uint rsi, const uint3 thread, const uint w, const uint h, const bool rowMajor )
+{
+ uint i;
+ if( rowMajor )
+ {
+ rsi += thread.y * arg1Strides.y;
+
+ for( i = thread.y; i < h; i += THREADS_Y, rsi += arg1Strides.y * THREADS_Y )
+ {
+ if( thread.x < w )
+ tile1[ i ][ thread.x ] = arg1[ rsi + thread.x * arg1Strides.x ];
+ else
+ tile1[ i ][ thread.x ] = 0.0;
+ }
+ }
+ else
+ {
+ // Unlike width which is smaller for the last tile, the height is always the same, and all these tiles are zero-initialized
+ if( thread.x >= h )
+ return;
+
+ rsi += thread.y * arg1Strides.x;
+ for( i = thread.y; i < w; i += THREADS_Y, rsi += arg1Strides.x * THREADS_Y )
+ tile1[ thread.x ][ i ] = arg1[ rsi + thread.x * arg0Strides.y ];
+ if( i >= TILE_SIZE )
+ return;
+ for( ; i < TILE_SIZE; i += THREADS_Y )
+ tile1[ thread.x ][ i ] = 0.0;
+ }
+}
+#endif
+
+void storeTile( const uint3 thread, const uint4 pos, const uint2 size )
+{
+ if( thread.x >= size.x )
+ return;
+ const uint4 prod4 = pos * resultStrides;
+ const uint2 prod2 = prod4.xy + prod4.zw;
+ uint rdi = prod2.x + prod2.y;
+ rdi += resultStrides.y * thread.y;
+ for( uint i = thread.y; i < size.y; i += THREADS_Y, rdi += resultStrides.y * THREADS_Y )
+ result[ rdi + thread.x * resultStrides.x ] = resTemp[ i ][ thread.x ];
+}
+
+[ numthreads( TILE_SIZE, THREADS_Y, 1 ) ]
+void main( uint3 group: SV_GroupID, uint3 thread : SV_GroupThreadID )
+{
+ // Zero out these shared buffers
+ for( uint i = 0; i < TILE_SIZE; i += THREADS_Y )
+ {
+ tile0[ i + thread.y ][ thread.x ] = 0.0;
+#if !STREAM_SECOND_MATRIX
+ tile1[ i + thread.y ][ thread.x ] = 0.0;
+#endif
+ resTemp[ i + thread.y ][ thread.x ] = 0.0;
+ }
+
+ const uint2 resultPos = group.xy * TILE_SIZE;
+ const uint2 layer = uint2( group.z % resultSize.z, group.z / resultSize.z );
+ uint rsi0 = resultPos.x * arg0Strides.y + layer.x * arg0Strides.z + layer.y * arg0Strides.w;
+ uint rsi1 = resultPos.y * arg1Strides.y + layer.x * arg1Strides.z + layer.y * arg1Strides.w;
+
+ const uint rsi0Inc = TILE_SIZE * arg0Strides.x;
+ const uint rsi1Inc = TILE_SIZE * arg1Strides.x;
+
+ const uint completeTiles = arg0Size.x / TILE_SIZE;
+ const uint rsi0AndAligned = rsi0 + rsi0Inc * completeTiles;
+ // Output tile size
+ // Normally TILE_SIZE^2, less than that for the tiles at the right and bottom edges of the output matrix
+ const uint2 outputSize = min( TILE_SIZE, resultSize.xy - resultPos );
+
+ const bool2 loadOrder = LOAD_ORDER;
+
+#if STREAM_SECOND_MATRIX
+ rsi1 += thread.y * arg1Strides.y;
+#endif
+ for( ; rsi0 < rsi0AndAligned; rsi0 += rsi0Inc, rsi1 += rsi1Inc )
+ {
+ loadTile0( rsi0, thread, TILE_SIZE, outputSize.x, loadOrder.x );
+#if STREAM_SECOND_MATRIX
+ GroupMemoryBarrierWithGroupSync();
+ multiplyTiles( rsi1, thread, TILE_SIZE, outputSize.y );
+#else
+ loadTile1( rsi1, thread, TILE_SIZE, outputSize.y, loadOrder.y );
+ GroupMemoryBarrierWithGroupSync();
+ multiplyTiles( thread );
+#endif
+ // Need one moar barrier here.
+ // Otherwise, some threads of the group are loading the next tile into tile0/tile1 groupshared buffers on the next iteration of the loop,
+ // while other threads of the same group are still computing the matrix product, and getting incorrect values from that groupshared buffer.
+ // The missing barrier only caused a bug on AMD, and only with "ggml-large.bin" model; no idea why that is.
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ const uint rem = arg0Size.x % TILE_SIZE;
+ if( 0 != rem )
+ {
+ loadTile0( rsi0, thread, rem, outputSize.x, loadOrder.x );
+#if STREAM_SECOND_MATRIX
+ GroupMemoryBarrierWithGroupSync();
+ multiplyTiles( rsi1, thread, rem, outputSize.y );
+#else
+ loadTile1( rsi1, thread, rem, outputSize.y, loadOrder.y );
+ GroupMemoryBarrierWithGroupSync();
+ multiplyTiles( thread );
+#endif
+ }
+
+ GroupMemoryBarrierWithGroupSync();
+ storeTile( thread, uint4( resultPos, layer ), outputSize );
+} \ No newline at end of file
diff --git a/ComputeShaders/mulMatTiled64.hlsl b/ComputeShaders/mulMatTiled64.hlsl
new file mode 100644
index 0000000..45d77b1
--- /dev/null
+++ b/ComputeShaders/mulMatTiled64.hlsl
@@ -0,0 +1,3 @@
+#define TILE_SIZE 64
+#define STREAM_SECOND_MATRIX 1
+#include "mulMatTiled.hlsl" \ No newline at end of file
diff --git a/ComputeShaders/mulMatTiledEx.hlsl b/ComputeShaders/mulMatTiledEx.hlsl
new file mode 100644
index 0000000..0f23da2
--- /dev/null
+++ b/ComputeShaders/mulMatTiledEx.hlsl
@@ -0,0 +1,194 @@
+// This compute shader implements yet another version of matrix*matrix product
+// For optimal VRAM access pattern, it requires both arguments to be reshaped into a sequence of horizontal column major panels.
+// The panel height is TILE_SIZE, and the last panel of the matrix needs to be padded with zeros; see matReshapePanels.hlsl shader for the reshaping.
+// So far, it's only used when running on AMD GPUs.
+#ifndef TILE_SIZE
+static const uint TILE_SIZE = 32;
+#endif
+#ifndef TILE_HEIGHT
+static const uint TILE_HEIGHT = 32;
+#endif
+#ifndef THREADS_Y
+static const uint THREADS_Y = 16;
+#endif
+
+#ifndef STREAM_SECOND_MATRIX
+#define STREAM_SECOND_MATRIX 1
+#endif
+
+// First tensor, reshaped into dense column major horizontal panels of size [ width, TILE_SIZE ]
+Buffer<float> arg0: register( t0 );
+// Second tensor, reshaped into dense column major horizontal panels of size [ width, TILE_SIZE ]
+Buffer<float> arg1: register( t1 );
+// FP32 output tensor, row major and continuous
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 arg0Size: packoffset( c0 );
+ uint arg0panel: packoffset( c1.y );
+ uint2 arg0LayerStrides: packoffset( c1.z );
+
+ // uint4 arg1Size: packoffset( c2 );
+ uint arg1panel: packoffset( c3.y );
+ uint2 arg1LayerStrides: packoffset( c3.z );
+
+ uint4 resultSize: packoffset( c4 );
+ uint4 resultStrides: packoffset( c5 );
+}
+
+// Accumulator for the output tile
+// That last `+1` helps a bit, I'm not sure why exactly but probebly because memory bank conflicts.
+groupshared float tileOutput[ TILE_SIZE ][ TILE_SIZE + 1 ];
+// A smaller tile loaded from the first source matrix
+groupshared float tile0[ TILE_HEIGHT ][ TILE_SIZE ];
+#if !STREAM_SECOND_MATRIX
+// A smaller tile loaded from the second source matrix
+groupshared float tile1[ TILE_HEIGHT ][ TILE_SIZE ];
+#endif
+
+#if STREAM_SECOND_MATRIX
+void multiplyTiles( const uint3 thread, uint rsi, const uint h )
+{
+ uint2 i = uint2( thread.y, rsi );
+ const uint2 iInc = uint2( THREADS_Y, THREADS_Y );
+ for( ; i.x < TILE_SIZE; i += iInc )
+ {
+ float r = 0.0;
+ uint2 j = uint2( 0, i.y );
+ const uint2 jInc = uint2( 1, TILE_SIZE );
+ for( ; j.x < h; j += jInc )
+ {
+ float a = tile0[ j.x ][ thread.x ];
+ float b = arg1[ j.y ];
+ r = mad( a, b, r );
+ }
+ tileOutput[ i.x ][ thread.x ] += r;
+ }
+}
+#else
+void multiplyTiles( const uint3 thread )
+{
+ for( uint row = thread.y; row < TILE_SIZE; row += THREADS_Y )
+ {
+ float r = 0.0;
+ for( uint j = 0; j < TILE_HEIGHT; j++ )
+ {
+ float a = tile0[ j ][ thread.x ];
+ float b = tile1[ j ][ row ];
+ r = mad( a, b, r );
+ }
+ tileOutput[ row ][ thread.x ] += r;
+ }
+}
+#endif
+
+void storeTile( const uint3 thread, const uint4 pos, const uint2 size )
+{
+ if( thread.x >= size.x )
+ return;
+ const uint4 prod4 = pos * resultStrides;
+ const uint2 prod2 = prod4.xy + prod4.zw;
+ uint rdi = prod2.x + prod2.y;
+ rdi += resultStrides.y * thread.y;
+ rdi += thread.x;
+ for( uint i = thread.y; i < size.y; i += THREADS_Y, rdi += resultStrides.y * THREADS_Y )
+ result[ rdi ] = tileOutput[ i ][ thread.x ];
+}
+
+[numthreads( TILE_SIZE, THREADS_Y, 1 )]
+void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID )
+{
+ uint i;
+ // Zero all 3 shared buffers
+ for( i = thread.y; i < TILE_SIZE; i += THREADS_Y )
+ tileOutput[ i ][ thread.x ] = 0.0;
+ for( i = thread.y; i < TILE_HEIGHT; i += THREADS_Y )
+ {
+ tile0[ i ][ thread.x ] = 0.0;
+#if !STREAM_SECOND_MATRIX
+ tile1[ i ][ thread.x ] = 0.0;
+#endif
+ }
+
+ const uint2 layer = uint2( group.z % resultSize.z, group.z / resultSize.z );
+
+ uint rsi0 = group.x * arg0panel + layer.x * arg0LayerStrides.x + layer.y * arg0LayerStrides.y;
+ uint rsi1 = group.y * arg1panel + layer.x * arg1LayerStrides.x + layer.y * arg1LayerStrides.y;
+
+ const uint threadOffset = thread.y * TILE_SIZE + thread.x;
+ rsi0 += threadOffset;
+#if STREAM_SECOND_MATRIX
+ rsi1 += thread.y;
+#else
+ rsi1 += threadOffset;
+#endif
+
+ const uint completeTiles = arg0Size.x / TILE_HEIGHT;
+ for( i = 0; i < completeTiles; i++ )
+ {
+ // Load [ TILE_SIZE, TILE_HEIGHT ] block from both source tensors into these groupshared buffers
+ for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y )
+ {
+ tile0[ j ][ thread.x ] = arg0[ rsi0 ];
+ rsi0 += THREADS_Y * TILE_SIZE;
+#if !STREAM_SECOND_MATRIX
+ tile1[ j ][ thread.x ] = arg1[ rsi1 ];
+ rsi1 += THREADS_Y * TILE_SIZE;
+#endif
+ }
+
+ // Wait for all threads in the group to complete these loads
+ GroupMemoryBarrierWithGroupSync();
+
+#if STREAM_SECOND_MATRIX
+ multiplyTiles( thread, rsi1, TILE_HEIGHT );
+ rsi1 += TILE_HEIGHT * TILE_SIZE;
+#else
+ // Multiply + accumulate the elements collected in the groupshared buffers
+ multiplyTiles( thread );
+#endif
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ const uint rem = arg0Size.x % TILE_HEIGHT;
+ if( rem != 0 )
+ {
+ // Load [ TILE_SIZE, rem ] block from both source tensors, and zero out the padding elements
+ for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y )
+ {
+ [branch]
+ if( j < rem )
+ {
+ tile0[ j ][ thread.x ] = arg0[ rsi0 ];
+ rsi0 += THREADS_Y * TILE_SIZE;
+#if !STREAM_SECOND_MATRIX
+ tile1[ j ][ thread.x ] = arg1[ rsi1 ];
+ rsi1 += THREADS_Y * TILE_SIZE;
+#endif
+ }
+ else
+ {
+ tile0[ j ][ thread.x ] = 0.0;
+#if !STREAM_SECOND_MATRIX
+ tile1[ j ][ thread.x ] = 0.0;
+#endif
+ }
+ }
+
+ // Wait for all threads in the group to complete these loads
+ GroupMemoryBarrierWithGroupSync();
+
+ // Multiply + accumulate the elements collected in the groupshared buffers
+#if STREAM_SECOND_MATRIX
+ multiplyTiles( thread, rsi1, rem );
+#else
+ multiplyTiles( thread );
+#endif
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ const uint2 resultPos = group.xy * TILE_SIZE;
+ const uint2 outputSize = min( TILE_SIZE, resultSize.xy - resultPos );
+ storeTile( thread, uint4( resultPos, layer ), outputSize );
+} \ No newline at end of file
diff --git a/ComputeShaders/norm.hlsl b/ComputeShaders/norm.hlsl
new file mode 100644
index 0000000..eeb82b7
--- /dev/null
+++ b/ComputeShaders/norm.hlsl
@@ -0,0 +1,86 @@
+// Ported from ggml_compute_forward_norm_f32
+// Dispatch [ ne01, ne02, ne03 ] thread groups of this shader
+Buffer<float> arg0: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 src0_elements: packoffset( c0 );
+ uint4 src0_strides: packoffset( c1 );
+ uint4 result_strides: packoffset( c3 );
+}
+
+static const float eps = 1e-5f; // TODO: make this a parameter
+
+#include "groupReduce.hlsli"
+
+float computeVectorSum( uint i, const uint length, const uint thread )
+{
+ float res = 0.0;
+
+ const uint iEnd = i + length;
+ i += thread;
+ for( ; i < iEnd; i += 32 )
+ res += arg0[ i ];
+
+ horizontalSumBroadcast( thread, res );
+ return res;
+}
+
+float offsetAndComputeSumSquares( uint rsi, uint rdi, const float mean, const uint length, const uint thread )
+{
+ float sum2 = 0.0;
+
+ const uint rsiEnd = rsi + length;
+ rsi += thread;
+ rdi += thread;
+ for( ; rsi < rsiEnd; rsi += 32, rdi += 32 )
+ {
+ float v = arg0[ rsi ] - mean;
+ result[ rdi ] = v;
+ sum2 = mad( v, v, sum2 );
+ }
+
+ horizontalSumBroadcast( thread, sum2 );
+ return sum2;
+}
+
+void scaleVector( uint rdi, const float scale, const uint length, const uint thread )
+{
+ const uint rdiEnd = rdi + length;
+ for( rdi += thread; rdi < rdiEnd; rdi += 32 )
+ {
+ float f = result[ rdi ];
+ f *= scale;
+ result[ rdi ] = f;
+ }
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint i03 = group.z;
+ const uint i02 = group.y;
+ const uint i01 = group.x;
+
+ const uint nb01 = src0_strides[ 1 ];
+ const uint nb02 = src0_strides[ 2 ];
+ const uint nb03 = src0_strides[ 3 ];
+
+ const uint p = i01 * nb01 + i02 * nb02 + i03 * nb03;
+
+ const uint ne00 = src0_elements[ 0 ];
+
+ float mean = computeVectorSum( p, ne00, thread );
+ mean /= (float)(int)ne00;
+
+ const uint nb1 = result_strides[ 1 ];
+ const uint nb2 = result_strides[ 2 ];
+ const uint nb3 = result_strides[ 3 ];
+ const uint y = i01 * nb1 + i02 * nb2 + i03 * nb3;
+
+ float sum2 = offsetAndComputeSumSquares( p, y, mean, ne00, thread );
+ const float scale = 1.0 / sqrt( sum2 / (float)(int)ne00 + eps );
+
+ scaleVector( y, scale, ne00, thread );
+} \ No newline at end of file
diff --git a/ComputeShaders/normCompat.hlsl b/ComputeShaders/normCompat.hlsl
new file mode 100644
index 0000000..23e1228
--- /dev/null
+++ b/ComputeShaders/normCompat.hlsl
@@ -0,0 +1,82 @@
+// Ported from ggml_compute_forward_norm_f32
+// Dispatch [ ( ne01 + 31 ) / 32, ne02, ne03 ] thread groups of this shader
+Buffer<float> arg0: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 src0_elements: packoffset( c0 );
+ uint4 src0_strides: packoffset( c1 );
+ uint4 result_strides: packoffset( c3 );
+}
+
+static const double eps = 1e-5; // TODO: make this a parameter
+
+#include "groupReduce.hlsli"
+
+double computeVectorSum( uint i, const uint length )
+{
+ double res = 0.0;
+ const uint iEnd = i + length;
+ for( ; i < iEnd; i++ )
+ res += arg0[ i ];
+ return res;
+}
+
+double offsetAndComputeSumSquares( uint rsi, uint rdi, const double mean, const uint length )
+{
+ precise double sum2 = 0.0;
+ const uint rsiEnd = rsi + length;
+ for( ; rsi < rsiEnd; rsi++, rdi++ )
+ {
+ double v = arg0[ rsi ];
+ v -= mean;
+ result[ rdi ] = (float)v;
+ double prod = v * v;
+ sum2 += prod;
+ }
+ return sum2;
+}
+
+void scaleVector( uint rdi, const float scale, const uint length )
+{
+ const uint rdiEnd = rdi + length;
+ for( ; rdi < rdiEnd; rdi++ )
+ {
+ float f = result[ rdi ];
+ f *= scale;
+ result[ rdi ] = f;
+ }
+}
+
+#include "fp64Utils.hlsli"
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 dtid: SV_DispatchThreadID )
+{
+ const uint i03 = dtid.z;
+ const uint i02 = dtid.y;
+ const uint i01 = dtid.x;
+ if( i01 >= src0_elements[ 1 ] )
+ return;
+
+ const uint nb01 = src0_strides[ 1 ];
+ const uint nb02 = src0_strides[ 2 ];
+ const uint nb03 = src0_strides[ 3 ];
+
+ const uint p = i01 * nb01 + i02 * nb02 + i03 * nb03;
+ const uint ne00 = src0_elements[ 0 ];
+
+ double mean = computeVectorSum( p, ne00 );
+ mean = div64( mean, (double)(int)ne00 );
+
+ const uint nb1 = result_strides[ 1 ];
+ const uint nb2 = result_strides[ 2 ];
+ const uint nb3 = result_strides[ 3 ];
+ const uint y = i01 * nb1 + i02 * nb2 + i03 * nb3;
+
+ const double sum2 = offsetAndComputeSumSquares( p, y, mean, ne00 );
+ const float scale = (float)div64( 1.0, sqrt64( sum2 / (float)(int)ne00 + eps ) );
+
+ scaleVector( y, scale, ne00 );
+} \ No newline at end of file
diff --git a/ComputeShaders/normFixed.hlsl b/ComputeShaders/normFixed.hlsl
new file mode 100644
index 0000000..8f2267f
--- /dev/null
+++ b/ComputeShaders/normFixed.hlsl
@@ -0,0 +1,124 @@
+// Ported from ggml_compute_forward_norm_f32
+// Dispatch [ ne01, ne02, ne03 ] thread groups of this shader
+Buffer<float> arg0: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 src0_elements: packoffset( c0 );
+ uint4 src0_strides: packoffset( c1 );
+ uint4 result_strides: packoffset( c3 );
+}
+
+static const float eps = 1e-5f; // TODO: make this a parameter
+
+// #include "groupReduce.hlsli"
+
+#ifndef THREADS
+static const uint THREADS = 32;
+#endif
+static const uint ROW_LENGTH = 1024;
+groupshared float rowBuffer[ ROW_LENGTH ];
+
+static const uint REDUCTION_BUFFER = 32;
+groupshared float sharedAccumulators[ REDUCTION_BUFFER ];
+
+// Compute horisontal sum of the numbers. The result is only correct on the thread #0 of the group.
+void horizontalSum( const uint thread, inout float sum )
+{
+ if( THREADS > REDUCTION_BUFFER )
+ {
+ for( uint t = REDUCTION_BUFFER; t < THREADS; t += REDUCTION_BUFFER )
+ {
+ // Threads [ t .. t + REDUCTION_BUFFER ] store into the buffer
+ if( thread >= t && thread < t + REDUCTION_BUFFER )
+ sharedAccumulators[ thread - t ] = sum;
+
+ GroupMemoryBarrierWithGroupSync();
+
+ // Threads [ 0 .. REDUCTION_BUFFER ] increment their local sum with the value loaded from the buffer
+ if( thread < REDUCTION_BUFFER )
+ sum += sharedAccumulators[ thread ];
+ }
+ }
+
+ if( thread < REDUCTION_BUFFER )
+ sharedAccumulators[ thread ] = sum;
+
+ for( uint i = REDUCTION_BUFFER / 2; i > 1; i /= 2 )
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if( thread < i )
+ {
+ sum += sharedAccumulators[ thread + i ];
+ sharedAccumulators[ thread ] = sum;
+ }
+ }
+
+ GroupMemoryBarrierWithGroupSync();
+ if( 0 == thread )
+ sum += sharedAccumulators[ 1 ];
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint i03 = group.z;
+ const uint i02 = group.y;
+ const uint i01 = group.x;
+ const uint ne00 = ROW_LENGTH;
+
+ // First pass: copy the data to local buffer, and compute sum
+ {
+ const uint nb01 = src0_strides[ 1 ];
+ const uint nb02 = src0_strides[ 2 ];
+ const uint nb03 = src0_strides[ 3 ];
+ const uint p = i01 * nb01 + i02 * nb02 + i03 * nb03;
+
+ float sum = 0;
+ for( uint i = thread; i < ne00; i += THREADS )
+ {
+ float f = arg0[ p + i ];
+ rowBuffer[ i ] = f;
+ sum += f;
+ }
+ horizontalSum( thread, sum );
+ if( 0 == thread )
+ sharedAccumulators[ 0 ] = sum / (float)(int)ne00;
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ // Second pass: offset and compute sum of squares
+ {
+ const float mean = sharedAccumulators[ 0 ];
+ float sum2 = 0;
+ for( uint i = thread; i < ne00; i += THREADS )
+ {
+ float v = rowBuffer[ i ];
+ v -= mean;
+ rowBuffer[ i ] = v;
+ sum2 = mad( v, v, sum2 );
+ }
+ horizontalSum( thread, sum2 );
+ if( 0 == thread )
+ sharedAccumulators[ 0 ] = 1.0 / sqrt( sum2 / (float)(int)ne00 + eps );
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ // Final pass: apply the scale, and copy from group shared buffer to the destination
+ {
+ const float scale = sharedAccumulators[ 0 ];
+
+ const uint nb1 = result_strides[ 1 ];
+ const uint nb2 = result_strides[ 2 ];
+ const uint nb3 = result_strides[ 3 ];
+ const uint y = i01 * nb1 + i02 * nb2 + i03 * nb3;
+
+ for( uint i = thread; i < ne00; i += THREADS )
+ {
+ float v = rowBuffer[ i ];
+ v *= scale;
+ result[ y + i ] = v;
+ }
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/normFixed64.hlsl b/ComputeShaders/normFixed64.hlsl
new file mode 100644
index 0000000..14aab3d
--- /dev/null
+++ b/ComputeShaders/normFixed64.hlsl
@@ -0,0 +1,2 @@
+#define THREADS 64
+#include "normFixed.hlsl" \ No newline at end of file
diff --git a/ComputeShaders/repeatUtils.hlsli b/ComputeShaders/repeatUtils.hlsli
new file mode 100644
index 0000000..1181501
--- /dev/null
+++ b/ComputeShaders/repeatUtils.hlsli
@@ -0,0 +1,21 @@
+inline uint rowOffset( uint3 idx, uint4 strides )
+{
+ return idx[ 0 ] * strides[ 1 ] + idx[ 1 ] * strides[ 2 ] + idx[ 2 ] * strides[ 3 ];
+}
+
+// Initial iterator state for a row of the output tensor
+// x = current index, y = index increment, z = end of the index
+inline uint3 tensorIteratorState( uint3 group, uint thread, uint4 size, uint4 stride )
+{
+ uint3 res;
+ res.x = rowOffset( group, stride );
+ res.y = THREADS * stride[ 0 ];
+ res.z = res.x + size[ 0 ] * stride[ 0 ];
+ res.x += thread * stride[ 0 ];
+ return res;
+}
+
+// Handle a complete row of output tensor, using the iterator made by tensorIteratorState() function
+#define ROW_LOOP( ts ) for( ; ts.x < ts.z; ts.x += ts.y )
+// Same as above, using different row length
+#define ROW_LOOP_EX( ts, len, stride ) for( ; ts.x < ts.z; ts.x += len * stride[ 0 ] ) \ No newline at end of file
diff --git a/ComputeShaders/scaleInPlace.hlsl b/ComputeShaders/scaleInPlace.hlsl
new file mode 100644
index 0000000..d0320b4
--- /dev/null
+++ b/ComputeShaders/scaleInPlace.hlsl
@@ -0,0 +1,23 @@
+RWBuffer<float> buffer: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 src0_elements: packoffset( c0 );
+ uint4 src0_strides: packoffset( c1 );
+ float multiplier: packoffset( c2.x );
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint nc0 = src0_elements[ 0 ];
+ uint i = group.x * src0_strides[ 1 ];
+ const uint iEnd = i + nc0;
+ const float mul = multiplier;
+ for( i += thread; i < iEnd; i += 32 )
+ {
+ float f = buffer[ i ];
+ f *= mul;
+ buffer[ i ] = f;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/softMax.hlsl b/ComputeShaders/softMax.hlsl
new file mode 100644
index 0000000..6ebd0f2
--- /dev/null
+++ b/ComputeShaders/softMax.hlsl
@@ -0,0 +1,71 @@
+// Dispatch [ nr, 1, 1 ] thread groups of this shader
+RWBuffer<float> result: register( u0 );
+
+// table_exp_f16
+Buffer<uint> lookupTable: register( t0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 elements: packoffset( c0 );
+ uint4 strides: packoffset( c1 );
+ uint nr: packoffset( c2.x );
+ float inputScale: packoffset( c2.y );
+}
+
+#include "miscUtils.hlsli"
+#include "groupReduce.hlsli"
+
+static const float negativeInfinity = asfloat( 0xff800000 );
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint p = group.x * strides[ 1 ];
+ const uint nc = elements[ 0 ];
+ const uint pEnd = p + nc;
+ uint i;
+
+ float m = negativeInfinity;
+ for( i = p + thread; i < pEnd; i += 32 )
+ m = max( m, result[ i ] );
+ horizontalMaxBroadcast( thread, m );
+
+ float sum = 0;
+ for( i = p + thread; i < pEnd; i += 32 )
+ {
+ float f = result[ i ];
+
+ [branch]
+ if( f != negativeInfinity )
+ {
+ f = ( f - m ) * inputScale;
+#if 1
+ // Similar to Radeon Graphics, computing the exponent on nVidia 1080Ti is also slightly faster than loading from the lookup table
+ f = exp( f );
+#else
+ uint s = fp16Rounded( f );
+ s = lookupTable[ s ];
+ f = f16tof32( s );
+#endif
+ sum += f;
+ }
+ else
+ f = 0;
+
+ result[ i ] = f;
+ }
+
+ horizontalSum( thread, sum );
+ if( 0 == thread )
+ sharedAccumulators[ 0 ] = 1.0 / sum;
+ GroupMemoryBarrierWithGroupSync();
+ const float scale = sharedAccumulators[ 0 ];
+
+ // ggml_vec_scale_f32
+ for( i = p + thread; i < pEnd; i += 32 )
+ {
+ float f = result[ i ];
+ f *= scale;
+ result[ i ] = f;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/softMax64.hlsl b/ComputeShaders/softMax64.hlsl
new file mode 100644
index 0000000..7ecd2ef
--- /dev/null
+++ b/ComputeShaders/softMax64.hlsl
@@ -0,0 +1,71 @@
+// Dispatch [ nr, 1, 1 ] thread groups of this shader
+RWBuffer<float> result: register( u0 );
+
+// table_exp_f16
+Buffer<uint> lookupTable: register( t0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 elements: packoffset( c0 );
+ uint4 strides: packoffset( c1 );
+ uint nr: packoffset( c2.x );
+ float inputScale: packoffset( c2.y );
+}
+
+#include "miscUtils.hlsli"
+#include "groupReduce64.hlsli"
+
+static const float negativeInfinity = asfloat( 0xff800000 );
+
+[ numthreads( 64, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint p = group.x * strides[ 1 ];
+ const uint nc = elements[ 0 ];
+ const uint pEnd = p + nc;
+ uint i;
+
+ float m = negativeInfinity;
+ for( i = p + thread; i < pEnd; i += 64 )
+ m = max( m, result[ i ] );
+ horizontalMaxBroadcast( thread, m );
+
+ float sum = 0;
+ for( i = p + thread; i < pEnd; i += 64 )
+ {
+ float f = result[ i ];
+
+ [branch]
+ if( f != negativeInfinity )
+ {
+ f = ( f - m ) * inputScale;
+#if 1
+ // At least on Radeon Graphics GPU inside Ryzen 7 5700G, computing exponent instead of loading from the buffer improves the performance
+ f = exp( f );
+#else
+ uint s = fp16Rounded( f );
+ s = lookupTable[ s ];
+ f = f16tof32( s );
+#endif
+ sum += f;
+ }
+ else
+ f = 0;
+
+ result[ i ] = f;
+ }
+
+ horizontalSum( thread, sum );
+ if( 0 == thread )
+ sharedAccumulators[ 0 ] = 1.0 / sum;
+ GroupMemoryBarrierWithGroupSync();
+ const float scale = sharedAccumulators[ 0 ];
+
+ // ggml_vec_scale_f32
+ for( i = p + thread; i < pEnd; i += 64 )
+ {
+ float f = result[ i ];
+ f *= scale;
+ result[ i ] = f;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/softMaxCompat.hlsl b/ComputeShaders/softMaxCompat.hlsl
new file mode 100644
index 0000000..2215ebd
--- /dev/null
+++ b/ComputeShaders/softMaxCompat.hlsl
@@ -0,0 +1,62 @@
+// ggml_compute_forward_soft_max_f32
+// Dispatch [ ( nr + 31 ) / 32, 1, 1 ] thread groups of this shader
+RWBuffer<float> result: register( u0 );
+
+// table_exp_f16
+Buffer<uint> lookupTable: register( t0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 elements: packoffset( c0 );
+ uint4 strides: packoffset( c1 );
+ uint nr: packoffset( c2.x );
+}
+
+#include "miscUtils.hlsli"
+#include "fp64Utils.hlsli"
+
+static const float negativeInfinity = asfloat( 0xff800000 );
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 dtid: SV_DispatchThreadID )
+{
+ if( dtid.x >= nr )
+ return;
+
+ const uint p = dtid.x * strides[ 1 ];
+ const uint nc = elements[ 0 ];
+ const uint pEnd = p + nc;
+ uint i;
+
+ float m = negativeInfinity;
+ for( i = p; i < pEnd; i++ )
+ m = max( m, result[ i ] );
+
+ double sum = 0;
+ for( i = p; i < pEnd; i++ )
+ {
+ float f = result[ i ];
+
+ [branch]
+ if( f != negativeInfinity )
+ {
+ uint s = fp16Rounded( f - m );
+ s = lookupTable[ s ];
+ f = f16tof32( s );
+ sum += f;
+ }
+ else
+ f = 0;
+
+ result[ i ] = f;
+ }
+
+ const float scale = (float)div64( 1.0, sum );
+ // ggml_vec_scale_f32
+ for( i = p; i < pEnd; i++ )
+ {
+ float f = result[ i ];
+ f *= scale;
+ result[ i ] = f;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/softMaxFixed.hlsl b/ComputeShaders/softMaxFixed.hlsl
new file mode 100644
index 0000000..7b4add2
--- /dev/null
+++ b/ComputeShaders/softMaxFixed.hlsl
@@ -0,0 +1,79 @@
+// Special softMax shader for matrices with rows of 1500 elements.
+// Uses group shared buffer of that length to save global memory bandwidth, more than 2x faster than the original.
+// Dispatch [ nr, 1, 1 ] thread groups of this shader
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint4 elements: packoffset( c0 );
+ uint4 strides: packoffset( c1 );
+ uint nr: packoffset( c2.x );
+ float inputScale: packoffset( c2.y );
+}
+
+#include "miscUtils.hlsli"
+#include "groupReduce64.hlsli"
+
+static const uint THREADS = 64;
+static const uint ROW_LENGTH = 1500;
+groupshared float rowBuffer[ ROW_LENGTH ];
+
+static const float negativeInfinity = asfloat( 0xff800000 );
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ const uint p = group.x * strides[ 1 ];
+ const uint nc = ROW_LENGTH;
+ uint i;
+
+ float m = negativeInfinity;
+ // First pass: compute maximum, and copy the row into the group shared buffer
+ for( i = thread; i < nc; i += THREADS )
+ {
+ float f = result[ p + i ];
+ m = max( m, f );
+ rowBuffer[ i ] = f;
+ }
+ horizontalMaxBroadcast( thread, m );
+
+ // Second pass: apply initial scale, compute the exponent, and compute total sum over the row
+ float sum = 0;
+ for( i = thread; i < nc; i += THREADS )
+ {
+ float f = rowBuffer[ i ];
+
+ [branch]
+ if( f != negativeInfinity )
+ {
+ f = ( f - m ) * inputScale;
+#if 1
+ // At least on Radeon Graphics GPU inside Ryzen 7 5700G, computing exponent instead of loading from the buffer improves the performance
+ f = exp( f );
+#else
+ uint s = fp16Rounded( f );
+ s = lookupTable[ s ];
+ f = f16tof32( s );
+#endif
+ sum += f;
+ }
+ else
+ f = 0;
+
+ rowBuffer[ i ] = f;
+ }
+
+ horizontalSum( thread, sum );
+ if( 0 == thread )
+ sharedAccumulators[ 0 ] = 1.0 / sum;
+ GroupMemoryBarrierWithGroupSync();
+ const float scale = sharedAccumulators[ 0 ];
+
+ // Final pass: apply the final scale, and copy the row from the group shared buffer back into the global memory
+ for( i = thread; i < nc; i += THREADS )
+ {
+ float f = rowBuffer[ i ];
+ f *= scale;
+ result[ p + i ] = f;
+ }
+} \ No newline at end of file
diff --git a/ComputeShaders/zeroMemory.hlsl b/ComputeShaders/zeroMemory.hlsl
new file mode 100644
index 0000000..c486636
--- /dev/null
+++ b/ComputeShaders/zeroMemory.hlsl
@@ -0,0 +1,27 @@
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+ uint elements: packoffset( c0.x );
+}
+
+// Thread group index is 16 bits per coordinate:
+// https://learn.microsoft.com/en-us/windows/win32/api/d3d11/nf-d3d11-id3d11devicecontext-dispatch
+// We want this shader to support buffers up to 2 GB.
+#ifndef THREADS
+static const uint THREADS = 512;
+#endif
+#ifndef ITERATIONS
+static const uint ITERATIONS = 128;
+#endif
+
+static const uint itemsPerGroup = THREADS * ITERATIONS;
+
+[numthreads( THREADS, 1, 1 )]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+ uint rdi = group.x * itemsPerGroup;
+ const uint rdiEnd = min( rdi + itemsPerGroup, elements );
+ for( rdi += thread; rdi < rdiEnd; rdi += THREADS )
+ result[ rdi ] = 0.0;
+} \ No newline at end of file