57 files changed, 4086 insertions, 0 deletions
diff --git a/ComputeShaders/ComputeShaders.cpp b/ComputeShaders/ComputeShaders.cpp
new file mode 100644
index 0000000..9c03f27
--- /dev/null
+++ b/ComputeShaders/ComputeShaders.cpp
@@ -0,0 +1,3 @@
+void fnComputeShaders()
+{
+}
+\ No newline at end of file
diff --git a/ComputeShaders/ComputeShaders.vcxproj b/ComputeShaders/ComputeShaders.vcxproj
new file mode 100644
index 0000000..350d266
--- /dev/null
+++ b/ComputeShaders/ComputeShaders.vcxproj
@@ -0,0 +1,221 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>16.0</VCProjectVersion>
+    <Keyword>Win32Proj</Keyword>
+    <ProjectGuid>{1c39d386-96d0-47a1-bbfa-68bbdb24439c}</ProjectGuid>
+    <RootNamespace>ComputeShaders</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <MultiProcFXC>true</MultiProcFXC>
+    <OutDir>$(Platform)\$(Configuration)\</OutDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <OutDir>$(Platform)\$(Configuration)\</OutDir>
+    <MultiProcFXC>true</MultiProcFXC>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>
+      </SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>
+      </SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>
+      </SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+    <FxCompile>
+      <ShaderModel>5.0</ShaderModel>
+      <ShaderType>Compute</ShaderType>
+    </FxCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>
+      </SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+    <FxCompile>
+      <ShaderModel>5.0</ShaderModel>
+      <ShaderType>Compute</ShaderType>
+      <DisableOptimizations>true</DisableOptimizations>
+      <EnableDebuggingInformation>true</EnableDebuggingInformation>
+    </FxCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="ComputeShaders.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <FxCompile Include="add.hlsl" />
+    <FxCompile Include="addInPlace.hlsl" />
+    <FxCompile Include="addRepeat.hlsl" />
+    <FxCompile Include="addRepeat64.hlsl" />
+    <FxCompile Include="addRepeatGelu.hlsl" />
+    <FxCompile Include="addRepeatGelu64.hlsl" />
+    <FxCompile Include="addRepeatScale.hlsl" />
+    <FxCompile Include="addRows.hlsl" />
+    <FxCompile Include="convolutionMain.hlsl" />
+    <FxCompile Include="convolutionMain2.hlsl" />
+    <FxCompile Include="convolutionMain2Fixed.hlsl" />
+    <FxCompile Include="convolutionPrep1.hlsl" />
+    <FxCompile Include="convolutionPrep2.hlsl" />
+    <FxCompile Include="copyConvert.hlsl" />
+    <FxCompile Include="copyTranspose.hlsl" />
+    <FxCompile Include="diagMaskInf.hlsl" />
+    <FxCompile Include="flashAttention.hlsl" />
+    <FxCompile Include="flashAttentionCompat1.hlsl" />
+    <FxCompile Include="flashAttentionCompat2.hlsl" />
+    <FxCompile Include="flashAttentionCompat3.hlsl" />
+    <FxCompile Include="fmaRepeat1.hlsl" />
+    <FxCompile Include="fmaRepeat164.hlsl" />
+    <FxCompile Include="fmaRepeat2.hlsl" />
+    <FxCompile Include="matReshapePanels.hlsl" />
+    <FxCompile Include="mulMatByRow.hlsl" />
+    <FxCompile Include="mulMatByRow64.hlsl" />
+    <FxCompile Include="mulMatByRowTiled.hlsl" />
+    <FxCompile Include="mulMatByRowTiled64.hlsl" />
+    <FxCompile Include="mulMatByRowTiledEx.hlsl" />
+    <FxCompile Include="mulMatByScalar.hlsl" />
+    <FxCompile Include="mulMatDotMain.hlsl" />
+    <FxCompile Include="mulMatDotReshape.hlsl" />
+    <FxCompile Include="mulMatMadMain.hlsl" />
+    <FxCompile Include="mulMatTiled.hlsl" />
+    <FxCompile Include="mulMatTiled64.hlsl" />
+    <FxCompile Include="mulMatTiledEx.hlsl" />
+    <FxCompile Include="norm.hlsl" />
+    <FxCompile Include="normCompat.hlsl" />
+    <FxCompile Include="normFixed.hlsl" />
+    <FxCompile Include="normFixed64.hlsl" />
+    <FxCompile Include="scaleInPlace.hlsl" />
+    <FxCompile Include="softMax.hlsl" />
+    <FxCompile Include="softMax64.hlsl" />
+    <FxCompile Include="softMaxCompat.hlsl" />
+    <FxCompile Include="softMaxFixed.hlsl" />
+    <FxCompile Include="zeroMemory.hlsl" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="componentwiseBinaryOp.hlsli" />
+    <None Include="flashAttentionCommon.hlsli" />
+    <None Include="fp64Utils.hlsli" />
+    <None Include="groupReduce.hlsli" />
+    <None Include="groupReduce64.hlsli" />
+    <None Include="miscUtils.hlsli" />
+    <None Include="repeatUtils.hlsli" />
+  </ItemGroup>
+  <ItemGroup>
+    <Text Include="Readme.txt" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
+\ No newline at end of file
diff --git a/ComputeShaders/ComputeShaders.vcxproj.filters b/ComputeShaders/ComputeShaders.vcxproj.filters
new file mode 100644
index 0000000..12f1559
--- /dev/null
+++ b/ComputeShaders/ComputeShaders.vcxproj.filters
@@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="ComputeShaders.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <FxCompile Include="mulMatDotMain.hlsl" />
+    <FxCompile Include="mulMatDotReshape.hlsl" />
+    <FxCompile Include="convolutionMain.hlsl" />
+    <FxCompile Include="convolutionPrep1.hlsl" />
+    <FxCompile Include="convolutionPrep2.hlsl" />
+    <FxCompile Include="add.hlsl" />
+    <FxCompile Include="flashAttention.hlsl" />
+    <FxCompile Include="convolutionMain2.hlsl" />
+    <FxCompile Include="norm.hlsl" />
+    <FxCompile Include="copyConvert.hlsl" />
+    <FxCompile Include="copyTranspose.hlsl" />
+    <FxCompile Include="normCompat.hlsl" />
+    <FxCompile Include="flashAttentionCompat1.hlsl" />
+    <FxCompile Include="flashAttentionCompat3.hlsl" />
+    <FxCompile Include="flashAttentionCompat2.hlsl" />
+    <FxCompile Include="scaleInPlace.hlsl" />
+    <FxCompile Include="diagMaskInf.hlsl" />
+    <FxCompile Include="softMaxCompat.hlsl" />
+    <FxCompile Include="mulMatMadMain.hlsl" />
+    <FxCompile Include="addRepeat.hlsl" />
+    <FxCompile Include="fmaRepeat1.hlsl" />
+    <FxCompile Include="fmaRepeat2.hlsl" />
+    <FxCompile Include="addInPlace.hlsl" />
+    <FxCompile Include="softMax.hlsl" />
+    <FxCompile Include="addRepeatScale.hlsl" />
+    <FxCompile Include="mulMatByRow.hlsl" />
+    <FxCompile Include="mulMatByScalar.hlsl" />
+    <FxCompile Include="mulMatTiled.hlsl" />
+    <FxCompile Include="mulMatByRow64.hlsl" />
+    <FxCompile Include="softMax64.hlsl" />
+    <FxCompile Include="softMaxFixed.hlsl" />
+    <FxCompile Include="addRepeat64.hlsl" />
+    <FxCompile Include="fmaRepeat164.hlsl" />
+    <FxCompile Include="addRepeatGelu.hlsl" />
+    <FxCompile Include="addRepeatGelu64.hlsl" />
+    <FxCompile Include="normFixed.hlsl" />
+    <FxCompile Include="normFixed64.hlsl" />
+    <FxCompile Include="mulMatByRowTiled.hlsl" />
+    <FxCompile Include="convolutionMain2Fixed.hlsl" />
+    <FxCompile Include="mulMatByRowTiled64.hlsl" />
+    <FxCompile Include="addRows.hlsl" />
+    <FxCompile Include="mulMatTiled64.hlsl" />
+    <FxCompile Include="zeroMemory.hlsl" />
+    <FxCompile Include="mulMatTiledEx.hlsl" />
+    <FxCompile Include="matReshapePanels.hlsl" />
+    <FxCompile Include="mulMatByRowTiledEx.hlsl" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="componentwiseBinaryOp.hlsli" />
+    <None Include="miscUtils.hlsli" />
+    <None Include="groupReduce.hlsli" />
+    <None Include="fp64Utils.hlsli" />
+    <None Include="flashAttentionCommon.hlsli" />
+    <None Include="repeatUtils.hlsli" />
+    <None Include="groupReduce64.hlsli" />
+  </ItemGroup>
+  <ItemGroup>
+    <Text Include="Readme.txt" />
+  </ItemGroup>
+</Project>
+\ No newline at end of file
diff --git a/ComputeShaders/Readme.txt b/ComputeShaders/Readme.txt
new file mode 100644
index 0000000..18d089e
--- /dev/null
+++ b/ComputeShaders/Readme.txt
@@ -0,0 +1,11 @@
+This project compiles all the compute shaders which implement the model.
+
+Many shaders come in 2 versions, something.hlsl and something64.hlsl
+
+The version with the `64` suffix is used on AMD GPUs, the version without suffix is used on nVidia and Intel GPUs.
+
+Not all of these shaders are actually used for anything.
+Some of them are implementing binary compatibility for the reference CPU version, and not used unless messing with the `constexpr` flags in MlContext C++ class.
+Such shaders often require FP64 support, which is an optional feature in D3D11.
+CompressShaders tool detects such shaders by looking at the SFI0 chunk in the binary, and outputs a bitmap of the FP64 shaders.
+This way, missing FP64 hardware support shouldn’t break the library.
+\ No newline at end of file
diff --git a/ComputeShaders/add.hlsl b/ComputeShaders/add.hlsl
new file mode 100644
index 0000000..819443e
--- /dev/null
+++ b/ComputeShaders/add.hlsl
@@ -0,0 +1,6 @@
+inline float compute( float a, float b )
+{
+	return a + b;
+}
+
+#include "componentwiseBinaryOp.hlsli"
+\ No newline at end of file
diff --git a/ComputeShaders/addInPlace.hlsl b/ComputeShaders/addInPlace.hlsl
new file mode 100644
index 0000000..a83c221
--- /dev/null
+++ b/ComputeShaders/addInPlace.hlsl
@@ -0,0 +1,39 @@
+#ifndef THREADS
+#define THREADS 512
+#endif
+
+Buffer<float> arg0: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 size: packoffset( c0 );
+	uint4 strides: packoffset( c1 );
+	uint4 argStrides: packoffset( c3 );
+}
+
+inline uint rowOffset( uint3 idx, uint4 strides )
+{
+	return idx[ 0 ] * strides[ 1 ] + idx[ 1 ] * strides[ 2 ] + idx[ 2 ] * strides[ 3 ];
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	uint rdi = rowOffset( group, strides );
+	uint rsi = rowOffset( group, argStrides );
+
+	const uint rdiEnd = rdi + size[ 0 ] * strides[ 0 ];
+	rdi += thread * strides[ 0 ];
+	rsi += thread * argStrides[ 0 ];
+
+	const uint rdiInc = THREADS * strides[ 0 ];
+	const uint rsiInc = THREADS * argStrides[ 0 ];
+
+	for( ; rdi < rdiEnd; rdi += rdiInc, rsi += rsiInc )
+	{
+		float f = result[ rdi ];
+		f += arg0[ rsi ];
+		result[ rdi ] = f;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/addRepeat.hlsl b/ComputeShaders/addRepeat.hlsl
new file mode 100644
index 0000000..e5cdaa3
--- /dev/null
+++ b/ComputeShaders/addRepeat.hlsl
@@ -0,0 +1,70 @@
+// Compute tensor = tensor + repeat( pattern, tensor ) in 1 shot, without VRAM allocations
+// Dispatch [ nb[ 1 ], nb[ 2 ], nb[ 3 ] ] thread groups of this shader, where nb is size of the destination tensor
+RWBuffer<float> tensor: register( u0 );
+Buffer<float> pattern: register( t0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 tensorSize: packoffset( c0 );
+	uint4 tensorStrides: packoffset( c1 );
+	uint4 patternSize: packoffset( c2 );
+	uint4 patternStrides: packoffset( c3 );
+}
+
+#ifndef THREADS
+#define THREADS 256
+#endif
+
+#include "repeatUtils.hlsli"
+
+inline void computeSimple( uint idx, float add )
+{
+	float f = tensor[ idx ];
+	f += add;
+	tensor[ idx ] = f;
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	uint3 it = tensorIteratorState( group, thread, tensorSize, tensorStrides );
+	uint rsi = rowOffset( group % patternSize.yzw, patternStrides );
+
+	if( patternSize[ 0 ] == 1 )
+	{
+		// The pattern only has 1 column - broadcasting over the row
+		const float p = pattern[ rsi ];
+		ROW_LOOP( it )
+			computeSimple( it.x, p );
+	}
+	else if( patternSize[ 0 ] <= THREADS )
+	{
+		// pattern size doesn't exceed thread group size: load pattern value outside of the loop
+		const uint threadsPerGroup = THREADS - ( THREADS % patternSize[ 0 ] );
+		if( thread >= threadsPerGroup )
+			return;
+
+		const float p = pattern[ rsi + ( thread % patternSize[ 0 ] ) * patternStrides[ 0 ] ];
+		ROW_LOOP_EX( it, threadsPerGroup, tensorStrides )
+			computeSimple( it.x, p );
+	}
+	else
+	{
+		// Pattern rows are larger than the thread group, need to stream from both buffers
+		const uint rsiInc = THREADS * patternStrides[ 0 ];
+		const uint rsiDec = patternSize[ 0 ] * patternStrides[ 0 ];
+		const uint rsiEnd = rsi + rsiDec;
+		rsi += thread * patternStrides[ 0 ];
+
+		ROW_LOOP( it )
+		{
+			float f = tensor[ it.x ];
+			float p = pattern[ rsi ];
+			rsi += rsiInc;
+			if( rsi >= rsiEnd )
+				rsi -= rsiDec;
+			f += p;
+			tensor[ it.x ] = f;
+		}
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/addRepeat64.hlsl b/ComputeShaders/addRepeat64.hlsl
new file mode 100644
index 0000000..b6c8c19
--- /dev/null
+++ b/ComputeShaders/addRepeat64.hlsl
@@ -0,0 +1,2 @@
+#define THREADS 64
+#include "addRepeat.hlsl"
+\ No newline at end of file
diff --git a/ComputeShaders/addRepeatGelu.hlsl b/ComputeShaders/addRepeatGelu.hlsl
new file mode 100644
index 0000000..7f63653
--- /dev/null
+++ b/ComputeShaders/addRepeatGelu.hlsl
@@ -0,0 +1,88 @@
+// Compute tensor = GELU( tensor + repeat( pattern, tensor ) ) in 1 shot, without VRAM allocations
+// Dispatch [ nb[ 1 ], nb[ 2 ], nb[ 3 ] ] thread groups of this shader, where nb is size of the destination tensor
+RWBuffer<float> tensor: register( u0 );
+Buffer<float> pattern: register( t0 );
+Buffer<uint> lookupTable: register( t1 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 tensorSize: packoffset( c0 );
+	uint4 tensorStrides: packoffset( c1 );
+	uint4 patternSize: packoffset( c2 );
+	uint4 patternStrides: packoffset( c3 );
+}
+
+#ifndef THREADS
+#define THREADS 1024
+#endif
+
+#include "repeatUtils.hlsli"
+#include "miscUtils.hlsli"
+
+inline float gelu( float x )
+{
+#if 1
+	const uint index = fp16Rounded( x );
+	const uint res16 = lookupTable[ index ];
+	return f16tof32( res16 );
+#else
+	// This version is much slower, at least on AMD, despite saving these VRAM loads.
+	const float GELU_COEF_A = 0.044715;
+	const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876;
+	return 0.5 * x * ( 1.0 + tanh( SQRT_2_OVER_PI * x * ( 1.0 + GELU_COEF_A * x * x ) ) );
+#endif
+}
+
+inline void computeSimple( uint idx, float add )
+{
+	float f = tensor[ idx ];
+	f += add;
+	f = gelu( f );
+	tensor[ idx ] = f;
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	uint3 it = tensorIteratorState( group, thread, tensorSize, tensorStrides );
+	uint rsi = rowOffset( group % patternSize.yzw, patternStrides );
+
+	if( patternSize[ 0 ] == 1 )
+	{
+		// The pattern only has 1 column - broadcasting over the row
+		const float p = pattern[ rsi ];
+		ROW_LOOP( it )
+			computeSimple( it.x, p );
+	}
+	else if( patternSize[ 0 ] <= THREADS )
+	{
+		// pattern size doesn't exceed thread group size: load pattern value outside of the loop
+		const uint threadsPerGroup = THREADS - ( THREADS % patternSize[ 0 ] );
+		if( thread >= threadsPerGroup )
+			return;
+
+		const float p = pattern[ rsi + ( thread % patternSize[ 0 ] ) * patternStrides[ 0 ] ];
+		ROW_LOOP_EX( it, threadsPerGroup, tensorStrides )
+			computeSimple( it.x, p );
+	}
+	else
+	{
+		// Pattern rows are larger than the thread group, need to stream from both buffers
+		const uint rsiInc = THREADS * patternStrides[ 0 ];
+		const uint rsiDec = patternSize[ 0 ] * patternStrides[ 0 ];
+		const uint rsiEnd = rsi + rsiDec;
+		rsi += thread * patternStrides[ 0 ];
+
+		ROW_LOOP( it )
+		{
+			float f = tensor[ it.x ];
+			float p = pattern[ rsi ];
+			rsi += rsiInc;
+			if( rsi >= rsiEnd )
+				rsi -= rsiDec;
+			f += p;
+			f = gelu( f );
+			tensor[ it.x ] = f;
+		}
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/addRepeatGelu64.hlsl b/ComputeShaders/addRepeatGelu64.hlsl
new file mode 100644
index 0000000..3d9b2e8
--- /dev/null
+++ b/ComputeShaders/addRepeatGelu64.hlsl
@@ -0,0 +1,2 @@
+#define THREADS 64
+#include "addRepeatGelu.hlsl"
+\ No newline at end of file
diff --git a/ComputeShaders/addRepeatScale.hlsl b/ComputeShaders/addRepeatScale.hlsl
new file mode 100644
index 0000000..8c24088
--- /dev/null
+++ b/ComputeShaders/addRepeatScale.hlsl
@@ -0,0 +1,73 @@
+// Compute tensor = ( tensor + repeat( pattern, tensor ) ) * scale in 1 shot, without VRAM allocations
+// Dispatch [ nb[ 1 ], nb[ 2 ], nb[ 3 ] ] thread groups of this shader, where nb is size of the destination tensor
+RWBuffer<float> tensor: register( u0 );
+Buffer<float> pattern: register( t0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 tensorSize: packoffset( c0 );
+	uint4 tensorStrides: packoffset( c1 );
+	uint4 patternSize: packoffset( c2 );
+	uint4 patternStrides: packoffset( c3 );
+	float scalingMul : packoffset( c4.x );
+}
+
+#ifndef THREADS
+#define THREADS 512
+#endif
+
+#include "repeatUtils.hlsli"
+
+inline void computeSimple( uint idx, float add )
+{
+	float f = tensor[ idx ];
+	f += add;
+	f *= scalingMul;
+	tensor[ idx ] = f;
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	uint3 it = tensorIteratorState( group, thread, tensorSize, tensorStrides );
+	uint rsi = rowOffset( group % patternSize.yzw, patternStrides );
+
+	if( patternSize[ 0 ] == 1 )
+	{
+		// The pattern only has 1 column - broadcasting over the row
+		const float p = pattern[ rsi ];
+		ROW_LOOP( it )
+			computeSimple( it.x, p );
+	}
+	else if( patternSize[ 0 ] <= THREADS )
+	{
+		// pattern size doesn't exceed thread group size: load pattern value outside of the loop
+		const uint threadsPerGroup = THREADS - ( THREADS % patternSize[ 0 ] );
+		if( thread >= threadsPerGroup )
+			return;
+
+		const float p = pattern[ rsi + ( thread % patternSize[ 0 ] ) * patternStrides[ 0 ] ];
+		ROW_LOOP_EX( it, threadsPerGroup, tensorStrides )
+			computeSimple( it.x, p );
+	}
+	else
+	{
+		// Pattern rows are larger than the thread group, need to stream from both buffers
+		const uint rsiInc = THREADS * patternStrides[ 0 ];
+		const uint rsiDec = patternSize[ 0 ] * patternStrides[ 0 ];
+		const uint rsiEnd = rsi + rsiDec;
+		rsi += thread * patternStrides[ 0 ];
+
+		ROW_LOOP( it )
+		{
+			float f = tensor[ it.x ];
+			float p = pattern[ rsi ];
+			rsi += rsiInc;
+			if( rsi >= rsiEnd )
+				rsi -= rsiDec;
+			f += p;
+			f *= scalingMul;
+			tensor[ it.x ] = f;
+		}
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/addRows.hlsl b/ComputeShaders/addRows.hlsl
new file mode 100644
index 0000000..21e5e73
--- /dev/null
+++ b/ComputeShaders/addRows.hlsl
@@ -0,0 +1,46 @@
+#ifndef THREADS
+#define THREADS 256
+#endif
+
+// dec.tokenEmbedding tensor
+Buffer<float> tokenEmbedding: register( t0 );
+// dec.positionalEmbedding tensor
+Buffer<float> positionalEmbedding: register( t1 );
+// R32_UINT buffer with the input tokens
+Buffer<uint> embd: register( t2 );
+// Output tensor
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint rowLength: packoffset( c0.x );
+	uint pastTokensCount: packoffset( c0.y );
+	uint outputRowStride: packoffset( c0.z );
+	uint2 embStrides: packoffset( c1.x );
+	uint2 posStrides: packoffset( c1.z );
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint row = group.x;
+	const uint rowTok = embd[ row ];
+	const uint rowPos = row + pastTokensCount;
+
+	uint rdi = row * outputRowStride;
+	const uint rdiEnd = rdi + rowLength;
+	rdi += thread;
+
+	uint rsiTok = rowTok * embStrides.y;
+	rsiTok += thread * embStrides.x;
+
+	uint rsiPos = rowPos * posStrides.y;
+	rsiPos += thread * posStrides.x;
+
+	for( ; rdi < rdiEnd; rdi += THREADS, rsiTok += THREADS * embStrides.x, rsiPos += THREADS * posStrides.x )
+	{
+		float a = tokenEmbedding[ rsiTok ];
+		float b = positionalEmbedding[ rsiPos ];
+		result[ rdi ] = a + b;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/componentwiseBinaryOp.hlsli b/ComputeShaders/componentwiseBinaryOp.hlsli
new file mode 100644
index 0000000..0a523ca
--- /dev/null
+++ b/ComputeShaders/componentwiseBinaryOp.hlsli
@@ -0,0 +1,43 @@
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 src0_elements: packoffset( c0 );
+	uint4 src0_strides: packoffset( c1 );
+	uint4 src1_elements: packoffset( c2 );
+	uint4 src1_strides: packoffset( c3 );
+	uint4 result_elements: packoffset( c4 );
+	uint4 result_strides: packoffset( c5 );
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint j = group.x;
+	const uint nb1 = result_strides[ 1 ];
+	const uint nb01 = src0_strides[ 1 ];
+
+	const uint nb10 = src1_strides[ 0 ];
+	const uint nb11 = src1_strides[ 1 ];
+	const uint nc = src0_elements[ 0 ];
+
+	uint rsi0 = j * nb01;
+	uint rsi1 = j * nb11;
+	uint rdi = j * nb1;
+	const uint rsi0End = rsi0 + nc;
+
+	rsi0 += thread;
+	rsi1 += thread * nb10;
+	rdi += thread;
+
+	const uint rsi1Inc = 32 * nb10;
+	for( ; rsi0 < rsi0End; rsi0 += 32, rsi1 += rsi1Inc, rdi += 32 )
+	{
+		const float a = arg0[ rsi0 ];
+		const float b = arg1[ rsi1 ];
+		const float res = compute( a, b );
+		result[ rdi ] = res;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/convolutionMain.hlsl b/ComputeShaders/convolutionMain.hlsl
new file mode 100644
index 0000000..eee3ebb
--- /dev/null
+++ b/ComputeShaders/convolutionMain.hlsl
@@ -0,0 +1,76 @@
+// ggml_compute_forward_conv_1d_1s_f16_f32, GGML_TASK_COMPUTE implementation
+// Dispatch [ ne10, ne02, 1 ] thread groups
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 src0_elements: packoffset( c0 );
+	uint4 src0_strides: packoffset( c1 );
+	uint4 src1_elements: packoffset( c2 );
+	uint4 result_elements: packoffset( c4 );
+	uint4 result_strides: packoffset( c5 );
+}
+
+#include "groupReduce.hlsli"
+
+inline void computeDotProduct( uint s0, uint s1, uint len, uint thread, inout float acc )
+{
+	float curr = 0;
+	const uint completeVectors = len / 32;
+	uint i;
+	for( i = 0; i < completeVectors; i++, s0 += 32, s1 += 32 )
+		curr = mad( arg0[ s0 + thread ], arg1[ s1 + thread ], curr );
+
+	horizontalSumCompatNew( thread, curr );
+
+	if( 0 == thread )
+	{
+		const uint rem = len % 32;
+		if( 0 != rem )
+		{
+			double f64 = curr;
+			for( i = 0; i < rem; i++ )
+			{
+				precise float a = arg0[ s0 + i ];
+				precise float b = arg1[ s1 + i ];
+				precise float prod = a * b;
+				f64 += prod;
+			}
+			curr = (float)f64;
+		}
+		acc += curr;
+	}
+}
+
+#include "miscUtils.hlsli"
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint i1 = group.y;
+	const uint i0 = group.x;
+
+	const uint ne00 = src0_elements[ 0 ];
+	const uint nk = ne00;
+	const int nh = (int)( nk / 2 );
+
+	const uint ne01 = src0_elements[ 1 ];
+	const int ew0 = roundUp32( ne01 );
+
+	float res = 0;
+	for( int k = -nh; k <= nh; k++ )
+	{
+		const uint source0 = i1 * ew0 * ne00 + uint( nh + k ) * ew0;
+		const uint source1 = uint( i0 + nh + k ) * ew0;
+		computeDotProduct( source0, source1, ew0, thread, res );
+	}
+
+	if( 0 != thread )
+		return;
+
+	const uint nb1 = result_strides[ 1 ];
+	const uint rdi = i1 * nb1 + i0;
+	result[ rdi ] = res;
+}
+\ No newline at end of file
diff --git a/ComputeShaders/convolutionMain2.hlsl b/ComputeShaders/convolutionMain2.hlsl
new file mode 100644
index 0000000..73c9da1
--- /dev/null
+++ b/ComputeShaders/convolutionMain2.hlsl
@@ -0,0 +1,60 @@
+// ggml_compute_forward_conv_1d_2s_f16_f32, GGML_TASK_COMPUTE implementation
+// Dispatch [ ne10 / 2, ne02, 1 ] thread groups
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 src0_elements: packoffset( c0 );
+	uint4 src0_strides: packoffset( c1 );
+	uint4 src1_elements: packoffset( c2 );
+	uint4 result_elements: packoffset( c4 );
+	uint4 result_strides: packoffset( c5 );
+}
+
+#include "groupReduce.hlsli"
+
+inline void computeDotProduct( uint s0, uint s1, uint len, uint thread, inout float acc )
+{
+	float curr = 0;
+	const uint s0End = s0 + len;
+	s0 += thread;
+	s1 += thread;
+	for( ; s0 < s0End; s0 += 32, s1 += 32 )
+		curr = mad( arg0[ s0 ], arg1[ s1 ], curr );
+
+	horizontalSumCompatNew( thread, curr );
+	if( 0 == thread )
+		acc += curr;
+}
+
+#include "miscUtils.hlsli"
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint ne00 = src0_elements[ 0 ];
+	const uint ne01 = src0_elements[ 1 ];
+	const int ew0 = roundUp32( ne01 );
+
+	float res = 0;
+	uint s0 = group.y * ew0 * ne00;
+	uint s1 = group.x * 2 * ew0;
+	// The original implementation did following:
+	// int nh = (int)( nk / 2 );
+	// for( int k = -nh; k <= nh; k++ )
+	// What we doing instead:
+	// for( uint len = ( nk / 2 ) * 2 + 1, i = 0; i < len; i++ )
+	// len = ( nk / 2 ) * 2 + 1 is equal to ( nk | 1 )
+	const uint s0End = s0 + ( ne00 | 1u ) * ew0;
+	for( ; s0 < s0End; s0 += ew0, s1 += ew0 )
+		computeDotProduct( s0, s1, ew0, thread, res );
+
+	if( 0 != thread )
+		return;
+
+	const uint nb1 = result_strides[ 1 ];
+	const uint rdi = group.y * nb1 + group.x;
+	result[ rdi ] = res;
+}
+\ No newline at end of file
diff --git a/ComputeShaders/convolutionMain2Fixed.hlsl b/ComputeShaders/convolutionMain2Fixed.hlsl
new file mode 100644
index 0000000..d21dcbb
--- /dev/null
+++ b/ComputeShaders/convolutionMain2Fixed.hlsl
@@ -0,0 +1,119 @@
+// Optimized version of convolutionMain2.hlsl for kernel size = 3
+// Dispatch [ ( ( ne10 / 2 ) + TILE_Y - 1 ) / TILE_Y, ne02, 1 ] thread groups of this shader
+#ifndef TILE_Y
+static const uint TILE_Y = 8;
+#endif
+#ifndef THREADS
+static const uint THREADS = 64;
+#endif
+
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 src0_elements: packoffset( c0 );
+	uint4 src0_strides: packoffset( c1 );
+	uint4 src1_elements: packoffset( c2 );
+	uint4 result_elements: packoffset( c4 );
+	uint4 result_strides: packoffset( c5 );
+}
+
+// The accumulators we're after
+groupshared float resTemp[ TILE_Y ][ THREADS ];
+
+// Multiply + accumulate the specified row
+inline void accumulate( float a0, float a1, const uint resultRow, const uint thread )
+{
+	float acc = resTemp[ resultRow ][ thread ];
+	acc = mad( a0, a1, acc );
+	resTemp[ resultRow ][ thread ] = acc;
+}
+
+inline void convolutionTile( const uint s0, uint s1, const uint thread, const uint stride, const uint height )
+{
+	// Load 3 rows from arg0
+	const float3 a0 = float3( arg0[ s0 ], arg0[ s0 + stride ], arg0[ s0 + stride * 2 ] );
+
+	// Row 0
+	float a1 = arg1[ s1 ];
+	accumulate( a0[ 0 ], a1, 0, thread );
+	s1 += stride;
+
+	for( uint i = 1; i < height; i++ )
+	{
+		// Row i*2-1
+		// Even-indexed rows only contribute to a single output rows, after muiltiplied by kernel row #1
+		a1 = arg1[ s1 ];
+		accumulate( a0[ 1 ], a1, i - 1, thread );
+		s1 += stride;
+
+		// Row i*2, contributes to 2 output rows corresponding to kernel rows #0 and #2
+		a1 = arg1[ s1 ];
+		accumulate( a0[ 2 ], a1, i - 1, thread );
+		accumulate( a0[ 0 ], a1, i, thread );
+		s1 += stride;
+	}
+
+	// Row height*2 - 1
+	a1 = arg1[ s1 ];
+	accumulate( a0[ 1 ], a1, height - 1, thread );
+	s1 += stride;
+
+	// Row height*2
+	a1 = arg1[ s1 ];
+	accumulate( a0[ 2 ], a1, height - 1, thread );
+}
+
+#include "miscUtils.hlsli"
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	uint i;
+	// Zero out the accumulators
+	for( i = 0; i < TILE_Y; i++ )
+		resTemp[ i ][ thread ] = 0.0;
+	GroupMemoryBarrierWithGroupSync();
+
+	const uint i1 = group.y;
+	const uint i0 = group.x * TILE_Y * 2;
+	const uint height = min( TILE_Y, ( src1_elements.x / 2 ) - group.x * TILE_Y );
+
+	const uint ne00 = src0_elements[ 0 ];
+	const uint ne01 = src0_elements[ 1 ];
+	const int ew0 = roundUp32( ne01 );
+
+	uint s0 = i1 * ew0 * ne00;
+	const uint s0End = s0 + ew0;
+	uint s1 = i0 * ew0;
+	s0 += thread;
+	s1 += thread;
+	for( ; s0 < s0End; s0 += THREADS, s1 += THREADS )
+		convolutionTile( s0, s1, thread, ew0, height );
+
+	GroupMemoryBarrierWithGroupSync();
+
+	// Now we need horizontal sums of these shared accumulators, i.e. reduce [height][THREADS] shared array into [height][1] column
+	for( i = THREADS / 2; i > 0; i /= 2 )
+	{
+		if( thread < i )
+		{
+			for( uint j = 0; j < height; j++ )
+			{
+				float sum = resTemp[ j ][ thread ];
+				sum += resTemp[ j ][ thread + i ];
+				resTemp[ j ][ thread ] = sum;
+			}
+		}
+		GroupMemoryBarrierWithGroupSync();
+	}
+
+	// And finally, store that column to global memory
+	if( thread >= height )
+		return;
+	const uint nb1 = result_strides[ 1 ];
+	const uint rdi = i1 * nb1 + group.x * TILE_Y + thread;
+	result[ rdi ] = resTemp[ thread ][ 0 ];
+}
+\ No newline at end of file
diff --git a/ComputeShaders/convolutionPrep1.hlsl b/ComputeShaders/convolutionPrep1.hlsl
new file mode 100644
index 0000000..528ff16
--- /dev/null
+++ b/ComputeShaders/convolutionPrep1.hlsl
@@ -0,0 +1,39 @@
+// ggml_compute_forward_conv_1d_1s_f16_f32, prepare kernel data (src0)
+// Dispatch [ ne01, ne02, 1 ] thread groups
+Buffer<float> arg0: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 src0_elements: packoffset( c0 );
+	uint4 src0_strides: packoffset( c1 );
+}
+
+inline uint roundUp32( uint x )
+{
+	return ( x + 31 ) & ( ~31u );
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint nb01 = src0_strides[ 1 ];
+	const uint nb02 = src0_strides[ 2 ];
+
+	const uint ne00 = src0_elements[ 0 ];
+	const uint ne01 = src0_elements[ 1 ];
+	const uint ew0 = roundUp32( ne01 );
+
+	const uint i02 = group.y;
+	const uint i01 = group.x;
+
+	uint rsi = i02 * nb02 + i01 * nb01;
+	const uint rsiEnd = rsi + ne00;
+	uint rdi = i02 * ew0 * ne00 + i01;
+	rsi += thread;
+	rdi += thread * ew0;
+	const uint rdiInc = 32 * ew0;
+
+	for( ; rsi < rsiEnd; rsi += 32, rdi += rdiInc )
+		result[ rdi ] = arg0[ rsi ];
+}
+\ No newline at end of file
diff --git a/ComputeShaders/convolutionPrep2.hlsl b/ComputeShaders/convolutionPrep2.hlsl
new file mode 100644
index 0000000..a7e7172
--- /dev/null
+++ b/ComputeShaders/convolutionPrep2.hlsl
@@ -0,0 +1,43 @@
+// ggml_compute_forward_conv_1d_1s_f16_f32, prepare source data (src1)
+// Dispatch [ ne11, 1, 1 ] thread groups
+Buffer<float> arg1: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 src0_elements: packoffset( c0 );
+	uint4 src1_elements: packoffset( c2 );
+	uint4 src1_strides: packoffset( c3 );
+}
+
+#include "miscUtils.hlsli"
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint i11 = group.x;
+
+	const uint ne00 = src0_elements[ 0 ];
+	const uint ne01 = src0_elements[ 1 ];
+	const uint ne10 = src1_elements[ 0 ];
+	const uint nb11 = src1_strides[ 1 ];
+
+	const uint nk = ne00;
+	const uint nh = nk / 2;
+	const int ew0 = roundUp32( ne01 );
+
+	uint rsi = i11 * nb11;
+	uint rdi = nh * ew0 + i11;
+	const uint rdiInc = ew0 * 32;
+	const uint rsiEnd = rsi + ne10;
+
+	rsi += thread;
+	rdi += thread * ew0;
+
+	for( ; rsi < rsiEnd; rsi += 32, rdi += rdiInc )
+	{
+		float f = arg1[ rsi ];
+		f = adjustFp16( f );
+		result[ rdi ] = f;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/copyConvert.hlsl b/ComputeShaders/copyConvert.hlsl
new file mode 100644
index 0000000..399147f
--- /dev/null
+++ b/ComputeShaders/copyConvert.hlsl
@@ -0,0 +1,50 @@
+// ggml_compute_forward_dup_f32 when we only need to convert types, but not reshape the tensor
+// Dispatch [ ne01, ne02, ne03 ] thread groups of this shader
+Buffer<float> arg0: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 src0_elements: packoffset( c0 );
+	uint4 src0_strides: packoffset( c1 );
+	bool downcastFp32 : packoffset( c2.x );
+}
+
+#include "miscUtils.hlsli"
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint nb00 = src0_strides[ 0 ];
+	const uint nb01 = src0_strides[ 1 ];
+	const uint nb02 = src0_strides[ 2 ];
+	const uint nb03 = src0_strides[ 3 ];
+
+	const uint ne00 = src0_elements[ 0 ];
+	const uint ne01 = src0_elements[ 1 ];
+	const uint ne02 = src0_elements[ 2 ];
+	const uint ne03 = src0_elements[ 3 ];
+
+	const uint i01 = group.x;
+	const uint i02 = group.y;
+	const uint i03 = group.z;
+
+	const uint rs = ne00 * nb00;
+	//const uint id = i01 + i02 * ne02 + i03 * ne01 * ne02;
+	const uint id = ( i03 * ne01 + i02 ) * ne02 + i01;
+
+	uint rsi = i01 * nb01 + i02 * nb02 + i03 * nb03;
+	uint rdi = id * rs;
+
+	const uint rsiEnd = rsi + rs;
+	rsi += thread;
+	rdi += thread;
+	for( ; rsi < rsiEnd; rsi += 32, rdi += 32 )
+	{
+		float f = arg0[ rsi ];
+		[branch]
+		if( downcastFp32 )
+			f = adjustFp16( f );
+		result[ rdi ] = f;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/copyTranspose.hlsl b/ComputeShaders/copyTranspose.hlsl
new file mode 100644
index 0000000..fc3e82f
--- /dev/null
+++ b/ComputeShaders/copyTranspose.hlsl
@@ -0,0 +1,56 @@
+// ggml_compute_forward_dup_f32 when we actually need to reshape the tensor
+// Dispatch [ ne01, ne02, ne03 ] thread groups of this shader
+Buffer<float> arg0: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 src0_elements: packoffset( c0 );
+	uint4 src0_strides: packoffset( c1 );
+	bool downcastFp32 : packoffset( c2.x );
+}
+
+#include "miscUtils.hlsli"
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint nb00 = src0_strides[ 0 ];
+	const uint nb01 = src0_strides[ 1 ];
+	const uint nb02 = src0_strides[ 2 ];
+	const uint nb03 = src0_strides[ 3 ];
+
+	const uint ne00 = src0_elements[ 0 ];
+	const uint ne01 = src0_elements[ 1 ];
+	const uint ne02 = src0_elements[ 2 ];
+	const uint ne03 = src0_elements[ 3 ];
+
+	const uint i01 = group.x;
+	const uint i02 = group.y;
+	const uint i03 = group.z;
+
+	// We need following integer: i01*ne00 + i02*ne00*ne01 + i03*ne00*ne01*ne02
+	// We want to minimize count of integer multiplications
+	// Also, DXBC assembly features `imad` instruction which computes a*b+c for integers, the actual hardware hopefully has an equivalent
+	// i03*ne00*ne01*ne02 + i02*ne00*ne01 + i01*ne00
+	// ( i03*ne01*ne02 + i02*ne01 + i01 ) * ne00
+	// ( ( i03*ne02 + i02) * ne01 + i01 ) * ne00
+	uint rdi = ( ( i03 * ne02 + i02 ) * ne01 + i01 ) * ne00;
+
+	const uint rdiEnd = rdi + ne00;
+
+	uint rsi = i01 * nb01 + i02 * nb02 + i03 * nb03;
+	const uint rsiInc = 32 * nb00;
+
+	rdi += thread;
+	rsi += thread * nb00;
+
+	for( ; rdi < rdiEnd; rdi += 32, rsi += rsiInc )
+	{
+		float f = arg0[ rsi ];
+		[branch]
+		if( downcastFp32 )
+			f = adjustFp16( f );
+		result[ rdi ] = f;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/diagMaskInf.hlsl b/ComputeShaders/diagMaskInf.hlsl
new file mode 100644
index 0000000..18e3938
--- /dev/null
+++ b/ComputeShaders/diagMaskInf.hlsl
@@ -0,0 +1,30 @@
+// ggml_compute_forward_diag_mask_inf_f32
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 elements: packoffset( c0 );
+	uint4 strides: packoffset( c1 );
+	uint n_past : packoffset( c2.x );
+}
+
+static const float negativeInfinity = asfloat( 0xff800000 );
+
+[numthreads( 32, 1, 1 )]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint k = group.y;
+	const uint j = group.x;
+
+	// Start of the row
+	uint rdi = k * strides[ 2 ] + j * strides[ 1 ];
+	// End of the row
+	const uint rdiEnd = rdi + elements[ 0 ] * strides[ 0 ];
+	// First index to write in this thread
+	rdi += ( n_past + j + thread + 1 ) * strides[ 0 ];
+	// Index increment
+	const uint rdiInc = 32 * strides[ 0 ];
+
+	for( ; rdi < rdiEnd; rdi += rdiInc )
+		result[ rdi ] = negativeInfinity;
+}
+\ No newline at end of file
diff --git a/ComputeShaders/flashAttention.hlsl b/ComputeShaders/flashAttention.hlsl
new file mode 100644
index 0000000..65212d0
--- /dev/null
+++ b/ComputeShaders/flashAttention.hlsl
@@ -0,0 +1,170 @@
+// Ported from ggml_compute_forward_flash_attn_f16
+// Dispatch with [ neq1*neq2*neq3, 1, 1 ] thread groups
+
+#include "flashAttentionCommon.hlsli"
+Buffer<uint> lookupTable: register( t3 );
+#include "groupReduce.hlsli"
+
+inline void computeDotProduct( Buffer<float> buff0, Buffer<float> buff1, uint s0, uint s1, const uint len, const uint thread, inout float acc )
+{
+	acc = 0;
+	const uint s0End = s0 + len;
+	s0 += thread;
+	s1 += thread;
+	for( ; s0 < s0End; s0 += 32, s1 += 32 )
+		acc = mad( buff0[ s0 ], buff1[ s1 ], acc );
+
+	horizontalSum( thread, acc );
+}
+
+inline void computeDotProduct( Buffer<float> buff0, RWBuffer<float> buff1, uint s0, uint s1, const uint len, const uint thread, inout float acc )
+{
+	acc = 0;
+	const uint s0End = s0 + len;
+	s0 += thread;
+	s1 += thread;
+	for( ; s0 < s0End; s0 += 32, s1 += 32 )
+		acc = mad( buff0[ s0 ], buff1[ s1 ], acc );
+
+	horizontalSum( thread, acc );
+}
+
+void scaleTempVector( uint i, const uint length, const uint thread, const float multiplier, bool round )
+{
+	const uint end = i + length;
+	for( i += thread; i < end; i += 32 )
+	{
+		float f = temp[ i ];
+		f *= multiplier;
+		if( round )
+			f = roundToFp16( f );
+		temp[ i ] = f;
+	}
+}
+
+#include "miscUtils.hlsli"
+
+// Transform temp[ i ] = exp( temp[ i ] - tempMax ), and return the sum of these values
+inline float applySoftMax( uint i, const uint length, const uint thread, const float tempMax )
+{
+	// Transform the values, and compute per-thread sum
+	const uint end = i + length;
+	float sum = 0;
+	for( i += thread; i < end; i += 32 )
+	{
+		float f = temp[ i ];
+		[branch]
+		if( f != negativeInfinity )
+		{
+			f -= tempMax;
+			const uint index = fp16Rounded( f );
+			const uint res16 = lookupTable[ index ];
+			f = f16tof32( res16 );
+		}
+		else
+			f = 0;
+
+		temp[ i ] = f;
+		sum += f;
+	}
+
+	// Reduce per-thread sum to the global one, over all threads of the group
+	horizontalSumBroadcast( thread, sum );
+	return sum;
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint neq0 = q_elements[ 0 ];
+	const uint neq1 = q_elements[ 1 ];
+	const uint neq2 = q_elements[ 2 ];
+	const uint neq3 = q_elements[ 3 ];
+
+	const uint nek0 = k_elements[ 0 ];
+	const uint nek1 = k_elements[ 1 ];
+
+	const uint nev1 = v_elements[ 1 ];
+
+	const uint ne0 = res_elements[ 0 ];
+	const uint ne1 = res_elements[ 1 ];
+
+	const uint nbk0 = k_strides[ 0 ];
+	const uint nbk1 = k_strides[ 1 ];
+	const uint nbk2 = k_strides[ 2 ];
+	const uint nbk3 = k_strides[ 3 ];
+
+	const uint nbq0 = q_strides[ 0 ];
+	const uint nbq1 = q_strides[ 1 ];
+	const uint nbq2 = q_strides[ 2 ];
+	const uint nbq3 = q_strides[ 3 ];
+
+	const uint nbv0 = v_strides[ 0 ];
+	const uint nbv1 = v_strides[ 1 ];
+	const uint nbv2 = v_strides[ 2 ];
+	const uint nbv3 = v_strides[ 3 ];
+
+	const uint nb0 = res_strides[ 0 ];
+	const uint nb1 = res_strides[ 1 ];
+	const uint nb2 = res_strides[ 2 ];
+	const uint nb3 = res_strides[ 3 ];
+
+	const uint D = neq0;
+	const uint N = neq1;
+	const uint P = nek1 - N;
+	const uint M = nek1;
+
+	const uint ir = group.x;
+	const uint iq3 = ir / ( neq2 * neq1 );
+	const uint iq2 = ( ir - iq3 * neq2 * neq1 ) / neq1;
+	const uint iq1 = ( ir - iq3 * neq2 * neq1 - iq2 * neq1 );
+
+	const uint tempIndex = ir * tempBufferStride;
+
+	uint ic;
+	float tvm = negativeInfinity;
+	const uint s1 = iq1 * nbq1 + iq2 * nbq2 + iq3 * nbq3;
+	uint s0 = iq2 * nbk2 + iq3 * nbk3;
+	for( ic = 0; ic < nek1; ic++, s0 += nbk1 )
+	{
+		if( masked )
+		{
+			if( ic > P + iq1 )
+			{
+				if( 0 == thread )
+					temp[ tempIndex + ic ] = negativeInfinity;
+				continue;
+			}
+		}
+
+		float dp;
+		computeDotProduct( k, q, s0, s1, neq0, thread, dp );
+		if( 0 == thread )
+		{
+			dp *= scale;
+			temp[ tempIndex + ic ] = dp;
+			tvm = max( tvm, dp );
+		}
+	}
+
+	if( 0 == thread )
+		sharedAccumulators[ 0 ] = tvm;
+	GroupMemoryBarrierWithGroupSync();
+	tvm = sharedAccumulators[ 0 ];
+
+	// Softmax
+	{
+		float sum = applySoftMax( tempIndex, M, thread, tvm );
+		scaleTempVector( tempIndex, M, thread, 1.0 / sum, true );
+	}
+
+	s0 = iq2 * nbv2 + iq3 * nbv3;
+	uint rdi = iq1 * nb1 + iq2 * nb2 + iq3 * nb3;
+	for( ic = 0; ic < nev1; ic++, s0 += nbv1, rdi += nb0 )
+	{
+		float dp;
+		computeDotProduct( v, temp, s0, tempIndex, nek1, thread, dp );
+		if( 0 == thread )
+			result[ rdi ] = dp;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/flashAttentionCommon.hlsli b/ComputeShaders/flashAttentionCommon.hlsli
new file mode 100644
index 0000000..68ed30b
--- /dev/null
+++ b/ComputeShaders/flashAttentionCommon.hlsli
@@ -0,0 +1,67 @@
+// Ported from ggml_compute_forward_flash_attn_f16
+// Dispatch with [ neq1*neq2*neq3, 1, 1 ] thread groups
+Buffer<float> q: register( t0 );
+Buffer<float> k: register( t1 );
+Buffer<float> v: register( t2 );
+
+RWBuffer<float> result: register( u0 );
+// This temporary buffer should fit tempBufferStride * neq1 * neq2 * neq3 elements, FP32 precision
+RWBuffer<float> temp: register( u1 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 q_elements: packoffset( c0 );
+	uint4 q_strides: packoffset( c1 );
+	uint4 k_elements: packoffset( c2 );
+	uint4 k_strides: packoffset( c3 );
+	uint4 v_elements: packoffset( c4 );
+	uint4 v_strides: packoffset( c5 );
+	uint4 res_elements: packoffset( c6 );
+	uint4 res_strides: packoffset( c7 );
+
+	bool masked : packoffset( c8.x );
+	// 1.0 / sqrt( (double) D )
+	float scale : packoffset( c8.y );
+	// This number is required to be >= nek1, and ideally rounded up to either 32 (L2 line) or 128 (L1 line) bytes
+	uint tempBufferStride: packoffset( c8.z );
+}
+
+static const float negativeInfinity = asfloat( 0xff800000 );
+
+// Convert FP32 number to FP16 using rounding to nearest, then upcast back to FP32
+inline float roundToFp16( const float src )
+{
+	const uint trunc16 = f32tof16( src );
+	const float trunc32 = f16tof32( trunc16 );
+
+	const uint truncExp = ( trunc16 >> 10 ) & 0x1F;
+	if( truncExp != 0x1F )
+	{
+		const uint next16 = trunc16 + 1;
+		const float next32 = f16tof32( next16 );
+
+		const float errTrunc = abs( src - trunc32 );
+		const float errNext = abs( src - next32 );
+
+		if( errTrunc < errNext )
+		{
+			// Truncated was closer to the source
+			return trunc32;
+		}
+		else if( errTrunc > errNext )
+		{
+			// Truncated + 1 was closer to the source
+			return next32;
+		}
+		else
+		{
+			// Exactly half, doing banker's rounding to nearest even
+			return ( 0 == ( trunc16 & 1 ) ) ? trunc32 : next32;
+		}
+	}
+	else
+	{
+		// INF or NAN
+		return trunc32;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/flashAttentionCompat1.hlsl b/ComputeShaders/flashAttentionCompat1.hlsl
new file mode 100644
index 0000000..f1f2ddb
--- /dev/null
+++ b/ComputeShaders/flashAttentionCompat1.hlsl
@@ -0,0 +1,125 @@
+// Dispatch with [ neq1*neq2*neq3, 1, 1 ] thread groups
+#include "flashAttentionCommon.hlsli"
+#include "groupReduce.hlsli"
+
+inline void computeDotProduct( Buffer<float> buff0, Buffer<float> buff1, uint s0, uint s1, const uint len, const uint thread, inout float acc )
+{
+	acc = 0;
+	/*
+	const uint s0End = s0 + len;
+	s0 += thread;
+	s1 += thread;
+	for( ; s0 < s0End; s0 += 32, s1 += 32 )
+		acc = mad( buff0[ s0 ], buff1[ s1 ], acc );
+	horizontalSumCompatNew( thread, acc );
+	*/
+
+	const uint completeVectors = len / 32;
+	uint i;
+	for( i = 0; i < completeVectors; i++, s0 += 32, s1 += 32 )
+		acc = mad( buff0[ s0 + thread ], buff1[ s1 + thread ], acc );
+
+	horizontalSumCompatNew( thread, acc );
+
+	if( 0 == thread )
+	{
+		const uint rem = len % 32;
+		for( i = 0; i < rem; i++ )
+		{
+			precise float a = buff0[ s0 + i ];
+			precise float b = buff1[ s1 + i ];
+			precise float prod = a * b;
+			acc += prod;
+		}
+	}
+}
+
+void scaleTempVector( uint i, const uint length, const uint thread, const float multiplier )
+{
+	const uint end = i + length;
+	for( i += thread; i < end; i += 32 )
+	{
+		float f = temp[ i ];
+		f *= multiplier;
+		temp[ i ] = f;
+	}
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint neq0 = q_elements[ 0 ];
+	const uint neq1 = q_elements[ 1 ];
+	const uint neq2 = q_elements[ 2 ];
+	const uint neq3 = q_elements[ 3 ];
+
+	const uint nek0 = k_elements[ 0 ];
+	const uint nek1 = k_elements[ 1 ];
+
+	const uint nev1 = v_elements[ 1 ];
+
+	const uint ne0 = res_elements[ 0 ];
+	const uint ne1 = res_elements[ 1 ];
+
+	const uint nbk0 = k_strides[ 0 ];
+	const uint nbk1 = k_strides[ 1 ];
+	const uint nbk2 = k_strides[ 2 ];
+	const uint nbk3 = k_strides[ 3 ];
+
+	const uint nbq0 = q_strides[ 0 ];
+	const uint nbq1 = q_strides[ 1 ];
+	const uint nbq2 = q_strides[ 2 ];
+	const uint nbq3 = q_strides[ 3 ];
+
+	const uint nbv0 = v_strides[ 0 ];
+	const uint nbv1 = v_strides[ 1 ];
+	const uint nbv2 = v_strides[ 2 ];
+	const uint nbv3 = v_strides[ 3 ];
+
+	const uint nb0 = res_strides[ 0 ];
+	const uint nb1 = res_strides[ 1 ];
+	const uint nb2 = res_strides[ 2 ];
+	const uint nb3 = res_strides[ 3 ];
+
+	const uint D = neq0;
+	const uint N = neq1;
+	const uint P = nek1 - N;
+	// const uint M = P + N;
+	const uint M = nek1;
+
+	const uint ir = group.x;
+	const uint iq3 = ir / ( neq2 * neq1 );
+	const uint iq2 = ( ir - iq3 * neq2 * neq1 ) / neq1;
+	const uint iq1 = ( ir - iq3 * neq2 * neq1 - iq2 * neq1 );
+
+	const uint tempIndex = ir * tempBufferStride;
+
+	uint ic;
+	for( ic = 0; ic < nek1; ic++ )
+	{
+		// k indices
+		const uint ik3 = iq3;
+		const uint ik2 = iq2;
+		const uint ik1 = ic;
+
+		// S indices
+		const uint i1 = ik1;
+
+		if( masked )
+		{
+			if( ic > P + iq1 )
+			{
+				if( 0 == thread )
+					temp[ tempIndex + ic ] = negativeInfinity;
+				continue;
+			}
+		}
+
+		const uint s0 = ik1 * nbk1 + ik2 * nbk2 + ik3 * nbk3;
+		const uint s1 = iq1 * nbq1 + iq2 * nbq2 + iq3 * nbq3;
+		float dp;
+		computeDotProduct( k, q, s0, s1, neq0, thread, dp );
+		if( 0 == thread )
+			temp[ tempIndex + ic ] = dp * scale;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/flashAttentionCompat2.hlsl b/ComputeShaders/flashAttentionCompat2.hlsl
new file mode 100644
index 0000000..73f5fce
--- /dev/null
+++ b/ComputeShaders/flashAttentionCompat2.hlsl
@@ -0,0 +1,114 @@
+// Dispatch with [ ( neq1*neq2*neq3 + 31 ) / 32, 1, 1 ] thread groups
+#include "flashAttentionCommon.hlsli"
+Buffer<uint> lookupTable: register( t3 );
+
+void scaleTempVector( uint i, const uint length, const float multiplier )
+{
+	const uint end = i + length;
+	for( ; i < end; i++ )
+	{
+		float f = temp[ i ];
+		f *= multiplier;
+		// Rounding in this shader causes numerical errors on my GeForce 1080 Ti GPU, driver 527.56
+		// f = roundToFp16( f );
+		temp[ i ] = f;
+	}
+}
+
+inline float computeTempVectorMax( uint i, const uint length )
+{
+	// Compute per-thread maximum
+	const uint end = i + length;
+	float ax = negativeInfinity;
+	for( ; i < end; i++ )
+		ax = max( ax, temp[ i ] );
+	return ax;
+}
+
+#include "miscUtils.hlsli"
+#include "fp64Utils.hlsli"
+
+// Transform temp[ i ] = exp( temp[ i ] - tempMax ), and return the sum of these values
+inline double applySoftMax( uint i, const uint length, const float tempMax )
+{
+	// Transform the values, and compute per-thread sum
+	const uint end = i + length;
+	double sum = 0;
+	for( ; i < end; i++ )
+	{
+		float f = temp[ i ];
+		[branch]
+		if( f != negativeInfinity )
+		{
+			f -= tempMax;
+			const uint index = fp16Rounded( f );
+			const uint res16 = lookupTable[ index ];
+			f = f16tof32( res16 );
+			sum += f;
+		}
+		else
+			f = 0;
+
+		temp[ i ] = f;
+	}
+	return sum;
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 dtid: SV_DispatchThreadID )
+{
+	const uint neq0 = q_elements[ 0 ];
+	const uint neq1 = q_elements[ 1 ];
+	const uint neq2 = q_elements[ 2 ];
+	const uint neq3 = q_elements[ 3 ];
+
+	const uint nek0 = k_elements[ 0 ];
+	const uint nek1 = k_elements[ 1 ];
+
+	const uint nev1 = v_elements[ 1 ];
+
+	const uint ne0 = res_elements[ 0 ];
+	const uint ne1 = res_elements[ 1 ];
+
+	const uint nbk0 = k_strides[ 0 ];
+	const uint nbk1 = k_strides[ 1 ];
+	const uint nbk2 = k_strides[ 2 ];
+	const uint nbk3 = k_strides[ 3 ];
+
+	const uint nbq0 = q_strides[ 0 ];
+	const uint nbq1 = q_strides[ 1 ];
+	const uint nbq2 = q_strides[ 2 ];
+	const uint nbq3 = q_strides[ 3 ];
+
+	const uint nbv0 = v_strides[ 0 ];
+	const uint nbv1 = v_strides[ 1 ];
+	const uint nbv2 = v_strides[ 2 ];
+	const uint nbv3 = v_strides[ 3 ];
+
+	const uint nb0 = res_strides[ 0 ];
+	const uint nb1 = res_strides[ 1 ];
+	const uint nb2 = res_strides[ 2 ];
+	const uint nb3 = res_strides[ 3 ];
+
+	const uint D = neq0;
+	const uint N = neq1;
+	const uint P = nek1 - N;
+	// const uint M = P + N;
+	const uint M = nek1;
+
+	const uint ir = dtid.x;
+	if( ir >= neq1 * neq2 * neq3 )
+		return;
+
+	const uint iq3 = ir / ( neq2 * neq1 );
+	const uint iq2 = ( ir - iq3 * neq2 * neq1 ) / neq1;
+	const uint iq1 = ( ir - iq3 * neq2 * neq1 - iq2 * neq1 );
+
+	const uint tempIndex = ir * tempBufferStride;
+
+	// Softmax
+	float tvm = computeTempVectorMax( tempIndex, M );
+	double sum = applySoftMax( tempIndex, M, tvm );
+
+	scaleTempVector( tempIndex, M, (float)( 1.0 / sum ) );
+}
+\ No newline at end of file
diff --git a/ComputeShaders/flashAttentionCompat3.hlsl b/ComputeShaders/flashAttentionCompat3.hlsl
new file mode 100644
index 0000000..e0a4061
--- /dev/null
+++ b/ComputeShaders/flashAttentionCompat3.hlsl
@@ -0,0 +1,118 @@
+// Dispatch with [ neq1*neq2*neq3, 1, 1 ] thread groups
+#include "flashAttentionCommon.hlsli"
+#include "groupReduce.hlsli"
+#include "miscUtils.hlsli"
+
+inline void roundTempVector( uint i, const uint len, const uint thread )
+{
+	const uint iEnd = i + len;
+	for( i += thread; i < iEnd; i += 32 )
+	{
+		float f = temp[ i ];
+		f = roundToFp16( f );
+		temp[ i ] = f;
+	}
+}
+
+inline void computeDotProduct( Buffer<float> buff0, RWBuffer<float> buff1, uint s0, uint s1, const uint len, const uint thread, inout float acc )
+{
+	acc = 0;
+/*	const uint s0End = s0 + len;
+	s0 += thread;
+	s1 += thread;
+	for( ; s0 < s0End; s0 += 32, s1 += 32 )
+		acc = mad( buff0[ s0 ], buff1[ s1 ], acc );
+
+	horizontalSumCompatNew( thread, acc ); */
+	const uint completeVectors = len / 32;
+	uint i;
+	for( i = 0; i < completeVectors; i++, s0 += 32, s1 += 32 )
+		acc = mad( buff0[ s0 + thread ], buff1[ s1 + thread ], acc );
+
+	horizontalSumCompatNew( thread, acc );
+
+	if( 0 == thread )
+	{
+		const uint rem = len % 32;
+		if( 0 != rem )
+		{
+			double f64 = acc;
+			for( i = 0; i < rem; i++ )
+			{
+				precise float a = buff0[ s0 + i ];
+				precise float b = buff1[ s1 + i ];
+				precise float prod = a * b;
+				f64 += prod;
+			}
+			acc = (float)f64;
+		}
+	}
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint neq0 = q_elements[ 0 ];
+	const uint neq1 = q_elements[ 1 ];
+	const uint neq2 = q_elements[ 2 ];
+	const uint neq3 = q_elements[ 3 ];
+
+	const uint nek0 = k_elements[ 0 ];
+	const uint nek1 = k_elements[ 1 ];
+
+	const uint nev1 = v_elements[ 1 ];
+
+	const uint ne0 = res_elements[ 0 ];
+	const uint ne1 = res_elements[ 1 ];
+
+	const uint nbk0 = k_strides[ 0 ];
+	const uint nbk1 = k_strides[ 1 ];
+	const uint nbk2 = k_strides[ 2 ];
+	const uint nbk3 = k_strides[ 3 ];
+
+	const uint nbq0 = q_strides[ 0 ];
+	const uint nbq1 = q_strides[ 1 ];
+	const uint nbq2 = q_strides[ 2 ];
+	const uint nbq3 = q_strides[ 3 ];
+
+	const uint nbv0 = v_strides[ 0 ];
+	const uint nbv1 = v_strides[ 1 ];
+	const uint nbv2 = v_strides[ 2 ];
+	const uint nbv3 = v_strides[ 3 ];
+
+	const uint nb0 = res_strides[ 0 ];
+	const uint nb1 = res_strides[ 1 ];
+	const uint nb2 = res_strides[ 2 ];
+	const uint nb3 = res_strides[ 3 ];
+
+	const uint D = neq0;
+	const uint N = neq1;
+	const uint P = nek1 - N;
+	// const uint M = P + N;
+	const uint M = nek1;
+
+	const uint ir = group.x;
+	const uint iq3 = ir / ( neq2 * neq1 );
+	const uint iq2 = ( ir - iq3 * neq2 * neq1 ) / neq1;
+	const uint iq1 = ( ir - iq3 * neq2 * neq1 - iq2 * neq1 );
+
+	const uint tempIndex = ir * tempBufferStride;
+
+	roundTempVector( tempIndex, nek1, thread );
+	AllMemoryBarrierWithGroupSync();
+
+	uint rdi = iq1 * nb1 + iq2 * nb2 + iq3 * nb3;
+	for( uint ic = 0; ic < nev1; ic++, rdi += nb0 )
+	{
+		// dst indices
+		const uint i1 = iq1;
+		const uint i2 = iq2;
+		const uint i3 = iq3;
+
+		const uint s0 = ic * nbv1 + i2 * nbv2 + i3 * nbv3;
+		float dp;
+		computeDotProduct( v, temp, s0, tempIndex, nek1, thread, dp );
+		if( 0 == thread )
+			result[ rdi ] = dp;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/fmaRepeat1.hlsl b/ComputeShaders/fmaRepeat1.hlsl
new file mode 100644
index 0000000..3db3827
--- /dev/null
+++ b/ComputeShaders/fmaRepeat1.hlsl
@@ -0,0 +1,77 @@
+// Implementation of fmaRepeat() when both source arguments have same size and strides
+// Dispatch [ nb[ 1 ], nb[ 2 ], nb[ 3 ] ] thread groups of this shader, where nb is size of the destination tensor
+RWBuffer<float> tensor: register( u0 );
+Buffer<float> patternMul: register( t0 );
+Buffer<float> patternAdd: register( t1 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 tensorSize: packoffset( c0 );
+	uint4 tensorStrides: packoffset( c1 );
+	uint4 patternSize: packoffset( c2 );
+	uint4 patternStrides: packoffset( c3 );
+}
+
+#ifndef THREADS
+#define THREADS 512
+#endif
+
+#include "repeatUtils.hlsli"
+
+inline void computeSimple( uint idx, float mul, float add )
+{
+	precise float f = tensor[ idx ];
+	f *= mul;
+	f += add;
+	tensor[ idx ] = f;
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	uint3 it = tensorIteratorState( group, thread, tensorSize, tensorStrides );
+	uint rsi = rowOffset( group % patternSize.yzw, patternStrides );
+
+	if( patternSize[ 0 ] == 1 )
+	{
+		// The pattern only has 1 column - broadcasting over the row
+		const float pMul = patternMul[ rsi ];
+		const float pAdd = patternAdd[ rsi ];
+		ROW_LOOP( it )
+			computeSimple( it.x, pMul, pAdd );
+	}
+	else if( patternSize[ 0 ] <= THREADS )
+	{
+		// pattern size doesn't exceed thread group size: load pattern value outside of the loop
+		const uint threadsPerGroup = THREADS - ( THREADS % patternSize[ 0 ] );
+		if( thread >= threadsPerGroup )
+			return;
+
+		rsi += ( thread % patternSize[ 0 ] ) * patternStrides[ 0 ];
+		const float pMul = patternMul[ rsi ];
+		const float pAdd = patternAdd[ rsi ];
+		ROW_LOOP_EX( it, threadsPerGroup, tensorStrides )
+			computeSimple( it.x, pMul, pAdd );
+	}
+	else
+	{
+		// Pattern rows are larger than the thread group, need to stream from both buffers
+		const uint rsiInc = THREADS * patternStrides[ 0 ];
+		const uint rsiDec = patternSize[ 0 ] * patternStrides[ 0 ];
+		const uint rsiEnd = rsi + rsiDec;
+		rsi += thread * patternStrides[ 0 ];
+
+		ROW_LOOP( it )
+		{
+			precise float f = tensor[ it.x ];
+			float mul = patternMul[ rsi ];
+			float add = patternAdd[ rsi ];
+			rsi += rsiInc;
+			if( rsi >= rsiEnd )
+				rsi -= rsiDec;
+			f *= mul;
+			f += add;
+			tensor[ it.x ] = f;
+		}
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/fmaRepeat164.hlsl b/ComputeShaders/fmaRepeat164.hlsl
new file mode 100644
index 0000000..99813f6
--- /dev/null
+++ b/ComputeShaders/fmaRepeat164.hlsl
@@ -0,0 +1,2 @@
+#define THREADS 64
+#include "fmaRepeat1.hlsl"
+\ No newline at end of file
diff --git a/ComputeShaders/fmaRepeat2.hlsl b/ComputeShaders/fmaRepeat2.hlsl
new file mode 100644
index 0000000..edadf0a
--- /dev/null
+++ b/ComputeShaders/fmaRepeat2.hlsl
@@ -0,0 +1,45 @@
+// Implementation of fmaRepeat() when source arguments have different shape or VRAM layout
+// Dispatch [ nb[ 1 ], nb[ 2 ], nb[ 3 ] ] thread groups of this shader, where nb is size of the destination tensor
+RWBuffer<float> tensor: register( u0 );
+Buffer<float> patternMul: register( t0 );
+Buffer<float> patternAdd: register( t1 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 tensorSize: packoffset( c0 );
+	uint4 tensorStrides: packoffset( c1 );
+	uint4 patternSizeMul: packoffset( c2 );
+	uint4 patternStridesMul: packoffset( c3 );
+	uint4 patternSizeAdd: packoffset( c4 );
+	uint4 patternStridesAdd: packoffset( c5 );
+}
+
+#ifndef THREADS
+#define THREADS 32
+#endif
+
+#include "repeatUtils.hlsli"
+
+inline float loadPattern( Buffer<float> buffer, uint rowStart, uint i, uint4 size, uint4 stride )
+{
+	i %= size.x;
+	return buffer[ i * stride.x + rowStart ];
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	uint3 it = tensorIteratorState( group, thread, tensorSize, tensorStrides );
+	const uint rsiMul = rowOffset( group % patternSizeMul.yzw, patternStridesMul );
+	const uint rsiAdd = rowOffset( group % patternSizeAdd.yzw, patternStridesAdd );
+
+	for( uint i = thread; it.x < it.z; it.x += it.y, i++ )
+	{
+		precise float f = tensor[ it.x ];
+		float mul = loadPattern( patternMul, rsiMul, i, patternSizeMul, patternStridesMul );
+		float add = loadPattern( patternAdd, rsiAdd, i, patternSizeAdd, patternStridesAdd );
+		f *= mul;
+		f += add;
+		tensor[ it.x ] = f;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/fp64Utils.hlsli b/ComputeShaders/fp64Utils.hlsli
new file mode 100644
index 0000000..9782718
--- /dev/null
+++ b/ComputeShaders/fp64Utils.hlsli
@@ -0,0 +1,28 @@
+// TODO: compile another version of these shader, and use it on GPUs with ExtendedDoublesShaderInstructions flag, will become slightly faster
+// https://learn.microsoft.com/en-us/windows/win32/api/d3d11/ns-d3d11-d3d11_feature_data_d3d11_options
+#ifndef ExtendedDoublesShaderInstructions
+#define ExtendedDoublesShaderInstructions 0
+#endif
+
+// Compute num/den in FP64 precision
+inline double div64( double num, double den )
+{
+#if ExtendedDoublesShaderInstructions
+	return num / den;
+#else
+	// https://en.wikipedia.org/wiki/Division_algorithm#Newton%E2%80%93Raphson_division
+	double x = 1.0f / (float)den;
+	x += x * ( 1.0 - den * x );
+	x += x * ( 1.0 - den * x );
+	return num * x;
+#endif
+}
+
+// Compute sqrt(x) in FP64 precision
+inline double sqrt64( double x )
+{
+	double root = sqrt( (float)x );
+	root = 0.5 * ( root + div64( x, root ) );
+	root = 0.5 * ( root + div64( x, root ) );
+	return root;
+}
+\ No newline at end of file
diff --git a/ComputeShaders/groupReduce.hlsli b/ComputeShaders/groupReduce.hlsli
new file mode 100644
index 0000000..1ffe1d8
--- /dev/null
+++ b/ComputeShaders/groupReduce.hlsli
@@ -0,0 +1,139 @@
+groupshared float sharedAccumulators[ 32 ];
+
+// Compute horisontal sum of the numbers. The result is only correct on the thread #0 of the group.
+void horizontalSum( const uint thread, inout float sum )
+{
+	sharedAccumulators[ thread ] = sum;
+	for( uint i = 16; i > 1; i /= 2 )
+	{
+		GroupMemoryBarrierWithGroupSync();
+		if( thread < i )
+		{
+			sum += sharedAccumulators[ thread + i ];
+			sharedAccumulators[ thread ] = sum;
+		}
+	}
+	GroupMemoryBarrierWithGroupSync();
+	if( 0 == thread )
+		sum += sharedAccumulators[ 1 ];
+}
+
+// Compute horisontal sum of the numbers, and broadcast to all threads of the group.
+void horizontalSumBroadcast( const uint thread, inout float sum )
+{
+	horizontalSum( thread, sum );
+	if( 0 == thread )
+		sharedAccumulators[ 0 ] = sum;
+	GroupMemoryBarrierWithGroupSync();
+	sum = sharedAccumulators[ 0 ];
+}
+
+// Compute horisontal sum of the numbers, in the order equal to the CPU-running dot product implementation.
+// The result is only correct on the thread #0 of the group.
+void horizontalSumCompat( const uint thread, inout float sum )
+{
+	sharedAccumulators[ thread ] = sum;
+	GroupMemoryBarrierWithGroupSync();
+
+	if( 0 == ( thread & 8 ) )
+	{
+		// This runs on threads [ 0 .. 7 ] and [ 16 .. 23 ]
+		// sum01 = _mm256_add_ps( sum0, sum1 );
+		// sum23 = _mm256_add_ps( sum2, sum3 );
+		sum += sharedAccumulators[ thread + 8 ];
+		sharedAccumulators[ thread ] = sum;
+	}
+
+	GroupMemoryBarrierWithGroupSync();
+	if( thread < 8 )
+	{
+		// This runs on threads [ 0 .. 7 ]
+		// sum0123 = _mm256_add_ps( sum01, sum23 );
+		sum += sharedAccumulators[ thread + 16 ];
+		sharedAccumulators[ thread ] = sum;
+	}
+
+	GroupMemoryBarrierWithGroupSync();
+	if( thread < 4 )
+	{
+		// const __m128 r4 = _mm_add_ps( _mm256_castps256_ps128( sum0123 ), _mm256_extractf128_ps( sum0123, 1 ) );
+		sum += sharedAccumulators[ thread + 4 ];
+		sharedAccumulators[ thread ] = sum;
+	}
+
+	GroupMemoryBarrierWithGroupSync();
+	if( thread < 2 )
+	{
+		// const __m128 r2 = _mm_add_ps( r4, _mm_movehl_ps( r4, r4 ) );
+		sum += sharedAccumulators[ thread + 2 ];
+		sharedAccumulators[ thread ] = sum;
+	}
+
+	GroupMemoryBarrierWithGroupSync();
+	if( 0 == thread )
+	{
+		// const __m128 r1 = _mm_add_ss( r2, _mm_movehdup_ps( r2 ) );
+		sum += sharedAccumulators[ 1 ];
+	}
+}
+
+// Compute horisontal sum of the numbers, in yet another creative summation order recently implemented in the upstream
+void horizontalSumCompatNew( const uint thread, inout float sum )
+{
+	// GGML_F32x8_REDUCE
+	sharedAccumulators[ thread ] = sum;
+	GroupMemoryBarrierWithGroupSync();
+
+	if( 0 == ( thread & 8 ) )
+	{
+		// Runs on threads [ 0 .. 7 ] and [ 16 .. 23 ]
+		sum += sharedAccumulators[ thread | 8 ];
+		sharedAccumulators[ thread ] = sum;
+	}
+	GroupMemoryBarrierWithGroupSync();
+
+	if( thread < 8 )
+	{
+		// Runs on threads [ 0 .. 7 ]
+		sum += sharedAccumulators[ thread | 0x10 ];
+		sharedAccumulators[ thread ] = sum;
+	}
+	GroupMemoryBarrierWithGroupSync();
+
+	if( thread < 4 )
+	{
+		// Runs on threads [ 0 .. 3 ]
+		sum += sharedAccumulators[ thread | 4 ];
+		sharedAccumulators[ thread ] = sum;
+	}
+	GroupMemoryBarrierWithGroupSync();
+
+	if( thread < 4 && 0 == ( thread & 1 ) )
+	{
+		// Runs on threads [ 0, 2 ]
+		sum += sharedAccumulators[ thread | 1 ];
+		sharedAccumulators[ thread ] = sum;
+	}
+	GroupMemoryBarrierWithGroupSync();
+
+	if( 0 == thread )
+		sum += sharedAccumulators[ 2 ];
+}
+
+
+// Compute horizontal maximum of the numbers, and broadcast to all threads of the group.
+void horizontalMaxBroadcast( const uint thread, inout float ax )
+{
+	sharedAccumulators[ thread ] = ax;
+	for( uint i = 16; i > 0; i /= 2 )
+	{
+		GroupMemoryBarrierWithGroupSync();
+		if( thread < i )
+		{
+			ax = max( ax, sharedAccumulators[ thread + i ] );
+			sharedAccumulators[ thread ] = ax;
+		}
+	}
+	GroupMemoryBarrierWithGroupSync();
+	ax = sharedAccumulators[ 0 ];
+}
+\ No newline at end of file
diff --git a/ComputeShaders/groupReduce64.hlsli b/ComputeShaders/groupReduce64.hlsli
new file mode 100644
index 0000000..7094d03
--- /dev/null
+++ b/ComputeShaders/groupReduce64.hlsli
@@ -0,0 +1,46 @@
+groupshared float sharedAccumulators[ 64 ];
+
+// Compute horisontal sum of the numbers. The result is only correct on the thread #0 of the group.
+void horizontalSum( const uint thread, inout float sum )
+{
+	sharedAccumulators[ thread ] = sum;
+	for( uint i = 32; i > 1; i /= 2 )
+	{
+		GroupMemoryBarrierWithGroupSync();
+		if( thread < i )
+		{
+			sum += sharedAccumulators[ thread + i ];
+			sharedAccumulators[ thread ] = sum;
+		}
+	}
+	GroupMemoryBarrierWithGroupSync();
+	if( 0 == thread )
+		sum += sharedAccumulators[ 1 ];
+}
+
+// Compute horisontal sum of the numbers, and broadcast to all threads of the group.
+void horizontalSumBroadcast( const uint thread, inout float sum )
+{
+	horizontalSum( thread, sum );
+	if( 0 == thread )
+		sharedAccumulators[ 0 ] = sum;
+	GroupMemoryBarrierWithGroupSync();
+	sum = sharedAccumulators[ 0 ];
+}
+
+// Compute horizontal maximum of the numbers, and broadcast to all threads of the group.
+void horizontalMaxBroadcast( const uint thread, inout float ax )
+{
+	sharedAccumulators[ thread ] = ax;
+	for( uint i = 32; i > 0; i /= 2 )
+	{
+		GroupMemoryBarrierWithGroupSync();
+		if( thread < i )
+		{
+			ax = max( ax, sharedAccumulators[ thread + i ] );
+			sharedAccumulators[ thread ] = ax;
+		}
+	}
+	GroupMemoryBarrierWithGroupSync();
+	ax = sharedAccumulators[ 0 ];
+}
+\ No newline at end of file
diff --git a/ComputeShaders/matReshapePanels.hlsl b/ComputeShaders/matReshapePanels.hlsl
new file mode 100644
index 0000000..f26f246
--- /dev/null
+++ b/ComputeShaders/matReshapePanels.hlsl
@@ -0,0 +1,105 @@
+// This shader reshapes a matrix into the shape expected by mulMatTiledEx.hlsl and mulMatByRowTiledEx.hlsl compute shaders
+// It's called in runtime, also while loading models from disk.
+// So far, it's only used when running on AMD GPUs.
+#ifndef TILE_SIZE
+static const uint TILE_SIZE = 32;
+#endif
+
+// Input tensor
+Buffer<float> source: register( t0 );
+// Output tensor
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 arg0Size: packoffset( c0 );
+	uint4 arg0Strides: packoffset( c1 );
+	// Count of elements per panel
+	uint panelSize : packoffset( c2.y );
+	// Layer strides of the output matrix
+	uint2 layerStrides: packoffset( c2.z );
+}
+
+inline uint hadd( uint2 v2 ) { return v2.x + v2.y; }
+
+groupshared float tileBuffer[ TILE_SIZE ][ TILE_SIZE ];
+
+[ numthreads( TILE_SIZE, 1, 1 ) ]
+void main( const uint3 group: SV_GroupID, const uint thread : SV_GroupIndex )
+{
+	uint rdi = hadd( group.yz * layerStrides );
+	rdi += group.x * panelSize;
+	rdi += thread;
+
+	uint rsi = hadd( group.yz * arg0Strides.zw );
+	const uint baseY = group.x * TILE_SIZE;
+	const uint dispatchThread = baseY + thread;
+	// Reshaping into a column major horizontal panel, height = TILE_SIZE, width = width of the source matrix
+	uint width = arg0Size.x;
+	// Usually TILE_SIZE; can be less for the last panel on the matrix when we need to generate zeros instead of loading these numbers
+	const uint height = min( TILE_SIZE, arg0Size.y - baseY );
+
+	if( arg0Strides.x == 1 )
+	{
+		// The input matrix is row major, can improve performance with coalesced loads and group shared buffer.
+		rsi += baseY * arg0Strides.y;
+
+		const uint widthCompleteTiles = width / TILE_SIZE;
+
+		if( height < TILE_SIZE )
+		{
+			// This thread group was dispatched for the last panel of the matrix, it doesn't have enough rows
+			// Write zeros to the corresponding elements of the groupshared buffer
+			for( uint j = height; j < TILE_SIZE; j++ )
+				tileBuffer[ thread ][ j ] = 0.0;
+		}
+
+		for( uint i = 0; i < widthCompleteTiles; i++, rsi += TILE_SIZE )
+		{
+			// Load [ TILE_SIZE ] * [ TILE_SIZE ] block with fully coalesced loads, store to group shared buffer in transposed order
+			uint rsiTile = rsi + thread;
+			uint j;
+			for( j = 0; j < height; j++, rsiTile += arg0Strides.y )
+			{
+				// Each iteration of the loop loads a row of [ TILE_SIZE ] elements from the corresponding row of the source tensor
+				// Fully coalesced load
+				float f = source[ rsiTile ];
+				// Random store but the local memory's fast, this works rather well in practice
+				tileBuffer[ thread ][ j ] = f;
+			}
+
+			GroupMemoryBarrierWithGroupSync();
+
+			// Copy from group shared buffer to output tensor
+			for( j = 0; j < TILE_SIZE; j++, rdi += TILE_SIZE )
+			{
+				// Fully coalesced loads and stores
+				float f = tileBuffer[ j ][ thread ];
+				result[ rdi ] = f;
+			}
+
+			GroupMemoryBarrierWithGroupSync();
+		}
+
+		width %= TILE_SIZE;
+		if( 0 == width )
+			return;
+		rsi += thread * arg0Strides.y;
+	}
+	else
+		rsi += dispatchThread * arg0Strides.y;
+
+	for( uint i = 0; i < width; i++ )
+	{
+		float f;
+		[branch]
+		if( thread < height )
+			f = source[ rsi ];
+		else
+			f = 0.0;
+		rsi += arg0Strides.x;
+
+		result[ rdi ] = f;
+		rdi += TILE_SIZE;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/miscUtils.hlsli b/ComputeShaders/miscUtils.hlsli
new file mode 100644
index 0000000..b957a06
--- /dev/null
+++ b/ComputeShaders/miscUtils.hlsli
@@ -0,0 +1,84 @@
+// When GPUs are converting FP32 to FP16, they always truncate towards 0, documented there:
+// https://learn.microsoft.com/en-us/windows/win32/direct3d10/d3d10-graphics-programming-guide-resources-data-conversion#conververting-from-a-higher-range-representation-to-a-lower-range-representation
+// Whisper code uses _mm_cvtps_ph( x, 0 ), the 0 stands for "Round to nearest even": https://www.felixcloutier.com/x86/vcvtps2ph
+// This function adjusts FP32 value making it so that truncation towards 0 results in the value equal to what CPU is doing
+inline float adjustFp16( const float src )
+{
+	const uint trunc16 = f32tof16( src );
+	const float trunc32 = f16tof32( trunc16 );
+
+	const uint truncExp = ( trunc16 >> 10 ) & 0x1F;
+	if( truncExp != 0x1F )
+	{
+		const uint next16 = trunc16 + 1;
+		const float next32 = f16tof32( next16 );
+
+		const float errTrunc = abs( src - trunc32 );
+		const float errNext = abs( src - next32 );
+
+		if( errTrunc < errNext )
+		{
+			// Truncated was closer to the source
+			return src;
+		}
+		else if( errTrunc > errNext )
+		{
+			// Truncated + 1 was closer to the source
+			return next32;
+		}
+		else
+		{
+			// Exactly half, doing banker's rounding to nearest even
+			return ( 0 == ( trunc16 & 1 ) ) ? src : next32;
+		}
+	}
+	else
+	{
+		// INF or NAN
+		return src;
+	}
+}
+
+// Convert FP32 number to FP16, using rounding to nearest
+inline uint fp16Rounded( const float src )
+{
+	const uint trunc16 = f32tof16( src );
+	const float trunc32 = f16tof32( trunc16 );
+
+	const uint truncExp = ( trunc16 >> 10 ) & 0x1F;
+	if( truncExp != 0x1F )
+	{
+		const uint next16 = trunc16 + 1;
+		const float next32 = f16tof32( next16 );
+
+		const float errTrunc = abs( src - trunc32 );
+		const float errNext = abs( src - next32 );
+
+		if( errTrunc < errNext )
+		{
+			// Truncated was closer to the source
+			return trunc16;
+		}
+		else if( errTrunc > errNext )
+		{
+			// Truncated + 1 was closer to the source
+			return next16;
+		}
+		else
+		{
+			// Exactly half, doing banker's rounding to nearest even
+			return ( 0 == ( trunc16 & 1 ) ) ? trunc16 : next16;
+		}
+	}
+	else
+	{
+		// INF or NAN
+		return trunc16;
+	}
+}
+
+// Round up the number to be a multiple of 32
+inline uint roundUp32( uint x )
+{
+	return ( x + 31 ) & ( ~31u );
+}
+\ No newline at end of file
diff --git a/ComputeShaders/mulMatByRow.hlsl b/ComputeShaders/mulMatByRow.hlsl
new file mode 100644
index 0000000..565cfdc
--- /dev/null
+++ b/ComputeShaders/mulMatByRow.hlsl
@@ -0,0 +1,49 @@
+// Matrix * row product, like [ E0, E1, E2, E3 ] * [ E0, 1, E2, E3 ] = [ E1, 1, E2, E3 ]
+// Dispatch [ E1, E2, E3 ] groups of this shader
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 arg0Size: packoffset( c0 );
+	uint4 arg0Strides: packoffset( c1 );
+	uint4 arg1Size: packoffset( c2 );
+	uint4 arg1Strides: packoffset( c3 );
+	uint4 resultSize: packoffset( c4 );
+	uint4 resultStrides: packoffset( c5 );
+}
+
+#include "groupReduce.hlsli"
+
+inline uint hadd( uint3 vec )
+{
+	return vec.x + vec.y + vec.z;
+}
+inline uint hadd( uint2 vec )
+{
+	return vec.x + vec.y;
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	uint s0 = hadd( group * arg0Strides.yzw );
+	uint s1 = hadd( group.yz * arg1Strides.zw );
+	const uint s0End = s0 + arg0Size.x * arg0Strides.x;
+	const uint s0Inc = 32 * arg0Strides.x;
+	const uint s1Inc = 32 * arg1Strides.x;
+
+	s0 += thread * arg0Strides.x;
+	s1 += thread * arg1Strides.x;
+	float dp = 0;
+	for( ; s0 < s0End; s0 += s0Inc, s1 += s1Inc )
+		dp = mad( arg0[ s0 ], arg1[ s1 ], dp );
+
+	horizontalSum( thread, dp );
+	if( 0 != thread )
+		return;
+
+	const uint rdi = group.x + hadd( group.yz * resultStrides.zw );
+	result[ rdi ] = dp;
+}
+\ No newline at end of file
diff --git a/ComputeShaders/mulMatByRow64.hlsl b/ComputeShaders/mulMatByRow64.hlsl
new file mode 100644
index 0000000..db5f801
--- /dev/null
+++ b/ComputeShaders/mulMatByRow64.hlsl
@@ -0,0 +1,90 @@
+// Matrix * row product, like [ E0, E1, E2, E3 ] * [ E0, 1, E2, E3 ] = [ E1, 1, E2, E3 ]
+// Dispatch [ E1, E2, E3 ] groups of this shader
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 arg0Size: packoffset( c0 );
+	uint4 arg0Strides: packoffset( c1 );
+	uint4 arg1Size: packoffset( c2 );
+	uint4 arg1Strides: packoffset( c3 );
+	uint4 resultSize: packoffset( c4 );
+	uint4 resultStrides: packoffset( c5 );
+}
+
+inline uint hadd( uint3 vec )
+{
+	return vec.x + vec.y + vec.z;
+}
+inline uint hadd( uint2 vec )
+{
+	return vec.x + vec.y;
+}
+
+// No idea why, but that particular configuration appears to be the fastest one on Ryzen 7 5700G iGPU
+// Not by much, though: when trying a few numbers I saw 1.30 - 1.42 seconds for this compute shader
+static const uint THREADS = 64;
+static const uint REDUCTION_BUFFER = 32;
+groupshared float sharedAccumulators[ REDUCTION_BUFFER ];
+
+// Compute horisontal sum of the numbers. The result is only correct on the thread #0 of the group.
+void horizontalSum( const uint thread, inout float sum )
+{
+	if( THREADS > REDUCTION_BUFFER )
+	{
+		for( uint t = REDUCTION_BUFFER; t < THREADS; t += REDUCTION_BUFFER )
+		{
+			// Threads [ t .. t + REDUCTION_BUFFER ] store into the buffer
+			if( thread >= t && thread < t + REDUCTION_BUFFER )
+				sharedAccumulators[ thread - t ] = sum;
+
+			GroupMemoryBarrierWithGroupSync();
+
+			// Threads [ 0 .. REDUCTION_BUFFER ] increment their local sum with the value loaded from the buffer
+			if( thread < REDUCTION_BUFFER )
+				sum += sharedAccumulators[ thread ];
+		}
+	}
+
+	if( thread < REDUCTION_BUFFER )
+		sharedAccumulators[ thread ] = sum;
+
+	for( uint i = REDUCTION_BUFFER / 2; i > 1; i /= 2 )
+	{
+		GroupMemoryBarrierWithGroupSync();
+		if( thread < i )
+		{
+			sum += sharedAccumulators[ thread + i ];
+			sharedAccumulators[ thread ] = sum;
+		}
+	}
+
+	GroupMemoryBarrierWithGroupSync();
+	if( 0 == thread )
+		sum += sharedAccumulators[ 1 ];
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	uint s0 = hadd( group * arg0Strides.yzw );
+	uint s1 = hadd( group.yz * arg1Strides.zw );
+	const uint s0End = s0 + arg0Size.x * arg0Strides.x;
+	const uint s0Inc = THREADS * arg0Strides.x;
+	const uint s1Inc = THREADS * arg1Strides.x;
+
+	s0 += thread * arg0Strides.x;
+	s1 += thread * arg1Strides.x;
+	float dp = 0;
+	for( ; s0 < s0End; s0 += s0Inc, s1 += s1Inc )
+		dp = mad( arg0[ s0 ], arg1[ s1 ], dp );
+
+	horizontalSum( thread, dp );
+	if( 0 != thread )
+		return;
+
+	const uint rdi = group.x + hadd( group.yz * resultStrides.zw );
+	result[ rdi ] = dp;
+}
+\ No newline at end of file
diff --git a/ComputeShaders/mulMatByRowTiled.hlsl b/ComputeShaders/mulMatByRowTiled.hlsl
new file mode 100644
index 0000000..fea2fcb
--- /dev/null
+++ b/ComputeShaders/mulMatByRowTiled.hlsl
@@ -0,0 +1,120 @@
+// Matrix * row product, like [ E0, E1, E2, E3 ] * [ E0, 1, E2, E3 ] = [ E1, 1, E2, E3 ]
+// Dispatch [ ( E1 + TILE_Y - 1 ) / TILE_Y, E2, E3 ] thread groups of this shader
+
+#ifndef TILE_Y
+static const uint TILE_Y = 64;
+#endif
+#ifndef THREADS_X
+static const uint THREADS_X = 32;
+#endif
+#ifndef THREADS_Y
+static const uint THREADS_Y = 16;
+#endif
+
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 arg0Size: packoffset( c0 );
+	uint4 arg0Strides: packoffset( c1 );
+	uint4 arg1Size: packoffset( c2 );
+	uint4 arg1Strides: packoffset( c3 );
+	uint4 resultSize: packoffset( c4 );
+	uint4 resultStrides: packoffset( c5 );
+}
+
+groupshared float resTemp[ TILE_Y ][ THREADS_X ];
+
+inline uint hadd( uint2 vec )
+{
+	return vec.x + vec.y;
+}
+
+[ numthreads( THREADS_X, THREADS_Y, 1 ) ]
+void main( uint3 group: SV_GroupID, uint3 thread : SV_GroupThreadID, uint threadFlattenned : SV_GroupIndex )
+{
+	uint i;
+	// Zero out the shared buffer
+	for( i = thread.y; i < TILE_Y; i += THREADS_Y )
+		resTemp[ i ][ thread.x ] = 0.0;
+	GroupMemoryBarrierWithGroupSync();
+
+	// Count of rows to compute in this thread group
+	const uint height = min( TILE_Y, arg0Size.y - group.x * TILE_Y );
+
+	uint s0 = hadd( group.yz * arg0Strides.zw );   //< arg0 layer for the thread group
+	s0 += group.x * TILE_Y * arg0Strides.y;        //< arg0 first row for the thread group
+	s0 += hadd( arg0Strides.xy * thread.xy );      //< arg0 load index for the thread
+
+	uint s1 = hadd( group.yz * arg1Strides.zw );   //< arg1 layer for the thread group
+	s1 += thread.x * arg1Strides.x;                //< arg1 load index for the thread
+
+	const uint completeTiles = arg0Size.x / THREADS_X;
+	// Each iteration of that loop loads THREADS_X elements from arg1,
+	// a block of [ THREADS_X, height ] elements from arg0,
+	// and accumulates these dot products in the shared buffer
+	for( uint t = 0; t < completeTiles; t++, s0 += THREADS_X * arg0Strides.x, s1 += THREADS_X * arg1Strides.x )
+	{
+		// Load THREADS_X elements from arg1
+		const float v1 = arg1[ s1 ];
+
+		uint rsi = s0;
+		for( i = thread.y; i < height; i += THREADS_Y, rsi += arg0Strides.y * THREADS_Y )
+		{
+			// Load THREADS_X elements from arg0
+			const float v0 = arg0[ rsi ];
+			// Multiply and accumulate in the shared buffer
+			float acc = resTemp[ i ][ thread.x ];
+			acc = mad( v0, v1, acc );
+			resTemp[ i ][ thread.x ] = acc;
+		}
+		GroupMemoryBarrierWithGroupSync();
+	}
+
+	const uint rem = arg0Size.x % THREADS_X;
+	if( rem != 0 )
+	{
+		// E0 ain't a multiple of THREADS_X, we have a remainder
+		float v1;
+		if( thread.x < rem )
+			v1 = arg1[ s1 ];
+		else
+			v1 = 0.0;
+
+		for( i = thread.y; i < height; i += THREADS_Y, s0 += arg0Strides.y * THREADS_Y )
+		{
+			if( thread.x >= rem )
+				continue;
+			const float v0 = arg0[ s0 ];
+			float acc = resTemp[ i ][ thread.x ];
+			acc = mad( v0, v1, acc );
+			resTemp[ i ][ thread.x ] = acc;
+		}
+		GroupMemoryBarrierWithGroupSync();
+	}
+
+	// Now we need horizontal sums of these shared accumulators, i.e. reduce [height][THREADS_X] shared array into [height][1] column
+	for( i = THREADS_X / 2; i > 0; i /= 2 )
+	{
+		if( thread.x < i )
+		{
+			for( uint j = thread.y; j < height; j += THREADS_Y )
+			{
+				float sum = resTemp[ j ][ thread.x ];
+				sum += resTemp[ j ][ thread.x + i ];
+				resTemp[ j ][ thread.x ] = sum;
+			}
+		}
+		GroupMemoryBarrierWithGroupSync();
+	}
+
+	// And finally, store that column to global memory
+	if( threadFlattenned >= height )
+		return;
+
+	uint rdi = hadd( group.yz * resultStrides.zw ) + group.x * TILE_Y * resultStrides.x;
+	rdi += threadFlattenned * resultStrides.x;
+	result[ rdi ] = resTemp[ threadFlattenned ][ 0 ];
+}
+\ No newline at end of file
diff --git a/ComputeShaders/mulMatByRowTiled64.hlsl b/ComputeShaders/mulMatByRowTiled64.hlsl
new file mode 100644
index 0000000..6c63f2d
--- /dev/null
+++ b/ComputeShaders/mulMatByRowTiled64.hlsl
@@ -0,0 +1,4 @@
+#define THREADS_Y 32
+#define THREADS_X 32
+#define TILE_Y 128
+#include "mulMatByRowTiled.hlsl"
+\ No newline at end of file
diff --git a/ComputeShaders/mulMatByRowTiledEx.hlsl b/ComputeShaders/mulMatByRowTiledEx.hlsl
new file mode 100644
index 0000000..d377b8c
--- /dev/null
+++ b/ComputeShaders/mulMatByRowTiledEx.hlsl
@@ -0,0 +1,156 @@
+// matrix*row vector product, needs first argument reshaped into a sequence of horizontal column major panels
+#ifndef TILE_SIZE
+static const uint TILE_SIZE = 32;
+#endif
+#ifndef TILE_HEIGHT
+static const uint TILE_HEIGHT = 32;
+#endif
+#ifndef THREADS_Y
+static const uint THREADS_Y = 16;
+#endif
+
+// First tensor, reshaped into dense column major horizontal panels of size [ width, TILE_SIZE ]
+Buffer<float> arg0: register( t0 );
+// Second tensor, reshaped into dense column major horizontal panels of size [ width, TILE_SIZE ]
+Buffer<float> arg1: register( t1 );
+// FP32 output tensor, row major and continuous
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 arg0Size: packoffset( c0 );
+	uint arg0panel: packoffset( c1.y );
+	uint2 arg0LayerStrides: packoffset( c1.z );
+	// uint4 arg1Size: packoffset( c2 );
+	uint4 arg1Strides: packoffset( c3 );
+	uint4 resultSize: packoffset( c4 );
+	uint4 resultStrides: packoffset( c5 );
+}
+
+groupshared float tileOutput[ THREADS_Y ][ TILE_SIZE ];
+groupshared float tile0[ TILE_HEIGHT ][ TILE_SIZE ];
+groupshared float tile1[ TILE_HEIGHT ];
+
+void multiplyTiles( const uint3 thread )
+{
+	float r = 0.0;
+	for( uint i = thread.y; i < TILE_HEIGHT; i += THREADS_Y )
+	{
+		float a = tile0[ i ][ thread.x ];
+		float b = tile1[ i ];
+		r = mad( a, b, r );
+	}
+	tileOutput[ thread.y ][ thread.x ] += r;
+}
+
+void reduceOutput( const uint3 thread )
+{
+	float curr = 0.0;
+	[branch]
+	if( thread.y < THREADS_Y / 2 )
+		curr = tileOutput[ thread.y ][ thread.x ];
+
+	for( uint i = THREADS_Y / 2; i > 0; i /= 2 )
+	{
+		[branch]
+		if( thread.y < i )
+		{
+			curr += tileOutput[ thread.y + i ][ thread.x ];
+			tileOutput[ thread.y ][ thread.x ] = curr;
+		}
+		GroupMemoryBarrierWithGroupSync();
+	}
+}
+
+void storeTile( const uint threadFlat, const uint4 pos, const uint size )
+{
+	if( threadFlat >= size )
+		return;
+	const uint4 prod4 = pos * resultStrides;
+	const uint2 prod2 = prod4.xy + prod4.zw;
+	uint rdi = prod2.x + prod2.y;
+	result[ rdi + threadFlat ] = tileOutput[ 0 ][ threadFlat ];
+}
+
+[ numthreads( TILE_SIZE, THREADS_Y, 1 ) ]
+void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID, uint threadFlat : SV_GroupIndex )
+{
+	uint i;
+	// Zero all 3 shared buffers
+	tileOutput[ thread.y ][ thread.x ] = 0.0;
+	for( i = thread.y; i < TILE_HEIGHT; i += THREADS_Y )
+		tile0[ i ][ thread.x ] = 0.0;
+	if( threadFlat < THREADS_Y )
+		tile1[ threadFlat ] = 0.0;
+
+	const uint2 layer = group.yz;
+	uint rsi0 = group.x * arg0panel + layer.x * arg0LayerStrides.x + layer.y * arg0LayerStrides.y;
+	uint rsi1 = layer.x * arg1Strides.z + layer.y * arg1Strides.w;
+
+	const uint threadOffset = thread.y * TILE_SIZE + thread.x;
+	rsi0 += threadOffset;
+	rsi1 += threadFlat * arg1Strides.x;
+
+	const uint completeTiles = arg0Size.x / TILE_HEIGHT;
+	for( i = 0; i < completeTiles; i++ )
+	{
+		// Load [ TILE_SIZE, TILE_HEIGHT ] block from the first source tensor into the groupshared buffer
+		for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y )
+		{
+			tile0[ j ][ thread.x ] = arg0[ rsi0 ];
+			rsi0 += THREADS_Y * TILE_SIZE;
+		}
+		// Load [ TILE_HEIGHT ] row from the second source into another groupshared buffer
+		[ branch ]
+		if( threadFlat < TILE_HEIGHT )
+			tile1[ threadFlat ] = arg1[ rsi1 ];
+		rsi1 += TILE_HEIGHT * arg1Strides.x;
+
+		GroupMemoryBarrierWithGroupSync();
+
+		multiplyTiles( thread );
+
+		GroupMemoryBarrierWithGroupSync();
+	}
+
+	const uint rem = arg0Size.x % TILE_HEIGHT;
+	if( rem != 0 )
+	{
+		for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y )
+		{
+			float a;
+			[branch]
+			if( j < rem )
+			{
+				a = arg0[ rsi0 ];
+				rsi0 += THREADS_Y * TILE_SIZE;
+			}
+			else
+				a = 0.0;
+			tile0[ j ][ thread.x ] = a;
+		}
+
+		if( threadFlat < TILE_HEIGHT )
+		{
+			float b;
+			[branch]
+			if( threadFlat < rem )
+				b = arg1[ rsi1 ];
+			else
+				b = 0.0;
+			tile1[ threadFlat ] = b;
+		}
+
+		GroupMemoryBarrierWithGroupSync();
+
+		multiplyTiles( thread );
+
+		GroupMemoryBarrierWithGroupSync();
+	}
+
+	reduceOutput( thread );
+
+	const uint resultPos = group.x * TILE_SIZE;
+	const uint outputSize = min( TILE_SIZE, resultSize.x - resultPos );
+	storeTile( threadFlat, uint4( resultPos, 0, layer ), outputSize );
+}
+\ No newline at end of file
diff --git a/ComputeShaders/mulMatByScalar.hlsl b/ComputeShaders/mulMatByScalar.hlsl
new file mode 100644
index 0000000..82df5d4
--- /dev/null
+++ b/ComputeShaders/mulMatByScalar.hlsl
@@ -0,0 +1,41 @@
+// Matrix * scalar product, like [ 1, E1, E2, E3 ] * [ 1, 1, E2, E3 ] = [ E1, 1, E2, E3 ]
+// Dispatch [ E2, E3, 1 ] thread groups of this shader
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 arg0Size: packoffset( c0 );
+	uint4 arg0Strides: packoffset( c1 );
+	uint4 arg1Size: packoffset( c2 );
+	uint4 arg1Strides: packoffset( c3 );
+	uint4 resultSize: packoffset( c4 );
+	uint4 resultStrides: packoffset( c5 );
+}
+
+inline uint hadd( uint2 vec )
+{
+	return vec.x + vec.y;
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const float scalarValue = arg1[ hadd( group.xy * arg1Strides.zw ) ];
+
+	uint s0 = hadd( group.xy * arg0Strides.zw );
+	const uint s0Inc = 32 * arg0Strides.y;
+	s0 += thread * arg0Strides.y;
+
+	uint rdi = hadd( group.xy * resultStrides.zw );
+	const uint rdiEnd = rdi + arg0Size.y;
+	rdi += thread;
+
+	for( ; rdi < rdiEnd; rdi += 32, s0 += s0Inc )
+	{
+		float f = arg0[ s0 ];
+		f *= scalarValue;
+		result[ rdi ] = f;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/mulMatDotMain.hlsl b/ComputeShaders/mulMatDotMain.hlsl
new file mode 100644
index 0000000..47c6d3e
--- /dev/null
+++ b/ComputeShaders/mulMatDotMain.hlsl
@@ -0,0 +1,95 @@
+// GGML_TASK_COMPUTE step for matrix*matrix product, where nb01 >= nb00;
+// Dispatch with [ ne11, ne01*ne02*ne03 ] thread groups
+// Each thread group computes a single dot product
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 src0_elements: packoffset( c0 );
+	uint4 src0_strides: packoffset( c1 );
+	uint4 src1_elements: packoffset( c2 );
+	uint4 result_elements: packoffset( c4 );
+	uint4 result_strides: packoffset( c5 );
+}
+
+inline uint product( uint3 vec )
+{
+	return vec.x * vec.y * vec.z;
+}
+
+inline uint product( uint4 vec )
+{
+	uint2 tmp = vec.xy * vec.zw;
+	return tmp.x * tmp.y;
+}
+
+inline float dotProductInner( uint i0, uint i1, uint length, uint thread )
+{
+	float res = 0;
+	for( uint i = thread; i < length; i += 32 )
+		res = mad( arg0[ i0 + i ], arg1[ i1 + i ], res );
+	return res;
+}
+
+#include "groupReduce.hlsli"
+
+[numthreads( 32, 1, 1 )]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint ne00 = src0_elements.x;
+	const uint ne01 = src0_elements.y;
+	const uint ne02 = src0_elements.z;
+	const uint ne03 = src0_elements.w;
+
+	const uint ne10 = src1_elements.x;
+	const uint ne11 = src1_elements.y;
+	const uint ne12 = src1_elements.z;
+	const uint ne13 = src1_elements.w;
+
+	const int nb00 = src0_strides.x;
+	const int nb01 = src0_strides.y;
+	const int nb02 = src0_strides.z;
+	const int nb03 = src0_strides.w;
+
+	// total rows in src0
+	// const int nr = ne01*ne02*ne03;
+	const uint nr = product( src0_elements.yzw );
+
+	const uint ir = group.y;
+
+	// src0 indices
+	const uint i03 = ir / ( ne02 * ne01 );
+	const uint i02 = ( ir - i03 * ne02 * ne01 ) / ne01;
+	const uint i01 = ( ir - i03 * ne02 * ne01 - i02 * ne01 );
+
+	const uint i13 = i03;
+	const uint i12 = i02;
+
+	const uint i0 = i01;
+	const uint i2 = i02;
+	const uint i3 = i03;
+
+	// src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
+	// src1_col = wdata + ( i13 * ne12 * ne11 + i12 * ne11 + 0 ) * ne00;
+	const uint src0_row = i01 * nb01 + i02 * nb02 + i03 * nb03;
+	const uint src1_col = ( i13 * ne12 * ne11 + i12 * ne11 ) * ne00;
+
+	const uint ic = group.x;
+	float curr = dotProductInner( src0_row, src1_col + ic * ne00, ne00, thread );
+	horizontalSumCompatNew( thread, curr );
+
+	if( 0 != thread )
+		return;
+
+	const uint nb0 = result_strides.x;
+	const uint nb1 = result_strides.y;
+	const uint nb2 = result_strides.z;
+	const uint nb3 = result_strides.w;
+
+	const uint ne0 = result_elements.x;
+	// float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
+	const uint dst_col = i0 * nb0 + i2 * nb2 + i3 * nb3;
+	result[ dst_col + ic * ne0 ] = curr;
+}
+\ No newline at end of file
diff --git a/ComputeShaders/mulMatDotReshape.hlsl b/ComputeShaders/mulMatDotReshape.hlsl
new file mode 100644
index 0000000..ffb6f83
--- /dev/null
+++ b/ComputeShaders/mulMatDotReshape.hlsl
@@ -0,0 +1,33 @@
+// GGML_TASK_INIT step for matrix*matrix product, where nb01 >= nb00;
+// Dispatch with [ ne11, ne12 ] groups
+Buffer<float> arg0: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 src0_elements: packoffset( c0 );
+	uint4 src0_strides: packoffset( c1 );
+}
+
+#include "miscUtils.hlsli"
+
+// Each thread group of this shader copies a single rows of the matrix
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint i12 = group.y;
+	const uint i11 = group.x;
+	const uint ne10 = src0_elements.x;
+	const uint ne11 = src0_elements.y;
+	const uint nb12 = src0_strides.z;
+	const uint nb11 = src0_strides.y;
+
+	uint rdi = i11 * ne10 + i12 * ne10 * ne11;
+	const uint rdiEnd = rdi + ne10;
+	uint rsi = i12 * nb12 + i11 * nb11;
+	rdi += thread;
+	rsi += thread;
+
+	for( ; rdi < rdiEnd; rdi += 32, rsi += 32 )
+		result[ rdi ] = adjustFp16( arg0[ rsi ] );
+}
+\ No newline at end of file
diff --git a/ComputeShaders/mulMatMadMain.hlsl b/ComputeShaders/mulMatMadMain.hlsl
new file mode 100644
index 0000000..0bd5753
--- /dev/null
+++ b/ComputeShaders/mulMatMadMain.hlsl
@@ -0,0 +1,154 @@
+// GGML_TASK_COMPUTE step for matrix*matrix product, where nb01 < nb00
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> resultTensor: register( u0 );
+RWBuffer<float> tempBuffer: register( u1 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 aSize: packoffset( c0 );
+	uint4 aStride: packoffset( c1 );
+	uint4 bSize: packoffset( c2 );
+	uint4 bStride: packoffset( c3 );
+	uint4 resSize: packoffset( c4 );
+	bool resultFp16 : packoffset( c5.x );
+	uint ne: packoffset( c5.y );
+}
+
+#include "miscUtils.hlsli"
+
+// tempBuffer[ rdi .. ] = 0.0
+inline void writeTempZeros( uint rdi, const uint len, const uint thread )
+{
+	const uint rdiEnd = rdi + len;
+	for( rdi += thread; rdi < rdiEnd; rdi += 32 )
+		tempBuffer[ rdi ] = 0.0;
+}
+
+// tempBuffer[ rdi .. ] += mul * arg0[ rsi .. ]
+inline void vectorMad( uint rsi, uint rdi, const uint len, const float mul, const uint thread )
+{
+	const uint rsiEnd = rsi + len;
+	rsi += thread;
+	rdi += thread;
+	for( ; rsi < rsiEnd; rsi += 32, rdi += 32 )
+	{
+		float f = tempBuffer[ rdi ];
+		f = mad( mul, arg0[ rsi ], f );
+		[branch]
+		if( resultFp16 )
+			f = adjustFp16( f );
+		tempBuffer[ rdi ] = f;
+	}
+}
+
+// resultTensor[ rdi .. ] = tempBuffer[ rsi .. ]
+inline void copyRow( uint rsi, uint rdi, const uint len, const uint thread )
+{
+	const uint rsiEnd = rsi + len;
+	rsi += thread;
+	rdi += thread;
+	for( ; rsi < rsiEnd; rsi += 32, rdi += 32 )
+	{
+		float f = tempBuffer[ rsi ];
+		resultTensor[ rdi ] = f;
+	}
+}
+
+// resultTensor[ rdi .. ] += tempBuffer[ rsi .. ]
+inline void addRow( uint rsi, uint rdi, const uint len, const uint thread )
+{
+	const uint rsiEnd = rsi + len;
+	rsi += thread;
+	rdi += thread;
+	for( ; rsi < rsiEnd; rsi += 32, rdi += 32 )
+	{
+		float f = resultTensor[ rdi ];
+		f += tempBuffer[ rsi ];
+		resultTensor[ rdi ] = f;
+	}
+}
+
+[numthreads( 32, 1, 1 )]
+void main( const uint3 group: SV_GroupID, const uint thread : SV_GroupIndex )
+{
+	const uint i1 = group[ 0 ];
+	const uint i2 = group[ 1 ];
+	const uint i3 = group[ 2 ];
+
+	const uint ne00 = aSize[ 0 ];
+	const uint ne01 = aSize[ 1 ];
+	const uint ne02 = aSize[ 2 ];
+	const uint ne03 = aSize[ 3 ];
+
+	const uint ne10 = bSize[ 0 ];
+	const uint ne11 = bSize[ 1 ];
+	const uint ne12 = bSize[ 2 ];
+	const uint ne13 = bSize[ 3 ];
+
+	const uint ne0 = resSize[ 0 ];
+	const uint ne1 = resSize[ 1 ];
+	const uint ne2 = resSize[ 2 ];
+	const uint ne3 = resSize[ 3 ];
+
+	const uint nb00 = aStride[ 0 ];
+	const uint nb01 = aStride[ 1 ];
+	const uint nb02 = aStride[ 2 ];
+	const uint nb03 = aStride[ 3 ];
+
+	const uint nb10 = bStride[ 0 ];
+	const uint nb11 = bStride[ 1 ];
+	const uint nb12 = bStride[ 2 ];
+	const uint nb13 = bStride[ 3 ];
+
+	// dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0;
+	const uint tempRowThread0 = i3 * ne2 * ne1 * ne0 + i2 * ne1 * ne0 + i1 * ne0;
+
+	// Faking 4 CPU threads trying to achieve bitwise compatibility with the CPU version
+	const uint nth = 4;
+
+	// GGML_TASK_COMPUTE
+	{
+		// src0_col = src0->data + ( i00 * nb00 + i02 * nb02 + i03 * nb03 );
+		const uint aBase = i2 * nb02 + i3 * nb03;
+		// src1_val = *      (float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+		const uint bBase = i1 * nb11 + i2 * nb12 + i3 * nb13;
+
+		// total columns in src1
+		const uint nc = ne10;
+		// columns per thread
+		const uint dc = ( nc + nth - 1 ) / nth;
+
+		uint tempRow = tempRowThread0;
+		for( uint ith = 0; ith < nth; ith++, tempRow += ne )
+		{
+			writeTempZeros( tempRow, ne01, thread );
+
+			// column range for this thread
+			const uint ic0 = dc * ith;
+			const uint ic1 = min( ic0 + dc, nc );
+
+			for( uint ic = ic0; ic < ic1; ic++ )
+			{
+				const uint idxA = aBase + ic * aStride[ 0 ];
+				const uint idxB = bBase + ic * bStride[ 0 ];
+				const float bValue = arg1[ idxB ];
+				vectorMad( idxA, tempRow, ne01, bValue, thread );
+			}
+		}
+	}
+
+	// GGML_TASK_FINALIZE
+	{
+		const uint rdi = tempRowThread0;
+		// const uint rdi = i1 * resSize[ 0 ] + i2 * resSize[ 0 ] * resSize[ 1 ] + i3 * resSize[ 0 ] * resSize[ 1 ] * resSize[ 2 ];
+		// const uint rdi = ( ( i3 * resSize[ 2 ] + i2 ) * resSize[ 1 ] + i1 ) * resSize[ 0 ];
+
+		uint tempRow = tempRowThread0;
+		copyRow( tempRow, rdi, ne01, thread );
+
+		tempRow += ne;
+		for( uint ith = 1; ith < nth; ith++, tempRow += ne )
+			addRow( tempRow, rdi, ne01, thread );
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/mulMatTiled.hlsl b/ComputeShaders/mulMatTiled.hlsl
new file mode 100644
index 0000000..7e4d7d8
--- /dev/null
+++ b/ComputeShaders/mulMatTiled.hlsl
@@ -0,0 +1,236 @@
+// This compute shader implements matrix*matrix product, using tiling and other tricks to improve the performance
+#ifndef TILE_SIZE
+static const uint TILE_SIZE = 32;
+#endif
+
+#ifndef THREADS_Y
+// Performance measures on Ryzen 7 5700G iGPU, the time is just for this shader:
+// 1 (32 threads per group) - 17.1 seconds, 2 - 9.02424 seconds, 4 - 6.95762 seconds, 6 - 6.79011 seconds, 8 - 6.67279 seconds, 10 - 6.9456 seconds, 16 - 7.20502 seconds
+// On nVidia, 8 is also the fastest option.
+static const uint THREADS_Y = 8;
+#endif
+
+#ifndef STREAM_SECOND_MATRIX
+#define STREAM_SECOND_MATRIX 0
+#endif
+
+#ifndef LOAD_ORDER
+
+// Load with coalesced loads from global memory whenever possible, store into groupshared buffer with random stores
+// #define LOAD_ORDER bool2( ( 1 == arg0Strides[ 0 ] ) || ( 1 != arg0Strides[ 1 ] ), ( 1 == arg1Strides[ 0 ] ) || ( 1 != arg1Strides[ 1 ] ) )
+
+// Load with random loads from global memory, store into groupshared buffer with coalesced stores
+// On my AMD iGPU inside Ryzen 7 5700G, there's whopping 15% performance win with that tactics, from 6.67 to 5.66 seconds for this shader.
+// My nVidia GPU does about the same
+#define LOAD_ORDER bool2( false, true )
+
+#endif
+
+Buffer<float> arg0: register( t0 );
+Buffer<float> arg1: register( t1 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 arg0Size: packoffset( c0 );
+	uint4 arg0Strides: packoffset( c1 );
+	uint4 arg1Strides: packoffset( c3 );
+	uint4 resultSize: packoffset( c4 );
+	uint4 resultStrides: packoffset( c5 );
+}
+
+groupshared float tile0[ TILE_SIZE ][ TILE_SIZE ];
+#if !STREAM_SECOND_MATRIX
+groupshared float tile1[ TILE_SIZE ][ TILE_SIZE ];
+#endif
+groupshared float resTemp[ TILE_SIZE ][ TILE_SIZE ];
+
+#if STREAM_SECOND_MATRIX
+void multiplyTiles( uint rsi, const uint3 thread, const uint w, const uint h )
+{
+	for( uint i = thread.y; i < h; i += THREADS_Y, rsi += THREADS_Y * arg1Strides.y )
+	{
+		float r = 0;
+		uint rsiRow = rsi;
+		for( uint j = 0; j < w; j++, rsiRow += arg1Strides.x )
+		{
+			// One TILE_SIZE * 4 bytes coalesced load, broadcasted into THREADS_Y copies
+			const float s0 = tile0[ j ][ thread.x ];
+			// THREADS_Y broadcasts from global memory, each one is 4 bytes broadcasted into TILE_SIZE copies
+			const float s1 = arg1[ rsiRow ];
+			// Multiply and accumulate
+			r = mad( s0, s1, r );
+		}
+		// Accumulate into the output tile
+		// THREADS_Y * 128 bytes coalesced loads and stores
+		resTemp[ i ][ thread.x ] += r;
+	}
+}
+#else
+// Compute resTemp += tile0 * tile1, for TILE_SIZE^2 square matrices
+// The group size is TILE_SIZE*THREADS_Y threads in this shader
+void multiplyTiles( const uint3 thread )
+{
+	for( uint i = thread.y; i < TILE_SIZE; i += THREADS_Y )
+	{
+		float r = 0;
+		for( uint j = 0; j < TILE_SIZE; j++ )
+		{
+			// One TILE_SIZE * 4 bytes coalesced load, broadcasted into THREADS_Y copies
+			const float s0 = tile0[ j ][ thread.x ];
+			// THREADS_Y broadcasts, each one is 4 bytes broadcasted into TILE_SIZE copies
+			const float s1 = tile1[ i ][ j ];
+			// Multiply and accumulate
+			r = mad( s0, s1, r );
+		}
+		// Accumulate into the output tile
+		// THREADS_Y * 128 bytes coalesced loads and stores
+		resTemp[ i ][ thread.x ] += r;
+	}
+}
+#endif
+
+// Note we transposed these tiles while loading
+void loadTile0( uint rsi, const uint3 thread, const uint w, const uint h, const bool rowMajor )
+{
+	uint i;
+	if( rowMajor )
+	{
+		rsi += arg0Strides.y * thread.y;
+		for( i = thread.y; i < h; i += THREADS_Y, rsi += arg0Strides.y * THREADS_Y )
+		{
+			if( thread.x < w )
+				tile0[ thread.x ][ i ] = arg0[ rsi + thread.x * arg0Strides.x ];
+			else
+				tile0[ thread.x ][ i ] = 0.0;
+		}
+	}
+	else
+	{
+		// Unlike width which is smaller for the last tile, the height is always the same, and all these tiles are zero-initialized
+		if( thread.x >= h )
+			return;
+
+		rsi += arg0Strides.x * thread.y;
+		for( i = thread.y; i < w; i += THREADS_Y, rsi += arg0Strides.x * THREADS_Y )
+			tile0[ i ][ thread.x ] = arg0[ rsi + thread.x * arg0Strides.y ];
+
+		if( i >= TILE_SIZE )
+			return;
+		for( ; i < TILE_SIZE; i += THREADS_Y )
+			tile0[ i ][ thread.x ] = 0.0;
+	}
+}
+
+#if !STREAM_SECOND_MATRIX
+void loadTile1( uint rsi, const uint3 thread, const uint w, const uint h, const bool rowMajor )
+{
+	uint i;
+	if( rowMajor )
+	{
+		rsi += thread.y * arg1Strides.y;
+
+		for( i = thread.y; i < h; i += THREADS_Y, rsi += arg1Strides.y * THREADS_Y )
+		{
+			if( thread.x < w )
+				tile1[ i ][ thread.x ] = arg1[ rsi + thread.x * arg1Strides.x ];
+			else
+				tile1[ i ][ thread.x ] = 0.0;
+		}
+	}
+	else
+	{
+		// Unlike width which is smaller for the last tile, the height is always the same, and all these tiles are zero-initialized
+		if( thread.x >= h )
+			return;
+
+		rsi += thread.y * arg1Strides.x;
+		for( i = thread.y; i < w; i += THREADS_Y, rsi += arg1Strides.x * THREADS_Y )
+			tile1[ thread.x ][ i ] = arg1[ rsi + thread.x * arg0Strides.y ];
+		if( i >= TILE_SIZE )
+			return;
+		for( ; i < TILE_SIZE; i += THREADS_Y )
+			tile1[ thread.x ][ i ] = 0.0;
+	}
+}
+#endif
+
+void storeTile( const uint3 thread, const uint4 pos, const uint2 size )
+{
+	if( thread.x >= size.x )
+		return;
+	const uint4 prod4 = pos * resultStrides;
+	const uint2 prod2 = prod4.xy + prod4.zw;
+	uint rdi = prod2.x + prod2.y;
+	rdi += resultStrides.y * thread.y;
+	for( uint i = thread.y; i < size.y; i += THREADS_Y, rdi += resultStrides.y * THREADS_Y )
+		result[ rdi + thread.x * resultStrides.x ] = resTemp[ i ][ thread.x ];
+}
+
+[ numthreads( TILE_SIZE, THREADS_Y, 1 ) ]
+void main( uint3 group: SV_GroupID, uint3 thread : SV_GroupThreadID )
+{
+	// Zero out these shared buffers
+	for( uint i = 0; i < TILE_SIZE; i += THREADS_Y )
+	{
+		tile0[ i + thread.y ][ thread.x ] = 0.0;
+#if !STREAM_SECOND_MATRIX
+		tile1[ i + thread.y ][ thread.x ] = 0.0;
+#endif
+		resTemp[ i + thread.y ][ thread.x ] = 0.0;
+	}
+
+	const uint2 resultPos = group.xy * TILE_SIZE;
+	const uint2 layer = uint2( group.z % resultSize.z, group.z / resultSize.z );
+	uint rsi0 = resultPos.x * arg0Strides.y + layer.x * arg0Strides.z + layer.y * arg0Strides.w;
+	uint rsi1 = resultPos.y * arg1Strides.y + layer.x * arg1Strides.z + layer.y * arg1Strides.w;
+
+	const uint rsi0Inc = TILE_SIZE * arg0Strides.x;
+	const uint rsi1Inc = TILE_SIZE * arg1Strides.x;
+
+	const uint completeTiles = arg0Size.x / TILE_SIZE;
+	const uint rsi0AndAligned = rsi0 + rsi0Inc * completeTiles;
+	// Output tile size
+	// Normally TILE_SIZE^2, less than that for the tiles at the right and bottom edges of the output matrix
+	const uint2 outputSize = min( TILE_SIZE, resultSize.xy - resultPos );
+
+	const bool2 loadOrder = LOAD_ORDER;
+
+#if STREAM_SECOND_MATRIX
+	rsi1 += thread.y * arg1Strides.y;
+#endif
+	for( ; rsi0 < rsi0AndAligned; rsi0 += rsi0Inc, rsi1 += rsi1Inc )
+	{
+		loadTile0( rsi0, thread, TILE_SIZE, outputSize.x, loadOrder.x );
+#if STREAM_SECOND_MATRIX
+		GroupMemoryBarrierWithGroupSync();
+		multiplyTiles( rsi1, thread, TILE_SIZE, outputSize.y );
+#else
+		loadTile1( rsi1, thread, TILE_SIZE, outputSize.y, loadOrder.y );
+		GroupMemoryBarrierWithGroupSync();
+		multiplyTiles( thread );
+#endif
+		// Need one moar barrier here.
+		// Otherwise, some threads of the group are loading the next tile into tile0/tile1 groupshared buffers on the next iteration of the loop,
+		// while other threads of the same group are still computing the matrix product, and getting incorrect values from that groupshared buffer.
+		// The missing barrier only caused a bug on AMD, and only with "ggml-large.bin" model; no idea why that is.
+		GroupMemoryBarrierWithGroupSync();
+	}
+
+	const uint rem = arg0Size.x % TILE_SIZE;
+	if( 0 != rem )
+	{
+		loadTile0( rsi0, thread, rem, outputSize.x, loadOrder.x );
+#if STREAM_SECOND_MATRIX
+		GroupMemoryBarrierWithGroupSync();
+		multiplyTiles( rsi1, thread, rem, outputSize.y );
+#else
+		loadTile1( rsi1, thread, rem, outputSize.y, loadOrder.y );
+		GroupMemoryBarrierWithGroupSync();
+		multiplyTiles( thread );
+#endif
+	}
+
+	GroupMemoryBarrierWithGroupSync();
+	storeTile( thread, uint4( resultPos, layer ), outputSize );
+}
+\ No newline at end of file
diff --git a/ComputeShaders/mulMatTiled64.hlsl b/ComputeShaders/mulMatTiled64.hlsl
new file mode 100644
index 0000000..45d77b1
--- /dev/null
+++ b/ComputeShaders/mulMatTiled64.hlsl
@@ -0,0 +1,3 @@
+#define TILE_SIZE 64
+#define STREAM_SECOND_MATRIX 1
+#include "mulMatTiled.hlsl"
+\ No newline at end of file
diff --git a/ComputeShaders/mulMatTiledEx.hlsl b/ComputeShaders/mulMatTiledEx.hlsl
new file mode 100644
index 0000000..0f23da2
--- /dev/null
+++ b/ComputeShaders/mulMatTiledEx.hlsl
@@ -0,0 +1,194 @@
+// This compute shader implements yet another version of matrix*matrix product
+// For optimal VRAM access pattern, it requires both arguments to be reshaped into a sequence of horizontal column major panels.
+// The panel height is TILE_SIZE, and the last panel of the matrix needs to be padded with zeros; see matReshapePanels.hlsl shader for the reshaping.
+// So far, it's only used when running on AMD GPUs.
+#ifndef TILE_SIZE
+static const uint TILE_SIZE = 32;
+#endif
+#ifndef TILE_HEIGHT
+static const uint TILE_HEIGHT = 32;
+#endif
+#ifndef THREADS_Y
+static const uint THREADS_Y = 16;
+#endif
+
+#ifndef STREAM_SECOND_MATRIX
+#define STREAM_SECOND_MATRIX 1
+#endif
+
+// First tensor, reshaped into dense column major horizontal panels of size [ width, TILE_SIZE ]
+Buffer<float> arg0: register( t0 );
+// Second tensor, reshaped into dense column major horizontal panels of size [ width, TILE_SIZE ]
+Buffer<float> arg1: register( t1 );
+// FP32 output tensor, row major and continuous
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 arg0Size: packoffset( c0 );
+	uint arg0panel: packoffset( c1.y );
+	uint2 arg0LayerStrides: packoffset( c1.z );
+
+	// uint4 arg1Size: packoffset( c2 );
+	uint arg1panel: packoffset( c3.y );
+	uint2 arg1LayerStrides: packoffset( c3.z );
+
+	uint4 resultSize: packoffset( c4 );
+	uint4 resultStrides: packoffset( c5 );
+}
+
+// Accumulator for the output tile
+// That last `+1` helps a bit, I'm not sure why exactly but probebly because memory bank conflicts.
+groupshared float tileOutput[ TILE_SIZE ][ TILE_SIZE + 1 ];
+// A smaller tile loaded from the first source matrix
+groupshared float tile0[ TILE_HEIGHT ][ TILE_SIZE ];
+#if !STREAM_SECOND_MATRIX
+// A smaller tile loaded from the second source matrix
+groupshared float tile1[ TILE_HEIGHT ][ TILE_SIZE ];
+#endif
+
+#if STREAM_SECOND_MATRIX
+void multiplyTiles( const uint3 thread, uint rsi, const uint h )
+{
+	uint2 i = uint2( thread.y, rsi );
+	const uint2 iInc = uint2( THREADS_Y, THREADS_Y );
+	for( ; i.x < TILE_SIZE; i += iInc )
+	{
+		float r = 0.0;
+		uint2 j = uint2( 0, i.y );
+		const uint2 jInc = uint2( 1, TILE_SIZE );
+		for( ; j.x < h; j += jInc )
+		{
+			float a = tile0[ j.x ][ thread.x ];
+			float b = arg1[ j.y ];
+			r = mad( a, b, r );
+		}
+		tileOutput[ i.x ][ thread.x ] += r;
+	}
+}
+#else
+void multiplyTiles( const uint3 thread )
+{
+	for( uint row = thread.y; row < TILE_SIZE; row += THREADS_Y )
+	{
+		float r = 0.0;
+		for( uint j = 0; j < TILE_HEIGHT; j++ )
+		{
+			float a = tile0[ j ][ thread.x ];
+			float b = tile1[ j ][ row ];
+			r = mad( a, b, r );
+		}
+		tileOutput[ row ][ thread.x ] += r;
+	}
+}
+#endif
+
+void storeTile( const uint3 thread, const uint4 pos, const uint2 size )
+{
+	if( thread.x >= size.x )
+		return;
+	const uint4 prod4 = pos * resultStrides;
+	const uint2 prod2 = prod4.xy + prod4.zw;
+	uint rdi = prod2.x + prod2.y;
+	rdi += resultStrides.y * thread.y;
+	rdi += thread.x;
+	for( uint i = thread.y; i < size.y; i += THREADS_Y, rdi += resultStrides.y * THREADS_Y )
+		result[ rdi ] = tileOutput[ i ][ thread.x ];
+}
+
+[numthreads( TILE_SIZE, THREADS_Y, 1 )]
+void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID )
+{
+	uint i;
+	// Zero all 3 shared buffers
+	for( i = thread.y; i < TILE_SIZE; i += THREADS_Y )
+		tileOutput[ i ][ thread.x ] = 0.0;
+	for( i = thread.y; i < TILE_HEIGHT; i += THREADS_Y )
+	{
+		tile0[ i ][ thread.x ] = 0.0;
+#if !STREAM_SECOND_MATRIX
+		tile1[ i ][ thread.x ] = 0.0;
+#endif
+	}
+
+	const uint2 layer = uint2( group.z % resultSize.z, group.z / resultSize.z );
+
+	uint rsi0 = group.x * arg0panel + layer.x * arg0LayerStrides.x + layer.y * arg0LayerStrides.y;
+	uint rsi1 = group.y * arg1panel + layer.x * arg1LayerStrides.x + layer.y * arg1LayerStrides.y;
+
+	const uint threadOffset = thread.y * TILE_SIZE + thread.x;
+	rsi0 += threadOffset;
+#if STREAM_SECOND_MATRIX
+	rsi1 += thread.y;
+#else
+	rsi1 += threadOffset;
+#endif
+
+	const uint completeTiles = arg0Size.x / TILE_HEIGHT;
+	for( i = 0; i < completeTiles; i++ )
+	{
+		// Load [ TILE_SIZE, TILE_HEIGHT ] block from both source tensors into these groupshared buffers
+		for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y )
+		{
+			tile0[ j ][ thread.x ] = arg0[ rsi0 ];
+			rsi0 += THREADS_Y * TILE_SIZE;
+#if !STREAM_SECOND_MATRIX
+			tile1[ j ][ thread.x ] = arg1[ rsi1 ];
+			rsi1 += THREADS_Y * TILE_SIZE;
+#endif
+		}
+
+		// Wait for all threads in the group to complete these loads
+		GroupMemoryBarrierWithGroupSync();
+
+#if STREAM_SECOND_MATRIX
+		multiplyTiles( thread, rsi1, TILE_HEIGHT );
+		rsi1 += TILE_HEIGHT * TILE_SIZE;
+#else
+		// Multiply + accumulate the elements collected in the groupshared buffers
+		multiplyTiles( thread );
+#endif
+		GroupMemoryBarrierWithGroupSync();
+	}
+
+	const uint rem = arg0Size.x % TILE_HEIGHT;
+	if( rem != 0 )
+	{
+		// Load [ TILE_SIZE, rem ] block from both source tensors, and zero out the padding elements
+		for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y )
+		{
+			[branch]
+			if( j < rem )
+			{
+				tile0[ j ][ thread.x ] = arg0[ rsi0 ];
+				rsi0 += THREADS_Y * TILE_SIZE;
+#if !STREAM_SECOND_MATRIX
+				tile1[ j ][ thread.x ] = arg1[ rsi1 ];
+				rsi1 += THREADS_Y * TILE_SIZE;
+#endif
+			}
+			else
+			{
+				tile0[ j ][ thread.x ] = 0.0;
+#if !STREAM_SECOND_MATRIX
+				tile1[ j ][ thread.x ] = 0.0;
+#endif
+			}
+		}
+
+		// Wait for all threads in the group to complete these loads
+		GroupMemoryBarrierWithGroupSync();
+
+		// Multiply + accumulate the elements collected in the groupshared buffers
+#if STREAM_SECOND_MATRIX
+		multiplyTiles( thread, rsi1, rem );
+#else
+		multiplyTiles( thread );
+#endif
+		GroupMemoryBarrierWithGroupSync();
+	}
+
+	const uint2 resultPos = group.xy * TILE_SIZE;
+	const uint2 outputSize = min( TILE_SIZE, resultSize.xy - resultPos );
+	storeTile( thread, uint4( resultPos, layer ), outputSize );
+}
+\ No newline at end of file
diff --git a/ComputeShaders/norm.hlsl b/ComputeShaders/norm.hlsl
new file mode 100644
index 0000000..eeb82b7
--- /dev/null
+++ b/ComputeShaders/norm.hlsl
@@ -0,0 +1,86 @@
+// Ported from ggml_compute_forward_norm_f32
+// Dispatch [ ne01, ne02, ne03 ] thread groups of this shader
+Buffer<float> arg0: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 src0_elements: packoffset( c0 );
+	uint4 src0_strides: packoffset( c1 );
+	uint4 result_strides: packoffset( c3 );
+}
+
+static const float eps = 1e-5f; // TODO: make this a parameter
+
+#include "groupReduce.hlsli"
+
+float computeVectorSum( uint i, const uint length, const uint thread )
+{
+	float res = 0.0;
+
+	const uint iEnd = i + length;
+	i += thread;
+	for( ; i < iEnd; i += 32 )
+		res += arg0[ i ];
+
+	horizontalSumBroadcast( thread, res );
+	return res;
+}
+
+float offsetAndComputeSumSquares( uint rsi, uint rdi, const float mean, const uint length, const uint thread )
+{
+	float sum2 = 0.0;
+
+	const uint rsiEnd = rsi + length;
+	rsi += thread;
+	rdi += thread;
+	for( ; rsi < rsiEnd; rsi += 32, rdi += 32 )
+	{
+		float v = arg0[ rsi ] - mean;
+		result[ rdi ] = v;
+		sum2 = mad( v, v, sum2 );
+	}
+
+	horizontalSumBroadcast( thread, sum2 );
+	return sum2;
+}
+
+void scaleVector( uint rdi, const float scale, const uint length, const uint thread )
+{
+	const uint rdiEnd = rdi + length;
+	for( rdi += thread; rdi < rdiEnd; rdi += 32 )
+	{
+		float f = result[ rdi ];
+		f *= scale;
+		result[ rdi ] = f;
+	}
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint i03 = group.z;
+	const uint i02 = group.y;
+	const uint i01 = group.x;
+
+	const uint nb01 = src0_strides[ 1 ];
+	const uint nb02 = src0_strides[ 2 ];
+	const uint nb03 = src0_strides[ 3 ];
+
+	const uint p = i01 * nb01 + i02 * nb02 + i03 * nb03;
+
+	const uint ne00 = src0_elements[ 0 ];
+
+	float mean = computeVectorSum( p, ne00, thread );
+	mean /= (float)(int)ne00;
+
+	const uint nb1 = result_strides[ 1 ];
+	const uint nb2 = result_strides[ 2 ];
+	const uint nb3 = result_strides[ 3 ];
+	const uint y = i01 * nb1 + i02 * nb2 + i03 * nb3;
+
+	float sum2 = offsetAndComputeSumSquares( p, y, mean, ne00, thread );
+	const float scale = 1.0 / sqrt( sum2 / (float)(int)ne00 + eps );
+
+	scaleVector( y, scale, ne00, thread );
+}
+\ No newline at end of file
diff --git a/ComputeShaders/normCompat.hlsl b/ComputeShaders/normCompat.hlsl
new file mode 100644
index 0000000..23e1228
--- /dev/null
+++ b/ComputeShaders/normCompat.hlsl
@@ -0,0 +1,82 @@
+// Ported from ggml_compute_forward_norm_f32
+// Dispatch [ ( ne01 + 31 ) / 32, ne02, ne03 ] thread groups of this shader
+Buffer<float> arg0: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 src0_elements: packoffset( c0 );
+	uint4 src0_strides: packoffset( c1 );
+	uint4 result_strides: packoffset( c3 );
+}
+
+static const double eps = 1e-5; // TODO: make this a parameter
+
+#include "groupReduce.hlsli"
+
+double computeVectorSum( uint i, const uint length )
+{
+	double res = 0.0;
+	const uint iEnd = i + length;
+	for( ; i < iEnd; i++ )
+		res += arg0[ i ];
+	return res;
+}
+
+double offsetAndComputeSumSquares( uint rsi, uint rdi, const double mean, const uint length )
+{
+	precise double sum2 = 0.0;
+	const uint rsiEnd = rsi + length;
+	for( ; rsi < rsiEnd; rsi++, rdi++ )
+	{
+		double v = arg0[ rsi ];
+		v -= mean;
+		result[ rdi ] = (float)v;
+		double prod = v * v;
+		sum2 += prod;
+	}
+	return sum2;
+}
+
+void scaleVector( uint rdi, const float scale, const uint length )
+{
+	const uint rdiEnd = rdi + length;
+	for( ; rdi < rdiEnd; rdi++ )
+	{
+		float f = result[ rdi ];
+		f *= scale;
+		result[ rdi ] = f;
+	}
+}
+
+#include "fp64Utils.hlsli"
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 dtid: SV_DispatchThreadID )
+{
+	const uint i03 = dtid.z;
+	const uint i02 = dtid.y;
+	const uint i01 = dtid.x;
+	if( i01 >= src0_elements[ 1 ] )
+		return;
+
+	const uint nb01 = src0_strides[ 1 ];
+	const uint nb02 = src0_strides[ 2 ];
+	const uint nb03 = src0_strides[ 3 ];
+
+	const uint p = i01 * nb01 + i02 * nb02 + i03 * nb03;
+	const uint ne00 = src0_elements[ 0 ];
+
+	double mean = computeVectorSum( p, ne00 );
+	mean = div64( mean, (double)(int)ne00 );
+
+	const uint nb1 = result_strides[ 1 ];
+	const uint nb2 = result_strides[ 2 ];
+	const uint nb3 = result_strides[ 3 ];
+	const uint y = i01 * nb1 + i02 * nb2 + i03 * nb3;
+
+	const double sum2 = offsetAndComputeSumSquares( p, y, mean, ne00 );
+	const float scale = (float)div64( 1.0, sqrt64( sum2 / (float)(int)ne00 + eps ) );
+
+	scaleVector( y, scale, ne00 );
+}
+\ No newline at end of file
diff --git a/ComputeShaders/normFixed.hlsl b/ComputeShaders/normFixed.hlsl
new file mode 100644
index 0000000..8f2267f
--- /dev/null
+++ b/ComputeShaders/normFixed.hlsl
@@ -0,0 +1,124 @@
+// Ported from ggml_compute_forward_norm_f32
+// Dispatch [ ne01, ne02, ne03 ] thread groups of this shader
+Buffer<float> arg0: register( t0 );
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 src0_elements: packoffset( c0 );
+	uint4 src0_strides: packoffset( c1 );
+	uint4 result_strides: packoffset( c3 );
+}
+
+static const float eps = 1e-5f; // TODO: make this a parameter
+
+// #include "groupReduce.hlsli"
+
+#ifndef THREADS
+static const uint THREADS = 32;
+#endif
+static const uint ROW_LENGTH = 1024;
+groupshared float rowBuffer[ ROW_LENGTH ];
+
+static const uint REDUCTION_BUFFER = 32;
+groupshared float sharedAccumulators[ REDUCTION_BUFFER ];
+
+// Compute horisontal sum of the numbers. The result is only correct on the thread #0 of the group.
+void horizontalSum( const uint thread, inout float sum )
+{
+	if( THREADS > REDUCTION_BUFFER )
+	{
+		for( uint t = REDUCTION_BUFFER; t < THREADS; t += REDUCTION_BUFFER )
+		{
+			// Threads [ t .. t + REDUCTION_BUFFER ] store into the buffer
+			if( thread >= t && thread < t + REDUCTION_BUFFER )
+				sharedAccumulators[ thread - t ] = sum;
+
+			GroupMemoryBarrierWithGroupSync();
+
+			// Threads [ 0 .. REDUCTION_BUFFER ] increment their local sum with the value loaded from the buffer
+			if( thread < REDUCTION_BUFFER )
+				sum += sharedAccumulators[ thread ];
+		}
+	}
+
+	if( thread < REDUCTION_BUFFER )
+		sharedAccumulators[ thread ] = sum;
+
+	for( uint i = REDUCTION_BUFFER / 2; i > 1; i /= 2 )
+	{
+		GroupMemoryBarrierWithGroupSync();
+		if( thread < i )
+		{
+			sum += sharedAccumulators[ thread + i ];
+			sharedAccumulators[ thread ] = sum;
+		}
+	}
+
+	GroupMemoryBarrierWithGroupSync();
+	if( 0 == thread )
+		sum += sharedAccumulators[ 1 ];
+}
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint i03 = group.z;
+	const uint i02 = group.y;
+	const uint i01 = group.x;
+	const uint ne00 = ROW_LENGTH;
+
+	// First pass: copy the data to local buffer, and compute sum
+	{
+		const uint nb01 = src0_strides[ 1 ];
+		const uint nb02 = src0_strides[ 2 ];
+		const uint nb03 = src0_strides[ 3 ];
+		const uint p = i01 * nb01 + i02 * nb02 + i03 * nb03;
+
+		float sum = 0;
+		for( uint i = thread; i < ne00; i += THREADS )
+		{
+			float f = arg0[ p + i ];
+			rowBuffer[ i ] = f;
+			sum += f;
+		}
+		horizontalSum( thread, sum );
+		if( 0 == thread )
+			sharedAccumulators[ 0 ] = sum / (float)(int)ne00;
+		GroupMemoryBarrierWithGroupSync();
+	}
+
+	// Second pass: offset and compute sum of squares
+	{
+		const float mean = sharedAccumulators[ 0 ];
+		float sum2 = 0;
+		for( uint i = thread; i < ne00; i += THREADS )
+		{
+			float v = rowBuffer[ i ];
+			v -= mean;
+			rowBuffer[ i ] = v;
+			sum2 = mad( v, v, sum2 );
+		}
+		horizontalSum( thread, sum2 );
+		if( 0 == thread )
+			sharedAccumulators[ 0 ] = 1.0 / sqrt( sum2 / (float)(int)ne00 + eps );
+		GroupMemoryBarrierWithGroupSync();
+	}
+
+	// Final pass: apply the scale, and copy from group shared buffer to the destination
+	{
+		const float scale = sharedAccumulators[ 0 ];
+
+		const uint nb1 = result_strides[ 1 ];
+		const uint nb2 = result_strides[ 2 ];
+		const uint nb3 = result_strides[ 3 ];
+		const uint y = i01 * nb1 + i02 * nb2 + i03 * nb3;
+
+		for( uint i = thread; i < ne00; i += THREADS )
+		{
+			float v = rowBuffer[ i ];
+			v *= scale;
+			result[ y + i ] = v;
+		}
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/normFixed64.hlsl b/ComputeShaders/normFixed64.hlsl
new file mode 100644
index 0000000..14aab3d
--- /dev/null
+++ b/ComputeShaders/normFixed64.hlsl
@@ -0,0 +1,2 @@
+#define THREADS 64
+#include "normFixed.hlsl"
+\ No newline at end of file
diff --git a/ComputeShaders/repeatUtils.hlsli b/ComputeShaders/repeatUtils.hlsli
new file mode 100644
index 0000000..1181501
--- /dev/null
+++ b/ComputeShaders/repeatUtils.hlsli
@@ -0,0 +1,21 @@
+inline uint rowOffset( uint3 idx, uint4 strides )
+{
+	return idx[ 0 ] * strides[ 1 ] + idx[ 1 ] * strides[ 2 ] + idx[ 2 ] * strides[ 3 ];
+}
+
+// Initial iterator state for a row of the output tensor
+// x = current index, y = index increment, z = end of the index
+inline uint3 tensorIteratorState( uint3 group, uint thread, uint4 size, uint4 stride )
+{
+	uint3 res;
+	res.x = rowOffset( group, stride );
+	res.y = THREADS * stride[ 0 ];
+	res.z = res.x + size[ 0 ] * stride[ 0 ];
+	res.x += thread * stride[ 0 ];
+	return res;
+}
+
+// Handle a complete row of output tensor, using the iterator made by tensorIteratorState() function
+#define ROW_LOOP( ts ) for( ; ts.x < ts.z; ts.x += ts.y )
+// Same as above, using different row length
+#define ROW_LOOP_EX( ts, len, stride ) for( ; ts.x < ts.z; ts.x += len * stride[ 0 ] )
+\ No newline at end of file
diff --git a/ComputeShaders/scaleInPlace.hlsl b/ComputeShaders/scaleInPlace.hlsl
new file mode 100644
index 0000000..d0320b4
--- /dev/null
+++ b/ComputeShaders/scaleInPlace.hlsl
@@ -0,0 +1,23 @@
+RWBuffer<float> buffer: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 src0_elements: packoffset( c0 );
+	uint4 src0_strides: packoffset( c1 );
+	float multiplier: packoffset( c2.x );
+}
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint nc0 = src0_elements[ 0 ];
+	uint i = group.x * src0_strides[ 1 ];
+	const uint iEnd = i + nc0;
+	const float mul = multiplier;
+	for( i += thread; i < iEnd; i += 32 )
+	{
+		float f = buffer[ i ];
+		f *= mul;
+		buffer[ i ] = f;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/softMax.hlsl b/ComputeShaders/softMax.hlsl
new file mode 100644
index 0000000..6ebd0f2
--- /dev/null
+++ b/ComputeShaders/softMax.hlsl
@@ -0,0 +1,71 @@
+// Dispatch [ nr, 1, 1 ] thread groups of this shader
+RWBuffer<float> result: register( u0 );
+
+// table_exp_f16
+Buffer<uint> lookupTable: register( t0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 elements: packoffset( c0 );
+	uint4 strides: packoffset( c1 );
+	uint nr: packoffset( c2.x );
+	float inputScale: packoffset( c2.y );
+}
+
+#include "miscUtils.hlsli"
+#include "groupReduce.hlsli"
+
+static const float negativeInfinity = asfloat( 0xff800000 );
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint p = group.x * strides[ 1 ];
+	const uint nc = elements[ 0 ];
+	const uint pEnd = p + nc;
+	uint i;
+
+	float m = negativeInfinity;
+	for( i = p + thread; i < pEnd; i += 32 )
+		m = max( m, result[ i ] );
+	horizontalMaxBroadcast( thread, m );
+
+	float sum = 0;
+	for( i = p + thread; i < pEnd; i += 32 )
+	{
+		float f = result[ i ];
+
+		[branch]
+		if( f != negativeInfinity )
+		{
+			f = ( f - m ) * inputScale;
+#if 1
+			// Similar to Radeon Graphics, computing the exponent on nVidia 1080Ti is also slightly faster than loading from the lookup table
+			f = exp( f );
+#else
+			uint s = fp16Rounded( f );
+			s = lookupTable[ s ];
+			f = f16tof32( s );
+#endif
+			sum += f;
+		}
+		else
+			f = 0;
+
+		result[ i ] = f;
+	}
+
+	horizontalSum( thread, sum );
+	if( 0 == thread )
+		sharedAccumulators[ 0 ] = 1.0 / sum;
+	GroupMemoryBarrierWithGroupSync();
+	const float scale = sharedAccumulators[ 0 ];
+
+	// ggml_vec_scale_f32
+	for( i = p + thread; i < pEnd; i += 32 )
+	{
+		float f = result[ i ];
+		f *= scale;
+		result[ i ] = f;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/softMax64.hlsl b/ComputeShaders/softMax64.hlsl
new file mode 100644
index 0000000..7ecd2ef
--- /dev/null
+++ b/ComputeShaders/softMax64.hlsl
@@ -0,0 +1,71 @@
+// Dispatch [ nr, 1, 1 ] thread groups of this shader
+RWBuffer<float> result: register( u0 );
+
+// table_exp_f16
+Buffer<uint> lookupTable: register( t0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 elements: packoffset( c0 );
+	uint4 strides: packoffset( c1 );
+	uint nr: packoffset( c2.x );
+	float inputScale: packoffset( c2.y );
+}
+
+#include "miscUtils.hlsli"
+#include "groupReduce64.hlsli"
+
+static const float negativeInfinity = asfloat( 0xff800000 );
+
+[ numthreads( 64, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint p = group.x * strides[ 1 ];
+	const uint nc = elements[ 0 ];
+	const uint pEnd = p + nc;
+	uint i;
+
+	float m = negativeInfinity;
+	for( i = p + thread; i < pEnd; i += 64 )
+		m = max( m, result[ i ] );
+	horizontalMaxBroadcast( thread, m );
+
+	float sum = 0;
+	for( i = p + thread; i < pEnd; i += 64 )
+	{
+		float f = result[ i ];
+
+		[branch]
+		if( f != negativeInfinity )
+		{
+			f = ( f - m ) * inputScale;
+#if 1
+			// At least on Radeon Graphics GPU inside Ryzen 7 5700G, computing exponent instead of loading from the buffer improves the performance
+			f = exp( f );
+#else
+			uint s = fp16Rounded( f );
+			s = lookupTable[ s ];
+			f = f16tof32( s );
+#endif
+			sum += f;
+		}
+		else
+			f = 0;
+
+		result[ i ] = f;
+	}
+
+	horizontalSum( thread, sum );
+	if( 0 == thread )
+		sharedAccumulators[ 0 ] = 1.0 / sum;
+	GroupMemoryBarrierWithGroupSync();
+	const float scale = sharedAccumulators[ 0 ];
+
+	// ggml_vec_scale_f32
+	for( i = p + thread; i < pEnd; i += 64 )
+	{
+		float f = result[ i ];
+		f *= scale;
+		result[ i ] = f;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/softMaxCompat.hlsl b/ComputeShaders/softMaxCompat.hlsl
new file mode 100644
index 0000000..2215ebd
--- /dev/null
+++ b/ComputeShaders/softMaxCompat.hlsl
@@ -0,0 +1,62 @@
+// ggml_compute_forward_soft_max_f32
+// Dispatch [ ( nr + 31 ) / 32, 1, 1 ] thread groups of this shader
+RWBuffer<float> result: register( u0 );
+
+// table_exp_f16
+Buffer<uint> lookupTable: register( t0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 elements: packoffset( c0 );
+	uint4 strides: packoffset( c1 );
+	uint nr: packoffset( c2.x );
+}
+
+#include "miscUtils.hlsli"
+#include "fp64Utils.hlsli"
+
+static const float negativeInfinity = asfloat( 0xff800000 );
+
+[ numthreads( 32, 1, 1 ) ]
+void main( uint3 dtid: SV_DispatchThreadID )
+{
+	if( dtid.x >= nr )
+		return;
+
+	const uint p = dtid.x * strides[ 1 ];
+	const uint nc = elements[ 0 ];
+	const uint pEnd = p + nc;
+	uint i;
+
+	float m = negativeInfinity;
+	for( i = p; i < pEnd; i++ )
+		m = max( m, result[ i ] );
+
+	double sum = 0;
+	for( i = p; i < pEnd; i++ )
+	{
+		float f = result[ i ];
+
+		[branch]
+		if( f != negativeInfinity )
+		{
+			uint s = fp16Rounded( f - m );
+			s = lookupTable[ s ];
+			f = f16tof32( s );
+			sum += f;
+		}
+		else
+			f = 0;
+
+		result[ i ] = f;
+	}
+
+	const float scale = (float)div64( 1.0, sum );
+	// ggml_vec_scale_f32
+	for( i = p; i < pEnd; i++ )
+	{
+		float f = result[ i ];
+		f *= scale;
+		result[ i ] = f;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/softMaxFixed.hlsl b/ComputeShaders/softMaxFixed.hlsl
new file mode 100644
index 0000000..7b4add2
--- /dev/null
+++ b/ComputeShaders/softMaxFixed.hlsl
@@ -0,0 +1,79 @@
+// Special softMax shader for matrices with rows of 1500 elements.
+// Uses group shared buffer of that length to save global memory bandwidth, more than 2x faster than the original.
+// Dispatch [ nr, 1, 1 ] thread groups of this shader
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint4 elements: packoffset( c0 );
+	uint4 strides: packoffset( c1 );
+	uint nr: packoffset( c2.x );
+	float inputScale: packoffset( c2.y );
+}
+
+#include "miscUtils.hlsli"
+#include "groupReduce64.hlsli"
+
+static const uint THREADS = 64;
+static const uint ROW_LENGTH = 1500;
+groupshared float rowBuffer[ ROW_LENGTH ];
+
+static const float negativeInfinity = asfloat( 0xff800000 );
+
+[ numthreads( THREADS, 1, 1 ) ]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	const uint p = group.x * strides[ 1 ];
+	const uint nc = ROW_LENGTH;
+	uint i;
+
+	float m = negativeInfinity;
+	// First pass: compute maximum, and copy the row into the group shared buffer
+	for( i = thread; i < nc; i += THREADS )
+	{
+		float f = result[ p + i ];
+		m = max( m, f );
+		rowBuffer[ i ] = f;
+	}
+	horizontalMaxBroadcast( thread, m );
+
+	// Second pass: apply initial scale, compute the exponent, and compute total sum over the row
+	float sum = 0;
+	for( i = thread; i < nc; i += THREADS )
+	{
+		float f = rowBuffer[ i ];
+
+		[branch]
+		if( f != negativeInfinity )
+		{
+			f = ( f - m ) * inputScale;
+#if 1
+			// At least on Radeon Graphics GPU inside Ryzen 7 5700G, computing exponent instead of loading from the buffer improves the performance
+			f = exp( f );
+#else
+			uint s = fp16Rounded( f );
+			s = lookupTable[ s ];
+			f = f16tof32( s );
+#endif
+			sum += f;
+		}
+		else
+			f = 0;
+
+		rowBuffer[ i ] = f;
+	}
+
+	horizontalSum( thread, sum );
+	if( 0 == thread )
+		sharedAccumulators[ 0 ] = 1.0 / sum;
+	GroupMemoryBarrierWithGroupSync();
+	const float scale = sharedAccumulators[ 0 ];
+
+	// Final pass: apply the final scale, and copy the row from the group shared buffer back into the global memory
+	for( i = thread; i < nc; i += THREADS )
+	{
+		float f = rowBuffer[ i ];
+		f *= scale;
+		result[ p + i ] = f;
+	}
+}
+\ No newline at end of file
diff --git a/ComputeShaders/zeroMemory.hlsl b/ComputeShaders/zeroMemory.hlsl
new file mode 100644
index 0000000..c486636
--- /dev/null
+++ b/ComputeShaders/zeroMemory.hlsl
@@ -0,0 +1,27 @@
+RWBuffer<float> result: register( u0 );
+
+cbuffer Constants: register( b0 )
+{
+	uint elements: packoffset( c0.x );
+}
+
+// Thread group index is 16 bits per coordinate:
+// https://learn.microsoft.com/en-us/windows/win32/api/d3d11/nf-d3d11-id3d11devicecontext-dispatch
+// We want this shader to support buffers up to 2 GB.
+#ifndef THREADS
+static const uint THREADS = 512;
+#endif
+#ifndef ITERATIONS
+static const uint ITERATIONS = 128;
+#endif
+
+static const uint itemsPerGroup = THREADS * ITERATIONS;
+
+[numthreads( THREADS, 1, 1 )]
+void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
+{
+	uint rdi = group.x * itemsPerGroup;
+	const uint rdiEnd = min( rdi + itemsPerGroup, elements );
+	for( rdi += thread; rdi < rdiEnd; rdi += THREADS )
+		result[ rdi ] = 0.0;
+}
+\ No newline at end of file