blob: ffb6f830203a260b46b7d298fea44e9c48ccb538 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
// GGML_TASK_INIT step for matrix*matrix product, where nb01 >= nb00;
// Dispatch with [ ne11, ne12 ] groups
Buffer<float> arg0: register( t0 );
RWBuffer<float> result: register( u0 );
cbuffer Constants: register( b0 )
{
uint4 src0_elements: packoffset( c0 );
uint4 src0_strides: packoffset( c1 );
}
#include "miscUtils.hlsli"
// Each thread group of this shader copies a single rows of the matrix
[ numthreads( 32, 1, 1 ) ]
void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
{
const uint i12 = group.y;
const uint i11 = group.x;
const uint ne10 = src0_elements.x;
const uint ne11 = src0_elements.y;
const uint nb12 = src0_strides.z;
const uint nb11 = src0_strides.y;
uint rdi = i11 * ne10 + i12 * ne10 * ne11;
const uint rdiEnd = rdi + ne10;
uint rsi = i12 * nb12 + i11 * nb11;
rdi += thread;
rsi += thread;
for( ; rdi < rdiEnd; rdi += 32, rsi += 32 )
result[ rdi ] = adjustFp16( arg0[ rsi ] );
}
|