summaryrefslogtreecommitdiffstats
path: root/Whisper
diff options
context:
space:
mode:
authorKonstantin <const@const.me>2023-01-24 17:40:50 +0100
committerKonstantin <const@const.me>2023-01-24 17:40:50 +0100
commite78815de53005336c3746bcd85c2e789a43f1b94 (patch)
tree5fe25b6d4024bbcca7402ebcc28910417c54758d /Whisper
parent9d6fb00973218ebc397e15819be3964423b81f91 (diff)
Performance tuning on AMD iGPU
Diffstat (limited to 'Whisper')
-rw-r--r--Whisper/ML/MlContext.cpp31
1 files changed, 6 insertions, 25 deletions
diff --git a/Whisper/ML/MlContext.cpp b/Whisper/ML/MlContext.cpp
index a226999..9e625a7 100644
--- a/Whisper/ML/MlContext.cpp
+++ b/Whisper/ML/MlContext.cpp
@@ -117,17 +117,8 @@ void MlContext::mulMatTiled( const Tensor& a, const Tensor& b, Tensor& res )
else
{
bindShader( eComputeShader::mulMatByRowTiled );
- uint32_t groupsX;
- if( gpuInfo.wave64() )
- {
- constexpr uint32_t TILE_Y = 128;
- groupsX = ( a.ne[ 1 ] + TILE_Y - 1 ) / TILE_Y;
- }
- else
- {
- constexpr uint32_t TILE_Y = 64;
- groupsX = ( a.ne[ 1 ] + TILE_Y - 1 ) / TILE_Y;
- }
+ constexpr uint32_t TILE_Y = 64;
+ const uint32_t groupsX = ( a.ne[ 1 ] + TILE_Y - 1 ) / TILE_Y;
context()->Dispatch( groupsX, a.ne[ 2 ], a.ne[ 3 ] );
}
}
@@ -146,20 +137,10 @@ void MlContext::mulMatTiled( const Tensor& a, const Tensor& b, Tensor& res )
// Dispatching one thread group for each tile of the output matrix.
bindShader( eComputeShader::mulMatTiled );
- uint32_t x, y;
- // These compute shaders correctly handle partial tiles on the right and bottom edges of the output matrix, that's why rounding up.
- if( gpuInfo.wave64() )
- {
- constexpr uint32_t TILE_SIZE = 64;
- x = ( res.ne[ 0 ] + TILE_SIZE - 1 ) / TILE_SIZE;
- y = ( res.ne[ 1 ] + TILE_SIZE - 1 ) / TILE_SIZE;
- }
- else
- {
- constexpr uint32_t TILE_SIZE = 32;
- x = ( res.ne[ 0 ] + TILE_SIZE - 1 ) / TILE_SIZE;
- y = ( res.ne[ 1 ] + TILE_SIZE - 1 ) / TILE_SIZE;
- }
+ // These compute shaders correctly handle partial tiles on the right and bottom edges of the output matrix, that's why rounding up
+ constexpr uint32_t TILE_SIZE = 32;
+ const uint32_t x = ( res.ne[ 0 ] + TILE_SIZE - 1 ) / TILE_SIZE;
+ const uint32_t y = ( res.ne[ 1 ] + TILE_SIZE - 1 ) / TILE_SIZE;
const uint32_t z = res.ne[ 2 ] * res.ne[ 3 ];
context()->Dispatch( x, y, z );