1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
#pragma once
#include <atlcomcli.h>
#include <string>
namespace DirectCompute
{
ID3D11Device* device();
ID3D11DeviceContext* context();
D3D_FEATURE_LEVEL featureLevel();
HRESULT initialize( uint32_t flags );
void terminate();
// DXGI_ADAPTER_DESC.VendorId magic numbers; they come from that database: https://pcisig.com/membership/member-companies
enum struct eGpuVendor : uint16_t
{
AMD = 0x1002,
NVidia = 0x10de,
Intel = 0x8086,
VMWare = 0x15ad,
};
enum struct eGpuEffectiveFlags : uint8_t
{
Wave64 = 1,
ReshapedMatMul = 2,
};
struct sGpuInfo
{
eGpuEffectiveFlags flags;
eGpuVendor vendor;
uint16_t device, revision;
uint32_t subsystem;
size_t vramDedicated, ramDedicated, ramShared;
std::wstring description;
inline bool wave64() const
{
return 0 != ( (uint8_t)flags & (uint8_t)eGpuEffectiveFlags::Wave64 );
}
// On nVidia 1080Ti that approach is much slower, by a factor of 2.4
// On AMD Cezanne that approach is faster by a factor of 0.69, i.e. 30% faster.
// Dunno why that is, maybe 'coz on that AMD complete panels fit in L3 cache.
// Anyway, we do want extra 30% perf on AMD Cezanne, so only using that code on AMD GPUs.
// Dunno how it gonna behave on other GPUs, need to test.
inline bool useReshapedMatMul() const
{
return 0 != ( (uint8_t)flags & (uint8_t)eGpuEffectiveFlags::ReshapedMatMul );
}
};
extern const sGpuInfo& gpuInfo;
inline bool available()
{
return nullptr != device();
}
inline void csSetCB( ID3D11Buffer* cb )
{
context()->CSSetConstantBuffers( 0, 1, &cb );
}
__m128i bufferMemoryUsage( ID3D11Buffer* buffer );
__m128i resourceMemoryUsage( ID3D11ShaderResourceView* srv );
}
|