diff options
| author | Konstantin <const@const.me> | 2023-01-16 14:52:43 +0100 |
|---|---|---|
| committer | Konstantin <const@const.me> | 2023-01-16 14:52:43 +0100 |
| commit | 8c4603c73675958efc960fbd4bb599a2909d106a (patch) | |
| tree | 714dc6fc9a1672d5fd7f89676b97e10959662abc /Tools | |
| parent | 990a8d0dbaefc996244097397259e92758b15cce (diff) | |
Source codes
Diffstat (limited to 'Tools')
| -rw-r--r-- | Tools/CompressShaders/Cabinet.cs | 60 | ||||
| -rw-r--r-- | Tools/CompressShaders/CompressShaders.cs | 244 | ||||
| -rw-r--r-- | Tools/CompressShaders/CompressShaders.csproj | 10 | ||||
| -rw-r--r-- | Tools/CompressShaders/DetectFp64.cs | 43 | ||||
| -rw-r--r-- | Tools/CompressShaders/LanguageCodes.cs | 103 | ||||
| -rw-r--r-- | Tools/CompressShaders/Readme.txt | 10 | ||||
| -rw-r--r-- | Tools/CompressShaders/ShaderNames.cs | 27 | ||||
| -rw-r--r-- | Tools/compareTraces/CommandLineArgs.cpp | 51 | ||||
| -rw-r--r-- | Tools/compareTraces/CommandLineArgs.h | 9 | ||||
| -rw-r--r-- | Tools/compareTraces/Readme.txt | 9 | ||||
| -rw-r--r-- | Tools/compareTraces/TraceReader.cpp | 46 | ||||
| -rw-r--r-- | Tools/compareTraces/TraceReader.h | 35 | ||||
| -rw-r--r-- | Tools/compareTraces/compare.cpp | 364 | ||||
| -rw-r--r-- | Tools/compareTraces/compare.h | 4 | ||||
| -rw-r--r-- | Tools/compareTraces/compareTraces.cpp | 16 | ||||
| -rw-r--r-- | Tools/compareTraces/compareTraces.vcxproj | 103 | ||||
| -rw-r--r-- | Tools/compareTraces/compareTraces.vcxproj.filters | 20 | ||||
| -rw-r--r-- | Tools/compareTraces/stdafx.cpp | 30 | ||||
| -rw-r--r-- | Tools/compareTraces/stdafx.h | 40 | ||||
| -rw-r--r-- | Tools/compareTraces/testUtils.cpp | 224 |
20 files changed, 1448 insertions, 0 deletions
diff --git a/Tools/CompressShaders/Cabinet.cs b/Tools/CompressShaders/Cabinet.cs new file mode 100644 index 0000000..b53fd18 --- /dev/null +++ b/Tools/CompressShaders/Cabinet.cs @@ -0,0 +1,60 @@ +using System.ComponentModel; +using System.Runtime.InteropServices; + +namespace CompressShaders +{ + /// <summary>Lossless data compressor implemented by <c>Cabinet.dll</c> Windows component</summary> + /// <remarks> + /// <para>Whisper.dll consumes that component in runtime, to decompress these shader binaries</para> + /// <para>If you wonder why not gzip — because the OS doesn’t include an API for that, at least not an API usable from C or C++.<br/> + /// .NET standard library includes gzip algorithm, but we don't want Whisper.dll to depend on .NET.</para> + /// </remarks> + static class Cabinet + { + /// <summary>Compression algorithm</summary> + /// <seealso href="https://learn.microsoft.com/en-us/windows/win32/cmpapi/using-the-compression-api#selecting-the-compression-algorithm" /> + enum eCompressionAlgorithm: uint + { + MSZIP = 2, + XPRESS = 3, + XPRESS_HUFF = 4, + LZMS = 5, + } + /// <summary>The value should match <c>constexpr DWORD compressionAlgorithm</c> constant,<br/>in <c>Whisper/D3D/shaders.cpp</c> source file</summary> + const eCompressionAlgorithm algo = eCompressionAlgorithm.MSZIP; + + [DllImport( "Cabinet.dll", SetLastError = true )] + static extern bool CreateCompressor( eCompressionAlgorithm Algorithm, IntPtr AllocationRoutines, out IntPtr CompressorHandle ); + + [DllImport( "Cabinet.dll", SetLastError = true )] + static extern bool CloseCompressor( IntPtr CompressorHandle ); + + [DllImport( "Cabinet.dll", SetLastError = true )] + static extern bool Compress( IntPtr CompressorHandle, [In] byte[] UncompressedData, IntPtr UncompressedDataSize, [Out] byte[] CompressedBuffer, IntPtr CompressedBufferSize, out IntPtr CompressedDataSize ); + + /// <summary>Compress an array of bytes into another, smaller array of bytes</summary> + /// <remarks>In practice, the compression ratio is about 7.1 for the shader binaries in Release configuration.</remarks> + public static byte[] compressBuffer( byte[] src ) + { + if( src.Length <= 0 ) + throw new ArgumentException( "The source buffer is empty" ); + IntPtr hCompressor; + if( !CreateCompressor( algo, IntPtr.Zero, out hCompressor ) ) + throw new Win32Exception( "Unable to create the compressor" ); + try + { + byte[] dest = new byte[ src.Length * 2 ]; + IntPtr srcSize = new IntPtr( src.Length ); + IntPtr destSize = new IntPtr( src.Length * 2 ); + if( !Compress( hCompressor, src, srcSize, dest, destSize, out destSize ) ) + throw new Win32Exception( "Compress failed" ); + Array.Resize( ref dest, (int)destSize ); + return dest; + } + finally + { + CloseCompressor( hCompressor ); + } + } + } +}
\ No newline at end of file diff --git a/Tools/CompressShaders/CompressShaders.cs b/Tools/CompressShaders/CompressShaders.cs new file mode 100644 index 0000000..814f966 --- /dev/null +++ b/Tools/CompressShaders/CompressShaders.cs @@ -0,0 +1,244 @@ +using System.Runtime.CompilerServices; +namespace CompressShaders; + +record struct sShaderBinary +{ + public string name; + public byte[] data; + + public sShaderBinary( string path ) + { + name = Path.GetFileNameWithoutExtension( path ); + data = File.ReadAllBytes( path ); + } + + public bool wave64 => name.EndsWith( "64" ); + public string uniqueName => wave64 ? name.Substring( 0, name.Length - 2 ) : name; +} + +sealed class FoundShaders +{ + public readonly sShaderBinary[] binaries; + public readonly string[] names; + public readonly int[] wave32, wave64; + + public FoundShaders( IEnumerable<sShaderBinary> found ) + { + binaries = found + .OrderBy( b => b.name ) + .ToArray(); + + names = binaries + .Select( b => b.uniqueName ) + .Distinct() + .ToArray(); + + wave32 = new int[ names.Length ]; + wave64 = new int[ names.Length ]; + for( int i = 0; i < names.Length; i++ ) + { + int i32 = findIndex( names[ i ], false ); + int i64 = findIndex( names[ i ], true ); + if( i32 >= 0 && i64 >= 0 ) + { + wave32[ i ] = i32; + wave64[ i ] = i64; + continue; + } + if( i32 >= 0 ) + { + wave32[ i ] = wave64[ i ] = i32; + continue; + } + throw new ApplicationException( $"Wave64 shader {names[ i ]} doesn't have the corresponding Wave32 one" ); + } + } + + int findIndex( string name, bool wave64 ) + { + for( int i = 0; i < binaries.Length; i++ ) + { + sShaderBinary sb = binaries[ i ]; + if( sb.uniqueName != name ) + continue; + if( sb.wave64 == wave64 ) + return i; + } + return -1; + } +} + +class Program +{ + static string getSolutionRoot( [CallerFilePath] string? path = null ) + { + string? dir = Path.GetDirectoryName( path ); + dir = Path.GetDirectoryName( dir ); + dir = Path.GetDirectoryName( dir ); + return dir ?? throw new ApplicationException(); + } + +#if DEBUG + const string config = "Debug"; +#else + const string config = "Release"; +#endif + + static string shadersBinDir( string root ) + { + return Path.Combine( root, "ComputeShaders", "x64", config ); + } + + static IEnumerable<sShaderBinary> readShaders( string root ) + { + string dir = shadersBinDir( root ); + foreach( string path in Directory.EnumerateFiles( dir, "*.cso" ) ) + yield return new sShaderBinary( path ); + } + + static void writeHeader( string root, IEnumerable<string> names ) + { + string path = Path.Combine( root, "Whisper", "D3D", "shaderNames.h" ); + using var stream = File.CreateText( path ); + stream.WriteLine( @"// This header is generated by a tool +#pragma once +#include <stdint.h> + +namespace DirectCompute +{ + enum struct eComputeShader: uint16_t + {" ); + + int id = 0; + foreach( string name in names ) + { + stream.WriteLine( "\t\t{0} = {1},", name, id ); + id++; + } + stream.Write( @" }; + + const char* computeShaderName( eComputeShader cs ); +}" ); + } + + static void writeCpp( string root, IEnumerable<string> names ) + { + string path = Path.Combine( root, "Whisper", "D3D", "shaderNames.cpp" ); + ShaderNames.write( path, names ); + } + + static void writePayloadIDs( StreamWriter stream, string varName, int[] ids ) + { + stream.Write( @" +static const std::array<uint8_t, {0}> {1} = {{", ids.Length, varName ); + + for( int i = 0; i < ids.Length; i++ ) + { + if( 0 == i % 16 ) + stream.Write( "\r\n\t" ); + else + stream.Write( ' ' ); + stream.Write( "{0},", ids[ i ] ); + } + stream.Write( @" +};" ); + } + + static void writePayload( string root, FoundShaders shaders, out int cbSource, out int cbCompressed ) + { + MemoryStream ms = new MemoryStream(); + List<int> offsets = new List<int>(); + foreach( var bin in shaders.binaries ) + { + offsets.Add( (int)ms.Length ); + ms.Write( bin.data ); + } + offsets.Add( (int)ms.Length ); + + byte[] dxbc = ms.ToArray(); + byte[] compressed = Cabinet.compressBuffer( dxbc ); + cbSource = dxbc.Length; + cbCompressed = compressed.Length; + + string path = Path.Combine( root, "Whisper", "D3D", $"shaderData-{config}.inl" ); + using var stream = File.CreateText( path ); + stream.Write( @"// This source file is generated by a tool + +// This array contains concatenated and compressed DXBC binaries for all compiled compute shaders +static const std::array<uint8_t, {0}> s_compressedShaders = +{{", compressed.Length ); + + for( int i = 0; i < compressed.Length; i++ ) + { + if( 0 == i % 16 ) + stream.Write( "\r\n\t" ); + else + stream.Write( ' ' ); + stream.Write( "0x{0:X02},", compressed[ i ] ); + } + + stream.Write( @" +}}; + +// This array contains start offsets of shader binaries in the decompressed DXBC blob. +// It includes one more entry for the end of the complete decompressed blob. +static const std::array<uint32_t, {0}> s_shaderOffsets = {{", offsets.Count ); + + for( int i = 0; i < offsets.Count; i++ ) + { + if( 0 == i % 16 ) + stream.Write( "\r\n\t" ); + else + stream.Write( ' ' ); + stream.Write( "{0},", offsets[ i ] ); + } + stream.Write( @" +};" ); + + stream.Write( @" +// Index = eComputeShader enum value, value = index of the shader binary to use on nVidia and Intel GPUs" ); + writePayloadIDs( stream, "s_shaderBlobs32", shaders.wave32 ); + stream.Write( @" +// Index = eComputeShader enum value, value = index of the shader binary to use on AMD GPUs" ); + writePayloadIDs( stream, "s_shaderBlobs64", shaders.wave64 ); + + ulong fp64Flags = 0; + for( int i = 0; i < shaders.binaries.Length; i++ ) + { + bool fp64 = DetectFp64.usesFp64( shaders.binaries[ i ].data ); + if( fp64 ) + fp64Flags |= (ulong)1 << i; + } + + stream.Write( @" +// Bitmap of the shader binaries which use FP64 arithmetic instructions +constexpr uint64_t fp64ShadersBitmap = 0x{0:X}ull;", fp64Flags ); + } + + static void mainImpl() + { + string root = getSolutionRoot(); + LanguageCodes.produce( root ); + + FoundShaders shaders = new FoundShaders( readShaders( root ) ); + + writeHeader( root, shaders.names ); + writeCpp( root, shaders.names ); + writePayload( root, shaders, out int cbIn, out int cbOut ); + Console.WriteLine( "Compressed {0} compute shaders, {1:F1} kb -> {2:F1} kb", shaders.binaries.Length, cbIn / 1024.0, cbOut / 1024.0 ); + } + + static int Main( string[] args ) + { + try + { + mainImpl(); + return 0; + } + catch( Exception ex ) + { + Console.WriteLine( ex.Message ); + return ex.HResult; + } + } +}
\ No newline at end of file diff --git a/Tools/CompressShaders/CompressShaders.csproj b/Tools/CompressShaders/CompressShaders.csproj new file mode 100644 index 0000000..dee1710 --- /dev/null +++ b/Tools/CompressShaders/CompressShaders.csproj @@ -0,0 +1,10 @@ +<Project Sdk="Microsoft.NET.Sdk"> + <PropertyGroup> + <OutputType>Exe</OutputType> + <TargetFramework>net6.0</TargetFramework> + <ImplicitUsings>enable</ImplicitUsings> + <Nullable>enable</Nullable> + <CheckForOverflowUnderflow>true</CheckForOverflowUnderflow> + <AppendTargetFrameworkToOutputPath>false</AppendTargetFrameworkToOutputPath> + </PropertyGroup> +</Project>
\ No newline at end of file diff --git a/Tools/CompressShaders/DetectFp64.cs b/Tools/CompressShaders/DetectFp64.cs new file mode 100644 index 0000000..1d75126 --- /dev/null +++ b/Tools/CompressShaders/DetectFp64.cs @@ -0,0 +1,43 @@ +#pragma warning disable CS0649 +using System.Runtime.InteropServices; + +namespace CompressShaders +{ + static class DetectFp64 + { + struct DXBCHeader + { + public uint FourCC; // Four character code "DXBC" + public uint Hash0; // 32-bit hash of the DXBC file + public uint Hash1; // 32-bit hash of the DXBC file + public uint Hash2; // 32-bit hash of the DXBC file + public uint Hash3; // 32-bit hash of the DXBC file + public uint unknownOne; + public uint TotalSize; // Total size of the DXBC file in bytes + public int NumChunks; // Number of chunks in the DXBC file + }; + + public static bool usesFp64( ReadOnlySpan<byte> dxbc ) + { + ReadOnlySpan<DXBCHeader> dxbcHeaderSpan = MemoryMarshal.Cast<byte, DXBCHeader>( dxbc ); + DXBCHeader dxbcHeader = dxbcHeaderSpan[ 0 ]; + + int cbHeader = Marshal.SizeOf<DXBCHeader>(); + int nChunks = dxbcHeader.NumChunks; + ReadOnlySpan<int> chunkOffsets = MemoryMarshal.Cast<byte, int>( dxbc.Slice( cbHeader, nChunks * 4 ) ); + foreach( int off in chunkOffsets ) + { + uint id = MemoryMarshal.Cast<byte, uint>( dxbc.Slice( off, 4 ) )[ 0 ]; + const uint SFI0 = 0x30494653; + if( id != SFI0 ) + continue; + int size = MemoryMarshal.Cast<byte, int>( dxbc.Slice( off + 4, 4 ) )[ 0 ]; + if( size < 4 ) + throw new ApplicationException(); + uint data = MemoryMarshal.Cast<byte, uint>( dxbc.Slice( off + 8, 4 ) )[ 0 ]; + return 0 != ( data & 1u ); + } + return false; + } + } +}
\ No newline at end of file diff --git a/Tools/CompressShaders/LanguageCodes.cs b/Tools/CompressShaders/LanguageCodes.cs new file mode 100644 index 0000000..71a9909 --- /dev/null +++ b/Tools/CompressShaders/LanguageCodes.cs @@ -0,0 +1,103 @@ +using System.Globalization; +using System.Text.RegularExpressions; + +namespace CompressShaders +{ + static class LanguageCodes + { + record struct Row + { + public string keySource; + public uint keyValue; + public int code; + public string name; + } + + static uint makeKey( string str ) + { + if( str.Length > 4 ) + throw new ArgumentException(); + uint k = 0; + int shift = 0; + foreach( char c in str ) + { + if( c >= 0x80 ) + throw new ArgumentException(); + uint u = (uint)c; + k |= ( u << shift ); + shift += 8; + } + return k; + } + + static IEnumerable<Row> load( string path ) + { + using var stm = File.OpenText( path ); + while( true ) + { + string? line = stm.ReadLine(); + if( null == line ) + break; + if( string.IsNullOrWhiteSpace( line ) ) + continue; + string[] fields = line.Split( '\t' ); + yield return new Row() + { + keySource = fields[ 0 ], + keyValue = makeKey( fields[ 0 ] ), + code = int.Parse( fields[ 1 ] ), + name = fields[ 2 ] + }; + } + } + + static void writeCpp( string inl, Row[] data ) + { + // TODO [very low]: sort them by the key here, then in C++ use binary search instead of the hash map + using var stm = File.CreateText( inl ); + stm.WriteLine( "// This file is generated by a tool, from the `languageCodez.tsv` file in this repository" ); + foreach( Row row in data ) + stm.WriteLine( "Lang{{ 0x{0:X}, {1}, \"{2}\" }},", row.keyValue, row.code, row.name ); + } + + static readonly CultureInfo ci = new CultureInfo( "en-US", false ); + static string titleCase( this string name ) => + ci.TextInfo.ToTitleCase( name.ToLower( ci ) ); + + static void writeCs( string cs, Row[] data ) + { + using var stm = File.CreateText( cs ); + stm.WriteLine( @"// This file is generated by a tool, from the `languageCodez.tsv` file in this repository +namespace Whisper +{ + /// <summary>Supported languages</summary> + public enum eLanguage: uint + {" ); + + foreach( Row row in data ) + { + string tc = row.name.titleCase(); + stm.WriteLine( " /// <summary>{0}</summary>", tc ); + tc = Regex.Replace( tc, @"\s+", string.Empty ); + stm.WriteLine( " {0} = 0x{1:X},", tc, row.keyValue ); + } + stm.Write( @" } +}" ); + } + + static void produce( string tsv, string inl, string cs ) + { + Row[] data = load( tsv ).OrderBy( r => r.name ).ToArray(); + writeCpp( inl, data ); + writeCs( cs, data ); + } + + public static void produce( string solutionRoot ) + { + string tsv = Path.Combine( solutionRoot, "Whisper\\Whisper\\languageCodez.tsv" ); + string inl = Path.Combine( solutionRoot, "Whisper\\Whisper\\languageCodez.inl" ); + string cs = Path.Combine( solutionRoot, "WhisperNet\\API\\eLanguage.cs" ); + produce( tsv, inl, cs ); + } + } +}
\ No newline at end of file diff --git a/Tools/CompressShaders/Readme.txt b/Tools/CompressShaders/Readme.txt new file mode 100644 index 0000000..69ef35a --- /dev/null +++ b/Tools/CompressShaders/Readme.txt @@ -0,0 +1,10 @@ +This project builds a C# console app which serves as a code generator for a few pieces of Whisper.dll and WhisperNet.dll. + +Specifically, it generates two things. + +1. It compresses the compiled DXBC shaders into a blob of bytes, and prints std::array with these bytes into shaderData-Release.inl and shaderData-Debug.inl C++ files. + +2. It parses the `languageCodez.tsv`, and generates both C++ and C# code with the data from that table. + +The tool uses relative paths across source files. +These paths will break if you move the source of the tool, or the source data of the tool.
\ No newline at end of file diff --git a/Tools/CompressShaders/ShaderNames.cs b/Tools/CompressShaders/ShaderNames.cs new file mode 100644 index 0000000..81ba46e --- /dev/null +++ b/Tools/CompressShaders/ShaderNames.cs @@ -0,0 +1,27 @@ +static class ShaderNames +{ + public static void write( string path, IEnumerable<string> names ) + { + string[] arr = names.ToArray(); + using var stream = File.CreateText( path ); + stream.WriteLine( @"// This source file is generated by a tool +#include ""stdafx.h"" +#include ""shaderNames.h"" +" ); + + stream.WriteLine( "static const std::array<const char*, {0}> s_shaderNames = ", arr.Length ); + stream.WriteLine( "{" ); + foreach( string name in arr ) + stream.WriteLine( @" ""{0}"",", name ); + + stream.Write( @"}; + +const char* DirectCompute::computeShaderName( eComputeShader cs ) +{ + const uint16_t i = (uint16_t)cs; + if( i < s_shaderNames.size() ) + return s_shaderNames[ i ]; + return nullptr; +}" ); + } +}
\ No newline at end of file diff --git a/Tools/compareTraces/CommandLineArgs.cpp b/Tools/compareTraces/CommandLineArgs.cpp new file mode 100644 index 0000000..5f26bdb --- /dev/null +++ b/Tools/compareTraces/CommandLineArgs.cpp @@ -0,0 +1,51 @@ +#include "stdafx.h" +#include "CommandLineArgs.h" +#include <charconv> + +static bool printUsage() +{ + fprintf( stderr, "Usage: compareTraces.exe trace1.bin trace2.bin [-diff N]\n" ); + return false; +} + +bool CommandLineArgs::parse( int argc, wchar_t* argv[] ) +{ + size_t idx = 0; + CString sw; + CStringA tmp; + for( int i = 1; i < argc; i++ ) + { + if( argv[ i ][ 0 ] != L'-' ) + { + if( idx >= 2 ) + return printUsage(); + inputs[ idx ] = argv[ i ]; + idx++; + continue; + } + sw = argv[ i ]; + if( 0 == sw.CompareNoCase( L"-diff" ) ) + { + i++; + if( i >= argc ) + return printUsage(); + tmp.Format( "%S", argv[ i ] ); + tmp.Trim(); + uint64_t v; + auto res = std::from_chars( tmp, cstr( tmp ) + tmp.GetLength(), v ); + if( res.ec != (std::errc)0 ) + { + fprintf( stderr, "Unable to parse string into number\n" ); + return false; + } + printDiff = v; + continue; + } + return printUsage(); + } + + if( idx != 2 ) + return printUsage(); + + return true; +}
\ No newline at end of file diff --git a/Tools/compareTraces/CommandLineArgs.h b/Tools/compareTraces/CommandLineArgs.h new file mode 100644 index 0000000..d434e76 --- /dev/null +++ b/Tools/compareTraces/CommandLineArgs.h @@ -0,0 +1,9 @@ +#pragma once + +struct CommandLineArgs +{ + int64_t printDiff = -1; + std::array<CString, 2> inputs; + + bool parse( int argc, wchar_t* argv[] ); +};
\ No newline at end of file diff --git a/Tools/compareTraces/Readme.txt b/Tools/compareTraces/Readme.txt new file mode 100644 index 0000000..035d658 --- /dev/null +++ b/Tools/compareTraces/Readme.txt @@ -0,0 +1,9 @@ +This project builds a C++ console tool which compares debug traces of the model. + +Tracing files easily exceed 1GB, and by default they’re disabled with a preprocessor macro in stdafx.h of the Whisper project. + +When enabled, the main GPU implementation saves a trace into C:\Temp\2remove\Whisper\gpu.bin + +The reference CPU implementation saves a trace into C:\Temp\2remove\Whisper\ref.bin + +This code in this project is optimized for development speed. For this reason it requires AVX2 CPU, uses memory-mapped IO instead of proper parsing, and checks little to no errors.
\ No newline at end of file diff --git a/Tools/compareTraces/TraceReader.cpp b/Tools/compareTraces/TraceReader.cpp new file mode 100644 index 0000000..b4b9681 --- /dev/null +++ b/Tools/compareTraces/TraceReader.cpp @@ -0,0 +1,46 @@ +#include "stdafx.h" +#include "TraceReader.h" +using namespace Tracing; + +const sTraceItem& TraceReader::operator[]( size_t idx ) const +{ + if( idx >= countItems ) + throw E_BOUNDS; + return items[ idx ]; +} + +CStringA TraceReader::getName( const sTraceItem& item ) const +{ + const size_t idx = item.stringIndex; + if( idx >= countStrings ) + throw E_BOUNDS; + const char* const source = stringData + stringIndex[ idx ]; + CStringA res; + res.Format( source, item.formatArgs[ 0 ], item.formatArgs[ 1 ], item.formatArgs[ 2 ], item.formatArgs[ 3 ] ); + return res; +} + +HRESULT TraceReader::open( LPCTSTR path ) +{ + CHECK( file.Create( path, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING ) ); + CHECK( mapping.MapFile( file ) ); + + const uint8_t* rsi = mapping; + const sFileHeader& header = *(const sFileHeader*)rsi; + if( header.magic != header.correctMagic ) + return E_INVALIDARG; + countItems = header.countItems; + countStrings = header.countStrings; + + rsi += sizeof( sFileHeader ); + payloadPointer = rsi; + + rsi += header.bytesPayload; + stringIndex = (const uint32_t*)( rsi ); + stringData = (const char*)( rsi + countStrings * 4 ); + + rsi += header.bytesStrings; + items = (const sTraceItem*)rsi; + + return S_OK; +}
\ No newline at end of file diff --git a/Tools/compareTraces/TraceReader.h b/Tools/compareTraces/TraceReader.h new file mode 100644 index 0000000..8d1e1f2 --- /dev/null +++ b/Tools/compareTraces/TraceReader.h @@ -0,0 +1,35 @@ +#pragma once +#include "../../Whisper/Utils/Trace/TraceStructures.h" +#include <atlstr.h> +#include <atlfile.h> + +namespace Tracing +{ + class TraceReader + { + const uint8_t* payloadPointer = nullptr; + const sTraceItem* items = nullptr; + size_t countItems = 0; + size_t countStrings = 0; + const uint32_t* stringIndex = nullptr; + const char* stringData = nullptr; + + CAtlFile file; + CAtlFileMapping<uint8_t> mapping; + + public: + + TraceReader() = default; + ~TraceReader() = default; + + HRESULT open( LPCTSTR path ); + size_t size() const { return countItems; } + const sTraceItem& operator[]( size_t idx ) const; + CStringA getName( const sTraceItem& item ) const; + + const void* payload( const sTraceItem& item ) const + { + return payloadPointer + item.payloadOffset; + } + }; +}
\ No newline at end of file diff --git a/Tools/compareTraces/compare.cpp b/Tools/compareTraces/compare.cpp new file mode 100644 index 0000000..ec2a6ef --- /dev/null +++ b/Tools/compareTraces/compare.cpp @@ -0,0 +1,364 @@ +#include "stdafx.h" +#include "../../Whisper/API/iContext.cl.h" +#include "TraceReader.h" +#include "../../Whisper/ML/testUtils.h" +#include "compare.h" +using namespace Tracing; +using namespace DirectCompute; + +namespace +{ + inline const char* cstr( eItemType it ) + { + switch( it ) + { + case eItemType::Buffer: return "Buffer"; + case eItemType::Tensor: return "Tensor"; + } + throw E_INVALIDARG; + } + inline const char* cstr( const CStringA& s ) { return s; } + + inline int tensorDims( __m128i vec ) + { + const __m128i one = _mm_set1_epi32( 1 ); + const uint32_t bitmapOnes = (uint32_t)_mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( vec, one ) ) ); + const uint32_t bitmapNotOnes = bitmapOnes ^ 0b1111u; + unsigned long idx; + if( !_BitScanReverse( &idx, bitmapNotOnes ) ) + return 0; + return idx + 1; + } + + int printSize( __m128i vec ) + { + const int sz = tensorDims( vec ); + switch( sz ) + { + case 0: + printf( "[ scalar ]" ); + break; + case 1: + printf( "[ %i ]", _mm_cvtsi128_si32( vec ) ); + break; + case 2: + printf( "[ %i, %i ]", _mm_cvtsi128_si32( vec ), _mm_extract_epi32( vec, 1 ) ); + break; + case 3: + printf( "[ %i, %i, %i ]", _mm_cvtsi128_si32( vec ), _mm_extract_epi32( vec, 1 ), _mm_extract_epi32( vec, 2 ) ); + break; + case 4: + printf( "[ %i, %i, %i, %i ]", _mm_cvtsi128_si32( vec ), _mm_extract_epi32( vec, 1 ), _mm_extract_epi32( vec, 2 ), _mm_extract_epi32( vec, 3 ) ); + break; + default: + throw E_UNEXPECTED; + } + return sz; + } + + class Comparer + { + TraceReader& readerA; + TraceReader& readerB; + + bool diffBuffers( size_t i, const sTraceItem& a, const sTraceItem& b, const CStringA& name ) + { + const size_t lenA = *(const uint64_t*)a.size.data(); + const size_t lenB = *(const uint64_t*)b.size.data(); + if( lenA != lenB ) + { + printf( "Buffer %zu \"%s\": different size, %zu in trace A, %zu in trace B\n", i, cstr( name ), lenA, lenB ); + return false; + } + if( a.dataType != b.dataType ) + { + printf( "Buffer %zu \"%s\": different data types\n", i, cstr( name ) ); + return false; + } + + switch( a.dataType ) + { + case eDataType::FP32: + return buffersFp32( i, name, (const float*)readerA.payload( a ), (const float*)readerB.payload( b ), lenA ); + } + throw E_NOTIMPL; + } + + bool diffTensors( size_t i, const sTraceItem& a, const sTraceItem& b, const CStringA& name ) + { + const __m128i ne1 = load( a.size ); + const __m128i ne2 = load( b.size ); + if( !vectorEqual( ne1, ne2 ) ) + { + printf( "Tensor %zu \"%s\" - different size: trace A size is ", i, cstr( name ) ); + printSize( ne1 ); + printf( ", trace B size is " ); + printSize( ne2 ); + printf( "\n" ); + return false; + } + + const __m128i stride1 = load( a.stride ); + const __m128i stride2 = load( b.stride ); + if( !vectorEqual( stride1, stride2 ) ) + { + printf( "Tensor %zu \"%s\" - different memory layout\n", i, cstr( name ) ); + return false; + } + + if( a.dataType != b.dataType ) + { + printf( "Tensor %zu \"%s\": different data types\n", i, cstr( name ) ); + return false; + } + + size_t elements = (uint32_t)_mm_cvtsi128_si32( ne1 ); + elements *= (uint32_t)_mm_extract_epi32( ne1, 1 ); + elements *= (uint32_t)_mm_extract_epi32( ne1, 2 ); + elements *= (uint32_t)_mm_extract_epi32( ne1, 3 ); + + switch( a.dataType ) + { + case eDataType::FP32: + return tensorsFp32( i, name, (const float*)readerA.payload( a ), (const float*)readerB.payload( b ), elements, ne1, stride1 ); + } + throw E_NOTIMPL; + } + + protected: + virtual bool buffersFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length ) = 0; + virtual bool tensorsFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length, __m128i ne, __m128i nb ) = 0; + + public: + + Comparer( TraceReader& t1, TraceReader& t2 ) : + readerA( t1 ), readerB( t2 ) { } + + bool compare( size_t i ) + { + const sTraceItem& a = readerA[ i ]; + const sTraceItem& b = readerB[ i ]; + CStringA name1 = readerA.getName( a ); + CStringA name2 = readerB.getName( b ); + + if( a.itemType != b.itemType ) + { + printf( "Item %zu: different type, trace A %s \"%s\", trace B %s \"%s\"\n", i, + cstr( a.itemType ), cstr( name1 ), cstr( b.itemType ), cstr( name2 ) ); + return false; + } + + if( name1 != name2 ) + { + printf( "%s %zu: different names, they are \"%s\" and \"%s\"\n", cstr( a.itemType ), i, cstr( name1 ), cstr( name2 ) ); + return false; + } + + switch( a.itemType ) + { + case eItemType::Buffer: + return diffBuffers( i, a, b, name1 ); + case eItemType::Tensor: + return diffTensors( i, a, b, name1 ); + default: + throw E_INVALIDARG; + } + } + }; + + class PrintSummary : public Comparer + { + bool buffersFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length ) override; + bool tensorsFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length, __m128i ne, __m128i nb ) override; + + public: + PrintSummary( TraceReader& a, TraceReader& b ) : Comparer( a, b ) { } + }; + + bool PrintSummary::buffersFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length ) + { + sTensorDiff diff = computeDiff( a, b, length ); + printf( "%s %zu \"%s\": ", cstr( eItemType::Buffer ), idx, cstr( name ) ); + diff.print(); + return true; + } + + bool PrintSummary::tensorsFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length, __m128i ne, __m128i nb ) + { + printSize( ne ); + printf( " " ); + sTensorDiff diff = computeDiff( a, b, length ); + printf( "%s %zu \"%s\": ", cstr( eItemType::Tensor ), idx, cstr( name ) ); + diff.print(); + return true; + } + + class PrintDiff : public Comparer + { + bool buffersFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length ) override; + bool tensorsFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length, __m128i ne, __m128i nb ) override; + public: + PrintDiff( TraceReader& a, TraceReader& b ) : Comparer( a, b ) { } + }; + + bool PrintDiff::buffersFp32( size_t idx, const CStringA& name, const float* A, const float* B, size_t length ) + { + printf( "idx\tA\tB\tA(hex)\tB(hex)\tdiff\n" ); + for( size_t i = 0; i < length; i++ ) + { + const float a = *A; + const float b = *B; + __m128 vf = _mm_setr_ps( a, b, 0, 0 ); + __m128i vi = _mm_castps_si128( vf ); + const float diff = std::abs( a - b ); + printf( "%zu\t%g\t%g\t0x%08X\t0x%08X\t%g\n", + i, a, b, _mm_cvtsi128_si32( vi ), _mm_extract_epi32( vi, 1 ), diff ); + } + return true; + } + + std::array<uint32_t, 4> storeSize( __m128i v ) + { + std::array<uint32_t, 4> a; + _mm_storeu_si128( ( __m128i* )a.data(), v ); + return a; + } + + std::array<size_t, 4> storeStrides( __m128i v ) + { + const __m128i zero = _mm_setzero_si128(); + std::array<size_t, 4> a; + _mm_storeu_si128( ( __m128i* ) & a[ 0 ], _mm_unpacklo_epi32( v, zero ) ); + _mm_storeu_si128( ( __m128i* ) & a[ 2 ], _mm_unpackhi_epi32( v, zero ) ); + return a; + } + + bool PrintDiff::tensorsFp32( size_t idx, const CStringA& name, const float* A, const float* B, size_t length, __m128i ne, __m128i nb ) + { + const int dims = tensorDims( ne ); + const std::array<uint32_t, 4> size = storeSize( ne ); + const std::array<size_t, 4> strides = storeStrides( ne ); + CStringA line; + if( dims > 4 ) + throw E_UNEXPECTED; + + for( int i = 0; i < dims; i++ ) + { + const char c = "xyzw"[ i ]; + line.AppendChar( c ); + line.AppendChar( '\t' ); + } + line += "A\tB\tA(hex)\tB(hex)\tdiff\n"; + printf( "%s", cstr( line ) ); + + if( 0 == dims ) + { + const float a = *A; + const float b = *B; + __m128 vf = _mm_setr_ps( a, b, 0, 0 ); + __m128i vi = _mm_castps_si128( vf ); + const float diff = std::abs( a - b ); + printf( "%g\t%g\t0x%08X\t0x%08X\t%g\n", + a, b, _mm_cvtsi128_si32( vi ), _mm_extract_epi32( vi, 1 ), diff ); + return true; + } + + size_t offLayer2 = 0; + for( uint32_t w = 0; w < size[ 3 ]; w++, offLayer2 += strides[ 3 ] ) + { + size_t offLayer = offLayer2; + for( uint32_t z = 0; z < size[ 2 ]; z++, offLayer += strides[ 2 ] ) + { + size_t offRow = offLayer; + for( uint32_t y = 0; y < size[ 1 ]; y++, offRow += strides[ 1 ] ) + { + size_t off = offRow; + for( uint32_t x = 0; x < size[ 0 ]; x++, off += strides[ 0 ] ) + { + line.Format( "%i\t", x ); + if( dims > 1 ) + line.AppendFormat( "%i\t", y ); + if( dims > 2 ) + line.AppendFormat( "%i\t", z ); + if( dims > 3 ) + line.AppendFormat( "%i\t", w ); + + const float a = A[ off ]; + const float b = B[ off ]; + __m128 vf = _mm_setr_ps( a, b, 0, 0 ); + __m128i vi = _mm_castps_si128( vf ); + const float diff = std::abs( a - b ); + line.AppendFormat( "%g\t%g\t0x%08X\t0x%08X\t%g\n", + a, b, _mm_cvtsi128_si32( vi ), _mm_extract_epi32( vi, 1 ), diff ); + printf( "%s", cstr( line ) ); + } + } + } + } + return true; + } +} + +HRESULT compareTraces( const CommandLineArgs& arguments ) +{ + const wchar_t* pathA = arguments.inputs[ 0 ]; + const wchar_t* pathB = arguments.inputs[ 1 ]; + + TraceReader a, b; + HRESULT hr = a.open( pathA ); + if( FAILED( hr ) ) + { + fwprintf( stderr, L"Unable to load trace A from \"%s\"", pathA ); + printError( hr ); + return hr; + } + + hr = b.open( pathB ); + if( FAILED( hr ) ) + { + fwprintf( stderr, L"Unable to load trace B from \"%s\"", pathA ); + printError( hr ); + return hr; + } + + wprintf( L"Trace A: %s\n", pathA ); + wprintf( L"Trace B: %s\n", pathB ); + const size_t sizeA = a.size(); + const size_t sizeB = b.size(); + const size_t count = std::min( sizeA, sizeB ); + + if( arguments.printDiff >= 0 ) + { + if( arguments.printDiff >= (int64_t)count ) + { + fprintf( stderr, "Trace A has %zu entries, trace B %zu entries; entry %zu ain't there\n", + sizeA, sizeB, (size_t)arguments.printDiff ); + return E_INVALIDARG; + } + try + { + PrintDiff print{ a, b }; + print.compare( arguments.printDiff ); + return S_OK; + } + catch( HRESULT hr ) + { + return hr; + } + } + + printf( "Trace A has %zu entries, trace B %zu entries, comparing first %zu\n", sizeA, sizeB, count ); + + try + { + PrintSummary print{ a, b }; + for( size_t i = 0; i < count; i++ ) + if( !print.compare( i ) ) + return S_FALSE; + return S_OK; + } + catch( HRESULT hr ) + { + return hr; + } +}
\ No newline at end of file diff --git a/Tools/compareTraces/compare.h b/Tools/compareTraces/compare.h new file mode 100644 index 0000000..2a4cd86 --- /dev/null +++ b/Tools/compareTraces/compare.h @@ -0,0 +1,4 @@ +#pragma once +#include "CommandLineArgs.h" + +HRESULT compareTraces( const CommandLineArgs& arguments );
\ No newline at end of file diff --git a/Tools/compareTraces/compareTraces.cpp b/Tools/compareTraces/compareTraces.cpp new file mode 100644 index 0000000..8813500 --- /dev/null +++ b/Tools/compareTraces/compareTraces.cpp @@ -0,0 +1,16 @@ +#include "stdafx.h" +#include <stdio.h> +#include "compare.h" +#include "CommandLineArgs.h" + +int wmain( int argc, wchar_t* argv[] ) +{ + CommandLineArgs cla; + if( !cla.parse( argc, argv ) ) + return 1; + + HRESULT hr = compareTraces( cla ); + if( SUCCEEDED( hr ) ) + return 0; + return hr; +}
\ No newline at end of file diff --git a/Tools/compareTraces/compareTraces.vcxproj b/Tools/compareTraces/compareTraces.vcxproj new file mode 100644 index 0000000..a9670b3 --- /dev/null +++ b/Tools/compareTraces/compareTraces.vcxproj @@ -0,0 +1,103 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup Label="ProjectConfigurations"> + <ProjectConfiguration Include="Debug|x64"> + <Configuration>Debug</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|x64"> + <Configuration>Release</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + </ItemGroup> + <PropertyGroup Label="Globals"> + <VCProjectVersion>16.0</VCProjectVersion> + <Keyword>Win32Proj</Keyword> + <ProjectGuid>{8478a77c-d851-4c63-9511-1770cc82d33e}</ProjectGuid> + <RootNamespace>compareTraces</RootNamespace> + <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseDebugLibraries>true</UseDebugLibraries> + <PlatformToolset>v143</PlatformToolset> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <PlatformToolset>v143</PlatformToolset> + <WholeProgramOptimization>true</WholeProgramOptimization> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <ImportGroup Label="Shared"> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <LanguageStandard>stdcpp20</LanguageStandard> + <PrecompiledHeader>Use</PrecompiledHeader> + <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <FunctionLevelLinking>true</FunctionLevelLinking> + <IntrinsicFunctions>true</IntrinsicFunctions> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <LanguageStandard>stdcpp20</LanguageStandard> + <PrecompiledHeader>Use</PrecompiledHeader> + <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <OptimizeReferences>true</OptimizeReferences> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="CommandLineArgs.cpp" /> + <ClCompile Include="compareTraces.cpp" /> + <ClCompile Include="compare.cpp" /> + <ClCompile Include="stdafx.cpp"> + <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader> + <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader> + </ClCompile> + <ClCompile Include="testUtils.cpp" /> + <ClCompile Include="TraceReader.cpp" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="CommandLineArgs.h" /> + <ClInclude Include="compare.h" /> + <ClInclude Include="stdafx.h" /> + <ClInclude Include="TraceReader.h" /> + </ItemGroup> + <ItemGroup> + <Text Include="Readme.txt" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/Tools/compareTraces/compareTraces.vcxproj.filters b/Tools/compareTraces/compareTraces.vcxproj.filters new file mode 100644 index 0000000..1d0d4c3 --- /dev/null +++ b/Tools/compareTraces/compareTraces.vcxproj.filters @@ -0,0 +1,20 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <ClCompile Include="compareTraces.cpp" /> + <ClCompile Include="TraceReader.cpp" /> + <ClCompile Include="compare.cpp" /> + <ClCompile Include="stdafx.cpp" /> + <ClCompile Include="testUtils.cpp" /> + <ClCompile Include="CommandLineArgs.cpp" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="TraceReader.h" /> + <ClInclude Include="stdafx.h" /> + <ClInclude Include="compare.h" /> + <ClInclude Include="CommandLineArgs.h" /> + </ItemGroup> + <ItemGroup> + <Text Include="Readme.txt" /> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/Tools/compareTraces/stdafx.cpp b/Tools/compareTraces/stdafx.cpp new file mode 100644 index 0000000..5c7f6c9 --- /dev/null +++ b/Tools/compareTraces/stdafx.cpp @@ -0,0 +1,30 @@ +#include "stdafx.h" + +namespace +{ + wchar_t* formatMessage( HRESULT hr ) + { + wchar_t* err; + if( FormatMessage( FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, + NULL, + hr, + MAKELANGID( LANG_NEUTRAL, SUBLANG_DEFAULT ), + (LPTSTR)&err, + 0, + NULL ) ) + return err; + return nullptr; + } +} + +void printError( HRESULT hr ) +{ + const wchar_t* err = formatMessage( hr ); + if( nullptr != err ) + { + fwprintf( stderr, L"%s\n", err ); + LocalFree( (HLOCAL)err ); + } + else + fprintf( stderr, "Error code %i (0x%08X)\n", hr, hr ); +}
\ No newline at end of file diff --git a/Tools/compareTraces/stdafx.h b/Tools/compareTraces/stdafx.h new file mode 100644 index 0000000..1e496f3 --- /dev/null +++ b/Tools/compareTraces/stdafx.h @@ -0,0 +1,40 @@ +#pragma once +#include <stdint.h> +#include <assert.h> + +#define WIN32_LEAN_AND_MEAN +#define NOMINMAX +#include <windows.h> +#include <atlstr.h> +#include <d3d11.h> + +#include <vector> +#include <array> +#include <emmintrin.h> +#include <smmintrin.h> + +#define CHECK( hr ) { const HRESULT __hr = ( hr ); if( FAILED( __hr ) ) return __hr; } + +inline __m128i load16( const int* rsi ) +{ + return _mm_loadu_si128( ( const __m128i* )rsi ); +} +inline __m128i load16( const uint32_t* rsi ) +{ + return _mm_loadu_si128( ( const __m128i* )rsi ); +} +inline __m128i load( const std::array<uint32_t, 4>& arr ) +{ + return load16( arr.data() ); +} + +inline bool vectorEqual( __m128i a, __m128i b ) +{ + __m128i xx = _mm_xor_si128( a, b ); + return (bool)_mm_testz_si128( xx, xx ); +} + +void printError( HRESULT hr ); + +inline const char* cstr( const CStringA& s ) { return s; } +inline const wchar_t* cstr( const CString& s ) { return s; }
\ No newline at end of file diff --git a/Tools/compareTraces/testUtils.cpp b/Tools/compareTraces/testUtils.cpp new file mode 100644 index 0000000..f9fa465 --- /dev/null +++ b/Tools/compareTraces/testUtils.cpp @@ -0,0 +1,224 @@ +#include "stdafx.h" +#include "../../Whisper/ML/testUtils.h" +#include <immintrin.h> +using namespace DirectCompute; + +namespace +{ + using DirectCompute::sTensorDiff; + + __forceinline __m256 load( const float* rsi ) + { + return _mm256_loadu_ps( rsi ); + } + + __forceinline __m256 load( const uint16_t* rsi ) + { + const __m128i iv = _mm_load_si128( ( const __m128i* )rsi ); + return _mm256_cvtph_ps( iv ); + } + + __forceinline void loadPartial( const uint16_t* x, const uint16_t* y, size_t count, __m256& fx, __m256& fy ) + { + __m128i ix, iy; + switch( count ) + { + case 1: // load 2 bytes + ix = _mm_cvtsi32_si128( *x ); + iy = _mm_cvtsi32_si128( *y ); + break; + case 2: // load 4 bytes + ix = _mm_cvtsi32_si128( *(const int*)x ); + iy = _mm_cvtsi32_si128( *(const int*)y ); + break; + case 3: // load 6 bytes + ix = _mm_cvtsi32_si128( *(const int*)x ); + iy = _mm_cvtsi32_si128( *(const int*)y ); + ix = _mm_insert_epi16( ix, x[ 2 ], 2 ); + iy = _mm_insert_epi16( iy, y[ 2 ], 2 ); + break; + case 4: // load 8 bytes + ix = _mm_cvtsi64_si128( *(const int64_t*)x ); + iy = _mm_cvtsi64_si128( *(const int64_t*)y ); + break; + case 5: // load 10 bytes + ix = _mm_cvtsi64_si128( *(const int64_t*)x ); + iy = _mm_cvtsi64_si128( *(const int64_t*)y ); + ix = _mm_insert_epi16( ix, x[ 4 ], 4 ); + iy = _mm_insert_epi16( iy, y[ 4 ], 4 ); + break; + case 6: // load 12 bytes + ix = _mm_cvtsi64_si128( *(const int64_t*)x ); + iy = _mm_cvtsi64_si128( *(const int64_t*)y ); + ix = _mm_insert_epi32( ix, *(const int*)( x + 4 ), 2 ); + iy = _mm_insert_epi32( iy, *(const int*)( y + 4 ), 2 ); + break; + case 7: // load 14 bytes + ix = _mm_cvtsi64_si128( *(const int64_t*)x ); + iy = _mm_cvtsi64_si128( *(const int64_t*)y ); + ix = _mm_insert_epi32( ix, *(const int*)( x + 4 ), 2 ); + iy = _mm_insert_epi32( iy, *(const int*)( y + 4 ), 2 ); + ix = _mm_insert_epi16( ix, x[ 6 ], 6 ); + iy = _mm_insert_epi16( iy, y[ 6 ], 6 ); + break; + default: + fx = fy = _mm256_setzero_ps(); + return; + } + + fx = _mm256_cvtph_ps( ix ); + fy = _mm256_cvtph_ps( iy ); + } + + inline __m128 loadFloat2( const float* rsi ) + { + return _mm_castpd_ps( _mm_load_sd( (const double*)rsi ) ); + } + inline __m128 loadFloat3( const float* rsi ) + { + __m128 f = loadFloat2( rsi ); + f = _mm_insert_ps( f, _mm_load_ss( rsi + 2 ), 0x20 ); + return f; + } + __forceinline void loadPartial( const float* x, const float* y, size_t count, __m256& fx, __m256& fy ) + { + __m128 low1, high1; + __m128 low2, high2; + high1 = high2 = _mm_setzero_ps(); + switch( count ) + { + case 1: + low1 = _mm_load_ss( x ); + low2 = _mm_load_ss( y ); + break; + case 2: + low1 = loadFloat2( x ); + low2 = loadFloat2( y ); + break; + case 3: + low1 = loadFloat3( x ); + low2 = loadFloat3( y ); + break; + case 4: + low1 = _mm_loadu_ps( x ); + low2 = _mm_loadu_ps( y ); + break; + case 5: + low1 = _mm_loadu_ps( x ); + low2 = _mm_loadu_ps( y ); + high1 = _mm_load_ss( x + 4 ); + high2 = _mm_load_ss( y + 4 ); + break; + case 6: + low1 = _mm_loadu_ps( x ); + low2 = _mm_loadu_ps( y ); + high1 = loadFloat2( x + 4 ); + high2 = loadFloat2( y + 4 ); + break; + case 7: // load 14 bytes + low1 = _mm_loadu_ps( x ); + low2 = _mm_loadu_ps( y ); + high1 = loadFloat3( x + 4 ); + high2 = loadFloat3( y + 4 ); + break; + default: + fx = fy = _mm256_setzero_ps(); + return; + } + + fx = _mm256_setr_m128( low1, high1 ); + fy = _mm256_setr_m128( low2, high2 ); + } + + __forceinline float horizontalMaximum( __m256 v ) + { + __m128 s = _mm256_extractf128_ps( v, 1 ); + s = _mm_max_ps( s, _mm256_castps256_ps128( v ) ); + s = _mm_max_ps( s, _mm_movehl_ps( s, s ) ); + s = _mm_max_ss( s, _mm_movehdup_ps( s ) ); + return _mm_cvtss_f32( s ); + } + + __forceinline double horizontalSum( __m256 v ) + { + __m256d d = _mm256_cvtps_pd( _mm256_extractf128_ps( v, 1 ) ); + d = _mm256_add_pd( d, _mm256_cvtps_pd( _mm256_castps256_ps128( v ) ) ); + + __m128d s = _mm256_extractf128_pd( d, 1 ); + s = _mm_add_pd( s, _mm256_castpd256_pd128( d ) ); + s = _mm_add_sd( s, _mm_unpackhi_pd( s, s ) ); + return _mm_cvtsd_f64( s ); + } + + __m256 maskInfNan( __m256 diff, __m256 a, __m256 b ) + { + __m256i ai = _mm256_castps_si256( a ); + __m256i bi = _mm256_castps_si256( b ); + __m256i eqi = _mm256_cmpeq_epi32( ai, bi ); + __m256 eq = _mm256_castsi256_ps( eqi ); + return _mm256_andnot_ps( eq, diff ); + } + + class DiffAcc + { + __m256 maxAbs = _mm256_setzero_ps(); + __m256 sumSquares = _mm256_setzero_ps(); + + public: + + __forceinline void add( __m256 a, __m256 b ) + { + const __m256 neg0 = _mm256_set1_ps( -0.0f ); + __m256 diff = _mm256_sub_ps( b, a ); + diff = maskInfNan( diff, a, b ); + sumSquares = _mm256_fmadd_ps( diff, diff, sumSquares ); + const __m256 absDiff = _mm256_andnot_ps( neg0, diff ); + maxAbs = _mm256_max_ps( maxAbs, absDiff ); + } + + __forceinline sTensorDiff reduce( size_t count ) + { + sTensorDiff res; + res.maxAbsDiff = horizontalMaximum( maxAbs ); + res.avgDiffSquared = (float)( horizontalSum( sumSquares ) / (double)(int64_t)count ); + res.length = count; + return res; + } + }; + + template<class E> + static sTensorDiff __declspec( noinline ) diffVectors( const E* a, const E* b, size_t length ) + { + // const E* const aEnd = a + length; + const E* const aEndAligned = a + ( length / 8 ) * 8; + const size_t remainder = length % 8; + + DiffAcc acc; + for( ; a < aEndAligned; a += 8, b += 8 ) + acc.add( load( a ), load( b ) ); + + if( remainder != 0 ) + { + __m256 va, vb; + loadPartial( a, b, remainder, va, vb ); + acc.add( va, vb ); + } + + return acc.reduce( length ); + } +} + +sTensorDiff DirectCompute::computeDiff( const float* a, const float* b, size_t length ) +{ + return diffVectors( a, b, length ); +} + +sTensorDiff DirectCompute::computeDiff( const uint16_t* a, const uint16_t* b, size_t length ) +{ + return diffVectors( a, b, length ); +} + +void DirectCompute::sTensorDiff::print() const +{ + printf( "%zu elements, maxAbsDiff = %g, avgDiffSquared = %g\n", length, maxAbsDiff, avgDiffSquared ); +}
\ No newline at end of file |
