summaryrefslogtreecommitdiffstats
path: root/Tools
diff options
context:
space:
mode:
authorKonstantin <const@const.me>2023-01-16 14:52:43 +0100
committerKonstantin <const@const.me>2023-01-16 14:52:43 +0100
commit8c4603c73675958efc960fbd4bb599a2909d106a (patch)
tree714dc6fc9a1672d5fd7f89676b97e10959662abc /Tools
parent990a8d0dbaefc996244097397259e92758b15cce (diff)
Source codes
Diffstat (limited to 'Tools')
-rw-r--r--Tools/CompressShaders/Cabinet.cs60
-rw-r--r--Tools/CompressShaders/CompressShaders.cs244
-rw-r--r--Tools/CompressShaders/CompressShaders.csproj10
-rw-r--r--Tools/CompressShaders/DetectFp64.cs43
-rw-r--r--Tools/CompressShaders/LanguageCodes.cs103
-rw-r--r--Tools/CompressShaders/Readme.txt10
-rw-r--r--Tools/CompressShaders/ShaderNames.cs27
-rw-r--r--Tools/compareTraces/CommandLineArgs.cpp51
-rw-r--r--Tools/compareTraces/CommandLineArgs.h9
-rw-r--r--Tools/compareTraces/Readme.txt9
-rw-r--r--Tools/compareTraces/TraceReader.cpp46
-rw-r--r--Tools/compareTraces/TraceReader.h35
-rw-r--r--Tools/compareTraces/compare.cpp364
-rw-r--r--Tools/compareTraces/compare.h4
-rw-r--r--Tools/compareTraces/compareTraces.cpp16
-rw-r--r--Tools/compareTraces/compareTraces.vcxproj103
-rw-r--r--Tools/compareTraces/compareTraces.vcxproj.filters20
-rw-r--r--Tools/compareTraces/stdafx.cpp30
-rw-r--r--Tools/compareTraces/stdafx.h40
-rw-r--r--Tools/compareTraces/testUtils.cpp224
20 files changed, 1448 insertions, 0 deletions
diff --git a/Tools/CompressShaders/Cabinet.cs b/Tools/CompressShaders/Cabinet.cs
new file mode 100644
index 0000000..b53fd18
--- /dev/null
+++ b/Tools/CompressShaders/Cabinet.cs
@@ -0,0 +1,60 @@
+using System.ComponentModel;
+using System.Runtime.InteropServices;
+
+namespace CompressShaders
+{
+ /// <summary>Lossless data compressor implemented by <c>Cabinet.dll</c> Windows component</summary>
+ /// <remarks>
+ /// <para>Whisper.dll consumes that component in runtime, to decompress these shader binaries</para>
+ /// <para>If you wonder why not gzip — because the OS doesn’t include an API for that, at least not an API usable from C or C++.<br/>
+ /// .NET standard library includes gzip algorithm, but we don't want Whisper.dll to depend on .NET.</para>
+ /// </remarks>
+ static class Cabinet
+ {
+ /// <summary>Compression algorithm</summary>
+ /// <seealso href="https://learn.microsoft.com/en-us/windows/win32/cmpapi/using-the-compression-api#selecting-the-compression-algorithm" />
+ enum eCompressionAlgorithm: uint
+ {
+ MSZIP = 2,
+ XPRESS = 3,
+ XPRESS_HUFF = 4,
+ LZMS = 5,
+ }
+ /// <summary>The value should match <c>constexpr DWORD compressionAlgorithm</c> constant,<br/>in <c>Whisper/D3D/shaders.cpp</c> source file</summary>
+ const eCompressionAlgorithm algo = eCompressionAlgorithm.MSZIP;
+
+ [DllImport( "Cabinet.dll", SetLastError = true )]
+ static extern bool CreateCompressor( eCompressionAlgorithm Algorithm, IntPtr AllocationRoutines, out IntPtr CompressorHandle );
+
+ [DllImport( "Cabinet.dll", SetLastError = true )]
+ static extern bool CloseCompressor( IntPtr CompressorHandle );
+
+ [DllImport( "Cabinet.dll", SetLastError = true )]
+ static extern bool Compress( IntPtr CompressorHandle, [In] byte[] UncompressedData, IntPtr UncompressedDataSize, [Out] byte[] CompressedBuffer, IntPtr CompressedBufferSize, out IntPtr CompressedDataSize );
+
+ /// <summary>Compress an array of bytes into another, smaller array of bytes</summary>
+ /// <remarks>In practice, the compression ratio is about 7.1 for the shader binaries in Release configuration.</remarks>
+ public static byte[] compressBuffer( byte[] src )
+ {
+ if( src.Length <= 0 )
+ throw new ArgumentException( "The source buffer is empty" );
+ IntPtr hCompressor;
+ if( !CreateCompressor( algo, IntPtr.Zero, out hCompressor ) )
+ throw new Win32Exception( "Unable to create the compressor" );
+ try
+ {
+ byte[] dest = new byte[ src.Length * 2 ];
+ IntPtr srcSize = new IntPtr( src.Length );
+ IntPtr destSize = new IntPtr( src.Length * 2 );
+ if( !Compress( hCompressor, src, srcSize, dest, destSize, out destSize ) )
+ throw new Win32Exception( "Compress failed" );
+ Array.Resize( ref dest, (int)destSize );
+ return dest;
+ }
+ finally
+ {
+ CloseCompressor( hCompressor );
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/Tools/CompressShaders/CompressShaders.cs b/Tools/CompressShaders/CompressShaders.cs
new file mode 100644
index 0000000..814f966
--- /dev/null
+++ b/Tools/CompressShaders/CompressShaders.cs
@@ -0,0 +1,244 @@
+using System.Runtime.CompilerServices;
+namespace CompressShaders;
+
+record struct sShaderBinary
+{
+ public string name;
+ public byte[] data;
+
+ public sShaderBinary( string path )
+ {
+ name = Path.GetFileNameWithoutExtension( path );
+ data = File.ReadAllBytes( path );
+ }
+
+ public bool wave64 => name.EndsWith( "64" );
+ public string uniqueName => wave64 ? name.Substring( 0, name.Length - 2 ) : name;
+}
+
+sealed class FoundShaders
+{
+ public readonly sShaderBinary[] binaries;
+ public readonly string[] names;
+ public readonly int[] wave32, wave64;
+
+ public FoundShaders( IEnumerable<sShaderBinary> found )
+ {
+ binaries = found
+ .OrderBy( b => b.name )
+ .ToArray();
+
+ names = binaries
+ .Select( b => b.uniqueName )
+ .Distinct()
+ .ToArray();
+
+ wave32 = new int[ names.Length ];
+ wave64 = new int[ names.Length ];
+ for( int i = 0; i < names.Length; i++ )
+ {
+ int i32 = findIndex( names[ i ], false );
+ int i64 = findIndex( names[ i ], true );
+ if( i32 >= 0 && i64 >= 0 )
+ {
+ wave32[ i ] = i32;
+ wave64[ i ] = i64;
+ continue;
+ }
+ if( i32 >= 0 )
+ {
+ wave32[ i ] = wave64[ i ] = i32;
+ continue;
+ }
+ throw new ApplicationException( $"Wave64 shader {names[ i ]} doesn't have the corresponding Wave32 one" );
+ }
+ }
+
+ int findIndex( string name, bool wave64 )
+ {
+ for( int i = 0; i < binaries.Length; i++ )
+ {
+ sShaderBinary sb = binaries[ i ];
+ if( sb.uniqueName != name )
+ continue;
+ if( sb.wave64 == wave64 )
+ return i;
+ }
+ return -1;
+ }
+}
+
+class Program
+{
+ static string getSolutionRoot( [CallerFilePath] string? path = null )
+ {
+ string? dir = Path.GetDirectoryName( path );
+ dir = Path.GetDirectoryName( dir );
+ dir = Path.GetDirectoryName( dir );
+ return dir ?? throw new ApplicationException();
+ }
+
+#if DEBUG
+ const string config = "Debug";
+#else
+ const string config = "Release";
+#endif
+
+ static string shadersBinDir( string root )
+ {
+ return Path.Combine( root, "ComputeShaders", "x64", config );
+ }
+
+ static IEnumerable<sShaderBinary> readShaders( string root )
+ {
+ string dir = shadersBinDir( root );
+ foreach( string path in Directory.EnumerateFiles( dir, "*.cso" ) )
+ yield return new sShaderBinary( path );
+ }
+
+ static void writeHeader( string root, IEnumerable<string> names )
+ {
+ string path = Path.Combine( root, "Whisper", "D3D", "shaderNames.h" );
+ using var stream = File.CreateText( path );
+ stream.WriteLine( @"// This header is generated by a tool
+#pragma once
+#include <stdint.h>
+
+namespace DirectCompute
+{
+ enum struct eComputeShader: uint16_t
+ {" );
+
+ int id = 0;
+ foreach( string name in names )
+ {
+ stream.WriteLine( "\t\t{0} = {1},", name, id );
+ id++;
+ }
+ stream.Write( @" };
+
+ const char* computeShaderName( eComputeShader cs );
+}" );
+ }
+
+ static void writeCpp( string root, IEnumerable<string> names )
+ {
+ string path = Path.Combine( root, "Whisper", "D3D", "shaderNames.cpp" );
+ ShaderNames.write( path, names );
+ }
+
+ static void writePayloadIDs( StreamWriter stream, string varName, int[] ids )
+ {
+ stream.Write( @"
+static const std::array<uint8_t, {0}> {1} = {{", ids.Length, varName );
+
+ for( int i = 0; i < ids.Length; i++ )
+ {
+ if( 0 == i % 16 )
+ stream.Write( "\r\n\t" );
+ else
+ stream.Write( ' ' );
+ stream.Write( "{0},", ids[ i ] );
+ }
+ stream.Write( @"
+};" );
+ }
+
+ static void writePayload( string root, FoundShaders shaders, out int cbSource, out int cbCompressed )
+ {
+ MemoryStream ms = new MemoryStream();
+ List<int> offsets = new List<int>();
+ foreach( var bin in shaders.binaries )
+ {
+ offsets.Add( (int)ms.Length );
+ ms.Write( bin.data );
+ }
+ offsets.Add( (int)ms.Length );
+
+ byte[] dxbc = ms.ToArray();
+ byte[] compressed = Cabinet.compressBuffer( dxbc );
+ cbSource = dxbc.Length;
+ cbCompressed = compressed.Length;
+
+ string path = Path.Combine( root, "Whisper", "D3D", $"shaderData-{config}.inl" );
+ using var stream = File.CreateText( path );
+ stream.Write( @"// This source file is generated by a tool
+
+// This array contains concatenated and compressed DXBC binaries for all compiled compute shaders
+static const std::array<uint8_t, {0}> s_compressedShaders =
+{{", compressed.Length );
+
+ for( int i = 0; i < compressed.Length; i++ )
+ {
+ if( 0 == i % 16 )
+ stream.Write( "\r\n\t" );
+ else
+ stream.Write( ' ' );
+ stream.Write( "0x{0:X02},", compressed[ i ] );
+ }
+
+ stream.Write( @"
+}};
+
+// This array contains start offsets of shader binaries in the decompressed DXBC blob.
+// It includes one more entry for the end of the complete decompressed blob.
+static const std::array<uint32_t, {0}> s_shaderOffsets = {{", offsets.Count );
+
+ for( int i = 0; i < offsets.Count; i++ )
+ {
+ if( 0 == i % 16 )
+ stream.Write( "\r\n\t" );
+ else
+ stream.Write( ' ' );
+ stream.Write( "{0},", offsets[ i ] );
+ }
+ stream.Write( @"
+};" );
+
+ stream.Write( @"
+// Index = eComputeShader enum value, value = index of the shader binary to use on nVidia and Intel GPUs" );
+ writePayloadIDs( stream, "s_shaderBlobs32", shaders.wave32 );
+ stream.Write( @"
+// Index = eComputeShader enum value, value = index of the shader binary to use on AMD GPUs" );
+ writePayloadIDs( stream, "s_shaderBlobs64", shaders.wave64 );
+
+ ulong fp64Flags = 0;
+ for( int i = 0; i < shaders.binaries.Length; i++ )
+ {
+ bool fp64 = DetectFp64.usesFp64( shaders.binaries[ i ].data );
+ if( fp64 )
+ fp64Flags |= (ulong)1 << i;
+ }
+
+ stream.Write( @"
+// Bitmap of the shader binaries which use FP64 arithmetic instructions
+constexpr uint64_t fp64ShadersBitmap = 0x{0:X}ull;", fp64Flags );
+ }
+
+ static void mainImpl()
+ {
+ string root = getSolutionRoot();
+ LanguageCodes.produce( root );
+
+ FoundShaders shaders = new FoundShaders( readShaders( root ) );
+
+ writeHeader( root, shaders.names );
+ writeCpp( root, shaders.names );
+ writePayload( root, shaders, out int cbIn, out int cbOut );
+ Console.WriteLine( "Compressed {0} compute shaders, {1:F1} kb -> {2:F1} kb", shaders.binaries.Length, cbIn / 1024.0, cbOut / 1024.0 );
+ }
+
+ static int Main( string[] args )
+ {
+ try
+ {
+ mainImpl();
+ return 0;
+ }
+ catch( Exception ex )
+ {
+ Console.WriteLine( ex.Message );
+ return ex.HResult;
+ }
+ }
+} \ No newline at end of file
diff --git a/Tools/CompressShaders/CompressShaders.csproj b/Tools/CompressShaders/CompressShaders.csproj
new file mode 100644
index 0000000..dee1710
--- /dev/null
+++ b/Tools/CompressShaders/CompressShaders.csproj
@@ -0,0 +1,10 @@
+<Project Sdk="Microsoft.NET.Sdk">
+ <PropertyGroup>
+ <OutputType>Exe</OutputType>
+ <TargetFramework>net6.0</TargetFramework>
+ <ImplicitUsings>enable</ImplicitUsings>
+ <Nullable>enable</Nullable>
+ <CheckForOverflowUnderflow>true</CheckForOverflowUnderflow>
+ <AppendTargetFrameworkToOutputPath>false</AppendTargetFrameworkToOutputPath>
+ </PropertyGroup>
+</Project> \ No newline at end of file
diff --git a/Tools/CompressShaders/DetectFp64.cs b/Tools/CompressShaders/DetectFp64.cs
new file mode 100644
index 0000000..1d75126
--- /dev/null
+++ b/Tools/CompressShaders/DetectFp64.cs
@@ -0,0 +1,43 @@
+#pragma warning disable CS0649
+using System.Runtime.InteropServices;
+
+namespace CompressShaders
+{
+ static class DetectFp64
+ {
+ struct DXBCHeader
+ {
+ public uint FourCC; // Four character code "DXBC"
+ public uint Hash0; // 32-bit hash of the DXBC file
+ public uint Hash1; // 32-bit hash of the DXBC file
+ public uint Hash2; // 32-bit hash of the DXBC file
+ public uint Hash3; // 32-bit hash of the DXBC file
+ public uint unknownOne;
+ public uint TotalSize; // Total size of the DXBC file in bytes
+ public int NumChunks; // Number of chunks in the DXBC file
+ };
+
+ public static bool usesFp64( ReadOnlySpan<byte> dxbc )
+ {
+ ReadOnlySpan<DXBCHeader> dxbcHeaderSpan = MemoryMarshal.Cast<byte, DXBCHeader>( dxbc );
+ DXBCHeader dxbcHeader = dxbcHeaderSpan[ 0 ];
+
+ int cbHeader = Marshal.SizeOf<DXBCHeader>();
+ int nChunks = dxbcHeader.NumChunks;
+ ReadOnlySpan<int> chunkOffsets = MemoryMarshal.Cast<byte, int>( dxbc.Slice( cbHeader, nChunks * 4 ) );
+ foreach( int off in chunkOffsets )
+ {
+ uint id = MemoryMarshal.Cast<byte, uint>( dxbc.Slice( off, 4 ) )[ 0 ];
+ const uint SFI0 = 0x30494653;
+ if( id != SFI0 )
+ continue;
+ int size = MemoryMarshal.Cast<byte, int>( dxbc.Slice( off + 4, 4 ) )[ 0 ];
+ if( size < 4 )
+ throw new ApplicationException();
+ uint data = MemoryMarshal.Cast<byte, uint>( dxbc.Slice( off + 8, 4 ) )[ 0 ];
+ return 0 != ( data & 1u );
+ }
+ return false;
+ }
+ }
+} \ No newline at end of file
diff --git a/Tools/CompressShaders/LanguageCodes.cs b/Tools/CompressShaders/LanguageCodes.cs
new file mode 100644
index 0000000..71a9909
--- /dev/null
+++ b/Tools/CompressShaders/LanguageCodes.cs
@@ -0,0 +1,103 @@
+using System.Globalization;
+using System.Text.RegularExpressions;
+
+namespace CompressShaders
+{
+ static class LanguageCodes
+ {
+ record struct Row
+ {
+ public string keySource;
+ public uint keyValue;
+ public int code;
+ public string name;
+ }
+
+ static uint makeKey( string str )
+ {
+ if( str.Length > 4 )
+ throw new ArgumentException();
+ uint k = 0;
+ int shift = 0;
+ foreach( char c in str )
+ {
+ if( c >= 0x80 )
+ throw new ArgumentException();
+ uint u = (uint)c;
+ k |= ( u << shift );
+ shift += 8;
+ }
+ return k;
+ }
+
+ static IEnumerable<Row> load( string path )
+ {
+ using var stm = File.OpenText( path );
+ while( true )
+ {
+ string? line = stm.ReadLine();
+ if( null == line )
+ break;
+ if( string.IsNullOrWhiteSpace( line ) )
+ continue;
+ string[] fields = line.Split( '\t' );
+ yield return new Row()
+ {
+ keySource = fields[ 0 ],
+ keyValue = makeKey( fields[ 0 ] ),
+ code = int.Parse( fields[ 1 ] ),
+ name = fields[ 2 ]
+ };
+ }
+ }
+
+ static void writeCpp( string inl, Row[] data )
+ {
+ // TODO [very low]: sort them by the key here, then in C++ use binary search instead of the hash map
+ using var stm = File.CreateText( inl );
+ stm.WriteLine( "// This file is generated by a tool, from the `languageCodez.tsv` file in this repository" );
+ foreach( Row row in data )
+ stm.WriteLine( "Lang{{ 0x{0:X}, {1}, \"{2}\" }},", row.keyValue, row.code, row.name );
+ }
+
+ static readonly CultureInfo ci = new CultureInfo( "en-US", false );
+ static string titleCase( this string name ) =>
+ ci.TextInfo.ToTitleCase( name.ToLower( ci ) );
+
+ static void writeCs( string cs, Row[] data )
+ {
+ using var stm = File.CreateText( cs );
+ stm.WriteLine( @"// This file is generated by a tool, from the `languageCodez.tsv` file in this repository
+namespace Whisper
+{
+ /// <summary>Supported languages</summary>
+ public enum eLanguage: uint
+ {" );
+
+ foreach( Row row in data )
+ {
+ string tc = row.name.titleCase();
+ stm.WriteLine( " /// <summary>{0}</summary>", tc );
+ tc = Regex.Replace( tc, @"\s+", string.Empty );
+ stm.WriteLine( " {0} = 0x{1:X},", tc, row.keyValue );
+ }
+ stm.Write( @" }
+}" );
+ }
+
+ static void produce( string tsv, string inl, string cs )
+ {
+ Row[] data = load( tsv ).OrderBy( r => r.name ).ToArray();
+ writeCpp( inl, data );
+ writeCs( cs, data );
+ }
+
+ public static void produce( string solutionRoot )
+ {
+ string tsv = Path.Combine( solutionRoot, "Whisper\\Whisper\\languageCodez.tsv" );
+ string inl = Path.Combine( solutionRoot, "Whisper\\Whisper\\languageCodez.inl" );
+ string cs = Path.Combine( solutionRoot, "WhisperNet\\API\\eLanguage.cs" );
+ produce( tsv, inl, cs );
+ }
+ }
+} \ No newline at end of file
diff --git a/Tools/CompressShaders/Readme.txt b/Tools/CompressShaders/Readme.txt
new file mode 100644
index 0000000..69ef35a
--- /dev/null
+++ b/Tools/CompressShaders/Readme.txt
@@ -0,0 +1,10 @@
+This project builds a C# console app which serves as a code generator for a few pieces of Whisper.dll and WhisperNet.dll.
+
+Specifically, it generates two things.
+
+1. It compresses the compiled DXBC shaders into a blob of bytes, and prints std::array with these bytes into shaderData-Release.inl and shaderData-Debug.inl C++ files.
+
+2. It parses the `languageCodez.tsv`, and generates both C++ and C# code with the data from that table.
+
+The tool uses relative paths across source files.
+These paths will break if you move the source of the tool, or the source data of the tool. \ No newline at end of file
diff --git a/Tools/CompressShaders/ShaderNames.cs b/Tools/CompressShaders/ShaderNames.cs
new file mode 100644
index 0000000..81ba46e
--- /dev/null
+++ b/Tools/CompressShaders/ShaderNames.cs
@@ -0,0 +1,27 @@
+static class ShaderNames
+{
+ public static void write( string path, IEnumerable<string> names )
+ {
+ string[] arr = names.ToArray();
+ using var stream = File.CreateText( path );
+ stream.WriteLine( @"// This source file is generated by a tool
+#include ""stdafx.h""
+#include ""shaderNames.h""
+" );
+
+ stream.WriteLine( "static const std::array<const char*, {0}> s_shaderNames = ", arr.Length );
+ stream.WriteLine( "{" );
+ foreach( string name in arr )
+ stream.WriteLine( @" ""{0}"",", name );
+
+ stream.Write( @"};
+
+const char* DirectCompute::computeShaderName( eComputeShader cs )
+{
+ const uint16_t i = (uint16_t)cs;
+ if( i < s_shaderNames.size() )
+ return s_shaderNames[ i ];
+ return nullptr;
+}" );
+ }
+} \ No newline at end of file
diff --git a/Tools/compareTraces/CommandLineArgs.cpp b/Tools/compareTraces/CommandLineArgs.cpp
new file mode 100644
index 0000000..5f26bdb
--- /dev/null
+++ b/Tools/compareTraces/CommandLineArgs.cpp
@@ -0,0 +1,51 @@
+#include "stdafx.h"
+#include "CommandLineArgs.h"
+#include <charconv>
+
+static bool printUsage()
+{
+ fprintf( stderr, "Usage: compareTraces.exe trace1.bin trace2.bin [-diff N]\n" );
+ return false;
+}
+
+bool CommandLineArgs::parse( int argc, wchar_t* argv[] )
+{
+ size_t idx = 0;
+ CString sw;
+ CStringA tmp;
+ for( int i = 1; i < argc; i++ )
+ {
+ if( argv[ i ][ 0 ] != L'-' )
+ {
+ if( idx >= 2 )
+ return printUsage();
+ inputs[ idx ] = argv[ i ];
+ idx++;
+ continue;
+ }
+ sw = argv[ i ];
+ if( 0 == sw.CompareNoCase( L"-diff" ) )
+ {
+ i++;
+ if( i >= argc )
+ return printUsage();
+ tmp.Format( "%S", argv[ i ] );
+ tmp.Trim();
+ uint64_t v;
+ auto res = std::from_chars( tmp, cstr( tmp ) + tmp.GetLength(), v );
+ if( res.ec != (std::errc)0 )
+ {
+ fprintf( stderr, "Unable to parse string into number\n" );
+ return false;
+ }
+ printDiff = v;
+ continue;
+ }
+ return printUsage();
+ }
+
+ if( idx != 2 )
+ return printUsage();
+
+ return true;
+} \ No newline at end of file
diff --git a/Tools/compareTraces/CommandLineArgs.h b/Tools/compareTraces/CommandLineArgs.h
new file mode 100644
index 0000000..d434e76
--- /dev/null
+++ b/Tools/compareTraces/CommandLineArgs.h
@@ -0,0 +1,9 @@
+#pragma once
+
+struct CommandLineArgs
+{
+ int64_t printDiff = -1;
+ std::array<CString, 2> inputs;
+
+ bool parse( int argc, wchar_t* argv[] );
+}; \ No newline at end of file
diff --git a/Tools/compareTraces/Readme.txt b/Tools/compareTraces/Readme.txt
new file mode 100644
index 0000000..035d658
--- /dev/null
+++ b/Tools/compareTraces/Readme.txt
@@ -0,0 +1,9 @@
+This project builds a C++ console tool which compares debug traces of the model.
+
+Tracing files easily exceed 1GB, and by default they’re disabled with a preprocessor macro in stdafx.h of the Whisper project.
+
+When enabled, the main GPU implementation saves a trace into C:\Temp\2remove\Whisper\gpu.bin
+
+The reference CPU implementation saves a trace into C:\Temp\2remove\Whisper\ref.bin
+
+This code in this project is optimized for development speed. For this reason it requires AVX2 CPU, uses memory-mapped IO instead of proper parsing, and checks little to no errors. \ No newline at end of file
diff --git a/Tools/compareTraces/TraceReader.cpp b/Tools/compareTraces/TraceReader.cpp
new file mode 100644
index 0000000..b4b9681
--- /dev/null
+++ b/Tools/compareTraces/TraceReader.cpp
@@ -0,0 +1,46 @@
+#include "stdafx.h"
+#include "TraceReader.h"
+using namespace Tracing;
+
+const sTraceItem& TraceReader::operator[]( size_t idx ) const
+{
+ if( idx >= countItems )
+ throw E_BOUNDS;
+ return items[ idx ];
+}
+
+CStringA TraceReader::getName( const sTraceItem& item ) const
+{
+ const size_t idx = item.stringIndex;
+ if( idx >= countStrings )
+ throw E_BOUNDS;
+ const char* const source = stringData + stringIndex[ idx ];
+ CStringA res;
+ res.Format( source, item.formatArgs[ 0 ], item.formatArgs[ 1 ], item.formatArgs[ 2 ], item.formatArgs[ 3 ] );
+ return res;
+}
+
+HRESULT TraceReader::open( LPCTSTR path )
+{
+ CHECK( file.Create( path, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING ) );
+ CHECK( mapping.MapFile( file ) );
+
+ const uint8_t* rsi = mapping;
+ const sFileHeader& header = *(const sFileHeader*)rsi;
+ if( header.magic != header.correctMagic )
+ return E_INVALIDARG;
+ countItems = header.countItems;
+ countStrings = header.countStrings;
+
+ rsi += sizeof( sFileHeader );
+ payloadPointer = rsi;
+
+ rsi += header.bytesPayload;
+ stringIndex = (const uint32_t*)( rsi );
+ stringData = (const char*)( rsi + countStrings * 4 );
+
+ rsi += header.bytesStrings;
+ items = (const sTraceItem*)rsi;
+
+ return S_OK;
+} \ No newline at end of file
diff --git a/Tools/compareTraces/TraceReader.h b/Tools/compareTraces/TraceReader.h
new file mode 100644
index 0000000..8d1e1f2
--- /dev/null
+++ b/Tools/compareTraces/TraceReader.h
@@ -0,0 +1,35 @@
+#pragma once
+#include "../../Whisper/Utils/Trace/TraceStructures.h"
+#include <atlstr.h>
+#include <atlfile.h>
+
+namespace Tracing
+{
+ class TraceReader
+ {
+ const uint8_t* payloadPointer = nullptr;
+ const sTraceItem* items = nullptr;
+ size_t countItems = 0;
+ size_t countStrings = 0;
+ const uint32_t* stringIndex = nullptr;
+ const char* stringData = nullptr;
+
+ CAtlFile file;
+ CAtlFileMapping<uint8_t> mapping;
+
+ public:
+
+ TraceReader() = default;
+ ~TraceReader() = default;
+
+ HRESULT open( LPCTSTR path );
+ size_t size() const { return countItems; }
+ const sTraceItem& operator[]( size_t idx ) const;
+ CStringA getName( const sTraceItem& item ) const;
+
+ const void* payload( const sTraceItem& item ) const
+ {
+ return payloadPointer + item.payloadOffset;
+ }
+ };
+} \ No newline at end of file
diff --git a/Tools/compareTraces/compare.cpp b/Tools/compareTraces/compare.cpp
new file mode 100644
index 0000000..ec2a6ef
--- /dev/null
+++ b/Tools/compareTraces/compare.cpp
@@ -0,0 +1,364 @@
+#include "stdafx.h"
+#include "../../Whisper/API/iContext.cl.h"
+#include "TraceReader.h"
+#include "../../Whisper/ML/testUtils.h"
+#include "compare.h"
+using namespace Tracing;
+using namespace DirectCompute;
+
+namespace
+{
+ inline const char* cstr( eItemType it )
+ {
+ switch( it )
+ {
+ case eItemType::Buffer: return "Buffer";
+ case eItemType::Tensor: return "Tensor";
+ }
+ throw E_INVALIDARG;
+ }
+ inline const char* cstr( const CStringA& s ) { return s; }
+
+ inline int tensorDims( __m128i vec )
+ {
+ const __m128i one = _mm_set1_epi32( 1 );
+ const uint32_t bitmapOnes = (uint32_t)_mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( vec, one ) ) );
+ const uint32_t bitmapNotOnes = bitmapOnes ^ 0b1111u;
+ unsigned long idx;
+ if( !_BitScanReverse( &idx, bitmapNotOnes ) )
+ return 0;
+ return idx + 1;
+ }
+
+ int printSize( __m128i vec )
+ {
+ const int sz = tensorDims( vec );
+ switch( sz )
+ {
+ case 0:
+ printf( "[ scalar ]" );
+ break;
+ case 1:
+ printf( "[ %i ]", _mm_cvtsi128_si32( vec ) );
+ break;
+ case 2:
+ printf( "[ %i, %i ]", _mm_cvtsi128_si32( vec ), _mm_extract_epi32( vec, 1 ) );
+ break;
+ case 3:
+ printf( "[ %i, %i, %i ]", _mm_cvtsi128_si32( vec ), _mm_extract_epi32( vec, 1 ), _mm_extract_epi32( vec, 2 ) );
+ break;
+ case 4:
+ printf( "[ %i, %i, %i, %i ]", _mm_cvtsi128_si32( vec ), _mm_extract_epi32( vec, 1 ), _mm_extract_epi32( vec, 2 ), _mm_extract_epi32( vec, 3 ) );
+ break;
+ default:
+ throw E_UNEXPECTED;
+ }
+ return sz;
+ }
+
+ class Comparer
+ {
+ TraceReader& readerA;
+ TraceReader& readerB;
+
+ bool diffBuffers( size_t i, const sTraceItem& a, const sTraceItem& b, const CStringA& name )
+ {
+ const size_t lenA = *(const uint64_t*)a.size.data();
+ const size_t lenB = *(const uint64_t*)b.size.data();
+ if( lenA != lenB )
+ {
+ printf( "Buffer %zu \"%s\": different size, %zu in trace A, %zu in trace B\n", i, cstr( name ), lenA, lenB );
+ return false;
+ }
+ if( a.dataType != b.dataType )
+ {
+ printf( "Buffer %zu \"%s\": different data types\n", i, cstr( name ) );
+ return false;
+ }
+
+ switch( a.dataType )
+ {
+ case eDataType::FP32:
+ return buffersFp32( i, name, (const float*)readerA.payload( a ), (const float*)readerB.payload( b ), lenA );
+ }
+ throw E_NOTIMPL;
+ }
+
+ bool diffTensors( size_t i, const sTraceItem& a, const sTraceItem& b, const CStringA& name )
+ {
+ const __m128i ne1 = load( a.size );
+ const __m128i ne2 = load( b.size );
+ if( !vectorEqual( ne1, ne2 ) )
+ {
+ printf( "Tensor %zu \"%s\" - different size: trace A size is ", i, cstr( name ) );
+ printSize( ne1 );
+ printf( ", trace B size is " );
+ printSize( ne2 );
+ printf( "\n" );
+ return false;
+ }
+
+ const __m128i stride1 = load( a.stride );
+ const __m128i stride2 = load( b.stride );
+ if( !vectorEqual( stride1, stride2 ) )
+ {
+ printf( "Tensor %zu \"%s\" - different memory layout\n", i, cstr( name ) );
+ return false;
+ }
+
+ if( a.dataType != b.dataType )
+ {
+ printf( "Tensor %zu \"%s\": different data types\n", i, cstr( name ) );
+ return false;
+ }
+
+ size_t elements = (uint32_t)_mm_cvtsi128_si32( ne1 );
+ elements *= (uint32_t)_mm_extract_epi32( ne1, 1 );
+ elements *= (uint32_t)_mm_extract_epi32( ne1, 2 );
+ elements *= (uint32_t)_mm_extract_epi32( ne1, 3 );
+
+ switch( a.dataType )
+ {
+ case eDataType::FP32:
+ return tensorsFp32( i, name, (const float*)readerA.payload( a ), (const float*)readerB.payload( b ), elements, ne1, stride1 );
+ }
+ throw E_NOTIMPL;
+ }
+
+ protected:
+ virtual bool buffersFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length ) = 0;
+ virtual bool tensorsFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length, __m128i ne, __m128i nb ) = 0;
+
+ public:
+
+ Comparer( TraceReader& t1, TraceReader& t2 ) :
+ readerA( t1 ), readerB( t2 ) { }
+
+ bool compare( size_t i )
+ {
+ const sTraceItem& a = readerA[ i ];
+ const sTraceItem& b = readerB[ i ];
+ CStringA name1 = readerA.getName( a );
+ CStringA name2 = readerB.getName( b );
+
+ if( a.itemType != b.itemType )
+ {
+ printf( "Item %zu: different type, trace A %s \"%s\", trace B %s \"%s\"\n", i,
+ cstr( a.itemType ), cstr( name1 ), cstr( b.itemType ), cstr( name2 ) );
+ return false;
+ }
+
+ if( name1 != name2 )
+ {
+ printf( "%s %zu: different names, they are \"%s\" and \"%s\"\n", cstr( a.itemType ), i, cstr( name1 ), cstr( name2 ) );
+ return false;
+ }
+
+ switch( a.itemType )
+ {
+ case eItemType::Buffer:
+ return diffBuffers( i, a, b, name1 );
+ case eItemType::Tensor:
+ return diffTensors( i, a, b, name1 );
+ default:
+ throw E_INVALIDARG;
+ }
+ }
+ };
+
+ class PrintSummary : public Comparer
+ {
+ bool buffersFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length ) override;
+ bool tensorsFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length, __m128i ne, __m128i nb ) override;
+
+ public:
+ PrintSummary( TraceReader& a, TraceReader& b ) : Comparer( a, b ) { }
+ };
+
+ bool PrintSummary::buffersFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length )
+ {
+ sTensorDiff diff = computeDiff( a, b, length );
+ printf( "%s %zu \"%s\": ", cstr( eItemType::Buffer ), idx, cstr( name ) );
+ diff.print();
+ return true;
+ }
+
+ bool PrintSummary::tensorsFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length, __m128i ne, __m128i nb )
+ {
+ printSize( ne );
+ printf( " " );
+ sTensorDiff diff = computeDiff( a, b, length );
+ printf( "%s %zu \"%s\": ", cstr( eItemType::Tensor ), idx, cstr( name ) );
+ diff.print();
+ return true;
+ }
+
+ class PrintDiff : public Comparer
+ {
+ bool buffersFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length ) override;
+ bool tensorsFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length, __m128i ne, __m128i nb ) override;
+ public:
+ PrintDiff( TraceReader& a, TraceReader& b ) : Comparer( a, b ) { }
+ };
+
+ bool PrintDiff::buffersFp32( size_t idx, const CStringA& name, const float* A, const float* B, size_t length )
+ {
+ printf( "idx\tA\tB\tA(hex)\tB(hex)\tdiff\n" );
+ for( size_t i = 0; i < length; i++ )
+ {
+ const float a = *A;
+ const float b = *B;
+ __m128 vf = _mm_setr_ps( a, b, 0, 0 );
+ __m128i vi = _mm_castps_si128( vf );
+ const float diff = std::abs( a - b );
+ printf( "%zu\t%g\t%g\t0x%08X\t0x%08X\t%g\n",
+ i, a, b, _mm_cvtsi128_si32( vi ), _mm_extract_epi32( vi, 1 ), diff );
+ }
+ return true;
+ }
+
+ std::array<uint32_t, 4> storeSize( __m128i v )
+ {
+ std::array<uint32_t, 4> a;
+ _mm_storeu_si128( ( __m128i* )a.data(), v );
+ return a;
+ }
+
+ std::array<size_t, 4> storeStrides( __m128i v )
+ {
+ const __m128i zero = _mm_setzero_si128();
+ std::array<size_t, 4> a;
+ _mm_storeu_si128( ( __m128i* ) & a[ 0 ], _mm_unpacklo_epi32( v, zero ) );
+ _mm_storeu_si128( ( __m128i* ) & a[ 2 ], _mm_unpackhi_epi32( v, zero ) );
+ return a;
+ }
+
+ bool PrintDiff::tensorsFp32( size_t idx, const CStringA& name, const float* A, const float* B, size_t length, __m128i ne, __m128i nb )
+ {
+ const int dims = tensorDims( ne );
+ const std::array<uint32_t, 4> size = storeSize( ne );
+ const std::array<size_t, 4> strides = storeStrides( ne );
+ CStringA line;
+ if( dims > 4 )
+ throw E_UNEXPECTED;
+
+ for( int i = 0; i < dims; i++ )
+ {
+ const char c = "xyzw"[ i ];
+ line.AppendChar( c );
+ line.AppendChar( '\t' );
+ }
+ line += "A\tB\tA(hex)\tB(hex)\tdiff\n";
+ printf( "%s", cstr( line ) );
+
+ if( 0 == dims )
+ {
+ const float a = *A;
+ const float b = *B;
+ __m128 vf = _mm_setr_ps( a, b, 0, 0 );
+ __m128i vi = _mm_castps_si128( vf );
+ const float diff = std::abs( a - b );
+ printf( "%g\t%g\t0x%08X\t0x%08X\t%g\n",
+ a, b, _mm_cvtsi128_si32( vi ), _mm_extract_epi32( vi, 1 ), diff );
+ return true;
+ }
+
+ size_t offLayer2 = 0;
+ for( uint32_t w = 0; w < size[ 3 ]; w++, offLayer2 += strides[ 3 ] )
+ {
+ size_t offLayer = offLayer2;
+ for( uint32_t z = 0; z < size[ 2 ]; z++, offLayer += strides[ 2 ] )
+ {
+ size_t offRow = offLayer;
+ for( uint32_t y = 0; y < size[ 1 ]; y++, offRow += strides[ 1 ] )
+ {
+ size_t off = offRow;
+ for( uint32_t x = 0; x < size[ 0 ]; x++, off += strides[ 0 ] )
+ {
+ line.Format( "%i\t", x );
+ if( dims > 1 )
+ line.AppendFormat( "%i\t", y );
+ if( dims > 2 )
+ line.AppendFormat( "%i\t", z );
+ if( dims > 3 )
+ line.AppendFormat( "%i\t", w );
+
+ const float a = A[ off ];
+ const float b = B[ off ];
+ __m128 vf = _mm_setr_ps( a, b, 0, 0 );
+ __m128i vi = _mm_castps_si128( vf );
+ const float diff = std::abs( a - b );
+ line.AppendFormat( "%g\t%g\t0x%08X\t0x%08X\t%g\n",
+ a, b, _mm_cvtsi128_si32( vi ), _mm_extract_epi32( vi, 1 ), diff );
+ printf( "%s", cstr( line ) );
+ }
+ }
+ }
+ }
+ return true;
+ }
+}
+
+HRESULT compareTraces( const CommandLineArgs& arguments )
+{
+ const wchar_t* pathA = arguments.inputs[ 0 ];
+ const wchar_t* pathB = arguments.inputs[ 1 ];
+
+ TraceReader a, b;
+ HRESULT hr = a.open( pathA );
+ if( FAILED( hr ) )
+ {
+ fwprintf( stderr, L"Unable to load trace A from \"%s\"", pathA );
+ printError( hr );
+ return hr;
+ }
+
+ hr = b.open( pathB );
+ if( FAILED( hr ) )
+ {
+ fwprintf( stderr, L"Unable to load trace B from \"%s\"", pathA );
+ printError( hr );
+ return hr;
+ }
+
+ wprintf( L"Trace A: %s\n", pathA );
+ wprintf( L"Trace B: %s\n", pathB );
+ const size_t sizeA = a.size();
+ const size_t sizeB = b.size();
+ const size_t count = std::min( sizeA, sizeB );
+
+ if( arguments.printDiff >= 0 )
+ {
+ if( arguments.printDiff >= (int64_t)count )
+ {
+ fprintf( stderr, "Trace A has %zu entries, trace B %zu entries; entry %zu ain't there\n",
+ sizeA, sizeB, (size_t)arguments.printDiff );
+ return E_INVALIDARG;
+ }
+ try
+ {
+ PrintDiff print{ a, b };
+ print.compare( arguments.printDiff );
+ return S_OK;
+ }
+ catch( HRESULT hr )
+ {
+ return hr;
+ }
+ }
+
+ printf( "Trace A has %zu entries, trace B %zu entries, comparing first %zu\n", sizeA, sizeB, count );
+
+ try
+ {
+ PrintSummary print{ a, b };
+ for( size_t i = 0; i < count; i++ )
+ if( !print.compare( i ) )
+ return S_FALSE;
+ return S_OK;
+ }
+ catch( HRESULT hr )
+ {
+ return hr;
+ }
+} \ No newline at end of file
diff --git a/Tools/compareTraces/compare.h b/Tools/compareTraces/compare.h
new file mode 100644
index 0000000..2a4cd86
--- /dev/null
+++ b/Tools/compareTraces/compare.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "CommandLineArgs.h"
+
+HRESULT compareTraces( const CommandLineArgs& arguments ); \ No newline at end of file
diff --git a/Tools/compareTraces/compareTraces.cpp b/Tools/compareTraces/compareTraces.cpp
new file mode 100644
index 0000000..8813500
--- /dev/null
+++ b/Tools/compareTraces/compareTraces.cpp
@@ -0,0 +1,16 @@
+#include "stdafx.h"
+#include <stdio.h>
+#include "compare.h"
+#include "CommandLineArgs.h"
+
+int wmain( int argc, wchar_t* argv[] )
+{
+ CommandLineArgs cla;
+ if( !cla.parse( argc, argv ) )
+ return 1;
+
+ HRESULT hr = compareTraces( cla );
+ if( SUCCEEDED( hr ) )
+ return 0;
+ return hr;
+} \ No newline at end of file
diff --git a/Tools/compareTraces/compareTraces.vcxproj b/Tools/compareTraces/compareTraces.vcxproj
new file mode 100644
index 0000000..a9670b3
--- /dev/null
+++ b/Tools/compareTraces/compareTraces.vcxproj
@@ -0,0 +1,103 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <VCProjectVersion>16.0</VCProjectVersion>
+ <Keyword>Win32Proj</Keyword>
+ <ProjectGuid>{8478a77c-d851-4c63-9511-1770cc82d33e}</ProjectGuid>
+ <RootNamespace>compareTraces</RootNamespace>
+ <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v143</PlatformToolset>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v143</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="Shared">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <SDLCheck>true</SDLCheck>
+ <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <ConformanceMode>true</ConformanceMode>
+ <LanguageStandard>stdcpp20</LanguageStandard>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <SDLCheck>true</SDLCheck>
+ <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <ConformanceMode>true</ConformanceMode>
+ <LanguageStandard>stdcpp20</LanguageStandard>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClCompile Include="CommandLineArgs.cpp" />
+ <ClCompile Include="compareTraces.cpp" />
+ <ClCompile Include="compare.cpp" />
+ <ClCompile Include="stdafx.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="testUtils.cpp" />
+ <ClCompile Include="TraceReader.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="CommandLineArgs.h" />
+ <ClInclude Include="compare.h" />
+ <ClInclude Include="stdafx.h" />
+ <ClInclude Include="TraceReader.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <Text Include="Readme.txt" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
+</Project> \ No newline at end of file
diff --git a/Tools/compareTraces/compareTraces.vcxproj.filters b/Tools/compareTraces/compareTraces.vcxproj.filters
new file mode 100644
index 0000000..1d0d4c3
--- /dev/null
+++ b/Tools/compareTraces/compareTraces.vcxproj.filters
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <ClCompile Include="compareTraces.cpp" />
+ <ClCompile Include="TraceReader.cpp" />
+ <ClCompile Include="compare.cpp" />
+ <ClCompile Include="stdafx.cpp" />
+ <ClCompile Include="testUtils.cpp" />
+ <ClCompile Include="CommandLineArgs.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="TraceReader.h" />
+ <ClInclude Include="stdafx.h" />
+ <ClInclude Include="compare.h" />
+ <ClInclude Include="CommandLineArgs.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <Text Include="Readme.txt" />
+ </ItemGroup>
+</Project> \ No newline at end of file
diff --git a/Tools/compareTraces/stdafx.cpp b/Tools/compareTraces/stdafx.cpp
new file mode 100644
index 0000000..5c7f6c9
--- /dev/null
+++ b/Tools/compareTraces/stdafx.cpp
@@ -0,0 +1,30 @@
+#include "stdafx.h"
+
+namespace
+{
+ wchar_t* formatMessage( HRESULT hr )
+ {
+ wchar_t* err;
+ if( FormatMessage( FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM,
+ NULL,
+ hr,
+ MAKELANGID( LANG_NEUTRAL, SUBLANG_DEFAULT ),
+ (LPTSTR)&err,
+ 0,
+ NULL ) )
+ return err;
+ return nullptr;
+ }
+}
+
+void printError( HRESULT hr )
+{
+ const wchar_t* err = formatMessage( hr );
+ if( nullptr != err )
+ {
+ fwprintf( stderr, L"%s\n", err );
+ LocalFree( (HLOCAL)err );
+ }
+ else
+ fprintf( stderr, "Error code %i (0x%08X)\n", hr, hr );
+} \ No newline at end of file
diff --git a/Tools/compareTraces/stdafx.h b/Tools/compareTraces/stdafx.h
new file mode 100644
index 0000000..1e496f3
--- /dev/null
+++ b/Tools/compareTraces/stdafx.h
@@ -0,0 +1,40 @@
+#pragma once
+#include <stdint.h>
+#include <assert.h>
+
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include <atlstr.h>
+#include <d3d11.h>
+
+#include <vector>
+#include <array>
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#define CHECK( hr ) { const HRESULT __hr = ( hr ); if( FAILED( __hr ) ) return __hr; }
+
+inline __m128i load16( const int* rsi )
+{
+ return _mm_loadu_si128( ( const __m128i* )rsi );
+}
+inline __m128i load16( const uint32_t* rsi )
+{
+ return _mm_loadu_si128( ( const __m128i* )rsi );
+}
+inline __m128i load( const std::array<uint32_t, 4>& arr )
+{
+ return load16( arr.data() );
+}
+
+inline bool vectorEqual( __m128i a, __m128i b )
+{
+ __m128i xx = _mm_xor_si128( a, b );
+ return (bool)_mm_testz_si128( xx, xx );
+}
+
+void printError( HRESULT hr );
+
+inline const char* cstr( const CStringA& s ) { return s; }
+inline const wchar_t* cstr( const CString& s ) { return s; } \ No newline at end of file
diff --git a/Tools/compareTraces/testUtils.cpp b/Tools/compareTraces/testUtils.cpp
new file mode 100644
index 0000000..f9fa465
--- /dev/null
+++ b/Tools/compareTraces/testUtils.cpp
@@ -0,0 +1,224 @@
+#include "stdafx.h"
+#include "../../Whisper/ML/testUtils.h"
+#include <immintrin.h>
+using namespace DirectCompute;
+
+namespace
+{
+ using DirectCompute::sTensorDiff;
+
+ __forceinline __m256 load( const float* rsi )
+ {
+ return _mm256_loadu_ps( rsi );
+ }
+
+ __forceinline __m256 load( const uint16_t* rsi )
+ {
+ const __m128i iv = _mm_load_si128( ( const __m128i* )rsi );
+ return _mm256_cvtph_ps( iv );
+ }
+
+ __forceinline void loadPartial( const uint16_t* x, const uint16_t* y, size_t count, __m256& fx, __m256& fy )
+ {
+ __m128i ix, iy;
+ switch( count )
+ {
+ case 1: // load 2 bytes
+ ix = _mm_cvtsi32_si128( *x );
+ iy = _mm_cvtsi32_si128( *y );
+ break;
+ case 2: // load 4 bytes
+ ix = _mm_cvtsi32_si128( *(const int*)x );
+ iy = _mm_cvtsi32_si128( *(const int*)y );
+ break;
+ case 3: // load 6 bytes
+ ix = _mm_cvtsi32_si128( *(const int*)x );
+ iy = _mm_cvtsi32_si128( *(const int*)y );
+ ix = _mm_insert_epi16( ix, x[ 2 ], 2 );
+ iy = _mm_insert_epi16( iy, y[ 2 ], 2 );
+ break;
+ case 4: // load 8 bytes
+ ix = _mm_cvtsi64_si128( *(const int64_t*)x );
+ iy = _mm_cvtsi64_si128( *(const int64_t*)y );
+ break;
+ case 5: // load 10 bytes
+ ix = _mm_cvtsi64_si128( *(const int64_t*)x );
+ iy = _mm_cvtsi64_si128( *(const int64_t*)y );
+ ix = _mm_insert_epi16( ix, x[ 4 ], 4 );
+ iy = _mm_insert_epi16( iy, y[ 4 ], 4 );
+ break;
+ case 6: // load 12 bytes
+ ix = _mm_cvtsi64_si128( *(const int64_t*)x );
+ iy = _mm_cvtsi64_si128( *(const int64_t*)y );
+ ix = _mm_insert_epi32( ix, *(const int*)( x + 4 ), 2 );
+ iy = _mm_insert_epi32( iy, *(const int*)( y + 4 ), 2 );
+ break;
+ case 7: // load 14 bytes
+ ix = _mm_cvtsi64_si128( *(const int64_t*)x );
+ iy = _mm_cvtsi64_si128( *(const int64_t*)y );
+ ix = _mm_insert_epi32( ix, *(const int*)( x + 4 ), 2 );
+ iy = _mm_insert_epi32( iy, *(const int*)( y + 4 ), 2 );
+ ix = _mm_insert_epi16( ix, x[ 6 ], 6 );
+ iy = _mm_insert_epi16( iy, y[ 6 ], 6 );
+ break;
+ default:
+ fx = fy = _mm256_setzero_ps();
+ return;
+ }
+
+ fx = _mm256_cvtph_ps( ix );
+ fy = _mm256_cvtph_ps( iy );
+ }
+
+ inline __m128 loadFloat2( const float* rsi )
+ {
+ return _mm_castpd_ps( _mm_load_sd( (const double*)rsi ) );
+ }
+ inline __m128 loadFloat3( const float* rsi )
+ {
+ __m128 f = loadFloat2( rsi );
+ f = _mm_insert_ps( f, _mm_load_ss( rsi + 2 ), 0x20 );
+ return f;
+ }
+ __forceinline void loadPartial( const float* x, const float* y, size_t count, __m256& fx, __m256& fy )
+ {
+ __m128 low1, high1;
+ __m128 low2, high2;
+ high1 = high2 = _mm_setzero_ps();
+ switch( count )
+ {
+ case 1:
+ low1 = _mm_load_ss( x );
+ low2 = _mm_load_ss( y );
+ break;
+ case 2:
+ low1 = loadFloat2( x );
+ low2 = loadFloat2( y );
+ break;
+ case 3:
+ low1 = loadFloat3( x );
+ low2 = loadFloat3( y );
+ break;
+ case 4:
+ low1 = _mm_loadu_ps( x );
+ low2 = _mm_loadu_ps( y );
+ break;
+ case 5:
+ low1 = _mm_loadu_ps( x );
+ low2 = _mm_loadu_ps( y );
+ high1 = _mm_load_ss( x + 4 );
+ high2 = _mm_load_ss( y + 4 );
+ break;
+ case 6:
+ low1 = _mm_loadu_ps( x );
+ low2 = _mm_loadu_ps( y );
+ high1 = loadFloat2( x + 4 );
+ high2 = loadFloat2( y + 4 );
+ break;
+ case 7: // load 14 bytes
+ low1 = _mm_loadu_ps( x );
+ low2 = _mm_loadu_ps( y );
+ high1 = loadFloat3( x + 4 );
+ high2 = loadFloat3( y + 4 );
+ break;
+ default:
+ fx = fy = _mm256_setzero_ps();
+ return;
+ }
+
+ fx = _mm256_setr_m128( low1, high1 );
+ fy = _mm256_setr_m128( low2, high2 );
+ }
+
+ __forceinline float horizontalMaximum( __m256 v )
+ {
+ __m128 s = _mm256_extractf128_ps( v, 1 );
+ s = _mm_max_ps( s, _mm256_castps256_ps128( v ) );
+ s = _mm_max_ps( s, _mm_movehl_ps( s, s ) );
+ s = _mm_max_ss( s, _mm_movehdup_ps( s ) );
+ return _mm_cvtss_f32( s );
+ }
+
+ __forceinline double horizontalSum( __m256 v )
+ {
+ __m256d d = _mm256_cvtps_pd( _mm256_extractf128_ps( v, 1 ) );
+ d = _mm256_add_pd( d, _mm256_cvtps_pd( _mm256_castps256_ps128( v ) ) );
+
+ __m128d s = _mm256_extractf128_pd( d, 1 );
+ s = _mm_add_pd( s, _mm256_castpd256_pd128( d ) );
+ s = _mm_add_sd( s, _mm_unpackhi_pd( s, s ) );
+ return _mm_cvtsd_f64( s );
+ }
+
+ __m256 maskInfNan( __m256 diff, __m256 a, __m256 b )
+ {
+ __m256i ai = _mm256_castps_si256( a );
+ __m256i bi = _mm256_castps_si256( b );
+ __m256i eqi = _mm256_cmpeq_epi32( ai, bi );
+ __m256 eq = _mm256_castsi256_ps( eqi );
+ return _mm256_andnot_ps( eq, diff );
+ }
+
+ class DiffAcc
+ {
+ __m256 maxAbs = _mm256_setzero_ps();
+ __m256 sumSquares = _mm256_setzero_ps();
+
+ public:
+
+ __forceinline void add( __m256 a, __m256 b )
+ {
+ const __m256 neg0 = _mm256_set1_ps( -0.0f );
+ __m256 diff = _mm256_sub_ps( b, a );
+ diff = maskInfNan( diff, a, b );
+ sumSquares = _mm256_fmadd_ps( diff, diff, sumSquares );
+ const __m256 absDiff = _mm256_andnot_ps( neg0, diff );
+ maxAbs = _mm256_max_ps( maxAbs, absDiff );
+ }
+
+ __forceinline sTensorDiff reduce( size_t count )
+ {
+ sTensorDiff res;
+ res.maxAbsDiff = horizontalMaximum( maxAbs );
+ res.avgDiffSquared = (float)( horizontalSum( sumSquares ) / (double)(int64_t)count );
+ res.length = count;
+ return res;
+ }
+ };
+
+ template<class E>
+ static sTensorDiff __declspec( noinline ) diffVectors( const E* a, const E* b, size_t length )
+ {
+ // const E* const aEnd = a + length;
+ const E* const aEndAligned = a + ( length / 8 ) * 8;
+ const size_t remainder = length % 8;
+
+ DiffAcc acc;
+ for( ; a < aEndAligned; a += 8, b += 8 )
+ acc.add( load( a ), load( b ) );
+
+ if( remainder != 0 )
+ {
+ __m256 va, vb;
+ loadPartial( a, b, remainder, va, vb );
+ acc.add( va, vb );
+ }
+
+ return acc.reduce( length );
+ }
+}
+
+sTensorDiff DirectCompute::computeDiff( const float* a, const float* b, size_t length )
+{
+ return diffVectors( a, b, length );
+}
+
+sTensorDiff DirectCompute::computeDiff( const uint16_t* a, const uint16_t* b, size_t length )
+{
+ return diffVectors( a, b, length );
+}
+
+void DirectCompute::sTensorDiff::print() const
+{
+ printf( "%zu elements, maxAbsDiff = %g, avgDiffSquared = %g\n", length, maxAbsDiff, avgDiffSquared );
+} \ No newline at end of file