summaryrefslogtreecommitdiffstats
path: root/Tools/compareTraces
diff options
context:
space:
mode:
authorKonstantin <const@const.me>2023-01-16 14:52:43 +0100
committerKonstantin <const@const.me>2023-01-16 14:52:43 +0100
commit8c4603c73675958efc960fbd4bb599a2909d106a (patch)
tree714dc6fc9a1672d5fd7f89676b97e10959662abc /Tools/compareTraces
parent990a8d0dbaefc996244097397259e92758b15cce (diff)
Source codes
Diffstat (limited to 'Tools/compareTraces')
-rw-r--r--Tools/compareTraces/CommandLineArgs.cpp51
-rw-r--r--Tools/compareTraces/CommandLineArgs.h9
-rw-r--r--Tools/compareTraces/Readme.txt9
-rw-r--r--Tools/compareTraces/TraceReader.cpp46
-rw-r--r--Tools/compareTraces/TraceReader.h35
-rw-r--r--Tools/compareTraces/compare.cpp364
-rw-r--r--Tools/compareTraces/compare.h4
-rw-r--r--Tools/compareTraces/compareTraces.cpp16
-rw-r--r--Tools/compareTraces/compareTraces.vcxproj103
-rw-r--r--Tools/compareTraces/compareTraces.vcxproj.filters20
-rw-r--r--Tools/compareTraces/stdafx.cpp30
-rw-r--r--Tools/compareTraces/stdafx.h40
-rw-r--r--Tools/compareTraces/testUtils.cpp224
13 files changed, 951 insertions, 0 deletions
diff --git a/Tools/compareTraces/CommandLineArgs.cpp b/Tools/compareTraces/CommandLineArgs.cpp
new file mode 100644
index 0000000..5f26bdb
--- /dev/null
+++ b/Tools/compareTraces/CommandLineArgs.cpp
@@ -0,0 +1,51 @@
+#include "stdafx.h"
+#include "CommandLineArgs.h"
+#include <charconv>
+
+static bool printUsage()
+{
+ fprintf( stderr, "Usage: compareTraces.exe trace1.bin trace2.bin [-diff N]\n" );
+ return false;
+}
+
+bool CommandLineArgs::parse( int argc, wchar_t* argv[] )
+{
+ size_t idx = 0;
+ CString sw;
+ CStringA tmp;
+ for( int i = 1; i < argc; i++ )
+ {
+ if( argv[ i ][ 0 ] != L'-' )
+ {
+ if( idx >= 2 )
+ return printUsage();
+ inputs[ idx ] = argv[ i ];
+ idx++;
+ continue;
+ }
+ sw = argv[ i ];
+ if( 0 == sw.CompareNoCase( L"-diff" ) )
+ {
+ i++;
+ if( i >= argc )
+ return printUsage();
+ tmp.Format( "%S", argv[ i ] );
+ tmp.Trim();
+ uint64_t v;
+ auto res = std::from_chars( tmp, cstr( tmp ) + tmp.GetLength(), v );
+ if( res.ec != (std::errc)0 )
+ {
+ fprintf( stderr, "Unable to parse string into number\n" );
+ return false;
+ }
+ printDiff = v;
+ continue;
+ }
+ return printUsage();
+ }
+
+ if( idx != 2 )
+ return printUsage();
+
+ return true;
+} \ No newline at end of file
diff --git a/Tools/compareTraces/CommandLineArgs.h b/Tools/compareTraces/CommandLineArgs.h
new file mode 100644
index 0000000..d434e76
--- /dev/null
+++ b/Tools/compareTraces/CommandLineArgs.h
@@ -0,0 +1,9 @@
+#pragma once
+
+struct CommandLineArgs
+{
+ int64_t printDiff = -1;
+ std::array<CString, 2> inputs;
+
+ bool parse( int argc, wchar_t* argv[] );
+}; \ No newline at end of file
diff --git a/Tools/compareTraces/Readme.txt b/Tools/compareTraces/Readme.txt
new file mode 100644
index 0000000..035d658
--- /dev/null
+++ b/Tools/compareTraces/Readme.txt
@@ -0,0 +1,9 @@
+This project builds a C++ console tool which compares debug traces of the model.
+
+Tracing files easily exceed 1GB, and by default they’re disabled with a preprocessor macro in stdafx.h of the Whisper project.
+
+When enabled, the main GPU implementation saves a trace into C:\Temp\2remove\Whisper\gpu.bin
+
+The reference CPU implementation saves a trace into C:\Temp\2remove\Whisper\ref.bin
+
+This code in this project is optimized for development speed. For this reason it requires AVX2 CPU, uses memory-mapped IO instead of proper parsing, and checks little to no errors. \ No newline at end of file
diff --git a/Tools/compareTraces/TraceReader.cpp b/Tools/compareTraces/TraceReader.cpp
new file mode 100644
index 0000000..b4b9681
--- /dev/null
+++ b/Tools/compareTraces/TraceReader.cpp
@@ -0,0 +1,46 @@
+#include "stdafx.h"
+#include "TraceReader.h"
+using namespace Tracing;
+
+const sTraceItem& TraceReader::operator[]( size_t idx ) const
+{
+ if( idx >= countItems )
+ throw E_BOUNDS;
+ return items[ idx ];
+}
+
+CStringA TraceReader::getName( const sTraceItem& item ) const
+{
+ const size_t idx = item.stringIndex;
+ if( idx >= countStrings )
+ throw E_BOUNDS;
+ const char* const source = stringData + stringIndex[ idx ];
+ CStringA res;
+ res.Format( source, item.formatArgs[ 0 ], item.formatArgs[ 1 ], item.formatArgs[ 2 ], item.formatArgs[ 3 ] );
+ return res;
+}
+
+HRESULT TraceReader::open( LPCTSTR path )
+{
+ CHECK( file.Create( path, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING ) );
+ CHECK( mapping.MapFile( file ) );
+
+ const uint8_t* rsi = mapping;
+ const sFileHeader& header = *(const sFileHeader*)rsi;
+ if( header.magic != header.correctMagic )
+ return E_INVALIDARG;
+ countItems = header.countItems;
+ countStrings = header.countStrings;
+
+ rsi += sizeof( sFileHeader );
+ payloadPointer = rsi;
+
+ rsi += header.bytesPayload;
+ stringIndex = (const uint32_t*)( rsi );
+ stringData = (const char*)( rsi + countStrings * 4 );
+
+ rsi += header.bytesStrings;
+ items = (const sTraceItem*)rsi;
+
+ return S_OK;
+} \ No newline at end of file
diff --git a/Tools/compareTraces/TraceReader.h b/Tools/compareTraces/TraceReader.h
new file mode 100644
index 0000000..8d1e1f2
--- /dev/null
+++ b/Tools/compareTraces/TraceReader.h
@@ -0,0 +1,35 @@
+#pragma once
+#include "../../Whisper/Utils/Trace/TraceStructures.h"
+#include <atlstr.h>
+#include <atlfile.h>
+
+namespace Tracing
+{
+ class TraceReader
+ {
+ const uint8_t* payloadPointer = nullptr;
+ const sTraceItem* items = nullptr;
+ size_t countItems = 0;
+ size_t countStrings = 0;
+ const uint32_t* stringIndex = nullptr;
+ const char* stringData = nullptr;
+
+ CAtlFile file;
+ CAtlFileMapping<uint8_t> mapping;
+
+ public:
+
+ TraceReader() = default;
+ ~TraceReader() = default;
+
+ HRESULT open( LPCTSTR path );
+ size_t size() const { return countItems; }
+ const sTraceItem& operator[]( size_t idx ) const;
+ CStringA getName( const sTraceItem& item ) const;
+
+ const void* payload( const sTraceItem& item ) const
+ {
+ return payloadPointer + item.payloadOffset;
+ }
+ };
+} \ No newline at end of file
diff --git a/Tools/compareTraces/compare.cpp b/Tools/compareTraces/compare.cpp
new file mode 100644
index 0000000..ec2a6ef
--- /dev/null
+++ b/Tools/compareTraces/compare.cpp
@@ -0,0 +1,364 @@
+#include "stdafx.h"
+#include "../../Whisper/API/iContext.cl.h"
+#include "TraceReader.h"
+#include "../../Whisper/ML/testUtils.h"
+#include "compare.h"
+using namespace Tracing;
+using namespace DirectCompute;
+
+namespace
+{
+ inline const char* cstr( eItemType it )
+ {
+ switch( it )
+ {
+ case eItemType::Buffer: return "Buffer";
+ case eItemType::Tensor: return "Tensor";
+ }
+ throw E_INVALIDARG;
+ }
+ inline const char* cstr( const CStringA& s ) { return s; }
+
+ inline int tensorDims( __m128i vec )
+ {
+ const __m128i one = _mm_set1_epi32( 1 );
+ const uint32_t bitmapOnes = (uint32_t)_mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( vec, one ) ) );
+ const uint32_t bitmapNotOnes = bitmapOnes ^ 0b1111u;
+ unsigned long idx;
+ if( !_BitScanReverse( &idx, bitmapNotOnes ) )
+ return 0;
+ return idx + 1;
+ }
+
+ int printSize( __m128i vec )
+ {
+ const int sz = tensorDims( vec );
+ switch( sz )
+ {
+ case 0:
+ printf( "[ scalar ]" );
+ break;
+ case 1:
+ printf( "[ %i ]", _mm_cvtsi128_si32( vec ) );
+ break;
+ case 2:
+ printf( "[ %i, %i ]", _mm_cvtsi128_si32( vec ), _mm_extract_epi32( vec, 1 ) );
+ break;
+ case 3:
+ printf( "[ %i, %i, %i ]", _mm_cvtsi128_si32( vec ), _mm_extract_epi32( vec, 1 ), _mm_extract_epi32( vec, 2 ) );
+ break;
+ case 4:
+ printf( "[ %i, %i, %i, %i ]", _mm_cvtsi128_si32( vec ), _mm_extract_epi32( vec, 1 ), _mm_extract_epi32( vec, 2 ), _mm_extract_epi32( vec, 3 ) );
+ break;
+ default:
+ throw E_UNEXPECTED;
+ }
+ return sz;
+ }
+
+ class Comparer
+ {
+ TraceReader& readerA;
+ TraceReader& readerB;
+
+ bool diffBuffers( size_t i, const sTraceItem& a, const sTraceItem& b, const CStringA& name )
+ {
+ const size_t lenA = *(const uint64_t*)a.size.data();
+ const size_t lenB = *(const uint64_t*)b.size.data();
+ if( lenA != lenB )
+ {
+ printf( "Buffer %zu \"%s\": different size, %zu in trace A, %zu in trace B\n", i, cstr( name ), lenA, lenB );
+ return false;
+ }
+ if( a.dataType != b.dataType )
+ {
+ printf( "Buffer %zu \"%s\": different data types\n", i, cstr( name ) );
+ return false;
+ }
+
+ switch( a.dataType )
+ {
+ case eDataType::FP32:
+ return buffersFp32( i, name, (const float*)readerA.payload( a ), (const float*)readerB.payload( b ), lenA );
+ }
+ throw E_NOTIMPL;
+ }
+
+ bool diffTensors( size_t i, const sTraceItem& a, const sTraceItem& b, const CStringA& name )
+ {
+ const __m128i ne1 = load( a.size );
+ const __m128i ne2 = load( b.size );
+ if( !vectorEqual( ne1, ne2 ) )
+ {
+ printf( "Tensor %zu \"%s\" - different size: trace A size is ", i, cstr( name ) );
+ printSize( ne1 );
+ printf( ", trace B size is " );
+ printSize( ne2 );
+ printf( "\n" );
+ return false;
+ }
+
+ const __m128i stride1 = load( a.stride );
+ const __m128i stride2 = load( b.stride );
+ if( !vectorEqual( stride1, stride2 ) )
+ {
+ printf( "Tensor %zu \"%s\" - different memory layout\n", i, cstr( name ) );
+ return false;
+ }
+
+ if( a.dataType != b.dataType )
+ {
+ printf( "Tensor %zu \"%s\": different data types\n", i, cstr( name ) );
+ return false;
+ }
+
+ size_t elements = (uint32_t)_mm_cvtsi128_si32( ne1 );
+ elements *= (uint32_t)_mm_extract_epi32( ne1, 1 );
+ elements *= (uint32_t)_mm_extract_epi32( ne1, 2 );
+ elements *= (uint32_t)_mm_extract_epi32( ne1, 3 );
+
+ switch( a.dataType )
+ {
+ case eDataType::FP32:
+ return tensorsFp32( i, name, (const float*)readerA.payload( a ), (const float*)readerB.payload( b ), elements, ne1, stride1 );
+ }
+ throw E_NOTIMPL;
+ }
+
+ protected:
+ virtual bool buffersFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length ) = 0;
+ virtual bool tensorsFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length, __m128i ne, __m128i nb ) = 0;
+
+ public:
+
+ Comparer( TraceReader& t1, TraceReader& t2 ) :
+ readerA( t1 ), readerB( t2 ) { }
+
+ bool compare( size_t i )
+ {
+ const sTraceItem& a = readerA[ i ];
+ const sTraceItem& b = readerB[ i ];
+ CStringA name1 = readerA.getName( a );
+ CStringA name2 = readerB.getName( b );
+
+ if( a.itemType != b.itemType )
+ {
+ printf( "Item %zu: different type, trace A %s \"%s\", trace B %s \"%s\"\n", i,
+ cstr( a.itemType ), cstr( name1 ), cstr( b.itemType ), cstr( name2 ) );
+ return false;
+ }
+
+ if( name1 != name2 )
+ {
+ printf( "%s %zu: different names, they are \"%s\" and \"%s\"\n", cstr( a.itemType ), i, cstr( name1 ), cstr( name2 ) );
+ return false;
+ }
+
+ switch( a.itemType )
+ {
+ case eItemType::Buffer:
+ return diffBuffers( i, a, b, name1 );
+ case eItemType::Tensor:
+ return diffTensors( i, a, b, name1 );
+ default:
+ throw E_INVALIDARG;
+ }
+ }
+ };
+
+ class PrintSummary : public Comparer
+ {
+ bool buffersFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length ) override;
+ bool tensorsFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length, __m128i ne, __m128i nb ) override;
+
+ public:
+ PrintSummary( TraceReader& a, TraceReader& b ) : Comparer( a, b ) { }
+ };
+
+ bool PrintSummary::buffersFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length )
+ {
+ sTensorDiff diff = computeDiff( a, b, length );
+ printf( "%s %zu \"%s\": ", cstr( eItemType::Buffer ), idx, cstr( name ) );
+ diff.print();
+ return true;
+ }
+
+ bool PrintSummary::tensorsFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length, __m128i ne, __m128i nb )
+ {
+ printSize( ne );
+ printf( " " );
+ sTensorDiff diff = computeDiff( a, b, length );
+ printf( "%s %zu \"%s\": ", cstr( eItemType::Tensor ), idx, cstr( name ) );
+ diff.print();
+ return true;
+ }
+
+ class PrintDiff : public Comparer
+ {
+ bool buffersFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length ) override;
+ bool tensorsFp32( size_t idx, const CStringA& name, const float* a, const float* b, size_t length, __m128i ne, __m128i nb ) override;
+ public:
+ PrintDiff( TraceReader& a, TraceReader& b ) : Comparer( a, b ) { }
+ };
+
+ bool PrintDiff::buffersFp32( size_t idx, const CStringA& name, const float* A, const float* B, size_t length )
+ {
+ printf( "idx\tA\tB\tA(hex)\tB(hex)\tdiff\n" );
+ for( size_t i = 0; i < length; i++ )
+ {
+ const float a = *A;
+ const float b = *B;
+ __m128 vf = _mm_setr_ps( a, b, 0, 0 );
+ __m128i vi = _mm_castps_si128( vf );
+ const float diff = std::abs( a - b );
+ printf( "%zu\t%g\t%g\t0x%08X\t0x%08X\t%g\n",
+ i, a, b, _mm_cvtsi128_si32( vi ), _mm_extract_epi32( vi, 1 ), diff );
+ }
+ return true;
+ }
+
+ std::array<uint32_t, 4> storeSize( __m128i v )
+ {
+ std::array<uint32_t, 4> a;
+ _mm_storeu_si128( ( __m128i* )a.data(), v );
+ return a;
+ }
+
+ std::array<size_t, 4> storeStrides( __m128i v )
+ {
+ const __m128i zero = _mm_setzero_si128();
+ std::array<size_t, 4> a;
+ _mm_storeu_si128( ( __m128i* ) & a[ 0 ], _mm_unpacklo_epi32( v, zero ) );
+ _mm_storeu_si128( ( __m128i* ) & a[ 2 ], _mm_unpackhi_epi32( v, zero ) );
+ return a;
+ }
+
+ bool PrintDiff::tensorsFp32( size_t idx, const CStringA& name, const float* A, const float* B, size_t length, __m128i ne, __m128i nb )
+ {
+ const int dims = tensorDims( ne );
+ const std::array<uint32_t, 4> size = storeSize( ne );
+ const std::array<size_t, 4> strides = storeStrides( ne );
+ CStringA line;
+ if( dims > 4 )
+ throw E_UNEXPECTED;
+
+ for( int i = 0; i < dims; i++ )
+ {
+ const char c = "xyzw"[ i ];
+ line.AppendChar( c );
+ line.AppendChar( '\t' );
+ }
+ line += "A\tB\tA(hex)\tB(hex)\tdiff\n";
+ printf( "%s", cstr( line ) );
+
+ if( 0 == dims )
+ {
+ const float a = *A;
+ const float b = *B;
+ __m128 vf = _mm_setr_ps( a, b, 0, 0 );
+ __m128i vi = _mm_castps_si128( vf );
+ const float diff = std::abs( a - b );
+ printf( "%g\t%g\t0x%08X\t0x%08X\t%g\n",
+ a, b, _mm_cvtsi128_si32( vi ), _mm_extract_epi32( vi, 1 ), diff );
+ return true;
+ }
+
+ size_t offLayer2 = 0;
+ for( uint32_t w = 0; w < size[ 3 ]; w++, offLayer2 += strides[ 3 ] )
+ {
+ size_t offLayer = offLayer2;
+ for( uint32_t z = 0; z < size[ 2 ]; z++, offLayer += strides[ 2 ] )
+ {
+ size_t offRow = offLayer;
+ for( uint32_t y = 0; y < size[ 1 ]; y++, offRow += strides[ 1 ] )
+ {
+ size_t off = offRow;
+ for( uint32_t x = 0; x < size[ 0 ]; x++, off += strides[ 0 ] )
+ {
+ line.Format( "%i\t", x );
+ if( dims > 1 )
+ line.AppendFormat( "%i\t", y );
+ if( dims > 2 )
+ line.AppendFormat( "%i\t", z );
+ if( dims > 3 )
+ line.AppendFormat( "%i\t", w );
+
+ const float a = A[ off ];
+ const float b = B[ off ];
+ __m128 vf = _mm_setr_ps( a, b, 0, 0 );
+ __m128i vi = _mm_castps_si128( vf );
+ const float diff = std::abs( a - b );
+ line.AppendFormat( "%g\t%g\t0x%08X\t0x%08X\t%g\n",
+ a, b, _mm_cvtsi128_si32( vi ), _mm_extract_epi32( vi, 1 ), diff );
+ printf( "%s", cstr( line ) );
+ }
+ }
+ }
+ }
+ return true;
+ }
+}
+
+HRESULT compareTraces( const CommandLineArgs& arguments )
+{
+ const wchar_t* pathA = arguments.inputs[ 0 ];
+ const wchar_t* pathB = arguments.inputs[ 1 ];
+
+ TraceReader a, b;
+ HRESULT hr = a.open( pathA );
+ if( FAILED( hr ) )
+ {
+ fwprintf( stderr, L"Unable to load trace A from \"%s\"", pathA );
+ printError( hr );
+ return hr;
+ }
+
+ hr = b.open( pathB );
+ if( FAILED( hr ) )
+ {
+ fwprintf( stderr, L"Unable to load trace B from \"%s\"", pathA );
+ printError( hr );
+ return hr;
+ }
+
+ wprintf( L"Trace A: %s\n", pathA );
+ wprintf( L"Trace B: %s\n", pathB );
+ const size_t sizeA = a.size();
+ const size_t sizeB = b.size();
+ const size_t count = std::min( sizeA, sizeB );
+
+ if( arguments.printDiff >= 0 )
+ {
+ if( arguments.printDiff >= (int64_t)count )
+ {
+ fprintf( stderr, "Trace A has %zu entries, trace B %zu entries; entry %zu ain't there\n",
+ sizeA, sizeB, (size_t)arguments.printDiff );
+ return E_INVALIDARG;
+ }
+ try
+ {
+ PrintDiff print{ a, b };
+ print.compare( arguments.printDiff );
+ return S_OK;
+ }
+ catch( HRESULT hr )
+ {
+ return hr;
+ }
+ }
+
+ printf( "Trace A has %zu entries, trace B %zu entries, comparing first %zu\n", sizeA, sizeB, count );
+
+ try
+ {
+ PrintSummary print{ a, b };
+ for( size_t i = 0; i < count; i++ )
+ if( !print.compare( i ) )
+ return S_FALSE;
+ return S_OK;
+ }
+ catch( HRESULT hr )
+ {
+ return hr;
+ }
+} \ No newline at end of file
diff --git a/Tools/compareTraces/compare.h b/Tools/compareTraces/compare.h
new file mode 100644
index 0000000..2a4cd86
--- /dev/null
+++ b/Tools/compareTraces/compare.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "CommandLineArgs.h"
+
+HRESULT compareTraces( const CommandLineArgs& arguments ); \ No newline at end of file
diff --git a/Tools/compareTraces/compareTraces.cpp b/Tools/compareTraces/compareTraces.cpp
new file mode 100644
index 0000000..8813500
--- /dev/null
+++ b/Tools/compareTraces/compareTraces.cpp
@@ -0,0 +1,16 @@
+#include "stdafx.h"
+#include <stdio.h>
+#include "compare.h"
+#include "CommandLineArgs.h"
+
+int wmain( int argc, wchar_t* argv[] )
+{
+ CommandLineArgs cla;
+ if( !cla.parse( argc, argv ) )
+ return 1;
+
+ HRESULT hr = compareTraces( cla );
+ if( SUCCEEDED( hr ) )
+ return 0;
+ return hr;
+} \ No newline at end of file
diff --git a/Tools/compareTraces/compareTraces.vcxproj b/Tools/compareTraces/compareTraces.vcxproj
new file mode 100644
index 0000000..a9670b3
--- /dev/null
+++ b/Tools/compareTraces/compareTraces.vcxproj
@@ -0,0 +1,103 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <VCProjectVersion>16.0</VCProjectVersion>
+ <Keyword>Win32Proj</Keyword>
+ <ProjectGuid>{8478a77c-d851-4c63-9511-1770cc82d33e}</ProjectGuid>
+ <RootNamespace>compareTraces</RootNamespace>
+ <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v143</PlatformToolset>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v143</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="Shared">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <SDLCheck>true</SDLCheck>
+ <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <ConformanceMode>true</ConformanceMode>
+ <LanguageStandard>stdcpp20</LanguageStandard>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <SDLCheck>true</SDLCheck>
+ <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <ConformanceMode>true</ConformanceMode>
+ <LanguageStandard>stdcpp20</LanguageStandard>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClCompile Include="CommandLineArgs.cpp" />
+ <ClCompile Include="compareTraces.cpp" />
+ <ClCompile Include="compare.cpp" />
+ <ClCompile Include="stdafx.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="testUtils.cpp" />
+ <ClCompile Include="TraceReader.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="CommandLineArgs.h" />
+ <ClInclude Include="compare.h" />
+ <ClInclude Include="stdafx.h" />
+ <ClInclude Include="TraceReader.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <Text Include="Readme.txt" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
+</Project> \ No newline at end of file
diff --git a/Tools/compareTraces/compareTraces.vcxproj.filters b/Tools/compareTraces/compareTraces.vcxproj.filters
new file mode 100644
index 0000000..1d0d4c3
--- /dev/null
+++ b/Tools/compareTraces/compareTraces.vcxproj.filters
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <ClCompile Include="compareTraces.cpp" />
+ <ClCompile Include="TraceReader.cpp" />
+ <ClCompile Include="compare.cpp" />
+ <ClCompile Include="stdafx.cpp" />
+ <ClCompile Include="testUtils.cpp" />
+ <ClCompile Include="CommandLineArgs.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="TraceReader.h" />
+ <ClInclude Include="stdafx.h" />
+ <ClInclude Include="compare.h" />
+ <ClInclude Include="CommandLineArgs.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <Text Include="Readme.txt" />
+ </ItemGroup>
+</Project> \ No newline at end of file
diff --git a/Tools/compareTraces/stdafx.cpp b/Tools/compareTraces/stdafx.cpp
new file mode 100644
index 0000000..5c7f6c9
--- /dev/null
+++ b/Tools/compareTraces/stdafx.cpp
@@ -0,0 +1,30 @@
+#include "stdafx.h"
+
+namespace
+{
+ wchar_t* formatMessage( HRESULT hr )
+ {
+ wchar_t* err;
+ if( FormatMessage( FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM,
+ NULL,
+ hr,
+ MAKELANGID( LANG_NEUTRAL, SUBLANG_DEFAULT ),
+ (LPTSTR)&err,
+ 0,
+ NULL ) )
+ return err;
+ return nullptr;
+ }
+}
+
+void printError( HRESULT hr )
+{
+ const wchar_t* err = formatMessage( hr );
+ if( nullptr != err )
+ {
+ fwprintf( stderr, L"%s\n", err );
+ LocalFree( (HLOCAL)err );
+ }
+ else
+ fprintf( stderr, "Error code %i (0x%08X)\n", hr, hr );
+} \ No newline at end of file
diff --git a/Tools/compareTraces/stdafx.h b/Tools/compareTraces/stdafx.h
new file mode 100644
index 0000000..1e496f3
--- /dev/null
+++ b/Tools/compareTraces/stdafx.h
@@ -0,0 +1,40 @@
+#pragma once
+#include <stdint.h>
+#include <assert.h>
+
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include <atlstr.h>
+#include <d3d11.h>
+
+#include <vector>
+#include <array>
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#define CHECK( hr ) { const HRESULT __hr = ( hr ); if( FAILED( __hr ) ) return __hr; }
+
+inline __m128i load16( const int* rsi )
+{
+ return _mm_loadu_si128( ( const __m128i* )rsi );
+}
+inline __m128i load16( const uint32_t* rsi )
+{
+ return _mm_loadu_si128( ( const __m128i* )rsi );
+}
+inline __m128i load( const std::array<uint32_t, 4>& arr )
+{
+ return load16( arr.data() );
+}
+
+inline bool vectorEqual( __m128i a, __m128i b )
+{
+ __m128i xx = _mm_xor_si128( a, b );
+ return (bool)_mm_testz_si128( xx, xx );
+}
+
+void printError( HRESULT hr );
+
+inline const char* cstr( const CStringA& s ) { return s; }
+inline const wchar_t* cstr( const CString& s ) { return s; } \ No newline at end of file
diff --git a/Tools/compareTraces/testUtils.cpp b/Tools/compareTraces/testUtils.cpp
new file mode 100644
index 0000000..f9fa465
--- /dev/null
+++ b/Tools/compareTraces/testUtils.cpp
@@ -0,0 +1,224 @@
+#include "stdafx.h"
+#include "../../Whisper/ML/testUtils.h"
+#include <immintrin.h>
+using namespace DirectCompute;
+
+namespace
+{
+ using DirectCompute::sTensorDiff;
+
+ __forceinline __m256 load( const float* rsi )
+ {
+ return _mm256_loadu_ps( rsi );
+ }
+
+ __forceinline __m256 load( const uint16_t* rsi )
+ {
+ const __m128i iv = _mm_load_si128( ( const __m128i* )rsi );
+ return _mm256_cvtph_ps( iv );
+ }
+
+ __forceinline void loadPartial( const uint16_t* x, const uint16_t* y, size_t count, __m256& fx, __m256& fy )
+ {
+ __m128i ix, iy;
+ switch( count )
+ {
+ case 1: // load 2 bytes
+ ix = _mm_cvtsi32_si128( *x );
+ iy = _mm_cvtsi32_si128( *y );
+ break;
+ case 2: // load 4 bytes
+ ix = _mm_cvtsi32_si128( *(const int*)x );
+ iy = _mm_cvtsi32_si128( *(const int*)y );
+ break;
+ case 3: // load 6 bytes
+ ix = _mm_cvtsi32_si128( *(const int*)x );
+ iy = _mm_cvtsi32_si128( *(const int*)y );
+ ix = _mm_insert_epi16( ix, x[ 2 ], 2 );
+ iy = _mm_insert_epi16( iy, y[ 2 ], 2 );
+ break;
+ case 4: // load 8 bytes
+ ix = _mm_cvtsi64_si128( *(const int64_t*)x );
+ iy = _mm_cvtsi64_si128( *(const int64_t*)y );
+ break;
+ case 5: // load 10 bytes
+ ix = _mm_cvtsi64_si128( *(const int64_t*)x );
+ iy = _mm_cvtsi64_si128( *(const int64_t*)y );
+ ix = _mm_insert_epi16( ix, x[ 4 ], 4 );
+ iy = _mm_insert_epi16( iy, y[ 4 ], 4 );
+ break;
+ case 6: // load 12 bytes
+ ix = _mm_cvtsi64_si128( *(const int64_t*)x );
+ iy = _mm_cvtsi64_si128( *(const int64_t*)y );
+ ix = _mm_insert_epi32( ix, *(const int*)( x + 4 ), 2 );
+ iy = _mm_insert_epi32( iy, *(const int*)( y + 4 ), 2 );
+ break;
+ case 7: // load 14 bytes
+ ix = _mm_cvtsi64_si128( *(const int64_t*)x );
+ iy = _mm_cvtsi64_si128( *(const int64_t*)y );
+ ix = _mm_insert_epi32( ix, *(const int*)( x + 4 ), 2 );
+ iy = _mm_insert_epi32( iy, *(const int*)( y + 4 ), 2 );
+ ix = _mm_insert_epi16( ix, x[ 6 ], 6 );
+ iy = _mm_insert_epi16( iy, y[ 6 ], 6 );
+ break;
+ default:
+ fx = fy = _mm256_setzero_ps();
+ return;
+ }
+
+ fx = _mm256_cvtph_ps( ix );
+ fy = _mm256_cvtph_ps( iy );
+ }
+
+ inline __m128 loadFloat2( const float* rsi )
+ {
+ return _mm_castpd_ps( _mm_load_sd( (const double*)rsi ) );
+ }
+ inline __m128 loadFloat3( const float* rsi )
+ {
+ __m128 f = loadFloat2( rsi );
+ f = _mm_insert_ps( f, _mm_load_ss( rsi + 2 ), 0x20 );
+ return f;
+ }
+ __forceinline void loadPartial( const float* x, const float* y, size_t count, __m256& fx, __m256& fy )
+ {
+ __m128 low1, high1;
+ __m128 low2, high2;
+ high1 = high2 = _mm_setzero_ps();
+ switch( count )
+ {
+ case 1:
+ low1 = _mm_load_ss( x );
+ low2 = _mm_load_ss( y );
+ break;
+ case 2:
+ low1 = loadFloat2( x );
+ low2 = loadFloat2( y );
+ break;
+ case 3:
+ low1 = loadFloat3( x );
+ low2 = loadFloat3( y );
+ break;
+ case 4:
+ low1 = _mm_loadu_ps( x );
+ low2 = _mm_loadu_ps( y );
+ break;
+ case 5:
+ low1 = _mm_loadu_ps( x );
+ low2 = _mm_loadu_ps( y );
+ high1 = _mm_load_ss( x + 4 );
+ high2 = _mm_load_ss( y + 4 );
+ break;
+ case 6:
+ low1 = _mm_loadu_ps( x );
+ low2 = _mm_loadu_ps( y );
+ high1 = loadFloat2( x + 4 );
+ high2 = loadFloat2( y + 4 );
+ break;
+ case 7: // load 14 bytes
+ low1 = _mm_loadu_ps( x );
+ low2 = _mm_loadu_ps( y );
+ high1 = loadFloat3( x + 4 );
+ high2 = loadFloat3( y + 4 );
+ break;
+ default:
+ fx = fy = _mm256_setzero_ps();
+ return;
+ }
+
+ fx = _mm256_setr_m128( low1, high1 );
+ fy = _mm256_setr_m128( low2, high2 );
+ }
+
+ __forceinline float horizontalMaximum( __m256 v )
+ {
+ __m128 s = _mm256_extractf128_ps( v, 1 );
+ s = _mm_max_ps( s, _mm256_castps256_ps128( v ) );
+ s = _mm_max_ps( s, _mm_movehl_ps( s, s ) );
+ s = _mm_max_ss( s, _mm_movehdup_ps( s ) );
+ return _mm_cvtss_f32( s );
+ }
+
+ __forceinline double horizontalSum( __m256 v )
+ {
+ __m256d d = _mm256_cvtps_pd( _mm256_extractf128_ps( v, 1 ) );
+ d = _mm256_add_pd( d, _mm256_cvtps_pd( _mm256_castps256_ps128( v ) ) );
+
+ __m128d s = _mm256_extractf128_pd( d, 1 );
+ s = _mm_add_pd( s, _mm256_castpd256_pd128( d ) );
+ s = _mm_add_sd( s, _mm_unpackhi_pd( s, s ) );
+ return _mm_cvtsd_f64( s );
+ }
+
+ __m256 maskInfNan( __m256 diff, __m256 a, __m256 b )
+ {
+ __m256i ai = _mm256_castps_si256( a );
+ __m256i bi = _mm256_castps_si256( b );
+ __m256i eqi = _mm256_cmpeq_epi32( ai, bi );
+ __m256 eq = _mm256_castsi256_ps( eqi );
+ return _mm256_andnot_ps( eq, diff );
+ }
+
+ class DiffAcc
+ {
+ __m256 maxAbs = _mm256_setzero_ps();
+ __m256 sumSquares = _mm256_setzero_ps();
+
+ public:
+
+ __forceinline void add( __m256 a, __m256 b )
+ {
+ const __m256 neg0 = _mm256_set1_ps( -0.0f );
+ __m256 diff = _mm256_sub_ps( b, a );
+ diff = maskInfNan( diff, a, b );
+ sumSquares = _mm256_fmadd_ps( diff, diff, sumSquares );
+ const __m256 absDiff = _mm256_andnot_ps( neg0, diff );
+ maxAbs = _mm256_max_ps( maxAbs, absDiff );
+ }
+
+ __forceinline sTensorDiff reduce( size_t count )
+ {
+ sTensorDiff res;
+ res.maxAbsDiff = horizontalMaximum( maxAbs );
+ res.avgDiffSquared = (float)( horizontalSum( sumSquares ) / (double)(int64_t)count );
+ res.length = count;
+ return res;
+ }
+ };
+
+ template<class E>
+ static sTensorDiff __declspec( noinline ) diffVectors( const E* a, const E* b, size_t length )
+ {
+ // const E* const aEnd = a + length;
+ const E* const aEndAligned = a + ( length / 8 ) * 8;
+ const size_t remainder = length % 8;
+
+ DiffAcc acc;
+ for( ; a < aEndAligned; a += 8, b += 8 )
+ acc.add( load( a ), load( b ) );
+
+ if( remainder != 0 )
+ {
+ __m256 va, vb;
+ loadPartial( a, b, remainder, va, vb );
+ acc.add( va, vb );
+ }
+
+ return acc.reduce( length );
+ }
+}
+
+sTensorDiff DirectCompute::computeDiff( const float* a, const float* b, size_t length )
+{
+ return diffVectors( a, b, length );
+}
+
+sTensorDiff DirectCompute::computeDiff( const uint16_t* a, const uint16_t* b, size_t length )
+{
+ return diffVectors( a, b, length );
+}
+
+void DirectCompute::sTensorDiff::print() const
+{
+ printf( "%zu elements, maxAbsDiff = %g, avgDiffSquared = %g\n", length, maxAbsDiff, avgDiffSquared );
+} \ No newline at end of file