From fcf83dbf9effab3bd98bad2b83b2468b7eb05cfd Mon Sep 17 00:00:00 2001 From: Tim Foley Date: Fri, 9 Jun 2017 11:34:21 -0700 Subject: Initial import of code. --- .../dxsdk/AdaptiveTessellationCS40/Render.hlsl | 58 + .../dxsdk/AdaptiveTessellationCS40/ScanCS.hlsl | 109 + .../TessellatorCS40_EdgeFactorCS.hlsl | 217 ++ .../TessellatorCS40_NumVerticesIndicesCS.hlsl | 56 + .../TessellatorCS40_ScatterIDCS.hlsl | 45 + .../TessellatorCS40_TessellateIndicesCS.hlsl | 628 +++++ .../TessellatorCS40_TessellateVerticesCS.hlsl | 206 ++ .../TessellatorCS40_common.hlsl | 411 ++++ .../TessellatorCS40_defines.h | 9 + .../dxsdk/BC6HBC7EncoderCS/Shaders/BC6HEncode.hlsl | 2567 ++++++++++++++++++++ .../dxsdk/BC6HBC7EncoderCS/Shaders/BC7Encode.hlsl | 1908 +++++++++++++++ .../hlsl/dxsdk/BasicCompute11/BasicCompute11.hlsl | 72 + tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL.fx | 158 ++ tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_PS.hlsl | 51 + tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_VS.hlsl | 49 + tests/hlsl/dxsdk/BasicHLSLFX11/BasicHLSLFX11.fx | 181 ++ .../CascadedShadowMaps11/RenderCascadeScene.hlsl | 506 ++++ .../CascadedShadowMaps11/RenderCascadeShadow.hlsl | 53 + .../ComputeShaderSort11/ComputeShaderSort11.hlsl | 75 + .../Direct3D11Tutorials/Tutorial02/Tutorial02.fx | 23 + .../Tutorial02/Tutorial02_PS.hlsl | 3 + .../Tutorial02/Tutorial02_VS.hlsl | 3 + .../Direct3D11Tutorials/Tutorial03/Tutorial03.fx | 23 + .../Tutorial03/Tutorial03_PS.hlsl | 3 + .../Tutorial03/Tutorial03_VS.hlsl | 3 + .../Direct3D11Tutorials/Tutorial04/Tutorial04.fx | 46 + .../Tutorial04/Tutorial04_PS.hlsl | 3 + .../Tutorial04/Tutorial04_VS.hlsl | 3 + .../Direct3D11Tutorials/Tutorial05/Tutorial05.fx | 54 + .../Tutorial05/Tutorial05_PS.hlsl | 3 + .../Tutorial05/Tutorial05_VS.hlsl | 3 + .../Direct3D11Tutorials/Tutorial06/Tutorial06.fx | 76 + .../Tutorial06/Tutorial06_PS.hlsl | 3 + .../Tutorial06/Tutorial06_VS.hlsl | 3 + .../Direct3D11Tutorials/Tutorial07/Tutorial07.fx | 67 + .../Tutorial07/Tutorial07_PS.hlsl | 3 + .../Tutorial07/Tutorial07_VS.hlsl | 3 + .../Tutorial08/Tutorial08.fx | 56 + .../Tutorial09/Tutorial09.fx | 69 + .../Tutorial10/Tutorial10.fx | 73 + .../Tutorial11/Tutorial11.fx | 117 + .../Tutorial12/Tutorial12.fx | 129 + .../Tutorial13/Tutorial13.fx | 191 ++ .../Tutorial14/Tutorial14.fx | 294 +++ .../DynamicShaderLinkage11_LightPSH.h | 84 + .../DynamicShaderLinkage11_MaterialPSH.h | 103 + .../DynamicShaderLinkage11_PS.hlsl | 84 + .../DynamicShaderLinkage11_PSBuffers.h | 129 + .../DynamicShaderLinkage11_VS.hlsl | 66 + .../DynamicShaderLinkageFX11.fx | 192 ++ .../DynamicShaderLinkageFX11_LightPSH.h | 82 + .../DynamicShaderLinkageFX11_MaterialPSH.h | 103 + .../DynamicShaderLinkageFX11_PSBuffers.h | 152 ++ .../DynamicShaderLinkageFX11_ps.hlsl | 113 + .../DynamicShaderLinkageFX11_vs.hlsl | 65 + tests/hlsl/dxsdk/FixedFuncEMUFX11/FixedFuncEMU.fx | 468 ++++ .../hlsl/dxsdk/FluidCS11/ComputeShaderSort11.hlsl | 75 + tests/hlsl/dxsdk/FluidCS11/FluidCS11.hlsl | 529 ++++ tests/hlsl/dxsdk/FluidCS11/FluidRender.hlsl | 112 + .../BrightPassAndHorizFilterCS.hlsl | 64 + .../dxsdk/HDRToneMappingCS11/DumpToTexture.hlsl | 29 + tests/hlsl/dxsdk/HDRToneMappingCS11/FilterCS.hlsl | 73 + tests/hlsl/dxsdk/HDRToneMappingCS11/FinalPass.hlsl | 79 + .../hlsl/dxsdk/HDRToneMappingCS11/PSApproach.hlsl | 129 + .../dxsdk/HDRToneMappingCS11/ReduceTo1DCS.hlsl | 72 + .../dxsdk/HDRToneMappingCS11/ReduceToSingleCS.hlsl | 63 + tests/hlsl/dxsdk/HDRToneMappingCS11/skybox11.hlsl | 44 + tests/hlsl/dxsdk/InstancingFX11/Instancing.fx | 591 +++++ .../MultithreadedRendering11_PS.hlsl | 202 ++ .../MultithreadedRendering11_VS.hlsl | 75 + .../dxsdk/NBodyGravityCS11/NBodyGravityCS11.hlsl | 103 + .../hlsl/dxsdk/NBodyGravityCS11/ParticleDraw.hlsl | 128 + tests/hlsl/dxsdk/OIT11/OIT_CS.hlsl | 277 +++ tests/hlsl/dxsdk/OIT11/OIT_PS.hlsl | 56 + tests/hlsl/dxsdk/OIT11/SceneVS.hlsl | 36 + tests/hlsl/dxsdk/README.md | 5 + .../hlsl/dxsdk/SimpleBezier11/SimpleBezier11.hlsl | 230 ++ tests/hlsl/dxsdk/SimpleSample11/SimpleSample.fx | 112 + tests/hlsl/dxsdk/SimpleSample11/SimpleSample.hlsl | 86 + tests/hlsl/dxsdk/SubD11/SubD11.hlsl | 1238 ++++++++++ .../dxsdk/VarianceShadows11/2DQuadShaders.hlsl | 211 ++ .../VarianceShadows11/RenderVarianceScene.hlsl | 412 ++++ .../VarianceShadows11/RenderVarianceShadow.hlsl | 45 + tests/hlsl/simple/compute-numthreads.hlsl | 11 + 84 files changed, 15341 insertions(+) create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/Render.hlsl create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/ScanCS.hlsl create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_EdgeFactorCS.hlsl create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_NumVerticesIndicesCS.hlsl create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_ScatterIDCS.hlsl create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateIndicesCS.hlsl create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateVerticesCS.hlsl create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_common.hlsl create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_defines.h create mode 100644 tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC6HEncode.hlsl create mode 100644 tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC7Encode.hlsl create mode 100644 tests/hlsl/dxsdk/BasicCompute11/BasicCompute11.hlsl create mode 100644 tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL.fx create mode 100644 tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_PS.hlsl create mode 100644 tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_VS.hlsl create mode 100644 tests/hlsl/dxsdk/BasicHLSLFX11/BasicHLSLFX11.fx create mode 100644 tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeScene.hlsl create mode 100644 tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeShadow.hlsl create mode 100644 tests/hlsl/dxsdk/ComputeShaderSort11/ComputeShaderSort11.hlsl create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02.fx create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_PS.hlsl create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_VS.hlsl create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03.fx create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_PS.hlsl create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_VS.hlsl create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04.fx create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_PS.hlsl create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_VS.hlsl create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05.fx create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_PS.hlsl create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_VS.hlsl create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06.fx create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_PS.hlsl create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_VS.hlsl create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07.fx create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_PS.hlsl create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_VS.hlsl create mode 100644 tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial08/Tutorial08.fx create mode 100644 tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial09/Tutorial09.fx create mode 100644 tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial10/Tutorial10.fx create mode 100644 tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial11/Tutorial11.fx create mode 100644 tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial12/Tutorial12.fx create mode 100644 tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial13/Tutorial13.fx create mode 100644 tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial14/Tutorial14.fx create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_LightPSH.h create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_MaterialPSH.h create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PS.hlsl create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PSBuffers.h create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_VS.hlsl create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11.fx create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_LightPSH.h create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_MaterialPSH.h create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_PSBuffers.h create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_ps.hlsl create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_vs.hlsl create mode 100644 tests/hlsl/dxsdk/FixedFuncEMUFX11/FixedFuncEMU.fx create mode 100644 tests/hlsl/dxsdk/FluidCS11/ComputeShaderSort11.hlsl create mode 100644 tests/hlsl/dxsdk/FluidCS11/FluidCS11.hlsl create mode 100644 tests/hlsl/dxsdk/FluidCS11/FluidRender.hlsl create mode 100644 tests/hlsl/dxsdk/HDRToneMappingCS11/BrightPassAndHorizFilterCS.hlsl create mode 100644 tests/hlsl/dxsdk/HDRToneMappingCS11/DumpToTexture.hlsl create mode 100644 tests/hlsl/dxsdk/HDRToneMappingCS11/FilterCS.hlsl create mode 100644 tests/hlsl/dxsdk/HDRToneMappingCS11/FinalPass.hlsl create mode 100644 tests/hlsl/dxsdk/HDRToneMappingCS11/PSApproach.hlsl create mode 100644 tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceTo1DCS.hlsl create mode 100644 tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceToSingleCS.hlsl create mode 100644 tests/hlsl/dxsdk/HDRToneMappingCS11/skybox11.hlsl create mode 100644 tests/hlsl/dxsdk/InstancingFX11/Instancing.fx create mode 100644 tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_PS.hlsl create mode 100644 tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_VS.hlsl create mode 100644 tests/hlsl/dxsdk/NBodyGravityCS11/NBodyGravityCS11.hlsl create mode 100644 tests/hlsl/dxsdk/NBodyGravityCS11/ParticleDraw.hlsl create mode 100644 tests/hlsl/dxsdk/OIT11/OIT_CS.hlsl create mode 100644 tests/hlsl/dxsdk/OIT11/OIT_PS.hlsl create mode 100644 tests/hlsl/dxsdk/OIT11/SceneVS.hlsl create mode 100644 tests/hlsl/dxsdk/README.md create mode 100644 tests/hlsl/dxsdk/SimpleBezier11/SimpleBezier11.hlsl create mode 100644 tests/hlsl/dxsdk/SimpleSample11/SimpleSample.fx create mode 100644 tests/hlsl/dxsdk/SimpleSample11/SimpleSample.hlsl create mode 100644 tests/hlsl/dxsdk/SubD11/SubD11.hlsl create mode 100644 tests/hlsl/dxsdk/VarianceShadows11/2DQuadShaders.hlsl create mode 100644 tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceScene.hlsl create mode 100644 tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceShadow.hlsl create mode 100644 tests/hlsl/simple/compute-numthreads.hlsl (limited to 'tests/hlsl') diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/Render.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/Render.hlsl new file mode 100644 index 000000000..b98b870da --- /dev/null +++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/Render.hlsl @@ -0,0 +1,58 @@ +//TEST:COMPARE_HLSL: -profile vs_4_0 -entry RenderBaseVS -profile ps_4_0 -entry RenderPS -target dxbc-assembly +//-------------------------------------------------------------------------------------- +// File: Render.hlsl +// +// The shaders for rendering tessellated mesh and base mesh +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- +cbuffer cbPerObject : register( b0 ) +{ + row_major matrix g_mWorldViewProjection : packoffset( c0 ); +} + +// The tessellated vertex structure +struct TessedVertex +{ + uint BaseTriID; // Which triangle of the base mesh this tessellated vertex belongs to? + float2 bc; // Barycentric coordinates with regard to the base triangle +}; +Buffer g_base_vb_buffer : register(t0); // Base mesh vertex buffer +StructuredBuffer g_TessedVertices : register(t1); // Tessellated mesh vertex buffer + +float4 bary_centric(float4 v1, float4 v2, float4 v3, float2 bc) +{ + return (1 - bc.x - bc.y) * v1 + bc.x * v2 + bc.y * v3; +} + +float4 RenderVS( uint vertid : SV_VertexID ) : SV_POSITION +{ + TessedVertex input = g_TessedVertices[vertid]; + + // Get the positions of the three vertices of the base triangle + float4 v[3]; + [unroll] + for (int i = 0; i < 3; ++ i) + { + uint vert_id = input.BaseTriID * 3 + i; + v[i] = g_base_vb_buffer[vert_id]; + } + + // Calculate the position of this tessellated vertex from barycentric coordinates and then project it + return mul(bary_centric(v[0], v[1], v[2], input.bc), g_mWorldViewProjection); +} + +struct BaseVertex +{ + float4 pos : POSITION; +}; + +float4 RenderBaseVS( BaseVertex input ) : SV_POSITION +{ + return mul( input.pos, g_mWorldViewProjection ); +} + +float4 RenderPS() : SV_TARGET +{ + return float4( 1.0f, 1.0f, 0.0f, 1.0f ); +} \ No newline at end of file diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/ScanCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/ScanCS.hlsl new file mode 100644 index 000000000..46cdc1ed9 --- /dev/null +++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/ScanCS.hlsl @@ -0,0 +1,109 @@ +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSScanInBucket -entry CSScanBucketResult -entry CSScanAddBucketResult +//-------------------------------------------------------------------------------------- +// File: ScanCS.hlsl +// +// A simple inclusive prefix sum(scan) implemented in CS4.0, +// using a typical up sweep and down sweep scheme +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- +StructuredBuffer Input : register( t0 ); // Change uint2 here if scan other types, and +RWStructuredBuffer Result : register( u0 ); // also here + +#define groupthreads 128 +groupshared uint4 bucket[groupthreads]; // Change uint4 to the "type x2" if scan other types, e.g. + // if scan uint2, then put uint4 here, + // if scan float, then put float2 here + +void CSScan( uint3 DTid, uint GI, uint2 x ) // Change the type of x here if scan other types +{ + // since CS40 can only support one shared memory for one shader, we use .xy and .zw as ping-ponging buffers + // if scan a single element type like int, search and replace all .xy to .x and .zw to .y below + bucket[GI].xy = x; + bucket[GI].zw = 0; + + // Up sweep + [unroll] + for ( uint stride = 2; stride <= groupthreads; stride <<= 1 ) + { + GroupMemoryBarrierWithGroupSync(); + + if ( (GI & (stride - 1)) == (stride - 1) ) + { + bucket[GI].xy += bucket[GI - stride/2].xy; + } + } + + if ( GI == (groupthreads - 1) ) + { + bucket[GI].xy = 0; + } + + // Down sweep + bool n = true; + [unroll] + for ( stride = groupthreads / 2; stride >= 1; stride >>= 1 ) + { + GroupMemoryBarrierWithGroupSync(); + + uint a = stride - 1; + uint b = stride | a; + + if ( n ) // ping-pong between passes + { + if ( ( GI & b) == b ) + { + bucket[GI].zw = bucket[GI-stride].xy + bucket[GI].xy; + } else + if ( (GI & a) == a ) + { + bucket[GI].zw = bucket[GI+stride].xy; + } else + { + bucket[GI].zw = bucket[GI].xy; + } + } else + { + if ( ( GI & b) == b ) + { + bucket[GI].xy = bucket[GI-stride].zw + bucket[GI].zw; + } else + if ( (GI & a) == a ) + { + bucket[GI].xy = bucket[GI+stride].zw; + } else + { + bucket[GI].xy = bucket[GI].zw; + } + } + + n = !n; + } + + Result[DTid.x] = bucket[GI].zw + x; +} + +// scan in each bucket +[numthreads( groupthreads, 1, 1 )] +void CSScanInBucket( uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI: SV_GroupIndex ) +{ + uint2 x = Input[DTid.x]; // Change the type of x here if scan other types + CSScan( DTid, GI, x ); +} + +// record and scan the sum of each bucket +[numthreads( groupthreads, 1, 1 )] +void CSScanBucketResult( uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI: SV_GroupIndex ) +{ + uint2 x = Input[DTid.x*groupthreads - 1]; // Change the type of x here if scan other types + CSScan( DTid, GI, x ); +} + +StructuredBuffer Input1 : register( t1 ); + +// add the bucket scanned result to each bucket to get the final result +[numthreads( groupthreads, 1, 1 )] +void CSScanAddBucketResult( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI: SV_GroupIndex ) +{ + Result[DTid.x] = Input[DTid.x] + Input1[Gid.x]; +} diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_EdgeFactorCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_EdgeFactorCS.hlsl new file mode 100644 index 000000000..91ebca777 --- /dev/null +++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_EdgeFactorCS.hlsl @@ -0,0 +1,217 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSEdgeFactor +//-------------------------------------------------------------------------------------- +// File: TessellatorCS40_EdgeFactorCS.hlsl +// +// The CS to compute edge tessellation factor acoording to current world, view, projection matrix +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +// http://jgt.akpeters.com/papers/akeninemoller01/tribox.html +bool planeBoxOverlap(float3 normal, float d, float3 maxbox) +{ + float3 vmin = maxbox, vmax = maxbox; + [unroll] + for (int q = 0;q <= 2; ++ q) + { + if (normal[q] > 0.0f) + { + vmin[q] *= -1; + } + else + { + vmax[q] *= -1; + } + } + if (dot(normal, vmin) + d > 0.0f) + { + return false; + } + if (dot(normal, vmax) + d >= 0.0f) + { + return true; + } + + return false; +} + +/*======================== X-tests ========================*/ +bool AXISTEST_X01(float3 v0, float3 v2, float3 boxhalfsize, float2 ab, float2 fab) +{ + float p0 = ab.x * v0.y - ab.y * v0.z; + float p2 = ab.x * v2.y - ab.y * v2.z; + float min_v = min(p0, p2); + float max_v = max(p0, p2); + float rad = dot(fab, boxhalfsize.yz); + return (min_v < rad) && (max_v > -rad); +} + +bool AXISTEST_X2(float3 v0, float3 v1, float3 boxhalfsize, float2 ab, float2 fab) +{ + float p0 = ab.x * v0.y - ab.y * v0.z; + float p1 = ab.x * v1.y - ab.y * v1.z; + float min_v = min(p0, p1); + float max_v = max(p0, p1); + float rad = dot(fab, boxhalfsize.yz); + return (min_v < rad) && (max_v > -rad); +} + +/*======================== Y-tests ========================*/ +bool AXISTEST_Y02(float3 v0, float3 v2, float3 boxhalfsize, float2 ab, float2 fab) +{ + float p0 = -ab.x * v0.x + ab.y * v0.z; + float p2 = -ab.x * v2.x + ab.y * v2.z; + float min_v = min(p0, p2); + float max_v = max(p0, p2); + float rad = dot(fab, boxhalfsize.xz); + return (min_v < rad) && (max_v > -rad); +} + +bool AXISTEST_Y1(float3 v0, float3 v1, float3 boxhalfsize, float2 ab, float2 fab) +{ + float p0 = -ab.x * v0.x + ab.y * v0.z; + float p1 = -ab.x * v1.x + ab.y * v1.z; + float min_v = min(p0, p1); + float max_v = max(p0, p1); + float rad = dot(fab, boxhalfsize.xz); + return (min_v < rad) && (max_v > -rad); +} + +/*======================== Z-tests ========================*/ +bool AXISTEST_Z12(float3 v1, float3 v2, float3 boxhalfsize, float2 ab, float2 fab) +{ + float p1 = ab.x * v1.x - ab.y * v1.y; + float p2 = ab.x * v2.x - ab.y * v2.y; + float min_v = min(p1, p2); + float max_v = max(p1, p2); + float rad = dot(fab, boxhalfsize.xy); + return (min_v < rad) && (max_v > -rad); +} + +bool AXISTEST_Z0(float3 v0, float3 v1, float3 boxhalfsize, float2 ab, float2 fab) +{ + float p0 = ab.x * v0.x - ab.y * v0.y; + float p1 = ab.x * v1.x - ab.y * v1.y; + float min_v = min(p0, p1); + float max_v = max(p0, p1); + float rad = dot(fab, boxhalfsize.xy); + return (min_v < rad) && (max_v > -rad); +} + +bool triBoxOverlap(float3 boxcenter,float3 boxhalfsize,float3 triverts0, float3 triverts1, float3 triverts2) +{ + /* use separating axis theorem to test overlap between triangle and box */ + /* need to test for overlap in these directions: */ + /* 1) the {x,y,z}-directions (actually, since we use the AABB of the triangle */ + /* we do not even need to test these) */ + /* 2) normal of the triangle */ + /* 3) crossproduct(edge from tri, {x,y,z}-directin) */ + /* this gives 3x3=9 more tests */ + + /* This is the fastest branch on Sun */ + /* move everything so that the boxcenter is in (0,0,0) */ + float3 v0 = triverts0 - boxcenter; + float3 v1 = triverts1 - boxcenter; + float3 v2 = triverts2 - boxcenter; + + /* compute triangle edges */ + float3 e0 = v1 - v0; /* tri edge 0 */ + float3 e1 = v2 - v1; /* tri edge 1 */ + float3 e2 = v0 - v2; /* tri edge 2 */ + + /* Bullet 3: */ + /* test the 9 tests first (this was faster) */ + float3 fe = abs(e0); + if (!AXISTEST_X01(v0, v2, boxhalfsize, e0.zy, fe.zy) + || !AXISTEST_Y02(v0, v2, boxhalfsize, e0.zx, fe.zx) + || !AXISTEST_Z12(v1, v2, boxhalfsize, e0.yx, fe.yx)) + { + return false; + } + + fe = abs(e1); + if (!AXISTEST_X01(v0, v2, boxhalfsize, e1.zy, fe.zy) + || !AXISTEST_Y02(v0, v2, boxhalfsize, e1.zx, fe.zx) + || !AXISTEST_Z0(v0, v1, boxhalfsize, e1.yx, fe.yx)) + { + return false; + } + + fe = abs(e2); + if (!AXISTEST_X2(v0, v1, boxhalfsize, e2.zy, fe.zy) + || !AXISTEST_Y1(v0, v1, boxhalfsize, e2.zx, fe.zx) + || !AXISTEST_Z12(v1, v2, boxhalfsize, e2.yx, fe.yx)) + { + return false; + } + + /* Bullet 1: */ + /* first test overlap in the {x,y,z}-directions */ + /* find min, max of the triangle each direction, and test for overlap in */ + /* that direction -- this is equivalent to testing a minimal AABB around */ + /* the triangle against the AABB */ + + float3 min_v = min(min(v0, v1), v2); + float3 max_v = max(max(v0, v1), v2); + if ((min_v.x > boxhalfsize.x || max_v.x < -boxhalfsize.x) + || (min_v.y > boxhalfsize.y || max_v.y < -boxhalfsize.y) + || (min_v.z > boxhalfsize.z || max_v.z < -boxhalfsize.z)) + { + return false; + } + + /* Bullet 2: */ + /* test if the box intersects the plane of the triangle */ + /* compute plane equation of triangle: normal*x+d=0 */ + float3 normal = cross(e0, e1); + float d = -dot(normal, v0); /* plane eq: normal.x+d=0 */ + if (!planeBoxOverlap(normal, d, boxhalfsize)) + { + return false; + } + + return true; /* box and triangle overlaps */ +} + + +Buffer InputVertices : register(t0); +RWStructuredBuffer EdgeFactorBufOut : register(u0); + +cbuffer cb +{ + row_major matrix g_matWVP; + float2 g_tess_edge_length_scale; + int num_triangles; + float dummy; +} + +[numthreads(128, 1, 1)] +void CSEdgeFactor( uint3 DTid : SV_DispatchThreadID ) +{ + if (DTid.x < num_triangles) + { + float4 p0 = mul(InputVertices[DTid.x*3+0], g_matWVP); + float4 p1 = mul(InputVertices[DTid.x*3+1], g_matWVP); + float4 p2 = mul(InputVertices[DTid.x*3+2], g_matWVP); + p0 = p0 / p0.w; + p1 = p1 / p1.w; + p2 = p2 / p2.w; + + float4 factor; + // Only triangles which are completely inside or intersect with the view frustum are taken into account + if ( triBoxOverlap( float3(0, 0, 0.5), float3(1.02, 1.02, 0.52), p0.xyz, p1.xyz, p2.xyz ) ) + { + factor.x = length((p0.xy - p2.xy) * g_tess_edge_length_scale); + factor.y = length((p1.xy - p0.xy) * g_tess_edge_length_scale); + factor.z = length((p2.xy - p1.xy) * g_tess_edge_length_scale); + factor.w = min(min(factor.x, factor.y), factor.z); + factor = clamp(factor, 0, 9); + } else + { + factor = 0; + } + + EdgeFactorBufOut[DTid.x] = factor; + } +} diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_NumVerticesIndicesCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_NumVerticesIndicesCS.hlsl new file mode 100644 index 000000000..4f2fb547b --- /dev/null +++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_NumVerticesIndicesCS.hlsl @@ -0,0 +1,56 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSNumVerticesIndices +//-------------------------------------------------------------------------------------- +// File: TessellatorCS40_NumVerticesIndicesCS.hlsl +// +// The CS to compute number of vertices and triangles to be generated from edge tessellation factor +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +#include "TessellatorCS40_common.hlsl" + +StructuredBuffer InputEdgeFactor : register(t0); +RWStructuredBuffer NumVerticesIndicesOut : register(u0); + +cbuffer cbCS : register(b1) +{ + uint4 g_param; +} + +[numthreads(128, 1, 1)] +void CSNumVerticesIndices( uint3 DTid : SV_DispatchThreadID ) +{ + if (DTid.x < g_param.x) + { + float4 edge_factor = InputEdgeFactor[DTid.x]; + + PROCESSED_TESS_FACTORS_TRI processedTessFactors; + int num_points = TriProcessTessFactors(edge_factor, processedTessFactors, g_partitioning); + + int num_index; + if (0 == num_points) + { + num_index = 0; + } + else if (3 == num_points) + { + num_index = 4; + } + else + { + int numRings = ((processedTessFactors.numPointsForOutsideInside.w + 1) / 2); // +1 is so even tess includes the center point, which we want to now + + int4 outsideInsideHalfTessFactor = int4(ceil(processedTessFactors.outsideInsideHalfTessFactor)); + uint3 n = NumStitchTransition(outsideInsideHalfTessFactor, processedTessFactors.outsideInsideTessFactorParity); + num_index = n.x + n.y + n.z; + num_index += TotalNumStitchRegular(true, DIAGONALS_MIRRORED, processedTessFactors.numPointsForOutsideInside.w, numRings - 1) * 3; + if( processedTessFactors.outsideInsideTessFactorParity.w == TESSELLATOR_PARITY_ODD ) + { + num_index += 4; + } + } + + NumVerticesIndicesOut[DTid.x] = uint2(num_points, num_index); + } +} diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_ScatterIDCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_ScatterIDCS.hlsl new file mode 100644 index 000000000..17f003794 --- /dev/null +++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_ScatterIDCS.hlsl @@ -0,0 +1,45 @@ +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSScatterVertexTriIDIndexID -entry CSScatterIndexTriIDIndexID +//-------------------------------------------------------------------------------------- +// File: TessellatorCS40_ScatterIDCS.hlsl +// +// The CS to scatter vertex ID and triangle ID +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- +StructuredBuffer InputScanned : register(t0); +RWStructuredBuffer TriIDIndexIDOut : register(u0); + +cbuffer cbCS : register(b1) +{ + uint4 g_param; +} + +[numthreads(128, 1, 1)] +void CSScatterVertexTriIDIndexID( uint3 DTid : SV_DispatchThreadID ) +{ + if (DTid.x < g_param.x) + { + uint start = InputScanned[DTid.x-1].x; + uint end = InputScanned[DTid.x].x; + + for ( uint i = start; i < end; ++i ) + { + TriIDIndexIDOut[i] = uint2(DTid.x, i - start); + } + } +} + +[numthreads(128, 1, 1)] +void CSScatterIndexTriIDIndexID( uint3 DTid : SV_DispatchThreadID ) +{ + if (DTid.x < g_param.x) + { + uint start = InputScanned[DTid.x-1].y; + uint end = InputScanned[DTid.x].y; + + for ( uint i = start; i < end; ++i ) + { + TriIDIndexIDOut[i] = uint2(DTid.x, i - start); + } + } +} diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateIndicesCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateIndicesCS.hlsl new file mode 100644 index 000000000..756f99e58 --- /dev/null +++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateIndicesCS.hlsl @@ -0,0 +1,628 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSTessellationIndices +//-------------------------------------------------------------------------------------- +// File: TessellatorCS40_TessellateIndicesCS.hlsl +// +// The CS to tessellate indices +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +#include "TessellatorCS40_common.hlsl" + +StructuredBuffer InputTriIDIndexID : register(t0); +StructuredBuffer InputEdgeFactor : register(t1); +StructuredBuffer InputScanned : register(t2); + +RWByteAddressBuffer TessedIndicesOut : register(u0); + +cbuffer cbCS : register(b1) +{ + uint4 g_param; +} + + +int TransformIndex1(int index, int vertices_base) +{ + return vertices_base + index; +} + +int TransformIndex2(int index, int vertices_base, INDEX_PATCH_CONTEXT IndexPatchContext) +{ + if( index >= IndexPatchContext.outsidePointIndexPatchBase ) // assumed remapped outide indices are > remapped inside vertices + { + if( index == IndexPatchContext.outsidePointIndexBadValue ) + { + index = IndexPatchContext.outsidePointIndexReplacementValue; + } + else + { + index += IndexPatchContext.outsidePointIndexDeltaToRealValue; + } + } + else + { + if( index == IndexPatchContext.insidePointIndexBadValue ) + { + index = IndexPatchContext.insidePointIndexReplacementValue; + } + else + { + index += IndexPatchContext.insidePointIndexDeltaToRealValue; + } + } + + return vertices_base + index; +} + + +int AStitchRegular(bool bTrapezoid, int diagonals, + uint numInsideEdgePoints, + int2 outsideInsideEdgePointBaseOffset, + int i) +{ + if (bTrapezoid) + { + ++ outsideInsideEdgePointBaseOffset.x; + } + + int pt; + + if ((i < 4) && bTrapezoid) + { + if (i < 2) + { + pt = outsideInsideEdgePointBaseOffset.x - 1 + i; + } + else if (i == 2) + { + pt = outsideInsideEdgePointBaseOffset.y; + } + else + { + pt = -1; + } + } + + int index = i; + if (bTrapezoid) + { + index -= 4; + } + + if (index >= 0) + { + uint uindex = (uint)index; + + switch( diagonals ) + { + case DIAGONALS_INSIDE_TO_OUTSIDE: + if (uindex < 5 * numInsideEdgePoints - 5) + { + uint p = uindex / 5; + uint r = uindex - p * 5; + if (r < 2) + { + pt = outsideInsideEdgePointBaseOffset.x + p + r; + } + else if (r < 4) + { + pt = outsideInsideEdgePointBaseOffset.y + p + r; + } + else + { + pt = -1; + } + } + else + { + int r = i - (4 + 5 * numInsideEdgePoints - 5); + if (r < 2) + { + pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints - 1 + r; + } + else if (r == 2) + { + pt = outsideInsideEdgePointBaseOffset.y + numInsideEdgePoints - 1; + } + else + { + pt = -1; + } + } + break; + + case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation + if (uindex < (numInsideEdgePoints / 2 - 1) * 5) + { + // First half + uint p = uindex / 5; + uint r = uindex - p * 5; + if (r < 2) + { + pt = outsideInsideEdgePointBaseOffset.x + p + r; + } + else if (r < 4) + { + pt = outsideInsideEdgePointBaseOffset.y + p; + } + else + { + pt = -1; + } + } + else if (uindex < (numInsideEdgePoints / 2 - 1) * 5 + 8) + { + // Middle + uint r = uindex - (numInsideEdgePoints / 2 - 1) * 5; + if (0 == r) + { + pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints / 2 - 1; + } + else if (r < 3) + { + pt = outsideInsideEdgePointBaseOffset.y + numInsideEdgePoints / 2 - 1 + (2 - r); + } + else if (r == 3) + { + pt = -1; + } + else if (r < 6) + { + pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints / 2 - 1 + (r - 4); + } + else if (r == 6) + { + pt = outsideInsideEdgePointBaseOffset.y + numInsideEdgePoints / 2 - 1 + 1; + } + else if (r == 7) + { + pt = -1; + } + } + //else if (uindex < (numInsideEdgePoints/2-1) * 5 + 8 + (numInsideEdgePoints - numInsideEdgePoints/2 - 1) * 5) + else if (uindex < numInsideEdgePoints * 5 - 2) + { + // Second half + uint p = (uindex - (numInsideEdgePoints / 2 - 1) * 5 + 8) / 5 + numInsideEdgePoints / 2 + 1; + uint r = uindex - (numInsideEdgePoints / 2 - 1) * 5 + 8 - (p - (numInsideEdgePoints / 2 + 1)) * 5; + if (r < 2) + { + pt = outsideInsideEdgePointBaseOffset.x + p - 1 + r; + } + else if (r < 4) + { + pt = outsideInsideEdgePointBaseOffset.y + p - 1 + r; + } + else + { + pt = -1; + } + } + else + { + //int r = i - (4 + (numInsideEdgePoints/2-1) * 5 + 8 + (numInsideEdgePoints - numInsideEdgePoints/2 - 1) * 5); + int r = i - (numInsideEdgePoints * 5 + 2); + if (r < 2) + { + pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints - 1 + r; + } + else if (r == 2) + { + pt = outsideInsideEdgePointBaseOffset.y + numInsideEdgePoints - 1; + } + else + { + pt = -1; + } + } + break; + + case DIAGONALS_MIRRORED: + if (uindex < (numInsideEdgePoints / 2 + 1) * 2) + { + uint p = uindex / 2; + uint r = uindex - p * 2; + if (0 == r) + { + pt = outsideInsideEdgePointBaseOffset.y + p; + } + else + { + pt = outsideInsideEdgePointBaseOffset.x + p; + } + } + else if (uindex == (numInsideEdgePoints / 2 + 1) * 2) + { + pt = -1; + } + else if (uindex == (numInsideEdgePoints / 2 + 1) * 2 + 1) + { + pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints / 2; + } + //else if (uindex < (numInsideEdgePoints / 2 + 1) * 2 + 2 + (numInsideEdgePoints - numInsideEdgePoints / 2) * 2) + else if (uindex < numInsideEdgePoints * 2 + 4) + { + uint p = (uindex - ((numInsideEdgePoints / 2 + 1) * 2 + 2)) / 2 + numInsideEdgePoints / 2; + uint r = uindex - ((numInsideEdgePoints / 2 + 1) * 2 + 2) - (p - numInsideEdgePoints / 2) * 2; + if (0 == r) + { + pt = outsideInsideEdgePointBaseOffset.x + p; + } + else + { + pt = outsideInsideEdgePointBaseOffset.y + p; + } + } + //else if (uindex == (numInsideEdgePoints / 2 + 1) * 2 + 2 + (numInsideEdgePoints - numInsideEdgePoints / 2) * 2) + else if (uindex == numInsideEdgePoints * 2 + 4) + { + pt = -1; + } + else + { + //int r = i - (4 + (numInsideEdgePoints / 2 + 1) * 2 + 2 + (numInsideEdgePoints - numInsideEdgePoints / 2) * 2 + 1); + uint r = i - (numInsideEdgePoints * 2 + 9); + if (r < 2) + { + pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints - 1 + r; + } + else if (r == 2) + { + pt = outsideInsideEdgePointBaseOffset.y + numInsideEdgePoints - 1; + } + else + { + pt = -1; + } + } + break; + } + } + + return pt; +} + +int AStitchTransition(int2 outsideInsideEdgePointBaseOffset, int2 outsideInsideNumHalfTessFactorPoints, + int2 outsideInsideEdgeTessFactorParity, + uint i) +{ + outsideInsideNumHalfTessFactorPoints -= (TESSELLATOR_PARITY_ODD == outsideInsideEdgeTessFactorParity); + + uint2 out_in_first_half = uint2(outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][MAX_FACTOR / 2 + 1].y, insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][MAX_FACTOR / 2 + 1].y) * 4; + + uint3 out_in_middle = 0; + if ((outsideInsideEdgeTessFactorParity.y != outsideInsideEdgeTessFactorParity.x) || (outsideInsideEdgeTessFactorParity.y == TESSELLATOR_PARITY_ODD)) + { + if (outsideInsideEdgeTessFactorParity.y == outsideInsideEdgeTessFactorParity.x) + { + // Quad in the middle + out_in_middle.z = 5; + out_in_middle.xy = 1; + } + else if (TESSELLATOR_PARITY_EVEN == outsideInsideEdgeTessFactorParity.y) + { + // Triangle pointing inside + out_in_middle.z = 4; + out_in_middle.x = 1; + } + else + { + // Triangle pointing outside + out_in_middle.z = 4; + out_in_middle.y = 1; + } + } + + + int pt = -1; + + if (i < out_in_first_half.y) + { + // Advance inside + + uint p = i / 4; + uint r = i - p * 4; + p = insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p].z; + if ((0 == r) || (2 == r)) + { + pt = outsideInsideEdgePointBaseOffset.y + insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p].y + r / 2; + } + else if (1 == r) + { + pt = outsideInsideEdgePointBaseOffset.x + outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p].y; + } + } + else + { + i -= out_in_first_half.y; + + if (i < out_in_first_half.x) + { + // Advance outside + + uint p = i / 4; + uint r = i - p * 4; + p = outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p].z; + if (r < 2) + { + pt = outsideInsideEdgePointBaseOffset.x + outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p].y + r; + } + else if (r == 2) + { + pt = outsideInsideEdgePointBaseOffset.y + insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p].y; + if (insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p].x) + { + ++ pt; + } + } + } + else + { + i -= out_in_first_half.x; + + if (i < out_in_middle.z) + { + uint r = i; + if (outsideInsideEdgeTessFactorParity.y == outsideInsideEdgeTessFactorParity.x) + { + // Quad in the middle + if ((0 == r) || (2 == r)) + { + pt = outsideInsideEdgePointBaseOffset.y + out_in_first_half.y / 4 + (2 == r);//r / 2; + } + else if ((1 == r) || (3 == r)) + { + pt = outsideInsideEdgePointBaseOffset.x + out_in_first_half.x / 4 + (3 == r);//(r - 1) / 2; + } + } + else if (TESSELLATOR_PARITY_EVEN == outsideInsideEdgeTessFactorParity.y) + { + // Triangle pointing inside + if (r == 0) + { + pt = outsideInsideEdgePointBaseOffset.y + out_in_first_half.y / 4; + } + else if (r < 3) + { + pt = outsideInsideEdgePointBaseOffset.x + out_in_first_half.x / 4 + r - 1; + } + } + else + { + // Triangle pointing outside + if ((0 == r) || (2 == r)) + { + pt = outsideInsideEdgePointBaseOffset.y + out_in_first_half.y / 4 + (2 == r);//r / 2; + } + else if (1 == r) + { + pt = outsideInsideEdgePointBaseOffset.x + out_in_first_half.x / 4; + } + } + } + else + { + i -= out_in_middle.z; + + if (i < out_in_first_half.x) + { + // Advance outside + + uint p = i / 4; + uint r = i - p * 4; + p = outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p].z; + if (r < 2) + { + pt = outsideInsideEdgePointBaseOffset.x + out_in_first_half.x / 4 + out_in_middle.x + (outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][MAX_FACTOR / 2 + 1].y - outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p + 1].y) + r; + } + else if (r == 2) + { + pt = outsideInsideEdgePointBaseOffset.y + out_in_first_half.y / 4 + out_in_middle.y + (insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][MAX_FACTOR / 2 + 1].y - insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p + 1].y); + } + } + else + { + // Advance inside + + i -= out_in_first_half.x; + + uint p = i / 4; + uint r = i - p * 4; + p = insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p].w; + if ((0 == r) || (2 == r)) + { + pt = outsideInsideEdgePointBaseOffset.y + out_in_first_half.y / 4 + out_in_middle.y + + (insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][MAX_FACTOR / 2 + 1].y - insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p + 1].y) + (2 == r);//r / 2; + } + else if (1 == r) + { + pt = outsideInsideEdgePointBaseOffset.x + out_in_first_half.x / 4 + out_in_middle.x + + (outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][MAX_FACTOR / 2 + 1].y - outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p + 1].y); + if (outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p].x) + { + ++ pt; + } + } + } + } + } + } + + return pt; +} + +[numthreads(128, 1, 1)] +void CSTessellationIndices( uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex ) +{ + uint id = DTid.x; + //uint id = Gid.x * 128 + GI; // Workaround for some CS4x preview drivers + + if ( id < g_param.x ) + { + uint tri_id = InputTriIDIndexID[id].x; + uint index_id = InputTriIDIndexID[id].y; + uint base_vertex = InputScanned[tri_id-1].x; + + float4 outside_inside_factor = InputEdgeFactor[tri_id]; + + PROCESSED_TESS_FACTORS_TRI processedTessFactors; + int num_points = TriProcessTessFactors(outside_inside_factor, processedTessFactors, g_partitioning); + + uint tessed_indices; + if (3 == num_points) + { + if (index_id < 3) + { + tessed_indices = TransformIndex1(index_id, base_vertex); + } + else + { + tessed_indices = -1; + } + } + else + { + // Generate primitives for all the concentric rings, one side at a time for each ring + static const int startRing = 1; + int numRings = ((processedTessFactors.numPointsForOutsideInside.w + 1) / 2); // +1 is so even tess includes the center point, which we want to now + + int4 outsideInsideHalfTessFactor = int4(ceil(processedTessFactors.outsideInsideHalfTessFactor)); + uint3 num = NumStitchTransition(outsideInsideHalfTessFactor, processedTessFactors.outsideInsideTessFactorParity); + num.y += num.x; + num.z += num.y; + uint num_index = num.z; + num_index += TotalNumStitchRegular(true, DIAGONALS_MIRRORED, processedTessFactors.numPointsForOutsideInside.w, numRings - 1) * 3; + if( processedTessFactors.outsideInsideTessFactorParity.w == TESSELLATOR_PARITY_ODD ) + { + num_index += 4; + } + + int pt; + + if (index_id < num.x) + { + int numPointsForInsideEdge = processedTessFactors.numPointsForOutsideInside.w - 2 * startRing; + + pt = AStitchTransition(int2(0, processedTessFactors.insideEdgePointBaseOffset), + outsideInsideHalfTessFactor.xw, + processedTessFactors.outsideInsideTessFactorParity.xw, + index_id); + if (pt != -1) + { + pt = TransformIndex1(pt, base_vertex); + } + } + else if (index_id < num.y) + { + int numPointsForInsideEdge = processedTessFactors.numPointsForOutsideInside.w - 2 * startRing; + + pt = AStitchTransition( + int2(processedTessFactors.numPointsForOutsideInside.x - 1, processedTessFactors.insideEdgePointBaseOffset + numPointsForInsideEdge - 1), + outsideInsideHalfTessFactor.yw, + processedTessFactors.outsideInsideTessFactorParity.yw, + index_id - num.x); + if (pt != -1) + { + pt = TransformIndex1(pt, base_vertex); + } + } + else if (index_id < num.z) + { + int numPointsForInsideEdge = processedTessFactors.numPointsForOutsideInside.w - 2 * startRing; + + INDEX_PATCH_CONTEXT IndexPatchContext; + IndexPatchContext.insidePointIndexDeltaToRealValue = processedTessFactors.insideEdgePointBaseOffset + 2 * (numPointsForInsideEdge - 1); + IndexPatchContext.insidePointIndexBadValue = numPointsForInsideEdge - 1; + IndexPatchContext.insidePointIndexReplacementValue = processedTessFactors.insideEdgePointBaseOffset; + IndexPatchContext.outsidePointIndexPatchBase = IndexPatchContext.insidePointIndexBadValue+1; // past inside patched index range + IndexPatchContext.outsidePointIndexDeltaToRealValue = processedTessFactors.numPointsForOutsideInside.x + processedTessFactors.numPointsForOutsideInside.y - 2 + - IndexPatchContext.outsidePointIndexPatchBase; + IndexPatchContext.outsidePointIndexBadValue = IndexPatchContext.outsidePointIndexPatchBase + + processedTessFactors.numPointsForOutsideInside.z - 1; + IndexPatchContext.outsidePointIndexReplacementValue = 0; + + pt = AStitchTransition(int2(numPointsForInsideEdge, 0), + outsideInsideHalfTessFactor.zw, + processedTessFactors.outsideInsideTessFactorParity.zw, + index_id - num.y); + if (pt != -1) + { + pt = TransformIndex2(pt, base_vertex, IndexPatchContext); + } + } + else + { + if ((processedTessFactors.outsideInsideTessFactorParity.w == TESSELLATOR_PARITY_ODD) && (index_id >= num_index - 4)) + { + int outsideEdgePointBaseOffset = processedTessFactors.insideEdgePointBaseOffset + + ((processedTessFactors.numPointsForOutsideInside.w + 1) - (numRings + startRing)) * (numRings - startRing - 1) * 3; + + if (index_id - (num_index - 4) != 3) + { + pt = TransformIndex1(outsideEdgePointBaseOffset + index_id - (num_index - 4), base_vertex); + } + else + { + pt = -1; + } + } + else + { + int ring = GetRingFromIndexStitchRegular(true, DIAGONALS_MIRRORED, processedTessFactors.numPointsForOutsideInside.w, index_id - num.z); + + int tn = TotalNumStitchRegular(true, DIAGONALS_MIRRORED, processedTessFactors.numPointsForOutsideInside.w, ring - 1) * 3; + int n = NumStitchRegular(true, DIAGONALS_MIRRORED, processedTessFactors.numPointsForOutsideInside.w - 2 * ring); + + int edge = (index_id - num.z - tn) / n; + int index = (index_id - num.z - tn) - edge * n; + + int2 outsideInsideEdgePointBaseOffset = processedTessFactors.insideEdgePointBaseOffset + + int2(0, 3 * (processedTessFactors.numPointsForOutsideInside.w - 3)) + + ((processedTessFactors.numPointsForOutsideInside.w - (ring + startRing)) + int2(1, -1)) * (ring - startRing - 1) * 3; + + int numPointsForInsideEdge = processedTessFactors.numPointsForOutsideInside.w - 2 * ring; + int numLastPointsForInsideEdge = numPointsForInsideEdge + 2; + + if (edge < 2) + { + pt = AStitchRegular(true, DIAGONALS_MIRRORED, + numPointsForInsideEdge, + outsideInsideEdgePointBaseOffset + (int2(numLastPointsForInsideEdge, numPointsForInsideEdge) - 1) * edge, + index); + if (pt != -1) + { + pt = TransformIndex1(pt, base_vertex); + } + } + else + { + INDEX_PATCH_CONTEXT IndexPatchContext; + IndexPatchContext.insidePointIndexDeltaToRealValue = outsideInsideEdgePointBaseOffset.y + (numPointsForInsideEdge - 1) * 2; + IndexPatchContext.insidePointIndexBadValue = numPointsForInsideEdge - 1; + IndexPatchContext.insidePointIndexReplacementValue = outsideInsideEdgePointBaseOffset.y; + IndexPatchContext.outsidePointIndexPatchBase = IndexPatchContext.insidePointIndexBadValue+1; // past inside patched index range + IndexPatchContext.outsidePointIndexDeltaToRealValue = outsideInsideEdgePointBaseOffset.x + (numLastPointsForInsideEdge - 1) * 2 + - IndexPatchContext.outsidePointIndexPatchBase; + IndexPatchContext.outsidePointIndexBadValue = IndexPatchContext.outsidePointIndexPatchBase + + numLastPointsForInsideEdge - 1; + IndexPatchContext.outsidePointIndexReplacementValue = outsideInsideEdgePointBaseOffset.x; + + pt = AStitchRegular(true, DIAGONALS_MIRRORED, + numPointsForInsideEdge, + int2(numPointsForInsideEdge, 0), + index); + if (pt != -1) + { + pt = TransformIndex2(pt, base_vertex, IndexPatchContext); + } + } + } + } + + tessed_indices = pt; + } + + TessedIndicesOut.Store(id*4, tessed_indices); + } +} diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateVerticesCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateVerticesCS.hlsl new file mode 100644 index 000000000..55bf1be87 --- /dev/null +++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateVerticesCS.hlsl @@ -0,0 +1,206 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSTessellationVertices +//-------------------------------------------------------------------------------------- +// File: TessellatorCS40_TessellateVerticesCS.hlsl +// +// The CS to tessellate vertices +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +#include "TessellatorCS40_common.hlsl" + +StructuredBuffer InputTriIDIndexID : register(t0); +StructuredBuffer InputEdgeFactor : register(t1); + +struct TessedVertex +{ + uint BaseTriID; + float2 bc; +}; +RWStructuredBuffer TessedVerticesOut : register(u0); + +cbuffer cbCS : register(b1) +{ + uint4 g_param; +} + +void PlacePointIn1D(PROCESSED_TESS_FACTORS_TRI processedTessFactors, int ctx_index, int pt, out float location, int parity) +{ + int numHalfTessFactorPoints = int(ceil(processedTessFactors.outsideInsideHalfTessFactor[ctx_index])); + + bool bFlip; + if( pt >= numHalfTessFactorPoints ) + { + pt = (numHalfTessFactorPoints << 1) - pt; + if( TESSELLATOR_PARITY_ODD == parity ) + { + pt -= 1; + } + bFlip = true; + } + else + { + bFlip = false; + } + + if( pt == numHalfTessFactorPoints ) + { + location = 0.5f; + } + else + { + unsigned int indexOnCeilHalfTessFactor = pt; + unsigned int indexOnFloorHalfTessFactor = indexOnCeilHalfTessFactor; + if( pt > processedTessFactors.outsideInsideSplitPointOnFloorHalfTessFactor[ctx_index] ) + { + indexOnFloorHalfTessFactor -= 1; + } + float locationOnFloorHalfTessFactor = indexOnFloorHalfTessFactor * processedTessFactors.outsideInsideInvNumSegmentsOnFloorTessFactor[ctx_index]; + float locationOnCeilHalfTessFactor = indexOnCeilHalfTessFactor * processedTessFactors.outsideInsideInvNumSegmentsOnCeilTessFactor[ctx_index]; + + location = lerp(locationOnFloorHalfTessFactor, locationOnCeilHalfTessFactor, frac(processedTessFactors.outsideInsideHalfTessFactor[ctx_index])); + + if( bFlip ) + { + location = 1.0f - location; + } + } +} + +[numthreads(128, 1, 1)] +void CSTessellationVertices( uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex ) +{ + uint id = DTid.x; + //uint id = Gid.x * 128 + GI; // Workaround for some CS4x preview drivers + + if ( id < g_param.x ) + { + uint tri_id = InputTriIDIndexID[id].x; + uint vert_id = InputTriIDIndexID[id].y; + + float4 outside_inside_factor = InputEdgeFactor[tri_id]; + + PROCESSED_TESS_FACTORS_TRI processedTessFactors; + int num_points = TriProcessTessFactors(outside_inside_factor, processedTessFactors, g_partitioning); + + float2 uv; + if (3 == num_points) + { + if (0 == vert_id) + { + uv = float2(0, 1); + } + else if (1 == vert_id) + { + uv = float2(0, 0); + } + else + { + uv = float2(1, 0); + } + } + else + { + if (vert_id < processedTessFactors.insideEdgePointBaseOffset) + { + // Generate exterior ring edge points, clockwise starting from point V (VW, the U==0 edge) + + int edge; + if (vert_id < processedTessFactors.numPointsForOutsideInside.x - 1) + { + edge = 0; + } + else + { + vert_id -= processedTessFactors.numPointsForOutsideInside.x - 1; + if (vert_id < processedTessFactors.numPointsForOutsideInside.y - 1) + { + edge = 1; + } + else + { + vert_id -= processedTessFactors.numPointsForOutsideInside.y - 1; + edge = 2; + } + } + + int p = vert_id; + int endPoint = processedTessFactors.numPointsForOutsideInside[edge] - 1; + float param; + int q = (edge & 0x1) ? p : endPoint - p; // whether to reverse point order given we are defining V or U (W implicit): + // edge0, VW, has V decreasing, so reverse 1D points below + // edge1, WU, has U increasing, so don't reverse 1D points below + // edge2, UV, has U decreasing, so reverse 1D points below + PlacePointIn1D(processedTessFactors, edge,q,param, processedTessFactors.outsideInsideTessFactorParity[edge]); + if (0 == edge) + { + uv = float2(0, param); + } + else if (1 == edge) + { + uv = float2(param, 0); + } + else + { + uv = float2(param, 1 - param); + } + } + else + { + // Generate interior ring points, clockwise spiralling in + + uint index = vert_id - processedTessFactors.insideEdgePointBaseOffset; + uint ring = 1 + (((3 * processedTessFactors.numPointsForOutsideInside.w - 6) - sqrt(sqr(3 * processedTessFactors.numPointsForOutsideInside.w - 6) - 4 * 3 * index)) + 0.001f) / 6; + index -= 3 * (processedTessFactors.numPointsForOutsideInside.w - ring - 1) * (ring - 1); + + uint startPoint = ring; + uint endPoint = processedTessFactors.numPointsForOutsideInside.w - 1 - startPoint; + if (index < 3 * (endPoint - startPoint)) + { + uint edge = index / (endPoint - startPoint); + uint p = index - edge * (endPoint - startPoint) + startPoint; + + int perpendicularAxisPoint = startPoint; + float perpParam; + PlacePointIn1D(processedTessFactors, 3, perpendicularAxisPoint, perpParam, processedTessFactors.outsideInsideTessFactorParity.w); + perpParam = perpParam * 2 / 3; + + float param; + int q = (edge & 0x1) ? p : endPoint - (p - startPoint); // whether to reverse point given we are defining V or U (W implicit): + // edge0, VW, has V decreasing, so reverse 1D points below + // edge1, WU, has U increasing, so don't reverse 1D points below + // edge2, UV, has U decreasing, so reverse 1D points below + PlacePointIn1D(processedTessFactors, 3, q,param, processedTessFactors.outsideInsideTessFactorParity.w); + // edge0 VW, has perpendicular parameter U constant + // edge1 WU, has perpendicular parameter V constant + // edge2 UV, has perpendicular parameter W constant + const unsigned int deriv = 2; // reciprocal is the rate of change of edge-parallel parameters as they are pushed into the triangle + if (0 == edge) + { + uv = float2(perpParam, param - perpParam / deriv); + } + else if (1 == edge) + { + uv = float2(param - perpParam / deriv, perpParam); + } + else + { + uv = float2(param - perpParam / deriv, 1 - (param - perpParam / deriv + perpParam)); + } + } + else + { + if( processedTessFactors.outsideInsideTessFactorParity.w != TESSELLATOR_PARITY_ODD ) + { + // Last point is the point at the center. + uv = 1 / 3.0f; + } + } + } + } + + TessedVerticesOut[id].BaseTriID = tri_id; + TessedVerticesOut[id].bc = uv; + } +} diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_common.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_common.hlsl new file mode 100644 index 000000000..309044cdb --- /dev/null +++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_common.hlsl @@ -0,0 +1,411 @@ +//TEST_IGNORE_FILE: +//-------------------------------------------------------------------------------------- +// File: TessellatorCS40_common.hlsl +// +// The common utils included by other shaders in the sample +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +#include "TessellatorCS40_defines.h" + +cbuffer cbNeverChanges : register(b0) +{ + uint4 insidePointIndex[MAX_FACTOR / 2 + 1][MAX_FACTOR / 2 + 2]; + uint4 outsidePointIndex[MAX_FACTOR / 2 + 1][MAX_FACTOR / 2 + 2]; +} + +#define D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR ( 64 ) +#define D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR ( 63 ) +#define D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR ( 2 ) +#define D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR ( 1 ) + +#define D3D11_TESSELLATOR_PARTITIONING_INTEGER ( 0 ) +#define D3D11_TESSELLATOR_PARTITIONING_POW2 ( 1 ) +#define D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD ( 2 ) +#define D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN ( 3 ) + +#define TESSELLATOR_PARITY_EVEN ( 0 ) +#define TESSELLATOR_PARITY_ODD ( 1 ) + +#define EPSILON 1e-6f +#define MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON (D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR + EPSILON/2) + +#define DIAGONALS_INSIDE_TO_OUTSIDE ( 0 ) +#define DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE ( 1 ) +#define DIAGONALS_MIRRORED ( 2 ) + + +// This is moved to macro defines at shader compile time, so that the partitioning mode can be changed during runtime +//#define g_partitioning (D3D11_TESSELLATOR_PARTITIONING_POW2) + + +struct PROCESSED_TESS_FACTORS_TRI +{ + float4 outsideInsideTessFactor; + int4 outsideInsideTessFactorParity; + + float4 outsideInsideInvNumSegmentsOnFloorTessFactor; + float4 outsideInsideInvNumSegmentsOnCeilTessFactor; + float4 outsideInsideHalfTessFactor; + int4 outsideInsideSplitPointOnFloorHalfTessFactor; + + // Stuff below is specific to the traversal order + uint4 numPointsForOutsideInside; + uint insideEdgePointBaseOffset; +}; + +struct INDEX_PATCH_CONTEXT +{ + int insidePointIndexDeltaToRealValue; + int insidePointIndexBadValue; + int insidePointIndexReplacementValue; + int outsidePointIndexPatchBase; + int outsidePointIndexDeltaToRealValue; + int outsidePointIndexBadValue; + int outsidePointIndexReplacementValue; +}; + +bool4 isEven(float4 input) +{ + return (((uint4)input) & 1) ? false : true; +} + +uint RemoveMSB(uint val) +{ + int check; + if( val <= 0x0000ffff ) + { + check = ( val <= 0x000000ff ) ? 0x00000080 : 0x00008000; + } + else + { + check = ( val <= 0x00ffffff ) ? 0x00800000 : 0x80000000; + } + for (int i = 0; i < 8; i++, check >>= 1) + { + if( val & check ) + { + return (val & ~check); + } + } + return 0; +} + +uint4 NumPointsForTessFactor(float4 tessFactor, int4 parity) +{ + return TESSELLATOR_PARITY_ODD == parity ? uint4(ceil(0.5f + tessFactor / 2)) * 2 : uint4(ceil(tessFactor / 2)) * 2 + 1; +} + +void ComputeTessFactorContext(float4 tessFactor, int4 parity, + out float4 invNumSegmentsOnFloorTessFactor, + out float4 invNumSegmentsOnCeilTessFactor, + out float4 halfTessFactor, + out int4 splitPointOnFloorHalfTessFactor) +{ + halfTessFactor = tessFactor / 2; + + halfTessFactor += 0.5 * ((TESSELLATOR_PARITY_ODD == parity) | (0.5f == halfTessFactor)); + + float4 floorHalfTessFactor = floor(halfTessFactor); + float4 ceilHalfTessFactor = ceil(halfTessFactor); + int4 numHalfTessFactorPoints = int4(ceilHalfTessFactor); + + for (int index = 0; index < 4; ++ index) + { + if( ceilHalfTessFactor[index] == floorHalfTessFactor[index] ) + { + splitPointOnFloorHalfTessFactor[index] = /*pick value to cause this to be ignored*/ numHalfTessFactorPoints[index]+1; + } + else if( TESSELLATOR_PARITY_ODD == parity[index] ) + { + if( floorHalfTessFactor[index] == 1 ) + { + splitPointOnFloorHalfTessFactor[index] = 0; + } + else + { + splitPointOnFloorHalfTessFactor[index] = (RemoveMSB(int(floorHalfTessFactor[index]) - 1) << 1) + 1; + } + } + else + { + splitPointOnFloorHalfTessFactor[index] = (RemoveMSB(int(floorHalfTessFactor[index])) << 1) + 1; + } + } + + int4 numFloorSegments = int4(floorHalfTessFactor * 2); + int4 numCeilSegments = int4(ceilHalfTessFactor * 2); + int4 s = (TESSELLATOR_PARITY_ODD == parity); + numFloorSegments -= s; + numCeilSegments -= s; + invNumSegmentsOnFloorTessFactor = 1.0f / numFloorSegments; + invNumSegmentsOnCeilTessFactor = 1.0f / numCeilSegments; +} + +int TriProcessTessFactors( inout float4 tessFactor, + out PROCESSED_TESS_FACTORS_TRI processedTessFactors, + int partitioning ) +{ + processedTessFactors = (PROCESSED_TESS_FACTORS_TRI)0; + + int parity = TESSELLATOR_PARITY_EVEN; + switch( partitioning ) + { + case D3D11_TESSELLATOR_PARTITIONING_INTEGER: + default: + break; + case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD: + parity = TESSELLATOR_PARITY_ODD; + break; + case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN: + parity = TESSELLATOR_PARITY_EVEN; + break; + } + + // Is the patch culled? + if( !(tessFactor.x > 0) || // NaN will pass + !(tessFactor.y > 0) || + !(tessFactor.z > 0) ) + { + return 0; + } + + // Clamp edge TessFactors + float lowerBound, upperBound; + switch(partitioning) + { + case D3D11_TESSELLATOR_PARTITIONING_INTEGER: + case D3D11_TESSELLATOR_PARTITIONING_POW2: // don't care about pow2 distinction for validation, just treat as integer + default: + lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR; + upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR; + break; + + case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN: + lowerBound = D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR; + upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR; + break; + + case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD: + lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR; + upperBound = D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR; + break; + } + + tessFactor.xyz = min( upperBound, max( lowerBound, tessFactor.xyz ) ); + + // Clamp inside TessFactors + if(D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD == partitioning) + { + if( (tessFactor.x > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) || + (tessFactor.y > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) || + (tessFactor.z > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON)) + // Don't need the same check for insideTessFactor for tri patches, + // since there is only one insideTessFactor, as opposed to quad + // patches which have 2 insideTessFactors. + { + // Force picture frame + lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR + EPSILON; + } + } + + tessFactor.w = min( upperBound, max( lowerBound, tessFactor.w ) ); + // Note the above clamps map NaN to lowerBound + + if (partitioning == D3D11_TESSELLATOR_PARTITIONING_INTEGER) + { + tessFactor = ceil(tessFactor); + } + else if (partitioning == D3D11_TESSELLATOR_PARTITIONING_POW2) + { + static const int exponentMask = 0x7f800000; + static const int mantissaMask = 0x007fffff; + static const int exponentLSB = 0x00800000; + + int4 bits = asint(tessFactor); + tessFactor = bits & mantissaMask ? asfloat((bits & exponentMask) + exponentLSB) : tessFactor; + } + + // Process tessFactors + if ((partitioning == D3D11_TESSELLATOR_PARTITIONING_INTEGER)|| (partitioning == D3D11_TESSELLATOR_PARTITIONING_POW2)) + { + bool4 e = isEven(tessFactor); + processedTessFactors.outsideInsideTessFactorParity.xyz = e.xyz ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; + processedTessFactors.outsideInsideTessFactorParity.w = (e.w || (1 == tessFactor.w)) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD; + } + else + { + processedTessFactors.outsideInsideTessFactorParity = parity; + } + + processedTessFactors.outsideInsideTessFactor = tessFactor; + + if (((partitioning == D3D11_TESSELLATOR_PARTITIONING_INTEGER)|| (partitioning == D3D11_TESSELLATOR_PARTITIONING_POW2)) || (parity == TESSELLATOR_PARITY_ODD)) + { + // Special case if all TessFactors are 1 + if( (1 == processedTessFactors.outsideInsideTessFactor.x) && + (1 == processedTessFactors.outsideInsideTessFactor.y) && + (1 == processedTessFactors.outsideInsideTessFactor.z) && + (1 == processedTessFactors.outsideInsideTessFactor.w) ) + { + return 3; + } + } + + // Compute per-TessFactor metadata + ComputeTessFactorContext(processedTessFactors.outsideInsideTessFactor, processedTessFactors.outsideInsideTessFactorParity, + processedTessFactors.outsideInsideInvNumSegmentsOnFloorTessFactor, + processedTessFactors.outsideInsideInvNumSegmentsOnCeilTessFactor, + processedTessFactors.outsideInsideHalfTessFactor, + processedTessFactors.outsideInsideSplitPointOnFloorHalfTessFactor); + + // Compute some initial data. + + // outside edge offsets and storage + processedTessFactors.numPointsForOutsideInside = NumPointsForTessFactor(processedTessFactors.outsideInsideTessFactor, processedTessFactors.outsideInsideTessFactorParity); + int NumPoints = processedTessFactors.numPointsForOutsideInside.x + processedTessFactors.numPointsForOutsideInside.y + processedTessFactors.numPointsForOutsideInside.z - 3; + + // inside edge offsets + { + uint pointCountMin = (processedTessFactors.outsideInsideTessFactorParity.w == TESSELLATOR_PARITY_ODD) ? 4 : 3; + // max() allows degenerate transition regions when inside TessFactor == 1 + processedTessFactors.numPointsForOutsideInside.w = max(pointCountMin, processedTessFactors.numPointsForOutsideInside.w); + } + + processedTessFactors.insideEdgePointBaseOffset = NumPoints; + + // inside storage, including interior edges above + { + int numInteriorRings = (processedTessFactors.numPointsForOutsideInside.w >> 1) - 1; + int numInteriorPoints; + if( processedTessFactors.outsideInsideTessFactorParity.w == TESSELLATOR_PARITY_ODD ) + { + numInteriorPoints = 3*(numInteriorRings*(numInteriorRings+1) - numInteriorRings); + } + else + { + numInteriorPoints = 3*(numInteriorRings*(numInteriorRings+1)) + 1; + } + NumPoints += numInteriorPoints; + } + + return NumPoints; +} + +int NumStitchRegular(bool bTrapezoid, int diagonals, int numInsideEdgePoints) +{ + int num_index = 0; + + if( bTrapezoid ) + { + num_index += 8; + } + switch( diagonals ) + { + case DIAGONALS_INSIDE_TO_OUTSIDE: + // Diagonals pointing from inside edge forward towards outside edge + num_index += 5 * numInsideEdgePoints - 5; + break; + + case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation + // Diagonals pointing from outside edge forward towards inside edge + num_index += 5 * numInsideEdgePoints - 2; + break; + + case DIAGONALS_MIRRORED: + num_index += 2 * numInsideEdgePoints + 5; + break; + } + + return num_index; +} + +uint TotalNumStitchRegular(bool bTrapezoid, int diagonals, + int numPointsForInsideTessFactor, int ring) +{ + uint num_index = 0; + + if( bTrapezoid ) + { + num_index += 8 * (ring - 1); + } + switch( diagonals ) + { + case DIAGONALS_INSIDE_TO_OUTSIDE: + // Diagonals pointing from inside edge forward towards outside edge + num_index += (5 * numPointsForInsideTessFactor - 35 - 5 * ring) * (ring - 1); + break; + + case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation + // Diagonals pointing from outside edge forward towards inside edge + num_index += (5 * numPointsForInsideTessFactor - 12 - 5 * ring) * (ring - 1); + break; + + case DIAGONALS_MIRRORED: + num_index += (2 * numPointsForInsideTessFactor + 1 - 2 * ring) * (ring - 1); + break; + } + + return num_index; +} + +int sqr(int x) +{ + return x * x; +} + +int GetRingFromIndexStitchRegular(bool bTrapezoid, int diagonals, int numPointsForInsideTessFactor, int index) +{ + int t = 0; + if (bTrapezoid) + { + t = 8; + } + + switch( diagonals ) + { + case DIAGONALS_INSIDE_TO_OUTSIDE: + t = (5 * numPointsForInsideTessFactor - (35 - t)) * 3; + return 1 + uint((t + 15) - sqrt(sqr(t + 15) - 4 * 15 * (t + index)) + 0.001f) / 30; + + case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: + t = (5 * numPointsForInsideTessFactor - (12 - t)) * 3; + return 1 + uint((t + 15) - sqrt(sqr(t + 15) - 4 * 15 * (t + index)) + 0.001f) / 30; + + case DIAGONALS_MIRRORED: + t = ((t + 1) + 2 * numPointsForInsideTessFactor) * 3; + return 1 + uint((t + 6) - sqrt(sqr(t + 6) - 4 * 6 * (t + index)) + 0.001f) / 12; + + default: + return -1; + } +} + +uint3 NumStitchTransition(int4 outsideInsideNumHalfTessFactorPoints, + int4 outsideInsideEdgeTessFactorParity) +{ + outsideInsideNumHalfTessFactorPoints -= (TESSELLATOR_PARITY_ODD == outsideInsideEdgeTessFactorParity); + + uint3 num_index = insidePointIndex[outsideInsideNumHalfTessFactorPoints.w][MAX_FACTOR / 2 + 1].y * 8; + + [unroll] + for (int edge = 0; edge < 3; ++ edge) + { + num_index[edge] += outsidePointIndex[outsideInsideNumHalfTessFactorPoints[edge]][MAX_FACTOR / 2 + 1].y * 8; + + if( (outsideInsideEdgeTessFactorParity.w != outsideInsideEdgeTessFactorParity[edge]) || (outsideInsideEdgeTessFactorParity.w == TESSELLATOR_PARITY_ODD)) + { + if( outsideInsideEdgeTessFactorParity.w == outsideInsideEdgeTessFactorParity[edge] ) + { + num_index[edge] += 5; + } + else + { + num_index[edge] += 4; + } + } + } + + return num_index; +} diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_defines.h b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_defines.h new file mode 100644 index 000000000..6b4382393 --- /dev/null +++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_defines.h @@ -0,0 +1,9 @@ +//-------------------------------------------------------------------------------------- +// File: TessellatorCS40_defines.h +// +// This file defines common constants which are included by both CPU code and shader code +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +#define MAX_FACTOR 16 diff --git a/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC6HEncode.hlsl b/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC6HEncode.hlsl new file mode 100644 index 000000000..1e40c80ef --- /dev/null +++ b/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC6HEncode.hlsl @@ -0,0 +1,2567 @@ +//TEST_IGNORE_FILE: +//-------------------------------------------------------------------------------------- +// File: BC6HEncode.hlsl +// +// The Compute Shader for BC6H Encoder +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//#define REF_DEVICE + +#define UINTLENGTH 32 +#define NCHANNELS 3 +#define SIGNED_F16 96 +#define UNSIGNED_F16 95 +#define MAX_FLOAT asfloat(0x7F7FFFFF) +#define MIN_FLOAT asfloat(0xFF7FFFFF) +#define MAX_INT asint(0x7FFFFFFF) +#define MIN_INT asint(0x80000000) + +cbuffer cbCS : register( b0 ) +{ + uint g_tex_width; + uint g_num_block_x; + uint g_format; //either SIGNED_F16 for DXGI_FORMAT_BC6H_SF16 or UNSIGNED_F16 for DXGI_FORMAT_BC6H_UF16 + uint g_mode_id; + uint g_start_block_id; + uint g_num_total_blocks; +}; + +static const uint candidateModeMemory[14] = { 0x00, 0x01, 0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, 0x03, 0x07, 0x0B, 0x0F }; +static const uint candidateModeFlag[14] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; +static const bool candidateModeTransformed[14] = { true, true, true, true, true, true, true, true, true, false, false, true, true, true }; +static const uint4 candidateModePrec[14] = { uint4(10,5,5,5), uint4(7,6,6,6), + uint4(11,5,4,4), uint4(11,4,5,4), uint4(11,4,4,5), uint4(9,5,5,5), + uint4(8,6,5,5), uint4(8,5,6,5), uint4(8,5,5,6), uint4(6,6,6,6), + uint4(10,10,10,10), uint4(11,9,9,9), uint4(12,8,8,8), uint4(16,4,4,4) }; + +/*static const uint4x4 candidateSection[32] = +{ + {0,0,1,1, 0,0,1,1, 0,0,1,1, 0,0,1,1}, {0,0,0,1, 0,0,0,1, 0,0,0,1, 0,0,0,1}, {0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1}, {0,0,0,1, 0,0,1,1, 0,0,1,1, 0,1,1,1}, + {0,0,0,0, 0,0,0,1, 0,0,0,1, 0,0,1,1}, {0,0,1,1, 0,1,1,1, 0,1,1,1, 1,1,1,1}, {0,0,0,1, 0,0,1,1, 0,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,1, 0,0,1,1, 0,1,1,1}, + {0,0,0,0, 0,0,0,0, 0,0,0,1, 0,0,1,1}, {0,0,1,1, 0,1,1,1, 1,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,1, 0,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,0, 0,0,0,1, 0,1,1,1}, + {0,0,0,1, 0,1,1,1, 1,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,0, 1,1,1,1, 1,1,1,1}, {0,0,0,0, 1,1,1,1, 1,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,0, 0,0,0,0, 1,1,1,1}, + {0,0,0,0, 1,0,0,0, 1,1,1,0, 1,1,1,1}, {0,1,1,1, 0,0,0,1, 0,0,0,0, 0,0,0,0}, {0,0,0,0, 0,0,0,0, 1,0,0,0, 1,1,1,0}, {0,1,1,1, 0,0,1,1, 0,0,0,1, 0,0,0,0}, + {0,0,1,1, 0,0,0,1, 0,0,0,0, 0,0,0,0}, {0,0,0,0, 1,0,0,0, 1,1,0,0, 1,1,1,0}, {0,0,0,0, 0,0,0,0, 1,0,0,0, 1,1,0,0}, {0,1,1,1, 0,0,1,1, 0,0,1,1, 0,0,0,1}, + {0,0,1,1, 0,0,0,1, 0,0,0,1, 0,0,0,0}, {0,0,0,0, 1,0,0,0, 1,0,0,0, 1,1,0,0}, {0,1,1,0, 0,1,1,0, 0,1,1,0, 0,1,1,0}, {0,0,1,1, 0,1,1,0, 0,1,1,0, 1,1,0,0}, + {0,0,0,1, 0,1,1,1, 1,1,1,0, 1,0,0,0}, {0,0,0,0, 1,1,1,1, 1,1,1,1, 0,0,0,0}, {0,1,1,1, 0,0,0,1, 1,0,0,0, 1,1,1,0}, {0,0,1,1, 1,0,0,1, 1,0,0,1, 1,1,0,0} +};*/ + +static const uint candidateSectionBit[32] = +{ + 0xCCCC, 0x8888, 0xEEEE, 0xECC8, + 0xC880, 0xFEEC, 0xFEC8, 0xEC80, + 0xC800, 0xFFEC, 0xFE80, 0xE800, + 0xFFE8, 0xFF00, 0xFFF0, 0xF000, + 0xF710, 0x008E, 0x7100, 0x08CE, + 0x008C, 0x7310, 0x3100, 0x8CCE, + 0x088C, 0x3110, 0x6666, 0x366C, + 0x17E8, 0x0FF0, 0x718E, 0x399C +}; + +static const uint candidateFixUpIndex1D[32] = +{ + 15,15,15,15, + 15,15,15,15, + 15,15,15,15, + 15,15,15,15, + 15, 2, 8, 2, + 2, 8, 8,15, + 2, 8, 2, 2, + 8, 8, 2, 2 +}; + +//0, 9, 18, 27, 37, 46, 55, 64 +static const uint aStep1[64] = {0,0,0,0,0,1,1,1, + 1,1,1,1,1,1,2,2, + 2,2,2,2,2,2,2,3, + 3,3,3,3,3,3,3,3, + 3,4,4,4,4,4,4,4, + 4,4,5,5,5,5,5,5, + 5,5,5,6,6,6,6,6, + 6,6,6,6,7,7,7,7}; + +//0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 +static const uint aStep2[64] = { 0, 0, 0, 1, 1, 1, 1, 2, + 2, 2, 2, 2, 3, 3, 3, 3, + 4, 4, 4, 4, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 7, 7, 7, + 7, 8, 8, 8, 8, 9, 9, 9, + 9,10,10,10,10,10,11,11, + 11,11,12,12,12,12,13,13, + 13,13,14,14,14,14,15,15}; + +static const float3 RGB2LUM = float3(0.2126f, 0.7152f, 0.0722f); + +#define THREAD_GROUP_SIZE 64 +#define BLOCK_SIZE_Y 4 +#define BLOCK_SIZE_X 4 +#define BLOCK_SIZE (BLOCK_SIZE_Y * BLOCK_SIZE_X) + + +//Forward declaration +uint3 float2half( float3 pixel_f ); +int3 start_quantize( uint3 pixel_h ); +void quantize( inout int2x3 endPoint, uint prec ); +void finish_quantize_0( inout int bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed ); +void finish_quantize_1( inout int bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed ); +void finish_quantize( out bool bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed ); + +void start_unquantize( inout int2x3 endPoint[2], uint4 prec, bool transformed ); +void start_unquantize( inout int2x3 endPoint, uint4 prec, bool transformed ); +void unquantize( inout int2x3 color, uint prec ); +uint3 finish_unquantize( int3 color ); +void generate_palette_unquantized8( out uint3 palette, int3 low, int3 high, int i ); +void generate_palette_unquantized16( out uint3 palette, int3 low, int3 high, int i ); +float3 half2float(uint3 color_h ); + +void block_package( inout uint4 block, int2x3 endPoint[2], uint mode_type, uint partition_index ); +void block_package( inout uint4 block, int2x3 endPoint, uint mode_type ); + +void swap(inout int3 lhs, inout int3 rhs) +{ + int3 tmp = lhs; + lhs = rhs; + rhs = tmp; +} + +Texture2D g_Input : register( t0 ); +StructuredBuffer g_InBuff : register( t1 ); + +RWStructuredBuffer g_OutBuff : register( u0 ); + +struct SharedData +{ + float3 pixel; + int3 pixel_ph; + float3 pixel_hr; + float pixel_lum; + float error; + uint best_mode; + uint best_partition; + int3 endPoint_low; + int3 endPoint_high; + float endPoint_lum_low; + float endPoint_lum_high; +}; + +groupshared SharedData shared_temp[THREAD_GROUP_SIZE]; + +[numthreads( THREAD_GROUP_SIZE, 1, 1 )] +void TryModeG10CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID ) +{ + const uint MAX_USED_THREAD = 16; + uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD; + uint blockInGroup = GI / MAX_USED_THREAD; + uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup; + uint threadBase = blockInGroup * MAX_USED_THREAD; + uint threadInBlock = GI - threadBase; + +#ifndef REF_DEVICE + if (blockID >= g_num_total_blocks) + { + return; + } +#endif + + uint block_y = blockID / g_num_block_x; + uint block_x = blockID - block_y * g_num_block_x; + uint base_x = block_x * BLOCK_SIZE_X; + uint base_y = block_y * BLOCK_SIZE_Y; + + if (threadInBlock < 16) + { + shared_temp[GI].pixel = g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ).rgb; + uint3 pixel_h = float2half( shared_temp[GI].pixel ); + shared_temp[GI].pixel_hr = half2float(pixel_h); + shared_temp[GI].pixel_lum = dot(shared_temp[GI].pixel_hr, RGB2LUM); + shared_temp[GI].pixel_ph = start_quantize( pixel_h ); + + shared_temp[GI].endPoint_low = shared_temp[GI].pixel_ph; + shared_temp[GI].endPoint_high = shared_temp[GI].pixel_ph; + shared_temp[GI].endPoint_lum_low = shared_temp[GI].pixel_lum; + shared_temp[GI].endPoint_lum_high = shared_temp[GI].pixel_lum; + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + if (threadInBlock < 8) + { + if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 8].endPoint_lum_low) + { + shared_temp[GI].endPoint_low = shared_temp[GI + 8].endPoint_low; + shared_temp[GI].endPoint_lum_low = shared_temp[GI + 8].endPoint_lum_low; + } + if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 8].endPoint_lum_high) + { + shared_temp[GI].endPoint_high = shared_temp[GI + 8].endPoint_high; + shared_temp[GI].endPoint_lum_high = shared_temp[GI + 8].endPoint_lum_high; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 4) + { + if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 4].endPoint_lum_low) + { + shared_temp[GI].endPoint_low = shared_temp[GI + 4].endPoint_low; + shared_temp[GI].endPoint_lum_low = shared_temp[GI + 4].endPoint_lum_low; + } + if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 4].endPoint_lum_high) + { + shared_temp[GI].endPoint_high = shared_temp[GI + 4].endPoint_high; + shared_temp[GI].endPoint_lum_high = shared_temp[GI + 4].endPoint_lum_high; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 2) + { + if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 2].endPoint_lum_low) + { + shared_temp[GI].endPoint_low = shared_temp[GI + 2].endPoint_low; + shared_temp[GI].endPoint_lum_low = shared_temp[GI + 2].endPoint_lum_low; + } + if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 2].endPoint_lum_high) + { + shared_temp[GI].endPoint_high = shared_temp[GI + 2].endPoint_high; + shared_temp[GI].endPoint_lum_high = shared_temp[GI + 2].endPoint_lum_high; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 1) + { + if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 1].endPoint_lum_low) + { + shared_temp[GI].endPoint_low = shared_temp[GI + 1].endPoint_low; + shared_temp[GI].endPoint_lum_low = shared_temp[GI + 1].endPoint_lum_low; + } + if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 1].endPoint_lum_high) + { + shared_temp[GI].endPoint_high = shared_temp[GI + 1].endPoint_high; + shared_temp[GI].endPoint_lum_high = shared_temp[GI + 1].endPoint_lum_high; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + //ergod mode_type 11:14 + if ( threadInBlock == 0 ) + { + int2x3 endPoint; + // find_axis + endPoint[0] = shared_temp[threadBase + 0].endPoint_low; + endPoint[1] = shared_temp[threadBase + 0].endPoint_high; + + //compute_index + float3 span = endPoint[1] - endPoint[0];// fixed a bug in v0.2 + float span_norm_sqr = dot( span, span );// fixed a bug in v0.2 + float dotProduct = dot( span, shared_temp[threadBase + 0].pixel_ph - endPoint[0] );// fixed a bug in v0.2 + if ( span_norm_sqr > 0 && dotProduct >= 0 && uint( dotProduct * 63.49999 / span_norm_sqr ) > 32 ) + { + swap(endPoint[0], endPoint[1]); + + shared_temp[GI].endPoint_low = endPoint[0]; + shared_temp[GI].endPoint_high = endPoint[1]; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + if (threadInBlock < 4) + { + int2x3 endPoint; + endPoint[0] = shared_temp[threadBase + 0].endPoint_low; + endPoint[1] = shared_temp[threadBase + 0].endPoint_high; + + float3 span = endPoint[1] - endPoint[0]; + float span_norm_sqr = dot( span, span ); + + uint4 prec = candidateModePrec[threadInBlock + 10]; + int2x3 endPoint_q = endPoint; + quantize( endPoint_q, prec.x ); + + bool transformed = candidateModeTransformed[threadInBlock + 10]; + if (transformed) + { + endPoint_q[1] -= endPoint_q[0]; + } + + bool bBadQuantize; + finish_quantize( bBadQuantize, endPoint_q, prec, transformed ); + + start_unquantize( endPoint_q, prec, transformed ); + + unquantize( endPoint_q, prec.x ); + + float error = 0; + [loop]for ( uint j = 0; j < 16; j ++ ) + { + float dotProduct = dot( span, shared_temp[threadBase + j].pixel_ph - endPoint[0] );// fixed a bug in v0.2 + uint index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0 + : ( ( dotProduct < span_norm_sqr ) ? aStep2[ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep2[63] ); + + uint3 pixel_rh; + generate_palette_unquantized16( pixel_rh, endPoint_q[0], endPoint_q[1], index ); + float3 pixel_r = half2float( pixel_rh ); + pixel_r -= shared_temp[threadBase + j].pixel_hr; + error += dot(pixel_r, pixel_r); + } + if ( bBadQuantize ) + error = 1e20f; + + shared_temp[GI].error = error; + shared_temp[GI].best_mode = candidateModeFlag[threadInBlock + 10]; + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + if (threadInBlock < 2) + { + if ( shared_temp[GI].error > shared_temp[GI + 2].error ) + { + shared_temp[GI].error = shared_temp[GI + 2].error; + shared_temp[GI].best_mode = shared_temp[GI + 2].best_mode; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 1) + { + if ( shared_temp[GI].error > shared_temp[GI + 1].error ) + { + shared_temp[GI].error = shared_temp[GI + 1].error; + shared_temp[GI].best_mode = shared_temp[GI + 1].best_mode; + } + + g_OutBuff[blockID] = uint4(asuint(shared_temp[GI].error), shared_temp[GI].best_mode, 0, 0); + } +} + +[numthreads( THREAD_GROUP_SIZE, 1, 1 )] +void TryModeLE10CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID ) +{ + const uint MAX_USED_THREAD = 32; + uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD; + uint blockInGroup = GI / MAX_USED_THREAD; + uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup; + uint threadBase = blockInGroup * MAX_USED_THREAD; + uint threadInBlock = GI - threadBase; + +#ifndef REF_DEVICE + if (blockID >= g_num_total_blocks) + { + return; + } + + if (asfloat(g_InBuff[blockID].x) < 1e-6f) + { + g_OutBuff[blockID] = g_InBuff[blockID]; + return; + } +#endif + + uint block_y = blockID / g_num_block_x; + uint block_x = blockID - block_y * g_num_block_x; + uint base_x = block_x * BLOCK_SIZE_X; + uint base_y = block_y * BLOCK_SIZE_Y; + + if (threadInBlock < 16) + { + shared_temp[GI].pixel = g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ).rgb; + uint3 pixel_h = float2half( shared_temp[GI].pixel ); + shared_temp[GI].pixel_hr = half2float(pixel_h); + shared_temp[GI].pixel_lum = dot(shared_temp[GI].pixel_hr, RGB2LUM); + shared_temp[GI].pixel_ph = start_quantize( pixel_h ); + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + //ergod mode_type 1:10 + if (threadInBlock < 32) + { + // find_axis + int2x3 endPoint[2]; + endPoint[0][0] = MAX_INT; + endPoint[0][1] = MIN_INT; + endPoint[1][0] = MAX_INT; + endPoint[1][1] = MIN_INT; + + float2 endPoint_lum[2]; + endPoint_lum[0][0] = MAX_FLOAT; + endPoint_lum[0][1] = MIN_FLOAT; + endPoint_lum[1][0] = MAX_FLOAT; + endPoint_lum[1][1] = MIN_FLOAT; + + uint bit = candidateSectionBit[threadInBlock]; + for ( uint i = 0; i < 16; i ++ ) + { + int3 pixel_ph = shared_temp[threadBase + i].pixel_ph; + float pixel_lum = shared_temp[threadBase + i].pixel_lum; + if ( (bit >> i) & 1 ) //It gets error when using "candidateSection" as "endPoint_ph" index + { + if (endPoint_lum[1][0] > pixel_lum) + { + endPoint[1][0] = pixel_ph; + endPoint_lum[1][0] = pixel_lum; + } + if (endPoint_lum[1][1] < pixel_lum) + { + endPoint[1][1] = pixel_ph; + endPoint_lum[1][1] = pixel_lum; + } + } + else + { + if (endPoint_lum[0][0] > pixel_lum) + { + endPoint[0][0] = pixel_ph; + endPoint_lum[0][0] = pixel_lum; + } + if (endPoint_lum[0][1] < pixel_lum) + { + endPoint[0][1] = pixel_ph; + endPoint_lum[0][1] = pixel_lum; + } + } + } + + //compute_index + float3 span[2];// fixed a bug in v0.2 + float span_norm_sqr[2];// fixed a bug in v0.2 + [unroll] + for (uint p = 0; p < 2; ++ p) + { + span[p] = endPoint[p][1] - endPoint[p][0]; + span_norm_sqr[p] = dot( span[p], span[p] ); + + float dotProduct = dot( span[p], shared_temp[threadBase + (0 == p ? 0 : candidateFixUpIndex1D[threadInBlock])].pixel_ph - endPoint[p][0] );// fixed a bug in v0.2 + if ( span_norm_sqr[p] > 0 && dotProduct >= 0 && uint( dotProduct * 63.49999 / span_norm_sqr[p] ) > 32 ) + { + span[p] = -span[p]; + swap(endPoint[p][0], endPoint[p][1]); + } + } + + uint4 prec = candidateModePrec[g_mode_id]; + int2x3 endPoint_q[2] = endPoint; + quantize( endPoint_q[0], prec.x ); + quantize( endPoint_q[1], prec.x ); + + bool transformed = candidateModeTransformed[g_mode_id]; + if (transformed) + { + endPoint_q[0][1] -= endPoint_q[0][0]; + endPoint_q[1][0] -= endPoint_q[0][0]; + endPoint_q[1][1] -= endPoint_q[0][0]; + } + + int bBadQuantize = 0; + finish_quantize_0( bBadQuantize, endPoint_q[0], prec, transformed ); + finish_quantize_1( bBadQuantize, endPoint_q[1], prec, transformed ); + + start_unquantize( endPoint_q, prec, transformed ); + + unquantize( endPoint_q[0], prec.x ); + unquantize( endPoint_q[1], prec.x ); + + float error = 0; + for ( uint j = 0; j < 16; j ++ ) + { + uint3 pixel_rh; + if ((bit >> j) & 1) + { + float dotProduct = dot( span[1], shared_temp[threadBase + j].pixel_ph - endPoint[1][0] );// fixed a bug in v0.2 + uint index = ( span_norm_sqr[1] <= 0 || dotProduct <= 0 ) ? 0 + : ( ( dotProduct < span_norm_sqr[1] ) ? aStep1[ uint( dotProduct * 63.49999 / span_norm_sqr[1] ) ] : aStep1[63] ); + generate_palette_unquantized8( pixel_rh, endPoint_q[1][0], endPoint_q[1][1], index ); + } + else + { + float dotProduct = dot( span[0], shared_temp[threadBase + j].pixel_ph - endPoint[0][0] );// fixed a bug in v0.2 + uint index = ( span_norm_sqr[0] <= 0 || dotProduct <= 0 ) ? 0 + : ( ( dotProduct < span_norm_sqr[0] ) ? aStep1[ uint( dotProduct * 63.49999 / span_norm_sqr[0] ) ] : aStep1[63] ); + generate_palette_unquantized8( pixel_rh, endPoint_q[0][0], endPoint_q[0][1], index ); + } + + float3 pixel_r = half2float( pixel_rh ); + pixel_r -= shared_temp[threadBase + j].pixel_hr; + error += dot(pixel_r, pixel_r); + } + if ( bBadQuantize ) + error = 1e20f; + + shared_temp[GI].error = error; + shared_temp[GI].best_mode = candidateModeFlag[g_mode_id]; + shared_temp[GI].best_partition = threadInBlock; + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + if (threadInBlock < 16) + { + if ( shared_temp[GI].error > shared_temp[GI + 16].error ) + { + shared_temp[GI].error = shared_temp[GI + 16].error; + shared_temp[GI].best_mode = shared_temp[GI + 16].best_mode; + shared_temp[GI].best_partition = shared_temp[GI + 16].best_partition; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 8) + { + if ( shared_temp[GI].error > shared_temp[GI + 8].error ) + { + shared_temp[GI].error = shared_temp[GI + 8].error; + shared_temp[GI].best_mode = shared_temp[GI + 8].best_mode; + shared_temp[GI].best_partition = shared_temp[GI + 8].best_partition; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 4) + { + if ( shared_temp[GI].error > shared_temp[GI + 4].error ) + { + shared_temp[GI].error = shared_temp[GI + 4].error; + shared_temp[GI].best_mode = shared_temp[GI + 4].best_mode; + shared_temp[GI].best_partition = shared_temp[GI + 4].best_partition; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 2) + { + if ( shared_temp[GI].error > shared_temp[GI + 2].error ) + { + shared_temp[GI].error = shared_temp[GI + 2].error; + shared_temp[GI].best_mode = shared_temp[GI + 2].best_mode; + shared_temp[GI].best_partition = shared_temp[GI + 2].best_partition; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 1) + { + if ( shared_temp[GI].error > shared_temp[GI + 1].error ) + { + shared_temp[GI].error = shared_temp[GI + 1].error; + shared_temp[GI].best_mode = shared_temp[GI + 1].best_mode; + shared_temp[GI].best_partition = shared_temp[GI + 1].best_partition; + } + + if (asfloat(g_InBuff[blockID].x) > shared_temp[GI].error) + { + g_OutBuff[blockID] = uint4(asuint(shared_temp[GI].error), shared_temp[GI].best_mode, shared_temp[GI].best_partition, 0); + } + else + { + g_OutBuff[blockID] = g_InBuff[blockID]; + } + } +} + +[numthreads( THREAD_GROUP_SIZE, 1, 1 )] +void EncodeBlockCS(uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID) +{ + const uint MAX_USED_THREAD = 32; + uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD; + uint blockInGroup = GI / MAX_USED_THREAD; + uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup; + uint threadBase = blockInGroup * MAX_USED_THREAD; + uint threadInBlock = GI - threadBase; + +#ifndef REF_DEVICE + if (blockID >= g_num_total_blocks) + { + return; + } +#endif + + uint block_y = blockID / g_num_block_x; + uint block_x = blockID - block_y * g_num_block_x; + uint base_x = block_x * BLOCK_SIZE_X; + uint base_y = block_y * BLOCK_SIZE_Y; + + if (threadInBlock < 16) + { + shared_temp[GI].pixel = g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ).rgb; + shared_temp[GI].pixel_lum = dot(shared_temp[GI].pixel, RGB2LUM); + uint3 pixel_h = float2half( shared_temp[GI].pixel ); + shared_temp[GI].pixel_ph = start_quantize( pixel_h ); + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + uint best_mode = g_InBuff[blockID].y; + uint best_partition = g_InBuff[blockID].z; + + uint4 block = 0; + + if (threadInBlock < 32) + { + int2x3 endPoint; + endPoint[0] = MAX_INT; + endPoint[1] = MIN_INT; + + float2 endPoint_lum; + endPoint_lum[0] = MAX_FLOAT; + endPoint_lum[1] = MIN_FLOAT; + + int2 endPoint_lum_index; + endPoint_lum_index[0] = -1; + endPoint_lum_index[1] = -1; + + int3 pixel_ph = shared_temp[threadBase + (threadInBlock & 0xF)].pixel_ph; + float pixel_lum = shared_temp[threadBase + (threadInBlock & 0xF)].pixel_lum; + if (threadInBlock < 16) + { + if (best_mode > 10) + { + endPoint[0] = endPoint[1] = pixel_ph; + endPoint_lum[0] = endPoint_lum[1] = pixel_lum; + } + else + { + uint bits = candidateSectionBit[best_partition]; + if (0 == ((bits >> threadInBlock) & 1)) + { + endPoint[0] = endPoint[1] = pixel_ph; + endPoint_lum[0] = endPoint_lum[1] = pixel_lum; + } + } + } + else + { + if (best_mode <= 10) + { + uint bits = candidateSectionBit[best_partition]; + if (1 == ((bits >> (threadInBlock & 0xF)) & 1)) + { + endPoint[0] = endPoint[1] = pixel_ph; + endPoint_lum[0] = endPoint_lum[1] = pixel_lum; + } + } + } + + shared_temp[GI].endPoint_low = endPoint[0]; + shared_temp[GI].endPoint_high = endPoint[1]; + + shared_temp[GI].endPoint_lum_low = endPoint_lum[0]; + shared_temp[GI].endPoint_lum_high = endPoint_lum[1]; + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if ((threadInBlock & 0xF) < 8) + { + if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 8].endPoint_lum_low) + { + shared_temp[GI].endPoint_low = shared_temp[GI + 8].endPoint_low; + shared_temp[GI].endPoint_lum_low = shared_temp[GI + 8].endPoint_lum_low; + } + if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 8].endPoint_lum_high) + { + shared_temp[GI].endPoint_high = shared_temp[GI + 8].endPoint_high; + shared_temp[GI].endPoint_lum_high = shared_temp[GI + 8].endPoint_lum_high; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if ((threadInBlock & 0xF) < 4) + { + if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 4].endPoint_lum_low) + { + shared_temp[GI].endPoint_low = shared_temp[GI + 4].endPoint_low; + shared_temp[GI].endPoint_lum_low = shared_temp[GI + 4].endPoint_lum_low; + } + if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 4].endPoint_lum_high) + { + shared_temp[GI].endPoint_high = shared_temp[GI + 4].endPoint_high; + shared_temp[GI].endPoint_lum_high = shared_temp[GI + 4].endPoint_lum_high; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if ((threadInBlock & 0xF) < 2) + { + if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 2].endPoint_lum_low) + { + shared_temp[GI].endPoint_low = shared_temp[GI + 2].endPoint_low; + shared_temp[GI].endPoint_lum_low = shared_temp[GI + 2].endPoint_lum_low; + } + if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 2].endPoint_lum_high) + { + shared_temp[GI].endPoint_high = shared_temp[GI + 2].endPoint_high; + shared_temp[GI].endPoint_lum_high = shared_temp[GI + 2].endPoint_lum_high; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if ((threadInBlock & 0xF) < 1) + { + if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 1].endPoint_lum_low) + { + shared_temp[GI].endPoint_low = shared_temp[GI + 1].endPoint_low; + shared_temp[GI].endPoint_lum_low = shared_temp[GI + 1].endPoint_lum_low; + } + if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 1].endPoint_lum_high) + { + shared_temp[GI].endPoint_high = shared_temp[GI + 1].endPoint_high; + shared_temp[GI].endPoint_lum_high = shared_temp[GI + 1].endPoint_lum_high; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + if (threadInBlock < 2) + { + // find_axis + int2x3 endPoint; + endPoint[0] = shared_temp[threadBase + threadInBlock * 16].endPoint_low; + endPoint[1] = shared_temp[threadBase + threadInBlock * 16].endPoint_high; + + uint fixup = 0; + if ((1 == threadInBlock) && (best_mode <= 10)) + { + fixup = candidateFixUpIndex1D[best_partition]; + } + + float3 span = endPoint[1] - endPoint[0]; + float span_norm_sqr = dot( span, span ); + float dotProduct = dot( span, shared_temp[threadBase + fixup].pixel_ph - endPoint[0] ); + if ( span_norm_sqr > 0 && dotProduct >= 0 && uint( dotProduct * 63.49999 / span_norm_sqr ) > 32 ) + { + swap(endPoint[0], endPoint[1]); + } + + shared_temp[GI].endPoint_low = endPoint[0]; + shared_temp[GI].endPoint_high = endPoint[1]; + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + if (threadInBlock < 16) + { + uint bits; + if (best_mode > 10) + { + bits = 0; + } + else + { + bits = candidateSectionBit[best_partition]; + } + + float3 span; + float dotProduct; + if ((bits >> threadInBlock) & 1) + { + span = shared_temp[threadBase + 1].endPoint_high - shared_temp[threadBase + 1].endPoint_low; + dotProduct = dot( span, shared_temp[threadBase + threadInBlock].pixel_ph - shared_temp[threadBase + 1].endPoint_low ); + } + else + { + span = shared_temp[threadBase + 0].endPoint_high - shared_temp[threadBase + 0].endPoint_low; + dotProduct = dot( span, shared_temp[threadBase + threadInBlock].pixel_ph - shared_temp[threadBase + 0].endPoint_low ); + } + float span_norm_sqr = dot( span, span ); + + if (best_mode > 10) + { + uint index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0 + : ( ( dotProduct < span_norm_sqr ) ? aStep2[ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep2[63] ); + if (threadInBlock == 0) + { + block.z |= index << 1; + } + else if (threadInBlock < 8) + { + block.z |= index << (threadInBlock * 4); + } + else + { + block.w |= index << ((threadInBlock - 8) * 4); + } + } + else + { + uint index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0 + : ( ( dotProduct < span_norm_sqr ) ? aStep1[ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep1[63] ); + + uint fixup = candidateFixUpIndex1D[best_partition]; + int2 offset = int2((fixup != 2), (fixup == 15)); + + if (threadInBlock == 0) + { + block.z |= index << 18; + } + else if (threadInBlock < 3) + { + block.z |= index << (20 + (threadInBlock - 1) * 3); + } + else if (threadInBlock < 5) + { + block.z |= index << (25 + (threadInBlock - 3) * 3 + offset.x); + } + else if (threadInBlock == 5) + { + block.w |= index >> !offset.x; + if (!offset.x) + { + block.z |= index << 31; + } + } + else if (threadInBlock < 9) + { + block.w |= index << (2 + (threadInBlock - 6) * 3 + offset.x); + } + else + { + block.w |= index << (11 + (threadInBlock - 9) * 3 + offset.y); + } + } + + shared_temp[GI].pixel_hr.xy = asfloat(block.zw); + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 8) + { + shared_temp[GI].pixel_hr.xy = asfloat(asuint(shared_temp[GI].pixel_hr.xy) | asuint(shared_temp[GI + 8].pixel_hr.xy)); + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 4) + { + shared_temp[GI].pixel_hr.xy = asfloat(asuint(shared_temp[GI].pixel_hr.xy) | asuint(shared_temp[GI + 4].pixel_hr.xy)); + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 2) + { + shared_temp[GI].pixel_hr.xy = asfloat(asuint(shared_temp[GI].pixel_hr.xy) | asuint(shared_temp[GI + 2].pixel_hr.xy)); + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 1) + { + shared_temp[GI].pixel_hr.xy = asfloat(asuint(shared_temp[GI].pixel_hr.xy) | asuint(shared_temp[GI + 1].pixel_hr.xy)); + + block.zw = asuint(shared_temp[GI].pixel_hr.xy); + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + bool transformed = candidateModeTransformed[best_mode - 1]; + uint4 prec = candidateModePrec[best_mode - 1]; + if (threadInBlock == 2) + { + int2x3 endPoint_q; + endPoint_q[0] = shared_temp[threadBase + 0].endPoint_low; + endPoint_q[1] = shared_temp[threadBase + 0].endPoint_high; + + quantize( endPoint_q, prec.x ); + if (transformed) + { + endPoint_q[1] -= endPoint_q[0]; + } + + shared_temp[GI].endPoint_low = endPoint_q[0]; + shared_temp[GI].endPoint_high = endPoint_q[1]; + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock == 3) + { + int3 ep0 = shared_temp[threadBase + 2].endPoint_low; + int2x3 endPoint_q; + endPoint_q[0] = shared_temp[threadBase + 1].endPoint_low; + endPoint_q[1] = shared_temp[threadBase + 1].endPoint_high; + + if (best_mode <= 10) + { + quantize( endPoint_q, prec.x ); + if (transformed) + { + endPoint_q[0] -= ep0; + endPoint_q[1] -= ep0; + } + + shared_temp[GI].endPoint_low = endPoint_q[0]; + shared_temp[GI].endPoint_high = endPoint_q[1]; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + if (threadInBlock < 2) + { + int2x3 endPoint_q; + endPoint_q[0] = shared_temp[threadBase + threadInBlock + 2].endPoint_low; + endPoint_q[1] = shared_temp[threadBase + threadInBlock + 2].endPoint_high; + + int bBadQuantize = 0; + if (threadInBlock == 0) + { + if (best_mode > 10) + { + finish_quantize( bBadQuantize, endPoint_q, prec, transformed ); + } + else + { + finish_quantize_0( bBadQuantize, endPoint_q, prec, transformed ); + } + } + else // if (threadInBlock == 1) + { + if (best_mode <= 10) + { + finish_quantize_1( bBadQuantize, endPoint_q, prec, transformed ); + } + } + + shared_temp[GI].endPoint_low = endPoint_q[0]; + shared_temp[GI].endPoint_high = endPoint_q[1]; + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + if ( threadInBlock == 0 ) + { + int2x3 endPoint_q[2]; + endPoint_q[0][0] = shared_temp[threadBase + 0].endPoint_low; + endPoint_q[0][1] = shared_temp[threadBase + 0].endPoint_high; + endPoint_q[1][0] = shared_temp[threadBase + 1].endPoint_low; + endPoint_q[1][1] = shared_temp[threadBase + 1].endPoint_high; + + if ( best_mode > 10 ) + { + block_package( block, endPoint_q[0], best_mode ); + } + else + { + block_package( block, endPoint_q, best_mode, best_partition ); + } + + g_OutBuff[blockID] = block; + } +} + +uint float2half1( float f ) +{ + uint Result; + + uint IValue = asuint(f); + uint Sign = (IValue & 0x80000000U) >> 16U; + IValue = IValue & 0x7FFFFFFFU; + + if (IValue > 0x47FFEFFFU) + { + // The number is too large to be represented as a half. Saturate to infinity. + Result = 0x7FFFU; + } + else + { + if (IValue < 0x38800000U) + { + // The number is too small to be represented as a normalized half. + // Convert it to a denormalized value. + uint Shift = 113U - (IValue >> 23U); + IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized half. + IValue += 0xC8000000U; + } + + Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU; + } + return (Result|Sign); +} + +uint3 float2half( float3 endPoint_f ) +{ + //uint3 sign = asuint(endPoint_f) & 0x80000000; + //uint3 expo = asuint(endPoint_f) & 0x7F800000; + //uint3 base = asuint(endPoint_f) & 0x007FFFFF; + //return ( expo < 0x33800000 ) ? 0 + // //0x33800000 indicating 2^-24, which is minimal denormalized number that half can present + // : ( ( expo < 0x38800000 ) ? ( sign >> 16 ) | ( ( base + 0x00800000 ) >> ( 23 - ( ( expo - 0x33800000 ) >> 23 ) ) )//fixed a bug in v0.2 + // //0x38800000 indicating 2^-14, which is minimal normalized number that half can present, so need to use denormalized half presentation + // : ( ( expo == 0x7F800000 || expo > 0x47000000 ) ? ( ( sign >> 16 ) | 0x7bff ) + // // treat NaN as INF, treat INF (including NaN) as the maximum/minimum number that half can present + // // 0x47000000 indicating 2^15, which is maximum exponent that half can present, so cut to 0x7bff which is the maximum half number + // : ( ( sign >> 16 ) | ( ( ( expo - 0x38000000 ) | base ) >> 13 ) ) ) ); + + + return uint3( float2half1( endPoint_f.x ), float2half1( endPoint_f.y ), float2half1( endPoint_f.z ) ); +} +int3 start_quantize( uint3 pixel_h ) +{ + if ( g_format == UNSIGNED_F16 ) + { + return asint( ( pixel_h << 6 ) / 31 ); + } + else + { + return ( pixel_h < 0x8000 ) ? ( ( pixel_h == 0x7bff ) ? 0x7fff : asint( ( pixel_h << 5 ) / 31 ) )// fixed a bug in v0.2 + : ( ( pixel_h == 0x7bff ) ? 0xffff8001 : -asint( ( ( 0x00007fff & pixel_h ) << 5 ) / 31 ) );// fixed a bug in v0.2 + } +} +void quantize( inout int2x3 endPoint, uint prec ) +{ + int iprec = asint( prec ); + if ( g_format == UNSIGNED_F16 ) + { + endPoint = ( ( iprec >= 15 ) | ( endPoint == 0 ) ) ? endPoint + : ( ( endPoint == asint(0xFFFF) ) ? ( ( 1 << iprec ) - 1 ) + : ( ( ( endPoint << iprec ) + asint(0x0000) ) >> 16 ) ); + } + else + { + endPoint = ( ( iprec >= 16 ) | ( endPoint == 0 ) ) ? endPoint + : ( ( endPoint >= 0 ) ? ( ( endPoint == asint(0x7FFF) ) ? ( ( 1 << ( iprec - 1 ) ) - 1 ) : ( ( ( endPoint << ( iprec - 1 ) ) + asint(0x0000) ) >> 15 ) ) + : ( ( -endPoint == asint(0x7FFF) ) ? -( ( 1 << ( iprec - 1 ) ) - 1 ) : -( ( ( -endPoint << ( iprec - 1 ) ) + asint(0x0000) ) >> 15 ) ) ); + } +} +void finish_quantize_0( inout int bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed ) +{ + if ( transformed ) + { + bool3 bBadComponent = ( endPoint[1] >= 0 ) ? ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) + : ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ); + bBadQuantize |= any(bBadComponent); + + endPoint[0] = endPoint[0] & ( ( 1 << prec.x ) - 1 ); + endPoint[1] = ( endPoint[1] >= 0 ) ? ( ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) ? ( ( 1 << ( prec.yzw - 1 ) ) - 1 ) : endPoint[1] ) + : ( ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ) ? ( 1 << ( prec.yzw - 1 ) ) : ( endPoint[1] & ( ( 1 << prec.yzw ) - 1 ) ) ); + } + else + { + endPoint &= ( ( 1 << prec.x ) - 1 ); + } +} +void finish_quantize_1( inout int bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed ) +{ + if ( transformed ) + { + bool2x3 bBadComponent; + bBadComponent[0] = ( endPoint[0] >= 0 ) ? ( endPoint[0] >= ( 1 << ( prec.yzw - 1 ) ) ) + : ( -endPoint[0] > ( 1 << ( prec.yzw - 1 ) ) ); + bBadComponent[1] = ( endPoint[1] >= 0 ) ? ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) + : ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ); + bBadQuantize |= any(bBadComponent); + + endPoint[0] = ( endPoint[0] >= 0 ) ? ( ( endPoint[0] >= ( 1 << ( prec.yzw - 1 ) ) ) ? ( ( 1 << ( prec.yzw - 1 ) ) - 1 ) : endPoint[0] ) + : ( ( -endPoint[0] > ( 1 << ( prec.yzw - 1 ) ) ) ? ( 1 << ( prec.yzw - 1 ) ) : ( endPoint[0] & ( ( 1 << prec.yzw ) - 1 ) ) ); + endPoint[1] = ( endPoint[1] >= 0 ) ? ( ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) ? ( ( 1 << ( prec.yzw - 1 ) ) - 1 ) : endPoint[1] ) + : ( ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ) ? ( 1 << ( prec.yzw - 1 ) ) : ( endPoint[1] & ( ( 1 << prec.yzw ) - 1 ) ) ); + } + else + { + endPoint &= ( ( 1 << prec.x ) - 1 ); + } +} +void finish_quantize( out bool bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed ) +{ + if ( transformed ) + { + bool3 bBadComponent; + bBadComponent = ( endPoint[1] >= 0 ) ? ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) + : ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ); + bBadQuantize = any( bBadComponent ); + + endPoint[0] = endPoint[0] & ( ( 1 << prec.x ) - 1 ); + endPoint[1] = ( endPoint[1] >= 0 ) ? ( ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) ? ( ( 1 << ( prec.yzw - 1 ) ) - 1 ) : endPoint[1] ) + : ( ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ) ? ( 1 << ( prec.yzw - 1 ) ) : ( endPoint[1] & ( ( 1 << prec.yzw ) - 1 ) ) ); + } + else + { + endPoint &= ( ( 1 << prec.x ) - 1 ); + + bBadQuantize = 0; + } +} + +void SIGN_EXTEND( uint3 prec, inout int3 color ) +{ + uint3 p = 1 << (prec - 1); + color = (color & p) ? (color & (p - 1)) - p : color; +} + +void sign_extend( bool transformed, uint4 prec, inout int2x3 endPoint ) +{ + if ( g_format == SIGNED_F16 ) + SIGN_EXTEND( prec.x, endPoint[0] ); + if ( g_format == SIGNED_F16 || transformed ) + SIGN_EXTEND( prec.yzw, endPoint[1] ); +} + +void sign_extend( bool transformed, uint4 prec, inout int2x3 endPoint[2] ) +{ + if ( g_format == SIGNED_F16 ) + SIGN_EXTEND( prec.x, endPoint[0][0] ); + if ( g_format == SIGNED_F16 || transformed ) + { + SIGN_EXTEND( prec.yzw, endPoint[0][1] ); + SIGN_EXTEND( prec.yzw, endPoint[1][0] ); + SIGN_EXTEND( prec.yzw, endPoint[1][1] ); + } +} +void start_unquantize( inout int2x3 endPoint[2], uint4 prec, bool transformed ) +{ + sign_extend( transformed, prec, endPoint ); + if ( transformed ) + { + endPoint[0][1] += endPoint[0][0]; + endPoint[1][0] += endPoint[0][0]; + endPoint[1][1] += endPoint[0][0]; + } +} +void start_unquantize( inout int2x3 endPoint, uint4 prec, bool transformed ) +{ + sign_extend( transformed, prec, endPoint ); + if ( transformed ) + endPoint[1] += endPoint[0]; +} +void unquantize( inout int2x3 color, uint prec ) +{ + int iprec = asint( prec ); + if (g_format == UNSIGNED_F16 ) + { + if (prec < 15) + { + color = (color != 0) ? (color == ((1 << iprec) - 1) ? 0xFFFF : (((color << 16) + 0x8000) >> iprec)) : color; + } + } + else + { + if (prec < 16) + { + uint2x3 s = color >= 0 ? 0 : 1; + color = abs(color); + color = (color != 0) ? (color >= ((1 << (iprec - 1)) - 1) ? 0x7FFF : (((color << 15) + 0x4000) >> (iprec - 1))) : color; + color = s > 0 ? -color : color; + } + } +} +uint3 finish_unquantize( int3 color ) +{ + if ( g_format == UNSIGNED_F16 ) + color = ( color * 31 ) >> 6; + else + { + color = ( color < 0 ) ? -( ( -color * 31 ) >> 5 ) : ( color * 31 ) >> 5; + color = ( color < 0 ) ? ( ( -color ) | 0x8000 ) : color; + } + return asuint(color); +} +void generate_palette_unquantized8( out uint3 palette, int3 low, int3 high, int i ) +{ + static const int aWeight3[] = {0, 9, 18, 27, 37, 46, 55, 64}; + + int3 tmp = ( low * ( 64 - aWeight3[i] ) + high * aWeight3[i] + 32 ) >> 6; + palette = finish_unquantize( tmp ); +} +void generate_palette_unquantized16( out uint3 palette, int3 low, int3 high, int i ) +{ + static const int aWeight4[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64}; + + int3 tmp = ( low * ( 64 - aWeight4[i] ) + high * aWeight4[i] + 32 ) >> 6; + palette = finish_unquantize( tmp ); +} + +float half2float1( uint Value ) +{ + uint Mantissa = (uint)(Value & 0x03FF); + + uint Exponent; + if ((Value & 0x7C00) != 0) // The value is normalized + { + Exponent = (uint)((Value >> 10) & 0x1F); + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x0400) == 0); + + Mantissa &= 0x03FF; + } + else // The value is zero + { + Exponent = (uint)(-112); + } + + uint Result = ((Value & 0x8000) << 16) | // Sign + ((Exponent + 112) << 23) | // Exponent + (Mantissa << 13); // Mantissa + + return asfloat(Result); +} + +float3 half2float(uint3 color_h ) +{ + //uint3 sign = color_h & 0x8000; + //uint3 expo = color_h & 0x7C00; + //uint3 base = color_h & 0x03FF; + //return ( expo == 0 ) ? asfloat( ( sign << 16 ) | asuint( float3(base) / 16777216 ) ) //16777216 = 2^24 + // : asfloat( ( sign << 16 ) | ( ( ( expo + 0x1C000 ) | base ) << 13 ) ); //0x1C000 = 0x1FC00 - 0x3C00 + + return float3( half2float1( color_h.x ), half2float1( color_h.y ), half2float1( color_h.z ) ); +} + +void block_package( inout uint4 block, int2x3 endPoint[2], uint mode_type, uint partition_index ) // for mode 1 - 10 +{ + block.xy = 0; + block.z &= 0xFFFC0000; + + //block.z |= (partition_index & 0x1f) << 13; + + if ( mode_type == candidateModeFlag[0]) + { + /*block.x = candidateModeMemory[0]; + block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 ); + block.x |= ( endPoint[1][0].g >> 2 ) & 0x00000004; + block.x |= ( endPoint[1][0].b >> 1 ) & 0x00000008; + block.x |= endPoint[1][1].b & 0x00000010; + block.y |= ( ( endPoint[0][0].b >> 7 ) & 0x00000007 ); + block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 ); + block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E); + block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000; + block.y |= ( ( endPoint[1][1].g << 4 ) & 0x00000100 ); + block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80); + block.yz |= ( ( endPoint[1][1].b << uint2(27, 9) ) & uint2(0x10000000, 0x00001000) ) | ( ( endPoint[1][1].b << uint2(18, 4) ) & uint2(0x00040000, 0x00000040) ); + block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;*/ + + block.x |= ((candidateModeMemory[0] >> 0) & 1) << 0; + block.x |= ((candidateModeMemory[0] >> 1) & 1) << 1; + block.x |= ((endPoint[1][0].g >> 4) & 1) << 2; + block.x |= ((endPoint[1][0].b >> 4) & 1) << 3; + block.x |= ((endPoint[1][1].b >> 4) & 1) << 4; + block.x |= ((endPoint[0][0].r >> 0) & 1) << 5; + block.x |= ((endPoint[0][0].r >> 1) & 1) << 6; + block.x |= ((endPoint[0][0].r >> 2) & 1) << 7; + block.x |= ((endPoint[0][0].r >> 3) & 1) << 8; + block.x |= ((endPoint[0][0].r >> 4) & 1) << 9; + block.x |= ((endPoint[0][0].r >> 5) & 1) << 10; + block.x |= ((endPoint[0][0].r >> 6) & 1) << 11; + block.x |= ((endPoint[0][0].r >> 7) & 1) << 12; + block.x |= ((endPoint[0][0].r >> 8) & 1) << 13; + block.x |= ((endPoint[0][0].r >> 9) & 1) << 14; + block.x |= ((endPoint[0][0].g >> 0) & 1) << 15; + block.x |= ((endPoint[0][0].g >> 1) & 1) << 16; + block.x |= ((endPoint[0][0].g >> 2) & 1) << 17; + block.x |= ((endPoint[0][0].g >> 3) & 1) << 18; + block.x |= ((endPoint[0][0].g >> 4) & 1) << 19; + block.x |= ((endPoint[0][0].g >> 5) & 1) << 20; + block.x |= ((endPoint[0][0].g >> 6) & 1) << 21; + block.x |= ((endPoint[0][0].g >> 7) & 1) << 22; + block.x |= ((endPoint[0][0].g >> 8) & 1) << 23; + block.x |= ((endPoint[0][0].g >> 9) & 1) << 24; + block.x |= ((endPoint[0][0].b >> 0) & 1) << 25; + block.x |= ((endPoint[0][0].b >> 1) & 1) << 26; + block.x |= ((endPoint[0][0].b >> 2) & 1) << 27; + block.x |= ((endPoint[0][0].b >> 3) & 1) << 28; + block.x |= ((endPoint[0][0].b >> 4) & 1) << 29; + block.x |= ((endPoint[0][0].b >> 5) & 1) << 30; + block.x |= ((endPoint[0][0].b >> 6) & 1) << 31; + block.y |= ((endPoint[0][0].b >> 7) & 1) << 0; + block.y |= ((endPoint[0][0].b >> 8) & 1) << 1; + block.y |= ((endPoint[0][0].b >> 9) & 1) << 2; + block.y |= ((endPoint[0][1].r >> 0) & 1) << 3; + block.y |= ((endPoint[0][1].r >> 1) & 1) << 4; + block.y |= ((endPoint[0][1].r >> 2) & 1) << 5; + block.y |= ((endPoint[0][1].r >> 3) & 1) << 6; + block.y |= ((endPoint[0][1].r >> 4) & 1) << 7; + block.y |= ((endPoint[1][1].g >> 4) & 1) << 8; + block.y |= ((endPoint[1][0].g >> 0) & 1) << 9; + block.y |= ((endPoint[1][0].g >> 1) & 1) << 10; + block.y |= ((endPoint[1][0].g >> 2) & 1) << 11; + block.y |= ((endPoint[1][0].g >> 3) & 1) << 12; + block.y |= ((endPoint[0][1].g >> 0) & 1) << 13; + block.y |= ((endPoint[0][1].g >> 1) & 1) << 14; + block.y |= ((endPoint[0][1].g >> 2) & 1) << 15; + block.y |= ((endPoint[0][1].g >> 3) & 1) << 16; + block.y |= ((endPoint[0][1].g >> 4) & 1) << 17; + block.y |= ((endPoint[1][1].b >> 0) & 1) << 18; + block.y |= ((endPoint[1][1].g >> 0) & 1) << 19; + block.y |= ((endPoint[1][1].g >> 1) & 1) << 20; + block.y |= ((endPoint[1][1].g >> 2) & 1) << 21; + block.y |= ((endPoint[1][1].g >> 3) & 1) << 22; + block.y |= ((endPoint[0][1].b >> 0) & 1) << 23; + block.y |= ((endPoint[0][1].b >> 1) & 1) << 24; + block.y |= ((endPoint[0][1].b >> 2) & 1) << 25; + block.y |= ((endPoint[0][1].b >> 3) & 1) << 26; + block.y |= ((endPoint[0][1].b >> 4) & 1) << 27; + block.y |= ((endPoint[1][1].b >> 1) & 1) << 28; + block.y |= ((endPoint[1][0].b >> 0) & 1) << 29; + block.y |= ((endPoint[1][0].b >> 1) & 1) << 30; + block.y |= ((endPoint[1][0].b >> 2) & 1) << 31; + block.z |= ((endPoint[1][0].b >> 3) & 1) << 0; + block.z |= ((endPoint[1][0].r >> 0) & 1) << 1; + block.z |= ((endPoint[1][0].r >> 1) & 1) << 2; + block.z |= ((endPoint[1][0].r >> 2) & 1) << 3; + block.z |= ((endPoint[1][0].r >> 3) & 1) << 4; + block.z |= ((endPoint[1][0].r >> 4) & 1) << 5; + block.z |= ((endPoint[1][1].b >> 2) & 1) << 6; + block.z |= ((endPoint[1][1].r >> 0) & 1) << 7; + block.z |= ((endPoint[1][1].r >> 1) & 1) << 8; + block.z |= ((endPoint[1][1].r >> 2) & 1) << 9; + block.z |= ((endPoint[1][1].r >> 3) & 1) << 10; + block.z |= ((endPoint[1][1].r >> 4) & 1) << 11; + block.z |= ((endPoint[1][1].b >> 3) & 1) << 12; + block.z |= ((partition_index >> 0) & 1) << 13; + block.z |= ((partition_index >> 1) & 1) << 14; + block.z |= ((partition_index >> 2) & 1) << 15; + block.z |= ((partition_index >> 3) & 1) << 16; + block.z |= ((partition_index >> 4) & 1) << 17; + } + else if ( mode_type == candidateModeFlag[1]) + { + /*block.x = candidateModeMemory[1]; + block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00000FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x003F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 ); + block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000001F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0007E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x1F800000 ); + block.x |= ( ( endPoint[1][0].g >> 3 ) & 0x00000004 ) | ( ( endPoint[1][0].g << 20 ) & 0x01000000 ); + block.x |= ( endPoint[1][1].g >> 1 ) & 0x00000018; + block.x |= ( ( endPoint[1][1].b << 21 ) & 0x00800000 ) | ( ( endPoint[1][1].b << 12 ) & 0x00003000 ); + block.x |= ( ( endPoint[1][0].b << 17 ) & 0x00400000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 ); + block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000007E); + block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000; + block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00001F80); + block.y |= ( ( endPoint[1][1].b >> 4 ) & 0x00000002 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 ) | ( ( endPoint[1][1].b >> 3 ) & 0x00000001 ); + block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;*/ + + block.x |= ((candidateModeMemory[1] >> 0) & 1) << 0; + block.x |= ((candidateModeMemory[1] >> 1) & 1) << 1; + block.x |= ((endPoint[1][0].g >> 5) & 1) << 2; + block.x |= ((endPoint[1][1].g >> 4) & 1) << 3; + block.x |= ((endPoint[1][1].g >> 5) & 1) << 4; + block.x |= ((endPoint[0][0].r >> 0) & 1) << 5; + block.x |= ((endPoint[0][0].r >> 1) & 1) << 6; + block.x |= ((endPoint[0][0].r >> 2) & 1) << 7; + block.x |= ((endPoint[0][0].r >> 3) & 1) << 8; + block.x |= ((endPoint[0][0].r >> 4) & 1) << 9; + block.x |= ((endPoint[0][0].r >> 5) & 1) << 10; + block.x |= ((endPoint[0][0].r >> 6) & 1) << 11; + block.x |= ((endPoint[1][1].b >> 0) & 1) << 12; + block.x |= ((endPoint[1][1].b >> 1) & 1) << 13; + block.x |= ((endPoint[1][0].b >> 4) & 1) << 14; + block.x |= ((endPoint[0][0].g >> 0) & 1) << 15; + block.x |= ((endPoint[0][0].g >> 1) & 1) << 16; + block.x |= ((endPoint[0][0].g >> 2) & 1) << 17; + block.x |= ((endPoint[0][0].g >> 3) & 1) << 18; + block.x |= ((endPoint[0][0].g >> 4) & 1) << 19; + block.x |= ((endPoint[0][0].g >> 5) & 1) << 20; + block.x |= ((endPoint[0][0].g >> 6) & 1) << 21; + block.x |= ((endPoint[1][0].b >> 5) & 1) << 22; + block.x |= ((endPoint[1][1].b >> 2) & 1) << 23; + block.x |= ((endPoint[1][0].g >> 4) & 1) << 24; + block.x |= ((endPoint[0][0].b >> 0) & 1) << 25; + block.x |= ((endPoint[0][0].b >> 1) & 1) << 26; + block.x |= ((endPoint[0][0].b >> 2) & 1) << 27; + block.x |= ((endPoint[0][0].b >> 3) & 1) << 28; + block.x |= ((endPoint[0][0].b >> 4) & 1) << 29; + block.x |= ((endPoint[0][0].b >> 5) & 1) << 30; + block.x |= ((endPoint[0][0].b >> 6) & 1) << 31; + block.y |= ((endPoint[1][1].b >> 3) & 1) << 0; + block.y |= ((endPoint[1][1].b >> 5) & 1) << 1; + block.y |= ((endPoint[1][1].b >> 4) & 1) << 2; + block.y |= ((endPoint[0][1].r >> 0) & 1) << 3; + block.y |= ((endPoint[0][1].r >> 1) & 1) << 4; + block.y |= ((endPoint[0][1].r >> 2) & 1) << 5; + block.y |= ((endPoint[0][1].r >> 3) & 1) << 6; + block.y |= ((endPoint[0][1].r >> 4) & 1) << 7; + block.y |= ((endPoint[0][1].r >> 5) & 1) << 8; + block.y |= ((endPoint[1][0].g >> 0) & 1) << 9; + block.y |= ((endPoint[1][0].g >> 1) & 1) << 10; + block.y |= ((endPoint[1][0].g >> 2) & 1) << 11; + block.y |= ((endPoint[1][0].g >> 3) & 1) << 12; + block.y |= ((endPoint[0][1].g >> 0) & 1) << 13; + block.y |= ((endPoint[0][1].g >> 1) & 1) << 14; + block.y |= ((endPoint[0][1].g >> 2) & 1) << 15; + block.y |= ((endPoint[0][1].g >> 3) & 1) << 16; + block.y |= ((endPoint[0][1].g >> 4) & 1) << 17; + block.y |= ((endPoint[0][1].g >> 5) & 1) << 18; + block.y |= ((endPoint[1][1].g >> 0) & 1) << 19; + block.y |= ((endPoint[1][1].g >> 1) & 1) << 20; + block.y |= ((endPoint[1][1].g >> 2) & 1) << 21; + block.y |= ((endPoint[1][1].g >> 3) & 1) << 22; + block.y |= ((endPoint[0][1].b >> 0) & 1) << 23; + block.y |= ((endPoint[0][1].b >> 1) & 1) << 24; + block.y |= ((endPoint[0][1].b >> 2) & 1) << 25; + block.y |= ((endPoint[0][1].b >> 3) & 1) << 26; + block.y |= ((endPoint[0][1].b >> 4) & 1) << 27; + block.y |= ((endPoint[0][1].b >> 5) & 1) << 28; + block.y |= ((endPoint[1][0].b >> 0) & 1) << 29; + block.y |= ((endPoint[1][0].b >> 1) & 1) << 30; + block.y |= ((endPoint[1][0].b >> 2) & 1) << 31; + block.z |= ((endPoint[1][0].b >> 3) & 1) << 0; + block.z |= ((endPoint[1][0].r >> 0) & 1) << 1; + block.z |= ((endPoint[1][0].r >> 1) & 1) << 2; + block.z |= ((endPoint[1][0].r >> 2) & 1) << 3; + block.z |= ((endPoint[1][0].r >> 3) & 1) << 4; + block.z |= ((endPoint[1][0].r >> 4) & 1) << 5; + block.z |= ((endPoint[1][0].r >> 5) & 1) << 6; + block.z |= ((endPoint[1][1].r >> 0) & 1) << 7; + block.z |= ((endPoint[1][1].r >> 1) & 1) << 8; + block.z |= ((endPoint[1][1].r >> 2) & 1) << 9; + block.z |= ((endPoint[1][1].r >> 3) & 1) << 10; + block.z |= ((endPoint[1][1].r >> 4) & 1) << 11; + block.z |= ((endPoint[1][1].r >> 5) & 1) << 12; + block.z |= ((partition_index >> 0) & 1) << 13; + block.z |= ((partition_index >> 1) & 1) << 14; + block.z |= ((partition_index >> 2) & 1) << 15; + block.z |= ((partition_index >> 3) & 1) << 16; + block.z |= ((partition_index >> 4) & 1) << 17; + } + else if ( mode_type == candidateModeFlag[2]) + { + /*block.x = candidateModeMemory[2]; + block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 ); + block.y |= ( endPoint[0][0].r >> 2 ) & 0x00000100; + block.y |= ( endPoint[0][0].g << 7 ) & 0x00020000; + block.y |= ( ( endPoint[0][0].b << 17 ) & 0x08000000 ) | ( ( endPoint[0][0].b >> 7 ) & 0x00000007 ); + block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0001E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x07800000 ); + block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E); + block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000; + block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80); + block.yz |= ( ( endPoint[1][1].b << uint2(27, 9) ) & uint2(0x10000000, 0x00001000) ) | ( ( endPoint[1][1].b << uint2(18, 4) ) & uint2(0x00040000, 0x00000040) ); + block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;*/ + + block.x |= ((candidateModeMemory[2] >> 0) & 1) << 0; + block.x |= ((candidateModeMemory[2] >> 1) & 1) << 1; + block.x |= ((candidateModeMemory[2] >> 2) & 1) << 2; + block.x |= ((candidateModeMemory[2] >> 3) & 1) << 3; + block.x |= ((candidateModeMemory[2] >> 4) & 1) << 4; + block.x |= ((endPoint[0][0].r >> 0) & 1) << 5; + block.x |= ((endPoint[0][0].r >> 1) & 1) << 6; + block.x |= ((endPoint[0][0].r >> 2) & 1) << 7; + block.x |= ((endPoint[0][0].r >> 3) & 1) << 8; + block.x |= ((endPoint[0][0].r >> 4) & 1) << 9; + block.x |= ((endPoint[0][0].r >> 5) & 1) << 10; + block.x |= ((endPoint[0][0].r >> 6) & 1) << 11; + block.x |= ((endPoint[0][0].r >> 7) & 1) << 12; + block.x |= ((endPoint[0][0].r >> 8) & 1) << 13; + block.x |= ((endPoint[0][0].r >> 9) & 1) << 14; + block.x |= ((endPoint[0][0].g >> 0) & 1) << 15; + block.x |= ((endPoint[0][0].g >> 1) & 1) << 16; + block.x |= ((endPoint[0][0].g >> 2) & 1) << 17; + block.x |= ((endPoint[0][0].g >> 3) & 1) << 18; + block.x |= ((endPoint[0][0].g >> 4) & 1) << 19; + block.x |= ((endPoint[0][0].g >> 5) & 1) << 20; + block.x |= ((endPoint[0][0].g >> 6) & 1) << 21; + block.x |= ((endPoint[0][0].g >> 7) & 1) << 22; + block.x |= ((endPoint[0][0].g >> 8) & 1) << 23; + block.x |= ((endPoint[0][0].g >> 9) & 1) << 24; + block.x |= ((endPoint[0][0].b >> 0) & 1) << 25; + block.x |= ((endPoint[0][0].b >> 1) & 1) << 26; + block.x |= ((endPoint[0][0].b >> 2) & 1) << 27; + block.x |= ((endPoint[0][0].b >> 3) & 1) << 28; + block.x |= ((endPoint[0][0].b >> 4) & 1) << 29; + block.x |= ((endPoint[0][0].b >> 5) & 1) << 30; + block.x |= ((endPoint[0][0].b >> 6) & 1) << 31; + block.y |= ((endPoint[0][0].b >> 7) & 1) << 0; + block.y |= ((endPoint[0][0].b >> 8) & 1) << 1; + block.y |= ((endPoint[0][0].b >> 9) & 1) << 2; + block.y |= ((endPoint[0][1].r >> 0) & 1) << 3; + block.y |= ((endPoint[0][1].r >> 1) & 1) << 4; + block.y |= ((endPoint[0][1].r >> 2) & 1) << 5; + block.y |= ((endPoint[0][1].r >> 3) & 1) << 6; + block.y |= ((endPoint[0][1].r >> 4) & 1) << 7; + block.y |= ((endPoint[0][0].r >> 10) & 1) << 8; + block.y |= ((endPoint[1][0].g >> 0) & 1) << 9; + block.y |= ((endPoint[1][0].g >> 1) & 1) << 10; + block.y |= ((endPoint[1][0].g >> 2) & 1) << 11; + block.y |= ((endPoint[1][0].g >> 3) & 1) << 12; + block.y |= ((endPoint[0][1].g >> 0) & 1) << 13; + block.y |= ((endPoint[0][1].g >> 1) & 1) << 14; + block.y |= ((endPoint[0][1].g >> 2) & 1) << 15; + block.y |= ((endPoint[0][1].g >> 3) & 1) << 16; + block.y |= ((endPoint[0][0].g >> 10) & 1) << 17; + block.y |= ((endPoint[1][1].b >> 0) & 1) << 18; + block.y |= ((endPoint[1][1].g >> 0) & 1) << 19; + block.y |= ((endPoint[1][1].g >> 1) & 1) << 20; + block.y |= ((endPoint[1][1].g >> 2) & 1) << 21; + block.y |= ((endPoint[1][1].g >> 3) & 1) << 22; + block.y |= ((endPoint[0][1].b >> 0) & 1) << 23; + block.y |= ((endPoint[0][1].b >> 1) & 1) << 24; + block.y |= ((endPoint[0][1].b >> 2) & 1) << 25; + block.y |= ((endPoint[0][1].b >> 3) & 1) << 26; + block.y |= ((endPoint[0][0].b >> 10) & 1) << 27; + block.y |= ((endPoint[1][1].b >> 1) & 1) << 28; + block.y |= ((endPoint[1][0].b >> 0) & 1) << 29; + block.y |= ((endPoint[1][0].b >> 1) & 1) << 30; + block.y |= ((endPoint[1][0].b >> 2) & 1) << 31; + block.z |= ((endPoint[1][0].b >> 3) & 1) << 0; + block.z |= ((endPoint[1][0].r >> 0) & 1) << 1; + block.z |= ((endPoint[1][0].r >> 1) & 1) << 2; + block.z |= ((endPoint[1][0].r >> 2) & 1) << 3; + block.z |= ((endPoint[1][0].r >> 3) & 1) << 4; + block.z |= ((endPoint[1][0].r >> 4) & 1) << 5; + block.z |= ((endPoint[1][1].b >> 2) & 1) << 6; + block.z |= ((endPoint[1][1].r >> 0) & 1) << 7; + block.z |= ((endPoint[1][1].r >> 1) & 1) << 8; + block.z |= ((endPoint[1][1].r >> 2) & 1) << 9; + block.z |= ((endPoint[1][1].r >> 3) & 1) << 10; + block.z |= ((endPoint[1][1].r >> 4) & 1) << 11; + block.z |= ((endPoint[1][1].b >> 3) & 1) << 12; + block.z |= ((partition_index >> 0) & 1) << 13; + block.z |= ((partition_index >> 1) & 1) << 14; + block.z |= ((partition_index >> 2) & 1) << 15; + block.z |= ((partition_index >> 3) & 1) << 16; + block.z |= ((partition_index >> 4) & 1) << 17; + } + else if ( mode_type == candidateModeFlag[3]) + { + /*block.x = candidateModeMemory[3]; + block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 ); + block.y |= ( endPoint[0][0].r >> 3 ) & 0x00000080; + block.y |= ( endPoint[0][0].g << 8 ) & 0x00040000; + block.y |= ( ( endPoint[0][0].b << 17 ) & 0x08000000 ) | ( ( endPoint[0][0].b >> 7 ) & 0x00000007 ); + block.y |= ( ( endPoint[0][1].r << 3 ) & 0x00000078 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x07800000 ); + block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000001E); + block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000; + block.y |= ( ( endPoint[1][1].g << 4 ) & 0x00000100 ); + block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000780); + block.yz |= ( endPoint[1][1].b << uint2(27, 9) ) & uint2(0x10000000, 0x00001000); + block.z |= ( ( endPoint[1][0].g << 7 ) & 0x00000800 ); + block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001; + block.z |= ( endPoint[1][1].b << 4 ) & 0x00000040; + block.z |= ( endPoint[1][1].b << 5 ) & 0x00000020;*/ + + block.x |= ((candidateModeMemory[3] >> 0) & 1) << 0; + block.x |= ((candidateModeMemory[3] >> 1) & 1) << 1; + block.x |= ((candidateModeMemory[3] >> 2) & 1) << 2; + block.x |= ((candidateModeMemory[3] >> 3) & 1) << 3; + block.x |= ((candidateModeMemory[3] >> 4) & 1) << 4; + block.x |= ((endPoint[0][0].r >> 0) & 1) << 5; + block.x |= ((endPoint[0][0].r >> 1) & 1) << 6; + block.x |= ((endPoint[0][0].r >> 2) & 1) << 7; + block.x |= ((endPoint[0][0].r >> 3) & 1) << 8; + block.x |= ((endPoint[0][0].r >> 4) & 1) << 9; + block.x |= ((endPoint[0][0].r >> 5) & 1) << 10; + block.x |= ((endPoint[0][0].r >> 6) & 1) << 11; + block.x |= ((endPoint[0][0].r >> 7) & 1) << 12; + block.x |= ((endPoint[0][0].r >> 8) & 1) << 13; + block.x |= ((endPoint[0][0].r >> 9) & 1) << 14; + block.x |= ((endPoint[0][0].g >> 0) & 1) << 15; + block.x |= ((endPoint[0][0].g >> 1) & 1) << 16; + block.x |= ((endPoint[0][0].g >> 2) & 1) << 17; + block.x |= ((endPoint[0][0].g >> 3) & 1) << 18; + block.x |= ((endPoint[0][0].g >> 4) & 1) << 19; + block.x |= ((endPoint[0][0].g >> 5) & 1) << 20; + block.x |= ((endPoint[0][0].g >> 6) & 1) << 21; + block.x |= ((endPoint[0][0].g >> 7) & 1) << 22; + block.x |= ((endPoint[0][0].g >> 8) & 1) << 23; + block.x |= ((endPoint[0][0].g >> 9) & 1) << 24; + block.x |= ((endPoint[0][0].b >> 0) & 1) << 25; + block.x |= ((endPoint[0][0].b >> 1) & 1) << 26; + block.x |= ((endPoint[0][0].b >> 2) & 1) << 27; + block.x |= ((endPoint[0][0].b >> 3) & 1) << 28; + block.x |= ((endPoint[0][0].b >> 4) & 1) << 29; + block.x |= ((endPoint[0][0].b >> 5) & 1) << 30; + block.x |= ((endPoint[0][0].b >> 6) & 1) << 31; + block.y |= ((endPoint[0][0].b >> 7) & 1) << 0; + block.y |= ((endPoint[0][0].b >> 8) & 1) << 1; + block.y |= ((endPoint[0][0].b >> 9) & 1) << 2; + block.y |= ((endPoint[0][1].r >> 0) & 1) << 3; + block.y |= ((endPoint[0][1].r >> 1) & 1) << 4; + block.y |= ((endPoint[0][1].r >> 2) & 1) << 5; + block.y |= ((endPoint[0][1].r >> 3) & 1) << 6; + block.y |= ((endPoint[0][0].r >> 10) & 1) << 7; + block.y |= ((endPoint[1][1].g >> 4) & 1) << 8; + block.y |= ((endPoint[1][0].g >> 0) & 1) << 9; + block.y |= ((endPoint[1][0].g >> 1) & 1) << 10; + block.y |= ((endPoint[1][0].g >> 2) & 1) << 11; + block.y |= ((endPoint[1][0].g >> 3) & 1) << 12; + block.y |= ((endPoint[0][1].g >> 0) & 1) << 13; + block.y |= ((endPoint[0][1].g >> 1) & 1) << 14; + block.y |= ((endPoint[0][1].g >> 2) & 1) << 15; + block.y |= ((endPoint[0][1].g >> 3) & 1) << 16; + block.y |= ((endPoint[0][1].g >> 4) & 1) << 17; + block.y |= ((endPoint[0][0].g >> 10) & 1) << 18; + block.y |= ((endPoint[1][1].g >> 0) & 1) << 19; + block.y |= ((endPoint[1][1].g >> 1) & 1) << 20; + block.y |= ((endPoint[1][1].g >> 2) & 1) << 21; + block.y |= ((endPoint[1][1].g >> 3) & 1) << 22; + block.y |= ((endPoint[0][1].b >> 0) & 1) << 23; + block.y |= ((endPoint[0][1].b >> 1) & 1) << 24; + block.y |= ((endPoint[0][1].b >> 2) & 1) << 25; + block.y |= ((endPoint[0][1].b >> 3) & 1) << 26; + block.y |= ((endPoint[0][0].b >> 10) & 1) << 27; + block.y |= ((endPoint[1][1].b >> 1) & 1) << 28; + block.y |= ((endPoint[1][0].b >> 0) & 1) << 29; + block.y |= ((endPoint[1][0].b >> 1) & 1) << 30; + block.y |= ((endPoint[1][0].b >> 2) & 1) << 31; + block.z |= ((endPoint[1][0].b >> 3) & 1) << 0; + block.z |= ((endPoint[1][0].r >> 0) & 1) << 1; + block.z |= ((endPoint[1][0].r >> 1) & 1) << 2; + block.z |= ((endPoint[1][0].r >> 2) & 1) << 3; + block.z |= ((endPoint[1][0].r >> 3) & 1) << 4; + block.z |= ((endPoint[1][1].b >> 0) & 1) << 5; + block.z |= ((endPoint[1][1].b >> 2) & 1) << 6; + block.z |= ((endPoint[1][1].r >> 0) & 1) << 7; + block.z |= ((endPoint[1][1].r >> 1) & 1) << 8; + block.z |= ((endPoint[1][1].r >> 2) & 1) << 9; + block.z |= ((endPoint[1][1].r >> 3) & 1) << 10; + block.z |= ((endPoint[1][0].g >> 4) & 1) << 11; + block.z |= ((endPoint[1][1].b >> 3) & 1) << 12; + block.z |= ((partition_index >> 0) & 1) << 13; + block.z |= ((partition_index >> 1) & 1) << 14; + block.z |= ((partition_index >> 2) & 1) << 15; + block.z |= ((partition_index >> 3) & 1) << 16; + block.z |= ((partition_index >> 4) & 1) << 17; + } + else if ( mode_type == candidateModeFlag[4]) + { + /*block.x = candidateModeMemory[4]; + block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 ); + block.y |= ( endPoint[0][0].r >> 3 ) & 0x00000080; + block.y |= ( endPoint[0][0].g << 7 ) & 0x00020000; + block.y |= ( ( endPoint[0][0].b << 18 ) & 0x10000000 ) | ( ( endPoint[0][0].b >> 7 ) & 0x00000007 ); + block.y |= ( ( endPoint[0][1].r << 3 ) & 0x00000078 ) | ( ( endPoint[0][1].g << 13 ) & 0x0001E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 ); + block.y |= ( ( endPoint[1][0].g << 9 ) & 0x00001E00 ) | ( ( endPoint[1][0].b << 4 ) & 0x00000100 ); + block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000; + block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000780); + block.yz |= ( endPoint[1][1].b << uint2(18, 4) ) & uint2(0x00040000, 0x00000060); + block.z |= ( endPoint[1][0].r << 1 ) & 0x0000001E; + block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001; + block.z |= ( ( endPoint[1][1].b << 7 ) & 0x00000800 ) | ( ( endPoint[1][1].b << 9 ) & 0x00001000 );*/ + + block.x |= ((candidateModeMemory[4] >> 0) & 1) << 0; + block.x |= ((candidateModeMemory[4] >> 1) & 1) << 1; + block.x |= ((candidateModeMemory[4] >> 2) & 1) << 2; + block.x |= ((candidateModeMemory[4] >> 3) & 1) << 3; + block.x |= ((candidateModeMemory[4] >> 4) & 1) << 4; + block.x |= ((endPoint[0][0].r >> 0) & 1) << 5; + block.x |= ((endPoint[0][0].r >> 1) & 1) << 6; + block.x |= ((endPoint[0][0].r >> 2) & 1) << 7; + block.x |= ((endPoint[0][0].r >> 3) & 1) << 8; + block.x |= ((endPoint[0][0].r >> 4) & 1) << 9; + block.x |= ((endPoint[0][0].r >> 5) & 1) << 10; + block.x |= ((endPoint[0][0].r >> 6) & 1) << 11; + block.x |= ((endPoint[0][0].r >> 7) & 1) << 12; + block.x |= ((endPoint[0][0].r >> 8) & 1) << 13; + block.x |= ((endPoint[0][0].r >> 9) & 1) << 14; + block.x |= ((endPoint[0][0].g >> 0) & 1) << 15; + block.x |= ((endPoint[0][0].g >> 1) & 1) << 16; + block.x |= ((endPoint[0][0].g >> 2) & 1) << 17; + block.x |= ((endPoint[0][0].g >> 3) & 1) << 18; + block.x |= ((endPoint[0][0].g >> 4) & 1) << 19; + block.x |= ((endPoint[0][0].g >> 5) & 1) << 20; + block.x |= ((endPoint[0][0].g >> 6) & 1) << 21; + block.x |= ((endPoint[0][0].g >> 7) & 1) << 22; + block.x |= ((endPoint[0][0].g >> 8) & 1) << 23; + block.x |= ((endPoint[0][0].g >> 9) & 1) << 24; + block.x |= ((endPoint[0][0].b >> 0) & 1) << 25; + block.x |= ((endPoint[0][0].b >> 1) & 1) << 26; + block.x |= ((endPoint[0][0].b >> 2) & 1) << 27; + block.x |= ((endPoint[0][0].b >> 3) & 1) << 28; + block.x |= ((endPoint[0][0].b >> 4) & 1) << 29; + block.x |= ((endPoint[0][0].b >> 5) & 1) << 30; + block.x |= ((endPoint[0][0].b >> 6) & 1) << 31; + block.y |= ((endPoint[0][0].b >> 7) & 1) << 0; + block.y |= ((endPoint[0][0].b >> 8) & 1) << 1; + block.y |= ((endPoint[0][0].b >> 9) & 1) << 2; + block.y |= ((endPoint[0][1].r >> 0) & 1) << 3; + block.y |= ((endPoint[0][1].r >> 1) & 1) << 4; + block.y |= ((endPoint[0][1].r >> 2) & 1) << 5; + block.y |= ((endPoint[0][1].r >> 3) & 1) << 6; + block.y |= ((endPoint[0][0].r >> 10) & 1) << 7; + block.y |= ((endPoint[1][0].b >> 4) & 1) << 8; + block.y |= ((endPoint[1][0].g >> 0) & 1) << 9; + block.y |= ((endPoint[1][0].g >> 1) & 1) << 10; + block.y |= ((endPoint[1][0].g >> 2) & 1) << 11; + block.y |= ((endPoint[1][0].g >> 3) & 1) << 12; + block.y |= ((endPoint[0][1].g >> 0) & 1) << 13; + block.y |= ((endPoint[0][1].g >> 1) & 1) << 14; + block.y |= ((endPoint[0][1].g >> 2) & 1) << 15; + block.y |= ((endPoint[0][1].g >> 3) & 1) << 16; + block.y |= ((endPoint[0][0].g >> 10) & 1) << 17; + block.y |= ((endPoint[1][1].b >> 0) & 1) << 18; + block.y |= ((endPoint[1][1].g >> 0) & 1) << 19; + block.y |= ((endPoint[1][1].g >> 1) & 1) << 20; + block.y |= ((endPoint[1][1].g >> 2) & 1) << 21; + block.y |= ((endPoint[1][1].g >> 3) & 1) << 22; + block.y |= ((endPoint[0][1].b >> 0) & 1) << 23; + block.y |= ((endPoint[0][1].b >> 1) & 1) << 24; + block.y |= ((endPoint[0][1].b >> 2) & 1) << 25; + block.y |= ((endPoint[0][1].b >> 3) & 1) << 26; + block.y |= ((endPoint[0][1].b >> 4) & 1) << 27; + block.y |= ((endPoint[0][0].b >> 10) & 1) << 28; + block.y |= ((endPoint[1][0].b >> 0) & 1) << 29; + block.y |= ((endPoint[1][0].b >> 1) & 1) << 30; + block.y |= ((endPoint[1][0].b >> 2) & 1) << 31; + block.z |= ((endPoint[1][0].b >> 3) & 1) << 0; + block.z |= ((endPoint[1][0].r >> 0) & 1) << 1; + block.z |= ((endPoint[1][0].r >> 1) & 1) << 2; + block.z |= ((endPoint[1][0].r >> 2) & 1) << 3; + block.z |= ((endPoint[1][0].r >> 3) & 1) << 4; + block.z |= ((endPoint[1][1].b >> 1) & 1) << 5; + block.z |= ((endPoint[1][1].b >> 2) & 1) << 6; + block.z |= ((endPoint[1][1].r >> 0) & 1) << 7; + block.z |= ((endPoint[1][1].r >> 1) & 1) << 8; + block.z |= ((endPoint[1][1].r >> 2) & 1) << 9; + block.z |= ((endPoint[1][1].r >> 3) & 1) << 10; + block.z |= ((endPoint[1][1].b >> 4) & 1) << 11; + block.z |= ((endPoint[1][1].b >> 3) & 1) << 12; + block.z |= ((partition_index >> 0) & 1) << 13; + block.z |= ((partition_index >> 1) & 1) << 14; + block.z |= ((partition_index >> 2) & 1) << 15; + block.z |= ((partition_index >> 3) & 1) << 16; + block.z |= ((partition_index >> 4) & 1) << 17; + } + else if ( mode_type == candidateModeFlag[5]) + { + /*block.x = candidateModeMemory[5]; + block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00003FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x00FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000); + block.y |= ( endPoint[0][0].b >> 7 ) & 0x00000003; + block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 ); + block.x |= ( ( endPoint[1][0].g << 20 ) & 0x01000000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 ); + block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E); + block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000; + block.y |= ( ( endPoint[1][1].g << 4 ) & 0x00000100 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 ); + block.y |= ( ( endPoint[1][1].b << 27 ) & 0x10000000 ); + block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80); + block.yz |= ( endPoint[1][1].b << uint2(18, 4) ) & uint2(0x00040000, 0x00000040); + block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001; + block.z |= ( ( endPoint[1][1].b << 9 ) & 0x00001000 );*/ + + block.x |= ((candidateModeMemory[5] >> 0) & 1) << 0; + block.x |= ((candidateModeMemory[5] >> 1) & 1) << 1; + block.x |= ((candidateModeMemory[5] >> 2) & 1) << 2; + block.x |= ((candidateModeMemory[5] >> 3) & 1) << 3; + block.x |= ((candidateModeMemory[5] >> 4) & 1) << 4; + block.x |= ((endPoint[0][0].r >> 0) & 1) << 5; + block.x |= ((endPoint[0][0].r >> 1) & 1) << 6; + block.x |= ((endPoint[0][0].r >> 2) & 1) << 7; + block.x |= ((endPoint[0][0].r >> 3) & 1) << 8; + block.x |= ((endPoint[0][0].r >> 4) & 1) << 9; + block.x |= ((endPoint[0][0].r >> 5) & 1) << 10; + block.x |= ((endPoint[0][0].r >> 6) & 1) << 11; + block.x |= ((endPoint[0][0].r >> 7) & 1) << 12; + block.x |= ((endPoint[0][0].r >> 8) & 1) << 13; + block.x |= ((endPoint[1][0].b >> 4) & 1) << 14; + block.x |= ((endPoint[0][0].g >> 0) & 1) << 15; + block.x |= ((endPoint[0][0].g >> 1) & 1) << 16; + block.x |= ((endPoint[0][0].g >> 2) & 1) << 17; + block.x |= ((endPoint[0][0].g >> 3) & 1) << 18; + block.x |= ((endPoint[0][0].g >> 4) & 1) << 19; + block.x |= ((endPoint[0][0].g >> 5) & 1) << 20; + block.x |= ((endPoint[0][0].g >> 6) & 1) << 21; + block.x |= ((endPoint[0][0].g >> 7) & 1) << 22; + block.x |= ((endPoint[0][0].g >> 8) & 1) << 23; + block.x |= ((endPoint[1][0].g >> 4) & 1) << 24; + block.x |= ((endPoint[0][0].b >> 0) & 1) << 25; + block.x |= ((endPoint[0][0].b >> 1) & 1) << 26; + block.x |= ((endPoint[0][0].b >> 2) & 1) << 27; + block.x |= ((endPoint[0][0].b >> 3) & 1) << 28; + block.x |= ((endPoint[0][0].b >> 4) & 1) << 29; + block.x |= ((endPoint[0][0].b >> 5) & 1) << 30; + block.x |= ((endPoint[0][0].b >> 6) & 1) << 31; + block.y |= ((endPoint[0][0].b >> 7) & 1) << 0; + block.y |= ((endPoint[0][0].b >> 8) & 1) << 1; + block.y |= ((endPoint[1][1].b >> 4) & 1) << 2; + block.y |= ((endPoint[0][1].r >> 0) & 1) << 3; + block.y |= ((endPoint[0][1].r >> 1) & 1) << 4; + block.y |= ((endPoint[0][1].r >> 2) & 1) << 5; + block.y |= ((endPoint[0][1].r >> 3) & 1) << 6; + block.y |= ((endPoint[0][1].r >> 4) & 1) << 7; + block.y |= ((endPoint[1][1].g >> 4) & 1) << 8; + block.y |= ((endPoint[1][0].g >> 0) & 1) << 9; + block.y |= ((endPoint[1][0].g >> 1) & 1) << 10; + block.y |= ((endPoint[1][0].g >> 2) & 1) << 11; + block.y |= ((endPoint[1][0].g >> 3) & 1) << 12; + block.y |= ((endPoint[0][1].g >> 0) & 1) << 13; + block.y |= ((endPoint[0][1].g >> 1) & 1) << 14; + block.y |= ((endPoint[0][1].g >> 2) & 1) << 15; + block.y |= ((endPoint[0][1].g >> 3) & 1) << 16; + block.y |= ((endPoint[0][1].g >> 4) & 1) << 17; + block.y |= ((endPoint[1][1].b >> 0) & 1) << 18; + block.y |= ((endPoint[1][1].g >> 0) & 1) << 19; + block.y |= ((endPoint[1][1].g >> 1) & 1) << 20; + block.y |= ((endPoint[1][1].g >> 2) & 1) << 21; + block.y |= ((endPoint[1][1].g >> 3) & 1) << 22; + block.y |= ((endPoint[0][1].b >> 0) & 1) << 23; + block.y |= ((endPoint[0][1].b >> 1) & 1) << 24; + block.y |= ((endPoint[0][1].b >> 2) & 1) << 25; + block.y |= ((endPoint[0][1].b >> 3) & 1) << 26; + block.y |= ((endPoint[0][1].b >> 4) & 1) << 27; + block.y |= ((endPoint[1][1].b >> 1) & 1) << 28; + block.y |= ((endPoint[1][0].b >> 0) & 1) << 29; + block.y |= ((endPoint[1][0].b >> 1) & 1) << 30; + block.y |= ((endPoint[1][0].b >> 2) & 1) << 31; + block.z |= ((endPoint[1][0].b >> 3) & 1) << 0; + block.z |= ((endPoint[1][0].r >> 0) & 1) << 1; + block.z |= ((endPoint[1][0].r >> 1) & 1) << 2; + block.z |= ((endPoint[1][0].r >> 2) & 1) << 3; + block.z |= ((endPoint[1][0].r >> 3) & 1) << 4; + block.z |= ((endPoint[1][0].r >> 4) & 1) << 5; + block.z |= ((endPoint[1][1].b >> 2) & 1) << 6; + block.z |= ((endPoint[1][1].r >> 0) & 1) << 7; + block.z |= ((endPoint[1][1].r >> 1) & 1) << 8; + block.z |= ((endPoint[1][1].r >> 2) & 1) << 9; + block.z |= ((endPoint[1][1].r >> 3) & 1) << 10; + block.z |= ((endPoint[1][1].r >> 4) & 1) << 11; + block.z |= ((endPoint[1][1].b >> 3) & 1) << 12; + block.z |= ((partition_index >> 0) & 1) << 13; + block.z |= ((partition_index >> 1) & 1) << 14; + block.z |= ((partition_index >> 2) & 1) << 15; + block.z |= ((partition_index >> 3) & 1) << 16; + block.z |= ((partition_index >> 4) & 1) << 17; + } + else if ( mode_type == candidateModeFlag[6]) + { + /*block.x = candidateModeMemory[6]; + block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00001FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x007F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 ); + block.y |= ( endPoint[0][0].b >> 7 ) & 0x00000001; + block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000001F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 ); + block.x |= ( ( endPoint[1][0].g << 20 ) & 0x01000000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000); + block.x |= ( ( endPoint[1][1].g << 9 ) & 0x00002000 ) | ( ( endPoint[1][1].b << 21 ) & 0x00800000); + block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000007E); + block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000; + block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00001F80); + block.y |= ( ( endPoint[1][1].b >> 2 ) & 0x00000006 ); + block.y |= ( ( endPoint[1][1].b << 27 ) & 0x10000000 ) | ( ( endPoint[1][1].b << 18 ) & 0x00040000 ); + block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;*/ + + block.x |= ((candidateModeMemory[6] >> 0) & 1) << 0; + block.x |= ((candidateModeMemory[6] >> 1) & 1) << 1; + block.x |= ((candidateModeMemory[6] >> 2) & 1) << 2; + block.x |= ((candidateModeMemory[6] >> 3) & 1) << 3; + block.x |= ((candidateModeMemory[6] >> 4) & 1) << 4; + block.x |= ((endPoint[0][0].r >> 0) & 1) << 5; + block.x |= ((endPoint[0][0].r >> 1) & 1) << 6; + block.x |= ((endPoint[0][0].r >> 2) & 1) << 7; + block.x |= ((endPoint[0][0].r >> 3) & 1) << 8; + block.x |= ((endPoint[0][0].r >> 4) & 1) << 9; + block.x |= ((endPoint[0][0].r >> 5) & 1) << 10; + block.x |= ((endPoint[0][0].r >> 6) & 1) << 11; + block.x |= ((endPoint[0][0].r >> 7) & 1) << 12; + block.x |= ((endPoint[1][1].g >> 4) & 1) << 13; + block.x |= ((endPoint[1][0].b >> 4) & 1) << 14; + block.x |= ((endPoint[0][0].g >> 0) & 1) << 15; + block.x |= ((endPoint[0][0].g >> 1) & 1) << 16; + block.x |= ((endPoint[0][0].g >> 2) & 1) << 17; + block.x |= ((endPoint[0][0].g >> 3) & 1) << 18; + block.x |= ((endPoint[0][0].g >> 4) & 1) << 19; + block.x |= ((endPoint[0][0].g >> 5) & 1) << 20; + block.x |= ((endPoint[0][0].g >> 6) & 1) << 21; + block.x |= ((endPoint[0][0].g >> 7) & 1) << 22; + block.x |= ((endPoint[1][1].b >> 2) & 1) << 23; + block.x |= ((endPoint[1][0].g >> 4) & 1) << 24; + block.x |= ((endPoint[0][0].b >> 0) & 1) << 25; + block.x |= ((endPoint[0][0].b >> 1) & 1) << 26; + block.x |= ((endPoint[0][0].b >> 2) & 1) << 27; + block.x |= ((endPoint[0][0].b >> 3) & 1) << 28; + block.x |= ((endPoint[0][0].b >> 4) & 1) << 29; + block.x |= ((endPoint[0][0].b >> 5) & 1) << 30; + block.x |= ((endPoint[0][0].b >> 6) & 1) << 31; + block.y |= ((endPoint[0][0].b >> 7) & 1) << 0; + block.y |= ((endPoint[1][1].b >> 3) & 1) << 1; + block.y |= ((endPoint[1][1].b >> 4) & 1) << 2; + block.y |= ((endPoint[0][1].r >> 0) & 1) << 3; + block.y |= ((endPoint[0][1].r >> 1) & 1) << 4; + block.y |= ((endPoint[0][1].r >> 2) & 1) << 5; + block.y |= ((endPoint[0][1].r >> 3) & 1) << 6; + block.y |= ((endPoint[0][1].r >> 4) & 1) << 7; + block.y |= ((endPoint[0][1].r >> 5) & 1) << 8; + block.y |= ((endPoint[1][0].g >> 0) & 1) << 9; + block.y |= ((endPoint[1][0].g >> 1) & 1) << 10; + block.y |= ((endPoint[1][0].g >> 2) & 1) << 11; + block.y |= ((endPoint[1][0].g >> 3) & 1) << 12; + block.y |= ((endPoint[0][1].g >> 0) & 1) << 13; + block.y |= ((endPoint[0][1].g >> 1) & 1) << 14; + block.y |= ((endPoint[0][1].g >> 2) & 1) << 15; + block.y |= ((endPoint[0][1].g >> 3) & 1) << 16; + block.y |= ((endPoint[0][1].g >> 4) & 1) << 17; + block.y |= ((endPoint[1][1].b >> 0) & 1) << 18; + block.y |= ((endPoint[1][1].g >> 0) & 1) << 19; + block.y |= ((endPoint[1][1].g >> 1) & 1) << 20; + block.y |= ((endPoint[1][1].g >> 2) & 1) << 21; + block.y |= ((endPoint[1][1].g >> 3) & 1) << 22; + block.y |= ((endPoint[0][1].b >> 0) & 1) << 23; + block.y |= ((endPoint[0][1].b >> 1) & 1) << 24; + block.y |= ((endPoint[0][1].b >> 2) & 1) << 25; + block.y |= ((endPoint[0][1].b >> 3) & 1) << 26; + block.y |= ((endPoint[0][1].b >> 4) & 1) << 27; + block.y |= ((endPoint[1][1].b >> 1) & 1) << 28; + block.y |= ((endPoint[1][0].b >> 0) & 1) << 29; + block.y |= ((endPoint[1][0].b >> 1) & 1) << 30; + block.y |= ((endPoint[1][0].b >> 2) & 1) << 31; + block.z |= ((endPoint[1][0].b >> 3) & 1) << 0; + block.z |= ((endPoint[1][0].r >> 0) & 1) << 1; + block.z |= ((endPoint[1][0].r >> 1) & 1) << 2; + block.z |= ((endPoint[1][0].r >> 2) & 1) << 3; + block.z |= ((endPoint[1][0].r >> 3) & 1) << 4; + block.z |= ((endPoint[1][0].r >> 4) & 1) << 5; + block.z |= ((endPoint[1][0].r >> 5) & 1) << 6; + block.z |= ((endPoint[1][1].r >> 0) & 1) << 7; + block.z |= ((endPoint[1][1].r >> 1) & 1) << 8; + block.z |= ((endPoint[1][1].r >> 2) & 1) << 9; + block.z |= ((endPoint[1][1].r >> 3) & 1) << 10; + block.z |= ((endPoint[1][1].r >> 4) & 1) << 11; + block.z |= ((endPoint[1][1].r >> 5) & 1) << 12; + block.z |= ((partition_index >> 0) & 1) << 13; + block.z |= ((partition_index >> 1) & 1) << 14; + block.z |= ((partition_index >> 2) & 1) << 15; + block.z |= ((partition_index >> 3) & 1) << 16; + block.z |= ((partition_index >> 4) & 1) << 17; + } + else if ( mode_type == candidateModeFlag[7]) + { + /*block.x = candidateModeMemory[7]; + block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00001FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x007F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 ); + block.y |= ( endPoint[0][0].b >> 7 ) & 0x00000001; + block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0007E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 ); + block.x |= ( ( endPoint[1][0].g << 20 ) & 0x01000000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 ); + block.x |= ( ( endPoint[1][0].g << 18 ) & 0x00800000 ); + block.x |= ( ( endPoint[1][1].b << 13 ) & 0x00002000 ); + block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E); + block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80); + block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000; + block.y |= ( ( endPoint[1][1].g >> 4 ) & 0x00000002 ) | ( ( endPoint[1][1].g << 4 ) & 0x00000100 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 ); + block.y |= ( endPoint[1][1].b << 27 ) & 0x10000000; + block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001; + block.z |= ( ( endPoint[1][1].b << 9 ) & 0x00001000 ) | ( ( endPoint[1][1].b << 4 ) & 0x00000040 );*/ + + block.x |= ((candidateModeMemory[7] >> 0) & 1) << 0; + block.x |= ((candidateModeMemory[7] >> 1) & 1) << 1; + block.x |= ((candidateModeMemory[7] >> 2) & 1) << 2; + block.x |= ((candidateModeMemory[7] >> 3) & 1) << 3; + block.x |= ((candidateModeMemory[7] >> 4) & 1) << 4; + block.x |= ((endPoint[0][0].r >> 0) & 1) << 5; + block.x |= ((endPoint[0][0].r >> 1) & 1) << 6; + block.x |= ((endPoint[0][0].r >> 2) & 1) << 7; + block.x |= ((endPoint[0][0].r >> 3) & 1) << 8; + block.x |= ((endPoint[0][0].r >> 4) & 1) << 9; + block.x |= ((endPoint[0][0].r >> 5) & 1) << 10; + block.x |= ((endPoint[0][0].r >> 6) & 1) << 11; + block.x |= ((endPoint[0][0].r >> 7) & 1) << 12; + block.x |= ((endPoint[1][1].b >> 0) & 1) << 13; + block.x |= ((endPoint[1][0].b >> 4) & 1) << 14; + block.x |= ((endPoint[0][0].g >> 0) & 1) << 15; + block.x |= ((endPoint[0][0].g >> 1) & 1) << 16; + block.x |= ((endPoint[0][0].g >> 2) & 1) << 17; + block.x |= ((endPoint[0][0].g >> 3) & 1) << 18; + block.x |= ((endPoint[0][0].g >> 4) & 1) << 19; + block.x |= ((endPoint[0][0].g >> 5) & 1) << 20; + block.x |= ((endPoint[0][0].g >> 6) & 1) << 21; + block.x |= ((endPoint[0][0].g >> 7) & 1) << 22; + block.x |= ((endPoint[1][0].g >> 5) & 1) << 23; + block.x |= ((endPoint[1][0].g >> 4) & 1) << 24; + block.x |= ((endPoint[0][0].b >> 0) & 1) << 25; + block.x |= ((endPoint[0][0].b >> 1) & 1) << 26; + block.x |= ((endPoint[0][0].b >> 2) & 1) << 27; + block.x |= ((endPoint[0][0].b >> 3) & 1) << 28; + block.x |= ((endPoint[0][0].b >> 4) & 1) << 29; + block.x |= ((endPoint[0][0].b >> 5) & 1) << 30; + block.x |= ((endPoint[0][0].b >> 6) & 1) << 31; + block.y |= ((endPoint[0][0].b >> 7) & 1) << 0; + block.y |= ((endPoint[1][1].g >> 5) & 1) << 1; + block.y |= ((endPoint[1][1].b >> 4) & 1) << 2; + block.y |= ((endPoint[0][1].r >> 0) & 1) << 3; + block.y |= ((endPoint[0][1].r >> 1) & 1) << 4; + block.y |= ((endPoint[0][1].r >> 2) & 1) << 5; + block.y |= ((endPoint[0][1].r >> 3) & 1) << 6; + block.y |= ((endPoint[0][1].r >> 4) & 1) << 7; + block.y |= ((endPoint[1][1].g >> 4) & 1) << 8; + block.y |= ((endPoint[1][0].g >> 0) & 1) << 9; + block.y |= ((endPoint[1][0].g >> 1) & 1) << 10; + block.y |= ((endPoint[1][0].g >> 2) & 1) << 11; + block.y |= ((endPoint[1][0].g >> 3) & 1) << 12; + block.y |= ((endPoint[0][1].g >> 0) & 1) << 13; + block.y |= ((endPoint[0][1].g >> 1) & 1) << 14; + block.y |= ((endPoint[0][1].g >> 2) & 1) << 15; + block.y |= ((endPoint[0][1].g >> 3) & 1) << 16; + block.y |= ((endPoint[0][1].g >> 4) & 1) << 17; + block.y |= ((endPoint[0][1].g >> 5) & 1) << 18; + block.y |= ((endPoint[1][1].g >> 0) & 1) << 19; + block.y |= ((endPoint[1][1].g >> 1) & 1) << 20; + block.y |= ((endPoint[1][1].g >> 2) & 1) << 21; + block.y |= ((endPoint[1][1].g >> 3) & 1) << 22; + block.y |= ((endPoint[0][1].b >> 0) & 1) << 23; + block.y |= ((endPoint[0][1].b >> 1) & 1) << 24; + block.y |= ((endPoint[0][1].b >> 2) & 1) << 25; + block.y |= ((endPoint[0][1].b >> 3) & 1) << 26; + block.y |= ((endPoint[0][1].b >> 4) & 1) << 27; + block.y |= ((endPoint[1][1].b >> 1) & 1) << 28; + block.y |= ((endPoint[1][0].b >> 0) & 1) << 29; + block.y |= ((endPoint[1][0].b >> 1) & 1) << 30; + block.y |= ((endPoint[1][0].b >> 2) & 1) << 31; + block.z |= ((endPoint[1][0].b >> 3) & 1) << 0; + block.z |= ((endPoint[1][0].r >> 0) & 1) << 1; + block.z |= ((endPoint[1][0].r >> 1) & 1) << 2; + block.z |= ((endPoint[1][0].r >> 2) & 1) << 3; + block.z |= ((endPoint[1][0].r >> 3) & 1) << 4; + block.z |= ((endPoint[1][0].r >> 4) & 1) << 5; + block.z |= ((endPoint[1][1].b >> 2) & 1) << 6; + block.z |= ((endPoint[1][1].r >> 0) & 1) << 7; + block.z |= ((endPoint[1][1].r >> 1) & 1) << 8; + block.z |= ((endPoint[1][1].r >> 2) & 1) << 9; + block.z |= ((endPoint[1][1].r >> 3) & 1) << 10; + block.z |= ((endPoint[1][1].r >> 4) & 1) << 11; + block.z |= ((endPoint[1][1].b >> 3) & 1) << 12; + block.z |= ((partition_index >> 0) & 1) << 13; + block.z |= ((partition_index >> 1) & 1) << 14; + block.z |= ((partition_index >> 2) & 1) << 15; + block.z |= ((partition_index >> 3) & 1) << 16; + block.z |= ((partition_index >> 4) & 1) << 17; + } + else if ( mode_type == candidateModeFlag[8]) + { + /*block.x = candidateModeMemory[8]; + block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00001FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x007F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 ); + block.y |= ( endPoint[0][0].b >> 7 ) & 0x00000001; + block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x1F800000 ); + block.x |= ( ( endPoint[1][0].g << 20 ) & 0x01000000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 ); + block.x |= ( ( endPoint[1][0].b << 18 ) & 0x00800000 ); + block.x |= ( endPoint[1][1].b << 12 ) & 0x00002000; + block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000; + block.y |= ( ( endPoint[1][1].g << 4 ) & 0x00000100 ) | ( ( endPoint[1][1].b >> 4 ) & 0x00000002 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 ); + block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E); + block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80); + block.y |= ( endPoint[1][1].b << 18 ) & 0x00040000; + block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001; + block.z |= ( ( endPoint[1][1].b << 9 ) & 0x00001000 ) | ( ( endPoint[1][1].b << 4 ) & 0x00000040 );*/ + + block.x |= ((candidateModeMemory[8] >> 0) & 1) << 0; + block.x |= ((candidateModeMemory[8] >> 1) & 1) << 1; + block.x |= ((candidateModeMemory[8] >> 2) & 1) << 2; + block.x |= ((candidateModeMemory[8] >> 3) & 1) << 3; + block.x |= ((candidateModeMemory[8] >> 4) & 1) << 4; + block.x |= ((endPoint[0][0].r >> 0) & 1) << 5; + block.x |= ((endPoint[0][0].r >> 1) & 1) << 6; + block.x |= ((endPoint[0][0].r >> 2) & 1) << 7; + block.x |= ((endPoint[0][0].r >> 3) & 1) << 8; + block.x |= ((endPoint[0][0].r >> 4) & 1) << 9; + block.x |= ((endPoint[0][0].r >> 5) & 1) << 10; + block.x |= ((endPoint[0][0].r >> 6) & 1) << 11; + block.x |= ((endPoint[0][0].r >> 7) & 1) << 12; + block.x |= ((endPoint[1][1].b >> 1) & 1) << 13; + block.x |= ((endPoint[1][0].b >> 4) & 1) << 14; + block.x |= ((endPoint[0][0].g >> 0) & 1) << 15; + block.x |= ((endPoint[0][0].g >> 1) & 1) << 16; + block.x |= ((endPoint[0][0].g >> 2) & 1) << 17; + block.x |= ((endPoint[0][0].g >> 3) & 1) << 18; + block.x |= ((endPoint[0][0].g >> 4) & 1) << 19; + block.x |= ((endPoint[0][0].g >> 5) & 1) << 20; + block.x |= ((endPoint[0][0].g >> 6) & 1) << 21; + block.x |= ((endPoint[0][0].g >> 7) & 1) << 22; + block.x |= ((endPoint[1][0].b >> 5) & 1) << 23; + block.x |= ((endPoint[1][0].g >> 4) & 1) << 24; + block.x |= ((endPoint[0][0].b >> 0) & 1) << 25; + block.x |= ((endPoint[0][0].b >> 1) & 1) << 26; + block.x |= ((endPoint[0][0].b >> 2) & 1) << 27; + block.x |= ((endPoint[0][0].b >> 3) & 1) << 28; + block.x |= ((endPoint[0][0].b >> 4) & 1) << 29; + block.x |= ((endPoint[0][0].b >> 5) & 1) << 30; + block.x |= ((endPoint[0][0].b >> 6) & 1) << 31; + block.y |= ((endPoint[0][0].b >> 7) & 1) << 0; + block.y |= ((endPoint[1][1].b >> 5) & 1) << 1; + block.y |= ((endPoint[1][1].b >> 4) & 1) << 2; + block.y |= ((endPoint[0][1].r >> 0) & 1) << 3; + block.y |= ((endPoint[0][1].r >> 1) & 1) << 4; + block.y |= ((endPoint[0][1].r >> 2) & 1) << 5; + block.y |= ((endPoint[0][1].r >> 3) & 1) << 6; + block.y |= ((endPoint[0][1].r >> 4) & 1) << 7; + block.y |= ((endPoint[1][1].g >> 4) & 1) << 8; + block.y |= ((endPoint[1][0].g >> 0) & 1) << 9; + block.y |= ((endPoint[1][0].g >> 1) & 1) << 10; + block.y |= ((endPoint[1][0].g >> 2) & 1) << 11; + block.y |= ((endPoint[1][0].g >> 3) & 1) << 12; + block.y |= ((endPoint[0][1].g >> 0) & 1) << 13; + block.y |= ((endPoint[0][1].g >> 1) & 1) << 14; + block.y |= ((endPoint[0][1].g >> 2) & 1) << 15; + block.y |= ((endPoint[0][1].g >> 3) & 1) << 16; + block.y |= ((endPoint[0][1].g >> 4) & 1) << 17; + block.y |= ((endPoint[1][1].b >> 0) & 1) << 18; + block.y |= ((endPoint[1][1].g >> 0) & 1) << 19; + block.y |= ((endPoint[1][1].g >> 1) & 1) << 20; + block.y |= ((endPoint[1][1].g >> 2) & 1) << 21; + block.y |= ((endPoint[1][1].g >> 3) & 1) << 22; + block.y |= ((endPoint[0][1].b >> 0) & 1) << 23; + block.y |= ((endPoint[0][1].b >> 1) & 1) << 24; + block.y |= ((endPoint[0][1].b >> 2) & 1) << 25; + block.y |= ((endPoint[0][1].b >> 3) & 1) << 26; + block.y |= ((endPoint[0][1].b >> 4) & 1) << 27; + block.y |= ((endPoint[0][1].b >> 5) & 1) << 28; + block.y |= ((endPoint[1][0].b >> 0) & 1) << 29; + block.y |= ((endPoint[1][0].b >> 1) & 1) << 30; + block.y |= ((endPoint[1][0].b >> 2) & 1) << 31; + block.z |= ((endPoint[1][0].b >> 3) & 1) << 0; + block.z |= ((endPoint[1][0].r >> 0) & 1) << 1; + block.z |= ((endPoint[1][0].r >> 1) & 1) << 2; + block.z |= ((endPoint[1][0].r >> 2) & 1) << 3; + block.z |= ((endPoint[1][0].r >> 3) & 1) << 4; + block.z |= ((endPoint[1][0].r >> 4) & 1) << 5; + block.z |= ((endPoint[1][1].b >> 2) & 1) << 6; + block.z |= ((endPoint[1][1].r >> 0) & 1) << 7; + block.z |= ((endPoint[1][1].r >> 1) & 1) << 8; + block.z |= ((endPoint[1][1].r >> 2) & 1) << 9; + block.z |= ((endPoint[1][1].r >> 3) & 1) << 10; + block.z |= ((endPoint[1][1].r >> 4) & 1) << 11; + block.z |= ((endPoint[1][1].b >> 3) & 1) << 12; + block.z |= ((partition_index >> 0) & 1) << 13; + block.z |= ((partition_index >> 1) & 1) << 14; + block.z |= ((partition_index >> 2) & 1) << 15; + block.z |= ((partition_index >> 3) & 1) << 16; + block.z |= ((partition_index >> 4) & 1) << 17; + } + else if ( mode_type == candidateModeFlag[9]) + { + /*block.x = candidateModeMemory[9]; + block.x |= ( ( endPoint[0][0].r << 5 ) & 0x000007E0 ) | ( ( endPoint[0][0].g << 15 ) & 0x001F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0x7E000000 ); + block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000001F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0007E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x1F800000 ); + block.x |= ( ( endPoint[1][0].g << 16 ) & 0x00200000 ) | ( ( endPoint[1][0].g << 20 ) & 0x01000000 ); + block.x |= ( ( endPoint[1][0].b << 17 ) & 0x00400000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 ); + block.x |= ( ( endPoint[1][1].b << 21 ) & 0x00800000 ) | ( ( endPoint[1][1].b << 12 ) & 0x00003000 ); + block.x |= ( ( endPoint[1][1].g << 26 ) & 0x80000000 ) | ( ( endPoint[1][1].g << 7 ) & 0x00000800 ); + block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000007E); + block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00001F80); + block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000; + block.y |= ( ( endPoint[1][1].b >> 4 ) & 0x00000002 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 ) | ( ( endPoint[1][1].b >> 3 ) & 0x00000001 ); + block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;*/ + + block.x |= ((candidateModeMemory[9] >> 0) & 1) << 0; + block.x |= ((candidateModeMemory[9] >> 1) & 1) << 1; + block.x |= ((candidateModeMemory[9] >> 2) & 1) << 2; + block.x |= ((candidateModeMemory[9] >> 3) & 1) << 3; + block.x |= ((candidateModeMemory[9] >> 4) & 1) << 4; + block.x |= ((endPoint[0][0].r >> 0) & 1) << 5; + block.x |= ((endPoint[0][0].r >> 1) & 1) << 6; + block.x |= ((endPoint[0][0].r >> 2) & 1) << 7; + block.x |= ((endPoint[0][0].r >> 3) & 1) << 8; + block.x |= ((endPoint[0][0].r >> 4) & 1) << 9; + block.x |= ((endPoint[0][0].r >> 5) & 1) << 10; + block.x |= ((endPoint[1][1].g >> 4) & 1) << 11; + block.x |= ((endPoint[1][1].b >> 0) & 1) << 12; + block.x |= ((endPoint[1][1].b >> 1) & 1) << 13; + block.x |= ((endPoint[1][0].b >> 4) & 1) << 14; + block.x |= ((endPoint[0][0].g >> 0) & 1) << 15; + block.x |= ((endPoint[0][0].g >> 1) & 1) << 16; + block.x |= ((endPoint[0][0].g >> 2) & 1) << 17; + block.x |= ((endPoint[0][0].g >> 3) & 1) << 18; + block.x |= ((endPoint[0][0].g >> 4) & 1) << 19; + block.x |= ((endPoint[0][0].g >> 5) & 1) << 20; + block.x |= ((endPoint[1][0].g >> 5) & 1) << 21; + block.x |= ((endPoint[1][0].b >> 5) & 1) << 22; + block.x |= ((endPoint[1][1].b >> 2) & 1) << 23; + block.x |= ((endPoint[1][0].g >> 4) & 1) << 24; + block.x |= ((endPoint[0][0].b >> 0) & 1) << 25; + block.x |= ((endPoint[0][0].b >> 1) & 1) << 26; + block.x |= ((endPoint[0][0].b >> 2) & 1) << 27; + block.x |= ((endPoint[0][0].b >> 3) & 1) << 28; + block.x |= ((endPoint[0][0].b >> 4) & 1) << 29; + block.x |= ((endPoint[0][0].b >> 5) & 1) << 30; + block.x |= ((endPoint[1][1].g >> 5) & 1) << 31; + block.y |= ((endPoint[1][1].b >> 3) & 1) << 0; + block.y |= ((endPoint[1][1].b >> 5) & 1) << 1; + block.y |= ((endPoint[1][1].b >> 4) & 1) << 2; + block.y |= ((endPoint[0][1].r >> 0) & 1) << 3; + block.y |= ((endPoint[0][1].r >> 1) & 1) << 4; + block.y |= ((endPoint[0][1].r >> 2) & 1) << 5; + block.y |= ((endPoint[0][1].r >> 3) & 1) << 6; + block.y |= ((endPoint[0][1].r >> 4) & 1) << 7; + block.y |= ((endPoint[0][1].r >> 5) & 1) << 8; + block.y |= ((endPoint[1][0].g >> 0) & 1) << 9; + block.y |= ((endPoint[1][0].g >> 1) & 1) << 10; + block.y |= ((endPoint[1][0].g >> 2) & 1) << 11; + block.y |= ((endPoint[1][0].g >> 3) & 1) << 12; + block.y |= ((endPoint[0][1].g >> 0) & 1) << 13; + block.y |= ((endPoint[0][1].g >> 1) & 1) << 14; + block.y |= ((endPoint[0][1].g >> 2) & 1) << 15; + block.y |= ((endPoint[0][1].g >> 3) & 1) << 16; + block.y |= ((endPoint[0][1].g >> 4) & 1) << 17; + block.y |= ((endPoint[0][1].g >> 5) & 1) << 18; + block.y |= ((endPoint[1][1].g >> 0) & 1) << 19; + block.y |= ((endPoint[1][1].g >> 1) & 1) << 20; + block.y |= ((endPoint[1][1].g >> 2) & 1) << 21; + block.y |= ((endPoint[1][1].g >> 3) & 1) << 22; + block.y |= ((endPoint[0][1].b >> 0) & 1) << 23; + block.y |= ((endPoint[0][1].b >> 1) & 1) << 24; + block.y |= ((endPoint[0][1].b >> 2) & 1) << 25; + block.y |= ((endPoint[0][1].b >> 3) & 1) << 26; + block.y |= ((endPoint[0][1].b >> 4) & 1) << 27; + block.y |= ((endPoint[0][1].b >> 5) & 1) << 28; + block.y |= ((endPoint[1][0].b >> 0) & 1) << 29; + block.y |= ((endPoint[1][0].b >> 1) & 1) << 30; + block.y |= ((endPoint[1][0].b >> 2) & 1) << 31; + block.z |= ((endPoint[1][0].b >> 3) & 1) << 0; + block.z |= ((endPoint[1][0].r >> 0) & 1) << 1; + block.z |= ((endPoint[1][0].r >> 1) & 1) << 2; + block.z |= ((endPoint[1][0].r >> 2) & 1) << 3; + block.z |= ((endPoint[1][0].r >> 3) & 1) << 4; + block.z |= ((endPoint[1][0].r >> 4) & 1) << 5; + block.z |= ((endPoint[1][0].r >> 5) & 1) << 6; + block.z |= ((endPoint[1][1].r >> 0) & 1) << 7; + block.z |= ((endPoint[1][1].r >> 1) & 1) << 8; + block.z |= ((endPoint[1][1].r >> 2) & 1) << 9; + block.z |= ((endPoint[1][1].r >> 3) & 1) << 10; + block.z |= ((endPoint[1][1].r >> 4) & 1) << 11; + block.z |= ((endPoint[1][1].r >> 5) & 1) << 12; + block.z |= ((partition_index >> 0) & 1) << 13; + block.z |= ((partition_index >> 1) & 1) << 14; + block.z |= ((partition_index >> 2) & 1) << 15; + block.z |= ((partition_index >> 3) & 1) << 16; + block.z |= ((partition_index >> 4) & 1) << 17; + } +} +void block_package( inout uint4 block, int2x3 endPoint, uint mode_type ) // for mode 11 - 14 +{ + /*block.x = ( ( endPoint[0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0].b << 25 ) & 0xFE000000 ); + block.y |= ( endPoint[0].b >> 7 ) & 0x00000007;*/ + + block.xy = 0; + block.z &= 0xFFFFFFFE; + + + if ( mode_type == candidateModeFlag[10]) + { + /* block.x |= candidateModeMemory[10]; + block.y |= ( ( endPoint[1].r << 3 ) & 0x00001FF8 ) | ( ( endPoint[1].g << 13 ) & 0x007FE000 ) | ( ( endPoint[1].b << 23 ) & 0xFF800000 ); + block.z |= ( endPoint[1].b >> 9 ) & 0x00000001;*/ + + block.x |= ((candidateModeMemory[10] >> 0) & 1) << 0; + block.x |= ((candidateModeMemory[10] >> 1) & 1) << 1; + block.x |= ((candidateModeMemory[10] >> 2) & 1) << 2; + block.x |= ((candidateModeMemory[10] >> 3) & 1) << 3; + block.x |= ((candidateModeMemory[10] >> 4) & 1) << 4; + block.x |= ((endPoint[0].r >> 0) & 1) << 5; + block.x |= ((endPoint[0].r >> 1) & 1) << 6; + block.x |= ((endPoint[0].r >> 2) & 1) << 7; + block.x |= ((endPoint[0].r >> 3) & 1) << 8; + block.x |= ((endPoint[0].r >> 4) & 1) << 9; + block.x |= ((endPoint[0].r >> 5) & 1) << 10; + block.x |= ((endPoint[0].r >> 6) & 1) << 11; + block.x |= ((endPoint[0].r >> 7) & 1) << 12; + block.x |= ((endPoint[0].r >> 8) & 1) << 13; + block.x |= ((endPoint[0].r >> 9) & 1) << 14; + block.x |= ((endPoint[0].g >> 0) & 1) << 15; + block.x |= ((endPoint[0].g >> 1) & 1) << 16; + block.x |= ((endPoint[0].g >> 2) & 1) << 17; + block.x |= ((endPoint[0].g >> 3) & 1) << 18; + block.x |= ((endPoint[0].g >> 4) & 1) << 19; + block.x |= ((endPoint[0].g >> 5) & 1) << 20; + block.x |= ((endPoint[0].g >> 6) & 1) << 21; + block.x |= ((endPoint[0].g >> 7) & 1) << 22; + block.x |= ((endPoint[0].g >> 8) & 1) << 23; + block.x |= ((endPoint[0].g >> 9) & 1) << 24; + block.x |= ((endPoint[0].b >> 0) & 1) << 25; + block.x |= ((endPoint[0].b >> 1) & 1) << 26; + block.x |= ((endPoint[0].b >> 2) & 1) << 27; + block.x |= ((endPoint[0].b >> 3) & 1) << 28; + block.x |= ((endPoint[0].b >> 4) & 1) << 29; + block.x |= ((endPoint[0].b >> 5) & 1) << 30; + block.x |= ((endPoint[0].b >> 6) & 1) << 31; + block.y |= ((endPoint[0].b >> 7) & 1) << 0; + block.y |= ((endPoint[0].b >> 8) & 1) << 1; + block.y |= ((endPoint[0].b >> 9) & 1) << 2; + block.y |= ((endPoint[1].r >> 0) & 1) << 3; + block.y |= ((endPoint[1].r >> 1) & 1) << 4; + block.y |= ((endPoint[1].r >> 2) & 1) << 5; + block.y |= ((endPoint[1].r >> 3) & 1) << 6; + block.y |= ((endPoint[1].r >> 4) & 1) << 7; + block.y |= ((endPoint[1].r >> 5) & 1) << 8; + block.y |= ((endPoint[1].r >> 6) & 1) << 9; + block.y |= ((endPoint[1].r >> 7) & 1) << 10; + block.y |= ((endPoint[1].r >> 8) & 1) << 11; + block.y |= ((endPoint[1].r >> 9) & 1) << 12; + block.y |= ((endPoint[1].g >> 0) & 1) << 13; + block.y |= ((endPoint[1].g >> 1) & 1) << 14; + block.y |= ((endPoint[1].g >> 2) & 1) << 15; + block.y |= ((endPoint[1].g >> 3) & 1) << 16; + block.y |= ((endPoint[1].g >> 4) & 1) << 17; + block.y |= ((endPoint[1].g >> 5) & 1) << 18; + block.y |= ((endPoint[1].g >> 6) & 1) << 19; + block.y |= ((endPoint[1].g >> 7) & 1) << 20; + block.y |= ((endPoint[1].g >> 8) & 1) << 21; + block.y |= ((endPoint[1].g >> 9) & 1) << 22; + block.y |= ((endPoint[1].b >> 0) & 1) << 23; + block.y |= ((endPoint[1].b >> 1) & 1) << 24; + block.y |= ((endPoint[1].b >> 2) & 1) << 25; + block.y |= ((endPoint[1].b >> 3) & 1) << 26; + block.y |= ((endPoint[1].b >> 4) & 1) << 27; + block.y |= ((endPoint[1].b >> 5) & 1) << 28; + block.y |= ((endPoint[1].b >> 6) & 1) << 29; + block.y |= ((endPoint[1].b >> 7) & 1) << 30; + block.y |= ((endPoint[1].b >> 8) & 1) << 31; + block.z |= ((endPoint[1].b >> 9) & 1) << 0; + } + else if (mode_type == candidateModeFlag[11]) + { + /*block.x |= candidateModeMemory[11]; + block.y |= ( ( endPoint[0].r << 2 ) & 0x00001000 ) | ( ( endPoint[0].g << 12 ) & 0x00400000 ); + block.y |= ( ( endPoint[1].r << 3 ) & 0x00000FF8 ) | ( ( endPoint[1].g << 13 ) & 0x003FE000 ) | ( ( endPoint[1].b << 23 ) & 0xFF800000 ); + block.z |= ( endPoint[0].b >> 10 ) & 0x00000001;*/ + + block.x |= ((candidateModeMemory[11] >> 0) & 1) << 0; + block.x |= ((candidateModeMemory[11] >> 1) & 1) << 1; + block.x |= ((candidateModeMemory[11] >> 2) & 1) << 2; + block.x |= ((candidateModeMemory[11] >> 3) & 1) << 3; + block.x |= ((candidateModeMemory[11] >> 4) & 1) << 4; + block.x |= ((endPoint[0].r >> 0) & 1) << 5; + block.x |= ((endPoint[0].r >> 1) & 1) << 6; + block.x |= ((endPoint[0].r >> 2) & 1) << 7; + block.x |= ((endPoint[0].r >> 3) & 1) << 8; + block.x |= ((endPoint[0].r >> 4) & 1) << 9; + block.x |= ((endPoint[0].r >> 5) & 1) << 10; + block.x |= ((endPoint[0].r >> 6) & 1) << 11; + block.x |= ((endPoint[0].r >> 7) & 1) << 12; + block.x |= ((endPoint[0].r >> 8) & 1) << 13; + block.x |= ((endPoint[0].r >> 9) & 1) << 14; + block.x |= ((endPoint[0].g >> 0) & 1) << 15; + block.x |= ((endPoint[0].g >> 1) & 1) << 16; + block.x |= ((endPoint[0].g >> 2) & 1) << 17; + block.x |= ((endPoint[0].g >> 3) & 1) << 18; + block.x |= ((endPoint[0].g >> 4) & 1) << 19; + block.x |= ((endPoint[0].g >> 5) & 1) << 20; + block.x |= ((endPoint[0].g >> 6) & 1) << 21; + block.x |= ((endPoint[0].g >> 7) & 1) << 22; + block.x |= ((endPoint[0].g >> 8) & 1) << 23; + block.x |= ((endPoint[0].g >> 9) & 1) << 24; + block.x |= ((endPoint[0].b >> 0) & 1) << 25; + block.x |= ((endPoint[0].b >> 1) & 1) << 26; + block.x |= ((endPoint[0].b >> 2) & 1) << 27; + block.x |= ((endPoint[0].b >> 3) & 1) << 28; + block.x |= ((endPoint[0].b >> 4) & 1) << 29; + block.x |= ((endPoint[0].b >> 5) & 1) << 30; + block.x |= ((endPoint[0].b >> 6) & 1) << 31; + block.y |= ((endPoint[0].b >> 7) & 1) << 0; + block.y |= ((endPoint[0].b >> 8) & 1) << 1; + block.y |= ((endPoint[0].b >> 9) & 1) << 2; + block.y |= ((endPoint[1].r >> 0) & 1) << 3; + block.y |= ((endPoint[1].r >> 1) & 1) << 4; + block.y |= ((endPoint[1].r >> 2) & 1) << 5; + block.y |= ((endPoint[1].r >> 3) & 1) << 6; + block.y |= ((endPoint[1].r >> 4) & 1) << 7; + block.y |= ((endPoint[1].r >> 5) & 1) << 8; + block.y |= ((endPoint[1].r >> 6) & 1) << 9; + block.y |= ((endPoint[1].r >> 7) & 1) << 10; + block.y |= ((endPoint[1].r >> 8) & 1) << 11; + block.y |= ((endPoint[0].r >> 10) & 1) << 12; + block.y |= ((endPoint[1].g >> 0) & 1) << 13; + block.y |= ((endPoint[1].g >> 1) & 1) << 14; + block.y |= ((endPoint[1].g >> 2) & 1) << 15; + block.y |= ((endPoint[1].g >> 3) & 1) << 16; + block.y |= ((endPoint[1].g >> 4) & 1) << 17; + block.y |= ((endPoint[1].g >> 5) & 1) << 18; + block.y |= ((endPoint[1].g >> 6) & 1) << 19; + block.y |= ((endPoint[1].g >> 7) & 1) << 20; + block.y |= ((endPoint[1].g >> 8) & 1) << 21; + block.y |= ((endPoint[0].g >> 10) & 1) << 22; + block.y |= ((endPoint[1].b >> 0) & 1) << 23; + block.y |= ((endPoint[1].b >> 1) & 1) << 24; + block.y |= ((endPoint[1].b >> 2) & 1) << 25; + block.y |= ((endPoint[1].b >> 3) & 1) << 26; + block.y |= ((endPoint[1].b >> 4) & 1) << 27; + block.y |= ((endPoint[1].b >> 5) & 1) << 28; + block.y |= ((endPoint[1].b >> 6) & 1) << 29; + block.y |= ((endPoint[1].b >> 7) & 1) << 30; + block.y |= ((endPoint[1].b >> 8) & 1) << 31; + block.z |= ((endPoint[0].b >> 10) & 1) << 0; + } + else if (mode_type == candidateModeFlag[12])// violate the spec in [0].low + { + /*block.x |= candidateModeMemory[12]; + block.y |= ( ( endPoint[0].r << 2 ) & 0x00001000 ) | ( ( endPoint[0].g << 12 ) & 0x00400000 ); + block.y |= ( ( endPoint[0].r << 0 ) & 0x00000800 ) | ( ( endPoint[0].g << 10 ) & 0x00200000 ); + block.y |= ( endPoint[0].b << 20 ) & 0x80000000; + block.y |= ( ( endPoint[1].r << 3 ) & 0x000007F8 ) | ( ( endPoint[1].g << 13 ) & 0x001FE000 ) | ( ( endPoint[1].b << 23 ) & 0x7F800000 ); + block.z |= ( endPoint[0].b >> 10 ) & 0x00000001;*/ + + block.x |= ((candidateModeMemory[12] >> 0) & 1) << 0; + block.x |= ((candidateModeMemory[12] >> 1) & 1) << 1; + block.x |= ((candidateModeMemory[12] >> 2) & 1) << 2; + block.x |= ((candidateModeMemory[12] >> 3) & 1) << 3; + block.x |= ((candidateModeMemory[12] >> 4) & 1) << 4; + block.x |= ((endPoint[0].r >> 0) & 1) << 5; + block.x |= ((endPoint[0].r >> 1) & 1) << 6; + block.x |= ((endPoint[0].r >> 2) & 1) << 7; + block.x |= ((endPoint[0].r >> 3) & 1) << 8; + block.x |= ((endPoint[0].r >> 4) & 1) << 9; + block.x |= ((endPoint[0].r >> 5) & 1) << 10; + block.x |= ((endPoint[0].r >> 6) & 1) << 11; + block.x |= ((endPoint[0].r >> 7) & 1) << 12; + block.x |= ((endPoint[0].r >> 8) & 1) << 13; + block.x |= ((endPoint[0].r >> 9) & 1) << 14; + block.x |= ((endPoint[0].g >> 0) & 1) << 15; + block.x |= ((endPoint[0].g >> 1) & 1) << 16; + block.x |= ((endPoint[0].g >> 2) & 1) << 17; + block.x |= ((endPoint[0].g >> 3) & 1) << 18; + block.x |= ((endPoint[0].g >> 4) & 1) << 19; + block.x |= ((endPoint[0].g >> 5) & 1) << 20; + block.x |= ((endPoint[0].g >> 6) & 1) << 21; + block.x |= ((endPoint[0].g >> 7) & 1) << 22; + block.x |= ((endPoint[0].g >> 8) & 1) << 23; + block.x |= ((endPoint[0].g >> 9) & 1) << 24; + block.x |= ((endPoint[0].b >> 0) & 1) << 25; + block.x |= ((endPoint[0].b >> 1) & 1) << 26; + block.x |= ((endPoint[0].b >> 2) & 1) << 27; + block.x |= ((endPoint[0].b >> 3) & 1) << 28; + block.x |= ((endPoint[0].b >> 4) & 1) << 29; + block.x |= ((endPoint[0].b >> 5) & 1) << 30; + block.x |= ((endPoint[0].b >> 6) & 1) << 31; + block.y |= ((endPoint[0].b >> 7) & 1) << 0; + block.y |= ((endPoint[0].b >> 8) & 1) << 1; + block.y |= ((endPoint[0].b >> 9) & 1) << 2; + block.y |= ((endPoint[1].r >> 0) & 1) << 3; + block.y |= ((endPoint[1].r >> 1) & 1) << 4; + block.y |= ((endPoint[1].r >> 2) & 1) << 5; + block.y |= ((endPoint[1].r >> 3) & 1) << 6; + block.y |= ((endPoint[1].r >> 4) & 1) << 7; + block.y |= ((endPoint[1].r >> 5) & 1) << 8; + block.y |= ((endPoint[1].r >> 6) & 1) << 9; + block.y |= ((endPoint[1].r >> 7) & 1) << 10; + block.y |= ((endPoint[0].r >> 11) & 1) << 11; + block.y |= ((endPoint[0].r >> 10) & 1) << 12; + block.y |= ((endPoint[1].g >> 0) & 1) << 13; + block.y |= ((endPoint[1].g >> 1) & 1) << 14; + block.y |= ((endPoint[1].g >> 2) & 1) << 15; + block.y |= ((endPoint[1].g >> 3) & 1) << 16; + block.y |= ((endPoint[1].g >> 4) & 1) << 17; + block.y |= ((endPoint[1].g >> 5) & 1) << 18; + block.y |= ((endPoint[1].g >> 6) & 1) << 19; + block.y |= ((endPoint[1].g >> 7) & 1) << 20; + block.y |= ((endPoint[0].g >> 11) & 1) << 21; + block.y |= ((endPoint[0].g >> 10) & 1) << 22; + block.y |= ((endPoint[1].b >> 0) & 1) << 23; + block.y |= ((endPoint[1].b >> 1) & 1) << 24; + block.y |= ((endPoint[1].b >> 2) & 1) << 25; + block.y |= ((endPoint[1].b >> 3) & 1) << 26; + block.y |= ((endPoint[1].b >> 4) & 1) << 27; + block.y |= ((endPoint[1].b >> 5) & 1) << 28; + block.y |= ((endPoint[1].b >> 6) & 1) << 29; + block.y |= ((endPoint[1].b >> 7) & 1) << 30; + block.y |= ((endPoint[0].b >> 11) & 1) << 31; + block.z |= ((endPoint[0].b >> 10) & 1) << 0; + } + else if (mode_type == candidateModeFlag[13]) + { + /*block.x |= candidateModeMemory[13]; + block.y |= ( ( endPoint[0].r >> 8 ) & 0x00000080 ); + block.y |= ( ( endPoint[0].r >> 6 ) & 0x00000100 ); + block.y |= ( ( endPoint[0].r >> 4 ) & 0x00000200 ); + block.y |= ( ( endPoint[0].r >> 2 ) & 0x00000400 ); + block.y |= ( ( endPoint[0].r >> 0 ) & 0x00000800 ); + block.y |= ( ( endPoint[0].r << 2 ) & 0x00001000 ); + block.y |= ( ( endPoint[0].g << 2 ) & 0x00020000 ); + block.y |= ( ( endPoint[0].g << 4 ) & 0x00040000 ); + block.y |= ( ( endPoint[0].g << 6 ) & 0x00080000 ); + block.y |= ( ( endPoint[0].g << 8 ) & 0x00100000 ); + block.y |= ( ( endPoint[0].g << 10 ) & 0x00200000 ); + block.y |= ( ( endPoint[0].g << 12 ) & 0x00400000 ); + block.y |= ( ( endPoint[0].b << 12 ) & 0x08000000 ); + block.y |= ( ( endPoint[0].b << 14 ) & 0x10000000 ); + block.y |= ( ( endPoint[0].b << 16 ) & 0x20000000 ); + block.y |= ( ( endPoint[0].b << 18 ) & 0x40000000 ); + block.y |= ( ( endPoint[0].b << 20 ) & 0x80000000 ); + block.y |= ( ( endPoint[1].r << 3 ) & 0x00000078 ) | ( ( endPoint[1].g << 13 ) & 0x0001E000 ) | ( ( endPoint[1].b << 23 ) & 0x07800000 ); + block.z |= ( endPoint[0].b >> 10 ) & 0x00000001;*/ + + block.x |= ((candidateModeMemory[13] >> 0) & 1) << 0; + block.x |= ((candidateModeMemory[13] >> 1) & 1) << 1; + block.x |= ((candidateModeMemory[13] >> 2) & 1) << 2; + block.x |= ((candidateModeMemory[13] >> 3) & 1) << 3; + block.x |= ((candidateModeMemory[13] >> 4) & 1) << 4; + block.x |= ((endPoint[0].r >> 0) & 1) << 5; + block.x |= ((endPoint[0].r >> 1) & 1) << 6; + block.x |= ((endPoint[0].r >> 2) & 1) << 7; + block.x |= ((endPoint[0].r >> 3) & 1) << 8; + block.x |= ((endPoint[0].r >> 4) & 1) << 9; + block.x |= ((endPoint[0].r >> 5) & 1) << 10; + block.x |= ((endPoint[0].r >> 6) & 1) << 11; + block.x |= ((endPoint[0].r >> 7) & 1) << 12; + block.x |= ((endPoint[0].r >> 8) & 1) << 13; + block.x |= ((endPoint[0].r >> 9) & 1) << 14; + block.x |= ((endPoint[0].g >> 0) & 1) << 15; + block.x |= ((endPoint[0].g >> 1) & 1) << 16; + block.x |= ((endPoint[0].g >> 2) & 1) << 17; + block.x |= ((endPoint[0].g >> 3) & 1) << 18; + block.x |= ((endPoint[0].g >> 4) & 1) << 19; + block.x |= ((endPoint[0].g >> 5) & 1) << 20; + block.x |= ((endPoint[0].g >> 6) & 1) << 21; + block.x |= ((endPoint[0].g >> 7) & 1) << 22; + block.x |= ((endPoint[0].g >> 8) & 1) << 23; + block.x |= ((endPoint[0].g >> 9) & 1) << 24; + block.x |= ((endPoint[0].b >> 0) & 1) << 25; + block.x |= ((endPoint[0].b >> 1) & 1) << 26; + block.x |= ((endPoint[0].b >> 2) & 1) << 27; + block.x |= ((endPoint[0].b >> 3) & 1) << 28; + block.x |= ((endPoint[0].b >> 4) & 1) << 29; + block.x |= ((endPoint[0].b >> 5) & 1) << 30; + block.x |= ((endPoint[0].b >> 6) & 1) << 31; + block.y |= ((endPoint[0].b >> 7) & 1) << 0; + block.y |= ((endPoint[0].b >> 8) & 1) << 1; + block.y |= ((endPoint[0].b >> 9) & 1) << 2; + block.y |= ((endPoint[1].r >> 0) & 1) << 3; + block.y |= ((endPoint[1].r >> 1) & 1) << 4; + block.y |= ((endPoint[1].r >> 2) & 1) << 5; + block.y |= ((endPoint[1].r >> 3) & 1) << 6; + block.y |= ((endPoint[0].r >> 15) & 1) << 7; + block.y |= ((endPoint[0].r >> 14) & 1) << 8; + block.y |= ((endPoint[0].r >> 13) & 1) << 9; + block.y |= ((endPoint[0].r >> 12) & 1) << 10; + block.y |= ((endPoint[0].r >> 11) & 1) << 11; + block.y |= ((endPoint[0].r >> 10) & 1) << 12; + block.y |= ((endPoint[1].g >> 0) & 1) << 13; + block.y |= ((endPoint[1].g >> 1) & 1) << 14; + block.y |= ((endPoint[1].g >> 2) & 1) << 15; + block.y |= ((endPoint[1].g >> 3) & 1) << 16; + block.y |= ((endPoint[0].g >> 15) & 1) << 17; + block.y |= ((endPoint[0].g >> 14) & 1) << 18; + block.y |= ((endPoint[0].g >> 13) & 1) << 19; + block.y |= ((endPoint[0].g >> 12) & 1) << 20; + block.y |= ((endPoint[0].g >> 11) & 1) << 21; + block.y |= ((endPoint[0].g >> 10) & 1) << 22; + block.y |= ((endPoint[1].b >> 0) & 1) << 23; + block.y |= ((endPoint[1].b >> 1) & 1) << 24; + block.y |= ((endPoint[1].b >> 2) & 1) << 25; + block.y |= ((endPoint[1].b >> 3) & 1) << 26; + block.y |= ((endPoint[0].b >> 15) & 1) << 27; + block.y |= ((endPoint[0].b >> 14) & 1) << 28; + block.y |= ((endPoint[0].b >> 13) & 1) << 29; + block.y |= ((endPoint[0].b >> 12) & 1) << 30; + block.y |= ((endPoint[0].b >> 11) & 1) << 31; + block.z |= ((endPoint[0].b >> 10) & 1) << 0; + } +} diff --git a/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC7Encode.hlsl b/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC7Encode.hlsl new file mode 100644 index 000000000..6a57c3862 --- /dev/null +++ b/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC7Encode.hlsl @@ -0,0 +1,1908 @@ +//TEST_IGNORE_FILE: +//-------------------------------------------------------------------------------------- +// File: BC7Encode.hlsl +// +// The Compute Shader for BC7 Encoder +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//#define REF_DEVICE + +#define CHAR_LENGTH 8 +#define NCHANNELS 4 +#define BC7_UNORM 98 +#define MAX_UINT 0xFFFFFFFF +#define MIN_UINT 0 + +static const uint candidateSectionBit[64] = //Associated to partition 0-63 +{ + 0xCCCC, 0x8888, 0xEEEE, 0xECC8, + 0xC880, 0xFEEC, 0xFEC8, 0xEC80, + 0xC800, 0xFFEC, 0xFE80, 0xE800, + 0xFFE8, 0xFF00, 0xFFF0, 0xF000, + 0xF710, 0x008E, 0x7100, 0x08CE, + 0x008C, 0x7310, 0x3100, 0x8CCE, + 0x088C, 0x3110, 0x6666, 0x366C, + 0x17E8, 0x0FF0, 0x718E, 0x399C, + 0xaaaa, 0xf0f0, 0x5a5a, 0x33cc, + 0x3c3c, 0x55aa, 0x9696, 0xa55a, + 0x73ce, 0x13c8, 0x324c, 0x3bdc, + 0x6996, 0xc33c, 0x9966, 0x660, + 0x272, 0x4e4, 0x4e40, 0x2720, + 0xc936, 0x936c, 0x39c6, 0x639c, + 0x9336, 0x9cc6, 0x817e, 0xe718, + 0xccf0, 0xfcc, 0x7744, 0xee22, +}; +static const uint candidateSectionBit2[64] = //Associated to partition 64-127 +{ + 0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8, + 0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050, + 0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090, + 0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250, + 0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0, + 0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500, + 0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400, + 0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200, + 0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424, + 0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50, + 0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0, + 0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600, + 0xaa444444, 0x54a854a8, 0x95809580, 0x96969600, + 0xa85454a8, 0x80959580, 0xaa141414, 0x96960000, + 0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000, + 0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254, +}; +static const uint2 candidateFixUpIndex1D[128] = +{ + {15, 0},{15, 0},{15, 0},{15, 0}, + {15, 0},{15, 0},{15, 0},{15, 0}, + {15, 0},{15, 0},{15, 0},{15, 0}, + {15, 0},{15, 0},{15, 0},{15, 0}, + {15, 0},{ 2, 0},{ 8, 0},{ 2, 0}, + { 2, 0},{ 8, 0},{ 8, 0},{15, 0}, + { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0}, + { 8, 0},{ 8, 0},{ 2, 0},{ 2, 0}, + + {15, 0},{15, 0},{ 6, 0},{ 8, 0}, + { 2, 0},{ 8, 0},{15, 0},{15, 0}, + { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0}, + { 2, 0},{15, 0},{15, 0},{ 6, 0}, + { 6, 0},{ 2, 0},{ 6, 0},{ 8, 0}, + {15, 0},{15, 0},{ 2, 0},{ 2, 0}, + {15, 0},{15, 0},{15, 0},{15, 0}, + {15, 0},{ 2, 0},{ 2, 0},{15, 0}, + //candidateFixUpIndex1D[i][1], i < 64 should not be used + + { 3,15},{ 3, 8},{15, 8},{15, 3}, + { 8,15},{ 3,15},{15, 3},{15, 8}, + { 8,15},{ 8,15},{ 6,15},{ 6,15}, + { 6,15},{ 5,15},{ 3,15},{ 3, 8}, + { 3,15},{ 3, 8},{ 8,15},{15, 3}, + { 3,15},{ 3, 8},{ 6,15},{10, 8}, + { 5, 3},{ 8,15},{ 8, 6},{ 6,10}, + { 8,15},{ 5,15},{15,10},{15, 8}, + + { 8,15},{15, 3},{ 3,15},{ 5,10}, + { 6,10},{10, 8},{ 8, 9},{15,10}, + {15, 6},{ 3,15},{15, 8},{ 5,15}, + {15, 3},{15, 6},{15, 6},{15, 8}, //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct + { 3,15},{15, 3},{ 5,15},{ 5,15}, + { 5,15},{ 8,15},{ 5,15},{10,15}, + { 5,15},{10,15},{ 8,15},{13,15}, + {15, 3},{12,15},{ 3,15},{ 3, 8}, +}; +static const uint2 candidateFixUpIndex1DOrdered[128] = //Same with candidateFixUpIndex1D but order the result when i >= 64 +{ + {15, 0},{15, 0},{15, 0},{15, 0}, + {15, 0},{15, 0},{15, 0},{15, 0}, + {15, 0},{15, 0},{15, 0},{15, 0}, + {15, 0},{15, 0},{15, 0},{15, 0}, + {15, 0},{ 2, 0},{ 8, 0},{ 2, 0}, + { 2, 0},{ 8, 0},{ 8, 0},{15, 0}, + { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0}, + { 8, 0},{ 8, 0},{ 2, 0},{ 2, 0}, + + {15, 0},{15, 0},{ 6, 0},{ 8, 0}, + { 2, 0},{ 8, 0},{15, 0},{15, 0}, + { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0}, + { 2, 0},{15, 0},{15, 0},{ 6, 0}, + { 6, 0},{ 2, 0},{ 6, 0},{ 8, 0}, + {15, 0},{15, 0},{ 2, 0},{ 2, 0}, + {15, 0},{15, 0},{15, 0},{15, 0}, + {15, 0},{ 2, 0},{ 2, 0},{15, 0}, + //candidateFixUpIndex1DOrdered[i][1], i < 64 should not be used + + { 3,15},{ 3, 8},{ 8,15},{ 3,15}, + { 8,15},{ 3,15},{ 3,15},{ 8,15}, + { 8,15},{ 8,15},{ 6,15},{ 6,15}, + { 6,15},{ 5,15},{ 3,15},{ 3, 8}, + { 3,15},{ 3, 8},{ 8,15},{ 3,15}, + { 3,15},{ 3, 8},{ 6,15},{ 8,10}, + { 3, 5},{ 8,15},{ 6, 8},{ 6,10}, + { 8,15},{ 5,15},{10,15},{ 8,15}, + + { 8,15},{ 3,15},{ 3,15},{ 5,10}, + { 6,10},{ 8,10},{ 8, 9},{10,15}, + { 6,15},{ 3,15},{ 8,15},{ 5,15}, + { 3,15},{ 6,15},{ 6,15},{ 8,15}, //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct + { 3,15},{ 3,15},{ 5,15},{ 5,15}, + { 5,15},{ 8,15},{ 5,15},{10,15}, + { 5,15},{10,15},{ 8,15},{13,15}, + { 3,15},{12,15},{ 3,15},{ 3, 8}, +}; +//static const uint4x4 candidateRotation[4] = +//{ +// {1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}, +// {0,0,0,1},{0,1,0,0},{0,0,1,0},{1,0,0,0}, +// {1,0,0,0},{0,0,0,1},{0,0,1,0},{0,1,0,0}, +// {1,0,0,0},{0,1,0,0},{0,0,0,1},{0,0,1,0} +//}; +//static const uint2 candidateIndexPrec[8] = {{3,0},{3,0},{2,0},{2,0}, +// {2,3}, //color index and alpha index can exchange +// {2,2},{4,4},{2,2}}; + +static const uint aWeight[3][16] = { {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64}, + {0, 9, 18, 27, 37, 46, 55, 64, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 21, 43, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + + //4 bit index: 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 +static const uint aStep[3][64] = { { 0, 0, 0, 1, 1, 1, 1, 2, + 2, 2, 2, 2, 3, 3, 3, 3, + 4, 4, 4, 4, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 7, 7, 7, + 7, 8, 8, 8, 8, 9, 9, 9, + 9,10,10,10,10,10,11,11, + 11,11,12,12,12,12,13,13, + 13,13,14,14,14,14,15,15 }, + //3 bit index: 0, 9, 18, 27, 37, 46, 55, 64 + { 0,0,0,0,0,1,1,1, + 1,1,1,1,1,1,2,2, + 2,2,2,2,2,2,2,3, + 3,3,3,3,3,3,3,3, + 3,4,4,4,4,4,4,4, + 4,4,5,5,5,5,5,5, + 5,5,5,6,6,6,6,6, + 6,6,6,6,7,7,7,7 }, + //2 bit index: 0, 21, 43, 64 + { 0,0,0,0,0,0,0,0, + 0,0,0,1,1,1,1,1, + 1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1, + 1,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,3,3, + 3,3,3,3,3,3,3,3 } }; + +cbuffer cbCS : register( b0 ) +{ + uint g_tex_width; + uint g_num_block_x; + uint g_format; + uint g_mode_id; + uint g_start_block_id; + uint g_num_total_blocks; + float g_alpha_weight; +}; + +//Forward declaration +uint2x4 compress_endpoints0( inout uint2x4 endPoint, uint2 P ); //Mode = 0 +uint2x4 compress_endpoints1( inout uint2x4 endPoint, uint2 P ); //Mode = 1 +uint2x4 compress_endpoints2( inout uint2x4 endPoint ); //Mode = 2 +uint2x4 compress_endpoints3( inout uint2x4 endPoint, uint2 P ); //Mode = 3 +uint2x4 compress_endpoints7( inout uint2x4 endPoint, uint2 P ); //Mode = 7 +uint2x4 compress_endpoints6( inout uint2x4 endPoint, uint2 P ); //Mode = 6 +uint2x4 compress_endpoints4( inout uint2x4 endPoint ); //Mode = 4 +uint2x4 compress_endpoints5( inout uint2x4 endPoint ); //Mode = 5 + +void block_package0( out uint4 block, uint partition, uint threadBase ); //Mode0 +void block_package1( out uint4 block, uint partition, uint threadBase ); //Mode1 +void block_package2( out uint4 block, uint partition, uint threadBase ); //Mode2 +void block_package3( out uint4 block, uint partition, uint threadBase ); //Mode3 +void block_package4( out uint4 block, uint rotation, uint index_selector, uint threadBase ); //Mode4 +void block_package5( out uint4 block, uint rotation, uint threadBase ); //Mode5 +void block_package6( out uint4 block, uint threadBase ); //Mode6 +void block_package7( out uint4 block, uint partition, uint threadBase ); //Mode7 + + +void swap(inout uint4 lhs, inout uint4 rhs) +{ + uint4 tmp = lhs; + lhs = rhs; + rhs = tmp; +} +void swap(inout uint3 lhs, inout uint3 rhs) +{ + uint3 tmp = lhs; + lhs = rhs; + rhs = tmp; +} +void swap(inout uint lhs, inout uint rhs) +{ + uint tmp = lhs; + lhs = rhs; + rhs = tmp; +} + +uint ComputeError(in uint4 a, in uint4 b) +{ + return dot(a.rgb, b.rgb) + g_alpha_weight * a.a*b.a; +} + +void Ensure_A_Is_Larger( inout uint4 a, inout uint4 b ) +{ + if ( a.x < b.x ) + swap( a.x, b.x ); + if ( a.y < b.y ) + swap( a.y, b.y ); + if ( a.z < b.z ) + swap( a.z, b.z ); + if ( a.w < b.w ) + swap( a.w, b.w ); +} + + +Texture2D g_Input : register( t0 ); +StructuredBuffer g_InBuff : register( t1 ); + +RWStructuredBuffer g_OutBuff : register( u0 ); + +#define THREAD_GROUP_SIZE 64 +#define BLOCK_SIZE_Y 4 +#define BLOCK_SIZE_X 4 +#define BLOCK_SIZE (BLOCK_SIZE_Y * BLOCK_SIZE_X) + +struct BufferShared +{ + uint4 pixel; + uint error; + uint mode; + uint partition; + uint index_selector; + uint rotation; + uint4 endPoint_low; + uint4 endPoint_high; + uint4 endPoint_low_quantized; + uint4 endPoint_high_quantized; +}; +groupshared BufferShared shared_temp[THREAD_GROUP_SIZE]; + +[numthreads( THREAD_GROUP_SIZE, 1, 1 )] +void TryMode456CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID ) // mode 4 5 6 all have 1 subset per block, and fix-up index is always index 0 +{ + // we process 4 BC blocks per thread group + const uint MAX_USED_THREAD = 16; // pixels in a BC (block compressed) block + uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD; // the number of BC blocks a thread group processes = 64 / 16 = 4 + uint blockInGroup = GI / MAX_USED_THREAD; // what BC block this thread is on within this thread group + uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup; // what global BC block this thread is on + uint threadBase = blockInGroup * MAX_USED_THREAD; // the first id of the pixel in this BC block in this thread group + uint threadInBlock = GI - threadBase; // id of the pixel in this BC block + +#ifndef REF_DEVICE + if (blockID >= g_num_total_blocks) + { + return; + } +#endif + + uint block_y = blockID / g_num_block_x; + uint block_x = blockID - block_y * g_num_block_x; + uint base_x = block_x * BLOCK_SIZE_X; + uint base_y = block_y * BLOCK_SIZE_Y; + + if (threadInBlock < 16) + { + shared_temp[GI].pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255); + + shared_temp[GI].endPoint_low = shared_temp[GI].pixel; + shared_temp[GI].endPoint_high = shared_temp[GI].pixel; + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + if (threadInBlock < 8) + { + shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low); + shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high); + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 4) + { + shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low); + shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high); + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 2) + { + shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low); + shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high); + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 1) + { + shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low); + shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high); + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + uint2x4 endPoint; + endPoint[0] = shared_temp[threadBase].endPoint_low; + endPoint[1] = shared_temp[threadBase].endPoint_high; + + uint error = 0xFFFFFFFF; + uint mode = 0; + uint index_selector = 0; + uint rotation = 0; + + uint2 indexPrec; + if (threadInBlock < 8) // all threads of threadInBlock < 8 will be working on trying out mode 4, since only mode 4 has index selector bit + { + if (0 == (threadInBlock & 1)) // thread 0, 2, 4, 6 + { + //2 represents 2bit index precision; 1 represents 3bit index precision + index_selector = 0; + indexPrec = uint2( 2, 1 ); + } + else // thread 1, 3, 5, 7 + { + //2 represents 2bit index precision; 1 represents 3bit index precision + index_selector = 1; + indexPrec = uint2( 1, 2 ); + } + } + else + { + //2 represents 2bit index precision + indexPrec = uint2( 2, 2 ); + } + + uint4 pixel_r; + uint color_index; + uint alpha_index; + int4 span; + int2 span_norm_sqr; + int2 dotProduct; + if (threadInBlock < 12) // Try mode 4 5 in threads 0..11 + { + // mode 4 5 have component rotation + if ((threadInBlock < 2) || (8 == threadInBlock)) // rotation = 0 in thread 0, 1 + { + rotation = 0; + } + else if ((threadInBlock < 4) || (9 == threadInBlock)) // rotation = 1 in thread 2, 3 + { + endPoint[0].ra = endPoint[0].ar; + endPoint[1].ra = endPoint[1].ar; + + rotation = 1; + } + else if ((threadInBlock < 6) || (10 == threadInBlock)) // rotation = 2 in thread 4, 5 + { + endPoint[0].ga = endPoint[0].ag; + endPoint[1].ga = endPoint[1].ag; + + rotation = 2; + } + else if ((threadInBlock < 8) || (11 == threadInBlock)) // rotation = 3 in thread 6, 7 + { + endPoint[0].ba = endPoint[0].ab; + endPoint[1].ba = endPoint[1].ab; + + rotation = 3; + } + + if (threadInBlock < 8) // try mode 4 in threads 0..7 + { + // mode 4 thread distribution + // Thread 0 1 2 3 4 5 6 7 + // Rotation 0 0 1 1 2 2 3 3 + // Index selector 0 1 0 1 0 1 0 1 + + mode = 4; + compress_endpoints4( endPoint ); + } + else // try mode 5 in threads 8..11 + { + // mode 5 thread distribution + // Thread 8 9 10 11 + // Rotation 0 1 2 3 + + mode = 5; + compress_endpoints5( endPoint ); + } + + uint4 pixel = shared_temp[threadBase + 0].pixel; + if (1 == rotation) + { + pixel.ra = pixel.ar; + } + else if (2 == rotation) + { + pixel.ga = pixel.ag; + } + else if (3 == rotation) + { + pixel.ba = pixel.ab; + } + + span = endPoint[1] - endPoint[0]; + span_norm_sqr = uint2( dot( span.rgb, span.rgb ), span.a * span.a ); + + // in mode 4 5 6, end point 0 must be closer to pixel 0 than end point 1, because of the fix-up index is always index 0 + // TODO: this shouldn't be necessary here in error calculation + /* + dotProduct = int2( dot( span.rgb, pixel.rgb - endPoint[0].rgb ), span.a * ( pixel.a - endPoint[0].a ) ); + if ( span_norm_sqr.x > 0 && dotProduct.x > 0 && uint( dotProduct.x * 63.49999 ) > uint( 32 * span_norm_sqr.x ) ) + { + span.rgb = -span.rgb; + swap(endPoint[0].rgb, endPoint[1].rgb); + } + if ( span_norm_sqr.y > 0 && dotProduct.y > 0 && uint( dotProduct.y * 63.49999 ) > uint( 32 * span_norm_sqr.y ) ) + { + span.a = -span.a; + swap(endPoint[0].a, endPoint[1].a); + } + */ + + // should be the same as above + dotProduct = int2( dot( pixel.rgb - endPoint[0].rgb, pixel.rgb - endPoint[0].rgb ), dot( pixel.rgb - endPoint[1].rgb, pixel.rgb - endPoint[1].rgb ) ); + if ( dotProduct.x > dotProduct.y ) + { + span.rgb = -span.rgb; + swap(endPoint[0].rgb, endPoint[1].rgb); + } + dotProduct = int2( dot( pixel.a - endPoint[0].a, pixel.a - endPoint[0].a ), dot( pixel.a - endPoint[1].a, pixel.a - endPoint[1].a ) ); + if ( dotProduct.x > dotProduct.y ) + { + span.a = -span.a; + swap(endPoint[0].a, endPoint[1].a); + } + + error = 0; + for ( uint i = 0; i < 16; i ++ ) + { + pixel = shared_temp[threadBase + i].pixel; + if (1 == rotation) + { + pixel.ra = pixel.ar; + } + else if (2 == rotation) + { + pixel.ga = pixel.ag; + } + else if (3 == rotation) + { + pixel.ba = pixel.ab; + } + + dotProduct.x = dot( span.rgb, pixel.rgb - endPoint[0].rgb ); + color_index = ( span_norm_sqr.x <= 0 /*endPoint[0] == endPoint[1]*/ || dotProduct.x <= 0 /*pixel == endPoint[0]*/ ) ? 0 + : ( ( dotProduct.x < span_norm_sqr.x ) ? aStep[indexPrec.x][ uint( dotProduct.x * 63.49999 / span_norm_sqr.x ) ] : aStep[indexPrec.x][63] ); + dotProduct.y = dot( span.a, pixel.a - endPoint[0].a ); + alpha_index = ( span_norm_sqr.y <= 0 || dotProduct.y <= 0 ) ? 0 + : ( ( dotProduct.y < span_norm_sqr.y ) ? aStep[indexPrec.y][ uint( dotProduct.y * 63.49999 / span_norm_sqr.y ) ] : aStep[indexPrec.y][63] ); + + // the same color_index and alpha_index should be used for reconstruction, so this should be left commented out + /*if (index_selector) + { + swap(color_index, alpha_index); + }*/ + + pixel_r.rgb = ( ( 64 - aWeight[indexPrec.x][color_index] ) * endPoint[0].rgb + + aWeight[indexPrec.x][color_index] * endPoint[1].rgb + + 32 ) >> 6; + pixel_r.a = ( ( 64 - aWeight[indexPrec.y][alpha_index] ) * endPoint[0].a + + aWeight[indexPrec.y][alpha_index] * endPoint[1].a + + 32 ) >> 6; + + Ensure_A_Is_Larger( pixel_r, pixel ); + pixel_r -= pixel; + if (1 == rotation) + { + pixel_r.ra = pixel_r.ar; + } + else if (2 == rotation) + { + pixel_r.ga = pixel_r.ag; + } + else if (3 == rotation) + { + pixel_r.ba = pixel_r.ab; + } + error += ComputeError(pixel_r, pixel_r); + } + } + else if (threadInBlock < 16) // Try mode 6 in threads 12..15, since in mode 4 5 6, only mode 6 has p bit + { + uint p = threadInBlock - 12; + + compress_endpoints6( endPoint, uint2(p >> 0, p >> 1) & 1 ); + + uint4 pixel = shared_temp[threadBase + 0].pixel; + + span = endPoint[1] - endPoint[0]; + span_norm_sqr = dot( span, span ); + dotProduct = dot( span, pixel - endPoint[0] ); + if ( span_norm_sqr.x > 0 && dotProduct.x >= 0 && uint( dotProduct.x * 63.49999 ) > uint( 32 * span_norm_sqr.x ) ) + { + span = -span; + swap(endPoint[0], endPoint[1]); + } + + error = 0; + for ( uint i = 0; i < 16; i ++ ) + { + pixel = shared_temp[threadBase + i].pixel; + + dotProduct.x = dot( span, pixel - endPoint[0] ); + color_index = ( span_norm_sqr.x <= 0 || dotProduct.x <= 0 ) ? 0 + : ( ( dotProduct.x < span_norm_sqr.x ) ? aStep[0][ uint( dotProduct.x * 63.49999 / span_norm_sqr.x ) ] : aStep[0][63] ); + + pixel_r = ( ( 64 - aWeight[0][color_index] ) * endPoint[0] + + aWeight[0][color_index] * endPoint[1] + 32 ) >> 6; + + Ensure_A_Is_Larger( pixel_r, pixel ); + pixel_r -= pixel; + error += ComputeError(pixel_r, pixel_r); + } + + mode = 6; + rotation = p; // Borrow rotation for p + } + + shared_temp[GI].error = error; + shared_temp[GI].mode = mode; + shared_temp[GI].index_selector = index_selector; + shared_temp[GI].rotation = rotation; + +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + if (threadInBlock < 8) + { + if ( shared_temp[GI].error > shared_temp[GI + 8].error ) + { + shared_temp[GI].error = shared_temp[GI + 8].error; + shared_temp[GI].mode = shared_temp[GI + 8].mode; + shared_temp[GI].index_selector = shared_temp[GI + 8].index_selector; + shared_temp[GI].rotation = shared_temp[GI + 8].rotation; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 4) + { + if ( shared_temp[GI].error > shared_temp[GI + 4].error ) + { + shared_temp[GI].error = shared_temp[GI + 4].error; + shared_temp[GI].mode = shared_temp[GI + 4].mode; + shared_temp[GI].index_selector = shared_temp[GI + 4].index_selector; + shared_temp[GI].rotation = shared_temp[GI + 4].rotation; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 2) + { + if ( shared_temp[GI].error > shared_temp[GI + 2].error ) + { + shared_temp[GI].error = shared_temp[GI + 2].error; + shared_temp[GI].mode = shared_temp[GI + 2].mode; + shared_temp[GI].index_selector = shared_temp[GI + 2].index_selector; + shared_temp[GI].rotation = shared_temp[GI + 2].rotation; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 1) + { + if ( shared_temp[GI].error > shared_temp[GI + 1].error ) + { + shared_temp[GI].error = shared_temp[GI + 1].error; + shared_temp[GI].mode = shared_temp[GI + 1].mode; + shared_temp[GI].index_selector = shared_temp[GI + 1].index_selector; + shared_temp[GI].rotation = shared_temp[GI + 1].rotation; + } + + g_OutBuff[blockID] = uint4(shared_temp[GI].error, (shared_temp[GI].index_selector << 31) | shared_temp[GI].mode, + 0, shared_temp[GI].rotation); // rotation is indeed rotation for mode 4 5. for mode 6, rotation is p bit + } +} + +[numthreads( THREAD_GROUP_SIZE, 1, 1 )] +void TryMode137CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID ) // mode 1 3 7 all have 2 subsets per block +{ + const uint MAX_USED_THREAD = 64; + uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD; + uint blockInGroup = GI / MAX_USED_THREAD; + uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup; + uint threadBase = blockInGroup * MAX_USED_THREAD; + uint threadInBlock = GI - threadBase; + + uint block_y = blockID / g_num_block_x; + uint block_x = blockID - block_y * g_num_block_x; + uint base_x = block_x * BLOCK_SIZE_X; + uint base_y = block_y * BLOCK_SIZE_Y; + + if (threadInBlock < 16) + { + shared_temp[GI].pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255); + } + GroupMemoryBarrierWithGroupSync(); + + shared_temp[GI].error = 0xFFFFFFFF; + + uint4 pixel_r; + uint2x4 endPoint[2]; // endPoint[0..1 for subset id][0..1 for low and high in the subset] + uint2x4 endPointBackup[2]; + uint color_index; + if (threadInBlock < 64) + { + uint partition = threadInBlock; + + endPoint[0][0] = MAX_UINT; + endPoint[0][1] = MIN_UINT; + endPoint[1][0] = MAX_UINT; + endPoint[1][1] = MIN_UINT; + uint bits = candidateSectionBit[partition]; + for ( uint i = 0; i < 16; i ++ ) + { + uint4 pixel = shared_temp[threadBase + i].pixel; + if ( (( bits >> i ) & 0x01) == 1 ) + { + endPoint[1][0] = min( endPoint[1][0], pixel ); + endPoint[1][1] = max( endPoint[1][1], pixel ); + } + else + { + endPoint[0][0] = min( endPoint[0][0], pixel ); + endPoint[0][1] = max( endPoint[0][1], pixel ); + } + } + + endPointBackup[0] = endPoint[0]; + endPointBackup[1] = endPoint[1]; + + uint max_p; + if (1 == g_mode_id) + { + // in mode 1, there is only one p bit per subset + max_p = 4; + } + else + { + // in mode 3 7, there are two p bits per subset, one for each end point + max_p = 16; + } + + uint rotation = 0; + uint error = MAX_UINT; + for ( uint p = 0; p < max_p; p ++ ) + { + endPoint[0] = endPointBackup[0]; + endPoint[1] = endPointBackup[1]; + + for ( i = 0; i < 2; i ++ ) // loop through 2 subsets + { + if (g_mode_id == 1) + { + compress_endpoints1( endPoint[i], (p >> i) & 1 ); + } + else if (g_mode_id == 3) + { + compress_endpoints3( endPoint[i], uint2(p >> (i * 2 + 0), p >> (i * 2 + 1)) & 1 ); + } + else if (g_mode_id == 7) + { + compress_endpoints7( endPoint[i], uint2(p >> (i * 2 + 0), p >> (i * 2 + 1)) & 1 ); + } + } + + int4 span[2]; + span[0] = endPoint[0][1] - endPoint[0][0]; + span[1] = endPoint[1][1] - endPoint[1][0]; + + if (g_mode_id != 7) + { + span[0].w = span[1].w = 0; + } + + int span_norm_sqr[2]; + span_norm_sqr[0] = dot( span[0], span[0] ); + span_norm_sqr[1] = dot( span[1], span[1] ); + + // TODO: again, this shouldn't be necessary here in error calculation + int dotProduct = dot( span[0], shared_temp[threadBase + 0].pixel - endPoint[0][0] ); + if ( span_norm_sqr[0] > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr[0] ) ) + { + span[0] = -span[0]; + swap(endPoint[0][0], endPoint[0][1]); + } + dotProduct = dot( span[1], shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel - endPoint[1][0] ); + if ( span_norm_sqr[1] > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr[1] ) ) + { + span[1] = -span[1]; + swap(endPoint[1][0], endPoint[1][1]); + } + + uint step_selector; + if (g_mode_id != 1) + { + step_selector = 2; // mode 3 7 have 2 bit index + } + else + { + step_selector = 1; // mode 1 has 3 bit index + } + + uint p_error = 0; + for ( i = 0; i < 16; i ++ ) + { + if (((bits >> i) & 0x01) == 1) + { + dotProduct = dot( span[1], shared_temp[threadBase + i].pixel - endPoint[1][0] ); + color_index = (span_norm_sqr[1] <= 0 || dotProduct <= 0) ? 0 + : ((dotProduct < span_norm_sqr[1]) ? aStep[step_selector][uint(dotProduct * 63.49999 / span_norm_sqr[1])] : aStep[step_selector][63]); + } + else + { + dotProduct = dot( span[0], shared_temp[threadBase + i].pixel - endPoint[0][0] ); + color_index = (span_norm_sqr[0] <= 0 || dotProduct <= 0) ? 0 + : ((dotProduct < span_norm_sqr[0]) ? aStep[step_selector][uint(dotProduct * 63.49999 / span_norm_sqr[0])] : aStep[step_selector][63]); + } + + uint subset_index = (bits >> i) & 0x01; + + pixel_r = ((64 - aWeight[step_selector][color_index]) * endPoint[subset_index][0] + + aWeight[step_selector][color_index] * endPoint[subset_index][1] + 32) >> 6; + if (g_mode_id != 7) + { + pixel_r.a = 255; + } + + uint4 pixel = shared_temp[threadBase + i].pixel; + Ensure_A_Is_Larger( pixel_r, pixel ); + pixel_r -= pixel; + p_error += ComputeError(pixel_r, pixel_r); + } + + if (p_error < error) + { + error = p_error; + rotation = p; + } + } + + shared_temp[GI].error = error; + shared_temp[GI].mode = g_mode_id; + shared_temp[GI].partition = partition; + shared_temp[GI].rotation = rotation; // mode 1 3 7 don't have rotation, we use rotation for p bits + } + GroupMemoryBarrierWithGroupSync(); + + if (threadInBlock < 32) + { + if ( shared_temp[GI].error > shared_temp[GI + 32].error ) + { + shared_temp[GI].error = shared_temp[GI + 32].error; + shared_temp[GI].mode = shared_temp[GI + 32].mode; + shared_temp[GI].partition = shared_temp[GI + 32].partition; + shared_temp[GI].rotation = shared_temp[GI + 32].rotation; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif +if (threadInBlock < 16) + { + if ( shared_temp[GI].error > shared_temp[GI + 16].error ) + { + shared_temp[GI].error = shared_temp[GI + 16].error; + shared_temp[GI].mode = shared_temp[GI + 16].mode; + shared_temp[GI].partition = shared_temp[GI + 16].partition; + shared_temp[GI].rotation = shared_temp[GI + 16].rotation; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 8) + { + if ( shared_temp[GI].error > shared_temp[GI + 8].error ) + { + shared_temp[GI].error = shared_temp[GI + 8].error; + shared_temp[GI].mode = shared_temp[GI + 8].mode; + shared_temp[GI].partition = shared_temp[GI + 8].partition; + shared_temp[GI].rotation = shared_temp[GI + 8].rotation; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 4) + { + if ( shared_temp[GI].error > shared_temp[GI + 4].error ) + { + shared_temp[GI].error = shared_temp[GI + 4].error; + shared_temp[GI].mode = shared_temp[GI + 4].mode; + shared_temp[GI].partition = shared_temp[GI + 4].partition; + shared_temp[GI].rotation = shared_temp[GI + 4].rotation; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 2) + { + if ( shared_temp[GI].error > shared_temp[GI + 2].error ) + { + shared_temp[GI].error = shared_temp[GI + 2].error; + shared_temp[GI].mode = shared_temp[GI + 2].mode; + shared_temp[GI].partition = shared_temp[GI + 2].partition; + shared_temp[GI].rotation = shared_temp[GI + 2].rotation; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 1) + { + if ( shared_temp[GI].error > shared_temp[GI + 1].error ) + { + shared_temp[GI].error = shared_temp[GI + 1].error; + shared_temp[GI].mode = shared_temp[GI + 1].mode; + shared_temp[GI].partition = shared_temp[GI + 1].partition; + shared_temp[GI].rotation = shared_temp[GI + 1].rotation; + } + + if (g_InBuff[blockID].x > shared_temp[GI].error) + { + g_OutBuff[blockID] = uint4(shared_temp[GI].error, shared_temp[GI].mode, shared_temp[GI].partition, shared_temp[GI].rotation); // mode 1 3 7 don't have rotation, we use rotation for p bits + } + else + { + g_OutBuff[blockID] = g_InBuff[blockID]; + } + } +} + +[numthreads( THREAD_GROUP_SIZE, 1, 1 )] +void TryMode02CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID ) // mode 0 2 have 3 subsets per block +{ + const uint MAX_USED_THREAD = 64; + uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD; + uint blockInGroup = GI / MAX_USED_THREAD; + uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup; + uint threadBase = blockInGroup * MAX_USED_THREAD; + uint threadInBlock = GI - threadBase; + + uint block_y = blockID / g_num_block_x; + uint block_x = blockID - block_y * g_num_block_x; + uint base_x = block_x * BLOCK_SIZE_X; + uint base_y = block_y * BLOCK_SIZE_Y; + + if (threadInBlock < 16) + { + shared_temp[GI].pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255); + } + GroupMemoryBarrierWithGroupSync(); + + shared_temp[GI].error = 0xFFFFFFFF; + + uint num_partitions; + if (0 == g_mode_id) + { + num_partitions = 16; + } + else + { + num_partitions = 64; + } + + uint4 pixel_r; + uint2x4 endPoint[3]; // endPoint[0..1 for subset id][0..1 for low and high in the subset] + uint2x4 endPointBackup[3]; + uint color_index[16]; + if (threadInBlock < num_partitions) + { + uint partition = threadInBlock + 64; + + endPoint[0][0] = MAX_UINT; + endPoint[0][1] = MIN_UINT; + endPoint[1][0] = MAX_UINT; + endPoint[1][1] = MIN_UINT; + endPoint[2][0] = MAX_UINT; + endPoint[2][1] = MIN_UINT; + uint bits2 = candidateSectionBit2[partition - 64]; + for ( uint i = 0; i < 16; i ++ ) + { + uint4 pixel = shared_temp[threadBase + i].pixel; + uint subset_index = ( bits2 >> ( i * 2 ) ) & 0x03; + if ( subset_index == 2 ) + { + endPoint[2][0] = min( endPoint[2][0], pixel ); + endPoint[2][1] = max( endPoint[2][1], pixel ); + } + else if ( subset_index == 1 ) + { + endPoint[1][0] = min( endPoint[1][0], pixel ); + endPoint[1][1] = max( endPoint[1][1], pixel ); + } + else + { + endPoint[0][0] = min( endPoint[0][0], pixel ); + endPoint[0][1] = max( endPoint[0][1], pixel ); + } + } + + endPointBackup[0] = endPoint[0]; + endPointBackup[1] = endPoint[1]; + endPointBackup[2] = endPoint[2]; + + uint max_p; + if (0 == g_mode_id) + { + max_p = 64; // changed from 32 to 64 + } + else + { + max_p = 1; + } + + uint rotation = 0; + uint error = MAX_UINT; + for ( uint p = 0; p < max_p; p ++ ) + { + endPoint[0] = endPointBackup[0]; + endPoint[1] = endPointBackup[1]; + endPoint[2] = endPointBackup[2]; + + for ( i = 0; i < 3; i ++ ) + { + if (0 == g_mode_id) + { + compress_endpoints0( endPoint[i], uint2(p >> (i * 2 + 0), p >> (i * 2 + 1)) & 1 ); + } + else + { + compress_endpoints2( endPoint[i] ); + } + } + + uint step_selector = 1 + (2 == g_mode_id); + + int4 span[3]; + span[0] = endPoint[0][1] - endPoint[0][0]; + span[1] = endPoint[1][1] - endPoint[1][0]; + span[2] = endPoint[2][1] - endPoint[2][0]; + span[0].w = span[1].w = span[2].w = 0; + int span_norm_sqr[3]; + span_norm_sqr[0] = dot( span[0], span[0] ); + span_norm_sqr[1] = dot( span[1], span[1] ); + span_norm_sqr[2] = dot( span[2], span[2] ); + + // TODO: again, this shouldn't be necessary here in error calculation + uint ci[3] = { 0, candidateFixUpIndex1D[partition].x, candidateFixUpIndex1D[partition].y }; + for (i = 0; i < 3; i ++) + { + int dotProduct = dot( span[i], shared_temp[threadBase + ci[i]].pixel - endPoint[i][0] ); + if ( span_norm_sqr[i] > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr[i] ) ) + { + span[i] = -span[i]; + swap(endPoint[i][0], endPoint[i][1]); + } + } + + uint p_error = 0; + for ( i = 0; i < 16; i ++ ) + { + uint subset_index = ( bits2 >> ( i * 2 ) ) & 0x03; + if ( subset_index == 2 ) + { + int dotProduct = dot( span[2], shared_temp[threadBase + i].pixel - endPoint[2][0] ); + color_index[i] = ( span_norm_sqr[2] <= 0 || dotProduct <= 0 ) ? 0 + : ( ( dotProduct < span_norm_sqr[2] ) ? aStep[step_selector][ uint( dotProduct * 63.49999 / span_norm_sqr[2] ) ] : aStep[step_selector][63] ); + } + else if ( subset_index == 1 ) + { + int dotProduct = dot( span[1], shared_temp[threadBase + i].pixel - endPoint[1][0] ); + color_index[i] = ( span_norm_sqr[1] <= 0 || dotProduct <= 0 ) ? 0 + : ( ( dotProduct < span_norm_sqr[1] ) ? aStep[step_selector][ uint( dotProduct * 63.49999 / span_norm_sqr[1] ) ] : aStep[step_selector][63] ); + } + else + { + int dotProduct = dot( span[0], shared_temp[threadBase + i].pixel - endPoint[0][0] ); + color_index[i] = ( span_norm_sqr[0] <= 0 || dotProduct <= 0 ) ? 0 + : ( ( dotProduct < span_norm_sqr[0] ) ? aStep[step_selector][ uint( dotProduct * 63.49999 / span_norm_sqr[0] ) ] : aStep[step_selector][63] ); + } + + pixel_r = ( ( 64 - aWeight[step_selector][color_index[i]] ) * endPoint[subset_index][0] + + aWeight[step_selector][color_index[i]] * endPoint[subset_index][1] + 32 ) >> 6; + pixel_r.a = 255; + + uint4 pixel = shared_temp[threadBase + i].pixel; + Ensure_A_Is_Larger( pixel_r, pixel ); + pixel_r -= pixel; + p_error += ComputeError(pixel_r, pixel_r); + } + + if (p_error < error) + { + error = p_error; + rotation = p; // Borrow rotation for p + } + } + + shared_temp[GI].error = error; + shared_temp[GI].partition = partition; + shared_temp[GI].rotation = rotation; + } + GroupMemoryBarrierWithGroupSync(); + + if (threadInBlock < 32) + { + if ( shared_temp[GI].error > shared_temp[GI + 32].error ) + { + shared_temp[GI].error = shared_temp[GI + 32].error; + shared_temp[GI].partition = shared_temp[GI + 32].partition; + shared_temp[GI].rotation = shared_temp[GI + 32].rotation; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 16) + { + if ( shared_temp[GI].error > shared_temp[GI + 16].error ) + { + shared_temp[GI].error = shared_temp[GI + 16].error; + shared_temp[GI].partition = shared_temp[GI + 16].partition; + shared_temp[GI].rotation = shared_temp[GI + 16].rotation; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 8) + { + if ( shared_temp[GI].error > shared_temp[GI + 8].error ) + { + shared_temp[GI].error = shared_temp[GI + 8].error; + shared_temp[GI].partition = shared_temp[GI + 8].partition; + shared_temp[GI].rotation = shared_temp[GI + 8].rotation; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 4) + { + if ( shared_temp[GI].error > shared_temp[GI + 4].error ) + { + shared_temp[GI].error = shared_temp[GI + 4].error; + shared_temp[GI].partition = shared_temp[GI + 4].partition; + shared_temp[GI].rotation = shared_temp[GI + 4].rotation; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 2) + { + if ( shared_temp[GI].error > shared_temp[GI + 2].error ) + { + shared_temp[GI].error = shared_temp[GI + 2].error; + shared_temp[GI].partition = shared_temp[GI + 2].partition; + shared_temp[GI].rotation = shared_temp[GI + 2].rotation; + } + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 1) + { + if ( shared_temp[GI].error > shared_temp[GI + 1].error ) + { + shared_temp[GI].error = shared_temp[GI + 1].error; + shared_temp[GI].partition = shared_temp[GI + 1].partition; + shared_temp[GI].rotation = shared_temp[GI + 1].rotation; + } + + if (g_InBuff[blockID].x > shared_temp[GI].error) + { + g_OutBuff[blockID] = uint4(shared_temp[GI].error, g_mode_id, shared_temp[GI].partition, shared_temp[GI].rotation); // rotation is actually p bit for mode 0. for mode 2, rotation is always 0 + } + else + { + g_OutBuff[blockID] = g_InBuff[blockID]; + } + } +} + +[numthreads( THREAD_GROUP_SIZE, 1, 1 )] +void EncodeBlockCS(uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID) +{ + const uint MAX_USED_THREAD = 16; + uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD; + uint blockInGroup = GI / MAX_USED_THREAD; + uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup; + uint threadBase = blockInGroup * MAX_USED_THREAD; + uint threadInBlock = GI - threadBase; + +#ifndef REF_DEVICE + if (blockID >= g_num_total_blocks) + { + return; + } +#endif + + uint block_y = blockID / g_num_block_x; + uint block_x = blockID - block_y * g_num_block_x; + uint base_x = block_x * BLOCK_SIZE_X; + uint base_y = block_y * BLOCK_SIZE_Y; + + uint mode = g_InBuff[blockID].y & 0x7FFFFFFF; + uint partition = g_InBuff[blockID].z; + uint index_selector = (g_InBuff[blockID].y >> 31) & 1; + uint rotation = g_InBuff[blockID].w; + + if (threadInBlock < 16) + { + uint4 pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255); + + if ((4 == mode) || (5 == mode)) + { + if (1 == rotation) + { + pixel.ra = pixel.ar; + } + else if (2 == rotation) + { + pixel.ga = pixel.ag; + } + else if (3 == rotation) + { + pixel.ba = pixel.ab; + } + } + + shared_temp[GI].pixel = pixel; + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + uint bits = candidateSectionBit[partition]; + uint bits2 = candidateSectionBit2[partition - 64]; + + uint2x4 ep; + uint2x4 ep_quantized; + [unroll] + for (int ii = 2; ii >= 0; -- ii) + { + if (threadInBlock < 16) + { + uint2x4 ep; + ep[0] = MAX_UINT; + ep[1] = MIN_UINT; + + uint4 pixel = shared_temp[GI].pixel; + + uint subset_index = ( bits >> threadInBlock ) & 0x01; + uint subset_index2 = ( bits2 >> ( threadInBlock * 2 ) ) & 0x03; + if (0 == ii) + { + if ((0 == mode) || (2 == mode)) + { + if (0 == subset_index2) + { + ep[0] = ep[1] = pixel; + } + } + else if ((1 == mode) || (3 == mode) || (7 == mode)) + { + if (0 == subset_index) + { + ep[0] = ep[1] = pixel; + } + } + else if ((4 == mode) || (5 == mode) || (6 == mode)) + { + ep[0] = ep[1] = pixel; + } + } + else if (1 == ii) + { + if ((0 == mode) || (2 == mode)) + { + if (1 == subset_index2) + { + ep[0] = ep[1] = pixel; + } + } + else if ((1 == mode) || (3 == mode) || (7 == mode)) + { + if (1 == subset_index) + { + ep[0] = ep[1] = pixel; + } + } + } + else + { + if ((0 == mode) || (2 == mode)) + { + if (2 == subset_index2) + { + ep[0] = ep[1] = pixel; + } + } + } + + shared_temp[GI].endPoint_low = ep[0]; + shared_temp[GI].endPoint_high = ep[1]; + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + if (threadInBlock < 8) + { + shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low); + shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high); + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 4) + { + shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low); + shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high); + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 2) + { + shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low); + shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high); + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + if (threadInBlock < 1) + { + shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low); + shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high); + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + if (ii == (int)threadInBlock) + { + ep[0] = shared_temp[threadBase].endPoint_low; + ep[1] = shared_temp[threadBase].endPoint_high; + } + } + + if (threadInBlock < 3) + { + uint2 P; + if (1 == mode) + { + P = (rotation >> threadInBlock) & 1; + } + else + { + P = uint2(rotation >> (threadInBlock * 2 + 0), rotation >> (threadInBlock * 2 + 1)) & 1; + } + + if (0 == mode) + { + ep_quantized = compress_endpoints0( ep, P ); + } + else if (1 == mode) + { + ep_quantized = compress_endpoints1( ep, P ); + } + else if (2 == mode) + { + ep_quantized = compress_endpoints2( ep ); + } + else if (3 == mode) + { + ep_quantized = compress_endpoints3( ep, P ); + } + else if (4 == mode) + { + ep_quantized = compress_endpoints4( ep ); + } + else if (5 == mode) + { + ep_quantized = compress_endpoints5( ep ); + } + else if (6 == mode) + { + ep_quantized = compress_endpoints6( ep, P ); + } + else //if (7 == mode) + { + ep_quantized = compress_endpoints7( ep, P ); + } + + int4 span = ep[1] - ep[0]; + if (mode < 4) + { + span.w = 0; + } + + if ((4 == mode) || (5 == mode)) + { + if (0 == threadInBlock) + { + int2 span_norm_sqr = uint2( dot( span.rgb, span.rgb ), span.a * span.a ); + int2 dotProduct = int2( dot( span.rgb, shared_temp[threadBase + 0].pixel.rgb - ep[0].rgb ), span.a * ( shared_temp[threadBase + 0].pixel.a - ep[0].a ) ); + if ( span_norm_sqr.x > 0 && dotProduct.x > 0 && uint( dotProduct.x * 63.49999 ) > uint( 32 * span_norm_sqr.x ) ) + { + swap(ep[0].rgb, ep[1].rgb); + swap(ep_quantized[0].rgb, ep_quantized[1].rgb); + } + if ( span_norm_sqr.y > 0 && dotProduct.y > 0 && uint( dotProduct.y * 63.49999 ) > uint( 32 * span_norm_sqr.y ) ) + { + swap(ep[0].a, ep[1].a); + swap(ep_quantized[0].a, ep_quantized[1].a); + } + } + } + else //if ((0 == mode) || (2 == mode) || (1 == mode) || (3 == mode) || (7 == mode) || (6 == mode)) + { + int p; + if (0 == threadInBlock) + { + p = 0; + } + else if (1 == threadInBlock) + { + p = candidateFixUpIndex1D[partition].x; + } + else //if (2 == threadInBlock) + { + p = candidateFixUpIndex1D[partition].y; + } + + int span_norm_sqr = dot( span, span ); + int dotProduct = dot( span, shared_temp[threadBase + p].pixel - ep[0] ); + if ( span_norm_sqr > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr ) ) + { + swap(ep[0], ep[1]); + swap(ep_quantized[0], ep_quantized[1]); + } + } + + shared_temp[GI].endPoint_low = ep[0]; + shared_temp[GI].endPoint_high = ep[1]; + shared_temp[GI].endPoint_low_quantized = ep_quantized[0]; + shared_temp[GI].endPoint_high_quantized = ep_quantized[1]; + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + if (threadInBlock < 16) + { + uint color_index = 0; + uint alpha_index = 0; + + uint2x4 ep; + + uint2 indexPrec; + if ((0 == mode) || (1 == mode)) + { + indexPrec = 1; + } + else if (6 == mode) + { + indexPrec = 0; + } + else if (4 == mode) + { + if (0 == index_selector) + { + indexPrec = uint2(2, 1); + } + else + { + indexPrec = uint2(1, 2); + } + } + else + { + indexPrec = 2; + } + + int subset_index; + if ((0 == mode) || (2 == mode)) + { + subset_index = (bits2 >> (threadInBlock * 2)) & 0x03; + } + else if ((1 == mode) || (3 == mode) || (7 == mode)) + { + subset_index = (bits >> threadInBlock) & 0x01; + } + else + { + subset_index = 0; + } + + ep[0] = shared_temp[threadBase + subset_index].endPoint_low; + ep[1] = shared_temp[threadBase + subset_index].endPoint_high; + + int4 span = ep[1] - ep[0]; + if (mode < 4) + { + span.w = 0; + } + + if ((4 == mode) || (5 == mode)) + { + int2 span_norm_sqr; + span_norm_sqr.x = dot( span.rgb, span.rgb ); + span_norm_sqr.y = span.a * span.a; + + int dotProduct = dot( span.rgb, shared_temp[threadBase + threadInBlock].pixel.rgb - ep[0].rgb ); + color_index = ( span_norm_sqr.x <= 0 || dotProduct <= 0 ) ? 0 + : ( ( dotProduct < span_norm_sqr.x ) ? aStep[indexPrec.x][ uint( dotProduct * 63.49999 / span_norm_sqr.x ) ] : aStep[indexPrec.x][63] ); + dotProduct = dot( span.a, shared_temp[threadBase + threadInBlock].pixel.a - ep[0].a ); + alpha_index = ( span_norm_sqr.y <= 0 || dotProduct <= 0 ) ? 0 + : ( ( dotProduct < span_norm_sqr.y ) ? aStep[indexPrec.y][ uint( dotProduct * 63.49999 / span_norm_sqr.y ) ] : aStep[indexPrec.y][63] ); + + if (index_selector) + { + swap(color_index, alpha_index); + } + } + else + { + int span_norm_sqr = dot( span, span ); + + int dotProduct = dot( span, shared_temp[threadBase + threadInBlock].pixel - ep[0] ); + color_index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0 + : ( ( dotProduct < span_norm_sqr ) ? aStep[indexPrec.x][ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep[indexPrec.x][63] ); + } + + shared_temp[GI].error = color_index; + shared_temp[GI].mode = alpha_index; + } +#ifdef REF_DEVICE + GroupMemoryBarrierWithGroupSync(); +#endif + + if (0 == threadInBlock) + { + uint4 block; + if (0 == mode) + { + block_package0( block, partition, threadBase ); + } + else if (1 == mode) + { + block_package1( block, partition, threadBase ); + } + else if (2 == mode) + { + block_package2( block, partition, threadBase ); + } + else if (3 == mode) + { + block_package3( block, partition, threadBase ); + } + else if (4 == mode) + { + block_package4( block, rotation, index_selector, threadBase ); + } + else if (5 == mode) + { + block_package5( block, rotation, threadBase ); + } + else if (6 == mode) + { + block_package6( block, threadBase ); + } + else //if (7 == mode) + { + block_package7( block, partition, threadBase ); + } + + g_OutBuff[blockID] = block; + } +} + +//uint4 truncate_and_round( uint4 color, uint bits) +//{ +// uint precisionMask = ((1 << bits) - 1) << (8 - bits); +// uint precisionHalf = (1 << (7-bits)); +// +// uint4 truncated = color & precisionMask; +// uint4 rounded = min(255, color + precisionHalf) & precisionMask; +// +// uint4 truncated_bak = truncated = truncated | (truncated >> bits); +// uint4 rounded_bak = rounded = rounded | (rounded >> bits); +// +// uint4 color_bak = color; +// +// Ensure_A_Is_Larger( rounded, color ); +// Ensure_A_Is_Larger( truncated, color_bak ); +// +// if (dot(rounded - color, rounded - color) < +// dot(truncated - color_bak, truncated - color_bak)) +// { +// return rounded_bak; +// } +// else +// { +// return truncated_bak; +// } +//} + +uint4 quantize( uint4 color, uint uPrec ) +{ + uint4 rnd = min(255, color + (1 << (7 - uPrec))); + return rnd >> (8 - uPrec); +} + +uint4 unquantize( uint4 color, uint uPrec ) +{ + color = color << (8 - uPrec); + return color | (color >> uPrec); +} + +uint2x4 compress_endpoints0( inout uint2x4 endPoint, uint2 P ) +{ + uint2x4 quantized; + for ( uint j = 0; j < 2; j ++ ) + { + quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb & 0xFFFFFFFE; + quantized[j].rgb |= P[j]; + quantized[j].a = 0xFF; + + endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb; + endPoint[j].a = 0xFF; + + quantized[j] <<= 3; + } + return quantized; +} +uint2x4 compress_endpoints1( inout uint2x4 endPoint, uint2 P ) +{ + uint2x4 quantized; + for ( uint j = 0; j < 2; j ++ ) + { + quantized[j].rgb = quantize(endPoint[j].rgbb, 7).rgb & 0xFFFFFFFE; + quantized[j].rgb |= P[j]; + quantized[j].a = 0xFF; + + endPoint[j].rgb = unquantize(quantized[j].rgbb, 7).rgb; + endPoint[j].a = 0xFF; + + quantized[j] <<= 1; + } + return quantized; +} +uint2x4 compress_endpoints2( inout uint2x4 endPoint ) +{ + uint2x4 quantized; + for ( uint j = 0; j < 2; j ++ ) + { + quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb; + quantized[j].a = 0xFF; + + endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb; + endPoint[j].a = 0xFF; + + quantized[j] <<= 3; + } + return quantized; +} +uint2x4 compress_endpoints3( inout uint2x4 endPoint, uint2 P ) +{ + uint2x4 quantized; + for ( uint j = 0; j < 2; j ++ ) + { + quantized[j].rgb = endPoint[j].rgb & 0xFFFFFFFE; + quantized[j].rgb |= P[j]; + quantized[j].a = 0xFF; + + endPoint[j].rgb = quantized[j].rgb; + endPoint[j].a = 0xFF; + } + return quantized; +} +uint2x4 compress_endpoints4( inout uint2x4 endPoint ) +{ + uint2x4 quantized; + for ( uint j = 0; j < 2; j ++ ) + { + quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb; + quantized[j].a = quantize(endPoint[j].a, 6).r; + + endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb; + endPoint[j].a = unquantize(quantized[j].a, 6).r; + + quantized[j].rgb <<= 3; + quantized[j].a <<= 2; + } + return quantized; +} +uint2x4 compress_endpoints5( inout uint2x4 endPoint ) +{ + uint2x4 quantized; + for ( uint j = 0; j < 2; j ++ ) + { + quantized[j].rgb = quantize(endPoint[j].rgbb, 7).rgb; + quantized[j].a = endPoint[j].a; + + endPoint[j].rgb = unquantize(quantized[j].rgbb, 7).rgb; + // endPoint[j].a Alpha is full precision + + quantized[j].rgb <<= 1; + } + return quantized; +} +uint2x4 compress_endpoints6( inout uint2x4 endPoint, uint2 P ) +{ + uint2x4 quantized; + for ( uint j = 0; j < 2; j ++ ) + { + quantized[j] = endPoint[j] & 0xFFFFFFFE; + quantized[j] |= P[j]; + + endPoint[j] = quantized[j]; + } + return quantized; +} +uint2x4 compress_endpoints7( inout uint2x4 endPoint, uint2 P ) +{ + uint2x4 quantized; + for ( uint j = 0; j < 2; j ++ ) + { + quantized[j] = quantize(endPoint[j], 6) & 0xFFFFFFFE; + quantized[j] |= P[j]; + + endPoint[j] = unquantize(quantized[j], 6); + } + return quantized << 2; +} + +#define get_end_point_l(subset) shared_temp[threadBase + subset].endPoint_low_quantized +#define get_end_point_h(subset) shared_temp[threadBase + subset].endPoint_high_quantized +#define get_color_index(index) shared_temp[threadBase + index].error +#define get_alpha_index(index) shared_temp[threadBase + index].mode + +void block_package0( out uint4 block, uint partition, uint threadBase ) +{ + block.x = 0x01 | ( (partition - 64) << 1 ) + | ( ( get_end_point_l(0).r & 0xF0 ) << 1 ) | ( ( get_end_point_h(0).r & 0xF0 ) << 5 ) + | ( ( get_end_point_l(1).r & 0xF0 ) << 9 ) | ( ( get_end_point_h(1).r & 0xF0 ) << 13 ) + | ( ( get_end_point_l(2).r & 0xF0 ) << 17 ) | ( ( get_end_point_h(2).r & 0xF0 ) << 21 ) + | ( ( get_end_point_l(0).g & 0xF0 ) << 25 ); + block.y = ( ( get_end_point_l(0).g & 0xF0 ) >> 7 ) | ( ( get_end_point_h(0).g & 0xF0 ) >> 3 ) + | ( ( get_end_point_l(1).g & 0xF0 ) << 1 ) | ( ( get_end_point_h(1).g & 0xF0 ) << 5 ) + | ( ( get_end_point_l(2).g & 0xF0 ) << 9 ) | ( ( get_end_point_h(2).g & 0xF0 ) << 13 ) + | ( ( get_end_point_l(0).b & 0xF0 ) << 17 ) | ( ( get_end_point_h(0).b & 0xF0 ) << 21 ) + | ( ( get_end_point_l(1).b & 0xF0 ) << 25 ); + block.z = ( ( get_end_point_l(1).b & 0xF0 ) >> 7 ) | ( ( get_end_point_h(1).b & 0xF0 ) >> 3 ) + | ( ( get_end_point_l(2).b & 0xF0 ) << 1 ) | ( ( get_end_point_h(2).b & 0xF0 ) << 5 ) + | ( ( get_end_point_l(0).r & 0x08 ) << 10 ) | ( ( get_end_point_h(0).r & 0x08 ) << 11 ) + | ( ( get_end_point_l(1).r & 0x08 ) << 12 ) | ( ( get_end_point_h(1).r & 0x08 ) << 13 ) + | ( ( get_end_point_l(2).r & 0x08 ) << 14 ) | ( ( get_end_point_h(2).r & 0x08 ) << 15 ) + | ( get_color_index(0) << 19 ); + block.w = 0; + uint i = 1; + for ( ; i <= min( candidateFixUpIndex1DOrdered[partition][0], 4 ); i ++ ) + { + block.z |= get_color_index(i) << ( i * 3 + 18 ); + } + if ( candidateFixUpIndex1DOrdered[partition][0] < 4 ) //i = 4 + { + block.z |= get_color_index(4) << 29; + i += 1; + } + else //i = 5 + { + block.w |= ( get_color_index(4) & 0x04 ) >> 2; + for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ ) + block.w |= get_color_index(i) << ( i * 3 - 14 ); + } + for ( ; i <= candidateFixUpIndex1DOrdered[partition][1]; i ++ ) + { + block.w |= get_color_index(i) << ( i * 3 - 15 ); + } + for ( ; i < 16; i ++ ) + { + block.w |= get_color_index(i) << ( i * 3 - 16 ); + } +} +void block_package1( out uint4 block, uint partition, uint threadBase ) +{ + block.x = 0x02 | ( partition << 2 ) + | ( ( get_end_point_l(0).r & 0xFC ) << 6 ) | ( ( get_end_point_h(0).r & 0xFC ) << 12 ) + | ( ( get_end_point_l(1).r & 0xFC ) << 18 ) | ( ( get_end_point_h(1).r & 0xFC ) << 24 ); + block.y = ( ( get_end_point_l(0).g & 0xFC ) >> 2 ) | ( ( get_end_point_h(0).g & 0xFC ) << 4 ) + | ( ( get_end_point_l(1).g & 0xFC ) << 10 ) | ( ( get_end_point_h(1).g & 0xFC ) << 16 ) + | ( ( get_end_point_l(0).b & 0xFC ) << 22 ) | ( ( get_end_point_h(0).b & 0xFC ) << 28 ); + block.z = ( ( get_end_point_h(0).b & 0xFC ) >> 4 ) | ( ( get_end_point_l(1).b & 0xFC ) << 2 ) + | ( ( get_end_point_h(1).b & 0xFC ) << 8 ) + | ( ( get_end_point_l(0).r & 0x02 ) << 15 ) | ( ( get_end_point_l(1).r & 0x02 ) << 16 ) + | ( get_color_index(0) << 18 ); + if ( candidateFixUpIndex1DOrdered[partition][0] == 15 ) + { + block.w = (get_color_index(15) << 30) | (get_color_index(14) << 27) | (get_color_index(13) << 24) | (get_color_index(12) << 21) | (get_color_index(11) << 18) | (get_color_index(10) << 15) + | (get_color_index(9) << 12) | (get_color_index(8) << 9) | (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5); + block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18); + } + else if ( candidateFixUpIndex1DOrdered[partition][0] == 2 ) + { + block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14) + | (get_color_index(9) << 11) | (get_color_index(8) << 8) | (get_color_index(7) << 5) | (get_color_index(6) << 2) | (get_color_index(5) >> 1); + block.z |= (get_color_index(5) << 31) | (get_color_index(4) << 28) | (get_color_index(3) << 25) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18); + } + else if ( candidateFixUpIndex1DOrdered[partition][0] == 8 ) + { + block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14) + | (get_color_index(9) << 11) | (get_color_index(8) << 9) | (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5); + block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18); + } + else //candidateFixUpIndex1DOrdered[partition] == 6 + { + block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14) + | (get_color_index(9) << 11) | (get_color_index(8) << 8) | (get_color_index(7) << 6) | (get_color_index(6) << 4) | get_color_index(5); + block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18); + } +} +void block_package2( out uint4 block, uint partition, uint threadBase ) +{ + block.x = 0x04 | ( (partition - 64) << 3 ) + | ( ( get_end_point_l(0).r & 0xF8 ) << 6 ) | ( ( get_end_point_h(0).r & 0xF8 ) << 11 ) + | ( ( get_end_point_l(1).r & 0xF8 ) << 16 ) | ( ( get_end_point_h(1).r & 0xF8 ) << 21 ) + | ( ( get_end_point_l(2).r & 0xF8 ) << 26 ); + block.y = ( ( get_end_point_l(2).r & 0xF8 ) >> 6 ) | ( ( get_end_point_h(2).r & 0xF8 ) >> 1 ) + | ( ( get_end_point_l(0).g & 0xF8 ) << 4 ) | ( ( get_end_point_h(0).g & 0xF8 ) << 9 ) + | ( ( get_end_point_l(1).g & 0xF8 ) << 14 ) | ( ( get_end_point_h(1).g & 0xF8 ) << 19 ) + | ( ( get_end_point_l(2).g & 0xF8 ) << 24 ); + block.z = ( ( get_end_point_h(2).g & 0xF8 ) >> 3 ) | ( ( get_end_point_l(0).b & 0xF8 ) << 2 ) + | ( ( get_end_point_h(0).b & 0xF8 ) << 7 ) | ( ( get_end_point_l(1).b & 0xF8 ) << 12 ) + | ( ( get_end_point_h(1).b & 0xF8 ) << 17 ) | ( ( get_end_point_l(2).b & 0xF8 ) << 22 ) + | ( ( get_end_point_h(2).b & 0xF8 ) << 27 ); + block.w = ( ( get_end_point_h(2).b & 0xF8 ) >> 5 ) + | ( get_color_index(0) << 3 ); + uint i = 1; + for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ ) + { + block.w |= get_color_index(i) << ( i * 2 + 2 ); + } + for ( ; i <= candidateFixUpIndex1DOrdered[partition][1]; i ++ ) + { + block.w |= get_color_index(i) << ( i * 2 + 1 ); + } + for ( ; i < 16; i ++ ) + { + block.w |= get_color_index(i) << ( i * 2 ); + } +} +void block_package3( out uint4 block, uint partition, uint threadBase ) +{ + block.x = 0x08 | ( partition << 4 ) + | ( ( get_end_point_l(0).r & 0xFE ) << 9 ) | ( ( get_end_point_h(0).r & 0xFE ) << 16 ) + | ( ( get_end_point_l(1).r & 0xFE ) << 23 ) | ( ( get_end_point_h(1).r & 0xFE ) << 30 ); + block.y = ( ( get_end_point_h(1).r & 0xFE ) >> 2 ) | ( ( get_end_point_l(0).g & 0xFE ) << 5 ) + | ( ( get_end_point_h(0).g & 0xFE ) << 12 ) | ( ( get_end_point_l(1).g & 0xFE ) << 19 ) + | ( ( get_end_point_h(1).g & 0xFE ) << 26 ); + block.z = ( ( get_end_point_h(1).g & 0xFE ) >> 6 ) | ( ( get_end_point_l(0).b & 0xFE ) << 1 ) + | ( ( get_end_point_h(0).b & 0xFE ) << 8 ) | ( ( get_end_point_l(1).b & 0xFE ) << 15 ) + | ( ( get_end_point_h(1).b & 0xFE ) << 22 ) + | ( ( get_end_point_l(0).r & 0x01 ) << 30 ) | ( ( get_end_point_h(0).r & 0x01 ) << 31 ); + block.w = ( ( get_end_point_l(1).r & 0x01 ) << 0 ) | ( ( get_end_point_h(1).r & 0x01 ) << 1 ) + | ( get_color_index(0) << 2 ); + uint i = 1; + for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ ) + { + block.w |= get_color_index(i) << ( i * 2 + 1 ); + } + for ( ; i < 16; i ++ ) + { + block.w |= get_color_index(i) << ( i * 2 ); + } +} +void block_package4( out uint4 block, uint rotation, uint index_selector, uint threadBase ) +{ + block.x = 0x10 | ( (rotation & 3) << 5 ) | ( (index_selector & 1) << 7 ) + | ( ( get_end_point_l(0).r & 0xF8 ) << 5 ) | ( ( get_end_point_h(0).r & 0xF8 ) << 10 ) + | ( ( get_end_point_l(0).g & 0xF8 ) << 15 ) | ( ( get_end_point_h(0).g & 0xF8 ) << 20 ) + | ( ( get_end_point_l(0).b & 0xF8 ) << 25 ); + + block.y = ( ( get_end_point_l(0).b & 0xF8 ) >> 7 ) | ( ( get_end_point_h(0).b & 0xF8 ) >> 2 ) + | ( ( get_end_point_l(0).a & 0xFC ) << 4 ) | ( ( get_end_point_h(0).a & 0xFC ) << 10 ) + | ( (get_color_index(0) & 1) << 18 ) | ( get_color_index(1) << 19 ) | ( get_color_index(2) << 21 ) | ( get_color_index(3) << 23 ) + | ( get_color_index(4) << 25 ) | ( get_color_index(5) << 27 ) | ( get_color_index(6) << 29 ) | ( get_color_index(7) << 31 ); + + block.z = ( get_color_index(7) >> 1 ) | ( get_color_index(8) << 1 ) | ( get_color_index(9) << 3 ) | ( get_color_index(10)<< 5 ) + | ( get_color_index(11)<< 7 ) | ( get_color_index(12)<< 9 ) | ( get_color_index(13)<< 11 ) | ( get_color_index(14)<< 13 ) + | ( get_color_index(15)<< 15 ) | ( (get_alpha_index(0) & 3) << 17 ) | ( get_alpha_index(1) << 19 ) | ( get_alpha_index(2) << 22 ) + | ( get_alpha_index(3) << 25 ) | ( get_alpha_index(4) << 28 ) | ( get_alpha_index(5) << 31 ); + + block.w = ( get_alpha_index(5) >> 1 ) | ( get_alpha_index(6) << 2 ) | ( get_alpha_index(7) << 5 ) | ( get_alpha_index(8) << 8 ) + | ( get_alpha_index(9) << 11 ) | ( get_alpha_index(10)<< 14 ) | ( get_alpha_index(11)<< 17 ) | ( get_alpha_index(12)<< 20 ) + | ( get_alpha_index(13)<< 23 ) | ( get_alpha_index(14)<< 26 ) | ( get_alpha_index(15)<< 29 ); +} +void block_package5( out uint4 block, uint rotation, uint threadBase ) +{ + block.x = 0x20 | ( rotation << 6 ) + | ( ( get_end_point_l(0).r & 0xFE ) << 7 ) | ( ( get_end_point_h(0).r & 0xFE ) << 14 ) + | ( ( get_end_point_l(0).g & 0xFE ) << 21 ) | ( ( get_end_point_h(0).g & 0xFE ) << 28 ); + block.y = ( ( get_end_point_h(0).g & 0xFE ) >> 4 ) | ( ( get_end_point_l(0).b & 0xFE ) << 3 ) + | ( ( get_end_point_h(0).b & 0xFE ) << 10 ) | ( get_end_point_l(0).a << 18 ) | ( get_end_point_h(0).a << 26 ); + block.z = ( get_end_point_h(0).a >> 6 ) + | ( get_color_index(0) << 2 ) | ( get_color_index(1) << 3 ) | ( get_color_index(2) << 5 ) | ( get_color_index(3) << 7 ) + | ( get_color_index(4) << 9 ) | ( get_color_index(5) << 11 ) | ( get_color_index(6) << 13 ) | ( get_color_index(7) << 15 ) + | ( get_color_index(8) << 17 ) | ( get_color_index(9) << 19 ) | ( get_color_index(10)<< 21 ) | ( get_color_index(11)<< 23 ) + | ( get_color_index(12)<< 25 ) | ( get_color_index(13)<< 27 ) | ( get_color_index(14)<< 29 ) | ( get_color_index(15)<< 31 ); + block.w = ( get_color_index(15)>> 1 ) | ( get_alpha_index(0) << 1 ) | ( get_alpha_index(1) << 2 ) | ( get_alpha_index(2) << 4 ) + | ( get_alpha_index(3) << 6 ) | ( get_alpha_index(4) << 8 ) | ( get_alpha_index(5) << 10 ) | ( get_alpha_index(6) << 12 ) + | ( get_alpha_index(7) << 14 ) | ( get_alpha_index(8) << 16 ) | ( get_alpha_index(9) << 18 ) | ( get_alpha_index(10)<< 20 ) + | ( get_alpha_index(11)<< 22 ) | ( get_alpha_index(12)<< 24 ) | ( get_alpha_index(13)<< 26 ) | ( get_alpha_index(14)<< 28 ) + | ( get_alpha_index(15)<< 30 ); +} +void block_package6( out uint4 block, uint threadBase ) +{ + block.x = 0x40 + | ( ( get_end_point_l(0).r & 0xFE ) << 6 ) | ( ( get_end_point_h(0).r & 0xFE ) << 13 ) + | ( ( get_end_point_l(0).g & 0xFE ) << 20 ) | ( ( get_end_point_h(0).g & 0xFE ) << 27 ); + block.y = ( ( get_end_point_h(0).g & 0xFE ) >> 5 ) | ( ( get_end_point_l(0).b & 0xFE ) << 2 ) + | ( ( get_end_point_h(0).b & 0xFE ) << 9 ) | ( ( get_end_point_l(0).a & 0xFE ) << 16 ) + | ( ( get_end_point_h(0).a & 0xFE ) << 23 ) + | ( get_end_point_l(0).r & 0x01 ) << 31; + block.z = ( get_end_point_h(0).r & 0x01 ) + | ( get_color_index(0) << 1 ) | ( get_color_index(1) << 4 ) | ( get_color_index(2) << 8 ) | ( get_color_index(3) << 12 ) + | ( get_color_index(4) << 16 ) | ( get_color_index(5) << 20 ) | ( get_color_index(6) << 24 ) | ( get_color_index(7) << 28 ); + block.w = ( get_color_index(8) << 0 ) | ( get_color_index(9) << 4 ) | ( get_color_index(10)<< 8 ) | ( get_color_index(11)<< 12 ) + | ( get_color_index(12)<< 16 ) | ( get_color_index(13)<< 20 ) | ( get_color_index(14)<< 24 ) | ( get_color_index(15)<< 28 ); +} +void block_package7( out uint4 block, uint partition, uint threadBase ) +{ + block.x = 0x80 | ( partition << 8 ) + | ( ( get_end_point_l(0).r & 0xF8 ) << 11 ) | ( ( get_end_point_h(0).r & 0xF8 ) << 16 ) + | ( ( get_end_point_l(1).r & 0xF8 ) << 21 ) | ( ( get_end_point_h(1).r & 0xF8 ) << 26 ); + block.y = ( ( get_end_point_h(1).r & 0xF8 ) >> 6 ) | ( ( get_end_point_l(0).g & 0xF8 ) >> 1 ) + | ( ( get_end_point_h(0).g & 0xF8 ) << 4 ) | ( ( get_end_point_l(1).g & 0xF8 ) << 9 ) + | ( ( get_end_point_h(1).g & 0xF8 ) << 14 ) | ( ( get_end_point_l(0).b & 0xF8 ) << 19 ) + | ( ( get_end_point_h(0).b & 0xF8 ) << 24 ); + block.z = ( ( get_end_point_l(1).b & 0xF8 ) >> 3 ) | ( ( get_end_point_h(1).b & 0xF8 ) << 2 ) + | ( ( get_end_point_l(0).a & 0xF8 ) << 7 ) | ( ( get_end_point_h(0).a & 0xF8 ) << 12 ) + | ( ( get_end_point_l(1).a & 0xF8 ) << 17 ) | ( ( get_end_point_h(1).a & 0xF8 ) << 22 ) + | ( ( get_end_point_l(0).r & 0x04 ) << 28 ) | ( ( get_end_point_h(0).r & 0x04 ) << 29 ); + block.w = ( ( get_end_point_l(1).r & 0x04 ) >> 2 ) | ( ( get_end_point_h(1).r & 0x04 ) >> 1 ) + | ( get_color_index(0) << 2 ); + uint i = 1; + for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ ) + { + block.w |= get_color_index(i) << ( i * 2 + 1 ); + } + for ( ; i < 16; i ++ ) + { + block.w |= get_color_index(i) << ( i * 2 ); + } +} \ No newline at end of file diff --git a/tests/hlsl/dxsdk/BasicCompute11/BasicCompute11.hlsl b/tests/hlsl/dxsdk/BasicCompute11/BasicCompute11.hlsl new file mode 100644 index 000000000..798eea2ff --- /dev/null +++ b/tests/hlsl/dxsdk/BasicCompute11/BasicCompute11.hlsl @@ -0,0 +1,72 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSMain +//-------------------------------------------------------------------------------------- +// File: BasicCompute11.hlsl +// +// This file contains the Compute Shader to perform array A + array B +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +#ifdef USE_STRUCTURED_BUFFERS + +struct BufType +{ + int i; + float f; +#ifdef TEST_DOUBLE + double d; +#endif +}; + +StructuredBuffer Buffer0 : register(t0); +StructuredBuffer Buffer1 : register(t1); +RWStructuredBuffer BufferOut : register(u0); + +[numthreads(1, 1, 1)] +void CSMain( uint3 DTid : SV_DispatchThreadID ) +{ + BufferOut[DTid.x].i = Buffer0[DTid.x].i + Buffer1[DTid.x].i; + BufferOut[DTid.x].f = Buffer0[DTid.x].f + Buffer1[DTid.x].f; +#ifdef TEST_DOUBLE + BufferOut[DTid.x].d = Buffer0[DTid.x].d + Buffer1[DTid.x].d; +#endif +} + +#else // The following code is for raw buffers + +ByteAddressBuffer Buffer0 : register(t0); +ByteAddressBuffer Buffer1 : register(t1); +RWByteAddressBuffer BufferOut : register(u0); + +[numthreads(1, 1, 1)] +void CSMain( uint3 DTid : SV_DispatchThreadID ) +{ +#ifdef TEST_DOUBLE + int i0 = asint( Buffer0.Load( DTid.x*16 ) ); + float f0 = asfloat( Buffer0.Load( DTid.x*16+4 ) ); + double d0 = asdouble( Buffer0.Load( DTid.x*16+8 ), Buffer0.Load( DTid.x*16+12 ) ); + int i1 = asint( Buffer1.Load( DTid.x*16 ) ); + float f1 = asfloat( Buffer1.Load( DTid.x*16+4 ) ); + double d1 = asdouble( Buffer1.Load( DTid.x*16+8 ), Buffer1.Load( DTid.x*16+12 ) ); + + BufferOut.Store( DTid.x*16, asuint(i0 + i1) ); + BufferOut.Store( DTid.x*16+4, asuint(f0 + f1) ); + + uint dl, dh; + asuint( d0 + d1, dl, dh ); + + BufferOut.Store( DTid.x*16+8, dl ); + BufferOut.Store( DTid.x*16+12, dh ); +#else + int i0 = asint( Buffer0.Load( DTid.x*8 ) ); + float f0 = asfloat( Buffer0.Load( DTid.x*8+4 ) ); + int i1 = asint( Buffer1.Load( DTid.x*8 ) ); + float f1 = asfloat( Buffer1.Load( DTid.x*8+4 ) ); + + BufferOut.Store( DTid.x*8, asuint(i0 + i1) ); + BufferOut.Store( DTid.x*8+4, asuint(f0 + f1) ); +#endif // TEST_DOUBLE +} + +#endif // USE_STRUCTURED_BUFFERS diff --git a/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL.fx b/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL.fx new file mode 100644 index 000000000..bd28f862b --- /dev/null +++ b/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL.fx @@ -0,0 +1,158 @@ +//TEST_IGNORE_FILE: +//-------------------------------------------------------------------------------------- +// File: BasicHLSL.fx +// +// The effect file for the BasicHLSL sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + + +//-------------------------------------------------------------------------------------- +// Global variables +//-------------------------------------------------------------------------------------- +float4 g_MaterialAmbientColor; // Material's ambient color +float4 g_MaterialDiffuseColor; // Material's diffuse color +int g_nNumLights; + +float3 g_LightDir; // Light's direction in world space +float4 g_LightDiffuse; // Light's diffuse color +float4 g_LightAmbient; // Light's ambient color + +texture g_MeshTexture; // Color texture for mesh + +float g_fTime; // App's time in seconds +float4x4 g_mWorld; // World matrix for object +float4x4 g_mWorldViewProjection; // World * View * Projection matrix + + + +//-------------------------------------------------------------------------------------- +// Texture samplers +//-------------------------------------------------------------------------------------- +sampler MeshTextureSampler = +sampler_state +{ + Texture = ; + MipFilter = LINEAR; + MinFilter = LINEAR; + MagFilter = LINEAR; +}; + + +//-------------------------------------------------------------------------------------- +// Vertex shader output structure +//-------------------------------------------------------------------------------------- +struct VS_OUTPUT +{ + float4 Position : POSITION; // vertex position + float4 Diffuse : COLOR0; // vertex diffuse color (note that COLOR0 is clamped from 0..1) + float2 TextureUV : TEXCOORD0; // vertex texture coords +}; + + +//-------------------------------------------------------------------------------------- +// This shader computes standard transform and lighting +//-------------------------------------------------------------------------------------- +VS_OUTPUT RenderSceneVS( float4 vPos : POSITION, + float3 vNormal : NORMAL, + float2 vTexCoord0 : TEXCOORD0, + uniform int nNumLights, + uniform bool bTexture, + uniform bool bAnimate ) +{ + + VS_OUTPUT Output; + float3 vNormalWorldSpace; + + // Transform the position from object space to homogeneous projection space + Output.Position = mul(vPos, g_mWorldViewProjection); + + // Transform the normal from object space to world space + vNormalWorldSpace = normalize(mul(vNormal, (float3x3)g_mWorld)); // normal (world space) + + // Compute simple directional lighting equation + float3 vTotalLightDiffuse = float3(0,0,0); + for(int i=0; i 1 ) + int fBlendIntervalbelowIndex = min(0, iCurrentCascadeIndex-1); + fPixelDepth -= m_fCascadeFrustumsEyeSpaceDepthsFloat4[ fBlendIntervalbelowIndex ].x; + fBlendInterval -= m_fCascadeFrustumsEyeSpaceDepthsFloat4[ fBlendIntervalbelowIndex ].x; + + // The current pixel's blend band location will be used to determine when we need to blend and by how much. + fCurrentPixelsBlendBandLocation = fPixelDepth / fBlendInterval; + fCurrentPixelsBlendBandLocation = 1.0f - fCurrentPixelsBlendBandLocation; + // The fBlendBetweenCascadesAmount is our location in the blend band. + fBlendBetweenCascadesAmount = fCurrentPixelsBlendBandLocation / m_fCascadeBlendArea; +} + + + +//-------------------------------------------------------------------------------------- +// Calculate amount to blend between two cascades and the band where blending will occure. +//-------------------------------------------------------------------------------------- +void CalculateBlendAmountForMap ( in float4 vShadowMapTextureCoord, + in out float fCurrentPixelsBlendBandLocation, + out float fBlendBetweenCascadesAmount ) +{ + // Calcaulte the blend band for the map based selection. + float2 distanceToOne = float2 ( 1.0f - vShadowMapTextureCoord.x, 1.0f - vShadowMapTextureCoord.y ); + fCurrentPixelsBlendBandLocation = min( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y ); + float fCurrentPixelsBlendBandLocation2 = min( distanceToOne.x, distanceToOne.y ); + fCurrentPixelsBlendBandLocation = + min( fCurrentPixelsBlendBandLocation, fCurrentPixelsBlendBandLocation2 ); + fBlendBetweenCascadesAmount = fCurrentPixelsBlendBandLocation / m_fCascadeBlendArea; +} + +//-------------------------------------------------------------------------------------- +// Calculate the shadow based on several options and rende the scene. +//-------------------------------------------------------------------------------------- +float4 PSMain( VS_OUTPUT Input ) : SV_TARGET +{ + float4 vDiffuse = g_txDiffuse.Sample( g_samLinear, Input.vTexcoord ); + + float4 vShadowMapTextureCoord = 0.0f; + float4 vShadowMapTextureCoord_blend = 0.0f; + + float4 vVisualizeCascadeColor = float4(0.0f,0.0f,0.0f,1.0f); + + float fPercentLit = 0.0f; + float fPercentLit_blend = 0.0f; + + + float fUpTextDepthWeight=0; + float fRightTextDepthWeight=0; + float fUpTextDepthWeight_blend=0; + float fRightTextDepthWeight_blend=0; + + int iBlurRowSize = m_iPCFBlurForLoopEnd - m_iPCFBlurForLoopStart; + iBlurRowSize *= iBlurRowSize; + float fBlurRowSize = (float)iBlurRowSize; + + int iCascadeFound = 0; + int iNextCascadeIndex = 1; + + float fCurrentPixelDepth; + + // The interval based selection technique compares the pixel's depth against the frustum's cascade divisions. + fCurrentPixelDepth = Input.vDepth; + + // This for loop is not necessary when the frustum is uniformaly divided and interval based selection is used. + // In this case fCurrentPixelDepth could be used as an array lookup into the correct frustum. + int iCurrentCascadeIndex; + + float4 vShadowMapTextureCoordViewSpace = Input.vTexShadow; + if( SELECT_CASCADE_BY_INTERVAL_FLAG ) + { + iCurrentCascadeIndex = 0; + if ( CASCADE_COUNT_FLAG > 1 ) + { + float4 vCurrentPixelDepth = Input.vDepth; + float4 fComparison = ( vCurrentPixelDepth > m_fCascadeFrustumsEyeSpaceDepthsFloat[0]); + float4 fComparison2 = ( vCurrentPixelDepth > m_fCascadeFrustumsEyeSpaceDepthsFloat[1]); + float fIndex = dot( + float4( CASCADE_COUNT_FLAG > 0, + CASCADE_COUNT_FLAG > 1, + CASCADE_COUNT_FLAG > 2, + CASCADE_COUNT_FLAG > 3) + , fComparison ) + + dot( + float4( + CASCADE_COUNT_FLAG > 4, + CASCADE_COUNT_FLAG > 5, + CASCADE_COUNT_FLAG > 6, + CASCADE_COUNT_FLAG > 7) + , fComparison2 ) ; + + fIndex = min( fIndex, CASCADE_COUNT_FLAG - 1 ); + iCurrentCascadeIndex = (int)fIndex; + } + } + + if ( !SELECT_CASCADE_BY_INTERVAL_FLAG ) + { + iCurrentCascadeIndex = 0; + if ( CASCADE_COUNT_FLAG == 1 ) + { + vShadowMapTextureCoord = vShadowMapTextureCoordViewSpace * m_vCascadeScale[0]; + vShadowMapTextureCoord += m_vCascadeOffset[0]; + } + if ( CASCADE_COUNT_FLAG > 1 ) { + for( int iCascadeIndex = 0; iCascadeIndex < CASCADE_COUNT_FLAG && iCascadeFound == 0; ++iCascadeIndex ) + { + vShadowMapTextureCoord = vShadowMapTextureCoordViewSpace * m_vCascadeScale[iCascadeIndex]; + vShadowMapTextureCoord += m_vCascadeOffset[iCascadeIndex]; + + if ( min( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y ) > m_fMinBorderPadding + && max( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y ) < m_fMaxBorderPadding ) + { + iCurrentCascadeIndex = iCascadeIndex; + iCascadeFound = 1; + } + } + } + } + + float4 color = 0; + + if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG ) + { + // Repeat text coord calculations for the next cascade. + // The next cascade index is used for blurring between maps. + iNextCascadeIndex = min ( CASCADE_COUNT_FLAG - 1, iCurrentCascadeIndex + 1 ); + } + + float fBlendBetweenCascadesAmount = 1.0f; + float fCurrentPixelsBlendBandLocation = 1.0f; + + if( SELECT_CASCADE_BY_INTERVAL_FLAG ) + { + if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG && CASCADE_COUNT_FLAG > 1 ) + { + CalculateBlendAmountForInterval ( iCurrentCascadeIndex, fCurrentPixelDepth, + fCurrentPixelsBlendBandLocation, fBlendBetweenCascadesAmount ); + } + } + else + { + + if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG ) + { + CalculateBlendAmountForMap ( vShadowMapTextureCoord, + fCurrentPixelsBlendBandLocation, fBlendBetweenCascadesAmount ); + } + } + + float3 vShadowMapTextureCoordDDX; + float3 vShadowMapTextureCoordDDY; + // The derivatives are used to find the slope of the current plane. + // The derivative calculation has to be inside of the loop in order to prevent divergent flow control artifacts. + if( USE_DERIVATIVES_FOR_DEPTH_OFFSET_FLAG ) + { + vShadowMapTextureCoordDDX = ddx( vShadowMapTextureCoordViewSpace ); + vShadowMapTextureCoordDDY = ddy( vShadowMapTextureCoordViewSpace ); + + vShadowMapTextureCoordDDX *= m_vCascadeScale[iCurrentCascadeIndex]; + vShadowMapTextureCoordDDY *= m_vCascadeScale[iCurrentCascadeIndex]; + } + + ComputeCoordinatesTransform( iCurrentCascadeIndex, + Input.vInterpPos, + vShadowMapTextureCoord, + vShadowMapTextureCoordViewSpace ); + + + vVisualizeCascadeColor = vCascadeColorsMultiplier[iCurrentCascadeIndex]; + + if( USE_DERIVATIVES_FOR_DEPTH_OFFSET_FLAG ) + { + CalculateRightAndUpTexelDepthDeltas ( vShadowMapTextureCoordDDX, vShadowMapTextureCoordDDY, + fUpTextDepthWeight, fRightTextDepthWeight ); + } + + CalculatePCFPercentLit ( vShadowMapTextureCoord, fRightTextDepthWeight, + fUpTextDepthWeight, fBlurRowSize, fPercentLit ); + + if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG && CASCADE_COUNT_FLAG > 1 ) + { + if( fCurrentPixelsBlendBandLocation < m_fCascadeBlendArea) + { // the current pixel is within the blend band. + + // Repeat text coord calculations for the next cascade. + // The next cascade index is used for blurring between maps. + if( !SELECT_CASCADE_BY_INTERVAL_FLAG ) + { + vShadowMapTextureCoord_blend = vShadowMapTextureCoordViewSpace * m_vCascadeScale[iNextCascadeIndex]; + vShadowMapTextureCoord_blend += m_vCascadeOffset[iNextCascadeIndex]; + } + + ComputeCoordinatesTransform( iNextCascadeIndex, Input.vInterpPos, + vShadowMapTextureCoord_blend, + vShadowMapTextureCoordViewSpace ); + + // We repeat the calcuation for the next cascade layer, when blending between maps. + if( fCurrentPixelsBlendBandLocation < m_fCascadeBlendArea) + { // the current pixel is within the blend band. + if( USE_DERIVATIVES_FOR_DEPTH_OFFSET_FLAG ) + { + + CalculateRightAndUpTexelDepthDeltas ( vShadowMapTextureCoordDDX, + vShadowMapTextureCoordDDY, + fUpTextDepthWeight_blend, + fRightTextDepthWeight_blend ); + } + CalculatePCFPercentLit ( vShadowMapTextureCoord_blend, fRightTextDepthWeight_blend, + fUpTextDepthWeight_blend, fBlurRowSize, fPercentLit_blend ); + fPercentLit = lerp( fPercentLit_blend, fPercentLit, fBlendBetweenCascadesAmount ); + // Blend the two calculated shadows by the blend amount. + } + } + } + + + if( !m_iVisualizeCascades ) vVisualizeCascadeColor = float4(1.0f,1.0f,1.0f,1.0f); + + float3 vLightDir1 = float3( -1.0f, 1.0f, -1.0f ); + float3 vLightDir2 = float3( 1.0f, 1.0f, -1.0f ); + float3 vLightDir3 = float3( 0.0f, -1.0f, 0.0f ); + float3 vLightDir4 = float3( 1.0f, 1.0f, 1.0f ); + // Some ambient-like lighting. + float fLighting = + saturate( dot( vLightDir1 , Input.vNormal ) )*0.05f + + saturate( dot( vLightDir2 , Input.vNormal ) )*0.05f + + saturate( dot( vLightDir3 , Input.vNormal ) )*0.05f + + saturate( dot( vLightDir4 , Input.vNormal ) )*0.05f ; + + float4 vShadowLighting = fLighting * 0.5f; + fLighting += saturate( dot( m_vLightDir , Input.vNormal ) ); + fLighting = lerp( vShadowLighting, fLighting, fPercentLit ); + + return fLighting * vVisualizeCascadeColor * vDiffuse; + +} + diff --git a/tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeShadow.hlsl b/tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeShadow.hlsl new file mode 100644 index 000000000..3b4d32a0d --- /dev/null +++ b/tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeShadow.hlsl @@ -0,0 +1,53 @@ +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain -entry VSMainPancake +//-------------------------------------------------------------------------------------- +// File: RenderCascadeShadow.hlsl +// +// The shader file for the RenderCascadeScene sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + + +//-------------------------------------------------------------------------------------- +// Globals +//-------------------------------------------------------------------------------------- +cbuffer cbPerObject : register( b0 ) +{ + matrix g_mWorldViewProjection : packoffset( c0 ); +}; + +//-------------------------------------------------------------------------------------- +// Input / Output structures +//-------------------------------------------------------------------------------------- +struct VS_INPUT +{ + float4 vPosition : POSITION; +}; + +struct VS_OUTPUT +{ + float4 vPosition : SV_POSITION; +}; + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +VS_OUTPUT VSMain( VS_INPUT Input ) +{ + VS_OUTPUT Output; + + // There is nothing special here, just transform and write out the depth. + Output.vPosition = mul( Input.vPosition, g_mWorldViewProjection ); + + return Output; +} + + +VS_OUTPUT VSMainPancake( VS_INPUT Input ) +{ + VS_OUTPUT Output; + // after transform move clipped geometry to near plane + Output.vPosition = mul( Input.vPosition, g_mWorldViewProjection ); + //Output.vPosition.z = max( Output.vPosition.z, 0.0f ); + return Output; +} \ No newline at end of file diff --git a/tests/hlsl/dxsdk/ComputeShaderSort11/ComputeShaderSort11.hlsl b/tests/hlsl/dxsdk/ComputeShaderSort11/ComputeShaderSort11.hlsl new file mode 100644 index 000000000..db7bd5136 --- /dev/null +++ b/tests/hlsl/dxsdk/ComputeShaderSort11/ComputeShaderSort11.hlsl @@ -0,0 +1,75 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry BitonicSort -entry MatrixTranspose +//-------------------------------------------------------------------------------------- +// File: ComputeShaderSort11.hlsl +// +// This file contains the compute shaders to perform GPU sorting using DirectX 11. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +#define BITONIC_BLOCK_SIZE 512 + +#define TRANSPOSE_BLOCK_SIZE 16 + +//-------------------------------------------------------------------------------------- +// Constant Buffers +//-------------------------------------------------------------------------------------- +cbuffer CB : register( b0 ) +{ + unsigned int g_iLevel; + unsigned int g_iLevelMask; + unsigned int g_iWidth; + unsigned int g_iHeight; +}; + +//-------------------------------------------------------------------------------------- +// Structured Buffers +//-------------------------------------------------------------------------------------- +StructuredBuffer Input : register( t0 ); +RWStructuredBuffer Data : register( u0 ); + +//-------------------------------------------------------------------------------------- +// Bitonic Sort Compute Shader +//-------------------------------------------------------------------------------------- +groupshared unsigned int shared_data[BITONIC_BLOCK_SIZE]; + +[numthreads(BITONIC_BLOCK_SIZE, 1, 1)] +void BitonicSort( uint3 Gid : SV_GroupID, + uint3 DTid : SV_DispatchThreadID, + uint3 GTid : SV_GroupThreadID, + uint GI : SV_GroupIndex ) +{ + // Load shared data + shared_data[GI] = Data[DTid.x]; + GroupMemoryBarrierWithGroupSync(); + + // Sort the shared data + for (unsigned int j = g_iLevel >> 1 ; j > 0 ; j >>= 1) + { + unsigned int result = ((shared_data[GI & ~j] <= shared_data[GI | j]) == (bool)(g_iLevelMask & DTid.x))? shared_data[GI ^ j] : shared_data[GI]; + GroupMemoryBarrierWithGroupSync(); + shared_data[GI] = result; + GroupMemoryBarrierWithGroupSync(); + } + + // Store shared data + Data[DTid.x] = shared_data[GI]; +} + +//-------------------------------------------------------------------------------------- +// Matrix Transpose Compute Shader +//-------------------------------------------------------------------------------------- +groupshared unsigned int transpose_shared_data[TRANSPOSE_BLOCK_SIZE * TRANSPOSE_BLOCK_SIZE]; + +[numthreads(TRANSPOSE_BLOCK_SIZE, TRANSPOSE_BLOCK_SIZE, 1)] +void MatrixTranspose( uint3 Gid : SV_GroupID, + uint3 DTid : SV_DispatchThreadID, + uint3 GTid : SV_GroupThreadID, + uint GI : SV_GroupIndex ) +{ + transpose_shared_data[GI] = Input[DTid.y * g_iWidth + DTid.x]; + GroupMemoryBarrierWithGroupSync(); + uint2 XY = DTid.yx - GTid.yx + GTid.xy; + Data[XY.y * g_iHeight + XY.x] = transpose_shared_data[GTid.x * TRANSPOSE_BLOCK_SIZE + GTid.y]; +} diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02.fx new file mode 100644 index 000000000..941e001b3 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02.fx @@ -0,0 +1,23 @@ +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS +//-------------------------------------------------------------------------------------- +// File: Tutorial02.fx +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +float4 VS( float4 Pos : POSITION ) : SV_POSITION +{ + return Pos; +} + + +//-------------------------------------------------------------------------------------- +// Pixel Shader +//-------------------------------------------------------------------------------------- +float4 PS( float4 Pos : SV_POSITION ) : SV_Target +{ + return float4( 1.0f, 1.0f, 0.0f, 1.0f ); // Yellow, with Alpha = 1 +} diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_PS.hlsl new file mode 100644 index 000000000..5a59aadc6 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_PS.hlsl @@ -0,0 +1,3 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS +#include "Tutorial02.fx" diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_VS.hlsl new file mode 100644 index 000000000..d58459b78 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_VS.hlsl @@ -0,0 +1,3 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS +#include "Tutorial02.fx" diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03.fx new file mode 100644 index 000000000..941e001b3 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03.fx @@ -0,0 +1,23 @@ +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS +//-------------------------------------------------------------------------------------- +// File: Tutorial02.fx +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +float4 VS( float4 Pos : POSITION ) : SV_POSITION +{ + return Pos; +} + + +//-------------------------------------------------------------------------------------- +// Pixel Shader +//-------------------------------------------------------------------------------------- +float4 PS( float4 Pos : SV_POSITION ) : SV_Target +{ + return float4( 1.0f, 1.0f, 0.0f, 1.0f ); // Yellow, with Alpha = 1 +} diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_PS.hlsl new file mode 100644 index 000000000..29b6e8b2c --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_PS.hlsl @@ -0,0 +1,3 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS +#include "Tutorial03.fx" diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_VS.hlsl new file mode 100644 index 000000000..db47ead28 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_VS.hlsl @@ -0,0 +1,3 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS +#include "Tutorial03.fx" diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04.fx new file mode 100644 index 000000000..deb7b585f --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04.fx @@ -0,0 +1,46 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS +//-------------------------------------------------------------------------------------- +// File: Tutorial04.fx +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Constant Buffer Variables +//-------------------------------------------------------------------------------------- +cbuffer ConstantBuffer : register( b0 ) +{ + matrix World; + matrix View; + matrix Projection; +} + +//-------------------------------------------------------------------------------------- +struct VS_OUTPUT +{ + float4 Pos : SV_POSITION; + float4 Color : COLOR0; +}; + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +VS_OUTPUT VS( float4 Pos : POSITION, float4 Color : COLOR ) +{ + VS_OUTPUT output = (VS_OUTPUT)0; + output.Pos = mul( Pos, World ); + output.Pos = mul( output.Pos, View ); + output.Pos = mul( output.Pos, Projection ); + output.Color = Color; + return output; +} + + +//-------------------------------------------------------------------------------------- +// Pixel Shader +//-------------------------------------------------------------------------------------- +float4 PS( VS_OUTPUT input ) : SV_Target +{ + return input.Color; +} diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_PS.hlsl new file mode 100644 index 000000000..dc627637c --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_PS.hlsl @@ -0,0 +1,3 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS +#include "Tutorial04.fx" diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_VS.hlsl new file mode 100644 index 000000000..96d0a642c --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_VS.hlsl @@ -0,0 +1,3 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS +#include "Tutorial04.fx" diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05.fx new file mode 100644 index 000000000..b15c99e49 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05.fx @@ -0,0 +1,54 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS +//-------------------------------------------------------------------------------------- +// File: Tutorial05.fx +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Constant Buffer Variables +//-------------------------------------------------------------------------------------- +cbuffer ConstantBuffer : register( b0 ) +{ + matrix World; + matrix View; + matrix Projection; +} + +//-------------------------------------------------------------------------------------- +struct VS_INPUT +{ + float4 Pos : POSITION; + float4 Color : COLOR; +}; + +struct PS_INPUT +{ + float4 Pos : SV_POSITION; + float4 Color : COLOR; +}; + + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +PS_INPUT VS( VS_INPUT input ) +{ + PS_INPUT output = (PS_INPUT)0; + output.Pos = mul( input.Pos, World ); + output.Pos = mul( output.Pos, View ); + output.Pos = mul( output.Pos, Projection ); + output.Color = input.Color; + + return output; +} + + +//-------------------------------------------------------------------------------------- +// Pixel Shader +//-------------------------------------------------------------------------------------- +float4 PS( PS_INPUT input) : SV_Target +{ + return input.Color; +} diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_PS.hlsl new file mode 100644 index 000000000..acc900ff5 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_PS.hlsl @@ -0,0 +1,3 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS +#include "Tutorial05.fx" diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_VS.hlsl new file mode 100644 index 000000000..726f05979 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_VS.hlsl @@ -0,0 +1,3 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS +#include "Tutorial05.fx" diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06.fx new file mode 100644 index 000000000..7d839009d --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06.fx @@ -0,0 +1,76 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS -entry PSSolid +//-------------------------------------------------------------------------------------- +// File: Tutorial06.fx +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + + +//-------------------------------------------------------------------------------------- +// Constant Buffer Variables +//-------------------------------------------------------------------------------------- +cbuffer ConstantBuffer : register( b0 ) +{ + matrix World; + matrix View; + matrix Projection; + float4 vLightDir[2]; + float4 vLightColor[2]; + float4 vOutputColor; +} + + +//-------------------------------------------------------------------------------------- +struct VS_INPUT +{ + float4 Pos : POSITION; + float3 Norm : NORMAL; +}; + +struct PS_INPUT +{ + float4 Pos : SV_POSITION; + float3 Norm : TEXCOORD0; +}; + + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +PS_INPUT VS( VS_INPUT input ) +{ + PS_INPUT output = (PS_INPUT)0; + output.Pos = mul( input.Pos, World ); + output.Pos = mul( output.Pos, View ); + output.Pos = mul( output.Pos, Projection ); + output.Norm = mul( float4( input.Norm, 1 ), World ).xyz; + + return output; +} + + +//-------------------------------------------------------------------------------------- +// Pixel Shader +//-------------------------------------------------------------------------------------- +float4 PS( PS_INPUT input) : SV_Target +{ + float4 finalColor = 0; + + //do NdotL lighting for 2 lights + for(int i=0; i<2; i++) + { + finalColor += saturate( dot( (float3)vLightDir[i],input.Norm) * vLightColor[i] ); + } + finalColor.a = 1; + return finalColor; +} + + +//-------------------------------------------------------------------------------------- +// PSSolid - render a solid color +//-------------------------------------------------------------------------------------- +float4 PSSolid( PS_INPUT input) : SV_Target +{ + return vOutputColor; +} diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_PS.hlsl new file mode 100644 index 000000000..31ed082e7 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_PS.hlsl @@ -0,0 +1,3 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS +#include "Tutorial06.fx" diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_VS.hlsl new file mode 100644 index 000000000..a5512efb6 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_VS.hlsl @@ -0,0 +1,3 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS +#include "Tutorial06.fx" diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07.fx new file mode 100644 index 000000000..0baad7a0c --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07.fx @@ -0,0 +1,67 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS +//-------------------------------------------------------------------------------------- +// File: Tutorial07.fx +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Constant Buffer Variables +//-------------------------------------------------------------------------------------- +Texture2D txDiffuse : register( t0 ); +SamplerState samLinear : register( s0 ); + +cbuffer cbNeverChanges : register( b0 ) +{ + matrix View; +}; + +cbuffer cbChangeOnResize : register( b1 ) +{ + matrix Projection; +}; + +cbuffer cbChangesEveryFrame : register( b2 ) +{ + matrix World; + float4 vMeshColor; +}; + + +//-------------------------------------------------------------------------------------- +struct VS_INPUT +{ + float4 Pos : POSITION; + float2 Tex : TEXCOORD0; +}; + +struct PS_INPUT +{ + float4 Pos : SV_POSITION; + float2 Tex : TEXCOORD0; +}; + + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +PS_INPUT VS( VS_INPUT input ) +{ + PS_INPUT output = (PS_INPUT)0; + output.Pos = mul( input.Pos, World ); + output.Pos = mul( output.Pos, View ); + output.Pos = mul( output.Pos, Projection ); + output.Tex = input.Tex; + + return output; +} + + +//-------------------------------------------------------------------------------------- +// Pixel Shader +//-------------------------------------------------------------------------------------- +float4 PS( PS_INPUT input) : SV_Target +{ + return txDiffuse.Sample( samLinear, input.Tex ) * vMeshColor; +} diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_PS.hlsl new file mode 100644 index 000000000..c3c101943 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_PS.hlsl @@ -0,0 +1,3 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS +#include "Tutorial07.fx" diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_VS.hlsl new file mode 100644 index 000000000..4c287c790 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_VS.hlsl @@ -0,0 +1,3 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS +#include "Tutorial07.fx" diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial08/Tutorial08.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial08/Tutorial08.fx new file mode 100644 index 000000000..6ff313b97 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial08/Tutorial08.fx @@ -0,0 +1,56 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS +//-------------------------------------------------------------------------------------- +// File: Tutorial08.fx +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Constant Buffer Variables +//-------------------------------------------------------------------------------------- +Texture2D txDiffuse : register( t0 ); +SamplerState samLinear : register( s0 ); + +cbuffer cbChangesEveryFrame : register( b0 ) +{ + matrix WorldViewProj; + matrix World; + float4 vMeshColor; +}; + + +//-------------------------------------------------------------------------------------- +struct VS_INPUT +{ + float4 Pos : POSITION; + float2 Tex : TEXCOORD; +}; + +struct PS_INPUT +{ + float4 Pos : SV_POSITION; + float2 Tex : TEXCOORD0; +}; + + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +PS_INPUT VS( VS_INPUT input ) +{ + PS_INPUT output = (PS_INPUT)0; + output.Pos = mul( input.Pos, WorldViewProj ); + output.Tex = input.Tex; + + return output; +} + + +//-------------------------------------------------------------------------------------- +// Pixel Shader +//-------------------------------------------------------------------------------------- +float4 PS( PS_INPUT input) : SV_Target +{ + return txDiffuse.Sample( samLinear, input.Tex ) * vMeshColor; +} diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial09/Tutorial09.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial09/Tutorial09.fx new file mode 100644 index 000000000..04a395588 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial09/Tutorial09.fx @@ -0,0 +1,69 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS +//-------------------------------------------------------------------------------------- +// File: Tutorial09.fx +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Constant Buffer Variables +//-------------------------------------------------------------------------------------- +Texture2D txDiffuse : register( t0 ); +SamplerState samLinear : register( s0 ); + +cbuffer cbNeverChanges : register( b0 ) +{ + float3 vLightDir; +}; + +cbuffer cbChangesEveryFrame : register( b1 ) +{ + matrix WorldViewProj; + matrix World; +}; + +struct VS_INPUT +{ + float3 Pos : POSITION; //position + float3 Norm : NORMAL; //normal + float2 Tex : TEXCOORD0; //texture coordinate +}; + +struct PS_INPUT +{ + float4 Pos : SV_POSITION; + float4 Diffuse : COLOR0; + float2 Tex : TEXCOORD1; +}; + + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +PS_INPUT VS( VS_INPUT input ) +{ + PS_INPUT output = (PS_INPUT)0; + output.Pos = mul( float4(input.Pos,1), WorldViewProj ); + float3 vNormalWorldSpace = normalize( mul( input.Norm, (float3x3)World ) ); + + float fLighting = saturate( dot( vNormalWorldSpace, vLightDir ) ); + output.Diffuse.rgb = fLighting; + output.Diffuse.a = 1.0f; + + output.Tex = input.Tex; + + return output; +} + + +//-------------------------------------------------------------------------------------- +// Pixel Shader +//-------------------------------------------------------------------------------------- +float4 PS( PS_INPUT input) : SV_Target +{ + //calculate lighting assuming light color is <1,1,1,1> + float4 outputColor = txDiffuse.Sample( samLinear, input.Tex ) * input.Diffuse; + outputColor.a = 1; + return outputColor; +} diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial10/Tutorial10.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial10/Tutorial10.fx new file mode 100644 index 000000000..e9bded408 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial10/Tutorial10.fx @@ -0,0 +1,73 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS +//-------------------------------------------------------------------------------------- +// File: Tutorial10.fx +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Constant Buffer Variables +//-------------------------------------------------------------------------------------- +Texture2D txDiffuse : register( t0 ); +SamplerState samLinear : register( s0 ); + +cbuffer cbNeverChanges : register( b0 ) +{ + float3 vLightDir; +}; + +cbuffer cbChangesEveryFrame : register( b1 ) +{ + matrix WorldViewProj; + matrix World; + float Puffiness; +}; + +struct VS_INPUT +{ + float3 Pos : POSITION; //position + float3 Norm : NORMAL; //normal + float2 Tex : TEXCOORD0; //texture coordinate +}; + +struct PS_INPUT +{ + float4 Pos : SV_POSITION; + float4 Diffuse : COLOR0; + float2 Tex : TEXCOORD1; +}; + + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +PS_INPUT VS( VS_INPUT input ) +{ + PS_INPUT output = (PS_INPUT)0; + + input.Pos += input.Norm * Puffiness; + + output.Pos = mul( float4(input.Pos,1), WorldViewProj ); + float3 vNormalWorldSpace = normalize( mul( input.Norm, (float3x3)World ) ); + + float fLighting = saturate( dot( vNormalWorldSpace, vLightDir ) ); + output.Diffuse.rgb = fLighting; + output.Diffuse.a = 1.0f; + + output.Tex = input.Tex; + + return output; +} + + +//-------------------------------------------------------------------------------------- +// Pixel Shader +//-------------------------------------------------------------------------------------- +float4 PS( PS_INPUT input) : SV_Target +{ + //calculate lighting assuming light color is <1,1,1,1> + float4 outputColor = txDiffuse.Sample( samLinear, input.Tex ) * input.Diffuse; + outputColor.a = 1; + return outputColor; +} diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial11/Tutorial11.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial11/Tutorial11.fx new file mode 100644 index 000000000..a647a9079 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial11/Tutorial11.fx @@ -0,0 +1,117 @@ +//TEST_IGNORE_FILE: +//-------------------------------------------------------------------------------------- +// File: Tutorial11.fx +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + + +//-------------------------------------------------------------------------------------- +// Constant Buffer Variables +//-------------------------------------------------------------------------------------- +Texture2D g_txDiffuse; +SamplerState samLinear +{ + Filter = MIN_MAG_MIP_LINEAR; + AddressU = Wrap; + AddressV = Wrap; +}; + +cbuffer cbConstant +{ + float3 vLightDir = float3(-0.577,0.577,-0.577); +}; + +cbuffer cbChangesEveryFrame +{ + matrix World; + matrix View; + matrix Projection; + float Time; +}; + +cbuffer cbUserChanges +{ + float Waviness; +}; + +struct VS_INPUT +{ + float3 Pos : POSITION; + float3 Norm : NORMAL; + float2 Tex : TEXCOORD0; +}; + +struct PS_INPUT +{ + float4 Pos : SV_POSITION; + float3 Norm : TEXCOORD0; + float2 Tex : TEXCOORD1; +}; + +//-------------------------------------------------------------------------------------- +// DepthStates +//-------------------------------------------------------------------------------------- +DepthStencilState EnableDepth +{ + DepthEnable = TRUE; + DepthWriteMask = ALL; + DepthFunc = LESS_EQUAL; +}; + +BlendState NoBlending +{ + AlphaToCoverageEnable = FALSE; + BlendEnable[0] = FALSE; +}; + + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +PS_INPUT VS( VS_INPUT input ) +{ + PS_INPUT output = (PS_INPUT)0; + + output.Pos = mul( float4(input.Pos,1), World ); + + output.Pos.x += sin( output.Pos.y*0.1f + Time )*Waviness; + + output.Pos = mul( output.Pos, View ); + output.Pos = mul( output.Pos, Projection ); + output.Norm = mul( input.Norm, World ); + output.Tex = input.Tex; + + return output; +} + + +//-------------------------------------------------------------------------------------- +// Pixel Shader +//-------------------------------------------------------------------------------------- +float4 PS( PS_INPUT input) : SV_Target +{ + // Calculate lighting assuming light color is <1,1,1,1> + float fLighting = saturate( dot( input.Norm, vLightDir ) ); + float4 outputColor = g_txDiffuse.Sample( samLinear, input.Tex ) * fLighting; + outputColor.a = 1; + return outputColor; +} + + +//-------------------------------------------------------------------------------------- +// Technique +//-------------------------------------------------------------------------------------- +technique11 Render +{ + pass P0 + { + SetVertexShader( CompileShader( vs_4_0, VS() ) ); + SetGeometryShader( NULL ); + SetPixelShader( CompileShader( ps_4_0, PS() ) ); + + SetDepthStencilState( EnableDepth, 0 ); + SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF ); + } +} + diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial12/Tutorial12.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial12/Tutorial12.fx new file mode 100644 index 000000000..aae7f9a87 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial12/Tutorial12.fx @@ -0,0 +1,129 @@ +//TEST_IGNORE_FILE: +// +// Constant Buffer Variables +// + +Texture2D g_txDiffuse; +SamplerState samLinear +{ + Filter = MIN_MAG_MIP_LINEAR; + AddressU = Wrap; + AddressV = Wrap; +}; + +TextureCube g_txEnvMap; +SamplerState samLinearClamp +{ + Filter = MIN_MAG_MIP_LINEAR; + AddressU = Clamp; + AddressV = Clamp; +}; + +cbuffer cbConstant +{ + float3 vLightDir = float3(-0.577,0.577,-0.577); +}; + +cbuffer cbChangesEveryFrame +{ + matrix World; + matrix View; + matrix Projection; + float Time; +}; + +cbuffer cbUserChanges +{ + float Waviness; +}; + +struct VS_INPUT +{ + float3 Pos : POSITION; //position + float3 Norm : NORMAL; //normal + float2 Tex : TEXCOORD0; //texture coordinate +}; + +struct PS_INPUT +{ + float4 Pos : SV_POSITION; + float3 Norm : TEXCOORD0; + float2 Tex : TEXCOORD1; + float3 ViewR : TEXCOORD2; +}; + +//-------------------------------------------------------------------------------------- +// DepthStates +//-------------------------------------------------------------------------------------- +DepthStencilState EnableDepth +{ + DepthEnable = TRUE; + DepthWriteMask = ALL; + DepthFunc = LESS_EQUAL; +}; + +BlendState NoBlending +{ + AlphaToCoverageEnable = FALSE; + BlendEnable[0] = FALSE; +}; + +// +// Vertex Shader +// +PS_INPUT VS( VS_INPUT input ) +{ + PS_INPUT output = (PS_INPUT)0; + + output.Pos = mul( float4(input.Pos,1), World ); + + output.Pos.x += sin( output.Pos.y*0.1f + Time )*Waviness; + + output.Pos = mul( output.Pos, View ); + output.Pos = mul( output.Pos, Projection ); + output.Norm = mul( input.Norm, (float3x3)World ); + output.Tex = input.Tex; + + // Calculate the reflection vector + float3 viewNorm = mul( output.Norm, (float3x3)View ); + output.ViewR = reflect( viewNorm, float3(0,0,-1.0) ); + + return output; +} + + +// +// Pixel Shader +// +float4 PS( PS_INPUT input) : SV_Target +{ + // Calculate lighting assuming light color is <1,1,1,1> + float fLighting = saturate( dot( input.Norm, vLightDir ) ); + + // Load the environment map texture + float4 cReflect = g_txEnvMap.Sample( samLinearClamp, input.ViewR ); + + // Load the diffuse texture and multiply by the lighting amount + float4 cDiffuse = g_txDiffuse.Sample( samLinear, input.Tex ) * fLighting; + + // Add diffuse to reflection and go + float4 cTotal = cDiffuse + cReflect; + cTotal.a = 1; + return cTotal; +} + +// +// Technique +// +technique11 Render +{ + pass P0 + { + SetVertexShader( CompileShader( vs_4_0, VS() ) ); + SetGeometryShader( NULL ); + SetPixelShader( CompileShader( ps_4_0, PS() ) ); + + SetDepthStencilState( EnableDepth, 0 ); + SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF ); + } +} diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial13/Tutorial13.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial13/Tutorial13.fx new file mode 100644 index 000000000..a6f09ecc7 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial13/Tutorial13.fx @@ -0,0 +1,191 @@ +//TEST_IGNORE_FILE: +//-------------------------------------------------------------------------------------- +// File: Tutorial13.fx +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + + +//-------------------------------------------------------------------------------------- +// Constant Buffer Variables +//-------------------------------------------------------------------------------------- +Texture2D g_txDiffuse; +SamplerState samLinear +{ + Filter = MIN_MAG_MIP_LINEAR; + AddressU = Wrap; + AddressV = Wrap; +}; + +TextureCube g_txEnvMap; +SamplerState samLinearClamp +{ + Filter = MIN_MAG_MIP_LINEAR; + AddressU = Clamp; + AddressV = Clamp; +}; + +cbuffer cbConstant +{ + float3 vLightDir = float3(-0.577,0.577,-0.577); +}; + +cbuffer cbChangesEveryFrame +{ + matrix World; + matrix View; + matrix Projection; + float Time; +}; + +cbuffer cbUserChanges +{ + float Explode; +}; + +struct VS_INPUT +{ + float3 Pos : POSITION; + float3 Norm : NORMAL; + float2 Tex : TEXCOORD0; +}; + +struct GSPS_INPUT +{ + float4 Pos : SV_POSITION; + float3 Norm : TEXCOORD0; + float2 Tex : TEXCOORD1; +}; + +//-------------------------------------------------------------------------------------- +// DepthStates +//-------------------------------------------------------------------------------------- +DepthStencilState EnableDepth +{ + DepthEnable = TRUE; + DepthWriteMask = ALL; + DepthFunc = LESS_EQUAL; +}; + +BlendState NoBlending +{ + AlphaToCoverageEnable = FALSE; + BlendEnable[0] = FALSE; +}; + + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +GSPS_INPUT VS( VS_INPUT input ) +{ + GSPS_INPUT output = (GSPS_INPUT)0; + + output.Pos = mul( float4(input.Pos,1), World ); + output.Norm = mul( input.Norm, (float3x3)World ); + output.Tex = input.Tex; + + return output; +} + + +//-------------------------------------------------------------------------------------- +// Geometry Shader +//-------------------------------------------------------------------------------------- +[maxvertexcount(12)] +void GS( triangle GSPS_INPUT input[3], inout TriangleStream TriStream ) +{ + GSPS_INPUT output; + + // + // Calculate the face normal + // + float3 faceEdgeA = input[1].Pos - input[0].Pos; + float3 faceEdgeB = input[2].Pos - input[0].Pos; + float3 faceNormal = normalize( cross(faceEdgeA, faceEdgeB) ); + float3 ExplodeAmt = faceNormal*Explode; + + // + // Calculate the face center + // + float3 centerPos = (input[0].Pos.xyz + input[1].Pos.xyz + input[2].Pos.xyz)/3.0; + float2 centerTex = (input[0].Tex + input[1].Tex + input[2].Tex)/3.0; + centerPos += faceNormal*Explode; + + // + // Output the pyramid + // + for( int i=0; i<3; i++ ) + { + output.Pos = input[i].Pos + float4(ExplodeAmt,0); + output.Pos = mul( output.Pos, View ); + output.Pos = mul( output.Pos, Projection ); + output.Norm = input[i].Norm; + output.Tex = input[i].Tex; + TriStream.Append( output ); + + int iNext = (i+1)%3; + output.Pos = input[iNext].Pos + float4(ExplodeAmt,0); + output.Pos = mul( output.Pos, View ); + output.Pos = mul( output.Pos, Projection ); + output.Norm = input[iNext].Norm; + output.Tex = input[iNext].Tex; + TriStream.Append( output ); + + output.Pos = float4(centerPos,1) + float4(ExplodeAmt,0); + output.Pos = mul( output.Pos, View ); + output.Pos = mul( output.Pos, Projection ); + output.Norm = faceNormal; + output.Tex = centerTex; + TriStream.Append( output ); + + TriStream.RestartStrip(); + } + + for( int i=2; i>=0; i-- ) + { + output.Pos = input[i].Pos + float4(ExplodeAmt,0); + output.Pos = mul( output.Pos, View ); + output.Pos = mul( output.Pos, Projection ); + output.Norm = -input[i].Norm; + output.Tex = input[i].Tex; + TriStream.Append( output ); + } + TriStream.RestartStrip(); +} + + +//-------------------------------------------------------------------------------------- +// Pixel Shader +//-------------------------------------------------------------------------------------- +float4 PS( GSPS_INPUT input) : SV_Target +{ + // Calculate lighting assuming light color is <1,1,1,1> + float fLighting = saturate( dot( input.Norm, vLightDir ) ); + + // Load the diffuse texture and multiply by the lighting amount + float4 cDiffuse = g_txDiffuse.Sample( samLinear, input.Tex ) * fLighting; + cDiffuse.a = 1; + + // return diffuse + return cDiffuse; +} + + +//-------------------------------------------------------------------------------------- +// Technique +//-------------------------------------------------------------------------------------- +technique11 Render +{ + pass P0 + { + SetVertexShader( CompileShader( vs_4_0, VS() ) ); + SetGeometryShader( CompileShader( gs_4_0, GS() ) ); + SetPixelShader( CompileShader( ps_4_0, PS() ) ); + + SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF ); + SetDepthStencilState( EnableDepth, 0 ); + } +} + + diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial14/Tutorial14.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial14/Tutorial14.fx new file mode 100644 index 000000000..b1e45b842 --- /dev/null +++ b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial14/Tutorial14.fx @@ -0,0 +1,294 @@ +//TEST_IGNORE_FILE: +//-------------------------------------------------------------------------------------- +// File: Tutorial14.fx +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + + +//-------------------------------------------------------------------------------------- +// Constant Buffer Variables +//-------------------------------------------------------------------------------------- +Texture2D g_txDiffuse; +SamplerState samLinear +{ + Filter = MIN_MAG_MIP_LINEAR; + AddressU = Wrap; + AddressV = Wrap; +}; + +cbuffer cbConstant +{ + float3 vLightDir = float3(-0.577,0.577,-0.577); +}; + +cbuffer cbChangesEveryFrame +{ + matrix World; + matrix View; + matrix Projection; +}; + +struct VS_INPUT +{ + float3 Pos : POSITION; //position + float3 Norm : NORMAL; //normal + float2 Tex : TEXCOORD0; //texture coordinate +}; + +struct PS_INPUT +{ + float4 Pos : SV_POSITION; + float3 Norm : TEXCOORD0; + float2 Tex : TEXCOORD1; +}; + +struct QUADVS_INPUT +{ + float4 Pos : POSITION; + float2 Tex : TEXCOORD0; +}; + +struct QUADVS_OUTPUT +{ + float4 Pos : SV_POSITION; // Transformed position + float2 Tex : TEXCOORD0; +}; + +//-------------------------------------------------------------------------------------- +// Blending States +//-------------------------------------------------------------------------------------- +BlendState NoBlending +{ + BlendEnable[0] = FALSE; +}; + +BlendState SrcAlphaBlendingAdd +{ + BlendEnable[0] = TRUE; + SrcBlend = SRC_ALPHA; + DestBlend = ONE; + BlendOp = ADD; + SrcBlendAlpha = ZERO; + DestBlendAlpha = ZERO; + BlendOpAlpha = ADD; + RenderTargetWriteMask[0] = 0x0F; +}; + +BlendState SrcAlphaBlendingSub +{ + BlendEnable[0] = TRUE; + SrcBlend = SRC_ALPHA; + DestBlend = ONE; + BlendOp = SUBTRACT; + SrcBlendAlpha = ZERO; + DestBlendAlpha = ZERO; + BlendOpAlpha = ADD; + RenderTargetWriteMask[0] = 0x0F; +}; + +BlendState SrcColorBlendingAdd +{ + BlendEnable[0] = TRUE; + SrcBlend = SRC_COLOR; + DestBlend = ONE; + BlendOp = ADD; + SrcBlendAlpha = ZERO; + DestBlendAlpha = ZERO; + BlendOpAlpha = ADD; + RenderTargetWriteMask[0] = 0x0F; +}; + +BlendState SrcColorBlendingSub +{ + BlendEnable[0] = TRUE; + SrcBlend = SRC_COLOR; + DestBlend = ONE; + BlendOp = SUBTRACT; + SrcBlendAlpha = ZERO; + DestBlendAlpha = ZERO; + BlendOpAlpha = ADD; + RenderTargetWriteMask[0] = 0x0F; +}; + +//-------------------------------------------------------------------------------------- +// Depth/Stencil States +//-------------------------------------------------------------------------------------- +DepthStencilState RenderWithStencilState +{ + DepthEnable = false; + DepthWriteMask = ZERO; + DepthFunc = Less; + + // Setup stencil states + StencilEnable = true; + StencilReadMask = 0xFF; + StencilWriteMask = 0x00; + + FrontFaceStencilFunc = Not_Equal; + FrontFaceStencilPass = Keep; + FrontFaceStencilFail = Zero; + + BackFaceStencilFunc = Not_Equal; + BackFaceStencilPass = Keep; + BackFaceStencilFail = Zero; +}; + + + +//-------------------------------------------------------------------------------------- +// Scene Vertex Shader +//-------------------------------------------------------------------------------------- +PS_INPUT VS( VS_INPUT input ) +{ + PS_INPUT output = (PS_INPUT)0; + + output.Pos = mul( float4(input.Pos,1), World ); + output.Pos = mul( output.Pos, View ); + output.Pos = mul( output.Pos, Projection ); + output.Norm = mul( input.Norm, World ); + output.Tex = input.Tex; + + return output; +} + +//----------------------------------------------------------------------------- +// Quad Vertex Shaders +//----------------------------------------------------------------------------- +QUADVS_OUTPUT QuadVS( QUADVS_INPUT Input ) +{ + QUADVS_OUTPUT Output; + Output.Pos = mul( Input.Pos, World ); + Output.Pos = mul( Output.Pos, View ); + Output.Pos = mul( Output.Pos, Projection ); + Output.Tex = Input.Tex; + return Output; +} + +QUADVS_OUTPUT ScreenQuadVS( QUADVS_INPUT Input ) +{ + QUADVS_OUTPUT Output; + Output.Pos = Input.Pos; + Output.Tex = Input.Tex; + return Output; +} + +//-------------------------------------------------------------------------------------- +// Pixel Shader +//-------------------------------------------------------------------------------------- +float4 PS( PS_INPUT input) : SV_Target +{ + // Calculate lighting assuming light color is <1,1,1,1> + float fLighting = saturate( dot( input.Norm, vLightDir ) ); + float4 outputColor = g_txDiffuse.Sample( samLinear, input.Tex ) * fLighting; + outputColor.a = 1; + return outputColor; +} + +//-------------------------------------------------------------------------------------- +// Quad Pixel Shader +//-------------------------------------------------------------------------------------- +float4 QuadPS( QUADVS_OUTPUT input) : SV_Target +{ + return g_txDiffuse.Sample( samLinear, input.Tex ); +} + + +//-------------------------------------------------------------------------------------- +// Scene Techniques +//-------------------------------------------------------------------------------------- +technique11 RenderScene +{ + pass P0 + { + SetVertexShader( CompileShader( vs_4_0, VS() ) ); + SetGeometryShader( NULL ); + SetPixelShader( CompileShader( ps_4_0, PS() ) ); + SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF ); + } +} + +//-------------------------------------------------------------------------------------- +// RenderWithStencil - set the depth stencil state inside of the technique +//-------------------------------------------------------------------------------------- +technique11 RenderWithStencil +{ + pass P0 + { + SetVertexShader( CompileShader( vs_4_0, ScreenQuadVS() ) ); + SetGeometryShader( NULL ); + SetPixelShader( CompileShader( ps_4_0, QuadPS() ) ); + + SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF ); + SetDepthStencilState( RenderWithStencilState, 0 ); + } +} + +//-------------------------------------------------------------------------------------- +// Quad Techniques: Alpha blending state is set inside the technique +//-------------------------------------------------------------------------------------- +technique11 RenderQuadSolid +{ + pass P0 + { + SetVertexShader( CompileShader( vs_4_0, QuadVS() ) ); + SetGeometryShader( NULL ); + SetPixelShader( CompileShader( ps_4_0, QuadPS() ) ); + + SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF ); + } +} + +//-------------------------------------------------------------------------------------- +technique11 RenderQuadSrcAlphaAdd +{ + pass P0 + { + SetVertexShader( CompileShader( vs_4_0, QuadVS() ) ); + SetGeometryShader( NULL ); + SetPixelShader( CompileShader( ps_4_0, QuadPS() ) ); + + SetBlendState( SrcAlphaBlendingAdd, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF ); + } +} + +//-------------------------------------------------------------------------------------- +technique11 RenderQuadSrcAlphaSub +{ + pass P0 + { + SetVertexShader( CompileShader( vs_4_0, QuadVS() ) ); + SetGeometryShader( NULL ); + SetPixelShader( CompileShader( ps_4_0, QuadPS() ) ); + + SetBlendState( SrcAlphaBlendingSub, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF ); + } +} + +//-------------------------------------------------------------------------------------- +technique11 RenderQuadSrcColorAdd +{ + pass P0 + { + SetVertexShader( CompileShader( vs_4_0, QuadVS() ) ); + SetGeometryShader( NULL ); + SetPixelShader( CompileShader( ps_4_0, QuadPS() ) ); + + SetBlendState( SrcColorBlendingAdd, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF ); + } +} + +//-------------------------------------------------------------------------------------- +technique11 RenderQuadSrcColorSub +{ + pass P0 + { + SetVertexShader( CompileShader( vs_4_0, QuadVS() ) ); + SetGeometryShader( NULL ); + SetPixelShader( CompileShader( ps_4_0, QuadPS() ) ); + + SetBlendState( SrcColorBlendingSub, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF ); + } +} + + diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_LightPSH.h b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_LightPSH.h new file mode 100644 index 000000000..b44251829 --- /dev/null +++ b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_LightPSH.h @@ -0,0 +1,84 @@ +//-------------------------------------------------------------------------------------- +// File: DynamicShaderLinkage11_LightPSH.h +// +// The pixel shader light header file for the DynamicShaderLinkage11 sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Interfaces +//-------------------------------------------------------------------------------------- +interface iBaseLight +{ + float3 IlluminateAmbient(float3 vNormal); + + float3 IlluminateDiffuse(float3 vNormal); + + float3 IlluminateSpecular(float3 vNormal, int specularPower ); + +}; + +//-------------------------------------------------------------------------------------- +// Classes +//-------------------------------------------------------------------------------------- +class cAmbientLight : iBaseLight +{ + float3 m_vLightColor; + bool m_bEnable; + + float3 IlluminateAmbient(float3 vNormal); + + float3 IlluminateDiffuse(float3 vNormal) + { + return (float3)0; + } + + float3 IlluminateSpecular(float3 vNormal, int specularPower ) + { + return (float3)0; + } +}; + +class cHemiAmbientLight : cAmbientLight +{ + // inherited float4 m_vLightColor is the SkyColor + float4 m_vGroundColor; + float4 m_vDirUp; + + float3 IlluminateAmbient(float3 vNormal); + +}; + +class cDirectionalLight : cAmbientLight +{ + // inherited float4 m_vLightColor is the LightColor + float4 m_vLightDir; + + float3 IlluminateDiffuse( float3 vNormal ); + + float3 IlluminateSpecular( float3 vNormal, int specularPower ); + +}; + +class cOmniLight : cAmbientLight +{ + float3 m_vLightPosition; + float radius; + + float3 IlluminateDiffuse( float3 vNormal ); + +}; + +class cSpotLight : cAmbientLight +{ + float3 m_vLightPosition; + float3 m_vLightDir; +}; + +class cEnvironmentLight : cAmbientLight +{ + float3 IlluminateSpecular( float3 vNormal, int specularPower ); +}; + + diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_MaterialPSH.h b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_MaterialPSH.h new file mode 100644 index 000000000..7f6bc3d22 --- /dev/null +++ b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_MaterialPSH.h @@ -0,0 +1,103 @@ +//-------------------------------------------------------------------------------------- +// File: DynamicShaderLinkage11_MATERIALPSH.h +// +// The pixel shader material header file for the DynamicShaderLinkage11 sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Interfaces +//-------------------------------------------------------------------------------------- +interface iBaseMaterial +{ + float3 GetAmbientColor(float2 vTexcoord); + + float3 GetDiffuseColor(float2 vTexcoord); + + int GetSpecularPower(); + +}; + +//-------------------------------------------------------------------------------------- +// Classes +//-------------------------------------------------------------------------------------- +class cBaseMaterial : iBaseMaterial +{ + float3 m_vColor; + int m_iSpecPower; + + float3 GetAmbientColor(float2 vTexcoord) + { + return m_vColor; + } + + float3 GetDiffuseColor(float2 vTexcoord) + { + return (float3)m_vColor; + } + + int GetSpecularPower() + { + return m_iSpecPower; + } + +}; + +class cPlasticMaterial : cBaseMaterial +{ + +}; + +class cPlasticTexturedMaterial : cPlasticMaterial +{ + float3 GetAmbientColor(float2 vTexcoord); + + float3 GetDiffuseColor(float2 vTexcoord); + +}; + +class cPlasticLightingOnlyMaterial : cBaseMaterial +{ + float3 GetAmbientColor(float2 vTexcoord) + { + return (float3)1.0f; + } + + float3 GetDiffuseColor(float2 vTexcoord) + { + return (float3)1.0f; + } + +}; + +class cRoughMaterial : cBaseMaterial +{ + int GetSpecularPower() + { + return m_iSpecPower; + } +}; + +class cRoughTexturedMaterial : cRoughMaterial +{ + float3 GetAmbientColor(float2 vTexcoord); + + float3 GetDiffuseColor(float2 vTexcoord); + +}; + + +class cRoughLightingOnlyMaterial : cRoughMaterial +{ + float3 GetAmbientColor(float2 vTexcoord) + { + return (float3)1.0f; + } + + float3 GetDiffuseColor(float2 vTexcoord) + { + return (float3)1.0f; + } + +}; diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PS.hlsl b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PS.hlsl new file mode 100644 index 000000000..c3ee93057 --- /dev/null +++ b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PS.hlsl @@ -0,0 +1,84 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PSMain +//-------------------------------------------------------------------------------------- +// File: DynamicShaderLinkage11.psh +// +// The pixel shader header file for the DynamicShaderLinkage11 sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Header Includes +//-------------------------------------------------------------------------------------- +#include "DynamicShaderLinkage11_PSBuffers.h" + +// Defines for default static permutated setting +#if defined( STATIC_PERMUTE ) + #define HEMI_AMBIENT //CONST_AMBIENT //HEMI_AMBIENT + #define TEXTURE_ENABLE + #define SPECULAR_ENABLE +#endif + +//-------------------------------------------------------------------------------------- +// Input / Output structures +//-------------------------------------------------------------------------------------- +struct PS_INPUT +{ + float4 vPosition : SV_POSITION; + float3 vNormal : NORMAL; + float2 vTexcoord : TEXCOORD0; + float4 vMatrix : TEXCOORD1; +}; + +//-------------------------------------------------------------------------------------- +// Abstract Interface Instances for dyamic linkage / permutation +//-------------------------------------------------------------------------------------- +#if !defined( STATIC_PERMUTE ) + iBaseLight g_abstractAmbientLighting; + iBaseLight g_abstractDirectLighting; + iBaseLight g_abstractEnvironmentLighting; + iBaseMaterial g_abstractMaterial; +#else +//-------------------------------------------------------------------------------------- +// Concrete Instances for STATIC_PERMUTE - static permutation +//-------------------------------------------------------------------------------------- + #if defined( HEMI_AMBIENT ) + #define g_abstractAmbientLighting g_hemiAmbientLight + #else + // CONST_AMBIENT + #define g_abstractAmbientLighting g_ambientLight + #endif + #define g_abstractDirectLighting g_directionalLight + #define g_abstractEnvironmentLighting g_environmentLight + #if defined( TEXTURE_ENABLE ) + #define g_abstractMaterial g_plasticTexturedMaterial + #else + #define g_abstractMaterial g_plasticMaterial + #endif +#endif + +//-------------------------------------------------------------------------------------- +// Pixel Shader +//-------------------------------------------------------------------------------------- +float4 PSMain( PS_INPUT Input ) : SV_TARGET +{ + // Compute the Ambient term + float3 Ambient = (float3)0.0f; + Ambient = g_abstractMaterial.GetAmbientColor( Input.vTexcoord ) * g_abstractAmbientLighting.IlluminateAmbient( Input.vNormal ); + + // Accumulate the Diffuse contribution + float3 Diffuse = (float3)0.0f; + + Diffuse += g_abstractMaterial.GetDiffuseColor( Input.vTexcoord ) * g_abstractDirectLighting.IlluminateDiffuse( Input.vNormal ); + + // Compute the Specular contribution + float3 Specular = (float3)0.0f; + Specular += g_abstractDirectLighting.IlluminateSpecular( Input.vNormal, g_abstractMaterial.GetSpecularPower() ); + Specular += g_abstractEnvironmentLighting.IlluminateSpecular( Input.vNormal, g_abstractMaterial.GetSpecularPower() ); + + // Accumulate the lighting with saturation + float3 Lighting = saturate( Ambient + Diffuse + Specular ); + + return float4(Lighting,1.0f); +} diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PSBuffers.h b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PSBuffers.h new file mode 100644 index 000000000..e2263b832 --- /dev/null +++ b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PSBuffers.h @@ -0,0 +1,129 @@ +//-------------------------------------------------------------------------------------- +// File: DynamicShaderLinkage11_LightPSH.hlsl +// +// The pixel shader light source module file for the DynamicShaderLinkage11 sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +#include "DynamicShaderLinkage11_LightPSH.h" +#include "DynamicShaderLinkage11_MaterialPSH.h" + +//-------------------------------------------------------------------------------------- +// Constant Buffers +//-------------------------------------------------------------------------------------- +cbuffer cbPerFrame : register( b0 ) +{ + cAmbientLight g_ambientLight; + cHemiAmbientLight g_hemiAmbientLight; + cDirectionalLight g_directionalLight; + cEnvironmentLight g_environmentLight; + float4 g_vEyeDir; +}; + +cbuffer cbPerPrimitive : register( b1 ) +{ + cPlasticMaterial g_plasticMaterial; + cPlasticTexturedMaterial g_plasticTexturedMaterial; + cPlasticLightingOnlyMaterial g_plasticLightingOnlyMaterial; + cRoughMaterial g_roughMaterial; + cRoughTexturedMaterial g_roughTexturedMaterial; + cRoughLightingOnlyMaterial g_roughLightingOnlyMaterial; +}; + +//-------------------------------------------------------------------------------------- +// Textures and Samplers +//-------------------------------------------------------------------------------------- +Texture2D g_txDiffuse : register( t0 ); +Texture2D g_txNormalMap : register( t1 ); +TextureCube g_txEnvironmentMap : register( t2 ); + +SamplerState g_samLinear : register( s0 ); + +//-------------------------------------------------------------------------------------- +// Lighting Class Methods +//-------------------------------------------------------------------------------------- +// Ambient Lighting Class Methods +float3 cAmbientLight::IlluminateAmbient(float3 vNormal) +{ + return float4( m_vLightColor * m_bEnable, 1.0f); +} + +float3 cHemiAmbientLight::IlluminateAmbient(float3 vNormal) +{ + float thetha = (dot( vNormal, m_vDirUp ) + 1.0f) / 2.0f; + + return lerp( m_vGroundColor, m_vLightColor, thetha) * m_bEnable; +} + +// Directional Light class +float3 cDirectionalLight::IlluminateDiffuse( float3 vNormal ) +{ + float lambert = saturate(dot( vNormal, m_vLightDir )); + return ((float3)lambert * m_vLightColor * m_bEnable); +} + +float3 cDirectionalLight::IlluminateSpecular( float3 vNormal, int specularPower ) +{ + float3 H = -normalize(g_vEyeDir) + m_vLightDir; + float3 halfAngle = normalize( H ); + float specular = pow( max(0,dot( halfAngle, normalize(vNormal) )), specularPower ); + + return ((float3)specular * m_vLightColor * m_bEnable); +} + +// Omni Light Class +float3 cOmniLight::IlluminateDiffuse( float3 vNormal ) +{ + return (float3)0.0f; // TO DO! +} + +// Environment Lighting +float3 cEnvironmentLight::IlluminateSpecular( float3 vNormal, int specularPower ) +{ + // compute reflection vector taking into account a cheap fresnel falloff; + float3 N = normalize(vNormal); + float3 E = normalize(g_vEyeDir); + float3 R = reflect( E, N ); + float fresnel = 1 - dot( -E, N ); + fresnel = (fresnel * fresnel * fresnel ); + + float3 specular = g_txEnvironmentMap.Sample( g_samLinear, R ) * fresnel; + + return (specular * (float3)m_bEnable); +// return ((float3)fresnel); + +} + +//-------------------------------------------------------------------------------------- +// Material Class Methods +//-------------------------------------------------------------------------------------- +// Plastic Material Methods +float3 cPlasticTexturedMaterial::GetAmbientColor(float2 vTexcoord) +{ + float4 vDiffuse = (float4)1.0f; + vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord ); + return m_vColor * vDiffuse; +} + +float3 cPlasticTexturedMaterial::GetDiffuseColor(float2 vTexcoord) +{ + float4 vDiffuse = (float4)1.0f; + vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord ); + return m_vColor * vDiffuse; +} + +// Rough Material Methods +float3 cRoughTexturedMaterial::GetAmbientColor(float2 vTexcoord) +{ + float4 vDiffuse = (float4)1.0f; + vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord ); + return m_vColor * vDiffuse; +} + +float3 cRoughTexturedMaterial::GetDiffuseColor(float2 vTexcoord) +{ + float4 vDiffuse = (float4)1.0f; + vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord ); + return m_vColor * vDiffuse; +} diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_VS.hlsl b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_VS.hlsl new file mode 100644 index 000000000..800dbf3b3 --- /dev/null +++ b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_VS.hlsl @@ -0,0 +1,66 @@ +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain +//-------------------------------------------------------------------------------------- +// File: DynamicShaderLinkage11_VS.hlsl +// +// The vertex shader file for the DynamicShaderLinkage11 sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Globals +//-------------------------------------------------------------------------------------- +cbuffer cbPerObject : register( b0 ) +{ + float4x4 g_mWorldViewProjection : packoffset( c0 ); + float4x4 g_mWorld : packoffset( c4 ); +}; + +//-------------------------------------------------------------------------------------- +// Input / Output structures +//-------------------------------------------------------------------------------------- +struct VS_INPUT +{ + float4 vPosition : POSITION; + float3 vNormal : NORMAL; + float2 vTexcoord : TEXCOORD0; +}; + +struct VS_OUTPUT +{ + float4 vPosition : SV_POSITION; + float3 vNormal : NORMAL; + float2 vTexcoord0 : TEXCOORD0; + float4 vMatrix : TEXCOORD1; // DEBUG +}; + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +// We aliased signed vectors as a unsigned format. +// Need to recover signed values. The values 1.0 and 2.0 +// are slightly inaccurate here. +float3 R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( in float3 vVec ) +{ + vVec *= 2.0f; + return vVec >= 1.0f ? ( vVec - 2.0f ) : vVec; +} + +VS_OUTPUT VSMain( VS_INPUT Input ) +{ + + VS_OUTPUT Output; + float3 tmpNormal; + + Output.vPosition = mul( Input.vPosition, g_mWorldViewProjection ); + + // Expand compressed vectors + tmpNormal = R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( Input.vNormal ); + Output.vNormal = mul( tmpNormal, (float3x3)g_mWorld ); + + Output.vTexcoord0 = Input.vTexcoord; + + Output.vMatrix = (float4)g_mWorld[0]; // DEBUG + return Output; +} + diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11.fx b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11.fx new file mode 100644 index 000000000..c72b98843 --- /dev/null +++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11.fx @@ -0,0 +1,192 @@ +//TEST_IGNORE_FILE: +//-------------------------------------------------------------------------------------- +// File: DynamicShaderLinkageFX11.fx +// +// The effect file for the DynamicShaderLinkageFX11 sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +#include "DynamicShaderLinkageFX11_VS.hlsl" +#include "DynamicShaderLinkageFX11_PS.hlsl" + +// +// Settings for static permutations. +// All of the pre-5.0 targets need static specialization +// since they don't support late binding. The below +// just selects a single specialization but you could +// create any number of them, each one representing +// a new shader with the interfaces compiled out +// due to the compile-time class references. +// + +#define StaticMaterial g_plasticTexturedMaterial +#define StaticAmbientLight g_ambientLight +#define StaticDirectLight g_directionalLight +#define StaticEnvironmentLight g_environmentLight + +technique11 FeatureLevel10 +{ + pass + { + SetRasterizerState(g_rasterizerState[g_fillMode]); + SetVertexShader(CompileShader(vs_4_0, + VSMain())); + SetPixelShader(CompileShader(ps_4_0, + PSMainUniform(StaticAmbientLight, + StaticDirectLight, + StaticEnvironmentLight, + StaticMaterial))); + } +} + +technique11 FeatureLevel10_1 +{ + pass + { + SetRasterizerState(g_rasterizerState[g_fillMode]); + SetVertexShader(CompileShader(vs_4_1, + VSMain())); + SetPixelShader(CompileShader(ps_4_1, + PSMainUniform(StaticAmbientLight, + StaticDirectLight, + StaticEnvironmentLight, + StaticMaterial))); + } +} + +// +// Variables for dynamic shader linkage. +// There are two variations here for dynamic usage. +// In the first we use the uniform entry point +// and pass in global interface variables. This +// creates a shader which refers to the global +// interface variables when running and we can bind +// concrete instances in our C++ code by using +// ID3DX11EffectInterfaceVariable::SetClassInstance. +// This approach works well when you have several +// independent variations and want to bind them +// individually in your C++ code, such as the +// different lighting and material parameters in +// this sample. +// + +iBaseLight g_abstractAmbientLighting; +iBaseLight g_abstractDirectLighting; +iBaseLight g_abstractEnvironmentLighting; +iBaseMaterial g_abstractMaterial; + +technique11 FeatureLevel11 +{ + pass + { + SetRasterizerState(g_rasterizerState[g_fillMode]); + SetVertexShader(CompileShader(vs_5_0, + VSMain())); + SetPixelShader(CompileShader(ps_5_0, + PSMainUniform(g_abstractAmbientLighting, + g_abstractDirectLighting, + g_abstractEnvironmentLighting, + g_abstractMaterial))); + } +} + +// +// In this second variation we use the non-uniform +// entry point so that we don't have to specify +// any interfaces when compiling the shader. We +// then reuse the compiled shader with different +// BindInterfaces calls so that all bindings are +// handled automatically by the effect runtime. +// Below we have multiple techniques where +// we've given a concrete binding for the material. +// Lighting parameters are left as interfaces for +// binding via effect variables, but could also +// be specified concretely if the number of variations +// is manageable. +// This approach works well for a small number of variations +// that are known in advance, as you can just list them +// in your effect and you don't need to do the +// binding work explicitly in your C++ code. +// + +VertexShader g_NonUniVS = CompileShader(vs_5_0, VSMain()); +PixelShader g_NonUniPS = CompileShader(ps_5_0, PSMainNonUniform()); + +technique11 FeatureLevel11_g_plasticMaterial +{ + pass + { + SetVertexShader(g_NonUniVS); + SetPixelShader(BindInterfaces(g_NonUniPS, + g_abstractAmbientLighting, + g_abstractDirectLighting, + g_abstractEnvironmentLighting, + g_plasticMaterial)); + } +} + +technique11 FeatureLevel11_g_plasticTexturedMaterial +{ + pass + { + SetVertexShader(g_NonUniVS); + SetPixelShader(BindInterfaces(g_NonUniPS, + g_abstractAmbientLighting, + g_abstractDirectLighting, + g_abstractEnvironmentLighting, + g_plasticTexturedMaterial)); + } +} + +technique11 FeatureLevel11_g_plasticLightingOnlyMaterial +{ + pass + { + SetVertexShader(g_NonUniVS); + SetPixelShader(BindInterfaces(g_NonUniPS, + g_abstractAmbientLighting, + g_abstractDirectLighting, + g_abstractEnvironmentLighting, + g_plasticLightingOnlyMaterial)); + } +} + +technique11 FeatureLevel11_g_roughMaterial +{ + pass + { + SetVertexShader(g_NonUniVS); + SetPixelShader(BindInterfaces(g_NonUniPS, + g_abstractAmbientLighting, + g_abstractDirectLighting, + g_abstractEnvironmentLighting, + g_roughMaterial)); + } +} + +technique11 FeatureLevel11_g_roughTexturedMaterial +{ + pass + { + SetVertexShader(g_NonUniVS); + SetPixelShader(BindInterfaces(g_NonUniPS, + g_abstractAmbientLighting, + g_abstractDirectLighting, + g_abstractEnvironmentLighting, + g_roughTexturedMaterial)); + } +} + +technique11 FeatureLevel11_g_roughLightingOnlyMaterial +{ + pass + { + SetVertexShader(g_NonUniVS); + SetPixelShader(BindInterfaces(g_NonUniPS, + g_abstractAmbientLighting, + g_abstractDirectLighting, + g_abstractEnvironmentLighting, + g_roughLightingOnlyMaterial)); + } +} diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_LightPSH.h b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_LightPSH.h new file mode 100644 index 000000000..6f9a0f4d8 --- /dev/null +++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_LightPSH.h @@ -0,0 +1,82 @@ +//-------------------------------------------------------------------------------------- +// File: DynamicShaderLinkageFX11_LightPSH.h +// +// The pixel shader light header file for the DynamicShaderLinkageFX11 sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Interfaces +//-------------------------------------------------------------------------------------- +interface iBaseLight +{ + float3 IlluminateAmbient(float3 vNormal); + + float3 IlluminateDiffuse(float3 vNormal); + + float3 IlluminateSpecular(float3 vNormal, int specularPower ); + +}; + +//-------------------------------------------------------------------------------------- +// Classes +//-------------------------------------------------------------------------------------- +class cAmbientLight : iBaseLight +{ + float3 m_vLightColor; + bool m_bEnable; + + float3 IlluminateAmbient(float3 vNormal); + + float3 IlluminateDiffuse(float3 vNormal) + { + return (float3)0; + } + + float3 IlluminateSpecular(float3 vNormal, int specularPower ) + { + return (float3)0; + } +}; + +class cHemiAmbientLight : cAmbientLight +{ + // inherited float4 m_vLightColor is the SkyColor + float4 m_vGroundColor; + float4 m_vDirUp; + + float3 IlluminateAmbient(float3 vNormal); + +}; + +class cDirectionalLight : cAmbientLight +{ + // inherited float4 m_vLightColor is the LightColor + float4 m_vLightDir; + + float3 IlluminateDiffuse( float3 vNormal ); + + float3 IlluminateSpecular( float3 vNormal, int specularPower ); + +}; + +class cOmniLight : cAmbientLight +{ + float3 m_vLightPosition; + float radius; + + float3 IlluminateDiffuse( float3 vNormal ); + +}; + +class cSpotLight : cAmbientLight +{ + float3 m_vLightPosition; + float3 m_vLightDir; +}; + +class cEnvironmentLight : cAmbientLight +{ + float3 IlluminateSpecular( float3 vNormal, int specularPower ); +}; diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_MaterialPSH.h b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_MaterialPSH.h new file mode 100644 index 000000000..cd54a283d --- /dev/null +++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_MaterialPSH.h @@ -0,0 +1,103 @@ +//-------------------------------------------------------------------------------------- +// File: DynamicShaderLinkageFX11_MaterialPSH.h +// +// The pixel shader material header file for the DynamicShaderLinkageFX11 sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Interfaces +//-------------------------------------------------------------------------------------- +interface iBaseMaterial +{ + float3 GetAmbientColor(float2 vTexcoord); + + float3 GetDiffuseColor(float2 vTexcoord); + + int GetSpecularPower(); + +}; + +//-------------------------------------------------------------------------------------- +// Classes +//-------------------------------------------------------------------------------------- +class cBaseMaterial : iBaseMaterial +{ + float3 m_vColor; + int m_iSpecPower; + + float3 GetAmbientColor(float2 vTexcoord) + { + return m_vColor; + } + + float3 GetDiffuseColor(float2 vTexcoord) + { + return (float3)m_vColor; + } + + int GetSpecularPower() + { + return m_iSpecPower; + } + +}; + +class cPlasticMaterial : cBaseMaterial +{ + +}; + +class cPlasticTexturedMaterial : cPlasticMaterial +{ + float3 GetAmbientColor(float2 vTexcoord); + + float3 GetDiffuseColor(float2 vTexcoord); + +}; + +class cPlasticLightingOnlyMaterial : cBaseMaterial +{ + float3 GetAmbientColor(float2 vTexcoord) + { + return (float3)1.0f; + } + + float3 GetDiffuseColor(float2 vTexcoord) + { + return (float3)1.0f; + } + +}; + +class cRoughMaterial : cBaseMaterial +{ + int GetSpecularPower() + { + return m_iSpecPower; + } +}; + +class cRoughTexturedMaterial : cRoughMaterial +{ + float3 GetAmbientColor(float2 vTexcoord); + + float3 GetDiffuseColor(float2 vTexcoord); + +}; + + +class cRoughLightingOnlyMaterial : cRoughMaterial +{ + float3 GetAmbientColor(float2 vTexcoord) + { + return (float3)1.0f; + } + + float3 GetDiffuseColor(float2 vTexcoord) + { + return (float3)1.0f; + } + +}; diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_PSBuffers.h b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_PSBuffers.h new file mode 100644 index 000000000..3b4c528be --- /dev/null +++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_PSBuffers.h @@ -0,0 +1,152 @@ +//-------------------------------------------------------------------------------------- +// File: DynamicShaderLinkageFX11_LightPSH.hlsl +// +// The pixel shader light source module file for the DynamicShaderLinkageFX11 sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +#include "DynamicShaderLinkageFX11_LightPSH.h" +#include "DynamicShaderLinkageFX11_MaterialPSH.h" + +//-------------------------------------------------------------------------------------- +// Constant Buffers +//-------------------------------------------------------------------------------------- +cbuffer cbPerFrame : register( b0 ) +{ + cAmbientLight g_ambientLight; + cHemiAmbientLight g_hemiAmbientLight; + cDirectionalLight g_directionalLight; + cEnvironmentLight g_environmentLight; + float4 g_vEyeDir; +}; + +cbuffer cbPerPrimitive : register( b1 ) +{ + cPlasticMaterial g_plasticMaterial; + cPlasticTexturedMaterial g_plasticTexturedMaterial; + cPlasticLightingOnlyMaterial g_plasticLightingOnlyMaterial; + cRoughMaterial g_roughMaterial; + cRoughTexturedMaterial g_roughTexturedMaterial; + cRoughLightingOnlyMaterial g_roughLightingOnlyMaterial; +}; + +//-------------------------------------------------------------------------------------- +// Textures and Samplers +//-------------------------------------------------------------------------------------- +Texture2D g_txDiffuse : register( t0 ); +Texture2D g_txNormalMap : register( t1 ); +TextureCube g_txEnvironmentMap : register( t2 ); + +SamplerState g_samLinear : register( s0 ) +{ + Filter = MIN_MAG_MIP_LINEAR; + AddressU = WRAP; + AddressV = WRAP; + AddressW = WRAP; +}; + +//-------------------------------------------------------------------------------------- +// Rasterization State +//-------------------------------------------------------------------------------------- +uint g_fillMode = 0; + +RasterizerState g_rasterizerState[2] +{ +{ + FillMode = SOLID; + MultisampleEnable = true; +}, +{ + FillMode = WIREFRAME; + MultisampleEnable = true; +} +}; + +//-------------------------------------------------------------------------------------- +// Lighting Class Methods +//-------------------------------------------------------------------------------------- +// Ambient Lighting Class Methods +float3 cAmbientLight::IlluminateAmbient(float3 vNormal) +{ + return m_vLightColor * m_bEnable; +} + +float3 cHemiAmbientLight::IlluminateAmbient(float3 vNormal) +{ + float thetha = (dot( vNormal, m_vDirUp.xyz ) + 1.0f) / 2.0f; + + return lerp( m_vGroundColor.xyz, m_vLightColor, thetha) * m_bEnable; +} + +// Directional Light class +float3 cDirectionalLight::IlluminateDiffuse( float3 vNormal ) +{ + float lambert = saturate(dot( vNormal, m_vLightDir.xyz )); + return ((float3)lambert * m_vLightColor * m_bEnable); +} + +float3 cDirectionalLight::IlluminateSpecular( float3 vNormal, int specularPower ) +{ + float3 H = -normalize(g_vEyeDir.xyz) + m_vLightDir.xyz; + float3 halfAngle = normalize( H ); + float specular = pow( max(0,dot( halfAngle, normalize(vNormal) )), specularPower ); + + return ((float3)specular * m_vLightColor * m_bEnable); +} + +// Omni Light Class +float3 cOmniLight::IlluminateDiffuse( float3 vNormal ) +{ + return (float3)0.0f; // TO DO! +} + +// Environment Lighting +float3 cEnvironmentLight::IlluminateSpecular( float3 vNormal, int specularPower ) +{ + // compute reflection vector taking into account a cheap fresnel falloff; + float3 N = normalize(vNormal); + float3 E = normalize(g_vEyeDir.xyz); + float3 R = reflect( E, N ); + float fresnel = 1 - dot( -E, N ); + fresnel = (fresnel * fresnel * fresnel ); + + float3 specular = g_txEnvironmentMap.Sample( g_samLinear, R ).xyz * fresnel; + + return (specular * (float3)m_bEnable); +// return ((float3)fresnel); + +} + +//-------------------------------------------------------------------------------------- +// Material Class Methods +//-------------------------------------------------------------------------------------- +// Plastic Material Methods +float3 cPlasticTexturedMaterial::GetAmbientColor(float2 vTexcoord) +{ + float4 vDiffuse = (float4)1.0f; + vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord ); + return m_vColor * vDiffuse.xyz; +} + +float3 cPlasticTexturedMaterial::GetDiffuseColor(float2 vTexcoord) +{ + float4 vDiffuse = (float4)1.0f; + vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord ); + return m_vColor * vDiffuse.xyz; +} + +// Rough Material Methods +float3 cRoughTexturedMaterial::GetAmbientColor(float2 vTexcoord) +{ + float4 vDiffuse = (float4)1.0f; + vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord ); + return m_vColor * vDiffuse.xyz; +} + +float3 cRoughTexturedMaterial::GetDiffuseColor(float2 vTexcoord) +{ + float4 vDiffuse = (float4)1.0f; + vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord ); + return m_vColor * vDiffuse.xyz; +} diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_ps.hlsl b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_ps.hlsl new file mode 100644 index 000000000..55d206259 --- /dev/null +++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_ps.hlsl @@ -0,0 +1,113 @@ +//TEST_IGNORE_FILE: +//-------------------------------------------------------------------------------------- +// File: DynamicShaderLinkageFX11.psh +// +// The pixel shader header file for the DynamicShaderLinkageFX11 sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Header Includes +//-------------------------------------------------------------------------------------- +#include "DynamicShaderLinkageFX11_PSBuffers.h" + +//-------------------------------------------------------------------------------------- +// Input / Output structures +//-------------------------------------------------------------------------------------- +struct PS_INPUT +{ + float4 vPosition : SV_POSITION; + float3 vNormal : NORMAL; + float2 vTexcoord : TEXCOORD0; + float4 vMatrix : TEXCOORD1; +}; + +//-------------------------------------------------------------------------------------- +// Pixel Shader +//-------------------------------------------------------------------------------------- + +// This pixel shader uses several interfaces during its +// work. We show three different ways of providing interface +// bindings for the PS and those have two different +// entry points so we've separated the base PS code +// into a worker routine that's called by the entry +// points. Normally only one technique would be used +// and this layering of entry point and worker would +// not be necessary. +float4 PSMainWorker( iBaseLight ambientLighting, + iBaseLight directLighting, + iBaseLight environmentLighting, + iBaseMaterial material, + PS_INPUT Input ) +{ + // Compute the Ambient term + float3 Ambient = (float3)0.0f; + Ambient = material.GetAmbientColor( Input.vTexcoord ) * ambientLighting.IlluminateAmbient( Input.vNormal ); + + // Accumulate the Diffuse contribution + float3 Diffuse = (float3)0.0f; + + Diffuse += material.GetDiffuseColor( Input.vTexcoord ) * directLighting.IlluminateDiffuse( Input.vNormal ); + + // Compute the Specular contribution + float3 Specular = (float3)0.0f; + Specular += directLighting.IlluminateSpecular( Input.vNormal, material.GetSpecularPower() ); + Specular += environmentLighting.IlluminateSpecular( Input.vNormal, material.GetSpecularPower() ); + + // Accumulate the lighting with saturation + float3 Lighting = saturate( Ambient + Diffuse + Specular); + + return float4(Lighting,1.0f); +} + +// One way to provide bindings for shaders in Effects 11 is +// to use uniform interface parameters. As with non-interface +// uniform parameters you must specify a value for these +// parameters in your CompileShader invocations in the effect. +// You can provide concrete class instances if you want +// to statically specialize your shaders, such as for targets +// that don't support abstract interfaces; or you can provide +// other interfaces that you bind using effect variables. +// Both are shown in this sample's technique passes. +float4 PSMainUniform( uniform iBaseLight ambientLighting, + uniform iBaseLight directLighting, + uniform iBaseLight environmentLighting, + uniform iBaseMaterial material, + PS_INPUT Input ) : SV_Target +{ + return PSMainWorker(ambientLighting, + directLighting, + environmentLighting, + material, + Input); +} + +// Another way to use Effects 11 with interfaces is +// to have non-uniform parameters, which then are +// bound with a BindInterfaces in a technique pass. +// BindInterfaces gives concrete instances to use +// with a shader but does not do static specialization, +// it just saves information for the effect runtime +// to use when setting up the shader to run. +// This lets you share a single shader, compiled with +// interface usage, while still getting the convenience +// of declaring concrete bindings in the effect and +// not needed explicit binding in code via effect +// variable updates. If you have many different +// variations it may be simpler to use bindings +// through effect variables, as then you don't +// need to list every possible binding set in your +// techniques. +float4 PSMainNonUniform( iBaseLight ambientLighting, + iBaseLight directLighting, + iBaseLight environmentLighting, + iBaseMaterial material, + PS_INPUT Input ) : SV_Target +{ + return PSMainWorker(ambientLighting, + directLighting, + environmentLighting, + material, + Input); +} diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_vs.hlsl b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_vs.hlsl new file mode 100644 index 000000000..4791e5786 --- /dev/null +++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_vs.hlsl @@ -0,0 +1,65 @@ +//TEST_IGNORE_FILE: +//-------------------------------------------------------------------------------------- +// File: DynamicShaderLinkageFX11_VS.hlsl +// +// The vertex shader file for the DynamicShaderLinkageFX11 sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Globals +//-------------------------------------------------------------------------------------- +cbuffer cbPerObject : register( b0 ) +{ + float4x4 g_mWorldViewProjection : packoffset( c0 ); + float4x4 g_mWorld : packoffset( c4 ); +}; + +//-------------------------------------------------------------------------------------- +// Input / Output structures +//-------------------------------------------------------------------------------------- +struct VS_INPUT +{ + float4 vPosition : POSITION; + float3 vNormal : NORMAL; + float2 vTexcoord : TEXCOORD0; +}; + +struct VS_OUTPUT +{ + float4 vPosition : SV_POSITION; + float3 vNormal : NORMAL; + float2 vTexcoord0 : TEXCOORD0; + float4 vMatrix : TEXCOORD1; // DEBUG +}; + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +// We aliased signed vectors as a unsigned format. +// Need to recover signed values. The values 1.0 and 2.0 +// are slightly inaccurate here. +float3 R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( in float3 vVec ) +{ + vVec *= 2.0f; + return vVec >= 1.0f ? ( vVec - 2.0f ) : vVec; +} + +VS_OUTPUT VSMain( VS_INPUT Input ) +{ + + VS_OUTPUT Output; + float3 tmpNormal; + + Output.vPosition = mul( Input.vPosition, g_mWorldViewProjection ); + + // Expand compressed vectors + tmpNormal = R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( Input.vNormal ); + Output.vNormal = mul( tmpNormal, (float3x3)g_mWorld ); + + Output.vTexcoord0 = Input.vTexcoord; + + Output.vMatrix = (float4)g_mWorld[0]; // DEBUG + return Output; +} diff --git a/tests/hlsl/dxsdk/FixedFuncEMUFX11/FixedFuncEMU.fx b/tests/hlsl/dxsdk/FixedFuncEMUFX11/FixedFuncEMU.fx new file mode 100644 index 000000000..699df8655 --- /dev/null +++ b/tests/hlsl/dxsdk/FixedFuncEMUFX11/FixedFuncEMU.fx @@ -0,0 +1,468 @@ +//TEST_IGNORE_FILE: +// FixedFuncEMU.fx +// Copyright (c) 2005 Microsoft Corporation. All rights reserved. +// + +struct VSSceneIn +{ + float3 pos : POSITION; //position of the particle + float3 norm : NORMAL; //velocity of the particle + float2 tex : TEXTURE0; //tex coords +}; + +struct VSSceneOut +{ + float4 pos : SV_Position; //position + float2 tex : TEXTURE0; //texture coordinate + float3 wPos : TEXTURE1; //world space pos + float3 wNorm : TEXTURE2; //world space normal + float4 colorD : COLOR0; //color for gouraud and flat shading + float4 colorS : COLOR1; //color for specular + float fogDist : FOGDISTANCE; //distance used for fog calculations + float3 planeDist : SV_ClipDistance0; //clip distance for 3 planes +}; + +struct PSSceneIn +{ + float4 pos : SV_Position; //position + float2 tex : TEXTURE0; //texture coordinate + float3 wPos : TEXTURE1; //world space pos + float3 wNorm : TEXTURE2; //world space normal + float4 colorD : COLOR0; //color for gouraud and flat shading + float4 colorS : COLOR1; //color for specular + float fogDist : FOGDISTANCE; //distance used for fog calculations +}; + +struct Light +{ + float4 Position; + float4 Diffuse; + float4 Specular; + float4 Ambient; + float4 Atten; +}; + +#define FOGMODE_NONE 0 +#define FOGMODE_LINEAR 1 +#define FOGMODE_EXP 2 +#define FOGMODE_EXP2 3 +#define E 2.71828 + +cbuffer cbLights +{ + float4 g_clipplanes[3]; + Light g_lights[8]; +}; + +cbuffer cbPerFrame +{ + float4x4 g_mWorld; + float4x4 g_mView; + float4x4 g_mProj; + float4x4 g_mInvProj; + float4x4 g_mLightViewProj; +}; + +cbuffer cbPerTechnique +{ + bool g_bEnableLighting = true; + bool g_bEnableClipping = true; + bool g_bPointScaleEnable = false; + float g_pointScaleA; + float g_pointScaleB; + float g_pointScaleC; + float g_pointSize; + + //fog params + int g_fogMode = FOGMODE_NONE; + float g_fogStart; + float g_fogEnd; + float g_fogDensity; + float4 g_fogColor; +}; + +cbuffer cbPerViewChange +{ + //viewport params + float g_viewportHeight; + float g_viewportWidth; + float g_nearPlane; +}; + +cbuffer cbImmutable +{ + float3 g_positions[4] = + { + float3( -0.5, 0.5, 0 ), + float3( 0.5, 0.5, 0 ), + float3( -0.5, -0.5, 0 ), + float3( 0.5, -0.5, 0 ), + }; +}; + +Texture2D g_txDiffuse; +Texture2D g_txProjected; +SamplerState g_samLinear +{ + Filter = MIN_MAG_MIP_LINEAR; + AddressU = Clamp; + AddressV = Clamp; +}; + +DepthStencilState DisableDepth +{ + DepthEnable = FALSE; + DepthWriteMask = ZERO; +}; + +DepthStencilState EnableDepth +{ + DepthEnable = TRUE; + DepthWriteMask = ALL; +}; + +struct ColorsOutput +{ + float4 Diffuse; + float4 Specular; +}; + +ColorsOutput CalcLighting( float3 worldNormal, float3 worldPos, float3 cameraPos ) +{ + ColorsOutput output = (ColorsOutput)0.0; + + for(int i=0; i<8; i++) + { + float3 toLight = g_lights[i].Position.xyz - worldPos; + float lightDist = length( toLight ); + float fAtten = 1.0/dot( g_lights[i].Atten, float4(1,lightDist,lightDist*lightDist,0) ); + float3 lightDir = normalize( toLight ); + float3 halfAngle = normalize( normalize(-cameraPos) + lightDir ); + + output.Diffuse += max(0,dot( lightDir, worldNormal ) * g_lights[i].Diffuse * fAtten) + g_lights[i].Ambient; + output.Specular += max(0,pow( dot( halfAngle, worldNormal ), 64 ) * g_lights[i].Specular * fAtten ); + } + + return output; +} + +// +// VS for emulating fixed function pipeline +// +VSSceneOut VSScenemain(VSSceneIn input) +{ + VSSceneOut output = (VSSceneOut)0.0; + + //output our final position in clipspace + float4 worldPos = mul( float4( input.pos, 1 ), g_mWorld ); + float4 cameraPos = mul( worldPos, g_mView ); //Save cameraPos for fog calculations + output.pos = mul( cameraPos, g_mProj ); + + //save world pos for later + output.wPos = worldPos; + + //save the fog distance for later + output.fogDist = cameraPos.z; + + //find our clipping planes (fixed function clipping is done in world space) + if( g_bEnableClipping ) + { + worldPos.w = 1; + + //calc the distance from the 3 clipping planes + output.planeDist.x = dot( worldPos, g_clipplanes[0] ); + output.planeDist.y = dot( worldPos, g_clipplanes[1] ); + output.planeDist.z = dot( worldPos, g_clipplanes[2] ); + } + else + { + output.planeDist.x = 1; + output.planeDist.y = 1; + output.planeDist.z = 1; + } + + //do gouraud lighting + if( g_bEnableLighting ) + { + float3 worldNormal = normalize( mul( input.norm, (float3x3)g_mWorld ) ); + output.wNorm = worldNormal; + ColorsOutput cOut = CalcLighting( worldNormal, worldPos, cameraPos ); + output.colorD = cOut.Diffuse; + output.colorS = cOut.Specular; + } + else + { + output.colorD = float4(1,1,1,1); + } + + //propogate texture coordinate + output.tex = input.tex; + + return output; +} + +// +// VS for rendering in screen space +// +PSSceneIn VSScreenSpacemain(VSSceneIn input) +{ + PSSceneIn output = (PSSceneIn)0.0; + + //output our final position + output.pos.x = (input.pos.x / (g_viewportWidth/2.0)) -1; + output.pos.y = -(input.pos.y / (g_viewportHeight/2.0)) +1; + output.pos.z = input.pos.z; + output.pos.w = 1; + + //propogate texture coordinate + output.tex = input.tex; + output.colorD = float4(1,1,1,1); + + return output; +} + +// +// GS for flat shaded rendering +// + +[maxvertexcount(3)] +void GSFlatmain( triangle VSSceneOut input[3], inout TriangleStream FlatTriStream ) +{ + VSSceneOut output; + + // + // Calculate the face normal + // + float3 faceEdgeA = input[1].wPos - input[0].wPos; + float3 faceEdgeB = input[2].wPos - input[0].wPos; + + // + // Cross product + // + float3 faceNormal = cross(faceEdgeA, faceEdgeB); + + // + //calculate the face center + // + float3 faceCenter = (input[0].wPos + input[1].wPos + input[2].wPos)/3.0; + + //find world pos and camera pos + float4 worldPos = float4( faceCenter, 1 ); + float4 cameraPos = mul( worldPos, g_mView ); + + //do shading + float3 worldNormal = normalize( faceNormal ); + ColorsOutput cOut = CalcLighting( worldNormal, worldPos, cameraPos ); + + for(int i=0; i<3; i++) + { + output = input[i]; + output.colorD = cOut.Diffuse; + output.colorS = cOut.Specular; + + FlatTriStream.Append( output ); + } + FlatTriStream.RestartStrip(); +} + +// +// GS for point rendering +// +[maxvertexcount(12)] +void GSPointmain( triangle VSSceneOut input[3], inout TriangleStream PointTriStream ) +{ + VSSceneOut output; + + // + // Calculate the point size + // + //float fSizeX = (g_pointSize/g_viewportWidth)/4.0; + float fSizeY = (g_pointSize/g_viewportHeight)/4.0; + float fSizeX = fSizeY; + + for(int i=0; i<3; i++) + { + output = input[i]; + + //find world pos and camera pos + float4 worldPos = float4(input[i].wPos,1); + float4 cameraPos = mul( worldPos, g_mView ); + + //find our size + if( g_bPointScaleEnable ) + { + float dEye = length( cameraPos.xyz ); + fSizeX = fSizeY = g_viewportHeight * g_pointSize * + sqrt( 1.0f/( g_pointScaleA + g_pointScaleB*dEye + g_pointScaleC*(dEye*dEye) ) ); + } + + //do shading + if(g_bEnableLighting) + { + float3 worldNormal = input[i].wNorm; + ColorsOutput cOut = CalcLighting( worldNormal, worldPos, cameraPos ); + + output.colorD = cOut.Diffuse; + output.colorS = cOut.Specular; + } + else + { + output.colorD = float4(1,1,1,1); + } + + output.tex = input[i].tex; + + // + // Emit two new triangles + // + for(int i=0; i<4; i++) + { + float4 outPos = mul( worldPos, g_mView ); + output.pos = mul( outPos, g_mProj ); + float zoverNear = (outPos.z)/g_nearPlane; + float4 posSize = float4( g_positions[i].x*fSizeX*zoverNear, + g_positions[i].y*fSizeY*zoverNear, + 0, + 0 ); + output.pos += posSize; + + PointTriStream.Append(output); + } + PointTriStream.RestartStrip(); + } +} + +// +// Calculates fog factor based upon distance +// +float CalcFogFactor( float d ) +{ + float fogCoeff = 1.0; + + if( FOGMODE_LINEAR == g_fogMode ) + { + fogCoeff = (g_fogEnd - d)/(g_fogEnd - g_fogStart); + } + else if( FOGMODE_EXP == g_fogMode ) + { + fogCoeff = 1.0 / pow( E, d*g_fogDensity ); + } + else if( FOGMODE_EXP2 == g_fogMode ) + { + fogCoeff = 1.0 / pow( E, d*d*g_fogDensity*g_fogDensity ); + } + + return clamp( fogCoeff, 0, 1 ); +} + +// +// PS for rendering with clip planes +// +float4 PSScenemain(PSSceneIn input) : SV_Target +{ + //calculate the fog factor + float fog = CalcFogFactor( input.fogDist ); + + //calculate the color based off of the normal, textures, etc + float4 normalColor = g_txDiffuse.Sample( g_samLinear, input.tex ) * input.colorD + input.colorS; + + //calculate the color from the projected texture + float4 cookieCoord = mul( float4(input.wPos,1), g_mLightViewProj ); + //since we don't have texldp, we must perform the w divide ourselves befor the texture lookup + cookieCoord.xy = 0.5 * cookieCoord.xy / cookieCoord.w + float2( 0.5, 0.5 ); + float4 cookieColor = float4(0,0,0,0); + if( cookieCoord.z > 0 ) + cookieColor = g_txProjected.Sample( g_samLinear, cookieCoord.xy ); + + //for standard light-modulating effects just multiply normalcolor and coookiecolor + normalColor += cookieColor; + + return fog * normalColor + (1.0 - fog)*g_fogColor; +} + +// +// PS for rendering with alpha test +// +float4 PSAlphaTestmain(PSSceneIn input) : SV_Target +{ + float4 color = g_txDiffuse.Sample( g_samLinear, input.tex ) * input.colorD; + if( color.a < 0.5 ) + discard; + return color; +} + +// +// RenderSceneGouraud - renders gouraud-shaded primitives +// +technique10 RenderSceneGouraud +{ + pass p0 + { + SetVertexShader( CompileShader( vs_4_0, VSScenemain() ) ); + SetGeometryShader( NULL ); + SetPixelShader( CompileShader( ps_4_0, PSScenemain() ) ); + + SetDepthStencilState( EnableDepth, 0 ); + } +} + +// +// RenderSceneFlat - renders flat-shaded primitives +// +technique10 RenderSceneFlat +{ + pass p0 + { + SetVertexShader( CompileShader( vs_4_0, VSScenemain() ) ); + SetGeometryShader( CompileShader( gs_4_0, GSFlatmain() ) ); + SetPixelShader( CompileShader( ps_4_0, PSScenemain() ) ); + + SetDepthStencilState( EnableDepth, 0 ); + } +} + +// +// RenderScenePoint - replaces d3dfill_point +// +technique10 RenderScenePoint +{ + pass p0 + { + SetVertexShader( CompileShader( vs_4_0, VSScenemain() ) ); + SetGeometryShader( CompileShader( gs_4_0, GSPointmain() ) ); + SetPixelShader( CompileShader( ps_4_0, PSScenemain() ) ); + + SetDepthStencilState( EnableDepth, 0 ); + } +} + +// +// RenderScreneSpace - shows how to render something in screenspace +// +technique10 RenderScreenSpaceAlphaTest +{ + pass p0 + { + SetVertexShader( CompileShader( vs_4_0, VSScreenSpacemain() ) ); + SetGeometryShader( NULL ); + SetPixelShader( CompileShader( ps_4_0, PSAlphaTestmain() ) ); + + SetDepthStencilState( DisableDepth, 0 ); + } +} + +// +// RenderScreneSpace - shows how to render something in screenspace +// +technique10 RenderTextureOnly +{ + pass p0 + { + SetVertexShader( CompileShader( vs_4_0, VSScenemain() ) ); + SetGeometryShader( NULL ); + SetPixelShader( CompileShader( ps_4_0, PSScenemain() ) ); + + SetDepthStencilState( EnableDepth, 0 ); + } +} + diff --git a/tests/hlsl/dxsdk/FluidCS11/ComputeShaderSort11.hlsl b/tests/hlsl/dxsdk/FluidCS11/ComputeShaderSort11.hlsl new file mode 100644 index 000000000..db7bd5136 --- /dev/null +++ b/tests/hlsl/dxsdk/FluidCS11/ComputeShaderSort11.hlsl @@ -0,0 +1,75 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry BitonicSort -entry MatrixTranspose +//-------------------------------------------------------------------------------------- +// File: ComputeShaderSort11.hlsl +// +// This file contains the compute shaders to perform GPU sorting using DirectX 11. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +#define BITONIC_BLOCK_SIZE 512 + +#define TRANSPOSE_BLOCK_SIZE 16 + +//-------------------------------------------------------------------------------------- +// Constant Buffers +//-------------------------------------------------------------------------------------- +cbuffer CB : register( b0 ) +{ + unsigned int g_iLevel; + unsigned int g_iLevelMask; + unsigned int g_iWidth; + unsigned int g_iHeight; +}; + +//-------------------------------------------------------------------------------------- +// Structured Buffers +//-------------------------------------------------------------------------------------- +StructuredBuffer Input : register( t0 ); +RWStructuredBuffer Data : register( u0 ); + +//-------------------------------------------------------------------------------------- +// Bitonic Sort Compute Shader +//-------------------------------------------------------------------------------------- +groupshared unsigned int shared_data[BITONIC_BLOCK_SIZE]; + +[numthreads(BITONIC_BLOCK_SIZE, 1, 1)] +void BitonicSort( uint3 Gid : SV_GroupID, + uint3 DTid : SV_DispatchThreadID, + uint3 GTid : SV_GroupThreadID, + uint GI : SV_GroupIndex ) +{ + // Load shared data + shared_data[GI] = Data[DTid.x]; + GroupMemoryBarrierWithGroupSync(); + + // Sort the shared data + for (unsigned int j = g_iLevel >> 1 ; j > 0 ; j >>= 1) + { + unsigned int result = ((shared_data[GI & ~j] <= shared_data[GI | j]) == (bool)(g_iLevelMask & DTid.x))? shared_data[GI ^ j] : shared_data[GI]; + GroupMemoryBarrierWithGroupSync(); + shared_data[GI] = result; + GroupMemoryBarrierWithGroupSync(); + } + + // Store shared data + Data[DTid.x] = shared_data[GI]; +} + +//-------------------------------------------------------------------------------------- +// Matrix Transpose Compute Shader +//-------------------------------------------------------------------------------------- +groupshared unsigned int transpose_shared_data[TRANSPOSE_BLOCK_SIZE * TRANSPOSE_BLOCK_SIZE]; + +[numthreads(TRANSPOSE_BLOCK_SIZE, TRANSPOSE_BLOCK_SIZE, 1)] +void MatrixTranspose( uint3 Gid : SV_GroupID, + uint3 DTid : SV_DispatchThreadID, + uint3 GTid : SV_GroupThreadID, + uint GI : SV_GroupIndex ) +{ + transpose_shared_data[GI] = Input[DTid.y * g_iWidth + DTid.x]; + GroupMemoryBarrierWithGroupSync(); + uint2 XY = DTid.yx - GTid.yx + GTid.xy; + Data[XY.y * g_iHeight + XY.x] = transpose_shared_data[GTid.x * TRANSPOSE_BLOCK_SIZE + GTid.y]; +} diff --git a/tests/hlsl/dxsdk/FluidCS11/FluidCS11.hlsl b/tests/hlsl/dxsdk/FluidCS11/FluidCS11.hlsl new file mode 100644 index 000000000..26e6cdf60 --- /dev/null +++ b/tests/hlsl/dxsdk/FluidCS11/FluidCS11.hlsl @@ -0,0 +1,529 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry BuildGridCS -entry ClearGridIndicesCS -entry BuildGridIndicesCS -entry RearrangeParticlesCS -entry DensityCS_Simple -entry DensityCS_Shared -entry DensityCS_Grid -entry ForceCS_Simple -entry ForceCS_Shared -entry ForceCS_Grid -entry IntegrateCS +//-------------------------------------------------------------------------------------- +// File: FluidCS11.hlsl +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Smoothed Particle Hydrodynamics Algorithm Based Upon: +// Particle-Based Fluid Simulation for Interactive Applications +// Matthias Müller +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Optimized Grid Algorithm Based Upon: +// Broad-Phase Collision Detection with CUDA +// Scott Le Grand +//-------------------------------------------------------------------------------------- + +struct Particle +{ + float2 position; + float2 velocity; +}; + +struct ParticleForces +{ + float2 acceleration; +}; + +struct ParticleDensity +{ + float density; +}; + +cbuffer cbSimulationConstants : register( b0 ) +{ + uint g_iNumParticles; + float g_fTimeStep; + float g_fSmoothlen; + float g_fPressureStiffness; + float g_fRestDensity; + float g_fDensityCoef; + float g_fGradPressureCoef; + float g_fLapViscosityCoef; + float g_fWallStiffness; + + float4 g_vGravity; + float4 g_vGridDim; + float3 g_vPlanes[4]; +}; + +//-------------------------------------------------------------------------------------- +// Fluid Simulation +//-------------------------------------------------------------------------------------- + +#define SIMULATION_BLOCK_SIZE 256 + +//-------------------------------------------------------------------------------------- +// Structured Buffers +//-------------------------------------------------------------------------------------- +RWStructuredBuffer ParticlesRW : register( u0 ); +StructuredBuffer ParticlesRO : register( t0 ); + +RWStructuredBuffer ParticlesDensityRW : register( u0 ); +StructuredBuffer ParticlesDensityRO : register( t1 ); + +RWStructuredBuffer ParticlesForcesRW : register( u0 ); +StructuredBuffer ParticlesForcesRO : register( t2 ); + +RWStructuredBuffer GridRW : register( u0 ); +StructuredBuffer GridRO : register( t3 ); + +RWStructuredBuffer GridIndicesRW : register( u0 ); +StructuredBuffer GridIndicesRO : register( t4 ); + + +//-------------------------------------------------------------------------------------- +// Grid Construction +//-------------------------------------------------------------------------------------- + +// For simplicity, this sample uses a 16-bit hash based on the grid cell and +// a 16-bit particle ID to keep track of the particles while sorting +// This imposes a limitation of 64K particles and 256x256 grid work +// You could extended the implementation to support large scenarios by using a uint2 + +float2 GridCalculateCell(float2 position) +{ + return clamp(position * g_vGridDim.xy + g_vGridDim.zw, float2(0, 0), float2(255, 255)); +} + +unsigned int GridConstuctKey(uint2 xy) +{ + // Bit pack [-----UNUSED-----][----Y---][----X---] + // 16-bit 8-bit 8-bit + return dot(xy.yx, uint2(256, 1)); +} + +unsigned int GridConstuctKeyValuePair(uint2 xy, uint value) +{ + // Bit pack [----Y---][----X---][-----VALUE------] + // 8-bit 8-bit 16-bit + return dot(uint3(xy.yx, value), uint3(256*256*256, 256*256, 1)); +} + +unsigned int GridGetKey(unsigned int keyvaluepair) +{ + return (keyvaluepair >> 16); +} + +unsigned int GridGetValue(unsigned int keyvaluepair) +{ + return (keyvaluepair & 0xFFFF); +} + + +//-------------------------------------------------------------------------------------- +// Build Grid +//-------------------------------------------------------------------------------------- + +[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)] +void BuildGridCS( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + const unsigned int P_ID = DTid.x; // Particle ID to operate on + + float2 position = ParticlesRO[P_ID].position; + float2 grid_xy = GridCalculateCell( position ); + + GridRW[P_ID] = GridConstuctKeyValuePair((uint2)grid_xy, P_ID); +} + + +//-------------------------------------------------------------------------------------- +// Build Grid Indices +//-------------------------------------------------------------------------------------- + +[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)] +void ClearGridIndicesCS( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + GridIndicesRW[DTid.x] = uint2(0, 0); +} + +[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)] +void BuildGridIndicesCS( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + const unsigned int G_ID = DTid.x; // Grid ID to operate on + unsigned int G_ID_PREV = (G_ID == 0)? g_iNumParticles : G_ID; G_ID_PREV--; + unsigned int G_ID_NEXT = G_ID + 1; if (G_ID_NEXT == g_iNumParticles) { G_ID_NEXT = 0; } + + unsigned int cell = GridGetKey( GridRO[G_ID] ); + unsigned int cell_prev = GridGetKey( GridRO[G_ID_PREV] ); + unsigned int cell_next = GridGetKey( GridRO[G_ID_NEXT] ); + if (cell != cell_prev) + { + // I'm the start of a cell + GridIndicesRW[cell].x = G_ID; + } + if (cell != cell_next) + { + // I'm the end of a cell + GridIndicesRW[cell].y = G_ID + 1; + } +} + + +//-------------------------------------------------------------------------------------- +// Rearrange Particles +//-------------------------------------------------------------------------------------- + +[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)] +void RearrangeParticlesCS( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + const unsigned int ID = DTid.x; // Particle ID to operate on + const unsigned int G_ID = GridGetValue( GridRO[ ID ] ); + ParticlesRW[ID] = ParticlesRO[ G_ID ]; +} + + +//-------------------------------------------------------------------------------------- +// Density Calculation +//-------------------------------------------------------------------------------------- + +float CalculateDensity(float r_sq) +{ + const float h_sq = g_fSmoothlen * g_fSmoothlen; + // Implements this equation: + // W_poly6(r, h) = 315 / (64 * pi * h^9) * (h^2 - r^2)^3 + // g_fDensityCoef = fParticleMass * 315.0f / (64.0f * PI * fSmoothlen^9) + return g_fDensityCoef * (h_sq - r_sq) * (h_sq - r_sq) * (h_sq - r_sq); +} + + +//-------------------------------------------------------------------------------------- +// Simple N^2 Algorithm +//-------------------------------------------------------------------------------------- + +[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)] +void DensityCS_Simple( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + const unsigned int P_ID = DTid.x; + const float h_sq = g_fSmoothlen * g_fSmoothlen; + float2 P_position = ParticlesRO[P_ID].position; + + float density = 0; + + // Calculate the density based on all neighbors + for (uint N_ID = 0 ; N_ID < g_iNumParticles ; N_ID++) + { + float2 N_position = ParticlesRO[N_ID].position; + + float2 diff = N_position - P_position; + float r_sq = dot(diff, diff); + if (r_sq < h_sq) + { + density += CalculateDensity(r_sq); + } + } + + ParticlesDensityRW[P_ID].density = density; +} + + +//-------------------------------------------------------------------------------------- +// Shared Memory Optimized N^2 Algorithm +//-------------------------------------------------------------------------------------- + +groupshared float2 density_shared_pos[SIMULATION_BLOCK_SIZE]; + +[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)] +void DensityCS_Shared( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + const unsigned int P_ID = DTid.x; + const float h_sq = g_fSmoothlen * g_fSmoothlen; + float2 P_position = ParticlesRO[P_ID].position; + + float density = 0; + + // Calculate the density based on all neighbors + [loop] + for (uint N_block_ID = 0 ; N_block_ID < g_iNumParticles ; N_block_ID += SIMULATION_BLOCK_SIZE) + { + // Cache a tile of particles unto shared memory to increase IO efficiency + density_shared_pos[GI] = ParticlesRO[N_block_ID + GI].position; + + GroupMemoryBarrierWithGroupSync(); + + for (uint N_tile_ID = 0; N_tile_ID < SIMULATION_BLOCK_SIZE; N_tile_ID++) + { + float2 N_position = density_shared_pos[N_tile_ID]; + + float2 diff = N_position - P_position; + float r_sq = dot(diff, diff); + if (r_sq < h_sq) + { + density += CalculateDensity(r_sq); + } + } + + GroupMemoryBarrierWithGroupSync(); + } + + ParticlesDensityRW[P_ID].density = density; +} + + +//-------------------------------------------------------------------------------------- +// Optimized Grid + Sort Algorithm +//-------------------------------------------------------------------------------------- + +[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)] +void DensityCS_Grid( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + const unsigned int P_ID = DTid.x; + const float h_sq = g_fSmoothlen * g_fSmoothlen; + float2 P_position = ParticlesRO[P_ID].position; + + float density = 0; + + // Calculate the density based on neighbors from the 8 adjacent cells + current cell + int2 G_XY = (int2)GridCalculateCell( P_position ); + for (int Y = max(G_XY.y - 1, 0) ; Y <= min(G_XY.y + 1, 255) ; Y++) + { + for (int X = max(G_XY.x - 1, 0) ; X <= min(G_XY.x + 1, 255) ; X++) + { + unsigned int G_CELL = GridConstuctKey(uint2(X, Y)); + uint2 G_START_END = GridIndicesRO[G_CELL]; + for (unsigned int N_ID = G_START_END.x ; N_ID < G_START_END.y ; N_ID++) + { + float2 N_position = ParticlesRO[N_ID].position; + + float2 diff = N_position - P_position; + float r_sq = dot(diff, diff); + if (r_sq < h_sq) + { + density += CalculateDensity(r_sq); + } + } + } + } + + ParticlesDensityRW[P_ID].density = density; +} + + +//-------------------------------------------------------------------------------------- +// Force Calculation +//-------------------------------------------------------------------------------------- + +float CalculatePressure(float density) +{ + // Implements this equation: + // Pressure = B * ((rho / rho_0)^y - 1) + return g_fPressureStiffness * max(pow(density / g_fRestDensity, 3) - 1, 0); +} + +float2 CalculateGradPressure(float r, float P_pressure, float N_pressure, float N_density, float2 diff) +{ + const float h = g_fSmoothlen; + float avg_pressure = 0.5f * (N_pressure + P_pressure); + // Implements this equation: + // W_spkiey(r, h) = 15 / (pi * h^6) * (h - r)^3 + // GRAD( W_spikey(r, h) ) = -45 / (pi * h^6) * (h - r)^2 + // g_fGradPressureCoef = fParticleMass * -45.0f / (PI * fSmoothlen^6) + return g_fGradPressureCoef * avg_pressure / N_density * (h - r) * (h - r) / r * (diff); +} + +float2 CalculateLapVelocity(float r, float2 P_velocity, float2 N_velocity, float N_density) +{ + const float h = g_fSmoothlen; + float2 vel_diff = (N_velocity - P_velocity); + // Implements this equation: + // W_viscosity(r, h) = 15 / (2 * pi * h^3) * (-r^3 / (2 * h^3) + r^2 / h^2 + h / (2 * r) - 1) + // LAPLACIAN( W_viscosity(r, h) ) = 45 / (pi * h^6) * (h - r) + // g_fLapViscosityCoef = fParticleMass * fViscosity * 45.0f / (PI * fSmoothlen^6) + return g_fLapViscosityCoef / N_density * (h - r) * vel_diff; +} + + +//-------------------------------------------------------------------------------------- +// Simple N^2 Algorithm +//-------------------------------------------------------------------------------------- + +[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)] +void ForceCS_Simple( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + const unsigned int P_ID = DTid.x; // Particle ID to operate on + + float2 P_position = ParticlesRO[P_ID].position; + float2 P_velocity = ParticlesRO[P_ID].velocity; + float P_density = ParticlesDensityRO[P_ID].density; + float P_pressure = CalculatePressure(P_density); + + const float h_sq = g_fSmoothlen * g_fSmoothlen; + + float2 acceleration = float2(0, 0); + + // Calculate the acceleration based on all neighbors + for (uint N_ID = 0 ; N_ID < g_iNumParticles ; N_ID++) + { + float2 N_position = ParticlesRO[N_ID].position; + + float2 diff = N_position - P_position; + float r_sq = dot(diff, diff); + if (r_sq < h_sq && P_ID != N_ID) + { + float2 N_velocity = ParticlesRO[N_ID].velocity; + float N_density = ParticlesDensityRO[N_ID].density; + float N_pressure = CalculatePressure(N_density); + float r = sqrt(r_sq); + + // Pressure Term + acceleration += CalculateGradPressure(r, P_pressure, N_pressure, N_density, diff); + + // Viscosity Term + acceleration += CalculateLapVelocity(r, P_velocity, N_velocity, N_density); + } + } + + ParticlesForcesRW[P_ID].acceleration = acceleration / P_density; +} + + +//-------------------------------------------------------------------------------------- +// Shared Memory Optimized N^2 Algorithm +//-------------------------------------------------------------------------------------- + +groupshared struct { float2 position; float2 velocity; float density; } force_shared_pos[SIMULATION_BLOCK_SIZE]; + +[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)] +void ForceCS_Shared( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + const unsigned int P_ID = DTid.x; // Particle ID to operate on + + float2 P_position = ParticlesRO[P_ID].position; + float2 P_velocity = ParticlesRO[P_ID].velocity; + float P_density = ParticlesDensityRO[P_ID].density; + float P_pressure = CalculatePressure(P_density); + + const float h_sq = g_fSmoothlen * g_fSmoothlen; + + float2 acceleration = float2(0, 0); + + // Calculate the acceleration based on all neighbors + [loop] + for (uint N_block_ID = 0 ; N_block_ID < g_iNumParticles ; N_block_ID += SIMULATION_BLOCK_SIZE) + { + // Cache a tile of particles unto shared memory to increase IO efficiency + force_shared_pos[GI].position = ParticlesRO[N_block_ID + GI].position; + force_shared_pos[GI].velocity = ParticlesRO[N_block_ID + GI].velocity; + force_shared_pos[GI].density = ParticlesDensityRO[N_block_ID + GI].density; + + GroupMemoryBarrierWithGroupSync(); + + [loop] + for (uint N_tile_ID = 0; N_tile_ID < SIMULATION_BLOCK_SIZE; N_tile_ID++ ) + { + uint N_ID = N_block_ID + N_tile_ID; + float2 N_position = force_shared_pos[N_tile_ID].position; + + float2 diff = N_position - P_position; + float r_sq = dot(diff, diff); + if (r_sq < h_sq && P_ID != N_ID) + { + float2 N_velocity = force_shared_pos[N_tile_ID].velocity; + float N_density = force_shared_pos[N_tile_ID].density; + float N_pressure = CalculatePressure(N_density); + float r = sqrt(r_sq); + + // Pressure Term + acceleration += CalculateGradPressure(r, P_pressure, N_pressure, N_density, diff); + + // Viscosity Term + acceleration += CalculateLapVelocity(r, P_velocity, N_velocity, N_density); + } + } + + GroupMemoryBarrierWithGroupSync(); + } + + ParticlesForcesRW[P_ID].acceleration = acceleration / P_density; +} + + +//-------------------------------------------------------------------------------------- +// Optimized Grid + Sort Algorithm +//-------------------------------------------------------------------------------------- + +[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)] +void ForceCS_Grid( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + const unsigned int P_ID = DTid.x; // Particle ID to operate on + + float2 P_position = ParticlesRO[P_ID].position; + float2 P_velocity = ParticlesRO[P_ID].velocity; + float P_density = ParticlesDensityRO[P_ID].density; + float P_pressure = CalculatePressure(P_density); + + const float h_sq = g_fSmoothlen * g_fSmoothlen; + + float2 acceleration = float2(0, 0); + + // Calculate the acceleration based on neighbors from the 8 adjacent cells + current cell + int2 G_XY = (int2)GridCalculateCell( P_position ); + for (int Y = max(G_XY.y - 1, 0) ; Y <= min(G_XY.y + 1, 255) ; Y++) + { + for (int X = max(G_XY.x - 1, 0) ; X <= min(G_XY.x + 1, 255) ; X++) + { + unsigned int G_CELL = GridConstuctKey(uint2(X, Y)); + uint2 G_START_END = GridIndicesRO[G_CELL]; + for (unsigned int N_ID = G_START_END.x ; N_ID < G_START_END.y ; N_ID++) + { + float2 N_position = ParticlesRO[N_ID].position; + + float2 diff = N_position - P_position; + float r_sq = dot(diff, diff); + if (r_sq < h_sq && P_ID != N_ID) + { + float2 N_velocity = ParticlesRO[N_ID].velocity; + float N_density = ParticlesDensityRO[N_ID].density; + float N_pressure = CalculatePressure(N_density); + float r = sqrt(r_sq); + + // Pressure Term + acceleration += CalculateGradPressure(r, P_pressure, N_pressure, N_density, diff); + + // Viscosity Term + acceleration += CalculateLapVelocity(r, P_velocity, N_velocity, N_density); + } + } + } + } + + ParticlesForcesRW[P_ID].acceleration = acceleration / P_density; +} + + +//-------------------------------------------------------------------------------------- +// Integration +//-------------------------------------------------------------------------------------- + +[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)] +void IntegrateCS( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + const unsigned int P_ID = DTid.x; // Particle ID to operate on + + float2 position = ParticlesRO[P_ID].position; + float2 velocity = ParticlesRO[P_ID].velocity; + float2 acceleration = ParticlesForcesRO[P_ID].acceleration; + + // Apply the forces from the map walls + [unroll] + for (unsigned int i = 0 ; i < 4 ; i++) + { + float dist = dot(float3(position, 1), g_vPlanes[i]); + acceleration += min(dist, 0) * -g_fWallStiffness * g_vPlanes[i].xy; + } + + // Apply gravity + acceleration += g_vGravity.xy; + + // Integrate + velocity += g_fTimeStep * acceleration; + position += g_fTimeStep * velocity; + + // Update + ParticlesRW[P_ID].position = position; + ParticlesRW[P_ID].velocity = velocity; +} diff --git a/tests/hlsl/dxsdk/FluidCS11/FluidRender.hlsl b/tests/hlsl/dxsdk/FluidCS11/FluidRender.hlsl new file mode 100644 index 000000000..d7e24b7bc --- /dev/null +++ b/tests/hlsl/dxsdk/FluidCS11/FluidRender.hlsl @@ -0,0 +1,112 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry ParticleVS -profile gs_4_0 -entry ParticleGS -profile ps_4_0 -entry ParticlePS +//-------------------------------------------------------------------------------------- +// File: FluidRender.hlsl +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Particle Rendering +//-------------------------------------------------------------------------------------- + +struct Particle { + float2 position; + float2 velocity; +}; + +struct ParticleDensity { + float density; +}; + +StructuredBuffer ParticlesRO : register( t0 ); +StructuredBuffer ParticleDensityRO : register( t1 ); + +cbuffer cbRenderConstants : register( b0 ) +{ + matrix g_mViewProjection; + float g_fParticleSize; +}; + +struct VSParticleOut +{ + float2 position : POSITION; + float4 color : COLOR; +}; + +struct GSParticleOut +{ + float4 position : SV_Position; + float4 color : COLOR; + float2 texcoord : TEXCOORD; +}; + + +//-------------------------------------------------------------------------------------- +// Visualization Helper +//-------------------------------------------------------------------------------------- + +static const float4 Rainbow[5] = { + float4(1, 0, 0, 1), // red + float4(1, 1, 0, 1), // orange + float4(0, 1, 0, 1), // green + float4(0, 1, 1, 1), // teal + float4(0, 0, 1, 1), // blue +}; + +float4 VisualizeNumber(float n) +{ + return lerp( Rainbow[ floor(n * 4.0f) ], Rainbow[ ceil(n * 4.0f) ], frac(n * 4.0f) ); +} + +float4 VisualizeNumber(float n, float lower, float upper) +{ + return VisualizeNumber( saturate( (n - lower) / (upper - lower) ) ); +} + + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- + +VSParticleOut ParticleVS(uint ID : SV_VertexID) +{ + VSParticleOut Out = (VSParticleOut)0; + Out.position = ParticlesRO[ID].position; + Out.color = VisualizeNumber(ParticleDensityRO[ID].density, 1000.0f, 2000.0f); + return Out; +} + + +//-------------------------------------------------------------------------------------- +// Particle Geometry Shader +//-------------------------------------------------------------------------------------- + +static const float2 g_positions[4] = { float2(-1, 1), float2(1, 1), float2(-1, -1), float2(1, -1) }; +static const float2 g_texcoords[4] = { float2(0, 1), float2(1, 1), float2(0, 0), float2(1, 0) }; + +[maxvertexcount(4)] +void ParticleGS(point VSParticleOut In[1], inout TriangleStream SpriteStream) +{ + [unroll] + for (int i = 0; i < 4; i++) + { + GSParticleOut Out = (GSParticleOut)0; + float4 position = float4(In[0].position, 0, 1) + g_fParticleSize * float4(g_positions[i], 0, 0); + Out.position = mul(position, g_mViewProjection); + Out.color = In[0].color; + Out.texcoord = g_texcoords[i]; + SpriteStream.Append(Out); + } + SpriteStream.RestartStrip(); +} + + +//-------------------------------------------------------------------------------------- +// Pixel Shader +//-------------------------------------------------------------------------------------- + +float4 ParticlePS(GSParticleOut In) : SV_Target +{ + return In.color; +} diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/BrightPassAndHorizFilterCS.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/BrightPassAndHorizFilterCS.hlsl new file mode 100644 index 000000000..87bad46ed --- /dev/null +++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/BrightPassAndHorizFilterCS.hlsl @@ -0,0 +1,64 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSMain +//-------------------------------------------------------------------------------------- +// File: BrightPassAndHorizFilterCS.hlsl +// +// The CS for bright pass and horizontal blur, used in CS path of +// HDRToneMappingCS11 sample +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- +static const float MIDDLE_GRAY = 0.72f; +static const float LUM_WHITE = 1.5f; +static const float BRIGHT_THRESHOLD = 0.5f; + +Texture2D Input : register( t0 ); +StructuredBuffer lum : register( t1 ); +RWStructuredBuffer Result : register( u0 ); + +cbuffer cb0 +{ + float4 g_avSampleWeights[15]; + uint g_outputwidth; + float g_inverse; + int2 g_inputsize; +} + +#define kernelhalf 7 +#define groupthreads 128 +groupshared float4 temp[groupthreads]; + +[numthreads( groupthreads, 1, 1 )] +void CSMain( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex ) +{ + int2 coord = int2( GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.x, Gid.y ); + coord = coord.xy * 8 + int2(4, 3); + coord = clamp( coord, int2(0, 0), int2(g_inputsize.x-1, g_inputsize.y-1) ); + float4 vColor = Input.Load( int3(coord, 0) ); + + float fLum = lum[0]*g_inverse; + + // Bright pass and tone mapping + vColor = max( 0.0f, vColor - BRIGHT_THRESHOLD ); + vColor *= MIDDLE_GRAY / (fLum + 0.001f); + vColor *= (1.0f + vColor/LUM_WHITE); + vColor /= (1.0f + vColor); + + temp[GI] = vColor; + + GroupMemoryBarrierWithGroupSync(); + + // Horizontal blur + if ( GI >= kernelhalf && + GI < (groupthreads - kernelhalf) && + ( (Gid.x * (groupthreads - 2 * kernelhalf) + GI - kernelhalf) < g_outputwidth) ) + { + float4 vOut = 0; + + [unroll] + for ( int i = -kernelhalf; i <= kernelhalf; ++i ) + vOut += temp[GI + i] * g_avSampleWeights[i + kernelhalf]; + + Result[GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.x + Gid.y * g_outputwidth] = float4(vOut.rgb, 1.0f); + } +} diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/DumpToTexture.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/DumpToTexture.hlsl new file mode 100644 index 000000000..d2d9611ce --- /dev/null +++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/DumpToTexture.hlsl @@ -0,0 +1,29 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PSDump +//-------------------------------------------------------------------------------------- +// File: DumpToTexture.hlsl +// +// The PS for converting CS output buffer to a texture, used in CS path of +// HDRToneMappingCS11 sample +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- +StructuredBuffer buffer : register( t0 ); + +struct QuadVS_Output +{ + float4 Pos : SV_POSITION; + float2 Tex : TEXCOORD0; +}; + +cbuffer cbPS : register( b0 ) +{ + uint4 g_param; +}; + +float4 PSDump( QuadVS_Output Input ) : SV_TARGET +{ + // To calculate the buffer offset, it is natural to use the screen space coordinates, + // Input.Pos is the screen space coordinates of the pixel being written + return buffer[ (Input.Pos.x - 0.5) + (Input.Pos.y - 0.5) * g_param.x ]; +} diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/FilterCS.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/FilterCS.hlsl new file mode 100644 index 000000000..09c91669a --- /dev/null +++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/FilterCS.hlsl @@ -0,0 +1,73 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSVerticalFilter -entry CSHorizFilter +//-------------------------------------------------------------------------------------- +// File: FilterCS.hlsl +// +// The CSs for doing vertical and horizontal blur, used in CS path of +// HDRToneMappingCS11 sample +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- +StructuredBuffer InputBuf : register( t0 ); +Texture2D InputTex : register( t1 ); +RWStructuredBuffer Result : register( u0 ); + +cbuffer cb0 +{ + float4 g_avSampleWeights[15]; + int2 g_outputsize; + int2 g_inputsize; +} + +#define kernelhalf 7 +#define groupthreads 128 +groupshared float4 temp[groupthreads]; + +[numthreads( groupthreads, 1, 1 )] +void CSVerticalFilter( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex ) +{ + int offsety = GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.y; + offsety = clamp( offsety, 0, g_inputsize.y-1 ); + int offset = Gid.x + offsety * g_inputsize.x; + temp[GI] = InputBuf[offset]; + + GroupMemoryBarrierWithGroupSync(); + + // Vertical blur + if ( GI >= kernelhalf && + GI < (groupthreads - kernelhalf) && + ( (GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.y) < g_outputsize.y) ) + { + float4 vOut = 0; + + [unroll] + for ( int i = -kernelhalf; i <= kernelhalf; ++i ) + vOut += temp[GI + i] * g_avSampleWeights[i + kernelhalf]; + + Result[Gid.x + (GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.y) * g_outputsize.x] = float4(vOut.rgb, 1.0f); + } +} + +[numthreads( groupthreads, 1, 1 )] +void CSHorizFilter( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex ) +{ + int2 coord = int2( GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.x, Gid.y ); + coord = clamp( coord, int2(0, 0), int2(g_inputsize.x-1, g_inputsize.y-1) ); + temp[GI] = InputTex.Load( int3(coord, 0) ); + + GroupMemoryBarrierWithGroupSync(); + + // Horizontal blur + if ( GI >= kernelhalf && + GI < (groupthreads - kernelhalf) && + ( (Gid.x * (groupthreads - 2 * kernelhalf) + GI - kernelhalf) < g_outputsize.x) ) + { + float4 vOut = 0; + + [unroll] + for ( int i = -kernelhalf; i <= kernelhalf; ++i ) + vOut += temp[GI + i] * g_avSampleWeights[i + kernelhalf]; + + Result[GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.x + Gid.y * g_outputsize.x] = float4(vOut.rgb, 1.0f); + } +} diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/FinalPass.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/FinalPass.hlsl new file mode 100644 index 000000000..a4673c237 --- /dev/null +++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/FinalPass.hlsl @@ -0,0 +1,79 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry QuadVS -profile ps_4_0 -entry PSFinalPass -entry PSFinalPassForCPUReduction +//-------------------------------------------------------------------------------------- +// File: FinalPass.hlsl +// +// The PSs for doing tone-mapping based on the input luminance, used in CS path of +// HDRToneMappingCS11 sample +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- +struct QuadVS_Input +{ + float4 Pos : POSITION; + float2 Tex : TEXCOORD0; +}; + +struct QuadVS_Output +{ + float4 Pos : SV_POSITION; + float2 Tex : TEXCOORD0; +}; + +QuadVS_Output QuadVS( QuadVS_Input Input ) +{ + QuadVS_Output Output; + Output.Pos = Input.Pos; + Output.Tex = Input.Tex; + return Output; +} + +Texture2D tex : register( t0 ); +StructuredBuffer lum : register( t1 ); +Texture2D bloom : register( t2 ); + +SamplerState PointSampler : register (s0); +SamplerState LinearSampler : register (s1); + + +static const float MIDDLE_GRAY = 0.72f; +static const float LUM_WHITE = 1.5f; + +cbuffer cbPS : register( b0 ) +{ + float4 g_param; +}; + +float4 PSFinalPass( QuadVS_Output Input ) : SV_TARGET +{ + float4 vColor = tex.Sample( PointSampler, Input.Tex ); + float fLum = lum[0]*g_param.x; + float3 vBloom = bloom.Sample( LinearSampler, Input.Tex ); + + // Tone mapping + vColor.rgb *= MIDDLE_GRAY / (fLum + 0.001f); + vColor.rgb *= (1.0f + vColor/LUM_WHITE); + vColor.rgb /= (1.0f + vColor); + + vColor.rgb += 0.6f * vBloom; + vColor.a = 1.0f; + + return vColor; +} + +float4 PSFinalPassForCPUReduction( QuadVS_Output Input ) : SV_TARGET +{ + float4 vColor = tex.Sample( PointSampler, Input.Tex ); + float fLum = g_param.x; + float3 vBloom = bloom.Sample( LinearSampler, Input.Tex ); + + // Tone mapping + vColor.rgb *= MIDDLE_GRAY / (fLum + 0.001f); + vColor.rgb *= (1.0f + vColor/LUM_WHITE); + vColor.rgb /= (1.0f + vColor); + + vColor.rgb += 0.6f * vBloom; + vColor.a = 1.0f; + + return vColor; +} diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/PSApproach.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/PSApproach.hlsl new file mode 100644 index 000000000..2b18cf0a1 --- /dev/null +++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/PSApproach.hlsl @@ -0,0 +1,129 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry DownScale2x2_Lum -entry DownScale3x3 -entry FinalPass -entry DownScale3x3_BrightPass -entry Bloom +//-------------------------------------------------------------------------------------- +// File: PSApproach.hlsl +// +// The PSs for doing post-processing, used in PS path of +// HDRToneMappingCS11 sample +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- +static const float4 LUM_VECTOR = float4(.299, .587, .114, 0); +static const float MIDDLE_GRAY = 0.72f; +static const float LUM_WHITE = 1.5f; +static const float BRIGHT_THRESHOLD = 0.5f; + +SamplerState PointSampler : register (s0); +SamplerState LinearSampler : register (s1); + +struct QuadVS_Output +{ + float4 Pos : SV_POSITION; + float2 Tex : TEXCOORD0; +}; + +Texture2D s0 : register(t0); +Texture2D s1 : register(t1); +Texture2D s2 : register(t2); + +float4 DownScale2x2_Lum ( QuadVS_Output Input ) : SV_TARGET +{ + float4 vColor = 0.0f; + float fAvg = 0.0f; + + for( int y = -1; y < 1; y++ ) + { + for( int x = -1; x < 1; x++ ) + { + // Compute the sum of color values + vColor = s0.Sample( PointSampler, Input.Tex, int2(x,y) ); + + fAvg += dot( vColor, LUM_VECTOR ); + } + } + + fAvg /= 4; + + return float4(fAvg, fAvg, fAvg, 1.0f); +} + +float4 DownScale3x3( QuadVS_Output Input ) : SV_TARGET +{ + float fAvg = 0.0f; + float4 vColor; + + for( int y = -1; y <= 1; y++ ) + { + for( int x = -1; x <= 1; x++ ) + { + // Compute the sum of color values + vColor = s0.Sample( PointSampler, Input.Tex, int2(x,y) ); + + fAvg += vColor.r; + } + } + + // Divide the sum to complete the average + fAvg /= 9; + + return float4(fAvg, fAvg, fAvg, 1.0f); +} + +float4 FinalPass( QuadVS_Output Input ) : SV_TARGET +{ + //float4 vColor = 0; + float4 vColor = s0.Sample( PointSampler, Input.Tex ); + float4 vLum = s1.Sample( PointSampler, float2(0,0) ); + float3 vBloom = s2.Sample( LinearSampler, Input.Tex ); + + // Tone mapping + vColor.rgb *= MIDDLE_GRAY / (vLum.r + 0.001f); + vColor.rgb *= (1.0f + vColor/LUM_WHITE); + vColor.rgb /= (1.0f + vColor); + + vColor.rgb += 0.6f * vBloom; + vColor.a = 1.0f; + + return vColor; +} + +float4 DownScale3x3_BrightPass( QuadVS_Output Input ) : SV_TARGET +{ + float3 vColor = 0.0f; + float4 vLum = s1.Sample( PointSampler, float2(0, 0) ); + float fLum = vLum.r; + + vColor = s0.Sample( PointSampler, Input.Tex ).rgb; + + // Bright pass and tone mapping + vColor = max( 0.0f, vColor - BRIGHT_THRESHOLD ); + vColor *= MIDDLE_GRAY / (fLum + 0.001f); + vColor *= (1.0f + vColor/LUM_WHITE); + vColor /= (1.0f + vColor); + + return float4(vColor, 1.0f); +} + +cbuffer cb0 +{ + float2 g_avSampleOffsets[15]; + float4 g_avSampleWeights[15]; +} + +float4 Bloom( QuadVS_Output Input ) : SV_TARGET +{ + float4 vSample = 0.0f; + float4 vColor = 0.0f; + float2 vSamplePosition; + + for( int iSample = 0; iSample < 15; iSample++ ) + { + // Sample from adjacent points + vSamplePosition = Input.Tex + g_avSampleOffsets[iSample]; + vColor = s0.Sample( PointSampler, vSamplePosition); + + vSample += g_avSampleWeights[iSample]*vColor; + } + + return vSample; +} diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceTo1DCS.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceTo1DCS.hlsl new file mode 100644 index 000000000..027838743 --- /dev/null +++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceTo1DCS.hlsl @@ -0,0 +1,72 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSMain +//----------------------------------------------------------------------------- +// File: ReduceTo1DCS.hlsl +// +// Desc: Reduce an input Texture2D to a buffer +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//----------------------------------------------------------------------------- +Texture2D Input : register( t0 ); +RWStructuredBuffer Result : register( u0 ); + +cbuffer cbCS : register( b0 ) +{ + uint4 g_param; // (g_param.x, g_param.y) is the x and y dimensions of the Dispatch call + // (g_param.z, g_param.w) is the size of the above Input Texture2D +}; + +//#define CS_FULL_PIXEL_REDUCITON // Defining this or not must be the same as in HDRToneMappingCS11.cpp + +#define blocksize 8 +#define blocksizeY 8 +#define groupthreads (blocksize*blocksizeY) +groupshared float accum[groupthreads]; + +static const float4 LUM_VECTOR = float4(.299, .587, .114, 0); + +[numthreads(blocksize,blocksizeY,1)] +void CSMain( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + float4 s = +#ifdef CS_FULL_PIXEL_REDUCITON + Input.Load( uint3(DTid.xy , 0) )+ + Input.Load( uint3(DTid.xy + uint2(blocksize*g_param.x, 0), 0) ) + + Input.Load( uint3(DTid.xy + uint2(0, blocksizeY*g_param.y), 0) ) + + Input.Load( uint3(DTid.xy + uint2(blocksize*g_param.x, blocksizeY*g_param.y), 0) ); +#else + Input.Load( uint3((float)DTid.x/81.0f*g_param.z, (float)DTid.y/81.0f*g_param.w, 0) ); +#endif + + accum[GI] = dot( s, LUM_VECTOR ); + + // Parallel reduction algorithm follows + GroupMemoryBarrierWithGroupSync(); + if ( GI < 32 ) + accum[GI] += accum[32+GI]; + + GroupMemoryBarrierWithGroupSync(); + if ( GI < 16 ) + accum[GI] += accum[16+GI]; + + GroupMemoryBarrierWithGroupSync(); + if ( GI < 8 ) + accum[GI] += accum[8+GI]; + + GroupMemoryBarrierWithGroupSync(); + if ( GI < 4 ) + accum[GI] += accum[4+GI]; + + GroupMemoryBarrierWithGroupSync(); + if ( GI < 2 ) + accum[GI] += accum[2+GI]; + + GroupMemoryBarrierWithGroupSync(); + if ( GI < 1 ) + accum[GI] += accum[1+GI]; + + if ( GI == 0 ) + { + Result[Gid.y*g_param.x+Gid.x] = accum[0]; + } +} diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceToSingleCS.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceToSingleCS.hlsl new file mode 100644 index 000000000..cf506283e --- /dev/null +++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceToSingleCS.hlsl @@ -0,0 +1,63 @@ +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSMain +//----------------------------------------------------------------------------- +// File: ReduceToSingleCS.hlsl +// +// Desc: Reduce an input buffer by a factor of groupthreads +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//----------------------------------------------------------------------------- + +StructuredBuffer Input : register( t0 ); +RWStructuredBuffer Result : register( u0 ); + +cbuffer cbCS : register( b0 ) +{ + uint4 g_param; // g_param.x is the actual elements contained in Input + // g_param.y is the x dimension of the Dispatch call +}; + +#define groupthreads 128 +groupshared float accum[groupthreads]; + +[numthreads(groupthreads,1,1)] +void CSMain( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + if ( DTid.x < g_param.x ) + accum[GI] = Input[DTid.x]; + else + accum[GI] = 0; + + // Parallel reduction algorithm follows + GroupMemoryBarrierWithGroupSync(); + if ( GI < 64 ) + accum[GI] += accum[64+GI]; + + GroupMemoryBarrierWithGroupSync(); + if ( GI < 32 ) + accum[GI] += accum[32+GI]; + + GroupMemoryBarrierWithGroupSync(); + if ( GI < 16 ) + accum[GI] += accum[16+GI]; + + GroupMemoryBarrierWithGroupSync(); + if ( GI < 8 ) + accum[GI] += accum[8+GI]; + + GroupMemoryBarrierWithGroupSync(); + if ( GI < 4 ) + accum[GI] += accum[4+GI]; + + GroupMemoryBarrierWithGroupSync(); + if ( GI < 2 ) + accum[GI] += accum[2+GI]; + + GroupMemoryBarrierWithGroupSync(); + if ( GI < 1 ) + accum[GI] += accum[1+GI]; + + if ( GI == 0 ) + { + Result[Gid.x] = accum[0]; + } +} diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/skybox11.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/skybox11.hlsl new file mode 100644 index 000000000..2728665e2 --- /dev/null +++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/skybox11.hlsl @@ -0,0 +1,44 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry SkyboxVS -profile ps_4_0 -entry SkyboxPS +//----------------------------------------------------------------------------- +// File: SkyBox11.hlsl +// +// Desc: +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//----------------------------------------------------------------------------- + +cbuffer cbPerObject : register( b0 ) +{ + row_major matrix g_mWorldViewProjection : packoffset( c0 ); +} + +TextureCube g_EnvironmentTexture : register( t0 ); +SamplerState g_sam : register( s0 ); + +struct SkyboxVS_Input +{ + float4 Pos : POSITION; +}; + +struct SkyboxVS_Output +{ + float4 Pos : SV_POSITION; + float3 Tex : TEXCOORD0; +}; + +SkyboxVS_Output SkyboxVS( SkyboxVS_Input Input ) +{ + SkyboxVS_Output Output; + + Output.Pos = Input.Pos; + Output.Tex = normalize( mul(Input.Pos, g_mWorldViewProjection) ); + + return Output; +} + +float4 SkyboxPS( SkyboxVS_Output Input ) : SV_TARGET +{ + float4 color = g_EnvironmentTexture.Sample( g_sam, Input.Tex ); + return color; +} diff --git a/tests/hlsl/dxsdk/InstancingFX11/Instancing.fx b/tests/hlsl/dxsdk/InstancingFX11/Instancing.fx new file mode 100644 index 000000000..3c8d45078 --- /dev/null +++ b/tests/hlsl/dxsdk/InstancingFX11/Instancing.fx @@ -0,0 +1,591 @@ +//TEST_IGNORE_FILE: +//-------------------------------------------------------------------------------------- +// File: Instancing.fx +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Input and output structures +//-------------------------------------------------------------------------------------- +struct VSInstIn +{ + float3 pos : POSITION; + float3 norm : NORMAL; + float2 tex : TEXTURE0; + row_major float4x4 mTransform : mTransform; +}; + +struct VSSceneIn +{ + float3 pos : POSITION; + float3 norm : NORMAL; + float2 tex : TEXTURE0; +}; + +struct VSGrassIn +{ + float3 pos : POSITION; + float3 norm : NORMAL; + float2 tex : TEXTURE0; + row_major float4x4 mTransform : mTransform; + uint VertexID : SV_VertexID; +}; + +struct VSGrassOut +{ + float3 pos : POSITION; + float3 norm : NORMAL; + float2 tex : TEXTURE0; + uint VertexID : VERTID; +}; + +struct VSQuadIn +{ + float3 pos : POSITION; + float2 tex : TEXTURE0; + row_major float4x4 mTransform : mTransform; + float fOcc : fOcc; + uint InstanceId : SV_InstanceID; +}; + +struct PSSceneIn +{ + float4 pos : SV_Position; + float2 tex : TEXTURE0; + float4 color : COLOR0; +}; + +struct PSQuadIn +{ + float4 pos : SV_Position; + float3 tex : TEXTURE0; + float4 color : COLOR0; +}; + +//-------------------------------------------------------------------------------------- +// Constant buffers +//-------------------------------------------------------------------------------------- +cbuffer crarely +{ + float4x4 g_mTreeMatrices[50]; + uint g_iNumTrees; +}; + +cbuffer ceveryframe +{ + float4x4 g_mWorldViewProj; + float4x4 g_mWorldView; +}; + +cbuffer cmultipleperframe +{ + float g_GrassWidth; + float g_GrassHeight; + uint g_iGrassCoverage; +}; + +cbuffer cusercontrolled +{ + float g_GrassMessiness; +}; + +struct light_struct +{ + float4 direction; + float4 color; +}; + +cbuffer cimmutable +{ + light_struct g_lights[4] = { + { float4(0.620275, 0.683659, 0.384537, 1), float4(0.75, 0.599, 0.405, 1) }, //sun + { float4(0.063288, -0.987444, 0.144735, 1), float4(0.192, 0.273, 0.275, 1) }, //bottom + { float4(0.23007, 0.785579, -0.574422, 1), float4(0.300, 0.292, 0.223, 1) }, //highlight + { float4(-0.620275, -0.683659, -0.384537, 1), float4(0.0, 0.0, 0.1, 1) } //blue rim-light + }; + + float4 g_ambient = float4(0.4945,0.465,0.5,1); + + float g_occDimHeight = 2400.0; //scalar that tells us how much to darken the tree near the top +}; + +cbuffer cgrassblade +{ + float3 g_positions[6] = + { + float3( -1, 0, 0 ), + float3( -1, 2, 0 ), + float3( 1, 0, 0 ), + float3( 1, 2, 0 ), + + float3( -1, 0, 0 ), + float3( -1, 2, 0 ), + }; + float2 g_texcoords[6] = + { + float2(0,1), + float2(0,0), + float2(1,1), + float2(1,0), + + float2(0,1), + float2(0,0), + }; +}; + +//-------------------------------------------------------------------------------------- +// Textures and Samplers +//-------------------------------------------------------------------------------------- +Texture2D g_txDiffuse; +Texture2DArray g_tx2dArray; +SamplerState g_samLinear +{ + Filter = ANISOTROPIC; + AddressU = Wrap; + AddressV = Wrap; +}; + +Texture1D g_txRandom; +SamplerState g_samPoint +{ + Filter = MIN_MAG_MIP_POINT; + AddressU = Wrap; + AddressV = Wrap; +}; + +//-------------------------------------------------------------------------------------- +// State structures +//-------------------------------------------------------------------------------------- +BlendState QuadAlphaBlendState +{ + AlphaToCoverageEnable = TRUE; + RenderTargetWriteMask[0] = 0x0F; +}; + +RasterizerState EnableMSAA +{ + CullMode = BACK; + MultisampleEnable = TRUE; +}; + +DepthStencilState DisableDepthTestWrite +{ + DepthEnable = FALSE; + DepthWriteMask = ZERO; +}; + +DepthStencilState EnableDepthTestWrite +{ + DepthEnable = TRUE; + DepthWriteMask = ALL; +}; + +BlendState NoBlending +{ + AlphaToCoverageEnable = FALSE; + BlendEnable[0] = FALSE; +}; + +//-------------------------------------------------------------------------------------- +// Sky vertex shader +//-------------------------------------------------------------------------------------- +PSSceneIn VSSkymain(VSSceneIn input) +{ + PSSceneIn output; + + // + // Transform the vert to view-space + // + float4 v4Position = mul(float4(input.pos, 1), g_mWorldViewProj); + output.pos = v4Position; + + // + // Transfer the rest + // + output.tex = input.tex; + + output.color = float4(1,1,1,1); + + return output; +} + +//-------------------------------------------------------------------------------------- +// CalcLighting helper function. Calculates lighting from 4 light sources, adds ambient +// and attenuates for depth. Used by all techniques for lighting. +//-------------------------------------------------------------------------------------- +float4 CalcLighting( float3 norm, float depth ) +{ + float4 color = float4(0,0,0,0); + + // add the contributions of 4 directional lights + [unroll] for( int i=0; i<4; i++ ) + { + color += saturate( dot(g_lights[i].direction,norm) )*g_lights[i].color; + } + + // give some attenuation due to depth + float attenuate = depth / 10000.0; + float4 attenColor = float4(0.15, 0.2, 0.3, 0); + + // add it all up plus ambient + return (1-attenuate*0.23)*(color + g_ambient) + attenColor*attenuate; +} + +//-------------------------------------------------------------------------------------- +// Instancing vertex shader. Positions the vertices based upon the matrix stored +// in the second vertex stream. +//-------------------------------------------------------------------------------------- +PSSceneIn VSInstmain(VSInstIn input) +{ + PSSceneIn output; + + // + // Transform by our Sceneance matrix + // + float4 InstancePosition = mul(float4(input.pos, 1), input.mTransform); + float4 ViewPos = mul(InstancePosition, g_mWorldView ); + + // + // Transform the vert to view-space + // + float4 v4Position = mul(InstancePosition, g_mWorldViewProj); + output.pos = v4Position; + + // + // Transfer the rest + // + output.tex = input.tex; + + // + // dot the norm with the light dir + // + float3 norm = mul(input.norm,(float3x3)input.mTransform); + output.color = CalcLighting( norm, ViewPos.z ); + + // + // Dim the color by how far up the tree we are. + // This is a nice way to fake occlusion of the branches by the leaves. + // + output.color *= 1.0f - saturate(input.pos.y/g_occDimHeight); + + + return output; +} + +//-------------------------------------------------------------------------------------- +// Quad (leaf) vertex shader. Instances the quad over multiple leaf positions and +// multiple trees. This demonstrates how to do double instancing. +//-------------------------------------------------------------------------------------- +PSQuadIn VSQuadmain(VSQuadIn input) +{ + PSQuadIn output; + + // base our leaf texture upon which instance id we are + uint iLeaf = input.InstanceId/g_iNumTrees; + uint iLeafTex = iLeaf%3; + output.tex = float3(input.tex, float(iLeafTex) ); + + // + // Transform the position by the Instance matrix + // + int iTree = input.InstanceId - (input.InstanceId/g_iNumTrees)*g_iNumTrees; + float4 vInstancePos = mul( float4(input.pos, 1), input.mTransform ); + float4 InstancePosition = mul(vInstancePos, g_mTreeMatrices[iTree] ); + float4 ViewPos = mul(InstancePosition, g_mWorldView ); + + // + // Transform the Instance position to view-space + // + output.pos = mul(InstancePosition, g_mWorldViewProj); + + // pack distance from the eye into the color alpha channel + output.color = float4(input.fOcc,input.fOcc,input.fOcc,ViewPos.z); + + return output; +} + +//-------------------------------------------------------------------------------------- +// Grass vertex shader. Basically a passthrough except for instancing the island base +// mesh. +//-------------------------------------------------------------------------------------- +VSGrassOut VSGrassmain(VSGrassIn input) +{ + // simple transform into the instance space + VSGrassOut output; + output.pos = mul(float4(input.pos, 1), input.mTransform); + output.norm = mul(input.norm, (float3x3)input.mTransform); + output.tex = input.tex; + output.VertexID = input.VertexID; + + return output; +} + +//-------------------------------------------------------------------------------------- +// Quad (leaf) GS. Calculates the normal and lighting for the leaf. +//-------------------------------------------------------------------------------------- +[maxvertexcount(3)] +void GSQuadmain(triangle PSQuadIn input[3], inout TriangleStream QuadStream) +{ + PSQuadIn output; + + // + // Calculate the face normal + // + float4 faceNormalA = input[1].pos.xyzw - input[0].pos.xyzw; + float4 faceNormalB = input[2].pos.xyzw - input[0].pos.xyzw; + + // + // Cross product + // + float3 faceNormal = cross(faceNormalA, faceNormalB); + + // + // Normalize face normal + // + faceNormal = normalize(faceNormal); + + // + // Dot face normal with some arbitrary light vectors + // + float4 color1 = CalcLighting( faceNormal, input[0].color.a ); + color1 *= input[0].color; + + // + // Make sure we always have an alpha of 1 + // + color1.a = 1.0; + + // + // Emit out the new tri + // + for(int i=0; i<3; i++) + { + output.pos = input[i].pos; + output.color = color1; + output.tex = input[i].tex; + QuadStream.Append(output); + } + QuadStream.RestartStrip(); +} + +//-------------------------------------------------------------------------------------- +// RandomDir helper. Samples a random dir out of our 1d random texture. In this case +// we use a texture because the offset could be anywhere. If we were sampling linearly +// then we would probably just use a buffer and load from that. +//-------------------------------------------------------------------------------------- +float3 RandomDir(float fOffset) +{ + float tCoord = (fOffset) / 300.0; + return g_txRandom.SampleLevel( g_samPoint, tCoord, 0 ); +} + +//-------------------------------------------------------------------------------------- +// Helper to determing if a point is within a triangle +//-------------------------------------------------------------------------------------- +bool IsInTriangle( float3 P, float3 A, float3 B, float3 C ) +{ + float3 crossA = cross( B-A, P-A ); + float3 crossB = cross( C-B, P-B ); + float3 crossC = cross( A-C, P-C ); + + if( dot( crossA, crossB ) > 0 && + dot( crossB, crossC ) > 0 ) + { + return true; + } + else + { + return false; + } +} + +//-------------------------------------------------------------------------------------- +// Gets a random orientation matrix based upon the RandomDir funciton +//-------------------------------------------------------------------------------------- +float4x4 GetRandomOrientation( float3 Pos, float3 Norm, float fRandOffset ) +{ + float3 Tangent = RandomDir(fRandOffset); + + float3 Bitangent = normalize( cross( Tangent, Norm ) ); + Tangent = normalize( cross( Bitangent, Norm ) ); + + float4x4 matWorld = { float4( Tangent, 0 ), + float4( Norm, 0 ), + float4( Bitangent, 0 ), + float4( Pos, 1 ) }; + return matWorld; +} + +//-------------------------------------------------------------------------------------- +// Generates an actual grass blade +//-------------------------------------------------------------------------------------- +void OutputGrassBlade( VSGrassOut midPoint, inout TriangleStream GrassStream, int iGrassTex ) +{ + PSQuadIn output; + + float4x4 mWorld = GetRandomOrientation( midPoint.pos, midPoint.norm, (float)midPoint.VertexID ); + float4 ViewPos = mul( midPoint.pos, g_mWorldView ); + + float3 grassNorm = midPoint.norm; + float4 color1 = CalcLighting( grassNorm, ViewPos.z ); + + for(int v=0; v<6; v++) + { + float3 pos = g_positions[v]; + pos.x *= g_GrassWidth; + pos.y *= g_GrassHeight; + + output.pos = mul( float4(pos,1), mWorld ); + output.pos = mul( output.pos, g_mWorldViewProj ); + output.tex = float3( g_texcoords[v], iGrassTex ); + output.color = color1; + + GrassStream.Append( output ); + } + + GrassStream.RestartStrip(); +} + +//-------------------------------------------------------------------------------------- +// Midpoint of the three vertices A,B,C +//-------------------------------------------------------------------------------------- +VSGrassOut CalcMidPoint( VSGrassOut A, VSGrassOut B, VSGrassOut C ) +{ + VSGrassOut MidPoint; + + MidPoint.pos = (A.pos + B.pos + C.pos)/3.0f; + MidPoint.norm = (A.norm + B.norm + C.norm)/3.0f; + MidPoint.tex = (A.tex + B.tex + C.tex)/3.0f; + MidPoint.VertexID = A.VertexID + B.VertexID + C.VertexID; + + return MidPoint; +} + +//-------------------------------------------------------------------------------------- +// The actual grass geometry shader. This generates grass blades based upon an input +// mesh (the tops of the islands) and a coverage texture. Each of the textures channels +// determines how much of each of the 4 types of grass to place at a particular spot. +//-------------------------------------------------------------------------------------- +[maxvertexcount(90)] +void GSGrassmain(triangle VSGrassOut input[3], inout TriangleStream GrassStream ) +{ + VSGrassOut MidPoint = CalcMidPoint( input[0], input[1], input[2] ); + + float4 CoverageMask = g_tx2dArray.SampleLevel( g_samPoint, float3(MidPoint.tex,4), 0 ); + float cm[4]; + cm[0] = CoverageMask.r; + cm[1] = CoverageMask.g; + cm[2] = CoverageMask.b; + cm[3] = CoverageMask.a; + + for(int g=0; g<4; g++) + { + float MaxBlades = float(g_iGrassCoverage)*cm[g]; + for(float i=0; i= vLightSpaceDepth ) ? 1.0f : 0.0f; + return dot( vBilinearWeights, vShadowTests ); +} + +//-------------------------------------------------------------------------------------- +// Diffuse lighting calculation, with angle and distance falloff +//-------------------------------------------------------------------------------------- +float4 CalcLightingColor( int iLight, float3 vPosWorld, float3 vPerPixelNormal ) +{ + float3 vLightPos = g_LightData[iLight].m_vLightPos.xyz; + float3 vLightDir = g_LightData[iLight].m_vLightDir.xyz; + float4 vLightColor = g_LightData[iLight].m_vLightColor; + float4 vFalloffs = g_LightData[iLight].m_vFalloffs; + + float3 vLightToPixelUnNormalized = vPosWorld - vLightPos; + + // Dist falloff = 0 at vFalloffs.x, 1 at vFalloffs.x - vFalloffs.y + float fDist = length( vLightToPixelUnNormalized ); + float fDistFalloff = saturate( ( vFalloffs.x - fDist ) / vFalloffs.y ); + + // Normalize from here on + float3 vLightToPixelNormalized = vLightToPixelUnNormalized / fDist; + + // Angle falloff = 0 at vFalloffs.z, 1 at vFalloffs.z - vFalloffs.w + float fCosAngle = dot( vLightToPixelNormalized, vLightDir ); + float fAngleFalloff = saturate( ( fCosAngle - vFalloffs.z ) / vFalloffs.w ); + + // Diffuse contribution + float fNDotL = saturate( -dot( vLightToPixelNormalized, vPerPixelNormal ) ); + + return vLightColor * fNDotL * fDistFalloff * fAngleFalloff; +} + +//-------------------------------------------------------------------------------------- +// Pixel Shader +//-------------------------------------------------------------------------------------- +float4 PSMain( PS_INPUT Input ) : SV_TARGET +{ + // Manual clip test, so that objects which are behind the mirror + // don't show up in the mirror. + clip( dot( g_vMirrorPlane.xyz, Input.vPosWorld.xyz ) + g_vMirrorPlane.w ); + +#ifdef NO_DIFFUSE_MAP + float4 vDiffuse = 0.5f; +#else // #ifdef NO_DIFFUSE_MAP + float4 vDiffuse = g_txDiffuse.Sample( g_samLinearWrap, Input.vTexcoord ); +#endif // #ifdef NO_DIFFUSE_MAP #else + + // Compute per-pixel normal +#ifdef NO_NORMAL_MAP + float3 vPerPixelNormal = Input.vNormal; +#else // #ifdef NO_NORMAL_MAP + float3 vPerPixelNormal = CalcPerPixelNormal( Input.vTexcoord, Input.vNormal, Input.vTangent ); +#endif // #ifdef NO_NORMAL_MAP #else + + // Compute lighting contribution +#ifdef NO_AMBIENT + float4 vTotalLightingColor = 0.0f; +#else // #ifdef NO_AMBIENT + float4 vTotalLightingColor = g_vAmbientColor; +#endif // #ifdef NO_AMBIENT #else + +#ifndef NO_DYNAMIC_LIGHTING + for ( int iLight = 0; iLight < g_iNumLights; ++iLight ) + { + float4 vLightingColor = CalcLightingColor( iLight, Input.vPosWorld, vPerPixelNormal ); +#ifndef NO_SHADOW_MAP + if ( iLight < g_iNumShadows && any( vLightingColor.xyz ) > 0.0f ) // Don't bother checking shadow map if the pixel is unlit + { + vLightingColor *= CalcUnshadowedAmountPCF2x2( iLight, Input.vPosWorld ); + } +#endif // #ifndef NO_SHADOW_MAP + vTotalLightingColor += vLightingColor; + } +#endif // #ifndef NO_DYNAMIC_LIGHTING + + return vDiffuse * g_vTintColor * g_vObjectColor * vTotalLightingColor; +} diff --git a/tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_VS.hlsl b/tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_VS.hlsl new file mode 100644 index 000000000..0d8d32ffa --- /dev/null +++ b/tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_VS.hlsl @@ -0,0 +1,75 @@ +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain +//-------------------------------------------------------------------------------------- +// File: MultithreadedRendering11_VS.hlsl +// +// The vertex shader file for the MultithreadedRendering11 sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +// Various debug options +//#define UNCOMPRESSED_VERTEX_DATA // The sdkmesh file contained uncompressed vertex data + +//-------------------------------------------------------------------------------------- +// Globals +//-------------------------------------------------------------------------------------- +cbuffer cbPerObject : register( b0 ) +{ + matrix g_mWorld : packoffset( c0 ); +}; +cbuffer cbPerScene : register( b1 ) +{ + matrix g_mViewProj : packoffset( c0 ); +}; + +//-------------------------------------------------------------------------------------- +// Input / Output structures +//-------------------------------------------------------------------------------------- +struct VS_INPUT +{ + float4 vPosition : POSITION; + float3 vNormal : NORMAL; + float2 vTexcoord : TEXCOORD0; + float3 vTangent : TANGENT; +}; + +struct VS_OUTPUT +{ + float3 vNormal : NORMAL; + float3 vTangent : TANGENT; + float2 vTexcoord : TEXCOORD0; + float4 vPosWorld : TEXCOORD1; + float4 vPosition : SV_POSITION; +}; + +// We aliased signed vectors as a unsigned format. +// Need to recover signed values. The values 1.0 and 2.0 +// are slightly inaccurate here. +float3 R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( in float3 vVec ) +{ + vVec *= 2.0f; + return vVec >= 1.0f ? ( vVec - 2.0f ) : vVec; +} + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +VS_OUTPUT VSMain( VS_INPUT Input ) +{ + VS_OUTPUT Output; + +#ifndef UNCOMPRESSED_VERTEX_DATA + // Expand compressed vectors + Input.vNormal = R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( Input.vNormal ); + Input.vTangent = R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( Input.vTangent ); +#endif // #ifndef UNCOMPRESSED_VERTEX_DATA + + Output.vPosWorld = mul( Input.vPosition, g_mWorld ); + Output.vPosition = mul( Output.vPosWorld, g_mViewProj ); + Output.vNormal = mul( Input.vNormal, (float3x3)g_mWorld ); + Output.vTangent = mul( Input.vTangent, (float3x3)g_mWorld ); + Output.vTexcoord = Input.vTexcoord; + + return Output; +} + diff --git a/tests/hlsl/dxsdk/NBodyGravityCS11/NBodyGravityCS11.hlsl b/tests/hlsl/dxsdk/NBodyGravityCS11/NBodyGravityCS11.hlsl new file mode 100644 index 000000000..0a694450c --- /dev/null +++ b/tests/hlsl/dxsdk/NBodyGravityCS11/NBodyGravityCS11.hlsl @@ -0,0 +1,103 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSMain +//-------------------------------------------------------------------------------------- +// File: NBodyGravityCS11.hlsl +// +// Demonstrates how to use Compute Shader to do n-body gravity computation +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +static float softeningSquared = 0.0012500000*0.0012500000; +static float g_fG = 6.67300e-11f * 10000.0f; +static float g_fParticleMass = g_fG*10000.0f * 10000.0f; + +#define blocksize 128 +groupshared float4 sharedPos[blocksize]; + +// Body to body interaction, acceleration of the particle at position bi is updated +void bodyBodyInteraction(inout float3 ai, float4 bj, float4 bi, float mass, int particles ) +{ + float3 r = bj.xyz - bi.xyz; + + float distSqr = dot(r, r); + distSqr += softeningSquared; + + float invDist = 1.0f / sqrt(distSqr); + float invDistCube = invDist * invDist * invDist; + + float s = mass * invDistCube * particles; + + ai += r * s; +} + +cbuffer cbCS : register( b0 ) +{ + uint4 g_param; // pcbCS->param[0] = MAX_PARTICLES; + // pcbCS->param[1] = dimx; + float4 g_paramf; // pcbCS->paramf[0] = 0.1f; + // pcbCS->paramf[1] = 1; +}; + +struct PosVelo +{ + float4 pos; + float4 velo; +}; + +StructuredBuffer oldPosVelo; +RWStructuredBuffer newPosVelo; + +[numthreads(blocksize, 1, 1)] +void CSMain( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + // Each thread of the CS updates one of the particles + + float4 pos = oldPosVelo[DTid.x].pos; + float4 vel = oldPosVelo[DTid.x].velo; + float3 accel = 0; + float mass = g_fParticleMass; + + // Update current particle using all other particles + [loop] + for (uint tile = 0; tile < g_param.y; tile++) + { + // Cache a tile of particles unto shared memory to increase IO efficiency + sharedPos[GI] = oldPosVelo[tile * blocksize + GI].pos; + + GroupMemoryBarrierWithGroupSync(); + + [unroll] + for (uint counter = 0; counter < blocksize; counter+=8 ) + { + bodyBodyInteraction(accel, sharedPos[counter], pos, mass, 1); + bodyBodyInteraction(accel, sharedPos[counter+1], pos, mass, 1); + bodyBodyInteraction(accel, sharedPos[counter+2], pos, mass, 1); + bodyBodyInteraction(accel, sharedPos[counter+3], pos, mass, 1); + bodyBodyInteraction(accel, sharedPos[counter+4], pos, mass, 1); + bodyBodyInteraction(accel, sharedPos[counter+5], pos, mass, 1); + bodyBodyInteraction(accel, sharedPos[counter+6], pos, mass, 1); + bodyBodyInteraction(accel, sharedPos[counter+7], pos, mass, 1); + } + + GroupMemoryBarrierWithGroupSync(); + } + + // g_param.x is the number of our particles, however this number might not be an exact multiple of the tile size. + // In such cases, out of bound reads occur in the process above, which means there will be + // tooManyParticles "phantom" particles generating false gravity at position (0, 0, 0), so we have to substract them here. + // NOTE, out of bound reads always return 0 in CS + const uint tooManyParticles = g_param.y * blocksize - g_param.x; + bodyBodyInteraction(accel, float4(0, 0, 0, 0), pos, mass, -tooManyParticles); + + // Update the velocity and position of current particle using the acceleration computed above + vel.xyz += accel.xyz * g_paramf.x; //deltaTime; + vel.xyz *= g_paramf.y; //damping; + pos.xyz += vel.xyz * g_paramf.x; //deltaTime; + + if ( DTid.x < g_param.x ) + { + newPosVelo[DTid.x].pos = pos; + newPosVelo[DTid.x].velo = float4(vel.xyz, length(accel)); + } +} diff --git a/tests/hlsl/dxsdk/NBodyGravityCS11/ParticleDraw.hlsl b/tests/hlsl/dxsdk/NBodyGravityCS11/ParticleDraw.hlsl new file mode 100644 index 000000000..ea56e20e9 --- /dev/null +++ b/tests/hlsl/dxsdk/NBodyGravityCS11/ParticleDraw.hlsl @@ -0,0 +1,128 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSParticleDraw -profile gs_4_0 -entry GSParticleDraw -profile ps_4_0 -entry PSParticleDraw +//-------------------------------------------------------------------------------------- +// File: ParticleDraw.hlsl +// +// Shaders for rendering the particle as point sprite +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +struct VSParticleIn +{ + float4 color : COLOR; + uint id : SV_VERTEXID; +}; + +struct VSParticleDrawOut +{ + float3 pos : POSITION; + float4 color : COLOR; +}; + +struct GSParticleDrawOut +{ + float2 tex : TEXCOORD0; + float4 color : COLOR; + float4 pos : SV_POSITION; +}; + +struct PSParticleDrawIn +{ + float2 tex : TEXCOORD0; + float4 color : COLOR; +}; + +struct PosVelo +{ + float4 pos; + float4 velo; +}; + +Texture2D g_txDiffuse; +StructuredBuffer g_bufPosVelo; + + +SamplerState g_samLinear +{ + Filter = MIN_MAG_MIP_LINEAR; + AddressU = Clamp; + AddressV = Clamp; +}; + +cbuffer cb0 +{ + row_major float4x4 g_mWorldViewProj; + row_major float4x4 g_mInvView; +}; + +cbuffer cb1 +{ + static float g_fParticleRad = 10.0f; +}; + +cbuffer cbImmutable +{ + static float3 g_positions[4] = + { + float3( -1, 1, 0 ), + float3( 1, 1, 0 ), + float3( -1, -1, 0 ), + float3( 1, -1, 0 ), + }; + + static float2 g_texcoords[4] = + { + float2(0,0), + float2(1,0), + float2(0,1), + float2(1,1), + }; +}; + +// +// Vertex shader for drawing the point-sprite particles +// +VSParticleDrawOut VSParticleDraw(VSParticleIn input) +{ + VSParticleDrawOut output; + + output.pos = g_bufPosVelo[input.id].pos; + + float mag = g_bufPosVelo[input.id].velo.w/9; + output.color = lerp( float4(1,0.1,0.1,1), input.color, mag ); + + return output; +} + +// +// GS for rendering point sprite particles. Takes a point and turns it into 2 tris. +// +[maxvertexcount(4)] +void GSParticleDraw(point VSParticleDrawOut input[1], inout TriangleStream SpriteStream) +{ + GSParticleDrawOut output; + + // + // Emit two new triangles + // + for(int i=0; i<4; i++) + { + float3 position = g_positions[i] * g_fParticleRad; + position = mul( position, (float3x3)g_mInvView ) + input[0].pos; + output.pos = mul( float4(position,1.0), g_mWorldViewProj ); + + output.color = input[0].color; + output.tex = g_texcoords[i]; + SpriteStream.Append(output); + } + SpriteStream.RestartStrip(); +} + +// +// PS for drawing particles +// +float4 PSParticleDraw(PSParticleDrawIn input) : SV_Target +{ + return g_txDiffuse.Sample( g_samLinear, input.tex ) * input.color; +} \ No newline at end of file diff --git a/tests/hlsl/dxsdk/OIT11/OIT_CS.hlsl b/tests/hlsl/dxsdk/OIT11/OIT_CS.hlsl new file mode 100644 index 000000000..dfc98b217 --- /dev/null +++ b/tests/hlsl/dxsdk/OIT11/OIT_CS.hlsl @@ -0,0 +1,277 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry VSParticleDraw -profile gs_4_0 -entry GSParticleDraw -profile ps_4_0 -entry PSParticleDraw +//----------------------------------------------------------------------------- +// File: OIT_CS.hlsl +// +// Desc: Compute shaders for used in the Order Independent Transparency sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//----------------------------------------------------------------------------- +// TODO: use structured buffers +RWBuffer deepBufferDepth : register( u0 ); +RWBuffer deepBufferColorUINT : register( u1 ); +RWTexture2D frameBuffer : register( u2 ); +RWBuffer prefixSum : register( u3 ); + +Texture2D fragmentCount : register ( t0 ); + +cbuffer CB : register( b0 ) +{ + uint g_nFrameWidth : packoffset( c0.x ); + uint g_nFrameHeight : packoffset( c0.y ); + uint g_nPassSize : packoffset( c0.z ); + uint g_nReserved : packoffset( c0.w ); +} + +#define blocksize 1 +#define groupthreads (blocksize*blocksize) +groupshared float accum[groupthreads]; + +// First pass of the prefix sum creation algorithm. Converts a 2D buffer to a 1D buffer, +// and sums every other value with the previous value. +[numthreads(1,1,1)] +void CreatePrefixSum_Pass0_CS( uint3 nGid : SV_GroupID, uint3 nDTid : SV_DispatchThreadID, uint3 nGTid : SV_GroupThreadID ) +{ + int nThreadNum = nGid.y*g_nFrameWidth + nGid.x; + if( nThreadNum%2 == 0 ) + { + prefixSum[nThreadNum] = fragmentCount[nGid.xy]; + + // Add the Fragment count to the next bin + if( (nThreadNum+1) < g_nFrameWidth * g_nFrameHeight ) + { + int2 nextUV; + nextUV.x = (nThreadNum+1) % g_nFrameWidth; + nextUV.y = (nThreadNum+1) / g_nFrameWidth; + prefixSum[ nThreadNum+1 ] = prefixSum[ nThreadNum ] + fragmentCount[ nextUV ]; + } + } +} + +// Second and following passes. Each pass distributes the sum of the first half of the group +// to the second half of the group. There are n/groupsize groups in each pass. +// Each pass increases the group size until it is the size of the buffer. +// The resulting buffer holds the prefix sum of all preceding values in each +// position +[numthreads(1,1,1)] +void CreatePrefixSum_Pass1_CS( uint3 nGid : SV_GroupID, uint3 nDTid : SV_DispatchThreadID, uint3 nGTid : SV_GroupThreadID ) +{ + int nThreadNum = nGid.x; + + int nValue = prefixSum[nThreadNum*g_nPassSize + g_nPassSize/2 - 1]; + for(int i = nThreadNum*g_nPassSize + g_nPassSize/2; i < nThreadNum*g_nPassSize + g_nPassSize && i < g_nFrameWidth*g_nFrameHeight; i++) + { + prefixSum[i] = prefixSum[i] + nValue; + } +} + +#if 1 + +// Sort the fragments using a bitonic sort, then accumulate the fragments into the final result. +groupshared int nIndex[32]; +#define NUM_THREADS 8 +[numthreads(1,1,1)] +void SortAndRenderCS( uint3 nGid : SV_GroupID, uint3 nDTid : SV_DispatchThreadID, uint3 nGTid : SV_GroupThreadID ) +{ + uint nThreadNum = nGid.y * g_nFrameWidth + nGid.x; + +// uint r0, r1, r2; +// float rd0, rd1, rd2, rd3, rd4, rd5, rd6, rd7; + + uint N = fragmentCount[nDTid.xy]; + + uint N2 = 1 << (int)(ceil(log2(N))); + + float fDepth[32]; + for(int i = 0; i < N; i++) + { + nIndex[i] = i; + fDepth[i] = deepBufferDepth[ prefixSum[nThreadNum-1] + i ]; + } + for(int i = N; i < N2; i++) + { + nIndex[i] = i; + fDepth[i] = 1.1f; + } + + uint idx = blocksize*nGTid.y + nGTid.x; + + // Bitonic sort + for( int k = 2; k <= N2; k = 2*k ) + { + for( int j = k>>1; j > 0 ; j = j>>1 ) + { + for( int i = 0; i < N2; i++ ) + { +// GroupMemoryBarrierWithGroupSync(); + //i = idx; + + float di = fDepth[ nIndex[ i ] ]; + int ixj = i^j; + if ( ( ixj ) > i ) + { + float dixj = fDepth[ nIndex[ ixj ] ]; + if ( ( i&k ) == 0 && di > dixj ) + { + int temp = nIndex[ i ]; + nIndex[ i ] = nIndex[ ixj ]; + nIndex[ ixj ] = temp; + } + if ( ( i&k ) != 0 && di < dixj ) + { + int temp = nIndex[ i ]; + nIndex[ i ] = nIndex[ ixj ]; + nIndex[ ixj ] = temp; + } + } + } + } + } + + // Output the final result to the frame buffer + if( idx == 0 ) + { + + /* + // Debug + uint color[8]; + for(int i = 0; i < 8; i++) + { + color[i] = deepBufferColorUINT[prefixSum[nThreadNum-1] + i]; + } + + for(int i = 0; i < 8; i++) + { + deepBufferDepth[nThreadNum*8+i] = fDepth[i];//fDepth[nIndex[i]]; + deepBufferColorUINT[nThreadNum*8+i] = color[nIndex[i]]; + } + */ + + // Accumulate fragments into final result + float4 result = 0.0f; + for( int x = N-1; x >= 0; x-- ) + { + uint bufferValue = deepBufferColorUINT[ prefixSum[nThreadNum-1] + nIndex[ x ] ]; + float4 color; + color.r = ( ( bufferValue >> 0 & 0xFF )) / 255.0f; + color.g = ( bufferValue >> 8 & 0xFF ) / 255.0f; + color.b = ( bufferValue >> 16 & 0xFF ) / 255.0f; + color.a = ( bufferValue >> 24 & 0xFF ) / 255.0f; + result = lerp( result, color, color.a ); + } + result.a = 1.0f; + frameBuffer[ nGid.xy ] = result; + } +} + +#else +[numthreads(1,1,1)] +void SortAndRenderCS( uint3 nGid : SV_GroupID, uint3 nDTid : SV_DispatchThreadID, uint3 nGTid : SV_GroupThreadID ) +{ + uint nThreadNum = nDTid.y * g_nFrameWidth + nDTid.x; + float d0 = deepBufferDepth[nThreadNum*8]; + float d1 = deepBufferDepth[nThreadNum*8+1]; + float d2 = deepBufferDepth[nThreadNum*8+2]; + + uint s0 = deepBufferColorUINT[nThreadNum*8 + 0]; + uint s1 = deepBufferColorUINT[nThreadNum*8 + 1]; + uint s2 = deepBufferColorUINT[nThreadNum*8 + 2]; + + uint r0, r1, r2; + float rd0, rd1, rd2; + if( d0 < d1 && d0 < d2 ) + { + r0 = s0; + rd0 = d0; + if( d1 < d2 ) + { + r1 = s1; + r2 = s2; + + rd1 = d1; + rd2 = d2; + } + else + { + r1 = s2; + r2 = s1; + + rd1 = d2; + rd2 = d1; + } + } + else if( d1 < d2 ) + { + r0 = s1; + rd0 = d1; + if( d0 < d2 ) + { + r1 = s0; + r2 = s2; + + rd1 = d0; + rd2 = d2; + } + else + { + r1 = s2; + r2 = s0; + + rd1 = d2; + rd2 = d0; + } + } + else + { + r0 = s2; + rd0 = d2; + if( d1 < d0 ) + { + r1 = s1; + r2 = s0; + + rd1 = d1; + rd2 = d0; + } + else + { + r1 = s0; + r2 = s1; + + rd1 = d0; + rd2 = d1; + } + } + + deepBufferDepth[nThreadNum*8] = rd0; + deepBufferDepth[nThreadNum*8+1] = rd1; + deepBufferDepth[nThreadNum*8+2] = rd2; + + deepBufferColorUINT[nThreadNum*8] = r0; + deepBufferColorUINT[nThreadNum*8+1] = r1; + deepBufferColorUINT[nThreadNum*8+2] = r2; + + // convert the color to floats + float4 color[3]; + color[0].r = (r0 >> 0 & 0xFF) / 255.0f; + color[0].g = (r0 >> 8 & 0xFF) / 255.0f; + color[0].b = (r0 >> 16 & 0xFF) / 255.0f; + color[0].a = (r0 >> 24 & 0xFF) / 255.0f; + + color[1].r = (r1 >> 0 & 0xFF) / 255.0f; + color[1].g = (r1 >> 8 & 0xFF) / 255.0f; + color[1].b = (r1 >> 16 & 0xFF) / 255.0f; + color[1].a = (r1 >> 24 & 0xFF) / 255.0f; + + color[2].r = (r2 >> 0 & 0xFF) / 255.0f; + color[2].g = (r2 >> 8 & 0xFF) / 255.0f; + color[2].b = (r2 >> 16 & 0xFF) / 255.0f; + color[2].a = (r2 >> 24 & 0xFF) / 255.0f; + + float4 result = lerp(lerp(lerp(0, color[2], color[2].a), color[1], color[1].a), color[0], color[0].a); + result.a = 1.0f; + + frameBuffer[nDTid.xy] = result; +} + +#endif \ No newline at end of file diff --git a/tests/hlsl/dxsdk/OIT11/OIT_PS.hlsl b/tests/hlsl/dxsdk/OIT11/OIT_PS.hlsl new file mode 100644 index 000000000..1fdb31622 --- /dev/null +++ b/tests/hlsl/dxsdk/OIT11/OIT_PS.hlsl @@ -0,0 +1,56 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry FragmentCountPS -entry FillDeepBufferPS +//----------------------------------------------------------------------------- +// File: OITPS.hlsl +// +// Desc: Pixel shaders used in the Order Independent Transparency sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//----------------------------------------------------------------------------- +//TODO: Use structured buffers +RWTexture2D fragmentCount : register( u1 ); +RWBuffer deepBufferDepth : register( u2 ); +RWBuffer deepBufferColor : register( u3 ); +RWBuffer prefixSum : register( u4 ); + +cbuffer CB : register( b0 ) +{ + uint g_nFrameWidth : packoffset( c0.x ); + uint g_nFrameHeight : packoffset( c0.y ); + uint g_nReserved0 : packoffset( c0.z ); + uint g_nReserved1 : packoffset( c0.w ); +} + +struct SceneVS_Output +{ + float4 pos : SV_POSITION; + float4 color : COLOR0; +}; + +void FragmentCountPS( SceneVS_Output input) +{ + // Increments need to be done atomically + InterlockedAdd(fragmentCount[input.pos.xy], 1); +} + +void FillDeepBufferPS( SceneVS_Output input ) +{ + uint x = input.pos.x; + uint y = input.pos.y; + + // Atomically allocate space in the deep buffer + uint fc; + InterlockedAdd(fragmentCount[input.pos.xy], 1, fc); + + uint nPrefixSumPos = y*g_nFrameWidth + x; + uint nDeepBufferPos; + if( nPrefixSumPos == 0 ) + nDeepBufferPos = fc; + else + nDeepBufferPos = prefixSum[nPrefixSumPos-1] + fc; + + // Store fragment data into the allocated space + deepBufferDepth[nDeepBufferPos] = input.pos.z; + deepBufferColor[nDeepBufferPos] = clamp(input.color, 0, 1)*255; +} + diff --git a/tests/hlsl/dxsdk/OIT11/SceneVS.hlsl b/tests/hlsl/dxsdk/OIT11/SceneVS.hlsl new file mode 100644 index 000000000..2f985d1d1 --- /dev/null +++ b/tests/hlsl/dxsdk/OIT11/SceneVS.hlsl @@ -0,0 +1,36 @@ +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry SceneVS +//----------------------------------------------------------------------------- +// File: SceneVS.hlsl +// +// Desc: Vertex shader for the scene. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//----------------------------------------------------------------------------- + + +cbuffer cbPerObject : register( b0 ) +{ + row_major matrix g_mWorldViewProjection : packoffset( c0 ); +} + +struct SceneVS_Input +{ + float4 pos : POSITION; + float4 color : COLOR; +}; + +struct SceneVS_Output +{ + float4 pos : SV_POSITION; + float4 color : COLOR0; +}; + +SceneVS_Output SceneVS( SceneVS_Input input ) +{ + SceneVS_Output output; + + output.color = input.color; + output.pos = mul(input.pos, g_mWorldViewProjection ); + + return output; +} diff --git a/tests/hlsl/dxsdk/README.md b/tests/hlsl/dxsdk/README.md new file mode 100644 index 000000000..dd0c0fb6b --- /dev/null +++ b/tests/hlsl/dxsdk/README.md @@ -0,0 +1,5 @@ +DirectX SDK Sample Shaders +========================== + +This directory contains shaders that have shipped as part of the DirectX SDK. +The licsense terms for these shaders are specificed at the top of the source files. \ No newline at end of file diff --git a/tests/hlsl/dxsdk/SimpleBezier11/SimpleBezier11.hlsl b/tests/hlsl/dxsdk/SimpleBezier11/SimpleBezier11.hlsl new file mode 100644 index 000000000..7b7a1489c --- /dev/null +++ b/tests/hlsl/dxsdk/SimpleBezier11/SimpleBezier11.hlsl @@ -0,0 +1,230 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry BezierVS -profile hs_5_0 -entry BezierHS -profile ds_5_0 -entry BezierDS -profile ps_4_0 -entry BezierPS -entry SolidColorPS +//-------------------------------------------------------------------------------------- +// File: SimpleBezier11.hlsl +// +// This sample shows an simple implementation of the DirectX 11 Hardware Tessellator +// for rendering a Bezier Patch. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +// This allows us to compile the shader with a #define to choose +// the different partition modes for the hull shader. +// See the hull shader: [partitioning(BEZIER_HS_PARTITION)] +// This sample demonstrates "integer", "fractional_even", and "fractional_odd" +#ifndef BEZIER_HS_PARTITION +#define BEZIER_HS_PARTITION "integer" +#endif // BEZIER_HS_PARTITION + +// The input patch size. In this sample, it is 16 control points. +// This value should match the call to IASetPrimitiveTopology() +#define INPUT_PATCH_SIZE 16 + +// The output patch size. In this sample, it is also 16 control points. +#define OUTPUT_PATCH_SIZE 16 + +//-------------------------------------------------------------------------------------- +// Constant Buffers +//-------------------------------------------------------------------------------------- +cbuffer cbPerFrame : register( b0 ) +{ + matrix g_mViewProjection; + float3 g_vCameraPosWorld; + float g_fTessellationFactor; +}; + +//-------------------------------------------------------------------------------------- +// Vertex shader section +//-------------------------------------------------------------------------------------- +struct VS_CONTROL_POINT_INPUT +{ + float3 vPosition : POSITION; +}; + +struct VS_CONTROL_POINT_OUTPUT +{ + float3 vPosition : POSITION; +}; + +// This simple vertex shader passes the control points straight through to the +// hull shader. In a more complex scene, you might transform the control points +// or perform skinning at this step. + +// The input to the vertex shader comes from the vertex buffer. + +// The output from the vertex shader will go into the hull shader. + +VS_CONTROL_POINT_OUTPUT BezierVS( VS_CONTROL_POINT_INPUT Input ) +{ + VS_CONTROL_POINT_OUTPUT Output; + + Output.vPosition = Input.vPosition; + + return Output; +} + +//-------------------------------------------------------------------------------------- +// Constant data function for the BezierHS. This is executed once per patch. +//-------------------------------------------------------------------------------------- +struct HS_CONSTANT_DATA_OUTPUT +{ + float Edges[4] : SV_TessFactor; + float Inside[2] : SV_InsideTessFactor; +}; + +struct HS_OUTPUT +{ + float3 vPosition : BEZIERPOS; +}; + +// This constant hull shader is executed once per patch. For the simple Mobius strip +// model, it will be executed 4 times. In this sample, we set the tessellation factor +// via SV_TessFactor and SV_InsideTessFactor for each patch. In a more complex scene, +// you might calculate a variable tessellation factor based on the camera's distance. + +HS_CONSTANT_DATA_OUTPUT BezierConstantHS( InputPatch ip, + uint PatchID : SV_PrimitiveID ) +{ + HS_CONSTANT_DATA_OUTPUT Output; + + float TessAmount = g_fTessellationFactor; + + Output.Edges[0] = Output.Edges[1] = Output.Edges[2] = Output.Edges[3] = TessAmount; + Output.Inside[0] = Output.Inside[1] = TessAmount; + + return Output; +} + +// The hull shader is called once per output control point, which is specified with +// outputcontrolpoints. For this sample, we take the control points from the vertex +// shader and pass them directly off to the domain shader. In a more complex scene, +// you might perform a basis conversion from the input control points into a Bezier +// patch, such as the SubD11 Sample. + +// The input to the hull shader comes from the vertex shader + +// The output from the hull shader will go to the domain shader. +// The tessellation factor, topology, and partition mode will go to the fixed function +// tessellator stage to calculate the UVW and domain points. + +[domain("quad")] +[partitioning(BEZIER_HS_PARTITION)] +[outputtopology("triangle_cw")] +[outputcontrolpoints(OUTPUT_PATCH_SIZE)] +[patchconstantfunc("BezierConstantHS")] +HS_OUTPUT BezierHS( InputPatch p, + uint i : SV_OutputControlPointID, + uint PatchID : SV_PrimitiveID ) +{ + HS_OUTPUT Output; + Output.vPosition = p[i].vPosition; + return Output; +} + +//-------------------------------------------------------------------------------------- +// Bezier evaluation domain shader section +//-------------------------------------------------------------------------------------- +struct DS_OUTPUT +{ + float4 vPosition : SV_POSITION; + float3 vWorldPos : WORLDPOS; + float3 vNormal : NORMAL; +}; + +//-------------------------------------------------------------------------------------- +float4 BernsteinBasis(float t) +{ + float invT = 1.0f - t; + + return float4( invT * invT * invT, + 3.0f * t * invT * invT, + 3.0f * t * t * invT, + t * t * t ); +} + +//-------------------------------------------------------------------------------------- +float4 dBernsteinBasis(float t) +{ + float invT = 1.0f - t; + + return float4( -3 * invT * invT, + 3 * invT * invT - 6 * t * invT, + 6 * t * invT - 3 * t * t, + 3 * t * t ); +} + +//-------------------------------------------------------------------------------------- +float3 EvaluateBezier( const OutputPatch bezpatch, + float4 BasisU, + float4 BasisV ) +{ + float3 Value = float3(0,0,0); + Value = BasisV.x * ( bezpatch[0].vPosition * BasisU.x + bezpatch[1].vPosition * BasisU.y + bezpatch[2].vPosition * BasisU.z + bezpatch[3].vPosition * BasisU.w ); + Value += BasisV.y * ( bezpatch[4].vPosition * BasisU.x + bezpatch[5].vPosition * BasisU.y + bezpatch[6].vPosition * BasisU.z + bezpatch[7].vPosition * BasisU.w ); + Value += BasisV.z * ( bezpatch[8].vPosition * BasisU.x + bezpatch[9].vPosition * BasisU.y + bezpatch[10].vPosition * BasisU.z + bezpatch[11].vPosition * BasisU.w ); + Value += BasisV.w * ( bezpatch[12].vPosition * BasisU.x + bezpatch[13].vPosition * BasisU.y + bezpatch[14].vPosition * BasisU.z + bezpatch[15].vPosition * BasisU.w ); + + return Value; +} + +// The domain shader is run once per vertex and calculates the final vertex's position +// and attributes. It receives the UVW from the fixed function tessellator and the +// control point outputs from the hull shader. Since we are using the DirectX 11 +// Tessellation pipeline, it is the domain shader's responsibility to calculate the +// final SV_POSITION for each vertex. In this sample, we evaluate the vertex's +// position using a Bernstein polynomial and the normal is calculated as the cross +// product of the U and V derivatives. + +// The input SV_DomainLocation to the domain shader comes from fixed function +// tessellator. And the OutputPatch comes from the hull shader. From these, you +// must calculate the final vertex position, color, texcoords, and other attributes. + +// The output from the domain shader will be a vertex that will go to the video card's +// rasterization pipeline and get drawn to the screen. + +[domain("quad")] +DS_OUTPUT BezierDS( HS_CONSTANT_DATA_OUTPUT input, + float2 UV : SV_DomainLocation, + const OutputPatch bezpatch ) +{ + float4 BasisU = BernsteinBasis( UV.x ); + float4 BasisV = BernsteinBasis( UV.y ); + float4 dBasisU = dBernsteinBasis( UV.x ); + float4 dBasisV = dBernsteinBasis( UV.y ); + + float3 WorldPos = EvaluateBezier( bezpatch, BasisU, BasisV ); + float3 Tangent = EvaluateBezier( bezpatch, dBasisU, BasisV ); + float3 BiTangent = EvaluateBezier( bezpatch, BasisU, dBasisV ); + float3 Norm = normalize( cross( Tangent, BiTangent ) ); + + DS_OUTPUT Output; + Output.vPosition = mul( float4(WorldPos,1), g_mViewProjection ); + Output.vWorldPos = WorldPos; + Output.vNormal = Norm; + + return Output; +} + +//-------------------------------------------------------------------------------------- +// Smooth shading pixel shader section +//-------------------------------------------------------------------------------------- + +// The pixel shader works the same as it would in a normal graphics pipeline. +// In this sample, it performs very simple N dot L lighting. + +float4 BezierPS( DS_OUTPUT Input ) : SV_TARGET +{ + float3 N = normalize(Input.vNormal); + float3 L = normalize(Input.vWorldPos - g_vCameraPosWorld); + return abs(dot(N, L)) * float4(1, 0, 0, 1); +} + +//-------------------------------------------------------------------------------------- +// Solid color shading pixel shader (used for wireframe overlay) +//-------------------------------------------------------------------------------------- +float4 SolidColorPS( DS_OUTPUT Input ) : SV_TARGET +{ + // Return a solid green color + return float4( 0, 1, 0, 1 ); +} diff --git a/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.fx b/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.fx new file mode 100644 index 000000000..00883ce70 --- /dev/null +++ b/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.fx @@ -0,0 +1,112 @@ +//TEST_IGNORE_FILE: +//-------------------------------------------------------------------------------------- +// File: SimpleSample.fx +// +// The effect file for the SimpleSample sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + + +//-------------------------------------------------------------------------------------- +// Global variables +//-------------------------------------------------------------------------------------- +float4 g_MaterialAmbientColor; // Material's ambient color +float4 g_MaterialDiffuseColor; // Material's diffuse color +float3 g_LightDir; // Light's direction in world space +float4 g_LightDiffuse; // Light's diffuse color +texture g_MeshTexture; // Color texture for mesh + +float g_fTime; // App's time in seconds +float4x4 g_mWorld; // World matrix for object +float4x4 g_mWorldViewProjection; // World * View * Projection matrix + + + +//-------------------------------------------------------------------------------------- +// Texture samplers +//-------------------------------------------------------------------------------------- +sampler MeshTextureSampler = +sampler_state +{ + Texture = ; + MipFilter = LINEAR; + MinFilter = LINEAR; + MagFilter = LINEAR; +}; + + +//-------------------------------------------------------------------------------------- +// Vertex shader output structure +//-------------------------------------------------------------------------------------- +struct VS_OUTPUT +{ + float4 Position : POSITION; // vertex position + float4 Diffuse : COLOR0; // vertex diffuse color (note that COLOR0 is clamped from 0..1) + float2 TextureUV : TEXCOORD0; // vertex texture coords +}; + + +//-------------------------------------------------------------------------------------- +// This shader computes standard transform and lighting +//-------------------------------------------------------------------------------------- +VS_OUTPUT RenderSceneVS( float4 vPos : POSITION, + float3 vNormal : NORMAL, + float2 vTexCoord0 : TEXCOORD0 ) +{ + VS_OUTPUT Output; + float3 vNormalWorldSpace; + + // Transform the position from object space to homogeneous projection space + Output.Position = mul(vPos, g_mWorldViewProjection); + + // Transform the normal from object space to world space + vNormalWorldSpace = normalize(mul(vNormal, (float3x3)g_mWorld)); // normal (world space) + + // Calc diffuse color + Output.Diffuse.rgb = g_MaterialDiffuseColor * g_LightDiffuse * max(0,dot(vNormalWorldSpace, g_LightDir)) + + g_MaterialAmbientColor; + Output.Diffuse.a = 1.0f; + + // Just copy the texture coordinate through + Output.TextureUV = vTexCoord0; + + return Output; +} + + +//-------------------------------------------------------------------------------------- +// Pixel shader output structure +//-------------------------------------------------------------------------------------- +struct PS_OUTPUT +{ + float4 RGBColor : COLOR0; // Pixel color +}; + + +//-------------------------------------------------------------------------------------- +// This shader outputs the pixel's color by modulating the texture's +// color with diffuse material color +//-------------------------------------------------------------------------------------- +PS_OUTPUT RenderScenePS( VS_OUTPUT In ) +{ + PS_OUTPUT Output; + + // Lookup mesh texture and modulate it with diffuse + Output.RGBColor = tex2D(MeshTextureSampler, In.TextureUV) * In.Diffuse; + + return Output; +} + + +//-------------------------------------------------------------------------------------- +// Renders scene +//-------------------------------------------------------------------------------------- +technique RenderScene +{ + pass P0 + { + VertexShader = compile vs_2_0 RenderSceneVS(); + PixelShader = compile ps_2_0 RenderScenePS(); + } +} diff --git a/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.hlsl b/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.hlsl new file mode 100644 index 000000000..12f368f86 --- /dev/null +++ b/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.hlsl @@ -0,0 +1,86 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry RenderSceneVS -profile ps_4_0 -entry RenderScenePS +//-------------------------------------------------------------------------------------- +// File: SimpleSample.hlsl +// +// The HLSL file for the SimpleSample sample for the Direct3D 11 device +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + + +//-------------------------------------------------------------------------------------- +// Constant Buffers +//-------------------------------------------------------------------------------------- +cbuffer cbPerObject : register( b0 ) +{ + matrix g_mWorldViewProjection : packoffset( c0 ); + matrix g_mWorld : packoffset( c4 ); + float4 g_MaterialAmbientColor : packoffset( c8 ); + float4 g_MaterialDiffuseColor : packoffset( c9 ); +} + +cbuffer cbPerFrame : register( b1 ) +{ + float3 g_vLightDir : packoffset( c0 ); + float g_fTime : packoffset( c0.w ); + float4 g_LightDiffuse : packoffset( c1 ); +}; + +//----------------------------------------------------------------------------------------- +// Textures and Samplers +//----------------------------------------------------------------------------------------- +Texture2D g_txDiffuse : register( t0 ); +SamplerState g_samLinear : register( s0 ); + +//-------------------------------------------------------------------------------------- +// shader input/output structure +//-------------------------------------------------------------------------------------- +struct VS_INPUT +{ + float4 Position : POSITION; // vertex position + float3 Normal : NORMAL; // this normal comes in per-vertex + float2 TextureUV : TEXCOORD0;// vertex texture coords +}; + +struct VS_OUTPUT +{ + float4 Position : SV_POSITION; // vertex position + float4 Diffuse : COLOR0; // vertex diffuse color (note that COLOR0 is clamped from 0..1) + float2 TextureUV : TEXCOORD0; // vertex texture coords +}; + +//-------------------------------------------------------------------------------------- +// This shader computes standard transform and lighting +//-------------------------------------------------------------------------------------- +VS_OUTPUT RenderSceneVS( VS_INPUT input ) +{ + VS_OUTPUT Output; + float3 vNormalWorldSpace; + + // Transform the position from object space to homogeneous projection space + Output.Position = mul( input.Position, g_mWorldViewProjection ); + + // Transform the normal from object space to world space + vNormalWorldSpace = normalize(mul(input.Normal, (float3x3)g_mWorld)); // normal (world space) + + // Calc diffuse color + Output.Diffuse.rgb = g_MaterialDiffuseColor * g_LightDiffuse * max(0,dot(vNormalWorldSpace, g_vLightDir)) + + g_MaterialAmbientColor; + Output.Diffuse.a = 1.0f; + + // Just copy the texture coordinate through + Output.TextureUV = input.TextureUV; + + return Output; +} + +//-------------------------------------------------------------------------------------- +// This shader outputs the pixel's color by modulating the texture's +// color with diffuse material color +//-------------------------------------------------------------------------------------- +float4 RenderScenePS( VS_OUTPUT In ) : SV_TARGET +{ + // Lookup mesh texture and modulate it with diffuse + return g_txDiffuse.Sample( g_samLinear, In.TextureUV ) * In.Diffuse; +} diff --git a/tests/hlsl/dxsdk/SubD11/SubD11.hlsl b/tests/hlsl/dxsdk/SubD11/SubD11.hlsl new file mode 100644 index 000000000..c4ebf9620 --- /dev/null +++ b/tests/hlsl/dxsdk/SubD11/SubD11.hlsl @@ -0,0 +1,1238 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry PatchSkinningVS -entry MeshSkinningVS -profile hs_5_0 -entry SubDToBezierHS -entry SubDToBezierHS4444 -profile ds_5_0 -entry BezierEvalDS -profile ps_4_0 -entry SmoothPS -entry SolidColorPS +//-------------------------------------------------------------------------------------- +// File: SubD11.hlsl +// +// This file contains functions to convert from a Catmull-Clark subdivision +// representation to a bicubic patch representation. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//Work-around for an optimization rule problem in the June 2010 HLSL Compiler (9.29.952.3111) +//see http://support.microsoft.com/kb/2448404 +#if D3DX_VERSION == 0xa2b +#pragma ruledisable 0x0802405f +#endif + +//-------------------------------------------------------------------------------------- +// A sample extraordinary SubD quad is represented by the following diagram: +// +// 15 Valences: +// / \ Vertex 0: 5 +// / 14 Vertex 1: 4 +// 17---------16 / \ Vertex 2: 5 +// | \ | / \ Vertex 3: 3 +// | \ | / 13 +// | \ |/ / Prefixes: +// | 3------2------12 Vertex 0: 9 +// | | | | Vertex 1: 12 +// | | | | Vertex 2: 16 +// 4----0------1------11 Vertex 3: 18 +// / /| | | +// / / | | | +// 5 / 8------9------10 +// \ / / +// 6 / +// \ / +// 7 +// +// Where the quad bounded by vertices 0,1,2,3 represents the actual subd surface of interest +// The 1-ring neighborhood of the quad is represented by vertices 4 through 17. The counter- +// clockwise winding of this 1-ring neighborhood is important, especially when it comes to compute +// the corner vertices of the bicubic patch that we will use to approximate the subd quad (0,1,2,3). +// +// The resulting bicubic patch fits within the subd quad (0,1,2,3) and has the following control +// point layout: +// +// 12--13--14--15 +// 8---9--10--11 +// 4---5---6---7 +// 0---1---2---3 +// +// The inner 4 control points of the bicubic patch are a combination of only the vertices (0,1,2,3) +// of the subd quad. However, the corner control points for the bicubic patch (0,3,15,12) are actually +// a much more complex weighting of the subd patch and the 1-ring neighborhood. In the example above +// the bicubic control point 0 is actually a weighted combination of subd points 0,1,2,3 and 1-ring +// neighborhood points 17, 4, 5, 6, 7, 8, and 9. We can see that the 1-ring neighbor hood is simply +// walked from the prefix value from the previous corner (corner 3 in this case) to the prefix +// prefix value for the current corner. We add one more vertex on either side of the prefix values +// and we have all the data necessary to calculate the value for the corner points. +// +// The edge control points of the bicubic patch (1,2,13,14,4,8,7,11) are also combinations of their +// neighbors, but fortunately each one is only a combination of 6 values and no walk is required. +//-------------------------------------------------------------------------------------- + +#define MOD4(x) ((x)&3) +#ifndef MAX_POINTS +#define MAX_POINTS 32 +#endif +#define MAX_BONE_MATRICES 80 + +//-------------------------------------------------------------------------------------- +// Textures +//-------------------------------------------------------------------------------------- +Texture2D g_txHeight : register( t0 ); // Height and Bump texture +Texture2D g_txDiffuse : register( t1 ); // Diffuse texture +Texture2D g_txSpecular : register( t2 ); // Specular texture + +//-------------------------------------------------------------------------------------- +// Samplers +//-------------------------------------------------------------------------------------- +SamplerState g_samLinear : register( s0 ); +SamplerState g_samPoint : register( s0 ); + +//-------------------------------------------------------------------------------------- +// Constant Buffers +//-------------------------------------------------------------------------------------- +cbuffer cbTangentStencilConstants : register( b0 ) +{ + float g_TanM[1024]; // Tangent patch stencils precomputed by the application + float g_fCi[16]; // Valence coefficients precomputed by the application +}; + +cbuffer cbPerMesh : register( b1 ) +{ + matrix g_mConstBoneWorld[MAX_BONE_MATRICES]; +}; + +cbuffer cbPerFrame : register( b2 ) +{ + matrix g_mViewProjection; + float3 g_vCameraPosWorld; + float g_fTessellationFactor; + float g_fDisplacementHeight; + float3 g_vSolidColor; +}; + +cbuffer cbPerSubset : register( b3 ) +{ + int g_iPatchStartIndex; +} + +//-------------------------------------------------------------------------------------- +Buffer g_ValencePrefixBuffer : register( t0 ); + +//-------------------------------------------------------------------------------------- +struct VS_CONTROL_POINT_OUTPUT +{ + float3 vPosition : WORLDPOS; + float2 vUV : TEXCOORD0; + float3 vTangent : TANGENT; +}; + +struct BEZIER_CONTROL_POINT +{ + float3 vPosition : BEZIERPOS; +}; + +struct PS_INPUT +{ + float3 vWorldPos : POSITION; + float3 vNormal : NORMAL; + float2 vUV : TEXCOORD; + float3 vTangent : TANGENT; + float3 vBiTangent : BITANGENT; +}; + +//-------------------------------------------------------------------------------------- +// SubD to Bezier helper functions +//-------------------------------------------------------------------------------------- +// Helps with getting tangent stencils from the g_TanM constant array +#define TANM(a,v) ( g_TanM[ Val[v]*64 + (a) ] ) + +//-------------------------------------------------------------------------------------- +float3 ComputeInteriorVertex( uint index, + uint Val[4], + const in InputPatch ip ) +{ + switch( index ) + { + case 0: + return (ip[0].vPosition*Val[0] + ip[1].vPosition*2 + ip[2].vPosition + ip[3].vPosition*2) / (5+Val[0]); + case 1: + return (ip[0].vPosition*2 + ip[1].vPosition*Val[1] + ip[2].vPosition*2 + ip[3].vPosition) / (5+Val[1]); + case 2: + return (ip[0].vPosition + ip[1].vPosition*2 + ip[2].vPosition*Val[2] + ip[3].vPosition*2) / (5+Val[2]); + case 3: + return (ip[0].vPosition*2 + ip[1].vPosition + ip[2].vPosition*2 + ip[3].vPosition*Val[3]) / (5+Val[3]); + } + + return float3(0,0,0); +} + +//-------------------------------------------------------------------------------------- +// Computes the corner vertices of the output UV patch. The corner vertices are +// a weighted combination of all points that are "connected" to that corner by an edge. +// The interior 4 points of the original subd quad are easy to get. The points in the +// 1-ring neighborhood around the interior quad are not. +// +// Because the valence of that corner could be any number between 3 and 16, we need to +// walk around the subd patch vertices connected to that point. This is there the +// Pref (prefix) values come into play. Each corner has a prefix value that is the index +// of the last value around the 1-ring neighborhood that should be used in calculating +// the coefficient of that corner. The walk goes from the prefix value of the previous +// corner to the prefix value of the current corner. +//-------------------------------------------------------------------------------------- +void ComputeCornerVertex( uint index, + out float3 CornerB, // Corner for the Bezier patch + out float3 CornerU, // Corner for the tangent patch + out float3 CornerV, // Corner for the bitangent patch + const in InputPatch ip, + const in uint Val[4], + const in uint Pref[4] ) +{ + const float fOWt = 1; + const float fEWt = 4; + + // Figure out where to start the walk by using the previous corner's prefix value + uint PrefIm1 = 0; + uint uStart = 4; + if( index ) + { + PrefIm1 = Pref[index-1]; + uStart = PrefIm1; + } + + // Setup the walk indices + uint uTIndexStart = 2 - (index&1); + uint uTIndex = uTIndexStart; + + // Calculate the N*N weight for the final value + CornerB = (Val[index]*Val[index])*ip[index].vPosition; // n^2 part + + // Zero out the corners + CornerU = float4(0,0,0,0); + CornerV = float4(0,0,0,0); + + const uint uV = Val[index] + ( ( index & 1 ) ? 1 : -1 ); + + // Start the walk with the uStart prefix (the prefix of the corner before us) + CornerB += ip[uStart].vPosition * fEWt; + CornerU += ip[uStart].vPosition * TANM( uTIndex * 2, index ); + CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index); + + // Gather all vertices between the previous corner's prefix and our own prefix + // We'll do two at a time, since they always come in twos + while(uStart < Pref[index]-1) + { + ++uStart; + CornerB += ip[uStart].vPosition * fOWt; + CornerU += ip[uStart].vPosition * TANM( uTIndex * 2 + 1, index ); + CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index ); + + ++uTIndex; + ++uStart; + CornerB += ip[uStart].vPosition * fEWt; + CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index ); + CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex+uV)%Val[index]) * 2, index ); + } + ++uStart; + + // Add in the last guy and make sure to wrap to the beginning if we're the last corner + if (index == 3) + uStart = 4; + CornerB += ip[uStart].vPosition * fOWt; + CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index ); + CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index ); + + // Add in the guy before the prefix as well + if (index) + uStart = PrefIm1-1; + else + uStart = Pref[3]-1; + uTIndex = uTIndexStart-1; + + CornerB += ip[uStart].vPosition * fOWt; + CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index ); + CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index ); + + // We're done with the walk now. Now we need to add the contributions of the original subd quad. + CornerB += ip[MOD4(index+1)].vPosition * fEWt; + CornerB += ip[MOD4(index+2)].vPosition * fOWt; + CornerB += ip[MOD4(index+3)].vPosition * fEWt; + + uTIndex = 0 + (index&1)*(Val[index]-1); + uStart = MOD4(index+1); + CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index ); + CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index ); + + uStart = MOD4(index+2); + CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index ); + CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index ); + + uStart = MOD4(index+3); + uTIndex = (uTIndex+1)%Val[index]; + + CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index ); + CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index ); + + // Normalize the corner weights + CornerB *= 1.0f / ( Val[index] * Val[index] + 5 * Val[index] ); // normalize + + // fixup signs from directional derivatives... + if( !((index - 1) & 2) ) // 1 and 2 + CornerU *= -1; + + if( index >= 2 ) // 2 and 3 + CornerV *= -1; +} + +void ComputeCornerVertex4444( uint index, + out float3 CornerB, // Corner for the Bezier patch + out float3 CornerU, // Corner for the tangent patch + out float3 CornerV, // Corner for the bitangent patch + const in InputPatch ip, + const in uint Val[4], + const in uint Pref[4] ) +{ + const float fOWt = 1; + const float fEWt = 4; + + // Figure out where to start the walk by using the previous corner's prefix value + uint PrefIm1 = 0; + uint uStart = 4; + if( index ) + { + PrefIm1 = Pref[index-1]; + uStart = PrefIm1; + } + + // Setup the walk indices + uint uTIndexStart = 2 - (index&1); + uint uTIndex = uTIndexStart; + + // Calculate the N*N weight for the final value + CornerB = (Val[index]*Val[index])*ip[index].vPosition; // n^2 part + + // Zero out the corners + CornerU = float4(0,0,0,0); + CornerV = float4(0,0,0,0); + + const uint uV = Val[index] + ( ( index & 1 ) ? 1 : -1 ); + + // Start the walk with the uStart prefix (the prefix of the corner before us) + CornerB += ip[uStart].vPosition * fEWt; + CornerU += ip[uStart].vPosition * TANM( uTIndex * 2, index ); + CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index); + + // Gather all vertices between the previous corner's prefix and our own prefix + // We'll do two at a time, since they always come in twos + while(uStart < Pref[index]-1) + { + ++uStart; + CornerB += ip[uStart].vPosition * fOWt; + CornerU += ip[uStart].vPosition * TANM( uTIndex * 2 + 1, index ); + CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index ); + + ++uTIndex; + ++uStart; + CornerB += ip[uStart].vPosition * fEWt; + CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index ); + CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex+uV)%Val[index]) * 2, index ); + } + ++uStart; + + // Add in the last guy and make sure to wrap to the beginning if we're the last corner + if (index == 3) + uStart = 4; + CornerB += ip[uStart].vPosition * fOWt; + CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index ); + CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index ); + + // Add in the guy before the prefix as well + if (index) + uStart = PrefIm1-1; + else + uStart = Pref[3]-1; + uTIndex = uTIndexStart-1; + + CornerB += ip[uStart].vPosition * fOWt; + CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index ); + CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index ); + + // We're done with the walk now. Now we need to add the contributions of the original subd quad. + CornerB += ip[MOD4(index+1)].vPosition * fEWt; + CornerB += ip[MOD4(index+2)].vPosition * fOWt; + CornerB += ip[MOD4(index+3)].vPosition * fEWt; + + uTIndex = 0 + (index&1)*(Val[index]-1); + uStart = MOD4(index+1); + CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index ); + CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index ); + + uStart = MOD4(index+2); + CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index ); + CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index ); + + uStart = MOD4(index+3); + uTIndex = (uTIndex+1)%Val[index]; + + CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index ); + CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index ); + + // Normalize the corner weights + CornerB *= 1.0f / ( Val[index] * Val[index] + 5 * Val[index] ); // normalize + + // fixup signs from directional derivatives... + if( !((index - 1) & 2) ) // 1 and 2 + CornerU *= -1; + + if( index >= 2 ) // 2 and 3 + CornerV *= -1; +} + +//-------------------------------------------------------------------------------------- +// Computes the edge vertices of the output bicubic patch. The edge vertices +// (1,2,4,7,8,11,13,14) are a weighted (by valence) combination of 6 interior and 1-ring +// neighborhood points. However, we don't have to do the walk on this one since we +// don't need all of the neighbor points attached to this vertex. +//-------------------------------------------------------------------------------------- +float3 ComputeEdgeVertex( in uint index /* 0-7 */, + const in InputPatch ip, + const in uint Val[4], + const in uint Pref[4] ) +{ + float val1 = 2 * Val[0] + 10; + float val2 = 2 * Val[1] + 10; + float val13 = 2 * Val[3] + 10; + float val14 = 2 * Val[2] + 10; + float val4 = val1; + float val8 = val13; + float val7 = val2; + float val11 = val14; + + float3 vRetVal = float3(0,0,0); + switch( index ) + { + // Horizontal + case 0: + vRetVal = (Val[0]*2*ip[0].vPosition + 4*ip[1].vPosition + ip[2].vPosition + ip[3].vPosition*2 + + 2*ip[Pref[0]-1].vPosition + ip[Pref[0]].vPosition) / val1; + break; + case 1: + vRetVal = (4*ip[0].vPosition + Val[1]*2*ip[1].vPosition + ip[2].vPosition*2 + ip[3].vPosition + + ip[Pref[0]-1].vPosition + 2*ip[Pref[0]].vPosition) / val2; + break; + case 2: + vRetVal = (2*ip[0].vPosition + ip[1].vPosition + 4*ip[2].vPosition + ip[3].vPosition*2*Val[3] + + 2*ip[Pref[2]].vPosition + ip[Pref[2]-1].vPosition) / val13; + break; + case 3: + vRetVal = (ip[0].vPosition + 2*ip[1].vPosition + Val[2]*2*ip[2].vPosition + ip[3].vPosition*4 + + ip[Pref[2]].vPosition + 2*ip[Pref[2]-1].vPosition) / val14; + break; + // Vertical + case 4: + vRetVal = (Val[0]*2*ip[0].vPosition + 2*ip[1].vPosition + ip[2].vPosition + ip[3].vPosition*4 + + 2*ip[4].vPosition + ip[Pref[3]-1].vPosition) / val4; + break; + case 5: + vRetVal = (4*ip[0].vPosition + ip[1].vPosition + 2*ip[2].vPosition + ip[3].vPosition*2*Val[3] + + ip[4].vPosition + 2*ip[Pref[3]-1].vPosition) / val8; + break; + case 6: + vRetVal = (2*ip[0].vPosition + Val[1]*2*ip[1].vPosition + 4*ip[2].vPosition + ip[3].vPosition + + 2*ip[Pref[1]-1].vPosition + ip[Pref[1]].vPosition) / val7; + break; + case 7: + vRetVal = (ip[0].vPosition + 4*ip[1].vPosition + Val[2]*2*ip[2].vPosition + 2*ip[3].vPosition + + ip[Pref[1]-1].vPosition + 2*ip[Pref[1]].vPosition) / val11; + break; + } + + return vRetVal; +} + +//-------------------------------------------------------------------------------------- +// Helper function +//-------------------------------------------------------------------------------------- +void BezierRaise(inout float3 pQ[3], out float3 pC[4]) +{ + pC[0] = pQ[0]; + pC[3] = pQ[2]; + + for( int i=1; i<3; i++ ) + { + pC[i] = ( 1.0f / 3.0f ) * ( pQ[i - 1] * i + ( 3.0f - i ) * pQ[i] ); + } +} + +//-------------------------------------------------------------------------------------- +// Computes the tangent patch from the input bezier patch +//-------------------------------------------------------------------------------------- +void ComputeTanPatch( const OutputPatch bezpatch, + inout float3 vOut[16], + in float fCWts[4], + in float3 vCorner[4], + in float3 vCornerLocal[4], + in const uint cX, + in const uint cY) +{ + float3 vQuad[3]; + float3 vQuadB[3]; + float3 vCubic[4]; + + // boundary edges are really simple... + vQuad[0] = vCornerLocal[0]; + vQuad[2] = vCornerLocal[1]; + vQuad[1] = 3.0f*(bezpatch[2*cX+0*cY].vPosition-bezpatch[1*cX+0*cY].vPosition); + + BezierRaise(vQuad,vCubic); + vOut[1*cX + 0*cY] = vCubic[1]; + vOut[2*cX + 0*cY] = vCubic[2]; + + vQuad[0] = vCornerLocal[2]; + vQuad[2] = vCornerLocal[3]; + vQuad[1] = 3.0f*(bezpatch[2*cX+3*cY].vPosition-bezpatch[1*cX+3*cY].vPosition); + + BezierRaise(vQuad,vCubic); + vOut[1*cX + 3*cY] = vCubic[1]; + vOut[2*cX + 3*cY] = vCubic[2]; + + // two internal edges - this is where work happens... + float3 vA,vB,vC,vD,vE; + float fC0,fC1; + vQuad[1] = 3.0f*(bezpatch[2*cX+2*cY].vPosition-bezpatch[1*cX+2*cY].vPosition); + // also do "second" scan line + vQuadB[1] = 3.0f*(bezpatch[2*cX+1*cY].vPosition-bezpatch[1*cX+1*cY].vPosition); + + vD = 3.0f*(bezpatch[1*cX + 2*cY].vPosition - bezpatch[0*cX + 2*cY].vPosition); + vE = 3.0f*(bezpatch[1*cX + 1*cY].vPosition - bezpatch[0*cX + 1*cY].vPosition); // used later... + + fC0 = fCWts[3]; + fC1 = fCWts[0]; + + // sign flip + vA = -vCorner[3]; + vB = 3.0f*(bezpatch[0*cX + 1*cY].vPosition - bezpatch[0*cX + 2*cY].vPosition); + vC = -vCorner[0]; + + vQuad[0] = 1.0f/3.0f*(2.0f*fC0*vB - fC1*vA) + vD; + vQuadB[0] = 1.0f/3.0f*(fC0*vC - 2.0f*fC1*vB) + vE; + + // do end of strip - same as before, but stuff is switched around... + vC = vCorner[2]; + vB = 3.0f*(bezpatch[3*cX + 2*cY].vPosition - bezpatch[3*cX + 1*cY].vPosition); + vA = vCorner[1]; + + vD = 3.0f*(bezpatch[2*cX + 1*cY].vPosition - bezpatch[3*cX + 1*cY].vPosition); + vE = 3.0f*(bezpatch[2*cX + 2*cY].vPosition - bezpatch[3*cX + 2*cY].vPosition); + + fC0 = fCWts[1]; + fC1 = fCWts[2]; + + vQuadB[2] = 1.0f/3.0f*(2.0f*fC0*vB - fC1*vA) + vD; + vQuad[2] = 1.0f/3.0f*(fC0*vC - 2.0f*fC1*vB) + vE; + + vQuadB[2] *= -1.0f; + vQuad[2] *= -1.0f; + + BezierRaise(vQuad,vCubic); + + vOut[0*cX + 2*cY] = vCubic[0]; + vOut[1*cX + 2*cY] = vCubic[1]; + vOut[2*cX + 2*cY] = vCubic[2]; + vOut[3*cX + 2*cY] = vCubic[3]; + + BezierRaise(vQuadB,vCubic); + + vOut[0*cX + 1*cY] = vCubic[0]; + vOut[1*cX + 1*cY] = vCubic[1]; + vOut[2*cX + 1*cY] = vCubic[2]; + vOut[3*cX + 1*cY] = vCubic[3]; +} + +//-------------------------------------------------------------------------------------- +// Skinning vertex shader Section +//-------------------------------------------------------------------------------------- +struct VS_CONTROL_POINT_INPUT +{ + float3 vPosition : POSITION; + float2 vUV : TEXCOORD0; + float3 vTangent : TANGENT; + uint4 vBones : BONES; + float4 vWeights : WEIGHTS; +}; + +VS_CONTROL_POINT_OUTPUT PatchSkinningVS( VS_CONTROL_POINT_INPUT Input ) +{ + VS_CONTROL_POINT_OUTPUT Output; + + float4 vInputPos = float4( Input.vPosition, 1 ); + float4 vWorldPos = float4( 0, 0, 0, 0 ); + + vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.x ] ) * Input.vWeights.x; + vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.y ] ) * Input.vWeights.y; + vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.z ] ) * Input.vWeights.z; + vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.w ] ) * Input.vWeights.w; + + float3 vWorldTan = float3( 0, 0, 0 ); + vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.x ] ) * Input.vWeights.x; + vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.y ] ) * Input.vWeights.y; + vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.z ] ) * Input.vWeights.z; + vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.w ] ) * Input.vWeights.w; + + Output.vPosition = vWorldPos; + Output.vUV = Input.vUV; + Output.vTangent = vWorldTan; + + return Output; +} + +struct VS_MESH_POINT_INPUT +{ + float3 vPosition : POSITION; + float2 vUV : TEXCOORD0; + float3 vNormal : NORMAL; + float3 vTangent : TANGENT; + uint4 vBones : BONES; + float4 vWeights : WEIGHTS; +}; + +struct VS_MESH_POINT_OUTPUT +{ + float3 vWorldPos : POSITION; + float3 vNormal : NORMAL; + float2 vUV : TEXCOORD; + float3 vTangent : TANGENT; + float3 vBiTangent : BITANGENT; + + float4 vPosition : SV_POSITION; +}; + +VS_MESH_POINT_OUTPUT MeshSkinningVS( VS_MESH_POINT_INPUT Input ) +{ + VS_MESH_POINT_OUTPUT Output; + + float4 vInputPos = float4( Input.vPosition, 1 ); + float4 vWorldPos = float4( 0, 0, 0, 0 ); + + vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.x ] ) * Input.vWeights.x; + vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.y ] ) * Input.vWeights.y; + vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.z ] ) * Input.vWeights.z; + vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.w ] ) * Input.vWeights.w; + + float3 vWorldTan = float3( 0, 0, 0 ); + vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.x ] ) * Input.vWeights.x; + vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.y ] ) * Input.vWeights.y; + vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.z ] ) * Input.vWeights.z; + vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.w ] ) * Input.vWeights.w; + + float3 vWorldNormal = float3( 0, 0, 0 ); + vWorldNormal += mul( Input.vNormal, (float3x3)g_mConstBoneWorld[ Input.vBones.x ] ) * Input.vWeights.x; + vWorldNormal += mul( Input.vNormal, (float3x3)g_mConstBoneWorld[ Input.vBones.y ] ) * Input.vWeights.y; + vWorldNormal += mul( Input.vNormal, (float3x3)g_mConstBoneWorld[ Input.vBones.z ] ) * Input.vWeights.z; + vWorldNormal += mul( Input.vNormal, (float3x3)g_mConstBoneWorld[ Input.vBones.w ] ) * Input.vWeights.w; + + Output.vWorldPos = vWorldPos.xyz; + Output.vPosition = mul( float4( vWorldPos.xyz, 1 ), g_mViewProjection ); + Output.vUV = Input.vUV; + Output.vTangent = vWorldTan; + Output.vNormal = vWorldNormal; + Output.vBiTangent = cross( vWorldNormal, vWorldTan ); + + return Output; +} + +//-------------------------------------------------------------------------------------- +// SubD to Bezier hull shader Section +//-------------------------------------------------------------------------------------- +struct HS_CONSTANT_DATA_OUTPUT +{ + float Edges[4] : SV_TessFactor; + float Inside[2] : SV_InsideTessFactor; + + float3 vTangent[4] : TANGENT; + float2 vUV[4] : TEXCOORD; + float3 vTanUCorner[4] : TANUCORNER; + float3 vTanVCorner[4] : TANVCORNER; + float4 vCWts : TANWEIGHTS; +}; + +//-------------------------------------------------------------------------------------- +// Load per-patch valence and prefix data +//-------------------------------------------------------------------------------------- +void LoadValenceAndPrefixData( in uint PatchID, out uint Val[4], out uint Prefixes[4] ) +{ + PatchID += g_iPatchStartIndex; + uint4 ValPack = g_ValencePrefixBuffer.Load( PatchID * 2 ); + uint4 PrefPack = g_ValencePrefixBuffer.Load( PatchID * 2 + 1 ); + + Val[0] = ValPack.x; + Val[1] = ValPack.y; + Val[2] = ValPack.z; + Val[3] = ValPack.w; + + Prefixes[0] = PrefPack.x; + Prefixes[1] = PrefPack.y; + Prefixes[2] = PrefPack.z; + Prefixes[3] = PrefPack.w; +} + + +//-------------------------------------------------------------------------------------- +// Constant data function for the SubDToBezierHS. This is executed once per patch. +//-------------------------------------------------------------------------------------- +HS_CONSTANT_DATA_OUTPUT SubDToBezierConstantsHS( InputPatch ip, + uint PatchID : SV_PrimitiveID ) +{ + HS_CONSTANT_DATA_OUTPUT Output; + + float TessAmount = g_fTessellationFactor; + + Output.Edges[0] = Output.Edges[1] = Output.Edges[2] = Output.Edges[3] = TessAmount; + Output.Inside[0] = Output.Inside[1] = TessAmount; + + Output.vTangent[0] = ip[0].vTangent; + Output.vTangent[1] = ip[1].vTangent; + Output.vTangent[2] = ip[2].vTangent; + Output.vTangent[3] = ip[3].vTangent; + + Output.vUV[0] = ip[0].vUV; + Output.vUV[1] = ip[1].vUV; + Output.vUV[2] = ip[2].vUV; + Output.vUV[3] = ip[3].vUV; + + // Compute part of our tangent patch here + uint Val[4]; + uint Prefixes[4]; + LoadValenceAndPrefixData( PatchID, Val, Prefixes ); + + [unroll] + for( int i=0; i<4; i++ ) + { + float3 CornerB, CornerU, CornerV; + ComputeCornerVertex( i, CornerB, CornerU, CornerV, ip, Val, Prefixes ); + Output.vTanUCorner[i] = CornerU; + Output.vTanVCorner[i] = CornerV; + } + + float fCWts[4]; + Output.vCWts.x = g_fCi[ Val[0]-3 ]; + Output.vCWts.y = g_fCi[ Val[1]-3 ]; + Output.vCWts.z = g_fCi[ Val[2]-3 ]; + Output.vCWts.w = g_fCi[ Val[3]-3 ]; + + return Output; +} + +HS_CONSTANT_DATA_OUTPUT SubDToBezierConstantsHS4444( InputPatch ip, + uint PatchID : SV_PrimitiveID ) +{ + HS_CONSTANT_DATA_OUTPUT Output; + + float TessAmount = g_fTessellationFactor; + + Output.Edges[0] = Output.Edges[1] = Output.Edges[2] = Output.Edges[3] = TessAmount; + Output.Inside[0] = Output.Inside[1] = TessAmount; + + Output.vTangent[0] = ip[0].vTangent; + Output.vTangent[1] = ip[1].vTangent; + Output.vTangent[2] = ip[2].vTangent; + Output.vTangent[3] = ip[3].vTangent; + + Output.vUV[0] = ip[0].vUV; + Output.vUV[1] = ip[1].vUV; + Output.vUV[2] = ip[2].vUV; + Output.vUV[3] = ip[3].vUV; + + // Compute part of our tangent patch here + static const uint Val[4] = (uint[4])uint4(4,4,4,4); + static const uint Prefixes[4] = (uint[4])uint4(7,10,13,16); + + [unroll] + for( int i=0; i<4; i++ ) + { + float3 CornerB, CornerU, CornerV; + ComputeCornerVertex4444( i, CornerB, CornerU, CornerV, ip, Val, Prefixes ); + Output.vTanUCorner[i] = CornerU; + Output.vTanVCorner[i] = CornerV; + } + + float fCWts[4]; + Output.vCWts.x = g_fCi[ Val[0]-3 ]; + Output.vCWts.y = g_fCi[ Val[1]-3 ]; + Output.vCWts.z = g_fCi[ Val[2]-3 ]; + Output.vCWts.w = g_fCi[ Val[3]-3 ]; + + return Output; +} + + +//-------------------------------------------------------------------------------------- +// HS for SubDToBezier. This outputcontrolpoints(16) specifies that we will produce +// 16 control points. Therefore this function will be invoked 16x, one for each output +// control point. +// +// !! PERFORMANCE NOTE: This hull shader is written for maximum readability, and its +// performance is not expected to be optimal on D3D11 hardware. The switch statement +// below that determines the codepath for each patch control point generates sub-optimal +// code for parallel execution on the GPU. A future implementation of this hull shader +// will combine the 16 codepaths and 3 variants (corner, edge, interior) into one shared +// codepath; this change is expected to increase performance at the expense of readability. +//-------------------------------------------------------------------------------------- +[domain("quad")] +[partitioning("integer")] +[outputtopology("triangle_cw")] +[outputcontrolpoints(16)] +[patchconstantfunc("SubDToBezierConstantsHS")] +BEZIER_CONTROL_POINT SubDToBezierHS( InputPatch p, + uint i : SV_OutputControlPointID, + uint PatchID : SV_PrimitiveID ) +{ + // Valences and prefixes are loaded from a buffer + uint Val[4]; + uint Prefixes[4]; + LoadValenceAndPrefixData( PatchID, Val, Prefixes ); + + float3 CornerB = float3(0,0,0); + float3 CornerU = float3(0,0,0); + float3 CornerV = float3(0,0,0); + + BEZIER_CONTROL_POINT Output; + Output.vPosition = float3(0,0,0); + + // !! PERFORMANCE NOTE: As mentioned above, this switch statement generates + // inefficient code for the sake of readability. + switch( i ) + { + // Interior vertices + case 5: + Output.vPosition = ComputeInteriorVertex( 0, Val, p ); + break; + case 6: + Output.vPosition = ComputeInteriorVertex( 1, Val, p ); + break; + case 10: + Output.vPosition = ComputeInteriorVertex( 2, Val, p ); + break; + case 9: + Output.vPosition = ComputeInteriorVertex( 3, Val, p ); + break; + + // Corner vertices + case 0: + ComputeCornerVertex( 0, CornerB, CornerU, CornerV, p, Val, Prefixes ); + Output.vPosition = CornerB; + break; + case 3: + ComputeCornerVertex( 1, CornerB, CornerU, CornerV, p, Val, Prefixes ); + Output.vPosition = CornerB; + break; + case 15: + ComputeCornerVertex( 2, CornerB, CornerU, CornerV, p, Val, Prefixes ); + Output.vPosition = CornerB; + break; + case 12: + ComputeCornerVertex( 3, CornerB, CornerU, CornerV, p, Val, Prefixes ); + Output.vPosition = CornerB; + break; + + // Edge vertices + case 1: + Output.vPosition = ComputeEdgeVertex( 0, p, Val, Prefixes ); + break; + case 2: + Output.vPosition = ComputeEdgeVertex( 1, p, Val, Prefixes ); + break; + case 13: + Output.vPosition = ComputeEdgeVertex( 2, p, Val, Prefixes ); + break; + case 14: + Output.vPosition = ComputeEdgeVertex( 3, p, Val, Prefixes ); + break; + case 4: + Output.vPosition = ComputeEdgeVertex( 4, p, Val, Prefixes ); + break; + case 8: + Output.vPosition = ComputeEdgeVertex( 5, p, Val, Prefixes ); + break; + case 7: + Output.vPosition = ComputeEdgeVertex( 6, p, Val, Prefixes ); + break; + case 11: + Output.vPosition = ComputeEdgeVertex( 7, p, Val, Prefixes ); + break; + } + + return Output; +} + +//-------------------------------------------------------------------------------------- +// Specialised version for Regular (4,4,4,4) patches, this is much simpler and has less +// branching compared to the general one above +//-------------------------------------------------------------------------------------- +[domain("quad")] +[partitioning("integer")] +[outputtopology("triangle_cw")] +[outputcontrolpoints(16)] +[patchconstantfunc("SubDToBezierConstantsHS4444")] +BEZIER_CONTROL_POINT SubDToBezierHS4444( InputPatch p, + uint i : SV_OutputControlPointID, + uint PatchID : SV_PrimitiveID ) +{ + // Valences and prefixes are Constant for this case (4,4,4,4) + static const uint Val[4] = (uint[4])uint4(4,4,4,4); + static const uint Prefixes[4] = (uint[4])uint4(7,10,13,16); + + float3 CornerB = float3(0,0,0); + float3 CornerU = float3(0,0,0); + float3 CornerV = float3(0,0,0); + + BEZIER_CONTROL_POINT Output; + Output.vPosition = float3(0,0,0); + + // !! PERFORMANCE NOTE: As mentioned above, this switch statement generates + // inefficient code for the sake of readability. + switch( i ) + { + // Interior vertices + case 5: + Output.vPosition = ComputeInteriorVertex( 0, Val, p ); + break; + case 6: + Output.vPosition = ComputeInteriorVertex( 1, Val, p ); + break; + case 10: + Output.vPosition = ComputeInteriorVertex( 2, Val, p ); + break; + case 9: + Output.vPosition = ComputeInteriorVertex( 3, Val, p ); + break; + + // Corner vertices + case 0: + ComputeCornerVertex4444( 0, CornerB, CornerU, CornerV, p, Val, Prefixes ); + Output.vPosition = CornerB; + break; + case 3: + ComputeCornerVertex4444( 1, CornerB, CornerU, CornerV, p, Val, Prefixes ); + Output.vPosition = CornerB; + break; + case 15: + ComputeCornerVertex4444( 2, CornerB, CornerU, CornerV, p, Val, Prefixes ); + Output.vPosition = CornerB; + break; + case 12: + ComputeCornerVertex4444( 3, CornerB, CornerU, CornerV, p, Val, Prefixes ); + Output.vPosition = CornerB; + break; + + // Edge vertices + case 1: + Output.vPosition = ComputeEdgeVertex( 0, p, Val, Prefixes ); + break; + case 2: + Output.vPosition = ComputeEdgeVertex( 1, p, Val, Prefixes ); + break; + case 13: + Output.vPosition = ComputeEdgeVertex( 2, p, Val, Prefixes ); + break; + case 14: + Output.vPosition = ComputeEdgeVertex( 3, p, Val, Prefixes ); + break; + case 4: + Output.vPosition = ComputeEdgeVertex( 4, p, Val, Prefixes ); + break; + case 8: + Output.vPosition = ComputeEdgeVertex( 5, p, Val, Prefixes ); + break; + case 7: + Output.vPosition = ComputeEdgeVertex( 6, p, Val, Prefixes ); + break; + case 11: + Output.vPosition = ComputeEdgeVertex( 7, p, Val, Prefixes ); + break; + } + + return Output; +} + + +//-------------------------------------------------------------------------------------- +// Bezier evaluation domain shader section +//-------------------------------------------------------------------------------------- +struct DS_OUTPUT +{ + float3 vWorldPos : POSITION; + float3 vNormal : NORMAL; + float2 vUV : TEXCOORD; + float3 vTangent : TANGENT; + float3 vBiTangent : BITANGENT; + + float4 vPosition : SV_POSITION; +}; + +//-------------------------------------------------------------------------------------- +float4 BernsteinBasis(float t) +{ + float invT = 1.0f - t; + + return float4( invT * invT * invT, + 3.0f * t * invT * invT, + 3.0f * t * t * invT, + t * t * t ); +} + +//-------------------------------------------------------------------------------------- +float4 dBernsteinBasis(float t) +{ + float invT = 1.0f - t; + + return float4( -3 * invT * invT, + 3 * invT * invT - 6 * t * invT, + 6 * t * invT - 3 * t * t, + 3 * t * t ); +} + +//-------------------------------------------------------------------------------------- +float3 EvaluateBezier( const OutputPatch bezpatch, + float4 BasisU, + float4 BasisV ) +{ + float3 Value = float3(0,0,0); + Value = BasisV.x * ( bezpatch[0].vPosition * BasisU.x + bezpatch[1].vPosition * BasisU.y + bezpatch[2].vPosition * BasisU.z + bezpatch[3].vPosition * BasisU.w ); + Value += BasisV.y * ( bezpatch[4].vPosition * BasisU.x + bezpatch[5].vPosition * BasisU.y + bezpatch[6].vPosition * BasisU.z + bezpatch[7].vPosition * BasisU.w ); + Value += BasisV.z * ( bezpatch[8].vPosition * BasisU.x + bezpatch[9].vPosition * BasisU.y + bezpatch[10].vPosition * BasisU.z + bezpatch[11].vPosition * BasisU.w ); + Value += BasisV.w * ( bezpatch[12].vPosition * BasisU.x + bezpatch[13].vPosition * BasisU.y + bezpatch[14].vPosition * BasisU.z + bezpatch[15].vPosition * BasisU.w ); + + return Value; +} + +//-------------------------------------------------------------------------------------- +float3 EvaluateBezierTan( const float3 bezpatch[16], + float4 BasisU, + float4 BasisV ) +{ + float3 Value = float3(0,0,0); + Value = BasisV.x * ( bezpatch[0] * BasisU.x + bezpatch[1] * BasisU.y + bezpatch[2] * BasisU.z + bezpatch[3] * BasisU.w ); + Value += BasisV.y * ( bezpatch[4] * BasisU.x + bezpatch[5] * BasisU.y + bezpatch[6] * BasisU.z + bezpatch[7] * BasisU.w ); + Value += BasisV.z * ( bezpatch[8] * BasisU.x + bezpatch[9] * BasisU.y + bezpatch[10] * BasisU.z + bezpatch[11] * BasisU.w ); + Value += BasisV.w * ( bezpatch[12] * BasisU.x + bezpatch[13] * BasisU.y + bezpatch[14] * BasisU.z + bezpatch[15] * BasisU.w ); + + return Value; +} + +//-------------------------------------------------------------------------------------- +// Compute a two full tangent patches from the Tangent corner data created in the +// HS constant data function. +//-------------------------------------------------------------------------------------- +void CreatTangentPatches( in HS_CONSTANT_DATA_OUTPUT input, + const OutputPatch bezpatch, + out float3 TanU[16], + out float3 TanV[16] ) +{ + TanV[0] = input.vTanVCorner[0]; + TanV[3] = input.vTanVCorner[1]; + TanV[15] = input.vTanVCorner[2]; + TanV[12] = input.vTanVCorner[3]; + + TanU[0] = input.vTanUCorner[0]; + TanU[3] = input.vTanUCorner[1]; + TanU[15] = input.vTanUCorner[2]; + TanU[12] = input.vTanUCorner[3]; + + float fCWts[4]; + fCWts[0] = input.vCWts.x; + fCWts[1] = input.vCWts.y; + fCWts[2] = input.vCWts.z; + fCWts[3] = input.vCWts.w; + + float3 vCorner[4]; + float3 vCornerLocal[4]; + + vCorner[0] = TanV[0]; + vCorner[1] = TanV[3]; + vCorner[2] = TanV[15]; + vCorner[3] = TanV[12]; + vCornerLocal[0] = TanU[0]; + vCornerLocal[1] = TanU[3]; + vCornerLocal[2] = TanU[12]; + vCornerLocal[3] = TanU[15]; + + ComputeTanPatch( bezpatch, TanU, fCWts, vCorner, vCornerLocal, 1, 4 ); + + fCWts[3] = input.vCWts.y; + fCWts[1] = input.vCWts.w; + + vCorner[0] = TanU[0]; + vCorner[3] = TanU[3]; + vCorner[2] = TanU[15]; + vCorner[1] = TanU[12]; + vCornerLocal[0] = TanV[0]; + vCornerLocal[1] = TanV[12]; + vCornerLocal[2] = TanV[3]; + vCornerLocal[3] = TanV[15]; + + ComputeTanPatch( bezpatch, TanV, fCWts, vCorner, vCornerLocal, 4, 1 ); +} + +//-------------------------------------------------------------------------------------- +// For each input UV (from the Tessellator), evaluate the Bezier patch at this position. +//-------------------------------------------------------------------------------------- +[domain("quad")] +DS_OUTPUT BezierEvalDS( HS_CONSTANT_DATA_OUTPUT input, + float2 UV : SV_DomainLocation, + const OutputPatch bezpatch ) +{ + float4 BasisU = BernsteinBasis( UV.x ); + float4 BasisV = BernsteinBasis( UV.y ); + + float3 WorldPos = EvaluateBezier( bezpatch, BasisU, BasisV ); + + float3 TanU[16]; + float3 TanV[16]; + CreatTangentPatches( input, bezpatch, TanU, TanV ); + float3 Tangent = EvaluateBezierTan( TanU, BasisU, BasisV ); + float3 BiTangent = EvaluateBezierTan( TanV, BasisU, BasisV ); + + // To see what the patch looks like without using the tangent patches to fix the normals, uncomment this section + /* + float4 dBasisU = dBernsteinBasis( UV.x ); + float4 dBasisV = dBernsteinBasis( UV.y ); + Tangent = EvaluateBezier( bezpatch, dBasisU, BasisV ); + BiTangent = EvaluateBezier( bezpatch, BasisU, dBasisV ); + */ + + float3 Norm = normalize( cross( Tangent, BiTangent ) ); + + DS_OUTPUT Output; + Output.vNormal = Norm; + + // Evalulate the tangent vectors through bilinear interpolation. + // These tangents are the texture-space tangents. They should not be confused with the parametric + // tangents that we use to get the normals for the bicubic patch. + float3 TextureTanU0 = input.vTangent[0]; + float3 TextureTanU1 = input.vTangent[1]; + float3 TextureTanU2 = input.vTangent[2]; + float3 TextureTanU3 = input.vTangent[3]; + + float3 UVbottom = lerp( TextureTanU0, TextureTanU1, UV.x ); + float3 UVtop = lerp( TextureTanU3, TextureTanU2, UV.x ); + float3 Tan = lerp( UVbottom, UVtop, UV.y ); + + Output.vTangent = Tan; + + // This is an optimization. We assume that the UV mapping of the mesh will result in a "relatively" orthogonal + // tangent basis. If we assume this, then we can avoid fetching and bilerping the BiTangent along with the tangent. + Output.vBiTangent = cross( Norm, Tan ); + + // bilerp the texture coordinates + float2 tex0 = input.vUV[0]; + float2 tex1 = input.vUV[1]; + float2 tex2 = input.vUV[2]; + float2 tex3 = input.vUV[3]; + + float2 bottom = lerp( tex0, tex1, UV.x ); + float2 top = lerp( tex3, tex2, UV.x ); + float2 TexUV = lerp( bottom, top, UV.y ); + Output.vUV = TexUV; + + if( g_fDisplacementHeight > 0 ) + { + // On this sample displacement can go into or out of the mesh. This is why we bias the heigh amount. + float height = g_fDisplacementHeight * ( g_txHeight.SampleLevel( g_samPoint, TexUV, 0 ).a * 2 - 1 ); + float3 WorldPosMiddle = Norm * height; + WorldPos += WorldPosMiddle; + } + + Output.vPosition = mul( float4(WorldPos,1), g_mViewProjection ); + Output.vWorldPos = WorldPos; + + return Output; +} + +//-------------------------------------------------------------------------------------- +// Smooth shading pixel shader section +//-------------------------------------------------------------------------------------- + +float3 safe_normalize( float3 vInput ) +{ + float len2 = dot( vInput, vInput ); + if( len2 > 0 ) + { + return vInput * rsqrt( len2 ); + } + return vInput; +} + +static const float g_fSpecularExponent = 32.0f; +static const float g_fSpecularIntensity = 0.6f; +static const float g_fNormalMapIntensity = 1.5f; + +float2 ComputeDirectionalLight( float3 vWorldPos, float3 vWorldNormal, float3 vDirLightDir ) +{ + // Result.x is diffuse illumination, Result.y is specular illumination + float2 Result = float2( 0, 0 ); + Result.x = pow( saturate( dot( vWorldNormal, -vDirLightDir ) ), 2 ); + + float3 vPointToCamera = normalize( g_vCameraPosWorld - vWorldPos ); + float3 vHalfAngle = normalize( vPointToCamera - vDirLightDir ); + Result.y = pow( saturate( dot( vHalfAngle, vWorldNormal ) ), g_fSpecularExponent ); + + return Result; +} + +float3 ColorGamma( float3 Input ) +{ + return pow( Input, 2.2f ); +} + +float4 SmoothPS( PS_INPUT Input ) : SV_TARGET +{ + float4 vNormalMapSampleRaw = g_txHeight.Sample( g_samLinear, Input.vUV ); + float3 vNormalMapSampleBiased = ( vNormalMapSampleRaw.xyz * 2 ) - 1; + vNormalMapSampleBiased.xy *= g_fNormalMapIntensity; + float3 vNormalMapSample = normalize( vNormalMapSampleBiased ); + + float3 vNormal = safe_normalize( Input.vNormal ) * vNormalMapSample.z; + vNormal += safe_normalize( Input.vTangent ) * vNormalMapSample.x; + vNormal += safe_normalize( Input.vBiTangent ) * vNormalMapSample.y; + + //float3 vColor = float3( 1, 1, 1 ); + float3 vColor = g_txDiffuse.Sample( g_samLinear, Input.vUV ).rgb; + float vSpecular = g_txSpecular.Sample( g_samLinear, Input.vUV ).r * g_fSpecularIntensity; + + const float3 DirLightDirections[4] = + { + // key light + normalize( float3( -63.345150, -58.043934, 27.785097 ) ), + // fill light + normalize( float3( 23.652107, -17.391443, 54.972504 ) ), + // back light 1 + normalize( float3( 20.470509, -22.939510, -33.929531 ) ), + // back light 2 + normalize( float3( -31.003685, 24.242104, -41.352859 ) ), + }; + + const float3 DirLightColors[4] = + { + // key light + ColorGamma( float3( 1.0f, 0.964f, 0.706f ) * 1.0f ), + // fill light + ColorGamma( float3( 0.446f, 0.641f, 1.0f ) * 1.0f ), + // back light 1 + ColorGamma( float3( 1.0f, 0.862f, 0.419f ) * 1.0f ), + // back light 2 + ColorGamma( float3( 0.405f, 0.630f, 1.0f ) * 1.0f ), + }; + + float3 fLightColor = 0; + for( int i = 0; i < 4; ++i ) + { + float2 LightDiffuseSpecular = ComputeDirectionalLight( Input.vWorldPos, vNormal, DirLightDirections[i] ); + fLightColor += DirLightColors[i] * vColor * LightDiffuseSpecular.x; + fLightColor += DirLightColors[i] * LightDiffuseSpecular.y * vSpecular; + } + + return float4( fLightColor, 1 ); +} + +//-------------------------------------------------------------------------------------- +// Solid color shading pixel shader (used for wireframe overlay) +//-------------------------------------------------------------------------------------- +float4 SolidColorPS( PS_INPUT Input ) : SV_TARGET +{ + return float4( g_vSolidColor, 1 ); +} diff --git a/tests/hlsl/dxsdk/VarianceShadows11/2DQuadShaders.hlsl b/tests/hlsl/dxsdk/VarianceShadows11/2DQuadShaders.hlsl new file mode 100644 index 000000000..c4401f010 --- /dev/null +++ b/tests/hlsl/dxsdk/VarianceShadows11/2DQuadShaders.hlsl @@ -0,0 +1,211 @@ +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain -profile ps_4_0 -entry PSBlurX -entry PSBlurY +//-------------------------------------------------------------------------------------- +// File: Skinning10.fx +// +// The effect file for the Skinning10 sample. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +#ifndef SEPERABLE_BLUR_KERNEL_SIZE +#define SEPERABLE_BLUR_KERNEL_SIZE 3 +#endif + +static const int BLUR_KERNEL_BEGIN = SEPERABLE_BLUR_KERNEL_SIZE / -2; +static const int BLUR_KERNEL_END = SEPERABLE_BLUR_KERNEL_SIZE / 2 + 1; +static const float FLOAT_BLUR_KERNEL_SIZE = (float)SEPERABLE_BLUR_KERNEL_SIZE; + +cbuffer cbblurVS : register( b2) +{ + int2 g_iWidthHeight : packoffset( c0 ); + int g_iKernelStart : packoffset( c0.z ); + int g_iKernelEnd : packoffset( c0.w ); +}; + +//-------------------------------------------------------------------------------------- +// defines +//-------------------------------------------------------------------------------------- + +Texture2DArray g_txShadow : register( t5 ); +SamplerState g_samShadow : register( s5 ); + +//-------------------------------------------------------------------------------------- +// Input/Output structures +//-------------------------------------------------------------------------------------- + +struct PSIn +{ + float4 Pos : SV_Position; //Position + float2 Tex : TEXCOORD; //Texture coordinate + float2 ITex : TEXCOORD2; +}; + +struct VSIn +{ + uint Pos : SV_VertexID ; +}; + + +PSIn VSMain(VSIn inn) +{ + PSIn output; + + output.Pos.y = -1.0f + (inn.Pos%2) * 2.0f ; + output.Pos.x = -1.0f + (inn.Pos/2) * 2.0f; + output.Pos.z = .5; + output.Pos.w = 1; + output.Tex.x = inn.Pos/2; + output.Tex.y = 1.0f - inn.Pos%2; + output.ITex.x = (float)(g_iWidthHeight.x * output.Tex.x); + output.ITex.y = (float)(g_iWidthHeight.y * output.Tex.y); + return output; +} + +//float PSDepth + +//------------------------------------------------------------------------------ +// Logarithmic filtering +//------------------------------------------------------------------------------ + +float log_conv ( float x0, float X, float y0, float Y ) +{ + return (X + log(x0 + (y0 * exp(Y - X)))); +} + + +//-------------------------------------------------------------------------------------- +// Pixel shader that performs bump mapping on the final vertex +//-------------------------------------------------------------------------------------- +float2 PSBlurX(PSIn input) : SV_Target +{ +/* + float2 centerDistance; + if ( input.Tex.x < .5 ) centerDistance.x = (1.0 - input.Tex.x); + else centerDistance.x = input.Tex.x; + if ( input.Tex.y < .5 ) centerDistance.y = (1.0 - input.Tex.y); + else centerDistance.y = input.Tex.y; + if (centerDistance.x < centerDistance.y) centerDistance.x = centerDistance.y; + centerDistance.x -= .2; + centerDistance.x *= (1.0f / .8); + + float store_samples[8]; + int ind = 0; + for (int x = g_iKernelStart; x < g_iKernelEnd; ++x) { + store_samples[ind] = g_txShadow.Load( int3(input.ITex.x+(float)x * centerDistance.x , input.ITex.y, 0) ).r; + ind++; + } + const float c = (1.f/5.f); + + float accum; + accum = log_conv( c, store_samples[0], c, store_samples[1] ); + + ind = 0; + for (x = g_iKernelStart - 2; x < g_iKernelEnd; ++x) { + ind++; + accum += log_conv( 1.0f, accum, c, store_samples[ind] ); + } + float2 rt; + rt.x = accum; + return rt; + */ + /* + float2 dep = 0; + float2 centerDistance; + if ( input.Tex.x < .5 ) centerDistance.x = (1.0 - input.Tex.x); + else centerDistance.x = input.Tex.x; + if ( input.Tex.y < .5 ) centerDistance.y = (1.0 - input.Tex.y); + else centerDistance.y = input.Tex.y; + if (centerDistance.x < centerDistance.y) centerDistance.x = centerDistance.y; + centerDistance.x -= .2; + centerDistance.x *= ( 1.0f / 0.8f ); + + for (int x = g_iKernelStart; x < g_iKernelEnd; ++x) { + dep += g_txShadow.Load( int3(input.ITex.x+(float)x * centerDistance.x , input.ITex.y, 0) ).rg; + } + dep /= (g_iKernelEnd - g_iKernelStart); + return dep; + */ + + float2 dep=0; + [unroll]for ( int x = BLUR_KERNEL_BEGIN; x < BLUR_KERNEL_END; ++x ) { + dep += g_txShadow.Sample( g_samShadow, float3( input.Tex.x, input.Tex.y, 0 ), int2( x,0 ) ).rg; + } + dep /= FLOAT_BLUR_KERNEL_SIZE; + return dep; + +// return g_txShadow.Sample(g_samShadow, float3(input.Tex.x, input.Tex.y, 0) ).rg; + +} + +//-------------------------------------------------------------------------------------- +// Pixel shader that performs bump mapping on the final vertex +//-------------------------------------------------------------------------------------- +float2 PSBlurY(PSIn input) : SV_Target +{ +/* + float2 centerDistance; + if ( input.Tex.x < .5 ) centerDistance.x = (1.0 - input.Tex.x); + else centerDistance.x = input.Tex.x; + if ( input.Tex.y < .5 ) centerDistance.y = (1.0 - input.Tex.y); + else centerDistance.y = input.Tex.y; + if (centerDistance.x < centerDistance.y) centerDistance.x = centerDistance.y; + centerDistance.x -= .2; + centerDistance.x *= (1.0f / .8); + + float store_samples[8]; + int ind = 0; + for (int y = g_iKernelStart; y < g_iKernelEnd; ++y) { + store_samples[ind] = g_txShadow.Load( int3(input.ITex.x, input.ITex.y+(float)y * centerDistance.x, 0) ).r; + } + const float c = (1.f/5.f); + + float accum; + accum = log_conv( c, store_samples[0], c, store_samples[1] ); + + ind = 0; + for (y = g_iKernelStart; y < g_iKernelEnd; ++y) { + ind++; + accum += log_conv( 1.0f, accum, c, store_samples[ind] ); + } + float2 rt; + rt.x = accum; + return rt; + */ + + + /* + float2 dep = 0; + + float2 centerDistance; + if ( input.Tex.x < .5 ) centerDistance.x = (1.0 - input.Tex.x); + else centerDistance.x = input.Tex.x; + if ( input.Tex.y < .5 ) centerDistance.y = (1.0 - input.Tex.y); + else centerDistance.y = input.Tex.y; + if (centerDistance.x < centerDistance.y) centerDistance.x = centerDistance.y; + centerDistance.x -= 0; + centerDistance.x *= (1.0f / 1.0f); + + if (centerDistance.x < centerDistance.y) centerDistance.x = centerDistance.y; + for (int y = g_iKernelStart; y < g_iKernelEnd; ++y) { + dep += g_txShadow.Load( int3(input.ITex.x, input.ITex.y+(float)y * centerDistance.x, 0) ).rg; + } + + + dep /= (g_iKernelEnd - g_iKernelStart); + return dep; + + */ + + + float2 dep=0; + [unroll]for ( int y = BLUR_KERNEL_BEGIN; y < BLUR_KERNEL_END; ++y ) { + dep += g_txShadow.Sample( g_samShadow, float3( input.Tex.x, input.Tex.y, 0 ), int2( 0,y ) ).rg; + } + dep /= FLOAT_BLUR_KERNEL_SIZE; + return dep; + + //return g_txShadow.Sample(g_samShadow, float3(input.Tex.x, input.Tex.y, 0) ).rg; +} + + + diff --git a/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceScene.hlsl b/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceScene.hlsl new file mode 100644 index 000000000..0b2e43b5c --- /dev/null +++ b/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceScene.hlsl @@ -0,0 +1,412 @@ +//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues. +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain -profile ps_4_0 -entry PSBlurX -entry PSBlurY +//-------------------------------------------------------------------------------------- +// File: RenderCascadeScene.hlsl +// +// This is the main shader file. This shader is compiled with several different flags +// to provide different customizations based on user controls. +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +//-------------------------------------------------------------------------------------- +// Globals +//-------------------------------------------------------------------------------------- + +// This flag enables the shadow to blend between cascades. This is most useful when the +// the shadow maps are small and artifact can be seen between the various cascade layers. +#ifndef BLEND_BETWEEN_CASCADE_LAYERS_FLAG +#define BLEND_BETWEEN_CASCADE_LAYERS_FLAG 0 +#endif + +// There are two methods for selecting the proper cascade a fragment lies in. Interval selection +// compares the depth of the fragment against the frustum's depth partition. +// Map based selection compares the texture coordinates against the acutal cascade maps. +// Map based selection gives better coverage. +// Interval based selection is easier to extend and understand. +#ifndef SELECT_CASCADE_BY_INTERVAL_FLAG +#define SELECT_CASCADE_BY_INTERVAL_FLAG 0 +#endif + +// The number of cascades +#ifndef CASCADE_COUNT_FLAG +#define CASCADE_COUNT_FLAG 3 +#endif + + +// Most titles will find that 3-4 cascades with +// BLEND_BETWEEN_CASCADE_LAYERS_FLAG, is good for lower end PCs. + +cbuffer cbAllShadowData : register( b0 ) +{ + matrix m_mWorldViewProjection; + matrix m_mWorld; + matrix m_mWorldView; + matrix m_mShadow; + float4 m_vCascadeOffset[8]; + float4 m_vCascadeScale[8]; + int m_nCascadeLevels; // Number of Cascades + int m_iVisualizeCascades; // 1 is to visualize the cascades in different colors. 0 is to just draw the scene + + // For Map based selection scheme, this keeps the pixels inside of the the valid range. + // When there is no boarder, these values are 0 and 1 respectivley. + float m_fMinBorderPadding; + float m_fMaxBorderPadding; + + float m_fCascadeBlendArea; // Amount to overlap when blending between cascades. + float m_fTexelSize; // Padding variables exist because CBs must be a multiple of 16 bytes. + float m_fNativeTexelSizeInX; + float4 m_fCascadeFrustumsEyeSpaceDepthsData[2]; // The values along Z that seperate the cascades. + // This code creates an array based pointer that points towards the vectorized input data. + // This is the only way to index arbitrary arrays of data. + // If the array is used at run time, the compiler will generate code that uses logic to index the correct component. + + static float m_fCascadeFrustumsEyeSpaceDepths[8] = (float[8])m_fCascadeFrustumsEyeSpaceDepthsData; + + float3 m_vLightDir; + float m_fPaddingCB4; + +}; + + + +//-------------------------------------------------------------------------------------- +// Textures and Samplers +//-------------------------------------------------------------------------------------- +Texture2D g_txDiffuse : register( t0 ); +Texture2DArray g_txShadow : register( t5 ); + +SamplerState g_samLinear : register( s0 ); +SamplerState g_samShadow : register( s5 ); + +//-------------------------------------------------------------------------------------- +// Input / Output structures +//-------------------------------------------------------------------------------------- +struct VS_INPUT +{ + float4 vPosition : POSITION; + float3 vNormal : NORMAL; + float2 vTexcoord : TEXCOORD0; +}; + +struct VS_OUTPUT +{ + float3 vNormal : NORMAL; + float2 vTexcoord : COLOR0; + float4 vTexShadow : TEXCOORD1; + float4 vPosition : SV_POSITION; + float4 vInterpPos : TEXCOORD2; + float vDepth : TEXCOORD3; +}; + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +VS_OUTPUT VSMain( VS_INPUT Input ) +{ + VS_OUTPUT Output; + + Output.vPosition = mul( Input.vPosition, m_mWorldViewProjection ); + Output.vNormal = mul( Input.vNormal, (float3x3)m_mWorld ); + Output.vTexcoord = Input.vTexcoord; + Output.vInterpPos = Input.vPosition; + Output.vDepth = mul( Input.vPosition, m_mWorldView ).z ; + + // Transform the shadow texture coordinates for all the cascades. + Output.vTexShadow = mul( Input.vPosition, m_mShadow ); + + return Output; +} + + + +static const float4 vCascadeColorsMultiplier[8] = +{ + float4 ( 1.5f, 0.0f, 0.0f, 1.0f ), + float4 ( 0.0f, 1.5f, 0.0f, 1.0f ), + float4 ( 0.0f, 0.0f, 5.5f, 1.0f ), + float4 ( 1.5f, 0.0f, 5.5f, 1.0f ), + float4 ( 1.5f, 1.5f, 0.0f, 1.0f ), + float4 ( 1.0f, 1.0f, 1.0f, 1.0f ), + float4 ( 0.0f, 1.0f, 5.5f, 1.0f ), + float4 ( 0.5f, 3.5f, 0.75f, 1.0f ) +}; + + +void ComputeCoordinatesTransform( in int iCascadeIndex, + in float4 InterpolatedPosition, + in out float4 vShadowTexCoord, + in out float4 vShadowTexCoordViewSpace ) +{ + // Now that we know the correct map, we can transform the world space position of the current fragment + if( SELECT_CASCADE_BY_INTERVAL_FLAG ) + { + vShadowTexCoord = vShadowTexCoordViewSpace * m_vCascadeScale[iCascadeIndex]; + vShadowTexCoord += m_vCascadeOffset[iCascadeIndex]; + } + vShadowTexCoord.w = vShadowTexCoord.z; // We put the z value in w so that we can index the texture array with Z. + vShadowTexCoord.z = iCascadeIndex; + +} + +//-------------------------------------------------------------------------------------- +// Use PCF to sample the depth map and return a percent lit value. +//-------------------------------------------------------------------------------------- +void CalculateVarianceShadow ( in float4 vShadowTexCoord, in float4 vShadowMapTextureCoordViewSpace, int iCascade, out float fPercentLit ) +{ + fPercentLit = 0.0f; + // This loop could be unrolled, and texture immediate offsets could be used if the kernel size were fixed. + // This would be a performance improvment. + + float2 mapDepth = 0; + + + // In orderto pull the derivative out of divergent flow control we calculate the + // derivative off of the view space coordinates an then scale the deriviative. + + float3 vShadowTexCoordDDX = + ddx(vShadowMapTextureCoordViewSpace ); + vShadowTexCoordDDX *= m_vCascadeScale[iCascade].xyz; + float3 vShadowTexCoordDDY = + ddy(vShadowMapTextureCoordViewSpace ); + vShadowTexCoordDDY *= m_vCascadeScale[iCascade].xyz; + + mapDepth += g_txShadow.SampleGrad( g_samShadow, vShadowTexCoord.xyz, + vShadowTexCoordDDX, + vShadowTexCoordDDY); + // The sample instruction uses gradients for some filters. + + float fAvgZ = mapDepth.x; // Filtered z + float fAvgZ2 = mapDepth.y; // Filtered z-squared + + if ( vShadowTexCoord.w <= fAvgZ ) // We put the z value in w so that we can index the texture array with Z. + { + fPercentLit = 1; + } + else + { + float variance = ( fAvgZ2 ) - ( fAvgZ * fAvgZ ); + variance = min( 1.0f, max( 0.0f, variance + 0.00001f ) ); + + float mean = fAvgZ; + float d = vShadowTexCoord.w - mean; // We put the z value in w so that we can index the texture array with Z. + float p_max = variance / ( variance + d*d ); + + // To combat light-bleeding, experiment with raising p_max to some power + // (Try values from 0.1 to 100.0, if you like.) + fPercentLit = pow( p_max, 4 ); + + } + +} + +//-------------------------------------------------------------------------------------- +// Calculate amount to blend between two cascades and the band where blending will occure. +//-------------------------------------------------------------------------------------- +void CalculateBlendAmountForInterval ( in int iNextCascadeIndex, + in out float fPixelDepth, + in out float fCurrentPixelsBlendBandLocation, + out float fBlendBetweenCascadesAmount + ) +{ + + // We need to calculate the band of the current shadow map where it will fade into the next cascade. + // We can then early out of the expensive PCF for loop. + // + float fBlendInterval = m_fCascadeFrustumsEyeSpaceDepths[ iNextCascadeIndex - 1 ]; + if( iNextCascadeIndex > 1 ) + { + fPixelDepth -= m_fCascadeFrustumsEyeSpaceDepths[ iNextCascadeIndex-2 ]; + fBlendInterval -= m_fCascadeFrustumsEyeSpaceDepths[ iNextCascadeIndex-2 ]; + } + // The current pixel's blend band location will be used to determine when we need to blend and by how much. + fCurrentPixelsBlendBandLocation = fPixelDepth / fBlendInterval; + fCurrentPixelsBlendBandLocation = 1.0f - fCurrentPixelsBlendBandLocation; + // The fBlendBetweenCascadesAmount is our location in the blend band. + fBlendBetweenCascadesAmount = fCurrentPixelsBlendBandLocation / m_fCascadeBlendArea; +} + + +//-------------------------------------------------------------------------------------- +// Calculate amount to blend between two cascades and the band where blending will occure. +//-------------------------------------------------------------------------------------- +void CalculateBlendAmountForMap ( in float4 vShadowMapTextureCoord, + in out float fCurrentPixelsBlendBandLocation, + out float fBlendBetweenCascadesAmount ) +{ + // Calcaulte the blend band for the map based selection. + float2 distanceToOne = float2 ( 1.0f - vShadowMapTextureCoord.x, 1.0f - vShadowMapTextureCoord.y ); + fCurrentPixelsBlendBandLocation = min( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y ); + float fCurrentPixelsBlendBandLocation2 = min( distanceToOne.x, distanceToOne.y ); + fCurrentPixelsBlendBandLocation = + min( fCurrentPixelsBlendBandLocation, fCurrentPixelsBlendBandLocation2 ); + fBlendBetweenCascadesAmount = fCurrentPixelsBlendBandLocation / m_fCascadeBlendArea; +} + +//-------------------------------------------------------------------------------------- +// Calculate the shadow based on several options and rende the scene. +//-------------------------------------------------------------------------------------- + +float4 PSMain( VS_OUTPUT Input ) : SV_TARGET +{ + float4 vDiffuse = g_txDiffuse.Sample( g_samLinear, Input.vTexcoord ); + + + float4 vShadowMapTextureCoordViewSpace = 0.0f; + float4 vShadowMapTextureCoord = 0.0f; + float4 vShadowMapTextureCoord_blend = 0.0f; + + float4 vVisualizeCascadeColor = float4(0.0f,0.0f,0.0f,1.0f); + + float fPercentLit = 0.0f; + float fPercentLit_blend = 0.0f; + + int iCascadeFound = 0; + int iCurrentCascadeIndex=1; + int iNextCascadeIndex = 0; + + float fCurrentPixelDepth; + + // The interval based selection technique compares the pixel's depth against the frustum's cascade divisions. + fCurrentPixelDepth = Input.vDepth; + + // This for loop is not necessary when the frustum is uniformaly divided and interval based selection is used. + // In this case fCurrentPixelDepth could be used as an array lookup into the correct frustum. + vShadowMapTextureCoordViewSpace = Input.vTexShadow; + + + if( SELECT_CASCADE_BY_INTERVAL_FLAG ) + { + iCurrentCascadeIndex = 0; + if (CASCADE_COUNT_FLAG > 1 ) + { + float4 vCurrentPixelDepth = Input.vDepth; + float4 fComparison = ( vCurrentPixelDepth > m_fCascadeFrustumsEyeSpaceDepthsData[0]); + float4 fComparison2 = ( vCurrentPixelDepth > m_fCascadeFrustumsEyeSpaceDepthsData[1]); + float fIndex = dot( + float4( CASCADE_COUNT_FLAG > 0, + CASCADE_COUNT_FLAG > 1, + CASCADE_COUNT_FLAG > 2, + CASCADE_COUNT_FLAG > 3) + , fComparison ) + + dot( + float4( + CASCADE_COUNT_FLAG > 4, + CASCADE_COUNT_FLAG > 5, + CASCADE_COUNT_FLAG > 6, + CASCADE_COUNT_FLAG > 7) + , fComparison2 ) ; + + fIndex = min( fIndex, CASCADE_COUNT_FLAG - 1 ); + iCurrentCascadeIndex = (int)fIndex; + } + } + + if ( !SELECT_CASCADE_BY_INTERVAL_FLAG ) + { + iCurrentCascadeIndex = 0; + if ( CASCADE_COUNT_FLAG == 1 ) + { + vShadowMapTextureCoord = vShadowMapTextureCoordViewSpace * m_vCascadeScale[0]; + vShadowMapTextureCoord += m_vCascadeOffset[0]; + } + if ( CASCADE_COUNT_FLAG > 1 ) { + for( int iCascadeIndex = 0; iCascadeIndex < CASCADE_COUNT_FLAG && iCascadeFound == 0; ++iCascadeIndex ) + { + vShadowMapTextureCoord = vShadowMapTextureCoordViewSpace * m_vCascadeScale[iCascadeIndex]; + vShadowMapTextureCoord += m_vCascadeOffset[iCascadeIndex]; + + if ( min( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y ) > m_fMinBorderPadding + && max( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y ) < m_fMaxBorderPadding ) + { + iCurrentCascadeIndex = iCascadeIndex; + iCascadeFound = 1; + } + } + } + } + // Found the correct map. + vVisualizeCascadeColor = vCascadeColorsMultiplier[iCurrentCascadeIndex]; + + ComputeCoordinatesTransform( iCurrentCascadeIndex, Input.vInterpPos, vShadowMapTextureCoord, vShadowMapTextureCoordViewSpace ); + + if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG && CASCADE_COUNT_FLAG > 1 ) + { + // Repeat text coord calculations for the next cascade. + // The next cascade index is used for blurring between maps. + iNextCascadeIndex = min ( CASCADE_COUNT_FLAG - 1, iCurrentCascadeIndex + 1 ); + if( !SELECT_CASCADE_BY_INTERVAL_FLAG ) + { + vShadowMapTextureCoord_blend = vShadowMapTextureCoordViewSpace * m_vCascadeScale[iNextCascadeIndex]; + vShadowMapTextureCoord_blend += m_vCascadeOffset[iNextCascadeIndex]; + } + ComputeCoordinatesTransform( iNextCascadeIndex, Input.vInterpPos, vShadowMapTextureCoord_blend, vShadowMapTextureCoordViewSpace ); + } + float fBlendBetweenCascadesAmount = 1.0f; + float fCurrentPixelsBlendBandLocation = 1.0f; + + if( SELECT_CASCADE_BY_INTERVAL_FLAG ) + { + if( CASCADE_COUNT_FLAG > 1 && BLEND_BETWEEN_CASCADE_LAYERS_FLAG ) + { + CalculateBlendAmountForInterval ( iNextCascadeIndex, fCurrentPixelDepth, + fCurrentPixelsBlendBandLocation, fBlendBetweenCascadesAmount ); + + } + } + else + { + if( CASCADE_COUNT_FLAG > 1 && BLEND_BETWEEN_CASCADE_LAYERS_FLAG ) + { + CalculateBlendAmountForMap ( vShadowMapTextureCoord, + fCurrentPixelsBlendBandLocation, fBlendBetweenCascadesAmount ); + } + } + + // Because the Z coordinate specifies the texture array, + // the derivative will be 0 when there is no divergence + //float fDivergence = abs( ddy( vShadowMapTextureCoord.z ) ) + abs( ddx( vShadowMapTextureCoord.z ) ); + CalculateVarianceShadow ( vShadowMapTextureCoord, vShadowMapTextureCoordViewSpace, + iCurrentCascadeIndex, fPercentLit); + + // We repeat the calcuation for the next cascade layer, when blending between maps. + if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG && CASCADE_COUNT_FLAG > 1 ) + { + if( fCurrentPixelsBlendBandLocation < m_fCascadeBlendArea ) + { // the current pixel is within the blend band. + + // Because the Z coordinate species the texture array, + // the derivative will be 0 when there is no divergence + float fDivergence = abs( ddy( vShadowMapTextureCoord_blend.z ) ) + + abs( ddx( vShadowMapTextureCoord_blend.z) ); + CalculateVarianceShadow ( vShadowMapTextureCoord_blend, vShadowMapTextureCoordViewSpace, + iNextCascadeIndex, fPercentLit_blend ); + + // Blend the two calculated shadows by the blend amount. + fPercentLit = lerp( fPercentLit_blend, fPercentLit, fBlendBetweenCascadesAmount ); + + } + } + + if( !m_iVisualizeCascades ) vVisualizeCascadeColor = float4( 1.0f, 1.0f, 1.0f, 1.0f ); + + float3 vLightDir1 = float3( -1.0f, 1.0f, -1.0f ); + float3 vLightDir2 = float3( 1.0f, 1.0f, -1.0f ); + float3 vLightDir3 = float3( 0.0f, -1.0f, 0.0f ); + float3 vLightDir4 = float3( 1.0f, 1.0f, 1.0f ); + // Some ambient-like lighting. + float fLighting = + saturate( dot( vLightDir1 , Input.vNormal ) )*0.05f + + saturate( dot( vLightDir2 , Input.vNormal ) )*0.05f + + saturate( dot( vLightDir3 , Input.vNormal ) )*0.05f + + saturate( dot( vLightDir4 , Input.vNormal ) )*0.05f ; + + float4 vShadowLighting = fLighting * 0.5f; + fLighting += saturate( dot( m_vLightDir , Input.vNormal ) ); + fLighting = lerp( vShadowLighting, fLighting, fPercentLit ); + + return fLighting * vVisualizeCascadeColor * vDiffuse; + +} + diff --git a/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceShadow.hlsl b/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceShadow.hlsl new file mode 100644 index 000000000..9837bf299 --- /dev/null +++ b/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceShadow.hlsl @@ -0,0 +1,45 @@ +//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain -profile ps_4_0 -entry PSMain + + +//-------------------------------------------------------------------------------------- +// Globals +//-------------------------------------------------------------------------------------- +cbuffer cbPerObject : register( b0 ) +{ + matrix g_mWorldViewProjection : packoffset( c0 ); +}; + +//-------------------------------------------------------------------------------------- +// Input / Output structures +//-------------------------------------------------------------------------------------- +struct VS_INPUT +{ + float4 vPosition : POSITION; +}; + +struct VS_OUTPUT +{ + float4 vPosition : SV_POSITION; +}; + +//-------------------------------------------------------------------------------------- +// Vertex Shader +//-------------------------------------------------------------------------------------- +VS_OUTPUT VSMain( VS_INPUT Input ) +{ + VS_OUTPUT Output; + + + Output.vPosition = mul( Input.vPosition, g_mWorldViewProjection ); + + return Output; +} + + +float2 PSMain (VS_OUTPUT Input) : SV_TARGET +{ + float2 rt; + rt.x = Input.vPosition.z; + rt.y = rt.x * rt.x; + return rt; +} \ No newline at end of file diff --git a/tests/hlsl/simple/compute-numthreads.hlsl b/tests/hlsl/simple/compute-numthreads.hlsl new file mode 100644 index 000000000..3843c401f --- /dev/null +++ b/tests/hlsl/simple/compute-numthreads.hlsl @@ -0,0 +1,11 @@ +//TEST:COMPARE_HLSL: -no-checking -target dxbc-assembly -profile cs_5_0 -entry main + +// Confirm that we properly pass along the `numthreads` attribute on an entry point. + +RWStructuredBuffer b; + +[numthreads(32,1,1)] +void main(uint3 tid : SV_DispatchThreadID) +{ + b[tid.x] = b[tid.x + 1] + 1.0f; +} \ No newline at end of file -- cgit v1.2.3