summaryrefslogtreecommitdiffstats
path: root/tests/hlsl
diff options
context:
space:
mode:
authorTim Foley <tfoley@nvidia.com>2017-06-09 11:34:21 -0700
committerTim Foley <tfoley@nvidia.com>2017-06-09 13:44:59 -0700
commitfcf83dbf9effab3bd98bad2b83b2468b7eb05cfd (patch)
tree41047c94883b86ec085a81597391ce3ef557cd43 /tests/hlsl
parent52e8d4b9a27ab0060f874c3a63ab531847be35c0 (diff)
Initial import of code.
Diffstat (limited to 'tests/hlsl')
-rw-r--r--tests/hlsl/dxsdk/AdaptiveTessellationCS40/Render.hlsl58
-rw-r--r--tests/hlsl/dxsdk/AdaptiveTessellationCS40/ScanCS.hlsl109
-rw-r--r--tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_EdgeFactorCS.hlsl217
-rw-r--r--tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_NumVerticesIndicesCS.hlsl56
-rw-r--r--tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_ScatterIDCS.hlsl45
-rw-r--r--tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateIndicesCS.hlsl628
-rw-r--r--tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateVerticesCS.hlsl206
-rw-r--r--tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_common.hlsl411
-rw-r--r--tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_defines.h9
-rw-r--r--tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC6HEncode.hlsl2567
-rw-r--r--tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC7Encode.hlsl1908
-rw-r--r--tests/hlsl/dxsdk/BasicCompute11/BasicCompute11.hlsl72
-rw-r--r--tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL.fx158
-rw-r--r--tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_PS.hlsl51
-rw-r--r--tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_VS.hlsl49
-rw-r--r--tests/hlsl/dxsdk/BasicHLSLFX11/BasicHLSLFX11.fx181
-rw-r--r--tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeScene.hlsl506
-rw-r--r--tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeShadow.hlsl53
-rw-r--r--tests/hlsl/dxsdk/ComputeShaderSort11/ComputeShaderSort11.hlsl75
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02.fx23
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_PS.hlsl3
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_VS.hlsl3
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03.fx23
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_PS.hlsl3
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_VS.hlsl3
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04.fx46
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_PS.hlsl3
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_VS.hlsl3
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05.fx54
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_PS.hlsl3
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_VS.hlsl3
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06.fx76
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_PS.hlsl3
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_VS.hlsl3
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07.fx67
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_PS.hlsl3
-rw-r--r--tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_VS.hlsl3
-rw-r--r--tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial08/Tutorial08.fx56
-rw-r--r--tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial09/Tutorial09.fx69
-rw-r--r--tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial10/Tutorial10.fx73
-rw-r--r--tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial11/Tutorial11.fx117
-rw-r--r--tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial12/Tutorial12.fx129
-rw-r--r--tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial13/Tutorial13.fx191
-rw-r--r--tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial14/Tutorial14.fx294
-rw-r--r--tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_LightPSH.h84
-rw-r--r--tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_MaterialPSH.h103
-rw-r--r--tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PS.hlsl84
-rw-r--r--tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PSBuffers.h129
-rw-r--r--tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_VS.hlsl66
-rw-r--r--tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11.fx192
-rw-r--r--tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_LightPSH.h82
-rw-r--r--tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_MaterialPSH.h103
-rw-r--r--tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_PSBuffers.h152
-rw-r--r--tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_ps.hlsl113
-rw-r--r--tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_vs.hlsl65
-rw-r--r--tests/hlsl/dxsdk/FixedFuncEMUFX11/FixedFuncEMU.fx468
-rw-r--r--tests/hlsl/dxsdk/FluidCS11/ComputeShaderSort11.hlsl75
-rw-r--r--tests/hlsl/dxsdk/FluidCS11/FluidCS11.hlsl529
-rw-r--r--tests/hlsl/dxsdk/FluidCS11/FluidRender.hlsl112
-rw-r--r--tests/hlsl/dxsdk/HDRToneMappingCS11/BrightPassAndHorizFilterCS.hlsl64
-rw-r--r--tests/hlsl/dxsdk/HDRToneMappingCS11/DumpToTexture.hlsl29
-rw-r--r--tests/hlsl/dxsdk/HDRToneMappingCS11/FilterCS.hlsl73
-rw-r--r--tests/hlsl/dxsdk/HDRToneMappingCS11/FinalPass.hlsl79
-rw-r--r--tests/hlsl/dxsdk/HDRToneMappingCS11/PSApproach.hlsl129
-rw-r--r--tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceTo1DCS.hlsl72
-rw-r--r--tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceToSingleCS.hlsl63
-rw-r--r--tests/hlsl/dxsdk/HDRToneMappingCS11/skybox11.hlsl44
-rw-r--r--tests/hlsl/dxsdk/InstancingFX11/Instancing.fx591
-rw-r--r--tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_PS.hlsl202
-rw-r--r--tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_VS.hlsl75
-rw-r--r--tests/hlsl/dxsdk/NBodyGravityCS11/NBodyGravityCS11.hlsl103
-rw-r--r--tests/hlsl/dxsdk/NBodyGravityCS11/ParticleDraw.hlsl128
-rw-r--r--tests/hlsl/dxsdk/OIT11/OIT_CS.hlsl277
-rw-r--r--tests/hlsl/dxsdk/OIT11/OIT_PS.hlsl56
-rw-r--r--tests/hlsl/dxsdk/OIT11/SceneVS.hlsl36
-rw-r--r--tests/hlsl/dxsdk/README.md5
-rw-r--r--tests/hlsl/dxsdk/SimpleBezier11/SimpleBezier11.hlsl230
-rw-r--r--tests/hlsl/dxsdk/SimpleSample11/SimpleSample.fx112
-rw-r--r--tests/hlsl/dxsdk/SimpleSample11/SimpleSample.hlsl86
-rw-r--r--tests/hlsl/dxsdk/SubD11/SubD11.hlsl1238
-rw-r--r--tests/hlsl/dxsdk/VarianceShadows11/2DQuadShaders.hlsl211
-rw-r--r--tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceScene.hlsl412
-rw-r--r--tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceShadow.hlsl45
-rw-r--r--tests/hlsl/simple/compute-numthreads.hlsl11
84 files changed, 15341 insertions, 0 deletions
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/Render.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/Render.hlsl
new file mode 100644
index 000000000..b98b870da
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/Render.hlsl
@@ -0,0 +1,58 @@
+//TEST:COMPARE_HLSL: -profile vs_4_0 -entry RenderBaseVS -profile ps_4_0 -entry RenderPS -target dxbc-assembly
+//--------------------------------------------------------------------------------------
+// File: Render.hlsl
+//
+// The shaders for rendering tessellated mesh and base mesh
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+ row_major matrix g_mWorldViewProjection : packoffset( c0 );
+}
+
+// The tessellated vertex structure
+struct TessedVertex
+{
+ uint BaseTriID; // Which triangle of the base mesh this tessellated vertex belongs to?
+ float2 bc; // Barycentric coordinates with regard to the base triangle
+};
+Buffer<float4> g_base_vb_buffer : register(t0); // Base mesh vertex buffer
+StructuredBuffer<TessedVertex> g_TessedVertices : register(t1); // Tessellated mesh vertex buffer
+
+float4 bary_centric(float4 v1, float4 v2, float4 v3, float2 bc)
+{
+ return (1 - bc.x - bc.y) * v1 + bc.x * v2 + bc.y * v3;
+}
+
+float4 RenderVS( uint vertid : SV_VertexID ) : SV_POSITION
+{
+ TessedVertex input = g_TessedVertices[vertid];
+
+ // Get the positions of the three vertices of the base triangle
+ float4 v[3];
+ [unroll]
+ for (int i = 0; i < 3; ++ i)
+ {
+ uint vert_id = input.BaseTriID * 3 + i;
+ v[i] = g_base_vb_buffer[vert_id];
+ }
+
+ // Calculate the position of this tessellated vertex from barycentric coordinates and then project it
+ return mul(bary_centric(v[0], v[1], v[2], input.bc), g_mWorldViewProjection);
+}
+
+struct BaseVertex
+{
+ float4 pos : POSITION;
+};
+
+float4 RenderBaseVS( BaseVertex input ) : SV_POSITION
+{
+ return mul( input.pos, g_mWorldViewProjection );
+}
+
+float4 RenderPS() : SV_TARGET
+{
+ return float4( 1.0f, 1.0f, 0.0f, 1.0f );
+} \ No newline at end of file
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/ScanCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/ScanCS.hlsl
new file mode 100644
index 000000000..46cdc1ed9
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/ScanCS.hlsl
@@ -0,0 +1,109 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSScanInBucket -entry CSScanBucketResult -entry CSScanAddBucketResult
+//--------------------------------------------------------------------------------------
+// File: ScanCS.hlsl
+//
+// A simple inclusive prefix sum(scan) implemented in CS4.0,
+// using a typical up sweep and down sweep scheme
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+StructuredBuffer<uint2> Input : register( t0 ); // Change uint2 here if scan other types, and
+RWStructuredBuffer<uint2> Result : register( u0 ); // also here
+
+#define groupthreads 128
+groupshared uint4 bucket[groupthreads]; // Change uint4 to the "type x2" if scan other types, e.g.
+ // if scan uint2, then put uint4 here,
+ // if scan float, then put float2 here
+
+void CSScan( uint3 DTid, uint GI, uint2 x ) // Change the type of x here if scan other types
+{
+ // since CS40 can only support one shared memory for one shader, we use .xy and .zw as ping-ponging buffers
+ // if scan a single element type like int, search and replace all .xy to .x and .zw to .y below
+ bucket[GI].xy = x;
+ bucket[GI].zw = 0;
+
+ // Up sweep
+ [unroll]
+ for ( uint stride = 2; stride <= groupthreads; stride <<= 1 )
+ {
+ GroupMemoryBarrierWithGroupSync();
+
+ if ( (GI & (stride - 1)) == (stride - 1) )
+ {
+ bucket[GI].xy += bucket[GI - stride/2].xy;
+ }
+ }
+
+ if ( GI == (groupthreads - 1) )
+ {
+ bucket[GI].xy = 0;
+ }
+
+ // Down sweep
+ bool n = true;
+ [unroll]
+ for ( stride = groupthreads / 2; stride >= 1; stride >>= 1 )
+ {
+ GroupMemoryBarrierWithGroupSync();
+
+ uint a = stride - 1;
+ uint b = stride | a;
+
+ if ( n ) // ping-pong between passes
+ {
+ if ( ( GI & b) == b )
+ {
+ bucket[GI].zw = bucket[GI-stride].xy + bucket[GI].xy;
+ } else
+ if ( (GI & a) == a )
+ {
+ bucket[GI].zw = bucket[GI+stride].xy;
+ } else
+ {
+ bucket[GI].zw = bucket[GI].xy;
+ }
+ } else
+ {
+ if ( ( GI & b) == b )
+ {
+ bucket[GI].xy = bucket[GI-stride].zw + bucket[GI].zw;
+ } else
+ if ( (GI & a) == a )
+ {
+ bucket[GI].xy = bucket[GI+stride].zw;
+ } else
+ {
+ bucket[GI].xy = bucket[GI].zw;
+ }
+ }
+
+ n = !n;
+ }
+
+ Result[DTid.x] = bucket[GI].zw + x;
+}
+
+// scan in each bucket
+[numthreads( groupthreads, 1, 1 )]
+void CSScanInBucket( uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI: SV_GroupIndex )
+{
+ uint2 x = Input[DTid.x]; // Change the type of x here if scan other types
+ CSScan( DTid, GI, x );
+}
+
+// record and scan the sum of each bucket
+[numthreads( groupthreads, 1, 1 )]
+void CSScanBucketResult( uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI: SV_GroupIndex )
+{
+ uint2 x = Input[DTid.x*groupthreads - 1]; // Change the type of x here if scan other types
+ CSScan( DTid, GI, x );
+}
+
+StructuredBuffer<uint2> Input1 : register( t1 );
+
+// add the bucket scanned result to each bucket to get the final result
+[numthreads( groupthreads, 1, 1 )]
+void CSScanAddBucketResult( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI: SV_GroupIndex )
+{
+ Result[DTid.x] = Input[DTid.x] + Input1[Gid.x];
+}
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_EdgeFactorCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_EdgeFactorCS.hlsl
new file mode 100644
index 000000000..91ebca777
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_EdgeFactorCS.hlsl
@@ -0,0 +1,217 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSEdgeFactor
+//--------------------------------------------------------------------------------------
+// File: TessellatorCS40_EdgeFactorCS.hlsl
+//
+// The CS to compute edge tessellation factor acoording to current world, view, projection matrix
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+// http://jgt.akpeters.com/papers/akeninemoller01/tribox.html
+bool planeBoxOverlap(float3 normal, float d, float3 maxbox)
+{
+ float3 vmin = maxbox, vmax = maxbox;
+ [unroll]
+ for (int q = 0;q <= 2; ++ q)
+ {
+ if (normal[q] > 0.0f)
+ {
+ vmin[q] *= -1;
+ }
+ else
+ {
+ vmax[q] *= -1;
+ }
+ }
+ if (dot(normal, vmin) + d > 0.0f)
+ {
+ return false;
+ }
+ if (dot(normal, vmax) + d >= 0.0f)
+ {
+ return true;
+ }
+
+ return false;
+}
+
+/*======================== X-tests ========================*/
+bool AXISTEST_X01(float3 v0, float3 v2, float3 boxhalfsize, float2 ab, float2 fab)
+{
+ float p0 = ab.x * v0.y - ab.y * v0.z;
+ float p2 = ab.x * v2.y - ab.y * v2.z;
+ float min_v = min(p0, p2);
+ float max_v = max(p0, p2);
+ float rad = dot(fab, boxhalfsize.yz);
+ return (min_v < rad) && (max_v > -rad);
+}
+
+bool AXISTEST_X2(float3 v0, float3 v1, float3 boxhalfsize, float2 ab, float2 fab)
+{
+ float p0 = ab.x * v0.y - ab.y * v0.z;
+ float p1 = ab.x * v1.y - ab.y * v1.z;
+ float min_v = min(p0, p1);
+ float max_v = max(p0, p1);
+ float rad = dot(fab, boxhalfsize.yz);
+ return (min_v < rad) && (max_v > -rad);
+}
+
+/*======================== Y-tests ========================*/
+bool AXISTEST_Y02(float3 v0, float3 v2, float3 boxhalfsize, float2 ab, float2 fab)
+{
+ float p0 = -ab.x * v0.x + ab.y * v0.z;
+ float p2 = -ab.x * v2.x + ab.y * v2.z;
+ float min_v = min(p0, p2);
+ float max_v = max(p0, p2);
+ float rad = dot(fab, boxhalfsize.xz);
+ return (min_v < rad) && (max_v > -rad);
+}
+
+bool AXISTEST_Y1(float3 v0, float3 v1, float3 boxhalfsize, float2 ab, float2 fab)
+{
+ float p0 = -ab.x * v0.x + ab.y * v0.z;
+ float p1 = -ab.x * v1.x + ab.y * v1.z;
+ float min_v = min(p0, p1);
+ float max_v = max(p0, p1);
+ float rad = dot(fab, boxhalfsize.xz);
+ return (min_v < rad) && (max_v > -rad);
+}
+
+/*======================== Z-tests ========================*/
+bool AXISTEST_Z12(float3 v1, float3 v2, float3 boxhalfsize, float2 ab, float2 fab)
+{
+ float p1 = ab.x * v1.x - ab.y * v1.y;
+ float p2 = ab.x * v2.x - ab.y * v2.y;
+ float min_v = min(p1, p2);
+ float max_v = max(p1, p2);
+ float rad = dot(fab, boxhalfsize.xy);
+ return (min_v < rad) && (max_v > -rad);
+}
+
+bool AXISTEST_Z0(float3 v0, float3 v1, float3 boxhalfsize, float2 ab, float2 fab)
+{
+ float p0 = ab.x * v0.x - ab.y * v0.y;
+ float p1 = ab.x * v1.x - ab.y * v1.y;
+ float min_v = min(p0, p1);
+ float max_v = max(p0, p1);
+ float rad = dot(fab, boxhalfsize.xy);
+ return (min_v < rad) && (max_v > -rad);
+}
+
+bool triBoxOverlap(float3 boxcenter,float3 boxhalfsize,float3 triverts0, float3 triverts1, float3 triverts2)
+{
+ /* use separating axis theorem to test overlap between triangle and box */
+ /* need to test for overlap in these directions: */
+ /* 1) the {x,y,z}-directions (actually, since we use the AABB of the triangle */
+ /* we do not even need to test these) */
+ /* 2) normal of the triangle */
+ /* 3) crossproduct(edge from tri, {x,y,z}-directin) */
+ /* this gives 3x3=9 more tests */
+
+ /* This is the fastest branch on Sun */
+ /* move everything so that the boxcenter is in (0,0,0) */
+ float3 v0 = triverts0 - boxcenter;
+ float3 v1 = triverts1 - boxcenter;
+ float3 v2 = triverts2 - boxcenter;
+
+ /* compute triangle edges */
+ float3 e0 = v1 - v0; /* tri edge 0 */
+ float3 e1 = v2 - v1; /* tri edge 1 */
+ float3 e2 = v0 - v2; /* tri edge 2 */
+
+ /* Bullet 3: */
+ /* test the 9 tests first (this was faster) */
+ float3 fe = abs(e0);
+ if (!AXISTEST_X01(v0, v2, boxhalfsize, e0.zy, fe.zy)
+ || !AXISTEST_Y02(v0, v2, boxhalfsize, e0.zx, fe.zx)
+ || !AXISTEST_Z12(v1, v2, boxhalfsize, e0.yx, fe.yx))
+ {
+ return false;
+ }
+
+ fe = abs(e1);
+ if (!AXISTEST_X01(v0, v2, boxhalfsize, e1.zy, fe.zy)
+ || !AXISTEST_Y02(v0, v2, boxhalfsize, e1.zx, fe.zx)
+ || !AXISTEST_Z0(v0, v1, boxhalfsize, e1.yx, fe.yx))
+ {
+ return false;
+ }
+
+ fe = abs(e2);
+ if (!AXISTEST_X2(v0, v1, boxhalfsize, e2.zy, fe.zy)
+ || !AXISTEST_Y1(v0, v1, boxhalfsize, e2.zx, fe.zx)
+ || !AXISTEST_Z12(v1, v2, boxhalfsize, e2.yx, fe.yx))
+ {
+ return false;
+ }
+
+ /* Bullet 1: */
+ /* first test overlap in the {x,y,z}-directions */
+ /* find min, max of the triangle each direction, and test for overlap in */
+ /* that direction -- this is equivalent to testing a minimal AABB around */
+ /* the triangle against the AABB */
+
+ float3 min_v = min(min(v0, v1), v2);
+ float3 max_v = max(max(v0, v1), v2);
+ if ((min_v.x > boxhalfsize.x || max_v.x < -boxhalfsize.x)
+ || (min_v.y > boxhalfsize.y || max_v.y < -boxhalfsize.y)
+ || (min_v.z > boxhalfsize.z || max_v.z < -boxhalfsize.z))
+ {
+ return false;
+ }
+
+ /* Bullet 2: */
+ /* test if the box intersects the plane of the triangle */
+ /* compute plane equation of triangle: normal*x+d=0 */
+ float3 normal = cross(e0, e1);
+ float d = -dot(normal, v0); /* plane eq: normal.x+d=0 */
+ if (!planeBoxOverlap(normal, d, boxhalfsize))
+ {
+ return false;
+ }
+
+ return true; /* box and triangle overlaps */
+}
+
+
+Buffer<float4> InputVertices : register(t0);
+RWStructuredBuffer<float4> EdgeFactorBufOut : register(u0);
+
+cbuffer cb
+{
+ row_major matrix g_matWVP;
+ float2 g_tess_edge_length_scale;
+ int num_triangles;
+ float dummy;
+}
+
+[numthreads(128, 1, 1)]
+void CSEdgeFactor( uint3 DTid : SV_DispatchThreadID )
+{
+ if (DTid.x < num_triangles)
+ {
+ float4 p0 = mul(InputVertices[DTid.x*3+0], g_matWVP);
+ float4 p1 = mul(InputVertices[DTid.x*3+1], g_matWVP);
+ float4 p2 = mul(InputVertices[DTid.x*3+2], g_matWVP);
+ p0 = p0 / p0.w;
+ p1 = p1 / p1.w;
+ p2 = p2 / p2.w;
+
+ float4 factor;
+ // Only triangles which are completely inside or intersect with the view frustum are taken into account
+ if ( triBoxOverlap( float3(0, 0, 0.5), float3(1.02, 1.02, 0.52), p0.xyz, p1.xyz, p2.xyz ) )
+ {
+ factor.x = length((p0.xy - p2.xy) * g_tess_edge_length_scale);
+ factor.y = length((p1.xy - p0.xy) * g_tess_edge_length_scale);
+ factor.z = length((p2.xy - p1.xy) * g_tess_edge_length_scale);
+ factor.w = min(min(factor.x, factor.y), factor.z);
+ factor = clamp(factor, 0, 9);
+ } else
+ {
+ factor = 0;
+ }
+
+ EdgeFactorBufOut[DTid.x] = factor;
+ }
+}
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_NumVerticesIndicesCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_NumVerticesIndicesCS.hlsl
new file mode 100644
index 000000000..4f2fb547b
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_NumVerticesIndicesCS.hlsl
@@ -0,0 +1,56 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSNumVerticesIndices
+//--------------------------------------------------------------------------------------
+// File: TessellatorCS40_NumVerticesIndicesCS.hlsl
+//
+// The CS to compute number of vertices and triangles to be generated from edge tessellation factor
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#include "TessellatorCS40_common.hlsl"
+
+StructuredBuffer<float4> InputEdgeFactor : register(t0);
+RWStructuredBuffer<uint2> NumVerticesIndicesOut : register(u0);
+
+cbuffer cbCS : register(b1)
+{
+ uint4 g_param;
+}
+
+[numthreads(128, 1, 1)]
+void CSNumVerticesIndices( uint3 DTid : SV_DispatchThreadID )
+{
+ if (DTid.x < g_param.x)
+ {
+ float4 edge_factor = InputEdgeFactor[DTid.x];
+
+ PROCESSED_TESS_FACTORS_TRI processedTessFactors;
+ int num_points = TriProcessTessFactors(edge_factor, processedTessFactors, g_partitioning);
+
+ int num_index;
+ if (0 == num_points)
+ {
+ num_index = 0;
+ }
+ else if (3 == num_points)
+ {
+ num_index = 4;
+ }
+ else
+ {
+ int numRings = ((processedTessFactors.numPointsForOutsideInside.w + 1) / 2); // +1 is so even tess includes the center point, which we want to now
+
+ int4 outsideInsideHalfTessFactor = int4(ceil(processedTessFactors.outsideInsideHalfTessFactor));
+ uint3 n = NumStitchTransition(outsideInsideHalfTessFactor, processedTessFactors.outsideInsideTessFactorParity);
+ num_index = n.x + n.y + n.z;
+ num_index += TotalNumStitchRegular(true, DIAGONALS_MIRRORED, processedTessFactors.numPointsForOutsideInside.w, numRings - 1) * 3;
+ if( processedTessFactors.outsideInsideTessFactorParity.w == TESSELLATOR_PARITY_ODD )
+ {
+ num_index += 4;
+ }
+ }
+
+ NumVerticesIndicesOut[DTid.x] = uint2(num_points, num_index);
+ }
+}
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_ScatterIDCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_ScatterIDCS.hlsl
new file mode 100644
index 000000000..17f003794
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_ScatterIDCS.hlsl
@@ -0,0 +1,45 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSScatterVertexTriIDIndexID -entry CSScatterIndexTriIDIndexID
+//--------------------------------------------------------------------------------------
+// File: TessellatorCS40_ScatterIDCS.hlsl
+//
+// The CS to scatter vertex ID and triangle ID
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+StructuredBuffer<uint2> InputScanned : register(t0);
+RWStructuredBuffer<uint2> TriIDIndexIDOut : register(u0);
+
+cbuffer cbCS : register(b1)
+{
+ uint4 g_param;
+}
+
+[numthreads(128, 1, 1)]
+void CSScatterVertexTriIDIndexID( uint3 DTid : SV_DispatchThreadID )
+{
+ if (DTid.x < g_param.x)
+ {
+ uint start = InputScanned[DTid.x-1].x;
+ uint end = InputScanned[DTid.x].x;
+
+ for ( uint i = start; i < end; ++i )
+ {
+ TriIDIndexIDOut[i] = uint2(DTid.x, i - start);
+ }
+ }
+}
+
+[numthreads(128, 1, 1)]
+void CSScatterIndexTriIDIndexID( uint3 DTid : SV_DispatchThreadID )
+{
+ if (DTid.x < g_param.x)
+ {
+ uint start = InputScanned[DTid.x-1].y;
+ uint end = InputScanned[DTid.x].y;
+
+ for ( uint i = start; i < end; ++i )
+ {
+ TriIDIndexIDOut[i] = uint2(DTid.x, i - start);
+ }
+ }
+}
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateIndicesCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateIndicesCS.hlsl
new file mode 100644
index 000000000..756f99e58
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateIndicesCS.hlsl
@@ -0,0 +1,628 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSTessellationIndices
+//--------------------------------------------------------------------------------------
+// File: TessellatorCS40_TessellateIndicesCS.hlsl
+//
+// The CS to tessellate indices
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#include "TessellatorCS40_common.hlsl"
+
+StructuredBuffer<uint2> InputTriIDIndexID : register(t0);
+StructuredBuffer<float4> InputEdgeFactor : register(t1);
+StructuredBuffer<uint2> InputScanned : register(t2);
+
+RWByteAddressBuffer TessedIndicesOut : register(u0);
+
+cbuffer cbCS : register(b1)
+{
+ uint4 g_param;
+}
+
+
+int TransformIndex1(int index, int vertices_base)
+{
+ return vertices_base + index;
+}
+
+int TransformIndex2(int index, int vertices_base, INDEX_PATCH_CONTEXT IndexPatchContext)
+{
+ if( index >= IndexPatchContext.outsidePointIndexPatchBase ) // assumed remapped outide indices are > remapped inside vertices
+ {
+ if( index == IndexPatchContext.outsidePointIndexBadValue )
+ {
+ index = IndexPatchContext.outsidePointIndexReplacementValue;
+ }
+ else
+ {
+ index += IndexPatchContext.outsidePointIndexDeltaToRealValue;
+ }
+ }
+ else
+ {
+ if( index == IndexPatchContext.insidePointIndexBadValue )
+ {
+ index = IndexPatchContext.insidePointIndexReplacementValue;
+ }
+ else
+ {
+ index += IndexPatchContext.insidePointIndexDeltaToRealValue;
+ }
+ }
+
+ return vertices_base + index;
+}
+
+
+int AStitchRegular(bool bTrapezoid, int diagonals,
+ uint numInsideEdgePoints,
+ int2 outsideInsideEdgePointBaseOffset,
+ int i)
+{
+ if (bTrapezoid)
+ {
+ ++ outsideInsideEdgePointBaseOffset.x;
+ }
+
+ int pt;
+
+ if ((i < 4) && bTrapezoid)
+ {
+ if (i < 2)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x - 1 + i;
+ }
+ else if (i == 2)
+ {
+ pt = outsideInsideEdgePointBaseOffset.y;
+ }
+ else
+ {
+ pt = -1;
+ }
+ }
+
+ int index = i;
+ if (bTrapezoid)
+ {
+ index -= 4;
+ }
+
+ if (index >= 0)
+ {
+ uint uindex = (uint)index;
+
+ switch( diagonals )
+ {
+ case DIAGONALS_INSIDE_TO_OUTSIDE:
+ if (uindex < 5 * numInsideEdgePoints - 5)
+ {
+ uint p = uindex / 5;
+ uint r = uindex - p * 5;
+ if (r < 2)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + p + r;
+ }
+ else if (r < 4)
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + p + r;
+ }
+ else
+ {
+ pt = -1;
+ }
+ }
+ else
+ {
+ int r = i - (4 + 5 * numInsideEdgePoints - 5);
+ if (r < 2)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints - 1 + r;
+ }
+ else if (r == 2)
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + numInsideEdgePoints - 1;
+ }
+ else
+ {
+ pt = -1;
+ }
+ }
+ break;
+
+ case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation
+ if (uindex < (numInsideEdgePoints / 2 - 1) * 5)
+ {
+ // First half
+ uint p = uindex / 5;
+ uint r = uindex - p * 5;
+ if (r < 2)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + p + r;
+ }
+ else if (r < 4)
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + p;
+ }
+ else
+ {
+ pt = -1;
+ }
+ }
+ else if (uindex < (numInsideEdgePoints / 2 - 1) * 5 + 8)
+ {
+ // Middle
+ uint r = uindex - (numInsideEdgePoints / 2 - 1) * 5;
+ if (0 == r)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints / 2 - 1;
+ }
+ else if (r < 3)
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + numInsideEdgePoints / 2 - 1 + (2 - r);
+ }
+ else if (r == 3)
+ {
+ pt = -1;
+ }
+ else if (r < 6)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints / 2 - 1 + (r - 4);
+ }
+ else if (r == 6)
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + numInsideEdgePoints / 2 - 1 + 1;
+ }
+ else if (r == 7)
+ {
+ pt = -1;
+ }
+ }
+ //else if (uindex < (numInsideEdgePoints/2-1) * 5 + 8 + (numInsideEdgePoints - numInsideEdgePoints/2 - 1) * 5)
+ else if (uindex < numInsideEdgePoints * 5 - 2)
+ {
+ // Second half
+ uint p = (uindex - (numInsideEdgePoints / 2 - 1) * 5 + 8) / 5 + numInsideEdgePoints / 2 + 1;
+ uint r = uindex - (numInsideEdgePoints / 2 - 1) * 5 + 8 - (p - (numInsideEdgePoints / 2 + 1)) * 5;
+ if (r < 2)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + p - 1 + r;
+ }
+ else if (r < 4)
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + p - 1 + r;
+ }
+ else
+ {
+ pt = -1;
+ }
+ }
+ else
+ {
+ //int r = i - (4 + (numInsideEdgePoints/2-1) * 5 + 8 + (numInsideEdgePoints - numInsideEdgePoints/2 - 1) * 5);
+ int r = i - (numInsideEdgePoints * 5 + 2);
+ if (r < 2)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints - 1 + r;
+ }
+ else if (r == 2)
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + numInsideEdgePoints - 1;
+ }
+ else
+ {
+ pt = -1;
+ }
+ }
+ break;
+
+ case DIAGONALS_MIRRORED:
+ if (uindex < (numInsideEdgePoints / 2 + 1) * 2)
+ {
+ uint p = uindex / 2;
+ uint r = uindex - p * 2;
+ if (0 == r)
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + p;
+ }
+ else
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + p;
+ }
+ }
+ else if (uindex == (numInsideEdgePoints / 2 + 1) * 2)
+ {
+ pt = -1;
+ }
+ else if (uindex == (numInsideEdgePoints / 2 + 1) * 2 + 1)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints / 2;
+ }
+ //else if (uindex < (numInsideEdgePoints / 2 + 1) * 2 + 2 + (numInsideEdgePoints - numInsideEdgePoints / 2) * 2)
+ else if (uindex < numInsideEdgePoints * 2 + 4)
+ {
+ uint p = (uindex - ((numInsideEdgePoints / 2 + 1) * 2 + 2)) / 2 + numInsideEdgePoints / 2;
+ uint r = uindex - ((numInsideEdgePoints / 2 + 1) * 2 + 2) - (p - numInsideEdgePoints / 2) * 2;
+ if (0 == r)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + p;
+ }
+ else
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + p;
+ }
+ }
+ //else if (uindex == (numInsideEdgePoints / 2 + 1) * 2 + 2 + (numInsideEdgePoints - numInsideEdgePoints / 2) * 2)
+ else if (uindex == numInsideEdgePoints * 2 + 4)
+ {
+ pt = -1;
+ }
+ else
+ {
+ //int r = i - (4 + (numInsideEdgePoints / 2 + 1) * 2 + 2 + (numInsideEdgePoints - numInsideEdgePoints / 2) * 2 + 1);
+ uint r = i - (numInsideEdgePoints * 2 + 9);
+ if (r < 2)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints - 1 + r;
+ }
+ else if (r == 2)
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + numInsideEdgePoints - 1;
+ }
+ else
+ {
+ pt = -1;
+ }
+ }
+ break;
+ }
+ }
+
+ return pt;
+}
+
+int AStitchTransition(int2 outsideInsideEdgePointBaseOffset, int2 outsideInsideNumHalfTessFactorPoints,
+ int2 outsideInsideEdgeTessFactorParity,
+ uint i)
+{
+ outsideInsideNumHalfTessFactorPoints -= (TESSELLATOR_PARITY_ODD == outsideInsideEdgeTessFactorParity);
+
+ uint2 out_in_first_half = uint2(outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][MAX_FACTOR / 2 + 1].y, insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][MAX_FACTOR / 2 + 1].y) * 4;
+
+ uint3 out_in_middle = 0;
+ if ((outsideInsideEdgeTessFactorParity.y != outsideInsideEdgeTessFactorParity.x) || (outsideInsideEdgeTessFactorParity.y == TESSELLATOR_PARITY_ODD))
+ {
+ if (outsideInsideEdgeTessFactorParity.y == outsideInsideEdgeTessFactorParity.x)
+ {
+ // Quad in the middle
+ out_in_middle.z = 5;
+ out_in_middle.xy = 1;
+ }
+ else if (TESSELLATOR_PARITY_EVEN == outsideInsideEdgeTessFactorParity.y)
+ {
+ // Triangle pointing inside
+ out_in_middle.z = 4;
+ out_in_middle.x = 1;
+ }
+ else
+ {
+ // Triangle pointing outside
+ out_in_middle.z = 4;
+ out_in_middle.y = 1;
+ }
+ }
+
+
+ int pt = -1;
+
+ if (i < out_in_first_half.y)
+ {
+ // Advance inside
+
+ uint p = i / 4;
+ uint r = i - p * 4;
+ p = insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p].z;
+ if ((0 == r) || (2 == r))
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p].y + r / 2;
+ }
+ else if (1 == r)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p].y;
+ }
+ }
+ else
+ {
+ i -= out_in_first_half.y;
+
+ if (i < out_in_first_half.x)
+ {
+ // Advance outside
+
+ uint p = i / 4;
+ uint r = i - p * 4;
+ p = outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p].z;
+ if (r < 2)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p].y + r;
+ }
+ else if (r == 2)
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p].y;
+ if (insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p].x)
+ {
+ ++ pt;
+ }
+ }
+ }
+ else
+ {
+ i -= out_in_first_half.x;
+
+ if (i < out_in_middle.z)
+ {
+ uint r = i;
+ if (outsideInsideEdgeTessFactorParity.y == outsideInsideEdgeTessFactorParity.x)
+ {
+ // Quad in the middle
+ if ((0 == r) || (2 == r))
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + out_in_first_half.y / 4 + (2 == r);//r / 2;
+ }
+ else if ((1 == r) || (3 == r))
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + out_in_first_half.x / 4 + (3 == r);//(r - 1) / 2;
+ }
+ }
+ else if (TESSELLATOR_PARITY_EVEN == outsideInsideEdgeTessFactorParity.y)
+ {
+ // Triangle pointing inside
+ if (r == 0)
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + out_in_first_half.y / 4;
+ }
+ else if (r < 3)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + out_in_first_half.x / 4 + r - 1;
+ }
+ }
+ else
+ {
+ // Triangle pointing outside
+ if ((0 == r) || (2 == r))
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + out_in_first_half.y / 4 + (2 == r);//r / 2;
+ }
+ else if (1 == r)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + out_in_first_half.x / 4;
+ }
+ }
+ }
+ else
+ {
+ i -= out_in_middle.z;
+
+ if (i < out_in_first_half.x)
+ {
+ // Advance outside
+
+ uint p = i / 4;
+ uint r = i - p * 4;
+ p = outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p].z;
+ if (r < 2)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + out_in_first_half.x / 4 + out_in_middle.x + (outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][MAX_FACTOR / 2 + 1].y - outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p + 1].y) + r;
+ }
+ else if (r == 2)
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + out_in_first_half.y / 4 + out_in_middle.y + (insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][MAX_FACTOR / 2 + 1].y - insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p + 1].y);
+ }
+ }
+ else
+ {
+ // Advance inside
+
+ i -= out_in_first_half.x;
+
+ uint p = i / 4;
+ uint r = i - p * 4;
+ p = insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p].w;
+ if ((0 == r) || (2 == r))
+ {
+ pt = outsideInsideEdgePointBaseOffset.y + out_in_first_half.y / 4 + out_in_middle.y
+ + (insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][MAX_FACTOR / 2 + 1].y - insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p + 1].y) + (2 == r);//r / 2;
+ }
+ else if (1 == r)
+ {
+ pt = outsideInsideEdgePointBaseOffset.x + out_in_first_half.x / 4 + out_in_middle.x
+ + (outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][MAX_FACTOR / 2 + 1].y - outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p + 1].y);
+ if (outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p].x)
+ {
+ ++ pt;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return pt;
+}
+
+[numthreads(128, 1, 1)]
+void CSTessellationIndices( uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex )
+{
+ uint id = DTid.x;
+ //uint id = Gid.x * 128 + GI; // Workaround for some CS4x preview drivers
+
+ if ( id < g_param.x )
+ {
+ uint tri_id = InputTriIDIndexID[id].x;
+ uint index_id = InputTriIDIndexID[id].y;
+ uint base_vertex = InputScanned[tri_id-1].x;
+
+ float4 outside_inside_factor = InputEdgeFactor[tri_id];
+
+ PROCESSED_TESS_FACTORS_TRI processedTessFactors;
+ int num_points = TriProcessTessFactors(outside_inside_factor, processedTessFactors, g_partitioning);
+
+ uint tessed_indices;
+ if (3 == num_points)
+ {
+ if (index_id < 3)
+ {
+ tessed_indices = TransformIndex1(index_id, base_vertex);
+ }
+ else
+ {
+ tessed_indices = -1;
+ }
+ }
+ else
+ {
+ // Generate primitives for all the concentric rings, one side at a time for each ring
+ static const int startRing = 1;
+ int numRings = ((processedTessFactors.numPointsForOutsideInside.w + 1) / 2); // +1 is so even tess includes the center point, which we want to now
+
+ int4 outsideInsideHalfTessFactor = int4(ceil(processedTessFactors.outsideInsideHalfTessFactor));
+ uint3 num = NumStitchTransition(outsideInsideHalfTessFactor, processedTessFactors.outsideInsideTessFactorParity);
+ num.y += num.x;
+ num.z += num.y;
+ uint num_index = num.z;
+ num_index += TotalNumStitchRegular(true, DIAGONALS_MIRRORED, processedTessFactors.numPointsForOutsideInside.w, numRings - 1) * 3;
+ if( processedTessFactors.outsideInsideTessFactorParity.w == TESSELLATOR_PARITY_ODD )
+ {
+ num_index += 4;
+ }
+
+ int pt;
+
+ if (index_id < num.x)
+ {
+ int numPointsForInsideEdge = processedTessFactors.numPointsForOutsideInside.w - 2 * startRing;
+
+ pt = AStitchTransition(int2(0, processedTessFactors.insideEdgePointBaseOffset),
+ outsideInsideHalfTessFactor.xw,
+ processedTessFactors.outsideInsideTessFactorParity.xw,
+ index_id);
+ if (pt != -1)
+ {
+ pt = TransformIndex1(pt, base_vertex);
+ }
+ }
+ else if (index_id < num.y)
+ {
+ int numPointsForInsideEdge = processedTessFactors.numPointsForOutsideInside.w - 2 * startRing;
+
+ pt = AStitchTransition(
+ int2(processedTessFactors.numPointsForOutsideInside.x - 1, processedTessFactors.insideEdgePointBaseOffset + numPointsForInsideEdge - 1),
+ outsideInsideHalfTessFactor.yw,
+ processedTessFactors.outsideInsideTessFactorParity.yw,
+ index_id - num.x);
+ if (pt != -1)
+ {
+ pt = TransformIndex1(pt, base_vertex);
+ }
+ }
+ else if (index_id < num.z)
+ {
+ int numPointsForInsideEdge = processedTessFactors.numPointsForOutsideInside.w - 2 * startRing;
+
+ INDEX_PATCH_CONTEXT IndexPatchContext;
+ IndexPatchContext.insidePointIndexDeltaToRealValue = processedTessFactors.insideEdgePointBaseOffset + 2 * (numPointsForInsideEdge - 1);
+ IndexPatchContext.insidePointIndexBadValue = numPointsForInsideEdge - 1;
+ IndexPatchContext.insidePointIndexReplacementValue = processedTessFactors.insideEdgePointBaseOffset;
+ IndexPatchContext.outsidePointIndexPatchBase = IndexPatchContext.insidePointIndexBadValue+1; // past inside patched index range
+ IndexPatchContext.outsidePointIndexDeltaToRealValue = processedTessFactors.numPointsForOutsideInside.x + processedTessFactors.numPointsForOutsideInside.y - 2
+ - IndexPatchContext.outsidePointIndexPatchBase;
+ IndexPatchContext.outsidePointIndexBadValue = IndexPatchContext.outsidePointIndexPatchBase
+ + processedTessFactors.numPointsForOutsideInside.z - 1;
+ IndexPatchContext.outsidePointIndexReplacementValue = 0;
+
+ pt = AStitchTransition(int2(numPointsForInsideEdge, 0),
+ outsideInsideHalfTessFactor.zw,
+ processedTessFactors.outsideInsideTessFactorParity.zw,
+ index_id - num.y);
+ if (pt != -1)
+ {
+ pt = TransformIndex2(pt, base_vertex, IndexPatchContext);
+ }
+ }
+ else
+ {
+ if ((processedTessFactors.outsideInsideTessFactorParity.w == TESSELLATOR_PARITY_ODD) && (index_id >= num_index - 4))
+ {
+ int outsideEdgePointBaseOffset = processedTessFactors.insideEdgePointBaseOffset
+ + ((processedTessFactors.numPointsForOutsideInside.w + 1) - (numRings + startRing)) * (numRings - startRing - 1) * 3;
+
+ if (index_id - (num_index - 4) != 3)
+ {
+ pt = TransformIndex1(outsideEdgePointBaseOffset + index_id - (num_index - 4), base_vertex);
+ }
+ else
+ {
+ pt = -1;
+ }
+ }
+ else
+ {
+ int ring = GetRingFromIndexStitchRegular(true, DIAGONALS_MIRRORED, processedTessFactors.numPointsForOutsideInside.w, index_id - num.z);
+
+ int tn = TotalNumStitchRegular(true, DIAGONALS_MIRRORED, processedTessFactors.numPointsForOutsideInside.w, ring - 1) * 3;
+ int n = NumStitchRegular(true, DIAGONALS_MIRRORED, processedTessFactors.numPointsForOutsideInside.w - 2 * ring);
+
+ int edge = (index_id - num.z - tn) / n;
+ int index = (index_id - num.z - tn) - edge * n;
+
+ int2 outsideInsideEdgePointBaseOffset = processedTessFactors.insideEdgePointBaseOffset
+ + int2(0, 3 * (processedTessFactors.numPointsForOutsideInside.w - 3))
+ + ((processedTessFactors.numPointsForOutsideInside.w - (ring + startRing)) + int2(1, -1)) * (ring - startRing - 1) * 3;
+
+ int numPointsForInsideEdge = processedTessFactors.numPointsForOutsideInside.w - 2 * ring;
+ int numLastPointsForInsideEdge = numPointsForInsideEdge + 2;
+
+ if (edge < 2)
+ {
+ pt = AStitchRegular(true, DIAGONALS_MIRRORED,
+ numPointsForInsideEdge,
+ outsideInsideEdgePointBaseOffset + (int2(numLastPointsForInsideEdge, numPointsForInsideEdge) - 1) * edge,
+ index);
+ if (pt != -1)
+ {
+ pt = TransformIndex1(pt, base_vertex);
+ }
+ }
+ else
+ {
+ INDEX_PATCH_CONTEXT IndexPatchContext;
+ IndexPatchContext.insidePointIndexDeltaToRealValue = outsideInsideEdgePointBaseOffset.y + (numPointsForInsideEdge - 1) * 2;
+ IndexPatchContext.insidePointIndexBadValue = numPointsForInsideEdge - 1;
+ IndexPatchContext.insidePointIndexReplacementValue = outsideInsideEdgePointBaseOffset.y;
+ IndexPatchContext.outsidePointIndexPatchBase = IndexPatchContext.insidePointIndexBadValue+1; // past inside patched index range
+ IndexPatchContext.outsidePointIndexDeltaToRealValue = outsideInsideEdgePointBaseOffset.x + (numLastPointsForInsideEdge - 1) * 2
+ - IndexPatchContext.outsidePointIndexPatchBase;
+ IndexPatchContext.outsidePointIndexBadValue = IndexPatchContext.outsidePointIndexPatchBase
+ + numLastPointsForInsideEdge - 1;
+ IndexPatchContext.outsidePointIndexReplacementValue = outsideInsideEdgePointBaseOffset.x;
+
+ pt = AStitchRegular(true, DIAGONALS_MIRRORED,
+ numPointsForInsideEdge,
+ int2(numPointsForInsideEdge, 0),
+ index);
+ if (pt != -1)
+ {
+ pt = TransformIndex2(pt, base_vertex, IndexPatchContext);
+ }
+ }
+ }
+ }
+
+ tessed_indices = pt;
+ }
+
+ TessedIndicesOut.Store(id*4, tessed_indices);
+ }
+}
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateVerticesCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateVerticesCS.hlsl
new file mode 100644
index 000000000..55bf1be87
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateVerticesCS.hlsl
@@ -0,0 +1,206 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSTessellationVertices
+//--------------------------------------------------------------------------------------
+// File: TessellatorCS40_TessellateVerticesCS.hlsl
+//
+// The CS to tessellate vertices
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#include "TessellatorCS40_common.hlsl"
+
+StructuredBuffer<uint2> InputTriIDIndexID : register(t0);
+StructuredBuffer<float4> InputEdgeFactor : register(t1);
+
+struct TessedVertex
+{
+ uint BaseTriID;
+ float2 bc;
+};
+RWStructuredBuffer<TessedVertex> TessedVerticesOut : register(u0);
+
+cbuffer cbCS : register(b1)
+{
+ uint4 g_param;
+}
+
+void PlacePointIn1D(PROCESSED_TESS_FACTORS_TRI processedTessFactors, int ctx_index, int pt, out float location, int parity)
+{
+ int numHalfTessFactorPoints = int(ceil(processedTessFactors.outsideInsideHalfTessFactor[ctx_index]));
+
+ bool bFlip;
+ if( pt >= numHalfTessFactorPoints )
+ {
+ pt = (numHalfTessFactorPoints << 1) - pt;
+ if( TESSELLATOR_PARITY_ODD == parity )
+ {
+ pt -= 1;
+ }
+ bFlip = true;
+ }
+ else
+ {
+ bFlip = false;
+ }
+
+ if( pt == numHalfTessFactorPoints )
+ {
+ location = 0.5f;
+ }
+ else
+ {
+ unsigned int indexOnCeilHalfTessFactor = pt;
+ unsigned int indexOnFloorHalfTessFactor = indexOnCeilHalfTessFactor;
+ if( pt > processedTessFactors.outsideInsideSplitPointOnFloorHalfTessFactor[ctx_index] )
+ {
+ indexOnFloorHalfTessFactor -= 1;
+ }
+ float locationOnFloorHalfTessFactor = indexOnFloorHalfTessFactor * processedTessFactors.outsideInsideInvNumSegmentsOnFloorTessFactor[ctx_index];
+ float locationOnCeilHalfTessFactor = indexOnCeilHalfTessFactor * processedTessFactors.outsideInsideInvNumSegmentsOnCeilTessFactor[ctx_index];
+
+ location = lerp(locationOnFloorHalfTessFactor, locationOnCeilHalfTessFactor, frac(processedTessFactors.outsideInsideHalfTessFactor[ctx_index]));
+
+ if( bFlip )
+ {
+ location = 1.0f - location;
+ }
+ }
+}
+
+[numthreads(128, 1, 1)]
+void CSTessellationVertices( uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex )
+{
+ uint id = DTid.x;
+ //uint id = Gid.x * 128 + GI; // Workaround for some CS4x preview drivers
+
+ if ( id < g_param.x )
+ {
+ uint tri_id = InputTriIDIndexID[id].x;
+ uint vert_id = InputTriIDIndexID[id].y;
+
+ float4 outside_inside_factor = InputEdgeFactor[tri_id];
+
+ PROCESSED_TESS_FACTORS_TRI processedTessFactors;
+ int num_points = TriProcessTessFactors(outside_inside_factor, processedTessFactors, g_partitioning);
+
+ float2 uv;
+ if (3 == num_points)
+ {
+ if (0 == vert_id)
+ {
+ uv = float2(0, 1);
+ }
+ else if (1 == vert_id)
+ {
+ uv = float2(0, 0);
+ }
+ else
+ {
+ uv = float2(1, 0);
+ }
+ }
+ else
+ {
+ if (vert_id < processedTessFactors.insideEdgePointBaseOffset)
+ {
+ // Generate exterior ring edge points, clockwise starting from point V (VW, the U==0 edge)
+
+ int edge;
+ if (vert_id < processedTessFactors.numPointsForOutsideInside.x - 1)
+ {
+ edge = 0;
+ }
+ else
+ {
+ vert_id -= processedTessFactors.numPointsForOutsideInside.x - 1;
+ if (vert_id < processedTessFactors.numPointsForOutsideInside.y - 1)
+ {
+ edge = 1;
+ }
+ else
+ {
+ vert_id -= processedTessFactors.numPointsForOutsideInside.y - 1;
+ edge = 2;
+ }
+ }
+
+ int p = vert_id;
+ int endPoint = processedTessFactors.numPointsForOutsideInside[edge] - 1;
+ float param;
+ int q = (edge & 0x1) ? p : endPoint - p; // whether to reverse point order given we are defining V or U (W implicit):
+ // edge0, VW, has V decreasing, so reverse 1D points below
+ // edge1, WU, has U increasing, so don't reverse 1D points below
+ // edge2, UV, has U decreasing, so reverse 1D points below
+ PlacePointIn1D(processedTessFactors, edge,q,param, processedTessFactors.outsideInsideTessFactorParity[edge]);
+ if (0 == edge)
+ {
+ uv = float2(0, param);
+ }
+ else if (1 == edge)
+ {
+ uv = float2(param, 0);
+ }
+ else
+ {
+ uv = float2(param, 1 - param);
+ }
+ }
+ else
+ {
+ // Generate interior ring points, clockwise spiralling in
+
+ uint index = vert_id - processedTessFactors.insideEdgePointBaseOffset;
+ uint ring = 1 + (((3 * processedTessFactors.numPointsForOutsideInside.w - 6) - sqrt(sqr(3 * processedTessFactors.numPointsForOutsideInside.w - 6) - 4 * 3 * index)) + 0.001f) / 6;
+ index -= 3 * (processedTessFactors.numPointsForOutsideInside.w - ring - 1) * (ring - 1);
+
+ uint startPoint = ring;
+ uint endPoint = processedTessFactors.numPointsForOutsideInside.w - 1 - startPoint;
+ if (index < 3 * (endPoint - startPoint))
+ {
+ uint edge = index / (endPoint - startPoint);
+ uint p = index - edge * (endPoint - startPoint) + startPoint;
+
+ int perpendicularAxisPoint = startPoint;
+ float perpParam;
+ PlacePointIn1D(processedTessFactors, 3, perpendicularAxisPoint, perpParam, processedTessFactors.outsideInsideTessFactorParity.w);
+ perpParam = perpParam * 2 / 3;
+
+ float param;
+ int q = (edge & 0x1) ? p : endPoint - (p - startPoint); // whether to reverse point given we are defining V or U (W implicit):
+ // edge0, VW, has V decreasing, so reverse 1D points below
+ // edge1, WU, has U increasing, so don't reverse 1D points below
+ // edge2, UV, has U decreasing, so reverse 1D points below
+ PlacePointIn1D(processedTessFactors, 3, q,param, processedTessFactors.outsideInsideTessFactorParity.w);
+ // edge0 VW, has perpendicular parameter U constant
+ // edge1 WU, has perpendicular parameter V constant
+ // edge2 UV, has perpendicular parameter W constant
+ const unsigned int deriv = 2; // reciprocal is the rate of change of edge-parallel parameters as they are pushed into the triangle
+ if (0 == edge)
+ {
+ uv = float2(perpParam, param - perpParam / deriv);
+ }
+ else if (1 == edge)
+ {
+ uv = float2(param - perpParam / deriv, perpParam);
+ }
+ else
+ {
+ uv = float2(param - perpParam / deriv, 1 - (param - perpParam / deriv + perpParam));
+ }
+ }
+ else
+ {
+ if( processedTessFactors.outsideInsideTessFactorParity.w != TESSELLATOR_PARITY_ODD )
+ {
+ // Last point is the point at the center.
+ uv = 1 / 3.0f;
+ }
+ }
+ }
+ }
+
+ TessedVerticesOut[id].BaseTriID = tri_id;
+ TessedVerticesOut[id].bc = uv;
+ }
+}
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_common.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_common.hlsl
new file mode 100644
index 000000000..309044cdb
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_common.hlsl
@@ -0,0 +1,411 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: TessellatorCS40_common.hlsl
+//
+// The common utils included by other shaders in the sample
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#include "TessellatorCS40_defines.h"
+
+cbuffer cbNeverChanges : register(b0)
+{
+ uint4 insidePointIndex[MAX_FACTOR / 2 + 1][MAX_FACTOR / 2 + 2];
+ uint4 outsidePointIndex[MAX_FACTOR / 2 + 1][MAX_FACTOR / 2 + 2];
+}
+
+#define D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR ( 64 )
+#define D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR ( 63 )
+#define D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR ( 2 )
+#define D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR ( 1 )
+
+#define D3D11_TESSELLATOR_PARTITIONING_INTEGER ( 0 )
+#define D3D11_TESSELLATOR_PARTITIONING_POW2 ( 1 )
+#define D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD ( 2 )
+#define D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN ( 3 )
+
+#define TESSELLATOR_PARITY_EVEN ( 0 )
+#define TESSELLATOR_PARITY_ODD ( 1 )
+
+#define EPSILON 1e-6f
+#define MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON (D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR + EPSILON/2)
+
+#define DIAGONALS_INSIDE_TO_OUTSIDE ( 0 )
+#define DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE ( 1 )
+#define DIAGONALS_MIRRORED ( 2 )
+
+
+// This is moved to macro defines at shader compile time, so that the partitioning mode can be changed during runtime
+//#define g_partitioning (D3D11_TESSELLATOR_PARTITIONING_POW2)
+
+
+struct PROCESSED_TESS_FACTORS_TRI
+{
+ float4 outsideInsideTessFactor;
+ int4 outsideInsideTessFactorParity;
+
+ float4 outsideInsideInvNumSegmentsOnFloorTessFactor;
+ float4 outsideInsideInvNumSegmentsOnCeilTessFactor;
+ float4 outsideInsideHalfTessFactor;
+ int4 outsideInsideSplitPointOnFloorHalfTessFactor;
+
+ // Stuff below is specific to the traversal order
+ uint4 numPointsForOutsideInside;
+ uint insideEdgePointBaseOffset;
+};
+
+struct INDEX_PATCH_CONTEXT
+{
+ int insidePointIndexDeltaToRealValue;
+ int insidePointIndexBadValue;
+ int insidePointIndexReplacementValue;
+ int outsidePointIndexPatchBase;
+ int outsidePointIndexDeltaToRealValue;
+ int outsidePointIndexBadValue;
+ int outsidePointIndexReplacementValue;
+};
+
+bool4 isEven(float4 input)
+{
+ return (((uint4)input) & 1) ? false : true;
+}
+
+uint RemoveMSB(uint val)
+{
+ int check;
+ if( val <= 0x0000ffff )
+ {
+ check = ( val <= 0x000000ff ) ? 0x00000080 : 0x00008000;
+ }
+ else
+ {
+ check = ( val <= 0x00ffffff ) ? 0x00800000 : 0x80000000;
+ }
+ for (int i = 0; i < 8; i++, check >>= 1)
+ {
+ if( val & check )
+ {
+ return (val & ~check);
+ }
+ }
+ return 0;
+}
+
+uint4 NumPointsForTessFactor(float4 tessFactor, int4 parity)
+{
+ return TESSELLATOR_PARITY_ODD == parity ? uint4(ceil(0.5f + tessFactor / 2)) * 2 : uint4(ceil(tessFactor / 2)) * 2 + 1;
+}
+
+void ComputeTessFactorContext(float4 tessFactor, int4 parity,
+ out float4 invNumSegmentsOnFloorTessFactor,
+ out float4 invNumSegmentsOnCeilTessFactor,
+ out float4 halfTessFactor,
+ out int4 splitPointOnFloorHalfTessFactor)
+{
+ halfTessFactor = tessFactor / 2;
+
+ halfTessFactor += 0.5 * ((TESSELLATOR_PARITY_ODD == parity) | (0.5f == halfTessFactor));
+
+ float4 floorHalfTessFactor = floor(halfTessFactor);
+ float4 ceilHalfTessFactor = ceil(halfTessFactor);
+ int4 numHalfTessFactorPoints = int4(ceilHalfTessFactor);
+
+ for (int index = 0; index < 4; ++ index)
+ {
+ if( ceilHalfTessFactor[index] == floorHalfTessFactor[index] )
+ {
+ splitPointOnFloorHalfTessFactor[index] = /*pick value to cause this to be ignored*/ numHalfTessFactorPoints[index]+1;
+ }
+ else if( TESSELLATOR_PARITY_ODD == parity[index] )
+ {
+ if( floorHalfTessFactor[index] == 1 )
+ {
+ splitPointOnFloorHalfTessFactor[index] = 0;
+ }
+ else
+ {
+ splitPointOnFloorHalfTessFactor[index] = (RemoveMSB(int(floorHalfTessFactor[index]) - 1) << 1) + 1;
+ }
+ }
+ else
+ {
+ splitPointOnFloorHalfTessFactor[index] = (RemoveMSB(int(floorHalfTessFactor[index])) << 1) + 1;
+ }
+ }
+
+ int4 numFloorSegments = int4(floorHalfTessFactor * 2);
+ int4 numCeilSegments = int4(ceilHalfTessFactor * 2);
+ int4 s = (TESSELLATOR_PARITY_ODD == parity);
+ numFloorSegments -= s;
+ numCeilSegments -= s;
+ invNumSegmentsOnFloorTessFactor = 1.0f / numFloorSegments;
+ invNumSegmentsOnCeilTessFactor = 1.0f / numCeilSegments;
+}
+
+int TriProcessTessFactors( inout float4 tessFactor,
+ out PROCESSED_TESS_FACTORS_TRI processedTessFactors,
+ int partitioning )
+{
+ processedTessFactors = (PROCESSED_TESS_FACTORS_TRI)0;
+
+ int parity = TESSELLATOR_PARITY_EVEN;
+ switch( partitioning )
+ {
+ case D3D11_TESSELLATOR_PARTITIONING_INTEGER:
+ default:
+ break;
+ case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD:
+ parity = TESSELLATOR_PARITY_ODD;
+ break;
+ case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN:
+ parity = TESSELLATOR_PARITY_EVEN;
+ break;
+ }
+
+ // Is the patch culled?
+ if( !(tessFactor.x > 0) || // NaN will pass
+ !(tessFactor.y > 0) ||
+ !(tessFactor.z > 0) )
+ {
+ return 0;
+ }
+
+ // Clamp edge TessFactors
+ float lowerBound, upperBound;
+ switch(partitioning)
+ {
+ case D3D11_TESSELLATOR_PARTITIONING_INTEGER:
+ case D3D11_TESSELLATOR_PARTITIONING_POW2: // don't care about pow2 distinction for validation, just treat as integer
+ default:
+ lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR;
+ upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR;
+ break;
+
+ case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN:
+ lowerBound = D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR;
+ upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR;
+ break;
+
+ case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD:
+ lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR;
+ upperBound = D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR;
+ break;
+ }
+
+ tessFactor.xyz = min( upperBound, max( lowerBound, tessFactor.xyz ) );
+
+ // Clamp inside TessFactors
+ if(D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD == partitioning)
+ {
+ if( (tessFactor.x > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) ||
+ (tessFactor.y > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) ||
+ (tessFactor.z > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON))
+ // Don't need the same check for insideTessFactor for tri patches,
+ // since there is only one insideTessFactor, as opposed to quad
+ // patches which have 2 insideTessFactors.
+ {
+ // Force picture frame
+ lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR + EPSILON;
+ }
+ }
+
+ tessFactor.w = min( upperBound, max( lowerBound, tessFactor.w ) );
+ // Note the above clamps map NaN to lowerBound
+
+ if (partitioning == D3D11_TESSELLATOR_PARTITIONING_INTEGER)
+ {
+ tessFactor = ceil(tessFactor);
+ }
+ else if (partitioning == D3D11_TESSELLATOR_PARTITIONING_POW2)
+ {
+ static const int exponentMask = 0x7f800000;
+ static const int mantissaMask = 0x007fffff;
+ static const int exponentLSB = 0x00800000;
+
+ int4 bits = asint(tessFactor);
+ tessFactor = bits & mantissaMask ? asfloat((bits & exponentMask) + exponentLSB) : tessFactor;
+ }
+
+ // Process tessFactors
+ if ((partitioning == D3D11_TESSELLATOR_PARTITIONING_INTEGER)|| (partitioning == D3D11_TESSELLATOR_PARTITIONING_POW2))
+ {
+ bool4 e = isEven(tessFactor);
+ processedTessFactors.outsideInsideTessFactorParity.xyz = e.xyz ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD;
+ processedTessFactors.outsideInsideTessFactorParity.w = (e.w || (1 == tessFactor.w)) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD;
+ }
+ else
+ {
+ processedTessFactors.outsideInsideTessFactorParity = parity;
+ }
+
+ processedTessFactors.outsideInsideTessFactor = tessFactor;
+
+ if (((partitioning == D3D11_TESSELLATOR_PARTITIONING_INTEGER)|| (partitioning == D3D11_TESSELLATOR_PARTITIONING_POW2)) || (parity == TESSELLATOR_PARITY_ODD))
+ {
+ // Special case if all TessFactors are 1
+ if( (1 == processedTessFactors.outsideInsideTessFactor.x) &&
+ (1 == processedTessFactors.outsideInsideTessFactor.y) &&
+ (1 == processedTessFactors.outsideInsideTessFactor.z) &&
+ (1 == processedTessFactors.outsideInsideTessFactor.w) )
+ {
+ return 3;
+ }
+ }
+
+ // Compute per-TessFactor metadata
+ ComputeTessFactorContext(processedTessFactors.outsideInsideTessFactor, processedTessFactors.outsideInsideTessFactorParity,
+ processedTessFactors.outsideInsideInvNumSegmentsOnFloorTessFactor,
+ processedTessFactors.outsideInsideInvNumSegmentsOnCeilTessFactor,
+ processedTessFactors.outsideInsideHalfTessFactor,
+ processedTessFactors.outsideInsideSplitPointOnFloorHalfTessFactor);
+
+ // Compute some initial data.
+
+ // outside edge offsets and storage
+ processedTessFactors.numPointsForOutsideInside = NumPointsForTessFactor(processedTessFactors.outsideInsideTessFactor, processedTessFactors.outsideInsideTessFactorParity);
+ int NumPoints = processedTessFactors.numPointsForOutsideInside.x + processedTessFactors.numPointsForOutsideInside.y + processedTessFactors.numPointsForOutsideInside.z - 3;
+
+ // inside edge offsets
+ {
+ uint pointCountMin = (processedTessFactors.outsideInsideTessFactorParity.w == TESSELLATOR_PARITY_ODD) ? 4 : 3;
+ // max() allows degenerate transition regions when inside TessFactor == 1
+ processedTessFactors.numPointsForOutsideInside.w = max(pointCountMin, processedTessFactors.numPointsForOutsideInside.w);
+ }
+
+ processedTessFactors.insideEdgePointBaseOffset = NumPoints;
+
+ // inside storage, including interior edges above
+ {
+ int numInteriorRings = (processedTessFactors.numPointsForOutsideInside.w >> 1) - 1;
+ int numInteriorPoints;
+ if( processedTessFactors.outsideInsideTessFactorParity.w == TESSELLATOR_PARITY_ODD )
+ {
+ numInteriorPoints = 3*(numInteriorRings*(numInteriorRings+1) - numInteriorRings);
+ }
+ else
+ {
+ numInteriorPoints = 3*(numInteriorRings*(numInteriorRings+1)) + 1;
+ }
+ NumPoints += numInteriorPoints;
+ }
+
+ return NumPoints;
+}
+
+int NumStitchRegular(bool bTrapezoid, int diagonals, int numInsideEdgePoints)
+{
+ int num_index = 0;
+
+ if( bTrapezoid )
+ {
+ num_index += 8;
+ }
+ switch( diagonals )
+ {
+ case DIAGONALS_INSIDE_TO_OUTSIDE:
+ // Diagonals pointing from inside edge forward towards outside edge
+ num_index += 5 * numInsideEdgePoints - 5;
+ break;
+
+ case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation
+ // Diagonals pointing from outside edge forward towards inside edge
+ num_index += 5 * numInsideEdgePoints - 2;
+ break;
+
+ case DIAGONALS_MIRRORED:
+ num_index += 2 * numInsideEdgePoints + 5;
+ break;
+ }
+
+ return num_index;
+}
+
+uint TotalNumStitchRegular(bool bTrapezoid, int diagonals,
+ int numPointsForInsideTessFactor, int ring)
+{
+ uint num_index = 0;
+
+ if( bTrapezoid )
+ {
+ num_index += 8 * (ring - 1);
+ }
+ switch( diagonals )
+ {
+ case DIAGONALS_INSIDE_TO_OUTSIDE:
+ // Diagonals pointing from inside edge forward towards outside edge
+ num_index += (5 * numPointsForInsideTessFactor - 35 - 5 * ring) * (ring - 1);
+ break;
+
+ case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation
+ // Diagonals pointing from outside edge forward towards inside edge
+ num_index += (5 * numPointsForInsideTessFactor - 12 - 5 * ring) * (ring - 1);
+ break;
+
+ case DIAGONALS_MIRRORED:
+ num_index += (2 * numPointsForInsideTessFactor + 1 - 2 * ring) * (ring - 1);
+ break;
+ }
+
+ return num_index;
+}
+
+int sqr(int x)
+{
+ return x * x;
+}
+
+int GetRingFromIndexStitchRegular(bool bTrapezoid, int diagonals, int numPointsForInsideTessFactor, int index)
+{
+ int t = 0;
+ if (bTrapezoid)
+ {
+ t = 8;
+ }
+
+ switch( diagonals )
+ {
+ case DIAGONALS_INSIDE_TO_OUTSIDE:
+ t = (5 * numPointsForInsideTessFactor - (35 - t)) * 3;
+ return 1 + uint((t + 15) - sqrt(sqr(t + 15) - 4 * 15 * (t + index)) + 0.001f) / 30;
+
+ case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE:
+ t = (5 * numPointsForInsideTessFactor - (12 - t)) * 3;
+ return 1 + uint((t + 15) - sqrt(sqr(t + 15) - 4 * 15 * (t + index)) + 0.001f) / 30;
+
+ case DIAGONALS_MIRRORED:
+ t = ((t + 1) + 2 * numPointsForInsideTessFactor) * 3;
+ return 1 + uint((t + 6) - sqrt(sqr(t + 6) - 4 * 6 * (t + index)) + 0.001f) / 12;
+
+ default:
+ return -1;
+ }
+}
+
+uint3 NumStitchTransition(int4 outsideInsideNumHalfTessFactorPoints,
+ int4 outsideInsideEdgeTessFactorParity)
+{
+ outsideInsideNumHalfTessFactorPoints -= (TESSELLATOR_PARITY_ODD == outsideInsideEdgeTessFactorParity);
+
+ uint3 num_index = insidePointIndex[outsideInsideNumHalfTessFactorPoints.w][MAX_FACTOR / 2 + 1].y * 8;
+
+ [unroll]
+ for (int edge = 0; edge < 3; ++ edge)
+ {
+ num_index[edge] += outsidePointIndex[outsideInsideNumHalfTessFactorPoints[edge]][MAX_FACTOR / 2 + 1].y * 8;
+
+ if( (outsideInsideEdgeTessFactorParity.w != outsideInsideEdgeTessFactorParity[edge]) || (outsideInsideEdgeTessFactorParity.w == TESSELLATOR_PARITY_ODD))
+ {
+ if( outsideInsideEdgeTessFactorParity.w == outsideInsideEdgeTessFactorParity[edge] )
+ {
+ num_index[edge] += 5;
+ }
+ else
+ {
+ num_index[edge] += 4;
+ }
+ }
+ }
+
+ return num_index;
+}
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_defines.h b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_defines.h
new file mode 100644
index 000000000..6b4382393
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_defines.h
@@ -0,0 +1,9 @@
+//--------------------------------------------------------------------------------------
+// File: TessellatorCS40_defines.h
+//
+// This file defines common constants which are included by both CPU code and shader code
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#define MAX_FACTOR 16
diff --git a/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC6HEncode.hlsl b/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC6HEncode.hlsl
new file mode 100644
index 000000000..1e40c80ef
--- /dev/null
+++ b/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC6HEncode.hlsl
@@ -0,0 +1,2567 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: BC6HEncode.hlsl
+//
+// The Compute Shader for BC6H Encoder
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//#define REF_DEVICE
+
+#define UINTLENGTH 32
+#define NCHANNELS 3
+#define SIGNED_F16 96
+#define UNSIGNED_F16 95
+#define MAX_FLOAT asfloat(0x7F7FFFFF)
+#define MIN_FLOAT asfloat(0xFF7FFFFF)
+#define MAX_INT asint(0x7FFFFFFF)
+#define MIN_INT asint(0x80000000)
+
+cbuffer cbCS : register( b0 )
+{
+ uint g_tex_width;
+ uint g_num_block_x;
+ uint g_format; //either SIGNED_F16 for DXGI_FORMAT_BC6H_SF16 or UNSIGNED_F16 for DXGI_FORMAT_BC6H_UF16
+ uint g_mode_id;
+ uint g_start_block_id;
+ uint g_num_total_blocks;
+};
+
+static const uint candidateModeMemory[14] = { 0x00, 0x01, 0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, 0x03, 0x07, 0x0B, 0x0F };
+static const uint candidateModeFlag[14] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
+static const bool candidateModeTransformed[14] = { true, true, true, true, true, true, true, true, true, false, false, true, true, true };
+static const uint4 candidateModePrec[14] = { uint4(10,5,5,5), uint4(7,6,6,6),
+ uint4(11,5,4,4), uint4(11,4,5,4), uint4(11,4,4,5), uint4(9,5,5,5),
+ uint4(8,6,5,5), uint4(8,5,6,5), uint4(8,5,5,6), uint4(6,6,6,6),
+ uint4(10,10,10,10), uint4(11,9,9,9), uint4(12,8,8,8), uint4(16,4,4,4) };
+
+/*static const uint4x4 candidateSection[32] =
+{
+ {0,0,1,1, 0,0,1,1, 0,0,1,1, 0,0,1,1}, {0,0,0,1, 0,0,0,1, 0,0,0,1, 0,0,0,1}, {0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1}, {0,0,0,1, 0,0,1,1, 0,0,1,1, 0,1,1,1},
+ {0,0,0,0, 0,0,0,1, 0,0,0,1, 0,0,1,1}, {0,0,1,1, 0,1,1,1, 0,1,1,1, 1,1,1,1}, {0,0,0,1, 0,0,1,1, 0,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,1, 0,0,1,1, 0,1,1,1},
+ {0,0,0,0, 0,0,0,0, 0,0,0,1, 0,0,1,1}, {0,0,1,1, 0,1,1,1, 1,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,1, 0,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,0, 0,0,0,1, 0,1,1,1},
+ {0,0,0,1, 0,1,1,1, 1,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,0, 1,1,1,1, 1,1,1,1}, {0,0,0,0, 1,1,1,1, 1,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,0, 0,0,0,0, 1,1,1,1},
+ {0,0,0,0, 1,0,0,0, 1,1,1,0, 1,1,1,1}, {0,1,1,1, 0,0,0,1, 0,0,0,0, 0,0,0,0}, {0,0,0,0, 0,0,0,0, 1,0,0,0, 1,1,1,0}, {0,1,1,1, 0,0,1,1, 0,0,0,1, 0,0,0,0},
+ {0,0,1,1, 0,0,0,1, 0,0,0,0, 0,0,0,0}, {0,0,0,0, 1,0,0,0, 1,1,0,0, 1,1,1,0}, {0,0,0,0, 0,0,0,0, 1,0,0,0, 1,1,0,0}, {0,1,1,1, 0,0,1,1, 0,0,1,1, 0,0,0,1},
+ {0,0,1,1, 0,0,0,1, 0,0,0,1, 0,0,0,0}, {0,0,0,0, 1,0,0,0, 1,0,0,0, 1,1,0,0}, {0,1,1,0, 0,1,1,0, 0,1,1,0, 0,1,1,0}, {0,0,1,1, 0,1,1,0, 0,1,1,0, 1,1,0,0},
+ {0,0,0,1, 0,1,1,1, 1,1,1,0, 1,0,0,0}, {0,0,0,0, 1,1,1,1, 1,1,1,1, 0,0,0,0}, {0,1,1,1, 0,0,0,1, 1,0,0,0, 1,1,1,0}, {0,0,1,1, 1,0,0,1, 1,0,0,1, 1,1,0,0}
+};*/
+
+static const uint candidateSectionBit[32] =
+{
+ 0xCCCC, 0x8888, 0xEEEE, 0xECC8,
+ 0xC880, 0xFEEC, 0xFEC8, 0xEC80,
+ 0xC800, 0xFFEC, 0xFE80, 0xE800,
+ 0xFFE8, 0xFF00, 0xFFF0, 0xF000,
+ 0xF710, 0x008E, 0x7100, 0x08CE,
+ 0x008C, 0x7310, 0x3100, 0x8CCE,
+ 0x088C, 0x3110, 0x6666, 0x366C,
+ 0x17E8, 0x0FF0, 0x718E, 0x399C
+};
+
+static const uint candidateFixUpIndex1D[32] =
+{
+ 15,15,15,15,
+ 15,15,15,15,
+ 15,15,15,15,
+ 15,15,15,15,
+ 15, 2, 8, 2,
+ 2, 8, 8,15,
+ 2, 8, 2, 2,
+ 8, 8, 2, 2
+};
+
+//0, 9, 18, 27, 37, 46, 55, 64
+static const uint aStep1[64] = {0,0,0,0,0,1,1,1,
+ 1,1,1,1,1,1,2,2,
+ 2,2,2,2,2,2,2,3,
+ 3,3,3,3,3,3,3,3,
+ 3,4,4,4,4,4,4,4,
+ 4,4,5,5,5,5,5,5,
+ 5,5,5,6,6,6,6,6,
+ 6,6,6,6,7,7,7,7};
+
+//0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64
+static const uint aStep2[64] = { 0, 0, 0, 1, 1, 1, 1, 2,
+ 2, 2, 2, 2, 3, 3, 3, 3,
+ 4, 4, 4, 4, 5, 5, 5, 5,
+ 6, 6, 6, 6, 6, 7, 7, 7,
+ 7, 8, 8, 8, 8, 9, 9, 9,
+ 9,10,10,10,10,10,11,11,
+ 11,11,12,12,12,12,13,13,
+ 13,13,14,14,14,14,15,15};
+
+static const float3 RGB2LUM = float3(0.2126f, 0.7152f, 0.0722f);
+
+#define THREAD_GROUP_SIZE 64
+#define BLOCK_SIZE_Y 4
+#define BLOCK_SIZE_X 4
+#define BLOCK_SIZE (BLOCK_SIZE_Y * BLOCK_SIZE_X)
+
+
+//Forward declaration
+uint3 float2half( float3 pixel_f );
+int3 start_quantize( uint3 pixel_h );
+void quantize( inout int2x3 endPoint, uint prec );
+void finish_quantize_0( inout int bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed );
+void finish_quantize_1( inout int bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed );
+void finish_quantize( out bool bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed );
+
+void start_unquantize( inout int2x3 endPoint[2], uint4 prec, bool transformed );
+void start_unquantize( inout int2x3 endPoint, uint4 prec, bool transformed );
+void unquantize( inout int2x3 color, uint prec );
+uint3 finish_unquantize( int3 color );
+void generate_palette_unquantized8( out uint3 palette, int3 low, int3 high, int i );
+void generate_palette_unquantized16( out uint3 palette, int3 low, int3 high, int i );
+float3 half2float(uint3 color_h );
+
+void block_package( inout uint4 block, int2x3 endPoint[2], uint mode_type, uint partition_index );
+void block_package( inout uint4 block, int2x3 endPoint, uint mode_type );
+
+void swap(inout int3 lhs, inout int3 rhs)
+{
+ int3 tmp = lhs;
+ lhs = rhs;
+ rhs = tmp;
+}
+
+Texture2D<float4> g_Input : register( t0 );
+StructuredBuffer<uint4> g_InBuff : register( t1 );
+
+RWStructuredBuffer<uint4> g_OutBuff : register( u0 );
+
+struct SharedData
+{
+ float3 pixel;
+ int3 pixel_ph;
+ float3 pixel_hr;
+ float pixel_lum;
+ float error;
+ uint best_mode;
+ uint best_partition;
+ int3 endPoint_low;
+ int3 endPoint_high;
+ float endPoint_lum_low;
+ float endPoint_lum_high;
+};
+
+groupshared SharedData shared_temp[THREAD_GROUP_SIZE];
+
+[numthreads( THREAD_GROUP_SIZE, 1, 1 )]
+void TryModeG10CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID )
+{
+ const uint MAX_USED_THREAD = 16;
+ uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
+ uint blockInGroup = GI / MAX_USED_THREAD;
+ uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
+ uint threadBase = blockInGroup * MAX_USED_THREAD;
+ uint threadInBlock = GI - threadBase;
+
+#ifndef REF_DEVICE
+ if (blockID >= g_num_total_blocks)
+ {
+ return;
+ }
+#endif
+
+ uint block_y = blockID / g_num_block_x;
+ uint block_x = blockID - block_y * g_num_block_x;
+ uint base_x = block_x * BLOCK_SIZE_X;
+ uint base_y = block_y * BLOCK_SIZE_Y;
+
+ if (threadInBlock < 16)
+ {
+ shared_temp[GI].pixel = g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ).rgb;
+ uint3 pixel_h = float2half( shared_temp[GI].pixel );
+ shared_temp[GI].pixel_hr = half2float(pixel_h);
+ shared_temp[GI].pixel_lum = dot(shared_temp[GI].pixel_hr, RGB2LUM);
+ shared_temp[GI].pixel_ph = start_quantize( pixel_h );
+
+ shared_temp[GI].endPoint_low = shared_temp[GI].pixel_ph;
+ shared_temp[GI].endPoint_high = shared_temp[GI].pixel_ph;
+ shared_temp[GI].endPoint_lum_low = shared_temp[GI].pixel_lum;
+ shared_temp[GI].endPoint_lum_high = shared_temp[GI].pixel_lum;
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ if (threadInBlock < 8)
+ {
+ if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 8].endPoint_lum_low)
+ {
+ shared_temp[GI].endPoint_low = shared_temp[GI + 8].endPoint_low;
+ shared_temp[GI].endPoint_lum_low = shared_temp[GI + 8].endPoint_lum_low;
+ }
+ if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 8].endPoint_lum_high)
+ {
+ shared_temp[GI].endPoint_high = shared_temp[GI + 8].endPoint_high;
+ shared_temp[GI].endPoint_lum_high = shared_temp[GI + 8].endPoint_lum_high;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 4)
+ {
+ if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 4].endPoint_lum_low)
+ {
+ shared_temp[GI].endPoint_low = shared_temp[GI + 4].endPoint_low;
+ shared_temp[GI].endPoint_lum_low = shared_temp[GI + 4].endPoint_lum_low;
+ }
+ if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 4].endPoint_lum_high)
+ {
+ shared_temp[GI].endPoint_high = shared_temp[GI + 4].endPoint_high;
+ shared_temp[GI].endPoint_lum_high = shared_temp[GI + 4].endPoint_lum_high;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 2)
+ {
+ if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 2].endPoint_lum_low)
+ {
+ shared_temp[GI].endPoint_low = shared_temp[GI + 2].endPoint_low;
+ shared_temp[GI].endPoint_lum_low = shared_temp[GI + 2].endPoint_lum_low;
+ }
+ if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 2].endPoint_lum_high)
+ {
+ shared_temp[GI].endPoint_high = shared_temp[GI + 2].endPoint_high;
+ shared_temp[GI].endPoint_lum_high = shared_temp[GI + 2].endPoint_lum_high;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 1)
+ {
+ if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 1].endPoint_lum_low)
+ {
+ shared_temp[GI].endPoint_low = shared_temp[GI + 1].endPoint_low;
+ shared_temp[GI].endPoint_lum_low = shared_temp[GI + 1].endPoint_lum_low;
+ }
+ if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 1].endPoint_lum_high)
+ {
+ shared_temp[GI].endPoint_high = shared_temp[GI + 1].endPoint_high;
+ shared_temp[GI].endPoint_lum_high = shared_temp[GI + 1].endPoint_lum_high;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ //ergod mode_type 11:14
+ if ( threadInBlock == 0 )
+ {
+ int2x3 endPoint;
+ // find_axis
+ endPoint[0] = shared_temp[threadBase + 0].endPoint_low;
+ endPoint[1] = shared_temp[threadBase + 0].endPoint_high;
+
+ //compute_index
+ float3 span = endPoint[1] - endPoint[0];// fixed a bug in v0.2
+ float span_norm_sqr = dot( span, span );// fixed a bug in v0.2
+ float dotProduct = dot( span, shared_temp[threadBase + 0].pixel_ph - endPoint[0] );// fixed a bug in v0.2
+ if ( span_norm_sqr > 0 && dotProduct >= 0 && uint( dotProduct * 63.49999 / span_norm_sqr ) > 32 )
+ {
+ swap(endPoint[0], endPoint[1]);
+
+ shared_temp[GI].endPoint_low = endPoint[0];
+ shared_temp[GI].endPoint_high = endPoint[1];
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ if (threadInBlock < 4)
+ {
+ int2x3 endPoint;
+ endPoint[0] = shared_temp[threadBase + 0].endPoint_low;
+ endPoint[1] = shared_temp[threadBase + 0].endPoint_high;
+
+ float3 span = endPoint[1] - endPoint[0];
+ float span_norm_sqr = dot( span, span );
+
+ uint4 prec = candidateModePrec[threadInBlock + 10];
+ int2x3 endPoint_q = endPoint;
+ quantize( endPoint_q, prec.x );
+
+ bool transformed = candidateModeTransformed[threadInBlock + 10];
+ if (transformed)
+ {
+ endPoint_q[1] -= endPoint_q[0];
+ }
+
+ bool bBadQuantize;
+ finish_quantize( bBadQuantize, endPoint_q, prec, transformed );
+
+ start_unquantize( endPoint_q, prec, transformed );
+
+ unquantize( endPoint_q, prec.x );
+
+ float error = 0;
+ [loop]for ( uint j = 0; j < 16; j ++ )
+ {
+ float dotProduct = dot( span, shared_temp[threadBase + j].pixel_ph - endPoint[0] );// fixed a bug in v0.2
+ uint index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0
+ : ( ( dotProduct < span_norm_sqr ) ? aStep2[ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep2[63] );
+
+ uint3 pixel_rh;
+ generate_palette_unquantized16( pixel_rh, endPoint_q[0], endPoint_q[1], index );
+ float3 pixel_r = half2float( pixel_rh );
+ pixel_r -= shared_temp[threadBase + j].pixel_hr;
+ error += dot(pixel_r, pixel_r);
+ }
+ if ( bBadQuantize )
+ error = 1e20f;
+
+ shared_temp[GI].error = error;
+ shared_temp[GI].best_mode = candidateModeFlag[threadInBlock + 10];
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ if (threadInBlock < 2)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 2].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 2].error;
+ shared_temp[GI].best_mode = shared_temp[GI + 2].best_mode;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 1)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 1].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 1].error;
+ shared_temp[GI].best_mode = shared_temp[GI + 1].best_mode;
+ }
+
+ g_OutBuff[blockID] = uint4(asuint(shared_temp[GI].error), shared_temp[GI].best_mode, 0, 0);
+ }
+}
+
+[numthreads( THREAD_GROUP_SIZE, 1, 1 )]
+void TryModeLE10CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID )
+{
+ const uint MAX_USED_THREAD = 32;
+ uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
+ uint blockInGroup = GI / MAX_USED_THREAD;
+ uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
+ uint threadBase = blockInGroup * MAX_USED_THREAD;
+ uint threadInBlock = GI - threadBase;
+
+#ifndef REF_DEVICE
+ if (blockID >= g_num_total_blocks)
+ {
+ return;
+ }
+
+ if (asfloat(g_InBuff[blockID].x) < 1e-6f)
+ {
+ g_OutBuff[blockID] = g_InBuff[blockID];
+ return;
+ }
+#endif
+
+ uint block_y = blockID / g_num_block_x;
+ uint block_x = blockID - block_y * g_num_block_x;
+ uint base_x = block_x * BLOCK_SIZE_X;
+ uint base_y = block_y * BLOCK_SIZE_Y;
+
+ if (threadInBlock < 16)
+ {
+ shared_temp[GI].pixel = g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ).rgb;
+ uint3 pixel_h = float2half( shared_temp[GI].pixel );
+ shared_temp[GI].pixel_hr = half2float(pixel_h);
+ shared_temp[GI].pixel_lum = dot(shared_temp[GI].pixel_hr, RGB2LUM);
+ shared_temp[GI].pixel_ph = start_quantize( pixel_h );
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ //ergod mode_type 1:10
+ if (threadInBlock < 32)
+ {
+ // find_axis
+ int2x3 endPoint[2];
+ endPoint[0][0] = MAX_INT;
+ endPoint[0][1] = MIN_INT;
+ endPoint[1][0] = MAX_INT;
+ endPoint[1][1] = MIN_INT;
+
+ float2 endPoint_lum[2];
+ endPoint_lum[0][0] = MAX_FLOAT;
+ endPoint_lum[0][1] = MIN_FLOAT;
+ endPoint_lum[1][0] = MAX_FLOAT;
+ endPoint_lum[1][1] = MIN_FLOAT;
+
+ uint bit = candidateSectionBit[threadInBlock];
+ for ( uint i = 0; i < 16; i ++ )
+ {
+ int3 pixel_ph = shared_temp[threadBase + i].pixel_ph;
+ float pixel_lum = shared_temp[threadBase + i].pixel_lum;
+ if ( (bit >> i) & 1 ) //It gets error when using "candidateSection" as "endPoint_ph" index
+ {
+ if (endPoint_lum[1][0] > pixel_lum)
+ {
+ endPoint[1][0] = pixel_ph;
+ endPoint_lum[1][0] = pixel_lum;
+ }
+ if (endPoint_lum[1][1] < pixel_lum)
+ {
+ endPoint[1][1] = pixel_ph;
+ endPoint_lum[1][1] = pixel_lum;
+ }
+ }
+ else
+ {
+ if (endPoint_lum[0][0] > pixel_lum)
+ {
+ endPoint[0][0] = pixel_ph;
+ endPoint_lum[0][0] = pixel_lum;
+ }
+ if (endPoint_lum[0][1] < pixel_lum)
+ {
+ endPoint[0][1] = pixel_ph;
+ endPoint_lum[0][1] = pixel_lum;
+ }
+ }
+ }
+
+ //compute_index
+ float3 span[2];// fixed a bug in v0.2
+ float span_norm_sqr[2];// fixed a bug in v0.2
+ [unroll]
+ for (uint p = 0; p < 2; ++ p)
+ {
+ span[p] = endPoint[p][1] - endPoint[p][0];
+ span_norm_sqr[p] = dot( span[p], span[p] );
+
+ float dotProduct = dot( span[p], shared_temp[threadBase + (0 == p ? 0 : candidateFixUpIndex1D[threadInBlock])].pixel_ph - endPoint[p][0] );// fixed a bug in v0.2
+ if ( span_norm_sqr[p] > 0 && dotProduct >= 0 && uint( dotProduct * 63.49999 / span_norm_sqr[p] ) > 32 )
+ {
+ span[p] = -span[p];
+ swap(endPoint[p][0], endPoint[p][1]);
+ }
+ }
+
+ uint4 prec = candidateModePrec[g_mode_id];
+ int2x3 endPoint_q[2] = endPoint;
+ quantize( endPoint_q[0], prec.x );
+ quantize( endPoint_q[1], prec.x );
+
+ bool transformed = candidateModeTransformed[g_mode_id];
+ if (transformed)
+ {
+ endPoint_q[0][1] -= endPoint_q[0][0];
+ endPoint_q[1][0] -= endPoint_q[0][0];
+ endPoint_q[1][1] -= endPoint_q[0][0];
+ }
+
+ int bBadQuantize = 0;
+ finish_quantize_0( bBadQuantize, endPoint_q[0], prec, transformed );
+ finish_quantize_1( bBadQuantize, endPoint_q[1], prec, transformed );
+
+ start_unquantize( endPoint_q, prec, transformed );
+
+ unquantize( endPoint_q[0], prec.x );
+ unquantize( endPoint_q[1], prec.x );
+
+ float error = 0;
+ for ( uint j = 0; j < 16; j ++ )
+ {
+ uint3 pixel_rh;
+ if ((bit >> j) & 1)
+ {
+ float dotProduct = dot( span[1], shared_temp[threadBase + j].pixel_ph - endPoint[1][0] );// fixed a bug in v0.2
+ uint index = ( span_norm_sqr[1] <= 0 || dotProduct <= 0 ) ? 0
+ : ( ( dotProduct < span_norm_sqr[1] ) ? aStep1[ uint( dotProduct * 63.49999 / span_norm_sqr[1] ) ] : aStep1[63] );
+ generate_palette_unquantized8( pixel_rh, endPoint_q[1][0], endPoint_q[1][1], index );
+ }
+ else
+ {
+ float dotProduct = dot( span[0], shared_temp[threadBase + j].pixel_ph - endPoint[0][0] );// fixed a bug in v0.2
+ uint index = ( span_norm_sqr[0] <= 0 || dotProduct <= 0 ) ? 0
+ : ( ( dotProduct < span_norm_sqr[0] ) ? aStep1[ uint( dotProduct * 63.49999 / span_norm_sqr[0] ) ] : aStep1[63] );
+ generate_palette_unquantized8( pixel_rh, endPoint_q[0][0], endPoint_q[0][1], index );
+ }
+
+ float3 pixel_r = half2float( pixel_rh );
+ pixel_r -= shared_temp[threadBase + j].pixel_hr;
+ error += dot(pixel_r, pixel_r);
+ }
+ if ( bBadQuantize )
+ error = 1e20f;
+
+ shared_temp[GI].error = error;
+ shared_temp[GI].best_mode = candidateModeFlag[g_mode_id];
+ shared_temp[GI].best_partition = threadInBlock;
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ if (threadInBlock < 16)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 16].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 16].error;
+ shared_temp[GI].best_mode = shared_temp[GI + 16].best_mode;
+ shared_temp[GI].best_partition = shared_temp[GI + 16].best_partition;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 8)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 8].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 8].error;
+ shared_temp[GI].best_mode = shared_temp[GI + 8].best_mode;
+ shared_temp[GI].best_partition = shared_temp[GI + 8].best_partition;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 4)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 4].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 4].error;
+ shared_temp[GI].best_mode = shared_temp[GI + 4].best_mode;
+ shared_temp[GI].best_partition = shared_temp[GI + 4].best_partition;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 2)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 2].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 2].error;
+ shared_temp[GI].best_mode = shared_temp[GI + 2].best_mode;
+ shared_temp[GI].best_partition = shared_temp[GI + 2].best_partition;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 1)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 1].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 1].error;
+ shared_temp[GI].best_mode = shared_temp[GI + 1].best_mode;
+ shared_temp[GI].best_partition = shared_temp[GI + 1].best_partition;
+ }
+
+ if (asfloat(g_InBuff[blockID].x) > shared_temp[GI].error)
+ {
+ g_OutBuff[blockID] = uint4(asuint(shared_temp[GI].error), shared_temp[GI].best_mode, shared_temp[GI].best_partition, 0);
+ }
+ else
+ {
+ g_OutBuff[blockID] = g_InBuff[blockID];
+ }
+ }
+}
+
+[numthreads( THREAD_GROUP_SIZE, 1, 1 )]
+void EncodeBlockCS(uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID)
+{
+ const uint MAX_USED_THREAD = 32;
+ uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
+ uint blockInGroup = GI / MAX_USED_THREAD;
+ uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
+ uint threadBase = blockInGroup * MAX_USED_THREAD;
+ uint threadInBlock = GI - threadBase;
+
+#ifndef REF_DEVICE
+ if (blockID >= g_num_total_blocks)
+ {
+ return;
+ }
+#endif
+
+ uint block_y = blockID / g_num_block_x;
+ uint block_x = blockID - block_y * g_num_block_x;
+ uint base_x = block_x * BLOCK_SIZE_X;
+ uint base_y = block_y * BLOCK_SIZE_Y;
+
+ if (threadInBlock < 16)
+ {
+ shared_temp[GI].pixel = g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ).rgb;
+ shared_temp[GI].pixel_lum = dot(shared_temp[GI].pixel, RGB2LUM);
+ uint3 pixel_h = float2half( shared_temp[GI].pixel );
+ shared_temp[GI].pixel_ph = start_quantize( pixel_h );
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ uint best_mode = g_InBuff[blockID].y;
+ uint best_partition = g_InBuff[blockID].z;
+
+ uint4 block = 0;
+
+ if (threadInBlock < 32)
+ {
+ int2x3 endPoint;
+ endPoint[0] = MAX_INT;
+ endPoint[1] = MIN_INT;
+
+ float2 endPoint_lum;
+ endPoint_lum[0] = MAX_FLOAT;
+ endPoint_lum[1] = MIN_FLOAT;
+
+ int2 endPoint_lum_index;
+ endPoint_lum_index[0] = -1;
+ endPoint_lum_index[1] = -1;
+
+ int3 pixel_ph = shared_temp[threadBase + (threadInBlock & 0xF)].pixel_ph;
+ float pixel_lum = shared_temp[threadBase + (threadInBlock & 0xF)].pixel_lum;
+ if (threadInBlock < 16)
+ {
+ if (best_mode > 10)
+ {
+ endPoint[0] = endPoint[1] = pixel_ph;
+ endPoint_lum[0] = endPoint_lum[1] = pixel_lum;
+ }
+ else
+ {
+ uint bits = candidateSectionBit[best_partition];
+ if (0 == ((bits >> threadInBlock) & 1))
+ {
+ endPoint[0] = endPoint[1] = pixel_ph;
+ endPoint_lum[0] = endPoint_lum[1] = pixel_lum;
+ }
+ }
+ }
+ else
+ {
+ if (best_mode <= 10)
+ {
+ uint bits = candidateSectionBit[best_partition];
+ if (1 == ((bits >> (threadInBlock & 0xF)) & 1))
+ {
+ endPoint[0] = endPoint[1] = pixel_ph;
+ endPoint_lum[0] = endPoint_lum[1] = pixel_lum;
+ }
+ }
+ }
+
+ shared_temp[GI].endPoint_low = endPoint[0];
+ shared_temp[GI].endPoint_high = endPoint[1];
+
+ shared_temp[GI].endPoint_lum_low = endPoint_lum[0];
+ shared_temp[GI].endPoint_lum_high = endPoint_lum[1];
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if ((threadInBlock & 0xF) < 8)
+ {
+ if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 8].endPoint_lum_low)
+ {
+ shared_temp[GI].endPoint_low = shared_temp[GI + 8].endPoint_low;
+ shared_temp[GI].endPoint_lum_low = shared_temp[GI + 8].endPoint_lum_low;
+ }
+ if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 8].endPoint_lum_high)
+ {
+ shared_temp[GI].endPoint_high = shared_temp[GI + 8].endPoint_high;
+ shared_temp[GI].endPoint_lum_high = shared_temp[GI + 8].endPoint_lum_high;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if ((threadInBlock & 0xF) < 4)
+ {
+ if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 4].endPoint_lum_low)
+ {
+ shared_temp[GI].endPoint_low = shared_temp[GI + 4].endPoint_low;
+ shared_temp[GI].endPoint_lum_low = shared_temp[GI + 4].endPoint_lum_low;
+ }
+ if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 4].endPoint_lum_high)
+ {
+ shared_temp[GI].endPoint_high = shared_temp[GI + 4].endPoint_high;
+ shared_temp[GI].endPoint_lum_high = shared_temp[GI + 4].endPoint_lum_high;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if ((threadInBlock & 0xF) < 2)
+ {
+ if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 2].endPoint_lum_low)
+ {
+ shared_temp[GI].endPoint_low = shared_temp[GI + 2].endPoint_low;
+ shared_temp[GI].endPoint_lum_low = shared_temp[GI + 2].endPoint_lum_low;
+ }
+ if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 2].endPoint_lum_high)
+ {
+ shared_temp[GI].endPoint_high = shared_temp[GI + 2].endPoint_high;
+ shared_temp[GI].endPoint_lum_high = shared_temp[GI + 2].endPoint_lum_high;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if ((threadInBlock & 0xF) < 1)
+ {
+ if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 1].endPoint_lum_low)
+ {
+ shared_temp[GI].endPoint_low = shared_temp[GI + 1].endPoint_low;
+ shared_temp[GI].endPoint_lum_low = shared_temp[GI + 1].endPoint_lum_low;
+ }
+ if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 1].endPoint_lum_high)
+ {
+ shared_temp[GI].endPoint_high = shared_temp[GI + 1].endPoint_high;
+ shared_temp[GI].endPoint_lum_high = shared_temp[GI + 1].endPoint_lum_high;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ if (threadInBlock < 2)
+ {
+ // find_axis
+ int2x3 endPoint;
+ endPoint[0] = shared_temp[threadBase + threadInBlock * 16].endPoint_low;
+ endPoint[1] = shared_temp[threadBase + threadInBlock * 16].endPoint_high;
+
+ uint fixup = 0;
+ if ((1 == threadInBlock) && (best_mode <= 10))
+ {
+ fixup = candidateFixUpIndex1D[best_partition];
+ }
+
+ float3 span = endPoint[1] - endPoint[0];
+ float span_norm_sqr = dot( span, span );
+ float dotProduct = dot( span, shared_temp[threadBase + fixup].pixel_ph - endPoint[0] );
+ if ( span_norm_sqr > 0 && dotProduct >= 0 && uint( dotProduct * 63.49999 / span_norm_sqr ) > 32 )
+ {
+ swap(endPoint[0], endPoint[1]);
+ }
+
+ shared_temp[GI].endPoint_low = endPoint[0];
+ shared_temp[GI].endPoint_high = endPoint[1];
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ if (threadInBlock < 16)
+ {
+ uint bits;
+ if (best_mode > 10)
+ {
+ bits = 0;
+ }
+ else
+ {
+ bits = candidateSectionBit[best_partition];
+ }
+
+ float3 span;
+ float dotProduct;
+ if ((bits >> threadInBlock) & 1)
+ {
+ span = shared_temp[threadBase + 1].endPoint_high - shared_temp[threadBase + 1].endPoint_low;
+ dotProduct = dot( span, shared_temp[threadBase + threadInBlock].pixel_ph - shared_temp[threadBase + 1].endPoint_low );
+ }
+ else
+ {
+ span = shared_temp[threadBase + 0].endPoint_high - shared_temp[threadBase + 0].endPoint_low;
+ dotProduct = dot( span, shared_temp[threadBase + threadInBlock].pixel_ph - shared_temp[threadBase + 0].endPoint_low );
+ }
+ float span_norm_sqr = dot( span, span );
+
+ if (best_mode > 10)
+ {
+ uint index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0
+ : ( ( dotProduct < span_norm_sqr ) ? aStep2[ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep2[63] );
+ if (threadInBlock == 0)
+ {
+ block.z |= index << 1;
+ }
+ else if (threadInBlock < 8)
+ {
+ block.z |= index << (threadInBlock * 4);
+ }
+ else
+ {
+ block.w |= index << ((threadInBlock - 8) * 4);
+ }
+ }
+ else
+ {
+ uint index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0
+ : ( ( dotProduct < span_norm_sqr ) ? aStep1[ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep1[63] );
+
+ uint fixup = candidateFixUpIndex1D[best_partition];
+ int2 offset = int2((fixup != 2), (fixup == 15));
+
+ if (threadInBlock == 0)
+ {
+ block.z |= index << 18;
+ }
+ else if (threadInBlock < 3)
+ {
+ block.z |= index << (20 + (threadInBlock - 1) * 3);
+ }
+ else if (threadInBlock < 5)
+ {
+ block.z |= index << (25 + (threadInBlock - 3) * 3 + offset.x);
+ }
+ else if (threadInBlock == 5)
+ {
+ block.w |= index >> !offset.x;
+ if (!offset.x)
+ {
+ block.z |= index << 31;
+ }
+ }
+ else if (threadInBlock < 9)
+ {
+ block.w |= index << (2 + (threadInBlock - 6) * 3 + offset.x);
+ }
+ else
+ {
+ block.w |= index << (11 + (threadInBlock - 9) * 3 + offset.y);
+ }
+ }
+
+ shared_temp[GI].pixel_hr.xy = asfloat(block.zw);
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 8)
+ {
+ shared_temp[GI].pixel_hr.xy = asfloat(asuint(shared_temp[GI].pixel_hr.xy) | asuint(shared_temp[GI + 8].pixel_hr.xy));
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 4)
+ {
+ shared_temp[GI].pixel_hr.xy = asfloat(asuint(shared_temp[GI].pixel_hr.xy) | asuint(shared_temp[GI + 4].pixel_hr.xy));
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 2)
+ {
+ shared_temp[GI].pixel_hr.xy = asfloat(asuint(shared_temp[GI].pixel_hr.xy) | asuint(shared_temp[GI + 2].pixel_hr.xy));
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 1)
+ {
+ shared_temp[GI].pixel_hr.xy = asfloat(asuint(shared_temp[GI].pixel_hr.xy) | asuint(shared_temp[GI + 1].pixel_hr.xy));
+
+ block.zw = asuint(shared_temp[GI].pixel_hr.xy);
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ bool transformed = candidateModeTransformed[best_mode - 1];
+ uint4 prec = candidateModePrec[best_mode - 1];
+ if (threadInBlock == 2)
+ {
+ int2x3 endPoint_q;
+ endPoint_q[0] = shared_temp[threadBase + 0].endPoint_low;
+ endPoint_q[1] = shared_temp[threadBase + 0].endPoint_high;
+
+ quantize( endPoint_q, prec.x );
+ if (transformed)
+ {
+ endPoint_q[1] -= endPoint_q[0];
+ }
+
+ shared_temp[GI].endPoint_low = endPoint_q[0];
+ shared_temp[GI].endPoint_high = endPoint_q[1];
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock == 3)
+ {
+ int3 ep0 = shared_temp[threadBase + 2].endPoint_low;
+ int2x3 endPoint_q;
+ endPoint_q[0] = shared_temp[threadBase + 1].endPoint_low;
+ endPoint_q[1] = shared_temp[threadBase + 1].endPoint_high;
+
+ if (best_mode <= 10)
+ {
+ quantize( endPoint_q, prec.x );
+ if (transformed)
+ {
+ endPoint_q[0] -= ep0;
+ endPoint_q[1] -= ep0;
+ }
+
+ shared_temp[GI].endPoint_low = endPoint_q[0];
+ shared_temp[GI].endPoint_high = endPoint_q[1];
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ if (threadInBlock < 2)
+ {
+ int2x3 endPoint_q;
+ endPoint_q[0] = shared_temp[threadBase + threadInBlock + 2].endPoint_low;
+ endPoint_q[1] = shared_temp[threadBase + threadInBlock + 2].endPoint_high;
+
+ int bBadQuantize = 0;
+ if (threadInBlock == 0)
+ {
+ if (best_mode > 10)
+ {
+ finish_quantize( bBadQuantize, endPoint_q, prec, transformed );
+ }
+ else
+ {
+ finish_quantize_0( bBadQuantize, endPoint_q, prec, transformed );
+ }
+ }
+ else // if (threadInBlock == 1)
+ {
+ if (best_mode <= 10)
+ {
+ finish_quantize_1( bBadQuantize, endPoint_q, prec, transformed );
+ }
+ }
+
+ shared_temp[GI].endPoint_low = endPoint_q[0];
+ shared_temp[GI].endPoint_high = endPoint_q[1];
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ if ( threadInBlock == 0 )
+ {
+ int2x3 endPoint_q[2];
+ endPoint_q[0][0] = shared_temp[threadBase + 0].endPoint_low;
+ endPoint_q[0][1] = shared_temp[threadBase + 0].endPoint_high;
+ endPoint_q[1][0] = shared_temp[threadBase + 1].endPoint_low;
+ endPoint_q[1][1] = shared_temp[threadBase + 1].endPoint_high;
+
+ if ( best_mode > 10 )
+ {
+ block_package( block, endPoint_q[0], best_mode );
+ }
+ else
+ {
+ block_package( block, endPoint_q, best_mode, best_partition );
+ }
+
+ g_OutBuff[blockID] = block;
+ }
+}
+
+uint float2half1( float f )
+{
+ uint Result;
+
+ uint IValue = asuint(f);
+ uint Sign = (IValue & 0x80000000U) >> 16U;
+ IValue = IValue & 0x7FFFFFFFU;
+
+ if (IValue > 0x47FFEFFFU)
+ {
+ // The number is too large to be represented as a half. Saturate to infinity.
+ Result = 0x7FFFU;
+ }
+ else
+ {
+ if (IValue < 0x38800000U)
+ {
+ // The number is too small to be represented as a normalized half.
+ // Convert it to a denormalized value.
+ uint Shift = 113U - (IValue >> 23U);
+ IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift;
+ }
+ else
+ {
+ // Rebias the exponent to represent the value as a normalized half.
+ IValue += 0xC8000000U;
+ }
+
+ Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU;
+ }
+ return (Result|Sign);
+}
+
+uint3 float2half( float3 endPoint_f )
+{
+ //uint3 sign = asuint(endPoint_f) & 0x80000000;
+ //uint3 expo = asuint(endPoint_f) & 0x7F800000;
+ //uint3 base = asuint(endPoint_f) & 0x007FFFFF;
+ //return ( expo < 0x33800000 ) ? 0
+ // //0x33800000 indicating 2^-24, which is minimal denormalized number that half can present
+ // : ( ( expo < 0x38800000 ) ? ( sign >> 16 ) | ( ( base + 0x00800000 ) >> ( 23 - ( ( expo - 0x33800000 ) >> 23 ) ) )//fixed a bug in v0.2
+ // //0x38800000 indicating 2^-14, which is minimal normalized number that half can present, so need to use denormalized half presentation
+ // : ( ( expo == 0x7F800000 || expo > 0x47000000 ) ? ( ( sign >> 16 ) | 0x7bff )
+ // // treat NaN as INF, treat INF (including NaN) as the maximum/minimum number that half can present
+ // // 0x47000000 indicating 2^15, which is maximum exponent that half can present, so cut to 0x7bff which is the maximum half number
+ // : ( ( sign >> 16 ) | ( ( ( expo - 0x38000000 ) | base ) >> 13 ) ) ) );
+
+
+ return uint3( float2half1( endPoint_f.x ), float2half1( endPoint_f.y ), float2half1( endPoint_f.z ) );
+}
+int3 start_quantize( uint3 pixel_h )
+{
+ if ( g_format == UNSIGNED_F16 )
+ {
+ return asint( ( pixel_h << 6 ) / 31 );
+ }
+ else
+ {
+ return ( pixel_h < 0x8000 ) ? ( ( pixel_h == 0x7bff ) ? 0x7fff : asint( ( pixel_h << 5 ) / 31 ) )// fixed a bug in v0.2
+ : ( ( pixel_h == 0x7bff ) ? 0xffff8001 : -asint( ( ( 0x00007fff & pixel_h ) << 5 ) / 31 ) );// fixed a bug in v0.2
+ }
+}
+void quantize( inout int2x3 endPoint, uint prec )
+{
+ int iprec = asint( prec );
+ if ( g_format == UNSIGNED_F16 )
+ {
+ endPoint = ( ( iprec >= 15 ) | ( endPoint == 0 ) ) ? endPoint
+ : ( ( endPoint == asint(0xFFFF) ) ? ( ( 1 << iprec ) - 1 )
+ : ( ( ( endPoint << iprec ) + asint(0x0000) ) >> 16 ) );
+ }
+ else
+ {
+ endPoint = ( ( iprec >= 16 ) | ( endPoint == 0 ) ) ? endPoint
+ : ( ( endPoint >= 0 ) ? ( ( endPoint == asint(0x7FFF) ) ? ( ( 1 << ( iprec - 1 ) ) - 1 ) : ( ( ( endPoint << ( iprec - 1 ) ) + asint(0x0000) ) >> 15 ) )
+ : ( ( -endPoint == asint(0x7FFF) ) ? -( ( 1 << ( iprec - 1 ) ) - 1 ) : -( ( ( -endPoint << ( iprec - 1 ) ) + asint(0x0000) ) >> 15 ) ) );
+ }
+}
+void finish_quantize_0( inout int bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed )
+{
+ if ( transformed )
+ {
+ bool3 bBadComponent = ( endPoint[1] >= 0 ) ? ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) )
+ : ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) );
+ bBadQuantize |= any(bBadComponent);
+
+ endPoint[0] = endPoint[0] & ( ( 1 << prec.x ) - 1 );
+ endPoint[1] = ( endPoint[1] >= 0 ) ? ( ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) ? ( ( 1 << ( prec.yzw - 1 ) ) - 1 ) : endPoint[1] )
+ : ( ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ) ? ( 1 << ( prec.yzw - 1 ) ) : ( endPoint[1] & ( ( 1 << prec.yzw ) - 1 ) ) );
+ }
+ else
+ {
+ endPoint &= ( ( 1 << prec.x ) - 1 );
+ }
+}
+void finish_quantize_1( inout int bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed )
+{
+ if ( transformed )
+ {
+ bool2x3 bBadComponent;
+ bBadComponent[0] = ( endPoint[0] >= 0 ) ? ( endPoint[0] >= ( 1 << ( prec.yzw - 1 ) ) )
+ : ( -endPoint[0] > ( 1 << ( prec.yzw - 1 ) ) );
+ bBadComponent[1] = ( endPoint[1] >= 0 ) ? ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) )
+ : ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) );
+ bBadQuantize |= any(bBadComponent);
+
+ endPoint[0] = ( endPoint[0] >= 0 ) ? ( ( endPoint[0] >= ( 1 << ( prec.yzw - 1 ) ) ) ? ( ( 1 << ( prec.yzw - 1 ) ) - 1 ) : endPoint[0] )
+ : ( ( -endPoint[0] > ( 1 << ( prec.yzw - 1 ) ) ) ? ( 1 << ( prec.yzw - 1 ) ) : ( endPoint[0] & ( ( 1 << prec.yzw ) - 1 ) ) );
+ endPoint[1] = ( endPoint[1] >= 0 ) ? ( ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) ? ( ( 1 << ( prec.yzw - 1 ) ) - 1 ) : endPoint[1] )
+ : ( ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ) ? ( 1 << ( prec.yzw - 1 ) ) : ( endPoint[1] & ( ( 1 << prec.yzw ) - 1 ) ) );
+ }
+ else
+ {
+ endPoint &= ( ( 1 << prec.x ) - 1 );
+ }
+}
+void finish_quantize( out bool bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed )
+{
+ if ( transformed )
+ {
+ bool3 bBadComponent;
+ bBadComponent = ( endPoint[1] >= 0 ) ? ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) )
+ : ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) );
+ bBadQuantize = any( bBadComponent );
+
+ endPoint[0] = endPoint[0] & ( ( 1 << prec.x ) - 1 );
+ endPoint[1] = ( endPoint[1] >= 0 ) ? ( ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) ? ( ( 1 << ( prec.yzw - 1 ) ) - 1 ) : endPoint[1] )
+ : ( ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ) ? ( 1 << ( prec.yzw - 1 ) ) : ( endPoint[1] & ( ( 1 << prec.yzw ) - 1 ) ) );
+ }
+ else
+ {
+ endPoint &= ( ( 1 << prec.x ) - 1 );
+
+ bBadQuantize = 0;
+ }
+}
+
+void SIGN_EXTEND( uint3 prec, inout int3 color )
+{
+ uint3 p = 1 << (prec - 1);
+ color = (color & p) ? (color & (p - 1)) - p : color;
+}
+
+void sign_extend( bool transformed, uint4 prec, inout int2x3 endPoint )
+{
+ if ( g_format == SIGNED_F16 )
+ SIGN_EXTEND( prec.x, endPoint[0] );
+ if ( g_format == SIGNED_F16 || transformed )
+ SIGN_EXTEND( prec.yzw, endPoint[1] );
+}
+
+void sign_extend( bool transformed, uint4 prec, inout int2x3 endPoint[2] )
+{
+ if ( g_format == SIGNED_F16 )
+ SIGN_EXTEND( prec.x, endPoint[0][0] );
+ if ( g_format == SIGNED_F16 || transformed )
+ {
+ SIGN_EXTEND( prec.yzw, endPoint[0][1] );
+ SIGN_EXTEND( prec.yzw, endPoint[1][0] );
+ SIGN_EXTEND( prec.yzw, endPoint[1][1] );
+ }
+}
+void start_unquantize( inout int2x3 endPoint[2], uint4 prec, bool transformed )
+{
+ sign_extend( transformed, prec, endPoint );
+ if ( transformed )
+ {
+ endPoint[0][1] += endPoint[0][0];
+ endPoint[1][0] += endPoint[0][0];
+ endPoint[1][1] += endPoint[0][0];
+ }
+}
+void start_unquantize( inout int2x3 endPoint, uint4 prec, bool transformed )
+{
+ sign_extend( transformed, prec, endPoint );
+ if ( transformed )
+ endPoint[1] += endPoint[0];
+}
+void unquantize( inout int2x3 color, uint prec )
+{
+ int iprec = asint( prec );
+ if (g_format == UNSIGNED_F16 )
+ {
+ if (prec < 15)
+ {
+ color = (color != 0) ? (color == ((1 << iprec) - 1) ? 0xFFFF : (((color << 16) + 0x8000) >> iprec)) : color;
+ }
+ }
+ else
+ {
+ if (prec < 16)
+ {
+ uint2x3 s = color >= 0 ? 0 : 1;
+ color = abs(color);
+ color = (color != 0) ? (color >= ((1 << (iprec - 1)) - 1) ? 0x7FFF : (((color << 15) + 0x4000) >> (iprec - 1))) : color;
+ color = s > 0 ? -color : color;
+ }
+ }
+}
+uint3 finish_unquantize( int3 color )
+{
+ if ( g_format == UNSIGNED_F16 )
+ color = ( color * 31 ) >> 6;
+ else
+ {
+ color = ( color < 0 ) ? -( ( -color * 31 ) >> 5 ) : ( color * 31 ) >> 5;
+ color = ( color < 0 ) ? ( ( -color ) | 0x8000 ) : color;
+ }
+ return asuint(color);
+}
+void generate_palette_unquantized8( out uint3 palette, int3 low, int3 high, int i )
+{
+ static const int aWeight3[] = {0, 9, 18, 27, 37, 46, 55, 64};
+
+ int3 tmp = ( low * ( 64 - aWeight3[i] ) + high * aWeight3[i] + 32 ) >> 6;
+ palette = finish_unquantize( tmp );
+}
+void generate_palette_unquantized16( out uint3 palette, int3 low, int3 high, int i )
+{
+ static const int aWeight4[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};
+
+ int3 tmp = ( low * ( 64 - aWeight4[i] ) + high * aWeight4[i] + 32 ) >> 6;
+ palette = finish_unquantize( tmp );
+}
+
+float half2float1( uint Value )
+{
+ uint Mantissa = (uint)(Value & 0x03FF);
+
+ uint Exponent;
+ if ((Value & 0x7C00) != 0) // The value is normalized
+ {
+ Exponent = (uint)((Value >> 10) & 0x1F);
+ }
+ else if (Mantissa != 0) // The value is denormalized
+ {
+ // Normalize the value in the resulting float
+ Exponent = 1;
+
+ do
+ {
+ Exponent--;
+ Mantissa <<= 1;
+ } while ((Mantissa & 0x0400) == 0);
+
+ Mantissa &= 0x03FF;
+ }
+ else // The value is zero
+ {
+ Exponent = (uint)(-112);
+ }
+
+ uint Result = ((Value & 0x8000) << 16) | // Sign
+ ((Exponent + 112) << 23) | // Exponent
+ (Mantissa << 13); // Mantissa
+
+ return asfloat(Result);
+}
+
+float3 half2float(uint3 color_h )
+{
+ //uint3 sign = color_h & 0x8000;
+ //uint3 expo = color_h & 0x7C00;
+ //uint3 base = color_h & 0x03FF;
+ //return ( expo == 0 ) ? asfloat( ( sign << 16 ) | asuint( float3(base) / 16777216 ) ) //16777216 = 2^24
+ // : asfloat( ( sign << 16 ) | ( ( ( expo + 0x1C000 ) | base ) << 13 ) ); //0x1C000 = 0x1FC00 - 0x3C00
+
+ return float3( half2float1( color_h.x ), half2float1( color_h.y ), half2float1( color_h.z ) );
+}
+
+void block_package( inout uint4 block, int2x3 endPoint[2], uint mode_type, uint partition_index ) // for mode 1 - 10
+{
+ block.xy = 0;
+ block.z &= 0xFFFC0000;
+
+ //block.z |= (partition_index & 0x1f) << 13;
+
+ if ( mode_type == candidateModeFlag[0])
+ {
+ /*block.x = candidateModeMemory[0];
+ block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
+ block.x |= ( endPoint[1][0].g >> 2 ) & 0x00000004;
+ block.x |= ( endPoint[1][0].b >> 1 ) & 0x00000008;
+ block.x |= endPoint[1][1].b & 0x00000010;
+ block.y |= ( ( endPoint[0][0].b >> 7 ) & 0x00000007 );
+ block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 );
+ block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E);
+ block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+ block.y |= ( ( endPoint[1][1].g << 4 ) & 0x00000100 );
+ block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80);
+ block.yz |= ( ( endPoint[1][1].b << uint2(27, 9) ) & uint2(0x10000000, 0x00001000) ) | ( ( endPoint[1][1].b << uint2(18, 4) ) & uint2(0x00040000, 0x00000040) );
+ block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;*/
+
+ block.x |= ((candidateModeMemory[0] >> 0) & 1) << 0;
+ block.x |= ((candidateModeMemory[0] >> 1) & 1) << 1;
+ block.x |= ((endPoint[1][0].g >> 4) & 1) << 2;
+ block.x |= ((endPoint[1][0].b >> 4) & 1) << 3;
+ block.x |= ((endPoint[1][1].b >> 4) & 1) << 4;
+ block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+ block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+ block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+ block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+ block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+ block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+ block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+ block.x |= ((endPoint[0][0].r >> 7) & 1) << 12;
+ block.x |= ((endPoint[0][0].r >> 8) & 1) << 13;
+ block.x |= ((endPoint[0][0].r >> 9) & 1) << 14;
+ block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+ block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+ block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+ block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+ block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+ block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+ block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+ block.x |= ((endPoint[0][0].g >> 7) & 1) << 22;
+ block.x |= ((endPoint[0][0].g >> 8) & 1) << 23;
+ block.x |= ((endPoint[0][0].g >> 9) & 1) << 24;
+ block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+ block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+ block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+ block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+ block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+ block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+ block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+ block.y |= ((endPoint[0][0].b >> 7) & 1) << 0;
+ block.y |= ((endPoint[0][0].b >> 8) & 1) << 1;
+ block.y |= ((endPoint[0][0].b >> 9) & 1) << 2;
+ block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+ block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+ block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+ block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+ block.y |= ((endPoint[0][1].r >> 4) & 1) << 7;
+ block.y |= ((endPoint[1][1].g >> 4) & 1) << 8;
+ block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+ block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+ block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+ block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+ block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+ block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+ block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+ block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+ block.y |= ((endPoint[0][1].g >> 4) & 1) << 17;
+ block.y |= ((endPoint[1][1].b >> 0) & 1) << 18;
+ block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+ block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+ block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+ block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+ block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+ block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+ block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+ block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+ block.y |= ((endPoint[0][1].b >> 4) & 1) << 27;
+ block.y |= ((endPoint[1][1].b >> 1) & 1) << 28;
+ block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+ block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+ block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+ block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+ block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+ block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+ block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+ block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+ block.z |= ((endPoint[1][0].r >> 4) & 1) << 5;
+ block.z |= ((endPoint[1][1].b >> 2) & 1) << 6;
+ block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+ block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+ block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+ block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+ block.z |= ((endPoint[1][1].r >> 4) & 1) << 11;
+ block.z |= ((endPoint[1][1].b >> 3) & 1) << 12;
+ block.z |= ((partition_index >> 0) & 1) << 13;
+ block.z |= ((partition_index >> 1) & 1) << 14;
+ block.z |= ((partition_index >> 2) & 1) << 15;
+ block.z |= ((partition_index >> 3) & 1) << 16;
+ block.z |= ((partition_index >> 4) & 1) << 17;
+ }
+ else if ( mode_type == candidateModeFlag[1])
+ {
+ /*block.x = candidateModeMemory[1];
+ block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00000FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x003F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
+ block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000001F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0007E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x1F800000 );
+ block.x |= ( ( endPoint[1][0].g >> 3 ) & 0x00000004 ) | ( ( endPoint[1][0].g << 20 ) & 0x01000000 );
+ block.x |= ( endPoint[1][1].g >> 1 ) & 0x00000018;
+ block.x |= ( ( endPoint[1][1].b << 21 ) & 0x00800000 ) | ( ( endPoint[1][1].b << 12 ) & 0x00003000 );
+ block.x |= ( ( endPoint[1][0].b << 17 ) & 0x00400000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 );
+ block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000007E);
+ block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+ block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00001F80);
+ block.y |= ( ( endPoint[1][1].b >> 4 ) & 0x00000002 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 ) | ( ( endPoint[1][1].b >> 3 ) & 0x00000001 );
+ block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;*/
+
+ block.x |= ((candidateModeMemory[1] >> 0) & 1) << 0;
+ block.x |= ((candidateModeMemory[1] >> 1) & 1) << 1;
+ block.x |= ((endPoint[1][0].g >> 5) & 1) << 2;
+ block.x |= ((endPoint[1][1].g >> 4) & 1) << 3;
+ block.x |= ((endPoint[1][1].g >> 5) & 1) << 4;
+ block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+ block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+ block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+ block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+ block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+ block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+ block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+ block.x |= ((endPoint[1][1].b >> 0) & 1) << 12;
+ block.x |= ((endPoint[1][1].b >> 1) & 1) << 13;
+ block.x |= ((endPoint[1][0].b >> 4) & 1) << 14;
+ block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+ block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+ block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+ block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+ block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+ block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+ block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+ block.x |= ((endPoint[1][0].b >> 5) & 1) << 22;
+ block.x |= ((endPoint[1][1].b >> 2) & 1) << 23;
+ block.x |= ((endPoint[1][0].g >> 4) & 1) << 24;
+ block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+ block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+ block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+ block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+ block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+ block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+ block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+ block.y |= ((endPoint[1][1].b >> 3) & 1) << 0;
+ block.y |= ((endPoint[1][1].b >> 5) & 1) << 1;
+ block.y |= ((endPoint[1][1].b >> 4) & 1) << 2;
+ block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+ block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+ block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+ block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+ block.y |= ((endPoint[0][1].r >> 4) & 1) << 7;
+ block.y |= ((endPoint[0][1].r >> 5) & 1) << 8;
+ block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+ block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+ block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+ block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+ block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+ block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+ block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+ block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+ block.y |= ((endPoint[0][1].g >> 4) & 1) << 17;
+ block.y |= ((endPoint[0][1].g >> 5) & 1) << 18;
+ block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+ block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+ block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+ block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+ block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+ block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+ block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+ block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+ block.y |= ((endPoint[0][1].b >> 4) & 1) << 27;
+ block.y |= ((endPoint[0][1].b >> 5) & 1) << 28;
+ block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+ block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+ block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+ block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+ block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+ block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+ block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+ block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+ block.z |= ((endPoint[1][0].r >> 4) & 1) << 5;
+ block.z |= ((endPoint[1][0].r >> 5) & 1) << 6;
+ block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+ block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+ block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+ block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+ block.z |= ((endPoint[1][1].r >> 4) & 1) << 11;
+ block.z |= ((endPoint[1][1].r >> 5) & 1) << 12;
+ block.z |= ((partition_index >> 0) & 1) << 13;
+ block.z |= ((partition_index >> 1) & 1) << 14;
+ block.z |= ((partition_index >> 2) & 1) << 15;
+ block.z |= ((partition_index >> 3) & 1) << 16;
+ block.z |= ((partition_index >> 4) & 1) << 17;
+ }
+ else if ( mode_type == candidateModeFlag[2])
+ {
+ /*block.x = candidateModeMemory[2];
+ block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
+ block.y |= ( endPoint[0][0].r >> 2 ) & 0x00000100;
+ block.y |= ( endPoint[0][0].g << 7 ) & 0x00020000;
+ block.y |= ( ( endPoint[0][0].b << 17 ) & 0x08000000 ) | ( ( endPoint[0][0].b >> 7 ) & 0x00000007 );
+ block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0001E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x07800000 );
+ block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E);
+ block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+ block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80);
+ block.yz |= ( ( endPoint[1][1].b << uint2(27, 9) ) & uint2(0x10000000, 0x00001000) ) | ( ( endPoint[1][1].b << uint2(18, 4) ) & uint2(0x00040000, 0x00000040) );
+ block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;*/
+
+ block.x |= ((candidateModeMemory[2] >> 0) & 1) << 0;
+ block.x |= ((candidateModeMemory[2] >> 1) & 1) << 1;
+ block.x |= ((candidateModeMemory[2] >> 2) & 1) << 2;
+ block.x |= ((candidateModeMemory[2] >> 3) & 1) << 3;
+ block.x |= ((candidateModeMemory[2] >> 4) & 1) << 4;
+ block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+ block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+ block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+ block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+ block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+ block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+ block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+ block.x |= ((endPoint[0][0].r >> 7) & 1) << 12;
+ block.x |= ((endPoint[0][0].r >> 8) & 1) << 13;
+ block.x |= ((endPoint[0][0].r >> 9) & 1) << 14;
+ block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+ block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+ block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+ block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+ block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+ block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+ block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+ block.x |= ((endPoint[0][0].g >> 7) & 1) << 22;
+ block.x |= ((endPoint[0][0].g >> 8) & 1) << 23;
+ block.x |= ((endPoint[0][0].g >> 9) & 1) << 24;
+ block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+ block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+ block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+ block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+ block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+ block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+ block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+ block.y |= ((endPoint[0][0].b >> 7) & 1) << 0;
+ block.y |= ((endPoint[0][0].b >> 8) & 1) << 1;
+ block.y |= ((endPoint[0][0].b >> 9) & 1) << 2;
+ block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+ block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+ block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+ block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+ block.y |= ((endPoint[0][1].r >> 4) & 1) << 7;
+ block.y |= ((endPoint[0][0].r >> 10) & 1) << 8;
+ block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+ block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+ block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+ block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+ block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+ block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+ block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+ block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+ block.y |= ((endPoint[0][0].g >> 10) & 1) << 17;
+ block.y |= ((endPoint[1][1].b >> 0) & 1) << 18;
+ block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+ block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+ block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+ block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+ block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+ block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+ block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+ block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+ block.y |= ((endPoint[0][0].b >> 10) & 1) << 27;
+ block.y |= ((endPoint[1][1].b >> 1) & 1) << 28;
+ block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+ block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+ block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+ block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+ block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+ block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+ block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+ block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+ block.z |= ((endPoint[1][0].r >> 4) & 1) << 5;
+ block.z |= ((endPoint[1][1].b >> 2) & 1) << 6;
+ block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+ block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+ block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+ block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+ block.z |= ((endPoint[1][1].r >> 4) & 1) << 11;
+ block.z |= ((endPoint[1][1].b >> 3) & 1) << 12;
+ block.z |= ((partition_index >> 0) & 1) << 13;
+ block.z |= ((partition_index >> 1) & 1) << 14;
+ block.z |= ((partition_index >> 2) & 1) << 15;
+ block.z |= ((partition_index >> 3) & 1) << 16;
+ block.z |= ((partition_index >> 4) & 1) << 17;
+ }
+ else if ( mode_type == candidateModeFlag[3])
+ {
+ /*block.x = candidateModeMemory[3];
+ block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
+ block.y |= ( endPoint[0][0].r >> 3 ) & 0x00000080;
+ block.y |= ( endPoint[0][0].g << 8 ) & 0x00040000;
+ block.y |= ( ( endPoint[0][0].b << 17 ) & 0x08000000 ) | ( ( endPoint[0][0].b >> 7 ) & 0x00000007 );
+ block.y |= ( ( endPoint[0][1].r << 3 ) & 0x00000078 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x07800000 );
+ block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000001E);
+ block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+ block.y |= ( ( endPoint[1][1].g << 4 ) & 0x00000100 );
+ block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000780);
+ block.yz |= ( endPoint[1][1].b << uint2(27, 9) ) & uint2(0x10000000, 0x00001000);
+ block.z |= ( ( endPoint[1][0].g << 7 ) & 0x00000800 );
+ block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
+ block.z |= ( endPoint[1][1].b << 4 ) & 0x00000040;
+ block.z |= ( endPoint[1][1].b << 5 ) & 0x00000020;*/
+
+ block.x |= ((candidateModeMemory[3] >> 0) & 1) << 0;
+ block.x |= ((candidateModeMemory[3] >> 1) & 1) << 1;
+ block.x |= ((candidateModeMemory[3] >> 2) & 1) << 2;
+ block.x |= ((candidateModeMemory[3] >> 3) & 1) << 3;
+ block.x |= ((candidateModeMemory[3] >> 4) & 1) << 4;
+ block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+ block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+ block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+ block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+ block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+ block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+ block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+ block.x |= ((endPoint[0][0].r >> 7) & 1) << 12;
+ block.x |= ((endPoint[0][0].r >> 8) & 1) << 13;
+ block.x |= ((endPoint[0][0].r >> 9) & 1) << 14;
+ block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+ block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+ block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+ block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+ block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+ block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+ block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+ block.x |= ((endPoint[0][0].g >> 7) & 1) << 22;
+ block.x |= ((endPoint[0][0].g >> 8) & 1) << 23;
+ block.x |= ((endPoint[0][0].g >> 9) & 1) << 24;
+ block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+ block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+ block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+ block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+ block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+ block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+ block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+ block.y |= ((endPoint[0][0].b >> 7) & 1) << 0;
+ block.y |= ((endPoint[0][0].b >> 8) & 1) << 1;
+ block.y |= ((endPoint[0][0].b >> 9) & 1) << 2;
+ block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+ block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+ block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+ block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+ block.y |= ((endPoint[0][0].r >> 10) & 1) << 7;
+ block.y |= ((endPoint[1][1].g >> 4) & 1) << 8;
+ block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+ block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+ block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+ block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+ block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+ block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+ block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+ block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+ block.y |= ((endPoint[0][1].g >> 4) & 1) << 17;
+ block.y |= ((endPoint[0][0].g >> 10) & 1) << 18;
+ block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+ block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+ block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+ block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+ block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+ block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+ block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+ block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+ block.y |= ((endPoint[0][0].b >> 10) & 1) << 27;
+ block.y |= ((endPoint[1][1].b >> 1) & 1) << 28;
+ block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+ block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+ block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+ block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+ block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+ block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+ block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+ block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+ block.z |= ((endPoint[1][1].b >> 0) & 1) << 5;
+ block.z |= ((endPoint[1][1].b >> 2) & 1) << 6;
+ block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+ block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+ block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+ block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+ block.z |= ((endPoint[1][0].g >> 4) & 1) << 11;
+ block.z |= ((endPoint[1][1].b >> 3) & 1) << 12;
+ block.z |= ((partition_index >> 0) & 1) << 13;
+ block.z |= ((partition_index >> 1) & 1) << 14;
+ block.z |= ((partition_index >> 2) & 1) << 15;
+ block.z |= ((partition_index >> 3) & 1) << 16;
+ block.z |= ((partition_index >> 4) & 1) << 17;
+ }
+ else if ( mode_type == candidateModeFlag[4])
+ {
+ /*block.x = candidateModeMemory[4];
+ block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
+ block.y |= ( endPoint[0][0].r >> 3 ) & 0x00000080;
+ block.y |= ( endPoint[0][0].g << 7 ) & 0x00020000;
+ block.y |= ( ( endPoint[0][0].b << 18 ) & 0x10000000 ) | ( ( endPoint[0][0].b >> 7 ) & 0x00000007 );
+ block.y |= ( ( endPoint[0][1].r << 3 ) & 0x00000078 ) | ( ( endPoint[0][1].g << 13 ) & 0x0001E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 );
+ block.y |= ( ( endPoint[1][0].g << 9 ) & 0x00001E00 ) | ( ( endPoint[1][0].b << 4 ) & 0x00000100 );
+ block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+ block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000780);
+ block.yz |= ( endPoint[1][1].b << uint2(18, 4) ) & uint2(0x00040000, 0x00000060);
+ block.z |= ( endPoint[1][0].r << 1 ) & 0x0000001E;
+ block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
+ block.z |= ( ( endPoint[1][1].b << 7 ) & 0x00000800 ) | ( ( endPoint[1][1].b << 9 ) & 0x00001000 );*/
+
+ block.x |= ((candidateModeMemory[4] >> 0) & 1) << 0;
+ block.x |= ((candidateModeMemory[4] >> 1) & 1) << 1;
+ block.x |= ((candidateModeMemory[4] >> 2) & 1) << 2;
+ block.x |= ((candidateModeMemory[4] >> 3) & 1) << 3;
+ block.x |= ((candidateModeMemory[4] >> 4) & 1) << 4;
+ block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+ block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+ block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+ block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+ block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+ block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+ block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+ block.x |= ((endPoint[0][0].r >> 7) & 1) << 12;
+ block.x |= ((endPoint[0][0].r >> 8) & 1) << 13;
+ block.x |= ((endPoint[0][0].r >> 9) & 1) << 14;
+ block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+ block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+ block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+ block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+ block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+ block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+ block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+ block.x |= ((endPoint[0][0].g >> 7) & 1) << 22;
+ block.x |= ((endPoint[0][0].g >> 8) & 1) << 23;
+ block.x |= ((endPoint[0][0].g >> 9) & 1) << 24;
+ block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+ block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+ block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+ block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+ block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+ block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+ block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+ block.y |= ((endPoint[0][0].b >> 7) & 1) << 0;
+ block.y |= ((endPoint[0][0].b >> 8) & 1) << 1;
+ block.y |= ((endPoint[0][0].b >> 9) & 1) << 2;
+ block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+ block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+ block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+ block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+ block.y |= ((endPoint[0][0].r >> 10) & 1) << 7;
+ block.y |= ((endPoint[1][0].b >> 4) & 1) << 8;
+ block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+ block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+ block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+ block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+ block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+ block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+ block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+ block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+ block.y |= ((endPoint[0][0].g >> 10) & 1) << 17;
+ block.y |= ((endPoint[1][1].b >> 0) & 1) << 18;
+ block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+ block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+ block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+ block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+ block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+ block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+ block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+ block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+ block.y |= ((endPoint[0][1].b >> 4) & 1) << 27;
+ block.y |= ((endPoint[0][0].b >> 10) & 1) << 28;
+ block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+ block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+ block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+ block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+ block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+ block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+ block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+ block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+ block.z |= ((endPoint[1][1].b >> 1) & 1) << 5;
+ block.z |= ((endPoint[1][1].b >> 2) & 1) << 6;
+ block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+ block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+ block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+ block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+ block.z |= ((endPoint[1][1].b >> 4) & 1) << 11;
+ block.z |= ((endPoint[1][1].b >> 3) & 1) << 12;
+ block.z |= ((partition_index >> 0) & 1) << 13;
+ block.z |= ((partition_index >> 1) & 1) << 14;
+ block.z |= ((partition_index >> 2) & 1) << 15;
+ block.z |= ((partition_index >> 3) & 1) << 16;
+ block.z |= ((partition_index >> 4) & 1) << 17;
+ }
+ else if ( mode_type == candidateModeFlag[5])
+ {
+ /*block.x = candidateModeMemory[5];
+ block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00003FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x00FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000);
+ block.y |= ( endPoint[0][0].b >> 7 ) & 0x00000003;
+ block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 );
+ block.x |= ( ( endPoint[1][0].g << 20 ) & 0x01000000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 );
+ block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E);
+ block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+ block.y |= ( ( endPoint[1][1].g << 4 ) & 0x00000100 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 );
+ block.y |= ( ( endPoint[1][1].b << 27 ) & 0x10000000 );
+ block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80);
+ block.yz |= ( endPoint[1][1].b << uint2(18, 4) ) & uint2(0x00040000, 0x00000040);
+ block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
+ block.z |= ( ( endPoint[1][1].b << 9 ) & 0x00001000 );*/
+
+ block.x |= ((candidateModeMemory[5] >> 0) & 1) << 0;
+ block.x |= ((candidateModeMemory[5] >> 1) & 1) << 1;
+ block.x |= ((candidateModeMemory[5] >> 2) & 1) << 2;
+ block.x |= ((candidateModeMemory[5] >> 3) & 1) << 3;
+ block.x |= ((candidateModeMemory[5] >> 4) & 1) << 4;
+ block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+ block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+ block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+ block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+ block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+ block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+ block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+ block.x |= ((endPoint[0][0].r >> 7) & 1) << 12;
+ block.x |= ((endPoint[0][0].r >> 8) & 1) << 13;
+ block.x |= ((endPoint[1][0].b >> 4) & 1) << 14;
+ block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+ block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+ block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+ block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+ block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+ block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+ block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+ block.x |= ((endPoint[0][0].g >> 7) & 1) << 22;
+ block.x |= ((endPoint[0][0].g >> 8) & 1) << 23;
+ block.x |= ((endPoint[1][0].g >> 4) & 1) << 24;
+ block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+ block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+ block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+ block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+ block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+ block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+ block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+ block.y |= ((endPoint[0][0].b >> 7) & 1) << 0;
+ block.y |= ((endPoint[0][0].b >> 8) & 1) << 1;
+ block.y |= ((endPoint[1][1].b >> 4) & 1) << 2;
+ block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+ block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+ block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+ block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+ block.y |= ((endPoint[0][1].r >> 4) & 1) << 7;
+ block.y |= ((endPoint[1][1].g >> 4) & 1) << 8;
+ block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+ block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+ block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+ block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+ block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+ block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+ block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+ block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+ block.y |= ((endPoint[0][1].g >> 4) & 1) << 17;
+ block.y |= ((endPoint[1][1].b >> 0) & 1) << 18;
+ block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+ block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+ block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+ block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+ block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+ block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+ block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+ block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+ block.y |= ((endPoint[0][1].b >> 4) & 1) << 27;
+ block.y |= ((endPoint[1][1].b >> 1) & 1) << 28;
+ block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+ block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+ block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+ block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+ block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+ block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+ block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+ block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+ block.z |= ((endPoint[1][0].r >> 4) & 1) << 5;
+ block.z |= ((endPoint[1][1].b >> 2) & 1) << 6;
+ block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+ block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+ block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+ block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+ block.z |= ((endPoint[1][1].r >> 4) & 1) << 11;
+ block.z |= ((endPoint[1][1].b >> 3) & 1) << 12;
+ block.z |= ((partition_index >> 0) & 1) << 13;
+ block.z |= ((partition_index >> 1) & 1) << 14;
+ block.z |= ((partition_index >> 2) & 1) << 15;
+ block.z |= ((partition_index >> 3) & 1) << 16;
+ block.z |= ((partition_index >> 4) & 1) << 17;
+ }
+ else if ( mode_type == candidateModeFlag[6])
+ {
+ /*block.x = candidateModeMemory[6];
+ block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00001FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x007F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
+ block.y |= ( endPoint[0][0].b >> 7 ) & 0x00000001;
+ block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000001F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 );
+ block.x |= ( ( endPoint[1][0].g << 20 ) & 0x01000000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000);
+ block.x |= ( ( endPoint[1][1].g << 9 ) & 0x00002000 ) | ( ( endPoint[1][1].b << 21 ) & 0x00800000);
+ block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000007E);
+ block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+ block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00001F80);
+ block.y |= ( ( endPoint[1][1].b >> 2 ) & 0x00000006 );
+ block.y |= ( ( endPoint[1][1].b << 27 ) & 0x10000000 ) | ( ( endPoint[1][1].b << 18 ) & 0x00040000 );
+ block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;*/
+
+ block.x |= ((candidateModeMemory[6] >> 0) & 1) << 0;
+ block.x |= ((candidateModeMemory[6] >> 1) & 1) << 1;
+ block.x |= ((candidateModeMemory[6] >> 2) & 1) << 2;
+ block.x |= ((candidateModeMemory[6] >> 3) & 1) << 3;
+ block.x |= ((candidateModeMemory[6] >> 4) & 1) << 4;
+ block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+ block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+ block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+ block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+ block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+ block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+ block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+ block.x |= ((endPoint[0][0].r >> 7) & 1) << 12;
+ block.x |= ((endPoint[1][1].g >> 4) & 1) << 13;
+ block.x |= ((endPoint[1][0].b >> 4) & 1) << 14;
+ block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+ block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+ block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+ block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+ block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+ block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+ block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+ block.x |= ((endPoint[0][0].g >> 7) & 1) << 22;
+ block.x |= ((endPoint[1][1].b >> 2) & 1) << 23;
+ block.x |= ((endPoint[1][0].g >> 4) & 1) << 24;
+ block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+ block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+ block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+ block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+ block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+ block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+ block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+ block.y |= ((endPoint[0][0].b >> 7) & 1) << 0;
+ block.y |= ((endPoint[1][1].b >> 3) & 1) << 1;
+ block.y |= ((endPoint[1][1].b >> 4) & 1) << 2;
+ block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+ block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+ block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+ block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+ block.y |= ((endPoint[0][1].r >> 4) & 1) << 7;
+ block.y |= ((endPoint[0][1].r >> 5) & 1) << 8;
+ block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+ block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+ block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+ block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+ block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+ block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+ block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+ block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+ block.y |= ((endPoint[0][1].g >> 4) & 1) << 17;
+ block.y |= ((endPoint[1][1].b >> 0) & 1) << 18;
+ block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+ block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+ block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+ block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+ block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+ block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+ block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+ block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+ block.y |= ((endPoint[0][1].b >> 4) & 1) << 27;
+ block.y |= ((endPoint[1][1].b >> 1) & 1) << 28;
+ block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+ block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+ block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+ block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+ block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+ block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+ block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+ block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+ block.z |= ((endPoint[1][0].r >> 4) & 1) << 5;
+ block.z |= ((endPoint[1][0].r >> 5) & 1) << 6;
+ block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+ block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+ block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+ block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+ block.z |= ((endPoint[1][1].r >> 4) & 1) << 11;
+ block.z |= ((endPoint[1][1].r >> 5) & 1) << 12;
+ block.z |= ((partition_index >> 0) & 1) << 13;
+ block.z |= ((partition_index >> 1) & 1) << 14;
+ block.z |= ((partition_index >> 2) & 1) << 15;
+ block.z |= ((partition_index >> 3) & 1) << 16;
+ block.z |= ((partition_index >> 4) & 1) << 17;
+ }
+ else if ( mode_type == candidateModeFlag[7])
+ {
+ /*block.x = candidateModeMemory[7];
+ block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00001FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x007F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
+ block.y |= ( endPoint[0][0].b >> 7 ) & 0x00000001;
+ block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0007E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 );
+ block.x |= ( ( endPoint[1][0].g << 20 ) & 0x01000000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 );
+ block.x |= ( ( endPoint[1][0].g << 18 ) & 0x00800000 );
+ block.x |= ( ( endPoint[1][1].b << 13 ) & 0x00002000 );
+ block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E);
+ block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80);
+ block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+ block.y |= ( ( endPoint[1][1].g >> 4 ) & 0x00000002 ) | ( ( endPoint[1][1].g << 4 ) & 0x00000100 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 );
+ block.y |= ( endPoint[1][1].b << 27 ) & 0x10000000;
+ block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
+ block.z |= ( ( endPoint[1][1].b << 9 ) & 0x00001000 ) | ( ( endPoint[1][1].b << 4 ) & 0x00000040 );*/
+
+ block.x |= ((candidateModeMemory[7] >> 0) & 1) << 0;
+ block.x |= ((candidateModeMemory[7] >> 1) & 1) << 1;
+ block.x |= ((candidateModeMemory[7] >> 2) & 1) << 2;
+ block.x |= ((candidateModeMemory[7] >> 3) & 1) << 3;
+ block.x |= ((candidateModeMemory[7] >> 4) & 1) << 4;
+ block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+ block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+ block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+ block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+ block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+ block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+ block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+ block.x |= ((endPoint[0][0].r >> 7) & 1) << 12;
+ block.x |= ((endPoint[1][1].b >> 0) & 1) << 13;
+ block.x |= ((endPoint[1][0].b >> 4) & 1) << 14;
+ block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+ block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+ block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+ block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+ block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+ block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+ block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+ block.x |= ((endPoint[0][0].g >> 7) & 1) << 22;
+ block.x |= ((endPoint[1][0].g >> 5) & 1) << 23;
+ block.x |= ((endPoint[1][0].g >> 4) & 1) << 24;
+ block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+ block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+ block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+ block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+ block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+ block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+ block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+ block.y |= ((endPoint[0][0].b >> 7) & 1) << 0;
+ block.y |= ((endPoint[1][1].g >> 5) & 1) << 1;
+ block.y |= ((endPoint[1][1].b >> 4) & 1) << 2;
+ block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+ block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+ block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+ block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+ block.y |= ((endPoint[0][1].r >> 4) & 1) << 7;
+ block.y |= ((endPoint[1][1].g >> 4) & 1) << 8;
+ block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+ block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+ block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+ block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+ block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+ block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+ block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+ block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+ block.y |= ((endPoint[0][1].g >> 4) & 1) << 17;
+ block.y |= ((endPoint[0][1].g >> 5) & 1) << 18;
+ block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+ block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+ block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+ block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+ block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+ block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+ block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+ block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+ block.y |= ((endPoint[0][1].b >> 4) & 1) << 27;
+ block.y |= ((endPoint[1][1].b >> 1) & 1) << 28;
+ block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+ block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+ block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+ block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+ block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+ block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+ block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+ block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+ block.z |= ((endPoint[1][0].r >> 4) & 1) << 5;
+ block.z |= ((endPoint[1][1].b >> 2) & 1) << 6;
+ block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+ block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+ block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+ block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+ block.z |= ((endPoint[1][1].r >> 4) & 1) << 11;
+ block.z |= ((endPoint[1][1].b >> 3) & 1) << 12;
+ block.z |= ((partition_index >> 0) & 1) << 13;
+ block.z |= ((partition_index >> 1) & 1) << 14;
+ block.z |= ((partition_index >> 2) & 1) << 15;
+ block.z |= ((partition_index >> 3) & 1) << 16;
+ block.z |= ((partition_index >> 4) & 1) << 17;
+ }
+ else if ( mode_type == candidateModeFlag[8])
+ {
+ /*block.x = candidateModeMemory[8];
+ block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00001FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x007F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
+ block.y |= ( endPoint[0][0].b >> 7 ) & 0x00000001;
+ block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x1F800000 );
+ block.x |= ( ( endPoint[1][0].g << 20 ) & 0x01000000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 );
+ block.x |= ( ( endPoint[1][0].b << 18 ) & 0x00800000 );
+ block.x |= ( endPoint[1][1].b << 12 ) & 0x00002000;
+ block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+ block.y |= ( ( endPoint[1][1].g << 4 ) & 0x00000100 ) | ( ( endPoint[1][1].b >> 4 ) & 0x00000002 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 );
+ block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E);
+ block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80);
+ block.y |= ( endPoint[1][1].b << 18 ) & 0x00040000;
+ block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
+ block.z |= ( ( endPoint[1][1].b << 9 ) & 0x00001000 ) | ( ( endPoint[1][1].b << 4 ) & 0x00000040 );*/
+
+ block.x |= ((candidateModeMemory[8] >> 0) & 1) << 0;
+ block.x |= ((candidateModeMemory[8] >> 1) & 1) << 1;
+ block.x |= ((candidateModeMemory[8] >> 2) & 1) << 2;
+ block.x |= ((candidateModeMemory[8] >> 3) & 1) << 3;
+ block.x |= ((candidateModeMemory[8] >> 4) & 1) << 4;
+ block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+ block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+ block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+ block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+ block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+ block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+ block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+ block.x |= ((endPoint[0][0].r >> 7) & 1) << 12;
+ block.x |= ((endPoint[1][1].b >> 1) & 1) << 13;
+ block.x |= ((endPoint[1][0].b >> 4) & 1) << 14;
+ block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+ block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+ block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+ block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+ block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+ block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+ block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+ block.x |= ((endPoint[0][0].g >> 7) & 1) << 22;
+ block.x |= ((endPoint[1][0].b >> 5) & 1) << 23;
+ block.x |= ((endPoint[1][0].g >> 4) & 1) << 24;
+ block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+ block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+ block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+ block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+ block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+ block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+ block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+ block.y |= ((endPoint[0][0].b >> 7) & 1) << 0;
+ block.y |= ((endPoint[1][1].b >> 5) & 1) << 1;
+ block.y |= ((endPoint[1][1].b >> 4) & 1) << 2;
+ block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+ block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+ block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+ block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+ block.y |= ((endPoint[0][1].r >> 4) & 1) << 7;
+ block.y |= ((endPoint[1][1].g >> 4) & 1) << 8;
+ block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+ block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+ block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+ block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+ block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+ block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+ block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+ block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+ block.y |= ((endPoint[0][1].g >> 4) & 1) << 17;
+ block.y |= ((endPoint[1][1].b >> 0) & 1) << 18;
+ block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+ block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+ block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+ block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+ block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+ block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+ block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+ block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+ block.y |= ((endPoint[0][1].b >> 4) & 1) << 27;
+ block.y |= ((endPoint[0][1].b >> 5) & 1) << 28;
+ block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+ block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+ block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+ block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+ block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+ block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+ block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+ block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+ block.z |= ((endPoint[1][0].r >> 4) & 1) << 5;
+ block.z |= ((endPoint[1][1].b >> 2) & 1) << 6;
+ block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+ block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+ block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+ block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+ block.z |= ((endPoint[1][1].r >> 4) & 1) << 11;
+ block.z |= ((endPoint[1][1].b >> 3) & 1) << 12;
+ block.z |= ((partition_index >> 0) & 1) << 13;
+ block.z |= ((partition_index >> 1) & 1) << 14;
+ block.z |= ((partition_index >> 2) & 1) << 15;
+ block.z |= ((partition_index >> 3) & 1) << 16;
+ block.z |= ((partition_index >> 4) & 1) << 17;
+ }
+ else if ( mode_type == candidateModeFlag[9])
+ {
+ /*block.x = candidateModeMemory[9];
+ block.x |= ( ( endPoint[0][0].r << 5 ) & 0x000007E0 ) | ( ( endPoint[0][0].g << 15 ) & 0x001F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0x7E000000 );
+ block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000001F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0007E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x1F800000 );
+ block.x |= ( ( endPoint[1][0].g << 16 ) & 0x00200000 ) | ( ( endPoint[1][0].g << 20 ) & 0x01000000 );
+ block.x |= ( ( endPoint[1][0].b << 17 ) & 0x00400000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 );
+ block.x |= ( ( endPoint[1][1].b << 21 ) & 0x00800000 ) | ( ( endPoint[1][1].b << 12 ) & 0x00003000 );
+ block.x |= ( ( endPoint[1][1].g << 26 ) & 0x80000000 ) | ( ( endPoint[1][1].g << 7 ) & 0x00000800 );
+ block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000007E);
+ block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00001F80);
+ block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+ block.y |= ( ( endPoint[1][1].b >> 4 ) & 0x00000002 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 ) | ( ( endPoint[1][1].b >> 3 ) & 0x00000001 );
+ block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;*/
+
+ block.x |= ((candidateModeMemory[9] >> 0) & 1) << 0;
+ block.x |= ((candidateModeMemory[9] >> 1) & 1) << 1;
+ block.x |= ((candidateModeMemory[9] >> 2) & 1) << 2;
+ block.x |= ((candidateModeMemory[9] >> 3) & 1) << 3;
+ block.x |= ((candidateModeMemory[9] >> 4) & 1) << 4;
+ block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+ block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+ block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+ block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+ block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+ block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+ block.x |= ((endPoint[1][1].g >> 4) & 1) << 11;
+ block.x |= ((endPoint[1][1].b >> 0) & 1) << 12;
+ block.x |= ((endPoint[1][1].b >> 1) & 1) << 13;
+ block.x |= ((endPoint[1][0].b >> 4) & 1) << 14;
+ block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+ block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+ block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+ block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+ block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+ block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+ block.x |= ((endPoint[1][0].g >> 5) & 1) << 21;
+ block.x |= ((endPoint[1][0].b >> 5) & 1) << 22;
+ block.x |= ((endPoint[1][1].b >> 2) & 1) << 23;
+ block.x |= ((endPoint[1][0].g >> 4) & 1) << 24;
+ block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+ block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+ block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+ block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+ block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+ block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+ block.x |= ((endPoint[1][1].g >> 5) & 1) << 31;
+ block.y |= ((endPoint[1][1].b >> 3) & 1) << 0;
+ block.y |= ((endPoint[1][1].b >> 5) & 1) << 1;
+ block.y |= ((endPoint[1][1].b >> 4) & 1) << 2;
+ block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+ block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+ block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+ block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+ block.y |= ((endPoint[0][1].r >> 4) & 1) << 7;
+ block.y |= ((endPoint[0][1].r >> 5) & 1) << 8;
+ block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+ block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+ block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+ block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+ block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+ block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+ block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+ block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+ block.y |= ((endPoint[0][1].g >> 4) & 1) << 17;
+ block.y |= ((endPoint[0][1].g >> 5) & 1) << 18;
+ block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+ block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+ block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+ block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+ block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+ block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+ block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+ block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+ block.y |= ((endPoint[0][1].b >> 4) & 1) << 27;
+ block.y |= ((endPoint[0][1].b >> 5) & 1) << 28;
+ block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+ block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+ block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+ block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+ block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+ block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+ block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+ block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+ block.z |= ((endPoint[1][0].r >> 4) & 1) << 5;
+ block.z |= ((endPoint[1][0].r >> 5) & 1) << 6;
+ block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+ block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+ block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+ block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+ block.z |= ((endPoint[1][1].r >> 4) & 1) << 11;
+ block.z |= ((endPoint[1][1].r >> 5) & 1) << 12;
+ block.z |= ((partition_index >> 0) & 1) << 13;
+ block.z |= ((partition_index >> 1) & 1) << 14;
+ block.z |= ((partition_index >> 2) & 1) << 15;
+ block.z |= ((partition_index >> 3) & 1) << 16;
+ block.z |= ((partition_index >> 4) & 1) << 17;
+ }
+}
+void block_package( inout uint4 block, int2x3 endPoint, uint mode_type ) // for mode 11 - 14
+{
+ /*block.x = ( ( endPoint[0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0].b << 25 ) & 0xFE000000 );
+ block.y |= ( endPoint[0].b >> 7 ) & 0x00000007;*/
+
+ block.xy = 0;
+ block.z &= 0xFFFFFFFE;
+
+
+ if ( mode_type == candidateModeFlag[10])
+ {
+ /* block.x |= candidateModeMemory[10];
+ block.y |= ( ( endPoint[1].r << 3 ) & 0x00001FF8 ) | ( ( endPoint[1].g << 13 ) & 0x007FE000 ) | ( ( endPoint[1].b << 23 ) & 0xFF800000 );
+ block.z |= ( endPoint[1].b >> 9 ) & 0x00000001;*/
+
+ block.x |= ((candidateModeMemory[10] >> 0) & 1) << 0;
+ block.x |= ((candidateModeMemory[10] >> 1) & 1) << 1;
+ block.x |= ((candidateModeMemory[10] >> 2) & 1) << 2;
+ block.x |= ((candidateModeMemory[10] >> 3) & 1) << 3;
+ block.x |= ((candidateModeMemory[10] >> 4) & 1) << 4;
+ block.x |= ((endPoint[0].r >> 0) & 1) << 5;
+ block.x |= ((endPoint[0].r >> 1) & 1) << 6;
+ block.x |= ((endPoint[0].r >> 2) & 1) << 7;
+ block.x |= ((endPoint[0].r >> 3) & 1) << 8;
+ block.x |= ((endPoint[0].r >> 4) & 1) << 9;
+ block.x |= ((endPoint[0].r >> 5) & 1) << 10;
+ block.x |= ((endPoint[0].r >> 6) & 1) << 11;
+ block.x |= ((endPoint[0].r >> 7) & 1) << 12;
+ block.x |= ((endPoint[0].r >> 8) & 1) << 13;
+ block.x |= ((endPoint[0].r >> 9) & 1) << 14;
+ block.x |= ((endPoint[0].g >> 0) & 1) << 15;
+ block.x |= ((endPoint[0].g >> 1) & 1) << 16;
+ block.x |= ((endPoint[0].g >> 2) & 1) << 17;
+ block.x |= ((endPoint[0].g >> 3) & 1) << 18;
+ block.x |= ((endPoint[0].g >> 4) & 1) << 19;
+ block.x |= ((endPoint[0].g >> 5) & 1) << 20;
+ block.x |= ((endPoint[0].g >> 6) & 1) << 21;
+ block.x |= ((endPoint[0].g >> 7) & 1) << 22;
+ block.x |= ((endPoint[0].g >> 8) & 1) << 23;
+ block.x |= ((endPoint[0].g >> 9) & 1) << 24;
+ block.x |= ((endPoint[0].b >> 0) & 1) << 25;
+ block.x |= ((endPoint[0].b >> 1) & 1) << 26;
+ block.x |= ((endPoint[0].b >> 2) & 1) << 27;
+ block.x |= ((endPoint[0].b >> 3) & 1) << 28;
+ block.x |= ((endPoint[0].b >> 4) & 1) << 29;
+ block.x |= ((endPoint[0].b >> 5) & 1) << 30;
+ block.x |= ((endPoint[0].b >> 6) & 1) << 31;
+ block.y |= ((endPoint[0].b >> 7) & 1) << 0;
+ block.y |= ((endPoint[0].b >> 8) & 1) << 1;
+ block.y |= ((endPoint[0].b >> 9) & 1) << 2;
+ block.y |= ((endPoint[1].r >> 0) & 1) << 3;
+ block.y |= ((endPoint[1].r >> 1) & 1) << 4;
+ block.y |= ((endPoint[1].r >> 2) & 1) << 5;
+ block.y |= ((endPoint[1].r >> 3) & 1) << 6;
+ block.y |= ((endPoint[1].r >> 4) & 1) << 7;
+ block.y |= ((endPoint[1].r >> 5) & 1) << 8;
+ block.y |= ((endPoint[1].r >> 6) & 1) << 9;
+ block.y |= ((endPoint[1].r >> 7) & 1) << 10;
+ block.y |= ((endPoint[1].r >> 8) & 1) << 11;
+ block.y |= ((endPoint[1].r >> 9) & 1) << 12;
+ block.y |= ((endPoint[1].g >> 0) & 1) << 13;
+ block.y |= ((endPoint[1].g >> 1) & 1) << 14;
+ block.y |= ((endPoint[1].g >> 2) & 1) << 15;
+ block.y |= ((endPoint[1].g >> 3) & 1) << 16;
+ block.y |= ((endPoint[1].g >> 4) & 1) << 17;
+ block.y |= ((endPoint[1].g >> 5) & 1) << 18;
+ block.y |= ((endPoint[1].g >> 6) & 1) << 19;
+ block.y |= ((endPoint[1].g >> 7) & 1) << 20;
+ block.y |= ((endPoint[1].g >> 8) & 1) << 21;
+ block.y |= ((endPoint[1].g >> 9) & 1) << 22;
+ block.y |= ((endPoint[1].b >> 0) & 1) << 23;
+ block.y |= ((endPoint[1].b >> 1) & 1) << 24;
+ block.y |= ((endPoint[1].b >> 2) & 1) << 25;
+ block.y |= ((endPoint[1].b >> 3) & 1) << 26;
+ block.y |= ((endPoint[1].b >> 4) & 1) << 27;
+ block.y |= ((endPoint[1].b >> 5) & 1) << 28;
+ block.y |= ((endPoint[1].b >> 6) & 1) << 29;
+ block.y |= ((endPoint[1].b >> 7) & 1) << 30;
+ block.y |= ((endPoint[1].b >> 8) & 1) << 31;
+ block.z |= ((endPoint[1].b >> 9) & 1) << 0;
+ }
+ else if (mode_type == candidateModeFlag[11])
+ {
+ /*block.x |= candidateModeMemory[11];
+ block.y |= ( ( endPoint[0].r << 2 ) & 0x00001000 ) | ( ( endPoint[0].g << 12 ) & 0x00400000 );
+ block.y |= ( ( endPoint[1].r << 3 ) & 0x00000FF8 ) | ( ( endPoint[1].g << 13 ) & 0x003FE000 ) | ( ( endPoint[1].b << 23 ) & 0xFF800000 );
+ block.z |= ( endPoint[0].b >> 10 ) & 0x00000001;*/
+
+ block.x |= ((candidateModeMemory[11] >> 0) & 1) << 0;
+ block.x |= ((candidateModeMemory[11] >> 1) & 1) << 1;
+ block.x |= ((candidateModeMemory[11] >> 2) & 1) << 2;
+ block.x |= ((candidateModeMemory[11] >> 3) & 1) << 3;
+ block.x |= ((candidateModeMemory[11] >> 4) & 1) << 4;
+ block.x |= ((endPoint[0].r >> 0) & 1) << 5;
+ block.x |= ((endPoint[0].r >> 1) & 1) << 6;
+ block.x |= ((endPoint[0].r >> 2) & 1) << 7;
+ block.x |= ((endPoint[0].r >> 3) & 1) << 8;
+ block.x |= ((endPoint[0].r >> 4) & 1) << 9;
+ block.x |= ((endPoint[0].r >> 5) & 1) << 10;
+ block.x |= ((endPoint[0].r >> 6) & 1) << 11;
+ block.x |= ((endPoint[0].r >> 7) & 1) << 12;
+ block.x |= ((endPoint[0].r >> 8) & 1) << 13;
+ block.x |= ((endPoint[0].r >> 9) & 1) << 14;
+ block.x |= ((endPoint[0].g >> 0) & 1) << 15;
+ block.x |= ((endPoint[0].g >> 1) & 1) << 16;
+ block.x |= ((endPoint[0].g >> 2) & 1) << 17;
+ block.x |= ((endPoint[0].g >> 3) & 1) << 18;
+ block.x |= ((endPoint[0].g >> 4) & 1) << 19;
+ block.x |= ((endPoint[0].g >> 5) & 1) << 20;
+ block.x |= ((endPoint[0].g >> 6) & 1) << 21;
+ block.x |= ((endPoint[0].g >> 7) & 1) << 22;
+ block.x |= ((endPoint[0].g >> 8) & 1) << 23;
+ block.x |= ((endPoint[0].g >> 9) & 1) << 24;
+ block.x |= ((endPoint[0].b >> 0) & 1) << 25;
+ block.x |= ((endPoint[0].b >> 1) & 1) << 26;
+ block.x |= ((endPoint[0].b >> 2) & 1) << 27;
+ block.x |= ((endPoint[0].b >> 3) & 1) << 28;
+ block.x |= ((endPoint[0].b >> 4) & 1) << 29;
+ block.x |= ((endPoint[0].b >> 5) & 1) << 30;
+ block.x |= ((endPoint[0].b >> 6) & 1) << 31;
+ block.y |= ((endPoint[0].b >> 7) & 1) << 0;
+ block.y |= ((endPoint[0].b >> 8) & 1) << 1;
+ block.y |= ((endPoint[0].b >> 9) & 1) << 2;
+ block.y |= ((endPoint[1].r >> 0) & 1) << 3;
+ block.y |= ((endPoint[1].r >> 1) & 1) << 4;
+ block.y |= ((endPoint[1].r >> 2) & 1) << 5;
+ block.y |= ((endPoint[1].r >> 3) & 1) << 6;
+ block.y |= ((endPoint[1].r >> 4) & 1) << 7;
+ block.y |= ((endPoint[1].r >> 5) & 1) << 8;
+ block.y |= ((endPoint[1].r >> 6) & 1) << 9;
+ block.y |= ((endPoint[1].r >> 7) & 1) << 10;
+ block.y |= ((endPoint[1].r >> 8) & 1) << 11;
+ block.y |= ((endPoint[0].r >> 10) & 1) << 12;
+ block.y |= ((endPoint[1].g >> 0) & 1) << 13;
+ block.y |= ((endPoint[1].g >> 1) & 1) << 14;
+ block.y |= ((endPoint[1].g >> 2) & 1) << 15;
+ block.y |= ((endPoint[1].g >> 3) & 1) << 16;
+ block.y |= ((endPoint[1].g >> 4) & 1) << 17;
+ block.y |= ((endPoint[1].g >> 5) & 1) << 18;
+ block.y |= ((endPoint[1].g >> 6) & 1) << 19;
+ block.y |= ((endPoint[1].g >> 7) & 1) << 20;
+ block.y |= ((endPoint[1].g >> 8) & 1) << 21;
+ block.y |= ((endPoint[0].g >> 10) & 1) << 22;
+ block.y |= ((endPoint[1].b >> 0) & 1) << 23;
+ block.y |= ((endPoint[1].b >> 1) & 1) << 24;
+ block.y |= ((endPoint[1].b >> 2) & 1) << 25;
+ block.y |= ((endPoint[1].b >> 3) & 1) << 26;
+ block.y |= ((endPoint[1].b >> 4) & 1) << 27;
+ block.y |= ((endPoint[1].b >> 5) & 1) << 28;
+ block.y |= ((endPoint[1].b >> 6) & 1) << 29;
+ block.y |= ((endPoint[1].b >> 7) & 1) << 30;
+ block.y |= ((endPoint[1].b >> 8) & 1) << 31;
+ block.z |= ((endPoint[0].b >> 10) & 1) << 0;
+ }
+ else if (mode_type == candidateModeFlag[12])// violate the spec in [0].low
+ {
+ /*block.x |= candidateModeMemory[12];
+ block.y |= ( ( endPoint[0].r << 2 ) & 0x00001000 ) | ( ( endPoint[0].g << 12 ) & 0x00400000 );
+ block.y |= ( ( endPoint[0].r << 0 ) & 0x00000800 ) | ( ( endPoint[0].g << 10 ) & 0x00200000 );
+ block.y |= ( endPoint[0].b << 20 ) & 0x80000000;
+ block.y |= ( ( endPoint[1].r << 3 ) & 0x000007F8 ) | ( ( endPoint[1].g << 13 ) & 0x001FE000 ) | ( ( endPoint[1].b << 23 ) & 0x7F800000 );
+ block.z |= ( endPoint[0].b >> 10 ) & 0x00000001;*/
+
+ block.x |= ((candidateModeMemory[12] >> 0) & 1) << 0;
+ block.x |= ((candidateModeMemory[12] >> 1) & 1) << 1;
+ block.x |= ((candidateModeMemory[12] >> 2) & 1) << 2;
+ block.x |= ((candidateModeMemory[12] >> 3) & 1) << 3;
+ block.x |= ((candidateModeMemory[12] >> 4) & 1) << 4;
+ block.x |= ((endPoint[0].r >> 0) & 1) << 5;
+ block.x |= ((endPoint[0].r >> 1) & 1) << 6;
+ block.x |= ((endPoint[0].r >> 2) & 1) << 7;
+ block.x |= ((endPoint[0].r >> 3) & 1) << 8;
+ block.x |= ((endPoint[0].r >> 4) & 1) << 9;
+ block.x |= ((endPoint[0].r >> 5) & 1) << 10;
+ block.x |= ((endPoint[0].r >> 6) & 1) << 11;
+ block.x |= ((endPoint[0].r >> 7) & 1) << 12;
+ block.x |= ((endPoint[0].r >> 8) & 1) << 13;
+ block.x |= ((endPoint[0].r >> 9) & 1) << 14;
+ block.x |= ((endPoint[0].g >> 0) & 1) << 15;
+ block.x |= ((endPoint[0].g >> 1) & 1) << 16;
+ block.x |= ((endPoint[0].g >> 2) & 1) << 17;
+ block.x |= ((endPoint[0].g >> 3) & 1) << 18;
+ block.x |= ((endPoint[0].g >> 4) & 1) << 19;
+ block.x |= ((endPoint[0].g >> 5) & 1) << 20;
+ block.x |= ((endPoint[0].g >> 6) & 1) << 21;
+ block.x |= ((endPoint[0].g >> 7) & 1) << 22;
+ block.x |= ((endPoint[0].g >> 8) & 1) << 23;
+ block.x |= ((endPoint[0].g >> 9) & 1) << 24;
+ block.x |= ((endPoint[0].b >> 0) & 1) << 25;
+ block.x |= ((endPoint[0].b >> 1) & 1) << 26;
+ block.x |= ((endPoint[0].b >> 2) & 1) << 27;
+ block.x |= ((endPoint[0].b >> 3) & 1) << 28;
+ block.x |= ((endPoint[0].b >> 4) & 1) << 29;
+ block.x |= ((endPoint[0].b >> 5) & 1) << 30;
+ block.x |= ((endPoint[0].b >> 6) & 1) << 31;
+ block.y |= ((endPoint[0].b >> 7) & 1) << 0;
+ block.y |= ((endPoint[0].b >> 8) & 1) << 1;
+ block.y |= ((endPoint[0].b >> 9) & 1) << 2;
+ block.y |= ((endPoint[1].r >> 0) & 1) << 3;
+ block.y |= ((endPoint[1].r >> 1) & 1) << 4;
+ block.y |= ((endPoint[1].r >> 2) & 1) << 5;
+ block.y |= ((endPoint[1].r >> 3) & 1) << 6;
+ block.y |= ((endPoint[1].r >> 4) & 1) << 7;
+ block.y |= ((endPoint[1].r >> 5) & 1) << 8;
+ block.y |= ((endPoint[1].r >> 6) & 1) << 9;
+ block.y |= ((endPoint[1].r >> 7) & 1) << 10;
+ block.y |= ((endPoint[0].r >> 11) & 1) << 11;
+ block.y |= ((endPoint[0].r >> 10) & 1) << 12;
+ block.y |= ((endPoint[1].g >> 0) & 1) << 13;
+ block.y |= ((endPoint[1].g >> 1) & 1) << 14;
+ block.y |= ((endPoint[1].g >> 2) & 1) << 15;
+ block.y |= ((endPoint[1].g >> 3) & 1) << 16;
+ block.y |= ((endPoint[1].g >> 4) & 1) << 17;
+ block.y |= ((endPoint[1].g >> 5) & 1) << 18;
+ block.y |= ((endPoint[1].g >> 6) & 1) << 19;
+ block.y |= ((endPoint[1].g >> 7) & 1) << 20;
+ block.y |= ((endPoint[0].g >> 11) & 1) << 21;
+ block.y |= ((endPoint[0].g >> 10) & 1) << 22;
+ block.y |= ((endPoint[1].b >> 0) & 1) << 23;
+ block.y |= ((endPoint[1].b >> 1) & 1) << 24;
+ block.y |= ((endPoint[1].b >> 2) & 1) << 25;
+ block.y |= ((endPoint[1].b >> 3) & 1) << 26;
+ block.y |= ((endPoint[1].b >> 4) & 1) << 27;
+ block.y |= ((endPoint[1].b >> 5) & 1) << 28;
+ block.y |= ((endPoint[1].b >> 6) & 1) << 29;
+ block.y |= ((endPoint[1].b >> 7) & 1) << 30;
+ block.y |= ((endPoint[0].b >> 11) & 1) << 31;
+ block.z |= ((endPoint[0].b >> 10) & 1) << 0;
+ }
+ else if (mode_type == candidateModeFlag[13])
+ {
+ /*block.x |= candidateModeMemory[13];
+ block.y |= ( ( endPoint[0].r >> 8 ) & 0x00000080 );
+ block.y |= ( ( endPoint[0].r >> 6 ) & 0x00000100 );
+ block.y |= ( ( endPoint[0].r >> 4 ) & 0x00000200 );
+ block.y |= ( ( endPoint[0].r >> 2 ) & 0x00000400 );
+ block.y |= ( ( endPoint[0].r >> 0 ) & 0x00000800 );
+ block.y |= ( ( endPoint[0].r << 2 ) & 0x00001000 );
+ block.y |= ( ( endPoint[0].g << 2 ) & 0x00020000 );
+ block.y |= ( ( endPoint[0].g << 4 ) & 0x00040000 );
+ block.y |= ( ( endPoint[0].g << 6 ) & 0x00080000 );
+ block.y |= ( ( endPoint[0].g << 8 ) & 0x00100000 );
+ block.y |= ( ( endPoint[0].g << 10 ) & 0x00200000 );
+ block.y |= ( ( endPoint[0].g << 12 ) & 0x00400000 );
+ block.y |= ( ( endPoint[0].b << 12 ) & 0x08000000 );
+ block.y |= ( ( endPoint[0].b << 14 ) & 0x10000000 );
+ block.y |= ( ( endPoint[0].b << 16 ) & 0x20000000 );
+ block.y |= ( ( endPoint[0].b << 18 ) & 0x40000000 );
+ block.y |= ( ( endPoint[0].b << 20 ) & 0x80000000 );
+ block.y |= ( ( endPoint[1].r << 3 ) & 0x00000078 ) | ( ( endPoint[1].g << 13 ) & 0x0001E000 ) | ( ( endPoint[1].b << 23 ) & 0x07800000 );
+ block.z |= ( endPoint[0].b >> 10 ) & 0x00000001;*/
+
+ block.x |= ((candidateModeMemory[13] >> 0) & 1) << 0;
+ block.x |= ((candidateModeMemory[13] >> 1) & 1) << 1;
+ block.x |= ((candidateModeMemory[13] >> 2) & 1) << 2;
+ block.x |= ((candidateModeMemory[13] >> 3) & 1) << 3;
+ block.x |= ((candidateModeMemory[13] >> 4) & 1) << 4;
+ block.x |= ((endPoint[0].r >> 0) & 1) << 5;
+ block.x |= ((endPoint[0].r >> 1) & 1) << 6;
+ block.x |= ((endPoint[0].r >> 2) & 1) << 7;
+ block.x |= ((endPoint[0].r >> 3) & 1) << 8;
+ block.x |= ((endPoint[0].r >> 4) & 1) << 9;
+ block.x |= ((endPoint[0].r >> 5) & 1) << 10;
+ block.x |= ((endPoint[0].r >> 6) & 1) << 11;
+ block.x |= ((endPoint[0].r >> 7) & 1) << 12;
+ block.x |= ((endPoint[0].r >> 8) & 1) << 13;
+ block.x |= ((endPoint[0].r >> 9) & 1) << 14;
+ block.x |= ((endPoint[0].g >> 0) & 1) << 15;
+ block.x |= ((endPoint[0].g >> 1) & 1) << 16;
+ block.x |= ((endPoint[0].g >> 2) & 1) << 17;
+ block.x |= ((endPoint[0].g >> 3) & 1) << 18;
+ block.x |= ((endPoint[0].g >> 4) & 1) << 19;
+ block.x |= ((endPoint[0].g >> 5) & 1) << 20;
+ block.x |= ((endPoint[0].g >> 6) & 1) << 21;
+ block.x |= ((endPoint[0].g >> 7) & 1) << 22;
+ block.x |= ((endPoint[0].g >> 8) & 1) << 23;
+ block.x |= ((endPoint[0].g >> 9) & 1) << 24;
+ block.x |= ((endPoint[0].b >> 0) & 1) << 25;
+ block.x |= ((endPoint[0].b >> 1) & 1) << 26;
+ block.x |= ((endPoint[0].b >> 2) & 1) << 27;
+ block.x |= ((endPoint[0].b >> 3) & 1) << 28;
+ block.x |= ((endPoint[0].b >> 4) & 1) << 29;
+ block.x |= ((endPoint[0].b >> 5) & 1) << 30;
+ block.x |= ((endPoint[0].b >> 6) & 1) << 31;
+ block.y |= ((endPoint[0].b >> 7) & 1) << 0;
+ block.y |= ((endPoint[0].b >> 8) & 1) << 1;
+ block.y |= ((endPoint[0].b >> 9) & 1) << 2;
+ block.y |= ((endPoint[1].r >> 0) & 1) << 3;
+ block.y |= ((endPoint[1].r >> 1) & 1) << 4;
+ block.y |= ((endPoint[1].r >> 2) & 1) << 5;
+ block.y |= ((endPoint[1].r >> 3) & 1) << 6;
+ block.y |= ((endPoint[0].r >> 15) & 1) << 7;
+ block.y |= ((endPoint[0].r >> 14) & 1) << 8;
+ block.y |= ((endPoint[0].r >> 13) & 1) << 9;
+ block.y |= ((endPoint[0].r >> 12) & 1) << 10;
+ block.y |= ((endPoint[0].r >> 11) & 1) << 11;
+ block.y |= ((endPoint[0].r >> 10) & 1) << 12;
+ block.y |= ((endPoint[1].g >> 0) & 1) << 13;
+ block.y |= ((endPoint[1].g >> 1) & 1) << 14;
+ block.y |= ((endPoint[1].g >> 2) & 1) << 15;
+ block.y |= ((endPoint[1].g >> 3) & 1) << 16;
+ block.y |= ((endPoint[0].g >> 15) & 1) << 17;
+ block.y |= ((endPoint[0].g >> 14) & 1) << 18;
+ block.y |= ((endPoint[0].g >> 13) & 1) << 19;
+ block.y |= ((endPoint[0].g >> 12) & 1) << 20;
+ block.y |= ((endPoint[0].g >> 11) & 1) << 21;
+ block.y |= ((endPoint[0].g >> 10) & 1) << 22;
+ block.y |= ((endPoint[1].b >> 0) & 1) << 23;
+ block.y |= ((endPoint[1].b >> 1) & 1) << 24;
+ block.y |= ((endPoint[1].b >> 2) & 1) << 25;
+ block.y |= ((endPoint[1].b >> 3) & 1) << 26;
+ block.y |= ((endPoint[0].b >> 15) & 1) << 27;
+ block.y |= ((endPoint[0].b >> 14) & 1) << 28;
+ block.y |= ((endPoint[0].b >> 13) & 1) << 29;
+ block.y |= ((endPoint[0].b >> 12) & 1) << 30;
+ block.y |= ((endPoint[0].b >> 11) & 1) << 31;
+ block.z |= ((endPoint[0].b >> 10) & 1) << 0;
+ }
+}
diff --git a/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC7Encode.hlsl b/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC7Encode.hlsl
new file mode 100644
index 000000000..6a57c3862
--- /dev/null
+++ b/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC7Encode.hlsl
@@ -0,0 +1,1908 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: BC7Encode.hlsl
+//
+// The Compute Shader for BC7 Encoder
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//#define REF_DEVICE
+
+#define CHAR_LENGTH 8
+#define NCHANNELS 4
+#define BC7_UNORM 98
+#define MAX_UINT 0xFFFFFFFF
+#define MIN_UINT 0
+
+static const uint candidateSectionBit[64] = //Associated to partition 0-63
+{
+ 0xCCCC, 0x8888, 0xEEEE, 0xECC8,
+ 0xC880, 0xFEEC, 0xFEC8, 0xEC80,
+ 0xC800, 0xFFEC, 0xFE80, 0xE800,
+ 0xFFE8, 0xFF00, 0xFFF0, 0xF000,
+ 0xF710, 0x008E, 0x7100, 0x08CE,
+ 0x008C, 0x7310, 0x3100, 0x8CCE,
+ 0x088C, 0x3110, 0x6666, 0x366C,
+ 0x17E8, 0x0FF0, 0x718E, 0x399C,
+ 0xaaaa, 0xf0f0, 0x5a5a, 0x33cc,
+ 0x3c3c, 0x55aa, 0x9696, 0xa55a,
+ 0x73ce, 0x13c8, 0x324c, 0x3bdc,
+ 0x6996, 0xc33c, 0x9966, 0x660,
+ 0x272, 0x4e4, 0x4e40, 0x2720,
+ 0xc936, 0x936c, 0x39c6, 0x639c,
+ 0x9336, 0x9cc6, 0x817e, 0xe718,
+ 0xccf0, 0xfcc, 0x7744, 0xee22,
+};
+static const uint candidateSectionBit2[64] = //Associated to partition 64-127
+{
+ 0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8,
+ 0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050,
+ 0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090,
+ 0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250,
+ 0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0,
+ 0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500,
+ 0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400,
+ 0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200,
+ 0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424,
+ 0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50,
+ 0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0,
+ 0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600,
+ 0xaa444444, 0x54a854a8, 0x95809580, 0x96969600,
+ 0xa85454a8, 0x80959580, 0xaa141414, 0x96960000,
+ 0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000,
+ 0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
+};
+static const uint2 candidateFixUpIndex1D[128] =
+{
+ {15, 0},{15, 0},{15, 0},{15, 0},
+ {15, 0},{15, 0},{15, 0},{15, 0},
+ {15, 0},{15, 0},{15, 0},{15, 0},
+ {15, 0},{15, 0},{15, 0},{15, 0},
+ {15, 0},{ 2, 0},{ 8, 0},{ 2, 0},
+ { 2, 0},{ 8, 0},{ 8, 0},{15, 0},
+ { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
+ { 8, 0},{ 8, 0},{ 2, 0},{ 2, 0},
+
+ {15, 0},{15, 0},{ 6, 0},{ 8, 0},
+ { 2, 0},{ 8, 0},{15, 0},{15, 0},
+ { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
+ { 2, 0},{15, 0},{15, 0},{ 6, 0},
+ { 6, 0},{ 2, 0},{ 6, 0},{ 8, 0},
+ {15, 0},{15, 0},{ 2, 0},{ 2, 0},
+ {15, 0},{15, 0},{15, 0},{15, 0},
+ {15, 0},{ 2, 0},{ 2, 0},{15, 0},
+ //candidateFixUpIndex1D[i][1], i < 64 should not be used
+
+ { 3,15},{ 3, 8},{15, 8},{15, 3},
+ { 8,15},{ 3,15},{15, 3},{15, 8},
+ { 8,15},{ 8,15},{ 6,15},{ 6,15},
+ { 6,15},{ 5,15},{ 3,15},{ 3, 8},
+ { 3,15},{ 3, 8},{ 8,15},{15, 3},
+ { 3,15},{ 3, 8},{ 6,15},{10, 8},
+ { 5, 3},{ 8,15},{ 8, 6},{ 6,10},
+ { 8,15},{ 5,15},{15,10},{15, 8},
+
+ { 8,15},{15, 3},{ 3,15},{ 5,10},
+ { 6,10},{10, 8},{ 8, 9},{15,10},
+ {15, 6},{ 3,15},{15, 8},{ 5,15},
+ {15, 3},{15, 6},{15, 6},{15, 8}, //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct
+ { 3,15},{15, 3},{ 5,15},{ 5,15},
+ { 5,15},{ 8,15},{ 5,15},{10,15},
+ { 5,15},{10,15},{ 8,15},{13,15},
+ {15, 3},{12,15},{ 3,15},{ 3, 8},
+};
+static const uint2 candidateFixUpIndex1DOrdered[128] = //Same with candidateFixUpIndex1D but order the result when i >= 64
+{
+ {15, 0},{15, 0},{15, 0},{15, 0},
+ {15, 0},{15, 0},{15, 0},{15, 0},
+ {15, 0},{15, 0},{15, 0},{15, 0},
+ {15, 0},{15, 0},{15, 0},{15, 0},
+ {15, 0},{ 2, 0},{ 8, 0},{ 2, 0},
+ { 2, 0},{ 8, 0},{ 8, 0},{15, 0},
+ { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
+ { 8, 0},{ 8, 0},{ 2, 0},{ 2, 0},
+
+ {15, 0},{15, 0},{ 6, 0},{ 8, 0},
+ { 2, 0},{ 8, 0},{15, 0},{15, 0},
+ { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
+ { 2, 0},{15, 0},{15, 0},{ 6, 0},
+ { 6, 0},{ 2, 0},{ 6, 0},{ 8, 0},
+ {15, 0},{15, 0},{ 2, 0},{ 2, 0},
+ {15, 0},{15, 0},{15, 0},{15, 0},
+ {15, 0},{ 2, 0},{ 2, 0},{15, 0},
+ //candidateFixUpIndex1DOrdered[i][1], i < 64 should not be used
+
+ { 3,15},{ 3, 8},{ 8,15},{ 3,15},
+ { 8,15},{ 3,15},{ 3,15},{ 8,15},
+ { 8,15},{ 8,15},{ 6,15},{ 6,15},
+ { 6,15},{ 5,15},{ 3,15},{ 3, 8},
+ { 3,15},{ 3, 8},{ 8,15},{ 3,15},
+ { 3,15},{ 3, 8},{ 6,15},{ 8,10},
+ { 3, 5},{ 8,15},{ 6, 8},{ 6,10},
+ { 8,15},{ 5,15},{10,15},{ 8,15},
+
+ { 8,15},{ 3,15},{ 3,15},{ 5,10},
+ { 6,10},{ 8,10},{ 8, 9},{10,15},
+ { 6,15},{ 3,15},{ 8,15},{ 5,15},
+ { 3,15},{ 6,15},{ 6,15},{ 8,15}, //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct
+ { 3,15},{ 3,15},{ 5,15},{ 5,15},
+ { 5,15},{ 8,15},{ 5,15},{10,15},
+ { 5,15},{10,15},{ 8,15},{13,15},
+ { 3,15},{12,15},{ 3,15},{ 3, 8},
+};
+//static const uint4x4 candidateRotation[4] =
+//{
+// {1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1},
+// {0,0,0,1},{0,1,0,0},{0,0,1,0},{1,0,0,0},
+// {1,0,0,0},{0,0,0,1},{0,0,1,0},{0,1,0,0},
+// {1,0,0,0},{0,1,0,0},{0,0,0,1},{0,0,1,0}
+//};
+//static const uint2 candidateIndexPrec[8] = {{3,0},{3,0},{2,0},{2,0},
+// {2,3}, //color index and alpha index can exchange
+// {2,2},{4,4},{2,2}};
+
+static const uint aWeight[3][16] = { {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64},
+ {0, 9, 18, 27, 37, 46, 55, 64, 0, 0, 0, 0, 0, 0, 0, 0},
+ {0, 21, 43, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
+
+ //4 bit index: 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64
+static const uint aStep[3][64] = { { 0, 0, 0, 1, 1, 1, 1, 2,
+ 2, 2, 2, 2, 3, 3, 3, 3,
+ 4, 4, 4, 4, 5, 5, 5, 5,
+ 6, 6, 6, 6, 6, 7, 7, 7,
+ 7, 8, 8, 8, 8, 9, 9, 9,
+ 9,10,10,10,10,10,11,11,
+ 11,11,12,12,12,12,13,13,
+ 13,13,14,14,14,14,15,15 },
+ //3 bit index: 0, 9, 18, 27, 37, 46, 55, 64
+ { 0,0,0,0,0,1,1,1,
+ 1,1,1,1,1,1,2,2,
+ 2,2,2,2,2,2,2,3,
+ 3,3,3,3,3,3,3,3,
+ 3,4,4,4,4,4,4,4,
+ 4,4,5,5,5,5,5,5,
+ 5,5,5,6,6,6,6,6,
+ 6,6,6,6,7,7,7,7 },
+ //2 bit index: 0, 21, 43, 64
+ { 0,0,0,0,0,0,0,0,
+ 0,0,0,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,
+ 1,2,2,2,2,2,2,2,
+ 2,2,2,2,2,2,2,2,
+ 2,2,2,2,2,2,3,3,
+ 3,3,3,3,3,3,3,3 } };
+
+cbuffer cbCS : register( b0 )
+{
+ uint g_tex_width;
+ uint g_num_block_x;
+ uint g_format;
+ uint g_mode_id;
+ uint g_start_block_id;
+ uint g_num_total_blocks;
+ float g_alpha_weight;
+};
+
+//Forward declaration
+uint2x4 compress_endpoints0( inout uint2x4 endPoint, uint2 P ); //Mode = 0
+uint2x4 compress_endpoints1( inout uint2x4 endPoint, uint2 P ); //Mode = 1
+uint2x4 compress_endpoints2( inout uint2x4 endPoint ); //Mode = 2
+uint2x4 compress_endpoints3( inout uint2x4 endPoint, uint2 P ); //Mode = 3
+uint2x4 compress_endpoints7( inout uint2x4 endPoint, uint2 P ); //Mode = 7
+uint2x4 compress_endpoints6( inout uint2x4 endPoint, uint2 P ); //Mode = 6
+uint2x4 compress_endpoints4( inout uint2x4 endPoint ); //Mode = 4
+uint2x4 compress_endpoints5( inout uint2x4 endPoint ); //Mode = 5
+
+void block_package0( out uint4 block, uint partition, uint threadBase ); //Mode0
+void block_package1( out uint4 block, uint partition, uint threadBase ); //Mode1
+void block_package2( out uint4 block, uint partition, uint threadBase ); //Mode2
+void block_package3( out uint4 block, uint partition, uint threadBase ); //Mode3
+void block_package4( out uint4 block, uint rotation, uint index_selector, uint threadBase ); //Mode4
+void block_package5( out uint4 block, uint rotation, uint threadBase ); //Mode5
+void block_package6( out uint4 block, uint threadBase ); //Mode6
+void block_package7( out uint4 block, uint partition, uint threadBase ); //Mode7
+
+
+void swap(inout uint4 lhs, inout uint4 rhs)
+{
+ uint4 tmp = lhs;
+ lhs = rhs;
+ rhs = tmp;
+}
+void swap(inout uint3 lhs, inout uint3 rhs)
+{
+ uint3 tmp = lhs;
+ lhs = rhs;
+ rhs = tmp;
+}
+void swap(inout uint lhs, inout uint rhs)
+{
+ uint tmp = lhs;
+ lhs = rhs;
+ rhs = tmp;
+}
+
+uint ComputeError(in uint4 a, in uint4 b)
+{
+ return dot(a.rgb, b.rgb) + g_alpha_weight * a.a*b.a;
+}
+
+void Ensure_A_Is_Larger( inout uint4 a, inout uint4 b )
+{
+ if ( a.x < b.x )
+ swap( a.x, b.x );
+ if ( a.y < b.y )
+ swap( a.y, b.y );
+ if ( a.z < b.z )
+ swap( a.z, b.z );
+ if ( a.w < b.w )
+ swap( a.w, b.w );
+}
+
+
+Texture2D g_Input : register( t0 );
+StructuredBuffer<uint4> g_InBuff : register( t1 );
+
+RWStructuredBuffer<uint4> g_OutBuff : register( u0 );
+
+#define THREAD_GROUP_SIZE 64
+#define BLOCK_SIZE_Y 4
+#define BLOCK_SIZE_X 4
+#define BLOCK_SIZE (BLOCK_SIZE_Y * BLOCK_SIZE_X)
+
+struct BufferShared
+{
+ uint4 pixel;
+ uint error;
+ uint mode;
+ uint partition;
+ uint index_selector;
+ uint rotation;
+ uint4 endPoint_low;
+ uint4 endPoint_high;
+ uint4 endPoint_low_quantized;
+ uint4 endPoint_high_quantized;
+};
+groupshared BufferShared shared_temp[THREAD_GROUP_SIZE];
+
+[numthreads( THREAD_GROUP_SIZE, 1, 1 )]
+void TryMode456CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID ) // mode 4 5 6 all have 1 subset per block, and fix-up index is always index 0
+{
+ // we process 4 BC blocks per thread group
+ const uint MAX_USED_THREAD = 16; // pixels in a BC (block compressed) block
+ uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD; // the number of BC blocks a thread group processes = 64 / 16 = 4
+ uint blockInGroup = GI / MAX_USED_THREAD; // what BC block this thread is on within this thread group
+ uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup; // what global BC block this thread is on
+ uint threadBase = blockInGroup * MAX_USED_THREAD; // the first id of the pixel in this BC block in this thread group
+ uint threadInBlock = GI - threadBase; // id of the pixel in this BC block
+
+#ifndef REF_DEVICE
+ if (blockID >= g_num_total_blocks)
+ {
+ return;
+ }
+#endif
+
+ uint block_y = blockID / g_num_block_x;
+ uint block_x = blockID - block_y * g_num_block_x;
+ uint base_x = block_x * BLOCK_SIZE_X;
+ uint base_y = block_y * BLOCK_SIZE_Y;
+
+ if (threadInBlock < 16)
+ {
+ shared_temp[GI].pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255);
+
+ shared_temp[GI].endPoint_low = shared_temp[GI].pixel;
+ shared_temp[GI].endPoint_high = shared_temp[GI].pixel;
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ if (threadInBlock < 8)
+ {
+ shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low);
+ shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high);
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 4)
+ {
+ shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low);
+ shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high);
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 2)
+ {
+ shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low);
+ shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high);
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 1)
+ {
+ shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low);
+ shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high);
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ uint2x4 endPoint;
+ endPoint[0] = shared_temp[threadBase].endPoint_low;
+ endPoint[1] = shared_temp[threadBase].endPoint_high;
+
+ uint error = 0xFFFFFFFF;
+ uint mode = 0;
+ uint index_selector = 0;
+ uint rotation = 0;
+
+ uint2 indexPrec;
+ if (threadInBlock < 8) // all threads of threadInBlock < 8 will be working on trying out mode 4, since only mode 4 has index selector bit
+ {
+ if (0 == (threadInBlock & 1)) // thread 0, 2, 4, 6
+ {
+ //2 represents 2bit index precision; 1 represents 3bit index precision
+ index_selector = 0;
+ indexPrec = uint2( 2, 1 );
+ }
+ else // thread 1, 3, 5, 7
+ {
+ //2 represents 2bit index precision; 1 represents 3bit index precision
+ index_selector = 1;
+ indexPrec = uint2( 1, 2 );
+ }
+ }
+ else
+ {
+ //2 represents 2bit index precision
+ indexPrec = uint2( 2, 2 );
+ }
+
+ uint4 pixel_r;
+ uint color_index;
+ uint alpha_index;
+ int4 span;
+ int2 span_norm_sqr;
+ int2 dotProduct;
+ if (threadInBlock < 12) // Try mode 4 5 in threads 0..11
+ {
+ // mode 4 5 have component rotation
+ if ((threadInBlock < 2) || (8 == threadInBlock)) // rotation = 0 in thread 0, 1
+ {
+ rotation = 0;
+ }
+ else if ((threadInBlock < 4) || (9 == threadInBlock)) // rotation = 1 in thread 2, 3
+ {
+ endPoint[0].ra = endPoint[0].ar;
+ endPoint[1].ra = endPoint[1].ar;
+
+ rotation = 1;
+ }
+ else if ((threadInBlock < 6) || (10 == threadInBlock)) // rotation = 2 in thread 4, 5
+ {
+ endPoint[0].ga = endPoint[0].ag;
+ endPoint[1].ga = endPoint[1].ag;
+
+ rotation = 2;
+ }
+ else if ((threadInBlock < 8) || (11 == threadInBlock)) // rotation = 3 in thread 6, 7
+ {
+ endPoint[0].ba = endPoint[0].ab;
+ endPoint[1].ba = endPoint[1].ab;
+
+ rotation = 3;
+ }
+
+ if (threadInBlock < 8) // try mode 4 in threads 0..7
+ {
+ // mode 4 thread distribution
+ // Thread 0 1 2 3 4 5 6 7
+ // Rotation 0 0 1 1 2 2 3 3
+ // Index selector 0 1 0 1 0 1 0 1
+
+ mode = 4;
+ compress_endpoints4( endPoint );
+ }
+ else // try mode 5 in threads 8..11
+ {
+ // mode 5 thread distribution
+ // Thread 8 9 10 11
+ // Rotation 0 1 2 3
+
+ mode = 5;
+ compress_endpoints5( endPoint );
+ }
+
+ uint4 pixel = shared_temp[threadBase + 0].pixel;
+ if (1 == rotation)
+ {
+ pixel.ra = pixel.ar;
+ }
+ else if (2 == rotation)
+ {
+ pixel.ga = pixel.ag;
+ }
+ else if (3 == rotation)
+ {
+ pixel.ba = pixel.ab;
+ }
+
+ span = endPoint[1] - endPoint[0];
+ span_norm_sqr = uint2( dot( span.rgb, span.rgb ), span.a * span.a );
+
+ // in mode 4 5 6, end point 0 must be closer to pixel 0 than end point 1, because of the fix-up index is always index 0
+ // TODO: this shouldn't be necessary here in error calculation
+ /*
+ dotProduct = int2( dot( span.rgb, pixel.rgb - endPoint[0].rgb ), span.a * ( pixel.a - endPoint[0].a ) );
+ if ( span_norm_sqr.x > 0 && dotProduct.x > 0 && uint( dotProduct.x * 63.49999 ) > uint( 32 * span_norm_sqr.x ) )
+ {
+ span.rgb = -span.rgb;
+ swap(endPoint[0].rgb, endPoint[1].rgb);
+ }
+ if ( span_norm_sqr.y > 0 && dotProduct.y > 0 && uint( dotProduct.y * 63.49999 ) > uint( 32 * span_norm_sqr.y ) )
+ {
+ span.a = -span.a;
+ swap(endPoint[0].a, endPoint[1].a);
+ }
+ */
+
+ // should be the same as above
+ dotProduct = int2( dot( pixel.rgb - endPoint[0].rgb, pixel.rgb - endPoint[0].rgb ), dot( pixel.rgb - endPoint[1].rgb, pixel.rgb - endPoint[1].rgb ) );
+ if ( dotProduct.x > dotProduct.y )
+ {
+ span.rgb = -span.rgb;
+ swap(endPoint[0].rgb, endPoint[1].rgb);
+ }
+ dotProduct = int2( dot( pixel.a - endPoint[0].a, pixel.a - endPoint[0].a ), dot( pixel.a - endPoint[1].a, pixel.a - endPoint[1].a ) );
+ if ( dotProduct.x > dotProduct.y )
+ {
+ span.a = -span.a;
+ swap(endPoint[0].a, endPoint[1].a);
+ }
+
+ error = 0;
+ for ( uint i = 0; i < 16; i ++ )
+ {
+ pixel = shared_temp[threadBase + i].pixel;
+ if (1 == rotation)
+ {
+ pixel.ra = pixel.ar;
+ }
+ else if (2 == rotation)
+ {
+ pixel.ga = pixel.ag;
+ }
+ else if (3 == rotation)
+ {
+ pixel.ba = pixel.ab;
+ }
+
+ dotProduct.x = dot( span.rgb, pixel.rgb - endPoint[0].rgb );
+ color_index = ( span_norm_sqr.x <= 0 /*endPoint[0] == endPoint[1]*/ || dotProduct.x <= 0 /*pixel == endPoint[0]*/ ) ? 0
+ : ( ( dotProduct.x < span_norm_sqr.x ) ? aStep[indexPrec.x][ uint( dotProduct.x * 63.49999 / span_norm_sqr.x ) ] : aStep[indexPrec.x][63] );
+ dotProduct.y = dot( span.a, pixel.a - endPoint[0].a );
+ alpha_index = ( span_norm_sqr.y <= 0 || dotProduct.y <= 0 ) ? 0
+ : ( ( dotProduct.y < span_norm_sqr.y ) ? aStep[indexPrec.y][ uint( dotProduct.y * 63.49999 / span_norm_sqr.y ) ] : aStep[indexPrec.y][63] );
+
+ // the same color_index and alpha_index should be used for reconstruction, so this should be left commented out
+ /*if (index_selector)
+ {
+ swap(color_index, alpha_index);
+ }*/
+
+ pixel_r.rgb = ( ( 64 - aWeight[indexPrec.x][color_index] ) * endPoint[0].rgb +
+ aWeight[indexPrec.x][color_index] * endPoint[1].rgb +
+ 32 ) >> 6;
+ pixel_r.a = ( ( 64 - aWeight[indexPrec.y][alpha_index] ) * endPoint[0].a +
+ aWeight[indexPrec.y][alpha_index] * endPoint[1].a +
+ 32 ) >> 6;
+
+ Ensure_A_Is_Larger( pixel_r, pixel );
+ pixel_r -= pixel;
+ if (1 == rotation)
+ {
+ pixel_r.ra = pixel_r.ar;
+ }
+ else if (2 == rotation)
+ {
+ pixel_r.ga = pixel_r.ag;
+ }
+ else if (3 == rotation)
+ {
+ pixel_r.ba = pixel_r.ab;
+ }
+ error += ComputeError(pixel_r, pixel_r);
+ }
+ }
+ else if (threadInBlock < 16) // Try mode 6 in threads 12..15, since in mode 4 5 6, only mode 6 has p bit
+ {
+ uint p = threadInBlock - 12;
+
+ compress_endpoints6( endPoint, uint2(p >> 0, p >> 1) & 1 );
+
+ uint4 pixel = shared_temp[threadBase + 0].pixel;
+
+ span = endPoint[1] - endPoint[0];
+ span_norm_sqr = dot( span, span );
+ dotProduct = dot( span, pixel - endPoint[0] );
+ if ( span_norm_sqr.x > 0 && dotProduct.x >= 0 && uint( dotProduct.x * 63.49999 ) > uint( 32 * span_norm_sqr.x ) )
+ {
+ span = -span;
+ swap(endPoint[0], endPoint[1]);
+ }
+
+ error = 0;
+ for ( uint i = 0; i < 16; i ++ )
+ {
+ pixel = shared_temp[threadBase + i].pixel;
+
+ dotProduct.x = dot( span, pixel - endPoint[0] );
+ color_index = ( span_norm_sqr.x <= 0 || dotProduct.x <= 0 ) ? 0
+ : ( ( dotProduct.x < span_norm_sqr.x ) ? aStep[0][ uint( dotProduct.x * 63.49999 / span_norm_sqr.x ) ] : aStep[0][63] );
+
+ pixel_r = ( ( 64 - aWeight[0][color_index] ) * endPoint[0]
+ + aWeight[0][color_index] * endPoint[1] + 32 ) >> 6;
+
+ Ensure_A_Is_Larger( pixel_r, pixel );
+ pixel_r -= pixel;
+ error += ComputeError(pixel_r, pixel_r);
+ }
+
+ mode = 6;
+ rotation = p; // Borrow rotation for p
+ }
+
+ shared_temp[GI].error = error;
+ shared_temp[GI].mode = mode;
+ shared_temp[GI].index_selector = index_selector;
+ shared_temp[GI].rotation = rotation;
+
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ if (threadInBlock < 8)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 8].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 8].error;
+ shared_temp[GI].mode = shared_temp[GI + 8].mode;
+ shared_temp[GI].index_selector = shared_temp[GI + 8].index_selector;
+ shared_temp[GI].rotation = shared_temp[GI + 8].rotation;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 4)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 4].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 4].error;
+ shared_temp[GI].mode = shared_temp[GI + 4].mode;
+ shared_temp[GI].index_selector = shared_temp[GI + 4].index_selector;
+ shared_temp[GI].rotation = shared_temp[GI + 4].rotation;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 2)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 2].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 2].error;
+ shared_temp[GI].mode = shared_temp[GI + 2].mode;
+ shared_temp[GI].index_selector = shared_temp[GI + 2].index_selector;
+ shared_temp[GI].rotation = shared_temp[GI + 2].rotation;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 1)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 1].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 1].error;
+ shared_temp[GI].mode = shared_temp[GI + 1].mode;
+ shared_temp[GI].index_selector = shared_temp[GI + 1].index_selector;
+ shared_temp[GI].rotation = shared_temp[GI + 1].rotation;
+ }
+
+ g_OutBuff[blockID] = uint4(shared_temp[GI].error, (shared_temp[GI].index_selector << 31) | shared_temp[GI].mode,
+ 0, shared_temp[GI].rotation); // rotation is indeed rotation for mode 4 5. for mode 6, rotation is p bit
+ }
+}
+
+[numthreads( THREAD_GROUP_SIZE, 1, 1 )]
+void TryMode137CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID ) // mode 1 3 7 all have 2 subsets per block
+{
+ const uint MAX_USED_THREAD = 64;
+ uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
+ uint blockInGroup = GI / MAX_USED_THREAD;
+ uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
+ uint threadBase = blockInGroup * MAX_USED_THREAD;
+ uint threadInBlock = GI - threadBase;
+
+ uint block_y = blockID / g_num_block_x;
+ uint block_x = blockID - block_y * g_num_block_x;
+ uint base_x = block_x * BLOCK_SIZE_X;
+ uint base_y = block_y * BLOCK_SIZE_Y;
+
+ if (threadInBlock < 16)
+ {
+ shared_temp[GI].pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255);
+ }
+ GroupMemoryBarrierWithGroupSync();
+
+ shared_temp[GI].error = 0xFFFFFFFF;
+
+ uint4 pixel_r;
+ uint2x4 endPoint[2]; // endPoint[0..1 for subset id][0..1 for low and high in the subset]
+ uint2x4 endPointBackup[2];
+ uint color_index;
+ if (threadInBlock < 64)
+ {
+ uint partition = threadInBlock;
+
+ endPoint[0][0] = MAX_UINT;
+ endPoint[0][1] = MIN_UINT;
+ endPoint[1][0] = MAX_UINT;
+ endPoint[1][1] = MIN_UINT;
+ uint bits = candidateSectionBit[partition];
+ for ( uint i = 0; i < 16; i ++ )
+ {
+ uint4 pixel = shared_temp[threadBase + i].pixel;
+ if ( (( bits >> i ) & 0x01) == 1 )
+ {
+ endPoint[1][0] = min( endPoint[1][0], pixel );
+ endPoint[1][1] = max( endPoint[1][1], pixel );
+ }
+ else
+ {
+ endPoint[0][0] = min( endPoint[0][0], pixel );
+ endPoint[0][1] = max( endPoint[0][1], pixel );
+ }
+ }
+
+ endPointBackup[0] = endPoint[0];
+ endPointBackup[1] = endPoint[1];
+
+ uint max_p;
+ if (1 == g_mode_id)
+ {
+ // in mode 1, there is only one p bit per subset
+ max_p = 4;
+ }
+ else
+ {
+ // in mode 3 7, there are two p bits per subset, one for each end point
+ max_p = 16;
+ }
+
+ uint rotation = 0;
+ uint error = MAX_UINT;
+ for ( uint p = 0; p < max_p; p ++ )
+ {
+ endPoint[0] = endPointBackup[0];
+ endPoint[1] = endPointBackup[1];
+
+ for ( i = 0; i < 2; i ++ ) // loop through 2 subsets
+ {
+ if (g_mode_id == 1)
+ {
+ compress_endpoints1( endPoint[i], (p >> i) & 1 );
+ }
+ else if (g_mode_id == 3)
+ {
+ compress_endpoints3( endPoint[i], uint2(p >> (i * 2 + 0), p >> (i * 2 + 1)) & 1 );
+ }
+ else if (g_mode_id == 7)
+ {
+ compress_endpoints7( endPoint[i], uint2(p >> (i * 2 + 0), p >> (i * 2 + 1)) & 1 );
+ }
+ }
+
+ int4 span[2];
+ span[0] = endPoint[0][1] - endPoint[0][0];
+ span[1] = endPoint[1][1] - endPoint[1][0];
+
+ if (g_mode_id != 7)
+ {
+ span[0].w = span[1].w = 0;
+ }
+
+ int span_norm_sqr[2];
+ span_norm_sqr[0] = dot( span[0], span[0] );
+ span_norm_sqr[1] = dot( span[1], span[1] );
+
+ // TODO: again, this shouldn't be necessary here in error calculation
+ int dotProduct = dot( span[0], shared_temp[threadBase + 0].pixel - endPoint[0][0] );
+ if ( span_norm_sqr[0] > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr[0] ) )
+ {
+ span[0] = -span[0];
+ swap(endPoint[0][0], endPoint[0][1]);
+ }
+ dotProduct = dot( span[1], shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel - endPoint[1][0] );
+ if ( span_norm_sqr[1] > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr[1] ) )
+ {
+ span[1] = -span[1];
+ swap(endPoint[1][0], endPoint[1][1]);
+ }
+
+ uint step_selector;
+ if (g_mode_id != 1)
+ {
+ step_selector = 2; // mode 3 7 have 2 bit index
+ }
+ else
+ {
+ step_selector = 1; // mode 1 has 3 bit index
+ }
+
+ uint p_error = 0;
+ for ( i = 0; i < 16; i ++ )
+ {
+ if (((bits >> i) & 0x01) == 1)
+ {
+ dotProduct = dot( span[1], shared_temp[threadBase + i].pixel - endPoint[1][0] );
+ color_index = (span_norm_sqr[1] <= 0 || dotProduct <= 0) ? 0
+ : ((dotProduct < span_norm_sqr[1]) ? aStep[step_selector][uint(dotProduct * 63.49999 / span_norm_sqr[1])] : aStep[step_selector][63]);
+ }
+ else
+ {
+ dotProduct = dot( span[0], shared_temp[threadBase + i].pixel - endPoint[0][0] );
+ color_index = (span_norm_sqr[0] <= 0 || dotProduct <= 0) ? 0
+ : ((dotProduct < span_norm_sqr[0]) ? aStep[step_selector][uint(dotProduct * 63.49999 / span_norm_sqr[0])] : aStep[step_selector][63]);
+ }
+
+ uint subset_index = (bits >> i) & 0x01;
+
+ pixel_r = ((64 - aWeight[step_selector][color_index]) * endPoint[subset_index][0]
+ + aWeight[step_selector][color_index] * endPoint[subset_index][1] + 32) >> 6;
+ if (g_mode_id != 7)
+ {
+ pixel_r.a = 255;
+ }
+
+ uint4 pixel = shared_temp[threadBase + i].pixel;
+ Ensure_A_Is_Larger( pixel_r, pixel );
+ pixel_r -= pixel;
+ p_error += ComputeError(pixel_r, pixel_r);
+ }
+
+ if (p_error < error)
+ {
+ error = p_error;
+ rotation = p;
+ }
+ }
+
+ shared_temp[GI].error = error;
+ shared_temp[GI].mode = g_mode_id;
+ shared_temp[GI].partition = partition;
+ shared_temp[GI].rotation = rotation; // mode 1 3 7 don't have rotation, we use rotation for p bits
+ }
+ GroupMemoryBarrierWithGroupSync();
+
+ if (threadInBlock < 32)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 32].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 32].error;
+ shared_temp[GI].mode = shared_temp[GI + 32].mode;
+ shared_temp[GI].partition = shared_temp[GI + 32].partition;
+ shared_temp[GI].rotation = shared_temp[GI + 32].rotation;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+if (threadInBlock < 16)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 16].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 16].error;
+ shared_temp[GI].mode = shared_temp[GI + 16].mode;
+ shared_temp[GI].partition = shared_temp[GI + 16].partition;
+ shared_temp[GI].rotation = shared_temp[GI + 16].rotation;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 8)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 8].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 8].error;
+ shared_temp[GI].mode = shared_temp[GI + 8].mode;
+ shared_temp[GI].partition = shared_temp[GI + 8].partition;
+ shared_temp[GI].rotation = shared_temp[GI + 8].rotation;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 4)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 4].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 4].error;
+ shared_temp[GI].mode = shared_temp[GI + 4].mode;
+ shared_temp[GI].partition = shared_temp[GI + 4].partition;
+ shared_temp[GI].rotation = shared_temp[GI + 4].rotation;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 2)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 2].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 2].error;
+ shared_temp[GI].mode = shared_temp[GI + 2].mode;
+ shared_temp[GI].partition = shared_temp[GI + 2].partition;
+ shared_temp[GI].rotation = shared_temp[GI + 2].rotation;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 1)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 1].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 1].error;
+ shared_temp[GI].mode = shared_temp[GI + 1].mode;
+ shared_temp[GI].partition = shared_temp[GI + 1].partition;
+ shared_temp[GI].rotation = shared_temp[GI + 1].rotation;
+ }
+
+ if (g_InBuff[blockID].x > shared_temp[GI].error)
+ {
+ g_OutBuff[blockID] = uint4(shared_temp[GI].error, shared_temp[GI].mode, shared_temp[GI].partition, shared_temp[GI].rotation); // mode 1 3 7 don't have rotation, we use rotation for p bits
+ }
+ else
+ {
+ g_OutBuff[blockID] = g_InBuff[blockID];
+ }
+ }
+}
+
+[numthreads( THREAD_GROUP_SIZE, 1, 1 )]
+void TryMode02CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID ) // mode 0 2 have 3 subsets per block
+{
+ const uint MAX_USED_THREAD = 64;
+ uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
+ uint blockInGroup = GI / MAX_USED_THREAD;
+ uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
+ uint threadBase = blockInGroup * MAX_USED_THREAD;
+ uint threadInBlock = GI - threadBase;
+
+ uint block_y = blockID / g_num_block_x;
+ uint block_x = blockID - block_y * g_num_block_x;
+ uint base_x = block_x * BLOCK_SIZE_X;
+ uint base_y = block_y * BLOCK_SIZE_Y;
+
+ if (threadInBlock < 16)
+ {
+ shared_temp[GI].pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255);
+ }
+ GroupMemoryBarrierWithGroupSync();
+
+ shared_temp[GI].error = 0xFFFFFFFF;
+
+ uint num_partitions;
+ if (0 == g_mode_id)
+ {
+ num_partitions = 16;
+ }
+ else
+ {
+ num_partitions = 64;
+ }
+
+ uint4 pixel_r;
+ uint2x4 endPoint[3]; // endPoint[0..1 for subset id][0..1 for low and high in the subset]
+ uint2x4 endPointBackup[3];
+ uint color_index[16];
+ if (threadInBlock < num_partitions)
+ {
+ uint partition = threadInBlock + 64;
+
+ endPoint[0][0] = MAX_UINT;
+ endPoint[0][1] = MIN_UINT;
+ endPoint[1][0] = MAX_UINT;
+ endPoint[1][1] = MIN_UINT;
+ endPoint[2][0] = MAX_UINT;
+ endPoint[2][1] = MIN_UINT;
+ uint bits2 = candidateSectionBit2[partition - 64];
+ for ( uint i = 0; i < 16; i ++ )
+ {
+ uint4 pixel = shared_temp[threadBase + i].pixel;
+ uint subset_index = ( bits2 >> ( i * 2 ) ) & 0x03;
+ if ( subset_index == 2 )
+ {
+ endPoint[2][0] = min( endPoint[2][0], pixel );
+ endPoint[2][1] = max( endPoint[2][1], pixel );
+ }
+ else if ( subset_index == 1 )
+ {
+ endPoint[1][0] = min( endPoint[1][0], pixel );
+ endPoint[1][1] = max( endPoint[1][1], pixel );
+ }
+ else
+ {
+ endPoint[0][0] = min( endPoint[0][0], pixel );
+ endPoint[0][1] = max( endPoint[0][1], pixel );
+ }
+ }
+
+ endPointBackup[0] = endPoint[0];
+ endPointBackup[1] = endPoint[1];
+ endPointBackup[2] = endPoint[2];
+
+ uint max_p;
+ if (0 == g_mode_id)
+ {
+ max_p = 64; // changed from 32 to 64
+ }
+ else
+ {
+ max_p = 1;
+ }
+
+ uint rotation = 0;
+ uint error = MAX_UINT;
+ for ( uint p = 0; p < max_p; p ++ )
+ {
+ endPoint[0] = endPointBackup[0];
+ endPoint[1] = endPointBackup[1];
+ endPoint[2] = endPointBackup[2];
+
+ for ( i = 0; i < 3; i ++ )
+ {
+ if (0 == g_mode_id)
+ {
+ compress_endpoints0( endPoint[i], uint2(p >> (i * 2 + 0), p >> (i * 2 + 1)) & 1 );
+ }
+ else
+ {
+ compress_endpoints2( endPoint[i] );
+ }
+ }
+
+ uint step_selector = 1 + (2 == g_mode_id);
+
+ int4 span[3];
+ span[0] = endPoint[0][1] - endPoint[0][0];
+ span[1] = endPoint[1][1] - endPoint[1][0];
+ span[2] = endPoint[2][1] - endPoint[2][0];
+ span[0].w = span[1].w = span[2].w = 0;
+ int span_norm_sqr[3];
+ span_norm_sqr[0] = dot( span[0], span[0] );
+ span_norm_sqr[1] = dot( span[1], span[1] );
+ span_norm_sqr[2] = dot( span[2], span[2] );
+
+ // TODO: again, this shouldn't be necessary here in error calculation
+ uint ci[3] = { 0, candidateFixUpIndex1D[partition].x, candidateFixUpIndex1D[partition].y };
+ for (i = 0; i < 3; i ++)
+ {
+ int dotProduct = dot( span[i], shared_temp[threadBase + ci[i]].pixel - endPoint[i][0] );
+ if ( span_norm_sqr[i] > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr[i] ) )
+ {
+ span[i] = -span[i];
+ swap(endPoint[i][0], endPoint[i][1]);
+ }
+ }
+
+ uint p_error = 0;
+ for ( i = 0; i < 16; i ++ )
+ {
+ uint subset_index = ( bits2 >> ( i * 2 ) ) & 0x03;
+ if ( subset_index == 2 )
+ {
+ int dotProduct = dot( span[2], shared_temp[threadBase + i].pixel - endPoint[2][0] );
+ color_index[i] = ( span_norm_sqr[2] <= 0 || dotProduct <= 0 ) ? 0
+ : ( ( dotProduct < span_norm_sqr[2] ) ? aStep[step_selector][ uint( dotProduct * 63.49999 / span_norm_sqr[2] ) ] : aStep[step_selector][63] );
+ }
+ else if ( subset_index == 1 )
+ {
+ int dotProduct = dot( span[1], shared_temp[threadBase + i].pixel - endPoint[1][0] );
+ color_index[i] = ( span_norm_sqr[1] <= 0 || dotProduct <= 0 ) ? 0
+ : ( ( dotProduct < span_norm_sqr[1] ) ? aStep[step_selector][ uint( dotProduct * 63.49999 / span_norm_sqr[1] ) ] : aStep[step_selector][63] );
+ }
+ else
+ {
+ int dotProduct = dot( span[0], shared_temp[threadBase + i].pixel - endPoint[0][0] );
+ color_index[i] = ( span_norm_sqr[0] <= 0 || dotProduct <= 0 ) ? 0
+ : ( ( dotProduct < span_norm_sqr[0] ) ? aStep[step_selector][ uint( dotProduct * 63.49999 / span_norm_sqr[0] ) ] : aStep[step_selector][63] );
+ }
+
+ pixel_r = ( ( 64 - aWeight[step_selector][color_index[i]] ) * endPoint[subset_index][0]
+ + aWeight[step_selector][color_index[i]] * endPoint[subset_index][1] + 32 ) >> 6;
+ pixel_r.a = 255;
+
+ uint4 pixel = shared_temp[threadBase + i].pixel;
+ Ensure_A_Is_Larger( pixel_r, pixel );
+ pixel_r -= pixel;
+ p_error += ComputeError(pixel_r, pixel_r);
+ }
+
+ if (p_error < error)
+ {
+ error = p_error;
+ rotation = p; // Borrow rotation for p
+ }
+ }
+
+ shared_temp[GI].error = error;
+ shared_temp[GI].partition = partition;
+ shared_temp[GI].rotation = rotation;
+ }
+ GroupMemoryBarrierWithGroupSync();
+
+ if (threadInBlock < 32)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 32].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 32].error;
+ shared_temp[GI].partition = shared_temp[GI + 32].partition;
+ shared_temp[GI].rotation = shared_temp[GI + 32].rotation;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 16)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 16].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 16].error;
+ shared_temp[GI].partition = shared_temp[GI + 16].partition;
+ shared_temp[GI].rotation = shared_temp[GI + 16].rotation;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 8)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 8].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 8].error;
+ shared_temp[GI].partition = shared_temp[GI + 8].partition;
+ shared_temp[GI].rotation = shared_temp[GI + 8].rotation;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 4)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 4].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 4].error;
+ shared_temp[GI].partition = shared_temp[GI + 4].partition;
+ shared_temp[GI].rotation = shared_temp[GI + 4].rotation;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 2)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 2].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 2].error;
+ shared_temp[GI].partition = shared_temp[GI + 2].partition;
+ shared_temp[GI].rotation = shared_temp[GI + 2].rotation;
+ }
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 1)
+ {
+ if ( shared_temp[GI].error > shared_temp[GI + 1].error )
+ {
+ shared_temp[GI].error = shared_temp[GI + 1].error;
+ shared_temp[GI].partition = shared_temp[GI + 1].partition;
+ shared_temp[GI].rotation = shared_temp[GI + 1].rotation;
+ }
+
+ if (g_InBuff[blockID].x > shared_temp[GI].error)
+ {
+ g_OutBuff[blockID] = uint4(shared_temp[GI].error, g_mode_id, shared_temp[GI].partition, shared_temp[GI].rotation); // rotation is actually p bit for mode 0. for mode 2, rotation is always 0
+ }
+ else
+ {
+ g_OutBuff[blockID] = g_InBuff[blockID];
+ }
+ }
+}
+
+[numthreads( THREAD_GROUP_SIZE, 1, 1 )]
+void EncodeBlockCS(uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID)
+{
+ const uint MAX_USED_THREAD = 16;
+ uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
+ uint blockInGroup = GI / MAX_USED_THREAD;
+ uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
+ uint threadBase = blockInGroup * MAX_USED_THREAD;
+ uint threadInBlock = GI - threadBase;
+
+#ifndef REF_DEVICE
+ if (blockID >= g_num_total_blocks)
+ {
+ return;
+ }
+#endif
+
+ uint block_y = blockID / g_num_block_x;
+ uint block_x = blockID - block_y * g_num_block_x;
+ uint base_x = block_x * BLOCK_SIZE_X;
+ uint base_y = block_y * BLOCK_SIZE_Y;
+
+ uint mode = g_InBuff[blockID].y & 0x7FFFFFFF;
+ uint partition = g_InBuff[blockID].z;
+ uint index_selector = (g_InBuff[blockID].y >> 31) & 1;
+ uint rotation = g_InBuff[blockID].w;
+
+ if (threadInBlock < 16)
+ {
+ uint4 pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255);
+
+ if ((4 == mode) || (5 == mode))
+ {
+ if (1 == rotation)
+ {
+ pixel.ra = pixel.ar;
+ }
+ else if (2 == rotation)
+ {
+ pixel.ga = pixel.ag;
+ }
+ else if (3 == rotation)
+ {
+ pixel.ba = pixel.ab;
+ }
+ }
+
+ shared_temp[GI].pixel = pixel;
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ uint bits = candidateSectionBit[partition];
+ uint bits2 = candidateSectionBit2[partition - 64];
+
+ uint2x4 ep;
+ uint2x4 ep_quantized;
+ [unroll]
+ for (int ii = 2; ii >= 0; -- ii)
+ {
+ if (threadInBlock < 16)
+ {
+ uint2x4 ep;
+ ep[0] = MAX_UINT;
+ ep[1] = MIN_UINT;
+
+ uint4 pixel = shared_temp[GI].pixel;
+
+ uint subset_index = ( bits >> threadInBlock ) & 0x01;
+ uint subset_index2 = ( bits2 >> ( threadInBlock * 2 ) ) & 0x03;
+ if (0 == ii)
+ {
+ if ((0 == mode) || (2 == mode))
+ {
+ if (0 == subset_index2)
+ {
+ ep[0] = ep[1] = pixel;
+ }
+ }
+ else if ((1 == mode) || (3 == mode) || (7 == mode))
+ {
+ if (0 == subset_index)
+ {
+ ep[0] = ep[1] = pixel;
+ }
+ }
+ else if ((4 == mode) || (5 == mode) || (6 == mode))
+ {
+ ep[0] = ep[1] = pixel;
+ }
+ }
+ else if (1 == ii)
+ {
+ if ((0 == mode) || (2 == mode))
+ {
+ if (1 == subset_index2)
+ {
+ ep[0] = ep[1] = pixel;
+ }
+ }
+ else if ((1 == mode) || (3 == mode) || (7 == mode))
+ {
+ if (1 == subset_index)
+ {
+ ep[0] = ep[1] = pixel;
+ }
+ }
+ }
+ else
+ {
+ if ((0 == mode) || (2 == mode))
+ {
+ if (2 == subset_index2)
+ {
+ ep[0] = ep[1] = pixel;
+ }
+ }
+ }
+
+ shared_temp[GI].endPoint_low = ep[0];
+ shared_temp[GI].endPoint_high = ep[1];
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ if (threadInBlock < 8)
+ {
+ shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low);
+ shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high);
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 4)
+ {
+ shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low);
+ shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high);
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 2)
+ {
+ shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low);
+ shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high);
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+ if (threadInBlock < 1)
+ {
+ shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low);
+ shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high);
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ if (ii == (int)threadInBlock)
+ {
+ ep[0] = shared_temp[threadBase].endPoint_low;
+ ep[1] = shared_temp[threadBase].endPoint_high;
+ }
+ }
+
+ if (threadInBlock < 3)
+ {
+ uint2 P;
+ if (1 == mode)
+ {
+ P = (rotation >> threadInBlock) & 1;
+ }
+ else
+ {
+ P = uint2(rotation >> (threadInBlock * 2 + 0), rotation >> (threadInBlock * 2 + 1)) & 1;
+ }
+
+ if (0 == mode)
+ {
+ ep_quantized = compress_endpoints0( ep, P );
+ }
+ else if (1 == mode)
+ {
+ ep_quantized = compress_endpoints1( ep, P );
+ }
+ else if (2 == mode)
+ {
+ ep_quantized = compress_endpoints2( ep );
+ }
+ else if (3 == mode)
+ {
+ ep_quantized = compress_endpoints3( ep, P );
+ }
+ else if (4 == mode)
+ {
+ ep_quantized = compress_endpoints4( ep );
+ }
+ else if (5 == mode)
+ {
+ ep_quantized = compress_endpoints5( ep );
+ }
+ else if (6 == mode)
+ {
+ ep_quantized = compress_endpoints6( ep, P );
+ }
+ else //if (7 == mode)
+ {
+ ep_quantized = compress_endpoints7( ep, P );
+ }
+
+ int4 span = ep[1] - ep[0];
+ if (mode < 4)
+ {
+ span.w = 0;
+ }
+
+ if ((4 == mode) || (5 == mode))
+ {
+ if (0 == threadInBlock)
+ {
+ int2 span_norm_sqr = uint2( dot( span.rgb, span.rgb ), span.a * span.a );
+ int2 dotProduct = int2( dot( span.rgb, shared_temp[threadBase + 0].pixel.rgb - ep[0].rgb ), span.a * ( shared_temp[threadBase + 0].pixel.a - ep[0].a ) );
+ if ( span_norm_sqr.x > 0 && dotProduct.x > 0 && uint( dotProduct.x * 63.49999 ) > uint( 32 * span_norm_sqr.x ) )
+ {
+ swap(ep[0].rgb, ep[1].rgb);
+ swap(ep_quantized[0].rgb, ep_quantized[1].rgb);
+ }
+ if ( span_norm_sqr.y > 0 && dotProduct.y > 0 && uint( dotProduct.y * 63.49999 ) > uint( 32 * span_norm_sqr.y ) )
+ {
+ swap(ep[0].a, ep[1].a);
+ swap(ep_quantized[0].a, ep_quantized[1].a);
+ }
+ }
+ }
+ else //if ((0 == mode) || (2 == mode) || (1 == mode) || (3 == mode) || (7 == mode) || (6 == mode))
+ {
+ int p;
+ if (0 == threadInBlock)
+ {
+ p = 0;
+ }
+ else if (1 == threadInBlock)
+ {
+ p = candidateFixUpIndex1D[partition].x;
+ }
+ else //if (2 == threadInBlock)
+ {
+ p = candidateFixUpIndex1D[partition].y;
+ }
+
+ int span_norm_sqr = dot( span, span );
+ int dotProduct = dot( span, shared_temp[threadBase + p].pixel - ep[0] );
+ if ( span_norm_sqr > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr ) )
+ {
+ swap(ep[0], ep[1]);
+ swap(ep_quantized[0], ep_quantized[1]);
+ }
+ }
+
+ shared_temp[GI].endPoint_low = ep[0];
+ shared_temp[GI].endPoint_high = ep[1];
+ shared_temp[GI].endPoint_low_quantized = ep_quantized[0];
+ shared_temp[GI].endPoint_high_quantized = ep_quantized[1];
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ if (threadInBlock < 16)
+ {
+ uint color_index = 0;
+ uint alpha_index = 0;
+
+ uint2x4 ep;
+
+ uint2 indexPrec;
+ if ((0 == mode) || (1 == mode))
+ {
+ indexPrec = 1;
+ }
+ else if (6 == mode)
+ {
+ indexPrec = 0;
+ }
+ else if (4 == mode)
+ {
+ if (0 == index_selector)
+ {
+ indexPrec = uint2(2, 1);
+ }
+ else
+ {
+ indexPrec = uint2(1, 2);
+ }
+ }
+ else
+ {
+ indexPrec = 2;
+ }
+
+ int subset_index;
+ if ((0 == mode) || (2 == mode))
+ {
+ subset_index = (bits2 >> (threadInBlock * 2)) & 0x03;
+ }
+ else if ((1 == mode) || (3 == mode) || (7 == mode))
+ {
+ subset_index = (bits >> threadInBlock) & 0x01;
+ }
+ else
+ {
+ subset_index = 0;
+ }
+
+ ep[0] = shared_temp[threadBase + subset_index].endPoint_low;
+ ep[1] = shared_temp[threadBase + subset_index].endPoint_high;
+
+ int4 span = ep[1] - ep[0];
+ if (mode < 4)
+ {
+ span.w = 0;
+ }
+
+ if ((4 == mode) || (5 == mode))
+ {
+ int2 span_norm_sqr;
+ span_norm_sqr.x = dot( span.rgb, span.rgb );
+ span_norm_sqr.y = span.a * span.a;
+
+ int dotProduct = dot( span.rgb, shared_temp[threadBase + threadInBlock].pixel.rgb - ep[0].rgb );
+ color_index = ( span_norm_sqr.x <= 0 || dotProduct <= 0 ) ? 0
+ : ( ( dotProduct < span_norm_sqr.x ) ? aStep[indexPrec.x][ uint( dotProduct * 63.49999 / span_norm_sqr.x ) ] : aStep[indexPrec.x][63] );
+ dotProduct = dot( span.a, shared_temp[threadBase + threadInBlock].pixel.a - ep[0].a );
+ alpha_index = ( span_norm_sqr.y <= 0 || dotProduct <= 0 ) ? 0
+ : ( ( dotProduct < span_norm_sqr.y ) ? aStep[indexPrec.y][ uint( dotProduct * 63.49999 / span_norm_sqr.y ) ] : aStep[indexPrec.y][63] );
+
+ if (index_selector)
+ {
+ swap(color_index, alpha_index);
+ }
+ }
+ else
+ {
+ int span_norm_sqr = dot( span, span );
+
+ int dotProduct = dot( span, shared_temp[threadBase + threadInBlock].pixel - ep[0] );
+ color_index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0
+ : ( ( dotProduct < span_norm_sqr ) ? aStep[indexPrec.x][ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep[indexPrec.x][63] );
+ }
+
+ shared_temp[GI].error = color_index;
+ shared_temp[GI].mode = alpha_index;
+ }
+#ifdef REF_DEVICE
+ GroupMemoryBarrierWithGroupSync();
+#endif
+
+ if (0 == threadInBlock)
+ {
+ uint4 block;
+ if (0 == mode)
+ {
+ block_package0( block, partition, threadBase );
+ }
+ else if (1 == mode)
+ {
+ block_package1( block, partition, threadBase );
+ }
+ else if (2 == mode)
+ {
+ block_package2( block, partition, threadBase );
+ }
+ else if (3 == mode)
+ {
+ block_package3( block, partition, threadBase );
+ }
+ else if (4 == mode)
+ {
+ block_package4( block, rotation, index_selector, threadBase );
+ }
+ else if (5 == mode)
+ {
+ block_package5( block, rotation, threadBase );
+ }
+ else if (6 == mode)
+ {
+ block_package6( block, threadBase );
+ }
+ else //if (7 == mode)
+ {
+ block_package7( block, partition, threadBase );
+ }
+
+ g_OutBuff[blockID] = block;
+ }
+}
+
+//uint4 truncate_and_round( uint4 color, uint bits)
+//{
+// uint precisionMask = ((1 << bits) - 1) << (8 - bits);
+// uint precisionHalf = (1 << (7-bits));
+//
+// uint4 truncated = color & precisionMask;
+// uint4 rounded = min(255, color + precisionHalf) & precisionMask;
+//
+// uint4 truncated_bak = truncated = truncated | (truncated >> bits);
+// uint4 rounded_bak = rounded = rounded | (rounded >> bits);
+//
+// uint4 color_bak = color;
+//
+// Ensure_A_Is_Larger( rounded, color );
+// Ensure_A_Is_Larger( truncated, color_bak );
+//
+// if (dot(rounded - color, rounded - color) <
+// dot(truncated - color_bak, truncated - color_bak))
+// {
+// return rounded_bak;
+// }
+// else
+// {
+// return truncated_bak;
+// }
+//}
+
+uint4 quantize( uint4 color, uint uPrec )
+{
+ uint4 rnd = min(255, color + (1 << (7 - uPrec)));
+ return rnd >> (8 - uPrec);
+}
+
+uint4 unquantize( uint4 color, uint uPrec )
+{
+ color = color << (8 - uPrec);
+ return color | (color >> uPrec);
+}
+
+uint2x4 compress_endpoints0( inout uint2x4 endPoint, uint2 P )
+{
+ uint2x4 quantized;
+ for ( uint j = 0; j < 2; j ++ )
+ {
+ quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb & 0xFFFFFFFE;
+ quantized[j].rgb |= P[j];
+ quantized[j].a = 0xFF;
+
+ endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb;
+ endPoint[j].a = 0xFF;
+
+ quantized[j] <<= 3;
+ }
+ return quantized;
+}
+uint2x4 compress_endpoints1( inout uint2x4 endPoint, uint2 P )
+{
+ uint2x4 quantized;
+ for ( uint j = 0; j < 2; j ++ )
+ {
+ quantized[j].rgb = quantize(endPoint[j].rgbb, 7).rgb & 0xFFFFFFFE;
+ quantized[j].rgb |= P[j];
+ quantized[j].a = 0xFF;
+
+ endPoint[j].rgb = unquantize(quantized[j].rgbb, 7).rgb;
+ endPoint[j].a = 0xFF;
+
+ quantized[j] <<= 1;
+ }
+ return quantized;
+}
+uint2x4 compress_endpoints2( inout uint2x4 endPoint )
+{
+ uint2x4 quantized;
+ for ( uint j = 0; j < 2; j ++ )
+ {
+ quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb;
+ quantized[j].a = 0xFF;
+
+ endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb;
+ endPoint[j].a = 0xFF;
+
+ quantized[j] <<= 3;
+ }
+ return quantized;
+}
+uint2x4 compress_endpoints3( inout uint2x4 endPoint, uint2 P )
+{
+ uint2x4 quantized;
+ for ( uint j = 0; j < 2; j ++ )
+ {
+ quantized[j].rgb = endPoint[j].rgb & 0xFFFFFFFE;
+ quantized[j].rgb |= P[j];
+ quantized[j].a = 0xFF;
+
+ endPoint[j].rgb = quantized[j].rgb;
+ endPoint[j].a = 0xFF;
+ }
+ return quantized;
+}
+uint2x4 compress_endpoints4( inout uint2x4 endPoint )
+{
+ uint2x4 quantized;
+ for ( uint j = 0; j < 2; j ++ )
+ {
+ quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb;
+ quantized[j].a = quantize(endPoint[j].a, 6).r;
+
+ endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb;
+ endPoint[j].a = unquantize(quantized[j].a, 6).r;
+
+ quantized[j].rgb <<= 3;
+ quantized[j].a <<= 2;
+ }
+ return quantized;
+}
+uint2x4 compress_endpoints5( inout uint2x4 endPoint )
+{
+ uint2x4 quantized;
+ for ( uint j = 0; j < 2; j ++ )
+ {
+ quantized[j].rgb = quantize(endPoint[j].rgbb, 7).rgb;
+ quantized[j].a = endPoint[j].a;
+
+ endPoint[j].rgb = unquantize(quantized[j].rgbb, 7).rgb;
+ // endPoint[j].a Alpha is full precision
+
+ quantized[j].rgb <<= 1;
+ }
+ return quantized;
+}
+uint2x4 compress_endpoints6( inout uint2x4 endPoint, uint2 P )
+{
+ uint2x4 quantized;
+ for ( uint j = 0; j < 2; j ++ )
+ {
+ quantized[j] = endPoint[j] & 0xFFFFFFFE;
+ quantized[j] |= P[j];
+
+ endPoint[j] = quantized[j];
+ }
+ return quantized;
+}
+uint2x4 compress_endpoints7( inout uint2x4 endPoint, uint2 P )
+{
+ uint2x4 quantized;
+ for ( uint j = 0; j < 2; j ++ )
+ {
+ quantized[j] = quantize(endPoint[j], 6) & 0xFFFFFFFE;
+ quantized[j] |= P[j];
+
+ endPoint[j] = unquantize(quantized[j], 6);
+ }
+ return quantized << 2;
+}
+
+#define get_end_point_l(subset) shared_temp[threadBase + subset].endPoint_low_quantized
+#define get_end_point_h(subset) shared_temp[threadBase + subset].endPoint_high_quantized
+#define get_color_index(index) shared_temp[threadBase + index].error
+#define get_alpha_index(index) shared_temp[threadBase + index].mode
+
+void block_package0( out uint4 block, uint partition, uint threadBase )
+{
+ block.x = 0x01 | ( (partition - 64) << 1 )
+ | ( ( get_end_point_l(0).r & 0xF0 ) << 1 ) | ( ( get_end_point_h(0).r & 0xF0 ) << 5 )
+ | ( ( get_end_point_l(1).r & 0xF0 ) << 9 ) | ( ( get_end_point_h(1).r & 0xF0 ) << 13 )
+ | ( ( get_end_point_l(2).r & 0xF0 ) << 17 ) | ( ( get_end_point_h(2).r & 0xF0 ) << 21 )
+ | ( ( get_end_point_l(0).g & 0xF0 ) << 25 );
+ block.y = ( ( get_end_point_l(0).g & 0xF0 ) >> 7 ) | ( ( get_end_point_h(0).g & 0xF0 ) >> 3 )
+ | ( ( get_end_point_l(1).g & 0xF0 ) << 1 ) | ( ( get_end_point_h(1).g & 0xF0 ) << 5 )
+ | ( ( get_end_point_l(2).g & 0xF0 ) << 9 ) | ( ( get_end_point_h(2).g & 0xF0 ) << 13 )
+ | ( ( get_end_point_l(0).b & 0xF0 ) << 17 ) | ( ( get_end_point_h(0).b & 0xF0 ) << 21 )
+ | ( ( get_end_point_l(1).b & 0xF0 ) << 25 );
+ block.z = ( ( get_end_point_l(1).b & 0xF0 ) >> 7 ) | ( ( get_end_point_h(1).b & 0xF0 ) >> 3 )
+ | ( ( get_end_point_l(2).b & 0xF0 ) << 1 ) | ( ( get_end_point_h(2).b & 0xF0 ) << 5 )
+ | ( ( get_end_point_l(0).r & 0x08 ) << 10 ) | ( ( get_end_point_h(0).r & 0x08 ) << 11 )
+ | ( ( get_end_point_l(1).r & 0x08 ) << 12 ) | ( ( get_end_point_h(1).r & 0x08 ) << 13 )
+ | ( ( get_end_point_l(2).r & 0x08 ) << 14 ) | ( ( get_end_point_h(2).r & 0x08 ) << 15 )
+ | ( get_color_index(0) << 19 );
+ block.w = 0;
+ uint i = 1;
+ for ( ; i <= min( candidateFixUpIndex1DOrdered[partition][0], 4 ); i ++ )
+ {
+ block.z |= get_color_index(i) << ( i * 3 + 18 );
+ }
+ if ( candidateFixUpIndex1DOrdered[partition][0] < 4 ) //i = 4
+ {
+ block.z |= get_color_index(4) << 29;
+ i += 1;
+ }
+ else //i = 5
+ {
+ block.w |= ( get_color_index(4) & 0x04 ) >> 2;
+ for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ )
+ block.w |= get_color_index(i) << ( i * 3 - 14 );
+ }
+ for ( ; i <= candidateFixUpIndex1DOrdered[partition][1]; i ++ )
+ {
+ block.w |= get_color_index(i) << ( i * 3 - 15 );
+ }
+ for ( ; i < 16; i ++ )
+ {
+ block.w |= get_color_index(i) << ( i * 3 - 16 );
+ }
+}
+void block_package1( out uint4 block, uint partition, uint threadBase )
+{
+ block.x = 0x02 | ( partition << 2 )
+ | ( ( get_end_point_l(0).r & 0xFC ) << 6 ) | ( ( get_end_point_h(0).r & 0xFC ) << 12 )
+ | ( ( get_end_point_l(1).r & 0xFC ) << 18 ) | ( ( get_end_point_h(1).r & 0xFC ) << 24 );
+ block.y = ( ( get_end_point_l(0).g & 0xFC ) >> 2 ) | ( ( get_end_point_h(0).g & 0xFC ) << 4 )
+ | ( ( get_end_point_l(1).g & 0xFC ) << 10 ) | ( ( get_end_point_h(1).g & 0xFC ) << 16 )
+ | ( ( get_end_point_l(0).b & 0xFC ) << 22 ) | ( ( get_end_point_h(0).b & 0xFC ) << 28 );
+ block.z = ( ( get_end_point_h(0).b & 0xFC ) >> 4 ) | ( ( get_end_point_l(1).b & 0xFC ) << 2 )
+ | ( ( get_end_point_h(1).b & 0xFC ) << 8 )
+ | ( ( get_end_point_l(0).r & 0x02 ) << 15 ) | ( ( get_end_point_l(1).r & 0x02 ) << 16 )
+ | ( get_color_index(0) << 18 );
+ if ( candidateFixUpIndex1DOrdered[partition][0] == 15 )
+ {
+ block.w = (get_color_index(15) << 30) | (get_color_index(14) << 27) | (get_color_index(13) << 24) | (get_color_index(12) << 21) | (get_color_index(11) << 18) | (get_color_index(10) << 15)
+ | (get_color_index(9) << 12) | (get_color_index(8) << 9) | (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5);
+ block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
+ }
+ else if ( candidateFixUpIndex1DOrdered[partition][0] == 2 )
+ {
+ block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14)
+ | (get_color_index(9) << 11) | (get_color_index(8) << 8) | (get_color_index(7) << 5) | (get_color_index(6) << 2) | (get_color_index(5) >> 1);
+ block.z |= (get_color_index(5) << 31) | (get_color_index(4) << 28) | (get_color_index(3) << 25) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
+ }
+ else if ( candidateFixUpIndex1DOrdered[partition][0] == 8 )
+ {
+ block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14)
+ | (get_color_index(9) << 11) | (get_color_index(8) << 9) | (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5);
+ block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
+ }
+ else //candidateFixUpIndex1DOrdered[partition] == 6
+ {
+ block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14)
+ | (get_color_index(9) << 11) | (get_color_index(8) << 8) | (get_color_index(7) << 6) | (get_color_index(6) << 4) | get_color_index(5);
+ block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
+ }
+}
+void block_package2( out uint4 block, uint partition, uint threadBase )
+{
+ block.x = 0x04 | ( (partition - 64) << 3 )
+ | ( ( get_end_point_l(0).r & 0xF8 ) << 6 ) | ( ( get_end_point_h(0).r & 0xF8 ) << 11 )
+ | ( ( get_end_point_l(1).r & 0xF8 ) << 16 ) | ( ( get_end_point_h(1).r & 0xF8 ) << 21 )
+ | ( ( get_end_point_l(2).r & 0xF8 ) << 26 );
+ block.y = ( ( get_end_point_l(2).r & 0xF8 ) >> 6 ) | ( ( get_end_point_h(2).r & 0xF8 ) >> 1 )
+ | ( ( get_end_point_l(0).g & 0xF8 ) << 4 ) | ( ( get_end_point_h(0).g & 0xF8 ) << 9 )
+ | ( ( get_end_point_l(1).g & 0xF8 ) << 14 ) | ( ( get_end_point_h(1).g & 0xF8 ) << 19 )
+ | ( ( get_end_point_l(2).g & 0xF8 ) << 24 );
+ block.z = ( ( get_end_point_h(2).g & 0xF8 ) >> 3 ) | ( ( get_end_point_l(0).b & 0xF8 ) << 2 )
+ | ( ( get_end_point_h(0).b & 0xF8 ) << 7 ) | ( ( get_end_point_l(1).b & 0xF8 ) << 12 )
+ | ( ( get_end_point_h(1).b & 0xF8 ) << 17 ) | ( ( get_end_point_l(2).b & 0xF8 ) << 22 )
+ | ( ( get_end_point_h(2).b & 0xF8 ) << 27 );
+ block.w = ( ( get_end_point_h(2).b & 0xF8 ) >> 5 )
+ | ( get_color_index(0) << 3 );
+ uint i = 1;
+ for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ )
+ {
+ block.w |= get_color_index(i) << ( i * 2 + 2 );
+ }
+ for ( ; i <= candidateFixUpIndex1DOrdered[partition][1]; i ++ )
+ {
+ block.w |= get_color_index(i) << ( i * 2 + 1 );
+ }
+ for ( ; i < 16; i ++ )
+ {
+ block.w |= get_color_index(i) << ( i * 2 );
+ }
+}
+void block_package3( out uint4 block, uint partition, uint threadBase )
+{
+ block.x = 0x08 | ( partition << 4 )
+ | ( ( get_end_point_l(0).r & 0xFE ) << 9 ) | ( ( get_end_point_h(0).r & 0xFE ) << 16 )
+ | ( ( get_end_point_l(1).r & 0xFE ) << 23 ) | ( ( get_end_point_h(1).r & 0xFE ) << 30 );
+ block.y = ( ( get_end_point_h(1).r & 0xFE ) >> 2 ) | ( ( get_end_point_l(0).g & 0xFE ) << 5 )
+ | ( ( get_end_point_h(0).g & 0xFE ) << 12 ) | ( ( get_end_point_l(1).g & 0xFE ) << 19 )
+ | ( ( get_end_point_h(1).g & 0xFE ) << 26 );
+ block.z = ( ( get_end_point_h(1).g & 0xFE ) >> 6 ) | ( ( get_end_point_l(0).b & 0xFE ) << 1 )
+ | ( ( get_end_point_h(0).b & 0xFE ) << 8 ) | ( ( get_end_point_l(1).b & 0xFE ) << 15 )
+ | ( ( get_end_point_h(1).b & 0xFE ) << 22 )
+ | ( ( get_end_point_l(0).r & 0x01 ) << 30 ) | ( ( get_end_point_h(0).r & 0x01 ) << 31 );
+ block.w = ( ( get_end_point_l(1).r & 0x01 ) << 0 ) | ( ( get_end_point_h(1).r & 0x01 ) << 1 )
+ | ( get_color_index(0) << 2 );
+ uint i = 1;
+ for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ )
+ {
+ block.w |= get_color_index(i) << ( i * 2 + 1 );
+ }
+ for ( ; i < 16; i ++ )
+ {
+ block.w |= get_color_index(i) << ( i * 2 );
+ }
+}
+void block_package4( out uint4 block, uint rotation, uint index_selector, uint threadBase )
+{
+ block.x = 0x10 | ( (rotation & 3) << 5 ) | ( (index_selector & 1) << 7 )
+ | ( ( get_end_point_l(0).r & 0xF8 ) << 5 ) | ( ( get_end_point_h(0).r & 0xF8 ) << 10 )
+ | ( ( get_end_point_l(0).g & 0xF8 ) << 15 ) | ( ( get_end_point_h(0).g & 0xF8 ) << 20 )
+ | ( ( get_end_point_l(0).b & 0xF8 ) << 25 );
+
+ block.y = ( ( get_end_point_l(0).b & 0xF8 ) >> 7 ) | ( ( get_end_point_h(0).b & 0xF8 ) >> 2 )
+ | ( ( get_end_point_l(0).a & 0xFC ) << 4 ) | ( ( get_end_point_h(0).a & 0xFC ) << 10 )
+ | ( (get_color_index(0) & 1) << 18 ) | ( get_color_index(1) << 19 ) | ( get_color_index(2) << 21 ) | ( get_color_index(3) << 23 )
+ | ( get_color_index(4) << 25 ) | ( get_color_index(5) << 27 ) | ( get_color_index(6) << 29 ) | ( get_color_index(7) << 31 );
+
+ block.z = ( get_color_index(7) >> 1 ) | ( get_color_index(8) << 1 ) | ( get_color_index(9) << 3 ) | ( get_color_index(10)<< 5 )
+ | ( get_color_index(11)<< 7 ) | ( get_color_index(12)<< 9 ) | ( get_color_index(13)<< 11 ) | ( get_color_index(14)<< 13 )
+ | ( get_color_index(15)<< 15 ) | ( (get_alpha_index(0) & 3) << 17 ) | ( get_alpha_index(1) << 19 ) | ( get_alpha_index(2) << 22 )
+ | ( get_alpha_index(3) << 25 ) | ( get_alpha_index(4) << 28 ) | ( get_alpha_index(5) << 31 );
+
+ block.w = ( get_alpha_index(5) >> 1 ) | ( get_alpha_index(6) << 2 ) | ( get_alpha_index(7) << 5 ) | ( get_alpha_index(8) << 8 )
+ | ( get_alpha_index(9) << 11 ) | ( get_alpha_index(10)<< 14 ) | ( get_alpha_index(11)<< 17 ) | ( get_alpha_index(12)<< 20 )
+ | ( get_alpha_index(13)<< 23 ) | ( get_alpha_index(14)<< 26 ) | ( get_alpha_index(15)<< 29 );
+}
+void block_package5( out uint4 block, uint rotation, uint threadBase )
+{
+ block.x = 0x20 | ( rotation << 6 )
+ | ( ( get_end_point_l(0).r & 0xFE ) << 7 ) | ( ( get_end_point_h(0).r & 0xFE ) << 14 )
+ | ( ( get_end_point_l(0).g & 0xFE ) << 21 ) | ( ( get_end_point_h(0).g & 0xFE ) << 28 );
+ block.y = ( ( get_end_point_h(0).g & 0xFE ) >> 4 ) | ( ( get_end_point_l(0).b & 0xFE ) << 3 )
+ | ( ( get_end_point_h(0).b & 0xFE ) << 10 ) | ( get_end_point_l(0).a << 18 ) | ( get_end_point_h(0).a << 26 );
+ block.z = ( get_end_point_h(0).a >> 6 )
+ | ( get_color_index(0) << 2 ) | ( get_color_index(1) << 3 ) | ( get_color_index(2) << 5 ) | ( get_color_index(3) << 7 )
+ | ( get_color_index(4) << 9 ) | ( get_color_index(5) << 11 ) | ( get_color_index(6) << 13 ) | ( get_color_index(7) << 15 )
+ | ( get_color_index(8) << 17 ) | ( get_color_index(9) << 19 ) | ( get_color_index(10)<< 21 ) | ( get_color_index(11)<< 23 )
+ | ( get_color_index(12)<< 25 ) | ( get_color_index(13)<< 27 ) | ( get_color_index(14)<< 29 ) | ( get_color_index(15)<< 31 );
+ block.w = ( get_color_index(15)>> 1 ) | ( get_alpha_index(0) << 1 ) | ( get_alpha_index(1) << 2 ) | ( get_alpha_index(2) << 4 )
+ | ( get_alpha_index(3) << 6 ) | ( get_alpha_index(4) << 8 ) | ( get_alpha_index(5) << 10 ) | ( get_alpha_index(6) << 12 )
+ | ( get_alpha_index(7) << 14 ) | ( get_alpha_index(8) << 16 ) | ( get_alpha_index(9) << 18 ) | ( get_alpha_index(10)<< 20 )
+ | ( get_alpha_index(11)<< 22 ) | ( get_alpha_index(12)<< 24 ) | ( get_alpha_index(13)<< 26 ) | ( get_alpha_index(14)<< 28 )
+ | ( get_alpha_index(15)<< 30 );
+}
+void block_package6( out uint4 block, uint threadBase )
+{
+ block.x = 0x40
+ | ( ( get_end_point_l(0).r & 0xFE ) << 6 ) | ( ( get_end_point_h(0).r & 0xFE ) << 13 )
+ | ( ( get_end_point_l(0).g & 0xFE ) << 20 ) | ( ( get_end_point_h(0).g & 0xFE ) << 27 );
+ block.y = ( ( get_end_point_h(0).g & 0xFE ) >> 5 ) | ( ( get_end_point_l(0).b & 0xFE ) << 2 )
+ | ( ( get_end_point_h(0).b & 0xFE ) << 9 ) | ( ( get_end_point_l(0).a & 0xFE ) << 16 )
+ | ( ( get_end_point_h(0).a & 0xFE ) << 23 )
+ | ( get_end_point_l(0).r & 0x01 ) << 31;
+ block.z = ( get_end_point_h(0).r & 0x01 )
+ | ( get_color_index(0) << 1 ) | ( get_color_index(1) << 4 ) | ( get_color_index(2) << 8 ) | ( get_color_index(3) << 12 )
+ | ( get_color_index(4) << 16 ) | ( get_color_index(5) << 20 ) | ( get_color_index(6) << 24 ) | ( get_color_index(7) << 28 );
+ block.w = ( get_color_index(8) << 0 ) | ( get_color_index(9) << 4 ) | ( get_color_index(10)<< 8 ) | ( get_color_index(11)<< 12 )
+ | ( get_color_index(12)<< 16 ) | ( get_color_index(13)<< 20 ) | ( get_color_index(14)<< 24 ) | ( get_color_index(15)<< 28 );
+}
+void block_package7( out uint4 block, uint partition, uint threadBase )
+{
+ block.x = 0x80 | ( partition << 8 )
+ | ( ( get_end_point_l(0).r & 0xF8 ) << 11 ) | ( ( get_end_point_h(0).r & 0xF8 ) << 16 )
+ | ( ( get_end_point_l(1).r & 0xF8 ) << 21 ) | ( ( get_end_point_h(1).r & 0xF8 ) << 26 );
+ block.y = ( ( get_end_point_h(1).r & 0xF8 ) >> 6 ) | ( ( get_end_point_l(0).g & 0xF8 ) >> 1 )
+ | ( ( get_end_point_h(0).g & 0xF8 ) << 4 ) | ( ( get_end_point_l(1).g & 0xF8 ) << 9 )
+ | ( ( get_end_point_h(1).g & 0xF8 ) << 14 ) | ( ( get_end_point_l(0).b & 0xF8 ) << 19 )
+ | ( ( get_end_point_h(0).b & 0xF8 ) << 24 );
+ block.z = ( ( get_end_point_l(1).b & 0xF8 ) >> 3 ) | ( ( get_end_point_h(1).b & 0xF8 ) << 2 )
+ | ( ( get_end_point_l(0).a & 0xF8 ) << 7 ) | ( ( get_end_point_h(0).a & 0xF8 ) << 12 )
+ | ( ( get_end_point_l(1).a & 0xF8 ) << 17 ) | ( ( get_end_point_h(1).a & 0xF8 ) << 22 )
+ | ( ( get_end_point_l(0).r & 0x04 ) << 28 ) | ( ( get_end_point_h(0).r & 0x04 ) << 29 );
+ block.w = ( ( get_end_point_l(1).r & 0x04 ) >> 2 ) | ( ( get_end_point_h(1).r & 0x04 ) >> 1 )
+ | ( get_color_index(0) << 2 );
+ uint i = 1;
+ for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ )
+ {
+ block.w |= get_color_index(i) << ( i * 2 + 1 );
+ }
+ for ( ; i < 16; i ++ )
+ {
+ block.w |= get_color_index(i) << ( i * 2 );
+ }
+} \ No newline at end of file
diff --git a/tests/hlsl/dxsdk/BasicCompute11/BasicCompute11.hlsl b/tests/hlsl/dxsdk/BasicCompute11/BasicCompute11.hlsl
new file mode 100644
index 000000000..798eea2ff
--- /dev/null
+++ b/tests/hlsl/dxsdk/BasicCompute11/BasicCompute11.hlsl
@@ -0,0 +1,72 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSMain
+//--------------------------------------------------------------------------------------
+// File: BasicCompute11.hlsl
+//
+// This file contains the Compute Shader to perform array A + array B
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#ifdef USE_STRUCTURED_BUFFERS
+
+struct BufType
+{
+ int i;
+ float f;
+#ifdef TEST_DOUBLE
+ double d;
+#endif
+};
+
+StructuredBuffer<BufType> Buffer0 : register(t0);
+StructuredBuffer<BufType> Buffer1 : register(t1);
+RWStructuredBuffer<BufType> BufferOut : register(u0);
+
+[numthreads(1, 1, 1)]
+void CSMain( uint3 DTid : SV_DispatchThreadID )
+{
+ BufferOut[DTid.x].i = Buffer0[DTid.x].i + Buffer1[DTid.x].i;
+ BufferOut[DTid.x].f = Buffer0[DTid.x].f + Buffer1[DTid.x].f;
+#ifdef TEST_DOUBLE
+ BufferOut[DTid.x].d = Buffer0[DTid.x].d + Buffer1[DTid.x].d;
+#endif
+}
+
+#else // The following code is for raw buffers
+
+ByteAddressBuffer Buffer0 : register(t0);
+ByteAddressBuffer Buffer1 : register(t1);
+RWByteAddressBuffer BufferOut : register(u0);
+
+[numthreads(1, 1, 1)]
+void CSMain( uint3 DTid : SV_DispatchThreadID )
+{
+#ifdef TEST_DOUBLE
+ int i0 = asint( Buffer0.Load( DTid.x*16 ) );
+ float f0 = asfloat( Buffer0.Load( DTid.x*16+4 ) );
+ double d0 = asdouble( Buffer0.Load( DTid.x*16+8 ), Buffer0.Load( DTid.x*16+12 ) );
+ int i1 = asint( Buffer1.Load( DTid.x*16 ) );
+ float f1 = asfloat( Buffer1.Load( DTid.x*16+4 ) );
+ double d1 = asdouble( Buffer1.Load( DTid.x*16+8 ), Buffer1.Load( DTid.x*16+12 ) );
+
+ BufferOut.Store( DTid.x*16, asuint(i0 + i1) );
+ BufferOut.Store( DTid.x*16+4, asuint(f0 + f1) );
+
+ uint dl, dh;
+ asuint( d0 + d1, dl, dh );
+
+ BufferOut.Store( DTid.x*16+8, dl );
+ BufferOut.Store( DTid.x*16+12, dh );
+#else
+ int i0 = asint( Buffer0.Load( DTid.x*8 ) );
+ float f0 = asfloat( Buffer0.Load( DTid.x*8+4 ) );
+ int i1 = asint( Buffer1.Load( DTid.x*8 ) );
+ float f1 = asfloat( Buffer1.Load( DTid.x*8+4 ) );
+
+ BufferOut.Store( DTid.x*8, asuint(i0 + i1) );
+ BufferOut.Store( DTid.x*8+4, asuint(f0 + f1) );
+#endif // TEST_DOUBLE
+}
+
+#endif // USE_STRUCTURED_BUFFERS
diff --git a/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL.fx b/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL.fx
new file mode 100644
index 000000000..bd28f862b
--- /dev/null
+++ b/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL.fx
@@ -0,0 +1,158 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: BasicHLSL.fx
+//
+// The effect file for the BasicHLSL sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Global variables
+//--------------------------------------------------------------------------------------
+float4 g_MaterialAmbientColor; // Material's ambient color
+float4 g_MaterialDiffuseColor; // Material's diffuse color
+int g_nNumLights;
+
+float3 g_LightDir; // Light's direction in world space
+float4 g_LightDiffuse; // Light's diffuse color
+float4 g_LightAmbient; // Light's ambient color
+
+texture g_MeshTexture; // Color texture for mesh
+
+float g_fTime; // App's time in seconds
+float4x4 g_mWorld; // World matrix for object
+float4x4 g_mWorldViewProjection; // World * View * Projection matrix
+
+
+
+//--------------------------------------------------------------------------------------
+// Texture samplers
+//--------------------------------------------------------------------------------------
+sampler MeshTextureSampler =
+sampler_state
+{
+ Texture = <g_MeshTexture>;
+ MipFilter = LINEAR;
+ MinFilter = LINEAR;
+ MagFilter = LINEAR;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex shader output structure
+//--------------------------------------------------------------------------------------
+struct VS_OUTPUT
+{
+ float4 Position : POSITION; // vertex position
+ float4 Diffuse : COLOR0; // vertex diffuse color (note that COLOR0 is clamped from 0..1)
+ float2 TextureUV : TEXCOORD0; // vertex texture coords
+};
+
+
+//--------------------------------------------------------------------------------------
+// This shader computes standard transform and lighting
+//--------------------------------------------------------------------------------------
+VS_OUTPUT RenderSceneVS( float4 vPos : POSITION,
+ float3 vNormal : NORMAL,
+ float2 vTexCoord0 : TEXCOORD0,
+ uniform int nNumLights,
+ uniform bool bTexture,
+ uniform bool bAnimate )
+{
+
+ VS_OUTPUT Output;
+ float3 vNormalWorldSpace;
+
+ // Transform the position from object space to homogeneous projection space
+ Output.Position = mul(vPos, g_mWorldViewProjection);
+
+ // Transform the normal from object space to world space
+ vNormalWorldSpace = normalize(mul(vNormal, (float3x3)g_mWorld)); // normal (world space)
+
+ // Compute simple directional lighting equation
+ float3 vTotalLightDiffuse = float3(0,0,0);
+ for(int i=0; i<nNumLights; i++ )
+ vTotalLightDiffuse += g_LightDiffuse * max(0,dot(vNormalWorldSpace, g_LightDir));
+
+ Output.Diffuse.rgb = g_MaterialDiffuseColor * vTotalLightDiffuse +
+ g_MaterialAmbientColor * g_LightAmbient;
+ Output.Diffuse.a = 1.0f;
+
+ // Just copy the texture coordinate through
+ if( bTexture )
+ Output.TextureUV = vTexCoord0;
+ else
+ Output.TextureUV = 0;
+
+ return Output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel shader output structure
+//--------------------------------------------------------------------------------------
+struct PS_OUTPUT
+{
+ float4 RGBColor : COLOR0; // Pixel color
+};
+
+
+//--------------------------------------------------------------------------------------
+// This shader outputs the pixel's color by modulating the texture's
+// color with diffuse material color
+//--------------------------------------------------------------------------------------
+PS_OUTPUT RenderScenePS( VS_OUTPUT In,
+ uniform bool bTexture )
+{
+ PS_OUTPUT Output;
+
+ // Lookup mesh texture and modulate it with diffuse
+ if( bTexture )
+ Output.RGBColor = tex2D(MeshTextureSampler, In.TextureUV) * In.Diffuse;
+ else
+ Output.RGBColor = In.Diffuse;
+
+ return Output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Renders scene to render target
+//--------------------------------------------------------------------------------------
+technique RenderSceneWithTexture1Light
+{
+ pass P0
+ {
+ VertexShader = compile vs_2_0 RenderSceneVS( 1, true, true );
+ PixelShader = compile ps_2_0 RenderScenePS( true ); // trivial pixel shader (could use FF instead if desired)
+ }
+}
+
+technique RenderSceneWithTexture2Light
+{
+ pass P0
+ {
+ VertexShader = compile vs_2_0 RenderSceneVS( 2, true, true );
+ PixelShader = compile ps_2_0 RenderScenePS( true ); // trivial pixel shader (could use FF instead if desired)
+ }
+}
+
+technique RenderSceneWithTexture3Light
+{
+ pass P0
+ {
+ VertexShader = compile vs_2_0 RenderSceneVS( 3, true, true );
+ PixelShader = compile ps_2_0 RenderScenePS( true ); // trivial pixel shader (could use FF instead if desired)
+ }
+}
+
+technique RenderSceneNoTexture
+{
+ pass P0
+ {
+ VertexShader = compile vs_2_0 RenderSceneVS( 1, false, false );
+ PixelShader = compile ps_2_0 RenderScenePS( false ); // trivial pixel shader (could use FF instead if desired)
+ }
+}
diff --git a/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_PS.hlsl b/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_PS.hlsl
new file mode 100644
index 000000000..78fff9eeb
--- /dev/null
+++ b/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_PS.hlsl
@@ -0,0 +1,51 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PSMain
+//--------------------------------------------------------------------------------------
+// File: BasicHLSL11_PS.hlsl
+//
+// The pixel shader file for the BasicHLSL11 sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+ float4 g_vObjectColor : packoffset( c0 );
+};
+
+cbuffer cbPerFrame : register( b1 )
+{
+ float3 g_vLightDir : packoffset( c0 );
+ float g_fAmbient : packoffset( c0.w );
+};
+
+//--------------------------------------------------------------------------------------
+// Textures and Samplers
+//--------------------------------------------------------------------------------------
+Texture2D g_txDiffuse : register( t0 );
+SamplerState g_samLinear : register( s0 );
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct PS_INPUT
+{
+ float3 vNormal : NORMAL;
+ float2 vTexcoord : TEXCOORD0;
+};
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PSMain( PS_INPUT Input ) : SV_TARGET
+{
+ float4 vDiffuse = g_txDiffuse.Sample( g_samLinear, Input.vTexcoord );
+
+ float fLighting = saturate( dot( g_vLightDir, Input.vNormal ) );
+ fLighting = max( fLighting, g_fAmbient );
+
+ return vDiffuse * fLighting;
+}
+
diff --git a/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_VS.hlsl b/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_VS.hlsl
new file mode 100644
index 000000000..cb2c1b950
--- /dev/null
+++ b/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_VS.hlsl
@@ -0,0 +1,49 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain
+//--------------------------------------------------------------------------------------
+// File: BasicHLSL11_VS.hlsl
+//
+// The vertex shader file for the BasicHLSL11 sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+ matrix g_mWorldViewProjection : packoffset( c0 );
+ matrix g_mWorld : packoffset( c4 );
+};
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+ float4 vPosition : POSITION;
+ float3 vNormal : NORMAL;
+ float2 vTexcoord : TEXCOORD0;
+};
+
+struct VS_OUTPUT
+{
+ float3 vNormal : NORMAL;
+ float2 vTexcoord : TEXCOORD0;
+ float4 vPosition : SV_POSITION;
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+VS_OUTPUT VSMain( VS_INPUT Input )
+{
+ VS_OUTPUT Output;
+
+ Output.vPosition = mul( Input.vPosition, g_mWorldViewProjection );
+ Output.vNormal = mul( Input.vNormal, (float3x3)g_mWorld );
+ Output.vTexcoord = Input.vTexcoord;
+
+ return Output;
+}
+
diff --git a/tests/hlsl/dxsdk/BasicHLSLFX11/BasicHLSLFX11.fx b/tests/hlsl/dxsdk/BasicHLSLFX11/BasicHLSLFX11.fx
new file mode 100644
index 000000000..1ecc1930a
--- /dev/null
+++ b/tests/hlsl/dxsdk/BasicHLSLFX11/BasicHLSLFX11.fx
@@ -0,0 +1,181 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: BasicHLSL11.fx
+//
+// The effect file for the BasicHLSL sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Global variables
+//--------------------------------------------------------------------------------------
+float4 g_MaterialAmbientColor; // Material's ambient color
+float4 g_MaterialDiffuseColor; // Material's diffuse color
+int g_nNumLights;
+
+float3 g_LightDir[3]; // Light's direction in world space
+float4 g_LightDiffuse[3]; // Light's diffuse color
+float4 g_LightAmbient; // Light's ambient color
+
+Texture2D g_MeshTexture; // Color texture for mesh
+
+float g_fTime; // App's time in seconds
+float4x4 g_mWorld; // World matrix for object
+float4x4 g_mWorldViewProjection; // World * View * Projection matrix
+
+//--------------------------------------------------------------------------------------
+// DepthStates
+//--------------------------------------------------------------------------------------
+DepthStencilState EnableDepth
+{
+ DepthEnable = TRUE;
+ DepthWriteMask = ALL;
+ DepthFunc = LESS_EQUAL;
+};
+
+//--------------------------------------------------------------------------------------
+// Texture samplers
+//--------------------------------------------------------------------------------------
+SamplerState MeshTextureSampler
+{
+ Filter = MIN_MAG_MIP_LINEAR;
+ AddressU = Wrap;
+ AddressV = Wrap;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex shader output structure
+//--------------------------------------------------------------------------------------
+struct VS_OUTPUT
+{
+ float4 Position : SV_POSITION; // vertex position
+ float4 Diffuse : COLOR0; // vertex diffuse color (note that COLOR0 is clamped from 0..1)
+ float2 TextureUV : TEXCOORD0; // vertex texture coords
+};
+
+
+//--------------------------------------------------------------------------------------
+// This shader computes standard transform and lighting
+//--------------------------------------------------------------------------------------
+VS_OUTPUT RenderSceneVS( float4 vPos : POSITION,
+ float3 vNormal : NORMAL,
+ float2 vTexCoord0 : TEXCOORD,
+ uniform int nNumLights,
+ uniform bool bTexture,
+ uniform bool bAnimate )
+{
+ VS_OUTPUT Output;
+ float3 vNormalWorldSpace;
+
+ float4 vAnimatedPos = vPos;
+
+ // Animation the vertex based on time and the vertex's object space position
+ if( bAnimate )
+ vAnimatedPos += float4(vNormal, 0) * (sin(g_fTime+5.5)+0.5)*5;
+
+ // Transform the position from object space to homogeneous projection space
+ Output.Position = mul(vAnimatedPos, g_mWorldViewProjection);
+
+ // Transform the normal from object space to world space
+ vNormalWorldSpace = normalize(mul(vNormal, (float3x3)g_mWorld)); // normal (world space)
+
+ // Compute simple directional lighting equation
+ float3 vTotalLightDiffuse = float3(0,0,0);
+ for(int i=0; i<nNumLights; i++ )
+ vTotalLightDiffuse += g_LightDiffuse[i] * max(0,dot(vNormalWorldSpace, g_LightDir[i]));
+
+ Output.Diffuse.rgb = g_MaterialDiffuseColor * vTotalLightDiffuse +
+ g_MaterialAmbientColor * g_LightAmbient;
+ Output.Diffuse.a = 1.0f;
+
+ // Just copy the texture coordinate through
+ if( bTexture )
+ Output.TextureUV = vTexCoord0;
+ else
+ Output.TextureUV = 0;
+
+ return Output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel shader output structure
+//--------------------------------------------------------------------------------------
+struct PS_OUTPUT
+{
+ float4 RGBColor : SV_Target; // Pixel color
+};
+
+
+//--------------------------------------------------------------------------------------
+// This shader outputs the pixel's color by modulating the texture's
+// color with diffuse material color
+//--------------------------------------------------------------------------------------
+PS_OUTPUT RenderScenePS( VS_OUTPUT In,
+ uniform bool bTexture )
+{
+ PS_OUTPUT Output;
+
+ // Lookup mesh texture and modulate it with diffuse
+ if( bTexture )
+ Output.RGBColor = g_MeshTexture.Sample(MeshTextureSampler, In.TextureUV) * In.Diffuse;
+ else
+ Output.RGBColor = In.Diffuse;
+
+ return Output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Renders scene to render target using D3D11 Techniques
+//--------------------------------------------------------------------------------------
+technique11 RenderSceneWithTexture1Light
+{
+ pass P0
+ {
+ SetVertexShader( CompileShader( vs_4_0_level_9_1, RenderSceneVS( 1, true, true ) ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0_level_9_1, RenderScenePS( true ) ) );
+
+ SetDepthStencilState( EnableDepth, 0 );
+ }
+}
+
+technique11 RenderSceneWithTexture2Light
+{
+ pass P0
+ {
+ SetVertexShader( CompileShader( vs_4_0_level_9_1, RenderSceneVS( 2, true, true ) ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0_level_9_1, RenderScenePS( true ) ) );
+
+ SetDepthStencilState( EnableDepth, 0 );
+ }
+}
+
+technique11 RenderSceneWithTexture3Light
+{
+ pass P0
+ {
+ SetVertexShader( CompileShader( vs_4_0_level_9_1, RenderSceneVS( 3, true, true ) ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0_level_9_1, RenderScenePS( true ) ) );
+
+ SetDepthStencilState( EnableDepth, 0 );
+ }
+}
+
+technique11 RenderSceneNoTexture
+{
+ pass P0
+ {
+ SetVertexShader( CompileShader( vs_4_0_level_9_1, RenderSceneVS( 1, true, true ) ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0_level_9_1, RenderScenePS( false ) ) );
+
+ SetDepthStencilState( EnableDepth, 0 );
+ }
+} \ No newline at end of file
diff --git a/tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeScene.hlsl b/tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeScene.hlsl
new file mode 100644
index 000000000..6a6dca0c4
--- /dev/null
+++ b/tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeScene.hlsl
@@ -0,0 +1,506 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain -profile ps_4_0 -entry PSMain
+//--------------------------------------------------------------------------------------
+// File: RenderCascadeScene.hlsl
+//
+// This is the main shader file. This shader is compiled with several different flags
+// to provide different customizations based on user controls.
+//
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+
+// This flag uses the derivative information to map the texels in a shadow map to the
+// view space plane of the primitive being rendred. This depth is then used as the
+// comparison depth and reduces self shadowing aliases. This technique is expensive
+// and is only valid when objects are planer ( such as a ground plane ).
+#ifndef USE_DERIVATIVES_FOR_DEPTH_OFFSET_FLAG
+#define USE_DERIVATIVES_FOR_DEPTH_OFFSET_FLAG 0
+#endif
+
+// This flag enables the shadow to blend between cascades. This is most useful when the
+// the shadow maps are small and artifact can be seen between the various cascade layers.
+#ifndef BLEND_BETWEEN_CASCADE_LAYERS_FLAG
+#define BLEND_BETWEEN_CASCADE_LAYERS_FLAG 0
+#endif
+
+// There are two methods for selecting the proper cascade a fragment lies in. Interval selection
+// compares the depth of the fragment against the frustum's depth partition.
+// Map based selection compares the texture coordinates against the acutal cascade maps.
+// Map based selection gives better coverage.
+// Interval based selection is easier to extend and understand.
+#ifndef SELECT_CASCADE_BY_INTERVAL_FLAG
+#define SELECT_CASCADE_BY_INTERVAL_FLAG 0
+#endif
+
+// The number of cascades
+#ifndef CASCADE_COUNT_FLAG
+#define CASCADE_COUNT_FLAG 3
+#endif
+
+
+// Most titles will find that 3-4 cascades with
+// BLEND_BETWEEN_CASCADE_LAYERS_FLAG, is good for lower end PCs.
+// High end PCs will be able to handle more cascades, and larger blur bands.
+// In some cases such as when large PCF kernels are used, derivative based depth offsets could be used
+// with larger PCF blur kernels on high end PCs for the ground plane.
+
+cbuffer cbAllShadowData : register( b0 )
+{
+ matrix m_mWorldViewProjection;
+ matrix m_mWorld;
+ matrix m_mWorldView;
+ matrix m_mShadow;
+ float4 m_vCascadeOffset[8];
+ float4 m_vCascadeScale[8];
+ int m_nCascadeLevels; // Number of Cascades
+ int m_iVisualizeCascades; // 1 is to visualize the cascades in different colors. 0 is to just draw the scene
+ int m_iPCFBlurForLoopStart; // For loop begin value. For a 5x5 Kernal this would be -2.
+ int m_iPCFBlurForLoopEnd; // For loop end value. For a 5x5 kernel this would be 3.
+
+ // For Map based selection scheme, this keeps the pixels inside of the the valid range.
+ // When there is no boarder, these values are 0 and 1 respectivley.
+ float m_fMinBorderPadding;
+ float m_fMaxBorderPadding;
+ float m_fShadowBiasFromGUI; // A shadow map offset to deal with self shadow artifacts.
+ //These artifacts are aggravated by PCF.
+ float m_fShadowPartitionSize;
+ float m_fCascadeBlendArea; // Amount to overlap when blending between cascades.
+ float m_fTexelSize;
+ float m_fNativeTexelSizeInX;
+ float m_fPaddingForCB3; // Padding variables exist because CBs must be a multiple of 16 bytes.
+ float4 m_fCascadeFrustumsEyeSpaceDepthsFloat[2]; // The values along Z that seperate the cascades.
+ float4 m_fCascadeFrustumsEyeSpaceDepthsFloat4[8]; // the values along Z that separte the cascades.
+ // Wastefully stored in float4 so they are array indexable.
+ float3 m_vLightDir;
+ float m_fPaddingCB4;
+
+};
+
+
+
+//--------------------------------------------------------------------------------------
+// Textures and Samplers
+//--------------------------------------------------------------------------------------
+Texture2D g_txDiffuse : register( t0 );
+Texture2D g_txShadow : register( t5 );
+
+
+SamplerState g_samLinear : register( s0 );
+SamplerComparisonState g_samShadow : register( s5 );
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+ float4 vPosition : POSITION;
+ float3 vNormal : NORMAL;
+ float2 vTexcoord : TEXCOORD0;
+};
+
+struct VS_OUTPUT
+{
+ float3 vNormal : NORMAL;
+ float2 vTexcoord : TEXCOORD0;
+ float4 vTexShadow : TEXCOORD1;
+ float4 vPosition : SV_POSITION;
+ float4 vInterpPos : TEXCOORD2;
+ float vDepth : TEXCOORD3;
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+VS_OUTPUT VSMain( VS_INPUT Input )
+{
+ VS_OUTPUT Output;
+
+ Output.vPosition = mul( Input.vPosition, m_mWorldViewProjection );
+ Output.vNormal = mul( Input.vNormal, (float3x3)m_mWorld );
+ Output.vTexcoord = Input.vTexcoord;
+ Output.vInterpPos = Input.vPosition;
+ Output.vDepth = mul( Input.vPosition, m_mWorldView ).z ;
+
+ // Transform the shadow texture coordinates for all the cascades.
+ Output.vTexShadow = mul( Input.vPosition, m_mShadow );
+ return Output;
+
+}
+
+
+
+static const float4 vCascadeColorsMultiplier[8] =
+{
+ float4 ( 1.5f, 0.0f, 0.0f, 1.0f ),
+ float4 ( 0.0f, 1.5f, 0.0f, 1.0f ),
+ float4 ( 0.0f, 0.0f, 5.5f, 1.0f ),
+ float4 ( 1.5f, 0.0f, 5.5f, 1.0f ),
+ float4 ( 1.5f, 1.5f, 0.0f, 1.0f ),
+ float4 ( 1.0f, 1.0f, 1.0f, 1.0f ),
+ float4 ( 0.0f, 1.0f, 5.5f, 1.0f ),
+ float4 ( 0.5f, 3.5f, 0.75f, 1.0f )
+};
+
+
+void ComputeCoordinatesTransform( in int iCascadeIndex,
+ in float4 InterpolatedPosition ,
+ in out float4 vShadowTexCoord ,
+ in out float4 vShadowTexCoordViewSpace )
+{
+ // Now that we know the correct map, we can transform the world space position of the current fragment
+ if( SELECT_CASCADE_BY_INTERVAL_FLAG )
+ {
+ vShadowTexCoord = vShadowTexCoordViewSpace * m_vCascadeScale[iCascadeIndex];
+ vShadowTexCoord += m_vCascadeOffset[iCascadeIndex];
+ }
+
+ vShadowTexCoord.x *= m_fShadowPartitionSize; // precomputed (float)iCascadeIndex / (float)CASCADE_CNT
+ vShadowTexCoord.x += (m_fShadowPartitionSize * (float)iCascadeIndex );
+
+
+}
+
+
+//--------------------------------------------------------------------------------------
+// This function calculates the screen space depth for shadow space texels
+//--------------------------------------------------------------------------------------
+void CalculateRightAndUpTexelDepthDeltas ( in float3 vShadowTexDDX,
+ in float3 vShadowTexDDY,
+ out float fUpTextDepthWeight,
+ out float fRightTextDepthWeight
+ ) {
+
+ // We use the derivatives in X and Y to create a transformation matrix. Because these derivives give us the
+ // transformation from screen space to shadow space, we need the inverse matrix to take us from shadow space
+ // to screen space. This new matrix will allow us to map shadow map texels to screen space. This will allow
+ // us to find the screen space depth of a corresponding depth pixel.
+ // This is not a perfect solution as it assumes the underlying geometry of the scene is a plane. A more
+ // accureate way of finding the actual depth would be to do a deferred rendering approach and actually
+ //sample the depth.
+
+ // Using an offset, or using variance shadow maps is a better approach to reducing these artifacts in most cases.
+
+ float2x2 matScreentoShadow = float2x2( vShadowTexDDX.xy, vShadowTexDDY.xy );
+ float fDeterminant = determinant ( matScreentoShadow );
+
+ float fInvDeterminant = 1.0f / fDeterminant;
+
+ float2x2 matShadowToScreen = float2x2 (
+ matScreentoShadow._22 * fInvDeterminant, matScreentoShadow._12 * -fInvDeterminant,
+ matScreentoShadow._21 * -fInvDeterminant, matScreentoShadow._11 * fInvDeterminant );
+
+ float2 vRightShadowTexelLocation = float2( m_fTexelSize, 0.0f );
+ float2 vUpShadowTexelLocation = float2( 0.0f, m_fTexelSize );
+
+ // Transform the right pixel by the shadow space to screen space matrix.
+ float2 vRightTexelDepthRatio = mul( vRightShadowTexelLocation, matShadowToScreen );
+ float2 vUpTexelDepthRatio = mul( vUpShadowTexelLocation, matShadowToScreen );
+
+ // We can now caculate how much depth changes when you move up or right in the shadow map.
+ // We use the ratio of change in x and y times the dervivite in X and Y of the screen space
+ // depth to calculate this change.
+ fUpTextDepthWeight =
+ vUpTexelDepthRatio.x * vShadowTexDDX.z
+ + vUpTexelDepthRatio.y * vShadowTexDDY.z;
+ fRightTextDepthWeight =
+ vRightTexelDepthRatio.x * vShadowTexDDX.z
+ + vRightTexelDepthRatio.y * vShadowTexDDY.z;
+
+}
+
+
+//--------------------------------------------------------------------------------------
+// Use PCF to sample the depth map and return a percent lit value.
+//--------------------------------------------------------------------------------------
+void CalculatePCFPercentLit ( in float4 vShadowTexCoord,
+ in float fRightTexelDepthDelta,
+ in float fUpTexelDepthDelta,
+ in float fBlurRowSize,
+ out float fPercentLit
+ )
+{
+ fPercentLit = 0.0f;
+ // This loop could be unrolled, and texture immediate offsets could be used if the kernel size were fixed.
+ // This would be performance improvment.
+ for( int x = m_iPCFBlurForLoopStart; x < m_iPCFBlurForLoopEnd; ++x )
+ {
+ for( int y = m_iPCFBlurForLoopStart; y < m_iPCFBlurForLoopEnd; ++y )
+ {
+ float depthcompare = vShadowTexCoord.z;
+ // A very simple solution to the depth bias problems of PCF is to use an offset.
+ // Unfortunately, too much offset can lead to Peter-panning (shadows near the base of object disappear )
+ // Too little offset can lead to shadow acne ( objects that should not be in shadow are partially self shadowed ).
+ depthcompare -= m_fShadowBiasFromGUI;
+ if ( USE_DERIVATIVES_FOR_DEPTH_OFFSET_FLAG )
+ {
+ // Add in derivative computed depth scale based on the x and y pixel.
+ depthcompare += fRightTexelDepthDelta * ( (float) x ) + fUpTexelDepthDelta * ( (float) y );
+ }
+ // Compare the transformed pixel depth to the depth read from the map.
+ fPercentLit += g_txShadow.SampleCmpLevelZero( g_samShadow,
+ float2(
+ vShadowTexCoord.x + ( ( (float) x ) * m_fNativeTexelSizeInX ) ,
+ vShadowTexCoord.y + ( ( (float) y ) * m_fTexelSize )
+ ),
+ depthcompare );
+ }
+ }
+ fPercentLit /= (float)fBlurRowSize;
+}
+
+//--------------------------------------------------------------------------------------
+// Calculate amount to blend between two cascades and the band where blending will occure.
+//--------------------------------------------------------------------------------------
+void CalculateBlendAmountForInterval ( in int iCurrentCascadeIndex,
+ in out float fPixelDepth,
+ in out float fCurrentPixelsBlendBandLocation,
+ out float fBlendBetweenCascadesAmount
+ )
+{
+
+ // We need to calculate the band of the current shadow map where it will fade into the next cascade.
+ // We can then early out of the expensive PCF for loop.
+ //
+ float fBlendInterval = m_fCascadeFrustumsEyeSpaceDepthsFloat4[ iCurrentCascadeIndex ].x;
+ //if( iNextCascadeIndex > 1 )
+ int fBlendIntervalbelowIndex = min(0, iCurrentCascadeIndex-1);
+ fPixelDepth -= m_fCascadeFrustumsEyeSpaceDepthsFloat4[ fBlendIntervalbelowIndex ].x;
+ fBlendInterval -= m_fCascadeFrustumsEyeSpaceDepthsFloat4[ fBlendIntervalbelowIndex ].x;
+
+ // The current pixel's blend band location will be used to determine when we need to blend and by how much.
+ fCurrentPixelsBlendBandLocation = fPixelDepth / fBlendInterval;
+ fCurrentPixelsBlendBandLocation = 1.0f - fCurrentPixelsBlendBandLocation;
+ // The fBlendBetweenCascadesAmount is our location in the blend band.
+ fBlendBetweenCascadesAmount = fCurrentPixelsBlendBandLocation / m_fCascadeBlendArea;
+}
+
+
+
+//--------------------------------------------------------------------------------------
+// Calculate amount to blend between two cascades and the band where blending will occure.
+//--------------------------------------------------------------------------------------
+void CalculateBlendAmountForMap ( in float4 vShadowMapTextureCoord,
+ in out float fCurrentPixelsBlendBandLocation,
+ out float fBlendBetweenCascadesAmount )
+{
+ // Calcaulte the blend band for the map based selection.
+ float2 distanceToOne = float2 ( 1.0f - vShadowMapTextureCoord.x, 1.0f - vShadowMapTextureCoord.y );
+ fCurrentPixelsBlendBandLocation = min( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y );
+ float fCurrentPixelsBlendBandLocation2 = min( distanceToOne.x, distanceToOne.y );
+ fCurrentPixelsBlendBandLocation =
+ min( fCurrentPixelsBlendBandLocation, fCurrentPixelsBlendBandLocation2 );
+ fBlendBetweenCascadesAmount = fCurrentPixelsBlendBandLocation / m_fCascadeBlendArea;
+}
+
+//--------------------------------------------------------------------------------------
+// Calculate the shadow based on several options and rende the scene.
+//--------------------------------------------------------------------------------------
+float4 PSMain( VS_OUTPUT Input ) : SV_TARGET
+{
+ float4 vDiffuse = g_txDiffuse.Sample( g_samLinear, Input.vTexcoord );
+
+ float4 vShadowMapTextureCoord = 0.0f;
+ float4 vShadowMapTextureCoord_blend = 0.0f;
+
+ float4 vVisualizeCascadeColor = float4(0.0f,0.0f,0.0f,1.0f);
+
+ float fPercentLit = 0.0f;
+ float fPercentLit_blend = 0.0f;
+
+
+ float fUpTextDepthWeight=0;
+ float fRightTextDepthWeight=0;
+ float fUpTextDepthWeight_blend=0;
+ float fRightTextDepthWeight_blend=0;
+
+ int iBlurRowSize = m_iPCFBlurForLoopEnd - m_iPCFBlurForLoopStart;
+ iBlurRowSize *= iBlurRowSize;
+ float fBlurRowSize = (float)iBlurRowSize;
+
+ int iCascadeFound = 0;
+ int iNextCascadeIndex = 1;
+
+ float fCurrentPixelDepth;
+
+ // The interval based selection technique compares the pixel's depth against the frustum's cascade divisions.
+ fCurrentPixelDepth = Input.vDepth;
+
+ // This for loop is not necessary when the frustum is uniformaly divided and interval based selection is used.
+ // In this case fCurrentPixelDepth could be used as an array lookup into the correct frustum.
+ int iCurrentCascadeIndex;
+
+ float4 vShadowMapTextureCoordViewSpace = Input.vTexShadow;
+ if( SELECT_CASCADE_BY_INTERVAL_FLAG )
+ {
+ iCurrentCascadeIndex = 0;
+ if ( CASCADE_COUNT_FLAG > 1 )
+ {
+ float4 vCurrentPixelDepth = Input.vDepth;
+ float4 fComparison = ( vCurrentPixelDepth > m_fCascadeFrustumsEyeSpaceDepthsFloat[0]);
+ float4 fComparison2 = ( vCurrentPixelDepth > m_fCascadeFrustumsEyeSpaceDepthsFloat[1]);
+ float fIndex = dot(
+ float4( CASCADE_COUNT_FLAG > 0,
+ CASCADE_COUNT_FLAG > 1,
+ CASCADE_COUNT_FLAG > 2,
+ CASCADE_COUNT_FLAG > 3)
+ , fComparison )
+ + dot(
+ float4(
+ CASCADE_COUNT_FLAG > 4,
+ CASCADE_COUNT_FLAG > 5,
+ CASCADE_COUNT_FLAG > 6,
+ CASCADE_COUNT_FLAG > 7)
+ , fComparison2 ) ;
+
+ fIndex = min( fIndex, CASCADE_COUNT_FLAG - 1 );
+ iCurrentCascadeIndex = (int)fIndex;
+ }
+ }
+
+ if ( !SELECT_CASCADE_BY_INTERVAL_FLAG )
+ {
+ iCurrentCascadeIndex = 0;
+ if ( CASCADE_COUNT_FLAG == 1 )
+ {
+ vShadowMapTextureCoord = vShadowMapTextureCoordViewSpace * m_vCascadeScale[0];
+ vShadowMapTextureCoord += m_vCascadeOffset[0];
+ }
+ if ( CASCADE_COUNT_FLAG > 1 ) {
+ for( int iCascadeIndex = 0; iCascadeIndex < CASCADE_COUNT_FLAG && iCascadeFound == 0; ++iCascadeIndex )
+ {
+ vShadowMapTextureCoord = vShadowMapTextureCoordViewSpace * m_vCascadeScale[iCascadeIndex];
+ vShadowMapTextureCoord += m_vCascadeOffset[iCascadeIndex];
+
+ if ( min( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y ) > m_fMinBorderPadding
+ && max( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y ) < m_fMaxBorderPadding )
+ {
+ iCurrentCascadeIndex = iCascadeIndex;
+ iCascadeFound = 1;
+ }
+ }
+ }
+ }
+
+ float4 color = 0;
+
+ if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG )
+ {
+ // Repeat text coord calculations for the next cascade.
+ // The next cascade index is used for blurring between maps.
+ iNextCascadeIndex = min ( CASCADE_COUNT_FLAG - 1, iCurrentCascadeIndex + 1 );
+ }
+
+ float fBlendBetweenCascadesAmount = 1.0f;
+ float fCurrentPixelsBlendBandLocation = 1.0f;
+
+ if( SELECT_CASCADE_BY_INTERVAL_FLAG )
+ {
+ if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG && CASCADE_COUNT_FLAG > 1 )
+ {
+ CalculateBlendAmountForInterval ( iCurrentCascadeIndex, fCurrentPixelDepth,
+ fCurrentPixelsBlendBandLocation, fBlendBetweenCascadesAmount );
+ }
+ }
+ else
+ {
+
+ if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG )
+ {
+ CalculateBlendAmountForMap ( vShadowMapTextureCoord,
+ fCurrentPixelsBlendBandLocation, fBlendBetweenCascadesAmount );
+ }
+ }
+
+ float3 vShadowMapTextureCoordDDX;
+ float3 vShadowMapTextureCoordDDY;
+ // The derivatives are used to find the slope of the current plane.
+ // The derivative calculation has to be inside of the loop in order to prevent divergent flow control artifacts.
+ if( USE_DERIVATIVES_FOR_DEPTH_OFFSET_FLAG )
+ {
+ vShadowMapTextureCoordDDX = ddx( vShadowMapTextureCoordViewSpace );
+ vShadowMapTextureCoordDDY = ddy( vShadowMapTextureCoordViewSpace );
+
+ vShadowMapTextureCoordDDX *= m_vCascadeScale[iCurrentCascadeIndex];
+ vShadowMapTextureCoordDDY *= m_vCascadeScale[iCurrentCascadeIndex];
+ }
+
+ ComputeCoordinatesTransform( iCurrentCascadeIndex,
+ Input.vInterpPos,
+ vShadowMapTextureCoord,
+ vShadowMapTextureCoordViewSpace );
+
+
+ vVisualizeCascadeColor = vCascadeColorsMultiplier[iCurrentCascadeIndex];
+
+ if( USE_DERIVATIVES_FOR_DEPTH_OFFSET_FLAG )
+ {
+ CalculateRightAndUpTexelDepthDeltas ( vShadowMapTextureCoordDDX, vShadowMapTextureCoordDDY,
+ fUpTextDepthWeight, fRightTextDepthWeight );
+ }
+
+ CalculatePCFPercentLit ( vShadowMapTextureCoord, fRightTextDepthWeight,
+ fUpTextDepthWeight, fBlurRowSize, fPercentLit );
+
+ if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG && CASCADE_COUNT_FLAG > 1 )
+ {
+ if( fCurrentPixelsBlendBandLocation < m_fCascadeBlendArea)
+ { // the current pixel is within the blend band.
+
+ // Repeat text coord calculations for the next cascade.
+ // The next cascade index is used for blurring between maps.
+ if( !SELECT_CASCADE_BY_INTERVAL_FLAG )
+ {
+ vShadowMapTextureCoord_blend = vShadowMapTextureCoordViewSpace * m_vCascadeScale[iNextCascadeIndex];
+ vShadowMapTextureCoord_blend += m_vCascadeOffset[iNextCascadeIndex];
+ }
+
+ ComputeCoordinatesTransform( iNextCascadeIndex, Input.vInterpPos,
+ vShadowMapTextureCoord_blend,
+ vShadowMapTextureCoordViewSpace );
+
+ // We repeat the calcuation for the next cascade layer, when blending between maps.
+ if( fCurrentPixelsBlendBandLocation < m_fCascadeBlendArea)
+ { // the current pixel is within the blend band.
+ if( USE_DERIVATIVES_FOR_DEPTH_OFFSET_FLAG )
+ {
+
+ CalculateRightAndUpTexelDepthDeltas ( vShadowMapTextureCoordDDX,
+ vShadowMapTextureCoordDDY,
+ fUpTextDepthWeight_blend,
+ fRightTextDepthWeight_blend );
+ }
+ CalculatePCFPercentLit ( vShadowMapTextureCoord_blend, fRightTextDepthWeight_blend,
+ fUpTextDepthWeight_blend, fBlurRowSize, fPercentLit_blend );
+ fPercentLit = lerp( fPercentLit_blend, fPercentLit, fBlendBetweenCascadesAmount );
+ // Blend the two calculated shadows by the blend amount.
+ }
+ }
+ }
+
+
+ if( !m_iVisualizeCascades ) vVisualizeCascadeColor = float4(1.0f,1.0f,1.0f,1.0f);
+
+ float3 vLightDir1 = float3( -1.0f, 1.0f, -1.0f );
+ float3 vLightDir2 = float3( 1.0f, 1.0f, -1.0f );
+ float3 vLightDir3 = float3( 0.0f, -1.0f, 0.0f );
+ float3 vLightDir4 = float3( 1.0f, 1.0f, 1.0f );
+ // Some ambient-like lighting.
+ float fLighting =
+ saturate( dot( vLightDir1 , Input.vNormal ) )*0.05f +
+ saturate( dot( vLightDir2 , Input.vNormal ) )*0.05f +
+ saturate( dot( vLightDir3 , Input.vNormal ) )*0.05f +
+ saturate( dot( vLightDir4 , Input.vNormal ) )*0.05f ;
+
+ float4 vShadowLighting = fLighting * 0.5f;
+ fLighting += saturate( dot( m_vLightDir , Input.vNormal ) );
+ fLighting = lerp( vShadowLighting, fLighting, fPercentLit );
+
+ return fLighting * vVisualizeCascadeColor * vDiffuse;
+
+}
+
diff --git a/tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeShadow.hlsl b/tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeShadow.hlsl
new file mode 100644
index 000000000..3b4d32a0d
--- /dev/null
+++ b/tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeShadow.hlsl
@@ -0,0 +1,53 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain -entry VSMainPancake
+//--------------------------------------------------------------------------------------
+// File: RenderCascadeShadow.hlsl
+//
+// The shader file for the RenderCascadeScene sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+ matrix g_mWorldViewProjection : packoffset( c0 );
+};
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+ float4 vPosition : POSITION;
+};
+
+struct VS_OUTPUT
+{
+ float4 vPosition : SV_POSITION;
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+VS_OUTPUT VSMain( VS_INPUT Input )
+{
+ VS_OUTPUT Output;
+
+ // There is nothing special here, just transform and write out the depth.
+ Output.vPosition = mul( Input.vPosition, g_mWorldViewProjection );
+
+ return Output;
+}
+
+
+VS_OUTPUT VSMainPancake( VS_INPUT Input )
+{
+ VS_OUTPUT Output;
+ // after transform move clipped geometry to near plane
+ Output.vPosition = mul( Input.vPosition, g_mWorldViewProjection );
+ //Output.vPosition.z = max( Output.vPosition.z, 0.0f );
+ return Output;
+} \ No newline at end of file
diff --git a/tests/hlsl/dxsdk/ComputeShaderSort11/ComputeShaderSort11.hlsl b/tests/hlsl/dxsdk/ComputeShaderSort11/ComputeShaderSort11.hlsl
new file mode 100644
index 000000000..db7bd5136
--- /dev/null
+++ b/tests/hlsl/dxsdk/ComputeShaderSort11/ComputeShaderSort11.hlsl
@@ -0,0 +1,75 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry BitonicSort -entry MatrixTranspose
+//--------------------------------------------------------------------------------------
+// File: ComputeShaderSort11.hlsl
+//
+// This file contains the compute shaders to perform GPU sorting using DirectX 11.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#define BITONIC_BLOCK_SIZE 512
+
+#define TRANSPOSE_BLOCK_SIZE 16
+
+//--------------------------------------------------------------------------------------
+// Constant Buffers
+//--------------------------------------------------------------------------------------
+cbuffer CB : register( b0 )
+{
+ unsigned int g_iLevel;
+ unsigned int g_iLevelMask;
+ unsigned int g_iWidth;
+ unsigned int g_iHeight;
+};
+
+//--------------------------------------------------------------------------------------
+// Structured Buffers
+//--------------------------------------------------------------------------------------
+StructuredBuffer<unsigned int> Input : register( t0 );
+RWStructuredBuffer<unsigned int> Data : register( u0 );
+
+//--------------------------------------------------------------------------------------
+// Bitonic Sort Compute Shader
+//--------------------------------------------------------------------------------------
+groupshared unsigned int shared_data[BITONIC_BLOCK_SIZE];
+
+[numthreads(BITONIC_BLOCK_SIZE, 1, 1)]
+void BitonicSort( uint3 Gid : SV_GroupID,
+ uint3 DTid : SV_DispatchThreadID,
+ uint3 GTid : SV_GroupThreadID,
+ uint GI : SV_GroupIndex )
+{
+ // Load shared data
+ shared_data[GI] = Data[DTid.x];
+ GroupMemoryBarrierWithGroupSync();
+
+ // Sort the shared data
+ for (unsigned int j = g_iLevel >> 1 ; j > 0 ; j >>= 1)
+ {
+ unsigned int result = ((shared_data[GI & ~j] <= shared_data[GI | j]) == (bool)(g_iLevelMask & DTid.x))? shared_data[GI ^ j] : shared_data[GI];
+ GroupMemoryBarrierWithGroupSync();
+ shared_data[GI] = result;
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ // Store shared data
+ Data[DTid.x] = shared_data[GI];
+}
+
+//--------------------------------------------------------------------------------------
+// Matrix Transpose Compute Shader
+//--------------------------------------------------------------------------------------
+groupshared unsigned int transpose_shared_data[TRANSPOSE_BLOCK_SIZE * TRANSPOSE_BLOCK_SIZE];
+
+[numthreads(TRANSPOSE_BLOCK_SIZE, TRANSPOSE_BLOCK_SIZE, 1)]
+void MatrixTranspose( uint3 Gid : SV_GroupID,
+ uint3 DTid : SV_DispatchThreadID,
+ uint3 GTid : SV_GroupThreadID,
+ uint GI : SV_GroupIndex )
+{
+ transpose_shared_data[GI] = Input[DTid.y * g_iWidth + DTid.x];
+ GroupMemoryBarrierWithGroupSync();
+ uint2 XY = DTid.yx - GTid.yx + GTid.xy;
+ Data[XY.y * g_iHeight + XY.x] = transpose_shared_data[GTid.x * TRANSPOSE_BLOCK_SIZE + GTid.y];
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02.fx
new file mode 100644
index 000000000..941e001b3
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02.fx
@@ -0,0 +1,23 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS
+//--------------------------------------------------------------------------------------
+// File: Tutorial02.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+float4 VS( float4 Pos : POSITION ) : SV_POSITION
+{
+ return Pos;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( float4 Pos : SV_POSITION ) : SV_Target
+{
+ return float4( 1.0f, 1.0f, 0.0f, 1.0f ); // Yellow, with Alpha = 1
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_PS.hlsl
new file mode 100644
index 000000000..5a59aadc6
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_PS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS
+#include "Tutorial02.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_VS.hlsl
new file mode 100644
index 000000000..d58459b78
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_VS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS
+#include "Tutorial02.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03.fx
new file mode 100644
index 000000000..941e001b3
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03.fx
@@ -0,0 +1,23 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS
+//--------------------------------------------------------------------------------------
+// File: Tutorial02.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+float4 VS( float4 Pos : POSITION ) : SV_POSITION
+{
+ return Pos;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( float4 Pos : SV_POSITION ) : SV_Target
+{
+ return float4( 1.0f, 1.0f, 0.0f, 1.0f ); // Yellow, with Alpha = 1
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_PS.hlsl
new file mode 100644
index 000000000..29b6e8b2c
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_PS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS
+#include "Tutorial03.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_VS.hlsl
new file mode 100644
index 000000000..db47ead28
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_VS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS
+#include "Tutorial03.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04.fx
new file mode 100644
index 000000000..deb7b585f
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04.fx
@@ -0,0 +1,46 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS
+//--------------------------------------------------------------------------------------
+// File: Tutorial04.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+cbuffer ConstantBuffer : register( b0 )
+{
+ matrix World;
+ matrix View;
+ matrix Projection;
+}
+
+//--------------------------------------------------------------------------------------
+struct VS_OUTPUT
+{
+ float4 Pos : SV_POSITION;
+ float4 Color : COLOR0;
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+VS_OUTPUT VS( float4 Pos : POSITION, float4 Color : COLOR )
+{
+ VS_OUTPUT output = (VS_OUTPUT)0;
+ output.Pos = mul( Pos, World );
+ output.Pos = mul( output.Pos, View );
+ output.Pos = mul( output.Pos, Projection );
+ output.Color = Color;
+ return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( VS_OUTPUT input ) : SV_Target
+{
+ return input.Color;
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_PS.hlsl
new file mode 100644
index 000000000..dc627637c
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_PS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS
+#include "Tutorial04.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_VS.hlsl
new file mode 100644
index 000000000..96d0a642c
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_VS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS
+#include "Tutorial04.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05.fx
new file mode 100644
index 000000000..b15c99e49
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05.fx
@@ -0,0 +1,54 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS
+//--------------------------------------------------------------------------------------
+// File: Tutorial05.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+cbuffer ConstantBuffer : register( b0 )
+{
+ matrix World;
+ matrix View;
+ matrix Projection;
+}
+
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+ float4 Pos : POSITION;
+ float4 Color : COLOR;
+};
+
+struct PS_INPUT
+{
+ float4 Pos : SV_POSITION;
+ float4 Color : COLOR;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+PS_INPUT VS( VS_INPUT input )
+{
+ PS_INPUT output = (PS_INPUT)0;
+ output.Pos = mul( input.Pos, World );
+ output.Pos = mul( output.Pos, View );
+ output.Pos = mul( output.Pos, Projection );
+ output.Color = input.Color;
+
+ return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( PS_INPUT input) : SV_Target
+{
+ return input.Color;
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_PS.hlsl
new file mode 100644
index 000000000..acc900ff5
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_PS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS
+#include "Tutorial05.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_VS.hlsl
new file mode 100644
index 000000000..726f05979
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_VS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS
+#include "Tutorial05.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06.fx
new file mode 100644
index 000000000..7d839009d
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06.fx
@@ -0,0 +1,76 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS -entry PSSolid
+//--------------------------------------------------------------------------------------
+// File: Tutorial06.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+cbuffer ConstantBuffer : register( b0 )
+{
+ matrix World;
+ matrix View;
+ matrix Projection;
+ float4 vLightDir[2];
+ float4 vLightColor[2];
+ float4 vOutputColor;
+}
+
+
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+ float4 Pos : POSITION;
+ float3 Norm : NORMAL;
+};
+
+struct PS_INPUT
+{
+ float4 Pos : SV_POSITION;
+ float3 Norm : TEXCOORD0;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+PS_INPUT VS( VS_INPUT input )
+{
+ PS_INPUT output = (PS_INPUT)0;
+ output.Pos = mul( input.Pos, World );
+ output.Pos = mul( output.Pos, View );
+ output.Pos = mul( output.Pos, Projection );
+ output.Norm = mul( float4( input.Norm, 1 ), World ).xyz;
+
+ return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( PS_INPUT input) : SV_Target
+{
+ float4 finalColor = 0;
+
+ //do NdotL lighting for 2 lights
+ for(int i=0; i<2; i++)
+ {
+ finalColor += saturate( dot( (float3)vLightDir[i],input.Norm) * vLightColor[i] );
+ }
+ finalColor.a = 1;
+ return finalColor;
+}
+
+
+//--------------------------------------------------------------------------------------
+// PSSolid - render a solid color
+//--------------------------------------------------------------------------------------
+float4 PSSolid( PS_INPUT input) : SV_Target
+{
+ return vOutputColor;
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_PS.hlsl
new file mode 100644
index 000000000..31ed082e7
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_PS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS
+#include "Tutorial06.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_VS.hlsl
new file mode 100644
index 000000000..a5512efb6
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_VS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS
+#include "Tutorial06.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07.fx
new file mode 100644
index 000000000..0baad7a0c
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07.fx
@@ -0,0 +1,67 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS
+//--------------------------------------------------------------------------------------
+// File: Tutorial07.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+Texture2D txDiffuse : register( t0 );
+SamplerState samLinear : register( s0 );
+
+cbuffer cbNeverChanges : register( b0 )
+{
+ matrix View;
+};
+
+cbuffer cbChangeOnResize : register( b1 )
+{
+ matrix Projection;
+};
+
+cbuffer cbChangesEveryFrame : register( b2 )
+{
+ matrix World;
+ float4 vMeshColor;
+};
+
+
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+ float4 Pos : POSITION;
+ float2 Tex : TEXCOORD0;
+};
+
+struct PS_INPUT
+{
+ float4 Pos : SV_POSITION;
+ float2 Tex : TEXCOORD0;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+PS_INPUT VS( VS_INPUT input )
+{
+ PS_INPUT output = (PS_INPUT)0;
+ output.Pos = mul( input.Pos, World );
+ output.Pos = mul( output.Pos, View );
+ output.Pos = mul( output.Pos, Projection );
+ output.Tex = input.Tex;
+
+ return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( PS_INPUT input) : SV_Target
+{
+ return txDiffuse.Sample( samLinear, input.Tex ) * vMeshColor;
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_PS.hlsl
new file mode 100644
index 000000000..c3c101943
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_PS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS
+#include "Tutorial07.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_VS.hlsl
new file mode 100644
index 000000000..4c287c790
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_VS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS
+#include "Tutorial07.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial08/Tutorial08.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial08/Tutorial08.fx
new file mode 100644
index 000000000..6ff313b97
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial08/Tutorial08.fx
@@ -0,0 +1,56 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS
+//--------------------------------------------------------------------------------------
+// File: Tutorial08.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+Texture2D txDiffuse : register( t0 );
+SamplerState samLinear : register( s0 );
+
+cbuffer cbChangesEveryFrame : register( b0 )
+{
+ matrix WorldViewProj;
+ matrix World;
+ float4 vMeshColor;
+};
+
+
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+ float4 Pos : POSITION;
+ float2 Tex : TEXCOORD;
+};
+
+struct PS_INPUT
+{
+ float4 Pos : SV_POSITION;
+ float2 Tex : TEXCOORD0;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+PS_INPUT VS( VS_INPUT input )
+{
+ PS_INPUT output = (PS_INPUT)0;
+ output.Pos = mul( input.Pos, WorldViewProj );
+ output.Tex = input.Tex;
+
+ return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( PS_INPUT input) : SV_Target
+{
+ return txDiffuse.Sample( samLinear, input.Tex ) * vMeshColor;
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial09/Tutorial09.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial09/Tutorial09.fx
new file mode 100644
index 000000000..04a395588
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial09/Tutorial09.fx
@@ -0,0 +1,69 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS
+//--------------------------------------------------------------------------------------
+// File: Tutorial09.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+Texture2D txDiffuse : register( t0 );
+SamplerState samLinear : register( s0 );
+
+cbuffer cbNeverChanges : register( b0 )
+{
+ float3 vLightDir;
+};
+
+cbuffer cbChangesEveryFrame : register( b1 )
+{
+ matrix WorldViewProj;
+ matrix World;
+};
+
+struct VS_INPUT
+{
+ float3 Pos : POSITION; //position
+ float3 Norm : NORMAL; //normal
+ float2 Tex : TEXCOORD0; //texture coordinate
+};
+
+struct PS_INPUT
+{
+ float4 Pos : SV_POSITION;
+ float4 Diffuse : COLOR0;
+ float2 Tex : TEXCOORD1;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+PS_INPUT VS( VS_INPUT input )
+{
+ PS_INPUT output = (PS_INPUT)0;
+ output.Pos = mul( float4(input.Pos,1), WorldViewProj );
+ float3 vNormalWorldSpace = normalize( mul( input.Norm, (float3x3)World ) );
+
+ float fLighting = saturate( dot( vNormalWorldSpace, vLightDir ) );
+ output.Diffuse.rgb = fLighting;
+ output.Diffuse.a = 1.0f;
+
+ output.Tex = input.Tex;
+
+ return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( PS_INPUT input) : SV_Target
+{
+ //calculate lighting assuming light color is <1,1,1,1>
+ float4 outputColor = txDiffuse.Sample( samLinear, input.Tex ) * input.Diffuse;
+ outputColor.a = 1;
+ return outputColor;
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial10/Tutorial10.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial10/Tutorial10.fx
new file mode 100644
index 000000000..e9bded408
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial10/Tutorial10.fx
@@ -0,0 +1,73 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS
+//--------------------------------------------------------------------------------------
+// File: Tutorial10.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+Texture2D txDiffuse : register( t0 );
+SamplerState samLinear : register( s0 );
+
+cbuffer cbNeverChanges : register( b0 )
+{
+ float3 vLightDir;
+};
+
+cbuffer cbChangesEveryFrame : register( b1 )
+{
+ matrix WorldViewProj;
+ matrix World;
+ float Puffiness;
+};
+
+struct VS_INPUT
+{
+ float3 Pos : POSITION; //position
+ float3 Norm : NORMAL; //normal
+ float2 Tex : TEXCOORD0; //texture coordinate
+};
+
+struct PS_INPUT
+{
+ float4 Pos : SV_POSITION;
+ float4 Diffuse : COLOR0;
+ float2 Tex : TEXCOORD1;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+PS_INPUT VS( VS_INPUT input )
+{
+ PS_INPUT output = (PS_INPUT)0;
+
+ input.Pos += input.Norm * Puffiness;
+
+ output.Pos = mul( float4(input.Pos,1), WorldViewProj );
+ float3 vNormalWorldSpace = normalize( mul( input.Norm, (float3x3)World ) );
+
+ float fLighting = saturate( dot( vNormalWorldSpace, vLightDir ) );
+ output.Diffuse.rgb = fLighting;
+ output.Diffuse.a = 1.0f;
+
+ output.Tex = input.Tex;
+
+ return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( PS_INPUT input) : SV_Target
+{
+ //calculate lighting assuming light color is <1,1,1,1>
+ float4 outputColor = txDiffuse.Sample( samLinear, input.Tex ) * input.Diffuse;
+ outputColor.a = 1;
+ return outputColor;
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial11/Tutorial11.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial11/Tutorial11.fx
new file mode 100644
index 000000000..a647a9079
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial11/Tutorial11.fx
@@ -0,0 +1,117 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: Tutorial11.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+Texture2D g_txDiffuse;
+SamplerState samLinear
+{
+ Filter = MIN_MAG_MIP_LINEAR;
+ AddressU = Wrap;
+ AddressV = Wrap;
+};
+
+cbuffer cbConstant
+{
+ float3 vLightDir = float3(-0.577,0.577,-0.577);
+};
+
+cbuffer cbChangesEveryFrame
+{
+ matrix World;
+ matrix View;
+ matrix Projection;
+ float Time;
+};
+
+cbuffer cbUserChanges
+{
+ float Waviness;
+};
+
+struct VS_INPUT
+{
+ float3 Pos : POSITION;
+ float3 Norm : NORMAL;
+ float2 Tex : TEXCOORD0;
+};
+
+struct PS_INPUT
+{
+ float4 Pos : SV_POSITION;
+ float3 Norm : TEXCOORD0;
+ float2 Tex : TEXCOORD1;
+};
+
+//--------------------------------------------------------------------------------------
+// DepthStates
+//--------------------------------------------------------------------------------------
+DepthStencilState EnableDepth
+{
+ DepthEnable = TRUE;
+ DepthWriteMask = ALL;
+ DepthFunc = LESS_EQUAL;
+};
+
+BlendState NoBlending
+{
+ AlphaToCoverageEnable = FALSE;
+ BlendEnable[0] = FALSE;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+PS_INPUT VS( VS_INPUT input )
+{
+ PS_INPUT output = (PS_INPUT)0;
+
+ output.Pos = mul( float4(input.Pos,1), World );
+
+ output.Pos.x += sin( output.Pos.y*0.1f + Time )*Waviness;
+
+ output.Pos = mul( output.Pos, View );
+ output.Pos = mul( output.Pos, Projection );
+ output.Norm = mul( input.Norm, World );
+ output.Tex = input.Tex;
+
+ return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( PS_INPUT input) : SV_Target
+{
+ // Calculate lighting assuming light color is <1,1,1,1>
+ float fLighting = saturate( dot( input.Norm, vLightDir ) );
+ float4 outputColor = g_txDiffuse.Sample( samLinear, input.Tex ) * fLighting;
+ outputColor.a = 1;
+ return outputColor;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Technique
+//--------------------------------------------------------------------------------------
+technique11 Render
+{
+ pass P0
+ {
+ SetVertexShader( CompileShader( vs_4_0, VS() ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0, PS() ) );
+
+ SetDepthStencilState( EnableDepth, 0 );
+ SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+ }
+}
+
diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial12/Tutorial12.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial12/Tutorial12.fx
new file mode 100644
index 000000000..aae7f9a87
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial12/Tutorial12.fx
@@ -0,0 +1,129 @@
+//TEST_IGNORE_FILE:
+//
+// Constant Buffer Variables
+//
+
+Texture2D g_txDiffuse;
+SamplerState samLinear
+{
+ Filter = MIN_MAG_MIP_LINEAR;
+ AddressU = Wrap;
+ AddressV = Wrap;
+};
+
+TextureCube g_txEnvMap;
+SamplerState samLinearClamp
+{
+ Filter = MIN_MAG_MIP_LINEAR;
+ AddressU = Clamp;
+ AddressV = Clamp;
+};
+
+cbuffer cbConstant
+{
+ float3 vLightDir = float3(-0.577,0.577,-0.577);
+};
+
+cbuffer cbChangesEveryFrame
+{
+ matrix World;
+ matrix View;
+ matrix Projection;
+ float Time;
+};
+
+cbuffer cbUserChanges
+{
+ float Waviness;
+};
+
+struct VS_INPUT
+{
+ float3 Pos : POSITION; //position
+ float3 Norm : NORMAL; //normal
+ float2 Tex : TEXCOORD0; //texture coordinate
+};
+
+struct PS_INPUT
+{
+ float4 Pos : SV_POSITION;
+ float3 Norm : TEXCOORD0;
+ float2 Tex : TEXCOORD1;
+ float3 ViewR : TEXCOORD2;
+};
+
+//--------------------------------------------------------------------------------------
+// DepthStates
+//--------------------------------------------------------------------------------------
+DepthStencilState EnableDepth
+{
+ DepthEnable = TRUE;
+ DepthWriteMask = ALL;
+ DepthFunc = LESS_EQUAL;
+};
+
+BlendState NoBlending
+{
+ AlphaToCoverageEnable = FALSE;
+ BlendEnable[0] = FALSE;
+};
+
+//
+// Vertex Shader
+//
+PS_INPUT VS( VS_INPUT input )
+{
+ PS_INPUT output = (PS_INPUT)0;
+
+ output.Pos = mul( float4(input.Pos,1), World );
+
+ output.Pos.x += sin( output.Pos.y*0.1f + Time )*Waviness;
+
+ output.Pos = mul( output.Pos, View );
+ output.Pos = mul( output.Pos, Projection );
+ output.Norm = mul( input.Norm, (float3x3)World );
+ output.Tex = input.Tex;
+
+ // Calculate the reflection vector
+ float3 viewNorm = mul( output.Norm, (float3x3)View );
+ output.ViewR = reflect( viewNorm, float3(0,0,-1.0) );
+
+ return output;
+}
+
+
+//
+// Pixel Shader
+//
+float4 PS( PS_INPUT input) : SV_Target
+{
+ // Calculate lighting assuming light color is <1,1,1,1>
+ float fLighting = saturate( dot( input.Norm, vLightDir ) );
+
+ // Load the environment map texture
+ float4 cReflect = g_txEnvMap.Sample( samLinearClamp, input.ViewR );
+
+ // Load the diffuse texture and multiply by the lighting amount
+ float4 cDiffuse = g_txDiffuse.Sample( samLinear, input.Tex ) * fLighting;
+
+ // Add diffuse to reflection and go
+ float4 cTotal = cDiffuse + cReflect;
+ cTotal.a = 1;
+ return cTotal;
+}
+
+//
+// Technique
+//
+technique11 Render
+{
+ pass P0
+ {
+ SetVertexShader( CompileShader( vs_4_0, VS() ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0, PS() ) );
+
+ SetDepthStencilState( EnableDepth, 0 );
+ SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+ }
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial13/Tutorial13.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial13/Tutorial13.fx
new file mode 100644
index 000000000..a6f09ecc7
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial13/Tutorial13.fx
@@ -0,0 +1,191 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: Tutorial13.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+Texture2D g_txDiffuse;
+SamplerState samLinear
+{
+ Filter = MIN_MAG_MIP_LINEAR;
+ AddressU = Wrap;
+ AddressV = Wrap;
+};
+
+TextureCube g_txEnvMap;
+SamplerState samLinearClamp
+{
+ Filter = MIN_MAG_MIP_LINEAR;
+ AddressU = Clamp;
+ AddressV = Clamp;
+};
+
+cbuffer cbConstant
+{
+ float3 vLightDir = float3(-0.577,0.577,-0.577);
+};
+
+cbuffer cbChangesEveryFrame
+{
+ matrix World;
+ matrix View;
+ matrix Projection;
+ float Time;
+};
+
+cbuffer cbUserChanges
+{
+ float Explode;
+};
+
+struct VS_INPUT
+{
+ float3 Pos : POSITION;
+ float3 Norm : NORMAL;
+ float2 Tex : TEXCOORD0;
+};
+
+struct GSPS_INPUT
+{
+ float4 Pos : SV_POSITION;
+ float3 Norm : TEXCOORD0;
+ float2 Tex : TEXCOORD1;
+};
+
+//--------------------------------------------------------------------------------------
+// DepthStates
+//--------------------------------------------------------------------------------------
+DepthStencilState EnableDepth
+{
+ DepthEnable = TRUE;
+ DepthWriteMask = ALL;
+ DepthFunc = LESS_EQUAL;
+};
+
+BlendState NoBlending
+{
+ AlphaToCoverageEnable = FALSE;
+ BlendEnable[0] = FALSE;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+GSPS_INPUT VS( VS_INPUT input )
+{
+ GSPS_INPUT output = (GSPS_INPUT)0;
+
+ output.Pos = mul( float4(input.Pos,1), World );
+ output.Norm = mul( input.Norm, (float3x3)World );
+ output.Tex = input.Tex;
+
+ return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Geometry Shader
+//--------------------------------------------------------------------------------------
+[maxvertexcount(12)]
+void GS( triangle GSPS_INPUT input[3], inout TriangleStream<GSPS_INPUT> TriStream )
+{
+ GSPS_INPUT output;
+
+ //
+ // Calculate the face normal
+ //
+ float3 faceEdgeA = input[1].Pos - input[0].Pos;
+ float3 faceEdgeB = input[2].Pos - input[0].Pos;
+ float3 faceNormal = normalize( cross(faceEdgeA, faceEdgeB) );
+ float3 ExplodeAmt = faceNormal*Explode;
+
+ //
+ // Calculate the face center
+ //
+ float3 centerPos = (input[0].Pos.xyz + input[1].Pos.xyz + input[2].Pos.xyz)/3.0;
+ float2 centerTex = (input[0].Tex + input[1].Tex + input[2].Tex)/3.0;
+ centerPos += faceNormal*Explode;
+
+ //
+ // Output the pyramid
+ //
+ for( int i=0; i<3; i++ )
+ {
+ output.Pos = input[i].Pos + float4(ExplodeAmt,0);
+ output.Pos = mul( output.Pos, View );
+ output.Pos = mul( output.Pos, Projection );
+ output.Norm = input[i].Norm;
+ output.Tex = input[i].Tex;
+ TriStream.Append( output );
+
+ int iNext = (i+1)%3;
+ output.Pos = input[iNext].Pos + float4(ExplodeAmt,0);
+ output.Pos = mul( output.Pos, View );
+ output.Pos = mul( output.Pos, Projection );
+ output.Norm = input[iNext].Norm;
+ output.Tex = input[iNext].Tex;
+ TriStream.Append( output );
+
+ output.Pos = float4(centerPos,1) + float4(ExplodeAmt,0);
+ output.Pos = mul( output.Pos, View );
+ output.Pos = mul( output.Pos, Projection );
+ output.Norm = faceNormal;
+ output.Tex = centerTex;
+ TriStream.Append( output );
+
+ TriStream.RestartStrip();
+ }
+
+ for( int i=2; i>=0; i-- )
+ {
+ output.Pos = input[i].Pos + float4(ExplodeAmt,0);
+ output.Pos = mul( output.Pos, View );
+ output.Pos = mul( output.Pos, Projection );
+ output.Norm = -input[i].Norm;
+ output.Tex = input[i].Tex;
+ TriStream.Append( output );
+ }
+ TriStream.RestartStrip();
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( GSPS_INPUT input) : SV_Target
+{
+ // Calculate lighting assuming light color is <1,1,1,1>
+ float fLighting = saturate( dot( input.Norm, vLightDir ) );
+
+ // Load the diffuse texture and multiply by the lighting amount
+ float4 cDiffuse = g_txDiffuse.Sample( samLinear, input.Tex ) * fLighting;
+ cDiffuse.a = 1;
+
+ // return diffuse
+ return cDiffuse;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Technique
+//--------------------------------------------------------------------------------------
+technique11 Render
+{
+ pass P0
+ {
+ SetVertexShader( CompileShader( vs_4_0, VS() ) );
+ SetGeometryShader( CompileShader( gs_4_0, GS() ) );
+ SetPixelShader( CompileShader( ps_4_0, PS() ) );
+
+ SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+ SetDepthStencilState( EnableDepth, 0 );
+ }
+}
+
+
diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial14/Tutorial14.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial14/Tutorial14.fx
new file mode 100644
index 000000000..b1e45b842
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial14/Tutorial14.fx
@@ -0,0 +1,294 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: Tutorial14.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+Texture2D g_txDiffuse;
+SamplerState samLinear
+{
+ Filter = MIN_MAG_MIP_LINEAR;
+ AddressU = Wrap;
+ AddressV = Wrap;
+};
+
+cbuffer cbConstant
+{
+ float3 vLightDir = float3(-0.577,0.577,-0.577);
+};
+
+cbuffer cbChangesEveryFrame
+{
+ matrix World;
+ matrix View;
+ matrix Projection;
+};
+
+struct VS_INPUT
+{
+ float3 Pos : POSITION; //position
+ float3 Norm : NORMAL; //normal
+ float2 Tex : TEXCOORD0; //texture coordinate
+};
+
+struct PS_INPUT
+{
+ float4 Pos : SV_POSITION;
+ float3 Norm : TEXCOORD0;
+ float2 Tex : TEXCOORD1;
+};
+
+struct QUADVS_INPUT
+{
+ float4 Pos : POSITION;
+ float2 Tex : TEXCOORD0;
+};
+
+struct QUADVS_OUTPUT
+{
+ float4 Pos : SV_POSITION; // Transformed position
+ float2 Tex : TEXCOORD0;
+};
+
+//--------------------------------------------------------------------------------------
+// Blending States
+//--------------------------------------------------------------------------------------
+BlendState NoBlending
+{
+ BlendEnable[0] = FALSE;
+};
+
+BlendState SrcAlphaBlendingAdd
+{
+ BlendEnable[0] = TRUE;
+ SrcBlend = SRC_ALPHA;
+ DestBlend = ONE;
+ BlendOp = ADD;
+ SrcBlendAlpha = ZERO;
+ DestBlendAlpha = ZERO;
+ BlendOpAlpha = ADD;
+ RenderTargetWriteMask[0] = 0x0F;
+};
+
+BlendState SrcAlphaBlendingSub
+{
+ BlendEnable[0] = TRUE;
+ SrcBlend = SRC_ALPHA;
+ DestBlend = ONE;
+ BlendOp = SUBTRACT;
+ SrcBlendAlpha = ZERO;
+ DestBlendAlpha = ZERO;
+ BlendOpAlpha = ADD;
+ RenderTargetWriteMask[0] = 0x0F;
+};
+
+BlendState SrcColorBlendingAdd
+{
+ BlendEnable[0] = TRUE;
+ SrcBlend = SRC_COLOR;
+ DestBlend = ONE;
+ BlendOp = ADD;
+ SrcBlendAlpha = ZERO;
+ DestBlendAlpha = ZERO;
+ BlendOpAlpha = ADD;
+ RenderTargetWriteMask[0] = 0x0F;
+};
+
+BlendState SrcColorBlendingSub
+{
+ BlendEnable[0] = TRUE;
+ SrcBlend = SRC_COLOR;
+ DestBlend = ONE;
+ BlendOp = SUBTRACT;
+ SrcBlendAlpha = ZERO;
+ DestBlendAlpha = ZERO;
+ BlendOpAlpha = ADD;
+ RenderTargetWriteMask[0] = 0x0F;
+};
+
+//--------------------------------------------------------------------------------------
+// Depth/Stencil States
+//--------------------------------------------------------------------------------------
+DepthStencilState RenderWithStencilState
+{
+ DepthEnable = false;
+ DepthWriteMask = ZERO;
+ DepthFunc = Less;
+
+ // Setup stencil states
+ StencilEnable = true;
+ StencilReadMask = 0xFF;
+ StencilWriteMask = 0x00;
+
+ FrontFaceStencilFunc = Not_Equal;
+ FrontFaceStencilPass = Keep;
+ FrontFaceStencilFail = Zero;
+
+ BackFaceStencilFunc = Not_Equal;
+ BackFaceStencilPass = Keep;
+ BackFaceStencilFail = Zero;
+};
+
+
+
+//--------------------------------------------------------------------------------------
+// Scene Vertex Shader
+//--------------------------------------------------------------------------------------
+PS_INPUT VS( VS_INPUT input )
+{
+ PS_INPUT output = (PS_INPUT)0;
+
+ output.Pos = mul( float4(input.Pos,1), World );
+ output.Pos = mul( output.Pos, View );
+ output.Pos = mul( output.Pos, Projection );
+ output.Norm = mul( input.Norm, World );
+ output.Tex = input.Tex;
+
+ return output;
+}
+
+//-----------------------------------------------------------------------------
+// Quad Vertex Shaders
+//-----------------------------------------------------------------------------
+QUADVS_OUTPUT QuadVS( QUADVS_INPUT Input )
+{
+ QUADVS_OUTPUT Output;
+ Output.Pos = mul( Input.Pos, World );
+ Output.Pos = mul( Output.Pos, View );
+ Output.Pos = mul( Output.Pos, Projection );
+ Output.Tex = Input.Tex;
+ return Output;
+}
+
+QUADVS_OUTPUT ScreenQuadVS( QUADVS_INPUT Input )
+{
+ QUADVS_OUTPUT Output;
+ Output.Pos = Input.Pos;
+ Output.Tex = Input.Tex;
+ return Output;
+}
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( PS_INPUT input) : SV_Target
+{
+ // Calculate lighting assuming light color is <1,1,1,1>
+ float fLighting = saturate( dot( input.Norm, vLightDir ) );
+ float4 outputColor = g_txDiffuse.Sample( samLinear, input.Tex ) * fLighting;
+ outputColor.a = 1;
+ return outputColor;
+}
+
+//--------------------------------------------------------------------------------------
+// Quad Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 QuadPS( QUADVS_OUTPUT input) : SV_Target
+{
+ return g_txDiffuse.Sample( samLinear, input.Tex );
+}
+
+
+//--------------------------------------------------------------------------------------
+// Scene Techniques
+//--------------------------------------------------------------------------------------
+technique11 RenderScene
+{
+ pass P0
+ {
+ SetVertexShader( CompileShader( vs_4_0, VS() ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0, PS() ) );
+ SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+ }
+}
+
+//--------------------------------------------------------------------------------------
+// RenderWithStencil - set the depth stencil state inside of the technique
+//--------------------------------------------------------------------------------------
+technique11 RenderWithStencil
+{
+ pass P0
+ {
+ SetVertexShader( CompileShader( vs_4_0, ScreenQuadVS() ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0, QuadPS() ) );
+
+ SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+ SetDepthStencilState( RenderWithStencilState, 0 );
+ }
+}
+
+//--------------------------------------------------------------------------------------
+// Quad Techniques: Alpha blending state is set inside the technique
+//--------------------------------------------------------------------------------------
+technique11 RenderQuadSolid
+{
+ pass P0
+ {
+ SetVertexShader( CompileShader( vs_4_0, QuadVS() ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0, QuadPS() ) );
+
+ SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+ }
+}
+
+//--------------------------------------------------------------------------------------
+technique11 RenderQuadSrcAlphaAdd
+{
+ pass P0
+ {
+ SetVertexShader( CompileShader( vs_4_0, QuadVS() ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0, QuadPS() ) );
+
+ SetBlendState( SrcAlphaBlendingAdd, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+ }
+}
+
+//--------------------------------------------------------------------------------------
+technique11 RenderQuadSrcAlphaSub
+{
+ pass P0
+ {
+ SetVertexShader( CompileShader( vs_4_0, QuadVS() ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0, QuadPS() ) );
+
+ SetBlendState( SrcAlphaBlendingSub, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+ }
+}
+
+//--------------------------------------------------------------------------------------
+technique11 RenderQuadSrcColorAdd
+{
+ pass P0
+ {
+ SetVertexShader( CompileShader( vs_4_0, QuadVS() ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0, QuadPS() ) );
+
+ SetBlendState( SrcColorBlendingAdd, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+ }
+}
+
+//--------------------------------------------------------------------------------------
+technique11 RenderQuadSrcColorSub
+{
+ pass P0
+ {
+ SetVertexShader( CompileShader( vs_4_0, QuadVS() ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0, QuadPS() ) );
+
+ SetBlendState( SrcColorBlendingSub, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+ }
+}
+
+
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_LightPSH.h b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_LightPSH.h
new file mode 100644
index 000000000..b44251829
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_LightPSH.h
@@ -0,0 +1,84 @@
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkage11_LightPSH.h
+//
+// The pixel shader light header file for the DynamicShaderLinkage11 sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Interfaces
+//--------------------------------------------------------------------------------------
+interface iBaseLight
+{
+ float3 IlluminateAmbient(float3 vNormal);
+
+ float3 IlluminateDiffuse(float3 vNormal);
+
+ float3 IlluminateSpecular(float3 vNormal, int specularPower );
+
+};
+
+//--------------------------------------------------------------------------------------
+// Classes
+//--------------------------------------------------------------------------------------
+class cAmbientLight : iBaseLight
+{
+ float3 m_vLightColor;
+ bool m_bEnable;
+
+ float3 IlluminateAmbient(float3 vNormal);
+
+ float3 IlluminateDiffuse(float3 vNormal)
+ {
+ return (float3)0;
+ }
+
+ float3 IlluminateSpecular(float3 vNormal, int specularPower )
+ {
+ return (float3)0;
+ }
+};
+
+class cHemiAmbientLight : cAmbientLight
+{
+ // inherited float4 m_vLightColor is the SkyColor
+ float4 m_vGroundColor;
+ float4 m_vDirUp;
+
+ float3 IlluminateAmbient(float3 vNormal);
+
+};
+
+class cDirectionalLight : cAmbientLight
+{
+ // inherited float4 m_vLightColor is the LightColor
+ float4 m_vLightDir;
+
+ float3 IlluminateDiffuse( float3 vNormal );
+
+ float3 IlluminateSpecular( float3 vNormal, int specularPower );
+
+};
+
+class cOmniLight : cAmbientLight
+{
+ float3 m_vLightPosition;
+ float radius;
+
+ float3 IlluminateDiffuse( float3 vNormal );
+
+};
+
+class cSpotLight : cAmbientLight
+{
+ float3 m_vLightPosition;
+ float3 m_vLightDir;
+};
+
+class cEnvironmentLight : cAmbientLight
+{
+ float3 IlluminateSpecular( float3 vNormal, int specularPower );
+};
+
+
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_MaterialPSH.h b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_MaterialPSH.h
new file mode 100644
index 000000000..7f6bc3d22
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_MaterialPSH.h
@@ -0,0 +1,103 @@
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkage11_MATERIALPSH.h
+//
+// The pixel shader material header file for the DynamicShaderLinkage11 sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Interfaces
+//--------------------------------------------------------------------------------------
+interface iBaseMaterial
+{
+ float3 GetAmbientColor(float2 vTexcoord);
+
+ float3 GetDiffuseColor(float2 vTexcoord);
+
+ int GetSpecularPower();
+
+};
+
+//--------------------------------------------------------------------------------------
+// Classes
+//--------------------------------------------------------------------------------------
+class cBaseMaterial : iBaseMaterial
+{
+ float3 m_vColor;
+ int m_iSpecPower;
+
+ float3 GetAmbientColor(float2 vTexcoord)
+ {
+ return m_vColor;
+ }
+
+ float3 GetDiffuseColor(float2 vTexcoord)
+ {
+ return (float3)m_vColor;
+ }
+
+ int GetSpecularPower()
+ {
+ return m_iSpecPower;
+ }
+
+};
+
+class cPlasticMaterial : cBaseMaterial
+{
+
+};
+
+class cPlasticTexturedMaterial : cPlasticMaterial
+{
+ float3 GetAmbientColor(float2 vTexcoord);
+
+ float3 GetDiffuseColor(float2 vTexcoord);
+
+};
+
+class cPlasticLightingOnlyMaterial : cBaseMaterial
+{
+ float3 GetAmbientColor(float2 vTexcoord)
+ {
+ return (float3)1.0f;
+ }
+
+ float3 GetDiffuseColor(float2 vTexcoord)
+ {
+ return (float3)1.0f;
+ }
+
+};
+
+class cRoughMaterial : cBaseMaterial
+{
+ int GetSpecularPower()
+ {
+ return m_iSpecPower;
+ }
+};
+
+class cRoughTexturedMaterial : cRoughMaterial
+{
+ float3 GetAmbientColor(float2 vTexcoord);
+
+ float3 GetDiffuseColor(float2 vTexcoord);
+
+};
+
+
+class cRoughLightingOnlyMaterial : cRoughMaterial
+{
+ float3 GetAmbientColor(float2 vTexcoord)
+ {
+ return (float3)1.0f;
+ }
+
+ float3 GetDiffuseColor(float2 vTexcoord)
+ {
+ return (float3)1.0f;
+ }
+
+};
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PS.hlsl b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PS.hlsl
new file mode 100644
index 000000000..c3ee93057
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PS.hlsl
@@ -0,0 +1,84 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PSMain
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkage11.psh
+//
+// The pixel shader header file for the DynamicShaderLinkage11 sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Header Includes
+//--------------------------------------------------------------------------------------
+#include "DynamicShaderLinkage11_PSBuffers.h"
+
+// Defines for default static permutated setting
+#if defined( STATIC_PERMUTE )
+ #define HEMI_AMBIENT //CONST_AMBIENT //HEMI_AMBIENT
+ #define TEXTURE_ENABLE
+ #define SPECULAR_ENABLE
+#endif
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct PS_INPUT
+{
+ float4 vPosition : SV_POSITION;
+ float3 vNormal : NORMAL;
+ float2 vTexcoord : TEXCOORD0;
+ float4 vMatrix : TEXCOORD1;
+};
+
+//--------------------------------------------------------------------------------------
+// Abstract Interface Instances for dyamic linkage / permutation
+//--------------------------------------------------------------------------------------
+#if !defined( STATIC_PERMUTE )
+ iBaseLight g_abstractAmbientLighting;
+ iBaseLight g_abstractDirectLighting;
+ iBaseLight g_abstractEnvironmentLighting;
+ iBaseMaterial g_abstractMaterial;
+#else
+//--------------------------------------------------------------------------------------
+// Concrete Instances for STATIC_PERMUTE - static permutation
+//--------------------------------------------------------------------------------------
+ #if defined( HEMI_AMBIENT )
+ #define g_abstractAmbientLighting g_hemiAmbientLight
+ #else
+ // CONST_AMBIENT
+ #define g_abstractAmbientLighting g_ambientLight
+ #endif
+ #define g_abstractDirectLighting g_directionalLight
+ #define g_abstractEnvironmentLighting g_environmentLight
+ #if defined( TEXTURE_ENABLE )
+ #define g_abstractMaterial g_plasticTexturedMaterial
+ #else
+ #define g_abstractMaterial g_plasticMaterial
+ #endif
+#endif
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PSMain( PS_INPUT Input ) : SV_TARGET
+{
+ // Compute the Ambient term
+ float3 Ambient = (float3)0.0f;
+ Ambient = g_abstractMaterial.GetAmbientColor( Input.vTexcoord ) * g_abstractAmbientLighting.IlluminateAmbient( Input.vNormal );
+
+ // Accumulate the Diffuse contribution
+ float3 Diffuse = (float3)0.0f;
+
+ Diffuse += g_abstractMaterial.GetDiffuseColor( Input.vTexcoord ) * g_abstractDirectLighting.IlluminateDiffuse( Input.vNormal );
+
+ // Compute the Specular contribution
+ float3 Specular = (float3)0.0f;
+ Specular += g_abstractDirectLighting.IlluminateSpecular( Input.vNormal, g_abstractMaterial.GetSpecularPower() );
+ Specular += g_abstractEnvironmentLighting.IlluminateSpecular( Input.vNormal, g_abstractMaterial.GetSpecularPower() );
+
+ // Accumulate the lighting with saturation
+ float3 Lighting = saturate( Ambient + Diffuse + Specular );
+
+ return float4(Lighting,1.0f);
+}
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PSBuffers.h b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PSBuffers.h
new file mode 100644
index 000000000..e2263b832
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PSBuffers.h
@@ -0,0 +1,129 @@
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkage11_LightPSH.hlsl
+//
+// The pixel shader light source module file for the DynamicShaderLinkage11 sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#include "DynamicShaderLinkage11_LightPSH.h"
+#include "DynamicShaderLinkage11_MaterialPSH.h"
+
+//--------------------------------------------------------------------------------------
+// Constant Buffers
+//--------------------------------------------------------------------------------------
+cbuffer cbPerFrame : register( b0 )
+{
+ cAmbientLight g_ambientLight;
+ cHemiAmbientLight g_hemiAmbientLight;
+ cDirectionalLight g_directionalLight;
+ cEnvironmentLight g_environmentLight;
+ float4 g_vEyeDir;
+};
+
+cbuffer cbPerPrimitive : register( b1 )
+{
+ cPlasticMaterial g_plasticMaterial;
+ cPlasticTexturedMaterial g_plasticTexturedMaterial;
+ cPlasticLightingOnlyMaterial g_plasticLightingOnlyMaterial;
+ cRoughMaterial g_roughMaterial;
+ cRoughTexturedMaterial g_roughTexturedMaterial;
+ cRoughLightingOnlyMaterial g_roughLightingOnlyMaterial;
+};
+
+//--------------------------------------------------------------------------------------
+// Textures and Samplers
+//--------------------------------------------------------------------------------------
+Texture2D g_txDiffuse : register( t0 );
+Texture2D g_txNormalMap : register( t1 );
+TextureCube g_txEnvironmentMap : register( t2 );
+
+SamplerState g_samLinear : register( s0 );
+
+//--------------------------------------------------------------------------------------
+// Lighting Class Methods
+//--------------------------------------------------------------------------------------
+// Ambient Lighting Class Methods
+float3 cAmbientLight::IlluminateAmbient(float3 vNormal)
+{
+ return float4( m_vLightColor * m_bEnable, 1.0f);
+}
+
+float3 cHemiAmbientLight::IlluminateAmbient(float3 vNormal)
+{
+ float thetha = (dot( vNormal, m_vDirUp ) + 1.0f) / 2.0f;
+
+ return lerp( m_vGroundColor, m_vLightColor, thetha) * m_bEnable;
+}
+
+// Directional Light class
+float3 cDirectionalLight::IlluminateDiffuse( float3 vNormal )
+{
+ float lambert = saturate(dot( vNormal, m_vLightDir ));
+ return ((float3)lambert * m_vLightColor * m_bEnable);
+}
+
+float3 cDirectionalLight::IlluminateSpecular( float3 vNormal, int specularPower )
+{
+ float3 H = -normalize(g_vEyeDir) + m_vLightDir;
+ float3 halfAngle = normalize( H );
+ float specular = pow( max(0,dot( halfAngle, normalize(vNormal) )), specularPower );
+
+ return ((float3)specular * m_vLightColor * m_bEnable);
+}
+
+// Omni Light Class
+float3 cOmniLight::IlluminateDiffuse( float3 vNormal )
+{
+ return (float3)0.0f; // TO DO!
+}
+
+// Environment Lighting
+float3 cEnvironmentLight::IlluminateSpecular( float3 vNormal, int specularPower )
+{
+ // compute reflection vector taking into account a cheap fresnel falloff;
+ float3 N = normalize(vNormal);
+ float3 E = normalize(g_vEyeDir);
+ float3 R = reflect( E, N );
+ float fresnel = 1 - dot( -E, N );
+ fresnel = (fresnel * fresnel * fresnel );
+
+ float3 specular = g_txEnvironmentMap.Sample( g_samLinear, R ) * fresnel;
+
+ return (specular * (float3)m_bEnable);
+// return ((float3)fresnel);
+
+}
+
+//--------------------------------------------------------------------------------------
+// Material Class Methods
+//--------------------------------------------------------------------------------------
+// Plastic Material Methods
+float3 cPlasticTexturedMaterial::GetAmbientColor(float2 vTexcoord)
+{
+ float4 vDiffuse = (float4)1.0f;
+ vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord );
+ return m_vColor * vDiffuse;
+}
+
+float3 cPlasticTexturedMaterial::GetDiffuseColor(float2 vTexcoord)
+{
+ float4 vDiffuse = (float4)1.0f;
+ vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord );
+ return m_vColor * vDiffuse;
+}
+
+// Rough Material Methods
+float3 cRoughTexturedMaterial::GetAmbientColor(float2 vTexcoord)
+{
+ float4 vDiffuse = (float4)1.0f;
+ vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord );
+ return m_vColor * vDiffuse;
+}
+
+float3 cRoughTexturedMaterial::GetDiffuseColor(float2 vTexcoord)
+{
+ float4 vDiffuse = (float4)1.0f;
+ vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord );
+ return m_vColor * vDiffuse;
+}
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_VS.hlsl b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_VS.hlsl
new file mode 100644
index 000000000..800dbf3b3
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_VS.hlsl
@@ -0,0 +1,66 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkage11_VS.hlsl
+//
+// The vertex shader file for the DynamicShaderLinkage11 sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+ float4x4 g_mWorldViewProjection : packoffset( c0 );
+ float4x4 g_mWorld : packoffset( c4 );
+};
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+ float4 vPosition : POSITION;
+ float3 vNormal : NORMAL;
+ float2 vTexcoord : TEXCOORD0;
+};
+
+struct VS_OUTPUT
+{
+ float4 vPosition : SV_POSITION;
+ float3 vNormal : NORMAL;
+ float2 vTexcoord0 : TEXCOORD0;
+ float4 vMatrix : TEXCOORD1; // DEBUG
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+// We aliased signed vectors as a unsigned format.
+// Need to recover signed values. The values 1.0 and 2.0
+// are slightly inaccurate here.
+float3 R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( in float3 vVec )
+{
+ vVec *= 2.0f;
+ return vVec >= 1.0f ? ( vVec - 2.0f ) : vVec;
+}
+
+VS_OUTPUT VSMain( VS_INPUT Input )
+{
+
+ VS_OUTPUT Output;
+ float3 tmpNormal;
+
+ Output.vPosition = mul( Input.vPosition, g_mWorldViewProjection );
+
+ // Expand compressed vectors
+ tmpNormal = R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( Input.vNormal );
+ Output.vNormal = mul( tmpNormal, (float3x3)g_mWorld );
+
+ Output.vTexcoord0 = Input.vTexcoord;
+
+ Output.vMatrix = (float4)g_mWorld[0]; // DEBUG
+ return Output;
+}
+
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11.fx b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11.fx
new file mode 100644
index 000000000..c72b98843
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11.fx
@@ -0,0 +1,192 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkageFX11.fx
+//
+// The effect file for the DynamicShaderLinkageFX11 sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#include "DynamicShaderLinkageFX11_VS.hlsl"
+#include "DynamicShaderLinkageFX11_PS.hlsl"
+
+//
+// Settings for static permutations.
+// All of the pre-5.0 targets need static specialization
+// since they don't support late binding. The below
+// just selects a single specialization but you could
+// create any number of them, each one representing
+// a new shader with the interfaces compiled out
+// due to the compile-time class references.
+//
+
+#define StaticMaterial g_plasticTexturedMaterial
+#define StaticAmbientLight g_ambientLight
+#define StaticDirectLight g_directionalLight
+#define StaticEnvironmentLight g_environmentLight
+
+technique11 FeatureLevel10
+{
+ pass
+ {
+ SetRasterizerState(g_rasterizerState[g_fillMode]);
+ SetVertexShader(CompileShader(vs_4_0,
+ VSMain()));
+ SetPixelShader(CompileShader(ps_4_0,
+ PSMainUniform(StaticAmbientLight,
+ StaticDirectLight,
+ StaticEnvironmentLight,
+ StaticMaterial)));
+ }
+}
+
+technique11 FeatureLevel10_1
+{
+ pass
+ {
+ SetRasterizerState(g_rasterizerState[g_fillMode]);
+ SetVertexShader(CompileShader(vs_4_1,
+ VSMain()));
+ SetPixelShader(CompileShader(ps_4_1,
+ PSMainUniform(StaticAmbientLight,
+ StaticDirectLight,
+ StaticEnvironmentLight,
+ StaticMaterial)));
+ }
+}
+
+//
+// Variables for dynamic shader linkage.
+// There are two variations here for dynamic usage.
+// In the first we use the uniform entry point
+// and pass in global interface variables. This
+// creates a shader which refers to the global
+// interface variables when running and we can bind
+// concrete instances in our C++ code by using
+// ID3DX11EffectInterfaceVariable::SetClassInstance.
+// This approach works well when you have several
+// independent variations and want to bind them
+// individually in your C++ code, such as the
+// different lighting and material parameters in
+// this sample.
+//
+
+iBaseLight g_abstractAmbientLighting;
+iBaseLight g_abstractDirectLighting;
+iBaseLight g_abstractEnvironmentLighting;
+iBaseMaterial g_abstractMaterial;
+
+technique11 FeatureLevel11
+{
+ pass
+ {
+ SetRasterizerState(g_rasterizerState[g_fillMode]);
+ SetVertexShader(CompileShader(vs_5_0,
+ VSMain()));
+ SetPixelShader(CompileShader(ps_5_0,
+ PSMainUniform(g_abstractAmbientLighting,
+ g_abstractDirectLighting,
+ g_abstractEnvironmentLighting,
+ g_abstractMaterial)));
+ }
+}
+
+//
+// In this second variation we use the non-uniform
+// entry point so that we don't have to specify
+// any interfaces when compiling the shader. We
+// then reuse the compiled shader with different
+// BindInterfaces calls so that all bindings are
+// handled automatically by the effect runtime.
+// Below we have multiple techniques where
+// we've given a concrete binding for the material.
+// Lighting parameters are left as interfaces for
+// binding via effect variables, but could also
+// be specified concretely if the number of variations
+// is manageable.
+// This approach works well for a small number of variations
+// that are known in advance, as you can just list them
+// in your effect and you don't need to do the
+// binding work explicitly in your C++ code.
+//
+
+VertexShader g_NonUniVS = CompileShader(vs_5_0, VSMain());
+PixelShader g_NonUniPS = CompileShader(ps_5_0, PSMainNonUniform());
+
+technique11 FeatureLevel11_g_plasticMaterial
+{
+ pass
+ {
+ SetVertexShader(g_NonUniVS);
+ SetPixelShader(BindInterfaces(g_NonUniPS,
+ g_abstractAmbientLighting,
+ g_abstractDirectLighting,
+ g_abstractEnvironmentLighting,
+ g_plasticMaterial));
+ }
+}
+
+technique11 FeatureLevel11_g_plasticTexturedMaterial
+{
+ pass
+ {
+ SetVertexShader(g_NonUniVS);
+ SetPixelShader(BindInterfaces(g_NonUniPS,
+ g_abstractAmbientLighting,
+ g_abstractDirectLighting,
+ g_abstractEnvironmentLighting,
+ g_plasticTexturedMaterial));
+ }
+}
+
+technique11 FeatureLevel11_g_plasticLightingOnlyMaterial
+{
+ pass
+ {
+ SetVertexShader(g_NonUniVS);
+ SetPixelShader(BindInterfaces(g_NonUniPS,
+ g_abstractAmbientLighting,
+ g_abstractDirectLighting,
+ g_abstractEnvironmentLighting,
+ g_plasticLightingOnlyMaterial));
+ }
+}
+
+technique11 FeatureLevel11_g_roughMaterial
+{
+ pass
+ {
+ SetVertexShader(g_NonUniVS);
+ SetPixelShader(BindInterfaces(g_NonUniPS,
+ g_abstractAmbientLighting,
+ g_abstractDirectLighting,
+ g_abstractEnvironmentLighting,
+ g_roughMaterial));
+ }
+}
+
+technique11 FeatureLevel11_g_roughTexturedMaterial
+{
+ pass
+ {
+ SetVertexShader(g_NonUniVS);
+ SetPixelShader(BindInterfaces(g_NonUniPS,
+ g_abstractAmbientLighting,
+ g_abstractDirectLighting,
+ g_abstractEnvironmentLighting,
+ g_roughTexturedMaterial));
+ }
+}
+
+technique11 FeatureLevel11_g_roughLightingOnlyMaterial
+{
+ pass
+ {
+ SetVertexShader(g_NonUniVS);
+ SetPixelShader(BindInterfaces(g_NonUniPS,
+ g_abstractAmbientLighting,
+ g_abstractDirectLighting,
+ g_abstractEnvironmentLighting,
+ g_roughLightingOnlyMaterial));
+ }
+}
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_LightPSH.h b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_LightPSH.h
new file mode 100644
index 000000000..6f9a0f4d8
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_LightPSH.h
@@ -0,0 +1,82 @@
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkageFX11_LightPSH.h
+//
+// The pixel shader light header file for the DynamicShaderLinkageFX11 sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Interfaces
+//--------------------------------------------------------------------------------------
+interface iBaseLight
+{
+ float3 IlluminateAmbient(float3 vNormal);
+
+ float3 IlluminateDiffuse(float3 vNormal);
+
+ float3 IlluminateSpecular(float3 vNormal, int specularPower );
+
+};
+
+//--------------------------------------------------------------------------------------
+// Classes
+//--------------------------------------------------------------------------------------
+class cAmbientLight : iBaseLight
+{
+ float3 m_vLightColor;
+ bool m_bEnable;
+
+ float3 IlluminateAmbient(float3 vNormal);
+
+ float3 IlluminateDiffuse(float3 vNormal)
+ {
+ return (float3)0;
+ }
+
+ float3 IlluminateSpecular(float3 vNormal, int specularPower )
+ {
+ return (float3)0;
+ }
+};
+
+class cHemiAmbientLight : cAmbientLight
+{
+ // inherited float4 m_vLightColor is the SkyColor
+ float4 m_vGroundColor;
+ float4 m_vDirUp;
+
+ float3 IlluminateAmbient(float3 vNormal);
+
+};
+
+class cDirectionalLight : cAmbientLight
+{
+ // inherited float4 m_vLightColor is the LightColor
+ float4 m_vLightDir;
+
+ float3 IlluminateDiffuse( float3 vNormal );
+
+ float3 IlluminateSpecular( float3 vNormal, int specularPower );
+
+};
+
+class cOmniLight : cAmbientLight
+{
+ float3 m_vLightPosition;
+ float radius;
+
+ float3 IlluminateDiffuse( float3 vNormal );
+
+};
+
+class cSpotLight : cAmbientLight
+{
+ float3 m_vLightPosition;
+ float3 m_vLightDir;
+};
+
+class cEnvironmentLight : cAmbientLight
+{
+ float3 IlluminateSpecular( float3 vNormal, int specularPower );
+};
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_MaterialPSH.h b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_MaterialPSH.h
new file mode 100644
index 000000000..cd54a283d
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_MaterialPSH.h
@@ -0,0 +1,103 @@
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkageFX11_MaterialPSH.h
+//
+// The pixel shader material header file for the DynamicShaderLinkageFX11 sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Interfaces
+//--------------------------------------------------------------------------------------
+interface iBaseMaterial
+{
+ float3 GetAmbientColor(float2 vTexcoord);
+
+ float3 GetDiffuseColor(float2 vTexcoord);
+
+ int GetSpecularPower();
+
+};
+
+//--------------------------------------------------------------------------------------
+// Classes
+//--------------------------------------------------------------------------------------
+class cBaseMaterial : iBaseMaterial
+{
+ float3 m_vColor;
+ int m_iSpecPower;
+
+ float3 GetAmbientColor(float2 vTexcoord)
+ {
+ return m_vColor;
+ }
+
+ float3 GetDiffuseColor(float2 vTexcoord)
+ {
+ return (float3)m_vColor;
+ }
+
+ int GetSpecularPower()
+ {
+ return m_iSpecPower;
+ }
+
+};
+
+class cPlasticMaterial : cBaseMaterial
+{
+
+};
+
+class cPlasticTexturedMaterial : cPlasticMaterial
+{
+ float3 GetAmbientColor(float2 vTexcoord);
+
+ float3 GetDiffuseColor(float2 vTexcoord);
+
+};
+
+class cPlasticLightingOnlyMaterial : cBaseMaterial
+{
+ float3 GetAmbientColor(float2 vTexcoord)
+ {
+ return (float3)1.0f;
+ }
+
+ float3 GetDiffuseColor(float2 vTexcoord)
+ {
+ return (float3)1.0f;
+ }
+
+};
+
+class cRoughMaterial : cBaseMaterial
+{
+ int GetSpecularPower()
+ {
+ return m_iSpecPower;
+ }
+};
+
+class cRoughTexturedMaterial : cRoughMaterial
+{
+ float3 GetAmbientColor(float2 vTexcoord);
+
+ float3 GetDiffuseColor(float2 vTexcoord);
+
+};
+
+
+class cRoughLightingOnlyMaterial : cRoughMaterial
+{
+ float3 GetAmbientColor(float2 vTexcoord)
+ {
+ return (float3)1.0f;
+ }
+
+ float3 GetDiffuseColor(float2 vTexcoord)
+ {
+ return (float3)1.0f;
+ }
+
+};
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_PSBuffers.h b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_PSBuffers.h
new file mode 100644
index 000000000..3b4c528be
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_PSBuffers.h
@@ -0,0 +1,152 @@
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkageFX11_LightPSH.hlsl
+//
+// The pixel shader light source module file for the DynamicShaderLinkageFX11 sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#include "DynamicShaderLinkageFX11_LightPSH.h"
+#include "DynamicShaderLinkageFX11_MaterialPSH.h"
+
+//--------------------------------------------------------------------------------------
+// Constant Buffers
+//--------------------------------------------------------------------------------------
+cbuffer cbPerFrame : register( b0 )
+{
+ cAmbientLight g_ambientLight;
+ cHemiAmbientLight g_hemiAmbientLight;
+ cDirectionalLight g_directionalLight;
+ cEnvironmentLight g_environmentLight;
+ float4 g_vEyeDir;
+};
+
+cbuffer cbPerPrimitive : register( b1 )
+{
+ cPlasticMaterial g_plasticMaterial;
+ cPlasticTexturedMaterial g_plasticTexturedMaterial;
+ cPlasticLightingOnlyMaterial g_plasticLightingOnlyMaterial;
+ cRoughMaterial g_roughMaterial;
+ cRoughTexturedMaterial g_roughTexturedMaterial;
+ cRoughLightingOnlyMaterial g_roughLightingOnlyMaterial;
+};
+
+//--------------------------------------------------------------------------------------
+// Textures and Samplers
+//--------------------------------------------------------------------------------------
+Texture2D g_txDiffuse : register( t0 );
+Texture2D g_txNormalMap : register( t1 );
+TextureCube g_txEnvironmentMap : register( t2 );
+
+SamplerState g_samLinear : register( s0 )
+{
+ Filter = MIN_MAG_MIP_LINEAR;
+ AddressU = WRAP;
+ AddressV = WRAP;
+ AddressW = WRAP;
+};
+
+//--------------------------------------------------------------------------------------
+// Rasterization State
+//--------------------------------------------------------------------------------------
+uint g_fillMode = 0;
+
+RasterizerState g_rasterizerState[2]
+{
+{
+ FillMode = SOLID;
+ MultisampleEnable = true;
+},
+{
+ FillMode = WIREFRAME;
+ MultisampleEnable = true;
+}
+};
+
+//--------------------------------------------------------------------------------------
+// Lighting Class Methods
+//--------------------------------------------------------------------------------------
+// Ambient Lighting Class Methods
+float3 cAmbientLight::IlluminateAmbient(float3 vNormal)
+{
+ return m_vLightColor * m_bEnable;
+}
+
+float3 cHemiAmbientLight::IlluminateAmbient(float3 vNormal)
+{
+ float thetha = (dot( vNormal, m_vDirUp.xyz ) + 1.0f) / 2.0f;
+
+ return lerp( m_vGroundColor.xyz, m_vLightColor, thetha) * m_bEnable;
+}
+
+// Directional Light class
+float3 cDirectionalLight::IlluminateDiffuse( float3 vNormal )
+{
+ float lambert = saturate(dot( vNormal, m_vLightDir.xyz ));
+ return ((float3)lambert * m_vLightColor * m_bEnable);
+}
+
+float3 cDirectionalLight::IlluminateSpecular( float3 vNormal, int specularPower )
+{
+ float3 H = -normalize(g_vEyeDir.xyz) + m_vLightDir.xyz;
+ float3 halfAngle = normalize( H );
+ float specular = pow( max(0,dot( halfAngle, normalize(vNormal) )), specularPower );
+
+ return ((float3)specular * m_vLightColor * m_bEnable);
+}
+
+// Omni Light Class
+float3 cOmniLight::IlluminateDiffuse( float3 vNormal )
+{
+ return (float3)0.0f; // TO DO!
+}
+
+// Environment Lighting
+float3 cEnvironmentLight::IlluminateSpecular( float3 vNormal, int specularPower )
+{
+ // compute reflection vector taking into account a cheap fresnel falloff;
+ float3 N = normalize(vNormal);
+ float3 E = normalize(g_vEyeDir.xyz);
+ float3 R = reflect( E, N );
+ float fresnel = 1 - dot( -E, N );
+ fresnel = (fresnel * fresnel * fresnel );
+
+ float3 specular = g_txEnvironmentMap.Sample( g_samLinear, R ).xyz * fresnel;
+
+ return (specular * (float3)m_bEnable);
+// return ((float3)fresnel);
+
+}
+
+//--------------------------------------------------------------------------------------
+// Material Class Methods
+//--------------------------------------------------------------------------------------
+// Plastic Material Methods
+float3 cPlasticTexturedMaterial::GetAmbientColor(float2 vTexcoord)
+{
+ float4 vDiffuse = (float4)1.0f;
+ vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord );
+ return m_vColor * vDiffuse.xyz;
+}
+
+float3 cPlasticTexturedMaterial::GetDiffuseColor(float2 vTexcoord)
+{
+ float4 vDiffuse = (float4)1.0f;
+ vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord );
+ return m_vColor * vDiffuse.xyz;
+}
+
+// Rough Material Methods
+float3 cRoughTexturedMaterial::GetAmbientColor(float2 vTexcoord)
+{
+ float4 vDiffuse = (float4)1.0f;
+ vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord );
+ return m_vColor * vDiffuse.xyz;
+}
+
+float3 cRoughTexturedMaterial::GetDiffuseColor(float2 vTexcoord)
+{
+ float4 vDiffuse = (float4)1.0f;
+ vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord );
+ return m_vColor * vDiffuse.xyz;
+}
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_ps.hlsl b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_ps.hlsl
new file mode 100644
index 000000000..55d206259
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_ps.hlsl
@@ -0,0 +1,113 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkageFX11.psh
+//
+// The pixel shader header file for the DynamicShaderLinkageFX11 sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Header Includes
+//--------------------------------------------------------------------------------------
+#include "DynamicShaderLinkageFX11_PSBuffers.h"
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct PS_INPUT
+{
+ float4 vPosition : SV_POSITION;
+ float3 vNormal : NORMAL;
+ float2 vTexcoord : TEXCOORD0;
+ float4 vMatrix : TEXCOORD1;
+};
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+
+// This pixel shader uses several interfaces during its
+// work. We show three different ways of providing interface
+// bindings for the PS and those have two different
+// entry points so we've separated the base PS code
+// into a worker routine that's called by the entry
+// points. Normally only one technique would be used
+// and this layering of entry point and worker would
+// not be necessary.
+float4 PSMainWorker( iBaseLight ambientLighting,
+ iBaseLight directLighting,
+ iBaseLight environmentLighting,
+ iBaseMaterial material,
+ PS_INPUT Input )
+{
+ // Compute the Ambient term
+ float3 Ambient = (float3)0.0f;
+ Ambient = material.GetAmbientColor( Input.vTexcoord ) * ambientLighting.IlluminateAmbient( Input.vNormal );
+
+ // Accumulate the Diffuse contribution
+ float3 Diffuse = (float3)0.0f;
+
+ Diffuse += material.GetDiffuseColor( Input.vTexcoord ) * directLighting.IlluminateDiffuse( Input.vNormal );
+
+ // Compute the Specular contribution
+ float3 Specular = (float3)0.0f;
+ Specular += directLighting.IlluminateSpecular( Input.vNormal, material.GetSpecularPower() );
+ Specular += environmentLighting.IlluminateSpecular( Input.vNormal, material.GetSpecularPower() );
+
+ // Accumulate the lighting with saturation
+ float3 Lighting = saturate( Ambient + Diffuse + Specular);
+
+ return float4(Lighting,1.0f);
+}
+
+// One way to provide bindings for shaders in Effects 11 is
+// to use uniform interface parameters. As with non-interface
+// uniform parameters you must specify a value for these
+// parameters in your CompileShader invocations in the effect.
+// You can provide concrete class instances if you want
+// to statically specialize your shaders, such as for targets
+// that don't support abstract interfaces; or you can provide
+// other interfaces that you bind using effect variables.
+// Both are shown in this sample's technique passes.
+float4 PSMainUniform( uniform iBaseLight ambientLighting,
+ uniform iBaseLight directLighting,
+ uniform iBaseLight environmentLighting,
+ uniform iBaseMaterial material,
+ PS_INPUT Input ) : SV_Target
+{
+ return PSMainWorker(ambientLighting,
+ directLighting,
+ environmentLighting,
+ material,
+ Input);
+}
+
+// Another way to use Effects 11 with interfaces is
+// to have non-uniform parameters, which then are
+// bound with a BindInterfaces in a technique pass.
+// BindInterfaces gives concrete instances to use
+// with a shader but does not do static specialization,
+// it just saves information for the effect runtime
+// to use when setting up the shader to run.
+// This lets you share a single shader, compiled with
+// interface usage, while still getting the convenience
+// of declaring concrete bindings in the effect and
+// not needed explicit binding in code via effect
+// variable updates. If you have many different
+// variations it may be simpler to use bindings
+// through effect variables, as then you don't
+// need to list every possible binding set in your
+// techniques.
+float4 PSMainNonUniform( iBaseLight ambientLighting,
+ iBaseLight directLighting,
+ iBaseLight environmentLighting,
+ iBaseMaterial material,
+ PS_INPUT Input ) : SV_Target
+{
+ return PSMainWorker(ambientLighting,
+ directLighting,
+ environmentLighting,
+ material,
+ Input);
+}
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_vs.hlsl b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_vs.hlsl
new file mode 100644
index 000000000..4791e5786
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_vs.hlsl
@@ -0,0 +1,65 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkageFX11_VS.hlsl
+//
+// The vertex shader file for the DynamicShaderLinkageFX11 sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+ float4x4 g_mWorldViewProjection : packoffset( c0 );
+ float4x4 g_mWorld : packoffset( c4 );
+};
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+ float4 vPosition : POSITION;
+ float3 vNormal : NORMAL;
+ float2 vTexcoord : TEXCOORD0;
+};
+
+struct VS_OUTPUT
+{
+ float4 vPosition : SV_POSITION;
+ float3 vNormal : NORMAL;
+ float2 vTexcoord0 : TEXCOORD0;
+ float4 vMatrix : TEXCOORD1; // DEBUG
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+// We aliased signed vectors as a unsigned format.
+// Need to recover signed values. The values 1.0 and 2.0
+// are slightly inaccurate here.
+float3 R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( in float3 vVec )
+{
+ vVec *= 2.0f;
+ return vVec >= 1.0f ? ( vVec - 2.0f ) : vVec;
+}
+
+VS_OUTPUT VSMain( VS_INPUT Input )
+{
+
+ VS_OUTPUT Output;
+ float3 tmpNormal;
+
+ Output.vPosition = mul( Input.vPosition, g_mWorldViewProjection );
+
+ // Expand compressed vectors
+ tmpNormal = R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( Input.vNormal );
+ Output.vNormal = mul( tmpNormal, (float3x3)g_mWorld );
+
+ Output.vTexcoord0 = Input.vTexcoord;
+
+ Output.vMatrix = (float4)g_mWorld[0]; // DEBUG
+ return Output;
+}
diff --git a/tests/hlsl/dxsdk/FixedFuncEMUFX11/FixedFuncEMU.fx b/tests/hlsl/dxsdk/FixedFuncEMUFX11/FixedFuncEMU.fx
new file mode 100644
index 000000000..699df8655
--- /dev/null
+++ b/tests/hlsl/dxsdk/FixedFuncEMUFX11/FixedFuncEMU.fx
@@ -0,0 +1,468 @@
+//TEST_IGNORE_FILE:
+// FixedFuncEMU.fx
+// Copyright (c) 2005 Microsoft Corporation. All rights reserved.
+//
+
+struct VSSceneIn
+{
+ float3 pos : POSITION; //position of the particle
+ float3 norm : NORMAL; //velocity of the particle
+ float2 tex : TEXTURE0; //tex coords
+};
+
+struct VSSceneOut
+{
+ float4 pos : SV_Position; //position
+ float2 tex : TEXTURE0; //texture coordinate
+ float3 wPos : TEXTURE1; //world space pos
+ float3 wNorm : TEXTURE2; //world space normal
+ float4 colorD : COLOR0; //color for gouraud and flat shading
+ float4 colorS : COLOR1; //color for specular
+ float fogDist : FOGDISTANCE; //distance used for fog calculations
+ float3 planeDist : SV_ClipDistance0; //clip distance for 3 planes
+};
+
+struct PSSceneIn
+{
+ float4 pos : SV_Position; //position
+ float2 tex : TEXTURE0; //texture coordinate
+ float3 wPos : TEXTURE1; //world space pos
+ float3 wNorm : TEXTURE2; //world space normal
+ float4 colorD : COLOR0; //color for gouraud and flat shading
+ float4 colorS : COLOR1; //color for specular
+ float fogDist : FOGDISTANCE; //distance used for fog calculations
+};
+
+struct Light
+{
+ float4 Position;
+ float4 Diffuse;
+ float4 Specular;
+ float4 Ambient;
+ float4 Atten;
+};
+
+#define FOGMODE_NONE 0
+#define FOGMODE_LINEAR 1
+#define FOGMODE_EXP 2
+#define FOGMODE_EXP2 3
+#define E 2.71828
+
+cbuffer cbLights
+{
+ float4 g_clipplanes[3];
+ Light g_lights[8];
+};
+
+cbuffer cbPerFrame
+{
+ float4x4 g_mWorld;
+ float4x4 g_mView;
+ float4x4 g_mProj;
+ float4x4 g_mInvProj;
+ float4x4 g_mLightViewProj;
+};
+
+cbuffer cbPerTechnique
+{
+ bool g_bEnableLighting = true;
+ bool g_bEnableClipping = true;
+ bool g_bPointScaleEnable = false;
+ float g_pointScaleA;
+ float g_pointScaleB;
+ float g_pointScaleC;
+ float g_pointSize;
+
+ //fog params
+ int g_fogMode = FOGMODE_NONE;
+ float g_fogStart;
+ float g_fogEnd;
+ float g_fogDensity;
+ float4 g_fogColor;
+};
+
+cbuffer cbPerViewChange
+{
+ //viewport params
+ float g_viewportHeight;
+ float g_viewportWidth;
+ float g_nearPlane;
+};
+
+cbuffer cbImmutable
+{
+ float3 g_positions[4] =
+ {
+ float3( -0.5, 0.5, 0 ),
+ float3( 0.5, 0.5, 0 ),
+ float3( -0.5, -0.5, 0 ),
+ float3( 0.5, -0.5, 0 ),
+ };
+};
+
+Texture2D g_txDiffuse;
+Texture2D g_txProjected;
+SamplerState g_samLinear
+{
+ Filter = MIN_MAG_MIP_LINEAR;
+ AddressU = Clamp;
+ AddressV = Clamp;
+};
+
+DepthStencilState DisableDepth
+{
+ DepthEnable = FALSE;
+ DepthWriteMask = ZERO;
+};
+
+DepthStencilState EnableDepth
+{
+ DepthEnable = TRUE;
+ DepthWriteMask = ALL;
+};
+
+struct ColorsOutput
+{
+ float4 Diffuse;
+ float4 Specular;
+};
+
+ColorsOutput CalcLighting( float3 worldNormal, float3 worldPos, float3 cameraPos )
+{
+ ColorsOutput output = (ColorsOutput)0.0;
+
+ for(int i=0; i<8; i++)
+ {
+ float3 toLight = g_lights[i].Position.xyz - worldPos;
+ float lightDist = length( toLight );
+ float fAtten = 1.0/dot( g_lights[i].Atten, float4(1,lightDist,lightDist*lightDist,0) );
+ float3 lightDir = normalize( toLight );
+ float3 halfAngle = normalize( normalize(-cameraPos) + lightDir );
+
+ output.Diffuse += max(0,dot( lightDir, worldNormal ) * g_lights[i].Diffuse * fAtten) + g_lights[i].Ambient;
+ output.Specular += max(0,pow( dot( halfAngle, worldNormal ), 64 ) * g_lights[i].Specular * fAtten );
+ }
+
+ return output;
+}
+
+//
+// VS for emulating fixed function pipeline
+//
+VSSceneOut VSScenemain(VSSceneIn input)
+{
+ VSSceneOut output = (VSSceneOut)0.0;
+
+ //output our final position in clipspace
+ float4 worldPos = mul( float4( input.pos, 1 ), g_mWorld );
+ float4 cameraPos = mul( worldPos, g_mView ); //Save cameraPos for fog calculations
+ output.pos = mul( cameraPos, g_mProj );
+
+ //save world pos for later
+ output.wPos = worldPos;
+
+ //save the fog distance for later
+ output.fogDist = cameraPos.z;
+
+ //find our clipping planes (fixed function clipping is done in world space)
+ if( g_bEnableClipping )
+ {
+ worldPos.w = 1;
+
+ //calc the distance from the 3 clipping planes
+ output.planeDist.x = dot( worldPos, g_clipplanes[0] );
+ output.planeDist.y = dot( worldPos, g_clipplanes[1] );
+ output.planeDist.z = dot( worldPos, g_clipplanes[2] );
+ }
+ else
+ {
+ output.planeDist.x = 1;
+ output.planeDist.y = 1;
+ output.planeDist.z = 1;
+ }
+
+ //do gouraud lighting
+ if( g_bEnableLighting )
+ {
+ float3 worldNormal = normalize( mul( input.norm, (float3x3)g_mWorld ) );
+ output.wNorm = worldNormal;
+ ColorsOutput cOut = CalcLighting( worldNormal, worldPos, cameraPos );
+ output.colorD = cOut.Diffuse;
+ output.colorS = cOut.Specular;
+ }
+ else
+ {
+ output.colorD = float4(1,1,1,1);
+ }
+
+ //propogate texture coordinate
+ output.tex = input.tex;
+
+ return output;
+}
+
+//
+// VS for rendering in screen space
+//
+PSSceneIn VSScreenSpacemain(VSSceneIn input)
+{
+ PSSceneIn output = (PSSceneIn)0.0;
+
+ //output our final position
+ output.pos.x = (input.pos.x / (g_viewportWidth/2.0)) -1;
+ output.pos.y = -(input.pos.y / (g_viewportHeight/2.0)) +1;
+ output.pos.z = input.pos.z;
+ output.pos.w = 1;
+
+ //propogate texture coordinate
+ output.tex = input.tex;
+ output.colorD = float4(1,1,1,1);
+
+ return output;
+}
+
+//
+// GS for flat shaded rendering
+//
+
+[maxvertexcount(3)]
+void GSFlatmain( triangle VSSceneOut input[3], inout TriangleStream<VSSceneOut> FlatTriStream )
+{
+ VSSceneOut output;
+
+ //
+ // Calculate the face normal
+ //
+ float3 faceEdgeA = input[1].wPos - input[0].wPos;
+ float3 faceEdgeB = input[2].wPos - input[0].wPos;
+
+ //
+ // Cross product
+ //
+ float3 faceNormal = cross(faceEdgeA, faceEdgeB);
+
+ //
+ //calculate the face center
+ //
+ float3 faceCenter = (input[0].wPos + input[1].wPos + input[2].wPos)/3.0;
+
+ //find world pos and camera pos
+ float4 worldPos = float4( faceCenter, 1 );
+ float4 cameraPos = mul( worldPos, g_mView );
+
+ //do shading
+ float3 worldNormal = normalize( faceNormal );
+ ColorsOutput cOut = CalcLighting( worldNormal, worldPos, cameraPos );
+
+ for(int i=0; i<3; i++)
+ {
+ output = input[i];
+ output.colorD = cOut.Diffuse;
+ output.colorS = cOut.Specular;
+
+ FlatTriStream.Append( output );
+ }
+ FlatTriStream.RestartStrip();
+}
+
+//
+// GS for point rendering
+//
+[maxvertexcount(12)]
+void GSPointmain( triangle VSSceneOut input[3], inout TriangleStream<VSSceneOut> PointTriStream )
+{
+ VSSceneOut output;
+
+ //
+ // Calculate the point size
+ //
+ //float fSizeX = (g_pointSize/g_viewportWidth)/4.0;
+ float fSizeY = (g_pointSize/g_viewportHeight)/4.0;
+ float fSizeX = fSizeY;
+
+ for(int i=0; i<3; i++)
+ {
+ output = input[i];
+
+ //find world pos and camera pos
+ float4 worldPos = float4(input[i].wPos,1);
+ float4 cameraPos = mul( worldPos, g_mView );
+
+ //find our size
+ if( g_bPointScaleEnable )
+ {
+ float dEye = length( cameraPos.xyz );
+ fSizeX = fSizeY = g_viewportHeight * g_pointSize *
+ sqrt( 1.0f/( g_pointScaleA + g_pointScaleB*dEye + g_pointScaleC*(dEye*dEye) ) );
+ }
+
+ //do shading
+ if(g_bEnableLighting)
+ {
+ float3 worldNormal = input[i].wNorm;
+ ColorsOutput cOut = CalcLighting( worldNormal, worldPos, cameraPos );
+
+ output.colorD = cOut.Diffuse;
+ output.colorS = cOut.Specular;
+ }
+ else
+ {
+ output.colorD = float4(1,1,1,1);
+ }
+
+ output.tex = input[i].tex;
+
+ //
+ // Emit two new triangles
+ //
+ for(int i=0; i<4; i++)
+ {
+ float4 outPos = mul( worldPos, g_mView );
+ output.pos = mul( outPos, g_mProj );
+ float zoverNear = (outPos.z)/g_nearPlane;
+ float4 posSize = float4( g_positions[i].x*fSizeX*zoverNear,
+ g_positions[i].y*fSizeY*zoverNear,
+ 0,
+ 0 );
+ output.pos += posSize;
+
+ PointTriStream.Append(output);
+ }
+ PointTriStream.RestartStrip();
+ }
+}
+
+//
+// Calculates fog factor based upon distance
+//
+float CalcFogFactor( float d )
+{
+ float fogCoeff = 1.0;
+
+ if( FOGMODE_LINEAR == g_fogMode )
+ {
+ fogCoeff = (g_fogEnd - d)/(g_fogEnd - g_fogStart);
+ }
+ else if( FOGMODE_EXP == g_fogMode )
+ {
+ fogCoeff = 1.0 / pow( E, d*g_fogDensity );
+ }
+ else if( FOGMODE_EXP2 == g_fogMode )
+ {
+ fogCoeff = 1.0 / pow( E, d*d*g_fogDensity*g_fogDensity );
+ }
+
+ return clamp( fogCoeff, 0, 1 );
+}
+
+//
+// PS for rendering with clip planes
+//
+float4 PSScenemain(PSSceneIn input) : SV_Target
+{
+ //calculate the fog factor
+ float fog = CalcFogFactor( input.fogDist );
+
+ //calculate the color based off of the normal, textures, etc
+ float4 normalColor = g_txDiffuse.Sample( g_samLinear, input.tex ) * input.colorD + input.colorS;
+
+ //calculate the color from the projected texture
+ float4 cookieCoord = mul( float4(input.wPos,1), g_mLightViewProj );
+ //since we don't have texldp, we must perform the w divide ourselves befor the texture lookup
+ cookieCoord.xy = 0.5 * cookieCoord.xy / cookieCoord.w + float2( 0.5, 0.5 );
+ float4 cookieColor = float4(0,0,0,0);
+ if( cookieCoord.z > 0 )
+ cookieColor = g_txProjected.Sample( g_samLinear, cookieCoord.xy );
+
+ //for standard light-modulating effects just multiply normalcolor and coookiecolor
+ normalColor += cookieColor;
+
+ return fog * normalColor + (1.0 - fog)*g_fogColor;
+}
+
+//
+// PS for rendering with alpha test
+//
+float4 PSAlphaTestmain(PSSceneIn input) : SV_Target
+{
+ float4 color = g_txDiffuse.Sample( g_samLinear, input.tex ) * input.colorD;
+ if( color.a < 0.5 )
+ discard;
+ return color;
+}
+
+//
+// RenderSceneGouraud - renders gouraud-shaded primitives
+//
+technique10 RenderSceneGouraud
+{
+ pass p0
+ {
+ SetVertexShader( CompileShader( vs_4_0, VSScenemain() ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0, PSScenemain() ) );
+
+ SetDepthStencilState( EnableDepth, 0 );
+ }
+}
+
+//
+// RenderSceneFlat - renders flat-shaded primitives
+//
+technique10 RenderSceneFlat
+{
+ pass p0
+ {
+ SetVertexShader( CompileShader( vs_4_0, VSScenemain() ) );
+ SetGeometryShader( CompileShader( gs_4_0, GSFlatmain() ) );
+ SetPixelShader( CompileShader( ps_4_0, PSScenemain() ) );
+
+ SetDepthStencilState( EnableDepth, 0 );
+ }
+}
+
+//
+// RenderScenePoint - replaces d3dfill_point
+//
+technique10 RenderScenePoint
+{
+ pass p0
+ {
+ SetVertexShader( CompileShader( vs_4_0, VSScenemain() ) );
+ SetGeometryShader( CompileShader( gs_4_0, GSPointmain() ) );
+ SetPixelShader( CompileShader( ps_4_0, PSScenemain() ) );
+
+ SetDepthStencilState( EnableDepth, 0 );
+ }
+}
+
+//
+// RenderScreneSpace - shows how to render something in screenspace
+//
+technique10 RenderScreenSpaceAlphaTest
+{
+ pass p0
+ {
+ SetVertexShader( CompileShader( vs_4_0, VSScreenSpacemain() ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0, PSAlphaTestmain() ) );
+
+ SetDepthStencilState( DisableDepth, 0 );
+ }
+}
+
+//
+// RenderScreneSpace - shows how to render something in screenspace
+//
+technique10 RenderTextureOnly
+{
+ pass p0
+ {
+ SetVertexShader( CompileShader( vs_4_0, VSScenemain() ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0, PSScenemain() ) );
+
+ SetDepthStencilState( EnableDepth, 0 );
+ }
+}
+
diff --git a/tests/hlsl/dxsdk/FluidCS11/ComputeShaderSort11.hlsl b/tests/hlsl/dxsdk/FluidCS11/ComputeShaderSort11.hlsl
new file mode 100644
index 000000000..db7bd5136
--- /dev/null
+++ b/tests/hlsl/dxsdk/FluidCS11/ComputeShaderSort11.hlsl
@@ -0,0 +1,75 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry BitonicSort -entry MatrixTranspose
+//--------------------------------------------------------------------------------------
+// File: ComputeShaderSort11.hlsl
+//
+// This file contains the compute shaders to perform GPU sorting using DirectX 11.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#define BITONIC_BLOCK_SIZE 512
+
+#define TRANSPOSE_BLOCK_SIZE 16
+
+//--------------------------------------------------------------------------------------
+// Constant Buffers
+//--------------------------------------------------------------------------------------
+cbuffer CB : register( b0 )
+{
+ unsigned int g_iLevel;
+ unsigned int g_iLevelMask;
+ unsigned int g_iWidth;
+ unsigned int g_iHeight;
+};
+
+//--------------------------------------------------------------------------------------
+// Structured Buffers
+//--------------------------------------------------------------------------------------
+StructuredBuffer<unsigned int> Input : register( t0 );
+RWStructuredBuffer<unsigned int> Data : register( u0 );
+
+//--------------------------------------------------------------------------------------
+// Bitonic Sort Compute Shader
+//--------------------------------------------------------------------------------------
+groupshared unsigned int shared_data[BITONIC_BLOCK_SIZE];
+
+[numthreads(BITONIC_BLOCK_SIZE, 1, 1)]
+void BitonicSort( uint3 Gid : SV_GroupID,
+ uint3 DTid : SV_DispatchThreadID,
+ uint3 GTid : SV_GroupThreadID,
+ uint GI : SV_GroupIndex )
+{
+ // Load shared data
+ shared_data[GI] = Data[DTid.x];
+ GroupMemoryBarrierWithGroupSync();
+
+ // Sort the shared data
+ for (unsigned int j = g_iLevel >> 1 ; j > 0 ; j >>= 1)
+ {
+ unsigned int result = ((shared_data[GI & ~j] <= shared_data[GI | j]) == (bool)(g_iLevelMask & DTid.x))? shared_data[GI ^ j] : shared_data[GI];
+ GroupMemoryBarrierWithGroupSync();
+ shared_data[GI] = result;
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ // Store shared data
+ Data[DTid.x] = shared_data[GI];
+}
+
+//--------------------------------------------------------------------------------------
+// Matrix Transpose Compute Shader
+//--------------------------------------------------------------------------------------
+groupshared unsigned int transpose_shared_data[TRANSPOSE_BLOCK_SIZE * TRANSPOSE_BLOCK_SIZE];
+
+[numthreads(TRANSPOSE_BLOCK_SIZE, TRANSPOSE_BLOCK_SIZE, 1)]
+void MatrixTranspose( uint3 Gid : SV_GroupID,
+ uint3 DTid : SV_DispatchThreadID,
+ uint3 GTid : SV_GroupThreadID,
+ uint GI : SV_GroupIndex )
+{
+ transpose_shared_data[GI] = Input[DTid.y * g_iWidth + DTid.x];
+ GroupMemoryBarrierWithGroupSync();
+ uint2 XY = DTid.yx - GTid.yx + GTid.xy;
+ Data[XY.y * g_iHeight + XY.x] = transpose_shared_data[GTid.x * TRANSPOSE_BLOCK_SIZE + GTid.y];
+}
diff --git a/tests/hlsl/dxsdk/FluidCS11/FluidCS11.hlsl b/tests/hlsl/dxsdk/FluidCS11/FluidCS11.hlsl
new file mode 100644
index 000000000..26e6cdf60
--- /dev/null
+++ b/tests/hlsl/dxsdk/FluidCS11/FluidCS11.hlsl
@@ -0,0 +1,529 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry BuildGridCS -entry ClearGridIndicesCS -entry BuildGridIndicesCS -entry RearrangeParticlesCS -entry DensityCS_Simple -entry DensityCS_Shared -entry DensityCS_Grid -entry ForceCS_Simple -entry ForceCS_Shared -entry ForceCS_Grid -entry IntegrateCS
+//--------------------------------------------------------------------------------------
+// File: FluidCS11.hlsl
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Smoothed Particle Hydrodynamics Algorithm Based Upon:
+// Particle-Based Fluid Simulation for Interactive Applications
+// Matthias Müller
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Optimized Grid Algorithm Based Upon:
+// Broad-Phase Collision Detection with CUDA
+// Scott Le Grand
+//--------------------------------------------------------------------------------------
+
+struct Particle
+{
+ float2 position;
+ float2 velocity;
+};
+
+struct ParticleForces
+{
+ float2 acceleration;
+};
+
+struct ParticleDensity
+{
+ float density;
+};
+
+cbuffer cbSimulationConstants : register( b0 )
+{
+ uint g_iNumParticles;
+ float g_fTimeStep;
+ float g_fSmoothlen;
+ float g_fPressureStiffness;
+ float g_fRestDensity;
+ float g_fDensityCoef;
+ float g_fGradPressureCoef;
+ float g_fLapViscosityCoef;
+ float g_fWallStiffness;
+
+ float4 g_vGravity;
+ float4 g_vGridDim;
+ float3 g_vPlanes[4];
+};
+
+//--------------------------------------------------------------------------------------
+// Fluid Simulation
+//--------------------------------------------------------------------------------------
+
+#define SIMULATION_BLOCK_SIZE 256
+
+//--------------------------------------------------------------------------------------
+// Structured Buffers
+//--------------------------------------------------------------------------------------
+RWStructuredBuffer<Particle> ParticlesRW : register( u0 );
+StructuredBuffer<Particle> ParticlesRO : register( t0 );
+
+RWStructuredBuffer<ParticleDensity> ParticlesDensityRW : register( u0 );
+StructuredBuffer<ParticleDensity> ParticlesDensityRO : register( t1 );
+
+RWStructuredBuffer<ParticleForces> ParticlesForcesRW : register( u0 );
+StructuredBuffer<ParticleForces> ParticlesForcesRO : register( t2 );
+
+RWStructuredBuffer<unsigned int> GridRW : register( u0 );
+StructuredBuffer<unsigned int> GridRO : register( t3 );
+
+RWStructuredBuffer<uint2> GridIndicesRW : register( u0 );
+StructuredBuffer<uint2> GridIndicesRO : register( t4 );
+
+
+//--------------------------------------------------------------------------------------
+// Grid Construction
+//--------------------------------------------------------------------------------------
+
+// For simplicity, this sample uses a 16-bit hash based on the grid cell and
+// a 16-bit particle ID to keep track of the particles while sorting
+// This imposes a limitation of 64K particles and 256x256 grid work
+// You could extended the implementation to support large scenarios by using a uint2
+
+float2 GridCalculateCell(float2 position)
+{
+ return clamp(position * g_vGridDim.xy + g_vGridDim.zw, float2(0, 0), float2(255, 255));
+}
+
+unsigned int GridConstuctKey(uint2 xy)
+{
+ // Bit pack [-----UNUSED-----][----Y---][----X---]
+ // 16-bit 8-bit 8-bit
+ return dot(xy.yx, uint2(256, 1));
+}
+
+unsigned int GridConstuctKeyValuePair(uint2 xy, uint value)
+{
+ // Bit pack [----Y---][----X---][-----VALUE------]
+ // 8-bit 8-bit 16-bit
+ return dot(uint3(xy.yx, value), uint3(256*256*256, 256*256, 1));
+}
+
+unsigned int GridGetKey(unsigned int keyvaluepair)
+{
+ return (keyvaluepair >> 16);
+}
+
+unsigned int GridGetValue(unsigned int keyvaluepair)
+{
+ return (keyvaluepair & 0xFFFF);
+}
+
+
+//--------------------------------------------------------------------------------------
+// Build Grid
+//--------------------------------------------------------------------------------------
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void BuildGridCS( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+ const unsigned int P_ID = DTid.x; // Particle ID to operate on
+
+ float2 position = ParticlesRO[P_ID].position;
+ float2 grid_xy = GridCalculateCell( position );
+
+ GridRW[P_ID] = GridConstuctKeyValuePair((uint2)grid_xy, P_ID);
+}
+
+
+//--------------------------------------------------------------------------------------
+// Build Grid Indices
+//--------------------------------------------------------------------------------------
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void ClearGridIndicesCS( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+ GridIndicesRW[DTid.x] = uint2(0, 0);
+}
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void BuildGridIndicesCS( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+ const unsigned int G_ID = DTid.x; // Grid ID to operate on
+ unsigned int G_ID_PREV = (G_ID == 0)? g_iNumParticles : G_ID; G_ID_PREV--;
+ unsigned int G_ID_NEXT = G_ID + 1; if (G_ID_NEXT == g_iNumParticles) { G_ID_NEXT = 0; }
+
+ unsigned int cell = GridGetKey( GridRO[G_ID] );
+ unsigned int cell_prev = GridGetKey( GridRO[G_ID_PREV] );
+ unsigned int cell_next = GridGetKey( GridRO[G_ID_NEXT] );
+ if (cell != cell_prev)
+ {
+ // I'm the start of a cell
+ GridIndicesRW[cell].x = G_ID;
+ }
+ if (cell != cell_next)
+ {
+ // I'm the end of a cell
+ GridIndicesRW[cell].y = G_ID + 1;
+ }
+}
+
+
+//--------------------------------------------------------------------------------------
+// Rearrange Particles
+//--------------------------------------------------------------------------------------
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void RearrangeParticlesCS( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+ const unsigned int ID = DTid.x; // Particle ID to operate on
+ const unsigned int G_ID = GridGetValue( GridRO[ ID ] );
+ ParticlesRW[ID] = ParticlesRO[ G_ID ];
+}
+
+
+//--------------------------------------------------------------------------------------
+// Density Calculation
+//--------------------------------------------------------------------------------------
+
+float CalculateDensity(float r_sq)
+{
+ const float h_sq = g_fSmoothlen * g_fSmoothlen;
+ // Implements this equation:
+ // W_poly6(r, h) = 315 / (64 * pi * h^9) * (h^2 - r^2)^3
+ // g_fDensityCoef = fParticleMass * 315.0f / (64.0f * PI * fSmoothlen^9)
+ return g_fDensityCoef * (h_sq - r_sq) * (h_sq - r_sq) * (h_sq - r_sq);
+}
+
+
+//--------------------------------------------------------------------------------------
+// Simple N^2 Algorithm
+//--------------------------------------------------------------------------------------
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void DensityCS_Simple( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+ const unsigned int P_ID = DTid.x;
+ const float h_sq = g_fSmoothlen * g_fSmoothlen;
+ float2 P_position = ParticlesRO[P_ID].position;
+
+ float density = 0;
+
+ // Calculate the density based on all neighbors
+ for (uint N_ID = 0 ; N_ID < g_iNumParticles ; N_ID++)
+ {
+ float2 N_position = ParticlesRO[N_ID].position;
+
+ float2 diff = N_position - P_position;
+ float r_sq = dot(diff, diff);
+ if (r_sq < h_sq)
+ {
+ density += CalculateDensity(r_sq);
+ }
+ }
+
+ ParticlesDensityRW[P_ID].density = density;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Shared Memory Optimized N^2 Algorithm
+//--------------------------------------------------------------------------------------
+
+groupshared float2 density_shared_pos[SIMULATION_BLOCK_SIZE];
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void DensityCS_Shared( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+ const unsigned int P_ID = DTid.x;
+ const float h_sq = g_fSmoothlen * g_fSmoothlen;
+ float2 P_position = ParticlesRO[P_ID].position;
+
+ float density = 0;
+
+ // Calculate the density based on all neighbors
+ [loop]
+ for (uint N_block_ID = 0 ; N_block_ID < g_iNumParticles ; N_block_ID += SIMULATION_BLOCK_SIZE)
+ {
+ // Cache a tile of particles unto shared memory to increase IO efficiency
+ density_shared_pos[GI] = ParticlesRO[N_block_ID + GI].position;
+
+ GroupMemoryBarrierWithGroupSync();
+
+ for (uint N_tile_ID = 0; N_tile_ID < SIMULATION_BLOCK_SIZE; N_tile_ID++)
+ {
+ float2 N_position = density_shared_pos[N_tile_ID];
+
+ float2 diff = N_position - P_position;
+ float r_sq = dot(diff, diff);
+ if (r_sq < h_sq)
+ {
+ density += CalculateDensity(r_sq);
+ }
+ }
+
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ ParticlesDensityRW[P_ID].density = density;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Optimized Grid + Sort Algorithm
+//--------------------------------------------------------------------------------------
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void DensityCS_Grid( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+ const unsigned int P_ID = DTid.x;
+ const float h_sq = g_fSmoothlen * g_fSmoothlen;
+ float2 P_position = ParticlesRO[P_ID].position;
+
+ float density = 0;
+
+ // Calculate the density based on neighbors from the 8 adjacent cells + current cell
+ int2 G_XY = (int2)GridCalculateCell( P_position );
+ for (int Y = max(G_XY.y - 1, 0) ; Y <= min(G_XY.y + 1, 255) ; Y++)
+ {
+ for (int X = max(G_XY.x - 1, 0) ; X <= min(G_XY.x + 1, 255) ; X++)
+ {
+ unsigned int G_CELL = GridConstuctKey(uint2(X, Y));
+ uint2 G_START_END = GridIndicesRO[G_CELL];
+ for (unsigned int N_ID = G_START_END.x ; N_ID < G_START_END.y ; N_ID++)
+ {
+ float2 N_position = ParticlesRO[N_ID].position;
+
+ float2 diff = N_position - P_position;
+ float r_sq = dot(diff, diff);
+ if (r_sq < h_sq)
+ {
+ density += CalculateDensity(r_sq);
+ }
+ }
+ }
+ }
+
+ ParticlesDensityRW[P_ID].density = density;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Force Calculation
+//--------------------------------------------------------------------------------------
+
+float CalculatePressure(float density)
+{
+ // Implements this equation:
+ // Pressure = B * ((rho / rho_0)^y - 1)
+ return g_fPressureStiffness * max(pow(density / g_fRestDensity, 3) - 1, 0);
+}
+
+float2 CalculateGradPressure(float r, float P_pressure, float N_pressure, float N_density, float2 diff)
+{
+ const float h = g_fSmoothlen;
+ float avg_pressure = 0.5f * (N_pressure + P_pressure);
+ // Implements this equation:
+ // W_spkiey(r, h) = 15 / (pi * h^6) * (h - r)^3
+ // GRAD( W_spikey(r, h) ) = -45 / (pi * h^6) * (h - r)^2
+ // g_fGradPressureCoef = fParticleMass * -45.0f / (PI * fSmoothlen^6)
+ return g_fGradPressureCoef * avg_pressure / N_density * (h - r) * (h - r) / r * (diff);
+}
+
+float2 CalculateLapVelocity(float r, float2 P_velocity, float2 N_velocity, float N_density)
+{
+ const float h = g_fSmoothlen;
+ float2 vel_diff = (N_velocity - P_velocity);
+ // Implements this equation:
+ // W_viscosity(r, h) = 15 / (2 * pi * h^3) * (-r^3 / (2 * h^3) + r^2 / h^2 + h / (2 * r) - 1)
+ // LAPLACIAN( W_viscosity(r, h) ) = 45 / (pi * h^6) * (h - r)
+ // g_fLapViscosityCoef = fParticleMass * fViscosity * 45.0f / (PI * fSmoothlen^6)
+ return g_fLapViscosityCoef / N_density * (h - r) * vel_diff;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Simple N^2 Algorithm
+//--------------------------------------------------------------------------------------
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void ForceCS_Simple( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+ const unsigned int P_ID = DTid.x; // Particle ID to operate on
+
+ float2 P_position = ParticlesRO[P_ID].position;
+ float2 P_velocity = ParticlesRO[P_ID].velocity;
+ float P_density = ParticlesDensityRO[P_ID].density;
+ float P_pressure = CalculatePressure(P_density);
+
+ const float h_sq = g_fSmoothlen * g_fSmoothlen;
+
+ float2 acceleration = float2(0, 0);
+
+ // Calculate the acceleration based on all neighbors
+ for (uint N_ID = 0 ; N_ID < g_iNumParticles ; N_ID++)
+ {
+ float2 N_position = ParticlesRO[N_ID].position;
+
+ float2 diff = N_position - P_position;
+ float r_sq = dot(diff, diff);
+ if (r_sq < h_sq && P_ID != N_ID)
+ {
+ float2 N_velocity = ParticlesRO[N_ID].velocity;
+ float N_density = ParticlesDensityRO[N_ID].density;
+ float N_pressure = CalculatePressure(N_density);
+ float r = sqrt(r_sq);
+
+ // Pressure Term
+ acceleration += CalculateGradPressure(r, P_pressure, N_pressure, N_density, diff);
+
+ // Viscosity Term
+ acceleration += CalculateLapVelocity(r, P_velocity, N_velocity, N_density);
+ }
+ }
+
+ ParticlesForcesRW[P_ID].acceleration = acceleration / P_density;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Shared Memory Optimized N^2 Algorithm
+//--------------------------------------------------------------------------------------
+
+groupshared struct { float2 position; float2 velocity; float density; } force_shared_pos[SIMULATION_BLOCK_SIZE];
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void ForceCS_Shared( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+ const unsigned int P_ID = DTid.x; // Particle ID to operate on
+
+ float2 P_position = ParticlesRO[P_ID].position;
+ float2 P_velocity = ParticlesRO[P_ID].velocity;
+ float P_density = ParticlesDensityRO[P_ID].density;
+ float P_pressure = CalculatePressure(P_density);
+
+ const float h_sq = g_fSmoothlen * g_fSmoothlen;
+
+ float2 acceleration = float2(0, 0);
+
+ // Calculate the acceleration based on all neighbors
+ [loop]
+ for (uint N_block_ID = 0 ; N_block_ID < g_iNumParticles ; N_block_ID += SIMULATION_BLOCK_SIZE)
+ {
+ // Cache a tile of particles unto shared memory to increase IO efficiency
+ force_shared_pos[GI].position = ParticlesRO[N_block_ID + GI].position;
+ force_shared_pos[GI].velocity = ParticlesRO[N_block_ID + GI].velocity;
+ force_shared_pos[GI].density = ParticlesDensityRO[N_block_ID + GI].density;
+
+ GroupMemoryBarrierWithGroupSync();
+
+ [loop]
+ for (uint N_tile_ID = 0; N_tile_ID < SIMULATION_BLOCK_SIZE; N_tile_ID++ )
+ {
+ uint N_ID = N_block_ID + N_tile_ID;
+ float2 N_position = force_shared_pos[N_tile_ID].position;
+
+ float2 diff = N_position - P_position;
+ float r_sq = dot(diff, diff);
+ if (r_sq < h_sq && P_ID != N_ID)
+ {
+ float2 N_velocity = force_shared_pos[N_tile_ID].velocity;
+ float N_density = force_shared_pos[N_tile_ID].density;
+ float N_pressure = CalculatePressure(N_density);
+ float r = sqrt(r_sq);
+
+ // Pressure Term
+ acceleration += CalculateGradPressure(r, P_pressure, N_pressure, N_density, diff);
+
+ // Viscosity Term
+ acceleration += CalculateLapVelocity(r, P_velocity, N_velocity, N_density);
+ }
+ }
+
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ ParticlesForcesRW[P_ID].acceleration = acceleration / P_density;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Optimized Grid + Sort Algorithm
+//--------------------------------------------------------------------------------------
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void ForceCS_Grid( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+ const unsigned int P_ID = DTid.x; // Particle ID to operate on
+
+ float2 P_position = ParticlesRO[P_ID].position;
+ float2 P_velocity = ParticlesRO[P_ID].velocity;
+ float P_density = ParticlesDensityRO[P_ID].density;
+ float P_pressure = CalculatePressure(P_density);
+
+ const float h_sq = g_fSmoothlen * g_fSmoothlen;
+
+ float2 acceleration = float2(0, 0);
+
+ // Calculate the acceleration based on neighbors from the 8 adjacent cells + current cell
+ int2 G_XY = (int2)GridCalculateCell( P_position );
+ for (int Y = max(G_XY.y - 1, 0) ; Y <= min(G_XY.y + 1, 255) ; Y++)
+ {
+ for (int X = max(G_XY.x - 1, 0) ; X <= min(G_XY.x + 1, 255) ; X++)
+ {
+ unsigned int G_CELL = GridConstuctKey(uint2(X, Y));
+ uint2 G_START_END = GridIndicesRO[G_CELL];
+ for (unsigned int N_ID = G_START_END.x ; N_ID < G_START_END.y ; N_ID++)
+ {
+ float2 N_position = ParticlesRO[N_ID].position;
+
+ float2 diff = N_position - P_position;
+ float r_sq = dot(diff, diff);
+ if (r_sq < h_sq && P_ID != N_ID)
+ {
+ float2 N_velocity = ParticlesRO[N_ID].velocity;
+ float N_density = ParticlesDensityRO[N_ID].density;
+ float N_pressure = CalculatePressure(N_density);
+ float r = sqrt(r_sq);
+
+ // Pressure Term
+ acceleration += CalculateGradPressure(r, P_pressure, N_pressure, N_density, diff);
+
+ // Viscosity Term
+ acceleration += CalculateLapVelocity(r, P_velocity, N_velocity, N_density);
+ }
+ }
+ }
+ }
+
+ ParticlesForcesRW[P_ID].acceleration = acceleration / P_density;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Integration
+//--------------------------------------------------------------------------------------
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void IntegrateCS( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+ const unsigned int P_ID = DTid.x; // Particle ID to operate on
+
+ float2 position = ParticlesRO[P_ID].position;
+ float2 velocity = ParticlesRO[P_ID].velocity;
+ float2 acceleration = ParticlesForcesRO[P_ID].acceleration;
+
+ // Apply the forces from the map walls
+ [unroll]
+ for (unsigned int i = 0 ; i < 4 ; i++)
+ {
+ float dist = dot(float3(position, 1), g_vPlanes[i]);
+ acceleration += min(dist, 0) * -g_fWallStiffness * g_vPlanes[i].xy;
+ }
+
+ // Apply gravity
+ acceleration += g_vGravity.xy;
+
+ // Integrate
+ velocity += g_fTimeStep * acceleration;
+ position += g_fTimeStep * velocity;
+
+ // Update
+ ParticlesRW[P_ID].position = position;
+ ParticlesRW[P_ID].velocity = velocity;
+}
diff --git a/tests/hlsl/dxsdk/FluidCS11/FluidRender.hlsl b/tests/hlsl/dxsdk/FluidCS11/FluidRender.hlsl
new file mode 100644
index 000000000..d7e24b7bc
--- /dev/null
+++ b/tests/hlsl/dxsdk/FluidCS11/FluidRender.hlsl
@@ -0,0 +1,112 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry ParticleVS -profile gs_4_0 -entry ParticleGS -profile ps_4_0 -entry ParticlePS
+//--------------------------------------------------------------------------------------
+// File: FluidRender.hlsl
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Particle Rendering
+//--------------------------------------------------------------------------------------
+
+struct Particle {
+ float2 position;
+ float2 velocity;
+};
+
+struct ParticleDensity {
+ float density;
+};
+
+StructuredBuffer<Particle> ParticlesRO : register( t0 );
+StructuredBuffer<ParticleDensity> ParticleDensityRO : register( t1 );
+
+cbuffer cbRenderConstants : register( b0 )
+{
+ matrix g_mViewProjection;
+ float g_fParticleSize;
+};
+
+struct VSParticleOut
+{
+ float2 position : POSITION;
+ float4 color : COLOR;
+};
+
+struct GSParticleOut
+{
+ float4 position : SV_Position;
+ float4 color : COLOR;
+ float2 texcoord : TEXCOORD;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Visualization Helper
+//--------------------------------------------------------------------------------------
+
+static const float4 Rainbow[5] = {
+ float4(1, 0, 0, 1), // red
+ float4(1, 1, 0, 1), // orange
+ float4(0, 1, 0, 1), // green
+ float4(0, 1, 1, 1), // teal
+ float4(0, 0, 1, 1), // blue
+};
+
+float4 VisualizeNumber(float n)
+{
+ return lerp( Rainbow[ floor(n * 4.0f) ], Rainbow[ ceil(n * 4.0f) ], frac(n * 4.0f) );
+}
+
+float4 VisualizeNumber(float n, float lower, float upper)
+{
+ return VisualizeNumber( saturate( (n - lower) / (upper - lower) ) );
+}
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+
+VSParticleOut ParticleVS(uint ID : SV_VertexID)
+{
+ VSParticleOut Out = (VSParticleOut)0;
+ Out.position = ParticlesRO[ID].position;
+ Out.color = VisualizeNumber(ParticleDensityRO[ID].density, 1000.0f, 2000.0f);
+ return Out;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Particle Geometry Shader
+//--------------------------------------------------------------------------------------
+
+static const float2 g_positions[4] = { float2(-1, 1), float2(1, 1), float2(-1, -1), float2(1, -1) };
+static const float2 g_texcoords[4] = { float2(0, 1), float2(1, 1), float2(0, 0), float2(1, 0) };
+
+[maxvertexcount(4)]
+void ParticleGS(point VSParticleOut In[1], inout TriangleStream<GSParticleOut> SpriteStream)
+{
+ [unroll]
+ for (int i = 0; i < 4; i++)
+ {
+ GSParticleOut Out = (GSParticleOut)0;
+ float4 position = float4(In[0].position, 0, 1) + g_fParticleSize * float4(g_positions[i], 0, 0);
+ Out.position = mul(position, g_mViewProjection);
+ Out.color = In[0].color;
+ Out.texcoord = g_texcoords[i];
+ SpriteStream.Append(Out);
+ }
+ SpriteStream.RestartStrip();
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+
+float4 ParticlePS(GSParticleOut In) : SV_Target
+{
+ return In.color;
+}
diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/BrightPassAndHorizFilterCS.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/BrightPassAndHorizFilterCS.hlsl
new file mode 100644
index 000000000..87bad46ed
--- /dev/null
+++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/BrightPassAndHorizFilterCS.hlsl
@@ -0,0 +1,64 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSMain
+//--------------------------------------------------------------------------------------
+// File: BrightPassAndHorizFilterCS.hlsl
+//
+// The CS for bright pass and horizontal blur, used in CS path of
+// HDRToneMappingCS11 sample
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+static const float MIDDLE_GRAY = 0.72f;
+static const float LUM_WHITE = 1.5f;
+static const float BRIGHT_THRESHOLD = 0.5f;
+
+Texture2D Input : register( t0 );
+StructuredBuffer<float> lum : register( t1 );
+RWStructuredBuffer<float4> Result : register( u0 );
+
+cbuffer cb0
+{
+ float4 g_avSampleWeights[15];
+ uint g_outputwidth;
+ float g_inverse;
+ int2 g_inputsize;
+}
+
+#define kernelhalf 7
+#define groupthreads 128
+groupshared float4 temp[groupthreads];
+
+[numthreads( groupthreads, 1, 1 )]
+void CSMain( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex )
+{
+ int2 coord = int2( GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.x, Gid.y );
+ coord = coord.xy * 8 + int2(4, 3);
+ coord = clamp( coord, int2(0, 0), int2(g_inputsize.x-1, g_inputsize.y-1) );
+ float4 vColor = Input.Load( int3(coord, 0) );
+
+ float fLum = lum[0]*g_inverse;
+
+ // Bright pass and tone mapping
+ vColor = max( 0.0f, vColor - BRIGHT_THRESHOLD );
+ vColor *= MIDDLE_GRAY / (fLum + 0.001f);
+ vColor *= (1.0f + vColor/LUM_WHITE);
+ vColor /= (1.0f + vColor);
+
+ temp[GI] = vColor;
+
+ GroupMemoryBarrierWithGroupSync();
+
+ // Horizontal blur
+ if ( GI >= kernelhalf &&
+ GI < (groupthreads - kernelhalf) &&
+ ( (Gid.x * (groupthreads - 2 * kernelhalf) + GI - kernelhalf) < g_outputwidth) )
+ {
+ float4 vOut = 0;
+
+ [unroll]
+ for ( int i = -kernelhalf; i <= kernelhalf; ++i )
+ vOut += temp[GI + i] * g_avSampleWeights[i + kernelhalf];
+
+ Result[GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.x + Gid.y * g_outputwidth] = float4(vOut.rgb, 1.0f);
+ }
+}
diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/DumpToTexture.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/DumpToTexture.hlsl
new file mode 100644
index 000000000..d2d9611ce
--- /dev/null
+++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/DumpToTexture.hlsl
@@ -0,0 +1,29 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PSDump
+//--------------------------------------------------------------------------------------
+// File: DumpToTexture.hlsl
+//
+// The PS for converting CS output buffer to a texture, used in CS path of
+// HDRToneMappingCS11 sample
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+StructuredBuffer<float4> buffer : register( t0 );
+
+struct QuadVS_Output
+{
+ float4 Pos : SV_POSITION;
+ float2 Tex : TEXCOORD0;
+};
+
+cbuffer cbPS : register( b0 )
+{
+ uint4 g_param;
+};
+
+float4 PSDump( QuadVS_Output Input ) : SV_TARGET
+{
+ // To calculate the buffer offset, it is natural to use the screen space coordinates,
+ // Input.Pos is the screen space coordinates of the pixel being written
+ return buffer[ (Input.Pos.x - 0.5) + (Input.Pos.y - 0.5) * g_param.x ];
+}
diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/FilterCS.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/FilterCS.hlsl
new file mode 100644
index 000000000..09c91669a
--- /dev/null
+++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/FilterCS.hlsl
@@ -0,0 +1,73 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSVerticalFilter -entry CSHorizFilter
+//--------------------------------------------------------------------------------------
+// File: FilterCS.hlsl
+//
+// The CSs for doing vertical and horizontal blur, used in CS path of
+// HDRToneMappingCS11 sample
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+StructuredBuffer<float4> InputBuf : register( t0 );
+Texture2D InputTex : register( t1 );
+RWStructuredBuffer<float4> Result : register( u0 );
+
+cbuffer cb0
+{
+ float4 g_avSampleWeights[15];
+ int2 g_outputsize;
+ int2 g_inputsize;
+}
+
+#define kernelhalf 7
+#define groupthreads 128
+groupshared float4 temp[groupthreads];
+
+[numthreads( groupthreads, 1, 1 )]
+void CSVerticalFilter( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex )
+{
+ int offsety = GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.y;
+ offsety = clamp( offsety, 0, g_inputsize.y-1 );
+ int offset = Gid.x + offsety * g_inputsize.x;
+ temp[GI] = InputBuf[offset];
+
+ GroupMemoryBarrierWithGroupSync();
+
+ // Vertical blur
+ if ( GI >= kernelhalf &&
+ GI < (groupthreads - kernelhalf) &&
+ ( (GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.y) < g_outputsize.y) )
+ {
+ float4 vOut = 0;
+
+ [unroll]
+ for ( int i = -kernelhalf; i <= kernelhalf; ++i )
+ vOut += temp[GI + i] * g_avSampleWeights[i + kernelhalf];
+
+ Result[Gid.x + (GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.y) * g_outputsize.x] = float4(vOut.rgb, 1.0f);
+ }
+}
+
+[numthreads( groupthreads, 1, 1 )]
+void CSHorizFilter( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex )
+{
+ int2 coord = int2( GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.x, Gid.y );
+ coord = clamp( coord, int2(0, 0), int2(g_inputsize.x-1, g_inputsize.y-1) );
+ temp[GI] = InputTex.Load( int3(coord, 0) );
+
+ GroupMemoryBarrierWithGroupSync();
+
+ // Horizontal blur
+ if ( GI >= kernelhalf &&
+ GI < (groupthreads - kernelhalf) &&
+ ( (Gid.x * (groupthreads - 2 * kernelhalf) + GI - kernelhalf) < g_outputsize.x) )
+ {
+ float4 vOut = 0;
+
+ [unroll]
+ for ( int i = -kernelhalf; i <= kernelhalf; ++i )
+ vOut += temp[GI + i] * g_avSampleWeights[i + kernelhalf];
+
+ Result[GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.x + Gid.y * g_outputsize.x] = float4(vOut.rgb, 1.0f);
+ }
+}
diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/FinalPass.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/FinalPass.hlsl
new file mode 100644
index 000000000..a4673c237
--- /dev/null
+++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/FinalPass.hlsl
@@ -0,0 +1,79 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry QuadVS -profile ps_4_0 -entry PSFinalPass -entry PSFinalPassForCPUReduction
+//--------------------------------------------------------------------------------------
+// File: FinalPass.hlsl
+//
+// The PSs for doing tone-mapping based on the input luminance, used in CS path of
+// HDRToneMappingCS11 sample
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+struct QuadVS_Input
+{
+ float4 Pos : POSITION;
+ float2 Tex : TEXCOORD0;
+};
+
+struct QuadVS_Output
+{
+ float4 Pos : SV_POSITION;
+ float2 Tex : TEXCOORD0;
+};
+
+QuadVS_Output QuadVS( QuadVS_Input Input )
+{
+ QuadVS_Output Output;
+ Output.Pos = Input.Pos;
+ Output.Tex = Input.Tex;
+ return Output;
+}
+
+Texture2D<float4> tex : register( t0 );
+StructuredBuffer<float> lum : register( t1 );
+Texture2D<float4> bloom : register( t2 );
+
+SamplerState PointSampler : register (s0);
+SamplerState LinearSampler : register (s1);
+
+
+static const float MIDDLE_GRAY = 0.72f;
+static const float LUM_WHITE = 1.5f;
+
+cbuffer cbPS : register( b0 )
+{
+ float4 g_param;
+};
+
+float4 PSFinalPass( QuadVS_Output Input ) : SV_TARGET
+{
+ float4 vColor = tex.Sample( PointSampler, Input.Tex );
+ float fLum = lum[0]*g_param.x;
+ float3 vBloom = bloom.Sample( LinearSampler, Input.Tex );
+
+ // Tone mapping
+ vColor.rgb *= MIDDLE_GRAY / (fLum + 0.001f);
+ vColor.rgb *= (1.0f + vColor/LUM_WHITE);
+ vColor.rgb /= (1.0f + vColor);
+
+ vColor.rgb += 0.6f * vBloom;
+ vColor.a = 1.0f;
+
+ return vColor;
+}
+
+float4 PSFinalPassForCPUReduction( QuadVS_Output Input ) : SV_TARGET
+{
+ float4 vColor = tex.Sample( PointSampler, Input.Tex );
+ float fLum = g_param.x;
+ float3 vBloom = bloom.Sample( LinearSampler, Input.Tex );
+
+ // Tone mapping
+ vColor.rgb *= MIDDLE_GRAY / (fLum + 0.001f);
+ vColor.rgb *= (1.0f + vColor/LUM_WHITE);
+ vColor.rgb /= (1.0f + vColor);
+
+ vColor.rgb += 0.6f * vBloom;
+ vColor.a = 1.0f;
+
+ return vColor;
+}
diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/PSApproach.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/PSApproach.hlsl
new file mode 100644
index 000000000..2b18cf0a1
--- /dev/null
+++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/PSApproach.hlsl
@@ -0,0 +1,129 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry DownScale2x2_Lum -entry DownScale3x3 -entry FinalPass -entry DownScale3x3_BrightPass -entry Bloom
+//--------------------------------------------------------------------------------------
+// File: PSApproach.hlsl
+//
+// The PSs for doing post-processing, used in PS path of
+// HDRToneMappingCS11 sample
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+static const float4 LUM_VECTOR = float4(.299, .587, .114, 0);
+static const float MIDDLE_GRAY = 0.72f;
+static const float LUM_WHITE = 1.5f;
+static const float BRIGHT_THRESHOLD = 0.5f;
+
+SamplerState PointSampler : register (s0);
+SamplerState LinearSampler : register (s1);
+
+struct QuadVS_Output
+{
+ float4 Pos : SV_POSITION;
+ float2 Tex : TEXCOORD0;
+};
+
+Texture2D s0 : register(t0);
+Texture2D s1 : register(t1);
+Texture2D s2 : register(t2);
+
+float4 DownScale2x2_Lum ( QuadVS_Output Input ) : SV_TARGET
+{
+ float4 vColor = 0.0f;
+ float fAvg = 0.0f;
+
+ for( int y = -1; y < 1; y++ )
+ {
+ for( int x = -1; x < 1; x++ )
+ {
+ // Compute the sum of color values
+ vColor = s0.Sample( PointSampler, Input.Tex, int2(x,y) );
+
+ fAvg += dot( vColor, LUM_VECTOR );
+ }
+ }
+
+ fAvg /= 4;
+
+ return float4(fAvg, fAvg, fAvg, 1.0f);
+}
+
+float4 DownScale3x3( QuadVS_Output Input ) : SV_TARGET
+{
+ float fAvg = 0.0f;
+ float4 vColor;
+
+ for( int y = -1; y <= 1; y++ )
+ {
+ for( int x = -1; x <= 1; x++ )
+ {
+ // Compute the sum of color values
+ vColor = s0.Sample( PointSampler, Input.Tex, int2(x,y) );
+
+ fAvg += vColor.r;
+ }
+ }
+
+ // Divide the sum to complete the average
+ fAvg /= 9;
+
+ return float4(fAvg, fAvg, fAvg, 1.0f);
+}
+
+float4 FinalPass( QuadVS_Output Input ) : SV_TARGET
+{
+ //float4 vColor = 0;
+ float4 vColor = s0.Sample( PointSampler, Input.Tex );
+ float4 vLum = s1.Sample( PointSampler, float2(0,0) );
+ float3 vBloom = s2.Sample( LinearSampler, Input.Tex );
+
+ // Tone mapping
+ vColor.rgb *= MIDDLE_GRAY / (vLum.r + 0.001f);
+ vColor.rgb *= (1.0f + vColor/LUM_WHITE);
+ vColor.rgb /= (1.0f + vColor);
+
+ vColor.rgb += 0.6f * vBloom;
+ vColor.a = 1.0f;
+
+ return vColor;
+}
+
+float4 DownScale3x3_BrightPass( QuadVS_Output Input ) : SV_TARGET
+{
+ float3 vColor = 0.0f;
+ float4 vLum = s1.Sample( PointSampler, float2(0, 0) );
+ float fLum = vLum.r;
+
+ vColor = s0.Sample( PointSampler, Input.Tex ).rgb;
+
+ // Bright pass and tone mapping
+ vColor = max( 0.0f, vColor - BRIGHT_THRESHOLD );
+ vColor *= MIDDLE_GRAY / (fLum + 0.001f);
+ vColor *= (1.0f + vColor/LUM_WHITE);
+ vColor /= (1.0f + vColor);
+
+ return float4(vColor, 1.0f);
+}
+
+cbuffer cb0
+{
+ float2 g_avSampleOffsets[15];
+ float4 g_avSampleWeights[15];
+}
+
+float4 Bloom( QuadVS_Output Input ) : SV_TARGET
+{
+ float4 vSample = 0.0f;
+ float4 vColor = 0.0f;
+ float2 vSamplePosition;
+
+ for( int iSample = 0; iSample < 15; iSample++ )
+ {
+ // Sample from adjacent points
+ vSamplePosition = Input.Tex + g_avSampleOffsets[iSample];
+ vColor = s0.Sample( PointSampler, vSamplePosition);
+
+ vSample += g_avSampleWeights[iSample]*vColor;
+ }
+
+ return vSample;
+}
diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceTo1DCS.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceTo1DCS.hlsl
new file mode 100644
index 000000000..027838743
--- /dev/null
+++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceTo1DCS.hlsl
@@ -0,0 +1,72 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSMain
+//-----------------------------------------------------------------------------
+// File: ReduceTo1DCS.hlsl
+//
+// Desc: Reduce an input Texture2D to a buffer
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//-----------------------------------------------------------------------------
+Texture2D Input : register( t0 );
+RWStructuredBuffer<float> Result : register( u0 );
+
+cbuffer cbCS : register( b0 )
+{
+ uint4 g_param; // (g_param.x, g_param.y) is the x and y dimensions of the Dispatch call
+ // (g_param.z, g_param.w) is the size of the above Input Texture2D
+};
+
+//#define CS_FULL_PIXEL_REDUCITON // Defining this or not must be the same as in HDRToneMappingCS11.cpp
+
+#define blocksize 8
+#define blocksizeY 8
+#define groupthreads (blocksize*blocksizeY)
+groupshared float accum[groupthreads];
+
+static const float4 LUM_VECTOR = float4(.299, .587, .114, 0);
+
+[numthreads(blocksize,blocksizeY,1)]
+void CSMain( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+ float4 s =
+#ifdef CS_FULL_PIXEL_REDUCITON
+ Input.Load( uint3(DTid.xy , 0) )+
+ Input.Load( uint3(DTid.xy + uint2(blocksize*g_param.x, 0), 0) ) +
+ Input.Load( uint3(DTid.xy + uint2(0, blocksizeY*g_param.y), 0) ) +
+ Input.Load( uint3(DTid.xy + uint2(blocksize*g_param.x, blocksizeY*g_param.y), 0) );
+#else
+ Input.Load( uint3((float)DTid.x/81.0f*g_param.z, (float)DTid.y/81.0f*g_param.w, 0) );
+#endif
+
+ accum[GI] = dot( s, LUM_VECTOR );
+
+ // Parallel reduction algorithm follows
+ GroupMemoryBarrierWithGroupSync();
+ if ( GI < 32 )
+ accum[GI] += accum[32+GI];
+
+ GroupMemoryBarrierWithGroupSync();
+ if ( GI < 16 )
+ accum[GI] += accum[16+GI];
+
+ GroupMemoryBarrierWithGroupSync();
+ if ( GI < 8 )
+ accum[GI] += accum[8+GI];
+
+ GroupMemoryBarrierWithGroupSync();
+ if ( GI < 4 )
+ accum[GI] += accum[4+GI];
+
+ GroupMemoryBarrierWithGroupSync();
+ if ( GI < 2 )
+ accum[GI] += accum[2+GI];
+
+ GroupMemoryBarrierWithGroupSync();
+ if ( GI < 1 )
+ accum[GI] += accum[1+GI];
+
+ if ( GI == 0 )
+ {
+ Result[Gid.y*g_param.x+Gid.x] = accum[0];
+ }
+}
diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceToSingleCS.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceToSingleCS.hlsl
new file mode 100644
index 000000000..cf506283e
--- /dev/null
+++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceToSingleCS.hlsl
@@ -0,0 +1,63 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSMain
+//-----------------------------------------------------------------------------
+// File: ReduceToSingleCS.hlsl
+//
+// Desc: Reduce an input buffer by a factor of groupthreads
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//-----------------------------------------------------------------------------
+
+StructuredBuffer<float> Input : register( t0 );
+RWStructuredBuffer<float> Result : register( u0 );
+
+cbuffer cbCS : register( b0 )
+{
+ uint4 g_param; // g_param.x is the actual elements contained in Input
+ // g_param.y is the x dimension of the Dispatch call
+};
+
+#define groupthreads 128
+groupshared float accum[groupthreads];
+
+[numthreads(groupthreads,1,1)]
+void CSMain( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+ if ( DTid.x < g_param.x )
+ accum[GI] = Input[DTid.x];
+ else
+ accum[GI] = 0;
+
+ // Parallel reduction algorithm follows
+ GroupMemoryBarrierWithGroupSync();
+ if ( GI < 64 )
+ accum[GI] += accum[64+GI];
+
+ GroupMemoryBarrierWithGroupSync();
+ if ( GI < 32 )
+ accum[GI] += accum[32+GI];
+
+ GroupMemoryBarrierWithGroupSync();
+ if ( GI < 16 )
+ accum[GI] += accum[16+GI];
+
+ GroupMemoryBarrierWithGroupSync();
+ if ( GI < 8 )
+ accum[GI] += accum[8+GI];
+
+ GroupMemoryBarrierWithGroupSync();
+ if ( GI < 4 )
+ accum[GI] += accum[4+GI];
+
+ GroupMemoryBarrierWithGroupSync();
+ if ( GI < 2 )
+ accum[GI] += accum[2+GI];
+
+ GroupMemoryBarrierWithGroupSync();
+ if ( GI < 1 )
+ accum[GI] += accum[1+GI];
+
+ if ( GI == 0 )
+ {
+ Result[Gid.x] = accum[0];
+ }
+}
diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/skybox11.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/skybox11.hlsl
new file mode 100644
index 000000000..2728665e2
--- /dev/null
+++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/skybox11.hlsl
@@ -0,0 +1,44 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry SkyboxVS -profile ps_4_0 -entry SkyboxPS
+//-----------------------------------------------------------------------------
+// File: SkyBox11.hlsl
+//
+// Desc:
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//-----------------------------------------------------------------------------
+
+cbuffer cbPerObject : register( b0 )
+{
+ row_major matrix g_mWorldViewProjection : packoffset( c0 );
+}
+
+TextureCube g_EnvironmentTexture : register( t0 );
+SamplerState g_sam : register( s0 );
+
+struct SkyboxVS_Input
+{
+ float4 Pos : POSITION;
+};
+
+struct SkyboxVS_Output
+{
+ float4 Pos : SV_POSITION;
+ float3 Tex : TEXCOORD0;
+};
+
+SkyboxVS_Output SkyboxVS( SkyboxVS_Input Input )
+{
+ SkyboxVS_Output Output;
+
+ Output.Pos = Input.Pos;
+ Output.Tex = normalize( mul(Input.Pos, g_mWorldViewProjection) );
+
+ return Output;
+}
+
+float4 SkyboxPS( SkyboxVS_Output Input ) : SV_TARGET
+{
+ float4 color = g_EnvironmentTexture.Sample( g_sam, Input.Tex );
+ return color;
+}
diff --git a/tests/hlsl/dxsdk/InstancingFX11/Instancing.fx b/tests/hlsl/dxsdk/InstancingFX11/Instancing.fx
new file mode 100644
index 000000000..3c8d45078
--- /dev/null
+++ b/tests/hlsl/dxsdk/InstancingFX11/Instancing.fx
@@ -0,0 +1,591 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: Instancing.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Input and output structures
+//--------------------------------------------------------------------------------------
+struct VSInstIn
+{
+ float3 pos : POSITION;
+ float3 norm : NORMAL;
+ float2 tex : TEXTURE0;
+ row_major float4x4 mTransform : mTransform;
+};
+
+struct VSSceneIn
+{
+ float3 pos : POSITION;
+ float3 norm : NORMAL;
+ float2 tex : TEXTURE0;
+};
+
+struct VSGrassIn
+{
+ float3 pos : POSITION;
+ float3 norm : NORMAL;
+ float2 tex : TEXTURE0;
+ row_major float4x4 mTransform : mTransform;
+ uint VertexID : SV_VertexID;
+};
+
+struct VSGrassOut
+{
+ float3 pos : POSITION;
+ float3 norm : NORMAL;
+ float2 tex : TEXTURE0;
+ uint VertexID : VERTID;
+};
+
+struct VSQuadIn
+{
+ float3 pos : POSITION;
+ float2 tex : TEXTURE0;
+ row_major float4x4 mTransform : mTransform;
+ float fOcc : fOcc;
+ uint InstanceId : SV_InstanceID;
+};
+
+struct PSSceneIn
+{
+ float4 pos : SV_Position;
+ float2 tex : TEXTURE0;
+ float4 color : COLOR0;
+};
+
+struct PSQuadIn
+{
+ float4 pos : SV_Position;
+ float3 tex : TEXTURE0;
+ float4 color : COLOR0;
+};
+
+//--------------------------------------------------------------------------------------
+// Constant buffers
+//--------------------------------------------------------------------------------------
+cbuffer crarely
+{
+ float4x4 g_mTreeMatrices[50];
+ uint g_iNumTrees;
+};
+
+cbuffer ceveryframe
+{
+ float4x4 g_mWorldViewProj;
+ float4x4 g_mWorldView;
+};
+
+cbuffer cmultipleperframe
+{
+ float g_GrassWidth;
+ float g_GrassHeight;
+ uint g_iGrassCoverage;
+};
+
+cbuffer cusercontrolled
+{
+ float g_GrassMessiness;
+};
+
+struct light_struct
+{
+ float4 direction;
+ float4 color;
+};
+
+cbuffer cimmutable
+{
+ light_struct g_lights[4] = {
+ { float4(0.620275, 0.683659, 0.384537, 1), float4(0.75, 0.599, 0.405, 1) }, //sun
+ { float4(0.063288, -0.987444, 0.144735, 1), float4(0.192, 0.273, 0.275, 1) }, //bottom
+ { float4(0.23007, 0.785579, -0.574422, 1), float4(0.300, 0.292, 0.223, 1) }, //highlight
+ { float4(-0.620275, -0.683659, -0.384537, 1), float4(0.0, 0.0, 0.1, 1) } //blue rim-light
+ };
+
+ float4 g_ambient = float4(0.4945,0.465,0.5,1);
+
+ float g_occDimHeight = 2400.0; //scalar that tells us how much to darken the tree near the top
+};
+
+cbuffer cgrassblade
+{
+ float3 g_positions[6] =
+ {
+ float3( -1, 0, 0 ),
+ float3( -1, 2, 0 ),
+ float3( 1, 0, 0 ),
+ float3( 1, 2, 0 ),
+
+ float3( -1, 0, 0 ),
+ float3( -1, 2, 0 ),
+ };
+ float2 g_texcoords[6] =
+ {
+ float2(0,1),
+ float2(0,0),
+ float2(1,1),
+ float2(1,0),
+
+ float2(0,1),
+ float2(0,0),
+ };
+};
+
+//--------------------------------------------------------------------------------------
+// Textures and Samplers
+//--------------------------------------------------------------------------------------
+Texture2D g_txDiffuse;
+Texture2DArray g_tx2dArray;
+SamplerState g_samLinear
+{
+ Filter = ANISOTROPIC;
+ AddressU = Wrap;
+ AddressV = Wrap;
+};
+
+Texture1D g_txRandom;
+SamplerState g_samPoint
+{
+ Filter = MIN_MAG_MIP_POINT;
+ AddressU = Wrap;
+ AddressV = Wrap;
+};
+
+//--------------------------------------------------------------------------------------
+// State structures
+//--------------------------------------------------------------------------------------
+BlendState QuadAlphaBlendState
+{
+ AlphaToCoverageEnable = TRUE;
+ RenderTargetWriteMask[0] = 0x0F;
+};
+
+RasterizerState EnableMSAA
+{
+ CullMode = BACK;
+ MultisampleEnable = TRUE;
+};
+
+DepthStencilState DisableDepthTestWrite
+{
+ DepthEnable = FALSE;
+ DepthWriteMask = ZERO;
+};
+
+DepthStencilState EnableDepthTestWrite
+{
+ DepthEnable = TRUE;
+ DepthWriteMask = ALL;
+};
+
+BlendState NoBlending
+{
+ AlphaToCoverageEnable = FALSE;
+ BlendEnable[0] = FALSE;
+};
+
+//--------------------------------------------------------------------------------------
+// Sky vertex shader
+//--------------------------------------------------------------------------------------
+PSSceneIn VSSkymain(VSSceneIn input)
+{
+ PSSceneIn output;
+
+ //
+ // Transform the vert to view-space
+ //
+ float4 v4Position = mul(float4(input.pos, 1), g_mWorldViewProj);
+ output.pos = v4Position;
+
+ //
+ // Transfer the rest
+ //
+ output.tex = input.tex;
+
+ output.color = float4(1,1,1,1);
+
+ return output;
+}
+
+//--------------------------------------------------------------------------------------
+// CalcLighting helper function. Calculates lighting from 4 light sources, adds ambient
+// and attenuates for depth. Used by all techniques for lighting.
+//--------------------------------------------------------------------------------------
+float4 CalcLighting( float3 norm, float depth )
+{
+ float4 color = float4(0,0,0,0);
+
+ // add the contributions of 4 directional lights
+ [unroll] for( int i=0; i<4; i++ )
+ {
+ color += saturate( dot(g_lights[i].direction,norm) )*g_lights[i].color;
+ }
+
+ // give some attenuation due to depth
+ float attenuate = depth / 10000.0;
+ float4 attenColor = float4(0.15, 0.2, 0.3, 0);
+
+ // add it all up plus ambient
+ return (1-attenuate*0.23)*(color + g_ambient) + attenColor*attenuate;
+}
+
+//--------------------------------------------------------------------------------------
+// Instancing vertex shader. Positions the vertices based upon the matrix stored
+// in the second vertex stream.
+//--------------------------------------------------------------------------------------
+PSSceneIn VSInstmain(VSInstIn input)
+{
+ PSSceneIn output;
+
+ //
+ // Transform by our Sceneance matrix
+ //
+ float4 InstancePosition = mul(float4(input.pos, 1), input.mTransform);
+ float4 ViewPos = mul(InstancePosition, g_mWorldView );
+
+ //
+ // Transform the vert to view-space
+ //
+ float4 v4Position = mul(InstancePosition, g_mWorldViewProj);
+ output.pos = v4Position;
+
+ //
+ // Transfer the rest
+ //
+ output.tex = input.tex;
+
+ //
+ // dot the norm with the light dir
+ //
+ float3 norm = mul(input.norm,(float3x3)input.mTransform);
+ output.color = CalcLighting( norm, ViewPos.z );
+
+ //
+ // Dim the color by how far up the tree we are.
+ // This is a nice way to fake occlusion of the branches by the leaves.
+ //
+ output.color *= 1.0f - saturate(input.pos.y/g_occDimHeight);
+
+
+ return output;
+}
+
+//--------------------------------------------------------------------------------------
+// Quad (leaf) vertex shader. Instances the quad over multiple leaf positions and
+// multiple trees. This demonstrates how to do double instancing.
+//--------------------------------------------------------------------------------------
+PSQuadIn VSQuadmain(VSQuadIn input)
+{
+ PSQuadIn output;
+
+ // base our leaf texture upon which instance id we are
+ uint iLeaf = input.InstanceId/g_iNumTrees;
+ uint iLeafTex = iLeaf%3;
+ output.tex = float3(input.tex, float(iLeafTex) );
+
+ //
+ // Transform the position by the Instance matrix
+ //
+ int iTree = input.InstanceId - (input.InstanceId/g_iNumTrees)*g_iNumTrees;
+ float4 vInstancePos = mul( float4(input.pos, 1), input.mTransform );
+ float4 InstancePosition = mul(vInstancePos, g_mTreeMatrices[iTree] );
+ float4 ViewPos = mul(InstancePosition, g_mWorldView );
+
+ //
+ // Transform the Instance position to view-space
+ //
+ output.pos = mul(InstancePosition, g_mWorldViewProj);
+
+ // pack distance from the eye into the color alpha channel
+ output.color = float4(input.fOcc,input.fOcc,input.fOcc,ViewPos.z);
+
+ return output;
+}
+
+//--------------------------------------------------------------------------------------
+// Grass vertex shader. Basically a passthrough except for instancing the island base
+// mesh.
+//--------------------------------------------------------------------------------------
+VSGrassOut VSGrassmain(VSGrassIn input)
+{
+ // simple transform into the instance space
+ VSGrassOut output;
+ output.pos = mul(float4(input.pos, 1), input.mTransform);
+ output.norm = mul(input.norm, (float3x3)input.mTransform);
+ output.tex = input.tex;
+ output.VertexID = input.VertexID;
+
+ return output;
+}
+
+//--------------------------------------------------------------------------------------
+// Quad (leaf) GS. Calculates the normal and lighting for the leaf.
+//--------------------------------------------------------------------------------------
+[maxvertexcount(3)]
+void GSQuadmain(triangle PSQuadIn input[3], inout TriangleStream<PSQuadIn> QuadStream)
+{
+ PSQuadIn output;
+
+ //
+ // Calculate the face normal
+ //
+ float4 faceNormalA = input[1].pos.xyzw - input[0].pos.xyzw;
+ float4 faceNormalB = input[2].pos.xyzw - input[0].pos.xyzw;
+
+ //
+ // Cross product
+ //
+ float3 faceNormal = cross(faceNormalA, faceNormalB);
+
+ //
+ // Normalize face normal
+ //
+ faceNormal = normalize(faceNormal);
+
+ //
+ // Dot face normal with some arbitrary light vectors
+ //
+ float4 color1 = CalcLighting( faceNormal, input[0].color.a );
+ color1 *= input[0].color;
+
+ //
+ // Make sure we always have an alpha of 1
+ //
+ color1.a = 1.0;
+
+ //
+ // Emit out the new tri
+ //
+ for(int i=0; i<3; i++)
+ {
+ output.pos = input[i].pos;
+ output.color = color1;
+ output.tex = input[i].tex;
+ QuadStream.Append(output);
+ }
+ QuadStream.RestartStrip();
+}
+
+//--------------------------------------------------------------------------------------
+// RandomDir helper. Samples a random dir out of our 1d random texture. In this case
+// we use a texture because the offset could be anywhere. If we were sampling linearly
+// then we would probably just use a buffer and load from that.
+//--------------------------------------------------------------------------------------
+float3 RandomDir(float fOffset)
+{
+ float tCoord = (fOffset) / 300.0;
+ return g_txRandom.SampleLevel( g_samPoint, tCoord, 0 );
+}
+
+//--------------------------------------------------------------------------------------
+// Helper to determing if a point is within a triangle
+//--------------------------------------------------------------------------------------
+bool IsInTriangle( float3 P, float3 A, float3 B, float3 C )
+{
+ float3 crossA = cross( B-A, P-A );
+ float3 crossB = cross( C-B, P-B );
+ float3 crossC = cross( A-C, P-C );
+
+ if( dot( crossA, crossB ) > 0 &&
+ dot( crossB, crossC ) > 0 )
+ {
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+}
+
+//--------------------------------------------------------------------------------------
+// Gets a random orientation matrix based upon the RandomDir funciton
+//--------------------------------------------------------------------------------------
+float4x4 GetRandomOrientation( float3 Pos, float3 Norm, float fRandOffset )
+{
+ float3 Tangent = RandomDir(fRandOffset);
+
+ float3 Bitangent = normalize( cross( Tangent, Norm ) );
+ Tangent = normalize( cross( Bitangent, Norm ) );
+
+ float4x4 matWorld = { float4( Tangent, 0 ),
+ float4( Norm, 0 ),
+ float4( Bitangent, 0 ),
+ float4( Pos, 1 ) };
+ return matWorld;
+}
+
+//--------------------------------------------------------------------------------------
+// Generates an actual grass blade
+//--------------------------------------------------------------------------------------
+void OutputGrassBlade( VSGrassOut midPoint, inout TriangleStream<PSQuadIn> GrassStream, int iGrassTex )
+{
+ PSQuadIn output;
+
+ float4x4 mWorld = GetRandomOrientation( midPoint.pos, midPoint.norm, (float)midPoint.VertexID );
+ float4 ViewPos = mul( midPoint.pos, g_mWorldView );
+
+ float3 grassNorm = midPoint.norm;
+ float4 color1 = CalcLighting( grassNorm, ViewPos.z );
+
+ for(int v=0; v<6; v++)
+ {
+ float3 pos = g_positions[v];
+ pos.x *= g_GrassWidth;
+ pos.y *= g_GrassHeight;
+
+ output.pos = mul( float4(pos,1), mWorld );
+ output.pos = mul( output.pos, g_mWorldViewProj );
+ output.tex = float3( g_texcoords[v], iGrassTex );
+ output.color = color1;
+
+ GrassStream.Append( output );
+ }
+
+ GrassStream.RestartStrip();
+}
+
+//--------------------------------------------------------------------------------------
+// Midpoint of the three vertices A,B,C
+//--------------------------------------------------------------------------------------
+VSGrassOut CalcMidPoint( VSGrassOut A, VSGrassOut B, VSGrassOut C )
+{
+ VSGrassOut MidPoint;
+
+ MidPoint.pos = (A.pos + B.pos + C.pos)/3.0f;
+ MidPoint.norm = (A.norm + B.norm + C.norm)/3.0f;
+ MidPoint.tex = (A.tex + B.tex + C.tex)/3.0f;
+ MidPoint.VertexID = A.VertexID + B.VertexID + C.VertexID;
+
+ return MidPoint;
+}
+
+//--------------------------------------------------------------------------------------
+// The actual grass geometry shader. This generates grass blades based upon an input
+// mesh (the tops of the islands) and a coverage texture. Each of the textures channels
+// determines how much of each of the 4 types of grass to place at a particular spot.
+//--------------------------------------------------------------------------------------
+[maxvertexcount(90)]
+void GSGrassmain(triangle VSGrassOut input[3], inout TriangleStream<PSQuadIn> GrassStream )
+{
+ VSGrassOut MidPoint = CalcMidPoint( input[0], input[1], input[2] );
+
+ float4 CoverageMask = g_tx2dArray.SampleLevel( g_samPoint, float3(MidPoint.tex,4), 0 );
+ float cm[4];
+ cm[0] = CoverageMask.r;
+ cm[1] = CoverageMask.g;
+ cm[2] = CoverageMask.b;
+ cm[3] = CoverageMask.a;
+
+ for(int g=0; g<4; g++)
+ {
+ float MaxBlades = float(g_iGrassCoverage)*cm[g];
+ for(float i=0; i<MaxBlades; i++)
+ {
+ float randOffset = g*5 + (i+1);
+ float3 Tan = RandomDir( MidPoint.pos.x + randOffset );
+ float3 Len = normalize( RandomDir( MidPoint.pos.z + randOffset ) );
+ float3 Shift = Len.x*g_GrassMessiness*normalize( cross( Tan, MidPoint.norm ) );
+ VSGrassOut grassPoint = MidPoint;
+ grassPoint.VertexID += randOffset;
+ grassPoint.pos += Shift;
+
+ //uncomment this to make the grass strictly conform to the mesh
+ //if( IsInTriangle( grassPoint.pos, input[0].pos, input[1].pos, input[2].pos ) )
+ {
+ OutputGrassBlade( grassPoint, GrassStream, g );
+ }
+ }
+ }
+}
+
+//--------------------------------------------------------------------------------------
+// PS for non-leaf or grass items.
+//--------------------------------------------------------------------------------------
+float4 PSScenemain(PSSceneIn input) : SV_Target
+{
+ float4 color = g_txDiffuse.Sample( g_samLinear, input.tex ) * input.color;
+ return color;
+}
+
+//--------------------------------------------------------------------------------------
+// PS for leaves and grass
+//--------------------------------------------------------------------------------------
+float4 PSQuadmain(PSQuadIn input) : SV_Target
+{
+ float4 color = g_tx2dArray.Sample( g_samLinear, input.tex );
+ color.xyz *= input.color.xyz;
+ return color;
+}
+
+//--------------------------------------------------------------------------------------
+// Render instanced meshes with vertex lighting
+//--------------------------------------------------------------------------------------
+technique10 RenderInstancedVertLighting
+{
+ pass p0
+ {
+ SetVertexShader( CompileShader( vs_4_0, VSInstmain() ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0, PSScenemain() ) );
+
+ SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+ SetDepthStencilState( EnableDepthTestWrite, 0 );
+ SetRasterizerState( EnableMSAA );
+ }
+}
+
+//--------------------------------------------------------------------------------------
+// Skybox
+//--------------------------------------------------------------------------------------
+technique10 RenderSkybox
+{
+ pass p0
+ {
+ SetVertexShader( CompileShader( vs_4_0, VSSkymain() ) );
+ SetGeometryShader( NULL );
+ SetPixelShader( CompileShader( ps_4_0, PSScenemain() ) );
+
+ SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+ SetDepthStencilState( DisableDepthTestWrite, 0 );
+ SetRasterizerState( EnableMSAA );
+ }
+}
+
+//--------------------------------------------------------------------------------------
+// Render leaves
+//--------------------------------------------------------------------------------------
+technique10 RenderQuad
+{
+ pass p0
+ {
+
+ SetVertexShader( CompileShader( vs_4_0, VSQuadmain() ) );
+ SetGeometryShader( CompileShader( gs_4_0, GSQuadmain() ) );
+ SetPixelShader( CompileShader( ps_4_0, PSQuadmain() ) );
+
+ SetBlendState( QuadAlphaBlendState, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+ SetDepthStencilState( EnableDepthTestWrite, 0 );
+ SetRasterizerState( EnableMSAA );
+ }
+}
+
+//--------------------------------------------------------------------------------------
+// Render grass
+//--------------------------------------------------------------------------------------
+technique10 RenderGrass
+{
+ pass p0
+ {
+
+ SetVertexShader( CompileShader( vs_4_0, VSGrassmain() ) );
+ SetGeometryShader( CompileShader( gs_4_0, GSGrassmain() ) );
+ SetPixelShader( CompileShader( ps_4_0, PSQuadmain() ) );
+
+ SetBlendState( QuadAlphaBlendState, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+ SetDepthStencilState( EnableDepthTestWrite, 0 );
+ SetRasterizerState( EnableMSAA );
+ }
+}
diff --git a/tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_PS.hlsl b/tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_PS.hlsl
new file mode 100644
index 000000000..dbeb87f33
--- /dev/null
+++ b/tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_PS.hlsl
@@ -0,0 +1,202 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PSMain
+//--------------------------------------------------------------------------------------
+// File: MultithreadedRendering11_PS.hlsl
+//
+// The pixel shader file for the MultithreadedRendering11 sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+// Various debug options
+//#define NO_DIFFUSE_MAP
+//#define NO_NORMAL_MAP
+//#define NO_AMBIENT
+//#define NO_DYNAMIC_LIGHTING
+//#define NO_SHADOW_MAP
+
+#define SHADOW_DEPTH_BIAS 0.0005f
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+static const int g_iNumLights = 4;
+static const int g_iNumShadows = 1; // by convention, the first n lights cast shadows
+
+cbuffer cbPerObject : register( b0 )
+{
+ float4 g_vObjectColor : packoffset( c0 );
+};
+
+cbuffer cbPerLight : register( b1 )
+{
+ struct LightDataStruct
+ {
+ matrix m_mLightViewProj;
+ float4 m_vLightPos;
+ float4 m_vLightDir;
+ float4 m_vLightColor;
+ float4 m_vFalloffs; // x = dist end, y = dist range, z = cos angle end, w = cos range
+ } g_LightData[g_iNumLights] : packoffset( c0 );
+};
+
+cbuffer cbPerScene : register( b2 )
+{
+ float4 g_vMirrorPlane : packoffset( c0 );
+ float4 g_vAmbientColor : packoffset( c1 );
+ float4 g_vTintColor : packoffset( c2 );
+};
+
+//--------------------------------------------------------------------------------------
+// Textures and Samplers
+//--------------------------------------------------------------------------------------
+Texture2D g_txDiffuse : register( t0 );
+Texture2D g_txNormal : register( t1 );
+Texture2D g_txShadow[g_iNumShadows] : register( t2 );
+
+SamplerState g_samPointClamp : register( s0 );
+SamplerState g_samLinearWrap : register( s1 );
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct PS_INPUT
+{
+ float3 vNormal : NORMAL;
+ float3 vTangent : TANGENT;
+ float2 vTexcoord : TEXCOORD0;
+ float4 vPosWorld : TEXCOORD1;
+};
+
+//--------------------------------------------------------------------------------------
+// Sample normal map, convert to signed, apply tangent-to-world space transform
+//--------------------------------------------------------------------------------------
+float3 CalcPerPixelNormal( float2 vTexcoord, float3 vVertNormal, float3 vVertTangent )
+{
+ // Compute tangent frame
+ vVertNormal = normalize( vVertNormal );
+ vVertTangent = normalize( vVertTangent );
+ float3 vVertBinormal = normalize( cross( vVertTangent, vVertNormal ) );
+ float3x3 mTangentSpaceToWorldSpace = float3x3( vVertTangent, vVertBinormal, vVertNormal );
+
+ // Compute per-pixel normal
+ float3 vBumpNormal = g_txNormal.Sample( g_samLinearWrap, vTexcoord );
+ vBumpNormal = 2.0f * vBumpNormal - 1.0f;
+
+ return mul( vBumpNormal, mTangentSpaceToWorldSpace );
+}
+
+//--------------------------------------------------------------------------------------
+// Test how much pixel is in shadow, using 2x2 percentage-closer filtering
+//--------------------------------------------------------------------------------------
+float4 CalcUnshadowedAmountPCF2x2( int iShadow, float4 vPosWorld )
+{
+ matrix mLightViewProj = g_LightData[iShadow].m_mLightViewProj;
+ Texture2D txShadow = g_txShadow[iShadow];
+
+ // Compute pixel position in light space
+ float4 vLightSpacePos = mul( vPosWorld, mLightViewProj );
+ vLightSpacePos.xyz /= vLightSpacePos.w;
+
+ // Translate from surface coords to texture coords
+ // Could fold these into the matrix
+ float2 vShadowTexCoord = 0.5f * vLightSpacePos + 0.5f;
+ vShadowTexCoord.y = 1.0f - vShadowTexCoord.y;
+
+ // Depth bias to avoid pixel self-shadowing
+ float vLightSpaceDepth = vLightSpacePos.z - SHADOW_DEPTH_BIAS;
+
+ // Find sub-pixel weights
+ float2 vShadowMapDims = float2( 2048.0f, 2048.0f ); // need to keep in sync with .cpp file
+ float4 vSubPixelCoords;
+ vSubPixelCoords.xy = frac( vShadowMapDims * vShadowTexCoord );
+ vSubPixelCoords.zw = 1.0f - vSubPixelCoords;
+ float4 vBilinearWeights = vSubPixelCoords.zxzx * vSubPixelCoords.wwyy;
+
+ // 2x2 percentage closer filtering
+ float2 vTexelUnits = 1.0f / vShadowMapDims;
+ float4 vShadowDepths;
+ vShadowDepths.x = txShadow.Sample( g_samPointClamp, vShadowTexCoord );
+ vShadowDepths.y = txShadow.Sample( g_samPointClamp, vShadowTexCoord + float2( vTexelUnits.x, 0.0f ) );
+ vShadowDepths.z = txShadow.Sample( g_samPointClamp, vShadowTexCoord + float2( 0.0f, vTexelUnits.y ) );
+ vShadowDepths.w = txShadow.Sample( g_samPointClamp, vShadowTexCoord + vTexelUnits );
+
+ // What weighted fraction of the 4 samples are nearer to the light than this pixel?
+ float4 vShadowTests = ( vShadowDepths >= vLightSpaceDepth ) ? 1.0f : 0.0f;
+ return dot( vBilinearWeights, vShadowTests );
+}
+
+//--------------------------------------------------------------------------------------
+// Diffuse lighting calculation, with angle and distance falloff
+//--------------------------------------------------------------------------------------
+float4 CalcLightingColor( int iLight, float3 vPosWorld, float3 vPerPixelNormal )
+{
+ float3 vLightPos = g_LightData[iLight].m_vLightPos.xyz;
+ float3 vLightDir = g_LightData[iLight].m_vLightDir.xyz;
+ float4 vLightColor = g_LightData[iLight].m_vLightColor;
+ float4 vFalloffs = g_LightData[iLight].m_vFalloffs;
+
+ float3 vLightToPixelUnNormalized = vPosWorld - vLightPos;
+
+ // Dist falloff = 0 at vFalloffs.x, 1 at vFalloffs.x - vFalloffs.y
+ float fDist = length( vLightToPixelUnNormalized );
+ float fDistFalloff = saturate( ( vFalloffs.x - fDist ) / vFalloffs.y );
+
+ // Normalize from here on
+ float3 vLightToPixelNormalized = vLightToPixelUnNormalized / fDist;
+
+ // Angle falloff = 0 at vFalloffs.z, 1 at vFalloffs.z - vFalloffs.w
+ float fCosAngle = dot( vLightToPixelNormalized, vLightDir );
+ float fAngleFalloff = saturate( ( fCosAngle - vFalloffs.z ) / vFalloffs.w );
+
+ // Diffuse contribution
+ float fNDotL = saturate( -dot( vLightToPixelNormalized, vPerPixelNormal ) );
+
+ return vLightColor * fNDotL * fDistFalloff * fAngleFalloff;
+}
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PSMain( PS_INPUT Input ) : SV_TARGET
+{
+ // Manual clip test, so that objects which are behind the mirror
+ // don't show up in the mirror.
+ clip( dot( g_vMirrorPlane.xyz, Input.vPosWorld.xyz ) + g_vMirrorPlane.w );
+
+#ifdef NO_DIFFUSE_MAP
+ float4 vDiffuse = 0.5f;
+#else // #ifdef NO_DIFFUSE_MAP
+ float4 vDiffuse = g_txDiffuse.Sample( g_samLinearWrap, Input.vTexcoord );
+#endif // #ifdef NO_DIFFUSE_MAP #else
+
+ // Compute per-pixel normal
+#ifdef NO_NORMAL_MAP
+ float3 vPerPixelNormal = Input.vNormal;
+#else // #ifdef NO_NORMAL_MAP
+ float3 vPerPixelNormal = CalcPerPixelNormal( Input.vTexcoord, Input.vNormal, Input.vTangent );
+#endif // #ifdef NO_NORMAL_MAP #else
+
+ // Compute lighting contribution
+#ifdef NO_AMBIENT
+ float4 vTotalLightingColor = 0.0f;
+#else // #ifdef NO_AMBIENT
+ float4 vTotalLightingColor = g_vAmbientColor;
+#endif // #ifdef NO_AMBIENT #else
+
+#ifndef NO_DYNAMIC_LIGHTING
+ for ( int iLight = 0; iLight < g_iNumLights; ++iLight )
+ {
+ float4 vLightingColor = CalcLightingColor( iLight, Input.vPosWorld, vPerPixelNormal );
+#ifndef NO_SHADOW_MAP
+ if ( iLight < g_iNumShadows && any( vLightingColor.xyz ) > 0.0f ) // Don't bother checking shadow map if the pixel is unlit
+ {
+ vLightingColor *= CalcUnshadowedAmountPCF2x2( iLight, Input.vPosWorld );
+ }
+#endif // #ifndef NO_SHADOW_MAP
+ vTotalLightingColor += vLightingColor;
+ }
+#endif // #ifndef NO_DYNAMIC_LIGHTING
+
+ return vDiffuse * g_vTintColor * g_vObjectColor * vTotalLightingColor;
+}
diff --git a/tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_VS.hlsl b/tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_VS.hlsl
new file mode 100644
index 000000000..0d8d32ffa
--- /dev/null
+++ b/tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_VS.hlsl
@@ -0,0 +1,75 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain
+//--------------------------------------------------------------------------------------
+// File: MultithreadedRendering11_VS.hlsl
+//
+// The vertex shader file for the MultithreadedRendering11 sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+// Various debug options
+//#define UNCOMPRESSED_VERTEX_DATA // The sdkmesh file contained uncompressed vertex data
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+ matrix g_mWorld : packoffset( c0 );
+};
+cbuffer cbPerScene : register( b1 )
+{
+ matrix g_mViewProj : packoffset( c0 );
+};
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+ float4 vPosition : POSITION;
+ float3 vNormal : NORMAL;
+ float2 vTexcoord : TEXCOORD0;
+ float3 vTangent : TANGENT;
+};
+
+struct VS_OUTPUT
+{
+ float3 vNormal : NORMAL;
+ float3 vTangent : TANGENT;
+ float2 vTexcoord : TEXCOORD0;
+ float4 vPosWorld : TEXCOORD1;
+ float4 vPosition : SV_POSITION;
+};
+
+// We aliased signed vectors as a unsigned format.
+// Need to recover signed values. The values 1.0 and 2.0
+// are slightly inaccurate here.
+float3 R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( in float3 vVec )
+{
+ vVec *= 2.0f;
+ return vVec >= 1.0f ? ( vVec - 2.0f ) : vVec;
+}
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+VS_OUTPUT VSMain( VS_INPUT Input )
+{
+ VS_OUTPUT Output;
+
+#ifndef UNCOMPRESSED_VERTEX_DATA
+ // Expand compressed vectors
+ Input.vNormal = R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( Input.vNormal );
+ Input.vTangent = R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( Input.vTangent );
+#endif // #ifndef UNCOMPRESSED_VERTEX_DATA
+
+ Output.vPosWorld = mul( Input.vPosition, g_mWorld );
+ Output.vPosition = mul( Output.vPosWorld, g_mViewProj );
+ Output.vNormal = mul( Input.vNormal, (float3x3)g_mWorld );
+ Output.vTangent = mul( Input.vTangent, (float3x3)g_mWorld );
+ Output.vTexcoord = Input.vTexcoord;
+
+ return Output;
+}
+
diff --git a/tests/hlsl/dxsdk/NBodyGravityCS11/NBodyGravityCS11.hlsl b/tests/hlsl/dxsdk/NBodyGravityCS11/NBodyGravityCS11.hlsl
new file mode 100644
index 000000000..0a694450c
--- /dev/null
+++ b/tests/hlsl/dxsdk/NBodyGravityCS11/NBodyGravityCS11.hlsl
@@ -0,0 +1,103 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSMain
+//--------------------------------------------------------------------------------------
+// File: NBodyGravityCS11.hlsl
+//
+// Demonstrates how to use Compute Shader to do n-body gravity computation
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+static float softeningSquared = 0.0012500000*0.0012500000;
+static float g_fG = 6.67300e-11f * 10000.0f;
+static float g_fParticleMass = g_fG*10000.0f * 10000.0f;
+
+#define blocksize 128
+groupshared float4 sharedPos[blocksize];
+
+// Body to body interaction, acceleration of the particle at position bi is updated
+void bodyBodyInteraction(inout float3 ai, float4 bj, float4 bi, float mass, int particles )
+{
+ float3 r = bj.xyz - bi.xyz;
+
+ float distSqr = dot(r, r);
+ distSqr += softeningSquared;
+
+ float invDist = 1.0f / sqrt(distSqr);
+ float invDistCube = invDist * invDist * invDist;
+
+ float s = mass * invDistCube * particles;
+
+ ai += r * s;
+}
+
+cbuffer cbCS : register( b0 )
+{
+ uint4 g_param; // pcbCS->param[0] = MAX_PARTICLES;
+ // pcbCS->param[1] = dimx;
+ float4 g_paramf; // pcbCS->paramf[0] = 0.1f;
+ // pcbCS->paramf[1] = 1;
+};
+
+struct PosVelo
+{
+ float4 pos;
+ float4 velo;
+};
+
+StructuredBuffer<PosVelo> oldPosVelo;
+RWStructuredBuffer<PosVelo> newPosVelo;
+
+[numthreads(blocksize, 1, 1)]
+void CSMain( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+ // Each thread of the CS updates one of the particles
+
+ float4 pos = oldPosVelo[DTid.x].pos;
+ float4 vel = oldPosVelo[DTid.x].velo;
+ float3 accel = 0;
+ float mass = g_fParticleMass;
+
+ // Update current particle using all other particles
+ [loop]
+ for (uint tile = 0; tile < g_param.y; tile++)
+ {
+ // Cache a tile of particles unto shared memory to increase IO efficiency
+ sharedPos[GI] = oldPosVelo[tile * blocksize + GI].pos;
+
+ GroupMemoryBarrierWithGroupSync();
+
+ [unroll]
+ for (uint counter = 0; counter < blocksize; counter+=8 )
+ {
+ bodyBodyInteraction(accel, sharedPos[counter], pos, mass, 1);
+ bodyBodyInteraction(accel, sharedPos[counter+1], pos, mass, 1);
+ bodyBodyInteraction(accel, sharedPos[counter+2], pos, mass, 1);
+ bodyBodyInteraction(accel, sharedPos[counter+3], pos, mass, 1);
+ bodyBodyInteraction(accel, sharedPos[counter+4], pos, mass, 1);
+ bodyBodyInteraction(accel, sharedPos[counter+5], pos, mass, 1);
+ bodyBodyInteraction(accel, sharedPos[counter+6], pos, mass, 1);
+ bodyBodyInteraction(accel, sharedPos[counter+7], pos, mass, 1);
+ }
+
+ GroupMemoryBarrierWithGroupSync();
+ }
+
+ // g_param.x is the number of our particles, however this number might not be an exact multiple of the tile size.
+ // In such cases, out of bound reads occur in the process above, which means there will be
+ // tooManyParticles "phantom" particles generating false gravity at position (0, 0, 0), so we have to substract them here.
+ // NOTE, out of bound reads always return 0 in CS
+ const uint tooManyParticles = g_param.y * blocksize - g_param.x;
+ bodyBodyInteraction(accel, float4(0, 0, 0, 0), pos, mass, -tooManyParticles);
+
+ // Update the velocity and position of current particle using the acceleration computed above
+ vel.xyz += accel.xyz * g_paramf.x; //deltaTime;
+ vel.xyz *= g_paramf.y; //damping;
+ pos.xyz += vel.xyz * g_paramf.x; //deltaTime;
+
+ if ( DTid.x < g_param.x )
+ {
+ newPosVelo[DTid.x].pos = pos;
+ newPosVelo[DTid.x].velo = float4(vel.xyz, length(accel));
+ }
+}
diff --git a/tests/hlsl/dxsdk/NBodyGravityCS11/ParticleDraw.hlsl b/tests/hlsl/dxsdk/NBodyGravityCS11/ParticleDraw.hlsl
new file mode 100644
index 000000000..ea56e20e9
--- /dev/null
+++ b/tests/hlsl/dxsdk/NBodyGravityCS11/ParticleDraw.hlsl
@@ -0,0 +1,128 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSParticleDraw -profile gs_4_0 -entry GSParticleDraw -profile ps_4_0 -entry PSParticleDraw
+//--------------------------------------------------------------------------------------
+// File: ParticleDraw.hlsl
+//
+// Shaders for rendering the particle as point sprite
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+struct VSParticleIn
+{
+ float4 color : COLOR;
+ uint id : SV_VERTEXID;
+};
+
+struct VSParticleDrawOut
+{
+ float3 pos : POSITION;
+ float4 color : COLOR;
+};
+
+struct GSParticleDrawOut
+{
+ float2 tex : TEXCOORD0;
+ float4 color : COLOR;
+ float4 pos : SV_POSITION;
+};
+
+struct PSParticleDrawIn
+{
+ float2 tex : TEXCOORD0;
+ float4 color : COLOR;
+};
+
+struct PosVelo
+{
+ float4 pos;
+ float4 velo;
+};
+
+Texture2D g_txDiffuse;
+StructuredBuffer<PosVelo> g_bufPosVelo;
+
+
+SamplerState g_samLinear
+{
+ Filter = MIN_MAG_MIP_LINEAR;
+ AddressU = Clamp;
+ AddressV = Clamp;
+};
+
+cbuffer cb0
+{
+ row_major float4x4 g_mWorldViewProj;
+ row_major float4x4 g_mInvView;
+};
+
+cbuffer cb1
+{
+ static float g_fParticleRad = 10.0f;
+};
+
+cbuffer cbImmutable
+{
+ static float3 g_positions[4] =
+ {
+ float3( -1, 1, 0 ),
+ float3( 1, 1, 0 ),
+ float3( -1, -1, 0 ),
+ float3( 1, -1, 0 ),
+ };
+
+ static float2 g_texcoords[4] =
+ {
+ float2(0,0),
+ float2(1,0),
+ float2(0,1),
+ float2(1,1),
+ };
+};
+
+//
+// Vertex shader for drawing the point-sprite particles
+//
+VSParticleDrawOut VSParticleDraw(VSParticleIn input)
+{
+ VSParticleDrawOut output;
+
+ output.pos = g_bufPosVelo[input.id].pos;
+
+ float mag = g_bufPosVelo[input.id].velo.w/9;
+ output.color = lerp( float4(1,0.1,0.1,1), input.color, mag );
+
+ return output;
+}
+
+//
+// GS for rendering point sprite particles. Takes a point and turns it into 2 tris.
+//
+[maxvertexcount(4)]
+void GSParticleDraw(point VSParticleDrawOut input[1], inout TriangleStream<GSParticleDrawOut> SpriteStream)
+{
+ GSParticleDrawOut output;
+
+ //
+ // Emit two new triangles
+ //
+ for(int i=0; i<4; i++)
+ {
+ float3 position = g_positions[i] * g_fParticleRad;
+ position = mul( position, (float3x3)g_mInvView ) + input[0].pos;
+ output.pos = mul( float4(position,1.0), g_mWorldViewProj );
+
+ output.color = input[0].color;
+ output.tex = g_texcoords[i];
+ SpriteStream.Append(output);
+ }
+ SpriteStream.RestartStrip();
+}
+
+//
+// PS for drawing particles
+//
+float4 PSParticleDraw(PSParticleDrawIn input) : SV_Target
+{
+ return g_txDiffuse.Sample( g_samLinear, input.tex ) * input.color;
+} \ No newline at end of file
diff --git a/tests/hlsl/dxsdk/OIT11/OIT_CS.hlsl b/tests/hlsl/dxsdk/OIT11/OIT_CS.hlsl
new file mode 100644
index 000000000..dfc98b217
--- /dev/null
+++ b/tests/hlsl/dxsdk/OIT11/OIT_CS.hlsl
@@ -0,0 +1,277 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry VSParticleDraw -profile gs_4_0 -entry GSParticleDraw -profile ps_4_0 -entry PSParticleDraw
+//-----------------------------------------------------------------------------
+// File: OIT_CS.hlsl
+//
+// Desc: Compute shaders for used in the Order Independent Transparency sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//-----------------------------------------------------------------------------
+// TODO: use structured buffers
+RWBuffer<float> deepBufferDepth : register( u0 );
+RWBuffer<uint> deepBufferColorUINT : register( u1 );
+RWTexture2D<float4> frameBuffer : register( u2 );
+RWBuffer<uint> prefixSum : register( u3 );
+
+Texture2D<uint> fragmentCount : register ( t0 );
+
+cbuffer CB : register( b0 )
+{
+ uint g_nFrameWidth : packoffset( c0.x );
+ uint g_nFrameHeight : packoffset( c0.y );
+ uint g_nPassSize : packoffset( c0.z );
+ uint g_nReserved : packoffset( c0.w );
+}
+
+#define blocksize 1
+#define groupthreads (blocksize*blocksize)
+groupshared float accum[groupthreads];
+
+// First pass of the prefix sum creation algorithm. Converts a 2D buffer to a 1D buffer,
+// and sums every other value with the previous value.
+[numthreads(1,1,1)]
+void CreatePrefixSum_Pass0_CS( uint3 nGid : SV_GroupID, uint3 nDTid : SV_DispatchThreadID, uint3 nGTid : SV_GroupThreadID )
+{
+ int nThreadNum = nGid.y*g_nFrameWidth + nGid.x;
+ if( nThreadNum%2 == 0 )
+ {
+ prefixSum[nThreadNum] = fragmentCount[nGid.xy];
+
+ // Add the Fragment count to the next bin
+ if( (nThreadNum+1) < g_nFrameWidth * g_nFrameHeight )
+ {
+ int2 nextUV;
+ nextUV.x = (nThreadNum+1) % g_nFrameWidth;
+ nextUV.y = (nThreadNum+1) / g_nFrameWidth;
+ prefixSum[ nThreadNum+1 ] = prefixSum[ nThreadNum ] + fragmentCount[ nextUV ];
+ }
+ }
+}
+
+// Second and following passes. Each pass distributes the sum of the first half of the group
+// to the second half of the group. There are n/groupsize groups in each pass.
+// Each pass increases the group size until it is the size of the buffer.
+// The resulting buffer holds the prefix sum of all preceding values in each
+// position
+[numthreads(1,1,1)]
+void CreatePrefixSum_Pass1_CS( uint3 nGid : SV_GroupID, uint3 nDTid : SV_DispatchThreadID, uint3 nGTid : SV_GroupThreadID )
+{
+ int nThreadNum = nGid.x;
+
+ int nValue = prefixSum[nThreadNum*g_nPassSize + g_nPassSize/2 - 1];
+ for(int i = nThreadNum*g_nPassSize + g_nPassSize/2; i < nThreadNum*g_nPassSize + g_nPassSize && i < g_nFrameWidth*g_nFrameHeight; i++)
+ {
+ prefixSum[i] = prefixSum[i] + nValue;
+ }
+}
+
+#if 1
+
+// Sort the fragments using a bitonic sort, then accumulate the fragments into the final result.
+groupshared int nIndex[32];
+#define NUM_THREADS 8
+[numthreads(1,1,1)]
+void SortAndRenderCS( uint3 nGid : SV_GroupID, uint3 nDTid : SV_DispatchThreadID, uint3 nGTid : SV_GroupThreadID )
+{
+ uint nThreadNum = nGid.y * g_nFrameWidth + nGid.x;
+
+// uint r0, r1, r2;
+// float rd0, rd1, rd2, rd3, rd4, rd5, rd6, rd7;
+
+ uint N = fragmentCount[nDTid.xy];
+
+ uint N2 = 1 << (int)(ceil(log2(N)));
+
+ float fDepth[32];
+ for(int i = 0; i < N; i++)
+ {
+ nIndex[i] = i;
+ fDepth[i] = deepBufferDepth[ prefixSum[nThreadNum-1] + i ];
+ }
+ for(int i = N; i < N2; i++)
+ {
+ nIndex[i] = i;
+ fDepth[i] = 1.1f;
+ }
+
+ uint idx = blocksize*nGTid.y + nGTid.x;
+
+ // Bitonic sort
+ for( int k = 2; k <= N2; k = 2*k )
+ {
+ for( int j = k>>1; j > 0 ; j = j>>1 )
+ {
+ for( int i = 0; i < N2; i++ )
+ {
+// GroupMemoryBarrierWithGroupSync();
+ //i = idx;
+
+ float di = fDepth[ nIndex[ i ] ];
+ int ixj = i^j;
+ if ( ( ixj ) > i )
+ {
+ float dixj = fDepth[ nIndex[ ixj ] ];
+ if ( ( i&k ) == 0 && di > dixj )
+ {
+ int temp = nIndex[ i ];
+ nIndex[ i ] = nIndex[ ixj ];
+ nIndex[ ixj ] = temp;
+ }
+ if ( ( i&k ) != 0 && di < dixj )
+ {
+ int temp = nIndex[ i ];
+ nIndex[ i ] = nIndex[ ixj ];
+ nIndex[ ixj ] = temp;
+ }
+ }
+ }
+ }
+ }
+
+ // Output the final result to the frame buffer
+ if( idx == 0 )
+ {
+
+ /*
+ // Debug
+ uint color[8];
+ for(int i = 0; i < 8; i++)
+ {
+ color[i] = deepBufferColorUINT[prefixSum[nThreadNum-1] + i];
+ }
+
+ for(int i = 0; i < 8; i++)
+ {
+ deepBufferDepth[nThreadNum*8+i] = fDepth[i];//fDepth[nIndex[i]];
+ deepBufferColorUINT[nThreadNum*8+i] = color[nIndex[i]];
+ }
+ */
+
+ // Accumulate fragments into final result
+ float4 result = 0.0f;
+ for( int x = N-1; x >= 0; x-- )
+ {
+ uint bufferValue = deepBufferColorUINT[ prefixSum[nThreadNum-1] + nIndex[ x ] ];
+ float4 color;
+ color.r = ( ( bufferValue >> 0 & 0xFF )) / 255.0f;
+ color.g = ( bufferValue >> 8 & 0xFF ) / 255.0f;
+ color.b = ( bufferValue >> 16 & 0xFF ) / 255.0f;
+ color.a = ( bufferValue >> 24 & 0xFF ) / 255.0f;
+ result = lerp( result, color, color.a );
+ }
+ result.a = 1.0f;
+ frameBuffer[ nGid.xy ] = result;
+ }
+}
+
+#else
+[numthreads(1,1,1)]
+void SortAndRenderCS( uint3 nGid : SV_GroupID, uint3 nDTid : SV_DispatchThreadID, uint3 nGTid : SV_GroupThreadID )
+{
+ uint nThreadNum = nDTid.y * g_nFrameWidth + nDTid.x;
+ float d0 = deepBufferDepth[nThreadNum*8];
+ float d1 = deepBufferDepth[nThreadNum*8+1];
+ float d2 = deepBufferDepth[nThreadNum*8+2];
+
+ uint s0 = deepBufferColorUINT[nThreadNum*8 + 0];
+ uint s1 = deepBufferColorUINT[nThreadNum*8 + 1];
+ uint s2 = deepBufferColorUINT[nThreadNum*8 + 2];
+
+ uint r0, r1, r2;
+ float rd0, rd1, rd2;
+ if( d0 < d1 && d0 < d2 )
+ {
+ r0 = s0;
+ rd0 = d0;
+ if( d1 < d2 )
+ {
+ r1 = s1;
+ r2 = s2;
+
+ rd1 = d1;
+ rd2 = d2;
+ }
+ else
+ {
+ r1 = s2;
+ r2 = s1;
+
+ rd1 = d2;
+ rd2 = d1;
+ }
+ }
+ else if( d1 < d2 )
+ {
+ r0 = s1;
+ rd0 = d1;
+ if( d0 < d2 )
+ {
+ r1 = s0;
+ r2 = s2;
+
+ rd1 = d0;
+ rd2 = d2;
+ }
+ else
+ {
+ r1 = s2;
+ r2 = s0;
+
+ rd1 = d2;
+ rd2 = d0;
+ }
+ }
+ else
+ {
+ r0 = s2;
+ rd0 = d2;
+ if( d1 < d0 )
+ {
+ r1 = s1;
+ r2 = s0;
+
+ rd1 = d1;
+ rd2 = d0;
+ }
+ else
+ {
+ r1 = s0;
+ r2 = s1;
+
+ rd1 = d0;
+ rd2 = d1;
+ }
+ }
+
+ deepBufferDepth[nThreadNum*8] = rd0;
+ deepBufferDepth[nThreadNum*8+1] = rd1;
+ deepBufferDepth[nThreadNum*8+2] = rd2;
+
+ deepBufferColorUINT[nThreadNum*8] = r0;
+ deepBufferColorUINT[nThreadNum*8+1] = r1;
+ deepBufferColorUINT[nThreadNum*8+2] = r2;
+
+ // convert the color to floats
+ float4 color[3];
+ color[0].r = (r0 >> 0 & 0xFF) / 255.0f;
+ color[0].g = (r0 >> 8 & 0xFF) / 255.0f;
+ color[0].b = (r0 >> 16 & 0xFF) / 255.0f;
+ color[0].a = (r0 >> 24 & 0xFF) / 255.0f;
+
+ color[1].r = (r1 >> 0 & 0xFF) / 255.0f;
+ color[1].g = (r1 >> 8 & 0xFF) / 255.0f;
+ color[1].b = (r1 >> 16 & 0xFF) / 255.0f;
+ color[1].a = (r1 >> 24 & 0xFF) / 255.0f;
+
+ color[2].r = (r2 >> 0 & 0xFF) / 255.0f;
+ color[2].g = (r2 >> 8 & 0xFF) / 255.0f;
+ color[2].b = (r2 >> 16 & 0xFF) / 255.0f;
+ color[2].a = (r2 >> 24 & 0xFF) / 255.0f;
+
+ float4 result = lerp(lerp(lerp(0, color[2], color[2].a), color[1], color[1].a), color[0], color[0].a);
+ result.a = 1.0f;
+
+ frameBuffer[nDTid.xy] = result;
+}
+
+#endif \ No newline at end of file
diff --git a/tests/hlsl/dxsdk/OIT11/OIT_PS.hlsl b/tests/hlsl/dxsdk/OIT11/OIT_PS.hlsl
new file mode 100644
index 000000000..1fdb31622
--- /dev/null
+++ b/tests/hlsl/dxsdk/OIT11/OIT_PS.hlsl
@@ -0,0 +1,56 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry FragmentCountPS -entry FillDeepBufferPS
+//-----------------------------------------------------------------------------
+// File: OITPS.hlsl
+//
+// Desc: Pixel shaders used in the Order Independent Transparency sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//-----------------------------------------------------------------------------
+//TODO: Use structured buffers
+RWTexture2D<uint> fragmentCount : register( u1 );
+RWBuffer<float> deepBufferDepth : register( u2 );
+RWBuffer<uint4> deepBufferColor : register( u3 );
+RWBuffer<uint> prefixSum : register( u4 );
+
+cbuffer CB : register( b0 )
+{
+ uint g_nFrameWidth : packoffset( c0.x );
+ uint g_nFrameHeight : packoffset( c0.y );
+ uint g_nReserved0 : packoffset( c0.z );
+ uint g_nReserved1 : packoffset( c0.w );
+}
+
+struct SceneVS_Output
+{
+ float4 pos : SV_POSITION;
+ float4 color : COLOR0;
+};
+
+void FragmentCountPS( SceneVS_Output input)
+{
+ // Increments need to be done atomically
+ InterlockedAdd(fragmentCount[input.pos.xy], 1);
+}
+
+void FillDeepBufferPS( SceneVS_Output input )
+{
+ uint x = input.pos.x;
+ uint y = input.pos.y;
+
+ // Atomically allocate space in the deep buffer
+ uint fc;
+ InterlockedAdd(fragmentCount[input.pos.xy], 1, fc);
+
+ uint nPrefixSumPos = y*g_nFrameWidth + x;
+ uint nDeepBufferPos;
+ if( nPrefixSumPos == 0 )
+ nDeepBufferPos = fc;
+ else
+ nDeepBufferPos = prefixSum[nPrefixSumPos-1] + fc;
+
+ // Store fragment data into the allocated space
+ deepBufferDepth[nDeepBufferPos] = input.pos.z;
+ deepBufferColor[nDeepBufferPos] = clamp(input.color, 0, 1)*255;
+}
+
diff --git a/tests/hlsl/dxsdk/OIT11/SceneVS.hlsl b/tests/hlsl/dxsdk/OIT11/SceneVS.hlsl
new file mode 100644
index 000000000..2f985d1d1
--- /dev/null
+++ b/tests/hlsl/dxsdk/OIT11/SceneVS.hlsl
@@ -0,0 +1,36 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry SceneVS
+//-----------------------------------------------------------------------------
+// File: SceneVS.hlsl
+//
+// Desc: Vertex shader for the scene.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//-----------------------------------------------------------------------------
+
+
+cbuffer cbPerObject : register( b0 )
+{
+ row_major matrix g_mWorldViewProjection : packoffset( c0 );
+}
+
+struct SceneVS_Input
+{
+ float4 pos : POSITION;
+ float4 color : COLOR;
+};
+
+struct SceneVS_Output
+{
+ float4 pos : SV_POSITION;
+ float4 color : COLOR0;
+};
+
+SceneVS_Output SceneVS( SceneVS_Input input )
+{
+ SceneVS_Output output;
+
+ output.color = input.color;
+ output.pos = mul(input.pos, g_mWorldViewProjection );
+
+ return output;
+}
diff --git a/tests/hlsl/dxsdk/README.md b/tests/hlsl/dxsdk/README.md
new file mode 100644
index 000000000..dd0c0fb6b
--- /dev/null
+++ b/tests/hlsl/dxsdk/README.md
@@ -0,0 +1,5 @@
+DirectX SDK Sample Shaders
+==========================
+
+This directory contains shaders that have shipped as part of the DirectX SDK.
+The licsense terms for these shaders are specificed at the top of the source files. \ No newline at end of file
diff --git a/tests/hlsl/dxsdk/SimpleBezier11/SimpleBezier11.hlsl b/tests/hlsl/dxsdk/SimpleBezier11/SimpleBezier11.hlsl
new file mode 100644
index 000000000..7b7a1489c
--- /dev/null
+++ b/tests/hlsl/dxsdk/SimpleBezier11/SimpleBezier11.hlsl
@@ -0,0 +1,230 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry BezierVS -profile hs_5_0 -entry BezierHS -profile ds_5_0 -entry BezierDS -profile ps_4_0 -entry BezierPS -entry SolidColorPS
+//--------------------------------------------------------------------------------------
+// File: SimpleBezier11.hlsl
+//
+// This sample shows an simple implementation of the DirectX 11 Hardware Tessellator
+// for rendering a Bezier Patch.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+// This allows us to compile the shader with a #define to choose
+// the different partition modes for the hull shader.
+// See the hull shader: [partitioning(BEZIER_HS_PARTITION)]
+// This sample demonstrates "integer", "fractional_even", and "fractional_odd"
+#ifndef BEZIER_HS_PARTITION
+#define BEZIER_HS_PARTITION "integer"
+#endif // BEZIER_HS_PARTITION
+
+// The input patch size. In this sample, it is 16 control points.
+// This value should match the call to IASetPrimitiveTopology()
+#define INPUT_PATCH_SIZE 16
+
+// The output patch size. In this sample, it is also 16 control points.
+#define OUTPUT_PATCH_SIZE 16
+
+//--------------------------------------------------------------------------------------
+// Constant Buffers
+//--------------------------------------------------------------------------------------
+cbuffer cbPerFrame : register( b0 )
+{
+ matrix g_mViewProjection;
+ float3 g_vCameraPosWorld;
+ float g_fTessellationFactor;
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex shader section
+//--------------------------------------------------------------------------------------
+struct VS_CONTROL_POINT_INPUT
+{
+ float3 vPosition : POSITION;
+};
+
+struct VS_CONTROL_POINT_OUTPUT
+{
+ float3 vPosition : POSITION;
+};
+
+// This simple vertex shader passes the control points straight through to the
+// hull shader. In a more complex scene, you might transform the control points
+// or perform skinning at this step.
+
+// The input to the vertex shader comes from the vertex buffer.
+
+// The output from the vertex shader will go into the hull shader.
+
+VS_CONTROL_POINT_OUTPUT BezierVS( VS_CONTROL_POINT_INPUT Input )
+{
+ VS_CONTROL_POINT_OUTPUT Output;
+
+ Output.vPosition = Input.vPosition;
+
+ return Output;
+}
+
+//--------------------------------------------------------------------------------------
+// Constant data function for the BezierHS. This is executed once per patch.
+//--------------------------------------------------------------------------------------
+struct HS_CONSTANT_DATA_OUTPUT
+{
+ float Edges[4] : SV_TessFactor;
+ float Inside[2] : SV_InsideTessFactor;
+};
+
+struct HS_OUTPUT
+{
+ float3 vPosition : BEZIERPOS;
+};
+
+// This constant hull shader is executed once per patch. For the simple Mobius strip
+// model, it will be executed 4 times. In this sample, we set the tessellation factor
+// via SV_TessFactor and SV_InsideTessFactor for each patch. In a more complex scene,
+// you might calculate a variable tessellation factor based on the camera's distance.
+
+HS_CONSTANT_DATA_OUTPUT BezierConstantHS( InputPatch<VS_CONTROL_POINT_OUTPUT, INPUT_PATCH_SIZE> ip,
+ uint PatchID : SV_PrimitiveID )
+{
+ HS_CONSTANT_DATA_OUTPUT Output;
+
+ float TessAmount = g_fTessellationFactor;
+
+ Output.Edges[0] = Output.Edges[1] = Output.Edges[2] = Output.Edges[3] = TessAmount;
+ Output.Inside[0] = Output.Inside[1] = TessAmount;
+
+ return Output;
+}
+
+// The hull shader is called once per output control point, which is specified with
+// outputcontrolpoints. For this sample, we take the control points from the vertex
+// shader and pass them directly off to the domain shader. In a more complex scene,
+// you might perform a basis conversion from the input control points into a Bezier
+// patch, such as the SubD11 Sample.
+
+// The input to the hull shader comes from the vertex shader
+
+// The output from the hull shader will go to the domain shader.
+// The tessellation factor, topology, and partition mode will go to the fixed function
+// tessellator stage to calculate the UVW and domain points.
+
+[domain("quad")]
+[partitioning(BEZIER_HS_PARTITION)]
+[outputtopology("triangle_cw")]
+[outputcontrolpoints(OUTPUT_PATCH_SIZE)]
+[patchconstantfunc("BezierConstantHS")]
+HS_OUTPUT BezierHS( InputPatch<VS_CONTROL_POINT_OUTPUT, INPUT_PATCH_SIZE> p,
+ uint i : SV_OutputControlPointID,
+ uint PatchID : SV_PrimitiveID )
+{
+ HS_OUTPUT Output;
+ Output.vPosition = p[i].vPosition;
+ return Output;
+}
+
+//--------------------------------------------------------------------------------------
+// Bezier evaluation domain shader section
+//--------------------------------------------------------------------------------------
+struct DS_OUTPUT
+{
+ float4 vPosition : SV_POSITION;
+ float3 vWorldPos : WORLDPOS;
+ float3 vNormal : NORMAL;
+};
+
+//--------------------------------------------------------------------------------------
+float4 BernsteinBasis(float t)
+{
+ float invT = 1.0f - t;
+
+ return float4( invT * invT * invT,
+ 3.0f * t * invT * invT,
+ 3.0f * t * t * invT,
+ t * t * t );
+}
+
+//--------------------------------------------------------------------------------------
+float4 dBernsteinBasis(float t)
+{
+ float invT = 1.0f - t;
+
+ return float4( -3 * invT * invT,
+ 3 * invT * invT - 6 * t * invT,
+ 6 * t * invT - 3 * t * t,
+ 3 * t * t );
+}
+
+//--------------------------------------------------------------------------------------
+float3 EvaluateBezier( const OutputPatch<HS_OUTPUT, OUTPUT_PATCH_SIZE> bezpatch,
+ float4 BasisU,
+ float4 BasisV )
+{
+ float3 Value = float3(0,0,0);
+ Value = BasisV.x * ( bezpatch[0].vPosition * BasisU.x + bezpatch[1].vPosition * BasisU.y + bezpatch[2].vPosition * BasisU.z + bezpatch[3].vPosition * BasisU.w );
+ Value += BasisV.y * ( bezpatch[4].vPosition * BasisU.x + bezpatch[5].vPosition * BasisU.y + bezpatch[6].vPosition * BasisU.z + bezpatch[7].vPosition * BasisU.w );
+ Value += BasisV.z * ( bezpatch[8].vPosition * BasisU.x + bezpatch[9].vPosition * BasisU.y + bezpatch[10].vPosition * BasisU.z + bezpatch[11].vPosition * BasisU.w );
+ Value += BasisV.w * ( bezpatch[12].vPosition * BasisU.x + bezpatch[13].vPosition * BasisU.y + bezpatch[14].vPosition * BasisU.z + bezpatch[15].vPosition * BasisU.w );
+
+ return Value;
+}
+
+// The domain shader is run once per vertex and calculates the final vertex's position
+// and attributes. It receives the UVW from the fixed function tessellator and the
+// control point outputs from the hull shader. Since we are using the DirectX 11
+// Tessellation pipeline, it is the domain shader's responsibility to calculate the
+// final SV_POSITION for each vertex. In this sample, we evaluate the vertex's
+// position using a Bernstein polynomial and the normal is calculated as the cross
+// product of the U and V derivatives.
+
+// The input SV_DomainLocation to the domain shader comes from fixed function
+// tessellator. And the OutputPatch comes from the hull shader. From these, you
+// must calculate the final vertex position, color, texcoords, and other attributes.
+
+// The output from the domain shader will be a vertex that will go to the video card's
+// rasterization pipeline and get drawn to the screen.
+
+[domain("quad")]
+DS_OUTPUT BezierDS( HS_CONSTANT_DATA_OUTPUT input,
+ float2 UV : SV_DomainLocation,
+ const OutputPatch<HS_OUTPUT, OUTPUT_PATCH_SIZE> bezpatch )
+{
+ float4 BasisU = BernsteinBasis( UV.x );
+ float4 BasisV = BernsteinBasis( UV.y );
+ float4 dBasisU = dBernsteinBasis( UV.x );
+ float4 dBasisV = dBernsteinBasis( UV.y );
+
+ float3 WorldPos = EvaluateBezier( bezpatch, BasisU, BasisV );
+ float3 Tangent = EvaluateBezier( bezpatch, dBasisU, BasisV );
+ float3 BiTangent = EvaluateBezier( bezpatch, BasisU, dBasisV );
+ float3 Norm = normalize( cross( Tangent, BiTangent ) );
+
+ DS_OUTPUT Output;
+ Output.vPosition = mul( float4(WorldPos,1), g_mViewProjection );
+ Output.vWorldPos = WorldPos;
+ Output.vNormal = Norm;
+
+ return Output;
+}
+
+//--------------------------------------------------------------------------------------
+// Smooth shading pixel shader section
+//--------------------------------------------------------------------------------------
+
+// The pixel shader works the same as it would in a normal graphics pipeline.
+// In this sample, it performs very simple N dot L lighting.
+
+float4 BezierPS( DS_OUTPUT Input ) : SV_TARGET
+{
+ float3 N = normalize(Input.vNormal);
+ float3 L = normalize(Input.vWorldPos - g_vCameraPosWorld);
+ return abs(dot(N, L)) * float4(1, 0, 0, 1);
+}
+
+//--------------------------------------------------------------------------------------
+// Solid color shading pixel shader (used for wireframe overlay)
+//--------------------------------------------------------------------------------------
+float4 SolidColorPS( DS_OUTPUT Input ) : SV_TARGET
+{
+ // Return a solid green color
+ return float4( 0, 1, 0, 1 );
+}
diff --git a/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.fx b/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.fx
new file mode 100644
index 000000000..00883ce70
--- /dev/null
+++ b/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.fx
@@ -0,0 +1,112 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: SimpleSample.fx
+//
+// The effect file for the SimpleSample sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Global variables
+//--------------------------------------------------------------------------------------
+float4 g_MaterialAmbientColor; // Material's ambient color
+float4 g_MaterialDiffuseColor; // Material's diffuse color
+float3 g_LightDir; // Light's direction in world space
+float4 g_LightDiffuse; // Light's diffuse color
+texture g_MeshTexture; // Color texture for mesh
+
+float g_fTime; // App's time in seconds
+float4x4 g_mWorld; // World matrix for object
+float4x4 g_mWorldViewProjection; // World * View * Projection matrix
+
+
+
+//--------------------------------------------------------------------------------------
+// Texture samplers
+//--------------------------------------------------------------------------------------
+sampler MeshTextureSampler =
+sampler_state
+{
+ Texture = <g_MeshTexture>;
+ MipFilter = LINEAR;
+ MinFilter = LINEAR;
+ MagFilter = LINEAR;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex shader output structure
+//--------------------------------------------------------------------------------------
+struct VS_OUTPUT
+{
+ float4 Position : POSITION; // vertex position
+ float4 Diffuse : COLOR0; // vertex diffuse color (note that COLOR0 is clamped from 0..1)
+ float2 TextureUV : TEXCOORD0; // vertex texture coords
+};
+
+
+//--------------------------------------------------------------------------------------
+// This shader computes standard transform and lighting
+//--------------------------------------------------------------------------------------
+VS_OUTPUT RenderSceneVS( float4 vPos : POSITION,
+ float3 vNormal : NORMAL,
+ float2 vTexCoord0 : TEXCOORD0 )
+{
+ VS_OUTPUT Output;
+ float3 vNormalWorldSpace;
+
+ // Transform the position from object space to homogeneous projection space
+ Output.Position = mul(vPos, g_mWorldViewProjection);
+
+ // Transform the normal from object space to world space
+ vNormalWorldSpace = normalize(mul(vNormal, (float3x3)g_mWorld)); // normal (world space)
+
+ // Calc diffuse color
+ Output.Diffuse.rgb = g_MaterialDiffuseColor * g_LightDiffuse * max(0,dot(vNormalWorldSpace, g_LightDir)) +
+ g_MaterialAmbientColor;
+ Output.Diffuse.a = 1.0f;
+
+ // Just copy the texture coordinate through
+ Output.TextureUV = vTexCoord0;
+
+ return Output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel shader output structure
+//--------------------------------------------------------------------------------------
+struct PS_OUTPUT
+{
+ float4 RGBColor : COLOR0; // Pixel color
+};
+
+
+//--------------------------------------------------------------------------------------
+// This shader outputs the pixel's color by modulating the texture's
+// color with diffuse material color
+//--------------------------------------------------------------------------------------
+PS_OUTPUT RenderScenePS( VS_OUTPUT In )
+{
+ PS_OUTPUT Output;
+
+ // Lookup mesh texture and modulate it with diffuse
+ Output.RGBColor = tex2D(MeshTextureSampler, In.TextureUV) * In.Diffuse;
+
+ return Output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Renders scene
+//--------------------------------------------------------------------------------------
+technique RenderScene
+{
+ pass P0
+ {
+ VertexShader = compile vs_2_0 RenderSceneVS();
+ PixelShader = compile ps_2_0 RenderScenePS();
+ }
+}
diff --git a/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.hlsl b/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.hlsl
new file mode 100644
index 000000000..12f368f86
--- /dev/null
+++ b/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.hlsl
@@ -0,0 +1,86 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry RenderSceneVS -profile ps_4_0 -entry RenderScenePS
+//--------------------------------------------------------------------------------------
+// File: SimpleSample.hlsl
+//
+// The HLSL file for the SimpleSample sample for the Direct3D 11 device
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Constant Buffers
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+ matrix g_mWorldViewProjection : packoffset( c0 );
+ matrix g_mWorld : packoffset( c4 );
+ float4 g_MaterialAmbientColor : packoffset( c8 );
+ float4 g_MaterialDiffuseColor : packoffset( c9 );
+}
+
+cbuffer cbPerFrame : register( b1 )
+{
+ float3 g_vLightDir : packoffset( c0 );
+ float g_fTime : packoffset( c0.w );
+ float4 g_LightDiffuse : packoffset( c1 );
+};
+
+//-----------------------------------------------------------------------------------------
+// Textures and Samplers
+//-----------------------------------------------------------------------------------------
+Texture2D g_txDiffuse : register( t0 );
+SamplerState g_samLinear : register( s0 );
+
+//--------------------------------------------------------------------------------------
+// shader input/output structure
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+ float4 Position : POSITION; // vertex position
+ float3 Normal : NORMAL; // this normal comes in per-vertex
+ float2 TextureUV : TEXCOORD0;// vertex texture coords
+};
+
+struct VS_OUTPUT
+{
+ float4 Position : SV_POSITION; // vertex position
+ float4 Diffuse : COLOR0; // vertex diffuse color (note that COLOR0 is clamped from 0..1)
+ float2 TextureUV : TEXCOORD0; // vertex texture coords
+};
+
+//--------------------------------------------------------------------------------------
+// This shader computes standard transform and lighting
+//--------------------------------------------------------------------------------------
+VS_OUTPUT RenderSceneVS( VS_INPUT input )
+{
+ VS_OUTPUT Output;
+ float3 vNormalWorldSpace;
+
+ // Transform the position from object space to homogeneous projection space
+ Output.Position = mul( input.Position, g_mWorldViewProjection );
+
+ // Transform the normal from object space to world space
+ vNormalWorldSpace = normalize(mul(input.Normal, (float3x3)g_mWorld)); // normal (world space)
+
+ // Calc diffuse color
+ Output.Diffuse.rgb = g_MaterialDiffuseColor * g_LightDiffuse * max(0,dot(vNormalWorldSpace, g_vLightDir)) +
+ g_MaterialAmbientColor;
+ Output.Diffuse.a = 1.0f;
+
+ // Just copy the texture coordinate through
+ Output.TextureUV = input.TextureUV;
+
+ return Output;
+}
+
+//--------------------------------------------------------------------------------------
+// This shader outputs the pixel's color by modulating the texture's
+// color with diffuse material color
+//--------------------------------------------------------------------------------------
+float4 RenderScenePS( VS_OUTPUT In ) : SV_TARGET
+{
+ // Lookup mesh texture and modulate it with diffuse
+ return g_txDiffuse.Sample( g_samLinear, In.TextureUV ) * In.Diffuse;
+}
diff --git a/tests/hlsl/dxsdk/SubD11/SubD11.hlsl b/tests/hlsl/dxsdk/SubD11/SubD11.hlsl
new file mode 100644
index 000000000..c4ebf9620
--- /dev/null
+++ b/tests/hlsl/dxsdk/SubD11/SubD11.hlsl
@@ -0,0 +1,1238 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry PatchSkinningVS -entry MeshSkinningVS -profile hs_5_0 -entry SubDToBezierHS -entry SubDToBezierHS4444 -profile ds_5_0 -entry BezierEvalDS -profile ps_4_0 -entry SmoothPS -entry SolidColorPS
+//--------------------------------------------------------------------------------------
+// File: SubD11.hlsl
+//
+// This file contains functions to convert from a Catmull-Clark subdivision
+// representation to a bicubic patch representation.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//Work-around for an optimization rule problem in the June 2010 HLSL Compiler (9.29.952.3111)
+//see http://support.microsoft.com/kb/2448404
+#if D3DX_VERSION == 0xa2b
+#pragma ruledisable 0x0802405f
+#endif
+
+//--------------------------------------------------------------------------------------
+// A sample extraordinary SubD quad is represented by the following diagram:
+//
+// 15 Valences:
+// / \ Vertex 0: 5
+// / 14 Vertex 1: 4
+// 17---------16 / \ Vertex 2: 5
+// | \ | / \ Vertex 3: 3
+// | \ | / 13
+// | \ |/ / Prefixes:
+// | 3------2------12 Vertex 0: 9
+// | | | | Vertex 1: 12
+// | | | | Vertex 2: 16
+// 4----0------1------11 Vertex 3: 18
+// / /| | |
+// / / | | |
+// 5 / 8------9------10
+// \ / /
+// 6 /
+// \ /
+// 7
+//
+// Where the quad bounded by vertices 0,1,2,3 represents the actual subd surface of interest
+// The 1-ring neighborhood of the quad is represented by vertices 4 through 17. The counter-
+// clockwise winding of this 1-ring neighborhood is important, especially when it comes to compute
+// the corner vertices of the bicubic patch that we will use to approximate the subd quad (0,1,2,3).
+//
+// The resulting bicubic patch fits within the subd quad (0,1,2,3) and has the following control
+// point layout:
+//
+// 12--13--14--15
+// 8---9--10--11
+// 4---5---6---7
+// 0---1---2---3
+//
+// The inner 4 control points of the bicubic patch are a combination of only the vertices (0,1,2,3)
+// of the subd quad. However, the corner control points for the bicubic patch (0,3,15,12) are actually
+// a much more complex weighting of the subd patch and the 1-ring neighborhood. In the example above
+// the bicubic control point 0 is actually a weighted combination of subd points 0,1,2,3 and 1-ring
+// neighborhood points 17, 4, 5, 6, 7, 8, and 9. We can see that the 1-ring neighbor hood is simply
+// walked from the prefix value from the previous corner (corner 3 in this case) to the prefix
+// prefix value for the current corner. We add one more vertex on either side of the prefix values
+// and we have all the data necessary to calculate the value for the corner points.
+//
+// The edge control points of the bicubic patch (1,2,13,14,4,8,7,11) are also combinations of their
+// neighbors, but fortunately each one is only a combination of 6 values and no walk is required.
+//--------------------------------------------------------------------------------------
+
+#define MOD4(x) ((x)&3)
+#ifndef MAX_POINTS
+#define MAX_POINTS 32
+#endif
+#define MAX_BONE_MATRICES 80
+
+//--------------------------------------------------------------------------------------
+// Textures
+//--------------------------------------------------------------------------------------
+Texture2D g_txHeight : register( t0 ); // Height and Bump texture
+Texture2D g_txDiffuse : register( t1 ); // Diffuse texture
+Texture2D g_txSpecular : register( t2 ); // Specular texture
+
+//--------------------------------------------------------------------------------------
+// Samplers
+//--------------------------------------------------------------------------------------
+SamplerState g_samLinear : register( s0 );
+SamplerState g_samPoint : register( s0 );
+
+//--------------------------------------------------------------------------------------
+// Constant Buffers
+//--------------------------------------------------------------------------------------
+cbuffer cbTangentStencilConstants : register( b0 )
+{
+ float g_TanM[1024]; // Tangent patch stencils precomputed by the application
+ float g_fCi[16]; // Valence coefficients precomputed by the application
+};
+
+cbuffer cbPerMesh : register( b1 )
+{
+ matrix g_mConstBoneWorld[MAX_BONE_MATRICES];
+};
+
+cbuffer cbPerFrame : register( b2 )
+{
+ matrix g_mViewProjection;
+ float3 g_vCameraPosWorld;
+ float g_fTessellationFactor;
+ float g_fDisplacementHeight;
+ float3 g_vSolidColor;
+};
+
+cbuffer cbPerSubset : register( b3 )
+{
+ int g_iPatchStartIndex;
+}
+
+//--------------------------------------------------------------------------------------
+Buffer<uint4> g_ValencePrefixBuffer : register( t0 );
+
+//--------------------------------------------------------------------------------------
+struct VS_CONTROL_POINT_OUTPUT
+{
+ float3 vPosition : WORLDPOS;
+ float2 vUV : TEXCOORD0;
+ float3 vTangent : TANGENT;
+};
+
+struct BEZIER_CONTROL_POINT
+{
+ float3 vPosition : BEZIERPOS;
+};
+
+struct PS_INPUT
+{
+ float3 vWorldPos : POSITION;
+ float3 vNormal : NORMAL;
+ float2 vUV : TEXCOORD;
+ float3 vTangent : TANGENT;
+ float3 vBiTangent : BITANGENT;
+};
+
+//--------------------------------------------------------------------------------------
+// SubD to Bezier helper functions
+//--------------------------------------------------------------------------------------
+// Helps with getting tangent stencils from the g_TanM constant array
+#define TANM(a,v) ( g_TanM[ Val[v]*64 + (a) ] )
+
+//--------------------------------------------------------------------------------------
+float3 ComputeInteriorVertex( uint index,
+ uint Val[4],
+ const in InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> ip )
+{
+ switch( index )
+ {
+ case 0:
+ return (ip[0].vPosition*Val[0] + ip[1].vPosition*2 + ip[2].vPosition + ip[3].vPosition*2) / (5+Val[0]);
+ case 1:
+ return (ip[0].vPosition*2 + ip[1].vPosition*Val[1] + ip[2].vPosition*2 + ip[3].vPosition) / (5+Val[1]);
+ case 2:
+ return (ip[0].vPosition + ip[1].vPosition*2 + ip[2].vPosition*Val[2] + ip[3].vPosition*2) / (5+Val[2]);
+ case 3:
+ return (ip[0].vPosition*2 + ip[1].vPosition + ip[2].vPosition*2 + ip[3].vPosition*Val[3]) / (5+Val[3]);
+ }
+
+ return float3(0,0,0);
+}
+
+//--------------------------------------------------------------------------------------
+// Computes the corner vertices of the output UV patch. The corner vertices are
+// a weighted combination of all points that are "connected" to that corner by an edge.
+// The interior 4 points of the original subd quad are easy to get. The points in the
+// 1-ring neighborhood around the interior quad are not.
+//
+// Because the valence of that corner could be any number between 3 and 16, we need to
+// walk around the subd patch vertices connected to that point. This is there the
+// Pref (prefix) values come into play. Each corner has a prefix value that is the index
+// of the last value around the 1-ring neighborhood that should be used in calculating
+// the coefficient of that corner. The walk goes from the prefix value of the previous
+// corner to the prefix value of the current corner.
+//--------------------------------------------------------------------------------------
+void ComputeCornerVertex( uint index,
+ out float3 CornerB, // Corner for the Bezier patch
+ out float3 CornerU, // Corner for the tangent patch
+ out float3 CornerV, // Corner for the bitangent patch
+ const in InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> ip,
+ const in uint Val[4],
+ const in uint Pref[4] )
+{
+ const float fOWt = 1;
+ const float fEWt = 4;
+
+ // Figure out where to start the walk by using the previous corner's prefix value
+ uint PrefIm1 = 0;
+ uint uStart = 4;
+ if( index )
+ {
+ PrefIm1 = Pref[index-1];
+ uStart = PrefIm1;
+ }
+
+ // Setup the walk indices
+ uint uTIndexStart = 2 - (index&1);
+ uint uTIndex = uTIndexStart;
+
+ // Calculate the N*N weight for the final value
+ CornerB = (Val[index]*Val[index])*ip[index].vPosition; // n^2 part
+
+ // Zero out the corners
+ CornerU = float4(0,0,0,0);
+ CornerV = float4(0,0,0,0);
+
+ const uint uV = Val[index] + ( ( index & 1 ) ? 1 : -1 );
+
+ // Start the walk with the uStart prefix (the prefix of the corner before us)
+ CornerB += ip[uStart].vPosition * fEWt;
+ CornerU += ip[uStart].vPosition * TANM( uTIndex * 2, index );
+ CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index);
+
+ // Gather all vertices between the previous corner's prefix and our own prefix
+ // We'll do two at a time, since they always come in twos
+ while(uStart < Pref[index]-1)
+ {
+ ++uStart;
+ CornerB += ip[uStart].vPosition * fOWt;
+ CornerU += ip[uStart].vPosition * TANM( uTIndex * 2 + 1, index );
+ CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index );
+
+ ++uTIndex;
+ ++uStart;
+ CornerB += ip[uStart].vPosition * fEWt;
+ CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index );
+ CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex+uV)%Val[index]) * 2, index );
+ }
+ ++uStart;
+
+ // Add in the last guy and make sure to wrap to the beginning if we're the last corner
+ if (index == 3)
+ uStart = 4;
+ CornerB += ip[uStart].vPosition * fOWt;
+ CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index );
+ CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index );
+
+ // Add in the guy before the prefix as well
+ if (index)
+ uStart = PrefIm1-1;
+ else
+ uStart = Pref[3]-1;
+ uTIndex = uTIndexStart-1;
+
+ CornerB += ip[uStart].vPosition * fOWt;
+ CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index );
+ CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index );
+
+ // We're done with the walk now. Now we need to add the contributions of the original subd quad.
+ CornerB += ip[MOD4(index+1)].vPosition * fEWt;
+ CornerB += ip[MOD4(index+2)].vPosition * fOWt;
+ CornerB += ip[MOD4(index+3)].vPosition * fEWt;
+
+ uTIndex = 0 + (index&1)*(Val[index]-1);
+ uStart = MOD4(index+1);
+ CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index );
+ CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index );
+
+ uStart = MOD4(index+2);
+ CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index );
+ CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index );
+
+ uStart = MOD4(index+3);
+ uTIndex = (uTIndex+1)%Val[index];
+
+ CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index );
+ CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index );
+
+ // Normalize the corner weights
+ CornerB *= 1.0f / ( Val[index] * Val[index] + 5 * Val[index] ); // normalize
+
+ // fixup signs from directional derivatives...
+ if( !((index - 1) & 2) ) // 1 and 2
+ CornerU *= -1;
+
+ if( index >= 2 ) // 2 and 3
+ CornerV *= -1;
+}
+
+void ComputeCornerVertex4444( uint index,
+ out float3 CornerB, // Corner for the Bezier patch
+ out float3 CornerU, // Corner for the tangent patch
+ out float3 CornerV, // Corner for the bitangent patch
+ const in InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> ip,
+ const in uint Val[4],
+ const in uint Pref[4] )
+{
+ const float fOWt = 1;
+ const float fEWt = 4;
+
+ // Figure out where to start the walk by using the previous corner's prefix value
+ uint PrefIm1 = 0;
+ uint uStart = 4;
+ if( index )
+ {
+ PrefIm1 = Pref[index-1];
+ uStart = PrefIm1;
+ }
+
+ // Setup the walk indices
+ uint uTIndexStart = 2 - (index&1);
+ uint uTIndex = uTIndexStart;
+
+ // Calculate the N*N weight for the final value
+ CornerB = (Val[index]*Val[index])*ip[index].vPosition; // n^2 part
+
+ // Zero out the corners
+ CornerU = float4(0,0,0,0);
+ CornerV = float4(0,0,0,0);
+
+ const uint uV = Val[index] + ( ( index & 1 ) ? 1 : -1 );
+
+ // Start the walk with the uStart prefix (the prefix of the corner before us)
+ CornerB += ip[uStart].vPosition * fEWt;
+ CornerU += ip[uStart].vPosition * TANM( uTIndex * 2, index );
+ CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index);
+
+ // Gather all vertices between the previous corner's prefix and our own prefix
+ // We'll do two at a time, since they always come in twos
+ while(uStart < Pref[index]-1)
+ {
+ ++uStart;
+ CornerB += ip[uStart].vPosition * fOWt;
+ CornerU += ip[uStart].vPosition * TANM( uTIndex * 2 + 1, index );
+ CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index );
+
+ ++uTIndex;
+ ++uStart;
+ CornerB += ip[uStart].vPosition * fEWt;
+ CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index );
+ CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex+uV)%Val[index]) * 2, index );
+ }
+ ++uStart;
+
+ // Add in the last guy and make sure to wrap to the beginning if we're the last corner
+ if (index == 3)
+ uStart = 4;
+ CornerB += ip[uStart].vPosition * fOWt;
+ CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index );
+ CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index );
+
+ // Add in the guy before the prefix as well
+ if (index)
+ uStart = PrefIm1-1;
+ else
+ uStart = Pref[3]-1;
+ uTIndex = uTIndexStart-1;
+
+ CornerB += ip[uStart].vPosition * fOWt;
+ CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index );
+ CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index );
+
+ // We're done with the walk now. Now we need to add the contributions of the original subd quad.
+ CornerB += ip[MOD4(index+1)].vPosition * fEWt;
+ CornerB += ip[MOD4(index+2)].vPosition * fOWt;
+ CornerB += ip[MOD4(index+3)].vPosition * fEWt;
+
+ uTIndex = 0 + (index&1)*(Val[index]-1);
+ uStart = MOD4(index+1);
+ CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index );
+ CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index );
+
+ uStart = MOD4(index+2);
+ CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index );
+ CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index );
+
+ uStart = MOD4(index+3);
+ uTIndex = (uTIndex+1)%Val[index];
+
+ CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index );
+ CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index );
+
+ // Normalize the corner weights
+ CornerB *= 1.0f / ( Val[index] * Val[index] + 5 * Val[index] ); // normalize
+
+ // fixup signs from directional derivatives...
+ if( !((index - 1) & 2) ) // 1 and 2
+ CornerU *= -1;
+
+ if( index >= 2 ) // 2 and 3
+ CornerV *= -1;
+}
+
+//--------------------------------------------------------------------------------------
+// Computes the edge vertices of the output bicubic patch. The edge vertices
+// (1,2,4,7,8,11,13,14) are a weighted (by valence) combination of 6 interior and 1-ring
+// neighborhood points. However, we don't have to do the walk on this one since we
+// don't need all of the neighbor points attached to this vertex.
+//--------------------------------------------------------------------------------------
+float3 ComputeEdgeVertex( in uint index /* 0-7 */,
+ const in InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> ip,
+ const in uint Val[4],
+ const in uint Pref[4] )
+{
+ float val1 = 2 * Val[0] + 10;
+ float val2 = 2 * Val[1] + 10;
+ float val13 = 2 * Val[3] + 10;
+ float val14 = 2 * Val[2] + 10;
+ float val4 = val1;
+ float val8 = val13;
+ float val7 = val2;
+ float val11 = val14;
+
+ float3 vRetVal = float3(0,0,0);
+ switch( index )
+ {
+ // Horizontal
+ case 0:
+ vRetVal = (Val[0]*2*ip[0].vPosition + 4*ip[1].vPosition + ip[2].vPosition + ip[3].vPosition*2 +
+ 2*ip[Pref[0]-1].vPosition + ip[Pref[0]].vPosition) / val1;
+ break;
+ case 1:
+ vRetVal = (4*ip[0].vPosition + Val[1]*2*ip[1].vPosition + ip[2].vPosition*2 + ip[3].vPosition +
+ ip[Pref[0]-1].vPosition + 2*ip[Pref[0]].vPosition) / val2;
+ break;
+ case 2:
+ vRetVal = (2*ip[0].vPosition + ip[1].vPosition + 4*ip[2].vPosition + ip[3].vPosition*2*Val[3] +
+ 2*ip[Pref[2]].vPosition + ip[Pref[2]-1].vPosition) / val13;
+ break;
+ case 3:
+ vRetVal = (ip[0].vPosition + 2*ip[1].vPosition + Val[2]*2*ip[2].vPosition + ip[3].vPosition*4 +
+ ip[Pref[2]].vPosition + 2*ip[Pref[2]-1].vPosition) / val14;
+ break;
+ // Vertical
+ case 4:
+ vRetVal = (Val[0]*2*ip[0].vPosition + 2*ip[1].vPosition + ip[2].vPosition + ip[3].vPosition*4 +
+ 2*ip[4].vPosition + ip[Pref[3]-1].vPosition) / val4;
+ break;
+ case 5:
+ vRetVal = (4*ip[0].vPosition + ip[1].vPosition + 2*ip[2].vPosition + ip[3].vPosition*2*Val[3] +
+ ip[4].vPosition + 2*ip[Pref[3]-1].vPosition) / val8;
+ break;
+ case 6:
+ vRetVal = (2*ip[0].vPosition + Val[1]*2*ip[1].vPosition + 4*ip[2].vPosition + ip[3].vPosition +
+ 2*ip[Pref[1]-1].vPosition + ip[Pref[1]].vPosition) / val7;
+ break;
+ case 7:
+ vRetVal = (ip[0].vPosition + 4*ip[1].vPosition + Val[2]*2*ip[2].vPosition + 2*ip[3].vPosition +
+ ip[Pref[1]-1].vPosition + 2*ip[Pref[1]].vPosition) / val11;
+ break;
+ }
+
+ return vRetVal;
+}
+
+//--------------------------------------------------------------------------------------
+// Helper function
+//--------------------------------------------------------------------------------------
+void BezierRaise(inout float3 pQ[3], out float3 pC[4])
+{
+ pC[0] = pQ[0];
+ pC[3] = pQ[2];
+
+ for( int i=1; i<3; i++ )
+ {
+ pC[i] = ( 1.0f / 3.0f ) * ( pQ[i - 1] * i + ( 3.0f - i ) * pQ[i] );
+ }
+}
+
+//--------------------------------------------------------------------------------------
+// Computes the tangent patch from the input bezier patch
+//--------------------------------------------------------------------------------------
+void ComputeTanPatch( const OutputPatch<BEZIER_CONTROL_POINT, 16> bezpatch,
+ inout float3 vOut[16],
+ in float fCWts[4],
+ in float3 vCorner[4],
+ in float3 vCornerLocal[4],
+ in const uint cX,
+ in const uint cY)
+{
+ float3 vQuad[3];
+ float3 vQuadB[3];
+ float3 vCubic[4];
+
+ // boundary edges are really simple...
+ vQuad[0] = vCornerLocal[0];
+ vQuad[2] = vCornerLocal[1];
+ vQuad[1] = 3.0f*(bezpatch[2*cX+0*cY].vPosition-bezpatch[1*cX+0*cY].vPosition);
+
+ BezierRaise(vQuad,vCubic);
+ vOut[1*cX + 0*cY] = vCubic[1];
+ vOut[2*cX + 0*cY] = vCubic[2];
+
+ vQuad[0] = vCornerLocal[2];
+ vQuad[2] = vCornerLocal[3];
+ vQuad[1] = 3.0f*(bezpatch[2*cX+3*cY].vPosition-bezpatch[1*cX+3*cY].vPosition);
+
+ BezierRaise(vQuad,vCubic);
+ vOut[1*cX + 3*cY] = vCubic[1];
+ vOut[2*cX + 3*cY] = vCubic[2];
+
+ // two internal edges - this is where work happens...
+ float3 vA,vB,vC,vD,vE;
+ float fC0,fC1;
+ vQuad[1] = 3.0f*(bezpatch[2*cX+2*cY].vPosition-bezpatch[1*cX+2*cY].vPosition);
+ // also do "second" scan line
+ vQuadB[1] = 3.0f*(bezpatch[2*cX+1*cY].vPosition-bezpatch[1*cX+1*cY].vPosition);
+
+ vD = 3.0f*(bezpatch[1*cX + 2*cY].vPosition - bezpatch[0*cX + 2*cY].vPosition);
+ vE = 3.0f*(bezpatch[1*cX + 1*cY].vPosition - bezpatch[0*cX + 1*cY].vPosition); // used later...
+
+ fC0 = fCWts[3];
+ fC1 = fCWts[0];
+
+ // sign flip
+ vA = -vCorner[3];
+ vB = 3.0f*(bezpatch[0*cX + 1*cY].vPosition - bezpatch[0*cX + 2*cY].vPosition);
+ vC = -vCorner[0];
+
+ vQuad[0] = 1.0f/3.0f*(2.0f*fC0*vB - fC1*vA) + vD;
+ vQuadB[0] = 1.0f/3.0f*(fC0*vC - 2.0f*fC1*vB) + vE;
+
+ // do end of strip - same as before, but stuff is switched around...
+ vC = vCorner[2];
+ vB = 3.0f*(bezpatch[3*cX + 2*cY].vPosition - bezpatch[3*cX + 1*cY].vPosition);
+ vA = vCorner[1];
+
+ vD = 3.0f*(bezpatch[2*cX + 1*cY].vPosition - bezpatch[3*cX + 1*cY].vPosition);
+ vE = 3.0f*(bezpatch[2*cX + 2*cY].vPosition - bezpatch[3*cX + 2*cY].vPosition);
+
+ fC0 = fCWts[1];
+ fC1 = fCWts[2];
+
+ vQuadB[2] = 1.0f/3.0f*(2.0f*fC0*vB - fC1*vA) + vD;
+ vQuad[2] = 1.0f/3.0f*(fC0*vC - 2.0f*fC1*vB) + vE;
+
+ vQuadB[2] *= -1.0f;
+ vQuad[2] *= -1.0f;
+
+ BezierRaise(vQuad,vCubic);
+
+ vOut[0*cX + 2*cY] = vCubic[0];
+ vOut[1*cX + 2*cY] = vCubic[1];
+ vOut[2*cX + 2*cY] = vCubic[2];
+ vOut[3*cX + 2*cY] = vCubic[3];
+
+ BezierRaise(vQuadB,vCubic);
+
+ vOut[0*cX + 1*cY] = vCubic[0];
+ vOut[1*cX + 1*cY] = vCubic[1];
+ vOut[2*cX + 1*cY] = vCubic[2];
+ vOut[3*cX + 1*cY] = vCubic[3];
+}
+
+//--------------------------------------------------------------------------------------
+// Skinning vertex shader Section
+//--------------------------------------------------------------------------------------
+struct VS_CONTROL_POINT_INPUT
+{
+ float3 vPosition : POSITION;
+ float2 vUV : TEXCOORD0;
+ float3 vTangent : TANGENT;
+ uint4 vBones : BONES;
+ float4 vWeights : WEIGHTS;
+};
+
+VS_CONTROL_POINT_OUTPUT PatchSkinningVS( VS_CONTROL_POINT_INPUT Input )
+{
+ VS_CONTROL_POINT_OUTPUT Output;
+
+ float4 vInputPos = float4( Input.vPosition, 1 );
+ float4 vWorldPos = float4( 0, 0, 0, 0 );
+
+ vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.x ] ) * Input.vWeights.x;
+ vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.y ] ) * Input.vWeights.y;
+ vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.z ] ) * Input.vWeights.z;
+ vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.w ] ) * Input.vWeights.w;
+
+ float3 vWorldTan = float3( 0, 0, 0 );
+ vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.x ] ) * Input.vWeights.x;
+ vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.y ] ) * Input.vWeights.y;
+ vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.z ] ) * Input.vWeights.z;
+ vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.w ] ) * Input.vWeights.w;
+
+ Output.vPosition = vWorldPos;
+ Output.vUV = Input.vUV;
+ Output.vTangent = vWorldTan;
+
+ return Output;
+}
+
+struct VS_MESH_POINT_INPUT
+{
+ float3 vPosition : POSITION;
+ float2 vUV : TEXCOORD0;
+ float3 vNormal : NORMAL;
+ float3 vTangent : TANGENT;
+ uint4 vBones : BONES;
+ float4 vWeights : WEIGHTS;
+};
+
+struct VS_MESH_POINT_OUTPUT
+{
+ float3 vWorldPos : POSITION;
+ float3 vNormal : NORMAL;
+ float2 vUV : TEXCOORD;
+ float3 vTangent : TANGENT;
+ float3 vBiTangent : BITANGENT;
+
+ float4 vPosition : SV_POSITION;
+};
+
+VS_MESH_POINT_OUTPUT MeshSkinningVS( VS_MESH_POINT_INPUT Input )
+{
+ VS_MESH_POINT_OUTPUT Output;
+
+ float4 vInputPos = float4( Input.vPosition, 1 );
+ float4 vWorldPos = float4( 0, 0, 0, 0 );
+
+ vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.x ] ) * Input.vWeights.x;
+ vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.y ] ) * Input.vWeights.y;
+ vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.z ] ) * Input.vWeights.z;
+ vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.w ] ) * Input.vWeights.w;
+
+ float3 vWorldTan = float3( 0, 0, 0 );
+ vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.x ] ) * Input.vWeights.x;
+ vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.y ] ) * Input.vWeights.y;
+ vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.z ] ) * Input.vWeights.z;
+ vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.w ] ) * Input.vWeights.w;
+
+ float3 vWorldNormal = float3( 0, 0, 0 );
+ vWorldNormal += mul( Input.vNormal, (float3x3)g_mConstBoneWorld[ Input.vBones.x ] ) * Input.vWeights.x;
+ vWorldNormal += mul( Input.vNormal, (float3x3)g_mConstBoneWorld[ Input.vBones.y ] ) * Input.vWeights.y;
+ vWorldNormal += mul( Input.vNormal, (float3x3)g_mConstBoneWorld[ Input.vBones.z ] ) * Input.vWeights.z;
+ vWorldNormal += mul( Input.vNormal, (float3x3)g_mConstBoneWorld[ Input.vBones.w ] ) * Input.vWeights.w;
+
+ Output.vWorldPos = vWorldPos.xyz;
+ Output.vPosition = mul( float4( vWorldPos.xyz, 1 ), g_mViewProjection );
+ Output.vUV = Input.vUV;
+ Output.vTangent = vWorldTan;
+ Output.vNormal = vWorldNormal;
+ Output.vBiTangent = cross( vWorldNormal, vWorldTan );
+
+ return Output;
+}
+
+//--------------------------------------------------------------------------------------
+// SubD to Bezier hull shader Section
+//--------------------------------------------------------------------------------------
+struct HS_CONSTANT_DATA_OUTPUT
+{
+ float Edges[4] : SV_TessFactor;
+ float Inside[2] : SV_InsideTessFactor;
+
+ float3 vTangent[4] : TANGENT;
+ float2 vUV[4] : TEXCOORD;
+ float3 vTanUCorner[4] : TANUCORNER;
+ float3 vTanVCorner[4] : TANVCORNER;
+ float4 vCWts : TANWEIGHTS;
+};
+
+//--------------------------------------------------------------------------------------
+// Load per-patch valence and prefix data
+//--------------------------------------------------------------------------------------
+void LoadValenceAndPrefixData( in uint PatchID, out uint Val[4], out uint Prefixes[4] )
+{
+ PatchID += g_iPatchStartIndex;
+ uint4 ValPack = g_ValencePrefixBuffer.Load( PatchID * 2 );
+ uint4 PrefPack = g_ValencePrefixBuffer.Load( PatchID * 2 + 1 );
+
+ Val[0] = ValPack.x;
+ Val[1] = ValPack.y;
+ Val[2] = ValPack.z;
+ Val[3] = ValPack.w;
+
+ Prefixes[0] = PrefPack.x;
+ Prefixes[1] = PrefPack.y;
+ Prefixes[2] = PrefPack.z;
+ Prefixes[3] = PrefPack.w;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Constant data function for the SubDToBezierHS. This is executed once per patch.
+//--------------------------------------------------------------------------------------
+HS_CONSTANT_DATA_OUTPUT SubDToBezierConstantsHS( InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> ip,
+ uint PatchID : SV_PrimitiveID )
+{
+ HS_CONSTANT_DATA_OUTPUT Output;
+
+ float TessAmount = g_fTessellationFactor;
+
+ Output.Edges[0] = Output.Edges[1] = Output.Edges[2] = Output.Edges[3] = TessAmount;
+ Output.Inside[0] = Output.Inside[1] = TessAmount;
+
+ Output.vTangent[0] = ip[0].vTangent;
+ Output.vTangent[1] = ip[1].vTangent;
+ Output.vTangent[2] = ip[2].vTangent;
+ Output.vTangent[3] = ip[3].vTangent;
+
+ Output.vUV[0] = ip[0].vUV;
+ Output.vUV[1] = ip[1].vUV;
+ Output.vUV[2] = ip[2].vUV;
+ Output.vUV[3] = ip[3].vUV;
+
+ // Compute part of our tangent patch here
+ uint Val[4];
+ uint Prefixes[4];
+ LoadValenceAndPrefixData( PatchID, Val, Prefixes );
+
+ [unroll]
+ for( int i=0; i<4; i++ )
+ {
+ float3 CornerB, CornerU, CornerV;
+ ComputeCornerVertex( i, CornerB, CornerU, CornerV, ip, Val, Prefixes );
+ Output.vTanUCorner[i] = CornerU;
+ Output.vTanVCorner[i] = CornerV;
+ }
+
+ float fCWts[4];
+ Output.vCWts.x = g_fCi[ Val[0]-3 ];
+ Output.vCWts.y = g_fCi[ Val[1]-3 ];
+ Output.vCWts.z = g_fCi[ Val[2]-3 ];
+ Output.vCWts.w = g_fCi[ Val[3]-3 ];
+
+ return Output;
+}
+
+HS_CONSTANT_DATA_OUTPUT SubDToBezierConstantsHS4444( InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> ip,
+ uint PatchID : SV_PrimitiveID )
+{
+ HS_CONSTANT_DATA_OUTPUT Output;
+
+ float TessAmount = g_fTessellationFactor;
+
+ Output.Edges[0] = Output.Edges[1] = Output.Edges[2] = Output.Edges[3] = TessAmount;
+ Output.Inside[0] = Output.Inside[1] = TessAmount;
+
+ Output.vTangent[0] = ip[0].vTangent;
+ Output.vTangent[1] = ip[1].vTangent;
+ Output.vTangent[2] = ip[2].vTangent;
+ Output.vTangent[3] = ip[3].vTangent;
+
+ Output.vUV[0] = ip[0].vUV;
+ Output.vUV[1] = ip[1].vUV;
+ Output.vUV[2] = ip[2].vUV;
+ Output.vUV[3] = ip[3].vUV;
+
+ // Compute part of our tangent patch here
+ static const uint Val[4] = (uint[4])uint4(4,4,4,4);
+ static const uint Prefixes[4] = (uint[4])uint4(7,10,13,16);
+
+ [unroll]
+ for( int i=0; i<4; i++ )
+ {
+ float3 CornerB, CornerU, CornerV;
+ ComputeCornerVertex4444( i, CornerB, CornerU, CornerV, ip, Val, Prefixes );
+ Output.vTanUCorner[i] = CornerU;
+ Output.vTanVCorner[i] = CornerV;
+ }
+
+ float fCWts[4];
+ Output.vCWts.x = g_fCi[ Val[0]-3 ];
+ Output.vCWts.y = g_fCi[ Val[1]-3 ];
+ Output.vCWts.z = g_fCi[ Val[2]-3 ];
+ Output.vCWts.w = g_fCi[ Val[3]-3 ];
+
+ return Output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// HS for SubDToBezier. This outputcontrolpoints(16) specifies that we will produce
+// 16 control points. Therefore this function will be invoked 16x, one for each output
+// control point.
+//
+// !! PERFORMANCE NOTE: This hull shader is written for maximum readability, and its
+// performance is not expected to be optimal on D3D11 hardware. The switch statement
+// below that determines the codepath for each patch control point generates sub-optimal
+// code for parallel execution on the GPU. A future implementation of this hull shader
+// will combine the 16 codepaths and 3 variants (corner, edge, interior) into one shared
+// codepath; this change is expected to increase performance at the expense of readability.
+//--------------------------------------------------------------------------------------
+[domain("quad")]
+[partitioning("integer")]
+[outputtopology("triangle_cw")]
+[outputcontrolpoints(16)]
+[patchconstantfunc("SubDToBezierConstantsHS")]
+BEZIER_CONTROL_POINT SubDToBezierHS( InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> p,
+ uint i : SV_OutputControlPointID,
+ uint PatchID : SV_PrimitiveID )
+{
+ // Valences and prefixes are loaded from a buffer
+ uint Val[4];
+ uint Prefixes[4];
+ LoadValenceAndPrefixData( PatchID, Val, Prefixes );
+
+ float3 CornerB = float3(0,0,0);
+ float3 CornerU = float3(0,0,0);
+ float3 CornerV = float3(0,0,0);
+
+ BEZIER_CONTROL_POINT Output;
+ Output.vPosition = float3(0,0,0);
+
+ // !! PERFORMANCE NOTE: As mentioned above, this switch statement generates
+ // inefficient code for the sake of readability.
+ switch( i )
+ {
+ // Interior vertices
+ case 5:
+ Output.vPosition = ComputeInteriorVertex( 0, Val, p );
+ break;
+ case 6:
+ Output.vPosition = ComputeInteriorVertex( 1, Val, p );
+ break;
+ case 10:
+ Output.vPosition = ComputeInteriorVertex( 2, Val, p );
+ break;
+ case 9:
+ Output.vPosition = ComputeInteriorVertex( 3, Val, p );
+ break;
+
+ // Corner vertices
+ case 0:
+ ComputeCornerVertex( 0, CornerB, CornerU, CornerV, p, Val, Prefixes );
+ Output.vPosition = CornerB;
+ break;
+ case 3:
+ ComputeCornerVertex( 1, CornerB, CornerU, CornerV, p, Val, Prefixes );
+ Output.vPosition = CornerB;
+ break;
+ case 15:
+ ComputeCornerVertex( 2, CornerB, CornerU, CornerV, p, Val, Prefixes );
+ Output.vPosition = CornerB;
+ break;
+ case 12:
+ ComputeCornerVertex( 3, CornerB, CornerU, CornerV, p, Val, Prefixes );
+ Output.vPosition = CornerB;
+ break;
+
+ // Edge vertices
+ case 1:
+ Output.vPosition = ComputeEdgeVertex( 0, p, Val, Prefixes );
+ break;
+ case 2:
+ Output.vPosition = ComputeEdgeVertex( 1, p, Val, Prefixes );
+ break;
+ case 13:
+ Output.vPosition = ComputeEdgeVertex( 2, p, Val, Prefixes );
+ break;
+ case 14:
+ Output.vPosition = ComputeEdgeVertex( 3, p, Val, Prefixes );
+ break;
+ case 4:
+ Output.vPosition = ComputeEdgeVertex( 4, p, Val, Prefixes );
+ break;
+ case 8:
+ Output.vPosition = ComputeEdgeVertex( 5, p, Val, Prefixes );
+ break;
+ case 7:
+ Output.vPosition = ComputeEdgeVertex( 6, p, Val, Prefixes );
+ break;
+ case 11:
+ Output.vPosition = ComputeEdgeVertex( 7, p, Val, Prefixes );
+ break;
+ }
+
+ return Output;
+}
+
+//--------------------------------------------------------------------------------------
+// Specialised version for Regular (4,4,4,4) patches, this is much simpler and has less
+// branching compared to the general one above
+//--------------------------------------------------------------------------------------
+[domain("quad")]
+[partitioning("integer")]
+[outputtopology("triangle_cw")]
+[outputcontrolpoints(16)]
+[patchconstantfunc("SubDToBezierConstantsHS4444")]
+BEZIER_CONTROL_POINT SubDToBezierHS4444( InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> p,
+ uint i : SV_OutputControlPointID,
+ uint PatchID : SV_PrimitiveID )
+{
+ // Valences and prefixes are Constant for this case (4,4,4,4)
+ static const uint Val[4] = (uint[4])uint4(4,4,4,4);
+ static const uint Prefixes[4] = (uint[4])uint4(7,10,13,16);
+
+ float3 CornerB = float3(0,0,0);
+ float3 CornerU = float3(0,0,0);
+ float3 CornerV = float3(0,0,0);
+
+ BEZIER_CONTROL_POINT Output;
+ Output.vPosition = float3(0,0,0);
+
+ // !! PERFORMANCE NOTE: As mentioned above, this switch statement generates
+ // inefficient code for the sake of readability.
+ switch( i )
+ {
+ // Interior vertices
+ case 5:
+ Output.vPosition = ComputeInteriorVertex( 0, Val, p );
+ break;
+ case 6:
+ Output.vPosition = ComputeInteriorVertex( 1, Val, p );
+ break;
+ case 10:
+ Output.vPosition = ComputeInteriorVertex( 2, Val, p );
+ break;
+ case 9:
+ Output.vPosition = ComputeInteriorVertex( 3, Val, p );
+ break;
+
+ // Corner vertices
+ case 0:
+ ComputeCornerVertex4444( 0, CornerB, CornerU, CornerV, p, Val, Prefixes );
+ Output.vPosition = CornerB;
+ break;
+ case 3:
+ ComputeCornerVertex4444( 1, CornerB, CornerU, CornerV, p, Val, Prefixes );
+ Output.vPosition = CornerB;
+ break;
+ case 15:
+ ComputeCornerVertex4444( 2, CornerB, CornerU, CornerV, p, Val, Prefixes );
+ Output.vPosition = CornerB;
+ break;
+ case 12:
+ ComputeCornerVertex4444( 3, CornerB, CornerU, CornerV, p, Val, Prefixes );
+ Output.vPosition = CornerB;
+ break;
+
+ // Edge vertices
+ case 1:
+ Output.vPosition = ComputeEdgeVertex( 0, p, Val, Prefixes );
+ break;
+ case 2:
+ Output.vPosition = ComputeEdgeVertex( 1, p, Val, Prefixes );
+ break;
+ case 13:
+ Output.vPosition = ComputeEdgeVertex( 2, p, Val, Prefixes );
+ break;
+ case 14:
+ Output.vPosition = ComputeEdgeVertex( 3, p, Val, Prefixes );
+ break;
+ case 4:
+ Output.vPosition = ComputeEdgeVertex( 4, p, Val, Prefixes );
+ break;
+ case 8:
+ Output.vPosition = ComputeEdgeVertex( 5, p, Val, Prefixes );
+ break;
+ case 7:
+ Output.vPosition = ComputeEdgeVertex( 6, p, Val, Prefixes );
+ break;
+ case 11:
+ Output.vPosition = ComputeEdgeVertex( 7, p, Val, Prefixes );
+ break;
+ }
+
+ return Output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Bezier evaluation domain shader section
+//--------------------------------------------------------------------------------------
+struct DS_OUTPUT
+{
+ float3 vWorldPos : POSITION;
+ float3 vNormal : NORMAL;
+ float2 vUV : TEXCOORD;
+ float3 vTangent : TANGENT;
+ float3 vBiTangent : BITANGENT;
+
+ float4 vPosition : SV_POSITION;
+};
+
+//--------------------------------------------------------------------------------------
+float4 BernsteinBasis(float t)
+{
+ float invT = 1.0f - t;
+
+ return float4( invT * invT * invT,
+ 3.0f * t * invT * invT,
+ 3.0f * t * t * invT,
+ t * t * t );
+}
+
+//--------------------------------------------------------------------------------------
+float4 dBernsteinBasis(float t)
+{
+ float invT = 1.0f - t;
+
+ return float4( -3 * invT * invT,
+ 3 * invT * invT - 6 * t * invT,
+ 6 * t * invT - 3 * t * t,
+ 3 * t * t );
+}
+
+//--------------------------------------------------------------------------------------
+float3 EvaluateBezier( const OutputPatch<BEZIER_CONTROL_POINT, 16> bezpatch,
+ float4 BasisU,
+ float4 BasisV )
+{
+ float3 Value = float3(0,0,0);
+ Value = BasisV.x * ( bezpatch[0].vPosition * BasisU.x + bezpatch[1].vPosition * BasisU.y + bezpatch[2].vPosition * BasisU.z + bezpatch[3].vPosition * BasisU.w );
+ Value += BasisV.y * ( bezpatch[4].vPosition * BasisU.x + bezpatch[5].vPosition * BasisU.y + bezpatch[6].vPosition * BasisU.z + bezpatch[7].vPosition * BasisU.w );
+ Value += BasisV.z * ( bezpatch[8].vPosition * BasisU.x + bezpatch[9].vPosition * BasisU.y + bezpatch[10].vPosition * BasisU.z + bezpatch[11].vPosition * BasisU.w );
+ Value += BasisV.w * ( bezpatch[12].vPosition * BasisU.x + bezpatch[13].vPosition * BasisU.y + bezpatch[14].vPosition * BasisU.z + bezpatch[15].vPosition * BasisU.w );
+
+ return Value;
+}
+
+//--------------------------------------------------------------------------------------
+float3 EvaluateBezierTan( const float3 bezpatch[16],
+ float4 BasisU,
+ float4 BasisV )
+{
+ float3 Value = float3(0,0,0);
+ Value = BasisV.x * ( bezpatch[0] * BasisU.x + bezpatch[1] * BasisU.y + bezpatch[2] * BasisU.z + bezpatch[3] * BasisU.w );
+ Value += BasisV.y * ( bezpatch[4] * BasisU.x + bezpatch[5] * BasisU.y + bezpatch[6] * BasisU.z + bezpatch[7] * BasisU.w );
+ Value += BasisV.z * ( bezpatch[8] * BasisU.x + bezpatch[9] * BasisU.y + bezpatch[10] * BasisU.z + bezpatch[11] * BasisU.w );
+ Value += BasisV.w * ( bezpatch[12] * BasisU.x + bezpatch[13] * BasisU.y + bezpatch[14] * BasisU.z + bezpatch[15] * BasisU.w );
+
+ return Value;
+}
+
+//--------------------------------------------------------------------------------------
+// Compute a two full tangent patches from the Tangent corner data created in the
+// HS constant data function.
+//--------------------------------------------------------------------------------------
+void CreatTangentPatches( in HS_CONSTANT_DATA_OUTPUT input,
+ const OutputPatch<BEZIER_CONTROL_POINT, 16> bezpatch,
+ out float3 TanU[16],
+ out float3 TanV[16] )
+{
+ TanV[0] = input.vTanVCorner[0];
+ TanV[3] = input.vTanVCorner[1];
+ TanV[15] = input.vTanVCorner[2];
+ TanV[12] = input.vTanVCorner[3];
+
+ TanU[0] = input.vTanUCorner[0];
+ TanU[3] = input.vTanUCorner[1];
+ TanU[15] = input.vTanUCorner[2];
+ TanU[12] = input.vTanUCorner[3];
+
+ float fCWts[4];
+ fCWts[0] = input.vCWts.x;
+ fCWts[1] = input.vCWts.y;
+ fCWts[2] = input.vCWts.z;
+ fCWts[3] = input.vCWts.w;
+
+ float3 vCorner[4];
+ float3 vCornerLocal[4];
+
+ vCorner[0] = TanV[0];
+ vCorner[1] = TanV[3];
+ vCorner[2] = TanV[15];
+ vCorner[3] = TanV[12];
+ vCornerLocal[0] = TanU[0];
+ vCornerLocal[1] = TanU[3];
+ vCornerLocal[2] = TanU[12];
+ vCornerLocal[3] = TanU[15];
+
+ ComputeTanPatch( bezpatch, TanU, fCWts, vCorner, vCornerLocal, 1, 4 );
+
+ fCWts[3] = input.vCWts.y;
+ fCWts[1] = input.vCWts.w;
+
+ vCorner[0] = TanU[0];
+ vCorner[3] = TanU[3];
+ vCorner[2] = TanU[15];
+ vCorner[1] = TanU[12];
+ vCornerLocal[0] = TanV[0];
+ vCornerLocal[1] = TanV[12];
+ vCornerLocal[2] = TanV[3];
+ vCornerLocal[3] = TanV[15];
+
+ ComputeTanPatch( bezpatch, TanV, fCWts, vCorner, vCornerLocal, 4, 1 );
+}
+
+//--------------------------------------------------------------------------------------
+// For each input UV (from the Tessellator), evaluate the Bezier patch at this position.
+//--------------------------------------------------------------------------------------
+[domain("quad")]
+DS_OUTPUT BezierEvalDS( HS_CONSTANT_DATA_OUTPUT input,
+ float2 UV : SV_DomainLocation,
+ const OutputPatch<BEZIER_CONTROL_POINT, 16> bezpatch )
+{
+ float4 BasisU = BernsteinBasis( UV.x );
+ float4 BasisV = BernsteinBasis( UV.y );
+
+ float3 WorldPos = EvaluateBezier( bezpatch, BasisU, BasisV );
+
+ float3 TanU[16];
+ float3 TanV[16];
+ CreatTangentPatches( input, bezpatch, TanU, TanV );
+ float3 Tangent = EvaluateBezierTan( TanU, BasisU, BasisV );
+ float3 BiTangent = EvaluateBezierTan( TanV, BasisU, BasisV );
+
+ // To see what the patch looks like without using the tangent patches to fix the normals, uncomment this section
+ /*
+ float4 dBasisU = dBernsteinBasis( UV.x );
+ float4 dBasisV = dBernsteinBasis( UV.y );
+ Tangent = EvaluateBezier( bezpatch, dBasisU, BasisV );
+ BiTangent = EvaluateBezier( bezpatch, BasisU, dBasisV );
+ */
+
+ float3 Norm = normalize( cross( Tangent, BiTangent ) );
+
+ DS_OUTPUT Output;
+ Output.vNormal = Norm;
+
+ // Evalulate the tangent vectors through bilinear interpolation.
+ // These tangents are the texture-space tangents. They should not be confused with the parametric
+ // tangents that we use to get the normals for the bicubic patch.
+ float3 TextureTanU0 = input.vTangent[0];
+ float3 TextureTanU1 = input.vTangent[1];
+ float3 TextureTanU2 = input.vTangent[2];
+ float3 TextureTanU3 = input.vTangent[3];
+
+ float3 UVbottom = lerp( TextureTanU0, TextureTanU1, UV.x );
+ float3 UVtop = lerp( TextureTanU3, TextureTanU2, UV.x );
+ float3 Tan = lerp( UVbottom, UVtop, UV.y );
+
+ Output.vTangent = Tan;
+
+ // This is an optimization. We assume that the UV mapping of the mesh will result in a "relatively" orthogonal
+ // tangent basis. If we assume this, then we can avoid fetching and bilerping the BiTangent along with the tangent.
+ Output.vBiTangent = cross( Norm, Tan );
+
+ // bilerp the texture coordinates
+ float2 tex0 = input.vUV[0];
+ float2 tex1 = input.vUV[1];
+ float2 tex2 = input.vUV[2];
+ float2 tex3 = input.vUV[3];
+
+ float2 bottom = lerp( tex0, tex1, UV.x );
+ float2 top = lerp( tex3, tex2, UV.x );
+ float2 TexUV = lerp( bottom, top, UV.y );
+ Output.vUV = TexUV;
+
+ if( g_fDisplacementHeight > 0 )
+ {
+ // On this sample displacement can go into or out of the mesh. This is why we bias the heigh amount.
+ float height = g_fDisplacementHeight * ( g_txHeight.SampleLevel( g_samPoint, TexUV, 0 ).a * 2 - 1 );
+ float3 WorldPosMiddle = Norm * height;
+ WorldPos += WorldPosMiddle;
+ }
+
+ Output.vPosition = mul( float4(WorldPos,1), g_mViewProjection );
+ Output.vWorldPos = WorldPos;
+
+ return Output;
+}
+
+//--------------------------------------------------------------------------------------
+// Smooth shading pixel shader section
+//--------------------------------------------------------------------------------------
+
+float3 safe_normalize( float3 vInput )
+{
+ float len2 = dot( vInput, vInput );
+ if( len2 > 0 )
+ {
+ return vInput * rsqrt( len2 );
+ }
+ return vInput;
+}
+
+static const float g_fSpecularExponent = 32.0f;
+static const float g_fSpecularIntensity = 0.6f;
+static const float g_fNormalMapIntensity = 1.5f;
+
+float2 ComputeDirectionalLight( float3 vWorldPos, float3 vWorldNormal, float3 vDirLightDir )
+{
+ // Result.x is diffuse illumination, Result.y is specular illumination
+ float2 Result = float2( 0, 0 );
+ Result.x = pow( saturate( dot( vWorldNormal, -vDirLightDir ) ), 2 );
+
+ float3 vPointToCamera = normalize( g_vCameraPosWorld - vWorldPos );
+ float3 vHalfAngle = normalize( vPointToCamera - vDirLightDir );
+ Result.y = pow( saturate( dot( vHalfAngle, vWorldNormal ) ), g_fSpecularExponent );
+
+ return Result;
+}
+
+float3 ColorGamma( float3 Input )
+{
+ return pow( Input, 2.2f );
+}
+
+float4 SmoothPS( PS_INPUT Input ) : SV_TARGET
+{
+ float4 vNormalMapSampleRaw = g_txHeight.Sample( g_samLinear, Input.vUV );
+ float3 vNormalMapSampleBiased = ( vNormalMapSampleRaw.xyz * 2 ) - 1;
+ vNormalMapSampleBiased.xy *= g_fNormalMapIntensity;
+ float3 vNormalMapSample = normalize( vNormalMapSampleBiased );
+
+ float3 vNormal = safe_normalize( Input.vNormal ) * vNormalMapSample.z;
+ vNormal += safe_normalize( Input.vTangent ) * vNormalMapSample.x;
+ vNormal += safe_normalize( Input.vBiTangent ) * vNormalMapSample.y;
+
+ //float3 vColor = float3( 1, 1, 1 );
+ float3 vColor = g_txDiffuse.Sample( g_samLinear, Input.vUV ).rgb;
+ float vSpecular = g_txSpecular.Sample( g_samLinear, Input.vUV ).r * g_fSpecularIntensity;
+
+ const float3 DirLightDirections[4] =
+ {
+ // key light
+ normalize( float3( -63.345150, -58.043934, 27.785097 ) ),
+ // fill light
+ normalize( float3( 23.652107, -17.391443, 54.972504 ) ),
+ // back light 1
+ normalize( float3( 20.470509, -22.939510, -33.929531 ) ),
+ // back light 2
+ normalize( float3( -31.003685, 24.242104, -41.352859 ) ),
+ };
+
+ const float3 DirLightColors[4] =
+ {
+ // key light
+ ColorGamma( float3( 1.0f, 0.964f, 0.706f ) * 1.0f ),
+ // fill light
+ ColorGamma( float3( 0.446f, 0.641f, 1.0f ) * 1.0f ),
+ // back light 1
+ ColorGamma( float3( 1.0f, 0.862f, 0.419f ) * 1.0f ),
+ // back light 2
+ ColorGamma( float3( 0.405f, 0.630f, 1.0f ) * 1.0f ),
+ };
+
+ float3 fLightColor = 0;
+ for( int i = 0; i < 4; ++i )
+ {
+ float2 LightDiffuseSpecular = ComputeDirectionalLight( Input.vWorldPos, vNormal, DirLightDirections[i] );
+ fLightColor += DirLightColors[i] * vColor * LightDiffuseSpecular.x;
+ fLightColor += DirLightColors[i] * LightDiffuseSpecular.y * vSpecular;
+ }
+
+ return float4( fLightColor, 1 );
+}
+
+//--------------------------------------------------------------------------------------
+// Solid color shading pixel shader (used for wireframe overlay)
+//--------------------------------------------------------------------------------------
+float4 SolidColorPS( PS_INPUT Input ) : SV_TARGET
+{
+ return float4( g_vSolidColor, 1 );
+}
diff --git a/tests/hlsl/dxsdk/VarianceShadows11/2DQuadShaders.hlsl b/tests/hlsl/dxsdk/VarianceShadows11/2DQuadShaders.hlsl
new file mode 100644
index 000000000..c4401f010
--- /dev/null
+++ b/tests/hlsl/dxsdk/VarianceShadows11/2DQuadShaders.hlsl
@@ -0,0 +1,211 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain -profile ps_4_0 -entry PSBlurX -entry PSBlurY
+//--------------------------------------------------------------------------------------
+// File: Skinning10.fx
+//
+// The effect file for the Skinning10 sample.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#ifndef SEPERABLE_BLUR_KERNEL_SIZE
+#define SEPERABLE_BLUR_KERNEL_SIZE 3
+#endif
+
+static const int BLUR_KERNEL_BEGIN = SEPERABLE_BLUR_KERNEL_SIZE / -2;
+static const int BLUR_KERNEL_END = SEPERABLE_BLUR_KERNEL_SIZE / 2 + 1;
+static const float FLOAT_BLUR_KERNEL_SIZE = (float)SEPERABLE_BLUR_KERNEL_SIZE;
+
+cbuffer cbblurVS : register( b2)
+{
+ int2 g_iWidthHeight : packoffset( c0 );
+ int g_iKernelStart : packoffset( c0.z );
+ int g_iKernelEnd : packoffset( c0.w );
+};
+
+//--------------------------------------------------------------------------------------
+// defines
+//--------------------------------------------------------------------------------------
+
+Texture2DArray g_txShadow : register( t5 );
+SamplerState g_samShadow : register( s5 );
+
+//--------------------------------------------------------------------------------------
+// Input/Output structures
+//--------------------------------------------------------------------------------------
+
+struct PSIn
+{
+ float4 Pos : SV_Position; //Position
+ float2 Tex : TEXCOORD; //Texture coordinate
+ float2 ITex : TEXCOORD2;
+};
+
+struct VSIn
+{
+ uint Pos : SV_VertexID ;
+};
+
+
+PSIn VSMain(VSIn inn)
+{
+ PSIn output;
+
+ output.Pos.y = -1.0f + (inn.Pos%2) * 2.0f ;
+ output.Pos.x = -1.0f + (inn.Pos/2) * 2.0f;
+ output.Pos.z = .5;
+ output.Pos.w = 1;
+ output.Tex.x = inn.Pos/2;
+ output.Tex.y = 1.0f - inn.Pos%2;
+ output.ITex.x = (float)(g_iWidthHeight.x * output.Tex.x);
+ output.ITex.y = (float)(g_iWidthHeight.y * output.Tex.y);
+ return output;
+}
+
+//float PSDepth
+
+//------------------------------------------------------------------------------
+// Logarithmic filtering
+//------------------------------------------------------------------------------
+
+float log_conv ( float x0, float X, float y0, float Y )
+{
+ return (X + log(x0 + (y0 * exp(Y - X))));
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel shader that performs bump mapping on the final vertex
+//--------------------------------------------------------------------------------------
+float2 PSBlurX(PSIn input) : SV_Target
+{
+/*
+ float2 centerDistance;
+ if ( input.Tex.x < .5 ) centerDistance.x = (1.0 - input.Tex.x);
+ else centerDistance.x = input.Tex.x;
+ if ( input.Tex.y < .5 ) centerDistance.y = (1.0 - input.Tex.y);
+ else centerDistance.y = input.Tex.y;
+ if (centerDistance.x < centerDistance.y) centerDistance.x = centerDistance.y;
+ centerDistance.x -= .2;
+ centerDistance.x *= (1.0f / .8);
+
+ float store_samples[8];
+ int ind = 0;
+ for (int x = g_iKernelStart; x < g_iKernelEnd; ++x) {
+ store_samples[ind] = g_txShadow.Load( int3(input.ITex.x+(float)x * centerDistance.x , input.ITex.y, 0) ).r;
+ ind++;
+ }
+ const float c = (1.f/5.f);
+
+ float accum;
+ accum = log_conv( c, store_samples[0], c, store_samples[1] );
+
+ ind = 0;
+ for (x = g_iKernelStart - 2; x < g_iKernelEnd; ++x) {
+ ind++;
+ accum += log_conv( 1.0f, accum, c, store_samples[ind] );
+ }
+ float2 rt;
+ rt.x = accum;
+ return rt;
+ */
+ /*
+ float2 dep = 0;
+ float2 centerDistance;
+ if ( input.Tex.x < .5 ) centerDistance.x = (1.0 - input.Tex.x);
+ else centerDistance.x = input.Tex.x;
+ if ( input.Tex.y < .5 ) centerDistance.y = (1.0 - input.Tex.y);
+ else centerDistance.y = input.Tex.y;
+ if (centerDistance.x < centerDistance.y) centerDistance.x = centerDistance.y;
+ centerDistance.x -= .2;
+ centerDistance.x *= ( 1.0f / 0.8f );
+
+ for (int x = g_iKernelStart; x < g_iKernelEnd; ++x) {
+ dep += g_txShadow.Load( int3(input.ITex.x+(float)x * centerDistance.x , input.ITex.y, 0) ).rg;
+ }
+ dep /= (g_iKernelEnd - g_iKernelStart);
+ return dep;
+ */
+
+ float2 dep=0;
+ [unroll]for ( int x = BLUR_KERNEL_BEGIN; x < BLUR_KERNEL_END; ++x ) {
+ dep += g_txShadow.Sample( g_samShadow, float3( input.Tex.x, input.Tex.y, 0 ), int2( x,0 ) ).rg;
+ }
+ dep /= FLOAT_BLUR_KERNEL_SIZE;
+ return dep;
+
+// return g_txShadow.Sample(g_samShadow, float3(input.Tex.x, input.Tex.y, 0) ).rg;
+
+}
+
+//--------------------------------------------------------------------------------------
+// Pixel shader that performs bump mapping on the final vertex
+//--------------------------------------------------------------------------------------
+float2 PSBlurY(PSIn input) : SV_Target
+{
+/*
+ float2 centerDistance;
+ if ( input.Tex.x < .5 ) centerDistance.x = (1.0 - input.Tex.x);
+ else centerDistance.x = input.Tex.x;
+ if ( input.Tex.y < .5 ) centerDistance.y = (1.0 - input.Tex.y);
+ else centerDistance.y = input.Tex.y;
+ if (centerDistance.x < centerDistance.y) centerDistance.x = centerDistance.y;
+ centerDistance.x -= .2;
+ centerDistance.x *= (1.0f / .8);
+
+ float store_samples[8];
+ int ind = 0;
+ for (int y = g_iKernelStart; y < g_iKernelEnd; ++y) {
+ store_samples[ind] = g_txShadow.Load( int3(input.ITex.x, input.ITex.y+(float)y * centerDistance.x, 0) ).r;
+ }
+ const float c = (1.f/5.f);
+
+ float accum;
+ accum = log_conv( c, store_samples[0], c, store_samples[1] );
+
+ ind = 0;
+ for (y = g_iKernelStart; y < g_iKernelEnd; ++y) {
+ ind++;
+ accum += log_conv( 1.0f, accum, c, store_samples[ind] );
+ }
+ float2 rt;
+ rt.x = accum;
+ return rt;
+ */
+
+
+ /*
+ float2 dep = 0;
+
+ float2 centerDistance;
+ if ( input.Tex.x < .5 ) centerDistance.x = (1.0 - input.Tex.x);
+ else centerDistance.x = input.Tex.x;
+ if ( input.Tex.y < .5 ) centerDistance.y = (1.0 - input.Tex.y);
+ else centerDistance.y = input.Tex.y;
+ if (centerDistance.x < centerDistance.y) centerDistance.x = centerDistance.y;
+ centerDistance.x -= 0;
+ centerDistance.x *= (1.0f / 1.0f);
+
+ if (centerDistance.x < centerDistance.y) centerDistance.x = centerDistance.y;
+ for (int y = g_iKernelStart; y < g_iKernelEnd; ++y) {
+ dep += g_txShadow.Load( int3(input.ITex.x, input.ITex.y+(float)y * centerDistance.x, 0) ).rg;
+ }
+
+
+ dep /= (g_iKernelEnd - g_iKernelStart);
+ return dep;
+
+ */
+
+
+ float2 dep=0;
+ [unroll]for ( int y = BLUR_KERNEL_BEGIN; y < BLUR_KERNEL_END; ++y ) {
+ dep += g_txShadow.Sample( g_samShadow, float3( input.Tex.x, input.Tex.y, 0 ), int2( 0,y ) ).rg;
+ }
+ dep /= FLOAT_BLUR_KERNEL_SIZE;
+ return dep;
+
+ //return g_txShadow.Sample(g_samShadow, float3(input.Tex.x, input.Tex.y, 0) ).rg;
+}
+
+
+
diff --git a/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceScene.hlsl b/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceScene.hlsl
new file mode 100644
index 000000000..0b2e43b5c
--- /dev/null
+++ b/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceScene.hlsl
@@ -0,0 +1,412 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain -profile ps_4_0 -entry PSBlurX -entry PSBlurY
+//--------------------------------------------------------------------------------------
+// File: RenderCascadeScene.hlsl
+//
+// This is the main shader file. This shader is compiled with several different flags
+// to provide different customizations based on user controls.
+//
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+
+// This flag enables the shadow to blend between cascades. This is most useful when the
+// the shadow maps are small and artifact can be seen between the various cascade layers.
+#ifndef BLEND_BETWEEN_CASCADE_LAYERS_FLAG
+#define BLEND_BETWEEN_CASCADE_LAYERS_FLAG 0
+#endif
+
+// There are two methods for selecting the proper cascade a fragment lies in. Interval selection
+// compares the depth of the fragment against the frustum's depth partition.
+// Map based selection compares the texture coordinates against the acutal cascade maps.
+// Map based selection gives better coverage.
+// Interval based selection is easier to extend and understand.
+#ifndef SELECT_CASCADE_BY_INTERVAL_FLAG
+#define SELECT_CASCADE_BY_INTERVAL_FLAG 0
+#endif
+
+// The number of cascades
+#ifndef CASCADE_COUNT_FLAG
+#define CASCADE_COUNT_FLAG 3
+#endif
+
+
+// Most titles will find that 3-4 cascades with
+// BLEND_BETWEEN_CASCADE_LAYERS_FLAG, is good for lower end PCs.
+
+cbuffer cbAllShadowData : register( b0 )
+{
+ matrix m_mWorldViewProjection;
+ matrix m_mWorld;
+ matrix m_mWorldView;
+ matrix m_mShadow;
+ float4 m_vCascadeOffset[8];
+ float4 m_vCascadeScale[8];
+ int m_nCascadeLevels; // Number of Cascades
+ int m_iVisualizeCascades; // 1 is to visualize the cascades in different colors. 0 is to just draw the scene
+
+ // For Map based selection scheme, this keeps the pixels inside of the the valid range.
+ // When there is no boarder, these values are 0 and 1 respectivley.
+ float m_fMinBorderPadding;
+ float m_fMaxBorderPadding;
+
+ float m_fCascadeBlendArea; // Amount to overlap when blending between cascades.
+ float m_fTexelSize; // Padding variables exist because CBs must be a multiple of 16 bytes.
+ float m_fNativeTexelSizeInX;
+ float4 m_fCascadeFrustumsEyeSpaceDepthsData[2]; // The values along Z that seperate the cascades.
+ // This code creates an array based pointer that points towards the vectorized input data.
+ // This is the only way to index arbitrary arrays of data.
+ // If the array is used at run time, the compiler will generate code that uses logic to index the correct component.
+
+ static float m_fCascadeFrustumsEyeSpaceDepths[8] = (float[8])m_fCascadeFrustumsEyeSpaceDepthsData;
+
+ float3 m_vLightDir;
+ float m_fPaddingCB4;
+
+};
+
+
+
+//--------------------------------------------------------------------------------------
+// Textures and Samplers
+//--------------------------------------------------------------------------------------
+Texture2D g_txDiffuse : register( t0 );
+Texture2DArray g_txShadow : register( t5 );
+
+SamplerState g_samLinear : register( s0 );
+SamplerState g_samShadow : register( s5 );
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+ float4 vPosition : POSITION;
+ float3 vNormal : NORMAL;
+ float2 vTexcoord : TEXCOORD0;
+};
+
+struct VS_OUTPUT
+{
+ float3 vNormal : NORMAL;
+ float2 vTexcoord : COLOR0;
+ float4 vTexShadow : TEXCOORD1;
+ float4 vPosition : SV_POSITION;
+ float4 vInterpPos : TEXCOORD2;
+ float vDepth : TEXCOORD3;
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+VS_OUTPUT VSMain( VS_INPUT Input )
+{
+ VS_OUTPUT Output;
+
+ Output.vPosition = mul( Input.vPosition, m_mWorldViewProjection );
+ Output.vNormal = mul( Input.vNormal, (float3x3)m_mWorld );
+ Output.vTexcoord = Input.vTexcoord;
+ Output.vInterpPos = Input.vPosition;
+ Output.vDepth = mul( Input.vPosition, m_mWorldView ).z ;
+
+ // Transform the shadow texture coordinates for all the cascades.
+ Output.vTexShadow = mul( Input.vPosition, m_mShadow );
+
+ return Output;
+}
+
+
+
+static const float4 vCascadeColorsMultiplier[8] =
+{
+ float4 ( 1.5f, 0.0f, 0.0f, 1.0f ),
+ float4 ( 0.0f, 1.5f, 0.0f, 1.0f ),
+ float4 ( 0.0f, 0.0f, 5.5f, 1.0f ),
+ float4 ( 1.5f, 0.0f, 5.5f, 1.0f ),
+ float4 ( 1.5f, 1.5f, 0.0f, 1.0f ),
+ float4 ( 1.0f, 1.0f, 1.0f, 1.0f ),
+ float4 ( 0.0f, 1.0f, 5.5f, 1.0f ),
+ float4 ( 0.5f, 3.5f, 0.75f, 1.0f )
+};
+
+
+void ComputeCoordinatesTransform( in int iCascadeIndex,
+ in float4 InterpolatedPosition,
+ in out float4 vShadowTexCoord,
+ in out float4 vShadowTexCoordViewSpace )
+{
+ // Now that we know the correct map, we can transform the world space position of the current fragment
+ if( SELECT_CASCADE_BY_INTERVAL_FLAG )
+ {
+ vShadowTexCoord = vShadowTexCoordViewSpace * m_vCascadeScale[iCascadeIndex];
+ vShadowTexCoord += m_vCascadeOffset[iCascadeIndex];
+ }
+ vShadowTexCoord.w = vShadowTexCoord.z; // We put the z value in w so that we can index the texture array with Z.
+ vShadowTexCoord.z = iCascadeIndex;
+
+}
+
+//--------------------------------------------------------------------------------------
+// Use PCF to sample the depth map and return a percent lit value.
+//--------------------------------------------------------------------------------------
+void CalculateVarianceShadow ( in float4 vShadowTexCoord, in float4 vShadowMapTextureCoordViewSpace, int iCascade, out float fPercentLit )
+{
+ fPercentLit = 0.0f;
+ // This loop could be unrolled, and texture immediate offsets could be used if the kernel size were fixed.
+ // This would be a performance improvment.
+
+ float2 mapDepth = 0;
+
+
+ // In orderto pull the derivative out of divergent flow control we calculate the
+ // derivative off of the view space coordinates an then scale the deriviative.
+
+ float3 vShadowTexCoordDDX =
+ ddx(vShadowMapTextureCoordViewSpace );
+ vShadowTexCoordDDX *= m_vCascadeScale[iCascade].xyz;
+ float3 vShadowTexCoordDDY =
+ ddy(vShadowMapTextureCoordViewSpace );
+ vShadowTexCoordDDY *= m_vCascadeScale[iCascade].xyz;
+
+ mapDepth += g_txShadow.SampleGrad( g_samShadow, vShadowTexCoord.xyz,
+ vShadowTexCoordDDX,
+ vShadowTexCoordDDY);
+ // The sample instruction uses gradients for some filters.
+
+ float fAvgZ = mapDepth.x; // Filtered z
+ float fAvgZ2 = mapDepth.y; // Filtered z-squared
+
+ if ( vShadowTexCoord.w <= fAvgZ ) // We put the z value in w so that we can index the texture array with Z.
+ {
+ fPercentLit = 1;
+ }
+ else
+ {
+ float variance = ( fAvgZ2 ) - ( fAvgZ * fAvgZ );
+ variance = min( 1.0f, max( 0.0f, variance + 0.00001f ) );
+
+ float mean = fAvgZ;
+ float d = vShadowTexCoord.w - mean; // We put the z value in w so that we can index the texture array with Z.
+ float p_max = variance / ( variance + d*d );
+
+ // To combat light-bleeding, experiment with raising p_max to some power
+ // (Try values from 0.1 to 100.0, if you like.)
+ fPercentLit = pow( p_max, 4 );
+
+ }
+
+}
+
+//--------------------------------------------------------------------------------------
+// Calculate amount to blend between two cascades and the band where blending will occure.
+//--------------------------------------------------------------------------------------
+void CalculateBlendAmountForInterval ( in int iNextCascadeIndex,
+ in out float fPixelDepth,
+ in out float fCurrentPixelsBlendBandLocation,
+ out float fBlendBetweenCascadesAmount
+ )
+{
+
+ // We need to calculate the band of the current shadow map where it will fade into the next cascade.
+ // We can then early out of the expensive PCF for loop.
+ //
+ float fBlendInterval = m_fCascadeFrustumsEyeSpaceDepths[ iNextCascadeIndex - 1 ];
+ if( iNextCascadeIndex > 1 )
+ {
+ fPixelDepth -= m_fCascadeFrustumsEyeSpaceDepths[ iNextCascadeIndex-2 ];
+ fBlendInterval -= m_fCascadeFrustumsEyeSpaceDepths[ iNextCascadeIndex-2 ];
+ }
+ // The current pixel's blend band location will be used to determine when we need to blend and by how much.
+ fCurrentPixelsBlendBandLocation = fPixelDepth / fBlendInterval;
+ fCurrentPixelsBlendBandLocation = 1.0f - fCurrentPixelsBlendBandLocation;
+ // The fBlendBetweenCascadesAmount is our location in the blend band.
+ fBlendBetweenCascadesAmount = fCurrentPixelsBlendBandLocation / m_fCascadeBlendArea;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Calculate amount to blend between two cascades and the band where blending will occure.
+//--------------------------------------------------------------------------------------
+void CalculateBlendAmountForMap ( in float4 vShadowMapTextureCoord,
+ in out float fCurrentPixelsBlendBandLocation,
+ out float fBlendBetweenCascadesAmount )
+{
+ // Calcaulte the blend band for the map based selection.
+ float2 distanceToOne = float2 ( 1.0f - vShadowMapTextureCoord.x, 1.0f - vShadowMapTextureCoord.y );
+ fCurrentPixelsBlendBandLocation = min( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y );
+ float fCurrentPixelsBlendBandLocation2 = min( distanceToOne.x, distanceToOne.y );
+ fCurrentPixelsBlendBandLocation =
+ min( fCurrentPixelsBlendBandLocation, fCurrentPixelsBlendBandLocation2 );
+ fBlendBetweenCascadesAmount = fCurrentPixelsBlendBandLocation / m_fCascadeBlendArea;
+}
+
+//--------------------------------------------------------------------------------------
+// Calculate the shadow based on several options and rende the scene.
+//--------------------------------------------------------------------------------------
+
+float4 PSMain( VS_OUTPUT Input ) : SV_TARGET
+{
+ float4 vDiffuse = g_txDiffuse.Sample( g_samLinear, Input.vTexcoord );
+
+
+ float4 vShadowMapTextureCoordViewSpace = 0.0f;
+ float4 vShadowMapTextureCoord = 0.0f;
+ float4 vShadowMapTextureCoord_blend = 0.0f;
+
+ float4 vVisualizeCascadeColor = float4(0.0f,0.0f,0.0f,1.0f);
+
+ float fPercentLit = 0.0f;
+ float fPercentLit_blend = 0.0f;
+
+ int iCascadeFound = 0;
+ int iCurrentCascadeIndex=1;
+ int iNextCascadeIndex = 0;
+
+ float fCurrentPixelDepth;
+
+ // The interval based selection technique compares the pixel's depth against the frustum's cascade divisions.
+ fCurrentPixelDepth = Input.vDepth;
+
+ // This for loop is not necessary when the frustum is uniformaly divided and interval based selection is used.
+ // In this case fCurrentPixelDepth could be used as an array lookup into the correct frustum.
+ vShadowMapTextureCoordViewSpace = Input.vTexShadow;
+
+
+ if( SELECT_CASCADE_BY_INTERVAL_FLAG )
+ {
+ iCurrentCascadeIndex = 0;
+ if (CASCADE_COUNT_FLAG > 1 )
+ {
+ float4 vCurrentPixelDepth = Input.vDepth;
+ float4 fComparison = ( vCurrentPixelDepth > m_fCascadeFrustumsEyeSpaceDepthsData[0]);
+ float4 fComparison2 = ( vCurrentPixelDepth > m_fCascadeFrustumsEyeSpaceDepthsData[1]);
+ float fIndex = dot(
+ float4( CASCADE_COUNT_FLAG > 0,
+ CASCADE_COUNT_FLAG > 1,
+ CASCADE_COUNT_FLAG > 2,
+ CASCADE_COUNT_FLAG > 3)
+ , fComparison )
+ + dot(
+ float4(
+ CASCADE_COUNT_FLAG > 4,
+ CASCADE_COUNT_FLAG > 5,
+ CASCADE_COUNT_FLAG > 6,
+ CASCADE_COUNT_FLAG > 7)
+ , fComparison2 ) ;
+
+ fIndex = min( fIndex, CASCADE_COUNT_FLAG - 1 );
+ iCurrentCascadeIndex = (int)fIndex;
+ }
+ }
+
+ if ( !SELECT_CASCADE_BY_INTERVAL_FLAG )
+ {
+ iCurrentCascadeIndex = 0;
+ if ( CASCADE_COUNT_FLAG == 1 )
+ {
+ vShadowMapTextureCoord = vShadowMapTextureCoordViewSpace * m_vCascadeScale[0];
+ vShadowMapTextureCoord += m_vCascadeOffset[0];
+ }
+ if ( CASCADE_COUNT_FLAG > 1 ) {
+ for( int iCascadeIndex = 0; iCascadeIndex < CASCADE_COUNT_FLAG && iCascadeFound == 0; ++iCascadeIndex )
+ {
+ vShadowMapTextureCoord = vShadowMapTextureCoordViewSpace * m_vCascadeScale[iCascadeIndex];
+ vShadowMapTextureCoord += m_vCascadeOffset[iCascadeIndex];
+
+ if ( min( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y ) > m_fMinBorderPadding
+ && max( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y ) < m_fMaxBorderPadding )
+ {
+ iCurrentCascadeIndex = iCascadeIndex;
+ iCascadeFound = 1;
+ }
+ }
+ }
+ }
+ // Found the correct map.
+ vVisualizeCascadeColor = vCascadeColorsMultiplier[iCurrentCascadeIndex];
+
+ ComputeCoordinatesTransform( iCurrentCascadeIndex, Input.vInterpPos, vShadowMapTextureCoord, vShadowMapTextureCoordViewSpace );
+
+ if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG && CASCADE_COUNT_FLAG > 1 )
+ {
+ // Repeat text coord calculations for the next cascade.
+ // The next cascade index is used for blurring between maps.
+ iNextCascadeIndex = min ( CASCADE_COUNT_FLAG - 1, iCurrentCascadeIndex + 1 );
+ if( !SELECT_CASCADE_BY_INTERVAL_FLAG )
+ {
+ vShadowMapTextureCoord_blend = vShadowMapTextureCoordViewSpace * m_vCascadeScale[iNextCascadeIndex];
+ vShadowMapTextureCoord_blend += m_vCascadeOffset[iNextCascadeIndex];
+ }
+ ComputeCoordinatesTransform( iNextCascadeIndex, Input.vInterpPos, vShadowMapTextureCoord_blend, vShadowMapTextureCoordViewSpace );
+ }
+ float fBlendBetweenCascadesAmount = 1.0f;
+ float fCurrentPixelsBlendBandLocation = 1.0f;
+
+ if( SELECT_CASCADE_BY_INTERVAL_FLAG )
+ {
+ if( CASCADE_COUNT_FLAG > 1 && BLEND_BETWEEN_CASCADE_LAYERS_FLAG )
+ {
+ CalculateBlendAmountForInterval ( iNextCascadeIndex, fCurrentPixelDepth,
+ fCurrentPixelsBlendBandLocation, fBlendBetweenCascadesAmount );
+
+ }
+ }
+ else
+ {
+ if( CASCADE_COUNT_FLAG > 1 && BLEND_BETWEEN_CASCADE_LAYERS_FLAG )
+ {
+ CalculateBlendAmountForMap ( vShadowMapTextureCoord,
+ fCurrentPixelsBlendBandLocation, fBlendBetweenCascadesAmount );
+ }
+ }
+
+ // Because the Z coordinate specifies the texture array,
+ // the derivative will be 0 when there is no divergence
+ //float fDivergence = abs( ddy( vShadowMapTextureCoord.z ) ) + abs( ddx( vShadowMapTextureCoord.z ) );
+ CalculateVarianceShadow ( vShadowMapTextureCoord, vShadowMapTextureCoordViewSpace,
+ iCurrentCascadeIndex, fPercentLit);
+
+ // We repeat the calcuation for the next cascade layer, when blending between maps.
+ if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG && CASCADE_COUNT_FLAG > 1 )
+ {
+ if( fCurrentPixelsBlendBandLocation < m_fCascadeBlendArea )
+ { // the current pixel is within the blend band.
+
+ // Because the Z coordinate species the texture array,
+ // the derivative will be 0 when there is no divergence
+ float fDivergence = abs( ddy( vShadowMapTextureCoord_blend.z ) ) +
+ abs( ddx( vShadowMapTextureCoord_blend.z) );
+ CalculateVarianceShadow ( vShadowMapTextureCoord_blend, vShadowMapTextureCoordViewSpace,
+ iNextCascadeIndex, fPercentLit_blend );
+
+ // Blend the two calculated shadows by the blend amount.
+ fPercentLit = lerp( fPercentLit_blend, fPercentLit, fBlendBetweenCascadesAmount );
+
+ }
+ }
+
+ if( !m_iVisualizeCascades ) vVisualizeCascadeColor = float4( 1.0f, 1.0f, 1.0f, 1.0f );
+
+ float3 vLightDir1 = float3( -1.0f, 1.0f, -1.0f );
+ float3 vLightDir2 = float3( 1.0f, 1.0f, -1.0f );
+ float3 vLightDir3 = float3( 0.0f, -1.0f, 0.0f );
+ float3 vLightDir4 = float3( 1.0f, 1.0f, 1.0f );
+ // Some ambient-like lighting.
+ float fLighting =
+ saturate( dot( vLightDir1 , Input.vNormal ) )*0.05f +
+ saturate( dot( vLightDir2 , Input.vNormal ) )*0.05f +
+ saturate( dot( vLightDir3 , Input.vNormal ) )*0.05f +
+ saturate( dot( vLightDir4 , Input.vNormal ) )*0.05f ;
+
+ float4 vShadowLighting = fLighting * 0.5f;
+ fLighting += saturate( dot( m_vLightDir , Input.vNormal ) );
+ fLighting = lerp( vShadowLighting, fLighting, fPercentLit );
+
+ return fLighting * vVisualizeCascadeColor * vDiffuse;
+
+}
+
diff --git a/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceShadow.hlsl b/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceShadow.hlsl
new file mode 100644
index 000000000..9837bf299
--- /dev/null
+++ b/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceShadow.hlsl
@@ -0,0 +1,45 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain -profile ps_4_0 -entry PSMain
+
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+ matrix g_mWorldViewProjection : packoffset( c0 );
+};
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+ float4 vPosition : POSITION;
+};
+
+struct VS_OUTPUT
+{
+ float4 vPosition : SV_POSITION;
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+VS_OUTPUT VSMain( VS_INPUT Input )
+{
+ VS_OUTPUT Output;
+
+
+ Output.vPosition = mul( Input.vPosition, g_mWorldViewProjection );
+
+ return Output;
+}
+
+
+float2 PSMain (VS_OUTPUT Input) : SV_TARGET
+{
+ float2 rt;
+ rt.x = Input.vPosition.z;
+ rt.y = rt.x * rt.x;
+ return rt;
+} \ No newline at end of file
diff --git a/tests/hlsl/simple/compute-numthreads.hlsl b/tests/hlsl/simple/compute-numthreads.hlsl
new file mode 100644
index 000000000..3843c401f
--- /dev/null
+++ b/tests/hlsl/simple/compute-numthreads.hlsl
@@ -0,0 +1,11 @@
+//TEST:COMPARE_HLSL: -no-checking -target dxbc-assembly -profile cs_5_0 -entry main
+
+// Confirm that we properly pass along the `numthreads` attribute on an entry point.
+
+RWStructuredBuffer<float> b;
+
+[numthreads(32,1,1)]
+void main(uint3 tid : SV_DispatchThreadID)
+{
+ b[tid.x] = b[tid.x + 1] + 1.0f;
+} \ No newline at end of file