From fcf83dbf9effab3bd98bad2b83b2468b7eb05cfd Mon Sep 17 00:00:00 2001
From: Tim Foley <tfoley@nvidia.com>
Date: Fri, 9 Jun 2017 11:34:21 -0700
Subject: Initial import of code.

---
 .../dxsdk/AdaptiveTessellationCS40/Render.hlsl     |   58 +
 .../dxsdk/AdaptiveTessellationCS40/ScanCS.hlsl     |  109 +
 .../TessellatorCS40_EdgeFactorCS.hlsl              |  217 ++
 .../TessellatorCS40_NumVerticesIndicesCS.hlsl      |   56 +
 .../TessellatorCS40_ScatterIDCS.hlsl               |   45 +
 .../TessellatorCS40_TessellateIndicesCS.hlsl       |  628 +++++
 .../TessellatorCS40_TessellateVerticesCS.hlsl      |  206 ++
 .../TessellatorCS40_common.hlsl                    |  411 ++++
 .../TessellatorCS40_defines.h                      |    9 +
 .../dxsdk/BC6HBC7EncoderCS/Shaders/BC6HEncode.hlsl | 2567 ++++++++++++++++++++
 .../dxsdk/BC6HBC7EncoderCS/Shaders/BC7Encode.hlsl  | 1908 +++++++++++++++
 .../hlsl/dxsdk/BasicCompute11/BasicCompute11.hlsl  |   72 +
 tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL.fx          |  158 ++
 tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_PS.hlsl   |   51 +
 tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_VS.hlsl   |   49 +
 tests/hlsl/dxsdk/BasicHLSLFX11/BasicHLSLFX11.fx    |  181 ++
 .../CascadedShadowMaps11/RenderCascadeScene.hlsl   |  506 ++++
 .../CascadedShadowMaps11/RenderCascadeShadow.hlsl  |   53 +
 .../ComputeShaderSort11/ComputeShaderSort11.hlsl   |   75 +
 .../Direct3D11Tutorials/Tutorial02/Tutorial02.fx   |   23 +
 .../Tutorial02/Tutorial02_PS.hlsl                  |    3 +
 .../Tutorial02/Tutorial02_VS.hlsl                  |    3 +
 .../Direct3D11Tutorials/Tutorial03/Tutorial03.fx   |   23 +
 .../Tutorial03/Tutorial03_PS.hlsl                  |    3 +
 .../Tutorial03/Tutorial03_VS.hlsl                  |    3 +
 .../Direct3D11Tutorials/Tutorial04/Tutorial04.fx   |   46 +
 .../Tutorial04/Tutorial04_PS.hlsl                  |    3 +
 .../Tutorial04/Tutorial04_VS.hlsl                  |    3 +
 .../Direct3D11Tutorials/Tutorial05/Tutorial05.fx   |   54 +
 .../Tutorial05/Tutorial05_PS.hlsl                  |    3 +
 .../Tutorial05/Tutorial05_VS.hlsl                  |    3 +
 .../Direct3D11Tutorials/Tutorial06/Tutorial06.fx   |   76 +
 .../Tutorial06/Tutorial06_PS.hlsl                  |    3 +
 .../Tutorial06/Tutorial06_VS.hlsl                  |    3 +
 .../Direct3D11Tutorials/Tutorial07/Tutorial07.fx   |   67 +
 .../Tutorial07/Tutorial07_PS.hlsl                  |    3 +
 .../Tutorial07/Tutorial07_VS.hlsl                  |    3 +
 .../Tutorial08/Tutorial08.fx                       |   56 +
 .../Tutorial09/Tutorial09.fx                       |   69 +
 .../Tutorial10/Tutorial10.fx                       |   73 +
 .../Tutorial11/Tutorial11.fx                       |  117 +
 .../Tutorial12/Tutorial12.fx                       |  129 +
 .../Tutorial13/Tutorial13.fx                       |  191 ++
 .../Tutorial14/Tutorial14.fx                       |  294 +++
 .../DynamicShaderLinkage11_LightPSH.h              |   84 +
 .../DynamicShaderLinkage11_MaterialPSH.h           |  103 +
 .../DynamicShaderLinkage11_PS.hlsl                 |   84 +
 .../DynamicShaderLinkage11_PSBuffers.h             |  129 +
 .../DynamicShaderLinkage11_VS.hlsl                 |   66 +
 .../DynamicShaderLinkageFX11.fx                    |  192 ++
 .../DynamicShaderLinkageFX11_LightPSH.h            |   82 +
 .../DynamicShaderLinkageFX11_MaterialPSH.h         |  103 +
 .../DynamicShaderLinkageFX11_PSBuffers.h           |  152 ++
 .../DynamicShaderLinkageFX11_ps.hlsl               |  113 +
 .../DynamicShaderLinkageFX11_vs.hlsl               |   65 +
 tests/hlsl/dxsdk/FixedFuncEMUFX11/FixedFuncEMU.fx  |  468 ++++
 .../hlsl/dxsdk/FluidCS11/ComputeShaderSort11.hlsl  |   75 +
 tests/hlsl/dxsdk/FluidCS11/FluidCS11.hlsl          |  529 ++++
 tests/hlsl/dxsdk/FluidCS11/FluidRender.hlsl        |  112 +
 .../BrightPassAndHorizFilterCS.hlsl                |   64 +
 .../dxsdk/HDRToneMappingCS11/DumpToTexture.hlsl    |   29 +
 tests/hlsl/dxsdk/HDRToneMappingCS11/FilterCS.hlsl  |   73 +
 tests/hlsl/dxsdk/HDRToneMappingCS11/FinalPass.hlsl |   79 +
 .../hlsl/dxsdk/HDRToneMappingCS11/PSApproach.hlsl  |  129 +
 .../dxsdk/HDRToneMappingCS11/ReduceTo1DCS.hlsl     |   72 +
 .../dxsdk/HDRToneMappingCS11/ReduceToSingleCS.hlsl |   63 +
 tests/hlsl/dxsdk/HDRToneMappingCS11/skybox11.hlsl  |   44 +
 tests/hlsl/dxsdk/InstancingFX11/Instancing.fx      |  591 +++++
 .../MultithreadedRendering11_PS.hlsl               |  202 ++
 .../MultithreadedRendering11_VS.hlsl               |   75 +
 .../dxsdk/NBodyGravityCS11/NBodyGravityCS11.hlsl   |  103 +
 .../hlsl/dxsdk/NBodyGravityCS11/ParticleDraw.hlsl  |  128 +
 tests/hlsl/dxsdk/OIT11/OIT_CS.hlsl                 |  277 +++
 tests/hlsl/dxsdk/OIT11/OIT_PS.hlsl                 |   56 +
 tests/hlsl/dxsdk/OIT11/SceneVS.hlsl                |   36 +
 tests/hlsl/dxsdk/README.md                         |    5 +
 .../hlsl/dxsdk/SimpleBezier11/SimpleBezier11.hlsl  |  230 ++
 tests/hlsl/dxsdk/SimpleSample11/SimpleSample.fx    |  112 +
 tests/hlsl/dxsdk/SimpleSample11/SimpleSample.hlsl  |   86 +
 tests/hlsl/dxsdk/SubD11/SubD11.hlsl                | 1238 ++++++++++
 .../dxsdk/VarianceShadows11/2DQuadShaders.hlsl     |  211 ++
 .../VarianceShadows11/RenderVarianceScene.hlsl     |  412 ++++
 .../VarianceShadows11/RenderVarianceShadow.hlsl    |   45 +
 tests/hlsl/simple/compute-numthreads.hlsl          |   11 +
 84 files changed, 15341 insertions(+)
 create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/Render.hlsl
 create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/ScanCS.hlsl
 create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_EdgeFactorCS.hlsl
 create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_NumVerticesIndicesCS.hlsl
 create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_ScatterIDCS.hlsl
 create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateIndicesCS.hlsl
 create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateVerticesCS.hlsl
 create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_common.hlsl
 create mode 100644 tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_defines.h
 create mode 100644 tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC6HEncode.hlsl
 create mode 100644 tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC7Encode.hlsl
 create mode 100644 tests/hlsl/dxsdk/BasicCompute11/BasicCompute11.hlsl
 create mode 100644 tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL.fx
 create mode 100644 tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_PS.hlsl
 create mode 100644 tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_VS.hlsl
 create mode 100644 tests/hlsl/dxsdk/BasicHLSLFX11/BasicHLSLFX11.fx
 create mode 100644 tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeScene.hlsl
 create mode 100644 tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeShadow.hlsl
 create mode 100644 tests/hlsl/dxsdk/ComputeShaderSort11/ComputeShaderSort11.hlsl
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02.fx
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_PS.hlsl
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_VS.hlsl
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03.fx
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_PS.hlsl
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_VS.hlsl
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04.fx
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_PS.hlsl
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_VS.hlsl
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05.fx
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_PS.hlsl
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_VS.hlsl
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06.fx
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_PS.hlsl
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_VS.hlsl
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07.fx
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_PS.hlsl
 create mode 100644 tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_VS.hlsl
 create mode 100644 tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial08/Tutorial08.fx
 create mode 100644 tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial09/Tutorial09.fx
 create mode 100644 tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial10/Tutorial10.fx
 create mode 100644 tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial11/Tutorial11.fx
 create mode 100644 tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial12/Tutorial12.fx
 create mode 100644 tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial13/Tutorial13.fx
 create mode 100644 tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial14/Tutorial14.fx
 create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_LightPSH.h
 create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_MaterialPSH.h
 create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PS.hlsl
 create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PSBuffers.h
 create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_VS.hlsl
 create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11.fx
 create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_LightPSH.h
 create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_MaterialPSH.h
 create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_PSBuffers.h
 create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_ps.hlsl
 create mode 100644 tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_vs.hlsl
 create mode 100644 tests/hlsl/dxsdk/FixedFuncEMUFX11/FixedFuncEMU.fx
 create mode 100644 tests/hlsl/dxsdk/FluidCS11/ComputeShaderSort11.hlsl
 create mode 100644 tests/hlsl/dxsdk/FluidCS11/FluidCS11.hlsl
 create mode 100644 tests/hlsl/dxsdk/FluidCS11/FluidRender.hlsl
 create mode 100644 tests/hlsl/dxsdk/HDRToneMappingCS11/BrightPassAndHorizFilterCS.hlsl
 create mode 100644 tests/hlsl/dxsdk/HDRToneMappingCS11/DumpToTexture.hlsl
 create mode 100644 tests/hlsl/dxsdk/HDRToneMappingCS11/FilterCS.hlsl
 create mode 100644 tests/hlsl/dxsdk/HDRToneMappingCS11/FinalPass.hlsl
 create mode 100644 tests/hlsl/dxsdk/HDRToneMappingCS11/PSApproach.hlsl
 create mode 100644 tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceTo1DCS.hlsl
 create mode 100644 tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceToSingleCS.hlsl
 create mode 100644 tests/hlsl/dxsdk/HDRToneMappingCS11/skybox11.hlsl
 create mode 100644 tests/hlsl/dxsdk/InstancingFX11/Instancing.fx
 create mode 100644 tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_PS.hlsl
 create mode 100644 tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_VS.hlsl
 create mode 100644 tests/hlsl/dxsdk/NBodyGravityCS11/NBodyGravityCS11.hlsl
 create mode 100644 tests/hlsl/dxsdk/NBodyGravityCS11/ParticleDraw.hlsl
 create mode 100644 tests/hlsl/dxsdk/OIT11/OIT_CS.hlsl
 create mode 100644 tests/hlsl/dxsdk/OIT11/OIT_PS.hlsl
 create mode 100644 tests/hlsl/dxsdk/OIT11/SceneVS.hlsl
 create mode 100644 tests/hlsl/dxsdk/README.md
 create mode 100644 tests/hlsl/dxsdk/SimpleBezier11/SimpleBezier11.hlsl
 create mode 100644 tests/hlsl/dxsdk/SimpleSample11/SimpleSample.fx
 create mode 100644 tests/hlsl/dxsdk/SimpleSample11/SimpleSample.hlsl
 create mode 100644 tests/hlsl/dxsdk/SubD11/SubD11.hlsl
 create mode 100644 tests/hlsl/dxsdk/VarianceShadows11/2DQuadShaders.hlsl
 create mode 100644 tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceScene.hlsl
 create mode 100644 tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceShadow.hlsl
 create mode 100644 tests/hlsl/simple/compute-numthreads.hlsl

(limited to 'tests/hlsl')

diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/Render.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/Render.hlsl
new file mode 100644
index 000000000..b98b870da
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/Render.hlsl
@@ -0,0 +1,58 @@
+//TEST:COMPARE_HLSL: -profile vs_4_0 -entry RenderBaseVS -profile ps_4_0 -entry RenderPS -target dxbc-assembly
+//--------------------------------------------------------------------------------------
+// File: Render.hlsl
+//
+// The shaders for rendering tessellated mesh and base mesh
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+    row_major matrix    g_mWorldViewProjection    : packoffset( c0 );
+}
+
+// The tessellated vertex structure
+struct TessedVertex
+{
+    uint BaseTriID;         // Which triangle of the base mesh this tessellated vertex belongs to?
+    float2 bc;              // Barycentric coordinates with regard to the base triangle
+};
+Buffer<float4>                  g_base_vb_buffer : register(t0);  // Base mesh vertex buffer
+StructuredBuffer<TessedVertex>  g_TessedVertices : register(t1);  // Tessellated mesh vertex buffer
+
+float4 bary_centric(float4 v1, float4 v2, float4 v3, float2 bc)
+{
+    return (1 - bc.x - bc.y) * v1 + bc.x * v2 + bc.y * v3;
+}
+
+float4 RenderVS( uint vertid : SV_VertexID ) : SV_POSITION
+{
+    TessedVertex input = g_TessedVertices[vertid];
+    
+    // Get the positions of the three vertices of the base triangle
+    float4 v[3];
+    [unroll]
+    for (int i = 0; i < 3; ++ i)
+    {
+        uint vert_id = input.BaseTriID * 3 + i;
+        v[i] = g_base_vb_buffer[vert_id];
+    }
+
+    // Calculate the position of this tessellated vertex from barycentric coordinates and then project it
+    return mul(bary_centric(v[0], v[1], v[2], input.bc), g_mWorldViewProjection);
+}
+
+struct BaseVertex
+{
+    float4 pos : POSITION;
+};
+
+float4 RenderBaseVS( BaseVertex input ) : SV_POSITION
+{
+    return mul( input.pos, g_mWorldViewProjection );
+}
+
+float4 RenderPS() : SV_TARGET
+{
+    return float4( 1.0f, 1.0f, 0.0f, 1.0f );
+}
\ No newline at end of file
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/ScanCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/ScanCS.hlsl
new file mode 100644
index 000000000..46cdc1ed9
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/ScanCS.hlsl
@@ -0,0 +1,109 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSScanInBucket -entry CSScanBucketResult -entry CSScanAddBucketResult
+//--------------------------------------------------------------------------------------
+// File: ScanCS.hlsl
+//
+// A simple inclusive prefix sum(scan) implemented in CS4.0, 
+// using a typical up sweep and down sweep scheme
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+StructuredBuffer<uint2> Input : register( t0 );     // Change uint2 here if scan other types, and
+RWStructuredBuffer<uint2> Result : register( u0 );  // also here
+
+#define groupthreads 128
+groupshared uint4 bucket[groupthreads];             // Change uint4 to the "type x2" if scan other types, e.g.
+                                                    // if scan uint2, then put uint4 here,
+                                                    // if scan float, then put float2 here
+
+void CSScan( uint3 DTid, uint GI, uint2 x )         // Change the type of x here if scan other types
+{
+    // since CS40 can only support one shared memory for one shader, we use .xy and .zw as ping-ponging buffers
+    // if scan a single element type like int, search and replace all .xy to .x and .zw to .y below
+    bucket[GI].xy = x; 
+    bucket[GI].zw = 0;
+
+    // Up sweep    
+    [unroll]
+    for ( uint stride = 2; stride <= groupthreads; stride <<= 1 )
+    {
+        GroupMemoryBarrierWithGroupSync();
+        
+        if ( (GI & (stride - 1)) == (stride - 1) )
+        {
+            bucket[GI].xy += bucket[GI - stride/2].xy;
+        }
+    }
+
+    if ( GI == (groupthreads - 1) ) 
+    {
+        bucket[GI].xy = 0;
+    }
+
+    // Down sweep
+    bool n = true;
+    [unroll]
+    for ( stride = groupthreads / 2; stride >= 1; stride >>= 1 )
+    {
+        GroupMemoryBarrierWithGroupSync();
+
+        uint a = stride - 1;
+        uint b = stride | a;
+
+        if ( n )        // ping-pong between passes
+        {
+            if ( ( GI & b) == b )
+            {
+                bucket[GI].zw = bucket[GI-stride].xy + bucket[GI].xy;
+            } else
+            if ( (GI & a) == a )
+            {
+                bucket[GI].zw = bucket[GI+stride].xy;
+            } else        
+            {
+                bucket[GI].zw = bucket[GI].xy;
+            }
+        } else
+        {
+            if ( ( GI & b) == b )
+            {
+                bucket[GI].xy = bucket[GI-stride].zw + bucket[GI].zw;
+            } else
+            if ( (GI & a) == a )
+            {
+                bucket[GI].xy = bucket[GI+stride].zw;
+            } else        
+            {
+                bucket[GI].xy = bucket[GI].zw;
+            }
+        }
+        
+        n = !n;
+    }    
+
+    Result[DTid.x] = bucket[GI].zw + x;
+}
+
+// scan in each bucket
+[numthreads( groupthreads, 1, 1 )]
+void CSScanInBucket( uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI: SV_GroupIndex )
+{
+    uint2 x = Input[DTid.x];                    // Change the type of x here if scan other types 
+    CSScan( DTid, GI, x );
+}
+
+// record and scan the sum of each bucket
+[numthreads( groupthreads, 1, 1 )]
+void CSScanBucketResult( uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI: SV_GroupIndex )
+{
+    uint2 x = Input[DTid.x*groupthreads - 1];   // Change the type of x here if scan other types
+    CSScan( DTid, GI, x );
+}
+
+StructuredBuffer<uint2> Input1 : register( t1 );
+
+// add the bucket scanned result to each bucket to get the final result
+[numthreads( groupthreads, 1, 1 )]
+void CSScanAddBucketResult( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI: SV_GroupIndex )
+{
+    Result[DTid.x] = Input[DTid.x] + Input1[Gid.x];
+}
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_EdgeFactorCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_EdgeFactorCS.hlsl
new file mode 100644
index 000000000..91ebca777
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_EdgeFactorCS.hlsl
@@ -0,0 +1,217 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSEdgeFactor
+//--------------------------------------------------------------------------------------
+// File: TessellatorCS40_EdgeFactorCS.hlsl
+//
+// The CS to compute edge tessellation factor acoording to current world, view, projection matrix
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+// http://jgt.akpeters.com/papers/akeninemoller01/tribox.html
+bool planeBoxOverlap(float3 normal, float d, float3 maxbox)
+{
+    float3 vmin = maxbox, vmax = maxbox;
+    [unroll]
+    for (int q = 0;q <= 2; ++ q)
+    {
+        if (normal[q] > 0.0f)
+        {
+            vmin[q] *= -1;
+        }
+        else
+        {
+            vmax[q] *= -1;
+        }
+    }
+    if (dot(normal, vmin) + d > 0.0f)
+    {
+        return false;
+    }
+    if (dot(normal, vmax) + d >= 0.0f)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+/*======================== X-tests ========================*/
+bool AXISTEST_X01(float3 v0, float3 v2, float3 boxhalfsize, float2 ab, float2 fab)
+{
+    float p0 = ab.x * v0.y - ab.y * v0.z;
+    float p2 = ab.x * v2.y - ab.y * v2.z;
+    float min_v = min(p0, p2);
+    float max_v = max(p0, p2);
+    float rad = dot(fab, boxhalfsize.yz);
+    return (min_v < rad) && (max_v > -rad);
+}
+
+bool AXISTEST_X2(float3 v0, float3 v1, float3 boxhalfsize, float2 ab, float2 fab)
+{
+    float p0 = ab.x * v0.y - ab.y * v0.z;
+    float p1 = ab.x * v1.y - ab.y * v1.z;
+    float min_v = min(p0, p1);
+    float max_v = max(p0, p1);
+    float rad = dot(fab, boxhalfsize.yz);
+    return (min_v < rad) && (max_v > -rad);
+}
+
+/*======================== Y-tests ========================*/
+bool AXISTEST_Y02(float3 v0, float3 v2, float3 boxhalfsize, float2 ab, float2 fab)
+{
+    float p0 = -ab.x * v0.x + ab.y * v0.z;
+    float p2 = -ab.x * v2.x + ab.y * v2.z;
+    float min_v = min(p0, p2);
+    float max_v = max(p0, p2);
+    float rad = dot(fab, boxhalfsize.xz);
+    return (min_v < rad) && (max_v > -rad);
+}
+
+bool AXISTEST_Y1(float3 v0, float3 v1, float3 boxhalfsize, float2 ab, float2 fab)
+{
+    float p0 = -ab.x * v0.x + ab.y * v0.z;
+    float p1 = -ab.x * v1.x + ab.y * v1.z;
+    float min_v = min(p0, p1);
+    float max_v = max(p0, p1);
+    float rad = dot(fab, boxhalfsize.xz);
+    return (min_v < rad) && (max_v > -rad);
+}
+
+/*======================== Z-tests ========================*/
+bool AXISTEST_Z12(float3 v1, float3 v2, float3 boxhalfsize, float2 ab, float2 fab)
+{
+    float p1 = ab.x * v1.x - ab.y * v1.y;
+    float p2 = ab.x * v2.x - ab.y * v2.y;
+    float min_v = min(p1, p2);
+    float max_v = max(p1, p2);
+    float rad = dot(fab, boxhalfsize.xy);
+    return (min_v < rad) && (max_v > -rad);
+}
+
+bool AXISTEST_Z0(float3 v0, float3 v1, float3 boxhalfsize, float2 ab, float2 fab)
+{
+    float p0 = ab.x * v0.x - ab.y * v0.y;
+    float p1 = ab.x * v1.x - ab.y * v1.y;
+    float min_v = min(p0, p1);
+    float max_v = max(p0, p1);
+    float rad = dot(fab, boxhalfsize.xy);
+    return (min_v < rad) && (max_v > -rad);
+}
+
+bool triBoxOverlap(float3 boxcenter,float3 boxhalfsize,float3 triverts0, float3 triverts1, float3 triverts2)
+{
+    /*    use separating axis theorem to test overlap between triangle and box */
+    /*    need to test for overlap in these directions: */
+    /*    1) the {x,y,z}-directions (actually, since we use the AABB of the triangle */
+    /*       we do not even need to test these) */
+    /*    2) normal of the triangle */
+    /*    3) crossproduct(edge from tri, {x,y,z}-directin) */
+    /*       this gives 3x3=9 more tests */
+
+    /* This is the fastest branch on Sun */
+    /* move everything so that the boxcenter is in (0,0,0) */
+    float3 v0 = triverts0 - boxcenter;
+    float3 v1 = triverts1 - boxcenter;
+    float3 v2 = triverts2 - boxcenter;
+
+    /* compute triangle edges */
+    float3 e0 = v1 - v0;      /* tri edge 0 */
+    float3 e1 = v2 - v1;      /* tri edge 1 */
+    float3 e2 = v0 - v2;      /* tri edge 2 */
+
+    /* Bullet 3:  */
+    /*  test the 9 tests first (this was faster) */
+    float3 fe = abs(e0);
+    if (!AXISTEST_X01(v0, v2, boxhalfsize, e0.zy, fe.zy)
+        || !AXISTEST_Y02(v0, v2, boxhalfsize, e0.zx, fe.zx)
+        || !AXISTEST_Z12(v1, v2, boxhalfsize, e0.yx, fe.yx))
+    {
+        return false;
+    }
+
+    fe = abs(e1);
+    if (!AXISTEST_X01(v0, v2, boxhalfsize, e1.zy, fe.zy)
+        || !AXISTEST_Y02(v0, v2, boxhalfsize, e1.zx, fe.zx)
+        || !AXISTEST_Z0(v0, v1, boxhalfsize, e1.yx, fe.yx))
+    {
+        return false;
+    }
+
+    fe = abs(e2);
+    if (!AXISTEST_X2(v0, v1, boxhalfsize, e2.zy, fe.zy)
+        || !AXISTEST_Y1(v0, v1, boxhalfsize, e2.zx, fe.zx)
+        || !AXISTEST_Z12(v1, v2, boxhalfsize, e2.yx, fe.yx))
+    {
+        return false;
+    }
+
+    /* Bullet 1: */
+    /*  first test overlap in the {x,y,z}-directions */
+    /*  find min, max of the triangle each direction, and test for overlap in */
+    /*  that direction -- this is equivalent to testing a minimal AABB around */
+    /*  the triangle against the AABB */
+
+    float3 min_v = min(min(v0, v1), v2);
+    float3 max_v = max(max(v0, v1), v2);
+    if ((min_v.x > boxhalfsize.x || max_v.x < -boxhalfsize.x)
+        || (min_v.y > boxhalfsize.y || max_v.y < -boxhalfsize.y)
+        || (min_v.z > boxhalfsize.z || max_v.z < -boxhalfsize.z))
+    {
+        return false;
+    }
+
+    /* Bullet 2: */
+    /*  test if the box intersects the plane of the triangle */
+    /*  compute plane equation of triangle: normal*x+d=0 */
+    float3 normal = cross(e0, e1);
+    float d = -dot(normal, v0);  /* plane eq: normal.x+d=0 */
+    if (!planeBoxOverlap(normal, d, boxhalfsize))
+    {
+        return false;
+    }
+
+    return true;   /* box and triangle overlaps */
+}
+
+
+Buffer<float4> InputVertices : register(t0);
+RWStructuredBuffer<float4> EdgeFactorBufOut : register(u0);
+
+cbuffer cb
+{
+    row_major matrix    g_matWVP;
+    float2              g_tess_edge_length_scale;
+    int                 num_triangles;
+    float               dummy;
+}
+
+[numthreads(128, 1, 1)]
+void CSEdgeFactor( uint3 DTid : SV_DispatchThreadID )
+{
+    if (DTid.x < num_triangles)
+    {
+        float4 p0 = mul(InputVertices[DTid.x*3+0], g_matWVP);
+        float4 p1 = mul(InputVertices[DTid.x*3+1], g_matWVP);
+        float4 p2 = mul(InputVertices[DTid.x*3+2], g_matWVP);
+        p0 = p0 / p0.w;
+        p1 = p1 / p1.w;
+        p2 = p2 / p2.w;
+
+        float4 factor;
+        // Only triangles which are completely inside or intersect with the view frustum are taken into account 
+        if ( triBoxOverlap( float3(0, 0, 0.5), float3(1.02, 1.02, 0.52), p0.xyz, p1.xyz, p2.xyz ) )
+        {
+            factor.x = length((p0.xy - p2.xy) * g_tess_edge_length_scale);
+            factor.y = length((p1.xy - p0.xy) * g_tess_edge_length_scale);
+            factor.z = length((p2.xy - p1.xy) * g_tess_edge_length_scale);
+            factor.w = min(min(factor.x, factor.y), factor.z);
+            factor = clamp(factor, 0, 9);
+        } else
+        {
+            factor = 0;
+        }
+
+        EdgeFactorBufOut[DTid.x] = factor;
+    }
+}
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_NumVerticesIndicesCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_NumVerticesIndicesCS.hlsl
new file mode 100644
index 000000000..4f2fb547b
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_NumVerticesIndicesCS.hlsl
@@ -0,0 +1,56 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSNumVerticesIndices
+//--------------------------------------------------------------------------------------
+// File: TessellatorCS40_NumVerticesIndicesCS.hlsl
+//
+// The CS to compute number of vertices and triangles to be generated from edge tessellation factor
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#include "TessellatorCS40_common.hlsl"
+
+StructuredBuffer<float4> InputEdgeFactor : register(t0);
+RWStructuredBuffer<uint2> NumVerticesIndicesOut : register(u0);
+
+cbuffer cbCS : register(b1)
+{
+    uint4 g_param;
+}
+
+[numthreads(128, 1, 1)]
+void CSNumVerticesIndices( uint3 DTid : SV_DispatchThreadID )
+{
+    if (DTid.x < g_param.x)
+    {
+        float4 edge_factor = InputEdgeFactor[DTid.x];
+        
+        PROCESSED_TESS_FACTORS_TRI processedTessFactors;
+        int num_points = TriProcessTessFactors(edge_factor, processedTessFactors, g_partitioning);
+
+        int num_index;
+        if (0 == num_points)
+        {
+            num_index = 0;
+        }
+        else if (3 == num_points)
+        {
+            num_index = 4;
+        }
+        else
+        {
+            int numRings = ((processedTessFactors.numPointsForOutsideInside.w + 1) / 2); // +1 is so even tess includes the center point, which we want to now
+
+            int4 outsideInsideHalfTessFactor = int4(ceil(processedTessFactors.outsideInsideHalfTessFactor));
+            uint3 n = NumStitchTransition(outsideInsideHalfTessFactor, processedTessFactors.outsideInsideTessFactorParity);
+            num_index = n.x + n.y + n.z;
+            num_index += TotalNumStitchRegular(true, DIAGONALS_MIRRORED, processedTessFactors.numPointsForOutsideInside.w, numRings - 1) * 3;
+            if( processedTessFactors.outsideInsideTessFactorParity.w == TESSELLATOR_PARITY_ODD )
+            {
+                num_index += 4;
+            }
+        }
+
+        NumVerticesIndicesOut[DTid.x] = uint2(num_points, num_index);
+    }
+}
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_ScatterIDCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_ScatterIDCS.hlsl
new file mode 100644
index 000000000..17f003794
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_ScatterIDCS.hlsl
@@ -0,0 +1,45 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSScatterVertexTriIDIndexID -entry CSScatterIndexTriIDIndexID
+//--------------------------------------------------------------------------------------
+// File: TessellatorCS40_ScatterIDCS.hlsl
+//
+// The CS to scatter vertex ID and triangle ID
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+StructuredBuffer<uint2> InputScanned : register(t0);
+RWStructuredBuffer<uint2> TriIDIndexIDOut : register(u0);
+
+cbuffer cbCS : register(b1)
+{
+    uint4 g_param;
+}
+
+[numthreads(128, 1, 1)]
+void CSScatterVertexTriIDIndexID( uint3 DTid : SV_DispatchThreadID )
+{
+    if (DTid.x < g_param.x)
+    {
+        uint start = InputScanned[DTid.x-1].x;
+        uint end = InputScanned[DTid.x].x;
+
+        for ( uint i = start; i < end; ++i ) 
+        {
+            TriIDIndexIDOut[i] = uint2(DTid.x, i - start);
+        }
+    }
+}
+
+[numthreads(128, 1, 1)]
+void CSScatterIndexTriIDIndexID( uint3 DTid : SV_DispatchThreadID )
+{
+    if (DTid.x < g_param.x)
+    {
+        uint start = InputScanned[DTid.x-1].y;
+        uint end = InputScanned[DTid.x].y;
+
+        for ( uint i = start; i < end; ++i ) 
+        {
+            TriIDIndexIDOut[i] = uint2(DTid.x, i - start);
+        }
+    }
+}
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateIndicesCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateIndicesCS.hlsl
new file mode 100644
index 000000000..756f99e58
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateIndicesCS.hlsl
@@ -0,0 +1,628 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSTessellationIndices
+//--------------------------------------------------------------------------------------
+// File: TessellatorCS40_TessellateIndicesCS.hlsl
+//
+// The CS to tessellate indices
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#include "TessellatorCS40_common.hlsl"
+
+StructuredBuffer<uint2> InputTriIDIndexID : register(t0);
+StructuredBuffer<float4> InputEdgeFactor : register(t1);
+StructuredBuffer<uint2> InputScanned : register(t2);
+
+RWByteAddressBuffer TessedIndicesOut : register(u0);
+
+cbuffer cbCS : register(b1)
+{
+    uint4 g_param;
+}
+
+
+int TransformIndex1(int index, int vertices_base)
+{
+    return vertices_base + index;
+}
+
+int TransformIndex2(int index, int vertices_base, INDEX_PATCH_CONTEXT IndexPatchContext)
+{
+    if( index >= IndexPatchContext.outsidePointIndexPatchBase ) // assumed remapped outide indices are > remapped inside vertices
+    {
+        if( index == IndexPatchContext.outsidePointIndexBadValue )
+        {
+            index = IndexPatchContext.outsidePointIndexReplacementValue;
+        }
+        else
+        {
+            index += IndexPatchContext.outsidePointIndexDeltaToRealValue;
+        }
+    }
+    else
+    {
+        if( index == IndexPatchContext.insidePointIndexBadValue )
+        {
+            index = IndexPatchContext.insidePointIndexReplacementValue;
+        }
+        else
+        {
+            index += IndexPatchContext.insidePointIndexDeltaToRealValue;
+        }
+    }
+
+    return vertices_base + index;
+}
+
+
+int AStitchRegular(bool bTrapezoid, int diagonals,
+                                 uint numInsideEdgePoints,
+                                 int2 outsideInsideEdgePointBaseOffset,
+                                 int i)
+{
+    if (bTrapezoid)
+    {
+        ++ outsideInsideEdgePointBaseOffset.x;
+    }
+
+    int pt;
+
+    if ((i < 4) && bTrapezoid)
+    {
+        if (i < 2)
+        {
+            pt = outsideInsideEdgePointBaseOffset.x - 1 + i; 
+        }
+        else if (i == 2)
+        {
+            pt = outsideInsideEdgePointBaseOffset.y;
+        }
+        else
+        {
+            pt = -1;
+        }
+    }
+
+    int index = i;
+    if (bTrapezoid)
+    {
+        index -= 4;
+    }
+
+    if (index >= 0)
+    {
+        uint uindex = (uint)index;
+        
+        switch( diagonals )
+        {
+        case DIAGONALS_INSIDE_TO_OUTSIDE:
+            if (uindex < 5 * numInsideEdgePoints - 5)
+            {
+                uint p = uindex / 5;
+                uint r = uindex - p * 5;
+                if (r < 2)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.x + p + r;
+                }
+                else if (r < 4)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.y + p + r;
+                }
+                else
+                {
+                    pt = -1;
+                }
+            }
+            else
+            {
+                int r = i - (4 + 5 * numInsideEdgePoints - 5);
+                if (r < 2)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints - 1 + r;
+                }
+                else if (r == 2)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.y + numInsideEdgePoints - 1;
+                }
+                else
+                {
+                    pt = -1;
+                }
+            }
+            break;
+
+        case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation
+            if (uindex < (numInsideEdgePoints / 2 - 1) * 5)
+            {
+                // First half
+                uint p = uindex / 5;
+                uint r = uindex - p * 5;
+                if (r < 2)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.x + p + r;
+                }
+                else if (r < 4)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.y + p;
+                }
+                else
+                {
+                    pt = -1;
+                }
+            }
+            else if (uindex < (numInsideEdgePoints / 2 - 1) * 5 + 8)
+            {
+                // Middle
+                uint r = uindex - (numInsideEdgePoints / 2 - 1) * 5;
+                if (0 == r)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints / 2 - 1;
+                }
+                else if (r < 3)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.y + numInsideEdgePoints / 2 - 1 + (2 - r);
+                }
+                else if (r == 3)
+                {
+                    pt = -1;
+                }
+                else if (r < 6)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints / 2 - 1 + (r - 4);
+                }
+                else if (r == 6)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.y + numInsideEdgePoints / 2 - 1 + 1;
+                }
+                else if (r == 7)
+                {
+                    pt = -1;
+                }
+            }
+            //else if (uindex < (numInsideEdgePoints/2-1) * 5 + 8 + (numInsideEdgePoints - numInsideEdgePoints/2 - 1) * 5)
+            else if (uindex < numInsideEdgePoints * 5 - 2)
+            {
+                // Second half
+                uint p = (uindex - (numInsideEdgePoints / 2 - 1) * 5 + 8) / 5 + numInsideEdgePoints / 2 + 1;
+                uint r = uindex - (numInsideEdgePoints / 2 - 1) * 5 + 8 - (p - (numInsideEdgePoints / 2 + 1)) * 5;
+                if (r < 2)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.x + p - 1 + r;
+                }
+                else if (r < 4)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.y + p - 1 + r;
+                }
+                else
+                {
+                    pt = -1;
+                }
+            }
+            else
+            {
+                //int r = i - (4 + (numInsideEdgePoints/2-1) * 5 + 8 + (numInsideEdgePoints - numInsideEdgePoints/2 - 1) * 5);
+                int r = i - (numInsideEdgePoints * 5 + 2);
+                if (r < 2)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints - 1 + r;
+                }
+                else if (r == 2)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.y + numInsideEdgePoints - 1;
+                }
+                else
+                {
+                    pt = -1;
+                }
+            }
+            break;
+
+        case DIAGONALS_MIRRORED:
+            if (uindex < (numInsideEdgePoints / 2 + 1) * 2)
+            {
+                uint p = uindex / 2;
+                uint r = uindex - p * 2;
+                if (0 == r)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.y + p;
+                }
+                else
+                {
+                    pt = outsideInsideEdgePointBaseOffset.x + p;
+                }
+            }
+            else if (uindex == (numInsideEdgePoints / 2 + 1) * 2)
+            {
+                pt = -1;
+            }
+            else if (uindex == (numInsideEdgePoints / 2 + 1) * 2 + 1)
+            {
+                pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints / 2;
+            }
+            //else if (uindex < (numInsideEdgePoints / 2 + 1) * 2 + 2 + (numInsideEdgePoints - numInsideEdgePoints / 2) * 2)
+            else if (uindex < numInsideEdgePoints * 2 + 4)
+            {
+                uint p = (uindex - ((numInsideEdgePoints / 2 + 1) * 2 + 2)) / 2 + numInsideEdgePoints / 2;
+                uint r = uindex - ((numInsideEdgePoints / 2 + 1) * 2 + 2) - (p - numInsideEdgePoints / 2) * 2;
+                if (0 == r)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.x + p;
+                }
+                else
+                {
+                    pt = outsideInsideEdgePointBaseOffset.y + p;
+                }
+            }
+            //else if (uindex == (numInsideEdgePoints / 2 + 1) * 2 + 2 + (numInsideEdgePoints - numInsideEdgePoints / 2) * 2)
+            else if (uindex == numInsideEdgePoints * 2 + 4)
+            {
+                pt = -1;
+            }
+            else
+            {
+                //int r = i - (4 + (numInsideEdgePoints / 2 + 1) * 2 + 2 + (numInsideEdgePoints - numInsideEdgePoints / 2) * 2 + 1);
+                uint r = i - (numInsideEdgePoints * 2 + 9);
+                if (r < 2)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.x + numInsideEdgePoints - 1 + r;
+                }
+                else if (r == 2)
+                {
+                    pt = outsideInsideEdgePointBaseOffset.y + numInsideEdgePoints - 1;
+                }
+                else
+                {
+                    pt = -1;
+                }
+            }
+            break;
+        }
+    }
+
+    return pt;
+}
+
+int AStitchTransition(int2 outsideInsideEdgePointBaseOffset, int2 outsideInsideNumHalfTessFactorPoints, 
+                                    int2 outsideInsideEdgeTessFactorParity,
+                                    uint i)
+{
+    outsideInsideNumHalfTessFactorPoints -= (TESSELLATOR_PARITY_ODD == outsideInsideEdgeTessFactorParity);
+    
+    uint2 out_in_first_half = uint2(outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][MAX_FACTOR / 2 + 1].y, insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][MAX_FACTOR / 2 + 1].y) * 4;
+
+    uint3 out_in_middle = 0;
+    if ((outsideInsideEdgeTessFactorParity.y != outsideInsideEdgeTessFactorParity.x) || (outsideInsideEdgeTessFactorParity.y == TESSELLATOR_PARITY_ODD))
+    {
+        if (outsideInsideEdgeTessFactorParity.y == outsideInsideEdgeTessFactorParity.x)
+        {
+            // Quad in the middle
+            out_in_middle.z = 5;
+            out_in_middle.xy = 1;
+        }
+        else if (TESSELLATOR_PARITY_EVEN == outsideInsideEdgeTessFactorParity.y)
+        {
+            // Triangle pointing inside
+            out_in_middle.z = 4;
+            out_in_middle.x = 1;
+        }
+        else
+        {
+            // Triangle pointing outside
+            out_in_middle.z = 4;
+            out_in_middle.y = 1;
+        }
+    }
+
+
+    int pt = -1;
+
+    if (i < out_in_first_half.y)
+    {
+        // Advance inside
+
+        uint p = i / 4;
+        uint r = i - p * 4;
+        p = insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p].z;
+        if ((0 == r) || (2 == r))
+        {
+            pt = outsideInsideEdgePointBaseOffset.y + insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p].y + r / 2;
+        }
+        else if (1 == r)
+        {
+            pt = outsideInsideEdgePointBaseOffset.x + outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p].y;
+        }
+    }
+    else
+    {
+        i -= out_in_first_half.y;
+        
+        if (i < out_in_first_half.x)
+        {
+            // Advance outside
+
+            uint p = i / 4;
+            uint r = i - p * 4;
+            p = outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p].z;
+            if (r < 2)
+            {
+                pt = outsideInsideEdgePointBaseOffset.x + outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p].y + r;
+            }
+            else if (r == 2)
+            {
+                pt = outsideInsideEdgePointBaseOffset.y + insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p].y;
+                if (insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p].x)
+                {
+                    ++ pt;
+                }
+            }
+        }
+        else
+        {
+            i -= out_in_first_half.x;
+            
+            if (i < out_in_middle.z)
+            {
+                uint r = i;
+                if (outsideInsideEdgeTessFactorParity.y == outsideInsideEdgeTessFactorParity.x)
+                {
+                    // Quad in the middle
+                    if ((0 == r) || (2 == r))
+                    {
+                        pt = outsideInsideEdgePointBaseOffset.y + out_in_first_half.y / 4 + (2 == r);//r / 2;
+                    }
+                    else if ((1 == r) || (3 == r))
+                    {
+                        pt = outsideInsideEdgePointBaseOffset.x + out_in_first_half.x / 4 + (3 == r);//(r - 1) / 2;
+                    }
+                }
+                else if (TESSELLATOR_PARITY_EVEN == outsideInsideEdgeTessFactorParity.y)
+                {
+                    // Triangle pointing inside
+                    if (r == 0)
+                    {
+                        pt = outsideInsideEdgePointBaseOffset.y + out_in_first_half.y / 4;
+                    }
+                    else if (r < 3)
+                    {
+                        pt = outsideInsideEdgePointBaseOffset.x + out_in_first_half.x / 4 + r - 1;
+                    }
+                }
+                else
+                {
+                    // Triangle pointing outside
+                    if ((0 == r) || (2 == r))
+                    {
+                        pt = outsideInsideEdgePointBaseOffset.y + out_in_first_half.y / 4 + (2 == r);//r / 2;
+                    }
+                    else if (1 == r)
+                    {
+                        pt = outsideInsideEdgePointBaseOffset.x + out_in_first_half.x / 4;
+                    }
+                }
+            }
+            else
+            {
+                i -= out_in_middle.z;
+                
+                if (i < out_in_first_half.x)
+                {
+                    // Advance outside
+
+                    uint p = i / 4;
+                    uint r = i - p * 4;
+                    p = outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p].z;
+                    if (r < 2)
+                    {
+                        pt = outsideInsideEdgePointBaseOffset.x + out_in_first_half.x / 4 + out_in_middle.x + (outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][MAX_FACTOR / 2 + 1].y - outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p + 1].y) + r;
+                    }
+                    else if (r == 2)
+                    {
+                        pt = outsideInsideEdgePointBaseOffset.y + out_in_first_half.y / 4 + out_in_middle.y + (insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][MAX_FACTOR / 2 + 1].y - insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p + 1].y);
+                    }
+                }
+                else
+                {
+                    // Advance inside
+                    
+                    i -= out_in_first_half.x;
+
+                    uint p = i / 4;
+                    uint r = i - p * 4;
+                    p = insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p].w;
+                    if ((0 == r) || (2 == r))
+                    {
+                        pt = outsideInsideEdgePointBaseOffset.y + out_in_first_half.y / 4 + out_in_middle.y
+                            + (insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][MAX_FACTOR / 2 + 1].y - insidePointIndex[outsideInsideNumHalfTessFactorPoints.y][p + 1].y) + (2 == r);//r / 2;
+                    }
+                    else if (1 == r)
+                    {
+                        pt = outsideInsideEdgePointBaseOffset.x + out_in_first_half.x / 4 + out_in_middle.x
+                            + (outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][MAX_FACTOR / 2 + 1].y - outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p + 1].y);
+                        if (outsidePointIndex[outsideInsideNumHalfTessFactorPoints.x][p].x)
+                        {
+                            ++ pt;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return pt;
+}
+
+[numthreads(128, 1, 1)]
+void CSTessellationIndices( uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex )
+{
+    uint id = DTid.x;
+    //uint id = Gid.x * 128 + GI; // Workaround for some CS4x preview drivers
+    
+    if ( id < g_param.x )
+    {
+        uint tri_id = InputTriIDIndexID[id].x;
+        uint index_id = InputTriIDIndexID[id].y;
+        uint base_vertex = InputScanned[tri_id-1].x;
+        
+        float4 outside_inside_factor = InputEdgeFactor[tri_id];
+        
+        PROCESSED_TESS_FACTORS_TRI processedTessFactors;
+        int num_points = TriProcessTessFactors(outside_inside_factor, processedTessFactors, g_partitioning);
+
+        uint tessed_indices;
+        if (3 == num_points)
+        {
+            if (index_id < 3)
+            {
+                tessed_indices = TransformIndex1(index_id, base_vertex);
+            }
+            else
+            {
+                tessed_indices = -1;
+            }
+        }
+        else
+        {
+            // Generate primitives for all the concentric rings, one side at a time for each ring
+            static const int startRing = 1;
+            int numRings = ((processedTessFactors.numPointsForOutsideInside.w + 1) / 2); // +1 is so even tess includes the center point, which we want to now
+
+            int4 outsideInsideHalfTessFactor = int4(ceil(processedTessFactors.outsideInsideHalfTessFactor));
+            uint3 num = NumStitchTransition(outsideInsideHalfTessFactor, processedTessFactors.outsideInsideTessFactorParity);
+            num.y += num.x;
+            num.z += num.y;
+            uint num_index = num.z;
+            num_index += TotalNumStitchRegular(true, DIAGONALS_MIRRORED, processedTessFactors.numPointsForOutsideInside.w, numRings - 1) * 3;
+            if( processedTessFactors.outsideInsideTessFactorParity.w == TESSELLATOR_PARITY_ODD )
+            {
+                num_index += 4;
+            }
+
+            int pt;
+
+            if (index_id < num.x)
+            {
+                int numPointsForInsideEdge = processedTessFactors.numPointsForOutsideInside.w - 2 * startRing;
+
+                pt = AStitchTransition(int2(0, processedTessFactors.insideEdgePointBaseOffset),
+                        outsideInsideHalfTessFactor.xw,
+                        processedTessFactors.outsideInsideTessFactorParity.xw,
+                        index_id);
+                if (pt != -1)
+                {
+                    pt = TransformIndex1(pt, base_vertex);
+                }
+            }
+            else if (index_id < num.y)
+            {
+                int numPointsForInsideEdge = processedTessFactors.numPointsForOutsideInside.w - 2 * startRing;
+
+                pt = AStitchTransition(
+                        int2(processedTessFactors.numPointsForOutsideInside.x - 1, processedTessFactors.insideEdgePointBaseOffset + numPointsForInsideEdge - 1),
+                        outsideInsideHalfTessFactor.yw,
+                        processedTessFactors.outsideInsideTessFactorParity.yw,
+                        index_id - num.x);
+                if (pt != -1)
+                {
+                    pt = TransformIndex1(pt, base_vertex);
+                }
+            }
+            else if (index_id < num.z)
+            {
+                int numPointsForInsideEdge = processedTessFactors.numPointsForOutsideInside.w - 2 * startRing;
+
+                INDEX_PATCH_CONTEXT IndexPatchContext;
+                IndexPatchContext.insidePointIndexDeltaToRealValue    = processedTessFactors.insideEdgePointBaseOffset + 2 * (numPointsForInsideEdge - 1);
+                IndexPatchContext.insidePointIndexBadValue            = numPointsForInsideEdge - 1;
+                IndexPatchContext.insidePointIndexReplacementValue    = processedTessFactors.insideEdgePointBaseOffset;
+                IndexPatchContext.outsidePointIndexPatchBase          = IndexPatchContext.insidePointIndexBadValue+1; // past inside patched index range
+                IndexPatchContext.outsidePointIndexDeltaToRealValue   = processedTessFactors.numPointsForOutsideInside.x + processedTessFactors.numPointsForOutsideInside.y - 2 
+                                                                                    - IndexPatchContext.outsidePointIndexPatchBase;
+                IndexPatchContext.outsidePointIndexBadValue           = IndexPatchContext.outsidePointIndexPatchBase
+                                                                                    + processedTessFactors.numPointsForOutsideInside.z - 1;
+                IndexPatchContext.outsidePointIndexReplacementValue   = 0;
+
+                pt = AStitchTransition(int2(numPointsForInsideEdge, 0),
+                            outsideInsideHalfTessFactor.zw,
+                            processedTessFactors.outsideInsideTessFactorParity.zw,
+                            index_id - num.y);
+                if (pt != -1)
+                {
+                    pt = TransformIndex2(pt, base_vertex, IndexPatchContext);
+                }
+            }
+            else
+            {
+                if ((processedTessFactors.outsideInsideTessFactorParity.w == TESSELLATOR_PARITY_ODD) && (index_id >= num_index - 4))
+                {
+                    int outsideEdgePointBaseOffset = processedTessFactors.insideEdgePointBaseOffset
+                        + ((processedTessFactors.numPointsForOutsideInside.w + 1) - (numRings + startRing)) * (numRings - startRing - 1) * 3;
+
+                    if (index_id - (num_index - 4) != 3)
+                    {
+                        pt = TransformIndex1(outsideEdgePointBaseOffset + index_id - (num_index - 4), base_vertex);
+                    }
+                    else
+                    {
+                        pt = -1;
+                    }
+                }
+                else
+                {
+                    int ring = GetRingFromIndexStitchRegular(true, DIAGONALS_MIRRORED, processedTessFactors.numPointsForOutsideInside.w, index_id - num.z);
+
+                    int tn = TotalNumStitchRegular(true, DIAGONALS_MIRRORED, processedTessFactors.numPointsForOutsideInside.w, ring - 1) * 3;
+                    int n = NumStitchRegular(true, DIAGONALS_MIRRORED, processedTessFactors.numPointsForOutsideInside.w - 2 * ring);
+
+                    int edge = (index_id - num.z - tn) / n;
+                    int index = (index_id - num.z - tn) - edge * n;
+
+                    int2 outsideInsideEdgePointBaseOffset = processedTessFactors.insideEdgePointBaseOffset
+                        + int2(0, 3 * (processedTessFactors.numPointsForOutsideInside.w - 3))
+                        + ((processedTessFactors.numPointsForOutsideInside.w - (ring + startRing)) + int2(1, -1)) * (ring - startRing - 1) * 3;
+
+                    int numPointsForInsideEdge = processedTessFactors.numPointsForOutsideInside.w - 2 * ring;
+                    int numLastPointsForInsideEdge = numPointsForInsideEdge + 2;
+
+                    if (edge < 2)
+                    {
+                        pt = AStitchRegular(true, DIAGONALS_MIRRORED,
+                                    numPointsForInsideEdge,
+                                    outsideInsideEdgePointBaseOffset + (int2(numLastPointsForInsideEdge, numPointsForInsideEdge) - 1) * edge,
+                                    index);
+                        if (pt != -1)
+                        {
+                            pt = TransformIndex1(pt, base_vertex);
+                        }
+                    }
+                    else
+                    {
+                        INDEX_PATCH_CONTEXT IndexPatchContext;
+                        IndexPatchContext.insidePointIndexDeltaToRealValue    = outsideInsideEdgePointBaseOffset.y + (numPointsForInsideEdge - 1) * 2;
+                        IndexPatchContext.insidePointIndexBadValue            = numPointsForInsideEdge - 1;
+                        IndexPatchContext.insidePointIndexReplacementValue    = outsideInsideEdgePointBaseOffset.y;
+                        IndexPatchContext.outsidePointIndexPatchBase          = IndexPatchContext.insidePointIndexBadValue+1; // past inside patched index range
+                        IndexPatchContext.outsidePointIndexDeltaToRealValue   = outsideInsideEdgePointBaseOffset.x + (numLastPointsForInsideEdge - 1) * 2 
+                                                                                    - IndexPatchContext.outsidePointIndexPatchBase;
+                        IndexPatchContext.outsidePointIndexBadValue           = IndexPatchContext.outsidePointIndexPatchBase
+                                                                                    + numLastPointsForInsideEdge - 1;
+                        IndexPatchContext.outsidePointIndexReplacementValue   = outsideInsideEdgePointBaseOffset.x;
+
+                        pt = AStitchRegular(true, DIAGONALS_MIRRORED,
+                                        numPointsForInsideEdge,
+                                        int2(numPointsForInsideEdge, 0),
+                                        index);
+                        if (pt != -1)
+                        {
+                            pt = TransformIndex2(pt, base_vertex, IndexPatchContext);
+                        }
+                    }
+                }
+            }
+
+            tessed_indices = pt;
+        }
+
+        TessedIndicesOut.Store(id*4, tessed_indices);
+    }       
+}
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateVerticesCS.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateVerticesCS.hlsl
new file mode 100644
index 000000000..55bf1be87
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_TessellateVerticesCS.hlsl
@@ -0,0 +1,206 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSTessellationVertices
+//--------------------------------------------------------------------------------------
+// File: TessellatorCS40_TessellateVerticesCS.hlsl
+//
+// The CS to tessellate vertices
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#include "TessellatorCS40_common.hlsl"
+
+StructuredBuffer<uint2> InputTriIDIndexID : register(t0);
+StructuredBuffer<float4> InputEdgeFactor : register(t1);
+
+struct TessedVertex
+{
+    uint BaseTriID;
+    float2 bc;
+};
+RWStructuredBuffer<TessedVertex> TessedVerticesOut : register(u0);
+
+cbuffer cbCS : register(b1)
+{
+    uint4 g_param;
+}
+
+void PlacePointIn1D(PROCESSED_TESS_FACTORS_TRI processedTessFactors, int ctx_index, int pt, out float location, int parity)
+{
+    int numHalfTessFactorPoints = int(ceil(processedTessFactors.outsideInsideHalfTessFactor[ctx_index]));
+
+    bool bFlip;
+    if( pt >= numHalfTessFactorPoints )
+    {
+        pt = (numHalfTessFactorPoints << 1) - pt;
+        if( TESSELLATOR_PARITY_ODD == parity )
+        {
+            pt -= 1;
+        }
+        bFlip = true;
+    }
+    else
+    {
+        bFlip = false;
+    }
+
+    if( pt == numHalfTessFactorPoints ) 
+    {
+        location = 0.5f;
+    }    
+    else
+    {
+        unsigned int indexOnCeilHalfTessFactor = pt;
+        unsigned int indexOnFloorHalfTessFactor = indexOnCeilHalfTessFactor;
+        if( pt > processedTessFactors.outsideInsideSplitPointOnFloorHalfTessFactor[ctx_index] )
+        {
+            indexOnFloorHalfTessFactor -= 1;
+        }
+        float locationOnFloorHalfTessFactor = indexOnFloorHalfTessFactor * processedTessFactors.outsideInsideInvNumSegmentsOnFloorTessFactor[ctx_index];
+        float locationOnCeilHalfTessFactor = indexOnCeilHalfTessFactor * processedTessFactors.outsideInsideInvNumSegmentsOnCeilTessFactor[ctx_index];
+
+        location = lerp(locationOnFloorHalfTessFactor, locationOnCeilHalfTessFactor, frac(processedTessFactors.outsideInsideHalfTessFactor[ctx_index]));
+
+        if( bFlip )
+        {
+            location = 1.0f - location;
+        }
+    }
+}
+
+[numthreads(128, 1, 1)]
+void CSTessellationVertices( uint3 DTid : SV_DispatchThreadID, uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex  )
+{
+    uint id = DTid.x;
+    //uint id = Gid.x * 128 + GI; // Workaround for some CS4x preview drivers
+    
+    if ( id < g_param.x )
+    {
+        uint tri_id = InputTriIDIndexID[id].x;
+        uint vert_id = InputTriIDIndexID[id].y;
+        
+        float4 outside_inside_factor = InputEdgeFactor[tri_id];
+
+        PROCESSED_TESS_FACTORS_TRI processedTessFactors;
+        int num_points = TriProcessTessFactors(outside_inside_factor, processedTessFactors, g_partitioning);
+
+        float2 uv;
+        if (3 == num_points)
+        {
+            if (0 == vert_id)
+            {
+                uv = float2(0, 1);
+            }
+            else if (1 == vert_id)
+            {
+                uv = float2(0, 0);
+            }
+            else
+            {
+                uv = float2(1, 0);
+            }
+        }
+        else
+        {
+            if (vert_id < processedTessFactors.insideEdgePointBaseOffset)
+            {
+                // Generate exterior ring edge points, clockwise starting from point V (VW, the U==0 edge)
+
+                int edge;
+                if (vert_id < processedTessFactors.numPointsForOutsideInside.x - 1)
+                {
+                    edge = 0;
+                }
+                else
+                {
+                    vert_id -= processedTessFactors.numPointsForOutsideInside.x - 1;
+                    if (vert_id < processedTessFactors.numPointsForOutsideInside.y - 1)
+                    {
+                        edge = 1;
+                    }
+                    else
+                    {
+                        vert_id -= processedTessFactors.numPointsForOutsideInside.y - 1;
+                        edge = 2;
+                    }
+                }
+                
+                int p = vert_id;
+                int endPoint = processedTessFactors.numPointsForOutsideInside[edge] - 1;
+                float param;
+                int q = (edge & 0x1) ? p : endPoint - p; // whether to reverse point order given we are defining V or U (W implicit):
+                                                     // edge0, VW, has V decreasing, so reverse 1D points below
+                                                     // edge1, WU, has U increasing, so don't reverse 1D points  below
+                                                     // edge2, UV, has U decreasing, so reverse 1D points below
+                PlacePointIn1D(processedTessFactors, edge,q,param, processedTessFactors.outsideInsideTessFactorParity[edge]);
+                if (0 == edge)
+                {
+                    uv = float2(0, param);
+                }
+                else if (1 == edge)
+                {
+                    uv = float2(param, 0);
+                }
+                else
+                {
+                    uv = float2(param, 1 - param);
+                }
+            }
+            else
+            {
+                // Generate interior ring points, clockwise spiralling in
+
+                uint index = vert_id - processedTessFactors.insideEdgePointBaseOffset;
+                uint ring = 1 + (((3 * processedTessFactors.numPointsForOutsideInside.w - 6) - sqrt(sqr(3 * processedTessFactors.numPointsForOutsideInside.w - 6) - 4 * 3 * index)) + 0.001f) / 6;
+                index -= 3 * (processedTessFactors.numPointsForOutsideInside.w - ring - 1) * (ring - 1);
+
+                uint startPoint = ring;
+                uint endPoint = processedTessFactors.numPointsForOutsideInside.w - 1 - startPoint;
+                if (index < 3 * (endPoint - startPoint))
+                {
+                    uint edge = index / (endPoint - startPoint);
+                    uint p = index - edge * (endPoint - startPoint) + startPoint;
+
+                    int perpendicularAxisPoint = startPoint;
+                    float perpParam;
+                    PlacePointIn1D(processedTessFactors, 3, perpendicularAxisPoint, perpParam, processedTessFactors.outsideInsideTessFactorParity.w);
+                    perpParam = perpParam * 2 / 3;
+                    
+                    float param;
+                    int q = (edge & 0x1) ? p : endPoint - (p - startPoint); // whether to reverse point given we are defining V or U (W implicit):
+                                                             // edge0, VW, has V decreasing, so reverse 1D points below
+                                                             // edge1, WU, has U increasing, so don't reverse 1D points  below
+                                                             // edge2, UV, has U decreasing, so reverse 1D points below
+                    PlacePointIn1D(processedTessFactors, 3, q,param, processedTessFactors.outsideInsideTessFactorParity.w);
+                    // edge0 VW, has perpendicular parameter U constant
+                    // edge1 WU, has perpendicular parameter V constant
+                    // edge2 UV, has perpendicular parameter W constant 
+                    const unsigned int deriv = 2; // reciprocal is the rate of change of edge-parallel parameters as they are pushed into the triangle
+                    if (0 == edge)
+                    {
+                        uv = float2(perpParam, param - perpParam / deriv);
+                    }
+                    else if (1 == edge)
+                    {
+                        uv = float2(param - perpParam / deriv, perpParam);
+                    }
+                    else
+                    {
+                        uv = float2(param - perpParam / deriv, 1 - (param - perpParam / deriv + perpParam));
+                    }
+                }
+                else
+                {
+                    if( processedTessFactors.outsideInsideTessFactorParity.w != TESSELLATOR_PARITY_ODD )
+                    {
+                        // Last point is the point at the center.
+                        uv = 1 / 3.0f;
+                    }
+                }
+            }
+        }
+        
+        TessedVerticesOut[id].BaseTriID = tri_id;
+        TessedVerticesOut[id].bc = uv;
+    }    
+}
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_common.hlsl b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_common.hlsl
new file mode 100644
index 000000000..309044cdb
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_common.hlsl
@@ -0,0 +1,411 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: TessellatorCS40_common.hlsl
+//
+// The common utils included by other shaders in the sample
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#include "TessellatorCS40_defines.h"
+
+cbuffer cbNeverChanges : register(b0)
+{
+    uint4 insidePointIndex[MAX_FACTOR / 2 + 1][MAX_FACTOR / 2 + 2];
+    uint4 outsidePointIndex[MAX_FACTOR / 2 + 1][MAX_FACTOR / 2 + 2];
+}
+
+#define D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR    ( 64 )
+#define D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR     ( 63 )
+#define D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR     ( 2 )
+#define D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR      ( 1 )
+
+#define D3D11_TESSELLATOR_PARTITIONING_INTEGER            ( 0 )
+#define D3D11_TESSELLATOR_PARTITIONING_POW2               ( 1 )
+#define D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD     ( 2 )
+#define D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN    ( 3 )
+    
+#define TESSELLATOR_PARITY_EVEN                           ( 0 )
+#define TESSELLATOR_PARITY_ODD                            ( 1 )
+
+#define EPSILON 1e-6f
+#define MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON (D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR + EPSILON/2)
+
+#define DIAGONALS_INSIDE_TO_OUTSIDE                       ( 0 )
+#define DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE         ( 1 )
+#define DIAGONALS_MIRRORED                                ( 2 )
+
+
+// This is moved to macro defines at shader compile time, so that the partitioning mode can be changed during runtime
+//#define g_partitioning (D3D11_TESSELLATOR_PARTITIONING_POW2)
+
+
+struct PROCESSED_TESS_FACTORS_TRI
+{
+    float4 outsideInsideTessFactor;
+    int4 outsideInsideTessFactorParity;
+
+    float4 outsideInsideInvNumSegmentsOnFloorTessFactor; 
+    float4 outsideInsideInvNumSegmentsOnCeilTessFactor;
+    float4 outsideInsideHalfTessFactor;
+    int4 outsideInsideSplitPointOnFloorHalfTessFactor; 
+
+    // Stuff below is specific to the traversal order 
+    uint4 numPointsForOutsideInside;
+    uint insideEdgePointBaseOffset;
+};
+
+struct INDEX_PATCH_CONTEXT
+{
+    int insidePointIndexDeltaToRealValue;
+    int insidePointIndexBadValue;
+    int insidePointIndexReplacementValue;
+    int outsidePointIndexPatchBase;
+    int outsidePointIndexDeltaToRealValue;
+    int outsidePointIndexBadValue;
+    int outsidePointIndexReplacementValue;
+};
+
+bool4 isEven(float4 input)
+{
+    return (((uint4)input) & 1) ? false : true;
+}
+
+uint RemoveMSB(uint val)
+{
+    int check;
+    if( val <= 0x0000ffff )
+    {
+        check = ( val <= 0x000000ff ) ? 0x00000080 : 0x00008000;
+    }
+    else
+    {
+        check = ( val <= 0x00ffffff ) ? 0x00800000 : 0x80000000;
+    }
+    for (int i = 0; i < 8; i++, check >>= 1)
+    {
+        if( val & check )
+        {
+            return (val & ~check);
+        }
+    }
+    return 0;
+}
+
+uint4 NumPointsForTessFactor(float4 tessFactor, int4 parity)
+{
+    return TESSELLATOR_PARITY_ODD == parity ? uint4(ceil(0.5f + tessFactor / 2)) * 2 : uint4(ceil(tessFactor / 2)) * 2 + 1;
+}
+
+void ComputeTessFactorContext(float4 tessFactor, int4 parity,
+    out float4 invNumSegmentsOnFloorTessFactor,
+    out float4 invNumSegmentsOnCeilTessFactor,
+    out float4 halfTessFactor,
+    out int4 splitPointOnFloorHalfTessFactor)
+{
+    halfTessFactor = tessFactor / 2;
+    
+    halfTessFactor += 0.5 * ((TESSELLATOR_PARITY_ODD == parity) | (0.5f == halfTessFactor));
+    
+    float4 floorHalfTessFactor = floor(halfTessFactor);
+    float4 ceilHalfTessFactor = ceil(halfTessFactor);
+    int4 numHalfTessFactorPoints = int4(ceilHalfTessFactor);
+    
+    for (int index = 0; index < 4; ++ index)
+    {
+        if( ceilHalfTessFactor[index] == floorHalfTessFactor[index] )
+        {
+            splitPointOnFloorHalfTessFactor[index] =  /*pick value to cause this to be ignored*/ numHalfTessFactorPoints[index]+1;
+        }
+        else if( TESSELLATOR_PARITY_ODD == parity[index] )
+        {
+            if( floorHalfTessFactor[index] == 1 )
+            {
+                splitPointOnFloorHalfTessFactor[index] = 0;
+            }
+            else
+            {
+                splitPointOnFloorHalfTessFactor[index] = (RemoveMSB(int(floorHalfTessFactor[index]) - 1) << 1) + 1;
+            }
+        }
+        else
+        {
+            splitPointOnFloorHalfTessFactor[index] = (RemoveMSB(int(floorHalfTessFactor[index])) << 1) + 1;
+        }
+    }
+    
+    int4 numFloorSegments = int4(floorHalfTessFactor * 2);
+    int4 numCeilSegments = int4(ceilHalfTessFactor * 2);
+    int4 s = (TESSELLATOR_PARITY_ODD == parity);
+    numFloorSegments -= s;
+    numCeilSegments -= s;
+    invNumSegmentsOnFloorTessFactor = 1.0f / numFloorSegments;
+    invNumSegmentsOnCeilTessFactor = 1.0f / numCeilSegments;
+}
+
+int TriProcessTessFactors( inout float4 tessFactor,
+                           out PROCESSED_TESS_FACTORS_TRI processedTessFactors,
+                           int partitioning )
+{
+    processedTessFactors = (PROCESSED_TESS_FACTORS_TRI)0;
+    
+    int parity = TESSELLATOR_PARITY_EVEN;
+    switch( partitioning )
+    {
+        case D3D11_TESSELLATOR_PARTITIONING_INTEGER:
+        default:
+            break;
+        case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD:
+            parity = TESSELLATOR_PARITY_ODD;
+            break;
+        case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN:
+            parity = TESSELLATOR_PARITY_EVEN;
+            break;
+    }
+
+    // Is the patch culled?
+    if( !(tessFactor.x > 0) || // NaN will pass
+        !(tessFactor.y > 0) ||
+        !(tessFactor.z > 0) )
+    {
+        return 0;
+    }
+
+    // Clamp edge TessFactors
+    float lowerBound, upperBound;
+    switch(partitioning)
+    {
+        case D3D11_TESSELLATOR_PARTITIONING_INTEGER:
+        case D3D11_TESSELLATOR_PARTITIONING_POW2: // don't care about pow2 distinction for validation, just treat as integer
+        default:
+            lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR;
+            upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR;
+            break;
+         
+        case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN:
+            lowerBound = D3D11_TESSELLATOR_MIN_EVEN_TESSELLATION_FACTOR;
+            upperBound = D3D11_TESSELLATOR_MAX_EVEN_TESSELLATION_FACTOR;
+            break;
+
+        case D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD:
+            lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR;
+            upperBound = D3D11_TESSELLATOR_MAX_ODD_TESSELLATION_FACTOR;
+            break;
+    }
+
+    tessFactor.xyz = min( upperBound, max( lowerBound, tessFactor.xyz ) );
+
+    // Clamp inside TessFactors
+    if(D3D11_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD == partitioning)
+    {
+        if( (tessFactor.x > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) ||
+            (tessFactor.y > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) ||
+            (tessFactor.z > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON)) 
+            // Don't need the same check for insideTessFactor for tri patches, 
+            // since there is only one insideTessFactor, as opposed to quad 
+            // patches which have 2 insideTessFactors.
+        {
+            // Force picture frame
+            lowerBound = D3D11_TESSELLATOR_MIN_ODD_TESSELLATION_FACTOR + EPSILON;
+        }
+    }
+
+    tessFactor.w = min( upperBound, max( lowerBound, tessFactor.w ) );
+    // Note the above clamps map NaN to lowerBound
+
+    if (partitioning == D3D11_TESSELLATOR_PARTITIONING_INTEGER)
+    {
+        tessFactor = ceil(tessFactor);
+    }
+    else if (partitioning == D3D11_TESSELLATOR_PARTITIONING_POW2)
+    {
+        static const int exponentMask = 0x7f800000;
+        static const int mantissaMask = 0x007fffff;
+        static const int exponentLSB = 0x00800000;
+
+        int4 bits = asint(tessFactor);
+        tessFactor = bits & mantissaMask ? asfloat((bits & exponentMask) + exponentLSB) : tessFactor;
+    }
+
+    // Process tessFactors
+    if ((partitioning == D3D11_TESSELLATOR_PARTITIONING_INTEGER)|| (partitioning == D3D11_TESSELLATOR_PARTITIONING_POW2))
+    {
+        bool4 e = isEven(tessFactor);
+        processedTessFactors.outsideInsideTessFactorParity.xyz = e.xyz ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD;
+        processedTessFactors.outsideInsideTessFactorParity.w = (e.w || (1 == tessFactor.w)) ? TESSELLATOR_PARITY_EVEN : TESSELLATOR_PARITY_ODD;
+    }
+    else
+    {
+        processedTessFactors.outsideInsideTessFactorParity = parity;
+    }
+    
+    processedTessFactors.outsideInsideTessFactor = tessFactor;
+
+    if (((partitioning == D3D11_TESSELLATOR_PARTITIONING_INTEGER)|| (partitioning == D3D11_TESSELLATOR_PARTITIONING_POW2)) || (parity == TESSELLATOR_PARITY_ODD))
+    {
+        // Special case if all TessFactors are 1 
+        if( (1 == processedTessFactors.outsideInsideTessFactor.x) &&
+            (1 == processedTessFactors.outsideInsideTessFactor.y) &&
+            (1 == processedTessFactors.outsideInsideTessFactor.z) &&
+            (1 == processedTessFactors.outsideInsideTessFactor.w) )
+        {
+            return 3;
+        }
+    }
+
+    // Compute per-TessFactor metadata
+    ComputeTessFactorContext(processedTessFactors.outsideInsideTessFactor, processedTessFactors.outsideInsideTessFactorParity,
+                             processedTessFactors.outsideInsideInvNumSegmentsOnFloorTessFactor,
+                             processedTessFactors.outsideInsideInvNumSegmentsOnCeilTessFactor,
+                             processedTessFactors.outsideInsideHalfTessFactor,
+                             processedTessFactors.outsideInsideSplitPointOnFloorHalfTessFactor);
+
+    // Compute some initial data.
+
+    // outside edge offsets and storage
+    processedTessFactors.numPointsForOutsideInside = NumPointsForTessFactor(processedTessFactors.outsideInsideTessFactor, processedTessFactors.outsideInsideTessFactorParity);
+    int NumPoints = processedTessFactors.numPointsForOutsideInside.x + processedTessFactors.numPointsForOutsideInside.y + processedTessFactors.numPointsForOutsideInside.z - 3;
+
+    // inside edge offsets
+    {
+        uint pointCountMin = (processedTessFactors.outsideInsideTessFactorParity.w == TESSELLATOR_PARITY_ODD) ? 4 : 3;
+        // max() allows degenerate transition regions when inside TessFactor == 1
+        processedTessFactors.numPointsForOutsideInside.w = max(pointCountMin, processedTessFactors.numPointsForOutsideInside.w);
+    }
+
+    processedTessFactors.insideEdgePointBaseOffset = NumPoints;
+
+    // inside storage, including interior edges above
+    {
+        int numInteriorRings = (processedTessFactors.numPointsForOutsideInside.w >> 1) - 1; 
+        int numInteriorPoints;
+        if( processedTessFactors.outsideInsideTessFactorParity.w == TESSELLATOR_PARITY_ODD )
+        {
+            numInteriorPoints = 3*(numInteriorRings*(numInteriorRings+1) - numInteriorRings);
+        }
+        else
+        {
+            numInteriorPoints = 3*(numInteriorRings*(numInteriorRings+1)) + 1;
+        }
+        NumPoints += numInteriorPoints;
+    }
+    
+    return NumPoints;
+}
+
+int NumStitchRegular(bool bTrapezoid, int diagonals, int numInsideEdgePoints)
+{
+    int num_index = 0;
+
+    if( bTrapezoid )
+    {
+        num_index += 8;
+    }
+    switch( diagonals )
+    {
+        case DIAGONALS_INSIDE_TO_OUTSIDE:
+            // Diagonals pointing from inside edge forward towards outside edge
+            num_index += 5 * numInsideEdgePoints - 5;
+            break;
+
+        case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation
+            // Diagonals pointing from outside edge forward towards inside edge
+            num_index += 5 * numInsideEdgePoints - 2;
+            break;
+
+        case DIAGONALS_MIRRORED:
+            num_index += 2 * numInsideEdgePoints + 5;
+            break;
+    }
+
+    return num_index;
+}
+
+uint TotalNumStitchRegular(bool bTrapezoid, int diagonals,
+                                 int numPointsForInsideTessFactor, int ring)
+{
+    uint num_index = 0;
+
+    if( bTrapezoid )
+    {
+        num_index += 8 * (ring - 1);
+    }
+    switch( diagonals )
+    {
+        case DIAGONALS_INSIDE_TO_OUTSIDE:
+            // Diagonals pointing from inside edge forward towards outside edge
+            num_index += (5 * numPointsForInsideTessFactor - 35 - 5 * ring) * (ring - 1);
+            break;
+
+        case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation
+            // Diagonals pointing from outside edge forward towards inside edge
+            num_index += (5 * numPointsForInsideTessFactor - 12 - 5 * ring) * (ring - 1);
+            break;
+
+        case DIAGONALS_MIRRORED:
+            num_index += (2 * numPointsForInsideTessFactor + 1 - 2 * ring) * (ring - 1);
+            break;
+    }
+
+    return num_index;
+}
+
+int sqr(int x)
+{
+    return x * x;
+}
+
+int GetRingFromIndexStitchRegular(bool bTrapezoid, int diagonals, int numPointsForInsideTessFactor, int index)
+{
+    int t = 0;
+    if (bTrapezoid)
+    {
+        t = 8;
+    }
+
+    switch( diagonals )
+    {
+        case DIAGONALS_INSIDE_TO_OUTSIDE:
+            t = (5 * numPointsForInsideTessFactor - (35 - t)) * 3;
+            return 1 + uint((t + 15) - sqrt(sqr(t + 15) - 4 * 15 * (t + index)) + 0.001f) / 30;
+
+        case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE:
+            t = (5 * numPointsForInsideTessFactor - (12 - t)) * 3;
+            return 1 + uint((t + 15) - sqrt(sqr(t + 15) - 4 * 15 * (t + index)) + 0.001f) / 30;
+
+        case DIAGONALS_MIRRORED:
+            t = ((t + 1) + 2 * numPointsForInsideTessFactor) * 3;
+            return 1 + uint((t + 6) - sqrt(sqr(t + 6) - 4 * 6 * (t + index)) + 0.001f) / 12;
+
+        default:
+            return -1;
+    }
+}
+
+uint3 NumStitchTransition(int4 outsideInsideNumHalfTessFactorPoints, 
+                                    int4 outsideInsideEdgeTessFactorParity)
+{
+    outsideInsideNumHalfTessFactorPoints -= (TESSELLATOR_PARITY_ODD == outsideInsideEdgeTessFactorParity);
+
+    uint3 num_index = insidePointIndex[outsideInsideNumHalfTessFactorPoints.w][MAX_FACTOR / 2 + 1].y * 8;
+    
+    [unroll]
+    for (int edge = 0; edge < 3; ++ edge)
+    {
+        num_index[edge] += outsidePointIndex[outsideInsideNumHalfTessFactorPoints[edge]][MAX_FACTOR / 2 + 1].y * 8;
+
+        if( (outsideInsideEdgeTessFactorParity.w != outsideInsideEdgeTessFactorParity[edge]) || (outsideInsideEdgeTessFactorParity.w == TESSELLATOR_PARITY_ODD))
+        {
+            if( outsideInsideEdgeTessFactorParity.w == outsideInsideEdgeTessFactorParity[edge] )
+            {
+                num_index[edge] += 5;
+            }
+            else
+            {
+                num_index[edge] += 4;
+            }
+        }
+    }
+
+    return num_index;
+}
diff --git a/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_defines.h b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_defines.h
new file mode 100644
index 000000000..6b4382393
--- /dev/null
+++ b/tests/hlsl/dxsdk/AdaptiveTessellationCS40/TessellatorCS40_defines.h
@@ -0,0 +1,9 @@
+//--------------------------------------------------------------------------------------
+// File: TessellatorCS40_defines.h
+//
+// This file defines common constants which are included by both CPU code and shader code
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#define MAX_FACTOR 16
diff --git a/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC6HEncode.hlsl b/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC6HEncode.hlsl
new file mode 100644
index 000000000..1e40c80ef
--- /dev/null
+++ b/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC6HEncode.hlsl
@@ -0,0 +1,2567 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: BC6HEncode.hlsl
+//
+// The Compute Shader for BC6H Encoder
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//#define REF_DEVICE
+
+#define UINTLENGTH            32
+#define NCHANNELS             3
+#define SIGNED_F16            96
+#define UNSIGNED_F16          95
+#define MAX_FLOAT             asfloat(0x7F7FFFFF)
+#define MIN_FLOAT             asfloat(0xFF7FFFFF)
+#define MAX_INT               asint(0x7FFFFFFF)
+#define MIN_INT               asint(0x80000000)
+
+cbuffer cbCS : register( b0 )
+{
+    uint g_tex_width;
+    uint g_num_block_x;
+    uint g_format;            //either SIGNED_F16 for DXGI_FORMAT_BC6H_SF16 or UNSIGNED_F16 for DXGI_FORMAT_BC6H_UF16
+    uint g_mode_id;
+    uint g_start_block_id;
+    uint g_num_total_blocks;
+};
+
+static const uint candidateModeMemory[14] = { 0x00, 0x01, 0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, 0x03, 0x07, 0x0B, 0x0F };
+static const uint candidateModeFlag[14] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
+static const bool candidateModeTransformed[14] = { true, true, true, true, true, true, true, true, true, false, false, true, true, true };
+static const uint4 candidateModePrec[14] = { uint4(10,5,5,5), uint4(7,6,6,6),
+    uint4(11,5,4,4), uint4(11,4,5,4), uint4(11,4,4,5), uint4(9,5,5,5),
+    uint4(8,6,5,5), uint4(8,5,6,5), uint4(8,5,5,6), uint4(6,6,6,6),
+    uint4(10,10,10,10), uint4(11,9,9,9), uint4(12,8,8,8), uint4(16,4,4,4) };
+
+/*static const uint4x4 candidateSection[32] = 
+{
+    {0,0,1,1, 0,0,1,1, 0,0,1,1, 0,0,1,1}, {0,0,0,1, 0,0,0,1, 0,0,0,1, 0,0,0,1}, {0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1}, {0,0,0,1, 0,0,1,1, 0,0,1,1, 0,1,1,1},
+    {0,0,0,0, 0,0,0,1, 0,0,0,1, 0,0,1,1}, {0,0,1,1, 0,1,1,1, 0,1,1,1, 1,1,1,1}, {0,0,0,1, 0,0,1,1, 0,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,1, 0,0,1,1, 0,1,1,1},
+    {0,0,0,0, 0,0,0,0, 0,0,0,1, 0,0,1,1}, {0,0,1,1, 0,1,1,1, 1,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,1, 0,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,0, 0,0,0,1, 0,1,1,1},
+    {0,0,0,1, 0,1,1,1, 1,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,0, 1,1,1,1, 1,1,1,1}, {0,0,0,0, 1,1,1,1, 1,1,1,1, 1,1,1,1}, {0,0,0,0, 0,0,0,0, 0,0,0,0, 1,1,1,1},
+    {0,0,0,0, 1,0,0,0, 1,1,1,0, 1,1,1,1}, {0,1,1,1, 0,0,0,1, 0,0,0,0, 0,0,0,0}, {0,0,0,0, 0,0,0,0, 1,0,0,0, 1,1,1,0}, {0,1,1,1, 0,0,1,1, 0,0,0,1, 0,0,0,0},
+    {0,0,1,1, 0,0,0,1, 0,0,0,0, 0,0,0,0}, {0,0,0,0, 1,0,0,0, 1,1,0,0, 1,1,1,0}, {0,0,0,0, 0,0,0,0, 1,0,0,0, 1,1,0,0}, {0,1,1,1, 0,0,1,1, 0,0,1,1, 0,0,0,1},
+    {0,0,1,1, 0,0,0,1, 0,0,0,1, 0,0,0,0}, {0,0,0,0, 1,0,0,0, 1,0,0,0, 1,1,0,0}, {0,1,1,0, 0,1,1,0, 0,1,1,0, 0,1,1,0}, {0,0,1,1, 0,1,1,0, 0,1,1,0, 1,1,0,0},
+    {0,0,0,1, 0,1,1,1, 1,1,1,0, 1,0,0,0}, {0,0,0,0, 1,1,1,1, 1,1,1,1, 0,0,0,0}, {0,1,1,1, 0,0,0,1, 1,0,0,0, 1,1,1,0}, {0,0,1,1, 1,0,0,1, 1,0,0,1, 1,1,0,0}
+};*/
+
+static const uint candidateSectionBit[32] = 
+{
+    0xCCCC, 0x8888, 0xEEEE, 0xECC8,
+    0xC880, 0xFEEC, 0xFEC8, 0xEC80,
+    0xC800, 0xFFEC, 0xFE80, 0xE800,
+    0xFFE8, 0xFF00, 0xFFF0, 0xF000,
+    0xF710, 0x008E, 0x7100, 0x08CE,
+    0x008C, 0x7310, 0x3100, 0x8CCE,
+    0x088C, 0x3110, 0x6666, 0x366C,
+    0x17E8, 0x0FF0, 0x718E, 0x399C
+};
+
+static const uint candidateFixUpIndex1D[32] = 
+{
+    15,15,15,15,
+    15,15,15,15,
+    15,15,15,15,
+    15,15,15,15,
+    15, 2, 8, 2,
+     2, 8, 8,15,
+     2, 8, 2, 2,
+     8, 8, 2, 2
+};
+
+//0, 9, 18, 27, 37, 46, 55, 64
+static const uint aStep1[64] = {0,0,0,0,0,1,1,1,
+                              1,1,1,1,1,1,2,2,
+                              2,2,2,2,2,2,2,3,
+                              3,3,3,3,3,3,3,3,
+                              3,4,4,4,4,4,4,4,
+                              4,4,5,5,5,5,5,5,
+                              5,5,5,6,6,6,6,6,
+                              6,6,6,6,7,7,7,7};
+                                  
+//0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64
+static const uint aStep2[64] = { 0, 0, 0, 1, 1, 1, 1, 2,
+                               2, 2, 2, 2, 3, 3, 3, 3,
+                               4, 4, 4, 4, 5, 5, 5, 5,
+                               6, 6, 6, 6, 6, 7, 7, 7,
+                               7, 8, 8, 8, 8, 9, 9, 9,
+                               9,10,10,10,10,10,11,11,
+                              11,11,12,12,12,12,13,13,
+                              13,13,14,14,14,14,15,15};
+
+static const float3 RGB2LUM = float3(0.2126f, 0.7152f, 0.0722f);
+
+#define THREAD_GROUP_SIZE    64
+#define BLOCK_SIZE_Y         4
+#define BLOCK_SIZE_X         4
+#define BLOCK_SIZE           (BLOCK_SIZE_Y * BLOCK_SIZE_X)
+
+
+//Forward declaration
+uint3 float2half( float3 pixel_f );
+int3 start_quantize( uint3 pixel_h );
+void quantize( inout int2x3 endPoint, uint prec );
+void finish_quantize_0( inout int bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed );
+void finish_quantize_1( inout int bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed );
+void finish_quantize( out bool bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed );
+
+void start_unquantize( inout int2x3 endPoint[2], uint4 prec, bool transformed );
+void start_unquantize( inout int2x3 endPoint, uint4 prec, bool transformed );
+void unquantize( inout int2x3 color, uint prec );
+uint3 finish_unquantize( int3 color );
+void generate_palette_unquantized8( out uint3 palette, int3 low, int3 high, int i );
+void generate_palette_unquantized16( out uint3 palette, int3 low, int3 high, int i );
+float3 half2float(uint3 color_h );
+
+void block_package( inout uint4 block, int2x3 endPoint[2], uint mode_type, uint partition_index );
+void block_package( inout uint4 block, int2x3 endPoint, uint mode_type );
+
+void swap(inout int3 lhs, inout int3 rhs)
+{
+    int3 tmp = lhs;
+    lhs = rhs;
+    rhs = tmp;
+}
+
+Texture2D<float4> g_Input : register( t0 ); 
+StructuredBuffer<uint4> g_InBuff : register( t1 );
+
+RWStructuredBuffer<uint4> g_OutBuff : register( u0 );
+
+struct SharedData
+{
+    float3 pixel;
+    int3 pixel_ph;
+    float3 pixel_hr;
+    float pixel_lum;
+    float error;
+    uint best_mode;
+    uint best_partition;
+    int3 endPoint_low;
+    int3 endPoint_high;
+    float endPoint_lum_low;
+    float endPoint_lum_high;
+};
+
+groupshared SharedData shared_temp[THREAD_GROUP_SIZE];
+
+[numthreads( THREAD_GROUP_SIZE, 1, 1 )]
+void TryModeG10CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID )
+{
+    const uint MAX_USED_THREAD = 16;
+    uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
+    uint blockInGroup = GI / MAX_USED_THREAD;
+    uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
+    uint threadBase = blockInGroup * MAX_USED_THREAD;
+    uint threadInBlock = GI - threadBase;
+
+#ifndef REF_DEVICE
+    if (blockID >= g_num_total_blocks)
+    {
+        return;
+    }
+#endif
+    
+    uint block_y = blockID / g_num_block_x;
+    uint block_x = blockID - block_y * g_num_block_x;
+    uint base_x = block_x * BLOCK_SIZE_X;
+    uint base_y = block_y * BLOCK_SIZE_Y;
+    
+    if (threadInBlock < 16)
+    {
+        shared_temp[GI].pixel = g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ).rgb;
+        uint3 pixel_h = float2half( shared_temp[GI].pixel );
+        shared_temp[GI].pixel_hr = half2float(pixel_h);
+        shared_temp[GI].pixel_lum = dot(shared_temp[GI].pixel_hr, RGB2LUM);
+        shared_temp[GI].pixel_ph = start_quantize( pixel_h );
+        
+        shared_temp[GI].endPoint_low = shared_temp[GI].pixel_ph;
+        shared_temp[GI].endPoint_high = shared_temp[GI].pixel_ph;
+        shared_temp[GI].endPoint_lum_low = shared_temp[GI].pixel_lum;
+        shared_temp[GI].endPoint_lum_high = shared_temp[GI].pixel_lum;
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    
+    if (threadInBlock < 8)
+    {
+        if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 8].endPoint_lum_low)
+        {
+            shared_temp[GI].endPoint_low = shared_temp[GI + 8].endPoint_low;
+            shared_temp[GI].endPoint_lum_low = shared_temp[GI + 8].endPoint_lum_low;
+        }
+        if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 8].endPoint_lum_high)
+        {
+            shared_temp[GI].endPoint_high = shared_temp[GI + 8].endPoint_high;
+            shared_temp[GI].endPoint_lum_high = shared_temp[GI + 8].endPoint_lum_high;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 4)
+    {
+        if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 4].endPoint_lum_low)
+        {
+            shared_temp[GI].endPoint_low = shared_temp[GI + 4].endPoint_low;
+            shared_temp[GI].endPoint_lum_low = shared_temp[GI + 4].endPoint_lum_low;
+        }
+        if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 4].endPoint_lum_high)
+        {
+            shared_temp[GI].endPoint_high = shared_temp[GI + 4].endPoint_high;
+            shared_temp[GI].endPoint_lum_high = shared_temp[GI + 4].endPoint_lum_high;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 2)
+    {
+        if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 2].endPoint_lum_low)
+        {
+            shared_temp[GI].endPoint_low = shared_temp[GI + 2].endPoint_low;
+            shared_temp[GI].endPoint_lum_low = shared_temp[GI + 2].endPoint_lum_low;
+        }
+        if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 2].endPoint_lum_high)
+        {
+            shared_temp[GI].endPoint_high = shared_temp[GI + 2].endPoint_high;
+            shared_temp[GI].endPoint_lum_high = shared_temp[GI + 2].endPoint_lum_high;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 1)
+    {
+        if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 1].endPoint_lum_low)
+        {
+            shared_temp[GI].endPoint_low = shared_temp[GI + 1].endPoint_low;
+            shared_temp[GI].endPoint_lum_low = shared_temp[GI + 1].endPoint_lum_low;
+        }
+        if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 1].endPoint_lum_high)
+        {
+            shared_temp[GI].endPoint_high = shared_temp[GI + 1].endPoint_high;
+            shared_temp[GI].endPoint_lum_high = shared_temp[GI + 1].endPoint_lum_high;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+
+    //ergod mode_type 11:14
+    if ( threadInBlock == 0 )
+    {
+        int2x3 endPoint;
+        // find_axis
+        endPoint[0] = shared_temp[threadBase + 0].endPoint_low;
+        endPoint[1] = shared_temp[threadBase + 0].endPoint_high;
+        
+        //compute_index
+        float3 span = endPoint[1] - endPoint[0];// fixed a bug in v0.2
+        float span_norm_sqr = dot( span, span );// fixed a bug in v0.2
+        float dotProduct = dot( span, shared_temp[threadBase + 0].pixel_ph - endPoint[0] );// fixed a bug in v0.2
+        if ( span_norm_sqr > 0 && dotProduct >= 0 && uint( dotProduct * 63.49999 / span_norm_sqr ) > 32 )
+        {
+            swap(endPoint[0], endPoint[1]);
+
+            shared_temp[GI].endPoint_low = endPoint[0];
+            shared_temp[GI].endPoint_high = endPoint[1];
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+
+    if (threadInBlock < 4)
+    {
+        int2x3 endPoint;
+        endPoint[0] = shared_temp[threadBase + 0].endPoint_low;
+        endPoint[1] = shared_temp[threadBase + 0].endPoint_high;
+        
+        float3 span = endPoint[1] - endPoint[0];
+        float span_norm_sqr = dot( span, span );
+            
+        uint4 prec = candidateModePrec[threadInBlock + 10];
+        int2x3 endPoint_q = endPoint;
+        quantize( endPoint_q, prec.x );
+
+        bool transformed = candidateModeTransformed[threadInBlock + 10];
+        if (transformed)
+        {
+            endPoint_q[1] -= endPoint_q[0];
+        }
+        
+        bool bBadQuantize;
+        finish_quantize( bBadQuantize, endPoint_q, prec, transformed );
+        
+        start_unquantize( endPoint_q, prec, transformed );
+        
+        unquantize( endPoint_q, prec.x );
+        
+        float error = 0;
+        [loop]for ( uint j = 0; j < 16; j ++ )
+        {
+            float dotProduct = dot( span, shared_temp[threadBase + j].pixel_ph - endPoint[0] );// fixed a bug in v0.2
+            uint index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0
+                : ( ( dotProduct < span_norm_sqr ) ? aStep2[ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep2[63] );
+                
+            uint3 pixel_rh;
+            generate_palette_unquantized16( pixel_rh, endPoint_q[0], endPoint_q[1], index );
+            float3 pixel_r = half2float( pixel_rh );
+            pixel_r -= shared_temp[threadBase + j].pixel_hr;
+            error += dot(pixel_r, pixel_r);
+        }
+        if ( bBadQuantize )
+            error = 1e20f;
+
+        shared_temp[GI].error = error;
+        shared_temp[GI].best_mode = candidateModeFlag[threadInBlock + 10];
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    
+    if (threadInBlock < 2)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 2].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 2].error;
+            shared_temp[GI].best_mode = shared_temp[GI + 2].best_mode;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 1)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 1].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 1].error;
+            shared_temp[GI].best_mode = shared_temp[GI + 1].best_mode;
+        }
+        
+        g_OutBuff[blockID] = uint4(asuint(shared_temp[GI].error), shared_temp[GI].best_mode, 0, 0);
+    }
+}
+
+[numthreads( THREAD_GROUP_SIZE, 1, 1 )]
+void TryModeLE10CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID )
+{
+    const uint MAX_USED_THREAD = 32;
+    uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
+    uint blockInGroup = GI / MAX_USED_THREAD;
+    uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
+    uint threadBase = blockInGroup * MAX_USED_THREAD;
+    uint threadInBlock = GI - threadBase;
+
+#ifndef REF_DEVICE
+    if (blockID >= g_num_total_blocks)
+    {
+        return;
+    }
+
+    if (asfloat(g_InBuff[blockID].x) < 1e-6f)
+    {
+        g_OutBuff[blockID] = g_InBuff[blockID];
+        return;
+    }
+#endif
+    
+    uint block_y = blockID / g_num_block_x;
+    uint block_x = blockID - block_y * g_num_block_x;
+    uint base_x = block_x * BLOCK_SIZE_X;
+    uint base_y = block_y * BLOCK_SIZE_Y;
+    
+    if (threadInBlock < 16)
+    {
+        shared_temp[GI].pixel = g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ).rgb;
+        uint3 pixel_h = float2half( shared_temp[GI].pixel );
+        shared_temp[GI].pixel_hr = half2float(pixel_h);
+        shared_temp[GI].pixel_lum = dot(shared_temp[GI].pixel_hr, RGB2LUM);
+        shared_temp[GI].pixel_ph = start_quantize( pixel_h );
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    
+    //ergod mode_type 1:10
+    if (threadInBlock < 32)
+    {
+        // find_axis
+        int2x3 endPoint[2];
+        endPoint[0][0] = MAX_INT;
+        endPoint[0][1] = MIN_INT;
+        endPoint[1][0] = MAX_INT;
+        endPoint[1][1] = MIN_INT;
+        
+        float2 endPoint_lum[2];
+        endPoint_lum[0][0] = MAX_FLOAT;
+        endPoint_lum[0][1] = MIN_FLOAT;
+        endPoint_lum[1][0] = MAX_FLOAT;
+        endPoint_lum[1][1] = MIN_FLOAT;
+
+        uint bit = candidateSectionBit[threadInBlock];
+        for ( uint i = 0; i < 16; i ++ )
+        {
+            int3 pixel_ph = shared_temp[threadBase + i].pixel_ph;
+            float pixel_lum = shared_temp[threadBase + i].pixel_lum;
+            if ( (bit >> i) & 1 ) //It gets error when using "candidateSection" as "endPoint_ph" index
+            {
+                if (endPoint_lum[1][0] > pixel_lum)
+                {
+                    endPoint[1][0] = pixel_ph;
+                    endPoint_lum[1][0] = pixel_lum;
+                }
+                if (endPoint_lum[1][1] < pixel_lum)
+                {
+                    endPoint[1][1] = pixel_ph;
+                    endPoint_lum[1][1] = pixel_lum;
+                }
+            }
+            else
+            {
+                if (endPoint_lum[0][0] > pixel_lum)
+                {
+                    endPoint[0][0] = pixel_ph;
+                    endPoint_lum[0][0] = pixel_lum;
+                }
+                if (endPoint_lum[0][1] < pixel_lum)
+                {
+                    endPoint[0][1] = pixel_ph;
+                    endPoint_lum[0][1] = pixel_lum;
+                }
+            }
+        }
+        
+        //compute_index
+        float3 span[2];// fixed a bug in v0.2
+        float span_norm_sqr[2];// fixed a bug in v0.2
+        [unroll]
+        for (uint p = 0; p < 2; ++ p)
+        {
+            span[p] = endPoint[p][1] - endPoint[p][0];
+            span_norm_sqr[p] = dot( span[p], span[p] );
+
+            float dotProduct = dot( span[p], shared_temp[threadBase + (0 == p ? 0 : candidateFixUpIndex1D[threadInBlock])].pixel_ph - endPoint[p][0] );// fixed a bug in v0.2
+            if ( span_norm_sqr[p] > 0 && dotProduct >= 0 && uint( dotProduct * 63.49999 / span_norm_sqr[p] ) > 32 )
+            {
+                span[p] = -span[p];
+                swap(endPoint[p][0], endPoint[p][1]);
+            }
+        }
+
+        uint4 prec = candidateModePrec[g_mode_id];
+        int2x3 endPoint_q[2] = endPoint;
+        quantize( endPoint_q[0], prec.x );
+        quantize( endPoint_q[1], prec.x );
+
+        bool transformed = candidateModeTransformed[g_mode_id];
+        if (transformed)
+        {
+            endPoint_q[0][1] -= endPoint_q[0][0];
+            endPoint_q[1][0] -= endPoint_q[0][0];
+            endPoint_q[1][1] -= endPoint_q[0][0];
+        }
+
+        int bBadQuantize = 0;
+        finish_quantize_0( bBadQuantize, endPoint_q[0], prec, transformed );
+        finish_quantize_1( bBadQuantize, endPoint_q[1], prec, transformed );
+        
+        start_unquantize( endPoint_q, prec, transformed );
+        
+        unquantize( endPoint_q[0], prec.x );
+        unquantize( endPoint_q[1], prec.x );
+        
+        float error = 0;
+        for ( uint j = 0; j < 16; j ++ )
+        {
+            uint3 pixel_rh;
+            if ((bit >> j) & 1)
+            {
+                float dotProduct = dot( span[1], shared_temp[threadBase + j].pixel_ph - endPoint[1][0] );// fixed a bug in v0.2
+                uint index = ( span_norm_sqr[1] <= 0 || dotProduct <= 0 ) ? 0
+                        : ( ( dotProduct < span_norm_sqr[1] ) ? aStep1[ uint( dotProduct * 63.49999 / span_norm_sqr[1] ) ] : aStep1[63] );
+                generate_palette_unquantized8( pixel_rh, endPoint_q[1][0], endPoint_q[1][1], index );
+            }
+            else
+            {
+                float dotProduct = dot( span[0], shared_temp[threadBase + j].pixel_ph - endPoint[0][0] );// fixed a bug in v0.2
+                uint index = ( span_norm_sqr[0] <= 0 || dotProduct <= 0 ) ? 0
+                        : ( ( dotProduct < span_norm_sqr[0] ) ? aStep1[ uint( dotProduct * 63.49999 / span_norm_sqr[0] ) ] : aStep1[63] );
+                generate_palette_unquantized8( pixel_rh, endPoint_q[0][0], endPoint_q[0][1], index );
+            }
+
+            float3 pixel_r = half2float( pixel_rh );
+            pixel_r -= shared_temp[threadBase + j].pixel_hr;
+            error += dot(pixel_r, pixel_r);
+        }
+        if ( bBadQuantize )
+            error = 1e20f;
+
+        shared_temp[GI].error = error;
+        shared_temp[GI].best_mode = candidateModeFlag[g_mode_id];
+        shared_temp[GI].best_partition = threadInBlock;
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    
+    if (threadInBlock < 16)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 16].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 16].error;
+            shared_temp[GI].best_mode = shared_temp[GI + 16].best_mode;
+            shared_temp[GI].best_partition = shared_temp[GI + 16].best_partition;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 8)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 8].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 8].error;
+            shared_temp[GI].best_mode = shared_temp[GI + 8].best_mode;
+            shared_temp[GI].best_partition = shared_temp[GI + 8].best_partition;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 4)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 4].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 4].error;
+            shared_temp[GI].best_mode = shared_temp[GI + 4].best_mode;
+            shared_temp[GI].best_partition = shared_temp[GI + 4].best_partition;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 2)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 2].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 2].error;
+            shared_temp[GI].best_mode = shared_temp[GI + 2].best_mode;
+            shared_temp[GI].best_partition = shared_temp[GI + 2].best_partition;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 1)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 1].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 1].error;
+            shared_temp[GI].best_mode = shared_temp[GI + 1].best_mode;
+            shared_temp[GI].best_partition = shared_temp[GI + 1].best_partition;
+        }
+        
+        if (asfloat(g_InBuff[blockID].x) > shared_temp[GI].error)
+        {
+            g_OutBuff[blockID] = uint4(asuint(shared_temp[GI].error), shared_temp[GI].best_mode, shared_temp[GI].best_partition, 0);
+        }
+        else
+        {
+            g_OutBuff[blockID] = g_InBuff[blockID];
+        }
+    }
+}
+
+[numthreads( THREAD_GROUP_SIZE, 1, 1 )]
+void EncodeBlockCS(uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID)
+{
+    const uint MAX_USED_THREAD = 32;
+    uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
+    uint blockInGroup = GI / MAX_USED_THREAD;
+    uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
+    uint threadBase = blockInGroup * MAX_USED_THREAD;
+    uint threadInBlock = GI - threadBase;
+
+#ifndef REF_DEVICE
+    if (blockID >= g_num_total_blocks)
+    {
+        return;
+    }
+#endif
+
+    uint block_y = blockID / g_num_block_x;
+    uint block_x = blockID - block_y * g_num_block_x;
+    uint base_x = block_x * BLOCK_SIZE_X;
+    uint base_y = block_y * BLOCK_SIZE_Y;
+    
+    if (threadInBlock < 16)
+    {
+        shared_temp[GI].pixel = g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ).rgb;
+        shared_temp[GI].pixel_lum = dot(shared_temp[GI].pixel, RGB2LUM);
+        uint3 pixel_h = float2half( shared_temp[GI].pixel );
+        shared_temp[GI].pixel_ph = start_quantize( pixel_h );
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    
+    uint best_mode = g_InBuff[blockID].y;
+    uint best_partition = g_InBuff[blockID].z;
+    
+    uint4 block = 0;
+
+    if (threadInBlock < 32)
+    {
+        int2x3 endPoint;
+        endPoint[0] = MAX_INT;
+        endPoint[1] = MIN_INT;
+
+        float2 endPoint_lum;
+        endPoint_lum[0] = MAX_FLOAT;
+        endPoint_lum[1] = MIN_FLOAT;
+        
+        int2 endPoint_lum_index;
+        endPoint_lum_index[0] = -1;
+        endPoint_lum_index[1] = -1;
+
+        int3 pixel_ph = shared_temp[threadBase + (threadInBlock & 0xF)].pixel_ph;
+        float pixel_lum = shared_temp[threadBase + (threadInBlock & 0xF)].pixel_lum;
+        if (threadInBlock < 16)
+        {
+            if (best_mode > 10)
+            {
+                endPoint[0] = endPoint[1] = pixel_ph;
+                endPoint_lum[0] = endPoint_lum[1] = pixel_lum;
+            }
+            else
+            {
+                uint bits = candidateSectionBit[best_partition];
+                if (0 == ((bits >> threadInBlock) & 1))
+                {
+                    endPoint[0] = endPoint[1] = pixel_ph;
+                    endPoint_lum[0] = endPoint_lum[1] = pixel_lum;
+                }
+            }
+        }
+        else
+        {
+            if (best_mode <= 10)
+            {
+                uint bits = candidateSectionBit[best_partition];
+                if (1 == ((bits >> (threadInBlock & 0xF)) & 1))
+                {
+                    endPoint[0] = endPoint[1] = pixel_ph;
+                    endPoint_lum[0] = endPoint_lum[1] = pixel_lum;
+                }
+            }
+        }
+
+        shared_temp[GI].endPoint_low = endPoint[0];
+        shared_temp[GI].endPoint_high = endPoint[1];
+        
+        shared_temp[GI].endPoint_lum_low = endPoint_lum[0];
+        shared_temp[GI].endPoint_lum_high = endPoint_lum[1];
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if ((threadInBlock & 0xF) < 8)
+    {
+        if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 8].endPoint_lum_low)
+        {
+            shared_temp[GI].endPoint_low = shared_temp[GI + 8].endPoint_low;
+            shared_temp[GI].endPoint_lum_low = shared_temp[GI + 8].endPoint_lum_low;
+        }
+        if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 8].endPoint_lum_high)
+        {
+            shared_temp[GI].endPoint_high = shared_temp[GI + 8].endPoint_high;
+            shared_temp[GI].endPoint_lum_high = shared_temp[GI + 8].endPoint_lum_high;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if ((threadInBlock & 0xF) < 4)
+    {
+        if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 4].endPoint_lum_low)
+        {
+            shared_temp[GI].endPoint_low = shared_temp[GI + 4].endPoint_low;
+            shared_temp[GI].endPoint_lum_low = shared_temp[GI + 4].endPoint_lum_low;
+        }
+        if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 4].endPoint_lum_high)
+        {
+            shared_temp[GI].endPoint_high = shared_temp[GI + 4].endPoint_high;
+            shared_temp[GI].endPoint_lum_high = shared_temp[GI + 4].endPoint_lum_high;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if ((threadInBlock & 0xF) < 2)
+    {
+        if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 2].endPoint_lum_low)
+        {
+            shared_temp[GI].endPoint_low = shared_temp[GI + 2].endPoint_low;
+            shared_temp[GI].endPoint_lum_low = shared_temp[GI + 2].endPoint_lum_low;
+        }
+        if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 2].endPoint_lum_high)
+        {
+            shared_temp[GI].endPoint_high = shared_temp[GI + 2].endPoint_high;
+            shared_temp[GI].endPoint_lum_high = shared_temp[GI + 2].endPoint_lum_high;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if ((threadInBlock & 0xF) < 1)
+    {
+        if (shared_temp[GI].endPoint_lum_low > shared_temp[GI + 1].endPoint_lum_low)
+        {
+            shared_temp[GI].endPoint_low = shared_temp[GI + 1].endPoint_low;
+            shared_temp[GI].endPoint_lum_low = shared_temp[GI + 1].endPoint_lum_low;
+        }
+        if (shared_temp[GI].endPoint_lum_high < shared_temp[GI + 1].endPoint_lum_high)
+        {
+            shared_temp[GI].endPoint_high = shared_temp[GI + 1].endPoint_high;
+            shared_temp[GI].endPoint_lum_high = shared_temp[GI + 1].endPoint_lum_high;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+
+    if (threadInBlock < 2)
+    {
+        // find_axis
+        int2x3 endPoint;
+        endPoint[0] = shared_temp[threadBase + threadInBlock * 16].endPoint_low;
+        endPoint[1] = shared_temp[threadBase + threadInBlock * 16].endPoint_high;
+
+        uint fixup = 0;
+        if ((1 == threadInBlock) && (best_mode <= 10))
+        {
+            fixup = candidateFixUpIndex1D[best_partition];
+        }
+        
+        float3 span = endPoint[1] - endPoint[0];
+        float span_norm_sqr = dot( span, span );
+        float dotProduct = dot( span, shared_temp[threadBase + fixup].pixel_ph - endPoint[0] );
+        if ( span_norm_sqr > 0 && dotProduct >= 0 && uint( dotProduct * 63.49999 / span_norm_sqr ) > 32 )
+        {
+            swap(endPoint[0], endPoint[1]);
+        }
+
+        shared_temp[GI].endPoint_low = endPoint[0];
+        shared_temp[GI].endPoint_high = endPoint[1];
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    
+    if (threadInBlock < 16)
+    {
+        uint bits;
+        if (best_mode > 10)
+        {
+            bits = 0;
+        }
+        else
+        {
+            bits = candidateSectionBit[best_partition];
+        }
+
+        float3 span;
+        float dotProduct;
+        if ((bits >> threadInBlock) & 1)
+        {
+            span = shared_temp[threadBase + 1].endPoint_high - shared_temp[threadBase + 1].endPoint_low;
+            dotProduct = dot( span, shared_temp[threadBase + threadInBlock].pixel_ph - shared_temp[threadBase + 1].endPoint_low );
+        }
+        else
+        {
+            span = shared_temp[threadBase + 0].endPoint_high - shared_temp[threadBase + 0].endPoint_low;
+            dotProduct = dot( span, shared_temp[threadBase + threadInBlock].pixel_ph - shared_temp[threadBase + 0].endPoint_low );
+        }
+        float span_norm_sqr = dot( span, span );
+
+        if (best_mode > 10)
+        {
+            uint index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0
+                    : ( ( dotProduct < span_norm_sqr ) ? aStep2[ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep2[63] );
+            if (threadInBlock == 0)
+            {
+                block.z |= index << 1;
+            }
+            else if (threadInBlock < 8)
+            {
+                block.z |= index << (threadInBlock * 4);
+            }
+            else
+            {
+                block.w |= index << ((threadInBlock - 8) * 4);
+            }
+        }
+        else
+        {
+            uint index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0
+                    : ( ( dotProduct < span_norm_sqr ) ? aStep1[ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep1[63] );
+
+            uint fixup = candidateFixUpIndex1D[best_partition];
+            int2 offset = int2((fixup != 2), (fixup == 15));
+
+            if (threadInBlock == 0)
+            {
+                block.z |= index << 18;
+            }
+            else if (threadInBlock < 3)
+            {
+                block.z |= index << (20 + (threadInBlock - 1) * 3);
+            }
+            else if (threadInBlock < 5)
+            {
+                block.z |= index << (25 + (threadInBlock - 3) * 3 + offset.x);
+            }
+            else if (threadInBlock == 5)
+            {
+                block.w |= index >> !offset.x;
+                if (!offset.x)
+                {
+                    block.z |= index << 31;
+                }
+            }
+            else if (threadInBlock < 9)
+            {
+                block.w |= index << (2 + (threadInBlock - 6) * 3 + offset.x);
+            }
+            else
+            {
+                block.w |= index << (11 + (threadInBlock - 9) * 3 + offset.y);
+            }
+        }
+        
+        shared_temp[GI].pixel_hr.xy = asfloat(block.zw);
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 8)
+    {
+        shared_temp[GI].pixel_hr.xy = asfloat(asuint(shared_temp[GI].pixel_hr.xy) | asuint(shared_temp[GI + 8].pixel_hr.xy));
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 4)
+    {
+        shared_temp[GI].pixel_hr.xy = asfloat(asuint(shared_temp[GI].pixel_hr.xy) | asuint(shared_temp[GI + 4].pixel_hr.xy));
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 2)
+    {
+        shared_temp[GI].pixel_hr.xy = asfloat(asuint(shared_temp[GI].pixel_hr.xy) | asuint(shared_temp[GI + 2].pixel_hr.xy));
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 1)
+    {
+        shared_temp[GI].pixel_hr.xy = asfloat(asuint(shared_temp[GI].pixel_hr.xy) | asuint(shared_temp[GI + 1].pixel_hr.xy));
+        
+        block.zw = asuint(shared_temp[GI].pixel_hr.xy);
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+
+    bool transformed = candidateModeTransformed[best_mode - 1];
+    uint4 prec = candidateModePrec[best_mode - 1];
+    if (threadInBlock == 2)
+    {
+        int2x3 endPoint_q;
+        endPoint_q[0] = shared_temp[threadBase + 0].endPoint_low;
+        endPoint_q[1] = shared_temp[threadBase + 0].endPoint_high;
+
+        quantize( endPoint_q, prec.x );
+        if (transformed)
+        {
+            endPoint_q[1] -= endPoint_q[0];
+        }
+
+        shared_temp[GI].endPoint_low = endPoint_q[0];
+        shared_temp[GI].endPoint_high = endPoint_q[1];
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock == 3)
+    {
+        int3 ep0 = shared_temp[threadBase + 2].endPoint_low;
+        int2x3 endPoint_q;
+        endPoint_q[0] = shared_temp[threadBase + 1].endPoint_low;
+        endPoint_q[1] = shared_temp[threadBase + 1].endPoint_high;
+
+        if (best_mode <= 10)
+        {
+            quantize( endPoint_q, prec.x );
+            if (transformed)
+            {
+                endPoint_q[0] -= ep0;
+                endPoint_q[1] -= ep0;
+            }
+
+            shared_temp[GI].endPoint_low = endPoint_q[0];
+            shared_temp[GI].endPoint_high = endPoint_q[1];
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+
+    if (threadInBlock < 2)
+    {
+        int2x3 endPoint_q;
+        endPoint_q[0] = shared_temp[threadBase + threadInBlock + 2].endPoint_low;
+        endPoint_q[1] = shared_temp[threadBase + threadInBlock + 2].endPoint_high;
+
+        int bBadQuantize = 0;
+        if (threadInBlock == 0)
+        {
+            if (best_mode > 10)
+            {
+                finish_quantize( bBadQuantize, endPoint_q, prec, transformed );
+            }
+            else
+            {
+                finish_quantize_0( bBadQuantize, endPoint_q, prec, transformed );
+            }
+        }
+        else // if (threadInBlock == 1)
+        {
+            if (best_mode <= 10)
+            {
+                finish_quantize_1( bBadQuantize, endPoint_q, prec, transformed );
+            }
+        }
+
+        shared_temp[GI].endPoint_low = endPoint_q[0];
+        shared_temp[GI].endPoint_high = endPoint_q[1];
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    
+    if ( threadInBlock == 0 )
+    {
+        int2x3 endPoint_q[2];
+        endPoint_q[0][0] = shared_temp[threadBase + 0].endPoint_low;
+        endPoint_q[0][1] = shared_temp[threadBase + 0].endPoint_high;
+        endPoint_q[1][0] = shared_temp[threadBase + 1].endPoint_low;
+        endPoint_q[1][1] = shared_temp[threadBase + 1].endPoint_high;
+
+        if ( best_mode > 10 )
+        {
+            block_package( block, endPoint_q[0], best_mode );
+        }
+        else
+        {
+            block_package( block, endPoint_q, best_mode, best_partition );
+        }
+        
+        g_OutBuff[blockID] = block;
+    }
+}
+
+uint float2half1( float f )
+{
+    uint Result;
+
+    uint IValue = asuint(f);
+    uint Sign = (IValue & 0x80000000U) >> 16U;
+    IValue = IValue & 0x7FFFFFFFU;
+    
+    if (IValue > 0x47FFEFFFU)
+    {
+        // The number is too large to be represented as a half.  Saturate to infinity.
+        Result = 0x7FFFU;
+    }
+    else
+    {
+        if (IValue < 0x38800000U)
+        {
+            // The number is too small to be represented as a normalized half.
+            // Convert it to a denormalized value.
+            uint Shift = 113U - (IValue >> 23U);
+            IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift;
+        }
+        else
+        {
+            // Rebias the exponent to represent the value as a normalized half.
+            IValue += 0xC8000000U;
+        }
+
+        Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU; 
+    }
+    return (Result|Sign);
+}
+
+uint3 float2half( float3 endPoint_f )
+{
+    //uint3 sign = asuint(endPoint_f) & 0x80000000;
+    //uint3 expo = asuint(endPoint_f) & 0x7F800000;
+    //uint3 base = asuint(endPoint_f) & 0x007FFFFF;
+    //return ( expo < 0x33800000 ) ? 0 
+    //                    //0x33800000 indicating 2^-24, which is minimal denormalized number that half can present 
+    //    : ( ( expo < 0x38800000 ) ? ( sign >> 16 ) | ( ( base + 0x00800000 ) >> ( 23 - ( ( expo - 0x33800000 ) >> 23 ) ) )//fixed a bug in v0.2
+    //                    //0x38800000 indicating 2^-14, which is minimal normalized number that half can present, so need to use denormalized half presentation
+    //    : ( ( expo == 0x7F800000 || expo > 0x47000000 ) ? ( ( sign >> 16 ) | 0x7bff )
+    //                    // treat NaN as INF, treat INF (including NaN) as the maximum/minimum number that half can present
+    //                    // 0x47000000 indicating 2^15, which is maximum exponent that half can present, so cut to 0x7bff which is the maximum half number
+    //    : ( ( sign >> 16 ) | ( ( ( expo - 0x38000000 ) | base ) >> 13 ) ) ) );
+
+
+    return uint3( float2half1( endPoint_f.x ), float2half1( endPoint_f.y ), float2half1( endPoint_f.z ) );
+}
+int3 start_quantize( uint3 pixel_h )
+{
+    if ( g_format == UNSIGNED_F16 )
+    {
+        return asint( ( pixel_h << 6 ) / 31 );
+    }
+    else
+    {
+        return ( pixel_h < 0x8000 ) ? ( ( pixel_h == 0x7bff ) ? 0x7fff : asint( ( pixel_h << 5 ) / 31 ) )// fixed a bug in v0.2
+            : ( ( pixel_h == 0x7bff ) ? 0xffff8001 : -asint( ( ( 0x00007fff & pixel_h ) << 5 ) / 31 ) );// fixed a bug in v0.2
+    }
+}
+void quantize( inout int2x3 endPoint, uint prec )
+{
+    int iprec = asint( prec );
+    if ( g_format == UNSIGNED_F16 )
+    {
+        endPoint = ( ( iprec >= 15 ) | ( endPoint == 0 ) ) ? endPoint
+            : ( ( endPoint == asint(0xFFFF) ) ? ( ( 1 << iprec ) - 1 )
+            : ( ( ( endPoint << iprec ) + asint(0x0000) ) >> 16 ) );
+    }
+    else
+    {
+        endPoint = ( ( iprec >= 16 ) | ( endPoint == 0 ) ) ? endPoint
+            : ( ( endPoint >= 0 ) ? ( ( endPoint == asint(0x7FFF) ) ? ( ( 1 << ( iprec - 1 ) ) - 1 ) : ( ( ( endPoint << ( iprec - 1 ) ) + asint(0x0000) ) >> 15 ) ) 
+            : ( ( -endPoint == asint(0x7FFF) ) ? -( ( 1 << ( iprec - 1 ) ) - 1 ) : -( ( ( -endPoint << ( iprec - 1 ) ) + asint(0x0000) ) >> 15 ) ) );
+    }
+}
+void finish_quantize_0( inout int bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed )
+{
+    if ( transformed )
+    {
+        bool3 bBadComponent = ( endPoint[1] >= 0 ) ? ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) )
+            : ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) );
+        bBadQuantize |= any(bBadComponent);
+
+        endPoint[0] = endPoint[0] & ( ( 1 << prec.x ) - 1 );
+        endPoint[1] = ( endPoint[1] >= 0 ) ? ( ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) ? ( ( 1 << ( prec.yzw - 1 ) ) - 1 ) : endPoint[1] )
+            : ( ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ) ? ( 1 << ( prec.yzw - 1 ) ) : ( endPoint[1] & ( ( 1 << prec.yzw ) - 1 ) ) );
+    }
+    else
+    {
+        endPoint &= ( ( 1 << prec.x ) - 1 );
+    }
+}
+void finish_quantize_1( inout int bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed )
+{
+    if ( transformed )
+    {
+        bool2x3 bBadComponent;
+        bBadComponent[0] = ( endPoint[0] >= 0 ) ? ( endPoint[0] >= ( 1 << ( prec.yzw - 1 ) ) )
+            : ( -endPoint[0] > ( 1 << ( prec.yzw - 1 ) ) );
+        bBadComponent[1] = ( endPoint[1] >= 0 ) ? ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) )
+            : ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) );
+        bBadQuantize |= any(bBadComponent);
+
+        endPoint[0] = ( endPoint[0] >= 0 ) ? ( ( endPoint[0] >= ( 1 << ( prec.yzw - 1 ) ) ) ? ( ( 1 << ( prec.yzw - 1 ) ) - 1 ) : endPoint[0] )
+            : ( ( -endPoint[0] > ( 1 << ( prec.yzw - 1 ) ) ) ? ( 1 << ( prec.yzw - 1 ) ) : ( endPoint[0] & ( ( 1 << prec.yzw ) - 1 ) ) );
+        endPoint[1] = ( endPoint[1] >= 0 ) ? ( ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) ? ( ( 1 << ( prec.yzw - 1 ) ) - 1 ) : endPoint[1] )
+            : ( ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ) ? ( 1 << ( prec.yzw - 1 ) ) : ( endPoint[1] & ( ( 1 << prec.yzw ) - 1 ) ) );
+    }
+    else
+    {
+        endPoint &= ( ( 1 << prec.x ) - 1 );
+    }
+}
+void finish_quantize( out bool bBadQuantize, inout int2x3 endPoint, uint4 prec, bool transformed )
+{
+    if ( transformed )
+    {
+        bool3 bBadComponent;
+        bBadComponent = ( endPoint[1] >= 0 ) ? ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) )
+            : ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) );
+        bBadQuantize = any( bBadComponent );
+
+        endPoint[0] = endPoint[0] & ( ( 1 << prec.x ) - 1 );
+        endPoint[1] = ( endPoint[1] >= 0 ) ? ( ( endPoint[1] >= ( 1 << ( prec.yzw - 1 ) ) ) ? ( ( 1 << ( prec.yzw - 1 ) ) - 1 ) : endPoint[1] )
+            : ( ( -endPoint[1] > ( 1 << ( prec.yzw - 1 ) ) ) ? ( 1 << ( prec.yzw - 1 ) ) : ( endPoint[1] & ( ( 1 << prec.yzw ) - 1 ) ) );            
+    }
+    else
+    {
+        endPoint &= ( ( 1 << prec.x ) - 1 );
+        
+        bBadQuantize = 0;
+    }
+}
+
+void SIGN_EXTEND( uint3 prec, inout int3 color )
+{
+    uint3 p = 1 << (prec - 1);
+    color = (color & p) ? (color & (p - 1)) - p : color;
+}
+
+void sign_extend( bool transformed, uint4 prec, inout int2x3 endPoint )
+{
+    if ( g_format == SIGNED_F16 )
+        SIGN_EXTEND( prec.x, endPoint[0] );
+    if ( g_format == SIGNED_F16 || transformed )
+        SIGN_EXTEND( prec.yzw, endPoint[1] );
+}
+
+void sign_extend( bool transformed, uint4 prec, inout int2x3 endPoint[2] )
+{
+    if ( g_format == SIGNED_F16 )
+        SIGN_EXTEND( prec.x, endPoint[0][0] );
+    if ( g_format == SIGNED_F16 || transformed )
+    {
+        SIGN_EXTEND( prec.yzw, endPoint[0][1] );
+        SIGN_EXTEND( prec.yzw, endPoint[1][0] );
+        SIGN_EXTEND( prec.yzw, endPoint[1][1] );
+    }
+}
+void start_unquantize( inout int2x3 endPoint[2], uint4 prec, bool transformed )
+{
+    sign_extend( transformed, prec, endPoint );
+    if ( transformed )
+    {
+        endPoint[0][1] += endPoint[0][0];
+        endPoint[1][0] += endPoint[0][0];
+        endPoint[1][1] += endPoint[0][0];
+    }
+}
+void start_unquantize( inout int2x3 endPoint, uint4 prec, bool transformed )
+{
+    sign_extend( transformed, prec, endPoint );
+    if ( transformed )
+        endPoint[1] += endPoint[0];
+}
+void unquantize( inout int2x3 color, uint prec )
+{
+    int iprec = asint( prec );
+    if (g_format == UNSIGNED_F16 )
+    {
+        if (prec < 15)
+        {
+            color = (color != 0) ? (color == ((1 << iprec) - 1) ? 0xFFFF : (((color << 16) + 0x8000) >> iprec)) : color;
+        }
+    }
+    else
+    {
+        if (prec < 16)
+        {
+            uint2x3 s = color >= 0 ? 0 : 1;
+            color = abs(color);
+            color = (color != 0) ? (color >= ((1 << (iprec - 1)) - 1) ? 0x7FFF : (((color << 15) + 0x4000) >> (iprec - 1))) : color;
+            color = s > 0 ? -color : color;
+        }
+    }
+}
+uint3 finish_unquantize( int3 color )
+{
+    if ( g_format == UNSIGNED_F16 )
+        color = ( color * 31 ) >> 6;
+    else
+    {
+        color = ( color < 0 ) ? -( ( -color * 31 ) >> 5 ) : ( color * 31 ) >> 5;
+        color = ( color < 0 ) ? ( ( -color ) | 0x8000 ) : color;
+    }
+    return asuint(color);
+}
+void generate_palette_unquantized8( out uint3 palette, int3 low, int3 high, int i )
+{
+    static const int aWeight3[] = {0, 9, 18, 27, 37, 46, 55, 64};
+    
+    int3 tmp = ( low * ( 64 - aWeight3[i] ) + high * aWeight3[i] + 32 ) >> 6;
+    palette = finish_unquantize( tmp );
+}
+void generate_palette_unquantized16( out uint3 palette, int3 low, int3 high, int i )
+{
+    static const int aWeight4[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};
+    
+    int3 tmp = ( low * ( 64 - aWeight4[i] ) + high * aWeight4[i] + 32 ) >> 6;
+    palette = finish_unquantize( tmp );
+}
+
+float half2float1( uint Value )
+{
+    uint Mantissa = (uint)(Value & 0x03FF);
+
+    uint Exponent;
+    if ((Value & 0x7C00) != 0)  // The value is normalized
+    {
+        Exponent = (uint)((Value >> 10) & 0x1F);
+    }
+    else if (Mantissa != 0)     // The value is denormalized
+    {
+        // Normalize the value in the resulting float
+        Exponent = 1;
+
+        do
+        {
+            Exponent--;
+            Mantissa <<= 1;
+        } while ((Mantissa & 0x0400) == 0);
+
+        Mantissa &= 0x03FF;
+    }
+    else                        // The value is zero
+    {
+        Exponent = (uint)(-112);
+    }
+
+    uint Result = ((Value & 0x8000) << 16) | // Sign
+                      ((Exponent + 112) << 23) | // Exponent
+                      (Mantissa << 13);          // Mantissa
+
+    return asfloat(Result);
+}
+
+float3 half2float(uint3 color_h )
+{
+    //uint3 sign = color_h & 0x8000;
+    //uint3 expo = color_h & 0x7C00;
+    //uint3 base = color_h & 0x03FF;
+    //return ( expo == 0 ) ? asfloat( ( sign << 16 ) | asuint( float3(base) / 16777216 ) ) //16777216 = 2^24
+    //    : asfloat( ( sign << 16 ) | ( ( ( expo + 0x1C000 ) | base ) << 13 ) ); //0x1C000 = 0x1FC00 - 0x3C00
+
+    return float3( half2float1( color_h.x ), half2float1( color_h.y ), half2float1( color_h.z ) );
+}
+
+void block_package( inout uint4 block, int2x3 endPoint[2], uint mode_type, uint partition_index ) // for mode 1 - 10
+{
+    block.xy = 0;
+    block.z &= 0xFFFC0000;
+    
+    //block.z |= (partition_index & 0x1f) << 13;
+    
+    if ( mode_type == candidateModeFlag[0])
+    {
+        /*block.x = candidateModeMemory[0];
+        block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
+        block.x |= ( endPoint[1][0].g >> 2 ) & 0x00000004;
+        block.x |= ( endPoint[1][0].b >> 1 ) & 0x00000008;
+        block.x |= endPoint[1][1].b & 0x00000010;
+        block.y |= ( ( endPoint[0][0].b >> 7 ) & 0x00000007 );
+        block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 );
+        block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E);
+        block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+        block.y |= ( ( endPoint[1][1].g << 4 ) & 0x00000100 );
+        block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80);
+        block.yz |= ( ( endPoint[1][1].b << uint2(27, 9) ) & uint2(0x10000000, 0x00001000) ) | ( ( endPoint[1][1].b << uint2(18, 4) ) & uint2(0x00040000, 0x00000040) );
+        block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;*/
+
+        block.x |= ((candidateModeMemory[0] >> 0) & 1) << 0;
+        block.x |= ((candidateModeMemory[0] >> 1) & 1) << 1;
+        block.x |= ((endPoint[1][0].g >> 4) & 1) << 2;
+        block.x |= ((endPoint[1][0].b >> 4) & 1) << 3;
+        block.x |= ((endPoint[1][1].b >> 4) & 1) << 4;
+        block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+        block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+        block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+        block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+        block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+        block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+        block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+        block.x |= ((endPoint[0][0].r >> 7) & 1) << 12;
+        block.x |= ((endPoint[0][0].r >> 8) & 1) << 13;
+        block.x |= ((endPoint[0][0].r >> 9) & 1) << 14;
+        block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+        block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+        block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+        block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+        block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+        block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+        block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+        block.x |= ((endPoint[0][0].g >> 7) & 1) << 22;
+        block.x |= ((endPoint[0][0].g >> 8) & 1) << 23;
+        block.x |= ((endPoint[0][0].g >> 9) & 1) << 24;
+        block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+        block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+        block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+        block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+        block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+        block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+        block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+        block.y |= ((endPoint[0][0].b >> 7) & 1) << 0;
+        block.y |= ((endPoint[0][0].b >> 8) & 1) << 1;
+        block.y |= ((endPoint[0][0].b >> 9) & 1) << 2;
+        block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+        block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+        block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+        block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+        block.y |= ((endPoint[0][1].r >> 4) & 1) << 7;
+        block.y |= ((endPoint[1][1].g >> 4) & 1) << 8;
+        block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+        block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+        block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+        block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+        block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+        block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+        block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+        block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+        block.y |= ((endPoint[0][1].g >> 4) & 1) << 17;
+        block.y |= ((endPoint[1][1].b >> 0) & 1) << 18;
+        block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+        block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+        block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+        block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+        block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+        block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+        block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+        block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+        block.y |= ((endPoint[0][1].b >> 4) & 1) << 27;
+        block.y |= ((endPoint[1][1].b >> 1) & 1) << 28;
+        block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+        block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+        block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+        block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+        block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+        block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+        block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+        block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+        block.z |= ((endPoint[1][0].r >> 4) & 1) << 5;
+        block.z |= ((endPoint[1][1].b >> 2) & 1) << 6;
+        block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+        block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+        block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+        block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+        block.z |= ((endPoint[1][1].r >> 4) & 1) << 11;
+        block.z |= ((endPoint[1][1].b >> 3) & 1) << 12;
+        block.z |= ((partition_index >> 0) & 1) << 13;
+        block.z |= ((partition_index >> 1) & 1) << 14;
+        block.z |= ((partition_index >> 2) & 1) << 15;
+        block.z |= ((partition_index >> 3) & 1) << 16;
+        block.z |= ((partition_index >> 4) & 1) << 17;
+    }
+    else if ( mode_type == candidateModeFlag[1])
+    {
+        /*block.x = candidateModeMemory[1];
+        block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00000FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x003F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
+        block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000001F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0007E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x1F800000 );
+        block.x |= ( ( endPoint[1][0].g >> 3 ) & 0x00000004 ) | ( ( endPoint[1][0].g << 20 ) & 0x01000000 );
+        block.x |= ( endPoint[1][1].g >> 1 ) & 0x00000018;
+        block.x |= ( ( endPoint[1][1].b << 21 ) & 0x00800000 ) | ( ( endPoint[1][1].b << 12 ) & 0x00003000 );
+        block.x |= ( ( endPoint[1][0].b << 17 ) & 0x00400000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 );
+        block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000007E);
+        block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+        block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00001F80);
+        block.y |= ( ( endPoint[1][1].b >> 4 ) & 0x00000002 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 ) | ( ( endPoint[1][1].b >> 3 ) & 0x00000001 );
+        block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;*/
+
+        block.x |= ((candidateModeMemory[1] >> 0) & 1) << 0;
+        block.x |= ((candidateModeMemory[1] >> 1) & 1) << 1;
+        block.x |= ((endPoint[1][0].g >> 5) & 1) << 2;
+        block.x |= ((endPoint[1][1].g >> 4) & 1) << 3;
+        block.x |= ((endPoint[1][1].g >> 5) & 1) << 4;
+        block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+        block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+        block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+        block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+        block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+        block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+        block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+        block.x |= ((endPoint[1][1].b >> 0) & 1) << 12;
+        block.x |= ((endPoint[1][1].b >> 1) & 1) << 13;
+        block.x |= ((endPoint[1][0].b >> 4) & 1) << 14;
+        block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+        block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+        block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+        block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+        block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+        block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+        block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+        block.x |= ((endPoint[1][0].b >> 5) & 1) << 22;
+        block.x |= ((endPoint[1][1].b >> 2) & 1) << 23;
+        block.x |= ((endPoint[1][0].g >> 4) & 1) << 24;
+        block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+        block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+        block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+        block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+        block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+        block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+        block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+        block.y |= ((endPoint[1][1].b >> 3) & 1) << 0;
+        block.y |= ((endPoint[1][1].b >> 5) & 1) << 1;
+        block.y |= ((endPoint[1][1].b >> 4) & 1) << 2;
+        block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+        block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+        block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+        block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+        block.y |= ((endPoint[0][1].r >> 4) & 1) << 7;
+        block.y |= ((endPoint[0][1].r >> 5) & 1) << 8;
+        block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+        block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+        block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+        block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+        block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+        block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+        block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+        block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+        block.y |= ((endPoint[0][1].g >> 4) & 1) << 17;
+        block.y |= ((endPoint[0][1].g >> 5) & 1) << 18;
+        block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+        block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+        block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+        block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+        block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+        block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+        block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+        block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+        block.y |= ((endPoint[0][1].b >> 4) & 1) << 27;
+        block.y |= ((endPoint[0][1].b >> 5) & 1) << 28;
+        block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+        block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+        block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+        block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+        block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+        block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+        block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+        block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+        block.z |= ((endPoint[1][0].r >> 4) & 1) << 5;
+        block.z |= ((endPoint[1][0].r >> 5) & 1) << 6;
+        block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+        block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+        block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+        block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+        block.z |= ((endPoint[1][1].r >> 4) & 1) << 11;
+        block.z |= ((endPoint[1][1].r >> 5) & 1) << 12;
+        block.z |= ((partition_index >> 0) & 1) << 13;
+        block.z |= ((partition_index >> 1) & 1) << 14;
+        block.z |= ((partition_index >> 2) & 1) << 15;
+        block.z |= ((partition_index >> 3) & 1) << 16;
+        block.z |= ((partition_index >> 4) & 1) << 17;
+    }
+    else if ( mode_type == candidateModeFlag[2])
+    {
+        /*block.x = candidateModeMemory[2];
+        block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
+        block.y |= ( endPoint[0][0].r >> 2 ) & 0x00000100;
+        block.y |= ( endPoint[0][0].g << 7 ) & 0x00020000;
+        block.y |= ( ( endPoint[0][0].b << 17 ) & 0x08000000 ) | ( ( endPoint[0][0].b >> 7 ) & 0x00000007 );
+        block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0001E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x07800000 );
+        block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E);
+        block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+        block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80);
+        block.yz |= ( ( endPoint[1][1].b << uint2(27, 9) ) & uint2(0x10000000, 0x00001000) ) | ( ( endPoint[1][1].b << uint2(18, 4) ) & uint2(0x00040000, 0x00000040) );
+        block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;*/
+
+        block.x |= ((candidateModeMemory[2] >> 0) & 1) << 0;
+        block.x |= ((candidateModeMemory[2] >> 1) & 1) << 1;
+        block.x |= ((candidateModeMemory[2] >> 2) & 1) << 2;
+        block.x |= ((candidateModeMemory[2] >> 3) & 1) << 3;
+        block.x |= ((candidateModeMemory[2] >> 4) & 1) << 4;
+        block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+        block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+        block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+        block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+        block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+        block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+        block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+        block.x |= ((endPoint[0][0].r >> 7) & 1) << 12;
+        block.x |= ((endPoint[0][0].r >> 8) & 1) << 13;
+        block.x |= ((endPoint[0][0].r >> 9) & 1) << 14;
+        block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+        block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+        block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+        block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+        block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+        block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+        block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+        block.x |= ((endPoint[0][0].g >> 7) & 1) << 22;
+        block.x |= ((endPoint[0][0].g >> 8) & 1) << 23;
+        block.x |= ((endPoint[0][0].g >> 9) & 1) << 24;
+        block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+        block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+        block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+        block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+        block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+        block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+        block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+        block.y |= ((endPoint[0][0].b >> 7) & 1) << 0;
+        block.y |= ((endPoint[0][0].b >> 8) & 1) << 1;
+        block.y |= ((endPoint[0][0].b >> 9) & 1) << 2;
+        block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+        block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+        block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+        block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+        block.y |= ((endPoint[0][1].r >> 4) & 1) << 7;
+        block.y |= ((endPoint[0][0].r >> 10) & 1) << 8;
+        block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+        block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+        block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+        block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+        block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+        block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+        block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+        block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+        block.y |= ((endPoint[0][0].g >> 10) & 1) << 17;
+        block.y |= ((endPoint[1][1].b >> 0) & 1) << 18;
+        block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+        block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+        block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+        block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+        block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+        block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+        block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+        block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+        block.y |= ((endPoint[0][0].b >> 10) & 1) << 27;
+        block.y |= ((endPoint[1][1].b >> 1) & 1) << 28;
+        block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+        block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+        block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+        block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+        block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+        block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+        block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+        block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+        block.z |= ((endPoint[1][0].r >> 4) & 1) << 5;
+        block.z |= ((endPoint[1][1].b >> 2) & 1) << 6;
+        block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+        block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+        block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+        block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+        block.z |= ((endPoint[1][1].r >> 4) & 1) << 11;
+        block.z |= ((endPoint[1][1].b >> 3) & 1) << 12;
+        block.z |= ((partition_index >> 0) & 1) << 13;
+        block.z |= ((partition_index >> 1) & 1) << 14;
+        block.z |= ((partition_index >> 2) & 1) << 15;
+        block.z |= ((partition_index >> 3) & 1) << 16;
+        block.z |= ((partition_index >> 4) & 1) << 17;
+    }
+    else if ( mode_type == candidateModeFlag[3])
+    {
+        /*block.x = candidateModeMemory[3];
+        block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
+        block.y |= ( endPoint[0][0].r >> 3 ) & 0x00000080;
+        block.y |= ( endPoint[0][0].g << 8 ) & 0x00040000;
+        block.y |= ( ( endPoint[0][0].b << 17 ) & 0x08000000 ) | ( ( endPoint[0][0].b >> 7 ) & 0x00000007 );
+        block.y |= ( ( endPoint[0][1].r << 3 ) & 0x00000078 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x07800000 );
+        block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000001E);
+        block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+        block.y |= ( ( endPoint[1][1].g << 4 ) & 0x00000100 );
+        block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000780);
+        block.yz |= ( endPoint[1][1].b << uint2(27, 9) ) & uint2(0x10000000, 0x00001000);
+        block.z |= ( ( endPoint[1][0].g << 7 ) & 0x00000800 );
+        block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
+        block.z |= ( endPoint[1][1].b << 4 ) & 0x00000040;
+        block.z |= ( endPoint[1][1].b << 5 ) & 0x00000020;*/
+
+        block.x |= ((candidateModeMemory[3] >> 0) & 1) << 0;
+        block.x |= ((candidateModeMemory[3] >> 1) & 1) << 1;
+        block.x |= ((candidateModeMemory[3] >> 2) & 1) << 2;
+        block.x |= ((candidateModeMemory[3] >> 3) & 1) << 3;
+        block.x |= ((candidateModeMemory[3] >> 4) & 1) << 4;
+        block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+        block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+        block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+        block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+        block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+        block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+        block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+        block.x |= ((endPoint[0][0].r >> 7) & 1) << 12;
+        block.x |= ((endPoint[0][0].r >> 8) & 1) << 13;
+        block.x |= ((endPoint[0][0].r >> 9) & 1) << 14;
+        block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+        block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+        block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+        block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+        block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+        block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+        block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+        block.x |= ((endPoint[0][0].g >> 7) & 1) << 22;
+        block.x |= ((endPoint[0][0].g >> 8) & 1) << 23;
+        block.x |= ((endPoint[0][0].g >> 9) & 1) << 24;
+        block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+        block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+        block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+        block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+        block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+        block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+        block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+        block.y |= ((endPoint[0][0].b >> 7) & 1) << 0;
+        block.y |= ((endPoint[0][0].b >> 8) & 1) << 1;
+        block.y |= ((endPoint[0][0].b >> 9) & 1) << 2;
+        block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+        block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+        block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+        block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+        block.y |= ((endPoint[0][0].r >> 10) & 1) << 7;
+        block.y |= ((endPoint[1][1].g >> 4) & 1) << 8;
+        block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+        block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+        block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+        block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+        block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+        block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+        block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+        block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+        block.y |= ((endPoint[0][1].g >> 4) & 1) << 17;
+        block.y |= ((endPoint[0][0].g >> 10) & 1) << 18;
+        block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+        block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+        block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+        block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+        block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+        block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+        block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+        block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+        block.y |= ((endPoint[0][0].b >> 10) & 1) << 27;
+        block.y |= ((endPoint[1][1].b >> 1) & 1) << 28;
+        block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+        block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+        block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+        block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+        block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+        block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+        block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+        block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+        block.z |= ((endPoint[1][1].b >> 0) & 1) << 5;
+        block.z |= ((endPoint[1][1].b >> 2) & 1) << 6;
+        block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+        block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+        block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+        block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+        block.z |= ((endPoint[1][0].g >> 4) & 1) << 11;
+        block.z |= ((endPoint[1][1].b >> 3) & 1) << 12;
+        block.z |= ((partition_index >> 0) & 1) << 13;
+        block.z |= ((partition_index >> 1) & 1) << 14;
+        block.z |= ((partition_index >> 2) & 1) << 15;
+        block.z |= ((partition_index >> 3) & 1) << 16;
+        block.z |= ((partition_index >> 4) & 1) << 17;
+    }
+    else if ( mode_type == candidateModeFlag[4])
+    {
+        /*block.x = candidateModeMemory[4];
+        block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
+        block.y |= ( endPoint[0][0].r >> 3 ) & 0x00000080;
+        block.y |= ( endPoint[0][0].g << 7 ) & 0x00020000;
+        block.y |= ( ( endPoint[0][0].b << 18 ) & 0x10000000 ) | ( ( endPoint[0][0].b >> 7 ) & 0x00000007 );
+        block.y |= ( ( endPoint[0][1].r << 3 ) & 0x00000078 ) | ( ( endPoint[0][1].g << 13 ) & 0x0001E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 );
+        block.y |= ( ( endPoint[1][0].g << 9 ) & 0x00001E00 ) | ( ( endPoint[1][0].b << 4 ) & 0x00000100 );
+        block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+        block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000780);
+        block.yz |= ( endPoint[1][1].b << uint2(18, 4) ) & uint2(0x00040000, 0x00000060);
+        block.z |= ( endPoint[1][0].r << 1 ) & 0x0000001E;
+        block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
+        block.z |= ( ( endPoint[1][1].b << 7 ) & 0x00000800 ) | ( ( endPoint[1][1].b << 9 ) & 0x00001000 );*/
+
+        block.x |= ((candidateModeMemory[4] >> 0) & 1) << 0;
+        block.x |= ((candidateModeMemory[4] >> 1) & 1) << 1;
+        block.x |= ((candidateModeMemory[4] >> 2) & 1) << 2;
+        block.x |= ((candidateModeMemory[4] >> 3) & 1) << 3;
+        block.x |= ((candidateModeMemory[4] >> 4) & 1) << 4;
+        block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+        block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+        block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+        block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+        block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+        block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+        block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+        block.x |= ((endPoint[0][0].r >> 7) & 1) << 12;
+        block.x |= ((endPoint[0][0].r >> 8) & 1) << 13;
+        block.x |= ((endPoint[0][0].r >> 9) & 1) << 14;
+        block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+        block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+        block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+        block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+        block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+        block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+        block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+        block.x |= ((endPoint[0][0].g >> 7) & 1) << 22;
+        block.x |= ((endPoint[0][0].g >> 8) & 1) << 23;
+        block.x |= ((endPoint[0][0].g >> 9) & 1) << 24;
+        block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+        block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+        block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+        block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+        block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+        block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+        block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+        block.y |= ((endPoint[0][0].b >> 7) & 1) << 0;
+        block.y |= ((endPoint[0][0].b >> 8) & 1) << 1;
+        block.y |= ((endPoint[0][0].b >> 9) & 1) << 2;
+        block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+        block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+        block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+        block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+        block.y |= ((endPoint[0][0].r >> 10) & 1) << 7;
+        block.y |= ((endPoint[1][0].b >> 4) & 1) << 8;
+        block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+        block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+        block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+        block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+        block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+        block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+        block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+        block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+        block.y |= ((endPoint[0][0].g >> 10) & 1) << 17;
+        block.y |= ((endPoint[1][1].b >> 0) & 1) << 18;
+        block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+        block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+        block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+        block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+        block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+        block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+        block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+        block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+        block.y |= ((endPoint[0][1].b >> 4) & 1) << 27;
+        block.y |= ((endPoint[0][0].b >> 10) & 1) << 28;
+        block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+        block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+        block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+        block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+        block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+        block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+        block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+        block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+        block.z |= ((endPoint[1][1].b >> 1) & 1) << 5;
+        block.z |= ((endPoint[1][1].b >> 2) & 1) << 6;
+        block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+        block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+        block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+        block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+        block.z |= ((endPoint[1][1].b >> 4) & 1) << 11;
+        block.z |= ((endPoint[1][1].b >> 3) & 1) << 12;
+        block.z |= ((partition_index >> 0) & 1) << 13;
+        block.z |= ((partition_index >> 1) & 1) << 14;
+        block.z |= ((partition_index >> 2) & 1) << 15;
+        block.z |= ((partition_index >> 3) & 1) << 16;
+        block.z |= ((partition_index >> 4) & 1) << 17;
+    }
+    else if ( mode_type == candidateModeFlag[5])
+    {
+        /*block.x = candidateModeMemory[5];
+        block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00003FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x00FF8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000);
+        block.y |= ( endPoint[0][0].b >> 7 ) & 0x00000003;
+        block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 );
+        block.x |= ( ( endPoint[1][0].g << 20 ) & 0x01000000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 );
+        block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E);
+        block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+        block.y |= ( ( endPoint[1][1].g << 4 ) & 0x00000100 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 );
+        block.y |= ( ( endPoint[1][1].b << 27 ) & 0x10000000 );
+        block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80);
+        block.yz |= ( endPoint[1][1].b << uint2(18, 4) ) & uint2(0x00040000, 0x00000040);
+        block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
+        block.z |= ( ( endPoint[1][1].b << 9 ) & 0x00001000 );*/
+
+        block.x |= ((candidateModeMemory[5] >> 0) & 1) << 0;
+        block.x |= ((candidateModeMemory[5] >> 1) & 1) << 1;
+        block.x |= ((candidateModeMemory[5] >> 2) & 1) << 2;
+        block.x |= ((candidateModeMemory[5] >> 3) & 1) << 3;
+        block.x |= ((candidateModeMemory[5] >> 4) & 1) << 4;
+        block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+        block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+        block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+        block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+        block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+        block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+        block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+        block.x |= ((endPoint[0][0].r >> 7) & 1) << 12;
+        block.x |= ((endPoint[0][0].r >> 8) & 1) << 13;
+        block.x |= ((endPoint[1][0].b >> 4) & 1) << 14;
+        block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+        block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+        block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+        block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+        block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+        block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+        block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+        block.x |= ((endPoint[0][0].g >> 7) & 1) << 22;
+        block.x |= ((endPoint[0][0].g >> 8) & 1) << 23;
+        block.x |= ((endPoint[1][0].g >> 4) & 1) << 24;
+        block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+        block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+        block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+        block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+        block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+        block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+        block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+        block.y |= ((endPoint[0][0].b >> 7) & 1) << 0;
+        block.y |= ((endPoint[0][0].b >> 8) & 1) << 1;
+        block.y |= ((endPoint[1][1].b >> 4) & 1) << 2;
+        block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+        block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+        block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+        block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+        block.y |= ((endPoint[0][1].r >> 4) & 1) << 7;
+        block.y |= ((endPoint[1][1].g >> 4) & 1) << 8;
+        block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+        block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+        block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+        block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+        block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+        block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+        block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+        block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+        block.y |= ((endPoint[0][1].g >> 4) & 1) << 17;
+        block.y |= ((endPoint[1][1].b >> 0) & 1) << 18;
+        block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+        block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+        block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+        block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+        block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+        block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+        block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+        block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+        block.y |= ((endPoint[0][1].b >> 4) & 1) << 27;
+        block.y |= ((endPoint[1][1].b >> 1) & 1) << 28;
+        block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+        block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+        block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+        block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+        block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+        block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+        block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+        block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+        block.z |= ((endPoint[1][0].r >> 4) & 1) << 5;
+        block.z |= ((endPoint[1][1].b >> 2) & 1) << 6;
+        block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+        block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+        block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+        block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+        block.z |= ((endPoint[1][1].r >> 4) & 1) << 11;
+        block.z |= ((endPoint[1][1].b >> 3) & 1) << 12;
+        block.z |= ((partition_index >> 0) & 1) << 13;
+        block.z |= ((partition_index >> 1) & 1) << 14;
+        block.z |= ((partition_index >> 2) & 1) << 15;
+        block.z |= ((partition_index >> 3) & 1) << 16;
+        block.z |= ((partition_index >> 4) & 1) << 17;
+    }
+    else if ( mode_type == candidateModeFlag[6])
+    {
+        /*block.x = candidateModeMemory[6];
+        block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00001FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x007F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
+        block.y |= ( endPoint[0][0].b >> 7 ) & 0x00000001;
+        block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000001F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 );
+        block.x |= ( ( endPoint[1][0].g << 20 ) & 0x01000000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000);
+        block.x |= ( ( endPoint[1][1].g << 9 ) & 0x00002000 ) | ( ( endPoint[1][1].b << 21 ) & 0x00800000);
+        block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000007E);
+        block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+        block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00001F80);
+        block.y |= ( ( endPoint[1][1].b >> 2 ) & 0x00000006 );
+        block.y |= ( ( endPoint[1][1].b << 27 ) & 0x10000000 ) | ( ( endPoint[1][1].b << 18 ) & 0x00040000 );
+        block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;*/
+
+        block.x |= ((candidateModeMemory[6] >> 0) & 1) << 0;
+        block.x |= ((candidateModeMemory[6] >> 1) & 1) << 1;
+        block.x |= ((candidateModeMemory[6] >> 2) & 1) << 2;
+        block.x |= ((candidateModeMemory[6] >> 3) & 1) << 3;
+        block.x |= ((candidateModeMemory[6] >> 4) & 1) << 4;
+        block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+        block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+        block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+        block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+        block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+        block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+        block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+        block.x |= ((endPoint[0][0].r >> 7) & 1) << 12;
+        block.x |= ((endPoint[1][1].g >> 4) & 1) << 13;
+        block.x |= ((endPoint[1][0].b >> 4) & 1) << 14;
+        block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+        block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+        block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+        block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+        block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+        block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+        block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+        block.x |= ((endPoint[0][0].g >> 7) & 1) << 22;
+        block.x |= ((endPoint[1][1].b >> 2) & 1) << 23;
+        block.x |= ((endPoint[1][0].g >> 4) & 1) << 24;
+        block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+        block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+        block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+        block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+        block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+        block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+        block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+        block.y |= ((endPoint[0][0].b >> 7) & 1) << 0;
+        block.y |= ((endPoint[1][1].b >> 3) & 1) << 1;
+        block.y |= ((endPoint[1][1].b >> 4) & 1) << 2;
+        block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+        block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+        block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+        block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+        block.y |= ((endPoint[0][1].r >> 4) & 1) << 7;
+        block.y |= ((endPoint[0][1].r >> 5) & 1) << 8;
+        block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+        block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+        block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+        block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+        block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+        block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+        block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+        block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+        block.y |= ((endPoint[0][1].g >> 4) & 1) << 17;
+        block.y |= ((endPoint[1][1].b >> 0) & 1) << 18;
+        block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+        block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+        block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+        block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+        block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+        block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+        block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+        block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+        block.y |= ((endPoint[0][1].b >> 4) & 1) << 27;
+        block.y |= ((endPoint[1][1].b >> 1) & 1) << 28;
+        block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+        block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+        block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+        block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+        block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+        block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+        block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+        block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+        block.z |= ((endPoint[1][0].r >> 4) & 1) << 5;
+        block.z |= ((endPoint[1][0].r >> 5) & 1) << 6;
+        block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+        block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+        block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+        block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+        block.z |= ((endPoint[1][1].r >> 4) & 1) << 11;
+        block.z |= ((endPoint[1][1].r >> 5) & 1) << 12;
+        block.z |= ((partition_index >> 0) & 1) << 13;
+        block.z |= ((partition_index >> 1) & 1) << 14;
+        block.z |= ((partition_index >> 2) & 1) << 15;
+        block.z |= ((partition_index >> 3) & 1) << 16;
+        block.z |= ((partition_index >> 4) & 1) << 17;
+    }
+    else if ( mode_type == candidateModeFlag[7])
+    {
+        /*block.x = candidateModeMemory[7];
+        block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00001FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x007F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
+        block.y |= ( endPoint[0][0].b >> 7 ) & 0x00000001;
+        block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0007E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x0F800000 );
+        block.x |= ( ( endPoint[1][0].g << 20 ) & 0x01000000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 );
+        block.x |= ( ( endPoint[1][0].g << 18 ) & 0x00800000 );
+        block.x |= ( ( endPoint[1][1].b << 13 ) & 0x00002000 );
+        block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E);
+        block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80);
+        block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+        block.y |= ( ( endPoint[1][1].g >> 4 ) & 0x00000002 ) | ( ( endPoint[1][1].g << 4 ) & 0x00000100 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 );
+        block.y |= ( endPoint[1][1].b << 27 ) & 0x10000000;
+        block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
+        block.z |= ( ( endPoint[1][1].b << 9 ) & 0x00001000 ) | ( ( endPoint[1][1].b << 4 ) & 0x00000040 );*/
+
+        block.x |= ((candidateModeMemory[7] >> 0) & 1) << 0;
+        block.x |= ((candidateModeMemory[7] >> 1) & 1) << 1;
+        block.x |= ((candidateModeMemory[7] >> 2) & 1) << 2;
+        block.x |= ((candidateModeMemory[7] >> 3) & 1) << 3;
+        block.x |= ((candidateModeMemory[7] >> 4) & 1) << 4;
+        block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+        block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+        block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+        block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+        block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+        block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+        block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+        block.x |= ((endPoint[0][0].r >> 7) & 1) << 12;
+        block.x |= ((endPoint[1][1].b >> 0) & 1) << 13;
+        block.x |= ((endPoint[1][0].b >> 4) & 1) << 14;
+        block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+        block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+        block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+        block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+        block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+        block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+        block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+        block.x |= ((endPoint[0][0].g >> 7) & 1) << 22;
+        block.x |= ((endPoint[1][0].g >> 5) & 1) << 23;
+        block.x |= ((endPoint[1][0].g >> 4) & 1) << 24;
+        block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+        block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+        block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+        block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+        block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+        block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+        block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+        block.y |= ((endPoint[0][0].b >> 7) & 1) << 0;
+        block.y |= ((endPoint[1][1].g >> 5) & 1) << 1;
+        block.y |= ((endPoint[1][1].b >> 4) & 1) << 2;
+        block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+        block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+        block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+        block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+        block.y |= ((endPoint[0][1].r >> 4) & 1) << 7;
+        block.y |= ((endPoint[1][1].g >> 4) & 1) << 8;
+        block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+        block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+        block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+        block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+        block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+        block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+        block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+        block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+        block.y |= ((endPoint[0][1].g >> 4) & 1) << 17;
+        block.y |= ((endPoint[0][1].g >> 5) & 1) << 18;
+        block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+        block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+        block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+        block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+        block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+        block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+        block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+        block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+        block.y |= ((endPoint[0][1].b >> 4) & 1) << 27;
+        block.y |= ((endPoint[1][1].b >> 1) & 1) << 28;
+        block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+        block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+        block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+        block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+        block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+        block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+        block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+        block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+        block.z |= ((endPoint[1][0].r >> 4) & 1) << 5;
+        block.z |= ((endPoint[1][1].b >> 2) & 1) << 6;
+        block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+        block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+        block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+        block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+        block.z |= ((endPoint[1][1].r >> 4) & 1) << 11;
+        block.z |= ((endPoint[1][1].b >> 3) & 1) << 12;
+        block.z |= ((partition_index >> 0) & 1) << 13;
+        block.z |= ((partition_index >> 1) & 1) << 14;
+        block.z |= ((partition_index >> 2) & 1) << 15;
+        block.z |= ((partition_index >> 3) & 1) << 16;
+        block.z |= ((partition_index >> 4) & 1) << 17;
+    }
+    else if ( mode_type == candidateModeFlag[8])
+    {
+        /*block.x = candidateModeMemory[8];
+        block.x |= ( ( endPoint[0][0].r << 5 ) & 0x00001FE0 ) | ( ( endPoint[0][0].g << 15 ) & 0x007F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0xFE000000 );
+        block.y |= ( endPoint[0][0].b >> 7 ) & 0x00000001;
+        block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000000F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0003E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x1F800000 );
+        block.x |= ( ( endPoint[1][0].g << 20 ) & 0x01000000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 );
+        block.x |= ( ( endPoint[1][0].b << 18 ) & 0x00800000 );
+        block.x |= ( endPoint[1][1].b << 12 ) & 0x00002000;
+        block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+        block.y |= ( ( endPoint[1][1].g << 4 ) & 0x00000100 ) | ( ( endPoint[1][1].b >> 4 ) & 0x00000002 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 );
+        block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000003E);
+        block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00000F80);
+        block.y |= ( endPoint[1][1].b << 18 ) & 0x00040000;
+        block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;
+        block.z |= ( ( endPoint[1][1].b << 9 ) & 0x00001000 ) | ( ( endPoint[1][1].b << 4 ) & 0x00000040 );*/
+
+        block.x |= ((candidateModeMemory[8] >> 0) & 1) << 0;
+        block.x |= ((candidateModeMemory[8] >> 1) & 1) << 1;
+        block.x |= ((candidateModeMemory[8] >> 2) & 1) << 2;
+        block.x |= ((candidateModeMemory[8] >> 3) & 1) << 3;
+        block.x |= ((candidateModeMemory[8] >> 4) & 1) << 4;
+        block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+        block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+        block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+        block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+        block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+        block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+        block.x |= ((endPoint[0][0].r >> 6) & 1) << 11;
+        block.x |= ((endPoint[0][0].r >> 7) & 1) << 12;
+        block.x |= ((endPoint[1][1].b >> 1) & 1) << 13;
+        block.x |= ((endPoint[1][0].b >> 4) & 1) << 14;
+        block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+        block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+        block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+        block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+        block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+        block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+        block.x |= ((endPoint[0][0].g >> 6) & 1) << 21;
+        block.x |= ((endPoint[0][0].g >> 7) & 1) << 22;
+        block.x |= ((endPoint[1][0].b >> 5) & 1) << 23;
+        block.x |= ((endPoint[1][0].g >> 4) & 1) << 24;
+        block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+        block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+        block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+        block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+        block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+        block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+        block.x |= ((endPoint[0][0].b >> 6) & 1) << 31;
+        block.y |= ((endPoint[0][0].b >> 7) & 1) << 0;
+        block.y |= ((endPoint[1][1].b >> 5) & 1) << 1;
+        block.y |= ((endPoint[1][1].b >> 4) & 1) << 2;
+        block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+        block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+        block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+        block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+        block.y |= ((endPoint[0][1].r >> 4) & 1) << 7;
+        block.y |= ((endPoint[1][1].g >> 4) & 1) << 8;
+        block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+        block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+        block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+        block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+        block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+        block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+        block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+        block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+        block.y |= ((endPoint[0][1].g >> 4) & 1) << 17;
+        block.y |= ((endPoint[1][1].b >> 0) & 1) << 18;
+        block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+        block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+        block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+        block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+        block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+        block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+        block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+        block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+        block.y |= ((endPoint[0][1].b >> 4) & 1) << 27;
+        block.y |= ((endPoint[0][1].b >> 5) & 1) << 28;
+        block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+        block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+        block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+        block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+        block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+        block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+        block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+        block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+        block.z |= ((endPoint[1][0].r >> 4) & 1) << 5;
+        block.z |= ((endPoint[1][1].b >> 2) & 1) << 6;
+        block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+        block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+        block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+        block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+        block.z |= ((endPoint[1][1].r >> 4) & 1) << 11;
+        block.z |= ((endPoint[1][1].b >> 3) & 1) << 12;
+        block.z |= ((partition_index >> 0) & 1) << 13;
+        block.z |= ((partition_index >> 1) & 1) << 14;
+        block.z |= ((partition_index >> 2) & 1) << 15;
+        block.z |= ((partition_index >> 3) & 1) << 16;
+        block.z |= ((partition_index >> 4) & 1) << 17;
+    }
+    else if ( mode_type == candidateModeFlag[9])
+    {
+        /*block.x = candidateModeMemory[9];
+        block.x |= ( ( endPoint[0][0].r << 5 ) & 0x000007E0 ) | ( ( endPoint[0][0].g << 15 ) & 0x001F8000 ) | ( ( endPoint[0][0].b << 25 ) & 0x7E000000 );
+        block.y |= ( ( endPoint[0][1].r << 3 ) & 0x000001F8 ) | ( ( endPoint[0][1].g << 13 ) & 0x0007E000 ) | ( ( endPoint[0][1].b << 23 ) & 0x1F800000 );
+        block.x |= ( ( endPoint[1][0].g << 16 ) & 0x00200000 ) | ( ( endPoint[1][0].g << 20 ) & 0x01000000 );
+        block.x |= ( ( endPoint[1][0].b << 17 ) & 0x00400000 ) | ( ( endPoint[1][0].b << 10 ) & 0x00004000 );
+        block.x |= ( ( endPoint[1][1].b << 21 ) & 0x00800000 ) | ( ( endPoint[1][1].b << 12 ) & 0x00003000 );
+        block.x |= ( ( endPoint[1][1].g << 26 ) & 0x80000000 ) | ( ( endPoint[1][1].g << 7 ) & 0x00000800 );
+        block.yz |= ( endPoint[1][0].gr << uint2(9, 1) ) & uint2(0x00001E00, 0x0000007E);
+        block.yz |= ( endPoint[1][1].gr << uint2(19, 7) ) & uint2(0x00780000, 0x00001F80);
+        block.y |= ( endPoint[1][0].b << 29 ) & 0xE0000000;
+        block.y |= ( ( endPoint[1][1].b >> 4 ) & 0x00000002 ) | ( ( endPoint[1][1].b >> 2 ) & 0x00000004 ) | ( ( endPoint[1][1].b >> 3 ) & 0x00000001 );
+        block.z |= ( endPoint[1][0].b >> 3 ) & 0x00000001;*/
+
+        block.x |= ((candidateModeMemory[9] >> 0) & 1) << 0;
+        block.x |= ((candidateModeMemory[9] >> 1) & 1) << 1;
+        block.x |= ((candidateModeMemory[9] >> 2) & 1) << 2;
+        block.x |= ((candidateModeMemory[9] >> 3) & 1) << 3;
+        block.x |= ((candidateModeMemory[9] >> 4) & 1) << 4;
+        block.x |= ((endPoint[0][0].r >> 0) & 1) << 5;
+        block.x |= ((endPoint[0][0].r >> 1) & 1) << 6;
+        block.x |= ((endPoint[0][0].r >> 2) & 1) << 7;
+        block.x |= ((endPoint[0][0].r >> 3) & 1) << 8;
+        block.x |= ((endPoint[0][0].r >> 4) & 1) << 9;
+        block.x |= ((endPoint[0][0].r >> 5) & 1) << 10;
+        block.x |= ((endPoint[1][1].g >> 4) & 1) << 11;
+        block.x |= ((endPoint[1][1].b >> 0) & 1) << 12;
+        block.x |= ((endPoint[1][1].b >> 1) & 1) << 13;
+        block.x |= ((endPoint[1][0].b >> 4) & 1) << 14;
+        block.x |= ((endPoint[0][0].g >> 0) & 1) << 15;
+        block.x |= ((endPoint[0][0].g >> 1) & 1) << 16;
+        block.x |= ((endPoint[0][0].g >> 2) & 1) << 17;
+        block.x |= ((endPoint[0][0].g >> 3) & 1) << 18;
+        block.x |= ((endPoint[0][0].g >> 4) & 1) << 19;
+        block.x |= ((endPoint[0][0].g >> 5) & 1) << 20;
+        block.x |= ((endPoint[1][0].g >> 5) & 1) << 21;
+        block.x |= ((endPoint[1][0].b >> 5) & 1) << 22;
+        block.x |= ((endPoint[1][1].b >> 2) & 1) << 23;
+        block.x |= ((endPoint[1][0].g >> 4) & 1) << 24;
+        block.x |= ((endPoint[0][0].b >> 0) & 1) << 25;
+        block.x |= ((endPoint[0][0].b >> 1) & 1) << 26;
+        block.x |= ((endPoint[0][0].b >> 2) & 1) << 27;
+        block.x |= ((endPoint[0][0].b >> 3) & 1) << 28;
+        block.x |= ((endPoint[0][0].b >> 4) & 1) << 29;
+        block.x |= ((endPoint[0][0].b >> 5) & 1) << 30;
+        block.x |= ((endPoint[1][1].g >> 5) & 1) << 31;
+        block.y |= ((endPoint[1][1].b >> 3) & 1) << 0;
+        block.y |= ((endPoint[1][1].b >> 5) & 1) << 1;
+        block.y |= ((endPoint[1][1].b >> 4) & 1) << 2;
+        block.y |= ((endPoint[0][1].r >> 0) & 1) << 3;
+        block.y |= ((endPoint[0][1].r >> 1) & 1) << 4;
+        block.y |= ((endPoint[0][1].r >> 2) & 1) << 5;
+        block.y |= ((endPoint[0][1].r >> 3) & 1) << 6;
+        block.y |= ((endPoint[0][1].r >> 4) & 1) << 7;
+        block.y |= ((endPoint[0][1].r >> 5) & 1) << 8;
+        block.y |= ((endPoint[1][0].g >> 0) & 1) << 9;
+        block.y |= ((endPoint[1][0].g >> 1) & 1) << 10;
+        block.y |= ((endPoint[1][0].g >> 2) & 1) << 11;
+        block.y |= ((endPoint[1][0].g >> 3) & 1) << 12;
+        block.y |= ((endPoint[0][1].g >> 0) & 1) << 13;
+        block.y |= ((endPoint[0][1].g >> 1) & 1) << 14;
+        block.y |= ((endPoint[0][1].g >> 2) & 1) << 15;
+        block.y |= ((endPoint[0][1].g >> 3) & 1) << 16;
+        block.y |= ((endPoint[0][1].g >> 4) & 1) << 17;
+        block.y |= ((endPoint[0][1].g >> 5) & 1) << 18;
+        block.y |= ((endPoint[1][1].g >> 0) & 1) << 19;
+        block.y |= ((endPoint[1][1].g >> 1) & 1) << 20;
+        block.y |= ((endPoint[1][1].g >> 2) & 1) << 21;
+        block.y |= ((endPoint[1][1].g >> 3) & 1) << 22;
+        block.y |= ((endPoint[0][1].b >> 0) & 1) << 23;
+        block.y |= ((endPoint[0][1].b >> 1) & 1) << 24;
+        block.y |= ((endPoint[0][1].b >> 2) & 1) << 25;
+        block.y |= ((endPoint[0][1].b >> 3) & 1) << 26;
+        block.y |= ((endPoint[0][1].b >> 4) & 1) << 27;
+        block.y |= ((endPoint[0][1].b >> 5) & 1) << 28;
+        block.y |= ((endPoint[1][0].b >> 0) & 1) << 29;
+        block.y |= ((endPoint[1][0].b >> 1) & 1) << 30;
+        block.y |= ((endPoint[1][0].b >> 2) & 1) << 31;
+        block.z |= ((endPoint[1][0].b >> 3) & 1) << 0;
+        block.z |= ((endPoint[1][0].r >> 0) & 1) << 1;
+        block.z |= ((endPoint[1][0].r >> 1) & 1) << 2;
+        block.z |= ((endPoint[1][0].r >> 2) & 1) << 3;
+        block.z |= ((endPoint[1][0].r >> 3) & 1) << 4;
+        block.z |= ((endPoint[1][0].r >> 4) & 1) << 5;
+        block.z |= ((endPoint[1][0].r >> 5) & 1) << 6;
+        block.z |= ((endPoint[1][1].r >> 0) & 1) << 7;
+        block.z |= ((endPoint[1][1].r >> 1) & 1) << 8;
+        block.z |= ((endPoint[1][1].r >> 2) & 1) << 9;
+        block.z |= ((endPoint[1][1].r >> 3) & 1) << 10;
+        block.z |= ((endPoint[1][1].r >> 4) & 1) << 11;
+        block.z |= ((endPoint[1][1].r >> 5) & 1) << 12;
+        block.z |= ((partition_index >> 0) & 1) << 13;
+        block.z |= ((partition_index >> 1) & 1) << 14;
+        block.z |= ((partition_index >> 2) & 1) << 15;
+        block.z |= ((partition_index >> 3) & 1) << 16;
+        block.z |= ((partition_index >> 4) & 1) << 17;
+    }
+}
+void block_package( inout uint4 block, int2x3 endPoint, uint mode_type ) // for mode 11 - 14
+{
+    /*block.x = ( ( endPoint[0].r << 5 ) & 0x00007FE0 ) | ( ( endPoint[0].g << 15 ) & 0x01FF8000 ) | ( ( endPoint[0].b << 25 ) & 0xFE000000 );
+    block.y |= ( endPoint[0].b >> 7 ) & 0x00000007;*/
+
+    block.xy = 0;
+    block.z &= 0xFFFFFFFE;
+
+
+    if ( mode_type == candidateModeFlag[10])
+    {
+       /* block.x |= candidateModeMemory[10];
+        block.y |= ( ( endPoint[1].r << 3 ) & 0x00001FF8 ) | ( ( endPoint[1].g << 13 ) & 0x007FE000 ) | ( ( endPoint[1].b << 23 ) & 0xFF800000 );
+        block.z |= ( endPoint[1].b >> 9 ) & 0x00000001;*/
+
+        block.x |= ((candidateModeMemory[10] >> 0) & 1) << 0;
+        block.x |= ((candidateModeMemory[10] >> 1) & 1) << 1;
+        block.x |= ((candidateModeMemory[10] >> 2) & 1) << 2;
+        block.x |= ((candidateModeMemory[10] >> 3) & 1) << 3;
+        block.x |= ((candidateModeMemory[10] >> 4) & 1) << 4;
+        block.x |= ((endPoint[0].r >> 0) & 1) << 5;
+        block.x |= ((endPoint[0].r >> 1) & 1) << 6;
+        block.x |= ((endPoint[0].r >> 2) & 1) << 7;
+        block.x |= ((endPoint[0].r >> 3) & 1) << 8;
+        block.x |= ((endPoint[0].r >> 4) & 1) << 9;
+        block.x |= ((endPoint[0].r >> 5) & 1) << 10;
+        block.x |= ((endPoint[0].r >> 6) & 1) << 11;
+        block.x |= ((endPoint[0].r >> 7) & 1) << 12;
+        block.x |= ((endPoint[0].r >> 8) & 1) << 13;
+        block.x |= ((endPoint[0].r >> 9) & 1) << 14;
+        block.x |= ((endPoint[0].g >> 0) & 1) << 15;
+        block.x |= ((endPoint[0].g >> 1) & 1) << 16;
+        block.x |= ((endPoint[0].g >> 2) & 1) << 17;
+        block.x |= ((endPoint[0].g >> 3) & 1) << 18;
+        block.x |= ((endPoint[0].g >> 4) & 1) << 19;
+        block.x |= ((endPoint[0].g >> 5) & 1) << 20;
+        block.x |= ((endPoint[0].g >> 6) & 1) << 21;
+        block.x |= ((endPoint[0].g >> 7) & 1) << 22;
+        block.x |= ((endPoint[0].g >> 8) & 1) << 23;
+        block.x |= ((endPoint[0].g >> 9) & 1) << 24;
+        block.x |= ((endPoint[0].b >> 0) & 1) << 25;
+        block.x |= ((endPoint[0].b >> 1) & 1) << 26;
+        block.x |= ((endPoint[0].b >> 2) & 1) << 27;
+        block.x |= ((endPoint[0].b >> 3) & 1) << 28;
+        block.x |= ((endPoint[0].b >> 4) & 1) << 29;
+        block.x |= ((endPoint[0].b >> 5) & 1) << 30;
+        block.x |= ((endPoint[0].b >> 6) & 1) << 31;
+        block.y |= ((endPoint[0].b >> 7) & 1) << 0;
+        block.y |= ((endPoint[0].b >> 8) & 1) << 1;
+        block.y |= ((endPoint[0].b >> 9) & 1) << 2;
+        block.y |= ((endPoint[1].r >> 0) & 1) << 3;
+        block.y |= ((endPoint[1].r >> 1) & 1) << 4;
+        block.y |= ((endPoint[1].r >> 2) & 1) << 5;
+        block.y |= ((endPoint[1].r >> 3) & 1) << 6;
+        block.y |= ((endPoint[1].r >> 4) & 1) << 7;
+        block.y |= ((endPoint[1].r >> 5) & 1) << 8;
+        block.y |= ((endPoint[1].r >> 6) & 1) << 9;
+        block.y |= ((endPoint[1].r >> 7) & 1) << 10;
+        block.y |= ((endPoint[1].r >> 8) & 1) << 11;
+        block.y |= ((endPoint[1].r >> 9) & 1) << 12;
+        block.y |= ((endPoint[1].g >> 0) & 1) << 13;
+        block.y |= ((endPoint[1].g >> 1) & 1) << 14;
+        block.y |= ((endPoint[1].g >> 2) & 1) << 15;
+        block.y |= ((endPoint[1].g >> 3) & 1) << 16;
+        block.y |= ((endPoint[1].g >> 4) & 1) << 17;
+        block.y |= ((endPoint[1].g >> 5) & 1) << 18;
+        block.y |= ((endPoint[1].g >> 6) & 1) << 19;
+        block.y |= ((endPoint[1].g >> 7) & 1) << 20;
+        block.y |= ((endPoint[1].g >> 8) & 1) << 21;
+        block.y |= ((endPoint[1].g >> 9) & 1) << 22;
+        block.y |= ((endPoint[1].b >> 0) & 1) << 23;
+        block.y |= ((endPoint[1].b >> 1) & 1) << 24;
+        block.y |= ((endPoint[1].b >> 2) & 1) << 25;
+        block.y |= ((endPoint[1].b >> 3) & 1) << 26;
+        block.y |= ((endPoint[1].b >> 4) & 1) << 27;
+        block.y |= ((endPoint[1].b >> 5) & 1) << 28;
+        block.y |= ((endPoint[1].b >> 6) & 1) << 29;
+        block.y |= ((endPoint[1].b >> 7) & 1) << 30;
+        block.y |= ((endPoint[1].b >> 8) & 1) << 31;
+        block.z |= ((endPoint[1].b >> 9) & 1) << 0;
+    }
+    else if (mode_type == candidateModeFlag[11])
+    {
+        /*block.x |= candidateModeMemory[11];
+        block.y |= ( ( endPoint[0].r << 2 ) & 0x00001000 ) | ( ( endPoint[0].g << 12 ) & 0x00400000 );
+        block.y |= ( ( endPoint[1].r << 3 ) & 0x00000FF8 ) | ( ( endPoint[1].g << 13 ) & 0x003FE000 ) | ( ( endPoint[1].b << 23 ) & 0xFF800000 );
+        block.z |= ( endPoint[0].b >> 10 ) & 0x00000001;*/
+
+        block.x |= ((candidateModeMemory[11] >> 0) & 1) << 0;
+        block.x |= ((candidateModeMemory[11] >> 1) & 1) << 1;
+        block.x |= ((candidateModeMemory[11] >> 2) & 1) << 2;
+        block.x |= ((candidateModeMemory[11] >> 3) & 1) << 3;
+        block.x |= ((candidateModeMemory[11] >> 4) & 1) << 4;
+        block.x |= ((endPoint[0].r >> 0) & 1) << 5;
+        block.x |= ((endPoint[0].r >> 1) & 1) << 6;
+        block.x |= ((endPoint[0].r >> 2) & 1) << 7;
+        block.x |= ((endPoint[0].r >> 3) & 1) << 8;
+        block.x |= ((endPoint[0].r >> 4) & 1) << 9;
+        block.x |= ((endPoint[0].r >> 5) & 1) << 10;
+        block.x |= ((endPoint[0].r >> 6) & 1) << 11;
+        block.x |= ((endPoint[0].r >> 7) & 1) << 12;
+        block.x |= ((endPoint[0].r >> 8) & 1) << 13;
+        block.x |= ((endPoint[0].r >> 9) & 1) << 14;
+        block.x |= ((endPoint[0].g >> 0) & 1) << 15;
+        block.x |= ((endPoint[0].g >> 1) & 1) << 16;
+        block.x |= ((endPoint[0].g >> 2) & 1) << 17;
+        block.x |= ((endPoint[0].g >> 3) & 1) << 18;
+        block.x |= ((endPoint[0].g >> 4) & 1) << 19;
+        block.x |= ((endPoint[0].g >> 5) & 1) << 20;
+        block.x |= ((endPoint[0].g >> 6) & 1) << 21;
+        block.x |= ((endPoint[0].g >> 7) & 1) << 22;
+        block.x |= ((endPoint[0].g >> 8) & 1) << 23;
+        block.x |= ((endPoint[0].g >> 9) & 1) << 24;
+        block.x |= ((endPoint[0].b >> 0) & 1) << 25;
+        block.x |= ((endPoint[0].b >> 1) & 1) << 26;
+        block.x |= ((endPoint[0].b >> 2) & 1) << 27;
+        block.x |= ((endPoint[0].b >> 3) & 1) << 28;
+        block.x |= ((endPoint[0].b >> 4) & 1) << 29;
+        block.x |= ((endPoint[0].b >> 5) & 1) << 30;
+        block.x |= ((endPoint[0].b >> 6) & 1) << 31;
+        block.y |= ((endPoint[0].b >> 7) & 1) << 0;
+        block.y |= ((endPoint[0].b >> 8) & 1) << 1;
+        block.y |= ((endPoint[0].b >> 9) & 1) << 2;
+        block.y |= ((endPoint[1].r >> 0) & 1) << 3;
+        block.y |= ((endPoint[1].r >> 1) & 1) << 4;
+        block.y |= ((endPoint[1].r >> 2) & 1) << 5;
+        block.y |= ((endPoint[1].r >> 3) & 1) << 6;
+        block.y |= ((endPoint[1].r >> 4) & 1) << 7;
+        block.y |= ((endPoint[1].r >> 5) & 1) << 8;
+        block.y |= ((endPoint[1].r >> 6) & 1) << 9;
+        block.y |= ((endPoint[1].r >> 7) & 1) << 10;
+        block.y |= ((endPoint[1].r >> 8) & 1) << 11;
+        block.y |= ((endPoint[0].r >> 10) & 1) << 12;
+        block.y |= ((endPoint[1].g >> 0) & 1) << 13;
+        block.y |= ((endPoint[1].g >> 1) & 1) << 14;
+        block.y |= ((endPoint[1].g >> 2) & 1) << 15;
+        block.y |= ((endPoint[1].g >> 3) & 1) << 16;
+        block.y |= ((endPoint[1].g >> 4) & 1) << 17;
+        block.y |= ((endPoint[1].g >> 5) & 1) << 18;
+        block.y |= ((endPoint[1].g >> 6) & 1) << 19;
+        block.y |= ((endPoint[1].g >> 7) & 1) << 20;
+        block.y |= ((endPoint[1].g >> 8) & 1) << 21;
+        block.y |= ((endPoint[0].g >> 10) & 1) << 22;
+        block.y |= ((endPoint[1].b >> 0) & 1) << 23;
+        block.y |= ((endPoint[1].b >> 1) & 1) << 24;
+        block.y |= ((endPoint[1].b >> 2) & 1) << 25;
+        block.y |= ((endPoint[1].b >> 3) & 1) << 26;
+        block.y |= ((endPoint[1].b >> 4) & 1) << 27;
+        block.y |= ((endPoint[1].b >> 5) & 1) << 28;
+        block.y |= ((endPoint[1].b >> 6) & 1) << 29;
+        block.y |= ((endPoint[1].b >> 7) & 1) << 30;
+        block.y |= ((endPoint[1].b >> 8) & 1) << 31;
+        block.z |= ((endPoint[0].b >> 10) & 1) << 0;
+    }
+    else if (mode_type == candidateModeFlag[12])// violate the spec in  [0].low
+    {
+        /*block.x |= candidateModeMemory[12];
+        block.y |= ( ( endPoint[0].r << 2 ) & 0x00001000 ) | ( ( endPoint[0].g << 12 ) & 0x00400000 );
+        block.y |= ( ( endPoint[0].r << 0 ) & 0x00000800 ) | ( ( endPoint[0].g << 10 ) & 0x00200000 );
+        block.y |= ( endPoint[0].b << 20 ) & 0x80000000;
+        block.y |= ( ( endPoint[1].r << 3 ) & 0x000007F8 ) | ( ( endPoint[1].g << 13 ) & 0x001FE000 ) | ( ( endPoint[1].b << 23 ) & 0x7F800000 );
+        block.z |= ( endPoint[0].b >> 10 ) & 0x00000001;*/
+
+        block.x |= ((candidateModeMemory[12] >> 0) & 1) << 0;
+        block.x |= ((candidateModeMemory[12] >> 1) & 1) << 1;
+        block.x |= ((candidateModeMemory[12] >> 2) & 1) << 2;
+        block.x |= ((candidateModeMemory[12] >> 3) & 1) << 3;
+        block.x |= ((candidateModeMemory[12] >> 4) & 1) << 4;
+        block.x |= ((endPoint[0].r >> 0) & 1) << 5;
+        block.x |= ((endPoint[0].r >> 1) & 1) << 6;
+        block.x |= ((endPoint[0].r >> 2) & 1) << 7;
+        block.x |= ((endPoint[0].r >> 3) & 1) << 8;
+        block.x |= ((endPoint[0].r >> 4) & 1) << 9;
+        block.x |= ((endPoint[0].r >> 5) & 1) << 10;
+        block.x |= ((endPoint[0].r >> 6) & 1) << 11;
+        block.x |= ((endPoint[0].r >> 7) & 1) << 12;
+        block.x |= ((endPoint[0].r >> 8) & 1) << 13;
+        block.x |= ((endPoint[0].r >> 9) & 1) << 14;
+        block.x |= ((endPoint[0].g >> 0) & 1) << 15;
+        block.x |= ((endPoint[0].g >> 1) & 1) << 16;
+        block.x |= ((endPoint[0].g >> 2) & 1) << 17;
+        block.x |= ((endPoint[0].g >> 3) & 1) << 18;
+        block.x |= ((endPoint[0].g >> 4) & 1) << 19;
+        block.x |= ((endPoint[0].g >> 5) & 1) << 20;
+        block.x |= ((endPoint[0].g >> 6) & 1) << 21;
+        block.x |= ((endPoint[0].g >> 7) & 1) << 22;
+        block.x |= ((endPoint[0].g >> 8) & 1) << 23;
+        block.x |= ((endPoint[0].g >> 9) & 1) << 24;
+        block.x |= ((endPoint[0].b >> 0) & 1) << 25;
+        block.x |= ((endPoint[0].b >> 1) & 1) << 26;
+        block.x |= ((endPoint[0].b >> 2) & 1) << 27;
+        block.x |= ((endPoint[0].b >> 3) & 1) << 28;
+        block.x |= ((endPoint[0].b >> 4) & 1) << 29;
+        block.x |= ((endPoint[0].b >> 5) & 1) << 30;
+        block.x |= ((endPoint[0].b >> 6) & 1) << 31;
+        block.y |= ((endPoint[0].b >> 7) & 1) << 0;
+        block.y |= ((endPoint[0].b >> 8) & 1) << 1;
+        block.y |= ((endPoint[0].b >> 9) & 1) << 2;
+        block.y |= ((endPoint[1].r >> 0) & 1) << 3;
+        block.y |= ((endPoint[1].r >> 1) & 1) << 4;
+        block.y |= ((endPoint[1].r >> 2) & 1) << 5;
+        block.y |= ((endPoint[1].r >> 3) & 1) << 6;
+        block.y |= ((endPoint[1].r >> 4) & 1) << 7;
+        block.y |= ((endPoint[1].r >> 5) & 1) << 8;
+        block.y |= ((endPoint[1].r >> 6) & 1) << 9;
+        block.y |= ((endPoint[1].r >> 7) & 1) << 10;
+        block.y |= ((endPoint[0].r >> 11) & 1) << 11;
+        block.y |= ((endPoint[0].r >> 10) & 1) << 12;
+        block.y |= ((endPoint[1].g >> 0) & 1) << 13;
+        block.y |= ((endPoint[1].g >> 1) & 1) << 14;
+        block.y |= ((endPoint[1].g >> 2) & 1) << 15;
+        block.y |= ((endPoint[1].g >> 3) & 1) << 16;
+        block.y |= ((endPoint[1].g >> 4) & 1) << 17;
+        block.y |= ((endPoint[1].g >> 5) & 1) << 18;
+        block.y |= ((endPoint[1].g >> 6) & 1) << 19;
+        block.y |= ((endPoint[1].g >> 7) & 1) << 20;
+        block.y |= ((endPoint[0].g >> 11) & 1) << 21;
+        block.y |= ((endPoint[0].g >> 10) & 1) << 22;
+        block.y |= ((endPoint[1].b >> 0) & 1) << 23;
+        block.y |= ((endPoint[1].b >> 1) & 1) << 24;
+        block.y |= ((endPoint[1].b >> 2) & 1) << 25;
+        block.y |= ((endPoint[1].b >> 3) & 1) << 26;
+        block.y |= ((endPoint[1].b >> 4) & 1) << 27;
+        block.y |= ((endPoint[1].b >> 5) & 1) << 28;
+        block.y |= ((endPoint[1].b >> 6) & 1) << 29;
+        block.y |= ((endPoint[1].b >> 7) & 1) << 30;
+        block.y |= ((endPoint[0].b >> 11) & 1) << 31;
+        block.z |= ((endPoint[0].b >> 10) & 1) << 0;
+    }
+    else if (mode_type == candidateModeFlag[13])
+    {
+        /*block.x |= candidateModeMemory[13];
+        block.y |= ( ( endPoint[0].r >> 8 ) & 0x00000080 );
+        block.y |= ( ( endPoint[0].r >> 6 ) & 0x00000100 );
+        block.y |= ( ( endPoint[0].r >> 4 ) & 0x00000200 );
+        block.y |= ( ( endPoint[0].r >> 2 ) & 0x00000400 );
+        block.y |= ( ( endPoint[0].r >> 0 ) & 0x00000800 );
+        block.y |= ( ( endPoint[0].r << 2 ) & 0x00001000 );
+        block.y |= ( ( endPoint[0].g << 2 ) & 0x00020000 );
+        block.y |= ( ( endPoint[0].g << 4 ) & 0x00040000 );
+        block.y |= ( ( endPoint[0].g << 6 ) & 0x00080000 );
+        block.y |= ( ( endPoint[0].g << 8 ) & 0x00100000 );
+        block.y |= ( ( endPoint[0].g << 10 ) & 0x00200000 );
+        block.y |= ( ( endPoint[0].g << 12 ) & 0x00400000 );
+        block.y |= ( ( endPoint[0].b << 12 ) & 0x08000000 );
+        block.y |= ( ( endPoint[0].b << 14 ) & 0x10000000 );
+        block.y |= ( ( endPoint[0].b << 16 ) & 0x20000000 );
+        block.y |= ( ( endPoint[0].b << 18 ) & 0x40000000 );
+        block.y |= ( ( endPoint[0].b << 20 ) & 0x80000000 );
+        block.y |= ( ( endPoint[1].r << 3 ) & 0x00000078 ) | ( ( endPoint[1].g << 13 ) & 0x0001E000 ) | ( ( endPoint[1].b << 23 ) & 0x07800000 );        
+        block.z |= ( endPoint[0].b >> 10 ) & 0x00000001;*/
+
+        block.x |= ((candidateModeMemory[13] >> 0) & 1) << 0;
+        block.x |= ((candidateModeMemory[13] >> 1) & 1) << 1;
+        block.x |= ((candidateModeMemory[13] >> 2) & 1) << 2;
+        block.x |= ((candidateModeMemory[13] >> 3) & 1) << 3;
+        block.x |= ((candidateModeMemory[13] >> 4) & 1) << 4;
+        block.x |= ((endPoint[0].r >> 0) & 1) << 5;
+        block.x |= ((endPoint[0].r >> 1) & 1) << 6;
+        block.x |= ((endPoint[0].r >> 2) & 1) << 7;
+        block.x |= ((endPoint[0].r >> 3) & 1) << 8;
+        block.x |= ((endPoint[0].r >> 4) & 1) << 9;
+        block.x |= ((endPoint[0].r >> 5) & 1) << 10;
+        block.x |= ((endPoint[0].r >> 6) & 1) << 11;
+        block.x |= ((endPoint[0].r >> 7) & 1) << 12;
+        block.x |= ((endPoint[0].r >> 8) & 1) << 13;
+        block.x |= ((endPoint[0].r >> 9) & 1) << 14;
+        block.x |= ((endPoint[0].g >> 0) & 1) << 15;
+        block.x |= ((endPoint[0].g >> 1) & 1) << 16;
+        block.x |= ((endPoint[0].g >> 2) & 1) << 17;
+        block.x |= ((endPoint[0].g >> 3) & 1) << 18;
+        block.x |= ((endPoint[0].g >> 4) & 1) << 19;
+        block.x |= ((endPoint[0].g >> 5) & 1) << 20;
+        block.x |= ((endPoint[0].g >> 6) & 1) << 21;
+        block.x |= ((endPoint[0].g >> 7) & 1) << 22;
+        block.x |= ((endPoint[0].g >> 8) & 1) << 23;
+        block.x |= ((endPoint[0].g >> 9) & 1) << 24;
+        block.x |= ((endPoint[0].b >> 0) & 1) << 25;
+        block.x |= ((endPoint[0].b >> 1) & 1) << 26;
+        block.x |= ((endPoint[0].b >> 2) & 1) << 27;
+        block.x |= ((endPoint[0].b >> 3) & 1) << 28;
+        block.x |= ((endPoint[0].b >> 4) & 1) << 29;
+        block.x |= ((endPoint[0].b >> 5) & 1) << 30;
+        block.x |= ((endPoint[0].b >> 6) & 1) << 31;
+        block.y |= ((endPoint[0].b >> 7) & 1) << 0;
+        block.y |= ((endPoint[0].b >> 8) & 1) << 1;
+        block.y |= ((endPoint[0].b >> 9) & 1) << 2;
+        block.y |= ((endPoint[1].r >> 0) & 1) << 3;
+        block.y |= ((endPoint[1].r >> 1) & 1) << 4;
+        block.y |= ((endPoint[1].r >> 2) & 1) << 5;
+        block.y |= ((endPoint[1].r >> 3) & 1) << 6;
+        block.y |= ((endPoint[0].r >> 15) & 1) << 7;
+        block.y |= ((endPoint[0].r >> 14) & 1) << 8;
+        block.y |= ((endPoint[0].r >> 13) & 1) << 9;
+        block.y |= ((endPoint[0].r >> 12) & 1) << 10;
+        block.y |= ((endPoint[0].r >> 11) & 1) << 11;
+        block.y |= ((endPoint[0].r >> 10) & 1) << 12;
+        block.y |= ((endPoint[1].g >> 0) & 1) << 13;
+        block.y |= ((endPoint[1].g >> 1) & 1) << 14;
+        block.y |= ((endPoint[1].g >> 2) & 1) << 15;
+        block.y |= ((endPoint[1].g >> 3) & 1) << 16;
+        block.y |= ((endPoint[0].g >> 15) & 1) << 17;
+        block.y |= ((endPoint[0].g >> 14) & 1) << 18;
+        block.y |= ((endPoint[0].g >> 13) & 1) << 19;
+        block.y |= ((endPoint[0].g >> 12) & 1) << 20;
+        block.y |= ((endPoint[0].g >> 11) & 1) << 21;
+        block.y |= ((endPoint[0].g >> 10) & 1) << 22;
+        block.y |= ((endPoint[1].b >> 0) & 1) << 23;
+        block.y |= ((endPoint[1].b >> 1) & 1) << 24;
+        block.y |= ((endPoint[1].b >> 2) & 1) << 25;
+        block.y |= ((endPoint[1].b >> 3) & 1) << 26;
+        block.y |= ((endPoint[0].b >> 15) & 1) << 27;
+        block.y |= ((endPoint[0].b >> 14) & 1) << 28;
+        block.y |= ((endPoint[0].b >> 13) & 1) << 29;
+        block.y |= ((endPoint[0].b >> 12) & 1) << 30;
+        block.y |= ((endPoint[0].b >> 11) & 1) << 31;
+        block.z |= ((endPoint[0].b >> 10) & 1) << 0;
+    }
+}
diff --git a/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC7Encode.hlsl b/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC7Encode.hlsl
new file mode 100644
index 000000000..6a57c3862
--- /dev/null
+++ b/tests/hlsl/dxsdk/BC6HBC7EncoderCS/Shaders/BC7Encode.hlsl
@@ -0,0 +1,1908 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: BC7Encode.hlsl
+//
+// The Compute Shader for BC7 Encoder
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//#define REF_DEVICE
+
+#define CHAR_LENGTH			8
+#define NCHANNELS			4
+#define	BC7_UNORM			98
+#define MAX_UINT			0xFFFFFFFF
+#define MIN_UINT			0
+
+static const uint candidateSectionBit[64] = //Associated to partition 0-63
+{
+    0xCCCC, 0x8888, 0xEEEE, 0xECC8,
+    0xC880, 0xFEEC, 0xFEC8, 0xEC80,
+    0xC800, 0xFFEC, 0xFE80, 0xE800,
+    0xFFE8, 0xFF00, 0xFFF0, 0xF000,
+    0xF710, 0x008E, 0x7100, 0x08CE,
+    0x008C, 0x7310, 0x3100, 0x8CCE,
+    0x088C, 0x3110, 0x6666, 0x366C,
+    0x17E8, 0x0FF0, 0x718E, 0x399C,
+    0xaaaa, 0xf0f0, 0x5a5a, 0x33cc, 
+    0x3c3c, 0x55aa, 0x9696, 0xa55a, 
+    0x73ce, 0x13c8, 0x324c, 0x3bdc, 
+    0x6996, 0xc33c, 0x9966, 0x660, 
+    0x272, 0x4e4, 0x4e40, 0x2720, 
+    0xc936, 0x936c, 0x39c6, 0x639c, 
+    0x9336, 0x9cc6, 0x817e, 0xe718, 
+    0xccf0, 0xfcc, 0x7744, 0xee22, 
+};
+static const uint candidateSectionBit2[64] = //Associated to partition 64-127
+{
+    0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8,
+    0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050,
+    0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090,
+    0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250,
+    0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0,
+    0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500,
+    0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400,
+    0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200,
+    0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424,
+    0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50,
+    0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0,
+    0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600,
+    0xaa444444, 0x54a854a8, 0x95809580, 0x96969600,
+    0xa85454a8, 0x80959580, 0xaa141414, 0x96960000,
+    0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000,
+    0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
+};
+static const uint2 candidateFixUpIndex1D[128] = 
+{
+    {15, 0},{15, 0},{15, 0},{15, 0},
+    {15, 0},{15, 0},{15, 0},{15, 0},
+    {15, 0},{15, 0},{15, 0},{15, 0},
+    {15, 0},{15, 0},{15, 0},{15, 0},
+    {15, 0},{ 2, 0},{ 8, 0},{ 2, 0},
+    { 2, 0},{ 8, 0},{ 8, 0},{15, 0},
+    { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
+    { 8, 0},{ 8, 0},{ 2, 0},{ 2, 0},
+    
+    {15, 0},{15, 0},{ 6, 0},{ 8, 0},
+    { 2, 0},{ 8, 0},{15, 0},{15, 0},
+    { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
+    { 2, 0},{15, 0},{15, 0},{ 6, 0},
+    { 6, 0},{ 2, 0},{ 6, 0},{ 8, 0},
+    {15, 0},{15, 0},{ 2, 0},{ 2, 0},
+    {15, 0},{15, 0},{15, 0},{15, 0},
+    {15, 0},{ 2, 0},{ 2, 0},{15, 0},
+    //candidateFixUpIndex1D[i][1], i < 64 should not be used
+    
+    { 3,15},{ 3, 8},{15, 8},{15, 3},
+    { 8,15},{ 3,15},{15, 3},{15, 8},
+    { 8,15},{ 8,15},{ 6,15},{ 6,15},
+    { 6,15},{ 5,15},{ 3,15},{ 3, 8},
+    { 3,15},{ 3, 8},{ 8,15},{15, 3},
+    { 3,15},{ 3, 8},{ 6,15},{10, 8},
+    { 5, 3},{ 8,15},{ 8, 6},{ 6,10},
+    { 8,15},{ 5,15},{15,10},{15, 8},
+    
+    { 8,15},{15, 3},{ 3,15},{ 5,10},
+    { 6,10},{10, 8},{ 8, 9},{15,10},
+    {15, 6},{ 3,15},{15, 8},{ 5,15},
+    {15, 3},{15, 6},{15, 6},{15, 8}, //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct
+    { 3,15},{15, 3},{ 5,15},{ 5,15},
+    { 5,15},{ 8,15},{ 5,15},{10,15},
+    { 5,15},{10,15},{ 8,15},{13,15},
+    {15, 3},{12,15},{ 3,15},{ 3, 8},
+};
+static const uint2 candidateFixUpIndex1DOrdered[128] = //Same with candidateFixUpIndex1D but order the result when i >= 64
+{
+    {15, 0},{15, 0},{15, 0},{15, 0},
+    {15, 0},{15, 0},{15, 0},{15, 0},
+    {15, 0},{15, 0},{15, 0},{15, 0},
+    {15, 0},{15, 0},{15, 0},{15, 0},
+    {15, 0},{ 2, 0},{ 8, 0},{ 2, 0},
+    { 2, 0},{ 8, 0},{ 8, 0},{15, 0},
+    { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
+    { 8, 0},{ 8, 0},{ 2, 0},{ 2, 0},
+    
+    {15, 0},{15, 0},{ 6, 0},{ 8, 0},
+    { 2, 0},{ 8, 0},{15, 0},{15, 0},
+    { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
+    { 2, 0},{15, 0},{15, 0},{ 6, 0},
+    { 6, 0},{ 2, 0},{ 6, 0},{ 8, 0},
+    {15, 0},{15, 0},{ 2, 0},{ 2, 0},
+    {15, 0},{15, 0},{15, 0},{15, 0},
+    {15, 0},{ 2, 0},{ 2, 0},{15, 0},
+    //candidateFixUpIndex1DOrdered[i][1], i < 64 should not be used
+    
+    { 3,15},{ 3, 8},{ 8,15},{ 3,15},
+    { 8,15},{ 3,15},{ 3,15},{ 8,15},
+    { 8,15},{ 8,15},{ 6,15},{ 6,15},
+    { 6,15},{ 5,15},{ 3,15},{ 3, 8},
+    { 3,15},{ 3, 8},{ 8,15},{ 3,15},
+    { 3,15},{ 3, 8},{ 6,15},{ 8,10},
+    { 3, 5},{ 8,15},{ 6, 8},{ 6,10},
+    { 8,15},{ 5,15},{10,15},{ 8,15},
+    
+    { 8,15},{ 3,15},{ 3,15},{ 5,10},
+    { 6,10},{ 8,10},{ 8, 9},{10,15},
+    { 6,15},{ 3,15},{ 8,15},{ 5,15},
+    { 3,15},{ 6,15},{ 6,15},{ 8,15}, //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct
+    { 3,15},{ 3,15},{ 5,15},{ 5,15},
+    { 5,15},{ 8,15},{ 5,15},{10,15},
+    { 5,15},{10,15},{ 8,15},{13,15},
+    { 3,15},{12,15},{ 3,15},{ 3, 8},
+};
+//static const uint4x4 candidateRotation[4] = 
+//{
+//    {1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1},
+//    {0,0,0,1},{0,1,0,0},{0,0,1,0},{1,0,0,0},
+//    {1,0,0,0},{0,0,0,1},{0,0,1,0},{0,1,0,0},
+//    {1,0,0,0},{0,1,0,0},{0,0,0,1},{0,0,1,0}
+//};
+//static const uint2 candidateIndexPrec[8] = {{3,0},{3,0},{2,0},{2,0},
+//                                            {2,3}, //color index and alpha index can exchange
+//                                            {2,2},{4,4},{2,2}};
+
+static const uint aWeight[3][16] = { {0,  4,  9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64},
+                                    {0,  9, 18, 27, 37, 46, 55, 64,  0,  0,  0,  0,  0,  0,  0,  0},
+                                    {0, 21, 43, 64,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0} };
+
+                                //4 bit index: 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64
+static const uint aStep[3][64] = {  { 0, 0, 0, 1, 1, 1, 1, 2,
+                                    2, 2, 2, 2, 3, 3, 3, 3,
+                                    4, 4, 4, 4, 5, 5, 5, 5,
+                                    6, 6, 6, 6, 6, 7, 7, 7,
+                                    7, 8, 8, 8, 8, 9, 9, 9,
+                                    9,10,10,10,10,10,11,11,
+                                   11,11,12,12,12,12,13,13,
+                                   13,13,14,14,14,14,15,15 },
+                                //3 bit index: 0, 9, 18, 27, 37, 46, 55, 64
+                                    { 0,0,0,0,0,1,1,1,
+                                    1,1,1,1,1,1,2,2,
+                                    2,2,2,2,2,2,2,3,
+                                    3,3,3,3,3,3,3,3,
+                                    3,4,4,4,4,4,4,4,
+                                    4,4,5,5,5,5,5,5,
+                                    5,5,5,6,6,6,6,6,
+                                    6,6,6,6,7,7,7,7 },
+                                //2 bit index: 0, 21, 43, 64
+                                    { 0,0,0,0,0,0,0,0,
+                                    0,0,0,1,1,1,1,1,
+                                    1,1,1,1,1,1,1,1,
+                                    1,1,1,1,1,1,1,1,
+                                    1,2,2,2,2,2,2,2,
+                                    2,2,2,2,2,2,2,2,
+                                    2,2,2,2,2,2,3,3,
+                                    3,3,3,3,3,3,3,3 } };
+
+cbuffer cbCS : register( b0 )
+{
+    uint g_tex_width;
+    uint g_num_block_x;
+    uint g_format;
+    uint g_mode_id;
+    uint g_start_block_id;
+    uint g_num_total_blocks;
+    float g_alpha_weight;
+};
+
+//Forward declaration
+uint2x4 compress_endpoints0( inout uint2x4 endPoint, uint2 P ); //Mode = 0
+uint2x4 compress_endpoints1( inout uint2x4 endPoint, uint2 P ); //Mode = 1
+uint2x4 compress_endpoints2( inout uint2x4 endPoint ); //Mode = 2
+uint2x4 compress_endpoints3( inout uint2x4 endPoint, uint2 P ); //Mode = 3
+uint2x4 compress_endpoints7( inout uint2x4 endPoint, uint2 P ); //Mode = 7
+uint2x4 compress_endpoints6( inout uint2x4 endPoint, uint2 P ); //Mode = 6
+uint2x4 compress_endpoints4( inout uint2x4 endPoint ); //Mode = 4
+uint2x4 compress_endpoints5( inout uint2x4 endPoint ); //Mode = 5
+
+void block_package0( out uint4 block, uint partition, uint threadBase ); //Mode0
+void block_package1( out uint4 block, uint partition, uint threadBase ); //Mode1
+void block_package2( out uint4 block, uint partition, uint threadBase ); //Mode2
+void block_package3( out uint4 block, uint partition, uint threadBase ); //Mode3
+void block_package4( out uint4 block, uint rotation, uint index_selector, uint threadBase ); //Mode4
+void block_package5( out uint4 block, uint rotation, uint threadBase ); //Mode5
+void block_package6( out uint4 block, uint threadBase ); //Mode6
+void block_package7( out uint4 block, uint partition, uint threadBase ); //Mode7
+
+
+void swap(inout uint4 lhs, inout uint4 rhs)
+{
+    uint4 tmp = lhs;
+    lhs = rhs;
+    rhs = tmp;
+}
+void swap(inout uint3 lhs, inout uint3 rhs)
+{
+    uint3 tmp = lhs;
+    lhs = rhs;
+    rhs = tmp;
+}
+void swap(inout uint lhs, inout uint rhs)
+{
+    uint tmp = lhs;
+    lhs = rhs;
+    rhs = tmp;
+}
+
+uint ComputeError(in uint4 a, in uint4 b)
+{		
+	return dot(a.rgb, b.rgb) + g_alpha_weight * a.a*b.a;
+}
+
+void Ensure_A_Is_Larger( inout uint4 a, inout uint4 b )
+{
+    if ( a.x < b.x )
+        swap( a.x, b.x );
+    if ( a.y < b.y )
+        swap( a.y, b.y );
+    if ( a.z < b.z )
+        swap( a.z, b.z );
+    if ( a.w < b.w )
+        swap( a.w, b.w );
+}
+
+
+Texture2D g_Input : register( t0 ); 
+StructuredBuffer<uint4> g_InBuff : register( t1 );
+
+RWStructuredBuffer<uint4> g_OutBuff : register( u0 );
+
+#define THREAD_GROUP_SIZE	64
+#define BLOCK_SIZE_Y		4
+#define BLOCK_SIZE_X		4
+#define BLOCK_SIZE			(BLOCK_SIZE_Y * BLOCK_SIZE_X)
+
+struct BufferShared
+{
+    uint4 pixel;
+    uint error;
+    uint mode;
+    uint partition;
+    uint index_selector;
+    uint rotation;
+    uint4 endPoint_low;
+    uint4 endPoint_high;
+    uint4 endPoint_low_quantized;
+    uint4 endPoint_high_quantized;
+};
+groupshared BufferShared shared_temp[THREAD_GROUP_SIZE];
+
+[numthreads( THREAD_GROUP_SIZE, 1, 1 )]
+void TryMode456CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID ) // mode 4 5 6 all have 1 subset per block, and fix-up index is always index 0
+{
+    // we process 4 BC blocks per thread group
+    const uint MAX_USED_THREAD = 16;                                                // pixels in a BC (block compressed) block
+    uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;                      // the number of BC blocks a thread group processes = 64 / 16 = 4
+    uint blockInGroup = GI / MAX_USED_THREAD;                                       // what BC block this thread is on within this thread group
+    uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;    // what global BC block this thread is on
+    uint threadBase = blockInGroup * MAX_USED_THREAD;                               // the first id of the pixel in this BC block in this thread group
+    uint threadInBlock = GI - threadBase;                                           // id of the pixel in this BC block
+
+#ifndef REF_DEVICE
+    if (blockID >= g_num_total_blocks)
+    {
+        return;
+    }
+#endif
+
+    uint block_y = blockID / g_num_block_x;
+    uint block_x = blockID - block_y * g_num_block_x;
+    uint base_x = block_x * BLOCK_SIZE_X;
+    uint base_y = block_y * BLOCK_SIZE_Y;
+    
+    if (threadInBlock < 16)
+    {
+        shared_temp[GI].pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255);
+
+        shared_temp[GI].endPoint_low = shared_temp[GI].pixel;
+        shared_temp[GI].endPoint_high = shared_temp[GI].pixel;
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+
+    if (threadInBlock < 8)
+    {
+        shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low);
+        shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high);
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 4)
+    {
+        shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low);
+        shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high);
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 2)
+    {
+        shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low);
+        shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high);
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 1)
+    {
+        shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low);
+        shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high);
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+
+    uint2x4 endPoint;
+    endPoint[0] = shared_temp[threadBase].endPoint_low;
+    endPoint[1] = shared_temp[threadBase].endPoint_high;
+
+    uint error = 0xFFFFFFFF;
+    uint mode = 0;
+    uint index_selector = 0;
+    uint rotation = 0;
+
+    uint2 indexPrec;
+    if (threadInBlock < 8) // all threads of threadInBlock < 8 will be working on trying out mode 4, since only mode 4 has index selector bit
+    {
+        if (0 == (threadInBlock & 1)) // thread 0, 2, 4, 6
+        {
+            //2 represents 2bit index precision; 1 represents 3bit index precision
+            index_selector = 0;
+            indexPrec = uint2( 2, 1 );
+        }
+        else                          // thread 1, 3, 5, 7
+        {
+            //2 represents 2bit index precision; 1 represents 3bit index precision
+            index_selector = 1;
+            indexPrec = uint2( 1, 2 );
+        }
+    }
+    else
+    {
+         //2 represents 2bit index precision
+        indexPrec = uint2( 2, 2 );
+    }
+
+    uint4 pixel_r;
+    uint color_index;
+    uint alpha_index;
+    int4 span;
+    int2 span_norm_sqr;
+    int2 dotProduct;
+    if (threadInBlock < 12) // Try mode 4 5 in threads 0..11
+    {
+        // mode 4 5 have component rotation
+        if ((threadInBlock < 2) || (8 == threadInBlock))       // rotation = 0 in thread 0, 1
+        {
+            rotation = 0;
+        }
+        else if ((threadInBlock < 4) || (9 == threadInBlock))  // rotation = 1 in thread 2, 3
+        {
+            endPoint[0].ra = endPoint[0].ar;
+            endPoint[1].ra = endPoint[1].ar;
+
+            rotation = 1;
+        }
+        else if ((threadInBlock < 6) || (10 == threadInBlock)) // rotation = 2 in thread 4, 5
+        {
+            endPoint[0].ga = endPoint[0].ag;
+            endPoint[1].ga = endPoint[1].ag;
+
+            rotation = 2;
+        }
+        else if ((threadInBlock < 8) || (11 == threadInBlock)) // rotation = 3 in thread 6, 7
+        {
+            endPoint[0].ba = endPoint[0].ab;
+            endPoint[1].ba = endPoint[1].ab;
+
+            rotation = 3;
+        }
+
+        if (threadInBlock < 8)  // try mode 4 in threads 0..7
+        {
+            // mode 4 thread distribution
+            // Thread           0	1	2	3	4	5	6	7
+            // Rotation	        0	0	1	1	2	2	3	3
+            // Index selector   0	1	0	1	0	1	0	1
+
+            mode = 4;
+            compress_endpoints4( endPoint );
+        }
+        else                    // try mode 5 in threads 8..11
+        {
+            // mode 5 thread distribution
+            // Thread	 8	9  10  11
+            // Rotation	 0	1   2   3
+
+            mode = 5;
+            compress_endpoints5( endPoint );
+        }
+
+        uint4 pixel = shared_temp[threadBase + 0].pixel;
+        if (1 == rotation)
+        {
+            pixel.ra = pixel.ar;
+        }
+        else if (2 == rotation)
+        {
+            pixel.ga = pixel.ag;
+        }
+        else if (3 == rotation)
+        {
+            pixel.ba = pixel.ab;
+        }
+
+        span = endPoint[1] - endPoint[0];
+        span_norm_sqr = uint2( dot( span.rgb, span.rgb ), span.a * span.a );
+        
+        // in mode 4 5 6, end point 0 must be closer to pixel 0 than end point 1, because of the fix-up index is always index 0
+        // TODO: this shouldn't be necessary here in error calculation
+        /*
+        dotProduct = int2( dot( span.rgb, pixel.rgb - endPoint[0].rgb ), span.a * ( pixel.a - endPoint[0].a ) );
+        if ( span_norm_sqr.x > 0 && dotProduct.x > 0 && uint( dotProduct.x * 63.49999 ) > uint( 32 * span_norm_sqr.x ) )
+        {
+            span.rgb = -span.rgb;
+            swap(endPoint[0].rgb, endPoint[1].rgb);
+        }
+        if ( span_norm_sqr.y > 0 && dotProduct.y > 0 && uint( dotProduct.y * 63.49999 ) > uint( 32 * span_norm_sqr.y ) )
+        {
+            span.a = -span.a;
+            swap(endPoint[0].a, endPoint[1].a);
+        }
+        */
+	
+        // should be the same as above
+        dotProduct = int2( dot( pixel.rgb - endPoint[0].rgb, pixel.rgb - endPoint[0].rgb ), dot( pixel.rgb - endPoint[1].rgb, pixel.rgb - endPoint[1].rgb ) );
+        if ( dotProduct.x > dotProduct.y )
+        {
+            span.rgb = -span.rgb;
+            swap(endPoint[0].rgb, endPoint[1].rgb);
+        }
+        dotProduct = int2( dot( pixel.a - endPoint[0].a, pixel.a - endPoint[0].a ), dot( pixel.a - endPoint[1].a, pixel.a - endPoint[1].a ) );
+        if ( dotProduct.x > dotProduct.y )
+        {
+            span.a = -span.a;
+            swap(endPoint[0].a, endPoint[1].a);
+        }
+
+        error = 0;
+        for ( uint i = 0; i < 16; i ++ )
+        {
+            pixel = shared_temp[threadBase + i].pixel;
+            if (1 == rotation)
+            {
+                pixel.ra = pixel.ar;
+            }
+            else if (2 == rotation)
+            {
+                pixel.ga = pixel.ag;
+            }
+            else if (3 == rotation)
+            {
+                pixel.ba = pixel.ab;
+            }
+
+            dotProduct.x = dot( span.rgb, pixel.rgb - endPoint[0].rgb );
+            color_index = ( span_norm_sqr.x <= 0 /*endPoint[0] == endPoint[1]*/ || dotProduct.x <= 0 /*pixel == endPoint[0]*/ ) ? 0
+                : ( ( dotProduct.x < span_norm_sqr.x ) ? aStep[indexPrec.x][ uint( dotProduct.x * 63.49999 / span_norm_sqr.x ) ] : aStep[indexPrec.x][63] );
+            dotProduct.y = dot( span.a, pixel.a - endPoint[0].a );
+            alpha_index = ( span_norm_sqr.y <= 0 || dotProduct.y <= 0 ) ? 0
+                : ( ( dotProduct.y < span_norm_sqr.y ) ? aStep[indexPrec.y][ uint( dotProduct.y * 63.49999 / span_norm_sqr.y ) ] : aStep[indexPrec.y][63] );
+
+            // the same color_index and alpha_index should be used for reconstruction, so this should be left commented out
+            /*if (index_selector)
+            {
+                swap(color_index, alpha_index);
+            }*/
+
+            pixel_r.rgb = ( ( 64 - aWeight[indexPrec.x][color_index] ) * endPoint[0].rgb +
+                            aWeight[indexPrec.x][color_index] * endPoint[1].rgb + 
+                            32 ) >> 6;
+            pixel_r.a = ( ( 64 - aWeight[indexPrec.y][alpha_index] ) * endPoint[0].a + 
+                          aWeight[indexPrec.y][alpha_index] * endPoint[1].a + 
+                          32 ) >> 6;
+
+            Ensure_A_Is_Larger( pixel_r, pixel );
+            pixel_r -= pixel;
+            if (1 == rotation)
+            {
+                pixel_r.ra = pixel_r.ar;
+            }
+            else if (2 == rotation)
+            {
+                pixel_r.ga = pixel_r.ag;
+            }
+            else if (3 == rotation)
+            {
+                pixel_r.ba = pixel_r.ab;
+            }
+            error += ComputeError(pixel_r, pixel_r);
+        }
+    }
+    else if (threadInBlock < 16) // Try mode 6 in threads 12..15, since in mode 4 5 6, only mode 6 has p bit
+    {
+        uint p = threadInBlock - 12;
+
+        compress_endpoints6( endPoint, uint2(p >> 0, p >> 1) & 1 );
+
+        uint4 pixel = shared_temp[threadBase + 0].pixel;
+
+        span = endPoint[1] - endPoint[0];
+        span_norm_sqr = dot( span, span );
+        dotProduct = dot( span, pixel - endPoint[0] );
+        if ( span_norm_sqr.x > 0 && dotProduct.x >= 0 && uint( dotProduct.x * 63.49999 ) > uint( 32 * span_norm_sqr.x ) )
+        {
+            span = -span;
+            swap(endPoint[0], endPoint[1]);
+        }
+            
+        error = 0;
+        for ( uint i = 0; i < 16; i ++ )
+        {
+            pixel = shared_temp[threadBase + i].pixel;
+            
+            dotProduct.x = dot( span, pixel - endPoint[0] );
+            color_index = ( span_norm_sqr.x <= 0 || dotProduct.x <= 0 ) ? 0
+                : ( ( dotProduct.x < span_norm_sqr.x ) ? aStep[0][ uint( dotProduct.x * 63.49999 / span_norm_sqr.x ) ] : aStep[0][63] );
+            
+            pixel_r = ( ( 64 - aWeight[0][color_index] ) * endPoint[0]
+                + aWeight[0][color_index] * endPoint[1] + 32 ) >> 6;
+        
+            Ensure_A_Is_Larger( pixel_r, pixel );
+            pixel_r -= pixel;
+            error += ComputeError(pixel_r, pixel_r);
+        }
+
+        mode = 6;
+        rotation = p;    // Borrow rotation for p
+    }
+
+    shared_temp[GI].error = error;
+    shared_temp[GI].mode = mode;
+    shared_temp[GI].index_selector = index_selector;
+    shared_temp[GI].rotation = rotation;
+
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+
+    if (threadInBlock < 8)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 8].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 8].error;
+            shared_temp[GI].mode = shared_temp[GI + 8].mode;
+            shared_temp[GI].index_selector = shared_temp[GI + 8].index_selector;
+            shared_temp[GI].rotation = shared_temp[GI + 8].rotation;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 4)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 4].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 4].error;
+            shared_temp[GI].mode = shared_temp[GI + 4].mode;
+            shared_temp[GI].index_selector = shared_temp[GI + 4].index_selector;
+            shared_temp[GI].rotation = shared_temp[GI + 4].rotation;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 2)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 2].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 2].error;
+            shared_temp[GI].mode = shared_temp[GI + 2].mode;
+            shared_temp[GI].index_selector = shared_temp[GI + 2].index_selector;
+            shared_temp[GI].rotation = shared_temp[GI + 2].rotation;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 1)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 1].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 1].error;
+            shared_temp[GI].mode = shared_temp[GI + 1].mode;
+            shared_temp[GI].index_selector = shared_temp[GI + 1].index_selector;
+            shared_temp[GI].rotation = shared_temp[GI + 1].rotation;
+        }
+
+        g_OutBuff[blockID] = uint4(shared_temp[GI].error, (shared_temp[GI].index_selector << 31) | shared_temp[GI].mode,
+            0, shared_temp[GI].rotation); // rotation is indeed rotation for mode 4 5. for mode 6, rotation is p bit
+    }
+}
+
+[numthreads( THREAD_GROUP_SIZE, 1, 1 )]
+void TryMode137CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID ) // mode 1 3 7 all have 2 subsets per block
+{
+    const uint MAX_USED_THREAD = 64;
+    uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
+    uint blockInGroup = GI / MAX_USED_THREAD;
+    uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
+    uint threadBase = blockInGroup * MAX_USED_THREAD;
+    uint threadInBlock = GI - threadBase;
+
+    uint block_y = blockID / g_num_block_x;
+    uint block_x = blockID - block_y * g_num_block_x;
+    uint base_x = block_x * BLOCK_SIZE_X;
+    uint base_y = block_y * BLOCK_SIZE_Y;
+    
+    if (threadInBlock < 16)
+    {
+        shared_temp[GI].pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255);
+    }
+    GroupMemoryBarrierWithGroupSync();
+
+    shared_temp[GI].error = 0xFFFFFFFF;
+
+    uint4 pixel_r;
+    uint2x4 endPoint[2];        // endPoint[0..1 for subset id][0..1 for low and high in the subset]
+    uint2x4 endPointBackup[2];
+    uint color_index;
+    if (threadInBlock < 64)
+    {
+        uint partition = threadInBlock;
+
+        endPoint[0][0] = MAX_UINT;
+        endPoint[0][1] = MIN_UINT;
+        endPoint[1][0] = MAX_UINT;
+        endPoint[1][1] = MIN_UINT;
+        uint bits = candidateSectionBit[partition];
+        for ( uint i = 0; i < 16; i ++ )
+        {
+            uint4 pixel = shared_temp[threadBase + i].pixel;
+            if ( (( bits >> i ) & 0x01) == 1 )
+            {
+                endPoint[1][0] = min( endPoint[1][0], pixel );
+                endPoint[1][1] = max( endPoint[1][1], pixel );
+            }
+            else
+            {
+                endPoint[0][0] = min( endPoint[0][0], pixel );
+                endPoint[0][1] = max( endPoint[0][1], pixel );
+            }
+        }
+
+        endPointBackup[0] = endPoint[0];
+        endPointBackup[1] = endPoint[1];
+
+        uint max_p;
+        if (1 == g_mode_id)
+        {
+            // in mode 1, there is only one p bit per subset
+            max_p = 4;
+        }
+        else
+        {
+            // in mode 3 7, there are two p bits per subset, one for each end point
+            max_p = 16;
+        }
+
+        uint rotation = 0;
+        uint error = MAX_UINT;
+        for ( uint p = 0; p < max_p; p ++ )
+        {
+            endPoint[0] = endPointBackup[0];
+            endPoint[1] = endPointBackup[1];
+
+            for ( i = 0; i < 2; i ++ ) // loop through 2 subsets
+            {
+                if (g_mode_id == 1)
+                {
+                    compress_endpoints1( endPoint[i], (p >> i) & 1 );
+                }
+                else if (g_mode_id == 3)
+                {
+                    compress_endpoints3( endPoint[i], uint2(p >> (i * 2 + 0), p >> (i * 2 + 1)) & 1 );
+                }
+                else if (g_mode_id == 7)
+                {
+                    compress_endpoints7( endPoint[i], uint2(p >> (i * 2 + 0), p >> (i * 2 + 1)) & 1 );
+                }
+            }
+
+            int4 span[2];
+            span[0] = endPoint[0][1] - endPoint[0][0];
+            span[1] = endPoint[1][1] - endPoint[1][0];
+
+            if (g_mode_id != 7)
+            {
+                span[0].w = span[1].w = 0;
+            }
+
+            int span_norm_sqr[2];
+            span_norm_sqr[0] = dot( span[0], span[0] );
+            span_norm_sqr[1] = dot( span[1], span[1] );
+
+            // TODO: again, this shouldn't be necessary here in error calculation
+            int dotProduct = dot( span[0], shared_temp[threadBase + 0].pixel - endPoint[0][0] );
+            if ( span_norm_sqr[0] > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr[0] ) )
+            {
+                span[0] = -span[0];
+                swap(endPoint[0][0], endPoint[0][1]);
+            }
+            dotProduct = dot( span[1], shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel - endPoint[1][0] );
+            if ( span_norm_sqr[1] > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr[1] ) )
+            {
+                span[1] = -span[1];
+                swap(endPoint[1][0], endPoint[1][1]);
+            }
+
+            uint step_selector;
+            if (g_mode_id != 1)
+            {
+                step_selector = 2;  // mode 3 7 have 2 bit index
+            }
+            else
+            {
+                step_selector = 1;  // mode 1 has 3 bit index
+            }
+
+            uint p_error = 0;            
+            for ( i = 0; i < 16; i ++ )
+            {
+                if (((bits >> i) & 0x01) == 1)
+                {
+                    dotProduct = dot( span[1], shared_temp[threadBase + i].pixel - endPoint[1][0] );
+                    color_index = (span_norm_sqr[1] <= 0 || dotProduct <= 0) ? 0
+                        : ((dotProduct < span_norm_sqr[1]) ? aStep[step_selector][uint(dotProduct * 63.49999 / span_norm_sqr[1])] : aStep[step_selector][63]);
+                }
+                else
+                {
+                    dotProduct = dot( span[0], shared_temp[threadBase + i].pixel - endPoint[0][0] );
+                    color_index = (span_norm_sqr[0] <= 0 || dotProduct <= 0) ? 0
+                        : ((dotProduct < span_norm_sqr[0]) ? aStep[step_selector][uint(dotProduct * 63.49999 / span_norm_sqr[0])] : aStep[step_selector][63]);
+                }
+
+                uint subset_index = (bits >> i) & 0x01;
+
+                pixel_r = ((64 - aWeight[step_selector][color_index]) * endPoint[subset_index][0]
+                    + aWeight[step_selector][color_index] * endPoint[subset_index][1] + 32) >> 6;
+                if (g_mode_id != 7)
+                {
+                    pixel_r.a = 255;
+                }
+
+                uint4 pixel = shared_temp[threadBase + i].pixel;
+                Ensure_A_Is_Larger( pixel_r, pixel );
+                pixel_r -= pixel;
+                p_error += ComputeError(pixel_r, pixel_r);
+            }
+
+            if (p_error < error)
+            {
+                error = p_error;
+                rotation = p;
+            }
+        }
+
+        shared_temp[GI].error = error;
+        shared_temp[GI].mode = g_mode_id;
+        shared_temp[GI].partition = partition;
+        shared_temp[GI].rotation = rotation; // mode 1 3 7 don't have rotation, we use rotation for p bits
+    }
+    GroupMemoryBarrierWithGroupSync();
+
+    if (threadInBlock < 32)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 32].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 32].error;
+            shared_temp[GI].mode = shared_temp[GI + 32].mode;
+            shared_temp[GI].partition = shared_temp[GI + 32].partition;
+            shared_temp[GI].rotation = shared_temp[GI + 32].rotation;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+if (threadInBlock < 16)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 16].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 16].error;
+            shared_temp[GI].mode = shared_temp[GI + 16].mode;
+            shared_temp[GI].partition = shared_temp[GI + 16].partition;
+            shared_temp[GI].rotation = shared_temp[GI + 16].rotation;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 8)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 8].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 8].error;
+            shared_temp[GI].mode = shared_temp[GI + 8].mode;
+            shared_temp[GI].partition = shared_temp[GI + 8].partition;
+            shared_temp[GI].rotation = shared_temp[GI + 8].rotation;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 4)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 4].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 4].error;
+            shared_temp[GI].mode = shared_temp[GI + 4].mode;
+            shared_temp[GI].partition = shared_temp[GI + 4].partition;
+            shared_temp[GI].rotation = shared_temp[GI + 4].rotation;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 2)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 2].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 2].error;
+            shared_temp[GI].mode = shared_temp[GI + 2].mode;
+            shared_temp[GI].partition = shared_temp[GI + 2].partition;
+            shared_temp[GI].rotation = shared_temp[GI + 2].rotation;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 1)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 1].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 1].error;
+            shared_temp[GI].mode = shared_temp[GI + 1].mode;
+            shared_temp[GI].partition = shared_temp[GI + 1].partition;
+            shared_temp[GI].rotation = shared_temp[GI + 1].rotation;
+        }
+
+        if (g_InBuff[blockID].x > shared_temp[GI].error)
+        {
+            g_OutBuff[blockID] = uint4(shared_temp[GI].error, shared_temp[GI].mode, shared_temp[GI].partition, shared_temp[GI].rotation); // mode 1 3 7 don't have rotation, we use rotation for p bits
+        }
+        else
+        {
+            g_OutBuff[blockID] = g_InBuff[blockID];
+        }
+    }
+}
+
+[numthreads( THREAD_GROUP_SIZE, 1, 1 )]
+void TryMode02CS( uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID ) // mode 0 2 have 3 subsets per block
+{
+    const uint MAX_USED_THREAD = 64;
+    uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
+    uint blockInGroup = GI / MAX_USED_THREAD;
+    uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
+    uint threadBase = blockInGroup * MAX_USED_THREAD;
+    uint threadInBlock = GI - threadBase;
+
+    uint block_y = blockID / g_num_block_x;
+    uint block_x = blockID - block_y * g_num_block_x;
+    uint base_x = block_x * BLOCK_SIZE_X;
+    uint base_y = block_y * BLOCK_SIZE_Y;
+    
+    if (threadInBlock < 16)
+    {
+        shared_temp[GI].pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255);
+    }
+    GroupMemoryBarrierWithGroupSync();
+
+    shared_temp[GI].error = 0xFFFFFFFF;
+
+    uint num_partitions;
+    if (0 == g_mode_id)
+    {
+        num_partitions = 16;
+    }
+    else
+    {
+        num_partitions = 64;
+    }
+
+    uint4 pixel_r;
+    uint2x4 endPoint[3];        // endPoint[0..1 for subset id][0..1 for low and high in the subset]
+    uint2x4 endPointBackup[3];
+    uint color_index[16];
+    if (threadInBlock < num_partitions)
+    {
+        uint partition = threadInBlock + 64;
+
+        endPoint[0][0] = MAX_UINT;
+        endPoint[0][1] = MIN_UINT;
+        endPoint[1][0] = MAX_UINT;
+        endPoint[1][1] = MIN_UINT;
+        endPoint[2][0] = MAX_UINT;
+        endPoint[2][1] = MIN_UINT;
+        uint bits2 = candidateSectionBit2[partition - 64];
+        for ( uint i = 0; i < 16; i ++ )
+        {
+            uint4 pixel = shared_temp[threadBase + i].pixel;
+            uint subset_index = ( bits2 >> ( i * 2 ) ) & 0x03;
+            if ( subset_index == 2 )
+            {
+                endPoint[2][0] = min( endPoint[2][0], pixel );
+                endPoint[2][1] = max( endPoint[2][1], pixel );
+            }
+            else if ( subset_index == 1 )
+            {
+                endPoint[1][0] = min( endPoint[1][0], pixel );
+                endPoint[1][1] = max( endPoint[1][1], pixel );
+            }
+            else
+            {
+                endPoint[0][0] = min( endPoint[0][0], pixel );
+                endPoint[0][1] = max( endPoint[0][1], pixel );
+            }
+        }
+
+        endPointBackup[0] = endPoint[0];
+        endPointBackup[1] = endPoint[1];
+        endPointBackup[2] = endPoint[2];
+
+        uint max_p;
+        if (0 == g_mode_id)
+        {
+            max_p = 64; // changed from 32 to 64
+        }
+        else
+        {
+            max_p = 1;
+        }
+
+        uint rotation = 0;
+        uint error = MAX_UINT;
+        for ( uint p = 0; p < max_p; p ++ )
+        {
+            endPoint[0] = endPointBackup[0];
+            endPoint[1] = endPointBackup[1];
+            endPoint[2] = endPointBackup[2];
+
+            for ( i = 0; i < 3; i ++ )
+            {
+                if (0 == g_mode_id)
+                {
+                    compress_endpoints0( endPoint[i], uint2(p >> (i * 2 + 0), p >> (i * 2 + 1)) & 1 );
+                }
+                else
+                {
+                    compress_endpoints2( endPoint[i] );
+                }
+            }
+
+            uint step_selector = 1 + (2 == g_mode_id);
+
+            int4 span[3];
+            span[0] = endPoint[0][1] - endPoint[0][0];
+            span[1] = endPoint[1][1] - endPoint[1][0];
+            span[2] = endPoint[2][1] - endPoint[2][0];
+            span[0].w = span[1].w = span[2].w = 0;
+            int span_norm_sqr[3];
+            span_norm_sqr[0] = dot( span[0], span[0] );
+            span_norm_sqr[1] = dot( span[1], span[1] );
+            span_norm_sqr[2] = dot( span[2], span[2] );
+
+            // TODO: again, this shouldn't be necessary here in error calculation
+            uint ci[3] = { 0, candidateFixUpIndex1D[partition].x, candidateFixUpIndex1D[partition].y };
+            for (i = 0; i < 3; i ++)
+            {
+                int dotProduct = dot( span[i], shared_temp[threadBase + ci[i]].pixel - endPoint[i][0] );
+                if ( span_norm_sqr[i] > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr[i] ) )
+                {
+                    span[i] = -span[i];
+                    swap(endPoint[i][0], endPoint[i][1]);
+                }
+            }
+
+            uint p_error = 0;
+            for ( i = 0; i < 16; i ++ )
+            {
+                uint subset_index = ( bits2 >> ( i * 2 ) ) & 0x03;
+                if ( subset_index == 2 )
+                {
+                    int dotProduct = dot( span[2], shared_temp[threadBase + i].pixel - endPoint[2][0] );
+                    color_index[i] = ( span_norm_sqr[2] <= 0 || dotProduct <= 0 ) ? 0
+                        : ( ( dotProduct < span_norm_sqr[2] ) ? aStep[step_selector][ uint( dotProduct * 63.49999 / span_norm_sqr[2] ) ] : aStep[step_selector][63] );
+                }
+                else if ( subset_index == 1 )
+                {
+                    int dotProduct = dot( span[1], shared_temp[threadBase + i].pixel - endPoint[1][0] );
+                    color_index[i] = ( span_norm_sqr[1] <= 0 || dotProduct <= 0 ) ? 0
+                        : ( ( dotProduct < span_norm_sqr[1] ) ? aStep[step_selector][ uint( dotProduct * 63.49999 / span_norm_sqr[1] ) ] : aStep[step_selector][63] );
+                }
+                else
+                {
+                    int dotProduct = dot( span[0], shared_temp[threadBase + i].pixel - endPoint[0][0] );
+                    color_index[i] = ( span_norm_sqr[0] <= 0 || dotProduct <= 0 ) ? 0
+                        : ( ( dotProduct < span_norm_sqr[0] ) ? aStep[step_selector][ uint( dotProduct * 63.49999 / span_norm_sqr[0] ) ] : aStep[step_selector][63] );
+                }
+
+                pixel_r = ( ( 64 - aWeight[step_selector][color_index[i]] ) * endPoint[subset_index][0]
+                    + aWeight[step_selector][color_index[i]] * endPoint[subset_index][1] + 32 ) >> 6;
+                pixel_r.a = 255;
+
+                uint4 pixel = shared_temp[threadBase + i].pixel;                
+                Ensure_A_Is_Larger( pixel_r, pixel );
+                pixel_r -= pixel;
+                p_error += ComputeError(pixel_r, pixel_r);
+            }
+
+            if (p_error < error)
+            {
+                error = p_error;
+                rotation = p;    // Borrow rotation for p
+            }
+        }
+
+        shared_temp[GI].error = error;
+        shared_temp[GI].partition = partition;
+        shared_temp[GI].rotation = rotation;
+    }
+    GroupMemoryBarrierWithGroupSync();
+
+    if (threadInBlock < 32)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 32].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 32].error;
+            shared_temp[GI].partition = shared_temp[GI + 32].partition;
+            shared_temp[GI].rotation = shared_temp[GI + 32].rotation;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 16)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 16].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 16].error;
+            shared_temp[GI].partition = shared_temp[GI + 16].partition;
+            shared_temp[GI].rotation = shared_temp[GI + 16].rotation;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 8)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 8].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 8].error;
+            shared_temp[GI].partition = shared_temp[GI + 8].partition;
+            shared_temp[GI].rotation = shared_temp[GI + 8].rotation;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 4)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 4].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 4].error;
+            shared_temp[GI].partition = shared_temp[GI + 4].partition;
+            shared_temp[GI].rotation = shared_temp[GI + 4].rotation;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 2)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 2].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 2].error;
+            shared_temp[GI].partition = shared_temp[GI + 2].partition;
+            shared_temp[GI].rotation = shared_temp[GI + 2].rotation;
+        }
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+    if (threadInBlock < 1)
+    {
+        if ( shared_temp[GI].error > shared_temp[GI + 1].error )
+        {
+            shared_temp[GI].error = shared_temp[GI + 1].error;
+            shared_temp[GI].partition = shared_temp[GI + 1].partition;
+            shared_temp[GI].rotation = shared_temp[GI + 1].rotation;
+        }
+
+        if (g_InBuff[blockID].x > shared_temp[GI].error)
+        {
+            g_OutBuff[blockID] = uint4(shared_temp[GI].error, g_mode_id, shared_temp[GI].partition, shared_temp[GI].rotation); // rotation is actually p bit for mode 0. for mode 2, rotation is always 0
+        }
+        else
+        {
+            g_OutBuff[blockID] = g_InBuff[blockID];
+        }
+    }
+}
+
+[numthreads( THREAD_GROUP_SIZE, 1, 1 )]
+void EncodeBlockCS(uint GI : SV_GroupIndex, uint3 groupID : SV_GroupID)
+{
+    const uint MAX_USED_THREAD = 16;
+    uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
+    uint blockInGroup = GI / MAX_USED_THREAD;
+    uint blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
+    uint threadBase = blockInGroup * MAX_USED_THREAD;
+    uint threadInBlock = GI - threadBase;
+
+#ifndef REF_DEVICE
+    if (blockID >= g_num_total_blocks)
+    {
+        return;
+    }
+#endif
+
+    uint block_y = blockID / g_num_block_x;
+    uint block_x = blockID - block_y * g_num_block_x;
+    uint base_x = block_x * BLOCK_SIZE_X;
+    uint base_y = block_y * BLOCK_SIZE_Y;
+
+    uint mode = g_InBuff[blockID].y & 0x7FFFFFFF;
+    uint partition = g_InBuff[blockID].z;
+    uint index_selector = (g_InBuff[blockID].y >> 31) & 1;
+    uint rotation = g_InBuff[blockID].w;
+
+    if (threadInBlock < 16)
+    {
+        uint4 pixel = clamp(uint4(g_Input.Load( uint3( base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0 ) ) * 255), 0, 255);
+
+        if ((4 == mode) || (5 == mode))
+        {
+            if (1 == rotation)
+            {
+                pixel.ra = pixel.ar;
+            }
+            else if (2 == rotation)
+            {
+                pixel.ga = pixel.ag;
+            }
+            else if (3 == rotation)
+            {
+                pixel.ba = pixel.ab;
+            }
+        }
+
+        shared_temp[GI].pixel = pixel;
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+
+    uint bits = candidateSectionBit[partition];
+    uint bits2 = candidateSectionBit2[partition - 64];
+
+    uint2x4 ep;
+    uint2x4 ep_quantized;
+    [unroll]
+    for (int ii = 2; ii >= 0; -- ii)
+    {
+        if (threadInBlock < 16)
+        {
+            uint2x4 ep;
+            ep[0] = MAX_UINT;
+            ep[1] = MIN_UINT;
+
+            uint4 pixel = shared_temp[GI].pixel;
+
+            uint subset_index = ( bits >> threadInBlock ) & 0x01;
+            uint subset_index2 = ( bits2 >> ( threadInBlock * 2 ) ) & 0x03;
+            if (0 == ii)
+            {
+                if ((0 == mode) || (2 == mode))
+                {
+                    if (0 == subset_index2)
+                    {
+                        ep[0] = ep[1] = pixel;
+                    }
+                }
+                else if ((1 == mode) || (3 == mode) || (7 == mode))
+                {
+                    if (0 == subset_index)
+                    {
+                        ep[0] = ep[1] = pixel;
+                    }
+                }
+                else if ((4 == mode) || (5 == mode) || (6 == mode))
+                {
+                    ep[0] = ep[1] = pixel;
+                }
+            }
+            else if (1 == ii)
+            {
+                if ((0 == mode) || (2 == mode))
+                {
+                    if (1 == subset_index2)
+                    {
+                        ep[0] = ep[1] = pixel;
+                    }
+                }
+                else if ((1 == mode) || (3 == mode) || (7 == mode))
+                {
+                    if (1 == subset_index)
+                    {
+                        ep[0] = ep[1] = pixel;
+                    }
+                }
+            }
+            else
+            {
+                if ((0 == mode) || (2 == mode))
+                {
+                    if (2 == subset_index2)
+                    {
+                        ep[0] = ep[1] = pixel;
+                    }
+                }
+            }
+
+            shared_temp[GI].endPoint_low = ep[0];
+            shared_temp[GI].endPoint_high = ep[1];
+        }
+#ifdef REF_DEVICE
+        GroupMemoryBarrierWithGroupSync();
+#endif
+
+        if (threadInBlock < 8)
+        {
+            shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low);
+            shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high);
+        }
+#ifdef REF_DEVICE
+        GroupMemoryBarrierWithGroupSync();
+#endif
+        if (threadInBlock < 4)
+        {
+            shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low);
+            shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high);
+        }
+#ifdef REF_DEVICE
+        GroupMemoryBarrierWithGroupSync();
+#endif
+        if (threadInBlock < 2)
+        {
+            shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low);
+            shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high);
+        }
+#ifdef REF_DEVICE
+        GroupMemoryBarrierWithGroupSync();
+#endif
+        if (threadInBlock < 1)
+        {
+            shared_temp[GI].endPoint_low = min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low);
+            shared_temp[GI].endPoint_high = max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high);
+        }
+#ifdef REF_DEVICE
+        GroupMemoryBarrierWithGroupSync();
+#endif
+
+        if (ii == (int)threadInBlock)
+        {
+            ep[0] = shared_temp[threadBase].endPoint_low;
+            ep[1] = shared_temp[threadBase].endPoint_high;
+        }
+    }
+
+    if (threadInBlock < 3)
+    {
+        uint2 P;
+        if (1 == mode)
+        {
+            P = (rotation >> threadInBlock) & 1;
+        }
+        else
+        {
+            P = uint2(rotation >> (threadInBlock * 2 + 0), rotation >> (threadInBlock * 2 + 1)) & 1;
+        }
+
+        if (0 == mode)
+        {
+            ep_quantized = compress_endpoints0( ep, P );
+        }
+        else if (1 == mode)
+        {
+            ep_quantized = compress_endpoints1( ep, P );
+        }
+        else if (2 == mode)
+        {
+            ep_quantized = compress_endpoints2( ep );
+        }
+        else if (3 == mode)
+        {
+            ep_quantized = compress_endpoints3( ep, P );
+        }
+        else if (4 == mode)
+        {
+            ep_quantized = compress_endpoints4( ep );
+        }
+        else if (5 == mode)
+        {
+            ep_quantized = compress_endpoints5( ep );
+        }
+        else if (6 == mode)
+        {
+            ep_quantized = compress_endpoints6( ep, P );
+        }
+        else //if (7 == mode)
+        {
+            ep_quantized = compress_endpoints7( ep, P );
+        }
+
+        int4 span = ep[1] - ep[0];
+        if (mode < 4)
+        {
+            span.w = 0;
+        }
+
+        if ((4 == mode) || (5 == mode))
+        {
+            if (0 == threadInBlock)
+            {
+                int2 span_norm_sqr = uint2( dot( span.rgb, span.rgb ), span.a * span.a );
+                int2 dotProduct = int2( dot( span.rgb, shared_temp[threadBase + 0].pixel.rgb - ep[0].rgb ), span.a * ( shared_temp[threadBase + 0].pixel.a - ep[0].a ) );
+                if ( span_norm_sqr.x > 0 && dotProduct.x > 0 && uint( dotProduct.x * 63.49999 ) > uint( 32 * span_norm_sqr.x ) )
+                {
+                    swap(ep[0].rgb, ep[1].rgb);
+                    swap(ep_quantized[0].rgb, ep_quantized[1].rgb);
+                }
+                if ( span_norm_sqr.y > 0 && dotProduct.y > 0 && uint( dotProduct.y * 63.49999 ) > uint( 32 * span_norm_sqr.y ) )
+                {
+                    swap(ep[0].a, ep[1].a);
+                    swap(ep_quantized[0].a, ep_quantized[1].a);		    
+                }
+            }
+        }
+        else //if ((0 == mode) || (2 == mode) || (1 == mode) || (3 == mode) || (7 == mode) || (6 == mode))
+        {
+            int p;
+            if (0 == threadInBlock)
+            {
+                p = 0;
+            }
+            else if (1 == threadInBlock)
+            {
+                p = candidateFixUpIndex1D[partition].x;
+            }
+            else //if (2 == threadInBlock)
+            {
+                p = candidateFixUpIndex1D[partition].y;
+            }
+
+            int span_norm_sqr = dot( span, span );
+            int dotProduct = dot( span, shared_temp[threadBase + p].pixel - ep[0] );
+            if ( span_norm_sqr > 0 && dotProduct > 0 && uint( dotProduct * 63.49999 ) > uint( 32 * span_norm_sqr ) )
+            {
+                swap(ep[0], ep[1]);
+                swap(ep_quantized[0], ep_quantized[1]);		
+            }
+        }
+
+        shared_temp[GI].endPoint_low = ep[0];
+        shared_temp[GI].endPoint_high = ep[1];
+        shared_temp[GI].endPoint_low_quantized = ep_quantized[0];
+        shared_temp[GI].endPoint_high_quantized = ep_quantized[1];
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+
+    if (threadInBlock < 16)
+    {
+        uint color_index = 0;
+        uint alpha_index = 0;
+
+        uint2x4 ep;
+
+        uint2 indexPrec;
+        if ((0 == mode) || (1 == mode))
+        {
+            indexPrec = 1;
+        }
+        else if (6 == mode)
+        {
+            indexPrec = 0;
+        }
+        else if (4 == mode)
+        {
+            if (0 == index_selector)
+            {
+                indexPrec = uint2(2, 1);
+            }
+            else
+            {
+                indexPrec = uint2(1, 2);
+            }
+        }
+        else
+        {
+            indexPrec = 2;
+        }
+
+        int subset_index;
+        if ((0 == mode) || (2 == mode))
+        {
+            subset_index = (bits2 >> (threadInBlock * 2)) & 0x03;
+        }
+        else if ((1 == mode) || (3 == mode) || (7 == mode))
+        {
+            subset_index = (bits >> threadInBlock) & 0x01;
+        }
+        else
+        {
+            subset_index = 0;
+        }
+
+        ep[0] = shared_temp[threadBase + subset_index].endPoint_low;
+        ep[1] = shared_temp[threadBase + subset_index].endPoint_high;
+
+        int4 span = ep[1] - ep[0];
+        if (mode < 4)
+        {
+            span.w = 0;
+        }
+
+        if ((4 == mode) || (5 == mode))
+        {
+            int2 span_norm_sqr;
+            span_norm_sqr.x = dot( span.rgb, span.rgb );
+            span_norm_sqr.y = span.a * span.a;
+            
+            int dotProduct = dot( span.rgb, shared_temp[threadBase + threadInBlock].pixel.rgb - ep[0].rgb );
+            color_index = ( span_norm_sqr.x <= 0 || dotProduct <= 0 ) ? 0
+                    : ( ( dotProduct < span_norm_sqr.x ) ? aStep[indexPrec.x][ uint( dotProduct * 63.49999 / span_norm_sqr.x ) ] : aStep[indexPrec.x][63] );
+            dotProduct = dot( span.a, shared_temp[threadBase + threadInBlock].pixel.a - ep[0].a );
+            alpha_index = ( span_norm_sqr.y <= 0 || dotProduct <= 0 ) ? 0
+                    : ( ( dotProduct < span_norm_sqr.y ) ? aStep[indexPrec.y][ uint( dotProduct * 63.49999 / span_norm_sqr.y ) ] : aStep[indexPrec.y][63] );
+
+            if (index_selector)
+            {
+                swap(color_index, alpha_index);
+            }
+        }
+        else
+        {
+            int span_norm_sqr = dot( span, span );
+
+            int dotProduct = dot( span, shared_temp[threadBase + threadInBlock].pixel - ep[0] );
+            color_index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0
+                    : ( ( dotProduct < span_norm_sqr ) ? aStep[indexPrec.x][ uint( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep[indexPrec.x][63] );
+        }
+
+        shared_temp[GI].error = color_index;
+        shared_temp[GI].mode = alpha_index;
+    }
+#ifdef REF_DEVICE
+    GroupMemoryBarrierWithGroupSync();
+#endif
+
+    if (0 == threadInBlock)
+    {
+        uint4 block;
+        if (0 == mode)
+        {
+            block_package0( block, partition, threadBase );
+        }
+        else if (1 == mode)
+        {
+            block_package1( block, partition, threadBase );
+        }
+        else if (2 == mode)
+        {
+            block_package2( block, partition, threadBase );
+        }
+        else if (3 == mode)
+        {
+            block_package3( block, partition, threadBase );
+        }
+        else if (4 == mode)
+        {
+            block_package4( block, rotation, index_selector, threadBase );
+        }
+        else if (5 == mode)
+        {
+            block_package5( block, rotation, threadBase );
+        }
+        else if (6 == mode)
+        {
+            block_package6( block, threadBase );
+        }
+        else //if (7 == mode)
+        {
+            block_package7( block, partition, threadBase );
+        }
+
+        g_OutBuff[blockID] = block;
+    }
+}
+
+//uint4 truncate_and_round( uint4 color, uint bits)
+//{
+//    uint precisionMask = ((1 << bits) - 1) << (8 - bits);
+//    uint precisionHalf = (1 << (7-bits));
+//
+//    uint4 truncated = color & precisionMask; 
+//    uint4 rounded = min(255, color + precisionHalf) & precisionMask;
+//    
+//    uint4 truncated_bak = truncated = truncated | (truncated >> bits);
+//    uint4 rounded_bak = rounded = rounded | (rounded >> bits);
+//
+//    uint4 color_bak = color;
+//    
+//    Ensure_A_Is_Larger( rounded, color );
+//    Ensure_A_Is_Larger( truncated, color_bak );
+//
+//    if (dot(rounded - color, rounded - color) < 
+//        dot(truncated - color_bak, truncated - color_bak))
+//    {
+//        return rounded_bak;
+//    }
+//    else
+//    {
+//        return truncated_bak;
+//    }
+//}
+
+uint4 quantize( uint4 color, uint uPrec )
+{
+    uint4 rnd = min(255, color + (1 << (7 - uPrec)));
+    return rnd >> (8 - uPrec);
+}
+
+uint4 unquantize( uint4 color, uint uPrec )
+{
+    color = color << (8 - uPrec);
+    return color | (color >> uPrec);
+}
+
+uint2x4 compress_endpoints0( inout uint2x4 endPoint, uint2 P )
+{
+    uint2x4 quantized;
+    for ( uint j = 0; j < 2; j ++ )
+    {
+        quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb & 0xFFFFFFFE;
+	    quantized[j].rgb |= P[j];
+        quantized[j].a = 0xFF;
+
+        endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb;
+        endPoint[j].a = 0xFF;
+
+        quantized[j] <<= 3;
+    }
+    return quantized;
+}
+uint2x4 compress_endpoints1( inout uint2x4 endPoint, uint2 P )
+{
+    uint2x4 quantized;
+    for ( uint j = 0; j < 2; j ++ )
+    {
+        quantized[j].rgb = quantize(endPoint[j].rgbb, 7).rgb & 0xFFFFFFFE;
+	    quantized[j].rgb |= P[j];
+        quantized[j].a = 0xFF;
+
+        endPoint[j].rgb = unquantize(quantized[j].rgbb, 7).rgb;
+	    endPoint[j].a = 0xFF;
+
+        quantized[j] <<= 1;
+    }
+    return quantized;
+}
+uint2x4 compress_endpoints2( inout uint2x4 endPoint )
+{
+    uint2x4 quantized;
+    for ( uint j = 0; j < 2; j ++ )
+    {
+        quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb;
+        quantized[j].a = 0xFF;
+
+        endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb;
+	    endPoint[j].a = 0xFF;    
+
+        quantized[j] <<= 3;
+    }
+    return quantized;
+}
+uint2x4 compress_endpoints3( inout uint2x4 endPoint, uint2 P )
+{
+    uint2x4 quantized;
+    for ( uint j = 0; j < 2; j ++ )
+    {
+        quantized[j].rgb = endPoint[j].rgb & 0xFFFFFFFE;
+	    quantized[j].rgb |= P[j];
+        quantized[j].a = 0xFF;
+        
+        endPoint[j].rgb = quantized[j].rgb;
+        endPoint[j].a = 0xFF;
+    }
+    return quantized;
+}
+uint2x4 compress_endpoints4( inout uint2x4 endPoint )
+{
+    uint2x4 quantized;
+    for ( uint j = 0; j < 2; j ++ )
+    {
+        quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb;
+        quantized[j].a = quantize(endPoint[j].a, 6).r;
+        
+        endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb;        
+        endPoint[j].a = unquantize(quantized[j].a, 6).r;
+
+        quantized[j].rgb <<= 3;
+        quantized[j].a <<= 2;
+    }    
+    return quantized;
+}
+uint2x4 compress_endpoints5( inout uint2x4 endPoint )
+{
+    uint2x4 quantized;
+    for ( uint j = 0; j < 2; j ++ )
+    {
+        quantized[j].rgb = quantize(endPoint[j].rgbb, 7).rgb;
+        quantized[j].a = endPoint[j].a;
+
+        endPoint[j].rgb = unquantize(quantized[j].rgbb, 7).rgb;
+        // endPoint[j].a   Alpha is full precision
+
+        quantized[j].rgb <<= 1;
+    }    
+    return quantized;
+}
+uint2x4 compress_endpoints6( inout uint2x4 endPoint, uint2 P )
+{
+    uint2x4 quantized;
+    for ( uint j = 0; j < 2; j ++ )
+    {
+        quantized[j] = endPoint[j] & 0xFFFFFFFE;
+	    quantized[j] |= P[j];
+	        
+        endPoint[j] = quantized[j];
+    }
+    return quantized;
+}
+uint2x4 compress_endpoints7( inout uint2x4 endPoint, uint2 P )
+{
+    uint2x4 quantized;
+    for ( uint j = 0; j < 2; j ++ )
+    {
+        quantized[j] = quantize(endPoint[j], 6) & 0xFFFFFFFE;
+	    quantized[j] |= P[j];
+
+        endPoint[j] = unquantize(quantized[j], 6);
+    }
+    return quantized << 2;
+}
+
+#define get_end_point_l(subset) shared_temp[threadBase + subset].endPoint_low_quantized
+#define get_end_point_h(subset) shared_temp[threadBase + subset].endPoint_high_quantized
+#define get_color_index(index) shared_temp[threadBase + index].error
+#define get_alpha_index(index) shared_temp[threadBase + index].mode
+
+void block_package0( out uint4 block, uint partition, uint threadBase )
+{
+    block.x = 0x01 | ( (partition - 64) << 1 ) 
+            | ( ( get_end_point_l(0).r & 0xF0 ) <<  1 ) | ( ( get_end_point_h(0).r & 0xF0 ) <<  5 ) 
+            | ( ( get_end_point_l(1).r & 0xF0 ) <<  9 ) | ( ( get_end_point_h(1).r & 0xF0 ) << 13 ) 
+            | ( ( get_end_point_l(2).r & 0xF0 ) << 17 ) | ( ( get_end_point_h(2).r & 0xF0 ) << 21 ) 
+            | ( ( get_end_point_l(0).g & 0xF0 ) << 25 );
+    block.y = ( ( get_end_point_l(0).g & 0xF0 ) >>  7 ) | ( ( get_end_point_h(0).g & 0xF0 ) >>  3 ) 
+            | ( ( get_end_point_l(1).g & 0xF0 ) <<  1 ) | ( ( get_end_point_h(1).g & 0xF0 ) <<  5 ) 
+            | ( ( get_end_point_l(2).g & 0xF0 ) <<  9 ) | ( ( get_end_point_h(2).g & 0xF0 ) << 13 ) 
+            | ( ( get_end_point_l(0).b & 0xF0 ) << 17 ) | ( ( get_end_point_h(0).b & 0xF0 ) << 21 )
+            | ( ( get_end_point_l(1).b & 0xF0 ) << 25 );
+    block.z = ( ( get_end_point_l(1).b & 0xF0 ) >>  7 ) | ( ( get_end_point_h(1).b & 0xF0 ) >>  3 ) 
+            | ( ( get_end_point_l(2).b & 0xF0 ) <<  1 ) | ( ( get_end_point_h(2).b & 0xF0 ) <<  5 ) 
+            | ( ( get_end_point_l(0).r & 0x08 ) << 10 ) | ( ( get_end_point_h(0).r & 0x08 ) << 11 ) 
+            | ( ( get_end_point_l(1).r & 0x08 ) << 12 ) | ( ( get_end_point_h(1).r & 0x08 ) << 13 ) 
+            | ( ( get_end_point_l(2).r & 0x08 ) << 14 ) | ( ( get_end_point_h(2).r & 0x08 ) << 15 )
+            | ( get_color_index(0) << 19 );
+    block.w = 0;
+    uint i = 1;
+    for ( ; i <= min( candidateFixUpIndex1DOrdered[partition][0], 4 ); i ++ )
+    {
+        block.z |= get_color_index(i) << ( i * 3 + 18 );
+    }
+    if ( candidateFixUpIndex1DOrdered[partition][0] < 4 ) //i = 4
+    {
+        block.z |= get_color_index(4) << 29;
+        i += 1;
+    }
+    else //i = 5
+    {
+        block.w |= ( get_color_index(4) & 0x04 ) >> 2;
+        for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ )
+            block.w |= get_color_index(i) << ( i * 3 - 14 );
+    }
+    for ( ; i <= candidateFixUpIndex1DOrdered[partition][1]; i ++ )
+    {
+        block.w |= get_color_index(i) << ( i * 3 - 15 );
+    }
+    for ( ; i < 16; i ++ )
+    {
+        block.w |= get_color_index(i) << ( i * 3 - 16 );
+    }
+}
+void block_package1( out uint4 block, uint partition, uint threadBase )
+{
+    block.x = 0x02 | ( partition << 2 ) 
+            | ( ( get_end_point_l(0).r & 0xFC ) <<  6 ) | ( ( get_end_point_h(0).r & 0xFC ) << 12 ) 
+            | ( ( get_end_point_l(1).r & 0xFC ) << 18 ) | ( ( get_end_point_h(1).r & 0xFC ) << 24 );
+    block.y = ( ( get_end_point_l(0).g & 0xFC ) >>  2 ) | ( ( get_end_point_h(0).g & 0xFC ) <<  4 ) 
+            | ( ( get_end_point_l(1).g & 0xFC ) << 10 ) | ( ( get_end_point_h(1).g & 0xFC ) << 16 )
+            | ( ( get_end_point_l(0).b & 0xFC ) << 22 ) | ( ( get_end_point_h(0).b & 0xFC ) << 28 );
+    block.z = ( ( get_end_point_h(0).b & 0xFC ) >>  4 ) | ( ( get_end_point_l(1).b & 0xFC ) <<  2 )
+            | ( ( get_end_point_h(1).b & 0xFC ) <<  8 ) 
+            | ( ( get_end_point_l(0).r & 0x02 ) << 15 ) | ( ( get_end_point_l(1).r & 0x02 ) << 16 )
+            | ( get_color_index(0) << 18 );
+    if ( candidateFixUpIndex1DOrdered[partition][0] == 15 )
+    {
+        block.w = (get_color_index(15) << 30) | (get_color_index(14) << 27) | (get_color_index(13) << 24) | (get_color_index(12) << 21) | (get_color_index(11) << 18) | (get_color_index(10) << 15)
+            | (get_color_index(9) << 12) | (get_color_index(8) << 9) | (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5);
+        block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
+    }
+    else if ( candidateFixUpIndex1DOrdered[partition][0] == 2 )
+    {
+        block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14)
+            | (get_color_index(9) << 11) | (get_color_index(8) << 8) | (get_color_index(7) << 5) | (get_color_index(6) << 2) | (get_color_index(5) >> 1);
+        block.z |= (get_color_index(5) << 31) | (get_color_index(4) << 28) | (get_color_index(3) << 25) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
+    }
+    else if ( candidateFixUpIndex1DOrdered[partition][0] == 8 )
+    {
+        block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14)
+            | (get_color_index(9) << 11) | (get_color_index(8) << 9) | (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5);
+        block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
+    }
+    else //candidateFixUpIndex1DOrdered[partition] == 6
+    {
+        block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) | (get_color_index(11) << 17) | (get_color_index(10) << 14)
+            | (get_color_index(9) << 11) | (get_color_index(8) << 8) | (get_color_index(7) << 6) | (get_color_index(6) << 4) | get_color_index(5);
+        block.z |= (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
+    }
+}
+void block_package2( out uint4 block, uint partition, uint threadBase )
+{
+    block.x = 0x04 | ( (partition - 64) << 3 ) 
+            | ( ( get_end_point_l(0).r & 0xF8 ) <<  6 ) | ( ( get_end_point_h(0).r & 0xF8 ) << 11 ) 
+            | ( ( get_end_point_l(1).r & 0xF8 ) << 16 ) | ( ( get_end_point_h(1).r & 0xF8 ) << 21 ) 
+            | ( ( get_end_point_l(2).r & 0xF8 ) << 26 );
+    block.y = ( ( get_end_point_l(2).r & 0xF8 ) >>  6 ) | ( ( get_end_point_h(2).r & 0xF8 ) >>  1 )
+            | ( ( get_end_point_l(0).g & 0xF8 ) <<  4 ) | ( ( get_end_point_h(0).g & 0xF8 ) <<  9 ) 
+            | ( ( get_end_point_l(1).g & 0xF8 ) << 14 ) | ( ( get_end_point_h(1).g & 0xF8 ) << 19 ) 
+            | ( ( get_end_point_l(2).g & 0xF8 ) << 24 );
+    block.z = ( ( get_end_point_h(2).g & 0xF8 ) >>  3 ) | ( ( get_end_point_l(0).b & 0xF8 ) <<  2 )
+            | ( ( get_end_point_h(0).b & 0xF8 ) <<  7 )	| ( ( get_end_point_l(1).b & 0xF8 ) << 12 )
+            | ( ( get_end_point_h(1).b & 0xF8 ) << 17 ) | ( ( get_end_point_l(2).b & 0xF8 ) << 22 ) 
+            | ( ( get_end_point_h(2).b & 0xF8 ) << 27 );
+    block.w = ( ( get_end_point_h(2).b & 0xF8 ) >>  5 ) 
+            | ( get_color_index(0) << 3 );
+    uint i = 1;
+    for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ )
+    {
+        block.w |= get_color_index(i) << ( i * 2 + 2 );
+    }
+    for ( ; i <= candidateFixUpIndex1DOrdered[partition][1]; i ++ )
+    {
+        block.w |= get_color_index(i) << ( i * 2 + 1 );
+    }
+    for ( ; i < 16; i ++ )
+    {
+        block.w |= get_color_index(i) << ( i * 2 );
+    }
+}
+void block_package3( out uint4 block, uint partition, uint threadBase )
+{
+    block.x = 0x08 | ( partition << 4 ) 
+            | ( ( get_end_point_l(0).r & 0xFE ) <<  9 ) | ( ( get_end_point_h(0).r & 0xFE ) << 16 ) 
+            | ( ( get_end_point_l(1).r & 0xFE ) << 23 ) | ( ( get_end_point_h(1).r & 0xFE ) << 30 );
+    block.y = ( ( get_end_point_h(1).r & 0xFE ) >>  2 ) | ( ( get_end_point_l(0).g & 0xFE ) <<  5 )
+            | ( ( get_end_point_h(0).g & 0xFE ) << 12 ) | ( ( get_end_point_l(1).g & 0xFE ) << 19 )
+            | ( ( get_end_point_h(1).g & 0xFE ) << 26 );
+    block.z = ( ( get_end_point_h(1).g & 0xFE ) >>  6 ) | ( ( get_end_point_l(0).b & 0xFE ) <<  1 )
+            | ( ( get_end_point_h(0).b & 0xFE ) <<  8 ) | ( ( get_end_point_l(1).b & 0xFE ) << 15 )
+            | ( ( get_end_point_h(1).b & 0xFE ) << 22 )
+            | ( ( get_end_point_l(0).r & 0x01 ) << 30 ) | ( ( get_end_point_h(0).r & 0x01 ) << 31 );
+    block.w = ( ( get_end_point_l(1).r & 0x01 ) <<  0 ) | ( ( get_end_point_h(1).r & 0x01 ) <<  1 )
+            | ( get_color_index(0) << 2 );
+    uint i = 1;
+    for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ )
+    {
+        block.w |= get_color_index(i) << ( i * 2 + 1 );
+    }
+    for ( ; i < 16; i ++ )
+    {
+        block.w |= get_color_index(i) << ( i * 2 );
+    }
+}
+void block_package4( out uint4 block, uint rotation, uint index_selector, uint threadBase )
+{
+    block.x = 0x10 | ( (rotation & 3) << 5 ) | ( (index_selector & 1) << 7 )
+            | ( ( get_end_point_l(0).r & 0xF8 ) <<  5 ) | ( ( get_end_point_h(0).r & 0xF8 ) << 10 )
+            | ( ( get_end_point_l(0).g & 0xF8 ) << 15 ) | ( ( get_end_point_h(0).g & 0xF8 ) << 20 )
+            | ( ( get_end_point_l(0).b & 0xF8 ) << 25 );
+
+    block.y = ( ( get_end_point_l(0).b & 0xF8 ) >>  7 ) | ( ( get_end_point_h(0).b & 0xF8 ) >>  2 )
+            | ( ( get_end_point_l(0).a & 0xFC ) <<  4 ) | ( ( get_end_point_h(0).a & 0xFC ) << 10 )
+            | ( (get_color_index(0) & 1) << 18 ) | ( get_color_index(1) << 19 ) | ( get_color_index(2) << 21 ) | ( get_color_index(3) << 23 ) 
+            | ( get_color_index(4) << 25 ) | ( get_color_index(5) << 27 ) | ( get_color_index(6) << 29 ) | ( get_color_index(7) << 31 );
+
+    block.z = ( get_color_index(7) >>  1 ) | ( get_color_index(8) <<  1 ) | ( get_color_index(9) <<  3 ) | ( get_color_index(10)<<  5 )
+            | ( get_color_index(11)<<  7 ) | ( get_color_index(12)<<  9 ) | ( get_color_index(13)<< 11 ) | ( get_color_index(14)<< 13 )
+            | ( get_color_index(15)<< 15 ) | ( (get_alpha_index(0) & 3) << 17 ) | ( get_alpha_index(1) << 19 ) | ( get_alpha_index(2) << 22 )
+            | ( get_alpha_index(3) << 25 ) | ( get_alpha_index(4) << 28 ) | ( get_alpha_index(5) << 31 );
+
+    block.w = ( get_alpha_index(5) >>  1 ) | ( get_alpha_index(6) <<  2 ) | ( get_alpha_index(7) <<  5 ) | ( get_alpha_index(8) <<  8 ) 
+            | ( get_alpha_index(9) << 11 ) | ( get_alpha_index(10)<< 14 ) | ( get_alpha_index(11)<< 17 ) | ( get_alpha_index(12)<< 20 ) 
+            | ( get_alpha_index(13)<< 23 ) | ( get_alpha_index(14)<< 26 ) | ( get_alpha_index(15)<< 29 );
+}
+void block_package5( out uint4 block, uint rotation, uint threadBase )
+{
+    block.x = 0x20 | ( rotation << 6 )
+            | ( ( get_end_point_l(0).r & 0xFE ) <<  7 ) | ( ( get_end_point_h(0).r & 0xFE ) << 14 )
+            | ( ( get_end_point_l(0).g & 0xFE ) << 21 ) | ( ( get_end_point_h(0).g & 0xFE ) << 28 );
+    block.y = ( ( get_end_point_h(0).g & 0xFE ) >>  4 ) | ( ( get_end_point_l(0).b & 0xFE ) <<  3 )
+            | ( ( get_end_point_h(0).b & 0xFE ) << 10 )	| ( get_end_point_l(0).a << 18 ) | ( get_end_point_h(0).a << 26 );
+    block.z = ( get_end_point_h(0).a >>  6 )
+            | ( get_color_index(0) <<  2 ) | ( get_color_index(1) <<  3 ) | ( get_color_index(2) <<  5 ) | ( get_color_index(3) <<  7 ) 
+            | ( get_color_index(4) <<  9 ) | ( get_color_index(5) << 11 ) | ( get_color_index(6) << 13 ) | ( get_color_index(7) << 15 )
+            | ( get_color_index(8) << 17 ) | ( get_color_index(9) << 19 ) | ( get_color_index(10)<< 21 ) | ( get_color_index(11)<< 23 ) 
+            | ( get_color_index(12)<< 25 ) | ( get_color_index(13)<< 27 ) | ( get_color_index(14)<< 29 ) | ( get_color_index(15)<< 31 );
+    block.w =  ( get_color_index(15)>> 1 ) | ( get_alpha_index(0) <<  1 ) | ( get_alpha_index(1) <<  2 ) | ( get_alpha_index(2) <<  4 )
+            | ( get_alpha_index(3) <<  6 ) | ( get_alpha_index(4) <<  8 ) | ( get_alpha_index(5) << 10 ) | ( get_alpha_index(6) << 12 )
+            | ( get_alpha_index(7) << 14 ) | ( get_alpha_index(8) << 16 ) | ( get_alpha_index(9) << 18 ) | ( get_alpha_index(10)<< 20 ) 
+            | ( get_alpha_index(11)<< 22 ) | ( get_alpha_index(12)<< 24 ) | ( get_alpha_index(13)<< 26 ) | ( get_alpha_index(14)<< 28 )
+            | ( get_alpha_index(15)<< 30 );
+}
+void block_package6( out uint4 block, uint threadBase )
+{
+    block.x = 0x40
+            | ( ( get_end_point_l(0).r & 0xFE ) <<  6 ) | ( ( get_end_point_h(0).r & 0xFE ) << 13 )
+            | ( ( get_end_point_l(0).g & 0xFE ) << 20 ) | ( ( get_end_point_h(0).g & 0xFE ) << 27 );
+    block.y = ( ( get_end_point_h(0).g & 0xFE ) >>  5 ) | ( ( get_end_point_l(0).b & 0xFE ) <<  2 )
+            | ( ( get_end_point_h(0).b & 0xFE ) <<  9 )	| ( ( get_end_point_l(0).a & 0xFE ) << 16 )
+            | ( ( get_end_point_h(0).a & 0xFE ) << 23 )
+            | ( get_end_point_l(0).r & 0x01 ) << 31;
+    block.z = ( get_end_point_h(0).r & 0x01 )
+            | ( get_color_index(0) <<  1 ) | ( get_color_index(1) <<  4 ) | ( get_color_index(2) <<  8 ) | ( get_color_index(3) << 12 ) 
+            | ( get_color_index(4) << 16 ) | ( get_color_index(5) << 20 ) | ( get_color_index(6) << 24 ) | ( get_color_index(7) << 28 );
+    block.w = ( get_color_index(8) <<  0 ) | ( get_color_index(9) <<  4 ) | ( get_color_index(10)<<  8 ) | ( get_color_index(11)<< 12 ) 
+            | ( get_color_index(12)<< 16 ) | ( get_color_index(13)<< 20 ) | ( get_color_index(14)<< 24 ) | ( get_color_index(15)<< 28 );
+}
+void block_package7( out uint4 block, uint partition, uint threadBase )
+{
+    block.x = 0x80 | ( partition << 8 ) 
+            | ( ( get_end_point_l(0).r & 0xF8 ) << 11 ) | ( ( get_end_point_h(0).r & 0xF8 ) << 16 ) 
+            | ( ( get_end_point_l(1).r & 0xF8 ) << 21 ) | ( ( get_end_point_h(1).r & 0xF8 ) << 26 );
+    block.y = ( ( get_end_point_h(1).r & 0xF8 ) >>  6 ) | ( ( get_end_point_l(0).g & 0xF8 ) >>  1 )
+            | ( ( get_end_point_h(0).g & 0xF8 ) <<  4 ) | ( ( get_end_point_l(1).g & 0xF8 ) <<  9 ) 
+            | ( ( get_end_point_h(1).g & 0xF8 ) << 14 )	| ( ( get_end_point_l(0).b & 0xF8 ) << 19 ) 
+            | ( ( get_end_point_h(0).b & 0xF8 ) << 24 );
+    block.z = ( ( get_end_point_l(1).b & 0xF8 ) >>  3 )	| ( ( get_end_point_h(1).b & 0xF8 ) <<  2 ) 
+            | ( ( get_end_point_l(0).a & 0xF8 ) <<  7 ) | ( ( get_end_point_h(0).a & 0xF8 ) << 12 ) 
+            | ( ( get_end_point_l(1).a & 0xF8 ) << 17 ) | ( ( get_end_point_h(1).a & 0xF8 ) << 22 ) 
+            | ( ( get_end_point_l(0).r & 0x04 ) << 28 ) | ( ( get_end_point_h(0).r & 0x04 ) << 29 );
+    block.w = ( ( get_end_point_l(1).r & 0x04 ) >>  2 ) | ( ( get_end_point_h(1).r & 0x04 ) >>  1 )
+            | ( get_color_index(0) <<  2 );
+    uint i = 1;
+    for ( ; i <= candidateFixUpIndex1DOrdered[partition][0]; i ++ )
+    {
+        block.w |= get_color_index(i) << ( i * 2 + 1 );
+    }
+    for ( ; i < 16; i ++ )
+    {
+        block.w |= get_color_index(i) << ( i * 2 );
+    }
+}
\ No newline at end of file
diff --git a/tests/hlsl/dxsdk/BasicCompute11/BasicCompute11.hlsl b/tests/hlsl/dxsdk/BasicCompute11/BasicCompute11.hlsl
new file mode 100644
index 000000000..798eea2ff
--- /dev/null
+++ b/tests/hlsl/dxsdk/BasicCompute11/BasicCompute11.hlsl
@@ -0,0 +1,72 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSMain
+//--------------------------------------------------------------------------------------
+// File: BasicCompute11.hlsl
+//
+// This file contains the Compute Shader to perform array A + array B
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#ifdef USE_STRUCTURED_BUFFERS
+
+struct BufType
+{
+    int i;
+    float f;
+#ifdef TEST_DOUBLE
+    double d;
+#endif    
+};
+
+StructuredBuffer<BufType> Buffer0 : register(t0);
+StructuredBuffer<BufType> Buffer1 : register(t1);
+RWStructuredBuffer<BufType> BufferOut : register(u0);
+
+[numthreads(1, 1, 1)]
+void CSMain( uint3 DTid : SV_DispatchThreadID )
+{
+    BufferOut[DTid.x].i = Buffer0[DTid.x].i + Buffer1[DTid.x].i;
+    BufferOut[DTid.x].f = Buffer0[DTid.x].f + Buffer1[DTid.x].f;
+#ifdef TEST_DOUBLE
+    BufferOut[DTid.x].d = Buffer0[DTid.x].d + Buffer1[DTid.x].d;
+#endif 
+}
+
+#else // The following code is for raw buffers
+
+ByteAddressBuffer Buffer0 : register(t0);
+ByteAddressBuffer Buffer1 : register(t1);
+RWByteAddressBuffer BufferOut : register(u0);
+
+[numthreads(1, 1, 1)]
+void CSMain( uint3 DTid : SV_DispatchThreadID )
+{
+#ifdef TEST_DOUBLE
+    int i0 = asint( Buffer0.Load( DTid.x*16 ) );
+    float f0 = asfloat( Buffer0.Load( DTid.x*16+4 ) );
+    double d0 = asdouble( Buffer0.Load( DTid.x*16+8 ), Buffer0.Load( DTid.x*16+12 ) );
+    int i1 = asint( Buffer1.Load( DTid.x*16 ) );
+    float f1 = asfloat( Buffer1.Load( DTid.x*16+4 ) );
+    double d1 = asdouble( Buffer1.Load( DTid.x*16+8 ), Buffer1.Load( DTid.x*16+12 ) );
+    
+    BufferOut.Store( DTid.x*16, asuint(i0 + i1) );
+    BufferOut.Store( DTid.x*16+4, asuint(f0 + f1) );
+    
+    uint dl, dh;
+    asuint( d0 + d1, dl, dh );
+
+    BufferOut.Store( DTid.x*16+8, dl );
+    BufferOut.Store( DTid.x*16+12, dh );
+#else
+    int i0 = asint( Buffer0.Load( DTid.x*8 ) );
+    float f0 = asfloat( Buffer0.Load( DTid.x*8+4 ) );
+    int i1 = asint( Buffer1.Load( DTid.x*8 ) );
+    float f1 = asfloat( Buffer1.Load( DTid.x*8+4 ) );
+    
+    BufferOut.Store( DTid.x*8, asuint(i0 + i1) );
+    BufferOut.Store( DTid.x*8+4, asuint(f0 + f1) );
+#endif // TEST_DOUBLE
+}
+
+#endif // USE_STRUCTURED_BUFFERS
diff --git a/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL.fx b/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL.fx
new file mode 100644
index 000000000..bd28f862b
--- /dev/null
+++ b/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL.fx
@@ -0,0 +1,158 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: BasicHLSL.fx
+//
+// The effect file for the BasicHLSL sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Global variables
+//--------------------------------------------------------------------------------------
+float4 g_MaterialAmbientColor;      // Material's ambient color
+float4 g_MaterialDiffuseColor;      // Material's diffuse color
+int g_nNumLights;
+
+float3 g_LightDir;               // Light's direction in world space
+float4 g_LightDiffuse;           // Light's diffuse color
+float4 g_LightAmbient;              // Light's ambient color
+
+texture g_MeshTexture;              // Color texture for mesh
+
+float    g_fTime;                   // App's time in seconds
+float4x4 g_mWorld;                  // World matrix for object
+float4x4 g_mWorldViewProjection;    // World * View * Projection matrix
+
+
+
+//--------------------------------------------------------------------------------------
+// Texture samplers
+//--------------------------------------------------------------------------------------
+sampler MeshTextureSampler = 
+sampler_state
+{
+    Texture = <g_MeshTexture>;
+    MipFilter = LINEAR;
+    MinFilter = LINEAR;
+    MagFilter = LINEAR;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex shader output structure
+//--------------------------------------------------------------------------------------
+struct VS_OUTPUT
+{
+    float4 Position   : POSITION;   // vertex position 
+    float4 Diffuse    : COLOR0;     // vertex diffuse color (note that COLOR0 is clamped from 0..1)
+    float2 TextureUV  : TEXCOORD0;  // vertex texture coords 
+};
+
+
+//--------------------------------------------------------------------------------------
+// This shader computes standard transform and lighting
+//--------------------------------------------------------------------------------------
+VS_OUTPUT RenderSceneVS( float4 vPos : POSITION, 
+                         float3 vNormal : NORMAL,
+                         float2 vTexCoord0 : TEXCOORD0,
+                         uniform int nNumLights,
+                         uniform bool bTexture,
+                         uniform bool bAnimate )
+{
+
+    VS_OUTPUT Output;
+    float3 vNormalWorldSpace;
+   
+    // Transform the position from object space to homogeneous projection space
+    Output.Position = mul(vPos, g_mWorldViewProjection);
+    
+    // Transform the normal from object space to world space    
+    vNormalWorldSpace = normalize(mul(vNormal, (float3x3)g_mWorld)); // normal (world space)
+    
+    // Compute simple directional lighting equation
+    float3 vTotalLightDiffuse = float3(0,0,0);
+    for(int i=0; i<nNumLights; i++ )
+        vTotalLightDiffuse += g_LightDiffuse * max(0,dot(vNormalWorldSpace, g_LightDir));
+        
+    Output.Diffuse.rgb = g_MaterialDiffuseColor * vTotalLightDiffuse + 
+                         g_MaterialAmbientColor * g_LightAmbient;   
+    Output.Diffuse.a = 1.0f; 
+    
+    // Just copy the texture coordinate through
+    if( bTexture ) 
+        Output.TextureUV = vTexCoord0; 
+    else
+        Output.TextureUV = 0; 
+    
+    return Output;    
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel shader output structure
+//--------------------------------------------------------------------------------------
+struct PS_OUTPUT
+{
+    float4 RGBColor : COLOR0;  // Pixel color    
+};
+
+
+//--------------------------------------------------------------------------------------
+// This shader outputs the pixel's color by modulating the texture's
+//       color with diffuse material color
+//--------------------------------------------------------------------------------------
+PS_OUTPUT RenderScenePS( VS_OUTPUT In,
+                         uniform bool bTexture ) 
+{ 
+    PS_OUTPUT Output;
+
+    // Lookup mesh texture and modulate it with diffuse
+    if( bTexture )
+        Output.RGBColor = tex2D(MeshTextureSampler, In.TextureUV) * In.Diffuse;
+    else
+        Output.RGBColor = In.Diffuse;
+
+    return Output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Renders scene to render target
+//--------------------------------------------------------------------------------------
+technique RenderSceneWithTexture1Light
+{
+    pass P0
+    {          
+        VertexShader = compile vs_2_0 RenderSceneVS( 1, true, true );
+        PixelShader  = compile ps_2_0 RenderScenePS( true ); // trivial pixel shader (could use FF instead if desired)
+    }
+}
+
+technique RenderSceneWithTexture2Light
+{
+    pass P0
+    {          
+        VertexShader = compile vs_2_0 RenderSceneVS( 2, true, true );
+        PixelShader  = compile ps_2_0 RenderScenePS( true ); // trivial pixel shader (could use FF instead if desired)
+    }
+}
+
+technique RenderSceneWithTexture3Light
+{
+    pass P0
+    {          
+        VertexShader = compile vs_2_0 RenderSceneVS( 3, true, true );
+        PixelShader  = compile ps_2_0 RenderScenePS( true ); // trivial pixel shader (could use FF instead if desired)
+    }
+}
+
+technique RenderSceneNoTexture
+{
+    pass P0
+    {          
+        VertexShader = compile vs_2_0 RenderSceneVS( 1, false, false );
+        PixelShader  = compile ps_2_0 RenderScenePS( false ); // trivial pixel shader (could use FF instead if desired)
+    }
+}
diff --git a/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_PS.hlsl b/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_PS.hlsl
new file mode 100644
index 000000000..78fff9eeb
--- /dev/null
+++ b/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_PS.hlsl
@@ -0,0 +1,51 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PSMain
+//--------------------------------------------------------------------------------------
+// File: BasicHLSL11_PS.hlsl
+//
+// The pixel shader file for the BasicHLSL11 sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+	float4		g_vObjectColor			: packoffset( c0 );
+};
+
+cbuffer cbPerFrame : register( b1 )
+{
+	float3		g_vLightDir				: packoffset( c0 );
+	float		g_fAmbient				: packoffset( c0.w );
+};
+
+//--------------------------------------------------------------------------------------
+// Textures and Samplers
+//--------------------------------------------------------------------------------------
+Texture2D	g_txDiffuse : register( t0 );
+SamplerState g_samLinear : register( s0 );
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct PS_INPUT
+{
+	float3 vNormal		: NORMAL;
+	float2 vTexcoord	: TEXCOORD0;
+};
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PSMain( PS_INPUT Input ) : SV_TARGET
+{
+	float4 vDiffuse = g_txDiffuse.Sample( g_samLinear, Input.vTexcoord );
+	
+	float fLighting = saturate( dot( g_vLightDir, Input.vNormal ) );
+	fLighting = max( fLighting, g_fAmbient );
+	
+	return vDiffuse * fLighting;
+}
+
diff --git a/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_VS.hlsl b/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_VS.hlsl
new file mode 100644
index 000000000..cb2c1b950
--- /dev/null
+++ b/tests/hlsl/dxsdk/BasicHLSL11/BasicHLSL11_VS.hlsl
@@ -0,0 +1,49 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain
+//--------------------------------------------------------------------------------------
+// File: BasicHLSL11_VS.hlsl
+//
+// The vertex shader file for the BasicHLSL11 sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+	matrix		g_mWorldViewProjection	: packoffset( c0 );
+	matrix		g_mWorld				: packoffset( c4 );
+};
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+	float4 vPosition	: POSITION;
+	float3 vNormal		: NORMAL;
+	float2 vTexcoord	: TEXCOORD0;
+};
+
+struct VS_OUTPUT
+{
+	float3 vNormal		: NORMAL;
+	float2 vTexcoord	: TEXCOORD0;
+	float4 vPosition	: SV_POSITION;
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+VS_OUTPUT VSMain( VS_INPUT Input )
+{
+	VS_OUTPUT Output;
+	
+	Output.vPosition = mul( Input.vPosition, g_mWorldViewProjection );
+	Output.vNormal = mul( Input.vNormal, (float3x3)g_mWorld );
+	Output.vTexcoord = Input.vTexcoord;
+	
+	return Output;
+}
+
diff --git a/tests/hlsl/dxsdk/BasicHLSLFX11/BasicHLSLFX11.fx b/tests/hlsl/dxsdk/BasicHLSLFX11/BasicHLSLFX11.fx
new file mode 100644
index 000000000..1ecc1930a
--- /dev/null
+++ b/tests/hlsl/dxsdk/BasicHLSLFX11/BasicHLSLFX11.fx
@@ -0,0 +1,181 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: BasicHLSL11.fx
+//
+// The effect file for the BasicHLSL sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Global variables
+//--------------------------------------------------------------------------------------
+float4 g_MaterialAmbientColor;      // Material's ambient color
+float4 g_MaterialDiffuseColor;      // Material's diffuse color
+int g_nNumLights;
+
+float3 g_LightDir[3];               // Light's direction in world space
+float4 g_LightDiffuse[3];           // Light's diffuse color
+float4 g_LightAmbient;              // Light's ambient color
+
+Texture2D g_MeshTexture;            // Color texture for mesh
+
+float    g_fTime;                   // App's time in seconds
+float4x4 g_mWorld;                  // World matrix for object
+float4x4 g_mWorldViewProjection;    // World * View * Projection matrix
+
+//--------------------------------------------------------------------------------------
+// DepthStates
+//--------------------------------------------------------------------------------------
+DepthStencilState EnableDepth
+{
+    DepthEnable = TRUE;
+    DepthWriteMask = ALL;
+    DepthFunc = LESS_EQUAL;
+};
+
+//--------------------------------------------------------------------------------------
+// Texture samplers
+//--------------------------------------------------------------------------------------
+SamplerState MeshTextureSampler
+{
+    Filter = MIN_MAG_MIP_LINEAR;
+    AddressU = Wrap;
+    AddressV = Wrap;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex shader output structure
+//--------------------------------------------------------------------------------------
+struct VS_OUTPUT
+{
+    float4 Position   : SV_POSITION; // vertex position 
+    float4 Diffuse    : COLOR0;      // vertex diffuse color (note that COLOR0 is clamped from 0..1)
+    float2 TextureUV  : TEXCOORD0;   // vertex texture coords 
+};
+
+
+//--------------------------------------------------------------------------------------
+// This shader computes standard transform and lighting
+//--------------------------------------------------------------------------------------
+VS_OUTPUT RenderSceneVS( float4 vPos : POSITION,
+                         float3 vNormal : NORMAL,
+                         float2 vTexCoord0 : TEXCOORD,
+                         uniform int nNumLights,
+                         uniform bool bTexture,
+                         uniform bool bAnimate )
+{
+    VS_OUTPUT Output;
+    float3 vNormalWorldSpace;
+  
+    float4 vAnimatedPos = vPos;
+    
+    // Animation the vertex based on time and the vertex's object space position
+    if( bAnimate )
+		vAnimatedPos += float4(vNormal, 0) * (sin(g_fTime+5.5)+0.5)*5;
+    
+    // Transform the position from object space to homogeneous projection space
+    Output.Position = mul(vAnimatedPos, g_mWorldViewProjection);
+    
+    // Transform the normal from object space to world space    
+    vNormalWorldSpace = normalize(mul(vNormal, (float3x3)g_mWorld)); // normal (world space)
+    
+    // Compute simple directional lighting equation
+    float3 vTotalLightDiffuse = float3(0,0,0);
+    for(int i=0; i<nNumLights; i++ )
+        vTotalLightDiffuse += g_LightDiffuse[i] * max(0,dot(vNormalWorldSpace, g_LightDir[i]));
+        
+    Output.Diffuse.rgb = g_MaterialDiffuseColor * vTotalLightDiffuse + 
+                         g_MaterialAmbientColor * g_LightAmbient;   
+    Output.Diffuse.a = 1.0f; 
+    
+    // Just copy the texture coordinate through
+    if( bTexture ) 
+        Output.TextureUV = vTexCoord0; 
+    else
+        Output.TextureUV = 0; 
+    
+    return Output;    
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel shader output structure
+//--------------------------------------------------------------------------------------
+struct PS_OUTPUT
+{
+    float4 RGBColor : SV_Target;  // Pixel color
+};
+
+
+//--------------------------------------------------------------------------------------
+// This shader outputs the pixel's color by modulating the texture's
+//       color with diffuse material color
+//--------------------------------------------------------------------------------------
+PS_OUTPUT RenderScenePS( VS_OUTPUT In,
+                         uniform bool bTexture ) 
+{ 
+    PS_OUTPUT Output;
+
+    // Lookup mesh texture and modulate it with diffuse
+    if( bTexture )
+        Output.RGBColor = g_MeshTexture.Sample(MeshTextureSampler, In.TextureUV) * In.Diffuse;
+    else
+        Output.RGBColor = In.Diffuse;
+
+    return Output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Renders scene to render target using D3D11 Techniques
+//--------------------------------------------------------------------------------------
+technique11 RenderSceneWithTexture1Light
+{
+    pass P0
+    {
+        SetVertexShader( CompileShader( vs_4_0_level_9_1, RenderSceneVS( 1, true, true ) ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0_level_9_1, RenderScenePS( true ) ) );
+
+        SetDepthStencilState( EnableDepth, 0 );
+    }
+}
+
+technique11 RenderSceneWithTexture2Light
+{
+    pass P0
+    {          
+        SetVertexShader( CompileShader( vs_4_0_level_9_1, RenderSceneVS( 2, true, true ) ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0_level_9_1, RenderScenePS( true ) ) ); 
+        
+        SetDepthStencilState( EnableDepth, 0 );
+    }
+}
+
+technique11 RenderSceneWithTexture3Light
+{
+    pass P0
+    {          
+        SetVertexShader( CompileShader( vs_4_0_level_9_1, RenderSceneVS( 3, true, true ) ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0_level_9_1, RenderScenePS( true ) ) );
+
+        SetDepthStencilState( EnableDepth, 0 );
+    }
+}
+
+technique11 RenderSceneNoTexture
+{
+    pass P0
+    {          
+        SetVertexShader( CompileShader( vs_4_0_level_9_1, RenderSceneVS( 1, true, true ) ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0_level_9_1, RenderScenePS( false ) ) );
+
+        SetDepthStencilState( EnableDepth, 0 );
+    }
+}
\ No newline at end of file
diff --git a/tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeScene.hlsl b/tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeScene.hlsl
new file mode 100644
index 000000000..6a6dca0c4
--- /dev/null
+++ b/tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeScene.hlsl
@@ -0,0 +1,506 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain -profile ps_4_0 -entry PSMain
+//--------------------------------------------------------------------------------------
+// File: RenderCascadeScene.hlsl
+//
+// This is the main shader file.  This shader is compiled with several different flags 
+// to provide different customizations based on user controls.
+// 
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+
+// This flag uses the derivative information to map the texels in a shadow map to the
+// view space plane of the primitive being rendred.  This depth is then used as the 
+// comparison depth and reduces self shadowing aliases.  This  technique is expensive
+// and is only valid when objects are planer ( such as a ground plane ).
+#ifndef USE_DERIVATIVES_FOR_DEPTH_OFFSET_FLAG
+#define USE_DERIVATIVES_FOR_DEPTH_OFFSET_FLAG 0
+#endif
+
+// This flag enables the shadow to blend between cascades.  This is most useful when the 
+// the shadow maps are small and artifact can be seen between the various cascade layers.
+#ifndef BLEND_BETWEEN_CASCADE_LAYERS_FLAG
+#define BLEND_BETWEEN_CASCADE_LAYERS_FLAG 0
+#endif
+
+// There are two methods for selecting the proper cascade a fragment lies in.  Interval selection
+// compares the depth of the fragment against the frustum's depth partition.
+// Map based selection compares the texture coordinates against the acutal cascade maps.
+// Map based selection gives better coverage.  
+// Interval based selection is easier to extend and understand.
+#ifndef SELECT_CASCADE_BY_INTERVAL_FLAG
+#define SELECT_CASCADE_BY_INTERVAL_FLAG 0
+#endif
+
+// The number of cascades 
+#ifndef CASCADE_COUNT_FLAG
+#define CASCADE_COUNT_FLAG 3
+#endif
+
+
+// Most titles will find that 3-4 cascades with 
+// BLEND_BETWEEN_CASCADE_LAYERS_FLAG, is good for lower end PCs.
+// High end PCs will be able to handle more cascades, and larger blur bands.
+// In some cases such as when large PCF kernels are used, derivative based depth offsets could be used 
+// with larger PCF blur kernels on high end PCs for the ground plane.
+
+cbuffer cbAllShadowData : register( b0 )
+{
+    matrix          m_mWorldViewProjection;
+    matrix          m_mWorld;
+    matrix          m_mWorldView;
+    matrix          m_mShadow;
+    float4          m_vCascadeOffset[8];
+    float4          m_vCascadeScale[8];
+    int             m_nCascadeLevels; // Number of Cascades
+    int             m_iVisualizeCascades; // 1 is to visualize the cascades in different colors. 0 is to just draw the scene
+    int             m_iPCFBlurForLoopStart; // For loop begin value. For a 5x5 Kernal this would be -2.
+    int             m_iPCFBlurForLoopEnd; // For loop end value. For a 5x5 kernel this would be 3.
+
+    // For Map based selection scheme, this keeps the pixels inside of the the valid range.
+    // When there is no boarder, these values are 0 and 1 respectivley.
+    float           m_fMinBorderPadding;     
+    float           m_fMaxBorderPadding;
+    float           m_fShadowBiasFromGUI;  // A shadow map offset to deal with self shadow artifacts.  
+                                           //These artifacts are aggravated by PCF.
+    float           m_fShadowPartitionSize; 
+    float           m_fCascadeBlendArea; // Amount to overlap when blending between cascades.
+    float           m_fTexelSize; 
+    float           m_fNativeTexelSizeInX;
+    float           m_fPaddingForCB3; // Padding variables exist because CBs must be a multiple of 16 bytes.
+    float4          m_fCascadeFrustumsEyeSpaceDepthsFloat[2];  // The values along Z that seperate the cascades.
+    float4          m_fCascadeFrustumsEyeSpaceDepthsFloat4[8];  // the values along Z that separte the cascades.  
+                                                          // Wastefully stored in float4 so they are array indexable. 
+    float3          m_vLightDir;
+    float           m_fPaddingCB4;
+
+};
+
+
+
+//--------------------------------------------------------------------------------------
+// Textures and Samplers
+//--------------------------------------------------------------------------------------
+Texture2D    g_txDiffuse                    : register( t0 );
+Texture2D    g_txShadow                     : register( t5 );
+
+
+SamplerState g_samLinear                    : register( s0 );
+SamplerComparisonState g_samShadow          : register( s5 );
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+    float4 vPosition                        : POSITION;
+    float3 vNormal                          : NORMAL;
+    float2 vTexcoord                        : TEXCOORD0;
+};
+
+struct VS_OUTPUT
+{
+    float3 vNormal                          : NORMAL;
+    float2 vTexcoord                        : TEXCOORD0;
+    float4 vTexShadow					    : TEXCOORD1;
+    float4 vPosition                        : SV_POSITION;
+    float4 vInterpPos                       : TEXCOORD2; 
+    float  vDepth                           : TEXCOORD3;
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+VS_OUTPUT VSMain( VS_INPUT Input )
+{
+    VS_OUTPUT Output;
+
+    Output.vPosition = mul( Input.vPosition, m_mWorldViewProjection );
+    Output.vNormal = mul( Input.vNormal, (float3x3)m_mWorld );
+    Output.vTexcoord = Input.vTexcoord;
+    Output.vInterpPos = Input.vPosition;   
+    Output.vDepth = mul( Input.vPosition, m_mWorldView ).z ; 
+       
+    // Transform the shadow texture coordinates for all the cascades.
+    Output.vTexShadow = mul( Input.vPosition, m_mShadow );
+    return Output;
+    
+}
+
+
+
+static const float4 vCascadeColorsMultiplier[8] = 
+{
+    float4 ( 1.5f, 0.0f, 0.0f, 1.0f ),
+    float4 ( 0.0f, 1.5f, 0.0f, 1.0f ),
+    float4 ( 0.0f, 0.0f, 5.5f, 1.0f ),
+    float4 ( 1.5f, 0.0f, 5.5f, 1.0f ),
+    float4 ( 1.5f, 1.5f, 0.0f, 1.0f ),
+    float4 ( 1.0f, 1.0f, 1.0f, 1.0f ),
+    float4 ( 0.0f, 1.0f, 5.5f, 1.0f ),
+    float4 ( 0.5f, 3.5f, 0.75f, 1.0f )
+};
+
+
+void ComputeCoordinatesTransform( in int iCascadeIndex,
+                                      in float4 InterpolatedPosition ,
+                                      in out float4 vShadowTexCoord , 
+                                      in out float4 vShadowTexCoordViewSpace ) 
+{
+    // Now that we know the correct map, we can transform the world space position of the current fragment                
+    if( SELECT_CASCADE_BY_INTERVAL_FLAG ) 
+    {
+        vShadowTexCoord = vShadowTexCoordViewSpace * m_vCascadeScale[iCascadeIndex];
+        vShadowTexCoord += m_vCascadeOffset[iCascadeIndex];
+    }  
+          
+    vShadowTexCoord.x *= m_fShadowPartitionSize;  // precomputed (float)iCascadeIndex / (float)CASCADE_CNT
+    vShadowTexCoord.x += (m_fShadowPartitionSize * (float)iCascadeIndex ); 
+
+
+} 
+
+
+//--------------------------------------------------------------------------------------
+// This function calculates the screen space depth for shadow space texels
+//--------------------------------------------------------------------------------------
+void CalculateRightAndUpTexelDepthDeltas ( in float3 vShadowTexDDX,
+                                           in float3 vShadowTexDDY,
+                                           out float fUpTextDepthWeight,
+                                           out float fRightTextDepthWeight
+ ) {
+        
+    // We use the derivatives in X and Y to create a transformation matrix.  Because these derivives give us the 
+    // transformation from screen space to shadow space, we need the inverse matrix to take us from shadow space 
+    // to screen space.  This new matrix will allow us to map shadow map texels to screen space.  This will allow 
+    // us to find the screen space depth of a corresponding depth pixel.
+    // This is not a perfect solution as it assumes the underlying geometry of the scene is a plane.  A more 
+    // accureate way of finding the actual depth would be to do a deferred rendering approach and actually 
+    //sample the depth.
+    
+    // Using an offset, or using variance shadow maps is a better approach to reducing these artifacts in most cases.
+    
+    float2x2 matScreentoShadow = float2x2( vShadowTexDDX.xy, vShadowTexDDY.xy );
+    float fDeterminant = determinant ( matScreentoShadow );
+    
+    float fInvDeterminant = 1.0f / fDeterminant;
+    
+    float2x2 matShadowToScreen = float2x2 (
+        matScreentoShadow._22 * fInvDeterminant, matScreentoShadow._12 * -fInvDeterminant, 
+        matScreentoShadow._21 * -fInvDeterminant, matScreentoShadow._11 * fInvDeterminant );
+
+    float2 vRightShadowTexelLocation = float2( m_fTexelSize, 0.0f );
+    float2 vUpShadowTexelLocation = float2( 0.0f, m_fTexelSize );  
+    
+    // Transform the right pixel by the shadow space to screen space matrix.
+    float2 vRightTexelDepthRatio = mul( vRightShadowTexelLocation,  matShadowToScreen );
+    float2 vUpTexelDepthRatio = mul( vUpShadowTexelLocation,  matShadowToScreen );
+
+    // We can now caculate how much depth changes when you move up or right in the shadow map.
+    // We use the ratio of change in x and y times the dervivite in X and Y of the screen space 
+    // depth to calculate this change.
+    fUpTextDepthWeight = 
+        vUpTexelDepthRatio.x * vShadowTexDDX.z 
+        + vUpTexelDepthRatio.y * vShadowTexDDY.z;
+    fRightTextDepthWeight = 
+        vRightTexelDepthRatio.x * vShadowTexDDX.z 
+        + vRightTexelDepthRatio.y * vShadowTexDDY.z;
+        
+}
+
+
+//--------------------------------------------------------------------------------------
+// Use PCF to sample the depth map and return a percent lit value.
+//--------------------------------------------------------------------------------------
+void CalculatePCFPercentLit ( in float4 vShadowTexCoord, 
+                              in float fRightTexelDepthDelta, 
+                              in float fUpTexelDepthDelta, 
+                              in float fBlurRowSize,
+                              out float fPercentLit
+                              ) 
+{
+    fPercentLit = 0.0f;
+    // This loop could be unrolled, and texture immediate offsets could be used if the kernel size were fixed.
+    // This would be performance improvment.
+    for( int x = m_iPCFBlurForLoopStart; x < m_iPCFBlurForLoopEnd; ++x ) 
+    {
+        for( int y = m_iPCFBlurForLoopStart; y < m_iPCFBlurForLoopEnd; ++y ) 
+        {
+            float depthcompare = vShadowTexCoord.z;
+            // A very simple solution to the depth bias problems of PCF is to use an offset.
+            // Unfortunately, too much offset can lead to Peter-panning (shadows near the base of object disappear )
+            // Too little offset can lead to shadow acne ( objects that should not be in shadow are partially self shadowed ).
+            depthcompare -= m_fShadowBiasFromGUI;
+            if ( USE_DERIVATIVES_FOR_DEPTH_OFFSET_FLAG ) 
+            {
+                // Add in derivative computed depth scale based on the x and y pixel.
+                depthcompare += fRightTexelDepthDelta * ( (float) x ) + fUpTexelDepthDelta * ( (float) y );
+            }
+            // Compare the transformed pixel depth to the depth read from the map.
+            fPercentLit += g_txShadow.SampleCmpLevelZero( g_samShadow, 
+                float2( 
+                    vShadowTexCoord.x + ( ( (float) x ) * m_fNativeTexelSizeInX ) , 
+                    vShadowTexCoord.y + ( ( (float) y ) * m_fTexelSize ) 
+                    ), 
+                depthcompare );
+        }
+    }
+    fPercentLit /= (float)fBlurRowSize;
+}
+
+//--------------------------------------------------------------------------------------
+// Calculate amount to blend between two cascades and the band where blending will occure.
+//--------------------------------------------------------------------------------------
+void CalculateBlendAmountForInterval ( in int iCurrentCascadeIndex, 
+                                       in out float fPixelDepth, 
+                                       in out float fCurrentPixelsBlendBandLocation,
+                                       out float fBlendBetweenCascadesAmount
+                                       ) 
+{
+
+    // We need to calculate the band of the current shadow map where it will fade into the next cascade.
+    // We can then early out of the expensive PCF for loop.
+    // 
+    float fBlendInterval = m_fCascadeFrustumsEyeSpaceDepthsFloat4[ iCurrentCascadeIndex  ].x;
+    //if( iNextCascadeIndex > 1 ) 
+    int fBlendIntervalbelowIndex = min(0, iCurrentCascadeIndex-1);
+    fPixelDepth -= m_fCascadeFrustumsEyeSpaceDepthsFloat4[ fBlendIntervalbelowIndex ].x;
+    fBlendInterval -= m_fCascadeFrustumsEyeSpaceDepthsFloat4[ fBlendIntervalbelowIndex ].x;
+    
+    // The current pixel's blend band location will be used to determine when we need to blend and by how much.
+    fCurrentPixelsBlendBandLocation = fPixelDepth / fBlendInterval;
+    fCurrentPixelsBlendBandLocation = 1.0f - fCurrentPixelsBlendBandLocation;
+    // The fBlendBetweenCascadesAmount is our location in the blend band.
+    fBlendBetweenCascadesAmount = fCurrentPixelsBlendBandLocation / m_fCascadeBlendArea;
+}
+
+
+
+//--------------------------------------------------------------------------------------
+// Calculate amount to blend between two cascades and the band where blending will occure.
+//--------------------------------------------------------------------------------------
+void CalculateBlendAmountForMap ( in float4 vShadowMapTextureCoord, 
+                                  in out float fCurrentPixelsBlendBandLocation,
+                                  out float fBlendBetweenCascadesAmount ) 
+{
+    // Calcaulte the blend band for the map based selection.
+    float2 distanceToOne = float2 ( 1.0f - vShadowMapTextureCoord.x, 1.0f - vShadowMapTextureCoord.y );
+    fCurrentPixelsBlendBandLocation = min( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y );
+    float fCurrentPixelsBlendBandLocation2 = min( distanceToOne.x, distanceToOne.y );
+    fCurrentPixelsBlendBandLocation = 
+        min( fCurrentPixelsBlendBandLocation, fCurrentPixelsBlendBandLocation2 );
+    fBlendBetweenCascadesAmount = fCurrentPixelsBlendBandLocation / m_fCascadeBlendArea;
+}
+
+//--------------------------------------------------------------------------------------
+// Calculate the shadow based on several options and rende the scene.
+//--------------------------------------------------------------------------------------
+float4 PSMain( VS_OUTPUT Input ) : SV_TARGET
+{
+    float4 vDiffuse = g_txDiffuse.Sample( g_samLinear, Input.vTexcoord );
+    
+    float4 vShadowMapTextureCoord = 0.0f;
+    float4 vShadowMapTextureCoord_blend = 0.0f;
+    
+    float4 vVisualizeCascadeColor = float4(0.0f,0.0f,0.0f,1.0f);
+    
+    float fPercentLit = 0.0f;
+    float fPercentLit_blend = 0.0f;
+
+   
+    float fUpTextDepthWeight=0;
+    float fRightTextDepthWeight=0;
+    float fUpTextDepthWeight_blend=0;
+    float fRightTextDepthWeight_blend=0;
+
+    int iBlurRowSize = m_iPCFBlurForLoopEnd - m_iPCFBlurForLoopStart;
+    iBlurRowSize *= iBlurRowSize;
+    float fBlurRowSize = (float)iBlurRowSize;
+        
+    int iCascadeFound = 0;
+    int iNextCascadeIndex = 1;
+
+    float fCurrentPixelDepth;
+
+    // The interval based selection technique compares the pixel's depth against the frustum's cascade divisions.
+    fCurrentPixelDepth = Input.vDepth;
+    
+    // This for loop is not necessary when the frustum is uniformaly divided and interval based selection is used.
+    // In this case fCurrentPixelDepth could be used as an array lookup into the correct frustum. 
+    int iCurrentCascadeIndex;
+    
+    float4 vShadowMapTextureCoordViewSpace = Input.vTexShadow;
+    if( SELECT_CASCADE_BY_INTERVAL_FLAG ) 
+    {
+        iCurrentCascadeIndex = 0;
+        if ( CASCADE_COUNT_FLAG > 1 ) 
+        {
+            float4 vCurrentPixelDepth = Input.vDepth;
+            float4 fComparison = ( vCurrentPixelDepth > m_fCascadeFrustumsEyeSpaceDepthsFloat[0]);
+            float4 fComparison2 = ( vCurrentPixelDepth > m_fCascadeFrustumsEyeSpaceDepthsFloat[1]);
+            float fIndex = dot( 
+                            float4( CASCADE_COUNT_FLAG > 0,
+                                    CASCADE_COUNT_FLAG > 1, 
+                                    CASCADE_COUNT_FLAG > 2, 
+                                    CASCADE_COUNT_FLAG > 3)
+                            , fComparison )
+                         + dot( 
+                            float4(
+                                    CASCADE_COUNT_FLAG > 4,
+                                    CASCADE_COUNT_FLAG > 5,
+                                    CASCADE_COUNT_FLAG > 6,
+                                    CASCADE_COUNT_FLAG > 7)
+                            , fComparison2 ) ;
+                                    
+            fIndex = min( fIndex, CASCADE_COUNT_FLAG - 1 );
+            iCurrentCascadeIndex = (int)fIndex;
+        }
+    }
+    
+    if ( !SELECT_CASCADE_BY_INTERVAL_FLAG ) 
+    {
+        iCurrentCascadeIndex = 0;
+        if ( CASCADE_COUNT_FLAG  == 1 ) 
+        {
+            vShadowMapTextureCoord = vShadowMapTextureCoordViewSpace * m_vCascadeScale[0];
+            vShadowMapTextureCoord += m_vCascadeOffset[0];
+        }
+        if ( CASCADE_COUNT_FLAG > 1 ) {
+            for( int iCascadeIndex = 0; iCascadeIndex < CASCADE_COUNT_FLAG && iCascadeFound == 0; ++iCascadeIndex ) 
+            {
+                vShadowMapTextureCoord = vShadowMapTextureCoordViewSpace * m_vCascadeScale[iCascadeIndex];
+                vShadowMapTextureCoord += m_vCascadeOffset[iCascadeIndex];
+
+                if ( min( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y ) > m_fMinBorderPadding
+                  && max( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y ) < m_fMaxBorderPadding )
+                { 
+                    iCurrentCascadeIndex = iCascadeIndex;   
+                    iCascadeFound = 1; 
+                }
+            }
+        }
+    }    
+    
+    float4 color = 0;   
+  
+    if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG  ) 
+    {
+        // Repeat text coord calculations for the next cascade. 
+        // The next cascade index is used for blurring between maps.
+        iNextCascadeIndex = min ( CASCADE_COUNT_FLAG - 1, iCurrentCascadeIndex + 1 ); 
+    }            
+
+    float fBlendBetweenCascadesAmount = 1.0f;
+    float fCurrentPixelsBlendBandLocation = 1.0f;
+    
+    if( SELECT_CASCADE_BY_INTERVAL_FLAG ) 
+    {
+        if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG && CASCADE_COUNT_FLAG > 1  ) 
+         {
+            CalculateBlendAmountForInterval ( iCurrentCascadeIndex, fCurrentPixelDepth, 
+                fCurrentPixelsBlendBandLocation, fBlendBetweenCascadesAmount );
+        }   
+    }
+    else 
+    {
+    
+        if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG ) 
+        {
+            CalculateBlendAmountForMap ( vShadowMapTextureCoord, 
+                fCurrentPixelsBlendBandLocation, fBlendBetweenCascadesAmount );
+        }   
+    }
+    
+    float3 vShadowMapTextureCoordDDX;
+    float3 vShadowMapTextureCoordDDY;
+    // The derivatives are used to find the slope of the current plane.
+    // The derivative calculation has to be inside of the loop in order to prevent divergent flow control artifacts.
+    if( USE_DERIVATIVES_FOR_DEPTH_OFFSET_FLAG ) 
+    {
+        vShadowMapTextureCoordDDX = ddx( vShadowMapTextureCoordViewSpace );
+        vShadowMapTextureCoordDDY = ddy( vShadowMapTextureCoordViewSpace );    
+        
+        vShadowMapTextureCoordDDX *= m_vCascadeScale[iCurrentCascadeIndex];
+        vShadowMapTextureCoordDDY *= m_vCascadeScale[iCurrentCascadeIndex];
+    }    
+    
+    ComputeCoordinatesTransform( iCurrentCascadeIndex, 
+                                 Input.vInterpPos, 
+                                 vShadowMapTextureCoord, 
+                                 vShadowMapTextureCoordViewSpace );    
+                                 
+
+    vVisualizeCascadeColor = vCascadeColorsMultiplier[iCurrentCascadeIndex];
+         
+    if( USE_DERIVATIVES_FOR_DEPTH_OFFSET_FLAG ) 
+    {
+         CalculateRightAndUpTexelDepthDeltas ( vShadowMapTextureCoordDDX, vShadowMapTextureCoordDDY,
+                                              fUpTextDepthWeight, fRightTextDepthWeight );
+    }
+    
+    CalculatePCFPercentLit ( vShadowMapTextureCoord, fRightTextDepthWeight, 
+                            fUpTextDepthWeight, fBlurRowSize, fPercentLit );
+                                             
+    if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG && CASCADE_COUNT_FLAG > 1 ) 
+    {
+        if( fCurrentPixelsBlendBandLocation < m_fCascadeBlendArea) 
+        {  // the current pixel is within the blend band.
+    
+            // Repeat text coord calculations for the next cascade. 
+            // The next cascade index is used for blurring between maps.
+            if( !SELECT_CASCADE_BY_INTERVAL_FLAG ) 
+            {
+                vShadowMapTextureCoord_blend = vShadowMapTextureCoordViewSpace * m_vCascadeScale[iNextCascadeIndex];
+                vShadowMapTextureCoord_blend += m_vCascadeOffset[iNextCascadeIndex];
+            }
+            
+            ComputeCoordinatesTransform( iNextCascadeIndex, Input.vInterpPos, 
+                                             vShadowMapTextureCoord_blend, 
+										     vShadowMapTextureCoordViewSpace );  
+       
+        // We repeat the calcuation for the next cascade layer, when blending between maps.
+            if( fCurrentPixelsBlendBandLocation < m_fCascadeBlendArea) 
+            {  // the current pixel is within the blend band.
+                if( USE_DERIVATIVES_FOR_DEPTH_OFFSET_FLAG ) 
+                {
+
+                    CalculateRightAndUpTexelDepthDeltas ( vShadowMapTextureCoordDDX,
+                                                          vShadowMapTextureCoordDDY,
+                                                          fUpTextDepthWeight_blend,
+                                                          fRightTextDepthWeight_blend );
+                }   
+                CalculatePCFPercentLit ( vShadowMapTextureCoord_blend, fRightTextDepthWeight_blend, 
+                                        fUpTextDepthWeight_blend, fBlurRowSize, fPercentLit_blend );
+                fPercentLit = lerp( fPercentLit_blend, fPercentLit, fBlendBetweenCascadesAmount ); 
+                // Blend the two calculated shadows by the blend amount.
+            }   
+        }   
+    }    
+
+    
+    if( !m_iVisualizeCascades ) vVisualizeCascadeColor = float4(1.0f,1.0f,1.0f,1.0f);
+    
+    float3 vLightDir1 = float3( -1.0f, 1.0f, -1.0f ); 
+    float3 vLightDir2 = float3( 1.0f, 1.0f, -1.0f ); 
+    float3 vLightDir3 = float3( 0.0f, -1.0f, 0.0f );
+    float3 vLightDir4 = float3( 1.0f, 1.0f, 1.0f );     
+    // Some ambient-like lighting.
+    float fLighting = 
+                      saturate( dot( vLightDir1 , Input.vNormal ) )*0.05f +
+                      saturate( dot( vLightDir2 , Input.vNormal ) )*0.05f +
+                      saturate( dot( vLightDir3 , Input.vNormal ) )*0.05f +
+                      saturate( dot( vLightDir4 , Input.vNormal ) )*0.05f ;
+    
+    float4 vShadowLighting = fLighting * 0.5f;
+    fLighting += saturate( dot( m_vLightDir , Input.vNormal ) );
+    fLighting = lerp( vShadowLighting, fLighting, fPercentLit );
+    
+    return fLighting * vVisualizeCascadeColor * vDiffuse;
+
+}
+
diff --git a/tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeShadow.hlsl b/tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeShadow.hlsl
new file mode 100644
index 000000000..3b4d32a0d
--- /dev/null
+++ b/tests/hlsl/dxsdk/CascadedShadowMaps11/RenderCascadeShadow.hlsl
@@ -0,0 +1,53 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain -entry VSMainPancake
+//--------------------------------------------------------------------------------------
+// File: RenderCascadeShadow.hlsl
+//
+// The shader file for the RenderCascadeScene sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+    matrix        g_mWorldViewProjection    : packoffset( c0 );
+};
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+    float4 vPosition    : POSITION;
+};
+
+struct VS_OUTPUT
+{
+    float4 vPosition    : SV_POSITION;
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+VS_OUTPUT VSMain( VS_INPUT Input )
+{
+    VS_OUTPUT Output;
+    
+    // There is nothing special here, just transform and write out the depth.
+    Output.vPosition = mul( Input.vPosition, g_mWorldViewProjection );
+
+    return Output;
+}
+
+
+VS_OUTPUT VSMainPancake( VS_INPUT Input )
+{
+    VS_OUTPUT Output;
+    // after transform move clipped geometry to near plane
+    Output.vPosition = mul( Input.vPosition, g_mWorldViewProjection );
+	//Output.vPosition.z = max( Output.vPosition.z, 0.0f );
+    return Output;
+}
\ No newline at end of file
diff --git a/tests/hlsl/dxsdk/ComputeShaderSort11/ComputeShaderSort11.hlsl b/tests/hlsl/dxsdk/ComputeShaderSort11/ComputeShaderSort11.hlsl
new file mode 100644
index 000000000..db7bd5136
--- /dev/null
+++ b/tests/hlsl/dxsdk/ComputeShaderSort11/ComputeShaderSort11.hlsl
@@ -0,0 +1,75 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry BitonicSort -entry MatrixTranspose
+//--------------------------------------------------------------------------------------
+// File: ComputeShaderSort11.hlsl
+//
+// This file contains the compute shaders to perform GPU sorting using DirectX 11.
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#define BITONIC_BLOCK_SIZE 512
+
+#define TRANSPOSE_BLOCK_SIZE 16
+
+//--------------------------------------------------------------------------------------
+// Constant Buffers
+//--------------------------------------------------------------------------------------
+cbuffer CB : register( b0 )
+{
+    unsigned int g_iLevel;
+    unsigned int g_iLevelMask;
+    unsigned int g_iWidth;
+    unsigned int g_iHeight;
+};
+
+//--------------------------------------------------------------------------------------
+// Structured Buffers
+//--------------------------------------------------------------------------------------
+StructuredBuffer<unsigned int> Input : register( t0 );
+RWStructuredBuffer<unsigned int> Data : register( u0 );
+
+//--------------------------------------------------------------------------------------
+// Bitonic Sort Compute Shader
+//--------------------------------------------------------------------------------------
+groupshared unsigned int shared_data[BITONIC_BLOCK_SIZE];
+
+[numthreads(BITONIC_BLOCK_SIZE, 1, 1)]
+void BitonicSort( uint3 Gid : SV_GroupID, 
+                  uint3 DTid : SV_DispatchThreadID, 
+                  uint3 GTid : SV_GroupThreadID, 
+                  uint GI : SV_GroupIndex )
+{
+    // Load shared data
+    shared_data[GI] = Data[DTid.x];
+    GroupMemoryBarrierWithGroupSync();
+    
+    // Sort the shared data
+    for (unsigned int j = g_iLevel >> 1 ; j > 0 ; j >>= 1)
+    {
+        unsigned int result = ((shared_data[GI & ~j] <= shared_data[GI | j]) == (bool)(g_iLevelMask & DTid.x))? shared_data[GI ^ j] : shared_data[GI];
+        GroupMemoryBarrierWithGroupSync();
+        shared_data[GI] = result;
+        GroupMemoryBarrierWithGroupSync();
+    }
+    
+    // Store shared data
+    Data[DTid.x] = shared_data[GI];
+}
+
+//--------------------------------------------------------------------------------------
+// Matrix Transpose Compute Shader
+//--------------------------------------------------------------------------------------
+groupshared unsigned int transpose_shared_data[TRANSPOSE_BLOCK_SIZE * TRANSPOSE_BLOCK_SIZE];
+
+[numthreads(TRANSPOSE_BLOCK_SIZE, TRANSPOSE_BLOCK_SIZE, 1)]
+void MatrixTranspose( uint3 Gid : SV_GroupID, 
+                      uint3 DTid : SV_DispatchThreadID, 
+                      uint3 GTid : SV_GroupThreadID, 
+                      uint GI : SV_GroupIndex )
+{
+    transpose_shared_data[GI] = Input[DTid.y * g_iWidth + DTid.x];
+    GroupMemoryBarrierWithGroupSync();
+    uint2 XY = DTid.yx - GTid.yx + GTid.xy;
+    Data[XY.y * g_iHeight + XY.x] = transpose_shared_data[GTid.x * TRANSPOSE_BLOCK_SIZE + GTid.y];
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02.fx
new file mode 100644
index 000000000..941e001b3
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02.fx
@@ -0,0 +1,23 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS
+//--------------------------------------------------------------------------------------
+// File: Tutorial02.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+float4 VS( float4 Pos : POSITION ) : SV_POSITION
+{
+    return Pos;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( float4 Pos : SV_POSITION ) : SV_Target
+{
+    return float4( 1.0f, 1.0f, 0.0f, 1.0f );    // Yellow, with Alpha = 1
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_PS.hlsl
new file mode 100644
index 000000000..5a59aadc6
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_PS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS
+#include "Tutorial02.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_VS.hlsl
new file mode 100644
index 000000000..d58459b78
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial02/Tutorial02_VS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS
+#include "Tutorial02.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03.fx
new file mode 100644
index 000000000..941e001b3
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03.fx
@@ -0,0 +1,23 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS
+//--------------------------------------------------------------------------------------
+// File: Tutorial02.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+float4 VS( float4 Pos : POSITION ) : SV_POSITION
+{
+    return Pos;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( float4 Pos : SV_POSITION ) : SV_Target
+{
+    return float4( 1.0f, 1.0f, 0.0f, 1.0f );    // Yellow, with Alpha = 1
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_PS.hlsl
new file mode 100644
index 000000000..29b6e8b2c
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_PS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS
+#include "Tutorial03.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_VS.hlsl
new file mode 100644
index 000000000..db47ead28
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial03/Tutorial03_VS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS
+#include "Tutorial03.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04.fx
new file mode 100644
index 000000000..deb7b585f
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04.fx
@@ -0,0 +1,46 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS
+//--------------------------------------------------------------------------------------
+// File: Tutorial04.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+cbuffer ConstantBuffer : register( b0 )
+{
+	matrix World;
+	matrix View;
+	matrix Projection;
+}
+
+//--------------------------------------------------------------------------------------
+struct VS_OUTPUT
+{
+    float4 Pos : SV_POSITION;
+    float4 Color : COLOR0;
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+VS_OUTPUT VS( float4 Pos : POSITION, float4 Color : COLOR )
+{
+    VS_OUTPUT output = (VS_OUTPUT)0;
+    output.Pos = mul( Pos, World );
+    output.Pos = mul( output.Pos, View );
+    output.Pos = mul( output.Pos, Projection );
+    output.Color = Color;
+    return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( VS_OUTPUT input ) : SV_Target
+{
+    return input.Color;
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_PS.hlsl
new file mode 100644
index 000000000..dc627637c
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_PS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS
+#include "Tutorial04.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_VS.hlsl
new file mode 100644
index 000000000..96d0a642c
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial04/Tutorial04_VS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS
+#include "Tutorial04.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05.fx
new file mode 100644
index 000000000..b15c99e49
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05.fx
@@ -0,0 +1,54 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS
+//--------------------------------------------------------------------------------------
+// File: Tutorial05.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+cbuffer ConstantBuffer : register( b0 )
+{
+	matrix World;
+	matrix View;
+	matrix Projection;
+}
+
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+    float4 Pos : POSITION;
+    float4 Color : COLOR;
+};
+
+struct PS_INPUT
+{
+    float4 Pos : SV_POSITION;
+    float4 Color : COLOR;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+PS_INPUT VS( VS_INPUT input )
+{
+    PS_INPUT output = (PS_INPUT)0;
+    output.Pos = mul( input.Pos, World );
+    output.Pos = mul( output.Pos, View );
+    output.Pos = mul( output.Pos, Projection );
+    output.Color = input.Color;
+    
+    return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( PS_INPUT input) : SV_Target
+{
+    return input.Color;
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_PS.hlsl
new file mode 100644
index 000000000..acc900ff5
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_PS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS
+#include "Tutorial05.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_VS.hlsl
new file mode 100644
index 000000000..726f05979
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial05/Tutorial05_VS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS
+#include "Tutorial05.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06.fx
new file mode 100644
index 000000000..7d839009d
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06.fx
@@ -0,0 +1,76 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS -entry PSSolid
+//--------------------------------------------------------------------------------------
+// File: Tutorial06.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+cbuffer ConstantBuffer : register( b0 )
+{
+	matrix World;
+	matrix View;
+	matrix Projection;
+	float4 vLightDir[2];
+	float4 vLightColor[2];
+	float4 vOutputColor;
+}
+
+
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+    float4 Pos : POSITION;
+    float3 Norm : NORMAL;
+};
+
+struct PS_INPUT
+{
+    float4 Pos : SV_POSITION;
+    float3 Norm : TEXCOORD0;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+PS_INPUT VS( VS_INPUT input )
+{
+    PS_INPUT output = (PS_INPUT)0;
+    output.Pos = mul( input.Pos, World );
+    output.Pos = mul( output.Pos, View );
+    output.Pos = mul( output.Pos, Projection );
+    output.Norm = mul( float4( input.Norm, 1 ), World ).xyz;
+    
+    return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( PS_INPUT input) : SV_Target
+{
+    float4 finalColor = 0;
+    
+    //do NdotL lighting for 2 lights
+    for(int i=0; i<2; i++)
+    {
+        finalColor += saturate( dot( (float3)vLightDir[i],input.Norm) * vLightColor[i] );
+    }
+    finalColor.a = 1;
+    return finalColor;
+}
+
+
+//--------------------------------------------------------------------------------------
+// PSSolid - render a solid color
+//--------------------------------------------------------------------------------------
+float4 PSSolid( PS_INPUT input) : SV_Target
+{
+    return vOutputColor;
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_PS.hlsl
new file mode 100644
index 000000000..31ed082e7
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_PS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS
+#include "Tutorial06.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_VS.hlsl
new file mode 100644
index 000000000..a5512efb6
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial06/Tutorial06_VS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS
+#include "Tutorial06.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07.fx b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07.fx
new file mode 100644
index 000000000..0baad7a0c
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07.fx
@@ -0,0 +1,67 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS
+//--------------------------------------------------------------------------------------
+// File: Tutorial07.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+Texture2D txDiffuse : register( t0 );
+SamplerState samLinear : register( s0 );
+
+cbuffer cbNeverChanges : register( b0 )
+{
+    matrix View;
+};
+
+cbuffer cbChangeOnResize : register( b1 )
+{
+    matrix Projection;
+};
+
+cbuffer cbChangesEveryFrame : register( b2 )
+{
+    matrix World;
+    float4 vMeshColor;
+};
+
+
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+    float4 Pos : POSITION;
+    float2 Tex : TEXCOORD0;
+};
+
+struct PS_INPUT
+{
+    float4 Pos : SV_POSITION;
+    float2 Tex : TEXCOORD0;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+PS_INPUT VS( VS_INPUT input )
+{
+    PS_INPUT output = (PS_INPUT)0;
+    output.Pos = mul( input.Pos, World );
+    output.Pos = mul( output.Pos, View );
+    output.Pos = mul( output.Pos, Projection );
+    output.Tex = input.Tex;
+    
+    return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( PS_INPUT input) : SV_Target
+{
+    return txDiffuse.Sample( samLinear, input.Tex ) * vMeshColor;
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_PS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_PS.hlsl
new file mode 100644
index 000000000..c3c101943
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_PS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PS
+#include "Tutorial07.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_VS.hlsl b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_VS.hlsl
new file mode 100644
index 000000000..4c287c790
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11Tutorials/Tutorial07/Tutorial07_VS.hlsl
@@ -0,0 +1,3 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS
+#include "Tutorial07.fx"
diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial08/Tutorial08.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial08/Tutorial08.fx
new file mode 100644
index 000000000..6ff313b97
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial08/Tutorial08.fx
@@ -0,0 +1,56 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS
+//--------------------------------------------------------------------------------------
+// File: Tutorial08.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+Texture2D txDiffuse : register( t0 );
+SamplerState samLinear : register( s0 );
+
+cbuffer cbChangesEveryFrame : register( b0 )
+{
+    matrix WorldViewProj;
+    matrix World;
+    float4 vMeshColor;
+};
+
+
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+    float4 Pos : POSITION;
+    float2 Tex : TEXCOORD;
+};
+
+struct PS_INPUT
+{
+    float4 Pos : SV_POSITION;
+    float2 Tex : TEXCOORD0;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+PS_INPUT VS( VS_INPUT input )
+{
+    PS_INPUT output = (PS_INPUT)0;
+    output.Pos = mul( input.Pos, WorldViewProj );
+    output.Tex = input.Tex;
+    
+    return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( PS_INPUT input) : SV_Target
+{
+    return txDiffuse.Sample( samLinear, input.Tex ) * vMeshColor;
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial09/Tutorial09.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial09/Tutorial09.fx
new file mode 100644
index 000000000..04a395588
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial09/Tutorial09.fx
@@ -0,0 +1,69 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS
+//--------------------------------------------------------------------------------------
+// File: Tutorial09.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+Texture2D txDiffuse : register( t0 );
+SamplerState samLinear : register( s0 );
+
+cbuffer cbNeverChanges : register( b0 )
+{
+    float3 vLightDir;
+};
+
+cbuffer cbChangesEveryFrame : register( b1 )
+{
+    matrix WorldViewProj;
+    matrix World;
+};
+
+struct VS_INPUT
+{
+    float3 Pos          : POSITION;         //position
+    float3 Norm         : NORMAL;           //normal
+    float2 Tex          : TEXCOORD0;        //texture coordinate
+};
+
+struct PS_INPUT
+{
+    float4 Pos : SV_POSITION;
+    float4 Diffuse : COLOR0;
+    float2 Tex : TEXCOORD1;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+PS_INPUT VS( VS_INPUT input )
+{
+    PS_INPUT output = (PS_INPUT)0;
+    output.Pos = mul( float4(input.Pos,1), WorldViewProj );
+    float3 vNormalWorldSpace = normalize( mul( input.Norm, (float3x3)World ) );
+
+    float fLighting = saturate( dot( vNormalWorldSpace, vLightDir ) );
+    output.Diffuse.rgb = fLighting;
+    output.Diffuse.a = 1.0f; 
+
+    output.Tex = input.Tex;
+    
+    return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( PS_INPUT input) : SV_Target
+{
+    //calculate lighting assuming light color is <1,1,1,1>
+    float4 outputColor = txDiffuse.Sample( samLinear, input.Tex ) * input.Diffuse;
+    outputColor.a = 1;
+    return outputColor;
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial10/Tutorial10.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial10/Tutorial10.fx
new file mode 100644
index 000000000..e9bded408
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11TutorialsDXUT/Tutorial10/Tutorial10.fx
@@ -0,0 +1,73 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VS -profile ps_4_0 -entry PS
+//--------------------------------------------------------------------------------------
+// File: Tutorial10.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+Texture2D txDiffuse : register( t0 );
+SamplerState samLinear : register( s0 );
+
+cbuffer cbNeverChanges : register( b0 )
+{
+    float3 vLightDir;
+};
+
+cbuffer cbChangesEveryFrame : register( b1 )
+{
+    matrix WorldViewProj;
+    matrix World;
+    float Puffiness;
+};
+
+struct VS_INPUT
+{
+    float3 Pos          : POSITION;         //position
+    float3 Norm         : NORMAL;           //normal
+    float2 Tex          : TEXCOORD0;        //texture coordinate
+};
+
+struct PS_INPUT
+{
+    float4 Pos : SV_POSITION;
+    float4 Diffuse : COLOR0;
+    float2 Tex : TEXCOORD1;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+PS_INPUT VS( VS_INPUT input )
+{
+    PS_INPUT output = (PS_INPUT)0;
+
+    input.Pos += input.Norm * Puffiness;
+
+    output.Pos = mul( float4(input.Pos,1), WorldViewProj );
+    float3 vNormalWorldSpace = normalize( mul( input.Norm, (float3x3)World ) );
+
+    float fLighting = saturate( dot( vNormalWorldSpace, vLightDir ) );
+    output.Diffuse.rgb = fLighting;
+    output.Diffuse.a = 1.0f; 
+
+    output.Tex = input.Tex;
+    
+    return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( PS_INPUT input) : SV_Target
+{
+    //calculate lighting assuming light color is <1,1,1,1>
+    float4 outputColor = txDiffuse.Sample( samLinear, input.Tex ) * input.Diffuse;
+    outputColor.a = 1;
+    return outputColor;
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial11/Tutorial11.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial11/Tutorial11.fx
new file mode 100644
index 000000000..a647a9079
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial11/Tutorial11.fx
@@ -0,0 +1,117 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: Tutorial11.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+Texture2D g_txDiffuse;
+SamplerState samLinear
+{
+    Filter = MIN_MAG_MIP_LINEAR;
+    AddressU = Wrap;
+    AddressV = Wrap;
+};
+
+cbuffer cbConstant
+{
+    float3 vLightDir = float3(-0.577,0.577,-0.577);
+};
+
+cbuffer cbChangesEveryFrame
+{
+    matrix World;
+    matrix View;
+    matrix Projection;
+    float Time;
+};
+
+cbuffer cbUserChanges
+{
+    float Waviness;
+};
+
+struct VS_INPUT
+{
+    float3 Pos          : POSITION;        
+    float3 Norm         : NORMAL;          
+    float2 Tex          : TEXCOORD0;       
+};
+
+struct PS_INPUT
+{
+    float4 Pos : SV_POSITION;
+    float3 Norm : TEXCOORD0;
+    float2 Tex : TEXCOORD1;
+};
+
+//--------------------------------------------------------------------------------------
+// DepthStates
+//--------------------------------------------------------------------------------------
+DepthStencilState EnableDepth
+{
+    DepthEnable = TRUE;
+    DepthWriteMask = ALL;
+    DepthFunc = LESS_EQUAL;
+};
+
+BlendState NoBlending
+{
+    AlphaToCoverageEnable = FALSE;
+    BlendEnable[0] = FALSE;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+PS_INPUT VS( VS_INPUT input )
+{
+    PS_INPUT output = (PS_INPUT)0;
+    
+    output.Pos = mul( float4(input.Pos,1), World );
+    
+    output.Pos.x += sin( output.Pos.y*0.1f + Time )*Waviness;
+    
+    output.Pos = mul( output.Pos, View );
+    output.Pos = mul( output.Pos, Projection );
+    output.Norm = mul( input.Norm, World );
+    output.Tex = input.Tex;
+    
+    return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( PS_INPUT input) : SV_Target
+{
+    // Calculate lighting assuming light color is <1,1,1,1>
+    float fLighting = saturate( dot( input.Norm, vLightDir ) );
+    float4 outputColor = g_txDiffuse.Sample( samLinear, input.Tex ) * fLighting;
+    outputColor.a = 1;
+    return outputColor;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Technique
+//--------------------------------------------------------------------------------------
+technique11 Render
+{
+    pass P0
+    {
+        SetVertexShader( CompileShader( vs_4_0, VS() ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0, PS() ) );        
+
+        SetDepthStencilState( EnableDepth, 0 );
+        SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+    }
+}
+
diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial12/Tutorial12.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial12/Tutorial12.fx
new file mode 100644
index 000000000..aae7f9a87
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial12/Tutorial12.fx
@@ -0,0 +1,129 @@
+//TEST_IGNORE_FILE:
+//
+// Constant Buffer Variables
+//
+
+Texture2D g_txDiffuse;
+SamplerState samLinear
+{
+    Filter = MIN_MAG_MIP_LINEAR;
+    AddressU = Wrap;
+    AddressV = Wrap;
+};
+
+TextureCube g_txEnvMap;
+SamplerState samLinearClamp
+{
+    Filter = MIN_MAG_MIP_LINEAR;
+    AddressU = Clamp;
+    AddressV = Clamp;
+};
+
+cbuffer cbConstant
+{
+    float3 vLightDir = float3(-0.577,0.577,-0.577);
+};
+
+cbuffer cbChangesEveryFrame
+{
+    matrix World;
+    matrix View;
+    matrix Projection;
+    float Time;
+};
+
+cbuffer cbUserChanges
+{
+    float Waviness;
+};
+
+struct VS_INPUT
+{
+    float3 Pos          : POSITION;         //position
+    float3 Norm         : NORMAL;           //normal
+    float2 Tex          : TEXCOORD0;        //texture coordinate
+};
+
+struct PS_INPUT
+{
+    float4 Pos : SV_POSITION;
+    float3 Norm : TEXCOORD0;
+    float2 Tex : TEXCOORD1;
+    float3 ViewR : TEXCOORD2;
+};
+
+//--------------------------------------------------------------------------------------
+// DepthStates
+//--------------------------------------------------------------------------------------
+DepthStencilState EnableDepth
+{
+    DepthEnable = TRUE;
+    DepthWriteMask = ALL;
+    DepthFunc = LESS_EQUAL;
+};
+
+BlendState NoBlending
+{
+    AlphaToCoverageEnable = FALSE;
+    BlendEnable[0] = FALSE;
+};
+
+//
+// Vertex Shader
+//
+PS_INPUT VS( VS_INPUT input )
+{
+    PS_INPUT output = (PS_INPUT)0;
+    
+    output.Pos = mul( float4(input.Pos,1), World );
+    
+    output.Pos.x += sin( output.Pos.y*0.1f + Time )*Waviness;
+    
+    output.Pos = mul( output.Pos, View );
+    output.Pos = mul( output.Pos, Projection );
+    output.Norm = mul( input.Norm, (float3x3)World );
+    output.Tex = input.Tex;
+    
+    // Calculate the reflection vector
+    float3 viewNorm = mul( output.Norm, (float3x3)View );
+    output.ViewR = reflect( viewNorm, float3(0,0,-1.0) );
+    
+    return output;
+}
+
+
+//
+// Pixel Shader
+//
+float4 PS( PS_INPUT input) : SV_Target
+{
+    // Calculate lighting assuming light color is <1,1,1,1>
+    float fLighting = saturate( dot( input.Norm, vLightDir ) );
+   
+    // Load the environment map texture
+    float4 cReflect = g_txEnvMap.Sample( samLinearClamp, input.ViewR );
+    
+    // Load the diffuse texture and multiply by the lighting amount
+    float4 cDiffuse = g_txDiffuse.Sample( samLinear, input.Tex ) * fLighting;
+    
+    // Add diffuse to reflection and go
+    float4 cTotal = cDiffuse + cReflect;
+    cTotal.a = 1;
+    return cTotal;
+}
+
+//
+// Technique
+//
+technique11 Render
+{
+    pass P0
+    {
+        SetVertexShader( CompileShader( vs_4_0, VS() ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0, PS() ) );
+        
+        SetDepthStencilState( EnableDepth, 0 );
+        SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+    }
+}
diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial13/Tutorial13.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial13/Tutorial13.fx
new file mode 100644
index 000000000..a6f09ecc7
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial13/Tutorial13.fx
@@ -0,0 +1,191 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: Tutorial13.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+Texture2D g_txDiffuse;
+SamplerState samLinear
+{
+    Filter = MIN_MAG_MIP_LINEAR;
+    AddressU = Wrap;
+    AddressV = Wrap;
+};
+
+TextureCube g_txEnvMap;
+SamplerState samLinearClamp
+{
+    Filter = MIN_MAG_MIP_LINEAR;
+    AddressU = Clamp;
+    AddressV = Clamp;
+};
+
+cbuffer cbConstant
+{
+    float3 vLightDir = float3(-0.577,0.577,-0.577);
+};
+
+cbuffer cbChangesEveryFrame
+{
+    matrix World;
+    matrix View;
+    matrix Projection;
+    float Time;
+};
+
+cbuffer cbUserChanges
+{
+    float Explode;
+};
+
+struct VS_INPUT
+{
+    float3 Pos          : POSITION;         
+    float3 Norm         : NORMAL;           
+    float2 Tex          : TEXCOORD0;        
+};
+
+struct GSPS_INPUT
+{
+    float4 Pos : SV_POSITION;
+    float3 Norm : TEXCOORD0;
+    float2 Tex : TEXCOORD1;
+};
+
+//--------------------------------------------------------------------------------------
+// DepthStates
+//--------------------------------------------------------------------------------------
+DepthStencilState EnableDepth
+{
+    DepthEnable = TRUE;
+    DepthWriteMask = ALL;
+    DepthFunc = LESS_EQUAL;
+};
+
+BlendState NoBlending
+{
+    AlphaToCoverageEnable = FALSE;
+    BlendEnable[0] = FALSE;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+GSPS_INPUT VS( VS_INPUT input )
+{
+    GSPS_INPUT output = (GSPS_INPUT)0;
+    
+    output.Pos = mul( float4(input.Pos,1), World );
+    output.Norm = mul( input.Norm, (float3x3)World );
+    output.Tex = input.Tex;
+    
+    return output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Geometry Shader
+//--------------------------------------------------------------------------------------
+[maxvertexcount(12)]
+void GS( triangle GSPS_INPUT input[3], inout TriangleStream<GSPS_INPUT> TriStream )
+{
+    GSPS_INPUT output;
+    
+    //
+    // Calculate the face normal
+    //
+    float3 faceEdgeA = input[1].Pos - input[0].Pos;
+    float3 faceEdgeB = input[2].Pos - input[0].Pos;
+    float3 faceNormal = normalize( cross(faceEdgeA, faceEdgeB) );
+    float3 ExplodeAmt = faceNormal*Explode;
+    
+    //
+    // Calculate the face center
+    //
+    float3 centerPos = (input[0].Pos.xyz + input[1].Pos.xyz + input[2].Pos.xyz)/3.0;
+    float2 centerTex = (input[0].Tex + input[1].Tex + input[2].Tex)/3.0;
+    centerPos += faceNormal*Explode;
+    
+    //
+    // Output the pyramid
+    //
+    for( int i=0; i<3; i++ )
+    {
+        output.Pos = input[i].Pos + float4(ExplodeAmt,0);
+        output.Pos = mul( output.Pos, View );
+        output.Pos = mul( output.Pos, Projection );
+        output.Norm = input[i].Norm;
+        output.Tex = input[i].Tex;
+        TriStream.Append( output );
+        
+        int iNext = (i+1)%3;
+        output.Pos = input[iNext].Pos + float4(ExplodeAmt,0);
+        output.Pos = mul( output.Pos, View );
+        output.Pos = mul( output.Pos, Projection );
+        output.Norm = input[iNext].Norm;
+        output.Tex = input[iNext].Tex;
+        TriStream.Append( output );
+        
+        output.Pos = float4(centerPos,1) + float4(ExplodeAmt,0);
+        output.Pos = mul( output.Pos, View );
+        output.Pos = mul( output.Pos, Projection );
+        output.Norm = faceNormal;
+        output.Tex = centerTex;
+        TriStream.Append( output );
+        
+        TriStream.RestartStrip();
+    }
+    
+    for( int i=2; i>=0; i-- )
+    {
+        output.Pos = input[i].Pos + float4(ExplodeAmt,0);
+        output.Pos = mul( output.Pos, View );
+        output.Pos = mul( output.Pos, Projection );
+        output.Norm = -input[i].Norm;
+        output.Tex = input[i].Tex;
+        TriStream.Append( output );
+    }
+    TriStream.RestartStrip();
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( GSPS_INPUT input) : SV_Target
+{
+    // Calculate lighting assuming light color is <1,1,1,1>
+    float fLighting = saturate( dot( input.Norm, vLightDir ) );
+    
+    // Load the diffuse texture and multiply by the lighting amount
+    float4 cDiffuse = g_txDiffuse.Sample( samLinear, input.Tex ) * fLighting;
+    cDiffuse.a = 1;
+    
+    // return diffuse
+    return cDiffuse;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Technique
+//--------------------------------------------------------------------------------------
+technique11 Render
+{
+    pass P0
+    {
+        SetVertexShader( CompileShader( vs_4_0, VS() ) );
+        SetGeometryShader( CompileShader( gs_4_0, GS() ) );
+        SetPixelShader( CompileShader( ps_4_0, PS() ) );
+        
+        SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+        SetDepthStencilState( EnableDepth, 0 );
+    }
+}
+
+
diff --git a/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial14/Tutorial14.fx b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial14/Tutorial14.fx
new file mode 100644
index 000000000..b1e45b842
--- /dev/null
+++ b/tests/hlsl/dxsdk/Direct3D11TutorialsFX11/Tutorial14/Tutorial14.fx
@@ -0,0 +1,294 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: Tutorial14.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Constant Buffer Variables
+//--------------------------------------------------------------------------------------
+Texture2D g_txDiffuse;
+SamplerState samLinear
+{
+    Filter = MIN_MAG_MIP_LINEAR;
+    AddressU = Wrap;
+    AddressV = Wrap;
+};
+
+cbuffer cbConstant
+{
+    float3 vLightDir = float3(-0.577,0.577,-0.577);
+};
+
+cbuffer cbChangesEveryFrame
+{
+    matrix World;
+    matrix View;
+    matrix Projection;
+};
+
+struct VS_INPUT
+{
+    float3 Pos          : POSITION;         //position
+    float3 Norm         : NORMAL;           //normal
+    float2 Tex          : TEXCOORD0;        //texture coordinate
+};
+
+struct PS_INPUT
+{
+    float4 Pos : SV_POSITION;
+    float3 Norm : TEXCOORD0;
+    float2 Tex : TEXCOORD1;
+};
+
+struct QUADVS_INPUT
+{
+    float4 Pos : POSITION;
+    float2 Tex : TEXCOORD0;
+};
+
+struct QUADVS_OUTPUT
+{
+    float4 Pos : SV_POSITION;              // Transformed position
+    float2 Tex : TEXCOORD0;
+};
+
+//--------------------------------------------------------------------------------------
+// Blending States
+//--------------------------------------------------------------------------------------
+BlendState NoBlending
+{
+    BlendEnable[0] = FALSE;
+};
+
+BlendState SrcAlphaBlendingAdd
+{
+    BlendEnable[0] = TRUE;
+    SrcBlend = SRC_ALPHA;
+    DestBlend = ONE;
+    BlendOp = ADD;
+    SrcBlendAlpha = ZERO;
+    DestBlendAlpha = ZERO;
+    BlendOpAlpha = ADD;
+    RenderTargetWriteMask[0] = 0x0F;
+};
+
+BlendState SrcAlphaBlendingSub
+{
+    BlendEnable[0] = TRUE;
+    SrcBlend = SRC_ALPHA;
+    DestBlend = ONE;
+    BlendOp = SUBTRACT;
+    SrcBlendAlpha = ZERO;
+    DestBlendAlpha = ZERO;
+    BlendOpAlpha = ADD;
+    RenderTargetWriteMask[0] = 0x0F;
+};
+
+BlendState SrcColorBlendingAdd
+{
+    BlendEnable[0] = TRUE;
+    SrcBlend = SRC_COLOR;
+    DestBlend = ONE;
+    BlendOp = ADD;
+    SrcBlendAlpha = ZERO;
+    DestBlendAlpha = ZERO;
+    BlendOpAlpha = ADD;
+    RenderTargetWriteMask[0] = 0x0F;
+};
+
+BlendState SrcColorBlendingSub
+{
+    BlendEnable[0] = TRUE;
+    SrcBlend = SRC_COLOR;
+    DestBlend = ONE;
+    BlendOp = SUBTRACT;
+    SrcBlendAlpha = ZERO;
+    DestBlendAlpha = ZERO;
+    BlendOpAlpha = ADD;
+    RenderTargetWriteMask[0] = 0x0F;
+};
+
+//--------------------------------------------------------------------------------------
+// Depth/Stencil States
+//--------------------------------------------------------------------------------------
+DepthStencilState RenderWithStencilState
+{
+    DepthEnable = false;
+    DepthWriteMask = ZERO;
+    DepthFunc = Less;
+    
+    // Setup stencil states
+    StencilEnable = true;
+    StencilReadMask = 0xFF;
+    StencilWriteMask = 0x00;
+    
+    FrontFaceStencilFunc = Not_Equal;
+    FrontFaceStencilPass = Keep;
+    FrontFaceStencilFail = Zero;
+    
+    BackFaceStencilFunc = Not_Equal;
+    BackFaceStencilPass = Keep;
+    BackFaceStencilFail = Zero;
+};
+
+
+
+//--------------------------------------------------------------------------------------
+// Scene Vertex Shader
+//--------------------------------------------------------------------------------------
+PS_INPUT VS( VS_INPUT input )
+{
+    PS_INPUT output = (PS_INPUT)0;
+    
+    output.Pos = mul( float4(input.Pos,1), World );
+    output.Pos = mul( output.Pos, View );
+    output.Pos = mul( output.Pos, Projection );
+    output.Norm = mul( input.Norm, World );
+    output.Tex = input.Tex;
+    
+    return output;
+}
+
+//-----------------------------------------------------------------------------
+// Quad Vertex Shaders
+//-----------------------------------------------------------------------------
+QUADVS_OUTPUT QuadVS( QUADVS_INPUT Input )
+{
+    QUADVS_OUTPUT Output;
+    Output.Pos = mul( Input.Pos, World );
+    Output.Pos = mul( Output.Pos, View );
+    Output.Pos = mul( Output.Pos, Projection );
+    Output.Tex = Input.Tex;
+    return Output;
+}
+
+QUADVS_OUTPUT ScreenQuadVS( QUADVS_INPUT Input )
+{
+    QUADVS_OUTPUT Output;
+    Output.Pos = Input.Pos;
+    Output.Tex = Input.Tex;
+    return Output;
+}
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PS( PS_INPUT input) : SV_Target
+{
+    // Calculate lighting assuming light color is <1,1,1,1>
+    float fLighting = saturate( dot( input.Norm, vLightDir ) );
+    float4 outputColor = g_txDiffuse.Sample( samLinear, input.Tex ) * fLighting;
+    outputColor.a = 1;
+    return outputColor;
+}
+
+//--------------------------------------------------------------------------------------
+// Quad Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 QuadPS( QUADVS_OUTPUT input) : SV_Target
+{
+    return g_txDiffuse.Sample( samLinear, input.Tex );
+}
+
+
+//--------------------------------------------------------------------------------------
+// Scene Techniques
+//--------------------------------------------------------------------------------------
+technique11 RenderScene
+{
+    pass P0
+    {
+        SetVertexShader( CompileShader( vs_4_0, VS() ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0, PS() ) );        
+        SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+    }
+}
+
+//--------------------------------------------------------------------------------------
+// RenderWithStencil - set the depth stencil state inside of the technique
+//--------------------------------------------------------------------------------------
+technique11 RenderWithStencil
+{
+    pass P0
+    {
+        SetVertexShader( CompileShader( vs_4_0, ScreenQuadVS() ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0, QuadPS() ) );     
+           
+        SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+        SetDepthStencilState( RenderWithStencilState, 0 );
+    }
+}
+
+//--------------------------------------------------------------------------------------
+// Quad Techniques:  Alpha blending state is set inside the technique
+//--------------------------------------------------------------------------------------
+technique11 RenderQuadSolid
+{
+    pass P0
+    {
+        SetVertexShader( CompileShader( vs_4_0, QuadVS() ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0, QuadPS() ) );     
+           
+        SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+    }
+}
+
+//--------------------------------------------------------------------------------------
+technique11 RenderQuadSrcAlphaAdd
+{
+    pass P0
+    {
+        SetVertexShader( CompileShader( vs_4_0, QuadVS() ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0, QuadPS() ) );   
+             
+        SetBlendState( SrcAlphaBlendingAdd, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+    }
+}
+
+//--------------------------------------------------------------------------------------
+technique11 RenderQuadSrcAlphaSub
+{
+    pass P0
+    {
+        SetVertexShader( CompileShader( vs_4_0, QuadVS() ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0, QuadPS() ) );   
+             
+        SetBlendState( SrcAlphaBlendingSub, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+    }
+}
+
+//--------------------------------------------------------------------------------------
+technique11 RenderQuadSrcColorAdd
+{
+    pass P0
+    {
+        SetVertexShader( CompileShader( vs_4_0, QuadVS() ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0, QuadPS() ) );   
+             
+        SetBlendState( SrcColorBlendingAdd, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+    }
+}
+
+//--------------------------------------------------------------------------------------
+technique11 RenderQuadSrcColorSub
+{
+    pass P0
+    {
+        SetVertexShader( CompileShader( vs_4_0, QuadVS() ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0, QuadPS() ) );   
+             
+        SetBlendState( SrcColorBlendingSub, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+    }
+}
+
+
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_LightPSH.h b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_LightPSH.h
new file mode 100644
index 000000000..b44251829
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_LightPSH.h
@@ -0,0 +1,84 @@
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkage11_LightPSH.h
+//
+// The pixel shader light header file for the DynamicShaderLinkage11 sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Interfaces
+//--------------------------------------------------------------------------------------
+interface iBaseLight
+{
+   float3 IlluminateAmbient(float3 vNormal);
+   
+   float3 IlluminateDiffuse(float3 vNormal);
+
+   float3 IlluminateSpecular(float3 vNormal, int specularPower );
+   
+};
+
+//--------------------------------------------------------------------------------------
+// Classes
+//--------------------------------------------------------------------------------------
+class cAmbientLight : iBaseLight
+{
+   float3	m_vLightColor;     
+   bool     m_bEnable;
+   
+   float3 IlluminateAmbient(float3 vNormal);
+      
+   float3 IlluminateDiffuse(float3 vNormal)
+   { 
+      return (float3)0;
+   }
+
+   float3 IlluminateSpecular(float3 vNormal, int specularPower )
+   { 
+      return (float3)0;
+   }
+};
+
+class cHemiAmbientLight : cAmbientLight
+{
+   // inherited float4 m_vLightColor is the SkyColor
+   float4   m_vGroundColor;
+   float4   m_vDirUp;
+
+   float3 IlluminateAmbient(float3 vNormal);
+   
+};
+
+class cDirectionalLight : cAmbientLight
+{
+   // inherited float4 m_vLightColor is the LightColor
+   float4 m_vLightDir;
+   
+   float3 IlluminateDiffuse( float3 vNormal );
+
+   float3 IlluminateSpecular( float3 vNormal, int specularPower );
+
+};
+
+class cOmniLight : cAmbientLight
+{
+   float3	m_vLightPosition;
+   float    radius;   
+   
+   float3 IlluminateDiffuse( float3 vNormal );
+  
+};
+
+class cSpotLight : cAmbientLight
+{
+   float3	m_vLightPosition;
+   float3	m_vLightDir;
+};
+
+class cEnvironmentLight : cAmbientLight
+{
+   float3  IlluminateSpecular( float3 vNormal, int specularPower );  
+};
+
+
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_MaterialPSH.h b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_MaterialPSH.h
new file mode 100644
index 000000000..7f6bc3d22
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_MaterialPSH.h
@@ -0,0 +1,103 @@
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkage11_MATERIALPSH.h
+//
+// The pixel shader material header file for the DynamicShaderLinkage11 sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Interfaces
+//--------------------------------------------------------------------------------------
+interface iBaseMaterial
+{
+   float3 GetAmbientColor(float2 vTexcoord);
+   
+   float3 GetDiffuseColor(float2 vTexcoord);
+
+   int GetSpecularPower();
+
+};
+
+//--------------------------------------------------------------------------------------
+// Classes
+//--------------------------------------------------------------------------------------
+class cBaseMaterial : iBaseMaterial
+{
+   float3	m_vColor;     
+   int      m_iSpecPower;
+   
+   float3 GetAmbientColor(float2 vTexcoord)
+   { 
+      return m_vColor;
+   }
+      
+   float3 GetDiffuseColor(float2 vTexcoord)
+   { 
+      return (float3)m_vColor;
+   }
+
+   int GetSpecularPower()
+   { 
+      return m_iSpecPower;
+   }
+   
+};
+
+class cPlasticMaterial : cBaseMaterial
+{  
+
+};
+
+class cPlasticTexturedMaterial : cPlasticMaterial
+{  
+   float3 GetAmbientColor(float2 vTexcoord);
+
+   float3 GetDiffuseColor(float2 vTexcoord);
+
+};
+
+class cPlasticLightingOnlyMaterial : cBaseMaterial
+{  
+   float3 GetAmbientColor(float2 vTexcoord)
+   { 
+      return (float3)1.0f;
+   }
+      
+   float3 GetDiffuseColor(float2 vTexcoord)
+   { 
+      return (float3)1.0f;
+   }
+
+};
+
+class cRoughMaterial : cBaseMaterial
+{
+   int GetSpecularPower()
+   { 
+      return m_iSpecPower;
+   }
+};
+
+class cRoughTexturedMaterial : cRoughMaterial
+{  
+   float3 GetAmbientColor(float2 vTexcoord);
+
+   float3 GetDiffuseColor(float2 vTexcoord);
+
+};
+
+
+class cRoughLightingOnlyMaterial : cRoughMaterial
+{
+   float3 GetAmbientColor(float2 vTexcoord)
+   { 
+      return (float3)1.0f;
+   }
+      
+   float3 GetDiffuseColor(float2 vTexcoord)
+   { 
+      return (float3)1.0f;
+   }
+
+};
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PS.hlsl b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PS.hlsl
new file mode 100644
index 000000000..c3ee93057
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PS.hlsl
@@ -0,0 +1,84 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PSMain
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkage11.psh
+//
+// The pixel shader header file for the DynamicShaderLinkage11 sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Header Includes
+//--------------------------------------------------------------------------------------
+#include "DynamicShaderLinkage11_PSBuffers.h"
+
+// Defines for default static permutated setting
+#if defined( STATIC_PERMUTE ) 
+   #define HEMI_AMBIENT //CONST_AMBIENT //HEMI_AMBIENT
+   #define TEXTURE_ENABLE
+   #define SPECULAR_ENABLE
+#endif
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct PS_INPUT
+{
+	float4 vPosition	: SV_POSITION;
+	float3 vNormal		: NORMAL;
+	float2 vTexcoord	: TEXCOORD0;
+	float4 vMatrix	: TEXCOORD1;	
+};
+
+//--------------------------------------------------------------------------------------
+// Abstract Interface Instances for dyamic linkage / permutation
+//--------------------------------------------------------------------------------------
+#if !defined( STATIC_PERMUTE ) 
+    iBaseLight     g_abstractAmbientLighting;
+    iBaseLight     g_abstractDirectLighting;
+    iBaseLight     g_abstractEnvironmentLighting;
+    iBaseMaterial  g_abstractMaterial;
+#else
+//--------------------------------------------------------------------------------------
+// Concrete Instances for STATIC_PERMUTE - static permutation
+//--------------------------------------------------------------------------------------
+    #if defined( HEMI_AMBIENT ) 
+        #define g_abstractAmbientLighting g_hemiAmbientLight
+    #else  
+        // CONST_AMBIENT
+        #define g_abstractAmbientLighting g_ambientLight
+    #endif
+    #define g_abstractDirectLighting g_directionalLight
+    #define g_abstractEnvironmentLighting g_environmentLight
+    #if defined( TEXTURE_ENABLE )
+        #define g_abstractMaterial g_plasticTexturedMaterial
+    #else    
+        #define g_abstractMaterial g_plasticMaterial
+    #endif
+#endif
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PSMain( PS_INPUT Input ) : SV_TARGET
+{   
+   // Compute the Ambient term
+   float3   Ambient = (float3)0.0f;	
+   Ambient = g_abstractMaterial.GetAmbientColor( Input.vTexcoord ) * g_abstractAmbientLighting.IlluminateAmbient( Input.vNormal );
+
+   // Accumulate the Diffuse contribution  
+   float3   Diffuse = (float3)0.0f;  
+   
+   Diffuse += g_abstractMaterial.GetDiffuseColor( Input.vTexcoord ) * g_abstractDirectLighting.IlluminateDiffuse( Input.vNormal );
+
+   // Compute the Specular contribution
+   float3   Specular = (float3)0.0f;   
+   Specular += g_abstractDirectLighting.IlluminateSpecular( Input.vNormal, g_abstractMaterial.GetSpecularPower() );
+   Specular += g_abstractEnvironmentLighting.IlluminateSpecular( Input.vNormal, g_abstractMaterial.GetSpecularPower() );
+     
+   // Accumulate the lighting with saturation
+   float3 Lighting = saturate( Ambient + Diffuse + Specular );
+     
+   return float4(Lighting,1.0f); 
+}
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PSBuffers.h b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PSBuffers.h
new file mode 100644
index 000000000..e2263b832
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_PSBuffers.h
@@ -0,0 +1,129 @@
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkage11_LightPSH.hlsl
+//
+// The pixel shader light source module file for the DynamicShaderLinkage11 sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#include "DynamicShaderLinkage11_LightPSH.h"
+#include "DynamicShaderLinkage11_MaterialPSH.h"
+
+//--------------------------------------------------------------------------------------
+// Constant Buffers
+//--------------------------------------------------------------------------------------
+cbuffer cbPerFrame : register( b0 )
+{
+   cAmbientLight     g_ambientLight;
+   cHemiAmbientLight g_hemiAmbientLight;
+   cDirectionalLight g_directionalLight;
+   cEnvironmentLight g_environmentLight;
+   float4            g_vEyeDir;   
+};
+
+cbuffer cbPerPrimitive : register( b1 )
+{
+   cPlasticMaterial              g_plasticMaterial;
+   cPlasticTexturedMaterial      g_plasticTexturedMaterial;
+   cPlasticLightingOnlyMaterial  g_plasticLightingOnlyMaterial;
+   cRoughMaterial                g_roughMaterial;
+   cRoughTexturedMaterial        g_roughTexturedMaterial;
+   cRoughLightingOnlyMaterial    g_roughLightingOnlyMaterial;
+};
+
+//--------------------------------------------------------------------------------------
+// Textures and Samplers
+//--------------------------------------------------------------------------------------
+Texture2D	   g_txDiffuse : register( t0 );
+Texture2D	   g_txNormalMap : register( t1 );
+TextureCube	   g_txEnvironmentMap : register( t2 );
+
+SamplerState   g_samLinear : register( s0 );
+
+//--------------------------------------------------------------------------------------
+// Lighting Class Methods
+//--------------------------------------------------------------------------------------
+// Ambient Lighting Class Methods
+float3 cAmbientLight::IlluminateAmbient(float3 vNormal)
+{ 
+   return float4( m_vLightColor * m_bEnable, 1.0f);
+}
+
+float3 cHemiAmbientLight::IlluminateAmbient(float3 vNormal)
+{ 
+   float thetha = (dot( vNormal, m_vDirUp ) + 1.0f) / 2.0f;
+ 
+   return  lerp( m_vGroundColor, m_vLightColor, thetha) * m_bEnable;
+}
+
+// Directional Light class
+float3 cDirectionalLight::IlluminateDiffuse( float3 vNormal ) 
+{
+   float lambert = saturate(dot( vNormal, m_vLightDir ));
+ 	return ((float3)lambert * m_vLightColor * m_bEnable); 
+}
+
+float3 cDirectionalLight::IlluminateSpecular( float3 vNormal, int specularPower ) 
+{ 	
+   float3 H = -normalize(g_vEyeDir) + m_vLightDir;
+   float3 halfAngle = normalize( H );
+   float specular = pow( max(0,dot( halfAngle, normalize(vNormal) )), specularPower );  	
+
+ 	return ((float3)specular * m_vLightColor * m_bEnable); 
+}
+
+// Omni Light Class
+float3 cOmniLight::IlluminateDiffuse( float3 vNormal ) 
+{
+   return (float3)0.0f; // TO DO!
+}
+
+// Environment Lighting
+float3 cEnvironmentLight::IlluminateSpecular( float3 vNormal, int specularPower ) 
+{ 	  
+   // compute reflection vector taking into account a cheap fresnel falloff;
+   float3 N = normalize(vNormal); 
+   float3 E = normalize(g_vEyeDir);
+   float3 R = reflect( E, N ); 
+   float fresnel = 1 - dot( -E, N );  	
+   fresnel = (fresnel * fresnel * fresnel );
+
+   float3 specular = g_txEnvironmentMap.Sample( g_samLinear, R ) * fresnel;
+
+   return (specular * (float3)m_bEnable); 
+//   return ((float3)fresnel); 
+
+}
+
+//--------------------------------------------------------------------------------------
+// Material Class Methods
+//--------------------------------------------------------------------------------------
+// Plastic Material Methods
+float3 cPlasticTexturedMaterial::GetAmbientColor(float2 vTexcoord)
+{ 
+   float4 vDiffuse = (float4)1.0f;
+   vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord );  
+   return m_vColor * vDiffuse;
+}
+   
+float3 cPlasticTexturedMaterial::GetDiffuseColor(float2 vTexcoord)
+{ 
+   float4 vDiffuse = (float4)1.0f;
+   vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord );  
+   return m_vColor * vDiffuse;
+}
+
+// Rough Material Methods
+float3 cRoughTexturedMaterial::GetAmbientColor(float2 vTexcoord)
+{ 
+   float4 vDiffuse = (float4)1.0f;
+   vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord );  
+   return m_vColor * vDiffuse;
+}
+   
+float3 cRoughTexturedMaterial::GetDiffuseColor(float2 vTexcoord)
+{ 
+   float4 vDiffuse = (float4)1.0f;
+   vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord );  
+   return m_vColor * vDiffuse;
+}
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_VS.hlsl b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_VS.hlsl
new file mode 100644
index 000000000..800dbf3b3
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkage11/DynamicShaderLinkage11_VS.hlsl
@@ -0,0 +1,66 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkage11_VS.hlsl
+//
+// The vertex shader file for the DynamicShaderLinkage11 sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+	float4x4		g_mWorldViewProjection	: packoffset( c0 );
+	float4x4		g_mWorld				: packoffset( c4 );
+};
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+	float4 vPosition	: POSITION;
+	float3 vNormal		: NORMAL;
+	float2 vTexcoord	: TEXCOORD0;
+};
+
+struct VS_OUTPUT
+{
+	float4 vPosition	: SV_POSITION;
+	float3 vNormal		: NORMAL;
+	float2 vTexcoord0	: TEXCOORD0;
+	float4 vMatrix	    : TEXCOORD1; // DEBUG
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+// We aliased signed vectors as a unsigned format. 
+// Need to recover signed values.  The values 1.0 and 2.0
+// are slightly inaccurate here.
+float3 R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( in float3 vVec )
+{
+    vVec *= 2.0f;
+    return vVec >= 1.0f ? ( vVec - 2.0f ) : vVec;
+}
+
+VS_OUTPUT VSMain( VS_INPUT Input )
+{
+
+	VS_OUTPUT   Output;
+	float3      tmpNormal;
+	
+	Output.vPosition =  mul( Input.vPosition, g_mWorldViewProjection );
+	
+	// Expand compressed vectors
+	tmpNormal = R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( Input.vNormal );
+	Output.vNormal = mul( tmpNormal, (float3x3)g_mWorld );
+	
+	Output.vTexcoord0 = Input.vTexcoord;
+
+    Output.vMatrix = (float4)g_mWorld[0]; // DEBUG
+	return Output;
+}
+
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11.fx b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11.fx
new file mode 100644
index 000000000..c72b98843
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11.fx
@@ -0,0 +1,192 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkageFX11.fx
+//
+// The effect file for the DynamicShaderLinkageFX11 sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#include "DynamicShaderLinkageFX11_VS.hlsl"
+#include "DynamicShaderLinkageFX11_PS.hlsl"
+
+//
+// Settings for static permutations.
+// All of the pre-5.0 targets need static specialization
+// since they don't support late binding.  The below
+// just selects a single specialization but you could
+// create any number of them, each one representing
+// a new shader with the interfaces compiled out
+// due to the compile-time class references.
+//
+
+#define StaticMaterial         g_plasticTexturedMaterial
+#define StaticAmbientLight     g_ambientLight
+#define StaticDirectLight      g_directionalLight
+#define StaticEnvironmentLight g_environmentLight
+
+technique11 FeatureLevel10
+{
+    pass
+    {
+        SetRasterizerState(g_rasterizerState[g_fillMode]);
+        SetVertexShader(CompileShader(vs_4_0,
+                                      VSMain()));
+        SetPixelShader(CompileShader(ps_4_0,
+                                     PSMainUniform(StaticAmbientLight,
+                                                   StaticDirectLight,
+                                                   StaticEnvironmentLight,
+                                                   StaticMaterial)));
+    }
+}
+
+technique11 FeatureLevel10_1
+{
+    pass
+    {
+        SetRasterizerState(g_rasterizerState[g_fillMode]);
+        SetVertexShader(CompileShader(vs_4_1,
+                                      VSMain()));
+        SetPixelShader(CompileShader(ps_4_1,
+                                     PSMainUniform(StaticAmbientLight,
+                                                   StaticDirectLight,
+                                                   StaticEnvironmentLight,
+                                                   StaticMaterial)));
+    }
+}
+
+//
+// Variables for dynamic shader linkage.
+// There are two variations here for dynamic usage.
+// In the first we use the uniform entry point
+// and pass in global interface variables.  This
+// creates a shader which refers to the global
+// interface variables when running and we can bind
+// concrete instances in our C++ code by using
+// ID3DX11EffectInterfaceVariable::SetClassInstance.
+// This approach works well when you have several
+// independent variations and want to bind them
+// individually in your C++ code, such as the
+// different lighting and material parameters in
+// this sample.
+//
+
+iBaseLight g_abstractAmbientLighting;
+iBaseLight g_abstractDirectLighting;
+iBaseLight g_abstractEnvironmentLighting;
+iBaseMaterial g_abstractMaterial;
+    
+technique11 FeatureLevel11
+{
+    pass
+    {
+        SetRasterizerState(g_rasterizerState[g_fillMode]);
+        SetVertexShader(CompileShader(vs_5_0,
+                                      VSMain()));
+        SetPixelShader(CompileShader(ps_5_0,
+                                     PSMainUniform(g_abstractAmbientLighting,
+                                                   g_abstractDirectLighting,
+                                                   g_abstractEnvironmentLighting,
+                                                   g_abstractMaterial)));
+    }
+}
+
+//
+// In this second variation we use the non-uniform
+// entry point so that we don't have to specify
+// any interfaces when compiling the shader.  We
+// then reuse the compiled shader with different
+// BindInterfaces calls so that all bindings are
+// handled automatically by the effect runtime.
+// Below we have multiple techniques where
+// we've given a concrete binding for the material.
+// Lighting parameters are left as interfaces for
+// binding via effect variables, but could also
+// be specified concretely if the number of variations
+// is manageable.
+// This approach works well for a small number of variations
+// that are known in advance, as you can just list them
+// in your effect and you don't need to do the
+// binding work explicitly in your C++ code.
+//
+
+VertexShader g_NonUniVS = CompileShader(vs_5_0, VSMain());
+PixelShader g_NonUniPS = CompileShader(ps_5_0, PSMainNonUniform());
+
+technique11 FeatureLevel11_g_plasticMaterial
+{
+    pass
+    {
+        SetVertexShader(g_NonUniVS);
+        SetPixelShader(BindInterfaces(g_NonUniPS,
+                                      g_abstractAmbientLighting,
+                                      g_abstractDirectLighting,
+                                      g_abstractEnvironmentLighting,
+                                      g_plasticMaterial));
+    }
+}
+
+technique11 FeatureLevel11_g_plasticTexturedMaterial
+{
+    pass
+    {
+        SetVertexShader(g_NonUniVS);
+        SetPixelShader(BindInterfaces(g_NonUniPS,
+                                      g_abstractAmbientLighting,
+                                      g_abstractDirectLighting,
+                                      g_abstractEnvironmentLighting,
+                                      g_plasticTexturedMaterial));
+    }
+}
+
+technique11 FeatureLevel11_g_plasticLightingOnlyMaterial
+{
+    pass
+    {
+        SetVertexShader(g_NonUniVS);
+        SetPixelShader(BindInterfaces(g_NonUniPS,
+                                      g_abstractAmbientLighting,
+                                      g_abstractDirectLighting,
+                                      g_abstractEnvironmentLighting,
+                                      g_plasticLightingOnlyMaterial));
+    }
+}
+
+technique11 FeatureLevel11_g_roughMaterial
+{
+    pass
+    {
+        SetVertexShader(g_NonUniVS);
+        SetPixelShader(BindInterfaces(g_NonUniPS,
+                                      g_abstractAmbientLighting,
+                                      g_abstractDirectLighting,
+                                      g_abstractEnvironmentLighting,
+                                      g_roughMaterial));
+    }
+}
+
+technique11 FeatureLevel11_g_roughTexturedMaterial
+{
+    pass
+    {
+        SetVertexShader(g_NonUniVS);
+        SetPixelShader(BindInterfaces(g_NonUniPS,
+                                      g_abstractAmbientLighting,
+                                      g_abstractDirectLighting,
+                                      g_abstractEnvironmentLighting,
+                                      g_roughTexturedMaterial));
+    }
+}
+
+technique11 FeatureLevel11_g_roughLightingOnlyMaterial
+{
+    pass
+    {
+        SetVertexShader(g_NonUniVS);
+        SetPixelShader(BindInterfaces(g_NonUniPS,
+                                      g_abstractAmbientLighting,
+                                      g_abstractDirectLighting,
+                                      g_abstractEnvironmentLighting,
+                                      g_roughLightingOnlyMaterial));
+    }
+}
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_LightPSH.h b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_LightPSH.h
new file mode 100644
index 000000000..6f9a0f4d8
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_LightPSH.h
@@ -0,0 +1,82 @@
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkageFX11_LightPSH.h
+//
+// The pixel shader light header file for the DynamicShaderLinkageFX11 sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Interfaces
+//--------------------------------------------------------------------------------------
+interface iBaseLight
+{
+   float3 IlluminateAmbient(float3 vNormal);
+   
+   float3 IlluminateDiffuse(float3 vNormal);
+
+   float3 IlluminateSpecular(float3 vNormal, int specularPower );
+   
+};
+
+//--------------------------------------------------------------------------------------
+// Classes
+//--------------------------------------------------------------------------------------
+class cAmbientLight : iBaseLight
+{
+   float3   m_vLightColor;     
+   bool     m_bEnable;
+   
+   float3 IlluminateAmbient(float3 vNormal);
+      
+   float3 IlluminateDiffuse(float3 vNormal)
+   { 
+      return (float3)0;
+   }
+
+   float3 IlluminateSpecular(float3 vNormal, int specularPower )
+   { 
+      return (float3)0;
+   }
+};
+
+class cHemiAmbientLight : cAmbientLight
+{
+   // inherited float4 m_vLightColor is the SkyColor
+   float4   m_vGroundColor;
+   float4   m_vDirUp;
+
+   float3 IlluminateAmbient(float3 vNormal);
+   
+};
+
+class cDirectionalLight : cAmbientLight
+{
+   // inherited float4 m_vLightColor is the LightColor
+   float4 m_vLightDir;
+   
+   float3 IlluminateDiffuse( float3 vNormal );
+
+   float3 IlluminateSpecular( float3 vNormal, int specularPower );
+
+};
+
+class cOmniLight : cAmbientLight
+{
+   float3   m_vLightPosition;
+   float    radius;   
+   
+   float3 IlluminateDiffuse( float3 vNormal );
+  
+};
+
+class cSpotLight : cAmbientLight
+{
+   float3   m_vLightPosition;
+   float3   m_vLightDir;
+};
+
+class cEnvironmentLight : cAmbientLight
+{
+   float3  IlluminateSpecular( float3 vNormal, int specularPower );  
+};
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_MaterialPSH.h b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_MaterialPSH.h
new file mode 100644
index 000000000..cd54a283d
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_MaterialPSH.h
@@ -0,0 +1,103 @@
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkageFX11_MaterialPSH.h
+//
+// The pixel shader material header file for the DynamicShaderLinkageFX11 sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Interfaces
+//--------------------------------------------------------------------------------------
+interface iBaseMaterial
+{
+   float3 GetAmbientColor(float2 vTexcoord);
+   
+   float3 GetDiffuseColor(float2 vTexcoord);
+
+   int GetSpecularPower();
+
+};
+
+//--------------------------------------------------------------------------------------
+// Classes
+//--------------------------------------------------------------------------------------
+class cBaseMaterial : iBaseMaterial
+{
+   float3   m_vColor;     
+   int      m_iSpecPower;
+   
+   float3 GetAmbientColor(float2 vTexcoord)
+   { 
+      return m_vColor;
+   }
+      
+   float3 GetDiffuseColor(float2 vTexcoord)
+   { 
+      return (float3)m_vColor;
+   }
+
+   int GetSpecularPower()
+   { 
+      return m_iSpecPower;
+   }
+   
+};
+
+class cPlasticMaterial : cBaseMaterial
+{  
+
+};
+
+class cPlasticTexturedMaterial : cPlasticMaterial
+{  
+   float3 GetAmbientColor(float2 vTexcoord);
+
+   float3 GetDiffuseColor(float2 vTexcoord);
+
+};
+
+class cPlasticLightingOnlyMaterial : cBaseMaterial
+{  
+   float3 GetAmbientColor(float2 vTexcoord)
+   { 
+      return (float3)1.0f;
+   }
+      
+   float3 GetDiffuseColor(float2 vTexcoord)
+   { 
+      return (float3)1.0f;
+   }
+
+};
+
+class cRoughMaterial : cBaseMaterial
+{
+   int GetSpecularPower()
+   { 
+      return m_iSpecPower;
+   }
+};
+
+class cRoughTexturedMaterial : cRoughMaterial
+{  
+   float3 GetAmbientColor(float2 vTexcoord);
+
+   float3 GetDiffuseColor(float2 vTexcoord);
+
+};
+
+
+class cRoughLightingOnlyMaterial : cRoughMaterial
+{
+   float3 GetAmbientColor(float2 vTexcoord)
+   { 
+      return (float3)1.0f;
+   }
+      
+   float3 GetDiffuseColor(float2 vTexcoord)
+   { 
+      return (float3)1.0f;
+   }
+
+};
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_PSBuffers.h b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_PSBuffers.h
new file mode 100644
index 000000000..3b4c528be
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_PSBuffers.h
@@ -0,0 +1,152 @@
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkageFX11_LightPSH.hlsl
+//
+// The pixel shader light source module file for the DynamicShaderLinkageFX11 sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#include "DynamicShaderLinkageFX11_LightPSH.h"
+#include "DynamicShaderLinkageFX11_MaterialPSH.h"
+
+//--------------------------------------------------------------------------------------
+// Constant Buffers
+//--------------------------------------------------------------------------------------
+cbuffer cbPerFrame : register( b0 )
+{
+   cAmbientLight     g_ambientLight;
+   cHemiAmbientLight g_hemiAmbientLight;
+   cDirectionalLight g_directionalLight;
+   cEnvironmentLight g_environmentLight;
+   float4            g_vEyeDir;   
+};
+
+cbuffer cbPerPrimitive : register( b1 )
+{
+   cPlasticMaterial              g_plasticMaterial;
+   cPlasticTexturedMaterial      g_plasticTexturedMaterial;
+   cPlasticLightingOnlyMaterial  g_plasticLightingOnlyMaterial;
+   cRoughMaterial                g_roughMaterial;
+   cRoughTexturedMaterial        g_roughTexturedMaterial;
+   cRoughLightingOnlyMaterial    g_roughLightingOnlyMaterial;
+};
+
+//--------------------------------------------------------------------------------------
+// Textures and Samplers
+//--------------------------------------------------------------------------------------
+Texture2D      g_txDiffuse : register( t0 );
+Texture2D      g_txNormalMap : register( t1 );
+TextureCube    g_txEnvironmentMap : register( t2 );
+
+SamplerState   g_samLinear : register( s0 )
+{
+    Filter = MIN_MAG_MIP_LINEAR;
+    AddressU = WRAP;
+    AddressV = WRAP;
+    AddressW = WRAP;
+};
+
+//--------------------------------------------------------------------------------------
+// Rasterization State
+//--------------------------------------------------------------------------------------
+uint g_fillMode = 0;
+   
+RasterizerState g_rasterizerState[2]
+{
+{
+    FillMode = SOLID;
+    MultisampleEnable = true;
+},
+{
+    FillMode = WIREFRAME;
+    MultisampleEnable = true;
+}
+};
+
+//--------------------------------------------------------------------------------------
+// Lighting Class Methods
+//--------------------------------------------------------------------------------------
+// Ambient Lighting Class Methods
+float3 cAmbientLight::IlluminateAmbient(float3 vNormal)
+{ 
+   return m_vLightColor * m_bEnable;
+}
+
+float3 cHemiAmbientLight::IlluminateAmbient(float3 vNormal)
+{ 
+   float thetha = (dot( vNormal, m_vDirUp.xyz ) + 1.0f) / 2.0f;
+ 
+   return  lerp( m_vGroundColor.xyz, m_vLightColor, thetha) * m_bEnable;
+}
+
+// Directional Light class
+float3 cDirectionalLight::IlluminateDiffuse( float3 vNormal ) 
+{
+   float lambert = saturate(dot( vNormal, m_vLightDir.xyz ));
+   return ((float3)lambert * m_vLightColor * m_bEnable);
+}
+
+float3 cDirectionalLight::IlluminateSpecular( float3 vNormal, int specularPower ) 
+{       
+   float3 H = -normalize(g_vEyeDir.xyz) + m_vLightDir.xyz;
+   float3 halfAngle = normalize( H );
+   float specular = pow( max(0,dot( halfAngle, normalize(vNormal) )), specularPower );          
+
+        return ((float3)specular * m_vLightColor * m_bEnable); 
+}
+
+// Omni Light Class
+float3 cOmniLight::IlluminateDiffuse( float3 vNormal ) 
+{
+   return (float3)0.0f; // TO DO!
+}
+
+// Environment Lighting
+float3 cEnvironmentLight::IlluminateSpecular( float3 vNormal, int specularPower ) 
+{         
+   // compute reflection vector taking into account a cheap fresnel falloff;
+   float3 N = normalize(vNormal); 
+   float3 E = normalize(g_vEyeDir.xyz);
+   float3 R = reflect( E, N ); 
+   float fresnel = 1 - dot( -E, N );    
+   fresnel = (fresnel * fresnel * fresnel );
+
+   float3 specular = g_txEnvironmentMap.Sample( g_samLinear, R ).xyz * fresnel;
+
+   return (specular * (float3)m_bEnable); 
+//   return ((float3)fresnel); 
+
+}
+
+//--------------------------------------------------------------------------------------
+// Material Class Methods
+//--------------------------------------------------------------------------------------
+// Plastic Material Methods
+float3 cPlasticTexturedMaterial::GetAmbientColor(float2 vTexcoord)
+{ 
+   float4 vDiffuse = (float4)1.0f;
+   vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord );  
+   return m_vColor * vDiffuse.xyz;
+}
+   
+float3 cPlasticTexturedMaterial::GetDiffuseColor(float2 vTexcoord)
+{ 
+   float4 vDiffuse = (float4)1.0f;
+   vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord );  
+   return m_vColor * vDiffuse.xyz;
+}
+
+// Rough Material Methods
+float3 cRoughTexturedMaterial::GetAmbientColor(float2 vTexcoord)
+{ 
+   float4 vDiffuse = (float4)1.0f;
+   vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord );  
+   return m_vColor * vDiffuse.xyz;
+}
+   
+float3 cRoughTexturedMaterial::GetDiffuseColor(float2 vTexcoord)
+{ 
+   float4 vDiffuse = (float4)1.0f;
+   vDiffuse = g_txDiffuse.Sample( g_samLinear, vTexcoord );  
+   return m_vColor * vDiffuse.xyz;
+}
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_ps.hlsl b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_ps.hlsl
new file mode 100644
index 000000000..55d206259
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_ps.hlsl
@@ -0,0 +1,113 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkageFX11.psh
+//
+// The pixel shader header file for the DynamicShaderLinkageFX11 sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Header Includes
+//--------------------------------------------------------------------------------------
+#include "DynamicShaderLinkageFX11_PSBuffers.h"
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct PS_INPUT
+{
+    float4 vPosition    : SV_POSITION;
+    float3 vNormal      : NORMAL;
+    float2 vTexcoord    : TEXCOORD0;
+    float4 vMatrix      : TEXCOORD1;    
+};
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+
+// This pixel shader uses several interfaces during its
+// work.  We show three different ways of providing interface
+// bindings for the PS and those have two different
+// entry points so we've separated the base PS code
+// into a worker routine that's called by the entry
+// points.  Normally only one technique would be used
+// and this layering of entry point and worker would
+// not be necessary.
+float4 PSMainWorker( iBaseLight ambientLighting,
+                     iBaseLight directLighting,
+                     iBaseLight environmentLighting,
+                     iBaseMaterial material,
+                     PS_INPUT Input )
+{   
+   // Compute the Ambient term
+   float3   Ambient = (float3)0.0f; 
+   Ambient = material.GetAmbientColor( Input.vTexcoord ) * ambientLighting.IlluminateAmbient( Input.vNormal );
+
+   // Accumulate the Diffuse contribution  
+   float3   Diffuse = (float3)0.0f;  
+   
+   Diffuse += material.GetDiffuseColor( Input.vTexcoord ) * directLighting.IlluminateDiffuse( Input.vNormal );
+
+   // Compute the Specular contribution
+   float3   Specular = (float3)0.0f;   
+   Specular += directLighting.IlluminateSpecular( Input.vNormal, material.GetSpecularPower() );
+   Specular += environmentLighting.IlluminateSpecular( Input.vNormal, material.GetSpecularPower() );
+     
+   // Accumulate the lighting with saturation
+   float3 Lighting = saturate( Ambient + Diffuse + Specular);
+
+   return float4(Lighting,1.0f);
+}
+
+// One way to provide bindings for shaders in Effects 11 is
+// to use uniform interface parameters.  As with non-interface
+// uniform parameters you must specify a value for these
+// parameters in your CompileShader invocations in the effect.
+// You can provide concrete class instances if you want
+// to statically specialize your shaders, such as for targets
+// that don't support abstract interfaces; or you can provide
+// other interfaces that you bind using effect variables.
+// Both are shown in this sample's technique passes.
+float4 PSMainUniform( uniform iBaseLight ambientLighting,
+                      uniform iBaseLight directLighting,
+                      uniform iBaseLight environmentLighting,
+                      uniform iBaseMaterial material,
+                      PS_INPUT Input ) : SV_Target
+{
+    return PSMainWorker(ambientLighting,
+                        directLighting,
+                        environmentLighting,
+                        material,
+                        Input);
+}
+
+// Another way to use Effects 11 with interfaces is
+// to have non-uniform parameters, which then are
+// bound with a BindInterfaces in a technique pass.
+// BindInterfaces gives concrete instances to use
+// with a shader but does not do static specialization,
+// it just saves information for the effect runtime
+// to use when setting up the shader to run.
+// This lets you share a single shader, compiled with
+// interface usage, while still getting the convenience
+// of declaring concrete bindings in the effect and
+// not needed explicit binding in code via effect
+// variable updates.  If you have many different
+// variations it may be simpler to use bindings
+// through effect variables, as then you don't
+// need to list every possible binding set in your
+// techniques.
+float4 PSMainNonUniform( iBaseLight ambientLighting,
+                         iBaseLight directLighting,
+                         iBaseLight environmentLighting,
+                         iBaseMaterial material,
+                         PS_INPUT Input ) : SV_Target
+{
+    return PSMainWorker(ambientLighting,
+                        directLighting,
+                        environmentLighting,
+                        material,
+                        Input);
+}
diff --git a/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_vs.hlsl b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_vs.hlsl
new file mode 100644
index 000000000..4791e5786
--- /dev/null
+++ b/tests/hlsl/dxsdk/DynamicShaderLinkageFX11/DynamicShaderLinkageFX11_vs.hlsl
@@ -0,0 +1,65 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: DynamicShaderLinkageFX11_VS.hlsl
+//
+// The vertex shader file for the DynamicShaderLinkageFX11 sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+    float4x4        g_mWorldViewProjection  : packoffset( c0 );
+    float4x4        g_mWorld                : packoffset( c4 );
+};
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+    float4 vPosition    : POSITION;
+    float3 vNormal      : NORMAL;
+    float2 vTexcoord    : TEXCOORD0;
+};
+
+struct VS_OUTPUT
+{
+    float4 vPosition    : SV_POSITION;
+    float3 vNormal      : NORMAL;
+    float2 vTexcoord0   : TEXCOORD0;
+    float4 vMatrix      : TEXCOORD1; // DEBUG
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+// We aliased signed vectors as a unsigned format. 
+// Need to recover signed values.  The values 1.0 and 2.0
+// are slightly inaccurate here.
+float3 R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( in float3 vVec )
+{
+    vVec *= 2.0f;
+    return vVec >= 1.0f ? ( vVec - 2.0f ) : vVec;
+}
+
+VS_OUTPUT VSMain( VS_INPUT Input )
+{
+
+    VS_OUTPUT   Output;
+    float3      tmpNormal;
+    
+    Output.vPosition =  mul( Input.vPosition, g_mWorldViewProjection );
+    
+    // Expand compressed vectors
+    tmpNormal = R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( Input.vNormal );
+    Output.vNormal = mul( tmpNormal, (float3x3)g_mWorld );
+    
+    Output.vTexcoord0 = Input.vTexcoord;
+
+    Output.vMatrix = (float4)g_mWorld[0]; // DEBUG
+    return Output;
+}
diff --git a/tests/hlsl/dxsdk/FixedFuncEMUFX11/FixedFuncEMU.fx b/tests/hlsl/dxsdk/FixedFuncEMUFX11/FixedFuncEMU.fx
new file mode 100644
index 000000000..699df8655
--- /dev/null
+++ b/tests/hlsl/dxsdk/FixedFuncEMUFX11/FixedFuncEMU.fx
@@ -0,0 +1,468 @@
+//TEST_IGNORE_FILE:
+// FixedFuncEMU.fx
+// Copyright (c) 2005 Microsoft Corporation. All rights reserved.
+//
+
+struct VSSceneIn
+{
+    float3 pos          : POSITION;         //position of the particle
+    float3 norm         : NORMAL;           //velocity of the particle
+    float2 tex          : TEXTURE0;         //tex coords
+};
+
+struct VSSceneOut
+{
+    float4 pos : SV_Position;               //position
+    float2 tex : TEXTURE0;                  //texture coordinate
+    float3 wPos : TEXTURE1;                 //world space pos
+    float3 wNorm : TEXTURE2;                //world space normal
+    float4 colorD : COLOR0;                 //color for gouraud and flat shading
+    float4 colorS : COLOR1;                 //color for specular
+    float  fogDist : FOGDISTANCE;           //distance used for fog calculations
+    float3 planeDist : SV_ClipDistance0;    //clip distance for 3 planes
+};
+
+struct PSSceneIn
+{
+    float4 pos : SV_Position;               //position
+    float2 tex : TEXTURE0;                  //texture coordinate
+    float3 wPos : TEXTURE1;                 //world space pos
+    float3 wNorm : TEXTURE2;                //world space normal
+    float4 colorD : COLOR0;                 //color for gouraud and flat shading
+    float4 colorS : COLOR1;                 //color for specular
+    float  fogDist : FOGDISTANCE;           //distance used for fog calculations
+};
+
+struct Light
+{
+    float4 Position;
+    float4 Diffuse;
+    float4 Specular;
+    float4 Ambient;
+    float4 Atten;
+};
+
+#define FOGMODE_NONE    0
+#define FOGMODE_LINEAR  1
+#define FOGMODE_EXP     2
+#define FOGMODE_EXP2    3
+#define E 2.71828
+
+cbuffer cbLights
+{
+    float4   g_clipplanes[3];
+    Light    g_lights[8];
+};
+
+cbuffer cbPerFrame
+{
+    float4x4 g_mWorld;
+    float4x4 g_mView;
+    float4x4 g_mProj;
+    float4x4 g_mInvProj;
+    float4x4 g_mLightViewProj;
+};
+
+cbuffer cbPerTechnique
+{
+    bool     g_bEnableLighting = true;
+    bool     g_bEnableClipping = true;
+    bool     g_bPointScaleEnable = false;
+    float    g_pointScaleA;
+    float    g_pointScaleB;
+    float    g_pointScaleC;
+    float    g_pointSize;
+    
+    //fog params
+    int      g_fogMode = FOGMODE_NONE;
+    float    g_fogStart;
+    float    g_fogEnd;
+    float    g_fogDensity;
+    float4   g_fogColor;
+};
+    
+cbuffer cbPerViewChange
+{
+    //viewport params
+    float    g_viewportHeight;
+    float    g_viewportWidth;
+    float    g_nearPlane;
+};
+
+cbuffer cbImmutable
+{
+    float3 g_positions[4] =
+    {
+        float3( -0.5, 0.5, 0 ),
+        float3( 0.5, 0.5, 0 ),
+        float3( -0.5, -0.5, 0 ),
+        float3( 0.5, -0.5, 0 ),
+    };
+};
+
+Texture2D g_txDiffuse;
+Texture2D g_txProjected;
+SamplerState g_samLinear
+{
+    Filter = MIN_MAG_MIP_LINEAR;
+    AddressU = Clamp;
+    AddressV = Clamp;
+};
+
+DepthStencilState DisableDepth
+{
+    DepthEnable = FALSE;
+    DepthWriteMask = ZERO;
+};
+
+DepthStencilState EnableDepth
+{
+    DepthEnable = TRUE;
+    DepthWriteMask = ALL;
+};
+
+struct ColorsOutput
+{
+    float4 Diffuse;
+    float4 Specular;
+};
+
+ColorsOutput CalcLighting( float3 worldNormal, float3 worldPos, float3 cameraPos )
+{
+    ColorsOutput output = (ColorsOutput)0.0;
+    
+    for(int i=0; i<8; i++)
+    {
+        float3 toLight = g_lights[i].Position.xyz - worldPos;
+        float lightDist = length( toLight );
+        float fAtten = 1.0/dot( g_lights[i].Atten, float4(1,lightDist,lightDist*lightDist,0) );
+        float3 lightDir = normalize( toLight );
+        float3 halfAngle = normalize( normalize(-cameraPos) + lightDir );
+        
+        output.Diffuse += max(0,dot( lightDir, worldNormal ) * g_lights[i].Diffuse * fAtten) + g_lights[i].Ambient;
+        output.Specular += max(0,pow( dot( halfAngle, worldNormal ), 64 ) * g_lights[i].Specular * fAtten );
+    }
+    
+    return output;
+}
+
+//
+// VS for emulating fixed function pipeline
+//
+VSSceneOut VSScenemain(VSSceneIn input)
+{
+    VSSceneOut output = (VSSceneOut)0.0;
+
+    //output our final position in clipspace
+    float4 worldPos = mul( float4( input.pos, 1 ), g_mWorld );
+    float4 cameraPos = mul( worldPos, g_mView ); //Save cameraPos for fog calculations
+    output.pos = mul( cameraPos, g_mProj );
+    
+    //save world pos for later
+    output.wPos = worldPos;
+    
+    //save the fog distance for later
+    output.fogDist = cameraPos.z;
+    
+    //find our clipping planes (fixed function clipping is done in world space)
+    if( g_bEnableClipping )
+    {
+        worldPos.w = 1;
+        
+        //calc the distance from the 3 clipping planes
+        output.planeDist.x = dot( worldPos, g_clipplanes[0] );
+        output.planeDist.y = dot( worldPos, g_clipplanes[1] );
+        output.planeDist.z = dot( worldPos, g_clipplanes[2] );
+    }
+    else
+    {
+        output.planeDist.x = 1;
+        output.planeDist.y = 1;
+        output.planeDist.z = 1;
+    }
+    
+    //do gouraud lighting
+    if( g_bEnableLighting )
+    {
+        float3 worldNormal = normalize( mul( input.norm, (float3x3)g_mWorld ) );
+        output.wNorm = worldNormal;
+        ColorsOutput cOut = CalcLighting( worldNormal, worldPos, cameraPos );
+        output.colorD = cOut.Diffuse;
+        output.colorS = cOut.Specular;
+    }
+    else
+    {
+        output.colorD = float4(1,1,1,1);
+    }
+    
+    //propogate texture coordinate
+    output.tex = input.tex;
+    
+    return output;
+}
+
+//
+// VS for rendering in screen space
+//
+PSSceneIn VSScreenSpacemain(VSSceneIn input)
+{
+    PSSceneIn output = (PSSceneIn)0.0;
+
+    //output our final position
+    output.pos.x = (input.pos.x / (g_viewportWidth/2.0)) -1;
+    output.pos.y = -(input.pos.y / (g_viewportHeight/2.0)) +1;
+    output.pos.z = input.pos.z;
+    output.pos.w = 1;
+    
+    //propogate texture coordinate
+    output.tex = input.tex;
+    output.colorD = float4(1,1,1,1);
+    
+    return output;
+}
+
+//
+// GS for flat shaded rendering
+//
+
+[maxvertexcount(3)]
+void GSFlatmain( triangle VSSceneOut input[3], inout TriangleStream<VSSceneOut> FlatTriStream )
+{
+    VSSceneOut output;
+    
+    //
+    // Calculate the face normal
+    //
+    float3 faceEdgeA = input[1].wPos - input[0].wPos;
+    float3 faceEdgeB = input[2].wPos - input[0].wPos;
+
+    //
+    // Cross product
+    //
+    float3 faceNormal = cross(faceEdgeA, faceEdgeB);
+    
+    //
+    //calculate the face center
+    //
+    float3 faceCenter = (input[0].wPos + input[1].wPos + input[2].wPos)/3.0;
+    
+    //find world pos and camera pos
+    float4 worldPos = float4( faceCenter, 1 );
+    float4 cameraPos = mul( worldPos, g_mView );
+    
+    //do shading
+    float3 worldNormal = normalize( faceNormal );
+    ColorsOutput cOut = CalcLighting( worldNormal, worldPos, cameraPos );
+    
+    for(int i=0; i<3; i++)
+    {
+        output = input[i];
+        output.colorD = cOut.Diffuse;
+        output.colorS = cOut.Specular;
+        
+        FlatTriStream.Append( output );
+    }
+    FlatTriStream.RestartStrip();
+}
+
+//
+// GS for point rendering
+//
+[maxvertexcount(12)]
+void GSPointmain( triangle VSSceneOut input[3], inout TriangleStream<VSSceneOut> PointTriStream )
+{
+    VSSceneOut output;
+    
+    //
+    // Calculate the point size
+    //
+    //float fSizeX = (g_pointSize/g_viewportWidth)/4.0;
+    float fSizeY = (g_pointSize/g_viewportHeight)/4.0;
+    float fSizeX = fSizeY;
+    
+    for(int i=0; i<3; i++)
+    {
+        output = input[i];
+    
+        //find world pos and camera pos
+        float4 worldPos = float4(input[i].wPos,1);
+        float4 cameraPos = mul( worldPos, g_mView );
+        
+        //find our size
+        if( g_bPointScaleEnable )
+        {   
+            float dEye = length( cameraPos.xyz );
+            fSizeX = fSizeY = g_viewportHeight * g_pointSize * 
+                    sqrt( 1.0f/( g_pointScaleA + g_pointScaleB*dEye + g_pointScaleC*(dEye*dEye) ) );
+        }
+        
+        //do shading
+        if(g_bEnableLighting)
+        {
+            float3 worldNormal = input[i].wNorm;
+            ColorsOutput cOut = CalcLighting( worldNormal, worldPos, cameraPos );
+        
+            output.colorD = cOut.Diffuse;
+            output.colorS = cOut.Specular;
+        }
+        else
+        {
+            output.colorD = float4(1,1,1,1);
+        }
+        
+        output.tex = input[i].tex;
+        
+        //
+        // Emit two new triangles
+        //
+        for(int i=0; i<4; i++)
+        {
+            float4 outPos = mul( worldPos, g_mView );
+            output.pos = mul( outPos, g_mProj );
+            float zoverNear = (outPos.z)/g_nearPlane;
+            float4 posSize = float4( g_positions[i].x*fSizeX*zoverNear,
+                                     g_positions[i].y*fSizeY*zoverNear,
+                                     0,
+                                     0 );
+            output.pos += posSize;
+            
+            PointTriStream.Append(output);
+        }
+        PointTriStream.RestartStrip();
+    }
+}
+
+//
+// Calculates fog factor based upon distance
+//
+float CalcFogFactor( float d )
+{
+    float fogCoeff = 1.0;
+    
+    if( FOGMODE_LINEAR == g_fogMode )
+    {
+        fogCoeff = (g_fogEnd - d)/(g_fogEnd - g_fogStart);
+    }
+    else if( FOGMODE_EXP == g_fogMode )
+    {
+        fogCoeff = 1.0 / pow( E, d*g_fogDensity );
+    }
+    else if( FOGMODE_EXP2 == g_fogMode )
+    {
+        fogCoeff = 1.0 / pow( E, d*d*g_fogDensity*g_fogDensity );
+    }
+    
+    return clamp( fogCoeff, 0, 1 );
+}
+
+//
+// PS for rendering with clip planes
+//
+float4 PSScenemain(PSSceneIn input) : SV_Target
+{   
+    //calculate the fog factor  
+    float fog = CalcFogFactor( input.fogDist );
+    
+    //calculate the color based off of the normal, textures, etc
+    float4 normalColor = g_txDiffuse.Sample( g_samLinear, input.tex ) * input.colorD + input.colorS;
+    
+    //calculate the color from the projected texture
+    float4 cookieCoord = mul( float4(input.wPos,1), g_mLightViewProj );
+    //since we don't have texldp, we must perform the w divide ourselves befor the texture lookup
+    cookieCoord.xy = 0.5 * cookieCoord.xy / cookieCoord.w + float2( 0.5, 0.5 ); 
+    float4 cookieColor = float4(0,0,0,0);
+    if( cookieCoord.z > 0 )
+        cookieColor = g_txProjected.Sample( g_samLinear, cookieCoord.xy );
+    
+    //for standard light-modulating effects just multiply normalcolor and coookiecolor
+    normalColor += cookieColor;
+    
+    return fog * normalColor + (1.0 - fog)*g_fogColor;
+}
+
+//
+// PS for rendering with alpha test
+//
+float4 PSAlphaTestmain(PSSceneIn input) : SV_Target
+{   
+    float4 color =  g_txDiffuse.Sample( g_samLinear, input.tex ) * input.colorD;
+    if( color.a < 0.5 )
+        discard;
+    return color;
+}
+
+//
+// RenderSceneGouraud - renders gouraud-shaded primitives
+//
+technique10 RenderSceneGouraud
+{
+    pass p0
+    {
+        SetVertexShader( CompileShader( vs_4_0, VSScenemain() ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0, PSScenemain() ) );
+        
+        SetDepthStencilState( EnableDepth, 0 );
+    }  
+}
+
+//
+// RenderSceneFlat - renders flat-shaded primitives
+//
+technique10 RenderSceneFlat
+{
+    pass p0
+    {
+        SetVertexShader( CompileShader( vs_4_0, VSScenemain() ) );
+        SetGeometryShader( CompileShader( gs_4_0, GSFlatmain() ) );
+        SetPixelShader( CompileShader( ps_4_0, PSScenemain() ) );
+        
+        SetDepthStencilState( EnableDepth, 0 );
+    }  
+}
+
+//
+// RenderScenePoint - replaces d3dfill_point
+//
+technique10 RenderScenePoint
+{
+    pass p0
+    {
+        SetVertexShader( CompileShader( vs_4_0, VSScenemain() ) );
+        SetGeometryShader( CompileShader( gs_4_0, GSPointmain() ) );
+        SetPixelShader( CompileShader( ps_4_0, PSScenemain() ) );
+        
+        SetDepthStencilState( EnableDepth, 0 );
+    }  
+}
+
+//
+// RenderScreneSpace - shows how to render something in screenspace
+//
+technique10 RenderScreenSpaceAlphaTest
+{
+    pass p0
+    {
+        SetVertexShader( CompileShader( vs_4_0, VSScreenSpacemain() ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0, PSAlphaTestmain() ) );
+        
+        SetDepthStencilState( DisableDepth, 0 );
+    }  
+}
+
+//
+// RenderScreneSpace - shows how to render something in screenspace
+//
+technique10 RenderTextureOnly
+{
+    pass p0
+    {
+        SetVertexShader( CompileShader( vs_4_0, VSScenemain() ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0, PSScenemain() ) );
+        
+        SetDepthStencilState( EnableDepth, 0 );
+    }  
+}
+
diff --git a/tests/hlsl/dxsdk/FluidCS11/ComputeShaderSort11.hlsl b/tests/hlsl/dxsdk/FluidCS11/ComputeShaderSort11.hlsl
new file mode 100644
index 000000000..db7bd5136
--- /dev/null
+++ b/tests/hlsl/dxsdk/FluidCS11/ComputeShaderSort11.hlsl
@@ -0,0 +1,75 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry BitonicSort -entry MatrixTranspose
+//--------------------------------------------------------------------------------------
+// File: ComputeShaderSort11.hlsl
+//
+// This file contains the compute shaders to perform GPU sorting using DirectX 11.
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#define BITONIC_BLOCK_SIZE 512
+
+#define TRANSPOSE_BLOCK_SIZE 16
+
+//--------------------------------------------------------------------------------------
+// Constant Buffers
+//--------------------------------------------------------------------------------------
+cbuffer CB : register( b0 )
+{
+    unsigned int g_iLevel;
+    unsigned int g_iLevelMask;
+    unsigned int g_iWidth;
+    unsigned int g_iHeight;
+};
+
+//--------------------------------------------------------------------------------------
+// Structured Buffers
+//--------------------------------------------------------------------------------------
+StructuredBuffer<unsigned int> Input : register( t0 );
+RWStructuredBuffer<unsigned int> Data : register( u0 );
+
+//--------------------------------------------------------------------------------------
+// Bitonic Sort Compute Shader
+//--------------------------------------------------------------------------------------
+groupshared unsigned int shared_data[BITONIC_BLOCK_SIZE];
+
+[numthreads(BITONIC_BLOCK_SIZE, 1, 1)]
+void BitonicSort( uint3 Gid : SV_GroupID, 
+                  uint3 DTid : SV_DispatchThreadID, 
+                  uint3 GTid : SV_GroupThreadID, 
+                  uint GI : SV_GroupIndex )
+{
+    // Load shared data
+    shared_data[GI] = Data[DTid.x];
+    GroupMemoryBarrierWithGroupSync();
+    
+    // Sort the shared data
+    for (unsigned int j = g_iLevel >> 1 ; j > 0 ; j >>= 1)
+    {
+        unsigned int result = ((shared_data[GI & ~j] <= shared_data[GI | j]) == (bool)(g_iLevelMask & DTid.x))? shared_data[GI ^ j] : shared_data[GI];
+        GroupMemoryBarrierWithGroupSync();
+        shared_data[GI] = result;
+        GroupMemoryBarrierWithGroupSync();
+    }
+    
+    // Store shared data
+    Data[DTid.x] = shared_data[GI];
+}
+
+//--------------------------------------------------------------------------------------
+// Matrix Transpose Compute Shader
+//--------------------------------------------------------------------------------------
+groupshared unsigned int transpose_shared_data[TRANSPOSE_BLOCK_SIZE * TRANSPOSE_BLOCK_SIZE];
+
+[numthreads(TRANSPOSE_BLOCK_SIZE, TRANSPOSE_BLOCK_SIZE, 1)]
+void MatrixTranspose( uint3 Gid : SV_GroupID, 
+                      uint3 DTid : SV_DispatchThreadID, 
+                      uint3 GTid : SV_GroupThreadID, 
+                      uint GI : SV_GroupIndex )
+{
+    transpose_shared_data[GI] = Input[DTid.y * g_iWidth + DTid.x];
+    GroupMemoryBarrierWithGroupSync();
+    uint2 XY = DTid.yx - GTid.yx + GTid.xy;
+    Data[XY.y * g_iHeight + XY.x] = transpose_shared_data[GTid.x * TRANSPOSE_BLOCK_SIZE + GTid.y];
+}
diff --git a/tests/hlsl/dxsdk/FluidCS11/FluidCS11.hlsl b/tests/hlsl/dxsdk/FluidCS11/FluidCS11.hlsl
new file mode 100644
index 000000000..26e6cdf60
--- /dev/null
+++ b/tests/hlsl/dxsdk/FluidCS11/FluidCS11.hlsl
@@ -0,0 +1,529 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry BuildGridCS -entry ClearGridIndicesCS -entry BuildGridIndicesCS -entry RearrangeParticlesCS -entry DensityCS_Simple -entry DensityCS_Shared -entry DensityCS_Grid -entry ForceCS_Simple -entry ForceCS_Shared -entry ForceCS_Grid -entry IntegrateCS
+//--------------------------------------------------------------------------------------
+// File: FluidCS11.hlsl
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Smoothed Particle Hydrodynamics Algorithm Based Upon:
+// Particle-Based Fluid Simulation for Interactive Applications
+// Matthias M�ller
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Optimized Grid Algorithm Based Upon:
+// Broad-Phase Collision Detection with CUDA
+// Scott Le Grand
+//--------------------------------------------------------------------------------------
+
+struct Particle
+{
+    float2 position;
+    float2 velocity;
+};
+
+struct ParticleForces
+{
+    float2 acceleration;
+};
+
+struct ParticleDensity
+{
+    float density;
+};
+
+cbuffer cbSimulationConstants : register( b0 )
+{
+    uint g_iNumParticles;
+    float g_fTimeStep;
+    float g_fSmoothlen;
+    float g_fPressureStiffness;
+    float g_fRestDensity;
+    float g_fDensityCoef;
+    float g_fGradPressureCoef;
+    float g_fLapViscosityCoef;
+    float g_fWallStiffness;
+
+    float4 g_vGravity;
+    float4 g_vGridDim;
+    float3 g_vPlanes[4];
+};
+
+//--------------------------------------------------------------------------------------
+// Fluid Simulation
+//--------------------------------------------------------------------------------------
+
+#define SIMULATION_BLOCK_SIZE 256
+
+//--------------------------------------------------------------------------------------
+// Structured Buffers
+//--------------------------------------------------------------------------------------
+RWStructuredBuffer<Particle> ParticlesRW : register( u0 );
+StructuredBuffer<Particle> ParticlesRO : register( t0 );
+
+RWStructuredBuffer<ParticleDensity> ParticlesDensityRW : register( u0 );
+StructuredBuffer<ParticleDensity> ParticlesDensityRO : register( t1 );
+
+RWStructuredBuffer<ParticleForces> ParticlesForcesRW : register( u0 );
+StructuredBuffer<ParticleForces> ParticlesForcesRO : register( t2 );
+
+RWStructuredBuffer<unsigned int> GridRW : register( u0 );
+StructuredBuffer<unsigned int> GridRO : register( t3 );
+
+RWStructuredBuffer<uint2> GridIndicesRW : register( u0 );
+StructuredBuffer<uint2> GridIndicesRO : register( t4 );
+
+
+//--------------------------------------------------------------------------------------
+// Grid Construction
+//--------------------------------------------------------------------------------------
+
+// For simplicity, this sample uses a 16-bit hash based on the grid cell and
+// a 16-bit particle ID to keep track of the particles while sorting
+// This imposes a limitation of 64K particles and 256x256 grid work
+// You could extended the implementation to support large scenarios by using a uint2
+
+float2 GridCalculateCell(float2 position)
+{
+    return clamp(position * g_vGridDim.xy + g_vGridDim.zw, float2(0, 0), float2(255, 255));
+}
+
+unsigned int GridConstuctKey(uint2 xy)
+{
+    // Bit pack [-----UNUSED-----][----Y---][----X---]
+    //                16-bit         8-bit     8-bit
+    return dot(xy.yx, uint2(256, 1));
+}
+
+unsigned int GridConstuctKeyValuePair(uint2 xy, uint value)
+{
+    // Bit pack [----Y---][----X---][-----VALUE------]
+    //             8-bit     8-bit        16-bit
+    return dot(uint3(xy.yx, value), uint3(256*256*256, 256*256, 1));
+}
+
+unsigned int GridGetKey(unsigned int keyvaluepair)
+{
+    return (keyvaluepair >> 16);
+}
+
+unsigned int GridGetValue(unsigned int keyvaluepair)
+{
+    return (keyvaluepair & 0xFFFF);
+}
+
+
+//--------------------------------------------------------------------------------------
+// Build Grid
+//--------------------------------------------------------------------------------------
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void BuildGridCS( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+    const unsigned int P_ID = DTid.x; // Particle ID to operate on
+    
+    float2 position = ParticlesRO[P_ID].position;
+    float2 grid_xy = GridCalculateCell( position );
+    
+    GridRW[P_ID] = GridConstuctKeyValuePair((uint2)grid_xy, P_ID);
+}
+
+
+//--------------------------------------------------------------------------------------
+// Build Grid Indices
+//--------------------------------------------------------------------------------------
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void ClearGridIndicesCS( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+    GridIndicesRW[DTid.x] = uint2(0, 0);
+}
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void BuildGridIndicesCS( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+    const unsigned int G_ID = DTid.x; // Grid ID to operate on
+    unsigned int G_ID_PREV = (G_ID == 0)? g_iNumParticles : G_ID; G_ID_PREV--;
+    unsigned int G_ID_NEXT = G_ID + 1; if (G_ID_NEXT == g_iNumParticles) { G_ID_NEXT = 0; }
+    
+    unsigned int cell = GridGetKey( GridRO[G_ID] );
+    unsigned int cell_prev = GridGetKey( GridRO[G_ID_PREV] );
+    unsigned int cell_next = GridGetKey( GridRO[G_ID_NEXT] );
+    if (cell != cell_prev)
+    {
+        // I'm the start of a cell
+        GridIndicesRW[cell].x = G_ID;
+    }
+    if (cell != cell_next)
+    {
+        // I'm the end of a cell
+        GridIndicesRW[cell].y = G_ID + 1;
+    }
+}
+
+
+//--------------------------------------------------------------------------------------
+// Rearrange Particles
+//--------------------------------------------------------------------------------------
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void RearrangeParticlesCS( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+    const unsigned int ID = DTid.x; // Particle ID to operate on
+    const unsigned int G_ID = GridGetValue( GridRO[ ID ] );
+    ParticlesRW[ID] = ParticlesRO[ G_ID ];
+}
+
+
+//--------------------------------------------------------------------------------------
+// Density Calculation
+//--------------------------------------------------------------------------------------
+
+float CalculateDensity(float r_sq)
+{
+    const float h_sq = g_fSmoothlen * g_fSmoothlen;
+    // Implements this equation:
+    // W_poly6(r, h) = 315 / (64 * pi * h^9) * (h^2 - r^2)^3
+    // g_fDensityCoef = fParticleMass * 315.0f / (64.0f * PI * fSmoothlen^9)
+    return g_fDensityCoef * (h_sq - r_sq) * (h_sq - r_sq) * (h_sq - r_sq);
+}
+
+
+//--------------------------------------------------------------------------------------
+// Simple N^2 Algorithm
+//--------------------------------------------------------------------------------------
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void DensityCS_Simple( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+    const unsigned int P_ID = DTid.x;
+    const float h_sq = g_fSmoothlen * g_fSmoothlen;
+    float2 P_position = ParticlesRO[P_ID].position;
+    
+    float density = 0;
+    
+    // Calculate the density based on all neighbors
+    for (uint N_ID = 0 ; N_ID < g_iNumParticles ; N_ID++)
+    {
+        float2 N_position = ParticlesRO[N_ID].position;
+        
+        float2 diff = N_position - P_position;
+        float r_sq = dot(diff, diff);
+        if (r_sq < h_sq)
+        {
+            density += CalculateDensity(r_sq);
+        }
+    }
+    
+    ParticlesDensityRW[P_ID].density = density;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Shared Memory Optimized N^2 Algorithm
+//--------------------------------------------------------------------------------------
+
+groupshared float2 density_shared_pos[SIMULATION_BLOCK_SIZE];
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void DensityCS_Shared( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+    const unsigned int P_ID = DTid.x;
+    const float h_sq = g_fSmoothlen * g_fSmoothlen;
+    float2 P_position = ParticlesRO[P_ID].position;
+    
+    float density = 0;
+    
+    // Calculate the density based on all neighbors
+    [loop]
+    for (uint N_block_ID = 0 ; N_block_ID < g_iNumParticles ; N_block_ID += SIMULATION_BLOCK_SIZE)
+    {
+        // Cache a tile of particles unto shared memory to increase IO efficiency
+        density_shared_pos[GI] = ParticlesRO[N_block_ID + GI].position;
+       
+        GroupMemoryBarrierWithGroupSync();        
+
+        for (uint N_tile_ID = 0; N_tile_ID < SIMULATION_BLOCK_SIZE; N_tile_ID++) 
+        {
+            float2 N_position = density_shared_pos[N_tile_ID];
+            
+            float2 diff = N_position - P_position;
+            float r_sq = dot(diff, diff);
+            if (r_sq < h_sq)
+            {
+                density += CalculateDensity(r_sq);
+            }
+        }        
+        
+        GroupMemoryBarrierWithGroupSync();
+    }
+    
+    ParticlesDensityRW[P_ID].density = density;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Optimized Grid + Sort Algorithm
+//--------------------------------------------------------------------------------------
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void DensityCS_Grid( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+    const unsigned int P_ID = DTid.x;
+    const float h_sq = g_fSmoothlen * g_fSmoothlen;
+    float2 P_position = ParticlesRO[P_ID].position;
+    
+    float density = 0;
+    
+    // Calculate the density based on neighbors from the 8 adjacent cells + current cell
+    int2 G_XY = (int2)GridCalculateCell( P_position );
+    for (int Y = max(G_XY.y - 1, 0) ; Y <= min(G_XY.y + 1, 255) ; Y++)
+    {
+        for (int X = max(G_XY.x - 1, 0) ; X <= min(G_XY.x + 1, 255) ; X++)
+        {
+            unsigned int G_CELL = GridConstuctKey(uint2(X, Y));
+            uint2 G_START_END = GridIndicesRO[G_CELL];
+            for (unsigned int N_ID = G_START_END.x ; N_ID < G_START_END.y ; N_ID++)
+            {
+                float2 N_position = ParticlesRO[N_ID].position;
+                
+                float2 diff = N_position - P_position;
+                float r_sq = dot(diff, diff);
+                if (r_sq < h_sq)
+                {
+                    density += CalculateDensity(r_sq);
+                }
+            }
+        }
+    }
+    
+    ParticlesDensityRW[P_ID].density = density;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Force Calculation
+//--------------------------------------------------------------------------------------
+
+float CalculatePressure(float density)
+{
+    // Implements this equation:
+    // Pressure = B * ((rho / rho_0)^y  - 1)
+    return g_fPressureStiffness * max(pow(density / g_fRestDensity, 3) - 1, 0);
+}
+
+float2 CalculateGradPressure(float r, float P_pressure, float N_pressure, float N_density, float2 diff)
+{
+    const float h = g_fSmoothlen;
+    float avg_pressure = 0.5f * (N_pressure + P_pressure);
+    // Implements this equation:
+    // W_spkiey(r, h) = 15 / (pi * h^6) * (h - r)^3
+    // GRAD( W_spikey(r, h) ) = -45 / (pi * h^6) * (h - r)^2
+    // g_fGradPressureCoef = fParticleMass * -45.0f / (PI * fSmoothlen^6)
+    return g_fGradPressureCoef * avg_pressure / N_density * (h - r) * (h - r) / r * (diff);
+}
+
+float2 CalculateLapVelocity(float r, float2 P_velocity, float2 N_velocity, float N_density)
+{
+    const float h = g_fSmoothlen;
+    float2 vel_diff = (N_velocity - P_velocity);
+    // Implements this equation:
+    // W_viscosity(r, h) = 15 / (2 * pi * h^3) * (-r^3 / (2 * h^3) + r^2 / h^2 + h / (2 * r) - 1)
+    // LAPLACIAN( W_viscosity(r, h) ) = 45 / (pi * h^6) * (h - r)
+    // g_fLapViscosityCoef = fParticleMass * fViscosity * 45.0f / (PI * fSmoothlen^6)
+    return g_fLapViscosityCoef / N_density * (h - r) * vel_diff;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Simple N^2 Algorithm
+//--------------------------------------------------------------------------------------
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void ForceCS_Simple( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+    const unsigned int P_ID = DTid.x; // Particle ID to operate on
+    
+    float2 P_position = ParticlesRO[P_ID].position;
+    float2 P_velocity = ParticlesRO[P_ID].velocity;
+    float P_density = ParticlesDensityRO[P_ID].density;
+    float P_pressure = CalculatePressure(P_density);
+    
+    const float h_sq = g_fSmoothlen * g_fSmoothlen;
+    
+    float2 acceleration = float2(0, 0);
+
+    // Calculate the acceleration based on all neighbors
+    for (uint N_ID = 0 ; N_ID < g_iNumParticles ; N_ID++)
+    {
+        float2 N_position = ParticlesRO[N_ID].position;
+        
+        float2 diff = N_position - P_position;
+        float r_sq = dot(diff, diff);
+        if (r_sq < h_sq && P_ID != N_ID)
+        {
+            float2 N_velocity = ParticlesRO[N_ID].velocity;
+            float N_density = ParticlesDensityRO[N_ID].density;
+            float N_pressure = CalculatePressure(N_density);
+            float r = sqrt(r_sq);
+
+            // Pressure Term
+            acceleration += CalculateGradPressure(r, P_pressure, N_pressure, N_density, diff);
+            
+            // Viscosity Term
+            acceleration += CalculateLapVelocity(r, P_velocity, N_velocity, N_density);
+        }
+    }
+    
+    ParticlesForcesRW[P_ID].acceleration = acceleration / P_density;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Shared Memory Optimized N^2 Algorithm
+//--------------------------------------------------------------------------------------
+
+groupshared struct { float2 position; float2 velocity; float density; } force_shared_pos[SIMULATION_BLOCK_SIZE];
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void ForceCS_Shared( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+    const unsigned int P_ID = DTid.x; // Particle ID to operate on
+    
+    float2 P_position = ParticlesRO[P_ID].position;
+    float2 P_velocity = ParticlesRO[P_ID].velocity;
+    float P_density = ParticlesDensityRO[P_ID].density;
+    float P_pressure = CalculatePressure(P_density);
+    
+    const float h_sq = g_fSmoothlen * g_fSmoothlen;
+    
+    float2 acceleration = float2(0, 0);
+
+    // Calculate the acceleration based on all neighbors
+    [loop]
+    for (uint N_block_ID = 0 ; N_block_ID < g_iNumParticles ; N_block_ID += SIMULATION_BLOCK_SIZE)
+    {
+        // Cache a tile of particles unto shared memory to increase IO efficiency
+        force_shared_pos[GI].position = ParticlesRO[N_block_ID + GI].position;
+        force_shared_pos[GI].velocity = ParticlesRO[N_block_ID + GI].velocity;
+        force_shared_pos[GI].density = ParticlesDensityRO[N_block_ID + GI].density;
+       
+        GroupMemoryBarrierWithGroupSync();        
+
+        [loop]
+        for (uint N_tile_ID = 0; N_tile_ID < SIMULATION_BLOCK_SIZE; N_tile_ID++ ) 
+        {
+            uint N_ID = N_block_ID + N_tile_ID;
+            float2 N_position = force_shared_pos[N_tile_ID].position;
+            
+            float2 diff = N_position - P_position;
+            float r_sq = dot(diff, diff);
+            if (r_sq < h_sq && P_ID != N_ID)
+            {
+                float2 N_velocity = force_shared_pos[N_tile_ID].velocity;
+                float N_density = force_shared_pos[N_tile_ID].density;
+                float N_pressure = CalculatePressure(N_density);
+                float r = sqrt(r_sq);
+
+                // Pressure Term
+                acceleration += CalculateGradPressure(r, P_pressure, N_pressure, N_density, diff);
+                
+                // Viscosity Term
+                acceleration += CalculateLapVelocity(r, P_velocity, N_velocity, N_density);
+            }
+        }        
+        
+        GroupMemoryBarrierWithGroupSync();
+    }
+    
+    ParticlesForcesRW[P_ID].acceleration = acceleration / P_density;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Optimized Grid + Sort Algorithm
+//--------------------------------------------------------------------------------------
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void ForceCS_Grid( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+    const unsigned int P_ID = DTid.x; // Particle ID to operate on
+    
+    float2 P_position = ParticlesRO[P_ID].position;
+    float2 P_velocity = ParticlesRO[P_ID].velocity;
+    float P_density = ParticlesDensityRO[P_ID].density;
+    float P_pressure = CalculatePressure(P_density);
+    
+    const float h_sq = g_fSmoothlen * g_fSmoothlen;
+    
+    float2 acceleration = float2(0, 0);
+    
+    // Calculate the acceleration based on neighbors from the 8 adjacent cells + current cell
+    int2 G_XY = (int2)GridCalculateCell( P_position );
+    for (int Y = max(G_XY.y - 1, 0) ; Y <= min(G_XY.y + 1, 255) ; Y++)
+    {
+        for (int X = max(G_XY.x - 1, 0) ; X <= min(G_XY.x + 1, 255) ; X++)
+        {
+            unsigned int G_CELL = GridConstuctKey(uint2(X, Y));
+            uint2 G_START_END = GridIndicesRO[G_CELL];
+            for (unsigned int N_ID = G_START_END.x ; N_ID < G_START_END.y ; N_ID++)
+            {
+                float2 N_position = ParticlesRO[N_ID].position;
+                
+                float2 diff = N_position - P_position;
+                float r_sq = dot(diff, diff);
+                if (r_sq < h_sq && P_ID != N_ID)
+                {
+                    float2 N_velocity = ParticlesRO[N_ID].velocity;
+                    float N_density = ParticlesDensityRO[N_ID].density;
+                    float N_pressure = CalculatePressure(N_density);
+                    float r = sqrt(r_sq);
+
+                    // Pressure Term
+                    acceleration += CalculateGradPressure(r, P_pressure, N_pressure, N_density, diff);
+                    
+                    // Viscosity Term
+                    acceleration += CalculateLapVelocity(r, P_velocity, N_velocity, N_density);
+                }
+            }
+        }
+    }
+
+    ParticlesForcesRW[P_ID].acceleration = acceleration / P_density;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Integration
+//--------------------------------------------------------------------------------------
+
+[numthreads(SIMULATION_BLOCK_SIZE, 1, 1)]
+void IntegrateCS( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+    const unsigned int P_ID = DTid.x; // Particle ID to operate on
+    
+    float2 position = ParticlesRO[P_ID].position;
+    float2 velocity = ParticlesRO[P_ID].velocity;
+    float2 acceleration = ParticlesForcesRO[P_ID].acceleration;
+    
+    // Apply the forces from the map walls
+    [unroll]
+    for (unsigned int i = 0 ; i < 4 ; i++)
+    {
+        float dist = dot(float3(position, 1), g_vPlanes[i]);
+        acceleration += min(dist, 0) * -g_fWallStiffness * g_vPlanes[i].xy;
+    }
+    
+    // Apply gravity
+    acceleration += g_vGravity.xy;
+    
+    // Integrate
+    velocity += g_fTimeStep * acceleration;
+    position += g_fTimeStep * velocity;
+    
+    // Update
+    ParticlesRW[P_ID].position = position;
+    ParticlesRW[P_ID].velocity = velocity;
+}
diff --git a/tests/hlsl/dxsdk/FluidCS11/FluidRender.hlsl b/tests/hlsl/dxsdk/FluidCS11/FluidRender.hlsl
new file mode 100644
index 000000000..d7e24b7bc
--- /dev/null
+++ b/tests/hlsl/dxsdk/FluidCS11/FluidRender.hlsl
@@ -0,0 +1,112 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry ParticleVS -profile gs_4_0 -entry ParticleGS -profile ps_4_0 -entry ParticlePS
+//--------------------------------------------------------------------------------------
+// File: FluidRender.hlsl
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Particle Rendering
+//--------------------------------------------------------------------------------------
+
+struct Particle {
+    float2 position;
+    float2 velocity;
+};
+
+struct ParticleDensity {
+    float density;
+};
+
+StructuredBuffer<Particle> ParticlesRO : register( t0 );
+StructuredBuffer<ParticleDensity> ParticleDensityRO : register( t1 );
+
+cbuffer cbRenderConstants : register( b0 )
+{
+    matrix g_mViewProjection;
+    float g_fParticleSize;
+};
+
+struct VSParticleOut
+{
+    float2 position : POSITION;
+    float4 color : COLOR;
+};
+
+struct GSParticleOut
+{
+    float4 position : SV_Position;
+    float4 color : COLOR;
+    float2 texcoord : TEXCOORD;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Visualization Helper
+//--------------------------------------------------------------------------------------
+
+static const float4 Rainbow[5] = {
+    float4(1, 0, 0, 1), // red
+    float4(1, 1, 0, 1), // orange
+    float4(0, 1, 0, 1), // green
+    float4(0, 1, 1, 1), // teal
+    float4(0, 0, 1, 1), // blue
+};
+
+float4 VisualizeNumber(float n)
+{
+    return lerp( Rainbow[ floor(n * 4.0f) ], Rainbow[ ceil(n * 4.0f) ], frac(n * 4.0f) );
+}
+
+float4 VisualizeNumber(float n, float lower, float upper)
+{
+    return VisualizeNumber( saturate( (n - lower) / (upper - lower) ) );
+}
+
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+
+VSParticleOut ParticleVS(uint ID : SV_VertexID)
+{
+    VSParticleOut Out = (VSParticleOut)0;
+    Out.position = ParticlesRO[ID].position;
+    Out.color = VisualizeNumber(ParticleDensityRO[ID].density, 1000.0f, 2000.0f);
+    return Out;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Particle Geometry Shader
+//--------------------------------------------------------------------------------------
+
+static const float2 g_positions[4] = { float2(-1, 1), float2(1, 1), float2(-1, -1), float2(1, -1) };
+static const float2 g_texcoords[4] = { float2(0, 1), float2(1, 1), float2(0, 0), float2(1, 0) };
+
+[maxvertexcount(4)]
+void ParticleGS(point VSParticleOut In[1], inout TriangleStream<GSParticleOut> SpriteStream)
+{
+    [unroll]
+    for (int i = 0; i < 4; i++)
+    {
+        GSParticleOut Out = (GSParticleOut)0;
+        float4 position = float4(In[0].position, 0, 1) + g_fParticleSize * float4(g_positions[i], 0, 0);
+        Out.position = mul(position, g_mViewProjection);
+        Out.color = In[0].color;
+        Out.texcoord = g_texcoords[i];
+        SpriteStream.Append(Out);
+    }
+    SpriteStream.RestartStrip();
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+
+float4 ParticlePS(GSParticleOut In) : SV_Target
+{
+    return In.color;
+}
diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/BrightPassAndHorizFilterCS.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/BrightPassAndHorizFilterCS.hlsl
new file mode 100644
index 000000000..87bad46ed
--- /dev/null
+++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/BrightPassAndHorizFilterCS.hlsl
@@ -0,0 +1,64 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSMain
+//--------------------------------------------------------------------------------------
+// File: BrightPassAndHorizFilterCS.hlsl
+//
+// The CS for bright pass and horizontal blur, used in CS path of 
+// HDRToneMappingCS11 sample
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+static const float  MIDDLE_GRAY = 0.72f;
+static const float  LUM_WHITE = 1.5f;
+static const float  BRIGHT_THRESHOLD = 0.5f;
+
+Texture2D Input : register( t0 ); 
+StructuredBuffer<float> lum : register( t1 );
+RWStructuredBuffer<float4> Result : register( u0 );
+
+cbuffer cb0
+{
+    float4  g_avSampleWeights[15];
+    uint    g_outputwidth;
+    float   g_inverse;
+    int2    g_inputsize;
+}
+
+#define kernelhalf 7
+#define groupthreads 128
+groupshared float4 temp[groupthreads];
+
+[numthreads( groupthreads, 1, 1 )]
+void CSMain( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex )
+{
+    int2 coord = int2( GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.x, Gid.y );
+    coord = coord.xy * 8 + int2(4, 3);
+    coord = clamp( coord, int2(0, 0), int2(g_inputsize.x-1, g_inputsize.y-1) );
+    float4 vColor = Input.Load( int3(coord, 0) );
+
+    float fLum = lum[0]*g_inverse;
+
+    // Bright pass and tone mapping
+    vColor = max( 0.0f, vColor - BRIGHT_THRESHOLD );
+    vColor *= MIDDLE_GRAY / (fLum + 0.001f);
+    vColor *= (1.0f + vColor/LUM_WHITE);
+    vColor /= (1.0f + vColor);
+
+    temp[GI] = vColor;
+
+    GroupMemoryBarrierWithGroupSync();
+
+    // Horizontal blur
+    if ( GI >= kernelhalf && 
+         GI < (groupthreads - kernelhalf) && 
+         ( (Gid.x * (groupthreads - 2 * kernelhalf) + GI - kernelhalf) < g_outputwidth) )
+    {
+        float4 vOut = 0;
+        
+        [unroll]
+        for ( int i = -kernelhalf; i <= kernelhalf; ++i )
+            vOut += temp[GI + i] * g_avSampleWeights[i + kernelhalf];
+
+        Result[GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.x + Gid.y * g_outputwidth] = float4(vOut.rgb, 1.0f);
+    }
+}
diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/DumpToTexture.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/DumpToTexture.hlsl
new file mode 100644
index 000000000..d2d9611ce
--- /dev/null
+++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/DumpToTexture.hlsl
@@ -0,0 +1,29 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PSDump
+//--------------------------------------------------------------------------------------
+// File: DumpToTexture.hlsl
+//
+// The PS for converting CS output buffer to a texture, used in CS path of 
+// HDRToneMappingCS11 sample
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+StructuredBuffer<float4> buffer : register( t0 );
+
+struct QuadVS_Output
+{
+    float4 Pos : SV_POSITION;              
+    float2 Tex : TEXCOORD0;
+};
+
+cbuffer cbPS : register( b0 )
+{
+    uint4    g_param;   
+};
+
+float4 PSDump( QuadVS_Output Input ) : SV_TARGET
+{
+    // To calculate the buffer offset, it is natural to use the screen space coordinates,
+    // Input.Pos is the screen space coordinates of the pixel being written 
+    return buffer[ (Input.Pos.x - 0.5) + (Input.Pos.y - 0.5) * g_param.x ];	
+}
diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/FilterCS.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/FilterCS.hlsl
new file mode 100644
index 000000000..09c91669a
--- /dev/null
+++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/FilterCS.hlsl
@@ -0,0 +1,73 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSVerticalFilter -entry CSHorizFilter
+//--------------------------------------------------------------------------------------
+// File: FilterCS.hlsl
+//
+// The CSs for doing vertical and horizontal blur, used in CS path of 
+// HDRToneMappingCS11 sample
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+StructuredBuffer<float4> InputBuf : register( t0 );
+Texture2D InputTex : register( t1 ); 
+RWStructuredBuffer<float4> Result : register( u0 );
+
+cbuffer cb0
+{
+    float4  g_avSampleWeights[15];
+    int2    g_outputsize;
+    int2    g_inputsize;
+}
+
+#define kernelhalf 7
+#define groupthreads 128
+groupshared float4 temp[groupthreads];
+
+[numthreads( groupthreads, 1, 1 )]
+void CSVerticalFilter( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex )
+{
+    int offsety = GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.y;
+    offsety = clamp( offsety, 0, g_inputsize.y-1 );
+    int offset = Gid.x + offsety * g_inputsize.x;
+    temp[GI] = InputBuf[offset];
+
+    GroupMemoryBarrierWithGroupSync();
+
+    // Vertical blur
+    if ( GI >= kernelhalf && 
+         GI < (groupthreads - kernelhalf) && 
+         ( (GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.y) < g_outputsize.y) )
+    {
+        float4 vOut = 0;
+        
+        [unroll]
+        for ( int i = -kernelhalf; i <= kernelhalf; ++i )
+            vOut += temp[GI + i] * g_avSampleWeights[i + kernelhalf];
+
+        Result[Gid.x + (GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.y) * g_outputsize.x] = float4(vOut.rgb, 1.0f);
+    }
+}
+
+[numthreads( groupthreads, 1, 1 )]
+void CSHorizFilter( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex )
+{
+    int2 coord = int2( GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.x, Gid.y );
+    coord = clamp( coord, int2(0, 0), int2(g_inputsize.x-1, g_inputsize.y-1) );
+    temp[GI] = InputTex.Load( int3(coord, 0) );        
+
+    GroupMemoryBarrierWithGroupSync();
+
+    // Horizontal blur
+    if ( GI >= kernelhalf && 
+         GI < (groupthreads - kernelhalf) && 
+         ( (Gid.x * (groupthreads - 2 * kernelhalf) + GI - kernelhalf) < g_outputsize.x) )
+    {
+        float4 vOut = 0;
+        
+        [unroll]
+        for ( int i = -kernelhalf; i <= kernelhalf; ++i )
+            vOut += temp[GI + i] * g_avSampleWeights[i + kernelhalf];
+
+        Result[GI - kernelhalf + (groupthreads - kernelhalf * 2) * Gid.x + Gid.y * g_outputsize.x] = float4(vOut.rgb, 1.0f);
+    }
+}
diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/FinalPass.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/FinalPass.hlsl
new file mode 100644
index 000000000..a4673c237
--- /dev/null
+++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/FinalPass.hlsl
@@ -0,0 +1,79 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry QuadVS -profile ps_4_0 -entry PSFinalPass -entry PSFinalPassForCPUReduction
+//--------------------------------------------------------------------------------------
+// File: FinalPass.hlsl
+//
+// The PSs for doing tone-mapping based on the input luminance, used in CS path of 
+// HDRToneMappingCS11 sample
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+struct QuadVS_Input
+{
+    float4 Pos : POSITION;
+    float2 Tex : TEXCOORD0;
+};
+
+struct QuadVS_Output
+{
+    float4 Pos : SV_POSITION;              
+    float2 Tex : TEXCOORD0;
+};
+
+QuadVS_Output QuadVS( QuadVS_Input Input )
+{
+    QuadVS_Output Output;
+    Output.Pos = Input.Pos;
+    Output.Tex = Input.Tex;
+    return Output;
+}
+
+Texture2D<float4> tex : register( t0 );
+StructuredBuffer<float> lum : register( t1 );
+Texture2D<float4> bloom : register( t2 );
+
+SamplerState PointSampler : register (s0);
+SamplerState LinearSampler : register (s1);
+
+
+static const float  MIDDLE_GRAY = 0.72f;
+static const float  LUM_WHITE = 1.5f;
+
+cbuffer cbPS : register( b0 )
+{
+    float4    g_param;   
+};
+
+float4 PSFinalPass( QuadVS_Output Input ) : SV_TARGET
+{
+    float4 vColor = tex.Sample( PointSampler, Input.Tex );
+    float fLum = lum[0]*g_param.x;
+    float3 vBloom = bloom.Sample( LinearSampler, Input.Tex );
+
+    // Tone mapping
+    vColor.rgb *= MIDDLE_GRAY / (fLum + 0.001f);
+    vColor.rgb *= (1.0f + vColor/LUM_WHITE);
+    vColor.rgb /= (1.0f + vColor);
+    
+    vColor.rgb += 0.6f * vBloom;
+    vColor.a = 1.0f;
+
+    return vColor;
+}
+
+float4 PSFinalPassForCPUReduction( QuadVS_Output Input ) : SV_TARGET
+{
+    float4 vColor = tex.Sample( PointSampler, Input.Tex );
+    float fLum = g_param.x;
+    float3 vBloom = bloom.Sample( LinearSampler, Input.Tex );
+
+    // Tone mapping
+    vColor.rgb *= MIDDLE_GRAY / (fLum + 0.001f);
+    vColor.rgb *= (1.0f + vColor/LUM_WHITE);
+    vColor.rgb /= (1.0f + vColor);
+    
+    vColor.rgb += 0.6f * vBloom;
+    vColor.a = 1.0f;
+    
+    return vColor;
+}
diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/PSApproach.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/PSApproach.hlsl
new file mode 100644
index 000000000..2b18cf0a1
--- /dev/null
+++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/PSApproach.hlsl
@@ -0,0 +1,129 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry DownScale2x2_Lum -entry DownScale3x3 -entry FinalPass -entry DownScale3x3_BrightPass -entry Bloom
+//--------------------------------------------------------------------------------------
+// File: PSApproach.hlsl
+//
+// The PSs for doing post-processing, used in PS path of 
+// HDRToneMappingCS11 sample
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+static const float4 LUM_VECTOR = float4(.299, .587, .114, 0);
+static const float  MIDDLE_GRAY = 0.72f;
+static const float  LUM_WHITE = 1.5f;
+static const float  BRIGHT_THRESHOLD = 0.5f;
+
+SamplerState PointSampler : register (s0);
+SamplerState LinearSampler : register (s1);
+
+struct QuadVS_Output
+{
+    float4 Pos : SV_POSITION;              
+    float2 Tex : TEXCOORD0;
+};
+
+Texture2D s0 : register(t0);
+Texture2D s1 : register(t1);
+Texture2D s2 : register(t2);
+
+float4 DownScale2x2_Lum ( QuadVS_Output Input ) : SV_TARGET
+{    
+    float4 vColor = 0.0f;
+    float  fAvg = 0.0f;
+    
+    for( int y = -1; y < 1; y++ )
+    {
+        for( int x = -1; x < 1; x++ )
+        {
+            // Compute the sum of color values
+            vColor = s0.Sample( PointSampler, Input.Tex, int2(x,y) );                       
+                
+            fAvg += dot( vColor, LUM_VECTOR );
+        }
+    }
+    
+    fAvg /= 4;
+    
+    return float4(fAvg, fAvg, fAvg, 1.0f);
+}
+
+float4 DownScale3x3( QuadVS_Output Input ) : SV_TARGET
+{
+    float fAvg = 0.0f; 
+    float4 vColor;
+    
+    for( int y = -1; y <= 1; y++ )
+    {
+        for( int x = -1; x <= 1; x++ )
+        {
+            // Compute the sum of color values
+            vColor = s0.Sample( PointSampler, Input.Tex, int2(x,y) );
+                        
+            fAvg += vColor.r; 
+        }
+    }
+    
+    // Divide the sum to complete the average
+    fAvg /= 9;
+    
+    return float4(fAvg, fAvg, fAvg, 1.0f);
+}
+
+float4 FinalPass( QuadVS_Output Input ) : SV_TARGET
+{   
+    //float4 vColor = 0;
+    float4 vColor = s0.Sample( PointSampler, Input.Tex );
+    float4 vLum = s1.Sample( PointSampler, float2(0,0) );
+    float3 vBloom = s2.Sample( LinearSampler, Input.Tex );       
+    
+    // Tone mapping
+    vColor.rgb *= MIDDLE_GRAY / (vLum.r + 0.001f);
+    vColor.rgb *= (1.0f + vColor/LUM_WHITE);
+    vColor.rgb /= (1.0f + vColor);
+    
+    vColor.rgb += 0.6f * vBloom;
+    vColor.a = 1.0f;    
+    
+    return vColor;
+}
+
+float4 DownScale3x3_BrightPass( QuadVS_Output Input ) : SV_TARGET
+{   
+    float3 vColor = 0.0f;
+    float4 vLum = s1.Sample( PointSampler, float2(0, 0) );
+    float  fLum = vLum.r;
+
+    vColor = s0.Sample( PointSampler, Input.Tex ).rgb;          
+ 
+    // Bright pass and tone mapping
+    vColor = max( 0.0f, vColor - BRIGHT_THRESHOLD );
+    vColor *= MIDDLE_GRAY / (fLum + 0.001f);
+    vColor *= (1.0f + vColor/LUM_WHITE);
+    vColor /= (1.0f + vColor);
+    
+    return float4(vColor, 1.0f);
+}
+
+cbuffer cb0
+{
+    float2 g_avSampleOffsets[15];
+    float4 g_avSampleWeights[15];
+}
+
+float4 Bloom( QuadVS_Output Input ) : SV_TARGET
+{    
+    float4 vSample = 0.0f;
+    float4 vColor = 0.0f;
+    float2 vSamplePosition;
+    
+    for( int iSample = 0; iSample < 15; iSample++ )
+    {
+        // Sample from adjacent points
+        vSamplePosition = Input.Tex + g_avSampleOffsets[iSample];
+        vColor = s0.Sample( PointSampler, vSamplePosition);
+        
+        vSample += g_avSampleWeights[iSample]*vColor;
+    }
+    
+    return vSample;
+}
diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceTo1DCS.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceTo1DCS.hlsl
new file mode 100644
index 000000000..027838743
--- /dev/null
+++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceTo1DCS.hlsl
@@ -0,0 +1,72 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSMain
+//-----------------------------------------------------------------------------
+// File: ReduceTo1DCS.hlsl
+//
+// Desc: Reduce an input Texture2D to a buffer
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//-----------------------------------------------------------------------------
+Texture2D Input : register( t0 ); 
+RWStructuredBuffer<float> Result : register( u0 );
+
+cbuffer cbCS : register( b0 )
+{
+    uint4    g_param;   // (g_param.x, g_param.y) is the x and y dimensions of the Dispatch call
+                        // (g_param.z, g_param.w) is the size of the above Input Texture2D
+};
+
+//#define CS_FULL_PIXEL_REDUCITON // Defining this or not must be the same as in HDRToneMappingCS11.cpp
+
+#define blocksize 8
+#define blocksizeY 8
+#define groupthreads (blocksize*blocksizeY)
+groupshared float accum[groupthreads];
+
+static const float4 LUM_VECTOR = float4(.299, .587, .114, 0);
+
+[numthreads(blocksize,blocksizeY,1)]
+void CSMain( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{    
+    float4 s = 
+#ifdef CS_FULL_PIXEL_REDUCITON
+        Input.Load( uint3(DTid.xy                                                   , 0) )+ 
+        Input.Load( uint3(DTid.xy + uint2(blocksize*g_param.x,                    0), 0) ) +
+        Input.Load( uint3(DTid.xy + uint2(0,                   blocksizeY*g_param.y), 0) ) + 
+        Input.Load( uint3(DTid.xy + uint2(blocksize*g_param.x, blocksizeY*g_param.y), 0) );
+#else
+        Input.Load( uint3((float)DTid.x/81.0f*g_param.z, (float)DTid.y/81.0f*g_param.w, 0) );
+#endif
+        
+    accum[GI] = dot( s, LUM_VECTOR );
+
+    // Parallel reduction algorithm follows 
+    GroupMemoryBarrierWithGroupSync();
+    if ( GI < 32 )
+        accum[GI] += accum[32+GI];
+
+    GroupMemoryBarrierWithGroupSync();
+    if ( GI < 16 )
+        accum[GI] += accum[16+GI];
+
+    GroupMemoryBarrierWithGroupSync();
+    if ( GI < 8 )
+        accum[GI] += accum[8+GI];
+
+    GroupMemoryBarrierWithGroupSync();
+    if ( GI < 4 )
+        accum[GI] += accum[4+GI];
+
+    GroupMemoryBarrierWithGroupSync();
+    if ( GI < 2 )
+        accum[GI] += accum[2+GI];
+
+    GroupMemoryBarrierWithGroupSync();
+    if ( GI < 1 )
+        accum[GI] += accum[1+GI];
+
+    if ( GI == 0 )
+    {                
+        Result[Gid.y*g_param.x+Gid.x] = accum[0];
+    }
+}
diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceToSingleCS.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceToSingleCS.hlsl
new file mode 100644
index 000000000..cf506283e
--- /dev/null
+++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/ReduceToSingleCS.hlsl
@@ -0,0 +1,63 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSMain
+//-----------------------------------------------------------------------------
+// File: ReduceToSingleCS.hlsl
+//
+// Desc: Reduce an input buffer by a factor of groupthreads
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//-----------------------------------------------------------------------------
+
+StructuredBuffer<float> Input : register( t0 );
+RWStructuredBuffer<float> Result : register( u0 );
+
+cbuffer cbCS : register( b0 )
+{
+    uint4    g_param;   // g_param.x is the actual elements contained in Input
+                        // g_param.y is the x dimension of the Dispatch call
+};
+
+#define groupthreads 128
+groupshared float accum[groupthreads];
+
+[numthreads(groupthreads,1,1)]
+void CSMain( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+    if ( DTid.x < g_param.x )
+        accum[GI] = Input[DTid.x];
+    else
+        accum[GI] = 0;
+
+    // Parallel reduction algorithm follows 
+    GroupMemoryBarrierWithGroupSync();
+    if ( GI < 64 )
+        accum[GI] += accum[64+GI];  
+
+    GroupMemoryBarrierWithGroupSync();
+    if ( GI < 32 )    
+        accum[GI] += accum[32+GI];
+
+    GroupMemoryBarrierWithGroupSync();
+    if ( GI < 16 )
+        accum[GI] += accum[16+GI];
+
+    GroupMemoryBarrierWithGroupSync();
+    if ( GI < 8 ) 
+        accum[GI] += accum[8+GI];
+
+    GroupMemoryBarrierWithGroupSync();
+    if ( GI < 4 )
+        accum[GI] += accum[4+GI];
+
+    GroupMemoryBarrierWithGroupSync();
+    if ( GI < 2 )
+        accum[GI] += accum[2+GI];
+
+    GroupMemoryBarrierWithGroupSync();
+    if ( GI < 1 )
+        accum[GI] += accum[1+GI];
+    
+    if ( GI == 0 )
+    {        
+        Result[Gid.x] = accum[0];
+    }
+}
diff --git a/tests/hlsl/dxsdk/HDRToneMappingCS11/skybox11.hlsl b/tests/hlsl/dxsdk/HDRToneMappingCS11/skybox11.hlsl
new file mode 100644
index 000000000..2728665e2
--- /dev/null
+++ b/tests/hlsl/dxsdk/HDRToneMappingCS11/skybox11.hlsl
@@ -0,0 +1,44 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry SkyboxVS -profile ps_4_0 -entry SkyboxPS
+//-----------------------------------------------------------------------------
+// File: SkyBox11.hlsl
+//
+// Desc: 
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//-----------------------------------------------------------------------------
+
+cbuffer cbPerObject : register( b0 )
+{
+    row_major matrix    g_mWorldViewProjection	: packoffset( c0 );
+}
+
+TextureCube	g_EnvironmentTexture : register( t0 );
+SamplerState g_sam : register( s0 );
+
+struct SkyboxVS_Input
+{
+    float4 Pos : POSITION;
+};
+
+struct SkyboxVS_Output
+{
+    float4 Pos : SV_POSITION;
+    float3 Tex : TEXCOORD0;
+};
+
+SkyboxVS_Output SkyboxVS( SkyboxVS_Input Input )
+{
+    SkyboxVS_Output Output;
+    
+    Output.Pos = Input.Pos;
+    Output.Tex = normalize( mul(Input.Pos, g_mWorldViewProjection) );
+    
+    return Output;
+}
+
+float4 SkyboxPS( SkyboxVS_Output Input ) : SV_TARGET
+{
+    float4 color = g_EnvironmentTexture.Sample( g_sam, Input.Tex );
+    return color;
+}
diff --git a/tests/hlsl/dxsdk/InstancingFX11/Instancing.fx b/tests/hlsl/dxsdk/InstancingFX11/Instancing.fx
new file mode 100644
index 000000000..3c8d45078
--- /dev/null
+++ b/tests/hlsl/dxsdk/InstancingFX11/Instancing.fx
@@ -0,0 +1,591 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: Instancing.fx
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Input and output structures 
+//--------------------------------------------------------------------------------------
+struct VSInstIn
+{
+    float3 pos : POSITION;
+    float3 norm : NORMAL;
+    float2 tex : TEXTURE0;
+    row_major float4x4 mTransform : mTransform;
+};
+
+struct VSSceneIn
+{
+    float3 pos : POSITION;
+    float3 norm : NORMAL;
+    float2 tex : TEXTURE0;
+};
+
+struct VSGrassIn
+{
+    float3 pos : POSITION;
+    float3 norm : NORMAL;
+    float2 tex : TEXTURE0;
+    row_major float4x4 mTransform : mTransform;
+    uint VertexID : SV_VertexID;
+};
+
+struct VSGrassOut
+{
+    float3 pos : POSITION;
+    float3 norm : NORMAL;
+    float2 tex : TEXTURE0;
+    uint VertexID : VERTID;
+};
+
+struct VSQuadIn
+{
+    float3 pos : POSITION;
+    float2 tex : TEXTURE0;
+    row_major float4x4 mTransform : mTransform;
+    float fOcc : fOcc;
+    uint InstanceId : SV_InstanceID;
+};
+
+struct PSSceneIn
+{
+    float4 pos : SV_Position;
+    float2 tex : TEXTURE0;
+    float4 color : COLOR0;
+};
+
+struct PSQuadIn
+{
+    float4 pos : SV_Position;
+    float3 tex : TEXTURE0;
+    float4 color : COLOR0;
+};
+
+//--------------------------------------------------------------------------------------
+// Constant buffers 
+//--------------------------------------------------------------------------------------
+cbuffer crarely
+{
+    float4x4 g_mTreeMatrices[50];
+    uint g_iNumTrees;
+};
+
+cbuffer ceveryframe
+{
+    float4x4 g_mWorldViewProj;
+    float4x4 g_mWorldView;
+};
+
+cbuffer cmultipleperframe
+{
+    float g_GrassWidth;
+    float g_GrassHeight;
+    uint g_iGrassCoverage;
+};
+
+cbuffer cusercontrolled
+{
+    float g_GrassMessiness;
+};
+
+struct light_struct
+{
+    float4 direction;
+    float4 color;
+};
+
+cbuffer cimmutable
+{
+    light_struct g_lights[4] = { 
+                    { float4(0.620275,  0.683659, 0.384537, 1),  float4(0.75, 0.599, 0.405, 1) },		//sun
+                    { float4(0.063288, -0.987444, 0.144735, 1),  float4(0.192, 0.273, 0.275, 1) },		//bottom
+                    { float4(0.23007,   0.785579, -0.574422, 1),  float4(0.300, 0.292, 0.223, 1) },		//highlight
+                    { float4(-0.620275,  -0.683659, -0.384537, 1),  float4(0.0, 0.0, 0.1, 1) }			//blue rim-light
+                    };
+    
+    float4 g_ambient = float4(0.4945,0.465,0.5,1);
+    
+    float g_occDimHeight = 2400.0;	//scalar that tells us how much to darken the tree near the top
+};
+
+cbuffer cgrassblade
+{
+    float3 g_positions[6] =
+    {
+        float3( -1, 0, 0 ),
+        float3( -1, 2, 0 ),
+        float3( 1, 0, 0 ),
+        float3( 1, 2, 0 ),
+        
+        float3( -1, 0, 0 ),
+        float3( -1, 2, 0 ),
+    };
+    float2 g_texcoords[6] = 
+    { 
+        float2(0,1), 
+        float2(0,0),
+        float2(1,1),
+        float2(1,0),
+        
+        float2(0,1),
+        float2(0,0),
+    };
+};
+
+//--------------------------------------------------------------------------------------
+// Textures and Samplers
+//--------------------------------------------------------------------------------------
+Texture2D g_txDiffuse;
+Texture2DArray g_tx2dArray;
+SamplerState g_samLinear
+{
+    Filter = ANISOTROPIC;
+    AddressU = Wrap;
+    AddressV = Wrap;
+};
+
+Texture1D g_txRandom;
+SamplerState g_samPoint
+{
+    Filter = MIN_MAG_MIP_POINT;
+    AddressU = Wrap;
+    AddressV = Wrap;
+};
+
+//--------------------------------------------------------------------------------------
+// State structures
+//--------------------------------------------------------------------------------------
+BlendState QuadAlphaBlendState
+{
+    AlphaToCoverageEnable = TRUE;
+    RenderTargetWriteMask[0] = 0x0F;
+};
+
+RasterizerState EnableMSAA
+{
+    CullMode = BACK;
+    MultisampleEnable = TRUE;
+};
+
+DepthStencilState DisableDepthTestWrite
+{
+    DepthEnable = FALSE;
+    DepthWriteMask = ZERO;
+};
+
+DepthStencilState EnableDepthTestWrite
+{
+    DepthEnable = TRUE;
+    DepthWriteMask = ALL;
+};
+
+BlendState NoBlending
+{
+    AlphaToCoverageEnable = FALSE;
+    BlendEnable[0] = FALSE;
+};
+
+//--------------------------------------------------------------------------------------
+// Sky vertex shader
+//--------------------------------------------------------------------------------------
+PSSceneIn VSSkymain(VSSceneIn input)
+{
+    PSSceneIn output;
+    
+    //
+    // Transform the vert to view-space
+    //
+    float4 v4Position = mul(float4(input.pos, 1), g_mWorldViewProj);
+    output.pos = v4Position;
+    
+    //  
+    // Transfer the rest
+    //
+    output.tex = input.tex;
+    
+    output.color = float4(1,1,1,1);
+    
+    return output;
+}
+
+//--------------------------------------------------------------------------------------
+// CalcLighting helper function.  Calculates lighting from 4 light sources, adds ambient
+// and attenuates for depth.  Used by all techniques for lighting.
+//--------------------------------------------------------------------------------------
+float4 CalcLighting( float3 norm, float depth )
+{
+    float4 color = float4(0,0,0,0);
+    
+    // add the contributions of 4 directional lights
+    [unroll] for( int i=0; i<4; i++ )
+    {
+        color += saturate( dot(g_lights[i].direction,norm) )*g_lights[i].color;
+    }
+    
+    // give some attenuation due to depth
+    float attenuate = depth / 10000.0;
+    float4 attenColor = float4(0.15, 0.2, 0.3, 0);
+    
+    // add it all up plus ambient
+    return (1-attenuate*0.23)*(color + g_ambient) + attenColor*attenuate;
+}
+
+//--------------------------------------------------------------------------------------
+// Instancing vertex shader.  Positions the vertices based upon the matrix stored
+// in the second vertex stream.
+//--------------------------------------------------------------------------------------
+PSSceneIn VSInstmain(VSInstIn input)
+{
+    PSSceneIn output;
+    
+    //
+    // Transform by our Sceneance matrix
+    //
+    float4 InstancePosition = mul(float4(input.pos, 1), input.mTransform);
+    float4 ViewPos = mul(InstancePosition, g_mWorldView );
+    
+    //
+    // Transform the vert to view-space
+    //
+    float4 v4Position = mul(InstancePosition, g_mWorldViewProj);
+    output.pos = v4Position;
+    
+    //  
+    // Transfer the rest
+    //
+    output.tex = input.tex;
+    
+    //
+    // dot the norm with the light dir
+    //
+    float3 norm = mul(input.norm,(float3x3)input.mTransform);
+    output.color = CalcLighting( norm, ViewPos.z );
+    
+    //
+    // Dim the color by how far up the tree we are.  
+    // This is a nice way to fake occlusion of the branches by the leaves.
+    //
+    output.color *= 1.0f - saturate(input.pos.y/g_occDimHeight);
+    
+    
+    return output;
+}
+
+//--------------------------------------------------------------------------------------
+// Quad (leaf) vertex shader.  Instances the quad over multiple leaf positions and 
+// multiple trees.  This demonstrates how to do double instancing.
+//--------------------------------------------------------------------------------------
+PSQuadIn VSQuadmain(VSQuadIn input)
+{
+    PSQuadIn output;
+    
+    // base our leaf texture upon which instance id we are
+    uint iLeaf = input.InstanceId/g_iNumTrees;
+    uint iLeafTex = iLeaf%3;
+    output.tex = float3(input.tex, float(iLeafTex) );
+
+    //
+    // Transform the position by the Instance matrix
+    //
+    int iTree = input.InstanceId - (input.InstanceId/g_iNumTrees)*g_iNumTrees;
+    float4 vInstancePos = mul( float4(input.pos, 1), input.mTransform  );
+    float4 InstancePosition = mul(vInstancePos, g_mTreeMatrices[iTree] );
+    float4 ViewPos = mul(InstancePosition, g_mWorldView );
+        
+    //  
+    // Transform the Instance position to view-space
+    //
+    output.pos = mul(InstancePosition, g_mWorldViewProj);
+    
+    // pack distance from the eye into the color alpha channel
+    output.color = float4(input.fOcc,input.fOcc,input.fOcc,ViewPos.z);
+    
+    return output;
+}
+
+//--------------------------------------------------------------------------------------
+// Grass vertex shader.  Basically a passthrough except for instancing the island base
+// mesh.
+//--------------------------------------------------------------------------------------
+VSGrassOut VSGrassmain(VSGrassIn input)
+{
+    // simple transform into the instance space
+    VSGrassOut output;
+    output.pos = mul(float4(input.pos, 1), input.mTransform);
+    output.norm = mul(input.norm, (float3x3)input.mTransform);
+    output.tex = input.tex;
+    output.VertexID = input.VertexID;
+    
+    return output;
+}
+
+//--------------------------------------------------------------------------------------
+// Quad (leaf) GS.  Calculates the normal and lighting for the leaf.
+//--------------------------------------------------------------------------------------
+[maxvertexcount(3)]
+void GSQuadmain(triangle PSQuadIn input[3], inout TriangleStream<PSQuadIn> QuadStream)
+{
+    PSQuadIn output;
+
+    //
+    // Calculate the face normal
+    //
+    float4 faceNormalA = input[1].pos.xyzw - input[0].pos.xyzw;
+    float4 faceNormalB = input[2].pos.xyzw - input[0].pos.xyzw;
+
+    //
+    // Cross product
+    //
+    float3 faceNormal = cross(faceNormalA, faceNormalB);
+
+    //
+    // Normalize face normal
+    //  
+    faceNormal = normalize(faceNormal);
+
+    //
+    // Dot face normal with some arbitrary light vectors
+    //
+    float4 color1 = CalcLighting( faceNormal, input[0].color.a );
+    color1 *= input[0].color;
+
+    //
+    // Make sure we always have an alpha of 1
+    //  
+    color1.a = 1.0;
+
+    //
+    // Emit out the new tri
+    //
+    for(int i=0; i<3; i++)
+    {
+        output.pos = input[i].pos;
+        output.color = color1;
+        output.tex = input[i].tex;  
+        QuadStream.Append(output);
+    }
+    QuadStream.RestartStrip();
+}
+
+//--------------------------------------------------------------------------------------
+// RandomDir helper.  Samples a random dir out of our 1d random texture.  In this case
+// we use a texture because the offset could be anywhere.  If we were sampling linearly
+// then we would probably just use a buffer and load from that.
+//--------------------------------------------------------------------------------------
+float3 RandomDir(float fOffset)
+{   
+    float tCoord = (fOffset) / 300.0;
+    return g_txRandom.SampleLevel( g_samPoint, tCoord, 0 );
+}
+
+//--------------------------------------------------------------------------------------
+// Helper to determing if a point is within a triangle
+//--------------------------------------------------------------------------------------
+bool IsInTriangle( float3 P, float3 A, float3 B, float3 C )
+{
+    float3 crossA = cross( B-A, P-A );
+    float3 crossB = cross( C-B, P-B );
+    float3 crossC = cross( A-C, P-C );
+    
+    if( dot( crossA, crossB ) > 0 &&
+        dot( crossB, crossC ) > 0 )
+    {
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+}
+
+//--------------------------------------------------------------------------------------
+// Gets a random orientation matrix based upon the RandomDir funciton
+//--------------------------------------------------------------------------------------
+float4x4 GetRandomOrientation( float3 Pos, float3 Norm, float fRandOffset )
+{
+    float3 Tangent = RandomDir(fRandOffset);
+    
+    float3 Bitangent = normalize( cross( Tangent, Norm ) );
+    Tangent = normalize( cross( Bitangent, Norm ) );
+    
+    float4x4 matWorld = { float4( Tangent, 0 ),
+                          float4( Norm, 0 ),
+                          float4( Bitangent, 0 ),
+                          float4( Pos, 1 ) };
+    return matWorld;
+}
+
+//--------------------------------------------------------------------------------------
+// Generates an actual grass blade
+//--------------------------------------------------------------------------------------
+void OutputGrassBlade( VSGrassOut midPoint, inout TriangleStream<PSQuadIn> GrassStream, int iGrassTex )
+{
+    PSQuadIn output;
+    
+    float4x4 mWorld = GetRandomOrientation( midPoint.pos, midPoint.norm, (float)midPoint.VertexID );
+    float4 ViewPos = mul( midPoint.pos, g_mWorldView );
+    
+    float3 grassNorm = midPoint.norm;
+    float4 color1 = CalcLighting( grassNorm, ViewPos.z );
+    
+    for(int v=0; v<6; v++)
+    {
+        float3 pos = g_positions[v];
+        pos.x *= g_GrassWidth;
+        pos.y *= g_GrassHeight;
+        
+        output.pos = mul( float4(pos,1), mWorld );
+        output.pos = mul( output.pos, g_mWorldViewProj );
+        output.tex = float3( g_texcoords[v], iGrassTex );
+        output.color = color1;
+    
+        GrassStream.Append( output );
+    }
+    
+    GrassStream.RestartStrip();
+}
+
+//--------------------------------------------------------------------------------------
+// Midpoint of the three vertices A,B,C
+//--------------------------------------------------------------------------------------
+VSGrassOut CalcMidPoint( VSGrassOut A, VSGrassOut B, VSGrassOut C )
+{
+    VSGrassOut MidPoint;
+    
+    MidPoint.pos = (A.pos + B.pos + C.pos)/3.0f;
+    MidPoint.norm = (A.norm + B.norm + C.norm)/3.0f;
+    MidPoint.tex = (A.tex + B.tex + C.tex)/3.0f;
+    MidPoint.VertexID = A.VertexID + B.VertexID + C.VertexID;
+    
+    return MidPoint;
+}
+
+//--------------------------------------------------------------------------------------
+// The actual grass geometry shader.  This generates grass blades based upon an input
+// mesh (the tops of the islands) and a coverage texture.  Each of the textures channels
+// determines how much of each of the 4 types of grass to place at a particular spot.
+//--------------------------------------------------------------------------------------
+[maxvertexcount(90)]
+void GSGrassmain(triangle VSGrassOut input[3], inout TriangleStream<PSQuadIn> GrassStream )
+{
+    VSGrassOut MidPoint = CalcMidPoint( input[0], input[1], input[2] );
+    
+    float4 CoverageMask = g_tx2dArray.SampleLevel( g_samPoint, float3(MidPoint.tex,4), 0 );
+    float cm[4];
+    cm[0] = CoverageMask.r;
+    cm[1] = CoverageMask.g;
+    cm[2] = CoverageMask.b;
+    cm[3] = CoverageMask.a;
+    
+    for(int g=0; g<4; g++)
+    {
+        float MaxBlades = float(g_iGrassCoverage)*cm[g];
+        for(float i=0; i<MaxBlades; i++)
+        {	
+            float randOffset = g*5 + (i+1);
+            float3 Tan = RandomDir( MidPoint.pos.x + randOffset );
+            float3 Len = normalize( RandomDir( MidPoint.pos.z + randOffset ) );
+            float3 Shift = Len.x*g_GrassMessiness*normalize( cross( Tan, MidPoint.norm ) );
+            VSGrassOut grassPoint = MidPoint;
+            grassPoint.VertexID += randOffset;
+            grassPoint.pos += Shift; 
+                
+            //uncomment this to make the grass strictly conform to the mesh
+            //if( IsInTriangle( grassPoint.pos, input[0].pos, input[1].pos, input[2].pos ) )
+            {
+                OutputGrassBlade( grassPoint, GrassStream, g );
+            }
+        }
+    }
+}
+
+//--------------------------------------------------------------------------------------
+// PS for non-leaf or grass items.
+//--------------------------------------------------------------------------------------
+float4 PSScenemain(PSSceneIn input) : SV_Target
+{
+    float4 color = g_txDiffuse.Sample( g_samLinear, input.tex ) * input.color;
+    return color;
+}
+
+//--------------------------------------------------------------------------------------
+// PS for leaves and grass
+//--------------------------------------------------------------------------------------
+float4 PSQuadmain(PSQuadIn input) : SV_Target
+{
+    float4 color = g_tx2dArray.Sample( g_samLinear, input.tex );
+    color.xyz *= input.color.xyz;
+    return color;
+}
+
+//--------------------------------------------------------------------------------------
+// Render instanced meshes with vertex lighting
+//--------------------------------------------------------------------------------------
+technique10 RenderInstancedVertLighting
+{
+    pass p0
+    {
+        SetVertexShader( CompileShader( vs_4_0, VSInstmain() ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0, PSScenemain() ) );
+        
+        SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+        SetDepthStencilState( EnableDepthTestWrite, 0 );
+        SetRasterizerState( EnableMSAA );
+    }  
+}
+
+//--------------------------------------------------------------------------------------
+// Skybox
+//--------------------------------------------------------------------------------------
+technique10 RenderSkybox
+{
+    pass p0
+    {
+        SetVertexShader( CompileShader( vs_4_0, VSSkymain() ) );
+        SetGeometryShader( NULL );
+        SetPixelShader( CompileShader( ps_4_0, PSScenemain() ) );
+        
+        SetBlendState( NoBlending, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+        SetDepthStencilState( DisableDepthTestWrite, 0 );
+        SetRasterizerState( EnableMSAA );
+    }  
+}
+
+//--------------------------------------------------------------------------------------
+// Render leaves
+//--------------------------------------------------------------------------------------
+technique10 RenderQuad
+{
+    pass p0
+    {
+        
+        SetVertexShader( CompileShader( vs_4_0, VSQuadmain() ) );
+        SetGeometryShader( CompileShader( gs_4_0, GSQuadmain() ) );
+        SetPixelShader( CompileShader( ps_4_0, PSQuadmain() ) );
+        
+        SetBlendState( QuadAlphaBlendState, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+        SetDepthStencilState( EnableDepthTestWrite, 0 );
+        SetRasterizerState( EnableMSAA );
+    }  
+}
+
+//--------------------------------------------------------------------------------------
+// Render grass
+//--------------------------------------------------------------------------------------
+technique10 RenderGrass
+{
+    pass p0
+    {
+        
+        SetVertexShader( CompileShader( vs_4_0, VSGrassmain() ) );
+        SetGeometryShader( CompileShader( gs_4_0, GSGrassmain() ) );
+        SetPixelShader( CompileShader( ps_4_0, PSQuadmain() ) );
+        
+        SetBlendState( QuadAlphaBlendState, float4( 0.0f, 0.0f, 0.0f, 0.0f ), 0xFFFFFFFF );
+        SetDepthStencilState( EnableDepthTestWrite, 0 );
+        SetRasterizerState( EnableMSAA );
+    }  
+}
diff --git a/tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_PS.hlsl b/tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_PS.hlsl
new file mode 100644
index 000000000..dbeb87f33
--- /dev/null
+++ b/tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_PS.hlsl
@@ -0,0 +1,202 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry PSMain
+//--------------------------------------------------------------------------------------
+// File: MultithreadedRendering11_PS.hlsl
+//
+// The pixel shader file for the MultithreadedRendering11 sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+// Various debug options
+//#define NO_DIFFUSE_MAP
+//#define NO_NORMAL_MAP
+//#define NO_AMBIENT
+//#define NO_DYNAMIC_LIGHTING
+//#define NO_SHADOW_MAP
+
+#define SHADOW_DEPTH_BIAS 0.0005f
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+static const int g_iNumLights = 4;
+static const int g_iNumShadows = 1; // by convention, the first n lights cast shadows
+
+cbuffer cbPerObject : register( b0 )
+{
+	float4		g_vObjectColor			: packoffset( c0 );
+};
+
+cbuffer cbPerLight : register( b1 )
+{
+    struct LightDataStruct
+    {
+	    matrix		m_mLightViewProj;
+	    float4		m_vLightPos;
+	    float4		m_vLightDir;
+	    float4		m_vLightColor;
+	    float4		m_vFalloffs;    // x = dist end, y = dist range, z = cos angle end, w = cos range
+	} g_LightData[g_iNumLights]         : packoffset( c0 );
+};
+
+cbuffer cbPerScene : register( b2 )
+{
+	float4		g_vMirrorPlane			: packoffset( c0 );
+	float4  	g_vAmbientColor			: packoffset( c1 );
+	float4		g_vTintColor			: packoffset( c2 );
+};
+
+//--------------------------------------------------------------------------------------
+// Textures and Samplers
+//--------------------------------------------------------------------------------------
+Texture2D	        g_txDiffuse                : register( t0 );
+Texture2D	        g_txNormal                 : register( t1 );
+Texture2D           g_txShadow[g_iNumShadows]  : register( t2 );
+
+SamplerState        g_samPointClamp : register( s0 );
+SamplerState        g_samLinearWrap : register( s1 );
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct PS_INPUT
+{
+	float3 vNormal		: NORMAL;
+	float3 vTangent		: TANGENT;
+	float2 vTexcoord	: TEXCOORD0;
+	float4 vPosWorld	: TEXCOORD1;
+};
+
+//--------------------------------------------------------------------------------------
+// Sample normal map, convert to signed, apply tangent-to-world space transform
+//--------------------------------------------------------------------------------------
+float3 CalcPerPixelNormal( float2 vTexcoord, float3 vVertNormal, float3 vVertTangent )
+{
+	// Compute tangent frame
+	vVertNormal =   normalize( vVertNormal );	
+	vVertTangent =  normalize( vVertTangent );	
+	float3 vVertBinormal = normalize( cross( vVertTangent, vVertNormal ) );	
+	float3x3 mTangentSpaceToWorldSpace = float3x3( vVertTangent, vVertBinormal, vVertNormal ); 
+	
+	// Compute per-pixel normal
+	float3 vBumpNormal = g_txNormal.Sample( g_samLinearWrap, vTexcoord );
+	vBumpNormal = 2.0f * vBumpNormal - 1.0f;
+	
+	return mul( vBumpNormal, mTangentSpaceToWorldSpace );
+}
+
+//--------------------------------------------------------------------------------------
+// Test how much pixel is in shadow, using 2x2 percentage-closer filtering
+//--------------------------------------------------------------------------------------
+float4 CalcUnshadowedAmountPCF2x2( int iShadow, float4 vPosWorld )
+{
+    matrix mLightViewProj = g_LightData[iShadow].m_mLightViewProj;
+    Texture2D txShadow =    g_txShadow[iShadow]; 
+
+    // Compute pixel position in light space
+    float4 vLightSpacePos = mul( vPosWorld, mLightViewProj ); 
+    vLightSpacePos.xyz /= vLightSpacePos.w;
+    
+    // Translate from surface coords to texture coords
+    // Could fold these into the matrix
+    float2 vShadowTexCoord = 0.5f * vLightSpacePos + 0.5f;
+    vShadowTexCoord.y = 1.0f - vShadowTexCoord.y;
+    
+    // Depth bias to avoid pixel self-shadowing
+    float vLightSpaceDepth = vLightSpacePos.z - SHADOW_DEPTH_BIAS;
+    
+    // Find sub-pixel weights
+    float2 vShadowMapDims = float2( 2048.0f, 2048.0f ); // need to keep in sync with .cpp file
+    float4 vSubPixelCoords;
+    vSubPixelCoords.xy = frac( vShadowMapDims * vShadowTexCoord );
+    vSubPixelCoords.zw = 1.0f - vSubPixelCoords;
+    float4 vBilinearWeights = vSubPixelCoords.zxzx * vSubPixelCoords.wwyy;
+
+    // 2x2 percentage closer filtering
+    float2 vTexelUnits = 1.0f / vShadowMapDims;
+    float4 vShadowDepths;
+    vShadowDepths.x = txShadow.Sample( g_samPointClamp, vShadowTexCoord );
+    vShadowDepths.y = txShadow.Sample( g_samPointClamp, vShadowTexCoord + float2( vTexelUnits.x, 0.0f ) );
+    vShadowDepths.z = txShadow.Sample( g_samPointClamp, vShadowTexCoord + float2( 0.0f, vTexelUnits.y ) );
+    vShadowDepths.w = txShadow.Sample( g_samPointClamp, vShadowTexCoord + vTexelUnits );
+    
+    // What weighted fraction of the 4 samples are nearer to the light than this pixel?
+    float4 vShadowTests = ( vShadowDepths >= vLightSpaceDepth ) ? 1.0f : 0.0f;
+    return dot( vBilinearWeights, vShadowTests );
+}
+
+//--------------------------------------------------------------------------------------
+// Diffuse lighting calculation, with angle and distance falloff
+//--------------------------------------------------------------------------------------
+float4 CalcLightingColor( int iLight, float3 vPosWorld, float3 vPerPixelNormal )
+{
+    float3 vLightPos =      g_LightData[iLight].m_vLightPos.xyz; 
+    float3 vLightDir =      g_LightData[iLight].m_vLightDir.xyz;
+    float4 vLightColor =    g_LightData[iLight].m_vLightColor; 
+    float4 vFalloffs =      g_LightData[iLight].m_vFalloffs; 
+    
+    float3 vLightToPixelUnNormalized = vPosWorld - vLightPos;
+    
+    // Dist falloff = 0 at vFalloffs.x, 1 at vFalloffs.x - vFalloffs.y
+    float fDist = length( vLightToPixelUnNormalized );
+    float fDistFalloff = saturate( ( vFalloffs.x - fDist ) / vFalloffs.y );
+    
+    // Normalize from here on
+    float3 vLightToPixelNormalized = vLightToPixelUnNormalized / fDist;
+    
+    // Angle falloff = 0 at vFalloffs.z, 1 at vFalloffs.z - vFalloffs.w
+    float fCosAngle = dot( vLightToPixelNormalized, vLightDir );
+    float fAngleFalloff = saturate( ( fCosAngle - vFalloffs.z ) / vFalloffs.w );
+    
+    // Diffuse contribution
+    float fNDotL = saturate( -dot( vLightToPixelNormalized, vPerPixelNormal ) );
+    
+	return vLightColor * fNDotL * fDistFalloff * fAngleFalloff;
+}
+
+//--------------------------------------------------------------------------------------
+// Pixel Shader
+//--------------------------------------------------------------------------------------
+float4 PSMain( PS_INPUT Input ) : SV_TARGET
+{
+    // Manual clip test, so that objects which are behind the mirror 
+    // don't show up in the mirror.
+    clip( dot( g_vMirrorPlane.xyz, Input.vPosWorld.xyz ) + g_vMirrorPlane.w );
+
+#ifdef NO_DIFFUSE_MAP
+	float4 vDiffuse = 0.5f;
+#else   // #ifdef NO_DIFFUSE_MAP
+	float4 vDiffuse = g_txDiffuse.Sample( g_samLinearWrap, Input.vTexcoord );
+#endif  // #ifdef NO_DIFFUSE_MAP #else
+	
+	// Compute per-pixel normal
+#ifdef NO_NORMAL_MAP
+	float3 vPerPixelNormal = Input.vNormal;
+#else   // #ifdef NO_NORMAL_MAP
+	float3 vPerPixelNormal = CalcPerPixelNormal( Input.vTexcoord, Input.vNormal, Input.vTangent );
+#endif  // #ifdef NO_NORMAL_MAP #else
+
+    // Compute lighting contribution
+#ifdef NO_AMBIENT
+	float4 vTotalLightingColor = 0.0f;
+#else   // #ifdef NO_AMBIENT
+	float4 vTotalLightingColor = g_vAmbientColor;
+#endif  // #ifdef NO_AMBIENT #else
+
+#ifndef NO_DYNAMIC_LIGHTING
+	for ( int iLight = 0; iLight < g_iNumLights; ++iLight )
+	{
+        float4 vLightingColor = CalcLightingColor( iLight, Input.vPosWorld, vPerPixelNormal );
+#ifndef NO_SHADOW_MAP
+	    if ( iLight < g_iNumShadows && any( vLightingColor.xyz ) > 0.0f ) // Don't bother checking shadow map if the pixel is unlit
+	    {
+            vLightingColor *= CalcUnshadowedAmountPCF2x2( iLight, Input.vPosWorld );
+	    }
+#endif  // #ifndef NO_SHADOW_MAP
+	    vTotalLightingColor += vLightingColor;
+	}
+#endif  // #ifndef NO_DYNAMIC_LIGHTING
+
+	return vDiffuse * g_vTintColor * g_vObjectColor * vTotalLightingColor;
+}
diff --git a/tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_VS.hlsl b/tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_VS.hlsl
new file mode 100644
index 000000000..0d8d32ffa
--- /dev/null
+++ b/tests/hlsl/dxsdk/MultithreadedRendering11/MultithreadedRendering11_VS.hlsl
@@ -0,0 +1,75 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain
+//--------------------------------------------------------------------------------------
+// File: MultithreadedRendering11_VS.hlsl
+//
+// The vertex shader file for the MultithreadedRendering11 sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+// Various debug options
+//#define UNCOMPRESSED_VERTEX_DATA  // The sdkmesh file contained uncompressed vertex data
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+	matrix		g_mWorld	: packoffset( c0 );
+};
+cbuffer cbPerScene : register( b1 )
+{
+	matrix		g_mViewProj	: packoffset( c0 );
+};
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+	float4 vPosition	: POSITION;
+	float3 vNormal		: NORMAL;
+	float2 vTexcoord	: TEXCOORD0;
+	float3 vTangent		: TANGENT;
+};
+
+struct VS_OUTPUT
+{
+	float3 vNormal		: NORMAL;
+	float3 vTangent		: TANGENT;
+	float2 vTexcoord	: TEXCOORD0;
+	float4 vPosWorld	: TEXCOORD1;
+	float4 vPosition	: SV_POSITION;
+};
+
+// We aliased signed vectors as a unsigned format. 
+// Need to recover signed values.  The values 1.0 and 2.0
+// are slightly inaccurate here.
+float3 R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( in float3 vVec )
+{
+    vVec *= 2.0f;
+    return vVec >= 1.0f ? ( vVec - 2.0f ) : vVec;
+}
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+VS_OUTPUT VSMain( VS_INPUT Input )
+{
+	VS_OUTPUT Output;
+	
+#ifndef UNCOMPRESSED_VERTEX_DATA
+	// Expand compressed vectors
+	Input.vNormal = R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( Input.vNormal );
+	Input.vTangent = R10G10B10A2_UNORM_TO_R32G32B32_FLOAT( Input.vTangent );
+#endif  // #ifndef UNCOMPRESSED_VERTEX_DATA
+	
+	Output.vPosWorld = mul( Input.vPosition,    g_mWorld );
+	Output.vPosition = mul( Output.vPosWorld,   g_mViewProj );
+	Output.vNormal   = mul( Input.vNormal,      (float3x3)g_mWorld );
+	Output.vTangent  = mul( Input.vTangent,     (float3x3)g_mWorld );
+	Output.vTexcoord = Input.vTexcoord;
+	
+	return Output;
+}
+
diff --git a/tests/hlsl/dxsdk/NBodyGravityCS11/NBodyGravityCS11.hlsl b/tests/hlsl/dxsdk/NBodyGravityCS11/NBodyGravityCS11.hlsl
new file mode 100644
index 000000000..0a694450c
--- /dev/null
+++ b/tests/hlsl/dxsdk/NBodyGravityCS11/NBodyGravityCS11.hlsl
@@ -0,0 +1,103 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry CSMain
+//--------------------------------------------------------------------------------------
+// File: NBodyGravityCS11.hlsl
+//
+// Demonstrates how to use Compute Shader to do n-body gravity computation
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+static float softeningSquared = 0.0012500000*0.0012500000;
+static float g_fG = 6.67300e-11f * 10000.0f;
+static float g_fParticleMass = g_fG*10000.0f * 10000.0f;
+
+#define blocksize 128
+groupshared float4 sharedPos[blocksize];
+
+// Body to body interaction, acceleration of the particle at position bi is updated
+void bodyBodyInteraction(inout float3 ai, float4 bj, float4 bi, float mass, int particles ) 
+{
+    float3 r = bj.xyz - bi.xyz;
+
+    float distSqr = dot(r, r);
+    distSqr += softeningSquared;
+
+    float invDist = 1.0f / sqrt(distSqr);
+	float invDistCube =  invDist * invDist * invDist;
+    
+    float s = mass * invDistCube * particles;
+
+    ai += r * s;    
+}
+
+cbuffer cbCS : register( b0 )
+{
+    uint4   g_param;    // pcbCS->param[0] = MAX_PARTICLES;
+                        // pcbCS->param[1] = dimx;              
+    float4  g_paramf;   // pcbCS->paramf[0] = 0.1f;
+                        // pcbCS->paramf[1] = 1; 
+};
+
+struct PosVelo
+{
+    float4 pos;
+    float4 velo;
+};
+
+StructuredBuffer<PosVelo> oldPosVelo;
+RWStructuredBuffer<PosVelo> newPosVelo;
+
+[numthreads(blocksize, 1, 1)]
+void CSMain( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+    // Each thread of the CS updates one of the particles
+
+    float4 pos = oldPosVelo[DTid.x].pos;
+    float4 vel = oldPosVelo[DTid.x].velo;
+    float3 accel = 0;
+    float mass = g_fParticleMass;
+
+    // Update current particle using all other particles
+    [loop]
+    for (uint tile = 0; tile < g_param.y; tile++)
+    {
+        // Cache a tile of particles unto shared memory to increase IO efficiency
+        sharedPos[GI] = oldPosVelo[tile * blocksize + GI].pos;
+       
+        GroupMemoryBarrierWithGroupSync();        
+
+        [unroll]
+        for (uint counter = 0; counter < blocksize; counter+=8 ) 
+        {
+            bodyBodyInteraction(accel, sharedPos[counter], pos, mass, 1);
+            bodyBodyInteraction(accel, sharedPos[counter+1], pos, mass, 1);
+            bodyBodyInteraction(accel, sharedPos[counter+2], pos, mass, 1);
+            bodyBodyInteraction(accel, sharedPos[counter+3], pos, mass, 1);
+            bodyBodyInteraction(accel, sharedPos[counter+4], pos, mass, 1);
+            bodyBodyInteraction(accel, sharedPos[counter+5], pos, mass, 1);
+            bodyBodyInteraction(accel, sharedPos[counter+6], pos, mass, 1);
+            bodyBodyInteraction(accel, sharedPos[counter+7], pos, mass, 1);
+        }
+        
+        GroupMemoryBarrierWithGroupSync();
+    }  
+
+    // g_param.x is the number of our particles, however this number might not be an exact multiple of the tile size.
+    // In such cases, out of bound reads occur in the process above, which means there will be 
+    // tooManyParticles "phantom" particles generating false gravity at position (0, 0, 0), so we have to substract them here.
+    // NOTE, out of bound reads always return 0 in CS
+    const uint tooManyParticles = g_param.y * blocksize - g_param.x;
+    bodyBodyInteraction(accel, float4(0, 0, 0, 0), pos, mass, -tooManyParticles);
+
+    // Update the velocity and position of current particle using the acceleration computed above
+    vel.xyz += accel.xyz * g_paramf.x;      //deltaTime;
+    vel.xyz *= g_paramf.y;                  //damping;
+    pos.xyz += vel.xyz * g_paramf.x;        //deltaTime;    
+
+    if ( DTid.x < g_param.x )
+    {
+        newPosVelo[DTid.x].pos = pos;
+        newPosVelo[DTid.x].velo = float4(vel.xyz, length(accel));
+    }
+}
diff --git a/tests/hlsl/dxsdk/NBodyGravityCS11/ParticleDraw.hlsl b/tests/hlsl/dxsdk/NBodyGravityCS11/ParticleDraw.hlsl
new file mode 100644
index 000000000..ea56e20e9
--- /dev/null
+++ b/tests/hlsl/dxsdk/NBodyGravityCS11/ParticleDraw.hlsl
@@ -0,0 +1,128 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSParticleDraw -profile gs_4_0 -entry GSParticleDraw -profile ps_4_0 -entry PSParticleDraw
+//--------------------------------------------------------------------------------------
+// File: ParticleDraw.hlsl
+//
+// Shaders for rendering the particle as point sprite
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+struct VSParticleIn
+{
+    float4  color   : COLOR;
+    uint    id      : SV_VERTEXID;
+};
+
+struct VSParticleDrawOut
+{
+    float3 pos			: POSITION;
+    float4 color		: COLOR;
+};
+
+struct GSParticleDrawOut
+{
+    float2 tex			: TEXCOORD0;
+    float4 color		: COLOR;
+    float4 pos			: SV_POSITION;
+};
+
+struct PSParticleDrawIn
+{
+    float2 tex			: TEXCOORD0;
+    float4 color		: COLOR;
+};
+
+struct PosVelo
+{
+    float4 pos;
+    float4 velo;
+};
+
+Texture2D		            g_txDiffuse;
+StructuredBuffer<PosVelo>   g_bufPosVelo;
+
+
+SamplerState g_samLinear
+{
+    Filter = MIN_MAG_MIP_LINEAR;
+    AddressU = Clamp;
+    AddressV = Clamp;
+};
+
+cbuffer cb0
+{
+    row_major float4x4 g_mWorldViewProj;
+    row_major float4x4 g_mInvView;    
+};
+
+cbuffer cb1
+{
+    static float g_fParticleRad = 10.0f;   
+};
+
+cbuffer cbImmutable
+{
+    static float3 g_positions[4] =
+    {
+        float3( -1, 1, 0 ),
+        float3( 1, 1, 0 ),
+        float3( -1, -1, 0 ),
+        float3( 1, -1, 0 ),
+    };
+    
+    static float2 g_texcoords[4] = 
+    { 
+        float2(0,0), 
+        float2(1,0),
+        float2(0,1),
+        float2(1,1),
+    };
+};
+
+//
+// Vertex shader for drawing the point-sprite particles
+//
+VSParticleDrawOut VSParticleDraw(VSParticleIn input)
+{
+    VSParticleDrawOut output;
+    
+    output.pos = g_bufPosVelo[input.id].pos;
+    
+    float mag = g_bufPosVelo[input.id].velo.w/9;
+    output.color = lerp( float4(1,0.1,0.1,1), input.color, mag );
+    
+    return output;
+}
+
+//
+// GS for rendering point sprite particles.  Takes a point and turns it into 2 tris.
+//
+[maxvertexcount(4)]
+void GSParticleDraw(point VSParticleDrawOut input[1], inout TriangleStream<GSParticleDrawOut> SpriteStream)
+{
+    GSParticleDrawOut output;
+    
+    //
+    // Emit two new triangles
+    //
+    for(int i=0; i<4; i++)
+    {
+        float3 position = g_positions[i] * g_fParticleRad;
+        position = mul( position, (float3x3)g_mInvView ) + input[0].pos;
+        output.pos = mul( float4(position,1.0), g_mWorldViewProj ); 
+
+        output.color = input[0].color;        
+        output.tex = g_texcoords[i];
+        SpriteStream.Append(output);
+    }
+    SpriteStream.RestartStrip();
+}
+
+//
+// PS for drawing particles
+//
+float4 PSParticleDraw(PSParticleDrawIn input) : SV_Target
+{   
+    return g_txDiffuse.Sample( g_samLinear, input.tex ) * input.color;
+}
\ No newline at end of file
diff --git a/tests/hlsl/dxsdk/OIT11/OIT_CS.hlsl b/tests/hlsl/dxsdk/OIT11/OIT_CS.hlsl
new file mode 100644
index 000000000..dfc98b217
--- /dev/null
+++ b/tests/hlsl/dxsdk/OIT11/OIT_CS.hlsl
@@ -0,0 +1,277 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile cs_4_0 -entry VSParticleDraw -profile gs_4_0 -entry GSParticleDraw -profile ps_4_0 -entry PSParticleDraw
+//-----------------------------------------------------------------------------
+// File: OIT_CS.hlsl
+//
+// Desc: Compute shaders for used in the Order Independent Transparency sample.
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//-----------------------------------------------------------------------------
+// TODO: use structured buffers
+RWBuffer<float>     deepBufferDepth     : register( u0 );
+RWBuffer<uint>      deepBufferColorUINT : register( u1 );
+RWTexture2D<float4> frameBuffer         : register( u2 );
+RWBuffer<uint>      prefixSum           : register( u3 );
+
+Texture2D<uint> fragmentCount : register ( t0 );
+
+cbuffer CB : register( b0 )
+{
+    uint g_nFrameWidth      : packoffset( c0.x );
+    uint g_nFrameHeight     : packoffset( c0.y );
+    uint g_nPassSize        : packoffset( c0.z );
+    uint g_nReserved        : packoffset( c0.w );
+}
+
+#define blocksize 1
+#define groupthreads (blocksize*blocksize)
+groupshared float accum[groupthreads];
+
+// First pass of the prefix sum creation algorithm.  Converts a 2D buffer to a 1D buffer,
+// and sums every other value with the previous value.
+[numthreads(1,1,1)]
+void CreatePrefixSum_Pass0_CS( uint3 nGid : SV_GroupID, uint3 nDTid : SV_DispatchThreadID, uint3 nGTid : SV_GroupThreadID )
+{
+    int nThreadNum = nGid.y*g_nFrameWidth + nGid.x;
+    if( nThreadNum%2 == 0 )
+    {
+        prefixSum[nThreadNum] = fragmentCount[nGid.xy];
+        
+        // Add the Fragment count to the next bin
+        if( (nThreadNum+1) < g_nFrameWidth * g_nFrameHeight )
+        {
+            int2 nextUV;
+            nextUV.x = (nThreadNum+1) % g_nFrameWidth;
+            nextUV.y = (nThreadNum+1) / g_nFrameWidth;
+            prefixSum[ nThreadNum+1 ] = prefixSum[ nThreadNum ] + fragmentCount[ nextUV ];
+        }
+    }
+}
+
+// Second and following passes.  Each pass distributes the sum of the first half of the group
+// to the second half of the group.  There are n/groupsize groups in each pass.
+// Each pass increases the group size until it is the size of the buffer.
+// The resulting buffer holds the prefix sum of all preceding values in each
+// position 
+[numthreads(1,1,1)]
+void CreatePrefixSum_Pass1_CS( uint3 nGid : SV_GroupID, uint3 nDTid : SV_DispatchThreadID, uint3 nGTid : SV_GroupThreadID )
+{
+    int nThreadNum = nGid.x;
+    
+    int nValue = prefixSum[nThreadNum*g_nPassSize + g_nPassSize/2 - 1];
+    for(int i = nThreadNum*g_nPassSize + g_nPassSize/2; i < nThreadNum*g_nPassSize + g_nPassSize && i < g_nFrameWidth*g_nFrameHeight; i++)
+    {
+        prefixSum[i] = prefixSum[i] + nValue;
+    }
+}
+
+#if 1
+
+// Sort the fragments using a bitonic sort, then accumulate the fragments into the final result.
+groupshared int nIndex[32];
+#define NUM_THREADS 8
+[numthreads(1,1,1)]
+void SortAndRenderCS( uint3 nGid : SV_GroupID, uint3 nDTid : SV_DispatchThreadID, uint3 nGTid : SV_GroupThreadID )
+{
+    uint nThreadNum = nGid.y * g_nFrameWidth + nGid.x;
+    
+//    uint r0, r1, r2;
+//    float rd0, rd1, rd2, rd3, rd4, rd5, rd6, rd7;
+
+    uint N = fragmentCount[nDTid.xy];
+    
+    uint N2 = 1 << (int)(ceil(log2(N)));
+
+    float fDepth[32];
+    for(int i = 0; i < N; i++)
+    {
+        nIndex[i] = i;
+        fDepth[i] = deepBufferDepth[ prefixSum[nThreadNum-1] + i ];
+    }
+    for(int i = N; i < N2; i++)
+    {
+        nIndex[i] = i;
+        fDepth[i] = 1.1f;
+    }
+    
+    uint idx = blocksize*nGTid.y + nGTid.x;
+
+    // Bitonic sort
+    for( int k = 2; k <= N2; k = 2*k )
+    {
+        for( int j = k>>1; j > 0 ; j = j>>1 ) 
+        {
+            for( int i = 0; i < N2; i++ ) 
+            {
+//                GroupMemoryBarrierWithGroupSync();
+                //i = idx;
+
+                float di = fDepth[ nIndex[ i ] ];
+                int ixj = i^j;
+                if ( ( ixj ) > i )
+                {
+                    float dixj = fDepth[ nIndex[ ixj ] ];
+                    if ( ( i&k ) == 0 && di > dixj )
+                    { 
+                        int temp = nIndex[ i ];
+                        nIndex[ i ] = nIndex[ ixj ];
+                        nIndex[ ixj ] = temp;
+                    }
+                    if ( ( i&k ) != 0 && di < dixj )
+                    {
+                        int temp = nIndex[ i ];
+                        nIndex[ i ] = nIndex[ ixj ];
+                        nIndex[ ixj ] = temp;
+                    }
+                }
+            }
+        }
+    }
+
+    // Output the final result to the frame buffer
+    if( idx == 0 )
+    {
+
+     /*   
+        // Debug
+        uint color[8];
+        for(int i = 0; i < 8; i++)
+        {
+            color[i] = deepBufferColorUINT[prefixSum[nThreadNum-1] + i];
+        }
+
+        for(int i = 0; i < 8; i++)
+        {
+            deepBufferDepth[nThreadNum*8+i] = fDepth[i];//fDepth[nIndex[i]];
+            deepBufferColorUINT[nThreadNum*8+i] = color[nIndex[i]];
+        }
+     */     
+   
+        // Accumulate fragments into final result
+        float4 result = 0.0f;
+        for( int x = N-1; x >= 0; x-- )
+        {
+            uint bufferValue = deepBufferColorUINT[ prefixSum[nThreadNum-1] + nIndex[ x ] ];
+            float4 color;
+            color.r = ( ( bufferValue >> 0  & 0xFF )) / 255.0f;
+            color.g = ( bufferValue >> 8  & 0xFF ) / 255.0f;
+            color.b = ( bufferValue >> 16 & 0xFF ) / 255.0f;
+            color.a = ( bufferValue >> 24 & 0xFF ) / 255.0f;
+            result = lerp( result, color, color.a );
+        }
+        result.a = 1.0f;
+        frameBuffer[ nGid.xy ] = result;
+    }
+}
+
+#else
+[numthreads(1,1,1)]
+void SortAndRenderCS( uint3 nGid : SV_GroupID, uint3 nDTid : SV_DispatchThreadID, uint3 nGTid : SV_GroupThreadID )
+{
+    uint nThreadNum = nDTid.y * g_nFrameWidth + nDTid.x;
+    float d0 = deepBufferDepth[nThreadNum*8];
+    float d1 = deepBufferDepth[nThreadNum*8+1];
+    float d2 = deepBufferDepth[nThreadNum*8+2];
+    
+    uint s0 = deepBufferColorUINT[nThreadNum*8 + 0]; 
+    uint s1 = deepBufferColorUINT[nThreadNum*8 + 1];
+    uint s2 = deepBufferColorUINT[nThreadNum*8 + 2];
+    
+    uint r0, r1, r2;
+    float rd0, rd1, rd2;
+    if( d0 < d1 && d0 < d2 )
+    {
+        r0 = s0;
+        rd0 = d0;
+        if( d1 < d2 )
+        {
+           r1 = s1;
+           r2 = s2;
+           
+           rd1 = d1;
+           rd2 = d2;
+        }
+        else
+        {
+            r1 = s2;
+            r2 = s1;
+            
+            rd1 = d2;
+            rd2 = d1;
+        } 
+    }
+    else if( d1 < d2 )
+    {
+        r0 = s1;
+        rd0 = d1;
+        if( d0 < d2 )
+        {
+          r1 = s0;
+          r2 = s2;
+          
+          rd1 = d0;
+          rd2 = d2;
+        }
+        else
+        {
+          r1 = s2;
+          r2 = s0;
+          
+          rd1 = d2;
+          rd2 = d0;
+        }
+    }
+    else
+    {
+        r0 = s2;
+        rd0 = d2;
+        if( d1 < d0 )
+        {
+          r1 = s1;
+          r2 = s0;
+          
+          rd1 = d1;
+          rd2 = d0;
+        }
+        else
+        {
+          r1 = s0;
+          r2 = s1;
+          
+          rd1 = d0;
+          rd2 = d1;
+        }
+    }
+    
+    deepBufferDepth[nThreadNum*8] = rd0;
+    deepBufferDepth[nThreadNum*8+1] = rd1;
+    deepBufferDepth[nThreadNum*8+2] = rd2;
+
+    deepBufferColorUINT[nThreadNum*8] = r0;
+    deepBufferColorUINT[nThreadNum*8+1] = r1;
+    deepBufferColorUINT[nThreadNum*8+2] = r2;
+
+    // convert the color to floats
+    float4 color[3];
+    color[0].r = (r0 >> 0  & 0xFF) / 255.0f;
+    color[0].g = (r0 >> 8  & 0xFF) / 255.0f;
+    color[0].b = (r0 >> 16 & 0xFF) / 255.0f;
+    color[0].a = (r0 >> 24 & 0xFF) / 255.0f;
+    
+    color[1].r = (r1 >> 0  & 0xFF) / 255.0f;
+    color[1].g = (r1 >> 8  & 0xFF) / 255.0f;
+    color[1].b = (r1 >> 16 & 0xFF) / 255.0f;
+    color[1].a = (r1 >> 24 & 0xFF) / 255.0f;
+    
+    color[2].r = (r2 >> 0  & 0xFF) / 255.0f;
+    color[2].g = (r2 >> 8  & 0xFF) / 255.0f;
+    color[2].b = (r2 >> 16 & 0xFF) / 255.0f;
+    color[2].a = (r2 >> 24 & 0xFF) / 255.0f;
+    
+    float4 result = lerp(lerp(lerp(0, color[2], color[2].a), color[1], color[1].a), color[0], color[0].a);
+    result.a = 1.0f;
+    
+    frameBuffer[nDTid.xy] = result;
+}
+
+#endif
\ No newline at end of file
diff --git a/tests/hlsl/dxsdk/OIT11/OIT_PS.hlsl b/tests/hlsl/dxsdk/OIT11/OIT_PS.hlsl
new file mode 100644
index 000000000..1fdb31622
--- /dev/null
+++ b/tests/hlsl/dxsdk/OIT11/OIT_PS.hlsl
@@ -0,0 +1,56 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile ps_4_0 -entry FragmentCountPS -entry FillDeepBufferPS
+//-----------------------------------------------------------------------------
+// File: OITPS.hlsl
+//
+// Desc: Pixel shaders used in the Order Independent Transparency sample.
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//-----------------------------------------------------------------------------
+//TODO: Use structured buffers
+RWTexture2D<uint> fragmentCount     : register( u1 );
+RWBuffer<float>   deepBufferDepth   : register( u2 );
+RWBuffer<uint4>   deepBufferColor   : register( u3 );
+RWBuffer<uint>    prefixSum         : register( u4 );
+
+cbuffer CB : register( b0 )
+{
+    uint g_nFrameWidth      : packoffset( c0.x );
+    uint g_nFrameHeight     : packoffset( c0.y );
+    uint g_nReserved0       : packoffset( c0.z );
+    uint g_nReserved1       : packoffset( c0.w );
+}
+
+struct SceneVS_Output
+{
+    float4 pos   : SV_POSITION;
+    float4 color : COLOR0;
+};
+
+void FragmentCountPS( SceneVS_Output input)
+{
+    // Increments need to be done atomically
+    InterlockedAdd(fragmentCount[input.pos.xy], 1);
+}
+
+void FillDeepBufferPS( SceneVS_Output input )
+{
+    uint x = input.pos.x;
+    uint y = input.pos.y;
+
+    // Atomically allocate space in the deep buffer
+    uint fc;
+    InterlockedAdd(fragmentCount[input.pos.xy], 1, fc);
+
+    uint nPrefixSumPos = y*g_nFrameWidth + x;
+    uint nDeepBufferPos;
+    if( nPrefixSumPos == 0 )
+        nDeepBufferPos = fc;
+    else
+        nDeepBufferPos = prefixSum[nPrefixSumPos-1] + fc;
+
+    // Store fragment data into the allocated space
+    deepBufferDepth[nDeepBufferPos] = input.pos.z;
+    deepBufferColor[nDeepBufferPos] = clamp(input.color, 0, 1)*255;
+}
+
diff --git a/tests/hlsl/dxsdk/OIT11/SceneVS.hlsl b/tests/hlsl/dxsdk/OIT11/SceneVS.hlsl
new file mode 100644
index 000000000..2f985d1d1
--- /dev/null
+++ b/tests/hlsl/dxsdk/OIT11/SceneVS.hlsl
@@ -0,0 +1,36 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry SceneVS
+//-----------------------------------------------------------------------------
+// File: SceneVS.hlsl
+//
+// Desc: Vertex shader for the scene.
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//-----------------------------------------------------------------------------
+
+
+cbuffer cbPerObject : register( b0 )
+{
+    row_major matrix    g_mWorldViewProjection	: packoffset( c0 );
+}
+
+struct SceneVS_Input
+{
+    float4 pos   : POSITION;
+    float4 color : COLOR;
+};
+
+struct SceneVS_Output
+{
+    float4 pos   : SV_POSITION;
+    float4 color : COLOR0;
+};
+
+SceneVS_Output SceneVS( SceneVS_Input input )
+{
+    SceneVS_Output output;
+    
+    output.color = input.color;
+    output.pos   = mul(input.pos, g_mWorldViewProjection );
+    
+    return output;
+}
diff --git a/tests/hlsl/dxsdk/README.md b/tests/hlsl/dxsdk/README.md
new file mode 100644
index 000000000..dd0c0fb6b
--- /dev/null
+++ b/tests/hlsl/dxsdk/README.md
@@ -0,0 +1,5 @@
+DirectX SDK Sample Shaders
+==========================
+
+This directory contains shaders that have shipped as part of the DirectX SDK.
+The licsense terms for these shaders are specificed at the top of the source files.
\ No newline at end of file
diff --git a/tests/hlsl/dxsdk/SimpleBezier11/SimpleBezier11.hlsl b/tests/hlsl/dxsdk/SimpleBezier11/SimpleBezier11.hlsl
new file mode 100644
index 000000000..7b7a1489c
--- /dev/null
+++ b/tests/hlsl/dxsdk/SimpleBezier11/SimpleBezier11.hlsl
@@ -0,0 +1,230 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry BezierVS -profile hs_5_0 -entry BezierHS -profile ds_5_0 -entry BezierDS -profile ps_4_0 -entry BezierPS -entry SolidColorPS
+//--------------------------------------------------------------------------------------
+// File: SimpleBezier11.hlsl
+//
+// This sample shows an simple implementation of the DirectX 11 Hardware Tessellator
+// for rendering a Bezier Patch.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+// This allows us to compile the shader with a #define to choose
+// the different partition modes for the hull shader.
+// See the hull shader: [partitioning(BEZIER_HS_PARTITION)]
+// This sample demonstrates "integer", "fractional_even", and "fractional_odd"
+#ifndef BEZIER_HS_PARTITION
+#define BEZIER_HS_PARTITION "integer"
+#endif // BEZIER_HS_PARTITION
+
+// The input patch size.  In this sample, it is 16 control points.
+// This value should match the call to IASetPrimitiveTopology()
+#define INPUT_PATCH_SIZE 16
+
+// The output patch size.  In this sample, it is also 16 control points.
+#define OUTPUT_PATCH_SIZE 16
+
+//--------------------------------------------------------------------------------------
+// Constant Buffers
+//--------------------------------------------------------------------------------------
+cbuffer cbPerFrame : register( b0 )
+{
+    matrix g_mViewProjection;
+    float3 g_vCameraPosWorld;
+    float  g_fTessellationFactor;
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex shader section
+//--------------------------------------------------------------------------------------
+struct VS_CONTROL_POINT_INPUT
+{
+    float3 vPosition        : POSITION;
+};
+
+struct VS_CONTROL_POINT_OUTPUT
+{
+    float3 vPosition        : POSITION;
+};
+
+// This simple vertex shader passes the control points straight through to the
+// hull shader.  In a more complex scene, you might transform the control points
+// or perform skinning at this step.
+
+// The input to the vertex shader comes from the vertex buffer.
+
+// The output from the vertex shader will go into the hull shader.
+
+VS_CONTROL_POINT_OUTPUT BezierVS( VS_CONTROL_POINT_INPUT Input )
+{
+    VS_CONTROL_POINT_OUTPUT Output;
+
+    Output.vPosition = Input.vPosition;
+
+    return Output;
+}
+
+//--------------------------------------------------------------------------------------
+// Constant data function for the BezierHS.  This is executed once per patch.
+//--------------------------------------------------------------------------------------
+struct HS_CONSTANT_DATA_OUTPUT
+{
+    float Edges[4]             : SV_TessFactor;
+    float Inside[2]            : SV_InsideTessFactor;
+};
+
+struct HS_OUTPUT
+{
+    float3 vPosition           : BEZIERPOS;
+};
+
+// This constant hull shader is executed once per patch.  For the simple Mobius strip
+// model, it will be executed 4 times.  In this sample, we set the tessellation factor
+// via SV_TessFactor and SV_InsideTessFactor for each patch.  In a more complex scene,
+// you might calculate a variable tessellation factor based on the camera's distance.
+
+HS_CONSTANT_DATA_OUTPUT BezierConstantHS( InputPatch<VS_CONTROL_POINT_OUTPUT, INPUT_PATCH_SIZE> ip,
+                                          uint PatchID : SV_PrimitiveID )
+{    
+    HS_CONSTANT_DATA_OUTPUT Output;
+
+    float TessAmount = g_fTessellationFactor;
+
+    Output.Edges[0] = Output.Edges[1] = Output.Edges[2] = Output.Edges[3] = TessAmount;
+    Output.Inside[0] = Output.Inside[1] = TessAmount;
+
+    return Output;
+}
+
+// The hull shader is called once per output control point, which is specified with
+// outputcontrolpoints.  For this sample, we take the control points from the vertex
+// shader and pass them directly off to the domain shader.  In a more complex scene,
+// you might perform a basis conversion from the input control points into a Bezier
+// patch, such as the SubD11 Sample.
+
+// The input to the hull shader comes from the vertex shader
+
+// The output from the hull shader will go to the domain shader.
+// The tessellation factor, topology, and partition mode will go to the fixed function
+// tessellator stage to calculate the UVW and domain points.
+
+[domain("quad")]
+[partitioning(BEZIER_HS_PARTITION)]
+[outputtopology("triangle_cw")]
+[outputcontrolpoints(OUTPUT_PATCH_SIZE)]
+[patchconstantfunc("BezierConstantHS")]
+HS_OUTPUT BezierHS( InputPatch<VS_CONTROL_POINT_OUTPUT, INPUT_PATCH_SIZE> p, 
+                    uint i : SV_OutputControlPointID,
+                    uint PatchID : SV_PrimitiveID )
+{
+    HS_OUTPUT Output;
+    Output.vPosition = p[i].vPosition;
+    return Output;
+}
+
+//--------------------------------------------------------------------------------------
+// Bezier evaluation domain shader section
+//--------------------------------------------------------------------------------------
+struct DS_OUTPUT
+{
+    float4 vPosition        : SV_POSITION;
+    float3 vWorldPos        : WORLDPOS;
+    float3 vNormal            : NORMAL;
+};
+
+//--------------------------------------------------------------------------------------
+float4 BernsteinBasis(float t)
+{
+    float invT = 1.0f - t;
+
+    return float4( invT * invT * invT,
+                   3.0f * t * invT * invT,
+                   3.0f * t * t * invT,
+                   t * t * t );
+}
+
+//--------------------------------------------------------------------------------------
+float4 dBernsteinBasis(float t)
+{
+    float invT = 1.0f - t;
+
+    return float4( -3 * invT * invT,
+                   3 * invT * invT - 6 * t * invT,
+                   6 * t * invT - 3 * t * t,
+                   3 * t * t );
+}
+
+//--------------------------------------------------------------------------------------
+float3 EvaluateBezier( const OutputPatch<HS_OUTPUT, OUTPUT_PATCH_SIZE> bezpatch,
+                       float4 BasisU,
+                       float4 BasisV )
+{
+    float3 Value = float3(0,0,0);
+    Value  = BasisV.x * ( bezpatch[0].vPosition * BasisU.x + bezpatch[1].vPosition * BasisU.y + bezpatch[2].vPosition * BasisU.z + bezpatch[3].vPosition * BasisU.w );
+    Value += BasisV.y * ( bezpatch[4].vPosition * BasisU.x + bezpatch[5].vPosition * BasisU.y + bezpatch[6].vPosition * BasisU.z + bezpatch[7].vPosition * BasisU.w );
+    Value += BasisV.z * ( bezpatch[8].vPosition * BasisU.x + bezpatch[9].vPosition * BasisU.y + bezpatch[10].vPosition * BasisU.z + bezpatch[11].vPosition * BasisU.w );
+    Value += BasisV.w * ( bezpatch[12].vPosition * BasisU.x + bezpatch[13].vPosition * BasisU.y + bezpatch[14].vPosition * BasisU.z + bezpatch[15].vPosition * BasisU.w );
+
+    return Value;
+}
+
+// The domain shader is run once per vertex and calculates the final vertex's position
+// and attributes.  It receives the UVW from the fixed function tessellator and the
+// control point outputs from the hull shader.  Since we are using the DirectX 11
+// Tessellation pipeline, it is the domain shader's responsibility to calculate the
+// final SV_POSITION for each vertex.  In this sample, we evaluate the vertex's
+// position using a Bernstein polynomial and the normal is calculated as the cross
+// product of the U and V derivatives.
+
+// The input SV_DomainLocation to the domain shader comes from fixed function
+// tessellator.  And the OutputPatch comes from the hull shader.  From these, you
+// must calculate the final vertex position, color, texcoords, and other attributes.
+
+// The output from the domain shader will be a vertex that will go to the video card's
+// rasterization pipeline and get drawn to the screen.
+
+[domain("quad")]
+DS_OUTPUT BezierDS( HS_CONSTANT_DATA_OUTPUT input, 
+                    float2 UV : SV_DomainLocation,
+                    const OutputPatch<HS_OUTPUT, OUTPUT_PATCH_SIZE> bezpatch )
+{
+    float4 BasisU = BernsteinBasis( UV.x );
+    float4 BasisV = BernsteinBasis( UV.y );
+    float4 dBasisU = dBernsteinBasis( UV.x );
+    float4 dBasisV = dBernsteinBasis( UV.y );
+
+    float3 WorldPos = EvaluateBezier( bezpatch, BasisU, BasisV );
+    float3 Tangent = EvaluateBezier( bezpatch, dBasisU, BasisV );
+    float3 BiTangent = EvaluateBezier( bezpatch, BasisU, dBasisV );
+    float3 Norm = normalize( cross( Tangent, BiTangent ) );
+
+    DS_OUTPUT Output;
+    Output.vPosition = mul( float4(WorldPos,1), g_mViewProjection );
+    Output.vWorldPos = WorldPos;
+    Output.vNormal = Norm;
+
+    return Output;    
+}
+
+//--------------------------------------------------------------------------------------
+// Smooth shading pixel shader section
+//--------------------------------------------------------------------------------------
+
+// The pixel shader works the same as it would in a normal graphics pipeline.
+// In this sample, it performs very simple N dot L lighting.
+
+float4 BezierPS( DS_OUTPUT Input ) : SV_TARGET
+{
+    float3 N = normalize(Input.vNormal);
+    float3 L = normalize(Input.vWorldPos - g_vCameraPosWorld);
+    return abs(dot(N, L)) * float4(1, 0, 0, 1);
+}
+
+//--------------------------------------------------------------------------------------
+// Solid color shading pixel shader (used for wireframe overlay)
+//--------------------------------------------------------------------------------------
+float4 SolidColorPS( DS_OUTPUT Input ) : SV_TARGET
+{
+    // Return a solid green color
+    return float4( 0, 1, 0, 1 );
+}
diff --git a/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.fx b/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.fx
new file mode 100644
index 000000000..00883ce70
--- /dev/null
+++ b/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.fx
@@ -0,0 +1,112 @@
+//TEST_IGNORE_FILE:
+//--------------------------------------------------------------------------------------
+// File: SimpleSample.fx
+//
+// The effect file for the SimpleSample sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Global variables
+//--------------------------------------------------------------------------------------
+float4 g_MaterialAmbientColor;      // Material's ambient color
+float4 g_MaterialDiffuseColor;      // Material's diffuse color
+float3 g_LightDir;                  // Light's direction in world space
+float4 g_LightDiffuse;              // Light's diffuse color
+texture g_MeshTexture;              // Color texture for mesh
+
+float    g_fTime;                   // App's time in seconds
+float4x4 g_mWorld;                  // World matrix for object
+float4x4 g_mWorldViewProjection;    // World * View * Projection matrix
+
+
+
+//--------------------------------------------------------------------------------------
+// Texture samplers
+//--------------------------------------------------------------------------------------
+sampler MeshTextureSampler = 
+sampler_state
+{
+    Texture = <g_MeshTexture>;
+    MipFilter = LINEAR;
+    MinFilter = LINEAR;
+    MagFilter = LINEAR;
+};
+
+
+//--------------------------------------------------------------------------------------
+// Vertex shader output structure
+//--------------------------------------------------------------------------------------
+struct VS_OUTPUT
+{
+    float4 Position   : POSITION;   // vertex position 
+    float4 Diffuse    : COLOR0;     // vertex diffuse color (note that COLOR0 is clamped from 0..1)
+    float2 TextureUV  : TEXCOORD0;  // vertex texture coords 
+};
+
+
+//--------------------------------------------------------------------------------------
+// This shader computes standard transform and lighting
+//--------------------------------------------------------------------------------------
+VS_OUTPUT RenderSceneVS( float4 vPos : POSITION, 
+                         float3 vNormal : NORMAL,
+                         float2 vTexCoord0 : TEXCOORD0 )
+{
+    VS_OUTPUT Output;
+    float3 vNormalWorldSpace;
+    
+    // Transform the position from object space to homogeneous projection space
+    Output.Position = mul(vPos, g_mWorldViewProjection);
+    
+    // Transform the normal from object space to world space    
+    vNormalWorldSpace = normalize(mul(vNormal, (float3x3)g_mWorld)); // normal (world space)
+
+    // Calc diffuse color    
+    Output.Diffuse.rgb = g_MaterialDiffuseColor * g_LightDiffuse * max(0,dot(vNormalWorldSpace, g_LightDir)) + 
+                         g_MaterialAmbientColor;   
+    Output.Diffuse.a = 1.0f; 
+    
+    // Just copy the texture coordinate through
+    Output.TextureUV = vTexCoord0; 
+    
+    return Output;    
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel shader output structure
+//--------------------------------------------------------------------------------------
+struct PS_OUTPUT
+{
+    float4 RGBColor : COLOR0;  // Pixel color    
+};
+
+
+//--------------------------------------------------------------------------------------
+// This shader outputs the pixel's color by modulating the texture's
+// color with diffuse material color
+//--------------------------------------------------------------------------------------
+PS_OUTPUT RenderScenePS( VS_OUTPUT In ) 
+{ 
+    PS_OUTPUT Output;
+
+    // Lookup mesh texture and modulate it with diffuse
+    Output.RGBColor = tex2D(MeshTextureSampler, In.TextureUV) * In.Diffuse;
+
+    return Output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Renders scene 
+//--------------------------------------------------------------------------------------
+technique RenderScene
+{
+    pass P0
+    {          
+        VertexShader = compile vs_2_0 RenderSceneVS();
+        PixelShader  = compile ps_2_0 RenderScenePS(); 
+    }
+}
diff --git a/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.hlsl b/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.hlsl
new file mode 100644
index 000000000..12f368f86
--- /dev/null
+++ b/tests/hlsl/dxsdk/SimpleSample11/SimpleSample.hlsl
@@ -0,0 +1,86 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry RenderSceneVS -profile ps_4_0 -entry RenderScenePS
+//--------------------------------------------------------------------------------------
+// File: SimpleSample.hlsl
+//
+// The HLSL file for the SimpleSample sample for the Direct3D 11 device
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+
+//--------------------------------------------------------------------------------------
+// Constant Buffers
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+    matrix  g_mWorldViewProjection  : packoffset( c0 );
+    matrix  g_mWorld                : packoffset( c4 );
+    float4  g_MaterialAmbientColor  : packoffset( c8 );
+    float4  g_MaterialDiffuseColor  : packoffset( c9 );
+}
+
+cbuffer cbPerFrame : register( b1 )
+{
+    float3              g_vLightDir             : packoffset( c0 );
+    float               g_fTime                 : packoffset( c0.w );
+    float4              g_LightDiffuse          : packoffset( c1 );
+};
+
+//-----------------------------------------------------------------------------------------
+// Textures and Samplers
+//-----------------------------------------------------------------------------------------
+Texture2D    g_txDiffuse : register( t0 );
+SamplerState g_samLinear : register( s0 );
+
+//--------------------------------------------------------------------------------------
+// shader input/output structure
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+    float4 Position     : POSITION; // vertex position 
+    float3 Normal       : NORMAL;   // this normal comes in per-vertex
+    float2 TextureUV    : TEXCOORD0;// vertex texture coords 
+};
+
+struct VS_OUTPUT
+{
+    float4 Position     : SV_POSITION; // vertex position 
+    float4 Diffuse      : COLOR0;      // vertex diffuse color (note that COLOR0 is clamped from 0..1)
+    float2 TextureUV    : TEXCOORD0;   // vertex texture coords 
+};
+
+//--------------------------------------------------------------------------------------
+// This shader computes standard transform and lighting
+//--------------------------------------------------------------------------------------
+VS_OUTPUT RenderSceneVS( VS_INPUT input )
+{
+    VS_OUTPUT Output;
+    float3 vNormalWorldSpace;
+    
+    // Transform the position from object space to homogeneous projection space
+    Output.Position = mul( input.Position, g_mWorldViewProjection );
+    
+    // Transform the normal from object space to world space    
+    vNormalWorldSpace = normalize(mul(input.Normal, (float3x3)g_mWorld)); // normal (world space)
+
+    // Calc diffuse color    
+    Output.Diffuse.rgb = g_MaterialDiffuseColor * g_LightDiffuse * max(0,dot(vNormalWorldSpace, g_vLightDir)) + 
+                         g_MaterialAmbientColor;   
+    Output.Diffuse.a = 1.0f; 
+    
+    // Just copy the texture coordinate through
+    Output.TextureUV = input.TextureUV; 
+    
+    return Output;    
+}
+
+//--------------------------------------------------------------------------------------
+// This shader outputs the pixel's color by modulating the texture's
+// color with diffuse material color
+//--------------------------------------------------------------------------------------
+float4 RenderScenePS( VS_OUTPUT In ) : SV_TARGET
+{ 
+    // Lookup mesh texture and modulate it with diffuse
+    return g_txDiffuse.Sample( g_samLinear, In.TextureUV ) * In.Diffuse;
+}
diff --git a/tests/hlsl/dxsdk/SubD11/SubD11.hlsl b/tests/hlsl/dxsdk/SubD11/SubD11.hlsl
new file mode 100644
index 000000000..c4ebf9620
--- /dev/null
+++ b/tests/hlsl/dxsdk/SubD11/SubD11.hlsl
@@ -0,0 +1,1238 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry PatchSkinningVS -entry MeshSkinningVS -profile hs_5_0 -entry SubDToBezierHS -entry SubDToBezierHS4444 -profile ds_5_0 -entry BezierEvalDS -profile ps_4_0 -entry SmoothPS -entry SolidColorPS
+//--------------------------------------------------------------------------------------
+// File: SubD11.hlsl
+//
+// This file contains functions to convert from a Catmull-Clark subdivision
+// representation to a bicubic patch representation.
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//Work-around for an optimization rule problem in the June 2010 HLSL Compiler (9.29.952.3111)
+//see http://support.microsoft.com/kb/2448404
+#if D3DX_VERSION == 0xa2b
+#pragma ruledisable 0x0802405f
+#endif
+
+//--------------------------------------------------------------------------------------
+// A sample extraordinary SubD quad is represented by the following diagram:
+//
+//                        15              Valences:
+//                       /  \               Vertex 0: 5
+//                      /    14             Vertex 1: 4
+//          17---------16   /  \            Vertex 2: 5
+//          | \         |  /    \           Vertex 3: 3
+//          |  \        | /      13
+//          |   \       |/      /         Prefixes:
+//          |    3------2------12           Vertex 0: 9
+//          |    |      |      |            Vertex 1: 12
+//          |    |      |      |            Vertex 2: 16
+//          4----0------1------11           Vertex 3: 18
+//         /    /|      |      |
+//        /    / |      |      |
+//       5    /  8------9------10
+//        \  /  /
+//         6   /
+//          \ /
+//           7
+//
+// Where the quad bounded by vertices 0,1,2,3 represents the actual subd surface of interest
+// The 1-ring neighborhood of the quad is represented by vertices 4 through 17.  The counter-
+// clockwise winding of this 1-ring neighborhood is important, especially when it comes to compute
+// the corner vertices of the bicubic patch that we will use to approximate the subd quad (0,1,2,3).
+// 
+// The resulting bicubic patch fits within the subd quad (0,1,2,3) and has the following control
+// point layout:
+//
+//     12--13--14--15
+//      8---9--10--11
+//      4---5---6---7
+//      0---1---2---3
+//
+// The inner 4 control points of the bicubic patch are a combination of only the vertices (0,1,2,3)
+// of the subd quad.  However, the corner control points for the bicubic patch (0,3,15,12) are actually
+// a much more complex weighting of the subd patch and the 1-ring neighborhood.  In the example above
+// the bicubic control point 0 is actually a weighted combination of subd points 0,1,2,3 and 1-ring
+// neighborhood points 17, 4, 5, 6, 7, 8, and 9.  We can see that the 1-ring neighbor hood is simply
+// walked from the prefix value from the previous corner (corner 3 in this case) to the prefix 
+// prefix value for the current corner.  We add one more vertex on either side of the prefix values
+// and we have all the data necessary to calculate the value for the corner points.
+//
+// The edge control points of the bicubic patch (1,2,13,14,4,8,7,11) are also combinations of their 
+// neighbors, but fortunately each one is only a combination of 6 values and no walk is required.
+//--------------------------------------------------------------------------------------
+
+#define MOD4(x) ((x)&3)
+#ifndef MAX_POINTS
+#define MAX_POINTS 32
+#endif
+#define MAX_BONE_MATRICES 80
+                        
+//--------------------------------------------------------------------------------------
+// Textures
+//--------------------------------------------------------------------------------------
+Texture2D       g_txHeight : register( t0 );           // Height and Bump texture
+Texture2D       g_txDiffuse : register( t1 );          // Diffuse texture
+Texture2D       g_txSpecular : register( t2 );         // Specular texture
+
+//--------------------------------------------------------------------------------------
+// Samplers
+//--------------------------------------------------------------------------------------
+SamplerState g_samLinear : register( s0 );
+SamplerState g_samPoint : register( s0 );
+
+//--------------------------------------------------------------------------------------
+// Constant Buffers
+//--------------------------------------------------------------------------------------
+cbuffer cbTangentStencilConstants : register( b0 )
+{
+    float g_TanM[1024]; // Tangent patch stencils precomputed by the application
+    float g_fCi[16];    // Valence coefficients precomputed by the application
+};
+
+cbuffer cbPerMesh : register( b1 )
+{
+    matrix g_mConstBoneWorld[MAX_BONE_MATRICES];
+};
+
+cbuffer cbPerFrame : register( b2 )
+{
+    matrix g_mViewProjection;
+    float3 g_vCameraPosWorld;
+    float  g_fTessellationFactor;
+    float  g_fDisplacementHeight;
+    float3 g_vSolidColor;
+};
+
+cbuffer cbPerSubset : register( b3 )
+{
+    int g_iPatchStartIndex;
+}
+
+//--------------------------------------------------------------------------------------
+Buffer<uint4>  g_ValencePrefixBuffer : register( t0 );
+
+//--------------------------------------------------------------------------------------
+struct VS_CONTROL_POINT_OUTPUT
+{
+    float3 vPosition		: WORLDPOS;
+    float2 vUV				: TEXCOORD0;
+    float3 vTangent			: TANGENT;
+};
+
+struct BEZIER_CONTROL_POINT
+{
+    float3 vPosition	: BEZIERPOS;
+};
+
+struct PS_INPUT
+{
+    float3 vWorldPos        : POSITION;
+    float3 vNormal			: NORMAL;
+    float2 vUV				: TEXCOORD;
+    float3 vTangent			: TANGENT;
+    float3 vBiTangent		: BITANGENT;
+};
+
+//--------------------------------------------------------------------------------------
+// SubD to Bezier helper functions
+//--------------------------------------------------------------------------------------
+// Helps with getting tangent stencils from the g_TanM constant array
+#define TANM(a,v) ( g_TanM[ Val[v]*64 + (a) ] )
+
+//--------------------------------------------------------------------------------------
+float3 ComputeInteriorVertex( uint index, 
+                              uint Val[4], 
+                              const in InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> ip )
+{
+    switch( index )
+    {
+    case 0:
+        return (ip[0].vPosition*Val[0] + ip[1].vPosition*2 +      ip[2].vPosition +        ip[3].vPosition*2)      / (5+Val[0]);
+    case 1:
+        return (ip[0].vPosition*2 +      ip[1].vPosition*Val[1] + ip[2].vPosition*2 +      ip[3].vPosition)        / (5+Val[1]);
+    case 2:
+        return (ip[0].vPosition +        ip[1].vPosition*2 +      ip[2].vPosition*Val[2] + ip[3].vPosition*2)      / (5+Val[2]);
+    case 3:
+        return (ip[0].vPosition*2 +      ip[1].vPosition +        ip[2].vPosition*2 +      ip[3].vPosition*Val[3]) / (5+Val[3]);
+    }
+    
+    return float3(0,0,0);
+}
+
+//--------------------------------------------------------------------------------------
+// Computes the corner vertices of the output UV patch.  The corner vertices are
+// a weighted combination of all points that are "connected" to that corner by an edge.
+// The interior 4 points of the original subd quad are easy to get.  The points in the
+// 1-ring neighborhood around the interior quad are not.
+//
+// Because the valence of that corner could be any number between 3 and 16, we need to
+// walk around the subd patch vertices connected to that point.  This is there the
+// Pref (prefix) values come into play.  Each corner has a prefix value that is the index
+// of the last value around the 1-ring neighborhood that should be used in calculating
+// the coefficient of that corner.  The walk goes from the prefix value of the previous
+// corner to the prefix value of the current corner.
+//--------------------------------------------------------------------------------------
+void ComputeCornerVertex( uint index, 
+                          out float3 CornerB, // Corner for the Bezier patch
+                          out float3 CornerU, // Corner for the tangent patch
+                          out float3 CornerV, // Corner for the bitangent patch
+                          const in InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> ip, 
+                          const in uint Val[4], 
+                          const in uint Pref[4] )
+{
+    const float fOWt = 1;
+    const float fEWt = 4;
+
+    // Figure out where to start the walk by using the previous corner's prefix value
+    uint PrefIm1 = 0;
+    uint uStart = 4;
+    if( index )
+    {
+        PrefIm1 = Pref[index-1];
+        uStart = PrefIm1;
+    }
+    
+    // Setup the walk indices
+    uint uTIndexStart = 2 - (index&1);
+    uint uTIndex = uTIndexStart;
+
+    // Calculate the N*N weight for the final value
+    CornerB = (Val[index]*Val[index])*ip[index].vPosition; // n^2 part
+
+    // Zero out the corners
+    CornerU = float4(0,0,0,0);
+    CornerV = float4(0,0,0,0);
+    
+    const uint uV = Val[index]  + ( ( index & 1 ) ? 1 : -1 );
+        
+    // Start the walk with the uStart prefix (the prefix of the corner before us)
+    CornerB += ip[uStart].vPosition * fEWt;
+    CornerU += ip[uStart].vPosition * TANM( uTIndex * 2, index );
+    CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index);
+
+    // Gather all vertices between the previous corner's prefix and our own prefix
+    // We'll do two at a time, since they always come in twos
+    while(uStart < Pref[index]-1) 
+    {
+        ++uStart;
+        CornerB += ip[uStart].vPosition * fOWt;
+        CornerU += ip[uStart].vPosition * TANM( uTIndex * 2 + 1, index );
+        CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index );
+
+        ++uTIndex;
+        ++uStart;
+        CornerB += ip[uStart].vPosition * fEWt;
+        CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index );
+        CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex+uV)%Val[index]) * 2, index );
+    }
+    ++uStart;
+
+    // Add in the last guy and make sure to wrap to the beginning if we're the last corner
+    if (index == 3)
+        uStart = 4; 
+    CornerB += ip[uStart].vPosition * fOWt;
+    CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index );
+    CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index );
+
+    // Add in the guy before the prefix as well
+    if (index)
+        uStart = PrefIm1-1;
+    else
+        uStart = Pref[3]-1;
+    uTIndex = uTIndexStart-1;
+
+    CornerB += ip[uStart].vPosition * fOWt;
+    CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index );
+    CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index );
+
+    // We're done with the walk now.  Now we need to add the contributions of the original subd quad.
+    CornerB += ip[MOD4(index+1)].vPosition * fEWt;
+    CornerB += ip[MOD4(index+2)].vPosition * fOWt;
+    CornerB += ip[MOD4(index+3)].vPosition * fEWt;
+    
+    uTIndex = 0 + (index&1)*(Val[index]-1);
+    uStart = MOD4(index+1);
+    CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index );
+    CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index );
+    
+    uStart = MOD4(index+2);
+    CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index );
+    CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index );
+
+    uStart = MOD4(index+3);
+    uTIndex = (uTIndex+1)%Val[index];
+
+    CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index );
+    CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index );
+
+    // Normalize the corner weights
+    CornerB *= 1.0f / ( Val[index] * Val[index] + 5 * Val[index] ); // normalize
+
+    // fixup signs from directional derivatives...
+    if( !((index - 1) & 2) ) // 1 and 2
+        CornerU *= -1;
+
+    if( index >= 2 ) // 2 and 3
+        CornerV *= -1;
+}
+
+void ComputeCornerVertex4444( uint index, 
+                          out float3 CornerB, // Corner for the Bezier patch
+                          out float3 CornerU, // Corner for the tangent patch
+                          out float3 CornerV, // Corner for the bitangent patch
+                          const in InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> ip, 
+                          const in uint Val[4], 
+                          const in uint Pref[4] )
+{
+    const float fOWt = 1;
+    const float fEWt = 4;
+
+    // Figure out where to start the walk by using the previous corner's prefix value
+    uint PrefIm1 = 0;
+    uint uStart = 4;
+    if( index )
+    {
+        PrefIm1 = Pref[index-1];
+        uStart = PrefIm1;
+    }
+    
+    // Setup the walk indices
+    uint uTIndexStart = 2 - (index&1);
+    uint uTIndex = uTIndexStart;
+
+    // Calculate the N*N weight for the final value
+    CornerB = (Val[index]*Val[index])*ip[index].vPosition; // n^2 part
+
+    // Zero out the corners
+    CornerU = float4(0,0,0,0);
+    CornerV = float4(0,0,0,0);
+    
+    const uint uV = Val[index]  + ( ( index & 1 ) ? 1 : -1 );
+        
+    // Start the walk with the uStart prefix (the prefix of the corner before us)
+    CornerB += ip[uStart].vPosition * fEWt;
+    CornerU += ip[uStart].vPosition * TANM( uTIndex * 2, index );
+    CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index);
+
+    // Gather all vertices between the previous corner's prefix and our own prefix
+    // We'll do two at a time, since they always come in twos
+    while(uStart < Pref[index]-1) 
+    {
+        ++uStart;
+        CornerB += ip[uStart].vPosition * fOWt;
+        CornerU += ip[uStart].vPosition * TANM( uTIndex * 2 + 1, index );
+        CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index );
+
+        ++uTIndex;
+        ++uStart;
+        CornerB += ip[uStart].vPosition * fEWt;
+        CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index );
+        CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex+uV)%Val[index]) * 2, index );
+    }
+    ++uStart;
+
+    // Add in the last guy and make sure to wrap to the beginning if we're the last corner
+    if (index == 3)
+        uStart = 4; 
+    CornerB += ip[uStart].vPosition * fOWt;
+    CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index );
+    CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index );
+
+    // Add in the guy before the prefix as well
+    if (index)
+        uStart = PrefIm1-1;
+    else
+        uStart = Pref[3]-1;
+    uTIndex = uTIndexStart-1;
+
+    CornerB += ip[uStart].vPosition * fOWt;
+    CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index );
+    CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index );
+
+    // We're done with the walk now.  Now we need to add the contributions of the original subd quad.
+    CornerB += ip[MOD4(index+1)].vPosition * fEWt;
+    CornerB += ip[MOD4(index+2)].vPosition * fOWt;
+    CornerB += ip[MOD4(index+3)].vPosition * fEWt;
+    
+    uTIndex = 0 + (index&1)*(Val[index]-1);
+    uStart = MOD4(index+1);
+    CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index );
+    CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index );
+    
+    uStart = MOD4(index+2);
+    CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2 + 1, index );
+    CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2 + 1, index );
+
+    uStart = MOD4(index+3);
+    uTIndex = (uTIndex+1)%Val[index];
+
+    CornerU += ip[uStart].vPosition * TANM( ( uTIndex % Val[index] ) * 2, index );
+    CornerV += ip[uStart].vPosition * TANM( ( ( uTIndex + uV ) % Val[index] ) * 2, index );
+
+    // Normalize the corner weights
+    CornerB *= 1.0f / ( Val[index] * Val[index] + 5 * Val[index] ); // normalize
+
+    // fixup signs from directional derivatives...
+    if( !((index - 1) & 2) ) // 1 and 2
+        CornerU *= -1;
+
+    if( index >= 2 ) // 2 and 3
+        CornerV *= -1;
+}
+
+//--------------------------------------------------------------------------------------
+// Computes the edge vertices of the output bicubic patch.  The edge vertices
+// (1,2,4,7,8,11,13,14) are a weighted (by valence) combination of 6 interior and 1-ring
+// neighborhood points.  However, we don't have to do the walk on this one since we
+// don't need all of the neighbor points attached to this vertex.
+//--------------------------------------------------------------------------------------
+float3 ComputeEdgeVertex( in uint index /* 0-7 */, 
+                          const in InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> ip, 
+                          const in uint Val[4], 
+                          const in uint Pref[4] )
+{
+    float val1 = 2 * Val[0] + 10;
+    float val2 = 2 * Val[1] + 10;
+    float val13 = 2 * Val[3] + 10;
+    float val14 = 2 * Val[2] + 10;
+    float val4 = val1;
+    float val8 = val13;
+    float val7 = val2;
+    float val11 = val14;
+    
+    float3 vRetVal = float3(0,0,0);
+    switch( index )
+    {
+    // Horizontal
+    case 0:
+        vRetVal = (Val[0]*2*ip[0].vPosition + 4*ip[1].vPosition + ip[2].vPosition + ip[3].vPosition*2 +
+              2*ip[Pref[0]-1].vPosition + ip[Pref[0]].vPosition) / val1;
+        break;
+    case 1:
+        vRetVal = (4*ip[0].vPosition + Val[1]*2*ip[1].vPosition + ip[2].vPosition*2 + ip[3].vPosition +
+              ip[Pref[0]-1].vPosition + 2*ip[Pref[0]].vPosition) / val2;
+        break;
+    case 2:
+        vRetVal = (2*ip[0].vPosition + ip[1].vPosition + 4*ip[2].vPosition + ip[3].vPosition*2*Val[3] +
+               2*ip[Pref[2]].vPosition + ip[Pref[2]-1].vPosition) / val13;
+        break;
+    case 3:
+        vRetVal = (ip[0].vPosition + 2*ip[1].vPosition + Val[2]*2*ip[2].vPosition + ip[3].vPosition*4 +
+               ip[Pref[2]].vPosition + 2*ip[Pref[2]-1].vPosition) / val14;
+        break;
+    // Vertical
+    case 4:
+        vRetVal = (Val[0]*2*ip[0].vPosition + 2*ip[1].vPosition + ip[2].vPosition + ip[3].vPosition*4 +
+              2*ip[4].vPosition + ip[Pref[3]-1].vPosition) / val4;
+        break;
+    case 5:
+        vRetVal = (4*ip[0].vPosition + ip[1].vPosition + 2*ip[2].vPosition + ip[3].vPosition*2*Val[3] +
+              ip[4].vPosition + 2*ip[Pref[3]-1].vPosition) / val8;
+        break;
+    case 6:
+        vRetVal = (2*ip[0].vPosition + Val[1]*2*ip[1].vPosition + 4*ip[2].vPosition + ip[3].vPosition +
+              2*ip[Pref[1]-1].vPosition + ip[Pref[1]].vPosition) / val7;
+        break;
+    case 7:
+        vRetVal = (ip[0].vPosition + 4*ip[1].vPosition + Val[2]*2*ip[2].vPosition + 2*ip[3].vPosition +
+               ip[Pref[1]-1].vPosition + 2*ip[Pref[1]].vPosition) / val11;
+        break;
+    }
+        
+    return vRetVal;
+}
+
+//--------------------------------------------------------------------------------------
+// Helper function
+//--------------------------------------------------------------------------------------
+void BezierRaise(inout float3 pQ[3], out float3 pC[4])
+{
+    pC[0] = pQ[0];
+    pC[3] = pQ[2];
+
+    for( int i=1; i<3; i++ ) 
+    {
+        pC[i] = ( 1.0f / 3.0f ) * ( pQ[i - 1] * i + ( 3.0f - i ) * pQ[i] );
+    }
+}
+
+//--------------------------------------------------------------------------------------
+// Computes the tangent patch from the input bezier patch
+//--------------------------------------------------------------------------------------
+void ComputeTanPatch( const OutputPatch<BEZIER_CONTROL_POINT, 16> bezpatch, 
+                      inout float3 vOut[16], 
+                      in float fCWts[4], 
+                      in float3 vCorner[4], 
+                      in float3 vCornerLocal[4], 
+                      in const uint cX, 
+                      in const uint cY)
+{
+    float3 vQuad[3];
+    float3 vQuadB[3];
+    float3 vCubic[4];
+
+    // boundary edges are really simple...
+    vQuad[0] = vCornerLocal[0];
+    vQuad[2] = vCornerLocal[1];
+    vQuad[1] = 3.0f*(bezpatch[2*cX+0*cY].vPosition-bezpatch[1*cX+0*cY].vPosition);
+
+    BezierRaise(vQuad,vCubic);
+    vOut[1*cX + 0*cY] = vCubic[1];
+    vOut[2*cX + 0*cY] = vCubic[2];
+
+    vQuad[0] = vCornerLocal[2];
+    vQuad[2] = vCornerLocal[3];
+    vQuad[1] = 3.0f*(bezpatch[2*cX+3*cY].vPosition-bezpatch[1*cX+3*cY].vPosition);
+
+    BezierRaise(vQuad,vCubic);
+    vOut[1*cX + 3*cY] = vCubic[1];
+    vOut[2*cX + 3*cY] = vCubic[2];
+
+    // two internal edges - this is where work happens...
+    float3 vA,vB,vC,vD,vE;
+    float fC0,fC1;
+    vQuad[1] = 3.0f*(bezpatch[2*cX+2*cY].vPosition-bezpatch[1*cX+2*cY].vPosition);
+    // also do "second" scan line
+    vQuadB[1] = 3.0f*(bezpatch[2*cX+1*cY].vPosition-bezpatch[1*cX+1*cY].vPosition);
+
+    vD = 3.0f*(bezpatch[1*cX + 2*cY].vPosition - bezpatch[0*cX + 2*cY].vPosition);
+    vE = 3.0f*(bezpatch[1*cX + 1*cY].vPosition - bezpatch[0*cX + 1*cY].vPosition); // used later...
+
+    fC0 = fCWts[3];
+    fC1 = fCWts[0];
+
+    // sign flip
+    vA = -vCorner[3];
+    vB = 3.0f*(bezpatch[0*cX + 1*cY].vPosition - bezpatch[0*cX + 2*cY].vPosition);
+    vC = -vCorner[0];
+
+    vQuad[0] = 1.0f/3.0f*(2.0f*fC0*vB - fC1*vA) + vD;
+    vQuadB[0] = 1.0f/3.0f*(fC0*vC - 2.0f*fC1*vB) + vE;
+
+    // do end of strip - same as before, but stuff is switched around...
+    vC = vCorner[2];
+    vB = 3.0f*(bezpatch[3*cX + 2*cY].vPosition - bezpatch[3*cX + 1*cY].vPosition);
+    vA = vCorner[1];
+
+    vD = 3.0f*(bezpatch[2*cX + 1*cY].vPosition - bezpatch[3*cX + 1*cY].vPosition);
+    vE = 3.0f*(bezpatch[2*cX + 2*cY].vPosition - bezpatch[3*cX + 2*cY].vPosition);
+    
+    fC0 = fCWts[1];
+    fC1 = fCWts[2];
+ 
+    vQuadB[2] = 1.0f/3.0f*(2.0f*fC0*vB - fC1*vA) + vD;
+    vQuad[2] = 1.0f/3.0f*(fC0*vC - 2.0f*fC1*vB) + vE;
+
+    vQuadB[2] *= -1.0f;
+    vQuad[2] *= -1.0f;
+
+    BezierRaise(vQuad,vCubic);
+
+    vOut[0*cX + 2*cY] = vCubic[0];
+    vOut[1*cX + 2*cY] = vCubic[1];
+    vOut[2*cX + 2*cY] = vCubic[2];
+    vOut[3*cX + 2*cY] = vCubic[3];
+
+    BezierRaise(vQuadB,vCubic);
+
+    vOut[0*cX + 1*cY] = vCubic[0];
+    vOut[1*cX + 1*cY] = vCubic[1];
+    vOut[2*cX + 1*cY] = vCubic[2];
+    vOut[3*cX + 1*cY] = vCubic[3];
+}
+
+//--------------------------------------------------------------------------------------
+// Skinning vertex shader Section
+//--------------------------------------------------------------------------------------
+struct VS_CONTROL_POINT_INPUT
+{
+    float3 vPosition		: POSITION;
+    float2 vUV				: TEXCOORD0;
+    float3 vTangent			: TANGENT;
+    uint4  vBones			: BONES;
+    float4 vWeights			: WEIGHTS;
+};
+
+VS_CONTROL_POINT_OUTPUT PatchSkinningVS( VS_CONTROL_POINT_INPUT Input )
+{
+    VS_CONTROL_POINT_OUTPUT Output;
+    
+    float4 vInputPos = float4( Input.vPosition, 1 );
+    float4 vWorldPos = float4( 0, 0, 0, 0 );
+    
+    vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.x ] ) * Input.vWeights.x;
+    vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.y ] ) * Input.vWeights.y;
+    vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.z ] ) * Input.vWeights.z;
+    vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.w ] ) * Input.vWeights.w;
+    
+    float3 vWorldTan = float3( 0, 0, 0 );
+    vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.x ] ) * Input.vWeights.x;
+    vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.y ] ) * Input.vWeights.y;
+    vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.z ] ) * Input.vWeights.z;
+    vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.w ] ) * Input.vWeights.w;
+    
+    Output.vPosition = vWorldPos;
+    Output.vUV = Input.vUV;
+    Output.vTangent = vWorldTan;
+    
+    return Output;
+}
+
+struct VS_MESH_POINT_INPUT
+{
+    float3 vPosition		: POSITION;
+    float2 vUV				: TEXCOORD0;
+    float3 vNormal			: NORMAL;
+    float3 vTangent			: TANGENT;
+    uint4  vBones			: BONES;
+    float4 vWeights			: WEIGHTS;
+};
+
+struct VS_MESH_POINT_OUTPUT
+{
+    float3 vWorldPos        : POSITION;
+    float3 vNormal			: NORMAL;
+    float2 vUV				: TEXCOORD;
+    float3 vTangent			: TANGENT;
+    float3 vBiTangent		: BITANGENT;
+    
+    float4 vPosition        : SV_POSITION;
+};
+
+VS_MESH_POINT_OUTPUT MeshSkinningVS( VS_MESH_POINT_INPUT Input )
+{
+    VS_MESH_POINT_OUTPUT Output;
+    
+    float4 vInputPos = float4( Input.vPosition, 1 );
+    float4 vWorldPos = float4( 0, 0, 0, 0 );
+    
+    vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.x ] ) * Input.vWeights.x;
+    vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.y ] ) * Input.vWeights.y;
+    vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.z ] ) * Input.vWeights.z;
+    vWorldPos += mul( vInputPos, g_mConstBoneWorld[ Input.vBones.w ] ) * Input.vWeights.w;
+    
+    float3 vWorldTan = float3( 0, 0, 0 );
+    vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.x ] ) * Input.vWeights.x;
+    vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.y ] ) * Input.vWeights.y;
+    vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.z ] ) * Input.vWeights.z;
+    vWorldTan += mul( Input.vTangent, (float3x3)g_mConstBoneWorld[ Input.vBones.w ] ) * Input.vWeights.w;
+    
+    float3 vWorldNormal = float3( 0, 0, 0 );
+    vWorldNormal += mul( Input.vNormal, (float3x3)g_mConstBoneWorld[ Input.vBones.x ] ) * Input.vWeights.x;
+    vWorldNormal += mul( Input.vNormal, (float3x3)g_mConstBoneWorld[ Input.vBones.y ] ) * Input.vWeights.y;
+    vWorldNormal += mul( Input.vNormal, (float3x3)g_mConstBoneWorld[ Input.vBones.z ] ) * Input.vWeights.z;
+    vWorldNormal += mul( Input.vNormal, (float3x3)g_mConstBoneWorld[ Input.vBones.w ] ) * Input.vWeights.w;
+    
+    Output.vWorldPos = vWorldPos.xyz;
+    Output.vPosition = mul( float4( vWorldPos.xyz, 1 ), g_mViewProjection );
+    Output.vUV = Input.vUV;
+    Output.vTangent = vWorldTan;
+    Output.vNormal = vWorldNormal;
+    Output.vBiTangent = cross( vWorldNormal, vWorldTan );
+    
+    return Output;    
+}
+
+//--------------------------------------------------------------------------------------
+// SubD to Bezier hull shader Section
+//--------------------------------------------------------------------------------------
+struct HS_CONSTANT_DATA_OUTPUT
+{
+    float Edges[4]			: SV_TessFactor;
+    float Inside[2]			: SV_InsideTessFactor;
+    
+    float3 vTangent[4]		: TANGENT;
+    float2 vUV[4]			: TEXCOORD;
+    float3 vTanUCorner[4]	: TANUCORNER;
+    float3 vTanVCorner[4]	: TANVCORNER;
+    float4 vCWts			: TANWEIGHTS;
+};
+
+//--------------------------------------------------------------------------------------
+// Load per-patch valence and prefix data
+//--------------------------------------------------------------------------------------
+void LoadValenceAndPrefixData( in uint PatchID, out uint Val[4], out uint Prefixes[4] )
+{
+    PatchID += g_iPatchStartIndex;
+    uint4 ValPack = g_ValencePrefixBuffer.Load( PatchID * 2 );
+    uint4 PrefPack = g_ValencePrefixBuffer.Load( PatchID * 2 + 1 );
+    
+    Val[0] = ValPack.x;
+    Val[1] = ValPack.y;
+    Val[2] = ValPack.z;
+    Val[3] = ValPack.w;
+    
+    Prefixes[0] = PrefPack.x;
+    Prefixes[1] = PrefPack.y;
+    Prefixes[2] = PrefPack.z;
+    Prefixes[3] = PrefPack.w;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Constant data function for the SubDToBezierHS.  This is executed once per patch.
+//--------------------------------------------------------------------------------------
+HS_CONSTANT_DATA_OUTPUT SubDToBezierConstantsHS( InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> ip,
+                                                 uint PatchID : SV_PrimitiveID )
+{	
+    HS_CONSTANT_DATA_OUTPUT Output;
+    
+    float TessAmount = g_fTessellationFactor;
+
+    Output.Edges[0] = Output.Edges[1] = Output.Edges[2] = Output.Edges[3] = TessAmount;
+    Output.Inside[0] = Output.Inside[1] = TessAmount;
+    
+    Output.vTangent[0] = ip[0].vTangent;
+    Output.vTangent[1] = ip[1].vTangent;
+    Output.vTangent[2] = ip[2].vTangent;
+    Output.vTangent[3] = ip[3].vTangent;
+    
+    Output.vUV[0] = ip[0].vUV;
+    Output.vUV[1] = ip[1].vUV;
+    Output.vUV[2] = ip[2].vUV;
+    Output.vUV[3] = ip[3].vUV;
+    
+    // Compute part of our tangent patch here
+    uint Val[4];
+    uint Prefixes[4];
+    LoadValenceAndPrefixData( PatchID, Val, Prefixes );
+
+    [unroll]
+    for( int i=0; i<4; i++ )
+    {
+        float3 CornerB, CornerU, CornerV;
+        ComputeCornerVertex( i, CornerB, CornerU, CornerV, ip, Val, Prefixes );
+        Output.vTanUCorner[i] = CornerU;
+        Output.vTanVCorner[i] = CornerV;
+    }
+    
+    float fCWts[4];
+    Output.vCWts.x = g_fCi[ Val[0]-3 ];
+    Output.vCWts.y = g_fCi[ Val[1]-3 ];
+    Output.vCWts.z = g_fCi[ Val[2]-3 ];
+    Output.vCWts.w = g_fCi[ Val[3]-3 ];
+    
+    return Output;
+}
+
+HS_CONSTANT_DATA_OUTPUT SubDToBezierConstantsHS4444( InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> ip,
+                                                 uint PatchID : SV_PrimitiveID )
+{	
+    HS_CONSTANT_DATA_OUTPUT Output;
+    
+    float TessAmount = g_fTessellationFactor;
+
+    Output.Edges[0] = Output.Edges[1] = Output.Edges[2] = Output.Edges[3] = TessAmount;
+    Output.Inside[0] = Output.Inside[1] = TessAmount;
+    
+    Output.vTangent[0] = ip[0].vTangent;
+    Output.vTangent[1] = ip[1].vTangent;
+    Output.vTangent[2] = ip[2].vTangent;
+    Output.vTangent[3] = ip[3].vTangent;
+    
+    Output.vUV[0] = ip[0].vUV;
+    Output.vUV[1] = ip[1].vUV;
+    Output.vUV[2] = ip[2].vUV;
+    Output.vUV[3] = ip[3].vUV;
+    
+    // Compute part of our tangent patch here
+    static const uint Val[4] = (uint[4])uint4(4,4,4,4);
+    static const uint Prefixes[4] = (uint[4])uint4(7,10,13,16);
+
+    [unroll]
+    for( int i=0; i<4; i++ )
+    {
+        float3 CornerB, CornerU, CornerV;
+        ComputeCornerVertex4444( i, CornerB, CornerU, CornerV, ip, Val, Prefixes );
+        Output.vTanUCorner[i] = CornerU;
+        Output.vTanVCorner[i] = CornerV;
+    }
+    
+    float fCWts[4];
+    Output.vCWts.x = g_fCi[ Val[0]-3 ];
+    Output.vCWts.y = g_fCi[ Val[1]-3 ];
+    Output.vCWts.z = g_fCi[ Val[2]-3 ];
+    Output.vCWts.w = g_fCi[ Val[3]-3 ];
+    
+    return Output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// HS for SubDToBezier.  This outputcontrolpoints(16) specifies that we will produce
+// 16 control points.  Therefore this function will be invoked 16x, one for each output
+// control point.
+//
+// !! PERFORMANCE NOTE: This hull shader is written for maximum readability, and its
+// performance is not expected to be optimal on D3D11 hardware.  The switch statement
+// below that determines the codepath for each patch control point generates sub-optimal
+// code for parallel execution on the GPU.  A future implementation of this hull shader
+// will combine the 16 codepaths and 3 variants (corner, edge, interior) into one shared
+// codepath; this change is expected to increase performance at the expense of readability.
+//--------------------------------------------------------------------------------------
+[domain("quad")]
+[partitioning("integer")]
+[outputtopology("triangle_cw")]
+[outputcontrolpoints(16)]
+[patchconstantfunc("SubDToBezierConstantsHS")]
+BEZIER_CONTROL_POINT SubDToBezierHS( InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> p, 
+                                     uint i : SV_OutputControlPointID,
+                                     uint PatchID : SV_PrimitiveID )
+{
+    // Valences and prefixes are loaded from a buffer
+    uint Val[4];
+    uint Prefixes[4];
+    LoadValenceAndPrefixData( PatchID, Val, Prefixes );
+    
+    float3 CornerB = float3(0,0,0);
+    float3 CornerU = float3(0,0,0);
+    float3 CornerV = float3(0,0,0);
+    
+    BEZIER_CONTROL_POINT Output;
+    Output.vPosition = float3(0,0,0);
+    
+    // !! PERFORMANCE NOTE: As mentioned above, this switch statement generates
+    // inefficient code for the sake of readability.
+    switch( i )
+    {
+    // Interior vertices
+    case 5:
+        Output.vPosition = ComputeInteriorVertex( 0, Val, p );
+        break;
+    case 6:
+        Output.vPosition = ComputeInteriorVertex( 1, Val, p );
+        break;
+    case 10:
+        Output.vPosition = ComputeInteriorVertex( 2, Val, p );
+        break;
+    case 9:
+        Output.vPosition = ComputeInteriorVertex( 3, Val, p );
+        break;
+        
+    // Corner vertices
+    case 0:
+        ComputeCornerVertex( 0, CornerB, CornerU, CornerV, p, Val, Prefixes );
+        Output.vPosition = CornerB;
+        break;
+    case 3:
+        ComputeCornerVertex( 1, CornerB, CornerU, CornerV, p, Val, Prefixes );
+        Output.vPosition = CornerB;
+        break;
+    case 15:
+        ComputeCornerVertex( 2, CornerB, CornerU, CornerV, p, Val, Prefixes );
+        Output.vPosition = CornerB;
+        break;
+    case 12:
+        ComputeCornerVertex( 3, CornerB, CornerU, CornerV, p, Val, Prefixes );
+        Output.vPosition = CornerB;
+        break;
+        
+    // Edge vertices
+    case 1:
+        Output.vPosition = ComputeEdgeVertex( 0, p, Val, Prefixes );
+        break;
+    case 2:
+        Output.vPosition = ComputeEdgeVertex( 1, p, Val, Prefixes );
+        break;
+    case 13:
+        Output.vPosition = ComputeEdgeVertex( 2, p, Val, Prefixes );
+        break;
+    case 14:
+        Output.vPosition = ComputeEdgeVertex( 3, p, Val, Prefixes );
+        break;
+    case 4:
+        Output.vPosition = ComputeEdgeVertex( 4, p, Val, Prefixes );
+        break;
+    case 8:
+        Output.vPosition = ComputeEdgeVertex( 5, p, Val, Prefixes );
+        break;
+    case 7:
+        Output.vPosition = ComputeEdgeVertex( 6, p, Val, Prefixes );
+        break;
+    case 11:
+        Output.vPosition = ComputeEdgeVertex( 7, p, Val, Prefixes );
+        break;
+    }
+    
+    return Output;
+}
+
+//--------------------------------------------------------------------------------------
+// Specialised version for Regular (4,4,4,4) patches, this is much simpler and has less
+// branching compared to the general one above
+//--------------------------------------------------------------------------------------
+[domain("quad")]
+[partitioning("integer")]
+[outputtopology("triangle_cw")]
+[outputcontrolpoints(16)]
+[patchconstantfunc("SubDToBezierConstantsHS4444")]
+BEZIER_CONTROL_POINT SubDToBezierHS4444( InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> p, 
+                                     uint i : SV_OutputControlPointID,
+                                     uint PatchID : SV_PrimitiveID )
+{
+    // Valences and prefixes are Constant for this case (4,4,4,4)
+    static const uint Val[4] = (uint[4])uint4(4,4,4,4);
+    static const uint Prefixes[4] = (uint[4])uint4(7,10,13,16);
+    
+    float3 CornerB = float3(0,0,0);
+    float3 CornerU = float3(0,0,0);
+    float3 CornerV = float3(0,0,0);
+    
+    BEZIER_CONTROL_POINT Output;
+    Output.vPosition = float3(0,0,0);
+    
+    // !! PERFORMANCE NOTE: As mentioned above, this switch statement generates
+    // inefficient code for the sake of readability.
+    switch( i )
+    {
+    // Interior vertices
+    case 5:
+        Output.vPosition = ComputeInteriorVertex( 0, Val, p );
+        break;
+    case 6:
+        Output.vPosition = ComputeInteriorVertex( 1, Val, p );
+        break;
+    case 10:
+        Output.vPosition = ComputeInteriorVertex( 2, Val, p );
+        break;
+    case 9:
+        Output.vPosition = ComputeInteriorVertex( 3, Val, p );
+        break;
+        
+    // Corner vertices
+    case 0:
+        ComputeCornerVertex4444( 0, CornerB, CornerU, CornerV, p, Val, Prefixes );
+        Output.vPosition = CornerB;
+        break;
+    case 3:
+        ComputeCornerVertex4444( 1, CornerB, CornerU, CornerV, p, Val, Prefixes );
+        Output.vPosition = CornerB;
+        break;
+    case 15:
+        ComputeCornerVertex4444( 2, CornerB, CornerU, CornerV, p, Val, Prefixes );
+        Output.vPosition = CornerB;
+        break;
+    case 12:
+        ComputeCornerVertex4444( 3, CornerB, CornerU, CornerV, p, Val, Prefixes );
+        Output.vPosition = CornerB;
+        break;
+        
+    // Edge vertices
+    case 1:
+        Output.vPosition = ComputeEdgeVertex( 0, p, Val, Prefixes );
+        break;
+    case 2:
+        Output.vPosition = ComputeEdgeVertex( 1, p, Val, Prefixes );
+        break;
+    case 13:
+        Output.vPosition = ComputeEdgeVertex( 2, p, Val, Prefixes );
+        break;
+    case 14:
+        Output.vPosition = ComputeEdgeVertex( 3, p, Val, Prefixes );
+        break;
+    case 4:
+        Output.vPosition = ComputeEdgeVertex( 4, p, Val, Prefixes );
+        break;
+    case 8:
+        Output.vPosition = ComputeEdgeVertex( 5, p, Val, Prefixes );
+        break;
+    case 7:
+        Output.vPosition = ComputeEdgeVertex( 6, p, Val, Prefixes );
+        break;
+    case 11:
+        Output.vPosition = ComputeEdgeVertex( 7, p, Val, Prefixes );
+        break;
+    }
+    
+    return Output;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Bezier evaluation domain shader section
+//--------------------------------------------------------------------------------------
+struct DS_OUTPUT
+{
+    float3 vWorldPos        : POSITION;
+    float3 vNormal			: NORMAL;
+    float2 vUV				: TEXCOORD;
+    float3 vTangent			: TANGENT;
+    float3 vBiTangent		: BITANGENT;
+    
+    float4 vPosition		: SV_POSITION;
+};
+
+//--------------------------------------------------------------------------------------
+float4 BernsteinBasis(float t)
+{
+    float invT = 1.0f - t;
+
+    return float4( invT * invT * invT,
+                   3.0f * t * invT * invT,
+                   3.0f * t * t * invT,
+                   t * t * t );
+}
+
+//--------------------------------------------------------------------------------------
+float4 dBernsteinBasis(float t)
+{
+    float invT = 1.0f - t;
+
+    return float4( -3 * invT * invT,
+                   3 * invT * invT - 6 * t * invT,
+                   6 * t * invT - 3 * t * t,
+                   3 * t * t );
+}
+
+//--------------------------------------------------------------------------------------
+float3 EvaluateBezier( const OutputPatch<BEZIER_CONTROL_POINT, 16> bezpatch,
+                       float4 BasisU,
+                       float4 BasisV )
+{
+    float3 Value = float3(0,0,0);
+    Value  = BasisV.x * ( bezpatch[0].vPosition * BasisU.x + bezpatch[1].vPosition * BasisU.y + bezpatch[2].vPosition * BasisU.z + bezpatch[3].vPosition * BasisU.w );
+    Value += BasisV.y * ( bezpatch[4].vPosition * BasisU.x + bezpatch[5].vPosition * BasisU.y + bezpatch[6].vPosition * BasisU.z + bezpatch[7].vPosition * BasisU.w );
+    Value += BasisV.z * ( bezpatch[8].vPosition * BasisU.x + bezpatch[9].vPosition * BasisU.y + bezpatch[10].vPosition * BasisU.z + bezpatch[11].vPosition * BasisU.w );
+    Value += BasisV.w * ( bezpatch[12].vPosition * BasisU.x + bezpatch[13].vPosition * BasisU.y + bezpatch[14].vPosition * BasisU.z + bezpatch[15].vPosition * BasisU.w );
+    
+    return Value;
+}
+
+//--------------------------------------------------------------------------------------
+float3 EvaluateBezierTan( const float3 bezpatch[16],
+                          float4 BasisU,
+                          float4 BasisV )
+{
+    float3 Value = float3(0,0,0);
+    Value  = BasisV.x * ( bezpatch[0] * BasisU.x + bezpatch[1] * BasisU.y + bezpatch[2] * BasisU.z + bezpatch[3] * BasisU.w );
+    Value += BasisV.y * ( bezpatch[4] * BasisU.x + bezpatch[5] * BasisU.y + bezpatch[6] * BasisU.z + bezpatch[7] * BasisU.w );
+    Value += BasisV.z * ( bezpatch[8] * BasisU.x + bezpatch[9] * BasisU.y + bezpatch[10] * BasisU.z + bezpatch[11] * BasisU.w );
+    Value += BasisV.w * ( bezpatch[12] * BasisU.x + bezpatch[13] * BasisU.y + bezpatch[14] * BasisU.z + bezpatch[15] * BasisU.w );
+    
+    return Value;
+}
+
+//--------------------------------------------------------------------------------------
+// Compute a two full tangent patches from the Tangent corner data created in the
+// HS constant data function.
+//--------------------------------------------------------------------------------------
+void CreatTangentPatches( in HS_CONSTANT_DATA_OUTPUT input, 
+                        const OutputPatch<BEZIER_CONTROL_POINT, 16> bezpatch,
+                        out float3 TanU[16], 
+                        out float3 TanV[16] )
+{    
+    TanV[0]  = input.vTanVCorner[0];
+    TanV[3]  = input.vTanVCorner[1];
+    TanV[15] = input.vTanVCorner[2];
+    TanV[12] = input.vTanVCorner[3];
+    
+    TanU[0]  = input.vTanUCorner[0];
+    TanU[3]  = input.vTanUCorner[1];
+    TanU[15] = input.vTanUCorner[2];
+    TanU[12] = input.vTanUCorner[3];
+    
+    float fCWts[4];
+    fCWts[0] = input.vCWts.x;
+    fCWts[1] = input.vCWts.y;
+    fCWts[2] = input.vCWts.z;
+    fCWts[3] = input.vCWts.w;
+
+    float3 vCorner[4];
+    float3 vCornerLocal[4];
+    
+    vCorner[0] = TanV[0];
+    vCorner[1] = TanV[3];
+    vCorner[2] = TanV[15];
+    vCorner[3] = TanV[12];
+    vCornerLocal[0] = TanU[0];
+    vCornerLocal[1] = TanU[3];
+    vCornerLocal[2] = TanU[12];
+    vCornerLocal[3] = TanU[15];
+
+    ComputeTanPatch( bezpatch, TanU, fCWts, vCorner, vCornerLocal, 1, 4 );
+
+    fCWts[3] = input.vCWts.y;
+    fCWts[1] = input.vCWts.w;
+
+    vCorner[0] = TanU[0];
+    vCorner[3] = TanU[3];
+    vCorner[2] = TanU[15];
+    vCorner[1] = TanU[12];
+    vCornerLocal[0] = TanV[0];
+    vCornerLocal[1] = TanV[12];
+    vCornerLocal[2] = TanV[3];
+    vCornerLocal[3] = TanV[15];
+
+    ComputeTanPatch( bezpatch, TanV, fCWts, vCorner, vCornerLocal, 4, 1 );
+}
+
+//--------------------------------------------------------------------------------------
+// For each input UV (from the Tessellator), evaluate the Bezier patch at this position.
+//--------------------------------------------------------------------------------------
+[domain("quad")]
+DS_OUTPUT BezierEvalDS( HS_CONSTANT_DATA_OUTPUT input, 
+                        float2 UV : SV_DomainLocation,
+                        const OutputPatch<BEZIER_CONTROL_POINT, 16> bezpatch )
+{
+    float4 BasisU = BernsteinBasis( UV.x );
+    float4 BasisV = BernsteinBasis( UV.y );
+    
+    float3 WorldPos = EvaluateBezier( bezpatch, BasisU, BasisV );
+    
+    float3 TanU[16];
+    float3 TanV[16];
+    CreatTangentPatches( input, bezpatch, TanU, TanV );
+    float3 Tangent = EvaluateBezierTan( TanU, BasisU, BasisV );
+    float3 BiTangent = EvaluateBezierTan( TanV, BasisU, BasisV );
+    
+    // To see what the patch looks like without using the tangent patches to fix the normals, uncomment this section
+    /*
+    float4 dBasisU = dBernsteinBasis( UV.x );
+    float4 dBasisV = dBernsteinBasis( UV.y );
+    Tangent = EvaluateBezier( bezpatch, dBasisU, BasisV );
+    BiTangent = EvaluateBezier( bezpatch, BasisU, dBasisV );
+    */
+    
+    float3 Norm = normalize( cross( Tangent, BiTangent ) );
+
+    DS_OUTPUT Output;
+    Output.vNormal = Norm;
+    
+    // Evalulate the tangent vectors through bilinear interpolation.
+    // These tangents are the texture-space tangents.  They should not be confused with the parametric
+    // tangents that we use to get the normals for the bicubic patch.
+    float3 TextureTanU0 = input.vTangent[0];
+    float3 TextureTanU1 = input.vTangent[1];
+    float3 TextureTanU2 = input.vTangent[2];
+    float3 TextureTanU3 = input.vTangent[3];
+    
+    float3 UVbottom = lerp( TextureTanU0, TextureTanU1, UV.x );
+    float3 UVtop = lerp( TextureTanU3, TextureTanU2, UV.x );
+    float3 Tan = lerp( UVbottom, UVtop, UV.y );
+
+    Output.vTangent = Tan;
+
+    // This is an optimization.  We assume that the UV mapping of the mesh will result in a "relatively" orthogonal
+    // tangent basis.  If we assume this, then we can avoid fetching and bilerping the BiTangent along with the tangent.
+    Output.vBiTangent = cross( Norm, Tan );
+
+    // bilerp the texture coordinates    
+    float2 tex0 = input.vUV[0];
+    float2 tex1 = input.vUV[1];
+    float2 tex2 = input.vUV[2];
+    float2 tex3 = input.vUV[3];
+        
+    float2 bottom = lerp( tex0, tex1, UV.x );
+    float2 top = lerp( tex3, tex2, UV.x );
+    float2 TexUV = lerp( bottom, top, UV.y );
+    Output.vUV = TexUV;
+    
+    if( g_fDisplacementHeight > 0 )
+    {
+        // On this sample displacement can go into or out of the mesh.  This is why we bias the heigh amount.
+        float height = g_fDisplacementHeight * ( g_txHeight.SampleLevel( g_samPoint, TexUV, 0 ).a * 2 - 1 );
+        float3 WorldPosMiddle = Norm * height;
+        WorldPos += WorldPosMiddle;
+    }
+    
+    Output.vPosition = mul( float4(WorldPos,1), g_mViewProjection );
+    Output.vWorldPos = WorldPos;
+    
+    return Output;    
+}
+
+//--------------------------------------------------------------------------------------
+// Smooth shading pixel shader section
+//--------------------------------------------------------------------------------------
+
+float3 safe_normalize( float3 vInput )
+{
+    float len2 = dot( vInput, vInput );
+    if( len2 > 0 )
+    {
+        return vInput * rsqrt( len2 );
+    }
+    return vInput;
+}
+
+static const float g_fSpecularExponent = 32.0f;
+static const float g_fSpecularIntensity = 0.6f;
+static const float g_fNormalMapIntensity = 1.5f;
+
+float2 ComputeDirectionalLight( float3 vWorldPos, float3 vWorldNormal, float3 vDirLightDir )
+{
+    // Result.x is diffuse illumination, Result.y is specular illumination
+    float2 Result = float2( 0, 0 );
+    Result.x = pow( saturate( dot( vWorldNormal, -vDirLightDir ) ), 2 );
+    
+    float3 vPointToCamera = normalize( g_vCameraPosWorld - vWorldPos );
+    float3 vHalfAngle = normalize( vPointToCamera - vDirLightDir );
+    Result.y = pow( saturate( dot( vHalfAngle, vWorldNormal ) ), g_fSpecularExponent );
+    
+    return Result;
+}
+
+float3 ColorGamma( float3 Input )
+{
+    return pow( Input, 2.2f );
+}
+
+float4 SmoothPS( PS_INPUT Input ) : SV_TARGET
+{
+    float4 vNormalMapSampleRaw = g_txHeight.Sample( g_samLinear, Input.vUV );
+    float3 vNormalMapSampleBiased = ( vNormalMapSampleRaw.xyz * 2 ) - 1; 
+    vNormalMapSampleBiased.xy *= g_fNormalMapIntensity;
+    float3 vNormalMapSample = normalize( vNormalMapSampleBiased );
+    
+    float3 vNormal = safe_normalize( Input.vNormal ) * vNormalMapSample.z;
+    vNormal += safe_normalize( Input.vTangent ) * vNormalMapSample.x;
+    vNormal += safe_normalize( Input.vBiTangent ) * vNormalMapSample.y;
+                     
+    //float3 vColor = float3( 1, 1, 1 );
+    float3 vColor = g_txDiffuse.Sample( g_samLinear, Input.vUV ).rgb;
+    float vSpecular = g_txSpecular.Sample( g_samLinear, Input.vUV ).r * g_fSpecularIntensity;
+    
+    const float3 DirLightDirections[4] =
+    {
+        // key light
+        normalize( float3( -63.345150, -58.043934, 27.785097 ) ),
+        // fill light
+        normalize( float3( 23.652107, -17.391443, 54.972504 ) ),
+        // back light 1
+        normalize( float3( 20.470509, -22.939510, -33.929531 ) ),
+        // back light 2
+        normalize( float3( -31.003685, 24.242104, -41.352859 ) ),
+    };
+    
+    const float3 DirLightColors[4] = 
+    {
+        // key light
+        ColorGamma( float3( 1.0f, 0.964f, 0.706f ) * 1.0f ),
+        // fill light
+        ColorGamma( float3( 0.446f, 0.641f, 1.0f ) * 1.0f ),
+        // back light 1
+        ColorGamma( float3( 1.0f, 0.862f, 0.419f ) * 1.0f ),
+        // back light 2
+        ColorGamma( float3( 0.405f, 0.630f, 1.0f ) * 1.0f ),
+    };
+        
+    float3 fLightColor = 0;
+    for( int i = 0; i < 4; ++i )
+    {
+        float2 LightDiffuseSpecular = ComputeDirectionalLight( Input.vWorldPos, vNormal, DirLightDirections[i] );
+        fLightColor += DirLightColors[i] * vColor * LightDiffuseSpecular.x;
+        fLightColor += DirLightColors[i] * LightDiffuseSpecular.y * vSpecular;
+    }
+    
+    return float4( fLightColor, 1 );
+}
+
+//--------------------------------------------------------------------------------------
+// Solid color shading pixel shader (used for wireframe overlay)
+//--------------------------------------------------------------------------------------
+float4 SolidColorPS( PS_INPUT Input ) : SV_TARGET
+{
+    return float4( g_vSolidColor, 1 );
+}
diff --git a/tests/hlsl/dxsdk/VarianceShadows11/2DQuadShaders.hlsl b/tests/hlsl/dxsdk/VarianceShadows11/2DQuadShaders.hlsl
new file mode 100644
index 000000000..c4401f010
--- /dev/null
+++ b/tests/hlsl/dxsdk/VarianceShadows11/2DQuadShaders.hlsl
@@ -0,0 +1,211 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain -profile ps_4_0 -entry PSBlurX -entry PSBlurY
+//--------------------------------------------------------------------------------------
+// File: Skinning10.fx
+//
+// The effect file for the Skinning10 sample.  
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#ifndef SEPERABLE_BLUR_KERNEL_SIZE
+#define SEPERABLE_BLUR_KERNEL_SIZE 3
+#endif
+
+static const int BLUR_KERNEL_BEGIN = SEPERABLE_BLUR_KERNEL_SIZE / -2; 
+static const int BLUR_KERNEL_END = SEPERABLE_BLUR_KERNEL_SIZE / 2 + 1;
+static const float FLOAT_BLUR_KERNEL_SIZE = (float)SEPERABLE_BLUR_KERNEL_SIZE;
+
+cbuffer cbblurVS : register( b2)
+{
+	int2		g_iWidthHeight			: packoffset( c0 );
+	int		    g_iKernelStart  		: packoffset( c0.z );
+	int		    g_iKernelEnd	        : packoffset( c0.w );
+};
+
+//--------------------------------------------------------------------------------------
+// defines
+//--------------------------------------------------------------------------------------
+
+Texture2DArray g_txShadow : register( t5 );
+SamplerState g_samShadow : register( s5 );
+
+//--------------------------------------------------------------------------------------
+// Input/Output structures
+//--------------------------------------------------------------------------------------
+
+struct PSIn
+{
+    float4      Pos	    : SV_Position;		//Position
+    float2      Tex	    : TEXCOORD;		    //Texture coordinate
+    float2      ITex    : TEXCOORD2;
+};
+
+struct VSIn
+{
+    uint Pos	: SV_VertexID ;
+};
+
+
+PSIn VSMain(VSIn inn)
+{
+    PSIn output;
+
+    output.Pos.y  = -1.0f + (inn.Pos%2) * 2.0f ;
+    output.Pos.x  = -1.0f + (inn.Pos/2) * 2.0f;
+    output.Pos.z = .5;
+    output.Pos.w = 1;
+    output.Tex.x = inn.Pos/2;
+    output.Tex.y = 1.0f - inn.Pos%2;
+    output.ITex.x = (float)(g_iWidthHeight.x * output.Tex.x);
+    output.ITex.y = (float)(g_iWidthHeight.y * output.Tex.y);
+    return output;
+}
+
+//float PSDepth
+
+//------------------------------------------------------------------------------
+// Logarithmic filtering
+//------------------------------------------------------------------------------
+
+float log_conv ( float x0, float X, float y0, float Y )
+{
+    return (X + log(x0 + (y0 * exp(Y - X))));
+}
+
+
+//--------------------------------------------------------------------------------------
+// Pixel shader that performs bump mapping on the final vertex
+//--------------------------------------------------------------------------------------
+float2 PSBlurX(PSIn input) : SV_Target
+{	
+/*
+	float2 centerDistance;
+	if ( input.Tex.x  < .5 ) centerDistance.x = (1.0 - input.Tex.x);
+	else centerDistance.x = input.Tex.x;
+	if ( input.Tex.y  < .5 ) centerDistance.y = (1.0 - input.Tex.y);
+	else centerDistance.y = input.Tex.y;
+	if (centerDistance.x < centerDistance.y) centerDistance.x = centerDistance.y;
+	centerDistance.x -= .2;
+	centerDistance.x *= (1.0f / .8);
+
+    float store_samples[8];
+    int ind = 0;
+    for (int x = g_iKernelStart; x < g_iKernelEnd; ++x) {
+        store_samples[ind] = g_txShadow.Load( int3(input.ITex.x+(float)x * centerDistance.x , input.ITex.y, 0) ).r;
+        ind++;
+    }
+    const float c = (1.f/5.f);    
+    
+    float accum;
+    accum = log_conv( c, store_samples[0], c, store_samples[1] );    
+    
+    ind = 0;
+    for (x = g_iKernelStart - 2; x < g_iKernelEnd; ++x) {
+        ind++;
+        accum += log_conv( 1.0f, accum, c, store_samples[ind] );
+    }
+    float2 rt;
+    rt.x = accum;
+    return rt;
+    */
+    /*
+    float2 dep = 0;
+	float2 centerDistance;
+	if ( input.Tex.x  < .5 ) centerDistance.x = (1.0 - input.Tex.x);
+	else centerDistance.x = input.Tex.x;
+	if ( input.Tex.y  < .5 ) centerDistance.y = (1.0 - input.Tex.y);
+	else centerDistance.y = input.Tex.y;
+	if (centerDistance.x < centerDistance.y) centerDistance.x = centerDistance.y;
+	centerDistance.x -= .2;
+	centerDistance.x *= ( 1.0f / 0.8f );
+
+    for (int x = g_iKernelStart; x < g_iKernelEnd; ++x) {
+        dep += g_txShadow.Load( int3(input.ITex.x+(float)x * centerDistance.x , input.ITex.y, 0) ).rg;
+    }
+    dep /= (g_iKernelEnd - g_iKernelStart);
+    return dep;
+  */  
+  
+    float2 dep=0;
+    [unroll]for ( int x = BLUR_KERNEL_BEGIN; x < BLUR_KERNEL_END; ++x ) {
+        dep += g_txShadow.Sample( g_samShadow,  float3( input.Tex.x, input.Tex.y, 0 ), int2( x,0 ) ).rg;
+    }
+    dep /= FLOAT_BLUR_KERNEL_SIZE;
+    return dep;  
+    
+//    return g_txShadow.Sample(g_samShadow,  float3(input.Tex.x, input.Tex.y, 0) ).rg;
+    
+}
+
+//--------------------------------------------------------------------------------------
+// Pixel shader that performs bump mapping on the final vertex
+//--------------------------------------------------------------------------------------
+float2 PSBlurY(PSIn input) : SV_Target
+{	
+/*
+	float2 centerDistance;
+	if ( input.Tex.x  < .5 ) centerDistance.x = (1.0 - input.Tex.x);
+	else centerDistance.x = input.Tex.x;
+	if ( input.Tex.y  < .5 ) centerDistance.y = (1.0 - input.Tex.y);
+	else centerDistance.y = input.Tex.y;
+	if (centerDistance.x < centerDistance.y) centerDistance.x = centerDistance.y;
+	centerDistance.x -= .2;
+	centerDistance.x *= (1.0f / .8);
+	
+    float store_samples[8];
+    int ind = 0;
+    for (int y = g_iKernelStart; y < g_iKernelEnd; ++y) {
+        store_samples[ind] = g_txShadow.Load( int3(input.ITex.x, input.ITex.y+(float)y * centerDistance.x, 0) ).r;
+    }
+    const float c = (1.f/5.f);    
+    
+    float accum;
+    accum = log_conv( c, store_samples[0], c, store_samples[1] );    
+    
+    ind = 0;
+    for (y = g_iKernelStart; y < g_iKernelEnd; ++y) {
+        ind++;
+        accum += log_conv( 1.0f, accum, c, store_samples[ind] );
+    }
+    float2 rt;
+    rt.x = accum;
+    return rt;
+    */
+    
+    
+    /*    
+    float2 dep = 0;
+
+	float2 centerDistance;
+	if ( input.Tex.x  < .5 ) centerDistance.x = (1.0 - input.Tex.x);
+	else centerDistance.x = input.Tex.x;
+	if ( input.Tex.y  < .5 ) centerDistance.y = (1.0 - input.Tex.y);
+	else centerDistance.y = input.Tex.y;
+	if (centerDistance.x < centerDistance.y) centerDistance.x = centerDistance.y;
+	centerDistance.x -= 0;
+	centerDistance.x *= (1.0f / 1.0f);
+	
+	if (centerDistance.x < centerDistance.y) centerDistance.x = centerDistance.y;
+    for (int y = g_iKernelStart; y < g_iKernelEnd; ++y) {
+        dep += g_txShadow.Load( int3(input.ITex.x, input.ITex.y+(float)y * centerDistance.x, 0) ).rg;
+    }
+    
+    
+    dep /= (g_iKernelEnd - g_iKernelStart);
+    return dep;
+    
+    */
+    
+    
+    float2 dep=0;
+    [unroll]for ( int y = BLUR_KERNEL_BEGIN; y < BLUR_KERNEL_END; ++y ) {
+        dep += g_txShadow.Sample( g_samShadow,  float3( input.Tex.x, input.Tex.y, 0 ), int2( 0,y ) ).rg;
+    }
+    dep /= FLOAT_BLUR_KERNEL_SIZE;
+    return dep;  
+    
+    //return g_txShadow.Sample(g_samShadow,  float3(input.Tex.x, input.Tex.y, 0) ).rg;
+}
+
+
+
diff --git a/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceScene.hlsl b/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceScene.hlsl
new file mode 100644
index 000000000..0b2e43b5c
--- /dev/null
+++ b/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceScene.hlsl
@@ -0,0 +1,412 @@
+//TEST_IGNORE_FILE: Currently failing due to Spire compiler issues.
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain -profile ps_4_0 -entry PSBlurX -entry PSBlurY
+//--------------------------------------------------------------------------------------
+// File: RenderCascadeScene.hlsl
+//
+// This is the main shader file.  This shader is compiled with several different flags 
+// to provide different customizations based on user controls.
+// 
+// 
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+
+// This flag enables the shadow to blend between cascades.  This is most useful when the 
+// the shadow maps are small and artifact can be seen between the various cascade layers.
+#ifndef BLEND_BETWEEN_CASCADE_LAYERS_FLAG
+#define BLEND_BETWEEN_CASCADE_LAYERS_FLAG 0
+#endif
+
+// There are two methods for selecting the proper cascade a fragment lies in.  Interval selection
+// compares the depth of the fragment against the frustum's depth partition.
+// Map based selection compares the texture coordinates against the acutal cascade maps.
+// Map based selection gives better coverage.  
+// Interval based selection is easier to extend and understand.
+#ifndef SELECT_CASCADE_BY_INTERVAL_FLAG
+#define SELECT_CASCADE_BY_INTERVAL_FLAG 0
+#endif
+
+// The number of cascades 
+#ifndef CASCADE_COUNT_FLAG
+#define CASCADE_COUNT_FLAG 3
+#endif
+
+
+// Most titles will find that 3-4 cascades with 
+// BLEND_BETWEEN_CASCADE_LAYERS_FLAG, is good for lower end PCs.
+
+cbuffer cbAllShadowData : register( b0 )
+{
+    matrix          m_mWorldViewProjection;
+    matrix          m_mWorld;
+    matrix          m_mWorldView;
+    matrix          m_mShadow;
+    float4          m_vCascadeOffset[8];
+    float4          m_vCascadeScale[8];
+    int             m_nCascadeLevels; // Number of Cascades
+    int             m_iVisualizeCascades; // 1 is to visualize the cascades in different colors. 0 is to just draw the scene
+
+    // For Map based selection scheme, this keeps the pixels inside of the the valid range.
+    // When there is no boarder, these values are 0 and 1 respectivley.
+    float           m_fMinBorderPadding;     
+    float           m_fMaxBorderPadding;
+                                          
+    float           m_fCascadeBlendArea; // Amount to overlap when blending between cascades.
+    float           m_fTexelSize; // Padding variables exist because CBs must be a multiple of 16 bytes.
+    float           m_fNativeTexelSizeInX;
+    float4          m_fCascadeFrustumsEyeSpaceDepthsData[2];  // The values along Z that seperate the cascades.
+    // This code creates an array based pointer that points towards the vectorized input data.
+    // This is the only way to index arbitrary arrays of data.
+    // If the array is used at run time, the compiler will generate code that uses logic to index the correct component.
+
+    static float    m_fCascadeFrustumsEyeSpaceDepths[8] = (float[8])m_fCascadeFrustumsEyeSpaceDepthsData;
+    
+    float3          m_vLightDir;
+    float           m_fPaddingCB4;
+
+};
+
+
+
+//--------------------------------------------------------------------------------------
+// Textures and Samplers
+//--------------------------------------------------------------------------------------
+Texture2D           g_txDiffuse             : register( t0 );
+Texture2DArray      g_txShadow              : register( t5 );
+
+SamplerState g_samLinear                    : register( s0 );
+SamplerState g_samShadow                    : register( s5 );
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+    float4 vPosition                        : POSITION;
+    float3 vNormal                          : NORMAL;
+    float2 vTexcoord                        : TEXCOORD0;
+};
+
+struct VS_OUTPUT
+{
+    float3 vNormal                          : NORMAL;
+    float2 vTexcoord                        : COLOR0;
+    float4 vTexShadow						: TEXCOORD1;
+    float4 vPosition                        : SV_POSITION;
+    float4 vInterpPos                       : TEXCOORD2;
+    float  vDepth                           : TEXCOORD3;
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+VS_OUTPUT VSMain( VS_INPUT Input )
+{
+    VS_OUTPUT Output;
+
+    Output.vPosition = mul( Input.vPosition, m_mWorldViewProjection );
+    Output.vNormal = mul( Input.vNormal, (float3x3)m_mWorld );
+    Output.vTexcoord = Input.vTexcoord;
+    Output.vInterpPos = Input.vPosition;   
+    Output.vDepth = mul( Input.vPosition, m_mWorldView ).z ; 
+       
+    // Transform the shadow texture coordinates for all the cascades.
+    Output.vTexShadow = mul( Input.vPosition, m_mShadow );
+        
+    return Output;
+}
+
+
+
+static const float4 vCascadeColorsMultiplier[8] = 
+{
+    float4 ( 1.5f, 0.0f, 0.0f, 1.0f ),
+    float4 ( 0.0f, 1.5f, 0.0f, 1.0f ),
+    float4 ( 0.0f, 0.0f, 5.5f, 1.0f ),
+    float4 ( 1.5f, 0.0f, 5.5f, 1.0f ),
+    float4 ( 1.5f, 1.5f, 0.0f, 1.0f ),
+    float4 ( 1.0f, 1.0f, 1.0f, 1.0f ),
+    float4 ( 0.0f, 1.0f, 5.5f, 1.0f ),
+    float4 ( 0.5f, 3.5f, 0.75f, 1.0f )
+};
+
+
+void ComputeCoordinatesTransform( in int iCascadeIndex,
+                                  in float4 InterpolatedPosition, 
+                                  in out float4 vShadowTexCoord,
+                                  in out float4 vShadowTexCoordViewSpace ) 
+{
+    // Now that we know the correct map, we can transform the world space position of the current fragment                
+    if( SELECT_CASCADE_BY_INTERVAL_FLAG ) 
+    {
+        vShadowTexCoord = vShadowTexCoordViewSpace * m_vCascadeScale[iCascadeIndex];
+        vShadowTexCoord += m_vCascadeOffset[iCascadeIndex];
+    }  
+    vShadowTexCoord.w = vShadowTexCoord.z; // We put the z value in w so that we can index the texture array with Z.
+    vShadowTexCoord.z = iCascadeIndex;
+    
+} 
+
+//--------------------------------------------------------------------------------------
+// Use PCF to sample the depth map and return a percent lit value.
+//--------------------------------------------------------------------------------------
+void CalculateVarianceShadow ( in float4 vShadowTexCoord, in float4 vShadowMapTextureCoordViewSpace, int iCascade, out float fPercentLit ) 
+{
+    fPercentLit = 0.0f;
+    // This loop could be unrolled, and texture immediate offsets could be used if the kernel size were fixed.
+    // This would be a performance improvment.
+	        
+    float2 mapDepth = 0;
+
+
+    // In orderto pull the derivative out of divergent flow control we calculate the 
+    // derivative off of the view space coordinates an then scale the deriviative.
+    
+    float3 vShadowTexCoordDDX = 
+		ddx(vShadowMapTextureCoordViewSpace );
+    vShadowTexCoordDDX *= m_vCascadeScale[iCascade].xyz; 
+    float3 vShadowTexCoordDDY = 
+		ddy(vShadowMapTextureCoordViewSpace );
+    vShadowTexCoordDDY *= m_vCascadeScale[iCascade].xyz; 
+    
+    mapDepth += g_txShadow.SampleGrad( g_samShadow, vShadowTexCoord.xyz, 
+									   vShadowTexCoordDDX,
+									   vShadowTexCoordDDY);
+    // The sample instruction uses gradients for some filters.
+		        
+    float  fAvgZ  = mapDepth.x; // Filtered z
+    float  fAvgZ2 = mapDepth.y; // Filtered z-squared
+    
+    if ( vShadowTexCoord.w <= fAvgZ ) // We put the z value in w so that we can index the texture array with Z.
+    {
+        fPercentLit = 1;
+	}
+	else 
+	{
+	    float variance = ( fAvgZ2 ) - ( fAvgZ * fAvgZ );
+        variance       = min( 1.0f, max( 0.0f, variance + 0.00001f ) );
+    
+        float mean     = fAvgZ;
+        float d        = vShadowTexCoord.w - mean; // We put the z value in w so that we can index the texture array with Z.
+        float p_max    = variance / ( variance + d*d );
+
+        // To combat light-bleeding, experiment with raising p_max to some power
+        // (Try values from 0.1 to 100.0, if you like.)
+        fPercentLit = pow( p_max, 4 );
+	    
+	}
+    
+}
+
+//--------------------------------------------------------------------------------------
+// Calculate amount to blend between two cascades and the band where blending will occure.
+//--------------------------------------------------------------------------------------
+void CalculateBlendAmountForInterval ( in int iNextCascadeIndex, 
+                                       in out float fPixelDepth, 
+                                       in out float fCurrentPixelsBlendBandLocation,
+                                       out float fBlendBetweenCascadesAmount
+                                       ) 
+{
+
+    // We need to calculate the band of the current shadow map where it will fade into the next cascade.
+    // We can then early out of the expensive PCF for loop.
+    // 
+    float fBlendInterval = m_fCascadeFrustumsEyeSpaceDepths[ iNextCascadeIndex - 1 ];
+    if( iNextCascadeIndex > 1 ) 
+    {
+        fPixelDepth -= m_fCascadeFrustumsEyeSpaceDepths[ iNextCascadeIndex-2 ];
+        fBlendInterval -= m_fCascadeFrustumsEyeSpaceDepths[ iNextCascadeIndex-2 ];
+    } 
+    // The current pixel's blend band location will be used to determine when we need to blend and by how much.
+    fCurrentPixelsBlendBandLocation = fPixelDepth / fBlendInterval;
+    fCurrentPixelsBlendBandLocation = 1.0f - fCurrentPixelsBlendBandLocation;
+    // The fBlendBetweenCascadesAmount is our location in the blend band.
+    fBlendBetweenCascadesAmount = fCurrentPixelsBlendBandLocation / m_fCascadeBlendArea;
+}
+
+
+//--------------------------------------------------------------------------------------
+// Calculate amount to blend between two cascades and the band where blending will occure.
+//--------------------------------------------------------------------------------------
+void CalculateBlendAmountForMap ( in float4 vShadowMapTextureCoord, 
+                                  in out float fCurrentPixelsBlendBandLocation,
+                                  out float fBlendBetweenCascadesAmount ) 
+{
+    // Calcaulte the blend band for the map based selection.
+    float2 distanceToOne = float2 ( 1.0f - vShadowMapTextureCoord.x, 1.0f - vShadowMapTextureCoord.y );
+    fCurrentPixelsBlendBandLocation = min( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y );
+    float fCurrentPixelsBlendBandLocation2 = min( distanceToOne.x, distanceToOne.y );
+    fCurrentPixelsBlendBandLocation = 
+        min( fCurrentPixelsBlendBandLocation, fCurrentPixelsBlendBandLocation2 );
+    fBlendBetweenCascadesAmount = fCurrentPixelsBlendBandLocation / m_fCascadeBlendArea;
+}
+
+//--------------------------------------------------------------------------------------
+// Calculate the shadow based on several options and rende the scene.
+//--------------------------------------------------------------------------------------
+
+float4 PSMain( VS_OUTPUT Input ) : SV_TARGET
+{
+    float4 vDiffuse = g_txDiffuse.Sample( g_samLinear, Input.vTexcoord );
+    
+    
+    float4 vShadowMapTextureCoordViewSpace = 0.0f;
+    float4 vShadowMapTextureCoord = 0.0f;
+    float4 vShadowMapTextureCoord_blend = 0.0f;
+    
+    float4 vVisualizeCascadeColor = float4(0.0f,0.0f,0.0f,1.0f);
+    
+    float fPercentLit = 0.0f;
+    float fPercentLit_blend = 0.0f;
+
+    int iCascadeFound = 0;
+    int iCurrentCascadeIndex=1;
+    int iNextCascadeIndex = 0;
+
+    float fCurrentPixelDepth;
+
+    // The interval based selection technique compares the pixel's depth against the frustum's cascade divisions.
+    fCurrentPixelDepth = Input.vDepth;
+    
+    // This for loop is not necessary when the frustum is uniformaly divided and interval based selection is used.
+    // In this case fCurrentPixelDepth could be used as an array lookup into the correct frustum. 
+    vShadowMapTextureCoordViewSpace = Input.vTexShadow;
+    
+    
+    if( SELECT_CASCADE_BY_INTERVAL_FLAG ) 
+    {
+        iCurrentCascadeIndex = 0;
+        if (CASCADE_COUNT_FLAG > 1 ) 
+        {
+            float4 vCurrentPixelDepth = Input.vDepth;
+            float4 fComparison = ( vCurrentPixelDepth > m_fCascadeFrustumsEyeSpaceDepthsData[0]);
+            float4 fComparison2 = ( vCurrentPixelDepth > m_fCascadeFrustumsEyeSpaceDepthsData[1]);
+            float fIndex = dot( 
+                            float4( CASCADE_COUNT_FLAG > 0,
+                                    CASCADE_COUNT_FLAG > 1, 
+                                    CASCADE_COUNT_FLAG > 2, 
+                                    CASCADE_COUNT_FLAG > 3)
+                            , fComparison )
+                         + dot( 
+                            float4(
+                                    CASCADE_COUNT_FLAG > 4,
+                                    CASCADE_COUNT_FLAG > 5,
+                                    CASCADE_COUNT_FLAG > 6,
+                                    CASCADE_COUNT_FLAG > 7)
+                            , fComparison2 ) ;
+                                    
+            fIndex = min( fIndex, CASCADE_COUNT_FLAG - 1 );
+            iCurrentCascadeIndex = (int)fIndex;
+        }
+    }
+    
+    if ( !SELECT_CASCADE_BY_INTERVAL_FLAG ) 
+    {
+        iCurrentCascadeIndex = 0;
+        if ( CASCADE_COUNT_FLAG == 1 ) 
+        {
+            vShadowMapTextureCoord = vShadowMapTextureCoordViewSpace * m_vCascadeScale[0];
+            vShadowMapTextureCoord += m_vCascadeOffset[0];
+        }
+        if ( CASCADE_COUNT_FLAG > 1 ) {
+            for( int iCascadeIndex = 0; iCascadeIndex < CASCADE_COUNT_FLAG && iCascadeFound == 0; ++iCascadeIndex ) 
+            {
+                vShadowMapTextureCoord = vShadowMapTextureCoordViewSpace * m_vCascadeScale[iCascadeIndex];
+                vShadowMapTextureCoord += m_vCascadeOffset[iCascadeIndex];
+
+                if ( min( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y ) > m_fMinBorderPadding
+                  && max( vShadowMapTextureCoord.x, vShadowMapTextureCoord.y ) < m_fMaxBorderPadding )
+                { 
+                    iCurrentCascadeIndex = iCascadeIndex;   
+                    iCascadeFound = 1; 
+                }
+            }
+        }
+    }    
+    // Found the correct map.
+    vVisualizeCascadeColor = vCascadeColorsMultiplier[iCurrentCascadeIndex];
+    
+    ComputeCoordinatesTransform( iCurrentCascadeIndex, Input.vInterpPos, vShadowMapTextureCoord, vShadowMapTextureCoordViewSpace  );    
+                                             
+    if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG && CASCADE_COUNT_FLAG > 1 ) 
+    {
+        // Repeat text coord calculations for the next cascade. 
+        // The next cascade index is used for blurring between maps.
+        iNextCascadeIndex = min ( CASCADE_COUNT_FLAG - 1, iCurrentCascadeIndex + 1 ); 
+        if( !SELECT_CASCADE_BY_INTERVAL_FLAG ) 
+        {
+            vShadowMapTextureCoord_blend = vShadowMapTextureCoordViewSpace * m_vCascadeScale[iNextCascadeIndex];
+            vShadowMapTextureCoord_blend += m_vCascadeOffset[iNextCascadeIndex];
+        }
+        ComputeCoordinatesTransform( iNextCascadeIndex, Input.vInterpPos, vShadowMapTextureCoord_blend, vShadowMapTextureCoordViewSpace );  
+    }            
+    float fBlendBetweenCascadesAmount = 1.0f;
+    float fCurrentPixelsBlendBandLocation = 1.0f;
+    
+    if( SELECT_CASCADE_BY_INTERVAL_FLAG ) 
+    {
+        if( CASCADE_COUNT_FLAG > 1 && BLEND_BETWEEN_CASCADE_LAYERS_FLAG ) 
+        {
+            CalculateBlendAmountForInterval ( iNextCascadeIndex, fCurrentPixelDepth, 
+                fCurrentPixelsBlendBandLocation, fBlendBetweenCascadesAmount );
+            
+        }   
+    }
+    else 
+    {
+        if( CASCADE_COUNT_FLAG > 1 && BLEND_BETWEEN_CASCADE_LAYERS_FLAG ) 
+        {
+            CalculateBlendAmountForMap ( vShadowMapTextureCoord, 
+                fCurrentPixelsBlendBandLocation, fBlendBetweenCascadesAmount );
+        }   
+    }
+    
+    // Because the Z coordinate specifies the texture array,
+    // the derivative will be 0 when there is no divergence
+    //float fDivergence = abs( ddy( vShadowMapTextureCoord.z ) ) +  abs( ddx( vShadowMapTextureCoord.z ) );
+    CalculateVarianceShadow ( vShadowMapTextureCoord, vShadowMapTextureCoordViewSpace, 
+								iCurrentCascadeIndex, fPercentLit);
+								
+    // We repeat the calcuation for the next cascade layer, when blending between maps.
+    if( BLEND_BETWEEN_CASCADE_LAYERS_FLAG  && CASCADE_COUNT_FLAG > 1 ) 
+    {
+        if( fCurrentPixelsBlendBandLocation < m_fCascadeBlendArea ) 
+        {  // the current pixel is within the blend band.
+
+			// Because the Z coordinate species the texture array,
+			// the derivative will be 0 when there is no divergence
+			float fDivergence = abs( ddy( vShadowMapTextureCoord_blend.z ) ) +  
+				abs( ddx( vShadowMapTextureCoord_blend.z) );
+            CalculateVarianceShadow ( vShadowMapTextureCoord_blend, vShadowMapTextureCoordViewSpace, 
+										iNextCascadeIndex, fPercentLit_blend );
+
+            // Blend the two calculated shadows by the blend amount.
+            fPercentLit = lerp( fPercentLit_blend, fPercentLit, fBlendBetweenCascadesAmount ); 
+
+        }   
+    }    
+  
+    if( !m_iVisualizeCascades ) vVisualizeCascadeColor = float4( 1.0f, 1.0f, 1.0f, 1.0f );
+    
+    float3 vLightDir1 = float3( -1.0f, 1.0f, -1.0f ); 
+    float3 vLightDir2 = float3( 1.0f, 1.0f, -1.0f ); 
+    float3 vLightDir3 = float3( 0.0f, -1.0f, 0.0f );
+    float3 vLightDir4 = float3( 1.0f, 1.0f, 1.0f );     
+    // Some ambient-like lighting.
+    float fLighting = 
+                      saturate( dot( vLightDir1 , Input.vNormal ) )*0.05f +
+                      saturate( dot( vLightDir2 , Input.vNormal ) )*0.05f +
+                      saturate( dot( vLightDir3 , Input.vNormal ) )*0.05f +
+                      saturate( dot( vLightDir4 , Input.vNormal ) )*0.05f ;
+    
+    float4 vShadowLighting = fLighting * 0.5f;
+    fLighting += saturate( dot( m_vLightDir , Input.vNormal ) );
+    fLighting = lerp( vShadowLighting, fLighting, fPercentLit );
+    
+    return fLighting * vVisualizeCascadeColor * vDiffuse;
+
+}
+
diff --git a/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceShadow.hlsl b/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceShadow.hlsl
new file mode 100644
index 000000000..9837bf299
--- /dev/null
+++ b/tests/hlsl/dxsdk/VarianceShadows11/RenderVarianceShadow.hlsl
@@ -0,0 +1,45 @@
+//TEST:COMPARE_HLSL: -target dxbc-assembly -profile vs_4_0 -entry VSMain -profile ps_4_0 -entry PSMain
+
+
+//--------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------
+cbuffer cbPerObject : register( b0 )
+{
+	matrix		g_mWorldViewProjection	: packoffset( c0 );
+};
+
+//--------------------------------------------------------------------------------------
+// Input / Output structures
+//--------------------------------------------------------------------------------------
+struct VS_INPUT
+{
+	float4 vPosition	: POSITION;
+};
+
+struct VS_OUTPUT
+{
+	float4 vPosition	: SV_POSITION;
+};
+
+//--------------------------------------------------------------------------------------
+// Vertex Shader
+//--------------------------------------------------------------------------------------
+VS_OUTPUT VSMain( VS_INPUT Input )
+{
+	VS_OUTPUT Output;
+	
+	
+	Output.vPosition = mul( Input.vPosition, g_mWorldViewProjection );
+
+	return Output;
+}
+
+
+float2 PSMain (VS_OUTPUT Input) : SV_TARGET 
+{
+    float2 rt;
+    rt.x = Input.vPosition.z;
+    rt.y = rt.x * rt.x;
+    return rt;
+}
\ No newline at end of file
diff --git a/tests/hlsl/simple/compute-numthreads.hlsl b/tests/hlsl/simple/compute-numthreads.hlsl
new file mode 100644
index 000000000..3843c401f
--- /dev/null
+++ b/tests/hlsl/simple/compute-numthreads.hlsl
@@ -0,0 +1,11 @@
+//TEST:COMPARE_HLSL: -no-checking -target dxbc-assembly -profile cs_5_0 -entry main
+
+// Confirm that we properly pass along the `numthreads` attribute on an entry point.
+
+RWStructuredBuffer<float> b;
+
+[numthreads(32,1,1)]
+void main(uint3 tid : SV_DispatchThreadID)
+{
+	b[tid.x] = b[tid.x + 1] + 1.0f;
+}
\ No newline at end of file
-- 
cgit v1.2.3