59 files changed, 1984 insertions, 832 deletions
diff --git a/build/visual-studio/slang/slang.vcxproj b/build/visual-studio/slang/slang.vcxproj
index 9971333d7..e97d6a2b1 100644
--- a/build/visual-studio/slang/slang.vcxproj
+++ b/build/visual-studio/slang/slang.vcxproj
@@ -407,6 +407,7 @@ IF EXIST ..\..\..\external\slang-glslang\bin\windows-aarch64\release\slang-glsla
     <ClInclude Include="..\..\..\source\slang\slang-ir-missing-return.h" />
     <ClInclude Include="..\..\..\source\slang\slang-ir-optix-entry-point-uniforms.h" />
     <ClInclude Include="..\..\..\source\slang\slang-ir-peephole.h" />
+    <ClInclude Include="..\..\..\source\slang\slang-ir-propagate-func-properties.h" />
     <ClInclude Include="..\..\..\source\slang\slang-ir-redundancy-removal.h" />
     <ClInclude Include="..\..\..\source\slang\slang-ir-remove-unused-generic-param.h" />
     <ClInclude Include="..\..\..\source\slang\slang-ir-restructure-scoping.h" />
@@ -591,6 +592,7 @@ IF EXIST ..\..\..\external\slang-glslang\bin\windows-aarch64\release\slang-glsla
     <ClCompile Include="..\..\..\source\slang\slang-ir-missing-return.cpp" />
     <ClCompile Include="..\..\..\source\slang\slang-ir-optix-entry-point-uniforms.cpp" />
     <ClCompile Include="..\..\..\source\slang\slang-ir-peephole.cpp" />
+    <ClCompile Include="..\..\..\source\slang\slang-ir-propagate-func-properties.cpp" />
     <ClCompile Include="..\..\..\source\slang\slang-ir-redundancy-removal.cpp" />
     <ClCompile Include="..\..\..\source\slang\slang-ir-remove-unused-generic-param.cpp" />
     <ClCompile Include="..\..\..\source\slang\slang-ir-restructure-scoping.cpp" />
diff --git a/build/visual-studio/slang/slang.vcxproj.filters b/build/visual-studio/slang/slang.vcxproj.filters
index 839182de5..64267db4b 100644
--- a/build/visual-studio/slang/slang.vcxproj.filters
+++ b/build/visual-studio/slang/slang.vcxproj.filters
@@ -327,6 +327,9 @@
     <ClInclude Include="..\..\..\source\slang\slang-ir-peephole.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\..\source\slang\slang-ir-propagate-func-properties.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
     <ClInclude Include="..\..\..\source\slang\slang-ir-redundancy-removal.h">
       <Filter>Header Files</Filter>
     </ClInclude>
@@ -875,6 +878,9 @@
     <ClCompile Include="..\..\..\source\slang\slang-ir-peephole.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\source\slang\slang-ir-propagate-func-properties.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\source\slang\slang-ir-redundancy-removal.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index 6357d58bd..9da33c755 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -2525,21 +2525,25 @@ int __SyntaxError();
 __generic<T>
 __target_intrinsic(cuda, "sizeof($G0)")
 __target_intrinsic(cpp, "sizeof($G0)")
+[__readNone]
 int __sizeOf();
 
 __generic<T>
 __target_intrinsic(cuda, "sizeof($T0)")
 __target_intrinsic(cpp, "sizeof($T0)")
+[__readNone]
 int __sizeOf(T v);
 
 __generic<T>
 __target_intrinsic(cuda, "SLANG_ALIGN_OF($G0)")
 __target_intrinsic(cpp, "SLANG_ALIGN_OF($G0)")
+[__readNone]
 int __alignOf();
 
 __generic<T>
 __target_intrinsic(cuda, "SLANG_ALIGN_OF($T0)")
 __target_intrinsic(cpp, "SLANG_ALIGN_OF($T0)")
+[__readNone]
 int __alignOf(T v);
 
 // It would be nice to have offsetof equivalent, but it's not clear how that would work in terms of the Slang language.
@@ -2547,6 +2551,7 @@ int __alignOf(T v);
 __generic<T,F>
 __target_intrinsic(cuda, "int(((char*)&($1)) - ((char*)&($0)))")
 __target_intrinsic(cpp, "int(((char*)&($1)) - ((char*)&($0))")
+[__readNone]
 int __offsetOf(in T t, in F field);
 
 /// Mark beginning of "interlocked" operations in a fragment shader.
@@ -2960,6 +2965,9 @@ attribute_syntax [builtin] : BuiltinAttribute;
 __attributeTarget(DeclBase)
 attribute_syntax [__requiresNVAPI] : RequiresNVAPIAttribute;
 
+__attributeTarget(DeclBase)
+attribute_syntax [__AlwaysFoldIntoUseSiteAttribute] : AlwaysFoldIntoUseSiteAttribute;
+
 __attributeTarget(FunctionDeclBase)
 attribute_syntax [noinline] : NoInlineAttribute;
 
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 7e75d06b3..37cdc205e 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -778,6 +778,7 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_abs($0)")
 __target_intrinsic(cpp, "$P_abs($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FAbs, SAbs) _0")
+[__readNone]
 T abs(T x);
 /*{
     // Note: this simple definition may not be appropriate for floating-point inputs
@@ -788,6 +789,7 @@ __generic<T : __BuiltinIntegerType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FAbs, SAbs) _0")
+[__readNone]
 vector<T, N> abs(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, abs, x);
@@ -795,6 +797,7 @@ vector<T, N> abs(vector<T, N> x)
 
 __generic<T : __BuiltinIntegerType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> abs(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, abs, x);
@@ -806,12 +809,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_abs($0)")
 __target_intrinsic(cpp, "$P_abs($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FAbs, SAbs) _0")
+[__readNone]
 T abs(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FAbs, SAbs) _0")
+[__readNone]
 vector<T, N> abs(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, abs, x);
@@ -819,6 +824,7 @@ vector<T, N> abs(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> abs(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, abs, x);
@@ -832,12 +838,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_acos($0)")
 __target_intrinsic(cpp, "$P_acos($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Acos _0")
+[__readNone]
 T acos(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Acos _0")
+[__readNone]
 vector<T, N> acos(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, acos, x);
@@ -845,6 +853,7 @@ vector<T, N> acos(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> acos(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, acos, x);
@@ -855,11 +864,13 @@ __generic<T : __BuiltinType>
 __target_intrinsic(cpp, "bool($0)")
 __target_intrinsic(cuda, "bool($0)")
 __target_intrinsic(glsl, "bool($0)")
+[__readNone]
 bool all(T x);
 
 __generic<T : __BuiltinType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "all(bvec$N0($0))")
+[__readNone]
 bool all(vector<T,N> x)
 {
     bool result = true;
@@ -870,6 +881,7 @@ bool all(vector<T,N> x)
 
 __generic<T : __BuiltinType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 bool all(matrix<T,N,M> x)
 {
     bool result = true;
@@ -894,11 +906,13 @@ __generic<T : __BuiltinType>
 __target_intrinsic(cpp, "bool($0)")
 __target_intrinsic(cuda, "bool($0)")
 __target_intrinsic(glsl, "bool($0)")
+[__readNone]
 bool any(T x);
 
 __generic<T : __BuiltinType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "any(bvec$N0($0))")
+[__readNone]
 bool any(vector<T, N> x)
 {
     bool result = false;
@@ -909,6 +923,7 @@ bool any(vector<T, N> x)
 
 __generic<T : __BuiltinType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 bool any(matrix<T, N, M> x)
 {
     bool result = false;
@@ -926,6 +941,7 @@ __target_intrinsic(cpp, "$P_asdouble($0, $1)")
 __target_intrinsic(cuda, "$P_asdouble($0, $1)")
 __target_intrinsic(spirv_direct, "%v = OpCompositeConstruct _type(uint2) resultId _0 _1; OpExtInst resultType resultId glsl450 59 %v")
 __glsl_extension(GL_ARB_gpu_shader5)
+[__readNone]
 double asdouble(uint lowbits, uint highbits);
 
 // Reinterpret bits as a float (HLSL SM 4.0)
@@ -935,6 +951,7 @@ __target_intrinsic(glsl, "intBitsToFloat")
 __target_intrinsic(cpp, "$P_asfloat($0)")
 __target_intrinsic(cuda, "$P_asfloat($0)")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 float asfloat(int x);
 
 __target_intrinsic(hlsl)
@@ -942,12 +959,14 @@ __target_intrinsic(glsl, "uintBitsToFloat")
 __target_intrinsic(cpp, "$P_asfloat($0)")
 __target_intrinsic(cuda, "$P_asfloat($0)")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 float asfloat(uint x);
 
 __generic<let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "intBitsToFloat")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 vector<float, N> asfloat(vector< int, N> x)
 {
     VECTOR_MAP_UNARY(float, N, asfloat, x);
@@ -957,6 +976,7 @@ __generic<let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "uintBitsToFloat")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 vector<float,N> asfloat(vector<uint,N> x)
 {
     VECTOR_MAP_UNARY(float, N, asfloat, x);
@@ -964,6 +984,7 @@ vector<float,N> asfloat(vector<uint,N> x)
 
 __generic<let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<float,N,M> asfloat(matrix< int,N,M> x)
 {
     MATRIX_MAP_UNARY(float, N, M, asfloat, x);
@@ -971,6 +992,7 @@ matrix<float,N,M> asfloat(matrix< int,N,M> x)
 
 __generic<let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<float,N,M> asfloat(matrix<uint,N,M> x)
 {
     MATRIX_MAP_UNARY(float, N, M, asfloat, x);
@@ -978,16 +1000,19 @@ matrix<float,N,M> asfloat(matrix<uint,N,M> x)
 
 // No op
 [__unsafeForceInlineEarly]
+[__readNone]
 float asfloat(float x)
 { return x; }
 
 __generic<let N : int>
 [__unsafeForceInlineEarly]
+[__readNone]
 vector<float,N> asfloat(vector<float,N> x)
 { return x; }
 
 __generic<let N : int, let M : int>
 [__unsafeForceInlineEarly]
+[__readNone]
 matrix<float,N,M> asfloat(matrix<float,N,M> x)
 { return x; }
 
@@ -998,12 +1023,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_asin($0)")
 __target_intrinsic(cpp, "$P_asin($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Asin _0")
+[__readNone]
 T asin(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Asin _0")
+[__readNone]
 vector<T, N> asin(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T,N,asin,x);
@@ -1011,6 +1038,7 @@ vector<T, N> asin(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> asin(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T,N,M,asin,x);
@@ -1023,6 +1051,7 @@ __target_intrinsic(glsl, "floatBitsToInt")
 __target_intrinsic(cpp, "$P_asint($0)")
 __target_intrinsic(cuda, "$P_asint($0)")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 int asint(float x);
 
 __target_intrinsic(hlsl)
@@ -1030,12 +1059,14 @@ __target_intrinsic(glsl, "int($0)")
 __target_intrinsic(cpp, "$P_asint($0)")
 __target_intrinsic(cuda, "$P_asint($0)")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 int asint(uint x);
 
 __generic<let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "floatBitsToInt")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 vector<int, N> asint(vector<float, N> x)
 {
     VECTOR_MAP_UNARY(int, N, asint, x);
@@ -1045,6 +1076,7 @@ __generic<let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "ivec$N0($0)")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 vector<int, N> asint(vector<uint, N> x)
 {
     VECTOR_MAP_UNARY(int, N, asint, x);
@@ -1052,6 +1084,7 @@ vector<int, N> asint(vector<uint, N> x)
 
 __generic<let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<int, N, M> asint(matrix<float, N, M> x)
 {
     MATRIX_MAP_UNARY(int, N, M, asint, x);
@@ -1059,6 +1092,7 @@ matrix<int, N, M> asint(matrix<float, N, M> x)
 
 __generic<let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<int, N, M> asint(matrix<uint, N, M> x)
 {
     MATRIX_MAP_UNARY(int, N, M, asint, x);
@@ -1066,16 +1100,19 @@ matrix<int, N, M> asint(matrix<uint, N, M> x)
 
 // No op
 [__unsafeForceInlineEarly]
+[__readNone]
 int asint(int x)
 { return x; }
 
 __generic<let N : int>
 [__unsafeForceInlineEarly]
+[__readNone]
 vector<int,N> asint(vector<int,N> x)
 { return x; }
 
 __generic<let N : int, let M : int>
 [__unsafeForceInlineEarly]
+[__readNone]
 matrix<int,N,M> asint(matrix<int,N,M> x)
 { return x; }
 
@@ -1086,6 +1123,7 @@ __target_intrinsic(glsl, "{ uvec2 v = unpackDouble2x32($0); $1 = v.x; $2 = v.y;
 __glsl_extension(GL_ARB_gpu_shader5)
 __target_intrinsic(cpp, "$P_asuint($0, $1, $2)")
 __target_intrinsic(cuda, "$P_asuint($0, $1, $2)")
+[__readNone]
 void asuint(double value, out uint lowbits, out uint highbits);
 
 // Reinterpret bits as a uint (HLSL SM 4.0)
@@ -1095,6 +1133,7 @@ __target_intrinsic(glsl, "floatBitsToUint")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
 __target_intrinsic(cpp, "$P_asuint($0)")
 __target_intrinsic(cuda, "$P_asuint($0)")
+[__readNone]
 uint asuint(float x);
 
 __target_intrinsic(hlsl)
@@ -1102,12 +1141,14 @@ __target_intrinsic(glsl, "uint($0)")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
 __target_intrinsic(cpp, "$P_asuint($0)")
 __target_intrinsic(cuda, "$P_asuint($0)")
+[__readNone]
 uint asuint(int x);
 
 __generic<let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "floatBitsToUint")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 vector<uint,N> asuint(vector<float,N> x)
 {
     VECTOR_MAP_UNARY(uint, N, asuint, x);
@@ -1117,6 +1158,7 @@ __generic<let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "uvec$N0($0)")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 vector<uint, N> asuint(vector<int, N> x)
 {
     VECTOR_MAP_UNARY(uint, N, asuint, x);
@@ -1124,6 +1166,7 @@ vector<uint, N> asuint(vector<int, N> x)
 
 __generic<let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<uint,N,M> asuint(matrix<float,N,M> x)
 {
     MATRIX_MAP_UNARY(uint, N, M, asuint, x);
@@ -1131,22 +1174,26 @@ matrix<uint,N,M> asuint(matrix<float,N,M> x)
 
 __generic<let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<uint, N, M> asuint(matrix<int, N, M> x)
 {
     MATRIX_MAP_UNARY(uint, N, M, asuint, x);
 }
 
 [__unsafeForceInlineEarly]
+[__readNone]
 uint asuint(uint x)
 { return x; }
 
 __generic<let N : int>
 [__unsafeForceInlineEarly]
+[__readNone]
 vector<uint,N> asuint(vector<uint,N> x)
 { return x; }
 
 __generic<let N : int, let M : int>
 [__unsafeForceInlineEarly]
+[__readNone]
 matrix<uint,N,M> asuint(matrix<uint,N,M> x)
 { return x; }
 
@@ -1159,38 +1206,41 @@ matrix<uint,N,M> asuint(matrix<uint,N,M> x)
 
 // Identity cases:
 
-[__unsafeForceInlineEarly] float16_t asfloat16(float16_t value) { return value; }
-[__unsafeForceInlineEarly] vector<float16_t,N> asfloat16<let N : int>(vector<float16_t,N> value) { return value; }
-[__unsafeForceInlineEarly] matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<float16_t,R,C> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] float16_t asfloat16(float16_t value) { return value; }
+[__unsafeForceInlineEarly][__readNone] vector<float16_t,N> asfloat16<let N : int>(vector<float16_t,N> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<float16_t,R,C> value) { return value; }
 
-[__unsafeForceInlineEarly] int16_t asint16(int16_t value) { return value; }
-[__unsafeForceInlineEarly] vector<int16_t,N> asint16<let N : int>(vector<int16_t,N> value) { return value; }
-[__unsafeForceInlineEarly] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] int16_t asint16(int16_t value) { return value; }
+[__unsafeForceInlineEarly][__readNone] vector<int16_t,N> asint16<let N : int>(vector<int16_t,N> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return value; }
 
-[__unsafeForceInlineEarly] uint16_t asuint16(uint16_t value) { return value; }
-[__unsafeForceInlineEarly] vector<uint16_t,N> asuint16<let N : int>(vector<uint16_t,N> value) { return value; }
-[__unsafeForceInlineEarly] matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<uint16_t,R,C> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] uint16_t asuint16(uint16_t value) { return value; }
+[__unsafeForceInlineEarly][__readNone] vector<uint16_t,N> asuint16<let N : int>(vector<uint16_t,N> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<uint16_t,R,C> value) { return value; }
 
 // Signed<->unsigned cases:
 
-[__unsafeForceInlineEarly] int16_t asint16(uint16_t value) { return value; }
-[__unsafeForceInlineEarly] vector<int16_t,N> asint16<let N : int>(vector<uint16_t,N> value) { return value; }
-[__unsafeForceInlineEarly] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<uint16_t,R,C> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] int16_t asint16(uint16_t value) { return value; }
+[__unsafeForceInlineEarly][__readNone] vector<int16_t,N> asint16<let N : int>(vector<uint16_t,N> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<uint16_t,R,C> value) { return value; }
 
-[__unsafeForceInlineEarly] uint16_t asuint16(int16_t value) { return value; }
-[__unsafeForceInlineEarly] vector<uint16_t,N> asuint16<let N : int>(vector<int16_t,N> value) { return value; }
-[__unsafeForceInlineEarly] matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] uint16_t asuint16(int16_t value) { return value; }
+[__unsafeForceInlineEarly][__readNone] vector<uint16_t,N> asuint16<let N : int>(vector<int16_t,N> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return value; }
 
 // Float->unsigned cases:
 
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "uint16_t(packHalf2x16(vec2($0, 0.0)))")
 __target_intrinsic(cuda, "__half_as_ushort")
+[__readNone]
 uint16_t asuint16(float16_t value);
 
+[__readNone]
 vector<uint16_t,N> asuint16<let N : int>(vector<float16_t,N> value)
 { VECTOR_MAP_UNARY(uint16_t, N, asuint16, value); }
 
+[__readNone]
 matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<float16_t,R,C> value)
 { MATRIX_MAP_UNARY(uint16_t, R, C, asuint16, value); }
 
@@ -1199,11 +1249,14 @@ matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<float16_t,R,C> va
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "float16_t(unpackHalf2x16($0).x)")
 __target_intrinsic(cuda, "__ushort_as_half")
+[__readNone]
 float16_t asfloat16(uint16_t value);
 
+[__readNone]
 vector<float16_t,N> asfloat16<let N : int>(vector<uint16_t,N> value)
 { VECTOR_MAP_UNARY(float16_t, N, asfloat16, value); }
 
+[__readNone]
 matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<uint16_t,R,C> value)
 { MATRIX_MAP_UNARY(float16_t, R, C, asfloat16, value); }
 
@@ -1211,16 +1264,17 @@ matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<uint16_t,R,C> v
 
 __target_intrinsic(hlsl)
 __target_intrinsic(cuda, "__half_as_short")
-[__unsafeForceInlineEarly] int16_t asint16(float16_t value) { return asuint16(value); }
-__target_intrinsic(hlsl) [__unsafeForceInlineEarly] vector<int16_t,N> asint16<let N : int>(vector<float16_t,N> value) { return asuint16(value); }
-__target_intrinsic(hlsl) [__unsafeForceInlineEarly] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<float16_t,R,C> value) { return asuint16(value); }
+[__unsafeForceInlineEarly][__readNone] int16_t asint16(float16_t value) { return asuint16(value); }
+__target_intrinsic(hlsl) [__unsafeForceInlineEarly][__readNone] vector<int16_t,N> asint16<let N : int>(vector<float16_t,N> value) { return asuint16(value); }
+__target_intrinsic(hlsl) [__unsafeForceInlineEarly][__readNone] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<float16_t,R,C> value) { return asuint16(value); }
 
 __target_intrinsic(hlsl)
 __target_intrinsic(cuda, "__short_as_half")
+[__readNone]
 [__unsafeForceInlineEarly] float16_t asfloat16(int16_t value) { return asfloat16(asuint16(value)); }
 
-__target_intrinsic(hlsl) [__unsafeForceInlineEarly] vector<float16_t,N> asfloat16<let N : int>(vector<int16_t,N> value) { return asfloat16(asuint16(value)); }
-__target_intrinsic(hlsl) [__unsafeForceInlineEarly] matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return asfloat16(asuint16(value)); }
+__target_intrinsic(hlsl) [__unsafeForceInlineEarly][__readNone] vector<float16_t,N> asfloat16<let N : int>(vector<int16_t,N> value) { return asfloat16(asuint16(value)); }
+__target_intrinsic(hlsl) [__unsafeForceInlineEarly][__readNone] matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return asfloat16(asuint16(value)); }
 
 // Inverse tangent (HLSL SM 1.0)
 __generic<T : __BuiltinFloatingPointType>
@@ -1229,12 +1283,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_atan($0)")
 __target_intrinsic(cpp, "$P_atan($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Atan _0")
+[__readNone]
 T atan(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Atan _0")
+[__readNone]
 vector<T, N> atan(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, atan, x);
@@ -1242,6 +1298,7 @@ vector<T, N> atan(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> atan(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, atan, x);
@@ -1253,12 +1310,14 @@ __target_intrinsic(glsl,"atan($0,$1)")
 __target_intrinsic(cuda, "$P_atan2($0, $1)")
 __target_intrinsic(cpp, "$P_atan2($0, $1)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Atan2 _0 _1")
+[__readNone]
 T atan2(T y, T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl,"atan($0,$1)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Atan2 _0 _1")
+[__readNone]
 vector<T, N> atan2(vector<T, N> y, vector<T, N> x)
 {
     VECTOR_MAP_BINARY(T, N, atan2, y, x);
@@ -1266,6 +1325,7 @@ vector<T, N> atan2(vector<T, N> y, vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> atan2(matrix<T,N,M> y, matrix<T,N,M> x)
 {
     MATRIX_MAP_BINARY(T, N, M, atan2, y, x);
@@ -1278,12 +1338,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_ceil($0)")
 __target_intrinsic(cpp, "$P_ceil($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Ceil _0")
+[__readNone]
 T ceil(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Ceil _0")
+[__readNone]
 vector<T, N> ceil(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, ceil, x);
@@ -1291,6 +1353,7 @@ vector<T, N> ceil(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> ceil(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, ceil, x);
@@ -1305,6 +1368,7 @@ __generic<T : __BuiltinIntegerType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FClamp, UClamp, SClamp) _0 _1 _2")
+[__readNone]
 T clamp(T x, T minBound, T maxBound)
 {
     return min(max(x, minBound), maxBound);
@@ -1314,6 +1378,7 @@ __generic<T : __BuiltinIntegerType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FClamp, UClamp, SClamp) _0 _1 _2")
+[__readNone]
 vector<T, N> clamp(vector<T, N> x, vector<T, N> minBound, vector<T, N> maxBound)
 {
     return min(max(x, minBound), maxBound);
@@ -1321,6 +1386,7 @@ vector<T, N> clamp(vector<T, N> x, vector<T, N> minBound, vector<T, N> maxBound)
 
 __generic<T : __BuiltinIntegerType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> clamp(matrix<T,N,M> x, matrix<T,N,M> minBound, matrix<T,N,M> maxBound)
 {
     return min(max(x, minBound), maxBound);
@@ -1330,6 +1396,7 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FClamp, UClamp, SClamp) _0 _1 _2")
+[__readNone]
 T clamp(T x, T minBound, T maxBound)
 {
     return min(max(x, minBound), maxBound);
@@ -1339,6 +1406,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FClamp, UClamp, SClamp) _0 _1 _2")
+[__readNone]
 vector<T, N> clamp(vector<T, N> x, vector<T, N> minBound, vector<T, N> maxBound)
 {
     return min(max(x, minBound), maxBound);
@@ -1346,6 +1414,7 @@ vector<T, N> clamp(vector<T, N> x, vector<T, N> minBound, vector<T, N> maxBound)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> clamp(matrix<T,N,M> x, matrix<T,N,M> minBound, matrix<T,N,M> maxBound)
 {
     return min(max(x, minBound), maxBound);
@@ -1354,6 +1423,7 @@ matrix<T,N,M> clamp(matrix<T,N,M> x, matrix<T,N,M> minBound, matrix<T,N,M> maxBo
 // Clip (discard) fragment conditionally
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
+[__readNone]
 void clip(T x)
 {
     if(x < T(0)) discard;
@@ -1361,6 +1431,7 @@ void clip(T x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 void clip(vector<T,N> x)
 {
     if(any(x < T(0))) discard;
@@ -1368,6 +1439,7 @@ void clip(vector<T,N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 void clip(matrix<T,N,M> x)
 {
     if(any(x < T(0))) discard;
@@ -1380,12 +1452,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_cos($0)")
 __target_intrinsic(cpp, "$P_cos($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cos _0")
+[__readNone]
 T cos(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cos _0")
+[__readNone]
 vector<T, N> cos(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T,N, cos, x);
@@ -1393,6 +1467,7 @@ vector<T, N> cos(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> cos(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, cos, x);
@@ -1405,12 +1480,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_cosh($0)")
 __target_intrinsic(cpp, "$P_cosh($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cosh _0")
+[__readNone]
 T cosh(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cosh _0")
+[__readNone]
 vector<T,N> cosh(vector<T,N> x)
 {
     VECTOR_MAP_UNARY(T,N, cosh, x);
@@ -1418,6 +1495,7 @@ vector<T,N> cosh(vector<T,N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> cosh(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, cosh, x);
@@ -1428,6 +1506,7 @@ __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "bitCount")
 __target_intrinsic(cuda, "$P_countbits($0)")
 __target_intrinsic(cpp, "$P_countbits($0)")
+[__readNone]
 uint countbits(uint value);
 
 // Cross product
@@ -1436,6 +1515,7 @@ __generic<T : __BuiltinArithmeticType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cross _0 _1")
+[__readNone]
 vector<T,3> cross(vector<T,3> left, vector<T,3> right)
 {
     return vector<T,3>(
@@ -1446,6 +1526,7 @@ vector<T,3> cross(vector<T,3> left, vector<T,3> right)
 
 // Convert encoded color
 __target_intrinsic(hlsl)
+[__readNone]
 int4 D3DCOLORtoUBYTE4(float4 color)
 {
     let scaled = color.zyxw * 255.001999f;
@@ -1455,11 +1536,13 @@ int4 D3DCOLORtoUBYTE4(float4 color)
 // Partial-difference derivatives
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(glsl, dFdx)
+[__readNone]
 T ddx(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, dFdx)
+[__readNone]
 vector<T, N> ddx(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, ddx, x);
@@ -1467,6 +1550,7 @@ vector<T, N> ddx(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> ddx(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, ddx, x);
@@ -1476,12 +1560,14 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __glsl_extension(GL_ARB_derivative_control)
 __target_intrinsic(glsl, dFdxCoarse)
+[__readNone]
 T ddx_coarse(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __glsl_extension(GL_ARB_derivative_control)
 __target_intrinsic(glsl, dFdxCoarse)
+[__readNone]
 vector<T, N> ddx_coarse(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, ddx_coarse, x);
@@ -1489,6 +1575,7 @@ vector<T, N> ddx_coarse(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> ddx_coarse(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, ddx_coarse, x);
@@ -1498,12 +1585,14 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __glsl_extension(GL_ARB_derivative_control)
 __target_intrinsic(glsl, dFdxFine)
+[__readNone]
 T ddx_fine(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __glsl_extension(GL_ARB_derivative_control)
 __target_intrinsic(glsl, dFdxFine)
+[__readNone]
 vector<T, N> ddx_fine(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, ddx_fine, x);
@@ -1511,6 +1600,7 @@ vector<T, N> ddx_fine(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> ddx_fine(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, ddx_fine, x);
@@ -1519,11 +1609,13 @@ matrix<T, N, M> ddx_fine(matrix<T, N, M> x)
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, dFdy)
+[__readNone]
 T ddy(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, dFdy)
+[__readNone]
 vector<T, N> ddy(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, ddy, x);
@@ -1531,6 +1623,7 @@ vector<T, N> ddy(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> ddy(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, ddy, x);
@@ -1539,12 +1632,14 @@ matrix<T, N, M> ddy(matrix<T, N, M> x)
 __generic<T : __BuiltinFloatingPointType>
 __glsl_extension(GL_ARB_derivative_control)
 __target_intrinsic(glsl, dFdyCoarse)
+[__readNone]
 T ddy_coarse(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __glsl_extension(GL_ARB_derivative_control)
 __target_intrinsic(glsl, dFdyCoarse)
+[__readNone]
 vector<T, N> ddy_coarse(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, ddy_coarse, x);
@@ -1552,6 +1647,7 @@ vector<T, N> ddy_coarse(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> ddy_coarse(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, ddy_coarse, x);
@@ -1561,12 +1657,14 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __glsl_extension(GL_ARB_derivative_control)
 __target_intrinsic(glsl, dFdyFine)
+[__readNone]
 T ddy_fine(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __glsl_extension(GL_ARB_derivative_control)
 __target_intrinsic(glsl, dFdyFine)
+[__readNone]
 vector<T, N> ddy_fine(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, ddy_fine, x);
@@ -1574,6 +1672,7 @@ vector<T, N> ddy_fine(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> ddy_fine(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, ddy_fine, x);
@@ -1586,6 +1685,7 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Degrees _0")
+[__readNone]
 T degrees(T x)
 {
     return x * (T(180) / T.getPi());
@@ -1595,6 +1695,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Degrees _0")
+[__readNone]
 vector<T, N> degrees(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, degrees, x);
@@ -1602,6 +1703,7 @@ vector<T, N> degrees(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> degrees(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, degrees, x);
@@ -1613,6 +1715,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Determinant _0")
+[__readNone]
 T determinant(matrix<T,N,N> m);
 
 // Barrier for device memory
@@ -1630,6 +1733,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Distance _0 _1")
+[__readNone]
 T distance(vector<T, N> x, vector<T, N> y)
 {
     return length(x - y);
@@ -1640,6 +1744,7 @@ T distance(vector<T, N> x, vector<T, N> y)
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
+[__readNone]
 T dot(vector<T, N> x, vector<T, N> y)
 {
     T result = T(0);
@@ -1650,6 +1755,7 @@ T dot(vector<T, N> x, vector<T, N> y)
 
 __generic<T : __BuiltinIntegerType, let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 T dot(vector<T, N> x, vector<T, N> y)
 {
     T result = T(0);
@@ -1682,15 +1788,18 @@ RWStructuredBuffer<T> __getEquivalentStructuredBuffer<T>(RWByteAddressBuffer b);
 __generic<T : __BuiltinArithmeticType>
 __target_intrinsic(glsl, interpolateAtCentroid)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InterpolateAtCentroid _0")
+[__readNone]
 T EvaluateAttributeAtCentroid(T x);
 
 __generic<T : __BuiltinArithmeticType, let N : int>
 __target_intrinsic(glsl, interpolateAtCentroid)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InterpolateAtCentroid _0")
+[__readNone]
 vector<T,N> EvaluateAttributeAtCentroid(vector<T,N> x);
 
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int>
 __target_intrinsic(glsl, interpolateAtCentroid)
+[__readNone]
 matrix<T,N,M> EvaluateAttributeAtCentroid(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, EvaluateAttributeAtCentroid, x);
@@ -1699,15 +1808,18 @@ matrix<T,N,M> EvaluateAttributeAtCentroid(matrix<T,N,M> x)
 __generic<T : __BuiltinArithmeticType>
 __target_intrinsic(glsl, "interpolateAtSample($0, int($1))")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InterpolateAtSample _0 _1")
+[__readNone]
 T EvaluateAttributeAtSample(T x, uint sampleindex);
 
 __generic<T : __BuiltinArithmeticType, let N : int>
 __target_intrinsic(glsl, "interpolateAtSample($0, int($1))")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InterpolateAtSample _0 _1")
+[__readNone]
 vector<T,N> EvaluateAttributeAtSample(vector<T,N> x, uint sampleindex);
 
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int>
 __target_intrinsic(glsl, "interpolateAtSample($0, int($1))")
+[__readNone]
 matrix<T,N,M> EvaluateAttributeAtSample(matrix<T,N,M> x, uint sampleindex)
 {
     matrix<T,N,M> result;
@@ -1721,15 +1833,18 @@ matrix<T,N,M> EvaluateAttributeAtSample(matrix<T,N,M> x, uint sampleindex)
 __generic<T : __BuiltinArithmeticType>
 __target_intrinsic(glsl, "interpolateAtOffset($0, vec2($1) / 16.0f)")
 __target_intrinsic(spirv_direct, "%foffset = OpConvertSToF _type(float2) resultId _1; %offsetdiv16 = 136 _type(float2) resultId %foffset const(float2, 16.0, 16.0); OpExtInst resultType resultId glsl450 78 _0 %offsetdiv16")
+[__readNone]
 T EvaluateAttributeSnapped(T x, int2 offset);
 
 __generic<T : __BuiltinArithmeticType, let N : int>
 __target_intrinsic(glsl, "interpolateAtOffset($0, vec2($1) / 16.0f)")
 __target_intrinsic(spirv_direct, "%foffset = OpConvertSToF _type(float2) resultId _1; %offsetdiv16 = 136 _type(float2) resultId %foffset const(float2, 16.0, 16.0); OpExtInst resultType resultId glsl450 78 _0 %offsetdiv16")
+[__readNone]
 vector<T,N> EvaluateAttributeSnapped(vector<T,N> x, int2 offset);
 
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int>
 __target_intrinsic(glsl, "interpolateAtOffset($0, vec2($1) / 16.0f)")
+[__readNone]
 matrix<T,N,M> EvaluateAttributeSnapped(matrix<T,N,M> x, int2 offset)
 {
     matrix<T,N,M> result;
@@ -1748,12 +1863,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_exp($0)")
 __target_intrinsic(cpp, "$P_exp($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Exp _0")
+[__readNone]
 T exp(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Exp _0")
+[__readNone]
 vector<T, N> exp(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, exp, x);
@@ -1761,6 +1878,7 @@ vector<T, N> exp(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> exp(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, exp, x);
@@ -1774,12 +1892,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_exp2($0)")
 __target_intrinsic(cpp, "$P_exp2($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Exp2 _0")
+[__readNone]
 T exp2(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Exp2 _0")
+[__readNone]
 vector<T,N> exp2(vector<T,N> x)
 {
     VECTOR_MAP_UNARY(T, N, exp2, x);
@@ -1787,6 +1907,7 @@ vector<T,N> exp2(vector<T,N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> exp2(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, exp2, x);
@@ -1799,10 +1920,12 @@ __glsl_version(420)
 __target_intrinsic(hlsl)
 __cuda_sm_version(6.0)
 __target_intrinsic(cuda, "__half2float(__ushort_as_half($0))")
+[__readNone]
 float f16tof32(uint value);
 
 __generic<let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 vector<float, N> f16tof32(vector<uint, N> value)
 {
     VECTOR_MAP_UNARY(float, N, f16tof32, value);
@@ -1816,10 +1939,12 @@ __glsl_version(420)
 __target_intrinsic(hlsl)
 __cuda_sm_version(6.0)
 __target_intrinsic(cuda, "__half_as_ushort(__float2half($0))")
+[__readNone]
 uint f32tof16(float value);
 
 __generic<let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 vector<uint, N> f32tof16(vector<float, N> value)
 {
     VECTOR_MAP_UNARY(uint, N, f32tof16, value);
@@ -1833,11 +1958,13 @@ vector<uint, N> f32tof16(vector<float, N> value)
 __target_intrinsic(glsl, "unpackHalf2x16($0).x")
 __target_intrinsic(cuda, "__half2float")
 __glsl_version(420)
+[__readNone]
 float f16tof32(float16_t value);
 
 __generic<let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(cuda, "__half2float")
+[__readNone]
 vector<float, N> f16tof32(vector<float16_t, N> value)
 {
     VECTOR_MAP_UNARY(float, N, f16tof32, value);
@@ -1847,10 +1974,12 @@ vector<float, N> f16tof32(vector<float16_t, N> value)
 __target_intrinsic(glsl, "packHalf2x16(vec2($0,0.0))")
 __glsl_version(420)
 __target_intrinsic(cuda, "__float2half")
+[__readNone]
 float16_t f32tof16_(float value);
 
 __generic<let N : int>
 __target_intrinsic(cuda, "__float2half")
+[__readNone]
 vector<float16_t, N> f32tof16_(vector<float, N> value)
 {
     VECTOR_MAP_UNARY(uint, N, f32tof16, value);
@@ -1862,6 +1991,7 @@ vector<float16_t, N> f32tof16_(vector<float, N> value)
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
+[__readNone]
 vector<T,N> faceforward(vector<T,N> n, vector<T,N> i, vector<T,N> ng)
 {
     return dot(ng, i) < T(0.0f) ? n : -n;
@@ -1873,12 +2003,14 @@ __target_intrinsic(glsl,"findMSB")
 __target_intrinsic(cuda, "$P_firstbithigh($0)")
 __target_intrinsic(cpp, "$P_firstbithigh($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindSMsb _0")
+[__readNone]
 int firstbithigh(int value);
 
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl,"findMSB")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindSMsb _0")
 __generic<let N : int>
+[__readNone]
 vector<int, N> firstbithigh(vector<int, N> value)
 {
     VECTOR_MAP_UNARY(int, N, firstbithigh, value);
@@ -1889,12 +2021,14 @@ __target_intrinsic(glsl,"findMSB")
 __target_intrinsic(cuda, "$P_firstbithigh($0)")
 __target_intrinsic(cpp, "$P_firstbithigh($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindUMsb _0")
+[__readNone]
 uint firstbithigh(uint value);
 
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl,"findMSB")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindUMsb _0")
 __generic<let N : int>
+[__readNone]
 vector<uint,N> firstbithigh(vector<uint,N> value)
 {
     VECTOR_MAP_UNARY(uint, N, firstbithigh, value);
@@ -1906,12 +2040,14 @@ __target_intrinsic(glsl,"findLSB")
 __target_intrinsic(cuda, "$P_firstbitlow($0)")
 __target_intrinsic(cpp, "$P_firstbitlow($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindILsb _0")
+[__readNone]
 int firstbitlow(int value);
 
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl,"findLSB")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindILsb _0")
 __generic<let N : int>
+[__readNone]
 vector<int,N> firstbitlow(vector<int,N> value)
 {
     VECTOR_MAP_UNARY(int, N, firstbitlow, value);
@@ -1922,12 +2058,14 @@ __target_intrinsic(glsl,"findLSB")
 __target_intrinsic(cuda, "$P_firstbitlow($0)")
 __target_intrinsic(cpp, "$P_firstbitlow($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindILsb _0")
+[__readNone]
 uint firstbitlow(uint value);
 
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl,"findLSB")
 __generic<let N : int>
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindILsb _0")
+[__readNone]
 vector<uint,N> firstbitlow(vector<uint,N> value)
 {
     VECTOR_MAP_UNARY(uint, N, firstbitlow, value);
@@ -1941,12 +2079,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_floor($0)")
 __target_intrinsic(cpp, "$P_floor($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Floor _0")
+[__readNone]
 T floor(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Floor _0")
+[__readNone]
 vector<T, N> floor(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, floor, x);
@@ -1954,6 +2094,7 @@ vector<T, N> floor(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> floor(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, floor, x);
@@ -1965,12 +2106,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_fma($0, $1, $2)")
 __target_intrinsic(cpp, "$P_fma($0, $1, $2)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
+[__readNone]
 double fma(double a, double b, double c);
 
 __generic<let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
+[__readNone]
 vector<double, N> fma(vector<double, N> a, vector<double, N> b, vector<double, N> c)
 {
     VECTOR_MAP_TRINARY(double, N, fma, a, b, c);
@@ -1978,6 +2121,7 @@ vector<double, N> fma(vector<double, N> a, vector<double, N> b, vector<double, N
 
 __generic<let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<double, N, M> fma(matrix<double, N, M> a, matrix<double, N, M> b, matrix<double, N, M> c)
 {
     MATRIX_MAP_TRINARY(double, N, M, fma, a, b, c);
@@ -1988,6 +2132,7 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(cuda, "$P_fmod($0, $1)")
 __target_intrinsic(cpp, "$P_fmod($0, $1)")
+[__readNone]
 T fmod(T x, T y)
 {
     return x - y * trunc(x/y);
@@ -1995,6 +2140,7 @@ T fmod(T x, T y)
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 vector<T, N> fmod(vector<T, N> x, vector<T, N> y)
 {
     VECTOR_MAP_BINARY(T, N, fmod, x, y);
@@ -2002,6 +2148,7 @@ vector<T, N> fmod(vector<T, N> x, vector<T, N> y)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> fmod(matrix<T, N, M> x, matrix<T, N, M> y)
 {
     MATRIX_MAP_BINARY(T, N, M, fmod, x, y);
@@ -2014,18 +2161,21 @@ __target_intrinsic(glsl, fract)
 __target_intrinsic(cuda, "$P_frac($0)")
 __target_intrinsic(cpp, "$P_frac($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fract _0")
+[__readNone]
 T frac(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, fract)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fract _0")
+[__readNone]
 vector<T, N> frac(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, frac, x);
 }
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
+[__readNone]
 matrix<T, N, M> frac(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, frac, x);
@@ -2036,12 +2186,14 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Frexp _0 _1")
+[__readNone]
 T frexp(T x, out T exp);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Frexp _0 _1")
+[__readNone]
 vector<T, N> frexp(vector<T, N> x, out vector<T, N> exp)
 {
     VECTOR_MAP_BINARY(T, N, frexp, x, exp);
@@ -2049,6 +2201,7 @@ vector<T, N> frexp(vector<T, N> x, out vector<T, N> exp)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> frexp(matrix<T, N, M> x, out matrix<T, N, M> exp)
 {
     MATRIX_MAP_BINARY(T, N, M, frexp, x, exp);
@@ -2056,11 +2209,13 @@ matrix<T, N, M> frexp(matrix<T, N, M> x, out matrix<T, N, M> exp)
 
 // Texture filter width
 __generic<T : __BuiltinFloatingPointType>
+[__readNone]
 T fwidth(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
+[__readNone]
 vector<T, N> fwidth(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, fwidth, x);
@@ -2068,6 +2223,7 @@ vector<T, N> fwidth(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> fwidth(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, fwidth, x);
@@ -2141,9 +2297,11 @@ matrix<T,N,M> GetAttributeAtVertex(matrix<T,N,M> attribute, uint vertexIndex);
 
 
 // Get number of samples in render target
+[__readNone]
 uint GetRenderTargetSampleCount();
 
 // Get position of given sample
+[__readNone]
 float2 GetRenderTargetSamplePosition(int Index);
 
 // Group memory barrier
@@ -2284,6 +2442,7 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(cuda, "$P_isfinite($0)")
 __target_intrinsic(cpp, "$P_isfinite($0)")
+[__readNone]
 bool isfinite(T x)
 {
     return !(isinf(x) || isnan(x));
@@ -2291,6 +2450,7 @@ bool isfinite(T x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 vector<bool, N> isfinite(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(bool, N, isfinite, x);
@@ -2298,6 +2458,7 @@ vector<bool, N> isfinite(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<bool, N, M> isfinite(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(bool, N, M, isfinite, x);
@@ -2309,11 +2470,13 @@ __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_isinf($0)")
 __target_intrinsic(cpp, "$P_isinf($0)")
+[__readNone]
 bool isinf(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
+[__readNone]
 vector<bool, N> isinf(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(bool, N, isinf, x);
@@ -2321,6 +2484,7 @@ vector<bool, N> isinf(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<bool, N, M> isinf(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(bool, N, M, isinf, x);
@@ -2332,11 +2496,13 @@ __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_isnan($0)")
 __target_intrinsic(cpp, "$P_isnan($0)")
+[__readNone]
 bool isnan(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
+[__readNone]
 vector<bool, N> isnan(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(bool, N, isnan, x);
@@ -2344,6 +2510,7 @@ vector<bool, N> isnan(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<bool, N, M> isnan(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(bool, N, M, isnan, x);
@@ -2354,6 +2521,7 @@ matrix<bool, N, M> isnan(matrix<T, N, M> x)
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Ldexp _0 _1")
+[__readNone]
 T ldexp(T x, T exp)
 {
     return x * exp2(exp);
@@ -2362,6 +2530,7 @@ T ldexp(T x, T exp)
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Ldexp _0 _1")
+[__readNone]
 vector<T, N> ldexp(vector<T, N> x, vector<T, N> exp)
 {
     return x * exp2(exp);
@@ -2369,6 +2538,7 @@ vector<T, N> ldexp(vector<T, N> x, vector<T, N> exp)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> ldexp(matrix<T, N, M> x, matrix<T, N, M> exp)
 {
     MATRIX_MAP_BINARY(T, N, M, ldexp, x, exp);
@@ -2379,6 +2549,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Length _0")
+[__readNone]
 T length(vector<T, N> x)
 {
     return sqrt(dot(x, x));
@@ -2389,6 +2560,7 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, mix)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FMix _0 _1 _2")
+[__readNone]
 T lerp(T x, T y, T s)
 {
     return x * (T(1.0f) - s) + y * s;
@@ -2398,6 +2570,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, mix)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FMix _0 _1 _2")
+[__readNone]
 vector<T, N> lerp(vector<T, N> x, vector<T, N> y, vector<T, N> s)
 {
     return x * (T(1.0f) - s) + y * s;
@@ -2405,6 +2578,7 @@ vector<T, N> lerp(vector<T, N> x, vector<T, N> y, vector<T, N> s)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> lerp(matrix<T,N,M> x, matrix<T,N,M> y, matrix<T,N,M> s)
 {
     MATRIX_MAP_TRINARY(T, N, M, lerp, x, y, s);
@@ -2412,6 +2586,7 @@ matrix<T,N,M> lerp(matrix<T,N,M> x, matrix<T,N,M> y, matrix<T,N,M> s)
 
 // Legacy lighting function (obsolete)
 __target_intrinsic(hlsl)
+[__readNone]
 float4 lit(float n_dot_l, float n_dot_h, float m)
 {
     let ambient = 1.0f;
@@ -2427,12 +2602,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_log($0)")
 __target_intrinsic(cpp, "$P_log($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Log _0")
+[__readNone]
 T log(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Log _0")
+[__readNone]
 vector<T, N> log(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, log, x);
@@ -2440,6 +2617,7 @@ vector<T, N> log(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> log(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, log, x);
@@ -2452,12 +2630,14 @@ __target_intrinsic(glsl, "(log( $0 ) * $S0( 0.43429448190325182765112891891661)
 __target_intrinsic(cuda, "$P_log10($0)")
 __target_intrinsic(cpp, "$P_log10($0)")
 __target_intrinsic(spirv_direct, "%baseElog = OpExtInst resultType resultId glsl450 Log _0; OpFMul resultType resultId _0 %baseElog const(_p,0.43429448190325182765112891891661)")
+[__readNone]
 T log10(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "(log( $0 ) * $S0(0.43429448190325182765112891891661) )" )
 __target_intrinsic(spirv_direct, "%baseElog = OpExtInst resultType resultId glsl450 Log _0; OpVectorTimesScalar resultType resultId _0 %baseElog const(_p,0.43429448190325182765112891891661)")
+[__readNone]
 vector<T,N> log10(vector<T,N> x)
 {
     VECTOR_MAP_UNARY(T, N, log10, x);
@@ -2465,6 +2645,7 @@ vector<T,N> log10(vector<T,N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> log10(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, log10, x);
@@ -2477,12 +2658,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_log2($0)")
 __target_intrinsic(cpp, "$P_log2($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Log2 _0")
+[__readNone]
 T log2(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Log2 _0")
+[__readNone]
 vector<T,N> log2(vector<T,N> x)
 {
     VECTOR_MAP_UNARY(T, N, log2, x);
@@ -2490,6 +2673,7 @@ vector<T,N> log2(vector<T,N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> log2(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, log2, x);
@@ -2503,12 +2687,14 @@ __target_intrinsic(glsl, fma)
 __target_intrinsic(cuda, "$P_fma($0, $1, $2)")
 __target_intrinsic(cpp, "$P_fma($0, $1, $2)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
+[__readNone]
 T mad(T mvalue, T avalue, T bvalue);
 
 __generic<T : __BuiltinArithmeticType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, fma)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
+[__readNone]
 vector<T, N> mad(vector<T, N> mvalue, vector<T, N> avalue, vector<T, N> bvalue)
 {
     VECTOR_MAP_TRINARY(T, N, mad, mvalue, avalue, bvalue);
@@ -2516,6 +2702,7 @@ vector<T, N> mad(vector<T, N> mvalue, vector<T, N> avalue, vector<T, N> bvalue)
 
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> mad(matrix<T, N, M> mvalue, matrix<T, N, M> avalue, matrix<T, N, M> bvalue)
 {
     MATRIX_MAP_TRINARY(T, N, M, mad, mvalue, avalue, bvalue);
@@ -2528,6 +2715,7 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_max($0, $1)")
 __target_intrinsic(cpp, "$P_max($0, $1)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMax, UMax, SMax) _0")
+[__readNone]
 T max(T x, T y);
 // Note: a stdlib implementation of `max` (or `min`) will require splitting
 // floating-point and integer cases apart, because the floating-point
@@ -2538,6 +2726,7 @@ __generic<T : __BuiltinIntegerType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMax, UMax, SMax) _0")
+[__readNone]
 vector<T, N> max(vector<T, N> x, vector<T, N> y)
 {
     VECTOR_MAP_BINARY(T, N, max, x, y);
@@ -2545,6 +2734,7 @@ vector<T, N> max(vector<T, N> x, vector<T, N> y)
 
 __generic<T : __BuiltinIntegerType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> max(matrix<T, N, M> x, matrix<T, N, M> y)
 {
     MATRIX_MAP_BINARY(T, N, M, max, x, y);
@@ -2556,12 +2746,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_max($0, $1)")
 __target_intrinsic(cpp, "$P_max($0, $1)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMax, UMax, SMax) _0")
+[__readNone]
 T max(T x, T y);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMax, UMax, SMax) _0")
+[__readNone]
 vector<T, N> max(vector<T, N> x, vector<T, N> y)
 {
     VECTOR_MAP_BINARY(T, N, max, x, y);
@@ -2569,6 +2761,7 @@ vector<T, N> max(vector<T, N> x, vector<T, N> y)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> max(matrix<T, N, M> x, matrix<T, N, M> y)
 {
     MATRIX_MAP_BINARY(T, N, M, max, x, y);
@@ -2581,12 +2774,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_min($0, $1)")
 __target_intrinsic(cpp, "$P_min($0, $1)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0")
+[__readNone]
 T min(T x, T y);
 
 __generic<T : __BuiltinIntegerType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0")
+[__readNone]
 vector<T,N> min(vector<T,N> x, vector<T,N> y)
 {
     VECTOR_MAP_BINARY(T, N, min, x, y);
@@ -2594,6 +2789,7 @@ vector<T,N> min(vector<T,N> x, vector<T,N> y)
 
 __generic<T : __BuiltinIntegerType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> min(matrix<T,N,M> x, matrix<T,N,M> y)
 {
     MATRIX_MAP_BINARY(T, N, M, min, x, y);
@@ -2605,12 +2801,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_min($0, $1)")
 __target_intrinsic(cpp, "$P_min($0, $1)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0")
+[__readNone]
 T min(T x, T y);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0")
+[__readNone]
 vector<T,N> min(vector<T,N> x, vector<T,N> y)
 {
     VECTOR_MAP_BINARY(T, N, min, x, y);
@@ -2618,6 +2816,7 @@ vector<T,N> min(vector<T,N> x, vector<T,N> y)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> min(matrix<T,N,M> x, matrix<T,N,M> y)
 {
     MATRIX_MAP_BINARY(T, N, M, min, x, y);
@@ -2625,11 +2824,13 @@ matrix<T,N,M> min(matrix<T,N,M> x, matrix<T,N,M> y)
 
 // split into integer and fractional parts (both with same sign)
 __generic<T : __BuiltinFloatingPointType>
+[__readNone]
 T modf(T x, out T ip);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
+[__readNone]
 vector<T,N> modf(vector<T,N> x, out vector<T,N> ip)
 {
     VECTOR_MAP_BINARY(T, N, modf, x, ip);
@@ -2637,6 +2838,7 @@ vector<T,N> modf(vector<T,N> x, out vector<T,N> ip)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> modf(matrix<T,N,M> x, out matrix<T,N,M> ip)
 {
     MATRIX_MAP_BINARY(T, N, M, modf, x, ip);
@@ -2644,6 +2846,7 @@ matrix<T,N,M> modf(matrix<T,N,M> x, out matrix<T,N,M> ip)
 
 // msad4 (whatever that is)
 __target_intrinsic(hlsl)
+[__readNone]
 uint4 msad4(uint reference, uint2 source, uint4 accum)
 {
     int4 bytesRef = (reference >> uint4(24, 16, 8, 0)) & 0xFF;
@@ -2665,36 +2868,43 @@ uint4 msad4(uint reference, uint2 source, uint4 accum)
 // scalar-scalar
 __generic<T : __BuiltinArithmeticType>
 __intrinsic_op($(kIROp_Mul))
+[__readNone]
 T mul(T x, T y);
 
 // scalar-vector and vector-scalar
 __generic<T : __BuiltinArithmeticType, let N : int>
 __intrinsic_op($(kIROp_Mul))
+[__readNone]
 vector<T, N> mul(vector<T, N> x, T y);
 
 __generic<T : __BuiltinArithmeticType, let N : int>
 __intrinsic_op($(kIROp_Mul))
+[__readNone]
 vector<T, N> mul(T x, vector<T, N> y);
 
 // scalar-matrix and matrix-scalar
 __generic<T : __BuiltinArithmeticType, let N : int, let M :int>
 __intrinsic_op($(kIROp_Mul))
+[__readNone]
 matrix<T, N, M> mul(matrix<T, N, M> x, T y);
 
 __generic<T : __BuiltinArithmeticType, let N : int, let M :int>
 __intrinsic_op($(kIROp_Mul))
+[__readNone]
 matrix<T, N, M> mul(T x, matrix<T, N, M> y);
 
 // vector-vector (dot product)
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "dot")
+[__readNone]
 T mul(vector<T, N> x, vector<T, N> y)
 {
     return dot(x, y);
 }
 __generic<T : __BuiltinIntegerType, let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 T mul(vector<T, N> x, vector<T, N> y)
 {
     return dot(x, y);
@@ -2704,6 +2914,7 @@ T mul(vector<T, N> x, vector<T, N> y)
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
 {
     vector<T,M> result;
@@ -2721,6 +2932,7 @@ vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
 __generic<T : __BuiltinIntegerType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
 {
     vector<T,M> result;
@@ -2738,6 +2950,7 @@ vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
 __generic<T : __BuiltinLogicalType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
 {
     vector<T,M> result;
@@ -2757,6 +2970,7 @@ vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
 {
     vector<T,N> result;
@@ -2774,6 +2988,7 @@ vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
 __generic<T : __BuiltinIntegerType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
 {
     vector<T,N> result;
@@ -2791,6 +3006,7 @@ vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
 __generic<T : __BuiltinLogicalType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
 {
     vector<T,N> result;
@@ -2810,6 +3026,7 @@ vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
 __generic<T : __BuiltinFloatingPointType, let R : int, let N : int, let C : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
 {
     matrix<T,R,C> result;
@@ -2828,6 +3045,7 @@ matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
 __generic<T : __BuiltinIntegerType, let R : int, let N : int, let C : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
 {
     matrix<T,R,C> result;
@@ -2846,6 +3064,7 @@ matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
 __generic<T : __BuiltinLogicalType, let R : int, let N : int, let C : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
 {
     matrix<T,R,C> result;
@@ -2864,11 +3083,13 @@ matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
 
 // noise (deprecated)
 
+[__readNone]
 float noise(float x)
 {
     return 0;
 }
 
+[__readNone]
 __generic<let N : int> float noise(vector<float, N> x)
 {
     return 0;
@@ -2915,6 +3136,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Normalize _0")
+[__readNone]
 vector<T,N> normalize(vector<T,N> x)
 {
     return x / length(x);
@@ -2927,12 +3149,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_pow($0, $1)")
 __target_intrinsic(cpp, "$P_pow($0, $1)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Pow _0 _1")
+[__readNone]
 T pow(T x, T y);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Pow _0 _1")
+[__readNone]
 vector<T, N> pow(vector<T, N> x, vector<T, N> y)
 {
     VECTOR_MAP_BINARY(T, N, pow, x, y);
@@ -2940,6 +3164,7 @@ vector<T, N> pow(vector<T, N> x, vector<T, N> y)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> pow(matrix<T,N,M> x, matrix<T,N,M> y)
 {
     MATRIX_MAP_BINARY(T, N, M, pow, x, y);
@@ -3087,6 +3312,7 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Radians _0")
+[__readNone]
 T radians(T x)
 {
     return x * (T.getPi() / T(180.0f));
@@ -3096,6 +3322,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Radians _0")
+[__readNone]
 vector<T, N> radians(vector<T, N> x)
 {
     return x * (T.getPi() / T(180.0f));
@@ -3103,6 +3330,7 @@ vector<T, N> radians(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> radians(matrix<T, N, M> x)
 {
     return x * (T.getPi() / T(180.0f));
@@ -3111,6 +3339,7 @@ matrix<T, N, M> radians(matrix<T, N, M> x)
 // Approximate reciprocal
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
+[__readNone]
 T rcp(T x)
 {
     return T(1.0) / x;
@@ -3118,6 +3347,7 @@ T rcp(T x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 vector<T, N> rcp(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, rcp, x);
@@ -3125,6 +3355,7 @@ vector<T, N> rcp(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> rcp(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, rcp, x);
@@ -3135,6 +3366,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Reflect _0 _1")
+[__readNone]
 vector<T,N> reflect(vector<T,N> i, vector<T,N> n)
 {
     return i - T(2) * dot(n,i) * n;
@@ -3145,6 +3377,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Refract _0 _1 _2")
+[__readNone]
 vector<T,N> refract(vector<T,N> i, vector<T,N> n, T eta)
 {
     let dotNI = dot(n,i);
@@ -3158,10 +3391,12 @@ __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "bitfieldReverse")
 __target_intrinsic(cuda, "$P_reversebits($0)")
 __target_intrinsic(cpp, "$P_reversebits($0)")
+[__readNone]
 uint reversebits(uint value);
 
 __target_intrinsic(glsl, "bitfieldReverse")
 __generic<let N : int>
+[__readNone]
 vector<uint, N> reversebits(vector<uint, N> value)
 {
     VECTOR_MAP_UNARY(uint, N, reversebits, value);
@@ -3174,12 +3409,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_round($0)")
 __target_intrinsic(cpp, "$P_round($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Round _0")
+[__readNone]
 T round(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Round _0")
+[__readNone]
 vector<T, N> round(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, round, x);
@@ -3187,6 +3424,7 @@ vector<T, N> round(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> round(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, round, x);
@@ -3199,6 +3437,7 @@ __target_intrinsic(glsl, "inversesqrt($0)")
 __target_intrinsic(cuda, "$P_rsqrt($0)")
 __target_intrinsic(cpp, "$P_rsqrt($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InverseSqrt _0")
+[__readNone]
 T rsqrt(T x)
 {
     return T(1.0) / sqrt(x);
@@ -3208,6 +3447,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "inversesqrt($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InverseSqrt _0")
+[__readNone]
 vector<T, N> rsqrt(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, rsqrt, x);
@@ -3215,6 +3455,7 @@ vector<T, N> rsqrt(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> rsqrt(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, rsqrt, x);
@@ -3224,6 +3465,7 @@ matrix<T, N, M> rsqrt(matrix<T, N, M> x)
 
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
+[__readNone]
 T saturate(T x)
 {
     return clamp<T>(x, T(0), T(1));
@@ -3231,6 +3473,7 @@ T saturate(T x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 vector<T,N> saturate(vector<T,N> x)
 {
     return clamp<T,N>(x,
@@ -3240,6 +3483,7 @@ vector<T,N> saturate(vector<T,N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> saturate(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, saturate, x);
@@ -3252,12 +3496,14 @@ __target_intrinsic(glsl, "int(sign($0))")
 __target_intrinsic(cuda, "$P_sign($0)")
 __target_intrinsic(cpp, "$P_sign($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FSign, SSign) _0")
+[__readNone]
 int sign(T x);
 
 __generic<T : __BuiltinSignedArithmeticType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "ivec$N0(sign($0))")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FSign, SSign) _0")
+[__readNone]
 vector<int, N> sign(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(int, N, sign, x);
@@ -3265,6 +3511,7 @@ vector<int, N> sign(vector<T, N> x)
 
 __generic<T : __BuiltinSignedArithmeticType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<int, N, M> sign(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(int, N, M, sign, x);
@@ -3279,12 +3526,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_sin($0)")
 __target_intrinsic(cpp, "$P_sin($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sin _0")
+[__readNone]
 T sin(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sin _0")
+[__readNone]
 vector<T, N> sin(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, sin, x);
@@ -3292,6 +3541,7 @@ vector<T, N> sin(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> sin(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, sin, x);
@@ -3301,6 +3551,7 @@ matrix<T, N, M> sin(matrix<T, N, M> x)
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(cuda, "$P_sincos($0, $1, $2)")
+[__readNone]
 void sincos(T x, out T s, out T c)
 {
     s = sin(x);
@@ -3309,6 +3560,7 @@ void sincos(T x, out T s, out T c)
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 void sincos(vector<T,N> x, out vector<T,N> s, out vector<T,N> c)
 {
     s = sin(x);
@@ -3317,6 +3569,7 @@ void sincos(vector<T,N> x, out vector<T,N> s, out vector<T,N> c)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 void sincos(matrix<T,N,M> x, out matrix<T,N,M> s, out matrix<T,N,M> c)
 {
     s = sin(x);
@@ -3330,12 +3583,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_sinh($0)")
 __target_intrinsic(cpp, "$P_sinh($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sinh _0")
+[__readNone]
 T sinh(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sinh _0")
+[__readNone]
 vector<T, N> sinh(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, sinh, x);
@@ -3343,6 +3598,7 @@ vector<T, N> sinh(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> sinh(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, sinh, x);
@@ -3353,6 +3609,7 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 SmoothStep _0 _1 _2")
+[__readNone]
 T smoothstep(T min, T max, T x)
 {
     let t = saturate((x - min) / (max - min));
@@ -3363,6 +3620,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 SmoothStep _0 _1 _2")
+[__readNone]
 vector<T, N> smoothstep(vector<T, N> min, vector<T, N> max, vector<T, N> x)
 {
     VECTOR_MAP_TRINARY(T, N, smoothstep, min, max, x);
@@ -3370,6 +3628,7 @@ vector<T, N> smoothstep(vector<T, N> min, vector<T, N> max, vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> smoothstep(matrix<T, N, M> min, matrix<T, N, M> max, matrix<T, N, M> x)
 {
     MATRIX_MAP_TRINARY(T, N, M, smoothstep, min, max, x);
@@ -3382,12 +3641,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_sqrt($0)")
 __target_intrinsic(cpp, "$P_sqrt($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sqrt _0")
+[__readNone]
 T sqrt(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sqrt _0")
+[__readNone]
 vector<T, N> sqrt(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, sqrt, x);
@@ -3395,6 +3656,7 @@ vector<T, N> sqrt(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> sqrt(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, sqrt, x);
@@ -3405,6 +3667,7 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Step _0 _1")
+[__readNone]
 T step(T y, T x)
 {
     return x < y ? T(0.0f) : T(1.0f);
@@ -3414,6 +3677,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Step _0 _1")
+[__readNone]
 vector<T,N> step(vector<T,N> y, vector<T,N> x)
 {
     VECTOR_MAP_BINARY(T, N, step, y, x);
@@ -3421,6 +3685,7 @@ vector<T,N> step(vector<T,N> y, vector<T,N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> step(matrix<T, N, M> y, matrix<T, N, M> x)
 {
     MATRIX_MAP_BINARY(T, N, M, step, y, x);
@@ -3433,12 +3698,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_tan($0)")
 __target_intrinsic(cpp, "$P_tan($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Tan _0")
+[__readNone]
 T tan(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Tan _0")
+[__readNone]
 vector<T, N> tan(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, tan, x);
@@ -3446,6 +3713,7 @@ vector<T, N> tan(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> tan(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, tan, x);
@@ -3458,12 +3726,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_tanh($0)")
 __target_intrinsic(cpp, "$P_tanh($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Tanh _0")
+[__readNone]
 T tanh(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Tanh _0")
+[__readNone]
 vector<T,N> tanh(vector<T,N> x)
 {
     VECTOR_MAP_UNARY(T, N, tanh, x);
@@ -3471,6 +3741,7 @@ vector<T,N> tanh(vector<T,N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> tanh(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, tanh, x);
@@ -3480,6 +3751,7 @@ matrix<T,N,M> tanh(matrix<T,N,M> x)
 __generic<T : __BuiltinType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
+[__readNone]
 matrix<T, M, N> transpose(matrix<T, N, M> x)
 {
     matrix<T,M,N> result;
@@ -3496,12 +3768,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_trunc($0)")
 __target_intrinsic(cpp, "$P_trunc($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Trunc _0")
+[__readNone]
 T trunc(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Trunc _0")
+[__readNone]
 vector<T, N> trunc(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, trunc, x);
@@ -3509,6 +3783,7 @@ vector<T, N> trunc(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> trunc(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, trunc, x);
@@ -4779,6 +5054,7 @@ void __executeCallable(uint shaderIndex, int payloadLocation);
 __generic<Payload>
 __target_intrinsic(__glslRayTracing, "$XC")
 [__readNone]
+[__AlwaysFoldIntoUseSiteAttribute]
 int __callablePayloadLocation(__ref Payload payload);
 
 // Now we provide a hard-coded definition of `CallShader()` for GLSL-based
@@ -4834,6 +5110,7 @@ void __traceRay(
 __generic<Payload>
 __target_intrinsic(__glslRayTracing, "$XP")
 [__readNone]
+[__AlwaysFoldIntoUseSiteAttribute]
 int __rayPayloadLocation(__ref Payload payload);
 
 __generic<payload_t>
@@ -5677,6 +5954,7 @@ Ref<T> __hitObjectAttributes<T>()
 __generic<Attributes>
 __target_intrinsic(__glslRayTracing, "$XH")
 [__readNone]
+[__AlwaysFoldIntoUseSiteAttribute]
 int __hitObjectAttributesLocation(__ref Attributes attributes);
 
     /// Immutable data type representing a ray hit or a miss. Can be used to invoke hit or miss shading,
diff --git a/source/slang/slang-ast-modifier.h b/source/slang/slang-ast-modifier.h
index 99e221b1e..6ac464784 100644
--- a/source/slang/slang-ast-modifier.h
+++ b/source/slang/slang-ast-modifier.h
@@ -1083,6 +1083,14 @@ class RequiresNVAPIAttribute : public Attribute
     SLANG_AST_CLASS(RequiresNVAPIAttribute)
 };
 
+
+    /// A `[__AlwaysFoldIntoUseSite]` attribute indicates that the calls into the modified
+    /// function should always be folded into use sites during source emit.
+class AlwaysFoldIntoUseSiteAttribute :public Attribute
+{
+    SLANG_AST_CLASS(AlwaysFoldIntoUseSiteAttribute)
+};
+
     /// The `[ForwardDifferentiable]` attribute indicates that a function can be forward-differentiated.
 class ForwardDifferentiableAttribute : public DifferentiableAttribute
 {
diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp
index c664449e5..7840dc450 100644
--- a/source/slang/slang-emit-c-like.cpp
+++ b/source/slang/slang-emit-c-like.cpp
@@ -1244,14 +1244,24 @@ bool CLikeSourceEmitter::shouldFoldInstIntoUseSites(IRInst* inst)
             return true;
     }
 
+    // Always hold if inst is a call into an [__alwaysFoldIntoUseSite] function.
+    if (auto call = as<IRCall>(inst))
+    {
+        auto callee = call->getCallee();
+        if (getResolvedInstForDecorations(callee)->findDecoration<IRAlwaysFoldIntoUseSiteDecoration>())
+        {
+            return true;
+        }
+    }
+
     // Having dealt with all of the cases where we *must* fold things
     // above, we can now deal with the more general cases where we
     // *should not* fold things.
-
     // Don't fold something with no users:
     if(!inst->hasUses())
         return false;
 
+
     // Don't fold something that has multiple users:
     if(inst->hasMoreThanOneUse())
         return false;
diff --git a/source/slang/slang-emit-c-like.h b/source/slang/slang-emit-c-like.h
index ff229c38b..1cd2045c7 100644
--- a/source/slang/slang-emit-c-like.h
+++ b/source/slang/slang-emit-c-like.h
@@ -326,7 +326,7 @@ public:
 
     void emitSimpleValue(IRInst* inst) { emitSimpleValueImpl(inst); }
     
-    bool shouldFoldInstIntoUseSites(IRInst* inst);
+    virtual bool shouldFoldInstIntoUseSites(IRInst* inst);
 
     void emitOperand(IRInst* inst, EmitOpInfo const& outerPrec) { emitOperandImpl(inst, outerPrec); }
 
diff --git a/source/slang/slang-emit-cpp.cpp b/source/slang/slang-emit-cpp.cpp
index ba6b26ec6..795ec74b0 100644
--- a/source/slang/slang-emit-cpp.cpp
+++ b/source/slang/slang-emit-cpp.cpp
@@ -1557,6 +1557,46 @@ void CPPSourceEmitter::emitGlobalInstImpl(IRInst* inst)
     }
 }
 
+bool CPPSourceEmitter::shouldFoldInstIntoUseSites(IRInst* inst)
+{
+    bool result = Super::shouldFoldInstIntoUseSites(inst);
+    if (!result)
+        return result;
+    if (as<IRVectorType>(inst->getDataType()) || as<IRMatrixType>(inst->getDataType()))
+    {
+        // If a vector value is being used in a reshape/cast,
+        // we should not fold it because the implementation of cast will have multiple references to it.
+        for (auto use = inst->firstUse; use; use = use->nextUse)
+        {
+            switch (use->getUser()->getOp())
+            {
+            case kIROp_MatrixReshape:
+            case kIROp_VectorReshape:
+            case kIROp_IntCast:
+            case kIROp_FloatCast:
+            case kIROp_CastIntToFloat:
+            case kIROp_CastFloatToInt:
+                return false;
+            default:
+                break;
+            }
+        }
+        switch (inst->getOp())
+        {
+        case kIROp_MatrixReshape:
+        case kIROp_VectorReshape:
+        case kIROp_IntCast:
+        case kIROp_FloatCast:
+        case kIROp_CastIntToFloat:
+        case kIROp_CastFloatToInt:
+            return false;
+        default:
+            break;
+        }
+    }
+    return true;
+}
+
 static bool _isExported(IRInst* inst)
 {
     for (auto decoration : inst->getDecorations())
diff --git a/source/slang/slang-emit-cpp.h b/source/slang/slang-emit-cpp.h
index 92780e0a4..71c382f87 100644
--- a/source/slang/slang-emit-cpp.h
+++ b/source/slang/slang-emit-cpp.h
@@ -71,6 +71,7 @@ protected:
     virtual void emitFuncDecorationsImpl(IRFunc* func) SLANG_OVERRIDE;
     virtual void emitVarDecorationsImpl(IRInst* var) SLANG_OVERRIDE;
     virtual void emitGlobalInstImpl(IRInst* inst) SLANG_OVERRIDE;
+    virtual bool shouldFoldInstIntoUseSites(IRInst* inst) SLANG_OVERRIDE;
 
     const UnownedStringSlice* getVectorElementNames(BaseType elemType, Index elemCount);
     
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index e2f00bf88..a25fae5ae 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -891,8 +891,8 @@ Result linkAndOptimizeIR(
         }
     }
 
-    // Run a final round of DCE to clean up unused things after phi-elimination.
-    eliminateDeadCode(irModule);
+    // Run a final round of simplifications to clean up unused things after phi-elimination.
+    simplifyNonSSAIR(irModule);
 
     // We include one final step to (optionally) dump the IR and validate
     // it after all of the optimization passes are complete. This should
diff --git a/source/slang/slang-ir-autodiff-unzip.cpp b/source/slang/slang-ir-autodiff-unzip.cpp
index 096751836..a05fe7044 100644
--- a/source/slang/slang-ir-autodiff-unzip.cpp
+++ b/source/slang/slang-ir-autodiff-unzip.cpp
@@ -559,6 +559,7 @@ IRFunc* DiffUnzipPass::extractPrimalFunc(
             {
                 if (inst->getOp() == kIROp_Call)
                 {
+                    // The primal calls should be marked as no side effect so they can be DCE'd if possible.
                     builder.addSimpleDecoration<IRNoSideEffectDecoration>(inst);
                 }
             }
diff --git a/source/slang/slang-ir-autodiff.h b/source/slang/slang-ir-autodiff.h
index fa01d50ae..a4eb94461 100644
--- a/source/slang/slang-ir-autodiff.h
+++ b/source/slang/slang-ir-autodiff.h
@@ -212,20 +212,12 @@ struct DifferentiableTypeConformanceContext
     IRInst* getZeroMethodForType(IRBuilder* builder, IRType* origType)
     {
         auto result = lookUpInterfaceMethod(builder, origType, sharedContext->zeroMethodStructKey);
-        if (result && !result->findDecoration<IRNoSideEffectDecoration>())
-        {
-            builder->addDecoration(result, kIROp_NoSideEffectDecoration);
-        }
         return result;
     }
 
     IRInst* getAddMethodForType(IRBuilder* builder, IRType* origType)
     {
         auto result = lookUpInterfaceMethod(builder, origType, sharedContext->addMethodStructKey);
-        if (result && !result->findDecoration<IRNoSideEffectDecoration>())
-        {
-            builder->addDecoration(result, kIROp_NoSideEffectDecoration);
-        }
         return result;
     }
 };
diff --git a/source/slang/slang-ir-dce.cpp b/source/slang/slang-ir-dce.cpp
index 58c9b23f1..e5c9b1fdb 100644
--- a/source/slang/slang-ir-dce.cpp
+++ b/source/slang/slang-ir-dce.cpp
@@ -24,6 +24,11 @@ struct DeadCodeEliminationContext
     // These uses will be replaced with `undefInst`.
     IRInst* undefInst = nullptr;
 
+    // Track if we have removed any phi parameters.
+    // If so we need to rerun dce pass because after removing them
+    // there could be new DCE opportunities.
+    bool phiRemoved = false;
+
     // Our overall process is going to be to determine
     // which instructions in the module are "live"
     // and then eliminate anything that wasn't found to
@@ -98,104 +103,115 @@ struct DeadCodeEliminationContext
 
     bool processInst(IRInst* root)
     {
-        // First of all, we know that the root instruction
-        // should be considered as live, because otherwise
-        // we'd end up eliminating it, so that is a
-        // good place to start.
-        //
-        markInstAsLive(root);
-
-        // Ensure there is a global undef inst that is always alive.
-        // This undef inst will be used to fill in weak-referencing uses
-        // whose used value is marked as dead and eliminated.
-        // We always make sure this undef inst is available to prevent
-        // infiniate oscilating loops.
-        markInstAsLive(getUndefInst());
-
-        // Marking the module as live should have
-        // seeded our work list, so we can now start
-        // processing entries off of our work list
-        // until it goes dry.
-        //
-        while (workList.getCount())
+        bool result = false;
+        for (;;)
         {
-            auto inst = workList.getLast();
-            workList.removeLast();
+            liveInsts.Clear();
+            workList.clear();
 
-            if (!isChildInstOf(inst, root))
-                continue;
-
-            // At this point we know that `inst` is live,
-            // and we want to start considering which other
-            // instructions must be live because of that
-            // knowlege.
-            //
-            // A first easy case is that the parent (if any)
-            // of a live instruction had better be live, or
-            // else we might delete the parent, and
-            // the child with it.
+            // First of all, we know that the root instruction
+            // should be considered as live, because otherwise
+            // we'd end up eliminating it, so that is a
+            // good place to start.
             //
-            markInstAsLive(inst->getParent());
-
-            // Next the type of a live instruction, and all
-            // of its operands must also be live, or else
-            // we won't be able to compute its value.
+            markInstAsLive(root);
+
+            // Ensure there is a global undef inst that is always alive.
+            // This undef inst will be used to fill in weak-referencing uses
+            // whose used value is marked as dead and eliminated.
+            // We always make sure this undef inst is available to prevent
+            // infiniate oscilating loops.
+            markInstAsLive(getUndefInst());
+
+            // Marking the module as live should have
+            // seeded our work list, so we can now start
+            // processing entries off of our work list
+            // until it goes dry.
             //
-            markInstAsLive(inst->getFullType());
-            UInt operandCount = inst->getOperandCount();
-            for (UInt ii = 0; ii < operandCount; ++ii)
+            while (workList.getCount())
             {
-                // There are some type of operands that needs to be treated as
-                // "weak" references -- they can never hold things alive, and
-                // whenever we delete the referenced value, these operands needs
-                // to be replaced with `undef`.
-                if (!isWeakReferenceOperand(inst, ii))
-                    markInstAsLive(inst->getOperand(ii));
-            }
+                auto inst = workList.getLast();
+                workList.removeLast();
+
+                if (!isChildInstOf(inst, root))
+                    continue;
+
+                // At this point we know that `inst` is live,
+                // and we want to start considering which other
+                // instructions must be live because of that
+                // knowlege.
+                //
+                // A first easy case is that the parent (if any)
+                // of a live instruction had better be live, or
+                // else we might delete the parent, and
+                // the child with it.
+                //
+                markInstAsLive(inst->getParent());
+
+                // Next the type of a live instruction, and all
+                // of its operands must also be live, or else
+                // we won't be able to compute its value.
+                //
+                markInstAsLive(inst->getFullType());
+                UInt operandCount = inst->getOperandCount();
+                for (UInt ii = 0; ii < operandCount; ++ii)
+                {
+                    // There are some type of operands that needs to be treated as
+                    // "weak" references -- they can never hold things alive, and
+                    // whenever we delete the referenced value, these operands needs
+                    // to be replaced with `undef`.
+                    if (!isWeakReferenceOperand(inst, ii))
+                        markInstAsLive(inst->getOperand(ii));
+                }
 
-            // Finally, we need to consider the children
-            // and decorations of the instruction.
-            //
-            // Note that just because an instruction is
-            // live doesn't mean its children must be, or
-            // else we'd never eliminate *anything* (we
-            // marked the whole module as live, and everything
-            // is a transitive child of the module).
-            //
-            // Decorations, in contrast, are always live if their
-            // parents are (because we don't want to silently drop
-            // decorations). It is still important to *mark*
-            // decorations as live, because they have operands,
-            // and those operands need to be marked as live.
-            // We will fold decorations into the same loop
-            // as children for simplicity.
-            //
-            // To keep the code here simple, we'll defer the
-            // decision of whether a child (or decoration)
-            // should be live when its parent is to a subroutine.
-            //
-            for (auto child : inst->getDecorationsAndChildren())
-            {
-                if (shouldInstBeLiveIfParentIsLive(child))
+                // Finally, we need to consider the children
+                // and decorations of the instruction.
+                //
+                // Note that just because an instruction is
+                // live doesn't mean its children must be, or
+                // else we'd never eliminate *anything* (we
+                // marked the whole module as live, and everything
+                // is a transitive child of the module).
+                //
+                // Decorations, in contrast, are always live if their
+                // parents are (because we don't want to silently drop
+                // decorations). It is still important to *mark*
+                // decorations as live, because they have operands,
+                // and those operands need to be marked as live.
+                // We will fold decorations into the same loop
+                // as children for simplicity.
+                //
+                // To keep the code here simple, we'll defer the
+                // decision of whether a child (or decoration)
+                // should be live when its parent is to a subroutine.
+                //
+                for (auto child : inst->getDecorationsAndChildren())
                 {
-                    // In this case, we know `inst` is live and
-                    // its `child` should be live if its parent is,
-                    // so the `child` must be live too.
-                    //
-                    markInstAsLive(child);
+                    if (shouldInstBeLiveIfParentIsLive(child))
+                    {
+                        // In this case, we know `inst` is live and
+                        // its `child` should be live if its parent is,
+                        // so the `child` must be live too.
+                        //
+                        markInstAsLive(child);
+                    }
                 }
             }
-        }
 
-        // If our work list runs dry, that means we've reached a steady
-        // state where everything that is transitively relevant to
-        // the "outputs" of the module has been marked as live.
-        //
-        // Now we can simply walk through all of our instructions
-        // recursively and eliminate those that are "dead" by
-        // virtue of not having been found live.
-        //
-        return eliminateDeadInstsRec(root);
+            // If our work list runs dry, that means we've reached a steady
+            // state where everything that is transitively relevant to
+            // the "outputs" of the module has been marked as live.
+            //
+            // Now we can simply walk through all of our instructions
+            // recursively and eliminate those that are "dead" by
+            // virtue of not having been found live.
+            //
+            phiRemoved = false;
+            result |= eliminateDeadInstsRec(root);
+            if (!phiRemoved)
+                break;
+        }
+        return result;
     }
 
     // Given the basic infrastructrure above, let's
@@ -207,6 +223,25 @@ struct DeadCodeEliminationContext
         return processInst(module->getModuleInst());
     }
 
+    void removePhiArgs(IRInst* phiParam)
+    {
+        auto block = cast<IRBlock>(phiParam->getParent());
+        UInt paramIndex = 0;
+        for (auto p = block->getFirstParam(); p; p = p->getNextParam())
+        {
+            if (p == phiParam)
+                break;
+            paramIndex++;
+        }
+        for (auto predBlock : block->getPredecessors())
+        {
+            auto termInst = as<IRUnconditionalBranch>(predBlock->getTerminator());
+            SLANG_ASSERT(paramIndex < termInst->getArgCount());
+            termInst->removeArgument(paramIndex);
+        }
+        phiRemoved = true;
+    }
+
     bool eliminateDeadInstsRec(IRInst* inst)
     {
         bool changed = false;
@@ -226,6 +261,12 @@ struct DeadCodeEliminationContext
             {
                 inst->replaceUsesWith(getUndefInst());
             }
+
+            if (inst->getOp() == kIROp_Param)
+            {
+                // For Phi parameters, we need to update all branch arguments.
+                removePhiArgs(inst);
+            }
             inst->removeAndDeallocate();
             changed = true;
         }
@@ -261,6 +302,16 @@ struct DeadCodeEliminationContext
     }
 };
 
+bool isFirstBlock(IRInst* inst)
+{
+    auto block = as<IRBlock>(inst);
+    if (!block)
+        return false;
+    if (!block->getParent())
+        return false;
+    return block->getParent()->getFirstBlock() == block;
+}
+
 bool shouldInstBeLiveIfParentIsLive(IRInst* inst, IRDeadCodeEliminationOptions options)
 {
     // The main source of confusion/complexity here is that
@@ -275,7 +326,31 @@ bool shouldInstBeLiveIfParentIsLive(IRInst* inst, IRDeadCodeEliminationOptions o
     // when it is executed, then we should keep it around.
     //
     if (inst->mightHaveSideEffects())
-        return true;
+    {
+        // If the inst has side effect, we should keep it alive.
+        // An exception is if we have a call to a pure function
+        // that writes its output to a local variable, but we
+        // don't have any uses of that local variable.
+        auto call = as<IRCall>(inst);
+        if (!call)
+            return true;
+        if (!getResolvedInstForDecorations(call->getCallee())->findDecoration<IRReadNoneDecoration>())
+            return true;
+        auto parentFunc = getParentFunc(inst);
+        if (!parentFunc)
+            return true;
+        for (UInt i = 0; i < call->getArgCount(); i++)
+        {
+            auto arg = call->getArg(i);
+            if (getParentFunc(arg) != parentFunc)
+                return true;
+            if (arg->getOp() != kIROp_Var)
+                return true;
+            if (arg->hasMoreThanOneUse())
+                return true;
+        }
+        return false;
+    }
     //
     // The `mightHaveSideEffects` query is conservative, and will
     // return `true` as its default mode, so once we are past that
@@ -352,17 +427,10 @@ bool shouldInstBeLiveIfParentIsLive(IRInst* inst, IRDeadCodeEliminationOptions o
     switch (inst->getOp())
     {
         // Function parameters obviously shouldn't get eliminated,
-        // even if nothing references them, and block parameters
-        // (phi nodes) will be considered live when their block is,
-        // just so that we don't have to deal with any complications
-        // around re-writing the relevant inter-block argument passing.
-        //
-        // TODO: A smarter DCE pass could deal with this case more
-        // carefully, or we could improve the interprocedural SCCP
-        // pass to deal with block parameters instead.
+        // even if nothing references them.
         //
     case kIROp_Param:
-        return true;
+        return isFirstBlock(inst->getParent());
 
         // IR struct types and witness tables are currently kludged
         // so that they have child instructions that represent their
diff --git a/source/slang/slang-ir-glsl-legalize.cpp b/source/slang/slang-ir-glsl-legalize.cpp
index e111a548b..9c16f40ac 100644
--- a/source/slang/slang-ir-glsl-legalize.cpp
+++ b/source/slang/slang-ir-glsl-legalize.cpp
@@ -2027,8 +2027,8 @@ void legalizeMeshOutputParam(
 
             IRBuilderInsertLocScope locScope{builder};
             builder->setInsertBefore(p);
-            auto e = builder->emitElementAddress(meshOutputBlockType, blockParam, p->getIndex());
-            auto a = builder->emitFieldAddress(builtin.type, e, builtin.key);
+            auto e = builder->emitElementAddress(builder->getPtrType(meshOutputBlockType), blockParam, p->getIndex());
+            auto a = builder->emitFieldAddress(builder->getPtrType(builtin.type), e, builtin.key);
 
             p->replaceUsesWith(a);
         });
diff --git a/source/slang/slang-ir-inst-defs.h b/source/slang/slang-ir-inst-defs.h
index 4dea3985a..4b1037240 100644
--- a/source/slang/slang-ir-inst-defs.h
+++ b/source/slang/slang-ir-inst-defs.h
@@ -728,6 +728,9 @@ INST(HighLevelDeclDecoration,               highLevelDecl,          1, 0)
         /// Applie to an IR function and signals that inlining should not be performed unless unavoidable.
     INST(NoInlineDecoration, noInline, 0, 0)
 
+        /// A call to the decorated function should always be folded into its use site.
+    INST(AlwaysFoldIntoUseSiteDecoration, alwaysFold, 0, 0)
+
     INST(PayloadDecoration, payload, 0, 0)
 
     /* Mesh Shader outputs */
diff --git a/source/slang/slang-ir-insts.h b/source/slang/slang-ir-insts.h
index fe20f17f5..f2e4e05d3 100644
--- a/source/slang/slang-ir-insts.h
+++ b/source/slang/slang-ir-insts.h
@@ -325,6 +325,7 @@ IR_SIMPLE_DECORATION(HLSLExportDecoration)
 IR_SIMPLE_DECORATION(KeepAliveDecoration)
 IR_SIMPLE_DECORATION(RequiresNVAPIDecoration)
 IR_SIMPLE_DECORATION(NoInlineDecoration)
+IR_SIMPLE_DECORATION(AlwaysFoldIntoUseSiteDecoration)
 
 struct IRNVAPIMagicDecoration : IRDecoration
 {
@@ -1925,7 +1926,7 @@ struct IRUnconditionalBranch : IRTerminatorInst
     UInt getArgCount();
     IRUse* getArgs();
     IRInst* getArg(UInt index);
-
+    void removeArgument(UInt index);
     IR_PARENT_ISA(UnconditionalBranch);
 };
 
@@ -1968,20 +1969,6 @@ struct IRConditionalBranch : IRTerminatorInst
     IRBlock* getFalseBlock() { return (IRBlock*)falseBlock.get(); }
 };
 
-// A conditional branch that represent the test inside a loop
-struct IRLoopTest : IRConditionalBranch
-{
-};
-
-// A conditional branch that represents a one-sided `if`:
-//
-//     if( <condition> ) { <trueBlock> }
-//     <falseBlock>
-struct IRIf : IRConditionalBranch
-{
-    IRBlock* getAfterBlock() { return getFalseBlock(); }
-};
-
 // A conditional branch that represents a two-sided `if`:
 //
 //     if( <condition> ) { <trueBlock> }
@@ -3361,6 +3348,7 @@ public:
     IRInst* emitBitOr(IRType* type, IRInst* left, IRInst* right);
     IRInst* emitBitNot(IRType* type, IRInst* value);
     IRInst* emitNeg(IRType* type, IRInst* value);
+    IRInst* emitNot(IRType* type, IRInst* value);
 
     IRInst* emitAdd(IRType* type, IRInst* left, IRInst* right);
     IRInst* emitSub(IRType* type, IRInst* left, IRInst* right);
diff --git a/source/slang/slang-ir-loop-unroll.cpp b/source/slang/slang-ir-loop-unroll.cpp
index 79b00f60a..2f689ebde 100644
--- a/source/slang/slang-ir-loop-unroll.cpp
+++ b/source/slang/slang-ir-loop-unroll.cpp
@@ -47,7 +47,7 @@ static bool _eliminateDeadBlocks(List<IRBlock*>& blocks, IRBlock* unreachableBlo
     return changed;
 }
 
-List<IRBlock*> _collectBlocksInLoop(Dictionary<IRBlock*, int>& blockOrdering, IRLoop* loopInst)
+List<IRBlock*> _collectBlocksInLoop(IRDominatorTree* dom, IRLoop* loopInst)
 {
     List<IRBlock*> loopBlocks;
     HashSet<IRBlock*> loopBlocksSet;
@@ -58,7 +58,6 @@ List<IRBlock*> _collectBlocksInLoop(Dictionary<IRBlock*, int>& blockOrdering, IR
     };
     auto firstBlock = as<IRBlock>(loopInst->block.get());
     auto breakBlock = as<IRBlock>(loopInst->breakBlock.get());
-    auto breakBlockOrdering = blockOrdering[breakBlock].GetValue();
 
     addBlock(firstBlock);
     for (Index i = 0; i < loopBlocks.getCount(); i++)
@@ -68,18 +67,19 @@ List<IRBlock*> _collectBlocksInLoop(Dictionary<IRBlock*, int>& blockOrdering, IR
         {
             if (succ == breakBlock)
                 continue;
-            auto successorOrdering = blockOrdering[block].GetValue();
-            // The target must be post-dominated by the break block in order to be considered
-            // the body of the loop.
-            // Since we don't support arbitrary goto or multi-level continue, the simple
-            // ordering comparison is sufficient to serve as a post-dominance check.
-            if (successorOrdering < breakBlockOrdering)
+            if (dom->dominates(firstBlock, succ) && !dom->dominates(breakBlock, succ))
                 addBlock(succ);
         }
     }
     return loopBlocks;
 }
 
+List<IRBlock*> collectBlocksInLoop(IRGlobalValueWithCode* func,  IRLoop* loopInst)
+{
+    auto dom = computeDominatorTree(func);
+    return _collectBlocksInLoop(dom, loopInst);
+}
+
 static int _getLoopMaxIterationsToUnroll(IRLoop* loopInst)
 {
     static constexpr int kMaxIterationsToAttempt = 100;
@@ -483,15 +483,7 @@ bool unrollLoopsInFunc(
         // Remove any continue jumps from the loop.
         eliminateContinueBlocks(module, loop);
 
-        auto postOrderReverseCFG = getPostorderOnReverseCFG(func);
-        Dictionary<IRBlock*, int> blockOrdering;
-        
-        for (Index i = 0; i < postOrderReverseCFG.getCount(); i++)
-        {
-            blockOrdering[postOrderReverseCFG[i]] = (int)i;
-        }
-
-        auto blocks = _collectBlocksInLoop(blockOrdering, loop);
+        auto blocks = collectBlocksInLoop(func, loop);
         auto loopLoc = loop->sourceLoc;
         if (!_unrollLoop(module, loop, blocks))
         {
diff --git a/source/slang/slang-ir-loop-unroll.h b/source/slang/slang-ir-loop-unroll.h
index d9c31e6be..6f7a41192 100644
--- a/source/slang/slang-ir-loop-unroll.h
+++ b/source/slang/slang-ir-loop-unroll.h
@@ -1,18 +1,22 @@
 // slang-ir-loop-unroll.h
 #pragma once
 
+#include "../core/slang-list.h"
+
 namespace Slang
 {
     struct IRLoop;
     struct IRGlobalValueWithCode;
     class DiagnosticSink;
     struct IRModule;
+    struct IRBlock;
 
     // Return true if successfull, false if errors occurred.
     bool unrollLoopsInFunc(IRModule* module, IRGlobalValueWithCode* func, DiagnosticSink* sink);
 
     bool unrollLoopsInModule(IRModule* module, DiagnosticSink* sink);
 
+    List<IRBlock*> collectBlocksInLoop(IRGlobalValueWithCode* func, IRLoop* loop);
 
     // Turn a loop with continue block into a loop with only back jumps and breaks.
     // Each iteration will be wrapped in a breakable region, where everything before `continue`
diff --git a/source/slang/slang-ir-propagate-func-properties.cpp b/source/slang/slang-ir-propagate-func-properties.cpp
new file mode 100644
index 000000000..f98a77fc7
--- /dev/null
+++ b/source/slang/slang-ir-propagate-func-properties.cpp
@@ -0,0 +1,186 @@
+#include "slang-ir-propagate-func-properties.h"
+
+#include "slang-ir.h"
+#include "slang-ir-insts.h"
+#include "slang-ir-util.h"
+
+
+namespace Slang
+{
+bool propagateFuncProperties(IRModule* module)
+{
+    bool result = false;
+    List<IRFunc*> workList;
+    HashSet<IRFunc*> workListSet;
+
+    auto addToWorkList = [&](IRFunc* f)
+    {
+        if (workListSet.Add(f))
+            workList.add(f);
+    };
+    auto addCallersToWorkList = [&](IRFunc* f)
+    {
+        if (auto g = findOuterGeneric(f))
+        {
+            for (auto use = g->firstUse; use; use = use->nextUse)
+            {
+                if (use->getUser()->getOp() == kIROp_Specialize)
+                {
+                    auto specialize = use->getUser();
+                    for (auto iuse = specialize->firstUse; iuse; iuse = iuse->nextUse)
+                    {
+                        if (auto userFunc = getParentFunc(iuse->getUser()))
+                            addToWorkList(userFunc);
+                    }
+                }
+            }
+            return;
+        }
+        for (auto use = f->firstUse; use; use = use->nextUse)
+        {
+            if (use->getUser()->getOp() == kIROp_Call)
+            {
+                if (auto userFunc = getParentFunc(use->getUser()))
+                    addToWorkList(userFunc);
+            }
+        }
+    };
+    for (;;)
+    {
+        bool changed = false;
+        workList.clear();
+        workListSet.Clear();
+
+        // Add side effect free functions and their transitive callers to work list.
+        for (auto inst : module->getGlobalInsts())
+        {
+            auto genericInst = as<IRGeneric>(inst);
+            if (genericInst)
+            {
+                inst = findGenericReturnVal(genericInst);
+            }
+            if (auto func = as<IRFunc>(inst))
+            {
+                if (func->findDecoration<IRReadNoneDecoration>())
+                {
+                    addCallersToWorkList(func);
+                }
+            }
+        }
+
+        // Add remaining functions to work list.
+        for (auto inst : module->getGlobalInsts())
+        {
+            auto genericInst = as<IRGeneric>(inst);
+            if (genericInst)
+            {
+                inst = findGenericReturnVal(genericInst);
+            }
+            if (auto func = as<IRFunc>(inst))
+            {
+                addToWorkList(func);
+            }
+        }
+
+        IRBuilder builder(module);
+
+        for (Index i = 0; i < workList.getCount(); i++)
+        {
+            auto f = workList[i];
+            bool hasSideEffectCall = false;
+            if (f->findDecoration<IRReadNoneDecoration>())
+                continue;
+            // Never propagate to functions without a body.
+            if (f->getFirstBlock() == nullptr)
+                continue;
+            if (f->findDecoration<IRTargetIntrinsicDecoration>())
+                continue;
+            for (auto block : f->getBlocks())
+            {
+                for (auto inst : block->getChildren())
+                {
+                    // Is this inst known to not have global side effect/analyzable?
+                    if (inst->mightHaveSideEffects())
+                    {
+                        switch (inst->getOp())
+                        {
+                        case kIROp_ifElse:
+                        case kIROp_unconditionalBranch:
+                        case kIROp_Switch:
+                        case kIROp_Return:
+                        case kIROp_loop:
+                        case kIROp_Store:
+                        case kIROp_Call:
+                        case kIROp_Param:
+                        case kIROp_Unreachable:
+                            break;
+                        default:
+                            // We have a inst that has side effect and is not understood by this method.
+                            // e.g. bufferStore, discard, etc.
+                            return true;
+                        }
+                    }
+
+                    if (auto call = as<IRCall>(inst))
+                    {
+                        auto callee = getResolvedInstForDecorations(call->getCallee());
+                        switch (callee->getOp())
+                        {
+                        default:
+                            // We are calling an unknown function, so we have to assume
+                            // there are side effects in the call.
+                            hasSideEffectCall = true;
+                            break;
+                        case kIROp_Func:
+                            if (!callee->findDecoration<IRReadNoneDecoration>())
+                            {
+                                hasSideEffectCall = true;
+                                break;
+                            }
+                        }
+                    }
+                    
+                    // Are any operands defined in global scope?
+                    for (UInt o = 0; o < inst->getOperandCount(); o++)
+                    {
+                        auto operand = inst->getOperand(o);
+                        if (getParentFunc(operand) == f)
+                            continue;
+                        if (as<IRConstant>(operand))
+                            continue;
+                        if (as<IRType>(operand))
+                            continue;
+                        switch (operand->getOp())
+                        {
+                        case kIROp_Specialize:
+                        case kIROp_LookupWitness:
+                        case kIROp_StructKey:
+                        case kIROp_WitnessTable:
+                        case kIROp_WitnessTableEntry:
+                        case kIROp_undefined:
+                        case kIROp_Func:
+                            continue;
+                        default:
+                            break;
+                        }
+                        hasSideEffectCall = true;
+                        break;
+                    }
+                }
+                if (hasSideEffectCall)
+                    break;
+            }
+            if (!hasSideEffectCall)
+            {
+                builder.addDecoration(f, kIROp_ReadNoneDecoration);
+                addCallersToWorkList(f);
+                changed = true;
+            }
+        }
+        result |= changed;
+        if (!changed)
+            break;
+    }
+    return result;
+}
+}
diff --git a/source/slang/slang-ir-propagate-func-properties.h b/source/slang/slang-ir-propagate-func-properties.h
new file mode 100644
index 000000000..6df2de18e
--- /dev/null
+++ b/source/slang/slang-ir-propagate-func-properties.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace Slang
+{
+struct IRModule;
+bool propagateFuncProperties(IRModule* module);
+}
diff --git a/source/slang/slang-ir-redundancy-removal.cpp b/source/slang/slang-ir-redundancy-removal.cpp
index f3996fc01..2a2047de9 100644
--- a/source/slang/slang-ir-redundancy-removal.cpp
+++ b/source/slang/slang-ir-redundancy-removal.cpp
@@ -8,10 +8,118 @@ namespace Slang
 struct RedundancyRemovalContext
 {
     RefPtr<IRDominatorTree> dom;
-    bool removeRedundancyInBlock(DeduplicateContext& deduplicateContext, IRBlock* block)
+    bool isMovableInst(IRInst* inst)
+    {
+        switch (inst->getOp())
+        {
+        case kIROp_Add:
+        case kIROp_Sub:
+        case kIROp_Mul:
+        case kIROp_Div:
+        case kIROp_FRem:
+        case kIROp_IRem:
+        case kIROp_Lsh:
+        case kIROp_Rsh:
+        case kIROp_And:
+        case kIROp_Or:
+        case kIROp_Not:
+        case kIROp_FieldExtract:
+        case kIROp_FieldAddress:
+        case kIROp_GetElement:
+        case kIROp_GetElementPtr:
+        case kIROp_UpdateElement:
+        case kIROp_OptionalHasValue:
+        case kIROp_GetOptionalValue:
+        case kIROp_MakeOptionalValue:
+        case kIROp_MakeTuple:
+        case kIROp_GetTupleElement:
+        case kIROp_MakeStruct:
+        case kIROp_MakeArray:
+        case kIROp_MakeArrayFromElement:
+        case kIROp_MakeVector:
+        case kIROp_MakeMatrix:
+        case kIROp_MakeMatrixFromScalar:
+        case kIROp_MakeVectorFromScalar:
+        case kIROp_swizzle:
+        case kIROp_MatrixReshape:
+        case kIROp_MakeString:
+        case kIROp_MakeResultError:
+        case kIROp_MakeResultValue:
+        case kIROp_GetResultError:
+        case kIROp_GetResultValue:
+        case kIROp_CastFloatToInt:
+        case kIROp_CastIntToFloat:
+        case kIROp_CastIntToPtr:
+        case kIROp_CastPtrToBool:
+        case kIROp_CastPtrToInt:
+        case kIROp_BitAnd:
+        case kIROp_BitNot:
+        case kIROp_BitOr:
+        case kIROp_BitXor:
+        case kIROp_BitCast:
+        case kIROp_Reinterpret:
+        case kIROp_Greater:
+        case kIROp_Less:
+        case kIROp_Geq:
+        case kIROp_Leq:
+        case kIROp_Neq:
+        case kIROp_Eql:
+            return true;
+        case kIROp_Call:
+            return isPureFunctionalCall(as<IRCall>(inst));
+        default:
+            return false;
+        }
+    }
+
+    bool tryHoistInstToOuterMostLoop(IRGlobalValueWithCode* func, IRInst* inst)
+    {
+        bool changed = false;
+        for (auto parentBlock = dom->getImmediateDominator(as<IRBlock>(inst->getParent()));
+             parentBlock;
+             parentBlock = dom->getImmediateDominator(parentBlock))
+        {
+            auto terminatorInst = parentBlock->getTerminator();
+            if (terminatorInst->getOp() == kIROp_loop)
+            {
+                // Consider hoisting the inst into this block.
+                // This is only possible if all operands of the inst are dominating `parentBlock`.
+                bool canHoist = true;
+                for (UInt i = 0; i < inst->getOperandCount(); i++)
+                {
+                    auto operand = inst->getOperand(i);
+                    if (getParentFunc(operand) != func)
+                    {
+                        // Global value won't prevent hoisting.
+                        continue;
+                    }
+                    auto operandParent = as<IRBlock>(operand->getParent());
+                    if (!operandParent)
+                    {
+                        canHoist = false;
+                        break;
+                    }
+                    canHoist = dom->dominates(operandParent, parentBlock);
+                    if (!canHoist)
+                        break;
+                }
+                if (!canHoist)
+                    break;
+
+                // Move inst to parentBlock.
+                inst->insertBefore(terminatorInst);
+                changed = true;
+
+                // Continue to consider outer hoisting positions.
+            }
+        }
+        return changed;
+    }
+
+    bool removeRedundancyInBlock(DeduplicateContext& deduplicateContext, IRGlobalValueWithCode* func, IRBlock* block)
     {
         bool result = false;
-        for (auto instP : block->getChildren())
+        for (auto instP : block->getModifiableChildren())
         {
             auto resultInst = deduplicateContext.deduplicate(instP, [&](IRInst* inst)
                 {
@@ -20,75 +128,25 @@ struct RedundancyRemovalContext
                         return false;
                     if (dom->isUnreachable(parentBlock))
                         return false;
-
-                    switch (inst->getOp())
-                    {
-                    case kIROp_Add:
-                    case kIROp_Sub:
-                    case kIROp_Mul:
-                    case kIROp_Div:
-                    case kIROp_Module:
-                    case kIROp_Lsh:
-                    case kIROp_Rsh:
-                    case kIROp_And:
-                    case kIROp_Or:
-                    case kIROp_Not:
-                    case kIROp_FieldExtract:
-                    case kIROp_FieldAddress:
-                    case kIROp_GetElement:
-                    case kIROp_GetElementPtr:
-                    case kIROp_UpdateElement:
-                    case kIROp_OptionalHasValue:
-                    case kIROp_GetOptionalValue:
-                    case kIROp_MakeOptionalValue:
-                    case kIROp_MakeTuple:
-                    case kIROp_GetTupleElement:
-                    case kIROp_MakeStruct:
-                    case kIROp_MakeArray:
-                    case kIROp_MakeArrayFromElement:
-                    case kIROp_MakeVector:
-                    case kIROp_MakeMatrix:
-                    case kIROp_MakeMatrixFromScalar:
-                    case kIROp_MakeVectorFromScalar:
-                    case kIROp_swizzle:
-                    case kIROp_MatrixReshape:
-                    case kIROp_MakeString:
-                    case kIROp_MakeResultError:
-                    case kIROp_MakeResultValue:
-                    case kIROp_GetResultError:
-                    case kIROp_GetResultValue:
-                    case kIROp_CastFloatToInt:
-                    case kIROp_CastIntToFloat:
-                    case kIROp_CastIntToPtr:
-                    case kIROp_CastPtrToBool:
-                    case kIROp_CastPtrToInt:
-                    case kIROp_BitAnd:
-                    case kIROp_BitNot:
-                    case kIROp_BitOr:
-                    case kIROp_BitXor:
-                    case kIROp_BitCast:
-                    case kIROp_Reinterpret:
-                    case kIROp_Greater:
-                    case kIROp_Less:
-                    case kIROp_Geq:
-                    case kIROp_Leq:
-                    case kIROp_Neq:
-                    case kIROp_Eql:
-                        return true;
-                    case kIROp_Call:
-                        return isPureFunctionalCall(as<IRCall>(inst));
-                    default:
-                        return false;
-                    }
+                    return isMovableInst(inst);
                 });
             if (resultInst != instP)
+            {
+                instP->replaceUsesWith(resultInst);
                 result = true;
+            }
+            else if (isMovableInst(resultInst))
+            {
+                // This inst is unique, we should consider hoisting it
+                // if it is inside a loop.
+                result |= tryHoistInstToOuterMostLoop(func, resultInst);
+            }
         }
         for (auto child : dom->getImmediatelyDominatedBlocks(block))
         {
             DeduplicateContext subContext;
             subContext.deduplicateMap = deduplicateContext.deduplicateMap;
-            result |= removeRedundancyInBlock(subContext, child);
+            result |= removeRedundancyInBlock(subContext, func, child);
         }
         return result;
     }
@@ -122,7 +180,142 @@ bool removeRedundancyInFunc(IRGlobalValueWithCode* func)
     RedundancyRemovalContext context;
     context.dom = computeDominatorTree(func);
     DeduplicateContext deduplicateCtx;
-    return context.removeRedundancyInBlock(deduplicateCtx, root);
+    return context.removeRedundancyInBlock(deduplicateCtx, func, root);
+}
+
+static IRInst* _getRootVar(IRInst* inst)
+{
+    while (inst)
+    {
+        switch (inst->getOp())
+        {
+        case kIROp_FieldAddress:
+        case kIROp_GetElementPtr:
+            inst = inst->getOperand(0);
+            break;
+        default:
+            return inst;
+        }
+    }
+    return inst;
+}
+
+bool tryRemoveRedundantStore(IRGlobalValueWithCode* func, IRStore* store)
+{
+    // We perform a quick and conservative check:
+    // A store is redundant if it is followed by another store to the same address in
+    // the same basic block, and there are no instructions that may use any addresses
+    // related to this address.
+    bool hasAddrUse = false;
+    bool hasOverridingStore = false;
+
+    // Stores to global variables will never get removed.
+    auto rootVar = _getRootVar(store->getPtr());
+    if (!isChildInstOf(rootVar, func))
+        return false;
+
+    // A store can be removed if it stores into a local variable
+    // that has no other uses than store.
+    if (auto varInst = as<IRVar>(rootVar))
+    {
+        bool hasNonStoreUse = false;
+        // If the entire access chain doesn't non-store use, we can safely remove it.
+        HashSet<IRInst*> knownAccessChain;
+        for (auto accessChain = store->getPtr(); accessChain;)
+        {
+            knownAccessChain.Add(accessChain);
+            for (auto use = accessChain->firstUse; use; use = use->nextUse)
+            {
+                if (as<IRDecoration>(use->getUser()))
+                    continue;
+                if (knownAccessChain.Contains(use->getUser()))
+                    continue;
+                if (use->getUser()->getOp() == kIROp_Store && 
+                    use == use->getUser()->getOperands())
+                {
+                    continue;
+                }
+                hasNonStoreUse = true;
+                break;
+            }
+            if (hasNonStoreUse)
+                break;
+            switch (accessChain->getOp())
+            {
+            case kIROp_GetElementPtr:
+            case kIROp_FieldAddress:
+                accessChain = accessChain->getOperand(0);
+                continue;
+            default:
+                break;
+            }
+            break;
+        }
+        if (!hasNonStoreUse)
+        {
+            store->removeAndDeallocate();
+            return true;
+        }
+    }
+
+    // A store can be removed if there are subsequent stores to the same variable,
+    // and there are no insts in between the stores that can read the variable.
+
+    HashSet<IRBlock*> visitedBlocks;
+    for (auto next = store->getNextInst(); next;)
+    {
+        if (auto nextStore = as<IRStore>(next))
+        {
+            if (nextStore->getPtr() == store->getPtr())
+            {
+                hasOverridingStore = true;
+                break;
+            }
+        }
+
+        // If we see any insts that have reads or modifies the address before seeing
+        // an overriding store, don't remove the store.
+        // We can make the test more accurate by collecting all addresses related to
+        // the target address first, and only bail out if any of the related addresses
+        // are involved.
+        switch (next->getOp())
+        {
+        case kIROp_Load:
+            if (canAddressesPotentiallyAlias(func, next->getOperand(0), store->getPtr()))
+            {
+                hasAddrUse = true;
+            }
+            break;
+        default:
+            if (canInstHaveSideEffectAtAddress(func, next, store->getPtr()))
+            {
+                hasAddrUse = true;
+            }
+            break;
+        }
+        if (hasAddrUse)
+            break;
+
+        // If we are at the end of the current block and see a unconditional branch,
+        // we can follow the path and check the subsequent block.
+        if (auto branch = as<IRUnconditionalBranch>(next))
+        {
+            auto nextBlock = branch->getTargetBlock();
+            if (visitedBlocks.Add(nextBlock))
+            {
+                next = nextBlock->getFirstInst();
+                continue;
+            }
+        }
+        next = next->getNextInst();
+    }
+
+    if (!hasAddrUse && hasOverridingStore)
+    {
+        store->removeAndDeallocate();
+        return true;
+    }
+    return false;
 }
 
 bool eliminateRedundantLoadStore(IRGlobalValueWithCode* func)
@@ -158,57 +351,7 @@ bool eliminateRedundantLoadStore(IRGlobalValueWithCode* func)
             }
             else if (auto store = as<IRStore>(inst))
             {
-                // We perform a quick and conservative check:
-                // A store is redundant if it is followed by another store to the same address in
-                // the same basic block, and there are no instructions that may use any addresses
-                // related to this address.
-                bool hasAddrUse = false;
-                bool hasOverridingStore = false;
-
-                // Stores to global variables will never get removed.
-                if (!isChildInstOf(store->getPtr(), func))
-                    hasAddrUse = true;
-
-                for (auto next = store->getNextInst(); next; next = next->getNextInst())
-                {
-                    if (auto nextStore = as<IRStore>(next))
-                    {
-                        if (nextStore->getPtr() == store->getPtr())
-                        {
-                            hasOverridingStore = true;
-                            break;
-                        }
-                    }
-
-                    // If we see any insts that have reads or modifies the address before seeing
-                    // an overriding store, don't remove the store.
-                    // We can make the test more accurate by collecting all addresses related to
-                    // the target address first, and only bail out if any of the related addresses
-                    // are involved.
-                    switch (next->getOp())
-                    {
-                    case kIROp_Load:
-                        if (canAddressesPotentiallyAlias(func, next->getOperand(0), store->getPtr()))
-                        {
-                            hasAddrUse = true;
-                        }
-                        break;
-                    default:
-                        if (canInstHaveSideEffectAtAddress(func, next, store->getPtr()))
-                        {
-                            hasAddrUse = true;
-                        }
-                        break;
-                    }
-                    if (hasAddrUse)
-                        break;
-                }
-
-                if (!hasAddrUse && hasOverridingStore)
-                {
-                    store->removeAndDeallocate();
-                    changed = true;
-                }
+                changed |= tryRemoveRedundantStore(func, store);
             }
             inst = nextInst;
         }
diff --git a/source/slang/slang-ir-sccp.cpp b/source/slang/slang-ir-sccp.cpp
index d05527e59..691bd7ff0 100644
--- a/source/slang/slang-ir-sccp.cpp
+++ b/source/slang/slang-ir-sccp.cpp
@@ -1439,7 +1439,9 @@ struct SCCPContext
                 inst->replaceUsesWith(constantVal);
                 if( !inst->mightHaveSideEffects() )
                 {
-                    instsToRemove.add(inst);
+                    // Don't delete phi parameters, they will be cleaned up in CFG simplification.
+                    if (inst->getOp() != kIROp_Param)
+                        instsToRemove.add(inst);
                 }
             }
         }
diff --git a/source/slang/slang-ir-simplify-cfg.cpp b/source/slang/slang-ir-simplify-cfg.cpp
index 7e9e105e1..b814442fa 100644
--- a/source/slang/slang-ir-simplify-cfg.cpp
+++ b/source/slang/slang-ir-simplify-cfg.cpp
@@ -4,6 +4,8 @@
 #include "slang-ir.h"
 #include "slang-ir-dominators.h"
 #include "slang-ir-restructure.h"
+#include "slang-ir-util.h"
+#include "slang-ir-loop-unroll.h"
 
 namespace Slang
 {
@@ -31,8 +33,7 @@ static BreakableRegion* findBreakableRegion(Region* region)
 // it is needed and hasn't been generated yet.
 static bool isTrivialSingleIterationLoop(
     IRGlobalValueWithCode* func,
-    IRLoop* loop,
-    CFGSimplificationContext& inoutContext)
+    IRLoop* loop)
 {
     auto targetBlock = loop->getTargetBlock();
     if (targetBlock->getPredecessors().getCount() != 1) return false;
@@ -52,14 +53,14 @@ static bool isTrivialSingleIterationLoop(
     // 
     // We need to verify this is a trivial loop by checking if there is any multi-level breaks
     // that skips out of this loop.
-
-    if (!inoutContext.domTree)
-        inoutContext.domTree = computeDominatorTree(func);
-    if (!inoutContext.regionTree)
-        inoutContext.regionTree = generateRegionTreeForFunc(func, nullptr);
+    CFGSimplificationContext context;
+    if (!context.domTree)
+        context.domTree = computeDominatorTree(func);
+    if (!context.regionTree)
+        context.regionTree = generateRegionTreeForFunc(func, nullptr);
 
     SimpleRegion* targetBlockRegion = nullptr;
-    if (!inoutContext.regionTree->mapBlockToRegion.TryGetValue(targetBlock, targetBlockRegion))
+    if (!context.regionTree->mapBlockToRegion.TryGetValue(targetBlock, targetBlockRegion))
         return false;
     BreakableRegion* loopBreakableRegion = findBreakableRegion(targetBlockRegion);
     LoopRegion* loopRegion = as<LoopRegion>(loopBreakableRegion);
@@ -67,18 +68,18 @@ static bool isTrivialSingleIterationLoop(
         return false;
     for (auto block : func->getBlocks())
     {
-        if (!inoutContext.domTree->dominates(loop->getTargetBlock(), block))
+        if (!context.domTree->dominates(loop->getTargetBlock(), block))
             continue;
-        if (inoutContext.domTree->dominates(loop->getBreakBlock(), block))
+        if (context.domTree->dominates(loop->getBreakBlock(), block))
             continue;
         SimpleRegion* region = nullptr;
-        if (!inoutContext.regionTree->mapBlockToRegion.TryGetValue(block, region))
+        if (!context.regionTree->mapBlockToRegion.TryGetValue(block, region))
             return false;
 
         for (auto branchTarget : block->getSuccessors())
         {
             SimpleRegion* targetRegion = nullptr;
-            if (!inoutContext.regionTree->mapBlockToRegion.TryGetValue(branchTarget, targetRegion))
+            if (!context.regionTree->mapBlockToRegion.TryGetValue(branchTarget, targetRegion))
                 return false;
             // If multi-level break out that skips over this loop exists, then this is not a trivial loop.
             if (targetRegion->isDescendentOf(loopRegion))
@@ -96,6 +97,104 @@ static bool isTrivialSingleIterationLoop(
     return true;
 }
 
+static bool doesLoopHasSideEffect(IRGlobalValueWithCode* func, IRLoop* loopInst)
+{
+    auto blocks = collectBlocksInLoop(func, loopInst);
+    HashSet<IRBlock*> loopBlocks;
+    for (auto b : blocks)
+        loopBlocks.Add(b);
+    auto addressHasOutOfLoopUses = [&](IRInst* addr)
+    {
+        // The entire access chain of `addr` must have no uses out side the loop.
+        // The root variable must be a local var.
+        for (auto chainNode = addr; chainNode;)
+        {
+            if (getParentFunc(chainNode) != func)
+                return true;
+            for (auto use = chainNode->firstUse; use; use = use->nextUse)
+            {
+                if (!loopBlocks.Contains(as<IRBlock>(use->getUser()->getParent())))
+                    return true;
+            }
+            switch (chainNode->getOp())
+            {
+            case kIROp_GetElementPtr:
+            case kIROp_FieldAddress:
+                chainNode = chainNode->getOperand(0);
+                continue;
+            case kIROp_Var:
+                break;
+            default:
+                return true;
+            }
+            break;
+        }
+        return false;
+    };
+
+    for (auto b : blocks)
+    {
+        for (auto inst : b->getChildren())
+        {
+            // Is this inst used anywhere outside the loop? If so the loop has side effect.
+            for (auto use = inst->firstUse; use; use = use->nextUse)
+            {
+                if (!loopBlocks.Contains(as<IRBlock>(use->getUser()->getParent())))
+                    return true;
+            }
+
+            // The inst can't possibly have side effect? Skip it.
+            if (!inst->mightHaveSideEffects())
+                continue;
+
+            // This inst might have side effect, try to prove that the
+            // side effect does not leak beyond the scope of the loop.
+            if (auto call = as<IRCall>(inst))
+            {
+                auto callee = getResolvedInstForDecorations(call->getCallee());
+                if (!callee || !callee->findDecoration<IRReadNoneDecoration>())
+                    return true;
+                // We are calling a pure function, check if any of the return
+                // variables are used outside the loop.
+                for (UInt i = 0; i < call->getArgCount(); i++)
+                {
+                    auto arg = call->getArg(i);
+                    if (!isValueType(arg->getDataType()))
+                    {
+                        if (addressHasOutOfLoopUses(arg))
+                            return true;
+                    }
+                }
+            }
+            else if (auto store = as<IRStore>(inst))
+            {
+                if (addressHasOutOfLoopUses(store->getPtr()))
+                    return true;
+            }
+            else if (auto branch = as<IRUnconditionalBranch>(inst))
+            {
+                if (loopBlocks.Contains(branch->getTargetBlock()))
+                    continue;
+                // Branching out of the loop with some argument is considered
+                // having a side effect.
+                if (branch->getArgCount() != 0)
+                    return true;
+            }
+            else if (as<IRIfElse>(inst) || as<IRSwitch>(inst) || as<IRLoop>(inst))
+            {
+                // We are starting a sub control flow.
+                // This is considered side effect free.
+            }
+            else
+            {
+                // For all other insts, we assume it has a global side effect.
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
 static bool removeDeadBlocks(IRGlobalValueWithCode* func)
 {
     bool changed = false;
@@ -142,15 +241,327 @@ static bool removeDeadBlocks(IRGlobalValueWithCode* func)
     return changed;
 }
 
+// Return the true of the if-else branch block if the branch is a trivial jump
+// to after block with no other insts.
+static bool isTrivialIfElseBranch(IRIfElse* condBranch, IRBlock* branchBlock)
+{
+    if (branchBlock != condBranch->getAfterBlock())
+    {
+        if (auto br = as<IRUnconditionalBranch>(branchBlock->getFirstOrdinaryInst()))
+        {
+            if (br->getTargetBlock() == condBranch->getAfterBlock() && br->getOp() == kIROp_unconditionalBranch)
+            {
+                return true;
+            }
+        }
+    }
+    else
+    {
+        return true;
+    }
+    return false;
+}
+
+static bool arePhiArgsEquivalentInBranches(IRIfElse* ifElse)
+{
+    // If one of the branch target is afterBlock itself, and the other branch
+    // is a trivial block that jumps into the afterBlock, this if-else is trivial.
+    // In this case the argCount must be 0 because a block with phi parameters can't
+    // be used as targets in a conditional branch.
+    auto branch1 = ifElse->getTrueBlock();
+    auto branch2 = ifElse->getFalseBlock();
+    auto afterBlock = ifElse->getAfterBlock();
+
+    if (branch1 == afterBlock) return true;
+    if (branch2 == afterBlock) return true;
+
+    auto branchInst1 = as<IRUnconditionalBranch>(branch1->getTerminator());
+    auto branchInst2 = as<IRUnconditionalBranch>(branch2->getTerminator());
+    if (!branchInst1) return false;
+    if (!branchInst2) return false;
+
+    // If both branches are trivial blocks, we must compare the arguments.
+    if (branchInst1->getArgCount() != branchInst2->getArgCount())
+    {
+        // This should never happen, return false now to be safe.
+        return false;
+    }
+    
+    for (UInt i = 0; i < branchInst1->getArgCount(); i++)
+    {
+        if (branchInst1->getArg(i) != branchInst2->getArg(i))
+        {
+            // argument is different, the if-else is non-trivial.
+            return false;
+        }
+    }
+    return true;
+}
+
+static bool isTrivialIfElse(IRIfElse* condBranch, bool& isTrueBranchTrivial, bool& isFalseBranchTrivial)
+{
+    isTrueBranchTrivial = isTrivialIfElseBranch(condBranch, condBranch->getTrueBlock());
+    isFalseBranchTrivial = isTrivialIfElseBranch(condBranch, condBranch->getFalseBlock());
+    if (isTrueBranchTrivial && isFalseBranchTrivial)
+    {
+        if (arePhiArgsEquivalentInBranches(condBranch))
+            return true;
+    }
+    return false;
+}
+
+#if 0
+static bool tryMoveFalseBranchToTrueBranch(IRBuilder& builder, IRIfElse* ifElseInst)
+{
+    auto falseBlock = ifElseInst->getFalseBlock();
+    if (falseBlock == ifElseInst->getAfterBlock())
+        return false;
+    if (auto termInst = as<IRUnconditionalBranch>(falseBlock->getTerminator()))
+    {
+        // We can't fold a branch with arguments into the ifElse.
+        if (termInst->getArgCount() != 0)
+            return false;
+    }
+    ifElseInst->trueBlock.set(falseBlock);
+    ifElseInst->falseBlock.set(ifElseInst->getAfterBlock());
+    builder.setInsertBefore(ifElseInst);
+    auto newCondition = builder.emitNot(builder.getBoolType(), ifElseInst->getCondition());
+    ifElseInst->condition.set(newCondition);
+    return true;
+}
+#endif
+
+static bool tryEliminateFalseBranch(IRIfElse* ifElseInst)
+{
+    auto falseBlock = ifElseInst->getFalseBlock();
+    if (falseBlock == ifElseInst->getAfterBlock())
+        return false;
+    if (auto termInst = as<IRUnconditionalBranch>(falseBlock->getTerminator()))
+    {
+        // We can't fold a branch with arguments into the ifElse.
+        if (termInst->getArgCount() != 0)
+            return false;
+    }
+    ifElseInst->falseBlock.set(ifElseInst->getAfterBlock());
+    return true;
+}
+
+static bool trySimplifyIfElse(IRBuilder& builder, IRIfElse* ifElseInst)
+{
+    bool isTrueBranchTrivial = false;
+    bool isFalseBranchTrivial = false;
+    if (isTrivialIfElse(ifElseInst, isTrueBranchTrivial, isFalseBranchTrivial))
+    {
+        // If both branches of `if-else` are trivial jumps into after block,
+        // we can get rid of the entire conditional branch and replace it
+        // with a jump into the after block.
+        if (auto termInst = as<IRUnconditionalBranch>(ifElseInst->getTrueBlock()->getTerminator()))
+        {
+            List<IRInst*> args;
+            for (UInt i = 0; i < termInst->getArgCount(); i++)
+                args.add(termInst->getArg(i));
+            builder.setInsertBefore(ifElseInst);
+            builder.emitBranch(ifElseInst->getAfterBlock(), (Int)args.getCount(), args.getBuffer());
+            ifElseInst->removeAndDeallocate();
+            return true;
+        }
+    }
+    else if (isTrueBranchTrivial)
+    {
+        // If true branch is empty, we move false branch to true branch and invert the condition.
+        // TODO: diabled for now since our auto-diff pass can't handle loops whose body is on the false
+        // side of condition.
+        //return tryMoveFalseBranchToTrueBranch(builder, ifElseInst);
+    }
+    else if (isFalseBranchTrivial)
+    {
+        // If false branch is empty, we set it to afterBlock.
+        return tryEliminateFalseBranch(ifElseInst);
+    }
+    return false;
+}
+
+static bool isTrueLit(IRInst* lit)
+{
+    if (auto boolLit = as<IRBoolLit>(lit))
+        return boolLit->getValue();
+    return false;
+}
+static bool isFalseLit(IRInst* lit)
+{
+    if (auto boolLit = as<IRBoolLit>(lit))
+        return !boolLit->getValue();
+    return false;
+}
+
+static bool simplifyBoolPhiParam(IRIfElse* ifElse, Array<IRBlock*, 2>& preds, IRParam* param, UInt paramIndex)
+{
+    // For bool params where its value is assigned from the same `if-else` statement,
+    // we can simplify it into an expression of the condition of the source `if-else`.
+
+    if (!param->getDataType() || param->getDataType()->getOp() != kIROp_BoolType)
+        return false;
+
+    auto branch0 = as<IRUnconditionalBranch>(preds[0]->getTerminator());
+    if (!branch0)
+        return false;
+    if (branch0->getArgCount() <= paramIndex)
+        return false;
+    auto branch1 = as<IRUnconditionalBranch>(preds[1]->getTerminator());
+    if (!branch1)
+        return false;
+    if (branch1->getArgCount() <= paramIndex)
+        return false;
+
+    IRInst* replacement = nullptr;
+    if (isTrueLit(branch0->getArg(paramIndex)) && isFalseLit(branch1->getArg(paramIndex)))
+    {
+        replacement = ifElse->getCondition();
+    }
+    else if (isFalseLit(branch0->getArg(paramIndex)) && isTrueLit(branch1->getArg(paramIndex)))
+    {
+        IRBuilder builder(param);
+        setInsertBeforeOrdinaryInst(&builder, param);
+        replacement = builder.emitNot(builder.getBoolType(), ifElse->getCondition());
+    }
+    if (replacement)
+    {
+        param->replaceUsesWith(replacement);
+        param->removeAndDeallocate();
+        branch0->removeArgument(paramIndex);
+        branch1->removeArgument(paramIndex);
+        return true;
+    }
+    return false;
+}
+
+static bool simplifyBoolPhiParams(IRBlock* block)
+{
+    if (!block)
+        return false;
+
+    if (block->getPredecessors().getCount() != 2)
+        return false;
+
+    Array<IRBlock*, 2> preds;
+    for (auto pred : block->getPredecessors())
+        preds.add(pred);
+
+    IRBlock* ifElseBlock = nullptr;
+    if (preds[0]->getPredecessors().getCount() != 1)
+        return false;
+    ifElseBlock = *(preds[0]->getPredecessors().begin());
+    if (preds[1]->getPredecessors().getCount() != 1)
+        return false;
+    auto p = *(preds[1]->getPredecessors().begin());
+    if (p != ifElseBlock)
+        return false;
+
+    auto ifElse = as<IRIfElse>(ifElseBlock->getTerminator());
+    if (!ifElse)
+        return false;
+
+    if (ifElse->getTrueBlock() == preds[1])
+    {
+        Swap(preds[0], preds[1]);
+    }
+    SLANG_ASSERT(ifElse->getTrueBlock() == preds[0] && ifElse->getFalseBlock() == preds[1]);
+
+    List<IRParam*> params;
+    for (auto param : block->getParams())
+        params.add(param);
+    bool changed = false;
+    for (Index i = params.getCount() - 1; i >= 0; i--)
+    {
+        changed |= simplifyBoolPhiParam(ifElse, preds, params[i], (UInt)i);
+    }
+    return changed;
+}
+
+static bool removeTrivialPhiParams(IRBlock* block)
+{
+    // We can remove a phi parmeter if:
+    // 1. all arguments to a parameter is the same (not really a phi).
+    // 2. the arguments to the parameter is always the same as arguments to another existing parameter (duplicate phi).
+
+    bool changed = false;
+    List<IRParam*> params;
+    struct ParamState
+    {
+        bool areKnownValueSame = true;
+        IRInst* knownValue = nullptr;
+        OrderedHashSet<UInt> sameAsParamSet;
+    };
+    List<ParamState> args;
+    List<IRUnconditionalBranch*> termInsts;
+    for (auto param : block->getParams())
+    {
+        params.add(param);
+        args.add(ParamState());
+    }
+
+    if (!params.getCount())
+        return false;
+
+    for (UInt i = 1; i < (UInt)args.getCount(); i++)
+        for (UInt j = 0; j < i; j++)
+            args[i].sameAsParamSet.Add(j);
+
+    for (auto pred : block->getPredecessors())
+    {
+        auto termInst = as<IRUnconditionalBranch>(pred->getTerminator());
+        if (!termInst)
+            return false;
+        SLANG_ASSERT(termInst->getArgCount() == (UInt)args.getCount());
+        termInsts.add(termInst);
+        for (UInt i = 0; i < termInst->getArgCount(); i++)
+        {
+            if (args[i].areKnownValueSame)
+            {
+                if (args[i].knownValue == nullptr)
+                    args[i].knownValue = termInst->getArg(i);
+                else if (args[i].knownValue != termInst->getArg(i))
+                    args[i].areKnownValueSame = false;
+            }
+            for (UInt j = 0; j < i; j++)
+            {
+                if (termInst->getArg(i) != termInst->getArg(j))
+                {
+                    args[i].sameAsParamSet.Remove(j);
+                }
+            }
+        }
+    }
+    for (Index i = args.getCount() - 1; i >= 0; i--)
+    {
+        IRInst* targetVal = nullptr;
+        if (args[i].areKnownValueSame)
+        {
+            targetVal = args[i].knownValue;
+        }
+        else if (args[i].sameAsParamSet.Count())
+        {
+            auto targetParamId = *args[i].sameAsParamSet.begin();
+            targetVal = params[targetParamId];
+        }
+        if (targetVal)
+        {
+            params[i]->replaceUsesWith(args[i].knownValue);
+            params[i]->removeAndDeallocate();
+            for (auto termInst : termInsts)
+                termInst->removeArgument((UInt)i);
+            changed = true;
+        }
+    }
+    return changed;
+}
+
 static bool processFunc(IRGlobalValueWithCode* func)
 {
     auto firstBlock = func->getFirstBlock();
     if (!firstBlock)
         return false;
 
-    // Lazily generated region tree.
-    CFGSimplificationContext simplificationContext;
-
     IRBuilder builder(func->getModule());
 
     bool changed = false;
@@ -165,6 +576,14 @@ static bool processFunc(IRGlobalValueWithCode* func)
             workList.fastRemoveAt(0);
             while (block)
             {
+                // If all arguments to a phi parameter are the known to be the same,
+                // we can safely replace the phi parameter with the argument.
+                if (block != func->getFirstBlock())
+                {
+                    changed |= simplifyBoolPhiParams(block);
+                    changed |= removeTrivialPhiParams(block);
+                }
+
                 if (auto loop = as<IRLoop>(block->getTerminator()))
                 {
                     // If continue block is unreachable, remove it.
@@ -179,7 +598,7 @@ static bool processFunc(IRGlobalValueWithCode* func)
                     // break at the end of the loop, we can remove the header and turn it into
                     // a normal branch.
                     auto targetBlock = loop->getTargetBlock();
-                    if (isTrivialSingleIterationLoop(func, loop, simplificationContext))
+                    if (isTrivialSingleIterationLoop(func, loop))
                     {
                         builder.setInsertBefore(loop);
                         List<IRInst*> args;
@@ -189,7 +608,22 @@ static bool processFunc(IRGlobalValueWithCode* func)
                         }
                         builder.emitBranch(targetBlock, args.getCount(), args.getBuffer());
                         loop->removeAndDeallocate();
+                        changed = true;
                     }
+                    else if (!doesLoopHasSideEffect(func, loop))
+                    {
+                        // The loop isn't computing anything useful outside the loop.
+                        // We can delete the entire loop.
+                        builder.setInsertBefore(loop);
+                        SLANG_ASSERT(loop->getBreakBlock()->getFirstParam() == nullptr);
+                        builder.emitBranch(loop->getBreakBlock());
+                        loop->removeAndDeallocate();
+                        changed = true;
+                    }
+                }
+                else if (auto condBranch = as<IRIfElse>(block->getTerminator()))
+                {
+                    changed |= trySimplifyIfElse(builder, condBranch);
                 }
 
                 // If `block` does not end with an unconditional branch, bail.
@@ -225,6 +659,7 @@ static bool processFunc(IRGlobalValueWithCode* func)
                 branch->removeAndDeallocate();
                 assert(!successor->hasUses());
                 successor->removeAndDeallocate();
+                break;
             }
             for (auto successor : block->getSuccessors())
             {
diff --git a/source/slang/slang-ir-specialize-function-call.cpp b/source/slang/slang-ir-specialize-function-call.cpp
index 894d46cce..a2ebbc0cf 100644
--- a/source/slang/slang-ir-specialize-function-call.cpp
+++ b/source/slang/slang-ir-specialize-function-call.cpp
@@ -822,6 +822,12 @@ struct FunctionParameterSpecializationContext
                 {
                     decoration->removeAndDeallocate();
                 }
+                else if (as<IRReadNoneDecoration>(decoration))
+                {
+                    // After specialization, the function may no longer be side effect free
+                    // because the parameter we substituted in maybe a global param. 
+                    decoration->removeAndDeallocate();
+                }
             }
         }
 
diff --git a/source/slang/slang-ir-ssa-simplification.cpp b/source/slang/slang-ir-ssa-simplification.cpp
index f06fafcb3..beaaae065 100644
--- a/source/slang/slang-ir-ssa-simplification.cpp
+++ b/source/slang/slang-ir-ssa-simplification.cpp
@@ -10,6 +10,7 @@
 #include "slang-ir-deduplicate-generic-children.h"
 #include "slang-ir-remove-unused-generic-param.h"
 #include "slang-ir-redundancy-removal.h"
+#include "slang-ir-propagate-func-properties.h"
 
 namespace Slang
 {
@@ -29,6 +30,7 @@ namespace Slang
             changed |= peepholeOptimize(module);
             changed |= removeRedundancy(module);
             changed |= simplifyCFG(module);
+            changed |= propagateFuncProperties(module);
 
             // Note: we disregard the `changed` state from dead code elimination pass since
             // SCCP pass could be generating temporarily evaluated constant values and never actually use them.
@@ -41,6 +43,28 @@ namespace Slang
         }
     }
 
+    void simplifyNonSSAIR(IRModule* module)
+    {
+        bool changed = true;
+        const int kMaxIterations = 8;
+        int iterationCounter = 0;
+        while (changed && iterationCounter < kMaxIterations)
+        {
+            changed = false;
+            changed |= peepholeOptimize(module);
+            changed |= removeRedundancy(module);
+            changed |= simplifyCFG(module);
+
+            // Note: we disregard the `changed` state from dead code elimination pass since
+            // SCCP pass could be generating temporarily evaluated constant values and never actually use them.
+            // DCE will always remove those nearly generated consts and always returns true here.
+            eliminateDeadCode(module);
+
+            iterationCounter++;
+        }
+    }
+
+
     void simplifyFunc(IRGlobalValueWithCode* func)
     {
         bool changed = true;
diff --git a/source/slang/slang-ir-ssa-simplification.h b/source/slang/slang-ir-ssa-simplification.h
index ee8343003..39504e102 100644
--- a/source/slang/slang-ir-ssa-simplification.h
+++ b/source/slang/slang-ir-ssa-simplification.h
@@ -10,5 +10,8 @@ namespace Slang
     // until no more changes are possible.
     void simplifyIR(IRModule* module);
 
+    // Run simplifications on IR that is out of SSA form.
+    void simplifyNonSSAIR(IRModule* module);
+
     void simplifyFunc(IRGlobalValueWithCode* func);
 }
diff --git a/source/slang/slang-ir-util.cpp b/source/slang/slang-ir-util.cpp
index 3db036a8d..339521f41 100644
--- a/source/slang/slang-ir-util.cpp
+++ b/source/slang/slang-ir-util.cpp
@@ -157,6 +157,32 @@ IRInst* maybeSpecializeWithGeneric(IRBuilder& builder, IRInst* genericToSpecaili
     return genericToSpecailize;
 }
 
+bool isValueType(IRInst* dataType)
+{
+    dataType = getResolvedInstForDecorations(unwrapAttributedType(dataType));
+    if (as<IRBasicType>(dataType))
+        return true;
+    switch (dataType->getOp())
+    {
+    case kIROp_StructType:
+    case kIROp_InterfaceType:
+    case kIROp_ClassType:
+    case kIROp_VectorType:
+    case kIROp_MatrixType:
+    case kIROp_TupleType:
+    case kIROp_ResultType:
+    case kIROp_OptionalType:
+    case kIROp_DifferentialPairType:
+    case kIROp_DynamicType:
+    case kIROp_AnyValueType:
+    case kIROp_ArrayType:
+    case kIROp_FuncType:
+        return true;
+    default:
+        return false;
+    }
+}
+
 IRInst* hoistValueFromGeneric(IRBuilder& inBuilder, IRInst* value, IRInst*& outSpecializedVal, bool replaceExistingValue)
 {
     auto outerGeneric = as<IRGeneric>(findOuterGeneric(value));
@@ -402,8 +428,7 @@ bool canInstHaveSideEffectAtAddress(IRGlobalValueWithCode* func, IRInst* inst, I
             {
                 auto callee = call->getCallee();
                 if (callee &&
-                    callee->findDecoration<IRReadNoneDecoration>() &&
-                    callee->findDecoration<IRNoSideEffectDecoration>())
+                    callee->findDecoration<IRReadNoneDecoration>())
                 {
                     // An exception is if the callee is side-effect free and is not reading from
                     // memory.
@@ -423,6 +448,32 @@ bool canInstHaveSideEffectAtAddress(IRGlobalValueWithCode* func, IRInst* inst, I
                     if (canAddressesPotentiallyAlias(func, call->getArg(i), addr))
                         return true;
                 }
+                else if (!isValueType(call->getArg(i)->getDataType()))
+                {
+                    // This is some unknown handle type, we assume it can have any side effects.
+                    return true;
+                }
+            }
+        }
+        break;
+    case kIROp_unconditionalBranch:
+    case kIROp_loop:
+        {
+            auto branch = as<IRUnconditionalBranch>(inst);
+            // If any pointer typed argument of the branch inst may overlap addr, return true.
+            for (UInt i = 0; i < branch->getArgCount(); i++)
+            {
+                SLANG_RELEASE_ASSERT(branch->getArg(i)->getDataType());
+                if (isPtrLikeOrHandleType(branch->getArg(i)->getDataType()))
+                {
+                    if (canAddressesPotentiallyAlias(func, branch->getArg(i), addr))
+                        return true;
+                }
+                else if (!isValueType(branch->getArg(i)->getDataType()))
+                {
+                    // This is some unknown handle type, we assume it can have any side effects.
+                    return true;
+                }
             }
         }
         break;
@@ -434,6 +485,11 @@ bool canInstHaveSideEffectAtAddress(IRGlobalValueWithCode* func, IRInst* inst, I
             if (isPtrLikeOrHandleType(inst->getOperand(0)->getDataType()) &&
                 canAddressesPotentiallyAlias(func, inst->getOperand(0), addr))
                 return true;
+            else if (!isValueType(inst->getOperand(0)->getDataType()))
+            {
+                // This is some unknown handle type, we assume it can have any side effects.
+                return true;
+            }
         }
         break;
     default:
@@ -520,20 +576,17 @@ bool isPureFunctionalCall(IRCall* call)
     auto callee = getResolvedInstForDecorations(call->getCallee());
     if (callee->findDecoration<IRReadNoneDecoration>())
     {
-        return true;
-    }
-    if (callee->findDecoration<IRNoSideEffectDecoration>())
-    {
         // If the function has no side effect and is not writing to any outputs,
         // we can safely treat the call as a normal inst.
         bool hasOutArg = false;
         for (UInt i = 0; i < call->getArgCount(); i++)
         {
-            if (as<IRPtrTypeBase>(call->getArg(i)->getDataType()))
-            {
-                hasOutArg = true;
-                break;
-            }
+            if (isValueType(call->getArg(i)->getDataType()))
+                continue;
+            // If the argument type is not a known value type,
+            // assume it is a pointer or handle through which side effect can take place.
+            hasOutArg = true;
+            break;
         }
         return !hasOutArg;
     }
diff --git a/source/slang/slang-ir-util.h b/source/slang/slang-ir-util.h
index 8a12ab895..62156cad6 100644
--- a/source/slang/slang-ir-util.h
+++ b/source/slang/slang-ir-util.h
@@ -83,6 +83,9 @@ inline bool isScalarIntegerType(IRType* type)
     return getTypeStyle(type->getOp()) == kIROp_IntType;
 }
 
+// No side effect can take place through a value of a "Value" type.
+bool isValueType(IRInst* type);
+
 inline bool isChildInstOf(IRInst* inst, IRInst* parent)
 {
     while (inst)
diff --git a/source/slang/slang-ir.cpp b/source/slang/slang-ir.cpp
index accefc0c9..fd211d05c 100644
--- a/source/slang/slang-ir.cpp
+++ b/source/slang/slang-ir.cpp
@@ -43,7 +43,10 @@ namespace Slang
             case kIROp_PreciseDecoration: 
             case kIROp_PublicDecoration: 
             case kIROp_HLSLExportDecoration: 
-            case kIROp_ReadNoneDecoration: 
+            case kIROp_ReadNoneDecoration:
+            case kIROp_NoSideEffectDecoration:
+            case kIROp_ForwardDifferentiableDecoration:
+            case kIROp_BackwardDifferentiableDecoration:
             case kIROp_RequiresNVAPIDecoration: 
             case kIROp_TriangleAdjInputPrimitiveTypeDecoration:
             case kIROp_TriangleInputPrimitiveTypeDecoration:
@@ -695,6 +698,21 @@ namespace Slang
         }
     }
 
+    void IRUnconditionalBranch::removeArgument(UInt index)
+    {
+        switch (getOp())
+        {
+        case kIROp_unconditionalBranch:
+            removeOperand(1 + index);
+            break;
+        case kIROp_loop:
+            removeOperand(3 + index);
+            break;
+        default:
+            SLANG_UNEXPECTED("unhandled unconditional branch opcode");
+        }
+    }
+
     IRInst* IRUnconditionalBranch::getArg(UInt index)
     {
         return getArgs()[index].usedValue;
@@ -5109,6 +5127,17 @@ namespace Slang
         return inst;
     }
 
+    IRInst* IRBuilder::emitNot(IRType* type, IRInst* value)
+    {
+        auto inst = createInst<IRInst>(
+            this,
+            kIROp_Not,
+            type,
+            value);
+        addInst(inst);
+        return inst;
+    }
+
     IRInst* IRBuilder::emitAdd(IRType* type, IRInst* left, IRInst* right)
     {
         auto inst = createInst<IRInst>(
@@ -6792,6 +6821,17 @@ namespace Slang
         }
     }
 
+    void IRInst::removeOperand(Index index)
+    {
+        for (Index i = index; i < (Index)operandCount - 1; i++)
+        {
+            getOperands()[i].set(getOperand(i + 1));
+        }
+        getOperands()[operandCount - 1].clear();
+        operandCount--;
+        return;
+    }
+
     // Remove this instruction from its parent block,
     // and then destroy it (it had better have no uses!)
     void IRInst::removeAndDeallocate()
@@ -6879,6 +6919,8 @@ namespace Slang
                 // common subexpression elimination, etc.
                 //
                 auto call = cast<IRCall>(this);
+                // If the call has been marked as no-side-effect, we
+                // will treat it so, by-passing all other checks.
                 if (call->findDecoration<IRNoSideEffectDecoration>())
                     return false;
                 return !isPureFunctionalCall(call);
@@ -6894,6 +6936,7 @@ namespace Slang
         case kIROp_Func:
         case kIROp_Generic:
         case kIROp_Var:
+        case kIROp_Param:
         case kIROp_GlobalVar: // Note: the IRGlobalVar represents the *address*, so only a load/store would have side effects
         case kIROp_GlobalConstant:
         case kIROp_GlobalParam:
@@ -7003,12 +7046,6 @@ namespace Slang
         case kIROp_BackwardDifferentiatePropagate:
             return false;
         }
-
-        // Check if the calle has been marked with a catch-all no-side-effect decoration.
-        if (findDecoration<IRNoSideEffectDecoration>())
-        {
-            return false;
-        }
         return true;
     }
 
diff --git a/source/slang/slang-ir.h b/source/slang/slang-ir.h
index 63b7c4ef9..e22ea8a36 100644
--- a/source/slang/slang-ir.h
+++ b/source/slang/slang-ir.h
@@ -744,6 +744,11 @@ struct IRInst
     // for those values.
     void removeArguments();
 
+    // Remove operand `index` from operand list.
+    // For example, if the inst is `op(a,b,c)`, calling removeOperand(inst, 1) will result
+    // `op(a,c)`.
+    void removeOperand(Index index);
+
         /// Transfer any decorations of this instruction to the `target` instruction.
     void transferDecorationsTo(IRInst* target);
 
diff --git a/source/slang/slang-lower-to-ir.cpp b/source/slang/slang-lower-to-ir.cpp
index 681871b6c..d09c35eea 100644
--- a/source/slang/slang-lower-to-ir.cpp
+++ b/source/slang/slang-lower-to-ir.cpp
@@ -8304,6 +8304,11 @@ struct DeclLoweringVisitor : DeclVisitor<DeclLoweringVisitor, LoweredValInfo>
             getBuilder()->addSimpleDecoration<IRRequiresNVAPIDecoration>(irFunc);
         }
 
+        if (decl->findModifier<AlwaysFoldIntoUseSiteAttribute>())
+        {
+            getBuilder()->addSimpleDecoration<IRAlwaysFoldIntoUseSiteDecoration>(irFunc);
+        }
+
         if (decl->findModifier<NoInlineAttribute>())
         {
             getBuilder()->addSimpleDecoration<IRNoInlineDecoration>(irFunc);
diff --git a/tests/bugs/sample-grad-clamp-lod.slang.glsl b/tests/bugs/sample-grad-clamp-lod.slang.glsl
index b91fb8668..a49983599 100644
--- a/tests/bugs/sample-grad-clamp-lod.slang.glsl
+++ b/tests/bugs/sample-grad-clamp-lod.slang.glsl
@@ -20,10 +20,10 @@ rayPayloadInEXT ShadowRay_0 _S1;
 
 void main()
 {
-    vec4 val_0 = (textureGradOffsetClampARB(sampler2DArray(t2D_0,samplerState_0), (vec3(_S1.hitDistance_0 * 0.20000000000000001110, _S1.hitDistance_0 * 0.29999999999999998890, 0.20000000000000001110)), (vec2(float(0), float(0))), (vec2(float(0), float(0))), (ivec2(0)), (0.50000000000000000000)));
+    const vec2 _S2 = vec2(0.0, 0.0);
 
-    float _S2 = dot(val_0, val_0);
+    vec4 val_0 = (textureGradOffsetClampARB(sampler2DArray(t2D_0,samplerState_0), (vec3(_S1.hitDistance_0 * 0.20000000298023223877, _S1.hitDistance_0 * 0.30000001192092895508, 0.20000000298023223877)), (_S2), (_S2), (ivec2(0)), (0.5)));
 
-    _S1.hitDistance_0 = _S2;
+    _S1.hitDistance_0 = dot(val_0, val_0);
     return;
 }
diff --git a/tests/bugs/vk-structured-buffer-load.hlsl.glsl b/tests/bugs/vk-structured-buffer-load.hlsl.glsl
index 7f3ec40a2..1d056944a 100644
--- a/tests/bugs/vk-structured-buffer-load.hlsl.glsl
+++ b/tests/bugs/vk-structured-buffer-load.hlsl.glsl
@@ -1,15 +1,10 @@
-// vk-structured-buffer-load.hlsl.glsl
-//TEST_IGNORE_FILE:
-
 #version 460
 #extension GL_NV_ray_tracing : require
 layout(row_major) uniform;
 layout(row_major) buffer;
-
 layout(std430, binding = 1) readonly buffer _S1 {
     float _data[];
 } gParamBlock_sbuf_0;
-
 float rcp_0(float x_0)
 {
     float _S2 = 1.0 / x_0;
@@ -36,37 +31,21 @@ void main()
     _S3.PackedHitInfoA_0.x = HitT_0;
 
     float offsfloat_0 = ((gParamBlock_sbuf_0)._data[(0)]);
-
     uint use_rcp_0 = 0U | uint(HitT_0 > 0.0);
-
     if(use_rcp_0 != 0U)
     {
-
-        float _S5 = rcp_0(offsfloat_0);
-
-        _S3.PackedHitInfoA_0.y = _S5;
-
+        _S3.PackedHitInfoA_0.y = rcp_0(offsfloat_0);
     }
     else
     {
-
         if(use_rcp_0 > 0U&&offsfloat_0 == 0.0)
         {
-
-            float _S6 = (inversesqrt((offsfloat_0 + 1.0)));
-
-            _S3.PackedHitInfoA_0.y = _S6;
-
+            _S3.PackedHitInfoA_0.y = (inversesqrt((offsfloat_0 + 1.0)));
         }
         else
         {
-            float _S7 = (inversesqrt((offsfloat_0)));
-
-            _S3.PackedHitInfoA_0.y = _S7;
-
+            _S3.PackedHitInfoA_0.y = (inversesqrt((offsfloat_0)));
         }
-
     }
-
     return;
 }
diff --git a/tests/cross-compile/array-of-buffers.slang.glsl b/tests/cross-compile/array-of-buffers.slang.glsl
index 1f436fad0..21961afd1 100644
--- a/tests/cross-compile/array-of-buffers.slang.glsl
+++ b/tests/cross-compile/array-of-buffers.slang.glsl
@@ -1,8 +1,6 @@
-//TEST_IGNORE_FILE:
 #version 450
 layout(row_major) uniform;
 layout(row_major) buffer;
-
 struct SLANG_ParameterGroup_C_0
 {
     uint index_0;
@@ -13,7 +11,6 @@ layout(std140) uniform _S1
 {
     SLANG_ParameterGroup_C_0 _data;
 } C_0;
-
 struct S_0
 {
     vec4 f_0;
@@ -24,31 +21,26 @@ layout(std140) uniform _S2
 {
     S_0 _data;
 } cb_0[3];
-
 layout(std430, binding = 2) readonly buffer _S3 {
     S_0 _data[];
 } sb1_0[4];
-
 layout(std430, binding = 3) buffer _S4 {
     vec4 _data[];
 } sb2_0[5];
-
 layout(std430, binding = 4) readonly buffer _S5
 {
     uint _data[];
 } bb_0[6];
-
 layout(location = 0)
 out vec4 _S6;
 
 void main()
 {
-    S_0 _S7 = ((sb1_0[C_0._data.index_0])._data[(C_0._data.index_0)]);
-    vec4 _S8 = cb_0[C_0._data.index_0]._data.f_0 + _S7.f_0;
-    vec4 _S9 = _S8 + ((sb2_0[C_0._data.index_0])._data[(C_0._data.index_0)]);
-    uint _S10 = ((bb_0[C_0._data.index_0])._data[(int(C_0._data.index_0 * 4U))/4]);
-
-    _S6 = _S9 + vec4(float(_S10));
-
+    vec4 _S7 = cb_0[C_0._data.index_0]._data.f_0;
+    S_0 _S8 = ((sb1_0[C_0._data.index_0])._data[(C_0._data.index_0)]);
+    vec4 _S9 = _S7 + _S8.f_0;
+    vec4 _S10 = _S9 + ((sb2_0[C_0._data.index_0])._data[(C_0._data.index_0)]);
+    uint _S11 = ((bb_0[C_0._data.index_0])._data[(int(C_0._data.index_0 * 4U))/4]);
+    _S6 = _S10 + vec4(float(_S11));
     return;
 }
diff --git a/tests/cross-compile/array-of-buffers.slang.hlsl b/tests/cross-compile/array-of-buffers.slang.hlsl
index 501b9c6db..960957789 100644
--- a/tests/cross-compile/array-of-buffers.slang.hlsl
+++ b/tests/cross-compile/array-of-buffers.slang.hlsl
@@ -1,5 +1,3 @@
-//TEST_IGNORE_FILE:
-
 #pragma pack_matrix(column_major)
 #ifdef SLANG_HLSL_ENABLE_NVAPI
 #include "nvHLSLExtns.h"
@@ -15,24 +13,24 @@ cbuffer C_0 : register(b0)
 {
     SLANG_ParameterGroup_C_0 C_0;
 }
-
 struct S_0
 {
     float4 f_0;
 };
 
 ConstantBuffer<S_0 >  cb_0[int(3)] : register(b1);
+
 StructuredBuffer<S_0 >  sb1_0[int(4)] : register(t0);
+
 RWStructuredBuffer<float4 >  sb2_0[int(5)] : register(u0);
-ByteAddressBuffer  bb_0[int(6)] : register(t4);
 
+ByteAddressBuffer  bb_0[int(6)] : register(t4);
 float4 main() : SV_TARGET
 {
-    S_0 _S1 = sb1_0[C_0.index_0][C_0.index_0];
-
-    float4 _S2 = cb_0[C_0.index_0].f_0 + _S1.f_0;
-    float4 _S3 = _S2 + sb2_0[C_0.index_0][C_0.index_0];
-    uint _S4 = bb_0[C_0.index_0].Load(int(C_0.index_0 * 4U));
-
-    return _S3 + (float4)float(_S4);
+    float4 _S1 = cb_0[C_0.index_0].f_0;
+    S_0 _S2 = sb1_0[C_0.index_0][C_0.index_0];
+    float4 _S3 = _S1 + _S2.f_0;
+    float4 _S4 = _S3 + sb2_0[C_0.index_0][C_0.index_0];
+    uint _S5 = bb_0[C_0.index_0].Load(int(C_0.index_0 * 4U));
+    return _S4 + (float4)float(_S5);
 }
diff --git a/tests/cross-compile/glsl-generic-in.slang.glsl b/tests/cross-compile/glsl-generic-in.slang.glsl
index c8d9b1bd1..4bf0598d0 100644
--- a/tests/cross-compile/glsl-generic-in.slang.glsl
+++ b/tests/cross-compile/glsl-generic-in.slang.glsl
@@ -1,8 +1,6 @@
-//TEST_IGNORE_FILE:
 #version 450
 layout(row_major) uniform;
 layout(row_major) buffer;
-
 struct F_0
 {
     vec4 v0_0;
@@ -14,10 +12,9 @@ float F_get_0(F_0 this_0)
     return this_0.v0_0.x + this_0.v1_0.x;
 }
 
-
 float E_get_0()
 {
-    return 1.00000000000000000000;
+    return 1.0;
 }
 
 layout(location = 0)
@@ -33,14 +30,11 @@ struct VOut_0
 {
     vec4 projPos_0;
 };
-
 void main()
 {
     F_0 _S4 = { _S2, _S3 };
     VOut_0 vout_0;
-    float _S5 = F_get_0(_S4);
-    float _S6 = E_get_0();
-    vout_0.projPos_0 = vec4(_S1, _S5 + _S6);
+    vout_0.projPos_0 = vec4(_S1, F_get_0(_S4) + E_get_0());
     gl_Position = vout_0.projPos_0;
     return;
-}
-\ No newline at end of file
+}
diff --git a/tests/cross-compile/half-conversion.slang.glsl b/tests/cross-compile/half-conversion.slang.glsl
index 58d20b4fc..fb51809b4 100644
--- a/tests/cross-compile/half-conversion.slang.glsl
+++ b/tests/cross-compile/half-conversion.slang.glsl
@@ -1,8 +1,6 @@
-//half-conversion.slang.glsl
-//TEST_IGNORE_FILE:
-
 #version 450
-
+layout(row_major) uniform;
+layout(row_major) buffer;
 struct SLANG_ParameterGroup_C_0
 {
     uvec4 u_0;
@@ -13,29 +11,30 @@ layout(std140) uniform _S1
 {
     SLANG_ParameterGroup_C_0 _data;
 } C_0;
-
 vec4 f16tof32_0(uvec4 value_0)
 {
     vec4 result_0;
-    int i_0;
-    i_0 = 0;
+    int i_0 = 0;
     for(;;)
     {
-        if(i_0 < 4) {} else break;
-
-        float _S2 = (unpackHalf2x16((value_0[i_0])).x);
-        result_0[i_0] = _S2;
-        i_0 = i_0 + int(1);
+        if(i_0 < 4)
+        {
+        }
+        else
+        {
+            break;
+        }
+        result_0[i_0] = (unpackHalf2x16((value_0[i_0])).x);
+        i_0 = i_0 + 1;
     }
     return result_0;
 }
 
 layout(location = 0)
-out vec4 _S3;
+out vec4 _S2;
 
 void main()
 {
-    vec4 _S4 = f16tof32_0(C_0._data.u_0);
-    _S3 = _S4;
+    _S2 = f16tof32_0(C_0._data.u_0);
     return;
 }
diff --git a/tests/cross-compile/sign.slang.glsl b/tests/cross-compile/sign.slang.glsl
index 7a3a37c51..44c015967 100644
--- a/tests/cross-compile/sign.slang.glsl
+++ b/tests/cross-compile/sign.slang.glsl
@@ -1,17 +1,12 @@
-//TEST_IGNORE_FILE:
 #version 450
 layout(row_major) uniform;
 layout(row_major) buffer;
 
-#line 8 0
 layout(location = 0)
 out vec4 _S1;
 
-
-#line 8
 void main()
 {
-    ivec4 _S2 = ivec4(sign(vec4(1.50000000000000000000, 1.00000000000000000000, -1.50000000000000000000, -1.00000000000000000000)));
-    _S1 = vec4(_S2);
+    _S1 = vec4((ivec4(sign((vec4(1.5, 1.0, -1.5, -1.0))))));
     return;
-}
-\ No newline at end of file
+}
diff --git a/tests/diagnostics/interfaces/anyvalue-size-validation.slang b/tests/diagnostics/interfaces/anyvalue-size-validation.slang
index 6c33b72a9..1ebf7f4c3 100644
--- a/tests/diagnostics/interfaces/anyvalue-size-validation.slang
+++ b/tests/diagnostics/interfaces/anyvalue-size-validation.slang
@@ -21,9 +21,11 @@ T test<T:IInterface>(T s)
     return s;
 }
 
+RWStructuredBuffer<uint> output;
+
 [numthreads(4, 1, 1)]
 void main()
 {
     S s;
-    test(s);
+    output[0] = test(s).a;
 }
 \ No newline at end of file
diff --git a/tests/experimental/liveness/liveness-2.slang.expected b/tests/experimental/liveness/liveness-2.slang.expected
index c742fa1fc..16883c1fd 100644
--- a/tests/experimental/liveness/liveness-2.slang.expected
+++ b/tests/experimental/liveness/liveness-2.slang.expected
@@ -51,9 +51,8 @@ layout(local_size_x = 4, local_size_y = 1, local_size_z = 1) in;
 void main()
 {
     int index_0 = int(gl_GlobalInvocationID.x);
-    uint _S4 = uint(index_0);
-    int _S5 = calcThing_0(index_0);
-    ((outputBuffer_0)._data[(_S4)]) = _S5;
+    int _S4 = calcThing_0(index_0);
+    ((outputBuffer_0)._data[(uint(index_0))]) = _S4;
     return;
 }
 
diff --git a/tests/experimental/liveness/liveness-3.slang.expected b/tests/experimental/liveness/liveness-3.slang.expected
index 4dff6b37a..cb093a640 100644
--- a/tests/experimental/liveness/liveness-3.slang.expected
+++ b/tests/experimental/liveness/liveness-3.slang.expected
@@ -53,12 +53,15 @@ int calcThing_0(int offset_0)
         idx_0[0] = 0;
         idx_0[1] = 0;
         idx_0[2] = 0;
+        int _S2 = (k_0 + 7) % 5;
+        bool _S3 = _S2 == 4;
+        int k_1 = k_0 + 1;
         int i_0;
         livenessStart_1(i_0, 0);
         i_0 = 0;
-        int _S2;
-        livenessStart_1(_S2, 0);
-        _S2 = _S1;
+        int _S4;
+        livenessStart_1(_S4, 0);
+        _S4 = _S1;
         for(;;)
         {
             if(i_0 < 17)
@@ -70,74 +73,74 @@ int calcThing_0(int offset_0)
             }
             int modRange_0 = i_0 % 3;
             another_0[i_0 & 1] = another_0[i_0 & 1] + modRange_0;
-            int _S3 = i_0 % 3;
-            int _S4;
-            if(_S3 != 0)
+            int _S5 = i_0 % 3;
+            int _S6;
+            if(modRange_0 != 0)
             {
-                int _S5 = _S2;
-                livenessEnd_0(_S2, 0);
-                int _S6 = _S5 + 1;
-                livenessStart_1(_S4, 0);
-                _S4 = _S6;
+                int _S7 = _S4;
+                livenessEnd_0(_S4, 0);
+                int _S8 = _S7 + 1;
+                livenessStart_1(_S6, 0);
+                _S6 = _S8;
             }
             else
             {
-                int _S7 = _S2;
-                livenessEnd_0(_S2, 0);
-                livenessStart_1(_S4, 0);
-                _S4 = _S7;
+                int _S9 = _S4;
+                livenessEnd_0(_S4, 0);
+                livenessStart_1(_S6, 0);
+                _S6 = _S9;
             }
-            idx_0[modRange_0] = idx_0[modRange_0] + (_S4 + i_0);
+            idx_0[modRange_0] = idx_0[modRange_0] + (_S6 + i_0);
             i_0 = i_0 + 1;
-            livenessStart_1(_S2, 0);
-            int _S8 = _S4;
-            livenessEnd_0(_S4, 0);
-            _S2 = _S8;
+            livenessStart_1(_S4, 0);
+            int _S10 = _S6;
+            livenessEnd_0(_S6, 0);
+            _S4 = _S10;
         }
         livenessEnd_0(i_0, 0);
         livenessEnd_0(_S1, 0);
-        int _S9 = (k_0 + 7) % 5;
-        if(_S9 == 4)
+        livenessEnd_0(k_0, 0);
+        if(_S3)
         {
-            livenessEnd_0(_S2, 0);
+            livenessEnd_0(_S4, 0);
             livenessEnd_1(idx_0, 0);
-            livenessEnd_0(k_0, 0);
             livenessEnd_2(another_0, 0);
             return total_0;
         }
-        int _S10 = idx_0[0] + idx_0[1];
-        int _S11 = idx_0[2];
+        int _S11 = idx_0[0] + idx_0[1];
+        int _S12 = idx_0[2];
         livenessEnd_1(idx_0, 0);
-        int _S12 = _S10 + _S11;
-        int _S13 = total_0;
+        int _S13 = _S11 + _S12;
+        int _S14 = total_0;
         livenessEnd_0(total_0, 0);
-        int total_1 = _S13 + _S12;
-        k_0 = k_0 + 1;
+        int total_1 = _S14 + _S13;
+        livenessStart_1(k_0, 0);
+        k_0 = k_1;
         livenessStart_1(_S1, 0);
-        int _S14 = _S2;
-        livenessEnd_0(_S2, 0);
-        _S1 = _S14;
+        int _S15 = _S4;
+        livenessEnd_0(_S4, 0);
+        _S1 = _S15;
         livenessStart_1(total_0, 0);
         total_0 = total_1;
     }
     livenessEnd_0(_S1, 0);
     livenessEnd_0(k_0, 0);
     livenessEnd_2(another_0, 0);
-    int _S15 = total_0;
+    int _S16 = total_0;
     livenessEnd_0(total_0, 0);
-    return - _S15;
+    return - _S16;
 }
 
-layout(std430, binding = 0) buffer _S16 {
+layout(std430, binding = 0) buffer _S17 {
     int _data[];
 } outputBuffer_0;
 layout(local_size_x = 4, local_size_y = 1, local_size_z = 1) in;
 void main()
 {
     int index_0 = int(gl_GlobalInvocationID.x);
-    uint _S17 = uint(index_0);
-    int _S18 = calcThing_0(index_0);
-    ((outputBuffer_0)._data[(_S17)]) = _S18;
+    uint _S18 = uint(index_0);
+    int _S19 = calcThing_0(index_0);
+    ((outputBuffer_0)._data[(_S18)]) = _S19;
     return;
 }
 
diff --git a/tests/experimental/liveness/liveness-4.slang.expected b/tests/experimental/liveness/liveness-4.slang.expected
index cd97f8057..efc2e3846 100644
--- a/tests/experimental/liveness/liveness-4.slang.expected
+++ b/tests/experimental/liveness/liveness-4.slang.expected
@@ -36,6 +36,9 @@ int calcThing_0(int offset_0)
         {
             break;
         }
+        int _S1 = (k_0 + 7) % 5;
+        bool _S2 = _S1 == 4;
+        int k_1 = k_0 + 1;
         int i_0;
         livenessStart_1(i_0, 0);
         i_0 = 0;
@@ -52,30 +55,30 @@ int calcThing_0(int offset_0)
             i_0 = i_0 + 1;
         }
         livenessEnd_0(i_0, 0);
-        int _S1 = (k_0 + 7) % 5;
-        if(_S1 == 4)
+        livenessEnd_0(k_0, 0);
+        if(_S2)
         {
-            livenessEnd_0(k_0, 0);
             livenessEnd_1(another_0, 0);
             return 1;
         }
-        k_0 = k_0 + 1;
+        livenessStart_1(k_0, 0);
+        k_0 = k_1;
     }
     livenessEnd_0(k_0, 0);
     livenessEnd_1(another_0, 0);
     return -2;
 }
 
-layout(std430, binding = 0) buffer _S2 {
+layout(std430, binding = 0) buffer _S3 {
     int _data[];
 } outputBuffer_0;
 layout(local_size_x = 4, local_size_y = 1, local_size_z = 1) in;
 void main()
 {
     int index_0 = int(gl_GlobalInvocationID.x);
-    uint _S3 = uint(index_0);
-    int _S4 = calcThing_0(index_0);
-    ((outputBuffer_0)._data[(_S3)]) = _S4;
+    uint _S4 = uint(index_0);
+    int _S5 = calcThing_0(index_0);
+    ((outputBuffer_0)._data[(_S4)]) = _S5;
     return;
 }
 
diff --git a/tests/experimental/liveness/liveness-5.slang.expected b/tests/experimental/liveness/liveness-5.slang.expected
index 3693d3fde..e9fe9d652 100644
--- a/tests/experimental/liveness/liveness-5.slang.expected
+++ b/tests/experimental/liveness/liveness-5.slang.expected
@@ -39,6 +39,9 @@ int calcThing_0(int offset_0)
         {
             break;
         }
+        int _S1 = (k_0 + 7) % 5;
+        bool _S2 = _S1 == 4;
+        int k_1 = k_0 + 1;
         int i_0;
         livenessStart_1(i_0, 0);
         i_0 = 0;
@@ -55,17 +58,17 @@ int calcThing_0(int offset_0)
             i_0 = i_0 + 1;
         }
         livenessEnd_0(i_0, 0);
-        int _S1 = total_0;
+        livenessEnd_0(k_0, 0);
+        int _S3 = total_0;
         livenessEnd_0(total_0, 0);
-        int total_1 = _S1 + another_0[k_0 & 1];
-        int _S2 = (k_0 + 7) % 5;
-        if(_S2 == 4)
+        int total_1 = _S3 + another_0[k_0 & 1];
+        if(_S2)
         {
-            livenessEnd_0(k_0, 0);
             livenessEnd_1(another_0, 0);
             return 1;
         }
-        k_0 = k_0 + 1;
+        livenessStart_1(k_0, 0);
+        k_0 = k_1;
         livenessStart_1(total_0, 0);
         total_0 = total_1;
     }
@@ -81,16 +84,16 @@ int calcThing_0(int offset_0)
     return total_0;
 }
 
-layout(std430, binding = 0) buffer _S3 {
+layout(std430, binding = 0) buffer _S4 {
     int _data[];
 } outputBuffer_0;
 layout(local_size_x = 4, local_size_y = 1, local_size_z = 1) in;
 void main()
 {
     int index_0 = int(gl_GlobalInvocationID.x);
-    uint _S4 = uint(index_0);
-    int _S5 = calcThing_0(index_0);
-    ((outputBuffer_0)._data[(_S4)]) = _S5;
+    uint _S5 = uint(index_0);
+    int _S6 = calcThing_0(index_0);
+    ((outputBuffer_0)._data[(_S5)]) = _S6;
     return;
 }
 
diff --git a/tests/experimental/liveness/liveness-6.slang.expected b/tests/experimental/liveness/liveness-6.slang.expected
index 9c3bae815..b661c09bf 100644
--- a/tests/experimental/liveness/liveness-6.slang.expected
+++ b/tests/experimental/liveness/liveness-6.slang.expected
@@ -43,6 +43,10 @@ int calcThing_0(int offset_0)
         livenessStart_0(arr_0, 0);
         arr_0[0] = 2;
         arr_0[1] = 3;
+        int _S1 = k_0 & 1;
+        int _S2 = (k_0 + 7) % 5;
+        bool _S3 = _S2 == 4;
+        int k_1 = k_0 + 1;
         int i_0;
         livenessStart_1(i_0, 0);
         i_0 = 0;
@@ -56,25 +60,24 @@ int calcThing_0(int offset_0)
                 break;
             }
             another_0[i_0 & 1] = another_0[i_0 & 1] + (k_0 + i_0);
-            arr_0[k_0 & 1] = arr_0[k_0 & 1] + i_0;
+            arr_0[_S1] = arr_0[_S1] + i_0;
             i_0 = i_0 + 1;
         }
         livenessEnd_0(i_0, 0);
-        int _S1 = k_0 & 1;
-        int _S2 = total_0;
+        livenessEnd_0(k_0, 0);
+        int _S4 = total_0;
         livenessEnd_0(total_0, 0);
-        int total_1 = _S2 + another_0[_S1];
-        int _S3 = arr_0[_S1];
+        int total_1 = _S4 + another_0[_S1];
+        int _S5 = arr_0[_S1];
         livenessEnd_1(arr_0, 0);
-        int total_2 = total_1 + _S3;
-        int _S4 = (k_0 + 7) % 5;
-        if(_S4 == 4)
+        int total_2 = total_1 + _S5;
+        if(_S3)
         {
-            livenessEnd_0(k_0, 0);
             livenessEnd_1(another_0, 0);
             return 1;
         }
-        k_0 = k_0 + 1;
+        livenessStart_1(k_0, 0);
+        k_0 = k_1;
         livenessStart_1(total_0, 0);
         total_0 = total_2;
     }
@@ -90,16 +93,16 @@ int calcThing_0(int offset_0)
     return total_0;
 }
 
-layout(std430, binding = 0) buffer _S5 {
+layout(std430, binding = 0) buffer _S6 {
     int _data[];
 } outputBuffer_0;
 layout(local_size_x = 4, local_size_y = 1, local_size_z = 1) in;
 void main()
 {
     int index_0 = int(gl_GlobalInvocationID.x);
-    uint _S6 = uint(index_0);
-    int _S7 = calcThing_0(index_0);
-    ((outputBuffer_0)._data[(_S6)]) = _S7;
+    uint _S7 = uint(index_0);
+    int _S8 = calcThing_0(index_0);
+    ((outputBuffer_0)._data[(_S7)]) = _S8;
     return;
 }
 
diff --git a/tests/experimental/liveness/liveness.slang.expected b/tests/experimental/liveness/liveness.slang.expected
index 4a81b8855..06809ffc3 100644
--- a/tests/experimental/liveness/liveness.slang.expected
+++ b/tests/experimental/liveness/liveness.slang.expected
@@ -21,6 +21,7 @@ void livenessEnd_1(spirv_by_reference int _0, spirv_literal int _1);
 int someSlowFunc_0(int a_0)
 {
     uint _S1 = uint(a_0);
+    int _S2 = a_0 * 20;
     uint v_0;
     livenessStart_0(v_0, 0);
     v_0 = _S1;
@@ -29,20 +30,20 @@ int someSlowFunc_0(int a_0)
     i_0 = 0;
     for(;;)
     {
-        if(i_0 < a_0 * 20)
+        if(i_0 < _S2)
         {
         }
         else
         {
             break;
         }
-        uint _S2 = v_0 >> 1;
-        uint _S3 = v_0;
+        uint _S3 = v_0 >> 1;
+        uint _S4 = v_0;
         livenessEnd_0(v_0, 0);
-        uint _S4 = (_S2 | _S3 << 31) * uint(i_0);
+        uint _S5 = (_S3 | _S4 << 31) * uint(i_0);
         int i_1 = i_0 + 1;
         livenessStart_0(v_0, 0);
-        v_0 = _S4;
+        v_0 = _S5;
         i_0 = i_1;
     }
     livenessEnd_1(i_0, 0);
@@ -58,12 +59,12 @@ struct SomeStruct_0
 
 SomeStruct_0 makeSomeStruct_0()
 {
-    const int  _S5[100] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-    SomeStruct_0 s_0 = { 0, 0, _S5 };
+    const int  _S6[100] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    SomeStruct_0 s_0 = { 0, 0, _S6 };
     return s_0;
 }
 
-layout(std430, binding = 1) buffer _S6 {
+layout(std430, binding = 1) buffer _S7 {
     int _data[];
 } anotherBuffer_0;
 int doThing_0(SomeStruct_0 s_1)
@@ -73,11 +74,12 @@ int doThing_0(SomeStruct_0 s_1)
 
 int somethingElse_0(inout SomeStruct_0 s_2)
 {
-    s_2.x_0 = s_2.x_0 + 1;
-    return s_2.x_0;
+    int _S8 = s_2.x_0 + 1;
+    s_2.x_0 = _S8;
+    return _S8;
 }
 
-layout(std430, binding = 0) buffer _S7 {
+layout(std430, binding = 0) buffer _S9 {
     int _data[];
 } outputBuffer_0;
 spirv_instruction(id = 256)
@@ -90,6 +92,12 @@ layout(local_size_x = 4, local_size_y = 1, local_size_z = 1) in;
 void main()
 {
     int index_0 = int(gl_GlobalInvocationID.x);
+    const int  _S10[100] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    int _S11 = index_0 & 7;
+    SomeStruct_0 _S12 = makeSomeStruct_0();
+    int v_1 = someSlowFunc_0(index_0);
+    bool _S13 = (v_1 & 256) != 0;
+    int _S14 = v_1 & 3;
     int i_2;
     livenessStart_1(i_2, 0);
     i_2 = 0;
@@ -105,23 +113,20 @@ void main()
         {
             break;
         }
-        int v_1 = someSlowFunc_0(index_0);
         SomeStruct_0 s_3;
         livenessStart_2(s_3, 0);
         SomeStruct_0 t_0;
         livenessStart_2(t_0, 0);
-        SomeStruct_0 _S8 = makeSomeStruct_0();
-        t_0 = _S8;
-        const int  _S9[100] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+        t_0 = _S12;
         SomeStruct_0 u_0;
-        if((v_1 & 256) != 0)
+        if(_S13)
         {
-            s_3.x_0 = ((anotherBuffer_0)._data[(uint(v_1 & 3))]);
-            t_0.x_0 = ((anotherBuffer_0)._data[(uint(v_1 & 3))]);
+            s_3.x_0 = ((anotherBuffer_0)._data[(uint(_S14))]);
+            t_0.x_0 = ((anotherBuffer_0)._data[(uint(_S14))]);
             livenessStart_2(u_0, 0);
             u_0.a_1 = 0;
             u_0.x_0 = 0;
-            u_0.c_0 = _S9;
+            u_0.c_0 = _S10;
         }
         else
         {
@@ -129,37 +134,35 @@ void main()
             livenessStart_2(x_1, 0);
             x_1.a_1 = 0;
             x_1.x_0 = 0;
-            x_1.c_0 = _S9;
-            x_1.x_0 = ((anotherBuffer_0)._data[(uint(v_1 & 3))]) + 1;
-            SomeStruct_0 _S10 = x_1;
+            x_1.c_0 = _S10;
+            x_1.x_0 = ((anotherBuffer_0)._data[(uint(_S14))]) + 1;
+            SomeStruct_0 _S15 = x_1;
             livenessEnd_2(x_1, 0);
             livenessStart_2(u_0, 0);
-            u_0 = _S10;
+            u_0 = _S15;
         }
-        s_3.c_0[index_0 & 7] = s_3.c_0[index_0 & 7] + 1;
-        int _S11 = s_3.x_0 + t_0.x_0;
-        SomeStruct_0 _S12 = u_0;
+        s_3.c_0[_S11] = s_3.c_0[_S11] + 1;
+        int _S16 = s_3.x_0 + t_0.x_0;
+        SomeStruct_0 _S17 = u_0;
         livenessEnd_2(u_0, 0);
-        int _S13 = _S11 + _S12.x_0;
-        int _S14 = doThing_0(t_0);
-        int _S15 = _S13 + _S14;
-        int _S16 = somethingElse_0(t_0);
+        int _S18 = _S16 + _S17.x_0 + doThing_0(t_0);
+        int _S19 = somethingElse_0(t_0);
         livenessEnd_2(t_0, 0);
-        int _S17 = _S15 + _S16;
-        int _S18 = s_3.c_0[2];
+        int _S20 = _S18 + _S19;
+        int _S21 = s_3.c_0[2];
         livenessEnd_2(s_3, 0);
-        int _S19 = _S17 + _S18;
-        int _S20 = res_0;
+        int _S22 = _S20 + _S21;
+        int _S23 = res_0;
         livenessEnd_1(res_0, 0);
-        int res_1 = _S20 + _S19;
+        int res_1 = _S23 + _S22;
         i_2 = i_2 + 1;
         livenessStart_1(res_0, 0);
         res_0 = res_1;
     }
     livenessEnd_1(i_2, 0);
-    int _S21 = res_0;
+    int _S24 = res_0;
     livenessEnd_1(res_0, 0);
-    ((outputBuffer_0)._data[(uint(index_0))]) = _S21;
+    ((outputBuffer_0)._data[(uint(index_0))]) = _S24;
     return;
 }
 
diff --git a/tests/hlsl-intrinsic/shader-execution-reordering/hit-object-make-hit.slang.1.expected b/tests/hlsl-intrinsic/shader-execution-reordering/hit-object-make-hit.slang.1.expected
index 09e389c32..09c026980 100644
--- a/tests/hlsl-intrinsic/shader-execution-reordering/hit-object-make-hit.slang.1.expected
+++ b/tests/hlsl-intrinsic/shader-execution-reordering/hit-object-make-hit.slang.1.expected
@@ -90,15 +90,23 @@ void main()
     ray_2.TMin_0 = 0.00999999977648258209;
     ray_2.Direction_0 = vec3(0.0, 1.0, 0.0);
     ray_2.TMax_0 = 10000.0;
-    RayDesc_0 _S10 = ray_2;
+    int _S10 = idx_0 * 2;
+    int _S11 = idx_0 * 3;
+    RayDesc_0 _S12 = ray_2;
     hitObjectNV hitObj_0;
-    hitObjectRecordHitWithIndexNV(hitObj_0, scene_0, int(uint(idx_0)), int(uint(idx_0 * 2)), int(uint(idx_0 * 3)), 0U, 0U, _S10.Origin_0, _S10.TMin_0, _S10.Direction_0, _S10.TMax_0, (0));
+    int _S13 = int(uint(idx_0));
+    int _S14 = int(uint(_S10));
+    int _S15 = int(uint(_S11));
+    hitObjectRecordHitWithIndexNV(hitObj_0, scene_0, _S13, _S14, _S15, 0U, 0U, _S12.Origin_0, _S12.TMin_0, _S12.Direction_0, _S12.TMax_0, (0));
     uint r_3 = calcValue_0(hitObj_0);
-    RayDesc_0 _S11 = ray_2;
+    RayDesc_0 _S16 = ray_2;
     hitObjectNV hitObj_1;
-    hitObjectRecordHitNV(hitObj_1, scene_0, int(uint(idx_0)), int(uint(idx_0 * 3)), int(uint(idx_0 * 2)), 0U, 0U, 4U, _S11.Origin_0, _S11.TMin_0, _S11.Direction_0, _S11.TMax_0, (0));
-    uint _S12 = calcValue_0(hitObj_1);
-    uint r_4 = r_3 + _S12;
+    int _S17 = int(uint(idx_0));
+    int _S18 = int(uint(_S11));
+    int _S19 = int(uint(_S10));
+    hitObjectRecordHitNV(hitObj_1, scene_0, _S17, _S18, _S19, 0U, 0U, 4U, _S16.Origin_0, _S16.TMin_0, _S16.Direction_0, _S16.TMax_0, (0));
+    uint _S20 = calcValue_0(hitObj_1);
+    uint r_4 = r_3 + _S20;
     ((outputBuffer_0)._data[(uint(idx_0))]) = r_4;
     return;
 }
diff --git a/tests/hlsl-intrinsic/vector-float.slang b/tests/hlsl-intrinsic/vector-float.slang
index b9cc6b9c8..de49bae73 100644
--- a/tests/hlsl-intrinsic/vector-float.slang
+++ b/tests/hlsl-intrinsic/vector-float.slang
@@ -13,6 +13,32 @@ typedef vector<Float, 3> FloatVector;
 typedef vector<int, 3> IntVector;
 typedef vector<uint, 3> UIntVector;
 
+void subf(inout FloatVector ft, FloatVector f, int idx, Float vf)
+{
+
+    ft += log(f + 10.0);
+    ft += log2(f * 3 + 2);
+
+    {
+        float v[] = { 1, 10, 100, 1000 };
+        ft += IntVector(log10(FloatVector(v[idx] + vf) + 0.5f));
+    }
+
+    ft += abs(f * 4 - 2.0f);
+
+    ft += min(0.5, f);
+    ft += max(f, 0.75);
+
+    ft += pow(0.5, f);
+
+    ft += smoothstep(0.2, 0.7, f);
+    ft += lerp(-100, 100, f);
+
+    ft += clamp(f, 0.1, 0.3);
+
+    ft += step(f, 0.5);
+}
+
 [numthreads(4, 1, 1)]
 void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
@@ -84,30 +110,8 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     ft += floor(f * 10 - 7.01);
 
     ft += trunc(f * 7);
-   
-    ft += log(f + 10.0);
-    ft += log2(f * 3 + 2);
-
-
-    {
-        float v[] = { 1, 10, 100, 1000 };
-        ft += IntVector(log10(FloatVector(v[idx] + vf) + 0.5f));
-    }
-
-       
-    ft += abs(f * 4 - 2.0f);
-    
-    ft += min(0.5, f);
-    ft += max(f, 0.75);
 
-    ft += pow(0.5, f);
-
-    ft += smoothstep(0.2, 0.7, f);
-    ft += lerp(-100, 100, f);
-
-    ft += clamp(f, 0.1, 0.3);
-
-    ft += step(f, 0.5);
+    subf(ft, f, idx, vf);
 
     {
         IntVector vi = asint(f - f) + idx;
diff --git a/tests/ir/loop-dce.slang b/tests/ir/loop-dce.slang
new file mode 100644
index 000000000..f89c1aa38
--- /dev/null
+++ b/tests/ir/loop-dce.slang
@@ -0,0 +1,40 @@
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -shaderobj
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+__target_intrinsic(hlsl, "@")
+__target_intrinsic(glsl, "@")
+__target_intrinsic(cpp, "@")
+__target_intrinsic(cuda, "@")
+[__readNone]
+int produceSyntaxError() { return 0; }
+
+[numthreads(1, 1, 1)]
+void computeMain(uint3 dispatchThreadID: SV_DispatchThreadID)
+{
+    int sum = 0;
+    int array[100];
+    // Next, this loop will be removed because there is no use of `array`.
+    for (int i = 0; i < 100; i++)
+    {
+        // This loop must be removed, or we will fail downstream compilation.
+        array[i] = i + produceSyntaxError();
+    }
+
+    // First, this loop will be removed because there is no use of `sum`.
+    for (int i = 0; i < 100; i++)
+    {
+        // This loop must be removed, or we will fail downstream compilation.
+        if (i < 50)
+        {
+            sum += array[i] + produceSyntaxError();
+        }
+        else
+        {
+            sum += i * 2 + produceSyntaxError();
+        }
+    }
+    outputBuffer[0] = 1;
+}
diff --git a/tests/ir/loop-dce.slang.expected.txt b/tests/ir/loop-dce.slang.expected.txt
new file mode 100644
index 000000000..968ac3ef0
--- /dev/null
+++ b/tests/ir/loop-dce.slang.expected.txt
@@ -0,0 +1,4 @@
+1
+0
+0
+0
diff --git a/tests/nv-extensions/nv-ray-tracing-motion-blur.slang.glsl b/tests/nv-extensions/nv-ray-tracing-motion-blur.slang.glsl
index 724a0a241..bae5f361d 100644
--- a/tests/nv-extensions/nv-ray-tracing-motion-blur.slang.glsl
+++ b/tests/nv-extensions/nv-ray-tracing-motion-blur.slang.glsl
@@ -3,47 +3,38 @@
 #extension GL_NV_ray_tracing_motion_blur : require
 layout(row_major) uniform;
 layout(row_major) buffer;
-
 struct ReflectionRay_0
 {
     float color_0;
 };
 
-
 layout(location = 0)
 rayPayloadEXT
 ReflectionRay_0 p_0;
 
-
 struct ShadowRay_0
 {
     float hitDistance_0;
 };
 
-
 layout(location = 1)
 rayPayloadEXT
 ShadowRay_0 p_1;
 
-
 layout(binding = 0)
 uniform texture2D samplerPosition_0;
 
-
 layout(binding = 2)
 uniform sampler sampler_0;
 
-
 layout(binding = 1)
 uniform texture2D samplerNormal_0;
-
 struct Light_0
 {
     vec4 position_0;
     vec4 color_1;
 };
 
-
 struct Uniforms_0
 {
     Light_0 light_0;
@@ -52,13 +43,11 @@ struct Uniforms_0
     mat4x4 model_0;
 };
 
-
 layout(binding = 3)
 layout(std140) uniform _S1
 {
     Uniforms_0 _data;
 } ubo_0;
-
 struct RayDesc_0
 {
     vec3 Origin_0;
@@ -67,115 +56,76 @@ struct RayDesc_0
     float TMax_0;
 };
 
-
 void TraceMotionRay_0(accelerationStructureEXT AccelerationStructure_0, uint RayFlags_0, uint InstanceInclusionMask_0, uint RayContributionToHitGroupIndex_0, uint MultiplierForGeometryContributionToHitGroupIndex_0, uint MissShaderIndex_0, RayDesc_0 Ray_0, float CurrentTime_0, inout ShadowRay_0 Payload_0)
 {
-
     p_1 = Payload_0;
     traceRayMotionNV(AccelerationStructure_0, RayFlags_0, InstanceInclusionMask_0, RayContributionToHitGroupIndex_0, MultiplierForGeometryContributionToHitGroupIndex_0, MissShaderIndex_0, Ray_0.Origin_0, Ray_0.TMin_0, Ray_0.Direction_0, Ray_0.TMax_0, CurrentTime_0, (1));
-
     Payload_0 = p_1;
     return;
 }
 
-
 layout(binding = 5)
 uniform accelerationStructureEXT as_0;
 
-
 float saturate_0(float x_0)
 {
-    float _S2 = clamp(x_0, 0.0, 1.0);
-
-    return _S2;
+    return clamp(x_0, 0.0, 1.0);
 }
 
-
 void TraceRay_0(accelerationStructureEXT AccelerationStructure_1, uint RayFlags_1, uint InstanceInclusionMask_1, uint RayContributionToHitGroupIndex_1, uint MultiplierForGeometryContributionToHitGroupIndex_1, uint MissShaderIndex_1, RayDesc_0 Ray_1, inout ReflectionRay_0 Payload_1)
 {
-
     p_0 = Payload_1;
     traceRayEXT(AccelerationStructure_1, RayFlags_1, InstanceInclusionMask_1, RayContributionToHitGroupIndex_1, MultiplierForGeometryContributionToHitGroupIndex_1, MissShaderIndex_1, Ray_1.Origin_0, Ray_1.TMin_0, Ray_1.Direction_0, Ray_1.TMax_0, (0));
-
     Payload_1 = p_0;
     return;
 }
 
-
 layout(rgba32f)
 layout(binding = 4)
 uniform image2D outputImage_0;
 
-
 void main()
 {
-    uvec3 _S3 = ((gl_LaunchIDEXT));
-
-    ivec2 launchID_0 = ivec2(_S3.xy);
-    uvec3 _S4 = ((gl_LaunchSizeEXT));
-
-    ivec2 launchSize_0 = ivec2(_S4.xy);
-
-
-    float _S5 = (float(launchID_0.x) + 0.5) / float(launchSize_0.x);
-    float _S6 = (float(launchID_0.y) + 0.5) / float(launchSize_0.y);
-
-    vec2 inUV_0 = vec2(_S5, _S6);
-
-    vec4 _S7 = (texture(sampler2D(samplerPosition_0,sampler_0), (inUV_0)));
-
-    vec3 P_0 = _S7.xyz;
-    vec4 _S8 = (texture(sampler2D(samplerNormal_0,sampler_0), (inUV_0)));
-
-    vec3 N_0 = _S8.xyz * 2.0 - 1.0;
-
+    uvec3 _S2 = ((gl_LaunchIDEXT));
+    ivec2 launchID_0 = ivec2(_S2.xy);
+    uvec3 _S3 = ((gl_LaunchSizeEXT));
+    ivec2 launchSize_0 = ivec2(_S3.xy);
+
+    float _S4 = (float(launchID_0.x) + 0.5) / float(launchSize_0.x);
+    float _S5 = (float(launchID_0.y) + 0.5) / float(launchSize_0.y);
+    vec2 inUV_0 = vec2(_S4, _S5);
+    vec4 _S6 = (texture(sampler2D(samplerPosition_0,sampler_0), (inUV_0)));
+    vec3 P_0 = _S6.xyz;
+    vec4 _S7 = (texture(sampler2D(samplerNormal_0,sampler_0), (inUV_0)));
+    vec3 N_0 = _S7.xyz * 2.0 - 1.0;
 
     vec3 lightDelta_0 = ubo_0._data.light_0.position_0.xyz - P_0;
     float lightDist_0 = length(lightDelta_0);
     vec3 L_0 = normalize(lightDelta_0);
-    float _S9 = 1.0 / (lightDist_0 * lightDist_0);
-
+    float _S8 = 1.0 / (lightDist_0 * lightDist_0);
     RayDesc_0 ray_0;
     ray_0.Origin_0 = P_0;
     ray_0.TMin_0 = 0.00000099999999747524;
     ray_0.Direction_0 = lightDelta_0;
     ray_0.TMax_0 = lightDist_0;
 
-
     ShadowRay_0 shadowRay_0;
     shadowRay_0.hitDistance_0 = 0.0;
 
-
-
     TraceMotionRay_0(as_0, 1U, 255U, 0U, 0U, 2U, ray_0, 1.0, shadowRay_0);
-
     float atten_0;
-
     if(shadowRay_0.hitDistance_0 < lightDist_0)
     {
-
         atten_0 = 0.0;
-
     }
     else
     {
-
-        atten_0 = _S9;
-
+        atten_0 = _S8;
     }
-
-    vec3 _S10 = ubo_0._data.light_0.color_1.xyz;
-
-    float _S11 = dot(N_0, L_0);
-
-    float _S12 = saturate_0(_S11);
-
-    vec3 color_2 = _S10 * _S12 * atten_0;
-
+    vec3 color_2 = ubo_0._data.light_0.color_1.xyz * saturate_0(dot(N_0, L_0)) * atten_0;
 
     ReflectionRay_0 reflectionRay_0;
     TraceRay_0(as_0, 1U, 255U, 0U, 0U, 2U, ray_0, reflectionRay_0);
-
     imageStore((outputImage_0), ivec2((uvec2(launchID_0))), vec4(color_2 + reflectionRay_0.color_0, 1.0));
     return;
-}
-\ No newline at end of file
+}
diff --git a/tests/pipeline/rasterization/fragment-shader-interlock.slang.glsl b/tests/pipeline/rasterization/fragment-shader-interlock.slang.glsl
index 84eba46f0..7f53576e9 100644
--- a/tests/pipeline/rasterization/fragment-shader-interlock.slang.glsl
+++ b/tests/pipeline/rasterization/fragment-shader-interlock.slang.glsl
@@ -1,10 +1,7 @@
-//TEST_IGNORE_FILE:
-
 #version 450
 #extension GL_ARB_fragment_shader_interlock : require
 layout(row_major) uniform;
 layout(row_major) buffer;
-
 layout(rgba32f)
 layout(binding = 0)
 uniform image2D entryPointParams_texture_0;
@@ -17,15 +14,12 @@ out vec4 _S2;
 
 void main()
 {
-    beginInvocationInterlockARB();
-
-    vec2 _S3 = _S1.xy;
-
-    vec4 _S4 = (imageLoad((entryPointParams_texture_0), ivec2((uvec2(_S3)))));
-    imageStore((entryPointParams_texture_0), ivec2((uvec2(_S3))), _S4 + _S1);
 
+    beginInvocationInterlockARB();
+    uvec2 _S3 = uvec2(_S1.xy);
+    vec4 _S4 = (imageLoad((entryPointParams_texture_0), ivec2((_S3))));
+    imageStore((entryPointParams_texture_0), ivec2((_S3)), _S4 + _S1);
     endInvocationInterlockARB();
-
     _S2 = _S4;
     return;
 }
diff --git a/tests/pipeline/rasterization/mesh/passing-outputs.slang.glsl b/tests/pipeline/rasterization/mesh/passing-outputs.slang.glsl
index 31c2f0db2..1102a838e 100644
--- a/tests/pipeline/rasterization/mesh/passing-outputs.slang.glsl
+++ b/tests/pipeline/rasterization/mesh/passing-outputs.slang.glsl
@@ -160,16 +160,10 @@ void main()
         d_0(gl_LocalInvocationIndex);
         e_0(gl_LocalInvocationIndex);
     }
-    else
-    {
-    }
     if(gl_LocalInvocationIndex < 1U)
     {
         gl_PrimitiveTriangleIndicesEXT[gl_LocalInvocationIndex] = uvec3(0U, 1U, 2U);
     }
-    else
-    {
-    }
     return;
 }
 
diff --git a/tests/pipeline/ray-tracing/acceleration-structure-in-compute.slang.glsl b/tests/pipeline/ray-tracing/acceleration-structure-in-compute.slang.glsl
index f95321039..83797d2d5 100644
--- a/tests/pipeline/ray-tracing/acceleration-structure-in-compute.slang.glsl
+++ b/tests/pipeline/ray-tracing/acceleration-structure-in-compute.slang.glsl
@@ -1,6 +1,7 @@
 #version 460
 #extension GL_EXT_ray_tracing : require
-
+layout(row_major) uniform;
+layout(row_major) buffer;
 int helper_0(accelerationStructureEXT a_0, int b_0)
 {
     return b_0;
diff --git a/tests/pipeline/ray-tracing/trace-ray-inline.slang.glsl b/tests/pipeline/ray-tracing/trace-ray-inline.slang.glsl
index 0364d2513..1c2bc8090 100644
--- a/tests/pipeline/ray-tracing/trace-ray-inline.slang.glsl
+++ b/tests/pipeline/ray-tracing/trace-ray-inline.slang.glsl
@@ -3,7 +3,6 @@
 #extension GL_EXT_ray_query : require
 layout(row_major) uniform;
 layout(row_major) buffer;
-
 struct SLANG_ParameterGroup_C_0
 {
     vec3 origin_0;
@@ -20,7 +19,6 @@ layout(std140) uniform _S1
 {
     SLANG_ParameterGroup_C_0 _data;
 } C_0;
-
 layout(binding = 0)
 uniform accelerationStructureEXT myAccelerationStructure_0;
 
@@ -70,40 +68,32 @@ void myMiss_0(inout MyRayPayload_0 payload_4)
 layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 void main()
 {
+    MyProceduralHitAttrs_0 committedProceduralAttrs_0;
 
     rayQueryEXT query_0;
 
     MyRayPayload_0 payload_5;
     payload_5.value_1 = -1;
     rayQueryInitializeEXT((query_0), (myAccelerationStructure_0), (C_0._data.rayFlags_0 | 512), (C_0._data.instanceMask_0), (C_0._data.origin_0), (C_0._data.tMin_0), (C_0._data.direction_0), (C_0._data.tMax_0));
-
-    MyProceduralHitAttrs_0 committedProceduralAttrs_0;
-
     for(;;)
     {
-
         bool _S2 = rayQueryProceedEXT(query_0);
-
         if(!_S2)
         {
             break;
         }
         uint _S3 = (rayQueryGetIntersectionTypeEXT((query_0), false));
-
         switch(_S3)
         {
         case 1U:
             {
                 MyProceduralHitAttrs_0 candidateProceduralAttrs_0;
-
                 candidateProceduralAttrs_0.value_0 = 0;
                 float tHit_1 = 0.0;
                 bool _S4 = myProceduralIntersection_0(tHit_1, candidateProceduralAttrs_0);
-
                 if(_S4)
                 {
                     bool _S5 = myProceduralAnyHit_0(payload_5);
-
                     if(_S5)
                     {
                         rayQueryGenerateIntersectionEXT(query_0, tHit_1);
@@ -112,35 +102,22 @@ void main()
                         {
                             rayQueryTerminateEXT(query_0);
                         }
-                        else
-                        {
-                        }
-
                         committedProceduralAttrs_0 = _S6;
-
                     }
                     else
                     {
-
                         committedProceduralAttrs_0 = committedProceduralAttrs_0;
-
                     }
-
                 }
                 else
                 {
-
                     committedProceduralAttrs_0 = committedProceduralAttrs_0;
-
                 }
-
                 break;
             }
         case 0U:
             {
-
                 bool _S7 = myTriangleAnyHit_0(payload_5);
-
                 if(_S7)
                 {
                     rayQueryConfirmIntersectionEXT(query_0);
@@ -148,12 +125,6 @@ void main()
                     {
                         rayQueryTerminateEXT(query_0);
                     }
-                    else
-                    {
-                    }
-                }
-                else
-                {
                 }
                 break;
             }
@@ -162,13 +133,8 @@ void main()
                 break;
             }
         }
-
-        committedProceduralAttrs_0 = committedProceduralAttrs_0;
-
     }
-
     uint _S8 = (rayQueryGetIntersectionTypeEXT((query_0), true));
-
     switch(_S8)
     {
     case 1U:
diff --git a/tests/pipeline/ray-tracing/trace-ray-inline.slang.hlsl b/tests/pipeline/ray-tracing/trace-ray-inline.slang.hlsl
index 97d972328..b0c798b2d 100644
--- a/tests/pipeline/ray-tracing/trace-ray-inline.slang.hlsl
+++ b/tests/pipeline/ray-tracing/trace-ray-inline.slang.hlsl
@@ -1,11 +1,14 @@
-// trace-ray-inline.slang.hlsl
-//TEST_IGNORE_FILE:
+#pragma pack_matrix(column_major)
+#ifdef SLANG_HLSL_ENABLE_NVAPI
+#include "nvHLSLExtns.h"
+#endif
+#pragma warning(disable: 3557)
 
 struct SLANG_ParameterGroup_C_0
 {
-    vector<float,3> origin_0;
+    float3 origin_0;
     float tMin_0;
-    vector<float,3> direction_0;
+    float3 direction_0;
     float tMax_0;
     uint rayFlags_0;
     uint instanceMask_0;
@@ -16,7 +19,6 @@ cbuffer C_0 : register(b0)
 {
     SLANG_ParameterGroup_C_0 C_0;
 }
-
 RaytracingAccelerationStructure myAccelerationStructure_0 : register(t0);
 
 struct MyProceduralHitAttrs_0
@@ -62,170 +64,92 @@ void myMiss_0(inout MyRayPayload_0 payload_4)
     return;
 }
 
-
-[shader("compute")]
-[numthreads(1, 1, 1)]
-void main(vector<uint,3> tid_0 : SV_DISPATCHTHREADID)
+[shader("compute")][numthreads(1, 1, 1)]
+void main(uint3 tid_0 : SV_DISPATCHTHREADID)
 {
-    MyRayPayload_0 payload_5;
     MyProceduralHitAttrs_0 committedProceduralAttrs_0;
-    MyProceduralHitAttrs_0 committedProceduralAttrs_1;
-    MyRayPayload_0 payload_6;
-    MyProceduralHitAttrs_0 committedProceduralAttrs_2;
-    MyRayPayload_0 payload_7;
-    MyProceduralHitAttrs_0 committedProceduralAttrs_3;
 
     RayQuery<int(512) > query_0;
 
-    MyRayPayload_0 _S1 = { int(-1) };
+    MyRayPayload_0 payload_5;
+    payload_5.value_1 = int(-1);
     RayDesc ray_0 = { C_0.origin_0, C_0.tMin_0, C_0.direction_0, C_0.tMax_0 };
     query_0.TraceRayInline(myAccelerationStructure_0, C_0.rayFlags_0, C_0.instanceMask_0, ray_0);
-
-    MyProceduralHitAttrs_0 _S2;
-
-    payload_5 = _S1;
-    committedProceduralAttrs_0 = _S2;
     for(;;)
     {
-        bool _S3 = query_0.Proceed();
-
-        if(!_S3)
+        bool _S1 = query_0.Proceed();
+        if(!_S1)
         {
             break;
         }
-        uint _S4 = query_0.CandidateType();
-
-        switch(_S4)
+        uint _S2 = query_0.CandidateType();
+        switch(_S2)
         {
-        case (uint) int(1):
+        case 1U:
             {
-                MyProceduralHitAttrs_0 candidateProceduralAttrs_0 = { int(0) };
-
-                float _S5;
-
-                _S5 = 0.00000000000000000000;
-
-                MyProceduralHitAttrs_0 _S6;
-
-                _S6 = candidateProceduralAttrs_0;
-
-                bool _S7 = myProceduralIntersection_0(_S5, _S6);
-
-                float tHit_1 = _S5;
-
-                MyProceduralHitAttrs_0 candidateProceduralAttrs_1 = _S6;
-
-                if(_S7)
+                MyProceduralHitAttrs_0 candidateProceduralAttrs_0;
+                candidateProceduralAttrs_0.value_0 = int(0);
+                float tHit_1 = 0.0;
+                bool _S3 = myProceduralIntersection_0(tHit_1, candidateProceduralAttrs_0);
+                if(_S3)
                 {
-                    MyRayPayload_0 _S8;
-
-                    _S8 = payload_5;
-
-                    bool _S9 = myProceduralAnyHit_0(_S8);
-
-                    MyRayPayload_0 _S10 = _S8;
-
-                    if(_S9)
+                    bool _S4 = myProceduralAnyHit_0(payload_5);
+                    if(_S4)
                     {
                         query_0.CommitProceduralPrimitiveHit(tHit_1);
-
-                        if((bool) C_0.shouldStopAtFirstHit_0)
+                        MyProceduralHitAttrs_0 _S5 = candidateProceduralAttrs_0;
+                        if(C_0.shouldStopAtFirstHit_0 != 0U)
                         {
-
                             query_0.Abort();
                         }
-                        else
-                        {
-                        }
-
-                        committedProceduralAttrs_1 = candidateProceduralAttrs_1;
+                        committedProceduralAttrs_0 = _S5;
                     }
                     else
                     {
-                        committedProceduralAttrs_1 = committedProceduralAttrs_0;
+                        committedProceduralAttrs_0 = committedProceduralAttrs_0;
                     }
-
-                    payload_6 = _S10;
-                    committedProceduralAttrs_2 = committedProceduralAttrs_1;
                 }
                 else
                 {
-                    payload_6 = payload_5;
-                    committedProceduralAttrs_2 = committedProceduralAttrs_0;
+                    committedProceduralAttrs_0 = committedProceduralAttrs_0;
                 }
-
-                payload_7 = payload_6;
-                committedProceduralAttrs_3 = committedProceduralAttrs_2;
                 break;
             }
-        case (uint) int(0):
+        case 0U:
             {
-                MyRayPayload_0 _S11;
-                _S11 = payload_5;
-
-                bool _S12 = myTriangleAnyHit_0(_S11);
-                MyRayPayload_0 _S13 = _S11;
-
-                if(_S12)
+                bool _S6 = myTriangleAnyHit_0(payload_5);
+                if(_S6)
                 {
                     query_0.CommitNonOpaqueTriangleHit();
-                    if((bool) C_0.shouldStopAtFirstHit_0)
+                    if(C_0.shouldStopAtFirstHit_0 != 0U)
                     {
                         query_0.Abort();
                     }
-                    else
-                    {
-                    }
-                }
-                else
-                {
                 }
-
-                payload_7 = _S13;
-                committedProceduralAttrs_3 = committedProceduralAttrs_0;
                 break;
             }
         default:
             {
-                payload_7 = payload_5;
-                committedProceduralAttrs_3 = committedProceduralAttrs_0;
                 break;
             }
         }
-
-        payload_5 = payload_7;
-        committedProceduralAttrs_0 = committedProceduralAttrs_3;
     }
-
-    uint _S14 = query_0.CommittedStatus();
-
-    switch(_S14)
+    uint _S7 = query_0.CommittedStatus();
+    switch(_S7)
     {
-    case (uint) int(1):
+    case 1U:
         {
-            MyRayPayload_0 _S15;
-
-            _S15 = payload_5;
-
-            myTriangleClosestHit_0(_S15);
+            myTriangleClosestHit_0(payload_5);
             break;
         }
-    case (uint) int(2):
+    case 2U:
         {
-
-            MyRayPayload_0 _S16;
-            _S16 = payload_5;
-
-            myProceduralClosestHit_0(_S16, committedProceduralAttrs_0);
+            myProceduralClosestHit_0(payload_5, committedProceduralAttrs_0);
             break;
         }
-    case (uint) int(0):
+    case 0U:
         {
-            MyRayPayload_0 _S17;
-
-            _S17 = payload_5;
-
-            myMiss_0(_S17);
+            myMiss_0(payload_5);
             break;
         }
     default:
@@ -233,6 +157,5 @@ void main(vector<uint,3> tid_0 : SV_DISPATCHTHREADID)
             break;
         }
     }
-
     return;
 }
diff --git a/tests/slang-extension/atomic-float-byte-address-buffer-cross.slang.glsl b/tests/slang-extension/atomic-float-byte-address-buffer-cross.slang.glsl
index 139d55518..fca1fc1fa 100644
--- a/tests/slang-extension/atomic-float-byte-address-buffer-cross.slang.glsl
+++ b/tests/slang-extension/atomic-float-byte-address-buffer-cross.slang.glsl
@@ -2,7 +2,6 @@
 #extension GL_EXT_shader_atomic_float : require
 layout(row_major) uniform;
 layout(row_major) buffer;
-
 layout(std430, binding = 1) buffer _S1 {
     float _data[];
 } anotherBuffer_0;
@@ -11,7 +10,7 @@ layout(std430, binding = 0) buffer _S2 {
 } _S3;
 void RWByteAddressBuffer_InterlockedAddF32_0(uint _S4, float _S5, out float _S6)
 {
-    uint _S7 = _S4 / uint(4);
+    uint _S7 = _S4 / 4U;
     float _S8 = (atomicAdd((((_S3)._data[(_S7)])), (_S5)));
     _S6 = _S8;
     return;
@@ -19,7 +18,7 @@ void RWByteAddressBuffer_InterlockedAddF32_0(uint _S4, float _S5, out float _S6)
 
 void RWByteAddressBuffer_InterlockedAddF32_1(uint _S9, float _S10)
 {
-    uint _S11 = _S9 / uint(4);
+    uint _S11 = _S9 / 4U;
     float _S12 = (atomicAdd((((_S3)._data[(_S11)])), (_S10)));
     return;
 }
@@ -28,11 +27,11 @@ layout(local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
 void main()
 {
     uint tid_0 = gl_GlobalInvocationID.x;
-    int idx_0 = int(tid_0 & uint(3) ^ tid_0 >> 2);
+    uint _S13 = tid_0 >> 2;
+    int idx_0 = int(tid_0 & 3U ^ _S13);
     float delta_0 = ((anotherBuffer_0)._data[(uint(idx_0 & 3))]);
-    float previousValue_0;
-    previousValue_0 = float(0);
-    RWByteAddressBuffer_InterlockedAddF32_0(uint(idx_0 << 2), 1.00000000000000000000, previousValue_0);
-    RWByteAddressBuffer_InterlockedAddF32_1(uint(int(tid_0 >> 2) << 2), delta_0);
+    float previousValue_0 = 0.0;
+    RWByteAddressBuffer_InterlockedAddF32_0(uint(idx_0 << 2), 1.0, previousValue_0);
+    RWByteAddressBuffer_InterlockedAddF32_1(uint(int(_S13) << 2), delta_0);
     return;
 }
diff --git a/tests/vkray/anyhit.slang.glsl b/tests/vkray/anyhit.slang.glsl
index eb39299c5..345dd6624 100644
--- a/tests/vkray/anyhit.slang.glsl
+++ b/tests/vkray/anyhit.slang.glsl
@@ -1,16 +1,8 @@
 // anyhit.slang.glsl
 #version 460
-
-#if USE_NV_RT
-#extension GL_NV_ray_tracing : require
-#define hitAttributeEXT hitAttributeNV
-#define rayPayloadInEXT rayPayloadInNV
-#define terminateRayEXT terminateRayNV
-#define ignoreIntersectionEXT ignoreIntersectionNV
-#else
 #extension GL_EXT_ray_tracing : require
-#endif
-
+layout(row_major) uniform;
+layout(row_major) buffer;
 struct Params_0
 {
     int mode_0;
@@ -47,16 +39,15 @@ void main()
         float val_0 = textureLod(
             sampler2D(gParams_alphaMap_0, gParams_sampler_0),
             _S2.normal_0.xy,
-            float(0)).x;
-
+            (0.0)).x;
 
-        if(val_0 > float(0))
+        if(val_0 > 0.0)
         {
-            terminateRayEXT;
+            terminateRayEXT;;
         }
         else
         {
-            ignoreIntersectionEXT;
+            ignoreIntersectionEXT;;
         }
     }
 
diff --git a/tests/vkray/callable-caller.slang.glsl b/tests/vkray/callable-caller.slang.glsl
index 0b7a9677b..11049074f 100644
--- a/tests/vkray/callable-caller.slang.glsl
+++ b/tests/vkray/callable-caller.slang.glsl
@@ -1,18 +1,7 @@
-//TEST_IGNORE_FILE:
 #version 460
 #extension GL_NV_ray_tracing : require
 layout(row_major) uniform;
 layout(row_major) buffer;
-struct SLANG_ParameterGroup_C_0
-{
-    uint shaderIndex_0;
-};
-
-layout(binding = 0)
-layout(std140) uniform _S1
-{
-    SLANG_ParameterGroup_C_0 _data;
-} C_0;
 struct MaterialPayload_0
 {
     vec4 albedo_0;
@@ -23,6 +12,16 @@ layout(location = 0)
 callableDataNV
 MaterialPayload_0 p_0;
 
+struct SLANG_ParameterGroup_C_0
+{
+    uint shaderIndex_0;
+};
+
+layout(binding = 0)
+layout(std140) uniform _S1
+{
+    SLANG_ParameterGroup_C_0 _data;
+} C_0;
 void CallShader_0(uint shaderIndex_1, inout MaterialPayload_0 payload_0)
 {
     p_0 = payload_0;
@@ -38,7 +37,7 @@ uniform image2D gImage_0;
 void main()
 {
     MaterialPayload_0 payload_1;
-    payload_1.albedo_0 = vec4(0);
+    payload_1.albedo_0 = vec4(0.0);
     uvec3 _S2 = ((gl_LaunchIDNV));
     vec2 _S3 = vec2(_S2.xy);
     uvec3 _S4 = ((gl_LaunchSizeNV));
diff --git a/tests/vkray/raygen.slang.glsl b/tests/vkray/raygen.slang.glsl
index e34f1f6e0..f86f67e82 100644
--- a/tests/vkray/raygen.slang.glsl
+++ b/tests/vkray/raygen.slang.glsl
@@ -76,8 +76,7 @@ uniform accelerationStructureEXT as_0;
 
 float saturate_0(float x_0)
 {
-    float _S2 = clamp(x_0, 0.0, 1.0);
-    return _S2;
+    return clamp(x_0, 0.0, 1.0);
 }
 
 layout(rgba32f)
@@ -86,24 +85,24 @@ uniform image2D outputImage_0;
 
 void main()
 {
-    uvec3 _S3 = ((gl_LaunchIDEXT));
-    float _S4 = float(_S3.x) + 0.5;
-    uvec3 _S5 = ((gl_LaunchSizeEXT));
-    float _S6 = _S4 / float(_S5.x);
-    uvec3 _S7 = ((gl_LaunchIDEXT));
-    float _S8 = float(_S7.y) + 0.5;
-    uvec3 _S9 = ((gl_LaunchSizeEXT));
-    float _S10 = _S8 / float(_S9.y);
-    vec2 inUV_0 = vec2(_S6, _S10);
-    vec4 _S11 = (texture(sampler2D(samplerPosition_0,sampler_0), (inUV_0)));
-    vec3 P_0 = _S11.xyz;
-    vec4 _S12 = (texture(sampler2D(samplerNormal_0,sampler_0), (inUV_0)));
-    vec3 N_0 = _S12.xyz * 2.0 - 1.0;
+    uvec3 _S2 = ((gl_LaunchIDEXT));
+    float _S3 = float(_S2.x) + 0.5;
+    uvec3 _S4 = ((gl_LaunchSizeEXT));
+    float _S5 = _S3 / float(_S4.x);
+    uvec3 _S6 = ((gl_LaunchIDEXT));
+    float _S7 = float(_S6.y) + 0.5;
+    uvec3 _S8 = ((gl_LaunchSizeEXT));
+    float _S9 = _S7 / float(_S8.y);
+    vec2 inUV_0 = vec2(_S5, _S9);
+    vec4 _S10 = (texture(sampler2D(samplerPosition_0,sampler_0), (inUV_0)));
+    vec3 P_0 = _S10.xyz;
+    vec4 _S11 = (texture(sampler2D(samplerNormal_0,sampler_0), (inUV_0)));
+    vec3 N_0 = _S11.xyz * 2.0 - 1.0;
 
     vec3 lightDelta_0 = ubo_0._data.light_0.position_0.xyz - P_0;
     float lightDist_0 = length(lightDelta_0);
     vec3 L_0 = normalize(lightDelta_0);
-    float _S13 = 1.0 / (lightDist_0 * lightDist_0);
+    float _S12 = 1.0 / (lightDist_0 * lightDist_0);
     RayDesc_0 ray_0;
     ray_0.Origin_0 = P_0;
     ray_0.TMin_0 = 0.00000099999999747524;
@@ -120,17 +119,14 @@ void main()
     }
     else
     {
-        atten_0 = _S13;
+        atten_0 = _S12;
     }
-    vec3 _S14 = ubo_0._data.light_0.color_1.xyz;
-    float _S15 = dot(N_0, L_0);
-    float _S16 = saturate_0(_S15);
-    vec3 color_2 = _S14 * _S16 * atten_0;
+    vec3 color_2 = ubo_0._data.light_0.color_1.xyz * saturate_0(dot(N_0, L_0)) * atten_0;
 
     ReflectionRay_0 reflectionRay_0;
     TraceRay_1(as_0, 1U, 255U, 0U, 0U, 2U, ray_0, reflectionRay_0);
     vec3 color_3 = color_2 + reflectionRay_0.color_0;
-    uvec3 _S17 = ((gl_LaunchIDEXT));
-    imageStore((outputImage_0), ivec2((uvec2(ivec2(_S17.xy)))), vec4(color_3, 1.0));
+    uvec3 _S13 = ((gl_LaunchIDEXT));
+    imageStore((outputImage_0), ivec2((uvec2(ivec2(_S13.xy)))), vec4(color_3, 1.0));
     return;
 }