More control flow simplifications. (#2673)

* More control flow and Phi param simplifications. * Fix. * Fix gcc error. * Fix. * More IR cleanup. * Fix bug in phi param dce + ifelse simplify. * Propagate and DCE side-effect-free functions. * Enhance CFG simplifcation to remove loops with no side effects. * Fix. * Fixes. * Fix tests. Add [__AlwaysFoldIntoUseSite] for rayPayloadLocation. * More cleanup. * Fixes. * Fix. --------- Co-authored-by: Yong He <yhe@nvidia.com>
author: Yong He <yonghe@outlook.com> 2023-02-24 10:01:47 -0800
committer: GitHub <noreply@github.com> 2023-02-24 10:01:47 -0800
commit: bd6306cdaa4a49344658bd026721b6532e103d09 (patch)
tree: bb7f666d426e6cfc7777a3ccac0a1d628588eb39 /source
parent: e8c08e7ecb1124f115a1d1042277776193122b57 (diff)
29 files changed, 1616 insertions, 314 deletions
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index 6357d58bd..9da33c755 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -2525,21 +2525,25 @@ int __SyntaxError();
 __generic<T>
 __target_intrinsic(cuda, "sizeof($G0)")
 __target_intrinsic(cpp, "sizeof($G0)")
+[__readNone]
 int __sizeOf();
 
 __generic<T>
 __target_intrinsic(cuda, "sizeof($T0)")
 __target_intrinsic(cpp, "sizeof($T0)")
+[__readNone]
 int __sizeOf(T v);
 
 __generic<T>
 __target_intrinsic(cuda, "SLANG_ALIGN_OF($G0)")
 __target_intrinsic(cpp, "SLANG_ALIGN_OF($G0)")
+[__readNone]
 int __alignOf();
 
 __generic<T>
 __target_intrinsic(cuda, "SLANG_ALIGN_OF($T0)")
 __target_intrinsic(cpp, "SLANG_ALIGN_OF($T0)")
+[__readNone]
 int __alignOf(T v);
 
 // It would be nice to have offsetof equivalent, but it's not clear how that would work in terms of the Slang language.
@@ -2547,6 +2551,7 @@ int __alignOf(T v);
 __generic<T,F>
 __target_intrinsic(cuda, "int(((char*)&($1)) - ((char*)&($0)))")
 __target_intrinsic(cpp, "int(((char*)&($1)) - ((char*)&($0))")
+[__readNone]
 int __offsetOf(in T t, in F field);
 
 /// Mark beginning of "interlocked" operations in a fragment shader.
@@ -2960,6 +2965,9 @@ attribute_syntax [builtin] : BuiltinAttribute;
 __attributeTarget(DeclBase)
 attribute_syntax [__requiresNVAPI] : RequiresNVAPIAttribute;
 
+__attributeTarget(DeclBase)
+attribute_syntax [__AlwaysFoldIntoUseSiteAttribute] : AlwaysFoldIntoUseSiteAttribute;
+
 __attributeTarget(FunctionDeclBase)
 attribute_syntax [noinline] : NoInlineAttribute;
 
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 7e75d06b3..37cdc205e 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -778,6 +778,7 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_abs($0)")
 __target_intrinsic(cpp, "$P_abs($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FAbs, SAbs) _0")
+[__readNone]
 T abs(T x);
 /*{
     // Note: this simple definition may not be appropriate for floating-point inputs
@@ -788,6 +789,7 @@ __generic<T : __BuiltinIntegerType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FAbs, SAbs) _0")
+[__readNone]
 vector<T, N> abs(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, abs, x);
@@ -795,6 +797,7 @@ vector<T, N> abs(vector<T, N> x)
 
 __generic<T : __BuiltinIntegerType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> abs(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, abs, x);
@@ -806,12 +809,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_abs($0)")
 __target_intrinsic(cpp, "$P_abs($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FAbs, SAbs) _0")
+[__readNone]
 T abs(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FAbs, SAbs) _0")
+[__readNone]
 vector<T, N> abs(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, abs, x);
@@ -819,6 +824,7 @@ vector<T, N> abs(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> abs(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, abs, x);
@@ -832,12 +838,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_acos($0)")
 __target_intrinsic(cpp, "$P_acos($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Acos _0")
+[__readNone]
 T acos(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Acos _0")
+[__readNone]
 vector<T, N> acos(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, acos, x);
@@ -845,6 +853,7 @@ vector<T, N> acos(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> acos(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, acos, x);
@@ -855,11 +864,13 @@ __generic<T : __BuiltinType>
 __target_intrinsic(cpp, "bool($0)")
 __target_intrinsic(cuda, "bool($0)")
 __target_intrinsic(glsl, "bool($0)")
+[__readNone]
 bool all(T x);
 
 __generic<T : __BuiltinType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "all(bvec$N0($0))")
+[__readNone]
 bool all(vector<T,N> x)
 {
     bool result = true;
@@ -870,6 +881,7 @@ bool all(vector<T,N> x)
 
 __generic<T : __BuiltinType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 bool all(matrix<T,N,M> x)
 {
     bool result = true;
@@ -894,11 +906,13 @@ __generic<T : __BuiltinType>
 __target_intrinsic(cpp, "bool($0)")
 __target_intrinsic(cuda, "bool($0)")
 __target_intrinsic(glsl, "bool($0)")
+[__readNone]
 bool any(T x);
 
 __generic<T : __BuiltinType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "any(bvec$N0($0))")
+[__readNone]
 bool any(vector<T, N> x)
 {
     bool result = false;
@@ -909,6 +923,7 @@ bool any(vector<T, N> x)
 
 __generic<T : __BuiltinType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 bool any(matrix<T, N, M> x)
 {
     bool result = false;
@@ -926,6 +941,7 @@ __target_intrinsic(cpp, "$P_asdouble($0, $1)")
 __target_intrinsic(cuda, "$P_asdouble($0, $1)")
 __target_intrinsic(spirv_direct, "%v = OpCompositeConstruct _type(uint2) resultId _0 _1; OpExtInst resultType resultId glsl450 59 %v")
 __glsl_extension(GL_ARB_gpu_shader5)
+[__readNone]
 double asdouble(uint lowbits, uint highbits);
 
 // Reinterpret bits as a float (HLSL SM 4.0)
@@ -935,6 +951,7 @@ __target_intrinsic(glsl, "intBitsToFloat")
 __target_intrinsic(cpp, "$P_asfloat($0)")
 __target_intrinsic(cuda, "$P_asfloat($0)")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 float asfloat(int x);
 
 __target_intrinsic(hlsl)
@@ -942,12 +959,14 @@ __target_intrinsic(glsl, "uintBitsToFloat")
 __target_intrinsic(cpp, "$P_asfloat($0)")
 __target_intrinsic(cuda, "$P_asfloat($0)")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 float asfloat(uint x);
 
 __generic<let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "intBitsToFloat")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 vector<float, N> asfloat(vector< int, N> x)
 {
     VECTOR_MAP_UNARY(float, N, asfloat, x);
@@ -957,6 +976,7 @@ __generic<let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "uintBitsToFloat")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 vector<float,N> asfloat(vector<uint,N> x)
 {
     VECTOR_MAP_UNARY(float, N, asfloat, x);
@@ -964,6 +984,7 @@ vector<float,N> asfloat(vector<uint,N> x)
 
 __generic<let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<float,N,M> asfloat(matrix< int,N,M> x)
 {
     MATRIX_MAP_UNARY(float, N, M, asfloat, x);
@@ -971,6 +992,7 @@ matrix<float,N,M> asfloat(matrix< int,N,M> x)
 
 __generic<let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<float,N,M> asfloat(matrix<uint,N,M> x)
 {
     MATRIX_MAP_UNARY(float, N, M, asfloat, x);
@@ -978,16 +1000,19 @@ matrix<float,N,M> asfloat(matrix<uint,N,M> x)
 
 // No op
 [__unsafeForceInlineEarly]
+[__readNone]
 float asfloat(float x)
 { return x; }
 
 __generic<let N : int>
 [__unsafeForceInlineEarly]
+[__readNone]
 vector<float,N> asfloat(vector<float,N> x)
 { return x; }
 
 __generic<let N : int, let M : int>
 [__unsafeForceInlineEarly]
+[__readNone]
 matrix<float,N,M> asfloat(matrix<float,N,M> x)
 { return x; }
 
@@ -998,12 +1023,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_asin($0)")
 __target_intrinsic(cpp, "$P_asin($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Asin _0")
+[__readNone]
 T asin(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Asin _0")
+[__readNone]
 vector<T, N> asin(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T,N,asin,x);
@@ -1011,6 +1038,7 @@ vector<T, N> asin(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> asin(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T,N,M,asin,x);
@@ -1023,6 +1051,7 @@ __target_intrinsic(glsl, "floatBitsToInt")
 __target_intrinsic(cpp, "$P_asint($0)")
 __target_intrinsic(cuda, "$P_asint($0)")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 int asint(float x);
 
 __target_intrinsic(hlsl)
@@ -1030,12 +1059,14 @@ __target_intrinsic(glsl, "int($0)")
 __target_intrinsic(cpp, "$P_asint($0)")
 __target_intrinsic(cuda, "$P_asint($0)")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 int asint(uint x);
 
 __generic<let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "floatBitsToInt")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 vector<int, N> asint(vector<float, N> x)
 {
     VECTOR_MAP_UNARY(int, N, asint, x);
@@ -1045,6 +1076,7 @@ __generic<let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "ivec$N0($0)")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 vector<int, N> asint(vector<uint, N> x)
 {
     VECTOR_MAP_UNARY(int, N, asint, x);
@@ -1052,6 +1084,7 @@ vector<int, N> asint(vector<uint, N> x)
 
 __generic<let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<int, N, M> asint(matrix<float, N, M> x)
 {
     MATRIX_MAP_UNARY(int, N, M, asint, x);
@@ -1059,6 +1092,7 @@ matrix<int, N, M> asint(matrix<float, N, M> x)
 
 __generic<let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<int, N, M> asint(matrix<uint, N, M> x)
 {
     MATRIX_MAP_UNARY(int, N, M, asint, x);
@@ -1066,16 +1100,19 @@ matrix<int, N, M> asint(matrix<uint, N, M> x)
 
 // No op
 [__unsafeForceInlineEarly]
+[__readNone]
 int asint(int x)
 { return x; }
 
 __generic<let N : int>
 [__unsafeForceInlineEarly]
+[__readNone]
 vector<int,N> asint(vector<int,N> x)
 { return x; }
 
 __generic<let N : int, let M : int>
 [__unsafeForceInlineEarly]
+[__readNone]
 matrix<int,N,M> asint(matrix<int,N,M> x)
 { return x; }
 
@@ -1086,6 +1123,7 @@ __target_intrinsic(glsl, "{ uvec2 v = unpackDouble2x32($0); $1 = v.x; $2 = v.y;
 __glsl_extension(GL_ARB_gpu_shader5)
 __target_intrinsic(cpp, "$P_asuint($0, $1, $2)")
 __target_intrinsic(cuda, "$P_asuint($0, $1, $2)")
+[__readNone]
 void asuint(double value, out uint lowbits, out uint highbits);
 
 // Reinterpret bits as a uint (HLSL SM 4.0)
@@ -1095,6 +1133,7 @@ __target_intrinsic(glsl, "floatBitsToUint")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
 __target_intrinsic(cpp, "$P_asuint($0)")
 __target_intrinsic(cuda, "$P_asuint($0)")
+[__readNone]
 uint asuint(float x);
 
 __target_intrinsic(hlsl)
@@ -1102,12 +1141,14 @@ __target_intrinsic(glsl, "uint($0)")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
 __target_intrinsic(cpp, "$P_asuint($0)")
 __target_intrinsic(cuda, "$P_asuint($0)")
+[__readNone]
 uint asuint(int x);
 
 __generic<let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "floatBitsToUint")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 vector<uint,N> asuint(vector<float,N> x)
 {
     VECTOR_MAP_UNARY(uint, N, asuint, x);
@@ -1117,6 +1158,7 @@ __generic<let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "uvec$N0($0)")
 __target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
+[__readNone]
 vector<uint, N> asuint(vector<int, N> x)
 {
     VECTOR_MAP_UNARY(uint, N, asuint, x);
@@ -1124,6 +1166,7 @@ vector<uint, N> asuint(vector<int, N> x)
 
 __generic<let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<uint,N,M> asuint(matrix<float,N,M> x)
 {
     MATRIX_MAP_UNARY(uint, N, M, asuint, x);
@@ -1131,22 +1174,26 @@ matrix<uint,N,M> asuint(matrix<float,N,M> x)
 
 __generic<let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<uint, N, M> asuint(matrix<int, N, M> x)
 {
     MATRIX_MAP_UNARY(uint, N, M, asuint, x);
 }
 
 [__unsafeForceInlineEarly]
+[__readNone]
 uint asuint(uint x)
 { return x; }
 
 __generic<let N : int>
 [__unsafeForceInlineEarly]
+[__readNone]
 vector<uint,N> asuint(vector<uint,N> x)
 { return x; }
 
 __generic<let N : int, let M : int>
 [__unsafeForceInlineEarly]
+[__readNone]
 matrix<uint,N,M> asuint(matrix<uint,N,M> x)
 { return x; }
 
@@ -1159,38 +1206,41 @@ matrix<uint,N,M> asuint(matrix<uint,N,M> x)
 
 // Identity cases:
 
-[__unsafeForceInlineEarly] float16_t asfloat16(float16_t value) { return value; }
-[__unsafeForceInlineEarly] vector<float16_t,N> asfloat16<let N : int>(vector<float16_t,N> value) { return value; }
-[__unsafeForceInlineEarly] matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<float16_t,R,C> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] float16_t asfloat16(float16_t value) { return value; }
+[__unsafeForceInlineEarly][__readNone] vector<float16_t,N> asfloat16<let N : int>(vector<float16_t,N> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<float16_t,R,C> value) { return value; }
 
-[__unsafeForceInlineEarly] int16_t asint16(int16_t value) { return value; }
-[__unsafeForceInlineEarly] vector<int16_t,N> asint16<let N : int>(vector<int16_t,N> value) { return value; }
-[__unsafeForceInlineEarly] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] int16_t asint16(int16_t value) { return value; }
+[__unsafeForceInlineEarly][__readNone] vector<int16_t,N> asint16<let N : int>(vector<int16_t,N> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return value; }
 
-[__unsafeForceInlineEarly] uint16_t asuint16(uint16_t value) { return value; }
-[__unsafeForceInlineEarly] vector<uint16_t,N> asuint16<let N : int>(vector<uint16_t,N> value) { return value; }
-[__unsafeForceInlineEarly] matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<uint16_t,R,C> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] uint16_t asuint16(uint16_t value) { return value; }
+[__unsafeForceInlineEarly][__readNone] vector<uint16_t,N> asuint16<let N : int>(vector<uint16_t,N> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<uint16_t,R,C> value) { return value; }
 
 // Signed<->unsigned cases:
 
-[__unsafeForceInlineEarly] int16_t asint16(uint16_t value) { return value; }
-[__unsafeForceInlineEarly] vector<int16_t,N> asint16<let N : int>(vector<uint16_t,N> value) { return value; }
-[__unsafeForceInlineEarly] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<uint16_t,R,C> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] int16_t asint16(uint16_t value) { return value; }
+[__unsafeForceInlineEarly][__readNone] vector<int16_t,N> asint16<let N : int>(vector<uint16_t,N> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<uint16_t,R,C> value) { return value; }
 
-[__unsafeForceInlineEarly] uint16_t asuint16(int16_t value) { return value; }
-[__unsafeForceInlineEarly] vector<uint16_t,N> asuint16<let N : int>(vector<int16_t,N> value) { return value; }
-[__unsafeForceInlineEarly] matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] uint16_t asuint16(int16_t value) { return value; }
+[__unsafeForceInlineEarly][__readNone] vector<uint16_t,N> asuint16<let N : int>(vector<int16_t,N> value) { return value; }
+[__unsafeForceInlineEarly][__readNone] matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return value; }
 
 // Float->unsigned cases:
 
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "uint16_t(packHalf2x16(vec2($0, 0.0)))")
 __target_intrinsic(cuda, "__half_as_ushort")
+[__readNone]
 uint16_t asuint16(float16_t value);
 
+[__readNone]
 vector<uint16_t,N> asuint16<let N : int>(vector<float16_t,N> value)
 { VECTOR_MAP_UNARY(uint16_t, N, asuint16, value); }
 
+[__readNone]
 matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<float16_t,R,C> value)
 { MATRIX_MAP_UNARY(uint16_t, R, C, asuint16, value); }
 
@@ -1199,11 +1249,14 @@ matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<float16_t,R,C> va
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "float16_t(unpackHalf2x16($0).x)")
 __target_intrinsic(cuda, "__ushort_as_half")
+[__readNone]
 float16_t asfloat16(uint16_t value);
 
+[__readNone]
 vector<float16_t,N> asfloat16<let N : int>(vector<uint16_t,N> value)
 { VECTOR_MAP_UNARY(float16_t, N, asfloat16, value); }
 
+[__readNone]
 matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<uint16_t,R,C> value)
 { MATRIX_MAP_UNARY(float16_t, R, C, asfloat16, value); }
 
@@ -1211,16 +1264,17 @@ matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<uint16_t,R,C> v
 
 __target_intrinsic(hlsl)
 __target_intrinsic(cuda, "__half_as_short")
-[__unsafeForceInlineEarly] int16_t asint16(float16_t value) { return asuint16(value); }
-__target_intrinsic(hlsl) [__unsafeForceInlineEarly] vector<int16_t,N> asint16<let N : int>(vector<float16_t,N> value) { return asuint16(value); }
-__target_intrinsic(hlsl) [__unsafeForceInlineEarly] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<float16_t,R,C> value) { return asuint16(value); }
+[__unsafeForceInlineEarly][__readNone] int16_t asint16(float16_t value) { return asuint16(value); }
+__target_intrinsic(hlsl) [__unsafeForceInlineEarly][__readNone] vector<int16_t,N> asint16<let N : int>(vector<float16_t,N> value) { return asuint16(value); }
+__target_intrinsic(hlsl) [__unsafeForceInlineEarly][__readNone] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<float16_t,R,C> value) { return asuint16(value); }
 
 __target_intrinsic(hlsl)
 __target_intrinsic(cuda, "__short_as_half")
+[__readNone]
 [__unsafeForceInlineEarly] float16_t asfloat16(int16_t value) { return asfloat16(asuint16(value)); }
 
-__target_intrinsic(hlsl) [__unsafeForceInlineEarly] vector<float16_t,N> asfloat16<let N : int>(vector<int16_t,N> value) { return asfloat16(asuint16(value)); }
-__target_intrinsic(hlsl) [__unsafeForceInlineEarly] matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return asfloat16(asuint16(value)); }
+__target_intrinsic(hlsl) [__unsafeForceInlineEarly][__readNone] vector<float16_t,N> asfloat16<let N : int>(vector<int16_t,N> value) { return asfloat16(asuint16(value)); }
+__target_intrinsic(hlsl) [__unsafeForceInlineEarly][__readNone] matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return asfloat16(asuint16(value)); }
 
 // Inverse tangent (HLSL SM 1.0)
 __generic<T : __BuiltinFloatingPointType>
@@ -1229,12 +1283,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_atan($0)")
 __target_intrinsic(cpp, "$P_atan($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Atan _0")
+[__readNone]
 T atan(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Atan _0")
+[__readNone]
 vector<T, N> atan(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, atan, x);
@@ -1242,6 +1298,7 @@ vector<T, N> atan(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> atan(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, atan, x);
@@ -1253,12 +1310,14 @@ __target_intrinsic(glsl,"atan($0,$1)")
 __target_intrinsic(cuda, "$P_atan2($0, $1)")
 __target_intrinsic(cpp, "$P_atan2($0, $1)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Atan2 _0 _1")
+[__readNone]
 T atan2(T y, T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl,"atan($0,$1)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Atan2 _0 _1")
+[__readNone]
 vector<T, N> atan2(vector<T, N> y, vector<T, N> x)
 {
     VECTOR_MAP_BINARY(T, N, atan2, y, x);
@@ -1266,6 +1325,7 @@ vector<T, N> atan2(vector<T, N> y, vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> atan2(matrix<T,N,M> y, matrix<T,N,M> x)
 {
     MATRIX_MAP_BINARY(T, N, M, atan2, y, x);
@@ -1278,12 +1338,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_ceil($0)")
 __target_intrinsic(cpp, "$P_ceil($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Ceil _0")
+[__readNone]
 T ceil(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Ceil _0")
+[__readNone]
 vector<T, N> ceil(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, ceil, x);
@@ -1291,6 +1353,7 @@ vector<T, N> ceil(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> ceil(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, ceil, x);
@@ -1305,6 +1368,7 @@ __generic<T : __BuiltinIntegerType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FClamp, UClamp, SClamp) _0 _1 _2")
+[__readNone]
 T clamp(T x, T minBound, T maxBound)
 {
     return min(max(x, minBound), maxBound);
@@ -1314,6 +1378,7 @@ __generic<T : __BuiltinIntegerType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FClamp, UClamp, SClamp) _0 _1 _2")
+[__readNone]
 vector<T, N> clamp(vector<T, N> x, vector<T, N> minBound, vector<T, N> maxBound)
 {
     return min(max(x, minBound), maxBound);
@@ -1321,6 +1386,7 @@ vector<T, N> clamp(vector<T, N> x, vector<T, N> minBound, vector<T, N> maxBound)
 
 __generic<T : __BuiltinIntegerType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> clamp(matrix<T,N,M> x, matrix<T,N,M> minBound, matrix<T,N,M> maxBound)
 {
     return min(max(x, minBound), maxBound);
@@ -1330,6 +1396,7 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FClamp, UClamp, SClamp) _0 _1 _2")
+[__readNone]
 T clamp(T x, T minBound, T maxBound)
 {
     return min(max(x, minBound), maxBound);
@@ -1339,6 +1406,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FClamp, UClamp, SClamp) _0 _1 _2")
+[__readNone]
 vector<T, N> clamp(vector<T, N> x, vector<T, N> minBound, vector<T, N> maxBound)
 {
     return min(max(x, minBound), maxBound);
@@ -1346,6 +1414,7 @@ vector<T, N> clamp(vector<T, N> x, vector<T, N> minBound, vector<T, N> maxBound)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> clamp(matrix<T,N,M> x, matrix<T,N,M> minBound, matrix<T,N,M> maxBound)
 {
     return min(max(x, minBound), maxBound);
@@ -1354,6 +1423,7 @@ matrix<T,N,M> clamp(matrix<T,N,M> x, matrix<T,N,M> minBound, matrix<T,N,M> maxBo
 // Clip (discard) fragment conditionally
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
+[__readNone]
 void clip(T x)
 {
     if(x < T(0)) discard;
@@ -1361,6 +1431,7 @@ void clip(T x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 void clip(vector<T,N> x)
 {
     if(any(x < T(0))) discard;
@@ -1368,6 +1439,7 @@ void clip(vector<T,N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 void clip(matrix<T,N,M> x)
 {
     if(any(x < T(0))) discard;
@@ -1380,12 +1452,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_cos($0)")
 __target_intrinsic(cpp, "$P_cos($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cos _0")
+[__readNone]
 T cos(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cos _0")
+[__readNone]
 vector<T, N> cos(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T,N, cos, x);
@@ -1393,6 +1467,7 @@ vector<T, N> cos(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> cos(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, cos, x);
@@ -1405,12 +1480,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_cosh($0)")
 __target_intrinsic(cpp, "$P_cosh($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cosh _0")
+[__readNone]
 T cosh(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cosh _0")
+[__readNone]
 vector<T,N> cosh(vector<T,N> x)
 {
     VECTOR_MAP_UNARY(T,N, cosh, x);
@@ -1418,6 +1495,7 @@ vector<T,N> cosh(vector<T,N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> cosh(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, cosh, x);
@@ -1428,6 +1506,7 @@ __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "bitCount")
 __target_intrinsic(cuda, "$P_countbits($0)")
 __target_intrinsic(cpp, "$P_countbits($0)")
+[__readNone]
 uint countbits(uint value);
 
 // Cross product
@@ -1436,6 +1515,7 @@ __generic<T : __BuiltinArithmeticType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cross _0 _1")
+[__readNone]
 vector<T,3> cross(vector<T,3> left, vector<T,3> right)
 {
     return vector<T,3>(
@@ -1446,6 +1526,7 @@ vector<T,3> cross(vector<T,3> left, vector<T,3> right)
 
 // Convert encoded color
 __target_intrinsic(hlsl)
+[__readNone]
 int4 D3DCOLORtoUBYTE4(float4 color)
 {
     let scaled = color.zyxw * 255.001999f;
@@ -1455,11 +1536,13 @@ int4 D3DCOLORtoUBYTE4(float4 color)
 // Partial-difference derivatives
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(glsl, dFdx)
+[__readNone]
 T ddx(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, dFdx)
+[__readNone]
 vector<T, N> ddx(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, ddx, x);
@@ -1467,6 +1550,7 @@ vector<T, N> ddx(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> ddx(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, ddx, x);
@@ -1476,12 +1560,14 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __glsl_extension(GL_ARB_derivative_control)
 __target_intrinsic(glsl, dFdxCoarse)
+[__readNone]
 T ddx_coarse(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __glsl_extension(GL_ARB_derivative_control)
 __target_intrinsic(glsl, dFdxCoarse)
+[__readNone]
 vector<T, N> ddx_coarse(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, ddx_coarse, x);
@@ -1489,6 +1575,7 @@ vector<T, N> ddx_coarse(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> ddx_coarse(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, ddx_coarse, x);
@@ -1498,12 +1585,14 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __glsl_extension(GL_ARB_derivative_control)
 __target_intrinsic(glsl, dFdxFine)
+[__readNone]
 T ddx_fine(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __glsl_extension(GL_ARB_derivative_control)
 __target_intrinsic(glsl, dFdxFine)
+[__readNone]
 vector<T, N> ddx_fine(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, ddx_fine, x);
@@ -1511,6 +1600,7 @@ vector<T, N> ddx_fine(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> ddx_fine(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, ddx_fine, x);
@@ -1519,11 +1609,13 @@ matrix<T, N, M> ddx_fine(matrix<T, N, M> x)
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, dFdy)
+[__readNone]
 T ddy(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, dFdy)
+[__readNone]
 vector<T, N> ddy(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, ddy, x);
@@ -1531,6 +1623,7 @@ vector<T, N> ddy(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> ddy(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, ddy, x);
@@ -1539,12 +1632,14 @@ matrix<T, N, M> ddy(matrix<T, N, M> x)
 __generic<T : __BuiltinFloatingPointType>
 __glsl_extension(GL_ARB_derivative_control)
 __target_intrinsic(glsl, dFdyCoarse)
+[__readNone]
 T ddy_coarse(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __glsl_extension(GL_ARB_derivative_control)
 __target_intrinsic(glsl, dFdyCoarse)
+[__readNone]
 vector<T, N> ddy_coarse(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, ddy_coarse, x);
@@ -1552,6 +1647,7 @@ vector<T, N> ddy_coarse(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> ddy_coarse(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, ddy_coarse, x);
@@ -1561,12 +1657,14 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __glsl_extension(GL_ARB_derivative_control)
 __target_intrinsic(glsl, dFdyFine)
+[__readNone]
 T ddy_fine(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __glsl_extension(GL_ARB_derivative_control)
 __target_intrinsic(glsl, dFdyFine)
+[__readNone]
 vector<T, N> ddy_fine(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, ddy_fine, x);
@@ -1574,6 +1672,7 @@ vector<T, N> ddy_fine(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> ddy_fine(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, ddy_fine, x);
@@ -1586,6 +1685,7 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Degrees _0")
+[__readNone]
 T degrees(T x)
 {
     return x * (T(180) / T.getPi());
@@ -1595,6 +1695,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Degrees _0")
+[__readNone]
 vector<T, N> degrees(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, degrees, x);
@@ -1602,6 +1703,7 @@ vector<T, N> degrees(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> degrees(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, degrees, x);
@@ -1613,6 +1715,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Determinant _0")
+[__readNone]
 T determinant(matrix<T,N,N> m);
 
 // Barrier for device memory
@@ -1630,6 +1733,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Distance _0 _1")
+[__readNone]
 T distance(vector<T, N> x, vector<T, N> y)
 {
     return length(x - y);
@@ -1640,6 +1744,7 @@ T distance(vector<T, N> x, vector<T, N> y)
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
+[__readNone]
 T dot(vector<T, N> x, vector<T, N> y)
 {
     T result = T(0);
@@ -1650,6 +1755,7 @@ T dot(vector<T, N> x, vector<T, N> y)
 
 __generic<T : __BuiltinIntegerType, let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 T dot(vector<T, N> x, vector<T, N> y)
 {
     T result = T(0);
@@ -1682,15 +1788,18 @@ RWStructuredBuffer<T> __getEquivalentStructuredBuffer<T>(RWByteAddressBuffer b);
 __generic<T : __BuiltinArithmeticType>
 __target_intrinsic(glsl, interpolateAtCentroid)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InterpolateAtCentroid _0")
+[__readNone]
 T EvaluateAttributeAtCentroid(T x);
 
 __generic<T : __BuiltinArithmeticType, let N : int>
 __target_intrinsic(glsl, interpolateAtCentroid)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InterpolateAtCentroid _0")
+[__readNone]
 vector<T,N> EvaluateAttributeAtCentroid(vector<T,N> x);
 
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int>
 __target_intrinsic(glsl, interpolateAtCentroid)
+[__readNone]
 matrix<T,N,M> EvaluateAttributeAtCentroid(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, EvaluateAttributeAtCentroid, x);
@@ -1699,15 +1808,18 @@ matrix<T,N,M> EvaluateAttributeAtCentroid(matrix<T,N,M> x)
 __generic<T : __BuiltinArithmeticType>
 __target_intrinsic(glsl, "interpolateAtSample($0, int($1))")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InterpolateAtSample _0 _1")
+[__readNone]
 T EvaluateAttributeAtSample(T x, uint sampleindex);
 
 __generic<T : __BuiltinArithmeticType, let N : int>
 __target_intrinsic(glsl, "interpolateAtSample($0, int($1))")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InterpolateAtSample _0 _1")
+[__readNone]
 vector<T,N> EvaluateAttributeAtSample(vector<T,N> x, uint sampleindex);
 
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int>
 __target_intrinsic(glsl, "interpolateAtSample($0, int($1))")
+[__readNone]
 matrix<T,N,M> EvaluateAttributeAtSample(matrix<T,N,M> x, uint sampleindex)
 {
     matrix<T,N,M> result;
@@ -1721,15 +1833,18 @@ matrix<T,N,M> EvaluateAttributeAtSample(matrix<T,N,M> x, uint sampleindex)
 __generic<T : __BuiltinArithmeticType>
 __target_intrinsic(glsl, "interpolateAtOffset($0, vec2($1) / 16.0f)")
 __target_intrinsic(spirv_direct, "%foffset = OpConvertSToF _type(float2) resultId _1; %offsetdiv16 = 136 _type(float2) resultId %foffset const(float2, 16.0, 16.0); OpExtInst resultType resultId glsl450 78 _0 %offsetdiv16")
+[__readNone]
 T EvaluateAttributeSnapped(T x, int2 offset);
 
 __generic<T : __BuiltinArithmeticType, let N : int>
 __target_intrinsic(glsl, "interpolateAtOffset($0, vec2($1) / 16.0f)")
 __target_intrinsic(spirv_direct, "%foffset = OpConvertSToF _type(float2) resultId _1; %offsetdiv16 = 136 _type(float2) resultId %foffset const(float2, 16.0, 16.0); OpExtInst resultType resultId glsl450 78 _0 %offsetdiv16")
+[__readNone]
 vector<T,N> EvaluateAttributeSnapped(vector<T,N> x, int2 offset);
 
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int>
 __target_intrinsic(glsl, "interpolateAtOffset($0, vec2($1) / 16.0f)")
+[__readNone]
 matrix<T,N,M> EvaluateAttributeSnapped(matrix<T,N,M> x, int2 offset)
 {
     matrix<T,N,M> result;
@@ -1748,12 +1863,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_exp($0)")
 __target_intrinsic(cpp, "$P_exp($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Exp _0")
+[__readNone]
 T exp(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Exp _0")
+[__readNone]
 vector<T, N> exp(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, exp, x);
@@ -1761,6 +1878,7 @@ vector<T, N> exp(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> exp(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, exp, x);
@@ -1774,12 +1892,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_exp2($0)")
 __target_intrinsic(cpp, "$P_exp2($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Exp2 _0")
+[__readNone]
 T exp2(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Exp2 _0")
+[__readNone]
 vector<T,N> exp2(vector<T,N> x)
 {
     VECTOR_MAP_UNARY(T, N, exp2, x);
@@ -1787,6 +1907,7 @@ vector<T,N> exp2(vector<T,N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> exp2(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, exp2, x);
@@ -1799,10 +1920,12 @@ __glsl_version(420)
 __target_intrinsic(hlsl)
 __cuda_sm_version(6.0)
 __target_intrinsic(cuda, "__half2float(__ushort_as_half($0))")
+[__readNone]
 float f16tof32(uint value);
 
 __generic<let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 vector<float, N> f16tof32(vector<uint, N> value)
 {
     VECTOR_MAP_UNARY(float, N, f16tof32, value);
@@ -1816,10 +1939,12 @@ __glsl_version(420)
 __target_intrinsic(hlsl)
 __cuda_sm_version(6.0)
 __target_intrinsic(cuda, "__half_as_ushort(__float2half($0))")
+[__readNone]
 uint f32tof16(float value);
 
 __generic<let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 vector<uint, N> f32tof16(vector<float, N> value)
 {
     VECTOR_MAP_UNARY(uint, N, f32tof16, value);
@@ -1833,11 +1958,13 @@ vector<uint, N> f32tof16(vector<float, N> value)
 __target_intrinsic(glsl, "unpackHalf2x16($0).x")
 __target_intrinsic(cuda, "__half2float")
 __glsl_version(420)
+[__readNone]
 float f16tof32(float16_t value);
 
 __generic<let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(cuda, "__half2float")
+[__readNone]
 vector<float, N> f16tof32(vector<float16_t, N> value)
 {
     VECTOR_MAP_UNARY(float, N, f16tof32, value);
@@ -1847,10 +1974,12 @@ vector<float, N> f16tof32(vector<float16_t, N> value)
 __target_intrinsic(glsl, "packHalf2x16(vec2($0,0.0))")
 __glsl_version(420)
 __target_intrinsic(cuda, "__float2half")
+[__readNone]
 float16_t f32tof16_(float value);
 
 __generic<let N : int>
 __target_intrinsic(cuda, "__float2half")
+[__readNone]
 vector<float16_t, N> f32tof16_(vector<float, N> value)
 {
     VECTOR_MAP_UNARY(uint, N, f32tof16, value);
@@ -1862,6 +1991,7 @@ vector<float16_t, N> f32tof16_(vector<float, N> value)
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
+[__readNone]
 vector<T,N> faceforward(vector<T,N> n, vector<T,N> i, vector<T,N> ng)
 {
     return dot(ng, i) < T(0.0f) ? n : -n;
@@ -1873,12 +2003,14 @@ __target_intrinsic(glsl,"findMSB")
 __target_intrinsic(cuda, "$P_firstbithigh($0)")
 __target_intrinsic(cpp, "$P_firstbithigh($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindSMsb _0")
+[__readNone]
 int firstbithigh(int value);
 
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl,"findMSB")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindSMsb _0")
 __generic<let N : int>
+[__readNone]
 vector<int, N> firstbithigh(vector<int, N> value)
 {
     VECTOR_MAP_UNARY(int, N, firstbithigh, value);
@@ -1889,12 +2021,14 @@ __target_intrinsic(glsl,"findMSB")
 __target_intrinsic(cuda, "$P_firstbithigh($0)")
 __target_intrinsic(cpp, "$P_firstbithigh($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindUMsb _0")
+[__readNone]
 uint firstbithigh(uint value);
 
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl,"findMSB")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindUMsb _0")
 __generic<let N : int>
+[__readNone]
 vector<uint,N> firstbithigh(vector<uint,N> value)
 {
     VECTOR_MAP_UNARY(uint, N, firstbithigh, value);
@@ -1906,12 +2040,14 @@ __target_intrinsic(glsl,"findLSB")
 __target_intrinsic(cuda, "$P_firstbitlow($0)")
 __target_intrinsic(cpp, "$P_firstbitlow($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindILsb _0")
+[__readNone]
 int firstbitlow(int value);
 
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl,"findLSB")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindILsb _0")
 __generic<let N : int>
+[__readNone]
 vector<int,N> firstbitlow(vector<int,N> value)
 {
     VECTOR_MAP_UNARY(int, N, firstbitlow, value);
@@ -1922,12 +2058,14 @@ __target_intrinsic(glsl,"findLSB")
 __target_intrinsic(cuda, "$P_firstbitlow($0)")
 __target_intrinsic(cpp, "$P_firstbitlow($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindILsb _0")
+[__readNone]
 uint firstbitlow(uint value);
 
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl,"findLSB")
 __generic<let N : int>
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindILsb _0")
+[__readNone]
 vector<uint,N> firstbitlow(vector<uint,N> value)
 {
     VECTOR_MAP_UNARY(uint, N, firstbitlow, value);
@@ -1941,12 +2079,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_floor($0)")
 __target_intrinsic(cpp, "$P_floor($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Floor _0")
+[__readNone]
 T floor(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Floor _0")
+[__readNone]
 vector<T, N> floor(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, floor, x);
@@ -1954,6 +2094,7 @@ vector<T, N> floor(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> floor(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, floor, x);
@@ -1965,12 +2106,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_fma($0, $1, $2)")
 __target_intrinsic(cpp, "$P_fma($0, $1, $2)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
+[__readNone]
 double fma(double a, double b, double c);
 
 __generic<let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
+[__readNone]
 vector<double, N> fma(vector<double, N> a, vector<double, N> b, vector<double, N> c)
 {
     VECTOR_MAP_TRINARY(double, N, fma, a, b, c);
@@ -1978,6 +2121,7 @@ vector<double, N> fma(vector<double, N> a, vector<double, N> b, vector<double, N
 
 __generic<let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<double, N, M> fma(matrix<double, N, M> a, matrix<double, N, M> b, matrix<double, N, M> c)
 {
     MATRIX_MAP_TRINARY(double, N, M, fma, a, b, c);
@@ -1988,6 +2132,7 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(cuda, "$P_fmod($0, $1)")
 __target_intrinsic(cpp, "$P_fmod($0, $1)")
+[__readNone]
 T fmod(T x, T y)
 {
     return x - y * trunc(x/y);
@@ -1995,6 +2140,7 @@ T fmod(T x, T y)
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 vector<T, N> fmod(vector<T, N> x, vector<T, N> y)
 {
     VECTOR_MAP_BINARY(T, N, fmod, x, y);
@@ -2002,6 +2148,7 @@ vector<T, N> fmod(vector<T, N> x, vector<T, N> y)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> fmod(matrix<T, N, M> x, matrix<T, N, M> y)
 {
     MATRIX_MAP_BINARY(T, N, M, fmod, x, y);
@@ -2014,18 +2161,21 @@ __target_intrinsic(glsl, fract)
 __target_intrinsic(cuda, "$P_frac($0)")
 __target_intrinsic(cpp, "$P_frac($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fract _0")
+[__readNone]
 T frac(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, fract)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fract _0")
+[__readNone]
 vector<T, N> frac(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, frac, x);
 }
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
+[__readNone]
 matrix<T, N, M> frac(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, frac, x);
@@ -2036,12 +2186,14 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Frexp _0 _1")
+[__readNone]
 T frexp(T x, out T exp);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Frexp _0 _1")
+[__readNone]
 vector<T, N> frexp(vector<T, N> x, out vector<T, N> exp)
 {
     VECTOR_MAP_BINARY(T, N, frexp, x, exp);
@@ -2049,6 +2201,7 @@ vector<T, N> frexp(vector<T, N> x, out vector<T, N> exp)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> frexp(matrix<T, N, M> x, out matrix<T, N, M> exp)
 {
     MATRIX_MAP_BINARY(T, N, M, frexp, x, exp);
@@ -2056,11 +2209,13 @@ matrix<T, N, M> frexp(matrix<T, N, M> x, out matrix<T, N, M> exp)
 
 // Texture filter width
 __generic<T : __BuiltinFloatingPointType>
+[__readNone]
 T fwidth(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
+[__readNone]
 vector<T, N> fwidth(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, fwidth, x);
@@ -2068,6 +2223,7 @@ vector<T, N> fwidth(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> fwidth(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, fwidth, x);
@@ -2141,9 +2297,11 @@ matrix<T,N,M> GetAttributeAtVertex(matrix<T,N,M> attribute, uint vertexIndex);
 
 
 // Get number of samples in render target
+[__readNone]
 uint GetRenderTargetSampleCount();
 
 // Get position of given sample
+[__readNone]
 float2 GetRenderTargetSamplePosition(int Index);
 
 // Group memory barrier
@@ -2284,6 +2442,7 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(cuda, "$P_isfinite($0)")
 __target_intrinsic(cpp, "$P_isfinite($0)")
+[__readNone]
 bool isfinite(T x)
 {
     return !(isinf(x) || isnan(x));
@@ -2291,6 +2450,7 @@ bool isfinite(T x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 vector<bool, N> isfinite(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(bool, N, isfinite, x);
@@ -2298,6 +2458,7 @@ vector<bool, N> isfinite(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<bool, N, M> isfinite(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(bool, N, M, isfinite, x);
@@ -2309,11 +2470,13 @@ __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_isinf($0)")
 __target_intrinsic(cpp, "$P_isinf($0)")
+[__readNone]
 bool isinf(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
+[__readNone]
 vector<bool, N> isinf(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(bool, N, isinf, x);
@@ -2321,6 +2484,7 @@ vector<bool, N> isinf(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<bool, N, M> isinf(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(bool, N, M, isinf, x);
@@ -2332,11 +2496,13 @@ __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_isnan($0)")
 __target_intrinsic(cpp, "$P_isnan($0)")
+[__readNone]
 bool isnan(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
+[__readNone]
 vector<bool, N> isnan(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(bool, N, isnan, x);
@@ -2344,6 +2510,7 @@ vector<bool, N> isnan(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<bool, N, M> isnan(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(bool, N, M, isnan, x);
@@ -2354,6 +2521,7 @@ matrix<bool, N, M> isnan(matrix<T, N, M> x)
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Ldexp _0 _1")
+[__readNone]
 T ldexp(T x, T exp)
 {
     return x * exp2(exp);
@@ -2362,6 +2530,7 @@ T ldexp(T x, T exp)
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Ldexp _0 _1")
+[__readNone]
 vector<T, N> ldexp(vector<T, N> x, vector<T, N> exp)
 {
     return x * exp2(exp);
@@ -2369,6 +2538,7 @@ vector<T, N> ldexp(vector<T, N> x, vector<T, N> exp)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> ldexp(matrix<T, N, M> x, matrix<T, N, M> exp)
 {
     MATRIX_MAP_BINARY(T, N, M, ldexp, x, exp);
@@ -2379,6 +2549,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Length _0")
+[__readNone]
 T length(vector<T, N> x)
 {
     return sqrt(dot(x, x));
@@ -2389,6 +2560,7 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, mix)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FMix _0 _1 _2")
+[__readNone]
 T lerp(T x, T y, T s)
 {
     return x * (T(1.0f) - s) + y * s;
@@ -2398,6 +2570,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, mix)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FMix _0 _1 _2")
+[__readNone]
 vector<T, N> lerp(vector<T, N> x, vector<T, N> y, vector<T, N> s)
 {
     return x * (T(1.0f) - s) + y * s;
@@ -2405,6 +2578,7 @@ vector<T, N> lerp(vector<T, N> x, vector<T, N> y, vector<T, N> s)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> lerp(matrix<T,N,M> x, matrix<T,N,M> y, matrix<T,N,M> s)
 {
     MATRIX_MAP_TRINARY(T, N, M, lerp, x, y, s);
@@ -2412,6 +2586,7 @@ matrix<T,N,M> lerp(matrix<T,N,M> x, matrix<T,N,M> y, matrix<T,N,M> s)
 
 // Legacy lighting function (obsolete)
 __target_intrinsic(hlsl)
+[__readNone]
 float4 lit(float n_dot_l, float n_dot_h, float m)
 {
     let ambient = 1.0f;
@@ -2427,12 +2602,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_log($0)")
 __target_intrinsic(cpp, "$P_log($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Log _0")
+[__readNone]
 T log(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Log _0")
+[__readNone]
 vector<T, N> log(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, log, x);
@@ -2440,6 +2617,7 @@ vector<T, N> log(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> log(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, log, x);
@@ -2452,12 +2630,14 @@ __target_intrinsic(glsl, "(log( $0 ) * $S0( 0.43429448190325182765112891891661)
 __target_intrinsic(cuda, "$P_log10($0)")
 __target_intrinsic(cpp, "$P_log10($0)")
 __target_intrinsic(spirv_direct, "%baseElog = OpExtInst resultType resultId glsl450 Log _0; OpFMul resultType resultId _0 %baseElog const(_p,0.43429448190325182765112891891661)")
+[__readNone]
 T log10(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "(log( $0 ) * $S0(0.43429448190325182765112891891661) )" )
 __target_intrinsic(spirv_direct, "%baseElog = OpExtInst resultType resultId glsl450 Log _0; OpVectorTimesScalar resultType resultId _0 %baseElog const(_p,0.43429448190325182765112891891661)")
+[__readNone]
 vector<T,N> log10(vector<T,N> x)
 {
     VECTOR_MAP_UNARY(T, N, log10, x);
@@ -2465,6 +2645,7 @@ vector<T,N> log10(vector<T,N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> log10(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, log10, x);
@@ -2477,12 +2658,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_log2($0)")
 __target_intrinsic(cpp, "$P_log2($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Log2 _0")
+[__readNone]
 T log2(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Log2 _0")
+[__readNone]
 vector<T,N> log2(vector<T,N> x)
 {
     VECTOR_MAP_UNARY(T, N, log2, x);
@@ -2490,6 +2673,7 @@ vector<T,N> log2(vector<T,N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> log2(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, log2, x);
@@ -2503,12 +2687,14 @@ __target_intrinsic(glsl, fma)
 __target_intrinsic(cuda, "$P_fma($0, $1, $2)")
 __target_intrinsic(cpp, "$P_fma($0, $1, $2)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
+[__readNone]
 T mad(T mvalue, T avalue, T bvalue);
 
 __generic<T : __BuiltinArithmeticType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, fma)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
+[__readNone]
 vector<T, N> mad(vector<T, N> mvalue, vector<T, N> avalue, vector<T, N> bvalue)
 {
     VECTOR_MAP_TRINARY(T, N, mad, mvalue, avalue, bvalue);
@@ -2516,6 +2702,7 @@ vector<T, N> mad(vector<T, N> mvalue, vector<T, N> avalue, vector<T, N> bvalue)
 
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> mad(matrix<T, N, M> mvalue, matrix<T, N, M> avalue, matrix<T, N, M> bvalue)
 {
     MATRIX_MAP_TRINARY(T, N, M, mad, mvalue, avalue, bvalue);
@@ -2528,6 +2715,7 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_max($0, $1)")
 __target_intrinsic(cpp, "$P_max($0, $1)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMax, UMax, SMax) _0")
+[__readNone]
 T max(T x, T y);
 // Note: a stdlib implementation of `max` (or `min`) will require splitting
 // floating-point and integer cases apart, because the floating-point
@@ -2538,6 +2726,7 @@ __generic<T : __BuiltinIntegerType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMax, UMax, SMax) _0")
+[__readNone]
 vector<T, N> max(vector<T, N> x, vector<T, N> y)
 {
     VECTOR_MAP_BINARY(T, N, max, x, y);
@@ -2545,6 +2734,7 @@ vector<T, N> max(vector<T, N> x, vector<T, N> y)
 
 __generic<T : __BuiltinIntegerType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> max(matrix<T, N, M> x, matrix<T, N, M> y)
 {
     MATRIX_MAP_BINARY(T, N, M, max, x, y);
@@ -2556,12 +2746,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_max($0, $1)")
 __target_intrinsic(cpp, "$P_max($0, $1)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMax, UMax, SMax) _0")
+[__readNone]
 T max(T x, T y);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMax, UMax, SMax) _0")
+[__readNone]
 vector<T, N> max(vector<T, N> x, vector<T, N> y)
 {
     VECTOR_MAP_BINARY(T, N, max, x, y);
@@ -2569,6 +2761,7 @@ vector<T, N> max(vector<T, N> x, vector<T, N> y)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> max(matrix<T, N, M> x, matrix<T, N, M> y)
 {
     MATRIX_MAP_BINARY(T, N, M, max, x, y);
@@ -2581,12 +2774,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_min($0, $1)")
 __target_intrinsic(cpp, "$P_min($0, $1)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0")
+[__readNone]
 T min(T x, T y);
 
 __generic<T : __BuiltinIntegerType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0")
+[__readNone]
 vector<T,N> min(vector<T,N> x, vector<T,N> y)
 {
     VECTOR_MAP_BINARY(T, N, min, x, y);
@@ -2594,6 +2789,7 @@ vector<T,N> min(vector<T,N> x, vector<T,N> y)
 
 __generic<T : __BuiltinIntegerType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> min(matrix<T,N,M> x, matrix<T,N,M> y)
 {
     MATRIX_MAP_BINARY(T, N, M, min, x, y);
@@ -2605,12 +2801,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_min($0, $1)")
 __target_intrinsic(cpp, "$P_min($0, $1)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0")
+[__readNone]
 T min(T x, T y);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0")
+[__readNone]
 vector<T,N> min(vector<T,N> x, vector<T,N> y)
 {
     VECTOR_MAP_BINARY(T, N, min, x, y);
@@ -2618,6 +2816,7 @@ vector<T,N> min(vector<T,N> x, vector<T,N> y)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> min(matrix<T,N,M> x, matrix<T,N,M> y)
 {
     MATRIX_MAP_BINARY(T, N, M, min, x, y);
@@ -2625,11 +2824,13 @@ matrix<T,N,M> min(matrix<T,N,M> x, matrix<T,N,M> y)
 
 // split into integer and fractional parts (both with same sign)
 __generic<T : __BuiltinFloatingPointType>
+[__readNone]
 T modf(T x, out T ip);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
+[__readNone]
 vector<T,N> modf(vector<T,N> x, out vector<T,N> ip)
 {
     VECTOR_MAP_BINARY(T, N, modf, x, ip);
@@ -2637,6 +2838,7 @@ vector<T,N> modf(vector<T,N> x, out vector<T,N> ip)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> modf(matrix<T,N,M> x, out matrix<T,N,M> ip)
 {
     MATRIX_MAP_BINARY(T, N, M, modf, x, ip);
@@ -2644,6 +2846,7 @@ matrix<T,N,M> modf(matrix<T,N,M> x, out matrix<T,N,M> ip)
 
 // msad4 (whatever that is)
 __target_intrinsic(hlsl)
+[__readNone]
 uint4 msad4(uint reference, uint2 source, uint4 accum)
 {
     int4 bytesRef = (reference >> uint4(24, 16, 8, 0)) & 0xFF;
@@ -2665,36 +2868,43 @@ uint4 msad4(uint reference, uint2 source, uint4 accum)
 // scalar-scalar
 __generic<T : __BuiltinArithmeticType>
 __intrinsic_op($(kIROp_Mul))
+[__readNone]
 T mul(T x, T y);
 
 // scalar-vector and vector-scalar
 __generic<T : __BuiltinArithmeticType, let N : int>
 __intrinsic_op($(kIROp_Mul))
+[__readNone]
 vector<T, N> mul(vector<T, N> x, T y);
 
 __generic<T : __BuiltinArithmeticType, let N : int>
 __intrinsic_op($(kIROp_Mul))
+[__readNone]
 vector<T, N> mul(T x, vector<T, N> y);
 
 // scalar-matrix and matrix-scalar
 __generic<T : __BuiltinArithmeticType, let N : int, let M :int>
 __intrinsic_op($(kIROp_Mul))
+[__readNone]
 matrix<T, N, M> mul(matrix<T, N, M> x, T y);
 
 __generic<T : __BuiltinArithmeticType, let N : int, let M :int>
 __intrinsic_op($(kIROp_Mul))
+[__readNone]
 matrix<T, N, M> mul(T x, matrix<T, N, M> y);
 
 // vector-vector (dot product)
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "dot")
+[__readNone]
 T mul(vector<T, N> x, vector<T, N> y)
 {
     return dot(x, y);
 }
 __generic<T : __BuiltinIntegerType, let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 T mul(vector<T, N> x, vector<T, N> y)
 {
     return dot(x, y);
@@ -2704,6 +2914,7 @@ T mul(vector<T, N> x, vector<T, N> y)
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
 {
     vector<T,M> result;
@@ -2721,6 +2932,7 @@ vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
 __generic<T : __BuiltinIntegerType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
 {
     vector<T,M> result;
@@ -2738,6 +2950,7 @@ vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
 __generic<T : __BuiltinLogicalType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
 {
     vector<T,M> result;
@@ -2757,6 +2970,7 @@ vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
 {
     vector<T,N> result;
@@ -2774,6 +2988,7 @@ vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
 __generic<T : __BuiltinIntegerType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
 {
     vector<T,N> result;
@@ -2791,6 +3006,7 @@ vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
 __generic<T : __BuiltinLogicalType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
 {
     vector<T,N> result;
@@ -2810,6 +3026,7 @@ vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
 __generic<T : __BuiltinFloatingPointType, let R : int, let N : int, let C : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
 {
     matrix<T,R,C> result;
@@ -2828,6 +3045,7 @@ matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
 __generic<T : __BuiltinIntegerType, let R : int, let N : int, let C : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
 {
     matrix<T,R,C> result;
@@ -2846,6 +3064,7 @@ matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
 __generic<T : __BuiltinLogicalType, let R : int, let N : int, let C : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "($1 * $0)")
+[__readNone]
 matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
 {
     matrix<T,R,C> result;
@@ -2864,11 +3083,13 @@ matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
 
 // noise (deprecated)
 
+[__readNone]
 float noise(float x)
 {
     return 0;
 }
 
+[__readNone]
 __generic<let N : int> float noise(vector<float, N> x)
 {
     return 0;
@@ -2915,6 +3136,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Normalize _0")
+[__readNone]
 vector<T,N> normalize(vector<T,N> x)
 {
     return x / length(x);
@@ -2927,12 +3149,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_pow($0, $1)")
 __target_intrinsic(cpp, "$P_pow($0, $1)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Pow _0 _1")
+[__readNone]
 T pow(T x, T y);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Pow _0 _1")
+[__readNone]
 vector<T, N> pow(vector<T, N> x, vector<T, N> y)
 {
     VECTOR_MAP_BINARY(T, N, pow, x, y);
@@ -2940,6 +3164,7 @@ vector<T, N> pow(vector<T, N> x, vector<T, N> y)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> pow(matrix<T,N,M> x, matrix<T,N,M> y)
 {
     MATRIX_MAP_BINARY(T, N, M, pow, x, y);
@@ -3087,6 +3312,7 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Radians _0")
+[__readNone]
 T radians(T x)
 {
     return x * (T.getPi() / T(180.0f));
@@ -3096,6 +3322,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Radians _0")
+[__readNone]
 vector<T, N> radians(vector<T, N> x)
 {
     return x * (T.getPi() / T(180.0f));
@@ -3103,6 +3330,7 @@ vector<T, N> radians(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> radians(matrix<T, N, M> x)
 {
     return x * (T.getPi() / T(180.0f));
@@ -3111,6 +3339,7 @@ matrix<T, N, M> radians(matrix<T, N, M> x)
 // Approximate reciprocal
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
+[__readNone]
 T rcp(T x)
 {
     return T(1.0) / x;
@@ -3118,6 +3347,7 @@ T rcp(T x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 vector<T, N> rcp(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, rcp, x);
@@ -3125,6 +3355,7 @@ vector<T, N> rcp(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> rcp(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, rcp, x);
@@ -3135,6 +3366,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Reflect _0 _1")
+[__readNone]
 vector<T,N> reflect(vector<T,N> i, vector<T,N> n)
 {
     return i - T(2) * dot(n,i) * n;
@@ -3145,6 +3377,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Refract _0 _1 _2")
+[__readNone]
 vector<T,N> refract(vector<T,N> i, vector<T,N> n, T eta)
 {
     let dotNI = dot(n,i);
@@ -3158,10 +3391,12 @@ __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "bitfieldReverse")
 __target_intrinsic(cuda, "$P_reversebits($0)")
 __target_intrinsic(cpp, "$P_reversebits($0)")
+[__readNone]
 uint reversebits(uint value);
 
 __target_intrinsic(glsl, "bitfieldReverse")
 __generic<let N : int>
+[__readNone]
 vector<uint, N> reversebits(vector<uint, N> value)
 {
     VECTOR_MAP_UNARY(uint, N, reversebits, value);
@@ -3174,12 +3409,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_round($0)")
 __target_intrinsic(cpp, "$P_round($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Round _0")
+[__readNone]
 T round(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Round _0")
+[__readNone]
 vector<T, N> round(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, round, x);
@@ -3187,6 +3424,7 @@ vector<T, N> round(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> round(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, round, x);
@@ -3199,6 +3437,7 @@ __target_intrinsic(glsl, "inversesqrt($0)")
 __target_intrinsic(cuda, "$P_rsqrt($0)")
 __target_intrinsic(cpp, "$P_rsqrt($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InverseSqrt _0")
+[__readNone]
 T rsqrt(T x)
 {
     return T(1.0) / sqrt(x);
@@ -3208,6 +3447,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "inversesqrt($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InverseSqrt _0")
+[__readNone]
 vector<T, N> rsqrt(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, rsqrt, x);
@@ -3215,6 +3455,7 @@ vector<T, N> rsqrt(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> rsqrt(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, rsqrt, x);
@@ -3224,6 +3465,7 @@ matrix<T, N, M> rsqrt(matrix<T, N, M> x)
 
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
+[__readNone]
 T saturate(T x)
 {
     return clamp<T>(x, T(0), T(1));
@@ -3231,6 +3473,7 @@ T saturate(T x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 vector<T,N> saturate(vector<T,N> x)
 {
     return clamp<T,N>(x,
@@ -3240,6 +3483,7 @@ vector<T,N> saturate(vector<T,N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> saturate(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, saturate, x);
@@ -3252,12 +3496,14 @@ __target_intrinsic(glsl, "int(sign($0))")
 __target_intrinsic(cuda, "$P_sign($0)")
 __target_intrinsic(cpp, "$P_sign($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FSign, SSign) _0")
+[__readNone]
 int sign(T x);
 
 __generic<T : __BuiltinSignedArithmeticType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl, "ivec$N0(sign($0))")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FSign, SSign) _0")
+[__readNone]
 vector<int, N> sign(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(int, N, sign, x);
@@ -3265,6 +3511,7 @@ vector<int, N> sign(vector<T, N> x)
 
 __generic<T : __BuiltinSignedArithmeticType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<int, N, M> sign(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(int, N, M, sign, x);
@@ -3279,12 +3526,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_sin($0)")
 __target_intrinsic(cpp, "$P_sin($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sin _0")
+[__readNone]
 T sin(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sin _0")
+[__readNone]
 vector<T, N> sin(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, sin, x);
@@ -3292,6 +3541,7 @@ vector<T, N> sin(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> sin(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, sin, x);
@@ -3301,6 +3551,7 @@ matrix<T, N, M> sin(matrix<T, N, M> x)
 __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(cuda, "$P_sincos($0, $1, $2)")
+[__readNone]
 void sincos(T x, out T s, out T c)
 {
     s = sin(x);
@@ -3309,6 +3560,7 @@ void sincos(T x, out T s, out T c)
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
+[__readNone]
 void sincos(vector<T,N> x, out vector<T,N> s, out vector<T,N> c)
 {
     s = sin(x);
@@ -3317,6 +3569,7 @@ void sincos(vector<T,N> x, out vector<T,N> s, out vector<T,N> c)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 void sincos(matrix<T,N,M> x, out matrix<T,N,M> s, out matrix<T,N,M> c)
 {
     s = sin(x);
@@ -3330,12 +3583,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_sinh($0)")
 __target_intrinsic(cpp, "$P_sinh($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sinh _0")
+[__readNone]
 T sinh(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sinh _0")
+[__readNone]
 vector<T, N> sinh(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, sinh, x);
@@ -3343,6 +3598,7 @@ vector<T, N> sinh(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> sinh(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, sinh, x);
@@ -3353,6 +3609,7 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 SmoothStep _0 _1 _2")
+[__readNone]
 T smoothstep(T min, T max, T x)
 {
     let t = saturate((x - min) / (max - min));
@@ -3363,6 +3620,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 SmoothStep _0 _1 _2")
+[__readNone]
 vector<T, N> smoothstep(vector<T, N> min, vector<T, N> max, vector<T, N> x)
 {
     VECTOR_MAP_TRINARY(T, N, smoothstep, min, max, x);
@@ -3370,6 +3628,7 @@ vector<T, N> smoothstep(vector<T, N> min, vector<T, N> max, vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> smoothstep(matrix<T, N, M> min, matrix<T, N, M> max, matrix<T, N, M> x)
 {
     MATRIX_MAP_TRINARY(T, N, M, smoothstep, min, max, x);
@@ -3382,12 +3641,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_sqrt($0)")
 __target_intrinsic(cpp, "$P_sqrt($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sqrt _0")
+[__readNone]
 T sqrt(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sqrt _0")
+[__readNone]
 vector<T, N> sqrt(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, sqrt, x);
@@ -3395,6 +3656,7 @@ vector<T, N> sqrt(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> sqrt(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, sqrt, x);
@@ -3405,6 +3667,7 @@ __generic<T : __BuiltinFloatingPointType>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Step _0 _1")
+[__readNone]
 T step(T y, T x)
 {
     return x < y ? T(0.0f) : T(1.0f);
@@ -3414,6 +3677,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Step _0 _1")
+[__readNone]
 vector<T,N> step(vector<T,N> y, vector<T,N> x)
 {
     VECTOR_MAP_BINARY(T, N, step, y, x);
@@ -3421,6 +3685,7 @@ vector<T,N> step(vector<T,N> y, vector<T,N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> step(matrix<T, N, M> y, matrix<T, N, M> x)
 {
     MATRIX_MAP_BINARY(T, N, M, step, y, x);
@@ -3433,12 +3698,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_tan($0)")
 __target_intrinsic(cpp, "$P_tan($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Tan _0")
+[__readNone]
 T tan(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Tan _0")
+[__readNone]
 vector<T, N> tan(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, tan, x);
@@ -3446,6 +3713,7 @@ vector<T, N> tan(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> tan(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, tan, x);
@@ -3458,12 +3726,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_tanh($0)")
 __target_intrinsic(cpp, "$P_tanh($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Tanh _0")
+[__readNone]
 T tanh(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Tanh _0")
+[__readNone]
 vector<T,N> tanh(vector<T,N> x)
 {
     VECTOR_MAP_UNARY(T, N, tanh, x);
@@ -3471,6 +3741,7 @@ vector<T,N> tanh(vector<T,N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T,N,M> tanh(matrix<T,N,M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, tanh, x);
@@ -3480,6 +3751,7 @@ matrix<T,N,M> tanh(matrix<T,N,M> x)
 __generic<T : __BuiltinType, let N : int, let M : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
+[__readNone]
 matrix<T, M, N> transpose(matrix<T, N, M> x)
 {
     matrix<T,M,N> result;
@@ -3496,12 +3768,14 @@ __target_intrinsic(glsl)
 __target_intrinsic(cuda, "$P_trunc($0)")
 __target_intrinsic(cpp, "$P_trunc($0)")
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Trunc _0")
+[__readNone]
 T trunc(T x);
 
 __generic<T : __BuiltinFloatingPointType, let N : int>
 __target_intrinsic(hlsl)
 __target_intrinsic(glsl)
 __target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Trunc _0")
+[__readNone]
 vector<T, N> trunc(vector<T, N> x)
 {
     VECTOR_MAP_UNARY(T, N, trunc, x);
@@ -3509,6 +3783,7 @@ vector<T, N> trunc(vector<T, N> x)
 
 __generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
 __target_intrinsic(hlsl)
+[__readNone]
 matrix<T, N, M> trunc(matrix<T, N, M> x)
 {
     MATRIX_MAP_UNARY(T, N, M, trunc, x);
@@ -4779,6 +5054,7 @@ void __executeCallable(uint shaderIndex, int payloadLocation);
 __generic<Payload>
 __target_intrinsic(__glslRayTracing, "$XC")
 [__readNone]
+[__AlwaysFoldIntoUseSiteAttribute]
 int __callablePayloadLocation(__ref Payload payload);
 
 // Now we provide a hard-coded definition of `CallShader()` for GLSL-based
@@ -4834,6 +5110,7 @@ void __traceRay(
 __generic<Payload>
 __target_intrinsic(__glslRayTracing, "$XP")
 [__readNone]
+[__AlwaysFoldIntoUseSiteAttribute]
 int __rayPayloadLocation(__ref Payload payload);
 
 __generic<payload_t>
@@ -5677,6 +5954,7 @@ Ref<T> __hitObjectAttributes<T>()
 __generic<Attributes>
 __target_intrinsic(__glslRayTracing, "$XH")
 [__readNone]
+[__AlwaysFoldIntoUseSiteAttribute]
 int __hitObjectAttributesLocation(__ref Attributes attributes);
 
     /// Immutable data type representing a ray hit or a miss. Can be used to invoke hit or miss shading,
diff --git a/source/slang/slang-ast-modifier.h b/source/slang/slang-ast-modifier.h
index 99e221b1e..6ac464784 100644
--- a/source/slang/slang-ast-modifier.h
+++ b/source/slang/slang-ast-modifier.h
@@ -1083,6 +1083,14 @@ class RequiresNVAPIAttribute : public Attribute
     SLANG_AST_CLASS(RequiresNVAPIAttribute)
 };
 
+
+    /// A `[__AlwaysFoldIntoUseSite]` attribute indicates that the calls into the modified
+    /// function should always be folded into use sites during source emit.
+class AlwaysFoldIntoUseSiteAttribute :public Attribute
+{
+    SLANG_AST_CLASS(AlwaysFoldIntoUseSiteAttribute)
+};
+
     /// The `[ForwardDifferentiable]` attribute indicates that a function can be forward-differentiated.
 class ForwardDifferentiableAttribute : public DifferentiableAttribute
 {
diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp
index c664449e5..7840dc450 100644
--- a/source/slang/slang-emit-c-like.cpp
+++ b/source/slang/slang-emit-c-like.cpp
@@ -1244,14 +1244,24 @@ bool CLikeSourceEmitter::shouldFoldInstIntoUseSites(IRInst* inst)
             return true;
     }
 
+    // Always hold if inst is a call into an [__alwaysFoldIntoUseSite] function.
+    if (auto call = as<IRCall>(inst))
+    {
+        auto callee = call->getCallee();
+        if (getResolvedInstForDecorations(callee)->findDecoration<IRAlwaysFoldIntoUseSiteDecoration>())
+        {
+            return true;
+        }
+    }
+
     // Having dealt with all of the cases where we *must* fold things
     // above, we can now deal with the more general cases where we
     // *should not* fold things.
-
     // Don't fold something with no users:
     if(!inst->hasUses())
         return false;
 
+
     // Don't fold something that has multiple users:
     if(inst->hasMoreThanOneUse())
         return false;
diff --git a/source/slang/slang-emit-c-like.h b/source/slang/slang-emit-c-like.h
index ff229c38b..1cd2045c7 100644
--- a/source/slang/slang-emit-c-like.h
+++ b/source/slang/slang-emit-c-like.h
@@ -326,7 +326,7 @@ public:
 
     void emitSimpleValue(IRInst* inst) { emitSimpleValueImpl(inst); }
     
-    bool shouldFoldInstIntoUseSites(IRInst* inst);
+    virtual bool shouldFoldInstIntoUseSites(IRInst* inst);
 
     void emitOperand(IRInst* inst, EmitOpInfo const& outerPrec) { emitOperandImpl(inst, outerPrec); }
 
diff --git a/source/slang/slang-emit-cpp.cpp b/source/slang/slang-emit-cpp.cpp
index ba6b26ec6..795ec74b0 100644
--- a/source/slang/slang-emit-cpp.cpp
+++ b/source/slang/slang-emit-cpp.cpp
@@ -1557,6 +1557,46 @@ void CPPSourceEmitter::emitGlobalInstImpl(IRInst* inst)
     }
 }
 
+bool CPPSourceEmitter::shouldFoldInstIntoUseSites(IRInst* inst)
+{
+    bool result = Super::shouldFoldInstIntoUseSites(inst);
+    if (!result)
+        return result;
+    if (as<IRVectorType>(inst->getDataType()) || as<IRMatrixType>(inst->getDataType()))
+    {
+        // If a vector value is being used in a reshape/cast,
+        // we should not fold it because the implementation of cast will have multiple references to it.
+        for (auto use = inst->firstUse; use; use = use->nextUse)
+        {
+            switch (use->getUser()->getOp())
+            {
+            case kIROp_MatrixReshape:
+            case kIROp_VectorReshape:
+            case kIROp_IntCast:
+            case kIROp_FloatCast:
+            case kIROp_CastIntToFloat:
+            case kIROp_CastFloatToInt:
+                return false;
+            default:
+                break;
+            }
+        }
+        switch (inst->getOp())
+        {
+        case kIROp_MatrixReshape:
+        case kIROp_VectorReshape:
+        case kIROp_IntCast:
+        case kIROp_FloatCast:
+        case kIROp_CastIntToFloat:
+        case kIROp_CastFloatToInt:
+            return false;
+        default:
+            break;
+        }
+    }
+    return true;
+}
+
 static bool _isExported(IRInst* inst)
 {
     for (auto decoration : inst->getDecorations())
diff --git a/source/slang/slang-emit-cpp.h b/source/slang/slang-emit-cpp.h
index 92780e0a4..71c382f87 100644
--- a/source/slang/slang-emit-cpp.h
+++ b/source/slang/slang-emit-cpp.h
@@ -71,6 +71,7 @@ protected:
     virtual void emitFuncDecorationsImpl(IRFunc* func) SLANG_OVERRIDE;
     virtual void emitVarDecorationsImpl(IRInst* var) SLANG_OVERRIDE;
     virtual void emitGlobalInstImpl(IRInst* inst) SLANG_OVERRIDE;
+    virtual bool shouldFoldInstIntoUseSites(IRInst* inst) SLANG_OVERRIDE;
 
     const UnownedStringSlice* getVectorElementNames(BaseType elemType, Index elemCount);
     
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index e2f00bf88..a25fae5ae 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -891,8 +891,8 @@ Result linkAndOptimizeIR(
         }
     }
 
-    // Run a final round of DCE to clean up unused things after phi-elimination.
-    eliminateDeadCode(irModule);
+    // Run a final round of simplifications to clean up unused things after phi-elimination.
+    simplifyNonSSAIR(irModule);
 
     // We include one final step to (optionally) dump the IR and validate
     // it after all of the optimization passes are complete. This should
diff --git a/source/slang/slang-ir-autodiff-unzip.cpp b/source/slang/slang-ir-autodiff-unzip.cpp
index 096751836..a05fe7044 100644
--- a/source/slang/slang-ir-autodiff-unzip.cpp
+++ b/source/slang/slang-ir-autodiff-unzip.cpp
@@ -559,6 +559,7 @@ IRFunc* DiffUnzipPass::extractPrimalFunc(
             {
                 if (inst->getOp() == kIROp_Call)
                 {
+                    // The primal calls should be marked as no side effect so they can be DCE'd if possible.
                     builder.addSimpleDecoration<IRNoSideEffectDecoration>(inst);
                 }
             }
diff --git a/source/slang/slang-ir-autodiff.h b/source/slang/slang-ir-autodiff.h
index fa01d50ae..a4eb94461 100644
--- a/source/slang/slang-ir-autodiff.h
+++ b/source/slang/slang-ir-autodiff.h
@@ -212,20 +212,12 @@ struct DifferentiableTypeConformanceContext
     IRInst* getZeroMethodForType(IRBuilder* builder, IRType* origType)
     {
         auto result = lookUpInterfaceMethod(builder, origType, sharedContext->zeroMethodStructKey);
-        if (result && !result->findDecoration<IRNoSideEffectDecoration>())
-        {
-            builder->addDecoration(result, kIROp_NoSideEffectDecoration);
-        }
         return result;
     }
 
     IRInst* getAddMethodForType(IRBuilder* builder, IRType* origType)
     {
         auto result = lookUpInterfaceMethod(builder, origType, sharedContext->addMethodStructKey);
-        if (result && !result->findDecoration<IRNoSideEffectDecoration>())
-        {
-            builder->addDecoration(result, kIROp_NoSideEffectDecoration);
-        }
         return result;
     }
 };
diff --git a/source/slang/slang-ir-dce.cpp b/source/slang/slang-ir-dce.cpp
index 58c9b23f1..e5c9b1fdb 100644
--- a/source/slang/slang-ir-dce.cpp
+++ b/source/slang/slang-ir-dce.cpp
@@ -24,6 +24,11 @@ struct DeadCodeEliminationContext
     // These uses will be replaced with `undefInst`.
     IRInst* undefInst = nullptr;
 
+    // Track if we have removed any phi parameters.
+    // If so we need to rerun dce pass because after removing them
+    // there could be new DCE opportunities.
+    bool phiRemoved = false;
+
     // Our overall process is going to be to determine
     // which instructions in the module are "live"
     // and then eliminate anything that wasn't found to
@@ -98,104 +103,115 @@ struct DeadCodeEliminationContext
 
     bool processInst(IRInst* root)
     {
-        // First of all, we know that the root instruction
-        // should be considered as live, because otherwise
-        // we'd end up eliminating it, so that is a
-        // good place to start.
-        //
-        markInstAsLive(root);
-
-        // Ensure there is a global undef inst that is always alive.
-        // This undef inst will be used to fill in weak-referencing uses
-        // whose used value is marked as dead and eliminated.
-        // We always make sure this undef inst is available to prevent
-        // infiniate oscilating loops.
-        markInstAsLive(getUndefInst());
-
-        // Marking the module as live should have
-        // seeded our work list, so we can now start
-        // processing entries off of our work list
-        // until it goes dry.
-        //
-        while (workList.getCount())
+        bool result = false;
+        for (;;)
         {
-            auto inst = workList.getLast();
-            workList.removeLast();
+            liveInsts.Clear();
+            workList.clear();
 
-            if (!isChildInstOf(inst, root))
-                continue;
-
-            // At this point we know that `inst` is live,
-            // and we want to start considering which other
-            // instructions must be live because of that
-            // knowlege.
-            //
-            // A first easy case is that the parent (if any)
-            // of a live instruction had better be live, or
-            // else we might delete the parent, and
-            // the child with it.
+            // First of all, we know that the root instruction
+            // should be considered as live, because otherwise
+            // we'd end up eliminating it, so that is a
+            // good place to start.
             //
-            markInstAsLive(inst->getParent());
-
-            // Next the type of a live instruction, and all
-            // of its operands must also be live, or else
-            // we won't be able to compute its value.
+            markInstAsLive(root);
+
+            // Ensure there is a global undef inst that is always alive.
+            // This undef inst will be used to fill in weak-referencing uses
+            // whose used value is marked as dead and eliminated.
+            // We always make sure this undef inst is available to prevent
+            // infiniate oscilating loops.
+            markInstAsLive(getUndefInst());
+
+            // Marking the module as live should have
+            // seeded our work list, so we can now start
+            // processing entries off of our work list
+            // until it goes dry.
             //
-            markInstAsLive(inst->getFullType());
-            UInt operandCount = inst->getOperandCount();
-            for (UInt ii = 0; ii < operandCount; ++ii)
+            while (workList.getCount())
             {
-                // There are some type of operands that needs to be treated as
-                // "weak" references -- they can never hold things alive, and
-                // whenever we delete the referenced value, these operands needs
-                // to be replaced with `undef`.
-                if (!isWeakReferenceOperand(inst, ii))
-                    markInstAsLive(inst->getOperand(ii));
-            }
+                auto inst = workList.getLast();
+                workList.removeLast();
+
+                if (!isChildInstOf(inst, root))
+                    continue;
+
+                // At this point we know that `inst` is live,
+                // and we want to start considering which other
+                // instructions must be live because of that
+                // knowlege.
+                //
+                // A first easy case is that the parent (if any)
+                // of a live instruction had better be live, or
+                // else we might delete the parent, and
+                // the child with it.
+                //
+                markInstAsLive(inst->getParent());
+
+                // Next the type of a live instruction, and all
+                // of its operands must also be live, or else
+                // we won't be able to compute its value.
+                //
+                markInstAsLive(inst->getFullType());
+                UInt operandCount = inst->getOperandCount();
+                for (UInt ii = 0; ii < operandCount; ++ii)
+                {
+                    // There are some type of operands that needs to be treated as
+                    // "weak" references -- they can never hold things alive, and
+                    // whenever we delete the referenced value, these operands needs
+                    // to be replaced with `undef`.
+                    if (!isWeakReferenceOperand(inst, ii))
+                        markInstAsLive(inst->getOperand(ii));
+                }
 
-            // Finally, we need to consider the children
-            // and decorations of the instruction.
-            //
-            // Note that just because an instruction is
-            // live doesn't mean its children must be, or
-            // else we'd never eliminate *anything* (we
-            // marked the whole module as live, and everything
-            // is a transitive child of the module).
-            //
-            // Decorations, in contrast, are always live if their
-            // parents are (because we don't want to silently drop
-            // decorations). It is still important to *mark*
-            // decorations as live, because they have operands,
-            // and those operands need to be marked as live.
-            // We will fold decorations into the same loop
-            // as children for simplicity.
-            //
-            // To keep the code here simple, we'll defer the
-            // decision of whether a child (or decoration)
-            // should be live when its parent is to a subroutine.
-            //
-            for (auto child : inst->getDecorationsAndChildren())
-            {
-                if (shouldInstBeLiveIfParentIsLive(child))
+                // Finally, we need to consider the children
+                // and decorations of the instruction.
+                //
+                // Note that just because an instruction is
+                // live doesn't mean its children must be, or
+                // else we'd never eliminate *anything* (we
+                // marked the whole module as live, and everything
+                // is a transitive child of the module).
+                //
+                // Decorations, in contrast, are always live if their
+                // parents are (because we don't want to silently drop
+                // decorations). It is still important to *mark*
+                // decorations as live, because they have operands,
+                // and those operands need to be marked as live.
+                // We will fold decorations into the same loop
+                // as children for simplicity.
+                //
+                // To keep the code here simple, we'll defer the
+                // decision of whether a child (or decoration)
+                // should be live when its parent is to a subroutine.
+                //
+                for (auto child : inst->getDecorationsAndChildren())
                 {
-                    // In this case, we know `inst` is live and
-                    // its `child` should be live if its parent is,
-                    // so the `child` must be live too.
-                    //
-                    markInstAsLive(child);
+                    if (shouldInstBeLiveIfParentIsLive(child))
+                    {
+                        // In this case, we know `inst` is live and
+                        // its `child` should be live if its parent is,
+                        // so the `child` must be live too.
+                        //
+                        markInstAsLive(child);
+                    }
                 }
             }
-        }
 
-        // If our work list runs dry, that means we've reached a steady
-        // state where everything that is transitively relevant to
-        // the "outputs" of the module has been marked as live.
-        //
-        // Now we can simply walk through all of our instructions
-        // recursively and eliminate those that are "dead" by
-        // virtue of not having been found live.
-        //
-        return eliminateDeadInstsRec(root);
+            // If our work list runs dry, that means we've reached a steady
+            // state where everything that is transitively relevant to
+            // the "outputs" of the module has been marked as live.
+            //
+            // Now we can simply walk through all of our instructions
+            // recursively and eliminate those that are "dead" by
+            // virtue of not having been found live.
+            //
+            phiRemoved = false;
+            result |= eliminateDeadInstsRec(root);
+            if (!phiRemoved)
+                break;
+        }
+        return result;
     }
 
     // Given the basic infrastructrure above, let's
@@ -207,6 +223,25 @@ struct DeadCodeEliminationContext
         return processInst(module->getModuleInst());
     }
 
+    void removePhiArgs(IRInst* phiParam)
+    {
+        auto block = cast<IRBlock>(phiParam->getParent());
+        UInt paramIndex = 0;
+        for (auto p = block->getFirstParam(); p; p = p->getNextParam())
+        {
+            if (p == phiParam)
+                break;
+            paramIndex++;
+        }
+        for (auto predBlock : block->getPredecessors())
+        {
+            auto termInst = as<IRUnconditionalBranch>(predBlock->getTerminator());
+            SLANG_ASSERT(paramIndex < termInst->getArgCount());
+            termInst->removeArgument(paramIndex);
+        }
+        phiRemoved = true;
+    }
+
     bool eliminateDeadInstsRec(IRInst* inst)
     {
         bool changed = false;
@@ -226,6 +261,12 @@ struct DeadCodeEliminationContext
             {
                 inst->replaceUsesWith(getUndefInst());
             }
+
+            if (inst->getOp() == kIROp_Param)
+            {
+                // For Phi parameters, we need to update all branch arguments.
+                removePhiArgs(inst);
+            }
             inst->removeAndDeallocate();
             changed = true;
         }
@@ -261,6 +302,16 @@ struct DeadCodeEliminationContext
     }
 };
 
+bool isFirstBlock(IRInst* inst)
+{
+    auto block = as<IRBlock>(inst);
+    if (!block)
+        return false;
+    if (!block->getParent())
+        return false;
+    return block->getParent()->getFirstBlock() == block;
+}
+
 bool shouldInstBeLiveIfParentIsLive(IRInst* inst, IRDeadCodeEliminationOptions options)
 {
     // The main source of confusion/complexity here is that
@@ -275,7 +326,31 @@ bool shouldInstBeLiveIfParentIsLive(IRInst* inst, IRDeadCodeEliminationOptions o
     // when it is executed, then we should keep it around.
     //
     if (inst->mightHaveSideEffects())
-        return true;
+    {
+        // If the inst has side effect, we should keep it alive.
+        // An exception is if we have a call to a pure function
+        // that writes its output to a local variable, but we
+        // don't have any uses of that local variable.
+        auto call = as<IRCall>(inst);
+        if (!call)
+            return true;
+        if (!getResolvedInstForDecorations(call->getCallee())->findDecoration<IRReadNoneDecoration>())
+            return true;
+        auto parentFunc = getParentFunc(inst);
+        if (!parentFunc)
+            return true;
+        for (UInt i = 0; i < call->getArgCount(); i++)
+        {
+            auto arg = call->getArg(i);
+            if (getParentFunc(arg) != parentFunc)
+                return true;
+            if (arg->getOp() != kIROp_Var)
+                return true;
+            if (arg->hasMoreThanOneUse())
+                return true;
+        }
+        return false;
+    }
     //
     // The `mightHaveSideEffects` query is conservative, and will
     // return `true` as its default mode, so once we are past that
@@ -352,17 +427,10 @@ bool shouldInstBeLiveIfParentIsLive(IRInst* inst, IRDeadCodeEliminationOptions o
     switch (inst->getOp())
     {
         // Function parameters obviously shouldn't get eliminated,
-        // even if nothing references them, and block parameters
-        // (phi nodes) will be considered live when their block is,
-        // just so that we don't have to deal with any complications
-        // around re-writing the relevant inter-block argument passing.
-        //
-        // TODO: A smarter DCE pass could deal with this case more
-        // carefully, or we could improve the interprocedural SCCP
-        // pass to deal with block parameters instead.
+        // even if nothing references them.
         //
     case kIROp_Param:
-        return true;
+        return isFirstBlock(inst->getParent());
 
         // IR struct types and witness tables are currently kludged
         // so that they have child instructions that represent their
diff --git a/source/slang/slang-ir-glsl-legalize.cpp b/source/slang/slang-ir-glsl-legalize.cpp
index e111a548b..9c16f40ac 100644
--- a/source/slang/slang-ir-glsl-legalize.cpp
+++ b/source/slang/slang-ir-glsl-legalize.cpp
@@ -2027,8 +2027,8 @@ void legalizeMeshOutputParam(
 
             IRBuilderInsertLocScope locScope{builder};
             builder->setInsertBefore(p);
-            auto e = builder->emitElementAddress(meshOutputBlockType, blockParam, p->getIndex());
-            auto a = builder->emitFieldAddress(builtin.type, e, builtin.key);
+            auto e = builder->emitElementAddress(builder->getPtrType(meshOutputBlockType), blockParam, p->getIndex());
+            auto a = builder->emitFieldAddress(builder->getPtrType(builtin.type), e, builtin.key);
 
             p->replaceUsesWith(a);
         });
diff --git a/source/slang/slang-ir-inst-defs.h b/source/slang/slang-ir-inst-defs.h
index 4dea3985a..4b1037240 100644
--- a/source/slang/slang-ir-inst-defs.h
+++ b/source/slang/slang-ir-inst-defs.h
@@ -728,6 +728,9 @@ INST(HighLevelDeclDecoration,               highLevelDecl,          1, 0)
         /// Applie to an IR function and signals that inlining should not be performed unless unavoidable.
     INST(NoInlineDecoration, noInline, 0, 0)
 
+        /// A call to the decorated function should always be folded into its use site.
+    INST(AlwaysFoldIntoUseSiteDecoration, alwaysFold, 0, 0)
+
     INST(PayloadDecoration, payload, 0, 0)
 
     /* Mesh Shader outputs */
diff --git a/source/slang/slang-ir-insts.h b/source/slang/slang-ir-insts.h
index fe20f17f5..f2e4e05d3 100644
--- a/source/slang/slang-ir-insts.h
+++ b/source/slang/slang-ir-insts.h
@@ -325,6 +325,7 @@ IR_SIMPLE_DECORATION(HLSLExportDecoration)
 IR_SIMPLE_DECORATION(KeepAliveDecoration)
 IR_SIMPLE_DECORATION(RequiresNVAPIDecoration)
 IR_SIMPLE_DECORATION(NoInlineDecoration)
+IR_SIMPLE_DECORATION(AlwaysFoldIntoUseSiteDecoration)
 
 struct IRNVAPIMagicDecoration : IRDecoration
 {
@@ -1925,7 +1926,7 @@ struct IRUnconditionalBranch : IRTerminatorInst
     UInt getArgCount();
     IRUse* getArgs();
     IRInst* getArg(UInt index);
-
+    void removeArgument(UInt index);
     IR_PARENT_ISA(UnconditionalBranch);
 };
 
@@ -1968,20 +1969,6 @@ struct IRConditionalBranch : IRTerminatorInst
     IRBlock* getFalseBlock() { return (IRBlock*)falseBlock.get(); }
 };
 
-// A conditional branch that represent the test inside a loop
-struct IRLoopTest : IRConditionalBranch
-{
-};
-
-// A conditional branch that represents a one-sided `if`:
-//
-//     if( <condition> ) { <trueBlock> }
-//     <falseBlock>
-struct IRIf : IRConditionalBranch
-{
-    IRBlock* getAfterBlock() { return getFalseBlock(); }
-};
-
 // A conditional branch that represents a two-sided `if`:
 //
 //     if( <condition> ) { <trueBlock> }
@@ -3361,6 +3348,7 @@ public:
     IRInst* emitBitOr(IRType* type, IRInst* left, IRInst* right);
     IRInst* emitBitNot(IRType* type, IRInst* value);
     IRInst* emitNeg(IRType* type, IRInst* value);
+    IRInst* emitNot(IRType* type, IRInst* value);
 
     IRInst* emitAdd(IRType* type, IRInst* left, IRInst* right);
     IRInst* emitSub(IRType* type, IRInst* left, IRInst* right);
diff --git a/source/slang/slang-ir-loop-unroll.cpp b/source/slang/slang-ir-loop-unroll.cpp
index 79b00f60a..2f689ebde 100644
--- a/source/slang/slang-ir-loop-unroll.cpp
+++ b/source/slang/slang-ir-loop-unroll.cpp
@@ -47,7 +47,7 @@ static bool _eliminateDeadBlocks(List<IRBlock*>& blocks, IRBlock* unreachableBlo
     return changed;
 }
 
-List<IRBlock*> _collectBlocksInLoop(Dictionary<IRBlock*, int>& blockOrdering, IRLoop* loopInst)
+List<IRBlock*> _collectBlocksInLoop(IRDominatorTree* dom, IRLoop* loopInst)
 {
     List<IRBlock*> loopBlocks;
     HashSet<IRBlock*> loopBlocksSet;
@@ -58,7 +58,6 @@ List<IRBlock*> _collectBlocksInLoop(Dictionary<IRBlock*, int>& blockOrdering, IR
     };
     auto firstBlock = as<IRBlock>(loopInst->block.get());
     auto breakBlock = as<IRBlock>(loopInst->breakBlock.get());
-    auto breakBlockOrdering = blockOrdering[breakBlock].GetValue();
 
     addBlock(firstBlock);
     for (Index i = 0; i < loopBlocks.getCount(); i++)
@@ -68,18 +67,19 @@ List<IRBlock*> _collectBlocksInLoop(Dictionary<IRBlock*, int>& blockOrdering, IR
         {
             if (succ == breakBlock)
                 continue;
-            auto successorOrdering = blockOrdering[block].GetValue();
-            // The target must be post-dominated by the break block in order to be considered
-            // the body of the loop.
-            // Since we don't support arbitrary goto or multi-level continue, the simple
-            // ordering comparison is sufficient to serve as a post-dominance check.
-            if (successorOrdering < breakBlockOrdering)
+            if (dom->dominates(firstBlock, succ) && !dom->dominates(breakBlock, succ))
                 addBlock(succ);
         }
     }
     return loopBlocks;
 }
 
+List<IRBlock*> collectBlocksInLoop(IRGlobalValueWithCode* func,  IRLoop* loopInst)
+{
+    auto dom = computeDominatorTree(func);
+    return _collectBlocksInLoop(dom, loopInst);
+}
+
 static int _getLoopMaxIterationsToUnroll(IRLoop* loopInst)
 {
     static constexpr int kMaxIterationsToAttempt = 100;
@@ -483,15 +483,7 @@ bool unrollLoopsInFunc(
         // Remove any continue jumps from the loop.
         eliminateContinueBlocks(module, loop);
 
-        auto postOrderReverseCFG = getPostorderOnReverseCFG(func);
-        Dictionary<IRBlock*, int> blockOrdering;
-        
-        for (Index i = 0; i < postOrderReverseCFG.getCount(); i++)
-        {
-            blockOrdering[postOrderReverseCFG[i]] = (int)i;
-        }
-
-        auto blocks = _collectBlocksInLoop(blockOrdering, loop);
+        auto blocks = collectBlocksInLoop(func, loop);
         auto loopLoc = loop->sourceLoc;
         if (!_unrollLoop(module, loop, blocks))
         {
diff --git a/source/slang/slang-ir-loop-unroll.h b/source/slang/slang-ir-loop-unroll.h
index d9c31e6be..6f7a41192 100644
--- a/source/slang/slang-ir-loop-unroll.h
+++ b/source/slang/slang-ir-loop-unroll.h
@@ -1,18 +1,22 @@
 // slang-ir-loop-unroll.h
 #pragma once
 
+#include "../core/slang-list.h"
+
 namespace Slang
 {
     struct IRLoop;
     struct IRGlobalValueWithCode;
     class DiagnosticSink;
     struct IRModule;
+    struct IRBlock;
 
     // Return true if successfull, false if errors occurred.
     bool unrollLoopsInFunc(IRModule* module, IRGlobalValueWithCode* func, DiagnosticSink* sink);
 
     bool unrollLoopsInModule(IRModule* module, DiagnosticSink* sink);
 
+    List<IRBlock*> collectBlocksInLoop(IRGlobalValueWithCode* func, IRLoop* loop);
 
     // Turn a loop with continue block into a loop with only back jumps and breaks.
     // Each iteration will be wrapped in a breakable region, where everything before `continue`
diff --git a/source/slang/slang-ir-propagate-func-properties.cpp b/source/slang/slang-ir-propagate-func-properties.cpp
new file mode 100644
index 000000000..f98a77fc7
--- /dev/null
+++ b/source/slang/slang-ir-propagate-func-properties.cpp
@@ -0,0 +1,186 @@
+#include "slang-ir-propagate-func-properties.h"
+
+#include "slang-ir.h"
+#include "slang-ir-insts.h"
+#include "slang-ir-util.h"
+
+
+namespace Slang
+{
+bool propagateFuncProperties(IRModule* module)
+{
+    bool result = false;
+    List<IRFunc*> workList;
+    HashSet<IRFunc*> workListSet;
+
+    auto addToWorkList = [&](IRFunc* f)
+    {
+        if (workListSet.Add(f))
+            workList.add(f);
+    };
+    auto addCallersToWorkList = [&](IRFunc* f)
+    {
+        if (auto g = findOuterGeneric(f))
+        {
+            for (auto use = g->firstUse; use; use = use->nextUse)
+            {
+                if (use->getUser()->getOp() == kIROp_Specialize)
+                {
+                    auto specialize = use->getUser();
+                    for (auto iuse = specialize->firstUse; iuse; iuse = iuse->nextUse)
+                    {
+                        if (auto userFunc = getParentFunc(iuse->getUser()))
+                            addToWorkList(userFunc);
+                    }
+                }
+            }
+            return;
+        }
+        for (auto use = f->firstUse; use; use = use->nextUse)
+        {
+            if (use->getUser()->getOp() == kIROp_Call)
+            {
+                if (auto userFunc = getParentFunc(use->getUser()))
+                    addToWorkList(userFunc);
+            }
+        }
+    };
+    for (;;)
+    {
+        bool changed = false;
+        workList.clear();
+        workListSet.Clear();
+
+        // Add side effect free functions and their transitive callers to work list.
+        for (auto inst : module->getGlobalInsts())
+        {
+            auto genericInst = as<IRGeneric>(inst);
+            if (genericInst)
+            {
+                inst = findGenericReturnVal(genericInst);
+            }
+            if (auto func = as<IRFunc>(inst))
+            {
+                if (func->findDecoration<IRReadNoneDecoration>())
+                {
+                    addCallersToWorkList(func);
+                }
+            }
+        }
+
+        // Add remaining functions to work list.
+        for (auto inst : module->getGlobalInsts())
+        {
+            auto genericInst = as<IRGeneric>(inst);
+            if (genericInst)
+            {
+                inst = findGenericReturnVal(genericInst);
+            }
+            if (auto func = as<IRFunc>(inst))
+            {
+                addToWorkList(func);
+            }
+        }
+
+        IRBuilder builder(module);
+
+        for (Index i = 0; i < workList.getCount(); i++)
+        {
+            auto f = workList[i];
+            bool hasSideEffectCall = false;
+            if (f->findDecoration<IRReadNoneDecoration>())
+                continue;
+            // Never propagate to functions without a body.
+            if (f->getFirstBlock() == nullptr)
+                continue;
+            if (f->findDecoration<IRTargetIntrinsicDecoration>())
+                continue;
+            for (auto block : f->getBlocks())
+            {
+                for (auto inst : block->getChildren())
+                {
+                    // Is this inst known to not have global side effect/analyzable?
+                    if (inst->mightHaveSideEffects())
+                    {
+                        switch (inst->getOp())
+                        {
+                        case kIROp_ifElse:
+                        case kIROp_unconditionalBranch:
+                        case kIROp_Switch:
+                        case kIROp_Return:
+                        case kIROp_loop:
+                        case kIROp_Store:
+                        case kIROp_Call:
+                        case kIROp_Param:
+                        case kIROp_Unreachable:
+                            break;
+                        default:
+                            // We have a inst that has side effect and is not understood by this method.
+                            // e.g. bufferStore, discard, etc.
+                            return true;
+                        }
+                    }
+
+                    if (auto call = as<IRCall>(inst))
+                    {
+                        auto callee = getResolvedInstForDecorations(call->getCallee());
+                        switch (callee->getOp())
+                        {
+                        default:
+                            // We are calling an unknown function, so we have to assume
+                            // there are side effects in the call.
+                            hasSideEffectCall = true;
+                            break;
+                        case kIROp_Func:
+                            if (!callee->findDecoration<IRReadNoneDecoration>())
+                            {
+                                hasSideEffectCall = true;
+                                break;
+                            }
+                        }
+                    }
+                    
+                    // Are any operands defined in global scope?
+                    for (UInt o = 0; o < inst->getOperandCount(); o++)
+                    {
+                        auto operand = inst->getOperand(o);
+                        if (getParentFunc(operand) == f)
+                            continue;
+                        if (as<IRConstant>(operand))
+                            continue;
+                        if (as<IRType>(operand))
+                            continue;
+                        switch (operand->getOp())
+                        {
+                        case kIROp_Specialize:
+                        case kIROp_LookupWitness:
+                        case kIROp_StructKey:
+                        case kIROp_WitnessTable:
+                        case kIROp_WitnessTableEntry:
+                        case kIROp_undefined:
+                        case kIROp_Func:
+                            continue;
+                        default:
+                            break;
+                        }
+                        hasSideEffectCall = true;
+                        break;
+                    }
+                }
+                if (hasSideEffectCall)
+                    break;
+            }
+            if (!hasSideEffectCall)
+            {
+                builder.addDecoration(f, kIROp_ReadNoneDecoration);
+                addCallersToWorkList(f);
+                changed = true;
+            }
+        }
+        result |= changed;
+        if (!changed)
+            break;
+    }
+    return result;
+}
+}
diff --git a/source/slang/slang-ir-propagate-func-properties.h b/source/slang/slang-ir-propagate-func-properties.h
new file mode 100644
index 000000000..6df2de18e
--- /dev/null
+++ b/source/slang/slang-ir-propagate-func-properties.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace Slang
+{
+struct IRModule;
+bool propagateFuncProperties(IRModule* module);
+}
diff --git a/source/slang/slang-ir-redundancy-removal.cpp b/source/slang/slang-ir-redundancy-removal.cpp
index f3996fc01..2a2047de9 100644
--- a/source/slang/slang-ir-redundancy-removal.cpp
+++ b/source/slang/slang-ir-redundancy-removal.cpp
@@ -8,10 +8,118 @@ namespace Slang
 struct RedundancyRemovalContext
 {
     RefPtr<IRDominatorTree> dom;
-    bool removeRedundancyInBlock(DeduplicateContext& deduplicateContext, IRBlock* block)
+    bool isMovableInst(IRInst* inst)
+    {
+        switch (inst->getOp())
+        {
+        case kIROp_Add:
+        case kIROp_Sub:
+        case kIROp_Mul:
+        case kIROp_Div:
+        case kIROp_FRem:
+        case kIROp_IRem:
+        case kIROp_Lsh:
+        case kIROp_Rsh:
+        case kIROp_And:
+        case kIROp_Or:
+        case kIROp_Not:
+        case kIROp_FieldExtract:
+        case kIROp_FieldAddress:
+        case kIROp_GetElement:
+        case kIROp_GetElementPtr:
+        case kIROp_UpdateElement:
+        case kIROp_OptionalHasValue:
+        case kIROp_GetOptionalValue:
+        case kIROp_MakeOptionalValue:
+        case kIROp_MakeTuple:
+        case kIROp_GetTupleElement:
+        case kIROp_MakeStruct:
+        case kIROp_MakeArray:
+        case kIROp_MakeArrayFromElement:
+        case kIROp_MakeVector:
+        case kIROp_MakeMatrix:
+        case kIROp_MakeMatrixFromScalar:
+        case kIROp_MakeVectorFromScalar:
+        case kIROp_swizzle:
+        case kIROp_MatrixReshape:
+        case kIROp_MakeString:
+        case kIROp_MakeResultError:
+        case kIROp_MakeResultValue:
+        case kIROp_GetResultError:
+        case kIROp_GetResultValue:
+        case kIROp_CastFloatToInt:
+        case kIROp_CastIntToFloat:
+        case kIROp_CastIntToPtr:
+        case kIROp_CastPtrToBool:
+        case kIROp_CastPtrToInt:
+        case kIROp_BitAnd:
+        case kIROp_BitNot:
+        case kIROp_BitOr:
+        case kIROp_BitXor:
+        case kIROp_BitCast:
+        case kIROp_Reinterpret:
+        case kIROp_Greater:
+        case kIROp_Less:
+        case kIROp_Geq:
+        case kIROp_Leq:
+        case kIROp_Neq:
+        case kIROp_Eql:
+            return true;
+        case kIROp_Call:
+            return isPureFunctionalCall(as<IRCall>(inst));
+        default:
+            return false;
+        }
+    }
+
+    bool tryHoistInstToOuterMostLoop(IRGlobalValueWithCode* func, IRInst* inst)
+    {
+        bool changed = false;
+        for (auto parentBlock = dom->getImmediateDominator(as<IRBlock>(inst->getParent()));
+             parentBlock;
+             parentBlock = dom->getImmediateDominator(parentBlock))
+        {
+            auto terminatorInst = parentBlock->getTerminator();
+            if (terminatorInst->getOp() == kIROp_loop)
+            {
+                // Consider hoisting the inst into this block.
+                // This is only possible if all operands of the inst are dominating `parentBlock`.
+                bool canHoist = true;
+                for (UInt i = 0; i < inst->getOperandCount(); i++)
+                {
+                    auto operand = inst->getOperand(i);
+                    if (getParentFunc(operand) != func)
+                    {
+                        // Global value won't prevent hoisting.
+                        continue;
+                    }
+                    auto operandParent = as<IRBlock>(operand->getParent());
+                    if (!operandParent)
+                    {
+                        canHoist = false;
+                        break;
+                    }
+                    canHoist = dom->dominates(operandParent, parentBlock);
+                    if (!canHoist)
+                        break;
+                }
+                if (!canHoist)
+                    break;
+
+                // Move inst to parentBlock.
+                inst->insertBefore(terminatorInst);
+                changed = true;
+
+                // Continue to consider outer hoisting positions.
+            }
+        }
+        return changed;
+    }
+
+    bool removeRedundancyInBlock(DeduplicateContext& deduplicateContext, IRGlobalValueWithCode* func, IRBlock* block)
     {
         bool result = false;
-        for (auto instP : block->getChildren())
+        for (auto instP : block->getModifiableChildren())
         {
             auto resultInst = deduplicateContext.deduplicate(instP, [&](IRInst* inst)
                 {
@@ -20,75 +128,25 @@ struct RedundancyRemovalContext
                         return false;
                     if (dom->isUnreachable(parentBlock))
                         return false;
-
-                    switch (inst->getOp())
-                    {
-                    case kIROp_Add:
-                    case kIROp_Sub:
-                    case kIROp_Mul:
-                    case kIROp_Div:
-                    case kIROp_Module:
-                    case kIROp_Lsh:
-                    case kIROp_Rsh:
-                    case kIROp_And:
-                    case kIROp_Or:
-                    case kIROp_Not:
-                    case kIROp_FieldExtract:
-                    case kIROp_FieldAddress:
-                    case kIROp_GetElement:
-                    case kIROp_GetElementPtr:
-                    case kIROp_UpdateElement:
-                    case kIROp_OptionalHasValue:
-                    case kIROp_GetOptionalValue:
-                    case kIROp_MakeOptionalValue:
-                    case kIROp_MakeTuple:
-                    case kIROp_GetTupleElement:
-                    case kIROp_MakeStruct:
-                    case kIROp_MakeArray:
-                    case kIROp_MakeArrayFromElement:
-                    case kIROp_MakeVector:
-                    case kIROp_MakeMatrix:
-                    case kIROp_MakeMatrixFromScalar:
-                    case kIROp_MakeVectorFromScalar:
-                    case kIROp_swizzle:
-                    case kIROp_MatrixReshape:
-                    case kIROp_MakeString:
-                    case kIROp_MakeResultError:
-                    case kIROp_MakeResultValue:
-                    case kIROp_GetResultError:
-                    case kIROp_GetResultValue:
-                    case kIROp_CastFloatToInt:
-                    case kIROp_CastIntToFloat:
-                    case kIROp_CastIntToPtr:
-                    case kIROp_CastPtrToBool:
-                    case kIROp_CastPtrToInt:
-                    case kIROp_BitAnd:
-                    case kIROp_BitNot:
-                    case kIROp_BitOr:
-                    case kIROp_BitXor:
-                    case kIROp_BitCast:
-                    case kIROp_Reinterpret:
-                    case kIROp_Greater:
-                    case kIROp_Less:
-                    case kIROp_Geq:
-                    case kIROp_Leq:
-                    case kIROp_Neq:
-                    case kIROp_Eql:
-                        return true;
-                    case kIROp_Call:
-                        return isPureFunctionalCall(as<IRCall>(inst));
-                    default:
-                        return false;
-                    }
+                    return isMovableInst(inst);
                 });
             if (resultInst != instP)
+            {
+                instP->replaceUsesWith(resultInst);
                 result = true;
+            }
+            else if (isMovableInst(resultInst))
+            {
+                // This inst is unique, we should consider hoisting it
+                // if it is inside a loop.
+                result |= tryHoistInstToOuterMostLoop(func, resultInst);
+            }
         }
         for (auto child : dom->getImmediatelyDominatedBlocks(block))
         {
             DeduplicateContext subContext;
             subContext.deduplicateMap = deduplicateContext.deduplicateMap;
-            result |= removeRedundancyInBlock(subContext, child);
+            result |= removeRedundancyInBlock(subContext, func, child);
         }
         return result;
     }
@@ -122,7 +180,142 @@ bool removeRedundancyInFunc(IRGlobalValueWithCode* func)
     RedundancyRemovalContext context;
     context.dom = computeDominatorTree(func);
     DeduplicateContext deduplicateCtx;
-    return context.removeRedundancyInBlock(deduplicateCtx, root);
+    return context.removeRedundancyInBlock(deduplicateCtx, func, root);
+}
+
+static IRInst* _getRootVar(IRInst* inst)
+{
+    while (inst)
+    {
+        switch (inst->getOp())
+        {
+        case kIROp_FieldAddress:
+        case kIROp_GetElementPtr:
+            inst = inst->getOperand(0);
+            break;
+        default:
+            return inst;
+        }
+    }
+    return inst;
+}
+
+bool tryRemoveRedundantStore(IRGlobalValueWithCode* func, IRStore* store)
+{
+    // We perform a quick and conservative check:
+    // A store is redundant if it is followed by another store to the same address in
+    // the same basic block, and there are no instructions that may use any addresses
+    // related to this address.
+    bool hasAddrUse = false;
+    bool hasOverridingStore = false;
+
+    // Stores to global variables will never get removed.
+    auto rootVar = _getRootVar(store->getPtr());
+    if (!isChildInstOf(rootVar, func))
+        return false;
+
+    // A store can be removed if it stores into a local variable
+    // that has no other uses than store.
+    if (auto varInst = as<IRVar>(rootVar))
+    {
+        bool hasNonStoreUse = false;
+        // If the entire access chain doesn't non-store use, we can safely remove it.
+        HashSet<IRInst*> knownAccessChain;
+        for (auto accessChain = store->getPtr(); accessChain;)
+        {
+            knownAccessChain.Add(accessChain);
+            for (auto use = accessChain->firstUse; use; use = use->nextUse)
+            {
+                if (as<IRDecoration>(use->getUser()))
+                    continue;
+                if (knownAccessChain.Contains(use->getUser()))
+                    continue;
+                if (use->getUser()->getOp() == kIROp_Store && 
+                    use == use->getUser()->getOperands())
+                {
+                    continue;
+                }
+                hasNonStoreUse = true;
+                break;
+            }
+            if (hasNonStoreUse)
+                break;
+            switch (accessChain->getOp())
+            {
+            case kIROp_GetElementPtr:
+            case kIROp_FieldAddress:
+                accessChain = accessChain->getOperand(0);
+                continue;
+            default:
+                break;
+            }
+            break;
+        }
+        if (!hasNonStoreUse)
+        {
+            store->removeAndDeallocate();
+            return true;
+        }
+    }
+
+    // A store can be removed if there are subsequent stores to the same variable,
+    // and there are no insts in between the stores that can read the variable.
+
+    HashSet<IRBlock*> visitedBlocks;
+    for (auto next = store->getNextInst(); next;)
+    {
+        if (auto nextStore = as<IRStore>(next))
+        {
+            if (nextStore->getPtr() == store->getPtr())
+            {
+                hasOverridingStore = true;
+                break;
+            }
+        }
+
+        // If we see any insts that have reads or modifies the address before seeing
+        // an overriding store, don't remove the store.
+        // We can make the test more accurate by collecting all addresses related to
+        // the target address first, and only bail out if any of the related addresses
+        // are involved.
+        switch (next->getOp())
+        {
+        case kIROp_Load:
+            if (canAddressesPotentiallyAlias(func, next->getOperand(0), store->getPtr()))
+            {
+                hasAddrUse = true;
+            }
+            break;
+        default:
+            if (canInstHaveSideEffectAtAddress(func, next, store->getPtr()))
+            {
+                hasAddrUse = true;
+            }
+            break;
+        }
+        if (hasAddrUse)
+            break;
+
+        // If we are at the end of the current block and see a unconditional branch,
+        // we can follow the path and check the subsequent block.
+        if (auto branch = as<IRUnconditionalBranch>(next))
+        {
+            auto nextBlock = branch->getTargetBlock();
+            if (visitedBlocks.Add(nextBlock))
+            {
+                next = nextBlock->getFirstInst();
+                continue;
+            }
+        }
+        next = next->getNextInst();
+    }
+
+    if (!hasAddrUse && hasOverridingStore)
+    {
+        store->removeAndDeallocate();
+        return true;
+    }
+    return false;
 }
 
 bool eliminateRedundantLoadStore(IRGlobalValueWithCode* func)
@@ -158,57 +351,7 @@ bool eliminateRedundantLoadStore(IRGlobalValueWithCode* func)
             }
             else if (auto store = as<IRStore>(inst))
             {
-                // We perform a quick and conservative check:
-                // A store is redundant if it is followed by another store to the same address in
-                // the same basic block, and there are no instructions that may use any addresses
-                // related to this address.
-                bool hasAddrUse = false;
-                bool hasOverridingStore = false;
-
-                // Stores to global variables will never get removed.
-                if (!isChildInstOf(store->getPtr(), func))
-                    hasAddrUse = true;
-
-                for (auto next = store->getNextInst(); next; next = next->getNextInst())
-                {
-                    if (auto nextStore = as<IRStore>(next))
-                    {
-                        if (nextStore->getPtr() == store->getPtr())
-                        {
-                            hasOverridingStore = true;
-                            break;
-                        }
-                    }
-
-                    // If we see any insts that have reads or modifies the address before seeing
-                    // an overriding store, don't remove the store.
-                    // We can make the test more accurate by collecting all addresses related to
-                    // the target address first, and only bail out if any of the related addresses
-                    // are involved.
-                    switch (next->getOp())
-                    {
-                    case kIROp_Load:
-                        if (canAddressesPotentiallyAlias(func, next->getOperand(0), store->getPtr()))
-                        {
-                            hasAddrUse = true;
-                        }
-                        break;
-                    default:
-                        if (canInstHaveSideEffectAtAddress(func, next, store->getPtr()))
-                        {
-                            hasAddrUse = true;
-                        }
-                        break;
-                    }
-                    if (hasAddrUse)
-                        break;
-                }
-
-                if (!hasAddrUse && hasOverridingStore)
-                {
-                    store->removeAndDeallocate();
-                    changed = true;
-                }
+                changed |= tryRemoveRedundantStore(func, store);
             }
             inst = nextInst;
         }
diff --git a/source/slang/slang-ir-sccp.cpp b/source/slang/slang-ir-sccp.cpp
index d05527e59..691bd7ff0 100644
--- a/source/slang/slang-ir-sccp.cpp
+++ b/source/slang/slang-ir-sccp.cpp
@@ -1439,7 +1439,9 @@ struct SCCPContext
                 inst->replaceUsesWith(constantVal);
                 if( !inst->mightHaveSideEffects() )
                 {
-                    instsToRemove.add(inst);
+                    // Don't delete phi parameters, they will be cleaned up in CFG simplification.
+                    if (inst->getOp() != kIROp_Param)
+                        instsToRemove.add(inst);
                 }
             }
         }
diff --git a/source/slang/slang-ir-simplify-cfg.cpp b/source/slang/slang-ir-simplify-cfg.cpp
index 7e9e105e1..b814442fa 100644
--- a/source/slang/slang-ir-simplify-cfg.cpp
+++ b/source/slang/slang-ir-simplify-cfg.cpp
@@ -4,6 +4,8 @@
 #include "slang-ir.h"
 #include "slang-ir-dominators.h"
 #include "slang-ir-restructure.h"
+#include "slang-ir-util.h"
+#include "slang-ir-loop-unroll.h"
 
 namespace Slang
 {
@@ -31,8 +33,7 @@ static BreakableRegion* findBreakableRegion(Region* region)
 // it is needed and hasn't been generated yet.
 static bool isTrivialSingleIterationLoop(
     IRGlobalValueWithCode* func,
-    IRLoop* loop,
-    CFGSimplificationContext& inoutContext)
+    IRLoop* loop)
 {
     auto targetBlock = loop->getTargetBlock();
     if (targetBlock->getPredecessors().getCount() != 1) return false;
@@ -52,14 +53,14 @@ static bool isTrivialSingleIterationLoop(
     // 
     // We need to verify this is a trivial loop by checking if there is any multi-level breaks
     // that skips out of this loop.
-
-    if (!inoutContext.domTree)
-        inoutContext.domTree = computeDominatorTree(func);
-    if (!inoutContext.regionTree)
-        inoutContext.regionTree = generateRegionTreeForFunc(func, nullptr);
+    CFGSimplificationContext context;
+    if (!context.domTree)
+        context.domTree = computeDominatorTree(func);
+    if (!context.regionTree)
+        context.regionTree = generateRegionTreeForFunc(func, nullptr);
 
     SimpleRegion* targetBlockRegion = nullptr;
-    if (!inoutContext.regionTree->mapBlockToRegion.TryGetValue(targetBlock, targetBlockRegion))
+    if (!context.regionTree->mapBlockToRegion.TryGetValue(targetBlock, targetBlockRegion))
         return false;
     BreakableRegion* loopBreakableRegion = findBreakableRegion(targetBlockRegion);
     LoopRegion* loopRegion = as<LoopRegion>(loopBreakableRegion);
@@ -67,18 +68,18 @@ static bool isTrivialSingleIterationLoop(
         return false;
     for (auto block : func->getBlocks())
     {
-        if (!inoutContext.domTree->dominates(loop->getTargetBlock(), block))
+        if (!context.domTree->dominates(loop->getTargetBlock(), block))
             continue;
-        if (inoutContext.domTree->dominates(loop->getBreakBlock(), block))
+        if (context.domTree->dominates(loop->getBreakBlock(), block))
             continue;
         SimpleRegion* region = nullptr;
-        if (!inoutContext.regionTree->mapBlockToRegion.TryGetValue(block, region))
+        if (!context.regionTree->mapBlockToRegion.TryGetValue(block, region))
             return false;
 
         for (auto branchTarget : block->getSuccessors())
         {
             SimpleRegion* targetRegion = nullptr;
-            if (!inoutContext.regionTree->mapBlockToRegion.TryGetValue(branchTarget, targetRegion))
+            if (!context.regionTree->mapBlockToRegion.TryGetValue(branchTarget, targetRegion))
                 return false;
             // If multi-level break out that skips over this loop exists, then this is not a trivial loop.
             if (targetRegion->isDescendentOf(loopRegion))
@@ -96,6 +97,104 @@ static bool isTrivialSingleIterationLoop(
     return true;
 }
 
+static bool doesLoopHasSideEffect(IRGlobalValueWithCode* func, IRLoop* loopInst)
+{
+    auto blocks = collectBlocksInLoop(func, loopInst);
+    HashSet<IRBlock*> loopBlocks;
+    for (auto b : blocks)
+        loopBlocks.Add(b);
+    auto addressHasOutOfLoopUses = [&](IRInst* addr)
+    {
+        // The entire access chain of `addr` must have no uses out side the loop.
+        // The root variable must be a local var.
+        for (auto chainNode = addr; chainNode;)
+        {
+            if (getParentFunc(chainNode) != func)
+                return true;
+            for (auto use = chainNode->firstUse; use; use = use->nextUse)
+            {
+                if (!loopBlocks.Contains(as<IRBlock>(use->getUser()->getParent())))
+                    return true;
+            }
+            switch (chainNode->getOp())
+            {
+            case kIROp_GetElementPtr:
+            case kIROp_FieldAddress:
+                chainNode = chainNode->getOperand(0);
+                continue;
+            case kIROp_Var:
+                break;
+            default:
+                return true;
+            }
+            break;
+        }
+        return false;
+    };
+
+    for (auto b : blocks)
+    {
+        for (auto inst : b->getChildren())
+        {
+            // Is this inst used anywhere outside the loop? If so the loop has side effect.
+            for (auto use = inst->firstUse; use; use = use->nextUse)
+            {
+                if (!loopBlocks.Contains(as<IRBlock>(use->getUser()->getParent())))
+                    return true;
+            }
+
+            // The inst can't possibly have side effect? Skip it.
+            if (!inst->mightHaveSideEffects())
+                continue;
+
+            // This inst might have side effect, try to prove that the
+            // side effect does not leak beyond the scope of the loop.
+            if (auto call = as<IRCall>(inst))
+            {
+                auto callee = getResolvedInstForDecorations(call->getCallee());
+                if (!callee || !callee->findDecoration<IRReadNoneDecoration>())
+                    return true;
+                // We are calling a pure function, check if any of the return
+                // variables are used outside the loop.
+                for (UInt i = 0; i < call->getArgCount(); i++)
+                {
+                    auto arg = call->getArg(i);
+                    if (!isValueType(arg->getDataType()))
+                    {
+                        if (addressHasOutOfLoopUses(arg))
+                            return true;
+                    }
+                }
+            }
+            else if (auto store = as<IRStore>(inst))
+            {
+                if (addressHasOutOfLoopUses(store->getPtr()))
+                    return true;
+            }
+            else if (auto branch = as<IRUnconditionalBranch>(inst))
+            {
+                if (loopBlocks.Contains(branch->getTargetBlock()))
+                    continue;
+                // Branching out of the loop with some argument is considered
+                // having a side effect.
+                if (branch->getArgCount() != 0)
+                    return true;
+            }
+            else if (as<IRIfElse>(inst) || as<IRSwitch>(inst) || as<IRLoop>(inst))
+            {
+                // We are starting a sub control flow.
+                // This is considered side effect free.
+            }
+            else
+            {
+                // For all other insts, we assume it has a global side effect.
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
 static bool removeDeadBlocks(IRGlobalValueWithCode* func)
 {
     bool changed = false;
@@ -142,15 +241,327 @@ static bool removeDeadBlocks(IRGlobalValueWithCode* func)
     return changed;
 }
 
+// Return the true of the if-else branch block if the branch is a trivial jump
+// to after block with no other insts.
+static bool isTrivialIfElseBranch(IRIfElse* condBranch, IRBlock* branchBlock)
+{
+    if (branchBlock != condBranch->getAfterBlock())
+    {
+        if (auto br = as<IRUnconditionalBranch>(branchBlock->getFirstOrdinaryInst()))
+        {
+            if (br->getTargetBlock() == condBranch->getAfterBlock() && br->getOp() == kIROp_unconditionalBranch)
+            {
+                return true;
+            }
+        }
+    }
+    else
+    {
+        return true;
+    }
+    return false;
+}
+
+static bool arePhiArgsEquivalentInBranches(IRIfElse* ifElse)
+{
+    // If one of the branch target is afterBlock itself, and the other branch
+    // is a trivial block that jumps into the afterBlock, this if-else is trivial.
+    // In this case the argCount must be 0 because a block with phi parameters can't
+    // be used as targets in a conditional branch.
+    auto branch1 = ifElse->getTrueBlock();
+    auto branch2 = ifElse->getFalseBlock();
+    auto afterBlock = ifElse->getAfterBlock();
+
+    if (branch1 == afterBlock) return true;
+    if (branch2 == afterBlock) return true;
+
+    auto branchInst1 = as<IRUnconditionalBranch>(branch1->getTerminator());
+    auto branchInst2 = as<IRUnconditionalBranch>(branch2->getTerminator());
+    if (!branchInst1) return false;
+    if (!branchInst2) return false;
+
+    // If both branches are trivial blocks, we must compare the arguments.
+    if (branchInst1->getArgCount() != branchInst2->getArgCount())
+    {
+        // This should never happen, return false now to be safe.
+        return false;
+    }
+    
+    for (UInt i = 0; i < branchInst1->getArgCount(); i++)
+    {
+        if (branchInst1->getArg(i) != branchInst2->getArg(i))
+        {
+            // argument is different, the if-else is non-trivial.
+            return false;
+        }
+    }
+    return true;
+}
+
+static bool isTrivialIfElse(IRIfElse* condBranch, bool& isTrueBranchTrivial, bool& isFalseBranchTrivial)
+{
+    isTrueBranchTrivial = isTrivialIfElseBranch(condBranch, condBranch->getTrueBlock());
+    isFalseBranchTrivial = isTrivialIfElseBranch(condBranch, condBranch->getFalseBlock());
+    if (isTrueBranchTrivial && isFalseBranchTrivial)
+    {
+        if (arePhiArgsEquivalentInBranches(condBranch))
+            return true;
+    }
+    return false;
+}
+
+#if 0
+static bool tryMoveFalseBranchToTrueBranch(IRBuilder& builder, IRIfElse* ifElseInst)
+{
+    auto falseBlock = ifElseInst->getFalseBlock();
+    if (falseBlock == ifElseInst->getAfterBlock())
+        return false;
+    if (auto termInst = as<IRUnconditionalBranch>(falseBlock->getTerminator()))
+    {
+        // We can't fold a branch with arguments into the ifElse.
+        if (termInst->getArgCount() != 0)
+            return false;
+    }
+    ifElseInst->trueBlock.set(falseBlock);
+    ifElseInst->falseBlock.set(ifElseInst->getAfterBlock());
+    builder.setInsertBefore(ifElseInst);
+    auto newCondition = builder.emitNot(builder.getBoolType(), ifElseInst->getCondition());
+    ifElseInst->condition.set(newCondition);
+    return true;
+}
+#endif
+
+static bool tryEliminateFalseBranch(IRIfElse* ifElseInst)
+{
+    auto falseBlock = ifElseInst->getFalseBlock();
+    if (falseBlock == ifElseInst->getAfterBlock())
+        return false;
+    if (auto termInst = as<IRUnconditionalBranch>(falseBlock->getTerminator()))
+    {
+        // We can't fold a branch with arguments into the ifElse.
+        if (termInst->getArgCount() != 0)
+            return false;
+    }
+    ifElseInst->falseBlock.set(ifElseInst->getAfterBlock());
+    return true;
+}
+
+static bool trySimplifyIfElse(IRBuilder& builder, IRIfElse* ifElseInst)
+{
+    bool isTrueBranchTrivial = false;
+    bool isFalseBranchTrivial = false;
+    if (isTrivialIfElse(ifElseInst, isTrueBranchTrivial, isFalseBranchTrivial))
+    {
+        // If both branches of `if-else` are trivial jumps into after block,
+        // we can get rid of the entire conditional branch and replace it
+        // with a jump into the after block.
+        if (auto termInst = as<IRUnconditionalBranch>(ifElseInst->getTrueBlock()->getTerminator()))
+        {
+            List<IRInst*> args;
+            for (UInt i = 0; i < termInst->getArgCount(); i++)
+                args.add(termInst->getArg(i));
+            builder.setInsertBefore(ifElseInst);
+            builder.emitBranch(ifElseInst->getAfterBlock(), (Int)args.getCount(), args.getBuffer());
+            ifElseInst->removeAndDeallocate();
+            return true;
+        }
+    }
+    else if (isTrueBranchTrivial)
+    {
+        // If true branch is empty, we move false branch to true branch and invert the condition.
+        // TODO: diabled for now since our auto-diff pass can't handle loops whose body is on the false
+        // side of condition.
+        //return tryMoveFalseBranchToTrueBranch(builder, ifElseInst);
+    }
+    else if (isFalseBranchTrivial)
+    {
+        // If false branch is empty, we set it to afterBlock.
+        return tryEliminateFalseBranch(ifElseInst);
+    }
+    return false;
+}
+
+static bool isTrueLit(IRInst* lit)
+{
+    if (auto boolLit = as<IRBoolLit>(lit))
+        return boolLit->getValue();
+    return false;
+}
+static bool isFalseLit(IRInst* lit)
+{
+    if (auto boolLit = as<IRBoolLit>(lit))
+        return !boolLit->getValue();
+    return false;
+}
+
+static bool simplifyBoolPhiParam(IRIfElse* ifElse, Array<IRBlock*, 2>& preds, IRParam* param, UInt paramIndex)
+{
+    // For bool params where its value is assigned from the same `if-else` statement,
+    // we can simplify it into an expression of the condition of the source `if-else`.
+
+    if (!param->getDataType() || param->getDataType()->getOp() != kIROp_BoolType)
+        return false;
+
+    auto branch0 = as<IRUnconditionalBranch>(preds[0]->getTerminator());
+    if (!branch0)
+        return false;
+    if (branch0->getArgCount() <= paramIndex)
+        return false;
+    auto branch1 = as<IRUnconditionalBranch>(preds[1]->getTerminator());
+    if (!branch1)
+        return false;
+    if (branch1->getArgCount() <= paramIndex)
+        return false;
+
+    IRInst* replacement = nullptr;
+    if (isTrueLit(branch0->getArg(paramIndex)) && isFalseLit(branch1->getArg(paramIndex)))
+    {
+        replacement = ifElse->getCondition();
+    }
+    else if (isFalseLit(branch0->getArg(paramIndex)) && isTrueLit(branch1->getArg(paramIndex)))
+    {
+        IRBuilder builder(param);
+        setInsertBeforeOrdinaryInst(&builder, param);
+        replacement = builder.emitNot(builder.getBoolType(), ifElse->getCondition());
+    }
+    if (replacement)
+    {
+        param->replaceUsesWith(replacement);
+        param->removeAndDeallocate();
+        branch0->removeArgument(paramIndex);
+        branch1->removeArgument(paramIndex);
+        return true;
+    }
+    return false;
+}
+
+static bool simplifyBoolPhiParams(IRBlock* block)
+{
+    if (!block)
+        return false;
+
+    if (block->getPredecessors().getCount() != 2)
+        return false;
+
+    Array<IRBlock*, 2> preds;
+    for (auto pred : block->getPredecessors())
+        preds.add(pred);
+
+    IRBlock* ifElseBlock = nullptr;
+    if (preds[0]->getPredecessors().getCount() != 1)
+        return false;
+    ifElseBlock = *(preds[0]->getPredecessors().begin());
+    if (preds[1]->getPredecessors().getCount() != 1)
+        return false;
+    auto p = *(preds[1]->getPredecessors().begin());
+    if (p != ifElseBlock)
+        return false;
+
+    auto ifElse = as<IRIfElse>(ifElseBlock->getTerminator());
+    if (!ifElse)
+        return false;
+
+    if (ifElse->getTrueBlock() == preds[1])
+    {
+        Swap(preds[0], preds[1]);
+    }
+    SLANG_ASSERT(ifElse->getTrueBlock() == preds[0] && ifElse->getFalseBlock() == preds[1]);
+
+    List<IRParam*> params;
+    for (auto param : block->getParams())
+        params.add(param);
+    bool changed = false;
+    for (Index i = params.getCount() - 1; i >= 0; i--)
+    {
+        changed |= simplifyBoolPhiParam(ifElse, preds, params[i], (UInt)i);
+    }
+    return changed;
+}
+
+static bool removeTrivialPhiParams(IRBlock* block)
+{
+    // We can remove a phi parmeter if:
+    // 1. all arguments to a parameter is the same (not really a phi).
+    // 2. the arguments to the parameter is always the same as arguments to another existing parameter (duplicate phi).
+
+    bool changed = false;
+    List<IRParam*> params;
+    struct ParamState
+    {
+        bool areKnownValueSame = true;
+        IRInst* knownValue = nullptr;
+        OrderedHashSet<UInt> sameAsParamSet;
+    };
+    List<ParamState> args;
+    List<IRUnconditionalBranch*> termInsts;
+    for (auto param : block->getParams())
+    {
+        params.add(param);
+        args.add(ParamState());
+    }
+
+    if (!params.getCount())
+        return false;
+
+    for (UInt i = 1; i < (UInt)args.getCount(); i++)
+        for (UInt j = 0; j < i; j++)
+            args[i].sameAsParamSet.Add(j);
+
+    for (auto pred : block->getPredecessors())
+    {
+        auto termInst = as<IRUnconditionalBranch>(pred->getTerminator());
+        if (!termInst)
+            return false;
+        SLANG_ASSERT(termInst->getArgCount() == (UInt)args.getCount());
+        termInsts.add(termInst);
+        for (UInt i = 0; i < termInst->getArgCount(); i++)
+        {
+            if (args[i].areKnownValueSame)
+            {
+                if (args[i].knownValue == nullptr)
+                    args[i].knownValue = termInst->getArg(i);
+                else if (args[i].knownValue != termInst->getArg(i))
+                    args[i].areKnownValueSame = false;
+            }
+            for (UInt j = 0; j < i; j++)
+            {
+                if (termInst->getArg(i) != termInst->getArg(j))
+                {
+                    args[i].sameAsParamSet.Remove(j);
+                }
+            }
+        }
+    }
+    for (Index i = args.getCount() - 1; i >= 0; i--)
+    {
+        IRInst* targetVal = nullptr;
+        if (args[i].areKnownValueSame)
+        {
+            targetVal = args[i].knownValue;
+        }
+        else if (args[i].sameAsParamSet.Count())
+        {
+            auto targetParamId = *args[i].sameAsParamSet.begin();
+            targetVal = params[targetParamId];
+        }
+        if (targetVal)
+        {
+            params[i]->replaceUsesWith(args[i].knownValue);
+            params[i]->removeAndDeallocate();
+            for (auto termInst : termInsts)
+                termInst->removeArgument((UInt)i);
+            changed = true;
+        }
+    }
+    return changed;
+}
+
 static bool processFunc(IRGlobalValueWithCode* func)
 {
     auto firstBlock = func->getFirstBlock();
     if (!firstBlock)
         return false;
 
-    // Lazily generated region tree.
-    CFGSimplificationContext simplificationContext;
-
     IRBuilder builder(func->getModule());
 
     bool changed = false;
@@ -165,6 +576,14 @@ static bool processFunc(IRGlobalValueWithCode* func)
             workList.fastRemoveAt(0);
             while (block)
             {
+                // If all arguments to a phi parameter are the known to be the same,
+                // we can safely replace the phi parameter with the argument.
+                if (block != func->getFirstBlock())
+                {
+                    changed |= simplifyBoolPhiParams(block);
+                    changed |= removeTrivialPhiParams(block);
+                }
+
                 if (auto loop = as<IRLoop>(block->getTerminator()))
                 {
                     // If continue block is unreachable, remove it.
@@ -179,7 +598,7 @@ static bool processFunc(IRGlobalValueWithCode* func)
                     // break at the end of the loop, we can remove the header and turn it into
                     // a normal branch.
                     auto targetBlock = loop->getTargetBlock();
-                    if (isTrivialSingleIterationLoop(func, loop, simplificationContext))
+                    if (isTrivialSingleIterationLoop(func, loop))
                     {
                         builder.setInsertBefore(loop);
                         List<IRInst*> args;
@@ -189,7 +608,22 @@ static bool processFunc(IRGlobalValueWithCode* func)
                         }
                         builder.emitBranch(targetBlock, args.getCount(), args.getBuffer());
                         loop->removeAndDeallocate();
+                        changed = true;
                     }
+                    else if (!doesLoopHasSideEffect(func, loop))
+                    {
+                        // The loop isn't computing anything useful outside the loop.
+                        // We can delete the entire loop.
+                        builder.setInsertBefore(loop);
+                        SLANG_ASSERT(loop->getBreakBlock()->getFirstParam() == nullptr);
+                        builder.emitBranch(loop->getBreakBlock());
+                        loop->removeAndDeallocate();
+                        changed = true;
+                    }
+                }
+                else if (auto condBranch = as<IRIfElse>(block->getTerminator()))
+                {
+                    changed |= trySimplifyIfElse(builder, condBranch);
                 }
 
                 // If `block` does not end with an unconditional branch, bail.
@@ -225,6 +659,7 @@ static bool processFunc(IRGlobalValueWithCode* func)
                 branch->removeAndDeallocate();
                 assert(!successor->hasUses());
                 successor->removeAndDeallocate();
+                break;
             }
             for (auto successor : block->getSuccessors())
             {
diff --git a/source/slang/slang-ir-specialize-function-call.cpp b/source/slang/slang-ir-specialize-function-call.cpp
index 894d46cce..a2ebbc0cf 100644
--- a/source/slang/slang-ir-specialize-function-call.cpp
+++ b/source/slang/slang-ir-specialize-function-call.cpp
@@ -822,6 +822,12 @@ struct FunctionParameterSpecializationContext
                 {
                     decoration->removeAndDeallocate();
                 }
+                else if (as<IRReadNoneDecoration>(decoration))
+                {
+                    // After specialization, the function may no longer be side effect free
+                    // because the parameter we substituted in maybe a global param. 
+                    decoration->removeAndDeallocate();
+                }
             }
         }
 
diff --git a/source/slang/slang-ir-ssa-simplification.cpp b/source/slang/slang-ir-ssa-simplification.cpp
index f06fafcb3..beaaae065 100644
--- a/source/slang/slang-ir-ssa-simplification.cpp
+++ b/source/slang/slang-ir-ssa-simplification.cpp
@@ -10,6 +10,7 @@
 #include "slang-ir-deduplicate-generic-children.h"
 #include "slang-ir-remove-unused-generic-param.h"
 #include "slang-ir-redundancy-removal.h"
+#include "slang-ir-propagate-func-properties.h"
 
 namespace Slang
 {
@@ -29,6 +30,7 @@ namespace Slang
             changed |= peepholeOptimize(module);
             changed |= removeRedundancy(module);
             changed |= simplifyCFG(module);
+            changed |= propagateFuncProperties(module);
 
             // Note: we disregard the `changed` state from dead code elimination pass since
             // SCCP pass could be generating temporarily evaluated constant values and never actually use them.
@@ -41,6 +43,28 @@ namespace Slang
         }
     }
 
+    void simplifyNonSSAIR(IRModule* module)
+    {
+        bool changed = true;
+        const int kMaxIterations = 8;
+        int iterationCounter = 0;
+        while (changed && iterationCounter < kMaxIterations)
+        {
+            changed = false;
+            changed |= peepholeOptimize(module);
+            changed |= removeRedundancy(module);
+            changed |= simplifyCFG(module);
+
+            // Note: we disregard the `changed` state from dead code elimination pass since
+            // SCCP pass could be generating temporarily evaluated constant values and never actually use them.
+            // DCE will always remove those nearly generated consts and always returns true here.
+            eliminateDeadCode(module);
+
+            iterationCounter++;
+        }
+    }
+
+
     void simplifyFunc(IRGlobalValueWithCode* func)
     {
         bool changed = true;
diff --git a/source/slang/slang-ir-ssa-simplification.h b/source/slang/slang-ir-ssa-simplification.h
index ee8343003..39504e102 100644
--- a/source/slang/slang-ir-ssa-simplification.h
+++ b/source/slang/slang-ir-ssa-simplification.h
@@ -10,5 +10,8 @@ namespace Slang
     // until no more changes are possible.
     void simplifyIR(IRModule* module);
 
+    // Run simplifications on IR that is out of SSA form.
+    void simplifyNonSSAIR(IRModule* module);
+
     void simplifyFunc(IRGlobalValueWithCode* func);
 }
diff --git a/source/slang/slang-ir-util.cpp b/source/slang/slang-ir-util.cpp
index 3db036a8d..339521f41 100644
--- a/source/slang/slang-ir-util.cpp
+++ b/source/slang/slang-ir-util.cpp
@@ -157,6 +157,32 @@ IRInst* maybeSpecializeWithGeneric(IRBuilder& builder, IRInst* genericToSpecaili
     return genericToSpecailize;
 }
 
+bool isValueType(IRInst* dataType)
+{
+    dataType = getResolvedInstForDecorations(unwrapAttributedType(dataType));
+    if (as<IRBasicType>(dataType))
+        return true;
+    switch (dataType->getOp())
+    {
+    case kIROp_StructType:
+    case kIROp_InterfaceType:
+    case kIROp_ClassType:
+    case kIROp_VectorType:
+    case kIROp_MatrixType:
+    case kIROp_TupleType:
+    case kIROp_ResultType:
+    case kIROp_OptionalType:
+    case kIROp_DifferentialPairType:
+    case kIROp_DynamicType:
+    case kIROp_AnyValueType:
+    case kIROp_ArrayType:
+    case kIROp_FuncType:
+        return true;
+    default:
+        return false;
+    }
+}
+
 IRInst* hoistValueFromGeneric(IRBuilder& inBuilder, IRInst* value, IRInst*& outSpecializedVal, bool replaceExistingValue)
 {
     auto outerGeneric = as<IRGeneric>(findOuterGeneric(value));
@@ -402,8 +428,7 @@ bool canInstHaveSideEffectAtAddress(IRGlobalValueWithCode* func, IRInst* inst, I
             {
                 auto callee = call->getCallee();
                 if (callee &&
-                    callee->findDecoration<IRReadNoneDecoration>() &&
-                    callee->findDecoration<IRNoSideEffectDecoration>())
+                    callee->findDecoration<IRReadNoneDecoration>())
                 {
                     // An exception is if the callee is side-effect free and is not reading from
                     // memory.
@@ -423,6 +448,32 @@ bool canInstHaveSideEffectAtAddress(IRGlobalValueWithCode* func, IRInst* inst, I
                     if (canAddressesPotentiallyAlias(func, call->getArg(i), addr))
                         return true;
                 }
+                else if (!isValueType(call->getArg(i)->getDataType()))
+                {
+                    // This is some unknown handle type, we assume it can have any side effects.
+                    return true;
+                }
+            }
+        }
+        break;
+    case kIROp_unconditionalBranch:
+    case kIROp_loop:
+        {
+            auto branch = as<IRUnconditionalBranch>(inst);
+            // If any pointer typed argument of the branch inst may overlap addr, return true.
+            for (UInt i = 0; i < branch->getArgCount(); i++)
+            {
+                SLANG_RELEASE_ASSERT(branch->getArg(i)->getDataType());
+                if (isPtrLikeOrHandleType(branch->getArg(i)->getDataType()))
+                {
+                    if (canAddressesPotentiallyAlias(func, branch->getArg(i), addr))
+                        return true;
+                }
+                else if (!isValueType(branch->getArg(i)->getDataType()))
+                {
+                    // This is some unknown handle type, we assume it can have any side effects.
+                    return true;
+                }
             }
         }
         break;
@@ -434,6 +485,11 @@ bool canInstHaveSideEffectAtAddress(IRGlobalValueWithCode* func, IRInst* inst, I
             if (isPtrLikeOrHandleType(inst->getOperand(0)->getDataType()) &&
                 canAddressesPotentiallyAlias(func, inst->getOperand(0), addr))
                 return true;
+            else if (!isValueType(inst->getOperand(0)->getDataType()))
+            {
+                // This is some unknown handle type, we assume it can have any side effects.
+                return true;
+            }
         }
         break;
     default:
@@ -520,20 +576,17 @@ bool isPureFunctionalCall(IRCall* call)
     auto callee = getResolvedInstForDecorations(call->getCallee());
     if (callee->findDecoration<IRReadNoneDecoration>())
     {
-        return true;
-    }
-    if (callee->findDecoration<IRNoSideEffectDecoration>())
-    {
         // If the function has no side effect and is not writing to any outputs,
         // we can safely treat the call as a normal inst.
         bool hasOutArg = false;
         for (UInt i = 0; i < call->getArgCount(); i++)
         {
-            if (as<IRPtrTypeBase>(call->getArg(i)->getDataType()))
-            {
-                hasOutArg = true;
-                break;
-            }
+            if (isValueType(call->getArg(i)->getDataType()))
+                continue;
+            // If the argument type is not a known value type,
+            // assume it is a pointer or handle through which side effect can take place.
+            hasOutArg = true;
+            break;
         }
         return !hasOutArg;
     }
diff --git a/source/slang/slang-ir-util.h b/source/slang/slang-ir-util.h
index 8a12ab895..62156cad6 100644
--- a/source/slang/slang-ir-util.h
+++ b/source/slang/slang-ir-util.h
@@ -83,6 +83,9 @@ inline bool isScalarIntegerType(IRType* type)
     return getTypeStyle(type->getOp()) == kIROp_IntType;
 }
 
+// No side effect can take place through a value of a "Value" type.
+bool isValueType(IRInst* type);
+
 inline bool isChildInstOf(IRInst* inst, IRInst* parent)
 {
     while (inst)
diff --git a/source/slang/slang-ir.cpp b/source/slang/slang-ir.cpp
index accefc0c9..fd211d05c 100644
--- a/source/slang/slang-ir.cpp
+++ b/source/slang/slang-ir.cpp
@@ -43,7 +43,10 @@ namespace Slang
             case kIROp_PreciseDecoration: 
             case kIROp_PublicDecoration: 
             case kIROp_HLSLExportDecoration: 
-            case kIROp_ReadNoneDecoration: 
+            case kIROp_ReadNoneDecoration:
+            case kIROp_NoSideEffectDecoration:
+            case kIROp_ForwardDifferentiableDecoration:
+            case kIROp_BackwardDifferentiableDecoration:
             case kIROp_RequiresNVAPIDecoration: 
             case kIROp_TriangleAdjInputPrimitiveTypeDecoration:
             case kIROp_TriangleInputPrimitiveTypeDecoration:
@@ -695,6 +698,21 @@ namespace Slang
         }
     }
 
+    void IRUnconditionalBranch::removeArgument(UInt index)
+    {
+        switch (getOp())
+        {
+        case kIROp_unconditionalBranch:
+            removeOperand(1 + index);
+            break;
+        case kIROp_loop:
+            removeOperand(3 + index);
+            break;
+        default:
+            SLANG_UNEXPECTED("unhandled unconditional branch opcode");
+        }
+    }
+
     IRInst* IRUnconditionalBranch::getArg(UInt index)
     {
         return getArgs()[index].usedValue;
@@ -5109,6 +5127,17 @@ namespace Slang
         return inst;
     }
 
+    IRInst* IRBuilder::emitNot(IRType* type, IRInst* value)
+    {
+        auto inst = createInst<IRInst>(
+            this,
+            kIROp_Not,
+            type,
+            value);
+        addInst(inst);
+        return inst;
+    }
+
     IRInst* IRBuilder::emitAdd(IRType* type, IRInst* left, IRInst* right)
     {
         auto inst = createInst<IRInst>(
@@ -6792,6 +6821,17 @@ namespace Slang
         }
     }
 
+    void IRInst::removeOperand(Index index)
+    {
+        for (Index i = index; i < (Index)operandCount - 1; i++)
+        {
+            getOperands()[i].set(getOperand(i + 1));
+        }
+        getOperands()[operandCount - 1].clear();
+        operandCount--;
+        return;
+    }
+
     // Remove this instruction from its parent block,
     // and then destroy it (it had better have no uses!)
     void IRInst::removeAndDeallocate()
@@ -6879,6 +6919,8 @@ namespace Slang
                 // common subexpression elimination, etc.
                 //
                 auto call = cast<IRCall>(this);
+                // If the call has been marked as no-side-effect, we
+                // will treat it so, by-passing all other checks.
                 if (call->findDecoration<IRNoSideEffectDecoration>())
                     return false;
                 return !isPureFunctionalCall(call);
@@ -6894,6 +6936,7 @@ namespace Slang
         case kIROp_Func:
         case kIROp_Generic:
         case kIROp_Var:
+        case kIROp_Param:
         case kIROp_GlobalVar: // Note: the IRGlobalVar represents the *address*, so only a load/store would have side effects
         case kIROp_GlobalConstant:
         case kIROp_GlobalParam:
@@ -7003,12 +7046,6 @@ namespace Slang
         case kIROp_BackwardDifferentiatePropagate:
             return false;
         }
-
-        // Check if the calle has been marked with a catch-all no-side-effect decoration.
-        if (findDecoration<IRNoSideEffectDecoration>())
-        {
-            return false;
-        }
         return true;
     }
 
diff --git a/source/slang/slang-ir.h b/source/slang/slang-ir.h
index 63b7c4ef9..e22ea8a36 100644
--- a/source/slang/slang-ir.h
+++ b/source/slang/slang-ir.h
@@ -744,6 +744,11 @@ struct IRInst
     // for those values.
     void removeArguments();
 
+    // Remove operand `index` from operand list.
+    // For example, if the inst is `op(a,b,c)`, calling removeOperand(inst, 1) will result
+    // `op(a,c)`.
+    void removeOperand(Index index);
+
         /// Transfer any decorations of this instruction to the `target` instruction.
     void transferDecorationsTo(IRInst* target);
 
diff --git a/source/slang/slang-lower-to-ir.cpp b/source/slang/slang-lower-to-ir.cpp
index 681871b6c..d09c35eea 100644
--- a/source/slang/slang-lower-to-ir.cpp
+++ b/source/slang/slang-lower-to-ir.cpp
@@ -8304,6 +8304,11 @@ struct DeclLoweringVisitor : DeclVisitor<DeclLoweringVisitor, LoweredValInfo>
             getBuilder()->addSimpleDecoration<IRRequiresNVAPIDecoration>(irFunc);
         }
 
+        if (decl->findModifier<AlwaysFoldIntoUseSiteAttribute>())
+        {
+            getBuilder()->addSimpleDecoration<IRAlwaysFoldIntoUseSiteDecoration>(irFunc);
+        }
+
         if (decl->findModifier<NoInlineAttribute>())
         {
             getBuilder()->addSimpleDecoration<IRNoInlineDecoration>(irFunc);
author	Yong He <yonghe@outlook.com>	2023-02-24 10:01:47 -0800
committer	GitHub <noreply@github.com>	2023-02-24 10:01:47 -0800
commit	bd6306cdaa4a49344658bd026721b6532e103d09 (patch)
tree	bb7f666d426e6cfc7777a3ccac0a1d628588eb39 /source
parent	e8c08e7ecb1124f115a1d1042277776193122b57 (diff)