From 71439f700b845e8d8336041c6d6824f01b7c9067 Mon Sep 17 00:00:00 2001
From: Yong He <yonghe@outlook.com>
Date: Mon, 5 Feb 2024 16:15:21 -0800
Subject: Add documentation on capability system. (#3549)

Fixes #3454.
---
 docs/_config.yml                                 |   2 +-
 docs/scripts/Program.cs                          |   6 +
 docs/user-guide/00-introduction.md               |   1 +
 docs/user-guide/01-get-started.md                |   1 +
 docs/user-guide/02-conventional-features.md      |   1 +
 docs/user-guide/03-convenience-features.md       |   1 +
 docs/user-guide/04-modules-and-access-control.md |   1 +
 docs/user-guide/05-capabilities.md               | 143 +++++
 docs/user-guide/05-interfaces-generics.md        | 736 ----------------------
 docs/user-guide/06-compiling.md                  | 497 ---------------
 docs/user-guide/06-interfaces-generics.md        | 737 ++++++++++++++++++++++
 docs/user-guide/07-autodiff.md                   | 758 +++++++++++++++++++++++
 docs/user-guide/07-targets.md                    | 366 -----------
 docs/user-guide/08-autodiff.md                   | 757 ----------------------
 docs/user-guide/08-compiling.md                  | 498 +++++++++++++++
 docs/user-guide/09-targets.md                    | 367 +++++++++++
 docs/user-guide/a1-02-slangpy.md                 |   4 +-
 docs/user-guide/toc.html                         | 160 ++---
 18 files changed, 2603 insertions(+), 2433 deletions(-)
 create mode 100644 docs/user-guide/05-capabilities.md
 delete mode 100644 docs/user-guide/05-interfaces-generics.md
 delete mode 100644 docs/user-guide/06-compiling.md
 create mode 100644 docs/user-guide/06-interfaces-generics.md
 create mode 100644 docs/user-guide/07-autodiff.md
 delete mode 100644 docs/user-guide/07-targets.md
 delete mode 100644 docs/user-guide/08-autodiff.md
 create mode 100644 docs/user-guide/08-compiling.md
 create mode 100644 docs/user-guide/09-targets.md

(limited to 'docs')

diff --git a/docs/_config.yml b/docs/_config.yml
index 259a24e4d..72d781d70 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -1 +1 @@
-theme: jekyll-theme-tactile
\ No newline at end of file
+theme: jekyll-theme-tactile
diff --git a/docs/scripts/Program.cs b/docs/scripts/Program.cs
index d7f4ef623..d543f399e 100644
--- a/docs/scripts/Program.cs
+++ b/docs/scripts/Program.cs
@@ -164,6 +164,12 @@ namespace toc
                         node.sections.Add(sectionStr);
                         node.sectionShortTitles.Add(maybeGetShortTitle(sectionStr, content, i));
                     }
+                    if (content[i].StartsWith("permalink:"))
+                    {
+                        var prefixLength = ("permalink:").Length;
+                        var permaPath = content[i].Substring(prefixLength, content[i].Length - prefixLength).Trim();
+                        node.fileID = Path.GetFileName(permaPath);
+                    }
                 }
                 if (node.title == null)
                 {
diff --git a/docs/user-guide/00-introduction.md b/docs/user-guide/00-introduction.md
index a77a63a8f..07b860d9c 100644
--- a/docs/user-guide/00-introduction.md
+++ b/docs/user-guide/00-introduction.md
@@ -1,5 +1,6 @@
 ---
 layout: user-guide
+permalink: /user-guide/introduction
 ---
 
 Introduction
diff --git a/docs/user-guide/01-get-started.md b/docs/user-guide/01-get-started.md
index 27ac956d0..a7cf2ddf1 100644
--- a/docs/user-guide/01-get-started.md
+++ b/docs/user-guide/01-get-started.md
@@ -1,5 +1,6 @@
 ---
 layout: user-guide
+permalink: /user-guide/get-started
 ---
 
 # Getting Started with Slang
diff --git a/docs/user-guide/02-conventional-features.md b/docs/user-guide/02-conventional-features.md
index 0f7efd2b9..4faa6607b 100644
--- a/docs/user-guide/02-conventional-features.md
+++ b/docs/user-guide/02-conventional-features.md
@@ -1,5 +1,6 @@
 ---
 layout: user-guide
+permalink: /user-guide/conventional-features
 ---
 
 Conventional Language Features
diff --git a/docs/user-guide/03-convenience-features.md b/docs/user-guide/03-convenience-features.md
index 2aea1fdff..948400731 100644
--- a/docs/user-guide/03-convenience-features.md
+++ b/docs/user-guide/03-convenience-features.md
@@ -1,5 +1,6 @@
 ---
 layout: user-guide
+permalink: /user-guide/convenience-features
 ---
 
 # Basic Convenience Features
diff --git a/docs/user-guide/04-modules-and-access-control.md b/docs/user-guide/04-modules-and-access-control.md
index 68e2d06c7..e1976effb 100644
--- a/docs/user-guide/04-modules-and-access-control.md
+++ b/docs/user-guide/04-modules-and-access-control.md
@@ -1,5 +1,6 @@
 ---
 layout: user-guide
+permalink: /user-guide/modules
 ---
 
 Modules and Access Control
diff --git a/docs/user-guide/05-capabilities.md b/docs/user-guide/05-capabilities.md
new file mode 100644
index 000000000..c20a43c75
--- /dev/null
+++ b/docs/user-guide/05-capabilities.md
@@ -0,0 +1,143 @@
+---
+layout: user-guide
+permalink: /user-guide/capabilities
+---
+
+# Capabilities
+
+One of the biggest challenges in maintaining cross-platform shader code is to manage the differences in hardware capabilities across different GPUs, graphics APIs, and shader stages.
+Each graphics API or shader stage may expose operations that are not available on other platforms. Instead of restricting Slang's features to the lowest common denominator of different platforms,
+Slang exposes operations from all target platforms to allow the user to take maximum advantage on a specific target.
+
+A consequence of this approach is that the user is now responsible for maintaining compatibility of their code. For example, if the user writes code that uses a Vulkan extension currently not
+available on D3D/HLSL, they will get an error when attempting to compile that code to D3D.
+
+To help the user to maintain compatibility of their shader code on platforms matter to their applications, Slang's type system can now infer and enforce capability requirements
+to provide assurance that the shader code will be compatible with the specific set of platforms before compiling for that platform.
+
+For example, `Texture2D.SampleCmp` is available on D3D and Vulkan, but not available on CUDA. If the user is intended to write cross-platform code that targets CUDA, they will
+receive a type-checking error when attempting to use `SampleCmp` before the code generation stage of compilation. When using Slang's intellisense plugin, the programmer should
+get a diagnostic message directly in their code editor.
+
+As another example, `discard` is a statement that is only meaningful when used in fragment shaders. If a vertex shader contains a `discard` statement or calling a function that contains
+a `discard` statement, it shall be a type-check error.
+
+## Capability Atoms and Capability Requirements
+
+Slang models code generation targets, shader stages, API extensions and hardware features as distinct capability atoms. For example, `GLSL_460` is a capability atom that stands for the GLSL 460 code generation target,
+`compute` is an atom that represents the compute shader stage, `_sm_6_7` is an atom representing the shader model 6.7 feature set in D3D, `SPV_KHR_ray_tracing` is an atom representing the `SPV_KHR_ray_tracing` SPIR-V extension, and `spvShaderClockKHR` is an atom for the `ShaderClockKHR` SPIRV capability. For a complete list of capabilities supported by the Slang compiler, check the [capability definition file](https://github.com/shader-slang/slang/blob/master/source/slang/slang-capabilities.capdef).
+
+A capabiltiy **requirement** can be a single capability atom, a conjunction of capability atoms, or a disjunction of conjunction of capability atoms. A function can declare its
+capability requirement with the following syntax:
+
+```csharp
+[require(spvShaderClockKHR)]
+[require(glsl, GL_EXT_shader_realtime_clock)]
+[require(hlsl_nvapi)]
+uint2 getClock() {...}
+```
+
+Each `[require]` attribute declares a conjunction of capability atoms, and all `[require]` attributes form the final requirement of the `getClock()` function as a disjunction of capabilities:
+```
+(spvShaderClockKHR | glsl + GL_EXT_shader_realtime_clock | hlsl_nvapi)
+```
+
+A capability can __imply__ other capabilities. Here `spvShaderClockKHR` is a capability that implies `SPV_KHR_shader_clock`, which represents the SPIRV `SPV_KHR_shader_clock` extension, and the `SPV_KHR_shader_clock` capability implies `spirv_1_0`, which stands for the spirv code generation target.
+
+When evaluating capability requirements, Slang will expand all implications. Therefore the final capability requirement for `getClock` is:
+```
+  spirv_1_0 + SPV_KHR_shader_clock + spvShaderClockKHR
+| glsl + _GL_EXT_shader_realtime_clock
+| hlsl + hlsl_nvapi
+```
+Which means the function can be called from locations where the `spvShaderClockKHR` capability is available (when targeting SPIRV), or where the `GL_EXT_shader_realtime_clock` extension is available when targeting GLSL,
+or where `nvapi` is available when targeting HLSL.
+
+## Conflicting Capabilities
+
+Certain groups of capabilities are mutually exclusive such that only one capability in the group is allowed to exist. For example, all stage capabilities are mutual exclusive: a requirement for both `fragment` and `vertex` is impossible to satisfy. Currently, capabilities that model different code generation targets (e.g. `hlsl`, `glsl`) or different shader stages (`vertex`, `fragment`, etc.) are mutually exclusive within
+their corresponding group.
+
+If two capability requirements contain different atoms that are conflicting with each other, these two requirements are considered __incompatible__.
+For example, requirement `spvShaderClockKHR + fragment` and requirement `spvShaderClockKHR + vertex` are incompatible, because `fragment` conflicts with `vertex`.
+
+## Requirements in Parent Scope
+
+The capability requirement of a decl is always joined with the requirements declared in its parents.
+For example:
+```csharp
+[require(spvShaderClockKHR)]
+struct MyType
+{
+    [require(spvShaderClockKHR)]
+    void method() { ... }
+}
+```
+`MyType.method` has requirement `spvShaderClockKHR + spvShaderClockKHR`.
+
+## Inferrence of Capability Requirements
+
+By default, Slang will infer the capability requirements of a function given its definition, as long as the function has `internal` or `private` visibilty. For example, given:
+```csharp
+void myFunc()
+{
+    if (getClock().x % 1000 == 0)
+        discard;
+}
+```
+Slang will automatically deduce that `myFunc` has capability
+```
+  spirv_1_0 + SPV_KHR_shader_clock + spvShaderClockKHR + fragment
+| glsl + _GL_EXT_shader_realtime_clock + fragment
+| hlsl + hlsl_nvapi + fragment
+```
+Since `discard` statement requires capability `fragment`.
+
+## Inferrence on target_switch
+
+A `__target_switch` statement will introduce disjunctions in its inferred capabiltiy requirement. For example:
+```csharp
+void myFunc()
+{
+    __target_switch
+    {
+    case spirv: ...;
+    case hlsl: ...;
+    }
+}
+```
+The capability requirement of `myFunc` is `(spirv | hlsl)`, meaning that the function can be called from a context where either `spirv` or `hlsl` capability
+is available.
+
+## Capability Aliases
+
+To make it easy to specify capabilities on different platforms, Slang also defines many aliases that can be used in `[require]` attributes.
+For example, Slang declares:
+```
+alias sm_6_6 = _sm_6_6
+             | glsl_spirv_1_5 + sm_6_5
+                + GL_EXT_shader_atomic_int64 + atomicfloat2
+             | spirv_1_5 + sm_6_5
+                + GL_EXT_shader_atomic_int64 + atomicfloat2
+                + SPV_EXT_descriptor_indexing
+             | cuda
+             | cpp;
+```
+So user code can write `[require(sm_6_6)]` to mean that the function requires shader model 6.6 on D3D or equivalent set of GLSL/SPIRV extensions when targeting GLSL or SPIRV.
+Note that in the above definition, `GL_EXT_shader_atomic_int64` is also an alias that is defined as:
+```
+alias GL_EXT_shader_atomic_int64 = _GL_EXT_shader_atomic_int64 | spvInt64Atomics;
+```
+Where `_GL_EXT_shader_atomic_int64` is the atom that represent the true `GL_EXT_shader_atomic_int64` GLSL extension.
+The `GL_EXT_shader_atomic_int64` alias is defined as a disjunction of `_GL_EXT_shader_atomic_int64` and the `Int64Atomics` SPIRV capability so that
+it can be used in both the contexts of GLSL and SPIRV target.
+
+When aliases are used in a `[require]` attribute, the compiler will expand the alias to evaluate the capability set, and remove all incompatible conjunctions.
+For example, `[require(hlsl, sm_6_6)]` will be evaluated to `(hlsl+_sm_6_6)` because all other conjunctions in `sm_6_6` are incompatible with `hlsl`.
+
+## Validation of Capability Requirements
+
+Slang requires all public methods and interface methods to have explicit capability requirements declarations. Omitting capability declaration on a public method means that the method does not require any
+specific capability. Functions with explicit requirement declarations will be verified by the compiler to ensure that it does not use any capability beyond what is declared.
+
+Slang recommends but does not require explicit declaration of capability requirements for entrypoints. If explicit capability requirements are declared on an entrypoint, they will be used to validate the entrypoint the same way as other public methods, providing assurance that the function will work on all intended targets. If an entrypoint does not define explicit capability requirements, Slang will infer the requirements, and only issue a compiler error when the inferred capability is incompatible with the current code generation target.
diff --git a/docs/user-guide/05-interfaces-generics.md b/docs/user-guide/05-interfaces-generics.md
deleted file mode 100644
index 94a08714e..000000000
--- a/docs/user-guide/05-interfaces-generics.md
+++ /dev/null
@@ -1,736 +0,0 @@
----
-layout: user-guide
----
-
-Interfaces and Generics
-===========================
-
-This chapter covers two interrelated Slang language features: interfaces and generics. We will talk about what they are, how do they relate to similar features in other languages, how are they parsed and translated by the compiler, and show examples on how these features simplifies and modularizes shader code.
-
-Interfaces
-----------
-
-Interfaces are used to define the methods and services a type should provide. You can define a interface as the following example:
-```csharp
-interface IFoo
-{
-    int myMethod(float arg);
-}
-```
-
-Slang's syntax for defining interfaces are similar to `interface`s in C# and `protocol`s in Swift. In this example, the `IFoo` interface establishes a contract that any type conforming to this interface must provide a method named `myMethod` that accepts a `float` argument and returns an `int` value.
-
-A `struct` type may declare its conformance to an `interface` via the following syntax:
-```csharp
-struct MyType : IFoo
-{
-    int myMethod(float arg)
-    {
-        return (int)arg + 1;
-    }
-}
-```
-By declaring the conformance to `IFoo`, the definition of `MyType` must include a method named `myMethod` with a matching signature to that defined in the `IFoo` interface to satisfy the declared conformance. If a type misses any methods required by the interface, the Slang compiler will generate an error message.
-
-A `struct` type may declare multiple interface conformances:
-```csharp
-interface IBar { uint myMethod2(uint2 x); }
-
-struct MyType : IFoo, IBar
-{
-    int myMethod(float arg) {...}
-    uint myMethod2(uint2 x) {...}
-}
-```
-In this case, the definition of `MyType` must satisfy the requirements from both the `IFoo` and `IBar` interfaces by providing both the `myMethod` and `myMethod2` methods.
-
-Generics
----------------------
-
-Generics can be used to eliminate duplicate code for shared logic that operates on different types. The following example shows how to define a generic method in Slang.
-
-```csharp
-int myGenericMethod<T: IFoo>(T arg)
-{
-    return arg.myMethod(1.0);
-}
-```
-
-The above listing defines a generic method named `myGenericMethod`, which accepts an argument that can be of any type `T` as long as `T` conforms to the `IFoo` interface. The `T` here is called a _generic type parameter_, and it is associated with an _type constraint_ that any type represented by `T` must conform to the interface `IFoo`.
-
-The following listing shows how to invoke a generic method:
-```csharp
-MyType obj;
-int a = myGenericMethod<MyType>(obj); // OK, explicit type argument
-int b = myGenericMethod(obj); // OK, automatic type deduction
-```
-
-You may explicitly specify the concrete type to used for the generic type argument, by providing the types in angular brackets after the method name, or leave it to the compiler to automatically deduce the type from the argument list.
-
-> #### Note ####
-> Slang currently does not support partial type argument list deduction.
-> For example if you have a generic method that accepts two type arguments:
-> ```
-> void g<T:IFoo, U:IBar>(T a, U b) {...}
-> ```
-> You may either call this method with no explicit type arguments:
-> ```
-> MyType a, b;
-> g(a, b);
-> ```
-> Or with explicit arguments for both generic type parameters:
-> ```
-> g<MyType, MyType>(a,b);
-> ```
-> If you only provide first type argument, Slang will generate an error:
-> ```
-> g<MyType>(a,b); // error, does not work today.
-> ```
-> We plan to support such use in a future version.
-
-
-Note that it is important to associate a generic type parameter with a type constraint. In the above example, although the definition of `myGenericMethod` is agnostic of the concrete type `T` will stand for, knowing that `T` conforms to `IFoo` allows the compiler to type-check and pre-compile `myGenericMethod` without needing to substitute `T` with any concrete types first. Similar to languages like C#, Rust, Swift and Java, leaving out the type constraint declaration on type parameter `T` will result in a compile error at the line calling `arg.myMethod` since the compiler cannot verify that `arg` has a member named `myMethod` without any knowledge on `T`. This is a major difference of Slang's generics compared to _templates_ in C++. 
-
-While C++ templates are a powerful language mechanism, Slang has followed the path of many other modern programming languages to adopt the more structural and restricted generics feature instead. This enables the Slang compiler to perform type checking early to give more readable error messages, and to speed-up compilation by reusing a lot of work for different instantiations of `myGenericMethod`.
-
-
-Supported Constructs in Interface Definitions
------------------------------------------------------
-
-Slang supports many other constructs in addition to ordinary methods as a part of an interface definition.
-
-### Properties
-
-```csharp
-interface IFoo
-{
-    property int count {get; set;}
-}
-```
-The above listing declares that any conforming type must define a property named `count` with both a `getter` and a `setter` method.
-
-### Generic Methods
-
-```csharp
-interface IFoo
-{
-    int compute<T:IBar>(T val);
-}
-```
-The above listing declares that any conforming type must define a generic method named `compute` that has one generic type parameter conforming to the `IBar` interface.
-
-### Static Methods
-
-```csharp
-interface IFoo
-{
-    static int compute(int val);
-};
-```
-
-The above listing declares that any conforming type must define a static method named `compute`. This allows the following generic method to pass type-checking:
-```csharp
-void f<T:IFoo>()
-{
-    T.compute(5); // OK, T has a static method `compute`.
-}
-```
-
-### Static Constants
-
-You can define static constant requirements in an interface. The constants can be accessed in places where a compile-time constant is needed.
-```csharp
-interface IMyValue
-{
-    static const int value;
-}
-struct MyObject2 : IMyValue
-{
-    static const int value = 2;
-}
-struct GetValuePlus1<T:IMyValue>
-{
-    static const int value = T.value + 1;
-}
-
-static const int result = GetValuePlus1<MyObject2>.value;  // result == 3
-```
-
-### `This` Type
-
-You may use a special keyword `This` in interface definitions to refer to the type that is conforming to the interface. The following examples demonstrate a use of `This` type:
-```csharp
-interface IComparable
-{
-    int comparesTo(This other);
-}
-struct MyObject : IComparable
-{
-    int val;
-    int comparesTo(MyObject other)
-    {
-        return val < other.val ? -1 : 1;
-    }
-}
-```
-In this example, the `IComparable` interface declares that any conforming type must provide a `comparesTo` method that performs a comparison between an object to another object of the same type. The `MyObject` type satisfies this requirement by providing a `comparesTo` method that accepts a `MyObject` typed argument, since in the scope of `MyObject`, `This` type is equivalent to `MyObject`.
-
-### Initializers
-
-Consider a generic method that wants to create and initialize a new instance of generic type `T`:
-```csharp
-void f<T:IFoo>()
-{
-    T obj = /*a newly initialized T*/
-}
-```
-One way to implement this is to introduce a static method requirement in `IFoo`:
-```csharp
-interface IFoo
-{
-    static This create();
-}
-```
-With this interface definition, we can define `f` as following:
-```csharp
-void f<T:IFoo>()
-{
-    T obj = T.create();
-}
-```
-
-This solution works just fine, but it would be nicer if you can just write:
-```csharp
-T obj = T();
-```
-Or simply
-```csharp
-T obj;
-```
-And let the compiler invoke the default initializer defined in the type.
-To enable this, you can include an initializer requirement in the interface definition:
-```csharp
-interface IFoo
-{
-    __init();
-}
-```
-
-Initializers with parameters are supported as well. For example:
-```csharp
-interface IFoo
-{
-    __init(int a, int b);
-}
-void g<T:IFoo>()
-{
-    T obj = {1, 2}; // OK, invoking the initializer on T.
-}
-```
-
-Associated Types
--------------------------
-
-When writing code using interfaces and generics, there are some situations where the an interface method needs to return an object whose type is implementation-dependent. For example, consider the following `IFloatContainer` interface that represents a container of `float` values:
-```csharp
-// Represents a container of float values.
-interface IFloatContainer
-{
-    // Returns the number of elements in this container.
-    uint getCount();
-    // Returns an iterator representing the start of the container.
-    Iterator begin();
-    // Returns an iterator representing the end of the container.
-    Iterator end();
-    // Return the element at the location represented by `iter`.
-    float getElementAt(Iterator iter);
-}
-```
-An implementation of the `IFloatContainer` interface may use different types of iterators. For example, an implementation that is simply an array of `float`s can expose `Iterator` as a simple integer index:
-```csharp
-struct ArrayFloatContainer : IFloatContainer
-{
-    float content[10];
-    uint getCount() { return 10; }
-    uint begin() { return 0; }
-    uint end() { return 10; }
-    float getElementAt(uint iter) { return content[iter]; }
-}
-```
-On the other hand, an implementation that uses multiple buffers as the backing storage may use a more complex type to locate an element:
-```csharp
-// Exposes values in two `StructuredBuffer`s as a single container.
-struct MultiArrayFloatContainer : IFloatContainer
-{
-    StructuredBuffer<float> firstBuffer;
-    StructuredBuffer<float> secondBuffer;
-    uint getCount() { return getBufferSize(firstBuffer) + getBufferSize(secondBuffer); }
-
-    // `uint2.x` indicates which buffer, `uint2.y` indicates the index within the buffer.
-    uint2 begin() { return uint2(0,0); }
-    uint2 end() { return uint2 (1, getBufferSize(secondBuffer)); }
-    float getElementAt(uint2 iter)
-    {
-        if (iter.x == 0) return firstBuffer[iter.y];
-        else return secondBuffer[iter.y];
-    }
-}
-```
-
-Ideally, a generic function that wishes to enumerate values in a `IFloatContainer` shouldn't need to care about the implementation details on what the concrete type of `Iterator` is, and we would like to be able to write the following:
-```csharp
-float sum<T:IFloatContainer>(T container)
-{
-    float result = 0.0f;
-    for (T.Iterator iter = container.begin(); iter != container.end(); iter=iter.next())
-    {
-        float val = container.getElementAt(iter);
-        result += val;
-    }
-    return result;
-}
-```
-Here the `sum` function simply wants to access all the elements and sum them up. The details of what the `Iterator` type actually is does not matter to the definition of `sum`.
-
-The problem is that the `IFloatContainer` interface definition requires methods like `begin()`, `end()` and `getElementAt()` to refer to a iterator type that is implementation dependent. How should the signature of these methods be defined in the interface? The answer is to use _associated types_.
-
-In addition to constructs listed in the previous section, Slang also supports defining associated types in an `interface` definition. An associated type can be defined as following.
-```csharp
-// The interface for an iterator type.
-interface IIterator
-{
-    // An iterator needs to know how to move to the next element.
-    This next();
-}
-
-interface IFloatContainer
-{
-    // Requires an implementation to define a typed named `Iterator` that
-    // conforms to the `IIterator` interface.
-    associatedtype Iterator : IIterator;
-
-    // Returns the number of elements in this container.
-    uint getCount();
-    // Returns an iterator representing the start of the container.
-    Iterator begin();
-    // Returns an iterator representing the end of the container.
-    Iterator end();
-    // Return the element at the location represented by `iter`.
-    float getElementAt(Iterator iter);
-};
-```
-
-This `associatedtype` definition in `IFloatContainer` requires that all types conforming to this interface must also define a type in its scope named `Iterator`, and this iterator type must conform to the `IIterator` interface. An implementation to the `IFloatContainer` interface by using either a `typedef` declaration or a `struct` definition inside its scope to satisfy the associated type requirement. For example, the `ArrayFloatContainer` can be implemented as following:
-```csharp
-struct ArrayIterator : IIterator
-{
-    uint index;
-    __init(int x) { index = x; }
-    ArrayIterator next()
-    {
-        return ArrayIterator(index + 1);
-    }
-}
-struct ArrayFloatContainer : IFloatContainer
-{
-    float content[10];
-
-    // Specify that the associated `Iterator` type is `ArrayIterator`.
-    typedef ArrayIterator Iterator;
-
-    Iterator getCount() { return 10; }
-    Iterator begin() { return ArrayIterator(0); }
-    Iterator end() { return ArrayIterator(10); }
-    float getElementAt(Iterator iter) { return content[iter.index]; }
-}
-```
-
-Alternatively, you may also define the `Iterator` type directly inside a `struct` implementation, as in the following definition for `MultiArrayFloatContainer`:
-```csharp
-// Exposes values in two `StructuredBuffer`s as a single container.
-struct MultiArrayFloatContainer : IFloatContainer
-{
-    // Represents an iterator of this container
-    struct Iterator : IIterator
-    {
-        // `index.x` indicates which buffer the element is located in.
-        // `index.y` indicates which the index of the element inside the buffer.
-        uint2 index;
-
-        // We also need to keep a size of the first buffer so we know when to
-        // switch to the second buffer.
-        uint firstBufferSize;
-
-        // Implementation of IIterator.next()
-        Iterator next()
-        {
-            Iterator result;
-            result.index.x = index.x;
-            result.index.y = index.y + 1;
-            // If we are at the end of the first buffer,
-            // move to the head of the second buffer
-            if (result.index.x == 0 && result.index.y == firstBufferSize)
-            {
-                result.index = uint2(1, 0);
-            }
-            return result;
-        }
-    }
-
-    StructuredBuffer<float> firstBuffer;
-    StructuredBuffer<float> secondBuffer;
-    uint getCount() { return getBufferSize(firstBuffer) + getBufferSize(secondBuffer); }
-
-    Iterator begin()
-    {
-        Iterator iter;
-        iter.index = uint2(0, 0);
-        iter.firstBufferSize = getBufferSize(firstBuffer);
-        return iter;
-    }
-    Iterator end()
-    {
-        Iterator iter;
-        iter.index = uint2(1, getBufferSize(secondBuffer));
-        iter.firstBufferSize = 0;
-        return iter;
-    }
-    float getElementAt(Iterator iter)
-    {
-        if (ite.indexr.x == 0) return firstBuffer[iter.index.y];
-        else return secondBuffer[iter.index.y];
-    }
-}
-```
-
-In summary, an `asssociatedtype` requirement in an interface is similar to other types of requirements: a method requirement means that an implementation must provide a method matching the interface signature, while an `associatedtype` requirement means that an implementation must provide a type in its scope with the matching name and interface constraint. In general, when defining an interface that is producing and consuming an object whose actual type is implementation-dependent, the type of this object can often be modeled as an associated type in the interface.
-
-### Comparison to the C++ Approach
-Readers who are familiar with C++ could easily relate the `Iterator` example in previous subsection to the implementation of STL. In C++, the `sum` function can be easily written with templates:
-```C++
-template<typename TContainer>
-float sum(const TContainer& container)
-{
-    float result = 0.0f;
-    // Assumes `TContainer` has a type `Iterator` that supports `operator++`.
-    for (TContainer::Iterator iter = container.begin(); iter != container.end(); ++iter)
-    {
-        result += container.getElementAt(iter);
-    }
-    return result;
-}
-```
-
-A C++ programmer can implement `ArrayFloatContainer` as following:
-```C++
-struct ArrayFloatContainer
-{
-    float content[10];
-
-    typedef uint32_t Iterator;
-
-    Iterator getCount() { return 10; }
-    Iterator begin() { return 0; }
-    Iterator end() { return 10; }
-    float getElementAt(Iterator iter) { return content[iter]; }
-};
-```
-Because C++ does not require a template function to define _constraints_ on the templated type, there are no interfaces or inheritances involved in the definition of `ArrayFloatContainer`. However `ArrayFloatContainer` still needs to define what its `Iterator` type is, so the `sum` function can be successfully specialized with an `ArrayFloatContainer`.
-
-Note that the biggest difference between C++ templates and generics is that templates are not type-checked prior to specialization, and therefore the code that consumes a templated type (`TContainer` in this example) can simply assume `container` has a method named `getElementAt`, and the `TContainer` scope provides a type definition for `TContainer::Iterator`. Compiler error only arises when the programmer is attempting to specialize the `sum` function with a type that does not meet these assumptions. Contrarily, Slang requires all possible uses of a generic type be declared through an interface. By stating that `TContainer:IContainer` in the generics declaration, the Slang compiler can verify that `container.getElementAt` is calling a valid function. Similarily, the interface also tells the compiler that `TContainer.Iterator` is a valid type and enables the compiler to fully type check the `sum` function without specializing it first.
-
-### Similarity to Swift and Rust
-
-Slang's `associatedtype` shares the same semantic meaning with `associatedtype` in a Swift `protocol` or `type` in a Rust `trait`, except that Slang currently does not support the more general `where` clause in these languages. C# does not have an equivalent to `associatedtype`, and programmers need to resort to generic interfaces to achieve similar goals.
-
-Generic Value Parameters
--------------------------------
-
-So far we have demonstrated generics with _type parameters_. Additionally, Slang also supports generic _value_ parameters.
-The following listing shows an example of generic value parameters.
-```csharp
-struct Array<T, let N : int>
-{
-    T arrayContent[N];
-}
-```
-In this example, the `Array` type has a generic type parameter, `T`, that is used as the element type of the `arrayContent` array, and a generic value parameter `N` of integer type.
-
-Note that the builtin `vector<float, N>` type also has an generic value parameter `N`.
-
-> #### Note ####
-> The only type of generic value parameters are `int`, `uint` and `bool`. `float` and
-> other types cannot be used in a generic value parameter. Computations in a type
-> expression are supported as long as they can be evaluated at compile time. For example,
-`vector<float, 1+1>` is allowed and considered equivalent to `vector<float, 2>`.
-
-
-Interface-typed Values
--------------------------------
-
-So far we have been using interfaces as constraints to generic type parameters. For example, the following listing defines a generic function with a type parameter `TTransform` constrained by interface `ITransform`:
-
-```csharp
-interface ITransform
-{
-    int compute(MyObject obj);
-}
-
-// Defining a generic method:
-int apply<TTransform : ITransform>(TTransform transform, MyObject object)
-{
-    return transform.compute(object);
-}
-```
-
-While Slang's syntax for defining generic methods bears similarity to generics in C#/Java and templates in C++ and should be easy to users who are familiar with these languages, codebases that make heavy use of generics can quickly become verbose and difficult to read. To reduce the amount of boilerplate, Slang supports an alternate way to define the `apply` method by using the interface type `ITransform` as parameter type directly:
-
-```csharp
-// A method that is equivalent to `apply` but uses simpler syntax:
-int apply_simple(ITransform transform, MyObject object)
-{
-    return transform.compute(object);
-}
-```
-
-Instead of defining a generic type parameter `TTransform` and a method parameter `transform` that has `TTransform` type, you can simply define the same `apply` function like a normal method, with a `transform` parameter whose type is an interface. From the Slang compiler's view, `apply` and `apply_simple` will be compiled to the same target code.
-
-In addition to parameters, Slang allows variables, and function return values to have an interface type as well:
-```csharp
-ITransform test(ITransform arg)
-{
-    ITransform v = arg;
-    return v;
-}
-```
-
-### Restrictions and Caveats
-
-The Slang compiler always attempts to determine the actual type of an interface-typed value at compile time and specialize the code with the actual type. As long as the compiler can successfully determine the actual type, code that uses interface-typed values are equivalent to code written in the generics syntax. However, when interface types are used in function return values, the compiler will not be able to trivially propagate type information. For example:
-```csharp
-ITransform getTransform(int x)
-{
-    if (x == 0)
-    {
-        Type1Transform rs = {};
-        return rs;
-    }
-    else
-    {
-        Type2Transform rs = {};
-        return rs;
-    }
-}
-```
-In this example, the actual type of the return value is dependent on the value of `x`, which may not be known at compile time. This means that the concrete type of the return value at invocation sites of `getTransform` may not be statically determinable. When the Slang compiler cannot infer the concrete type of an interface-type value, it will generate code that performs a dynamic dispatch based on the concrete type of the value at runtime, which may introduce performance overhead. Note that this behavior applies to function return values in the form of `out` parameters as well:
-
-```csharp
-void getTransform(int x, out ITransform transform)
-{
-    if (x == 0)
-    {
-        Type1Transform rs = {};
-        transform = rs;
-    }
-    else
-    {
-        Type2Transform rs = {};
-        transform = rs;
-    }
-}
-```
-This `getTransform` definition can also result in dynamic dispatch code since the type of `transform` may not be statically determinable.
-
-When the compiler is generating dynamic dispatch code for interface-typed values, it requires the concrete type of the interface-typed value to be free of any opaque-typed fields (e.g. resources and buffer types). A compiler error will generated upon such attempts:
-```csharp
-struct MyTransform : ITransform
-{
-    StructuredBuffer<int> buffer;
-    int compute(MyObject obj)
-    {
-        return buffer[0];
-    }
-}
-
-ITransform getTransform(int x)
-{
-    MyTransform rs;
-    // Error: cannot use an opaque value as an interface-typed return value.
-    return rs;
-}
-```
-
-Assigning different values to a mutable interface-typed variable also undermines the compiler's ability to statically determine the type of the variable, and is not supported by the Slang compiler today:
-```csharp
-void test(int x)
-{
-    ITransform t = Type1Transform();
-    // Do something ...
-    // Assign a different type of transform to `t`:
-    // (Not supported by Slang today)
-    t = Type2Transform();
-    // Do something else...
-}
-```
-
-In general, if the use of interface-typed values is restricted to function parameters only, then the all code that involves interface-typed values will be compiled the same way as if the code is written using standard generics syntax.
-
-
-Extending a Type with Additional Interface Conformances
------------------------------
-In the previous chapter, we introduced the `extension` feature that lets you define new members to an existing type in a separate location outside the original definition of the type. 
-
-`extensions` can be used to make an existing type conform to additional interfaces. Suppose we have an interface `IFoo` and a type `MyObject` that implements the interface:
-
-```csharp
-interface IFoo
-{
-    int foo();
-};
-
-struct MyObject : IFoo
-{
-    int foo() { return 0; }
-}
-```
-
-Now we introduce another interface, `IBar`:
-```csharp
-interface IBar
-{
-    float bar();
-}
-```
-
-We can define an `extension` to make `MyObject` conform to `IBar` as well:
-```csharp
-extension MyObject : IBar
-{
-    float bar() { return 1.0f }
-}
-```
-
-With this extension, we can use `MyObject` in places that expects an `IBar` as well:
-```csharp
-void use(IBar b)
-{
-    b.bar();
-}
-
-void test()
-{
-    MyObject obj;
-    use(obj); // OK, `MyObject` is extended to conform to `IBar`.
-}
-```
-
-You may define more than one interface conformances in a single `extension`:
-```csharp
-interface IBar2
-{
-    float bar2();
-}
-extension MyObject : IBar, IBar2
-{
-    float bar() { return 1.0f }
-    float bar2() { return 2.0f }
-}
-```
-
-`is` and `as` Operator
-----------------------------
-
-You can use `is` operator to test if an interface-typed value is of a specific concrete type, and use `as` operator to downcast the value into a specific type.
-The `as` operator returns an `Optional<T>` that is not `none` if the downcast succeeds.
-
-```csharp
-interface IFoo
-{
-    int foo();
-}
-struct MyImpl : IFoo
-{
-    int foo() { return 0; }
-}
-void test(IFoo foo)
-{
-    bool t = foo is MyImpl; // true
-    Optional<MyImpl> optV = foo as MyImpl;
-    if (t == (optV != none))
-        printf("success");
-    else
-        printf("fail");
-}
-void main()
-{
-    MyImpl v;
-    test(v);
-}
-// Result:
-// "success"
-```
-
-
-Extensions to Interfaces
------------------------------
-
-In addtion to extending ordinary types, you can define extensions on interfaces as well:
-```csharp
-// An example interface.
-interface IFoo
-{
-    int foo();
-}
-
-// Extending `IFoo` with a new method requirement
-// with a default implementation.
-extension IFoo
-{
-    int bar() { return 0; }
-}
-
-int use(IFoo foo)
-{
-    // With the extension, all uses of `IFoo` typed values
-    // can assume there is a `bar` method.
-    return foo.bar();
-}
-```
-
-Although the syntax of above listing suggests that we are extending an interface with additional requirements, this interpretation does not make logical sense in many ways. Consider a type `MyType` that exists before the extension is defined:
-```csharp
-struct MyType : IFoo
-{
-    int foo() { return 0; }
-}
-```
-
-If we extend the `IFoo` with new requirements, the existing `MyType` definition would become invalid since `MyType` no longer provides implementations to all interface requirements. Instead, what an `extension` on an interface `IFoo` means is that for all types that conforms to the `IFoo` interface and does not have a `bar` method defined, add a `bar` method defined in this extension to that type so that all `IFoo` typed values have a `bar` method defined. If a type already defines a matching `bar` method, then the existing method will always override the default method provided in the extension:
-
-```csharp
-interface IFoo
-{
-    int foo();
-}
-struct MyFoo1 : IFoo
-{
-    int foo() { return 0; }
-}
-extension IFoo
-{
-    int bar() { return 0; }
-}
-struct MyFoo2 : IFoo
-{
-    int foo() { return 0; }
-    int bar() { return 1; }
-}
-void test()
-{
-    MyFoo1 f1;
-    MyFoo2 f2;
-    int a = f1.bar(); // a == 0, calling the method in the extension.
-    int b = f2.bar(); // b == 1, calling the existing method in `MyFoo2`.
-}
-```
-This feature is similar to extension traits in Rust.
diff --git a/docs/user-guide/06-compiling.md b/docs/user-guide/06-compiling.md
deleted file mode 100644
index daeefc0e6..000000000
--- a/docs/user-guide/06-compiling.md
+++ /dev/null
@@ -1,497 +0,0 @@
----
-layout: user-guide
----
-
-Compiling Code with Slang
-=========================
-
-This chapter presents the ways that the Slang system supports compiling and composing shader code.
-We will start with a discussion of the mental model that Slang uses for compilation.
-Next we will cover the command-line Slang compiler, `slangc`, and how to use it to perform offline compilation.
-Finally we will discuss the Slang compilation API, which can be used to integrate Slang compilation into an application at runtime, or to build custom tools that implement application-specific compilation policy.
-
-Concepts
---------
-
-For simple scenarios it may be enough to think of a shader compiler as a box where source code goes in and compiled kernels come out.
-Most real-time graphics applications end up needing more control over shader compilation, and/or more information about the results of compilation.
-In order to make use of the services provided by the Slang compilation system, it is useful to start with a clear model of the concepts that are involved in compilation.
-
-### Source Units
-
-At the finest granularity, code is fed to the compiler in _source units_ which are most often stored as files on disk or strings of text in memory.
-The compilation model largely does not care whether source units have been authored by human programmers or automatically assembled by other tools.
-
-If multiple source units are specified as part of the same compile, they will be preprocessed and parsed independently.
-However, a source unit might contain `#include` directives, so that the preprocessed text of that source unit includes the content of other files.
-Note that the `#include`d files do not become additional source units; they are just part of the text of a source unit that was fed to the compiler.
-
-### Translation Units and Modules
-
-Source units (such as files) are grouped into _translation units_, and each translation unit will produce a single _module_ when compiled.
-
-While the source units are all preprocessed and parsed independently, semantic checking is applied to a translation unit as a whole.
-One source file in a translation unit may freely refer to declarations in another translation unit without any need for forward declarations. For example:
-
-```hlsl
-// A.slang
-
-float getFactor() { return 10.0; }
-```
-
-```hlsl
-// B.slang
-
-float scaleValue(float value)
-{
-    return value * getFactor();
-}
-```
-
-In this example, the `scaleValue()` function in `B.slang` can freely refer to the `getFactor()` function in `A.slang` because they are part of the same translation unit.
-
-It is allowed, and indeed common, for a translation unit to contain only a single source unit.
-For example, when adapting an existing codebase with many `.hlsl` files, it is appropriate to compile each `.hlsl` file as its own translation unit.
-A modernized codebase might decide to compile multiple `.slang` files in a single directory as a single translation unit.
-
-The result of compiling a translation unit is a module in Slang's internal intermediate representation (IR).
-
-### Entry Points
-
-A translation unit / module may contain zero or more entry points.
-Slang supports two models for identifying entry points when compiling.
-
-#### Entry Point Attributes
-
-By default, the compiler wll scan a translation unit for function declarations marked with the `[shader(...)]` attribute; each such function will be identified as an entry point in the module.
-Developers are encouraged to use this model because it directly documents intention and makes source code less dependent on external compiler configuration options.
-
-#### Explicit Entry Point Options
-
-For compatibility with existing code, the Slang compiler also supports explicit specification of entry point functions using configuration optiosn external to shader source code.
-When these options are used the compiler will *ignore* all `[shader(...)]` attributes and only use the explicitly-specified entry points intead.
-
-### Shader Parameters
-
-A translation unit / module may contain zero or more global shader parameters.
-Similarly, each entry point may define zero or more entry-point `uniform` shader parameters.
-
-The shader parameters of a module or entry point are significant because they describe the interface between host application code and GPU code.
-It is important that both the application and generated GPU kernel code agree on how parameters are laid out in memory and/or how they are assigned to particular API-defined registers, locations, or other "slots."
-
-### Targets
-
-Within the Slang system a _target_ represents a particular platform and set of capabilities that output code can be generated for.
-A target includes information such as:
-
-* The _format_ that code should be generated in: SPIR-V, DXIL, etc.
-
-* A _profile_ that specifies a general feature/capability level for the target: D3D Shader Model 5.1, GLSL version 4.60, etc.
-
-* Optional _capabilities_ that should be assumed available on the target: for example, specific Vulkan GLSL extensions
-
-* Options that impact code generation: floating-point strictness, level of debug information to generate, etc.
-
-Slang supports compiling for multiple targets in the same compilation session.
-When using multiple targets at a time, it is important to understand the distinction between the _front-end_ of the compiler, and the _back-end_:
-
-* The compiler front-end comprises preprocessing, parsing, and semantic checking. The front-end runs once for each translation unit and its results are shared across all targets.
-
-* The compiler back-end generates output code, and thus runs once per target.
-
-> #### Note ####
-> Because front-end actions, including preprocessing, only run once, across all targets, the Slang compiler does not automatically provide any target-specific preprocessor `#define`s that can be used for preprocessor conditionals.
-> Applications that need target-specific `#define`s should always compile for one target at a time, and set up their per-target preprocessor state manually.
-
-### Layout
-
-While the front-end of the compiler determines what the shader parameters of a module or entry point are, the _layout_ for those parameters is dependent on a particular compilation target.
-A `Texture2D` might consume a `t` register for Direct3D, a `binding` for Vulkan, or just plain bytes for CUDA.
-
-The details of layout in Slang will come in a later chapter.
-For the purposes of the compilation model it is important to note that the layout computed for shader parameters depends on:
-
-* What modules and entry points are being used together; these define which parameters are relevant.
-
-* Some well-defined ordering of those parameters; this defines which parameters should be laid out before which others.
-
-* The rules and constraints that the target imposes on layout.
-
-An important design choice in Slang is give the user of the compiler control over these choices.
-
-### Composition
-
-The user of the Slang compiler communicates the modules and entry points that will be used together, as well as their relative order, using a system for _composition_.
-
-A _component type_ is a unit of shader code composition; both modules and entry points are examples of component types.
-A _composite_ component type is formed from a list of other component types (for example, one module and two entry points) and can be used to define a unit of shader code that is meant to be used together.
-
-Once a programmer has formed a composite of all the code they intend to use together, they can query the layout of the shader parameters in that composite, or request kernel code generation for its entry points.
-
-### Kernels
-
-A _kernel_ is generated code for an entry point.
-The same entry point can be used to generate many different kernels.
-First, and entry point can be compiled for different targets, resulting in different kernels in the appropriate format for each target.
-Second, different compositions of shader code can result in different layouts, which leads to different kernels being required.
-
-Command-Line Compilation with `slangc`
---------------------------------------
-
-The `slangc` tool, included in binary distributions of Slang, is a command-line compiler that can handle most simple compilation tasks.
-`slangc` is intended to be usable as a replacement for tools like `fxc` and `dxc`, and covers most of the same use cases.
-
-### Example
-
-Here we will repeat the example used in the [Getting Started](01-get-started.md) chapter.
-Given the following Slang code:
-
-```hlsl
-// hello-world.slang
-StructuredBuffer<float> buffer0;
-StructuredBuffer<float> buffer1;
-RWStructuredBuffer<float> result;
-
-[shader("compute")]
-[numthreads(1,1,1)]
-void computeMain(uint3 threadId : SV_DispatchThreadID)
-{
-    uint index = threadId.x;
-    result[index] = buffer0[index] + buffer1[index];
-}
-```
-
-we can compile the `computeMain()` entry point to SPIR-V using the following command line:
-
-```bat
-slangc hello-world.slang -entry computeMain -target spirv -o hello-world.spv
-```
-
-### Source Files and Translation Units
-
-The `hello-world.slang` argument here is specifying an input file.
-Each input file specified on the command line will be a distinct source unit during compilation.
-Slang supports multiple file-name extensions for input files, but the most common ones will be `.hlsl` for existing HLSL code, and `.slang` for files written specifically for Slang.
-
-If multiple source files are passed to `slangc`, they will be grouped into translation units using the following rules:
-
-* If there are any `.slang` files, then all of them will be grouped into a single translation unit
-
-* Each `.hlsl` file will be grouped into a distinct translation unit of its own
-
-### Entry Points
-
-When using `slangc`, you will typically want to identify which entry point(s) you intend to compile.
-The `-entry computeMain` option selects an entry point to be compiled to output code in this invocation of `slangc`.
-
-Because the `computeMain()` entry point in this example has a `[shader(...)]` attribute, the compiler is able to deduce that it should be compiled for the `compute` stage.
-In code that does not use `[shader(...)]` attributes, a `-entry` option should be followed by a `-stage` option to specify the stage of the entry point:
-
-```bat
-slangc hello-world.slang -entry computeMain -stage compute -o hello-world.spv
-```
-
-### Targets
-
-Our example uses the option `-target spirv` to introduce a compilation target; in this case, code will be generated as SPIR-V.
-The argument of a `-target` option specified the format to use for the target; common values are `dxbc`, `dxil`, and `spirv`.
-
-Additional options for a target can be specified after the `-target` option.
-For example, a `-profile` option can be used to specify a profile that should be used.
-Slang provides two main kinds of profiles for use with `slangc`:
-
-* Direct3D "Shader Model" profiles have names like `sm_5_1` and `sm_6_3`
-
-* GLSL versions can be used as profile with names like `glsl_430` and `glsl_460`
-
-### Kernels
-
-A `-o` option indicates that kernel code should be written to a file on disk.
-In our example, the SPIR-V kernel code for the `computeMain()` entry point will be written to the file `hello-world.spv`.
-
-### Working with Multiples
-
-It is possible to use `slangc` with multiple input files, entry points, or targets.
-In these cases, the ordering of arguments on the command line becomes significant.
-
-When an option modifies or relates to another command-line argument, it implicitly applies to the most recent relevant argument.
-For example:
-
-* If there are multiple input files, then an `-entry` option applies to the preceding input file
-
-* If there are multiple entry points, then a `-stage` option applies to the preceding `-entry` option
-
-* If there are multiple targets, then a `-profile` option applies to the preceding `-target` option
-
-Kernel `-o` options are the most complicated case, because they depend on both a target and entry point.
-A `-o` option applies to the preceding entry point, and the compiler will try to apply it to a matching target based on its file extension.
-For example, a `.spv` output file will be matched to a `-target spriv`.
-
-The compiler makes a best effort to support complicated cases with multiple files, entry points, and targets.
-Users with very complicated compilation requirements will probably be better off using multiple `slangc` invocations or migrating to the compilation API.
-
-### Additional Options
-
-The main other options are:
-
-* `-D<name>` or `-D<name>=<value>` can be used to introduce preprocessor macros.
-
-* `-I<path>` or `-I <path>` can be used to introduce a _search path_ to be used when resolving `#include` directives and `import` declarations.
-
-* `-g` can be used to enable inclusion of debug information in output files (where possible and implemented)
-
-* `-O<level>` can be used to control optimization levels when the Slang compiler invokes downstream code generator
-
-### Convenience Features
-
-The `slangc` compiler provides a few conveniences for command-line compilation:
-
-* Most options can appear out of order when they are unambiguous. For example, if there is only a single translation unit a `-entry` option can appear before or after any file.
-
-* A `-target` option can be left out if it can be inferred from the only `-o` option present. For example, `-o hello-world.spv` already implies `-target spriv`.
-
-* If a `-o` option is left out then kernel code will be written to the standard output. This output can be piped to a file, or can be printed to a console. In the latter case, the compiler will automatically disassemble binary formats for printing.
-
-### Limitations
-
-The `slangc` tool is meant to serve the needs of many developers, including those who are currently using `fxc`, `dxc`, or similar tools.
-However, some applications will benefit from deeper integration of the Slang compiler into application-specific code and workflows.
-Notable features that Slang supports which cannot be accessed from `slangc` include:
-
-* Slang can provide _reflection_ information about shader parameters and their layouts for particular targets; this information is not currently output by `slangc`.
-
-* Slang allows applications to control the way that shader modules and entry points are composed (which in turn influences their layout); `slangc` currently implements a single default policy for how to generate a composition of shader code.
-
-Applications that more control over compilation are encouraged to use the C++ compilation API described in the next section.
-
-Using the Compilation API
--------------------------
-
-The C++ API provided by Slang is meant to provide more complete control over compilation for applications that need it.
-The additional level of control means that some tasks require more individual steps than they would when using a one-size-fits-all tool like `slangc`.
-
-### "COM-lite" Components
-
-Many parts of the Slang C++ API use interfaces that follow the design of COM (the Component Object Model).
-Some key Slang interfaces are binary-compatible with existing COM interfaces.
-However, the Slang API does not depend on any runtime aspects of the COM system, even on Windows; the Slang system can be seen as a "COM-lite" API.
-
-The `ISlangUnknown` interface is equivalent to (and binary-compatible with) the standard COM `IUnknown`.
-Application code is expected to correctly maintain the reference counts of `ISlangUnknown` objects returned from API calls; the `Slang::ComPtr<T>` "smart pointer" type is provided as an optional convenience for applications that want to use it.
-
-Many Slang API calls return `SlangResult` values; this type is equivalent to (and binary-compatible with) the standard COM `HRESULT` type.
-As a matter of convention, Slang API calls return a zero value (`SLANG_OK`) on success, and a negative value on errors.
-
-### Creating a Global Session
-
-A Slang _global session_ uses the interface `slang::IGlobalSession` and it represents a connection from an application to a particular implementation of the Slang API.
-A global session is created using the function `slang::createGlobalSession()`:
-
-```c++
-Slang::ComPtr<IGlobalSession> globalSession;
-slang::createGlobalSession(globalSession.writeRef());
-```
-
-When a global session is created, the Slang system will load its internal representation of the _standard library_ that the compiler provides to user code.
-The standard library can take a significant amount of time to load, so applications are advised to use a single global session if possible, rather than creating and then disposing of one for each compile.
-
-> #### Note ####
-> Currently, the global session type is *not* thread-safe.
-> Applications that wish to compile on multiple threads will need to ensure that each concurrent thread compiles with a distinct global session.
-
-### Creating a Session
-
-A _session_ uses the interface `slang::ISession`, and represents a scope for compilation with a consistent set of compiler options.
-In particular, all compilation with a single session will share:
-
-* A list of enabled compilation targets (with their options)
-
-* A list of search paths (for `#include` and `import`)
-
-* A list of pre-defined macros
-
-In addition, a session provides a scope for the loading and re-use of modules.
-If two pieces of code compiled in a session both `import`  the same module, then that module will only be loaded and compiled once.
-
-To create a session, use the `IGlobalSession::createSession()` method:
-
-```c++
-SessionDesc sessionDesc;
-/* ... fill in `sessionDesc` ... */
-Slang::ComPtr<ISession> session;
-globalSession->createSession(sessionDesc, session.writeRef());
-```
-
-#### Targets
-
-The `SessionDesc::targets` array can be used to describe the list of targets that the application wants to support in a session.
-Often, this will consist of a single target.
-
-Each target is described with a `TargetDesc` which includes options to control code generation for the target.
-The most important fields of the `TargetDesc` are the `format` and `profile`; most others can be left at their default values.
-
-The `format` field should be set to one of the values from the `SlangCompileTarget` enumeration.
-For example:
-
-```c++
-TargetDesc targetDesc;
-targetDesc.format = SLANG_FORMAT_SPIRV;
-```
-
-The `profile` field must be set with the ID of one of the profiles supported by the Slang compiler.
-The exact numeric value of the different profiles is not currently stable across compiler versions, so applications should look up a chosen profile using `IGlobalSession::findProfile`.
-For example:
-
-```c++
-targetDesc.profile = globalSession->findProfile("glsl_450");
-```
-
-Once the chosen `TargetDesc`s have been initialized, they can be attached to the `SessionDesc`:
-
-```c++
-sessionDesc.targets = &targetDesc;
-sessionDesc.targetCount = 1;
-```
-
-#### Search Paths
-
-The search paths on a session provide the paths where the compiler will look when trying to resolve a `#include` directive or `import` declaration.
-The search paths can be set in the `SessionDesc` as an array of `const char*`:
-
-```c++
-const char* searchPaths[] = { "myapp/shaders/" };
-sessionDesc.searchPaths = searchPaths;
-sessionDesc.searchPathCount = 1;
-```
-
-#### Pre-Defined Macros
-
-The pre-defined macros in a session will be visible at the start of each source unit that is compiled, including source units loaded via `import`.
-Each pre-defined macro is described with a `PreprocessorMacroDesc`, which has `name` and `value` fields:
-
-```c++
-PreprocessorMacroDesc fancyFlag = { "ENABLE_FANCY_FEATURE", "1" };
-sessionDesc.preprocessorMacros = &fancyFlag;
-sessionDesc.preprocessorMacroCount = 1;
-```
-
-### Loading a Module
-
-The simplest way to load code into a session is with `ISession::loadModule()`:
-
-```c++
-Slang::ComPtr<IModule> module = session->loadModule("MyShaders");
-```
-
-Executing `loadModule("MyShaders")` in host C++ code is similar to using `import MyShaders` in Slang code.
-The session will search for a matching module (usually in a file called `MyShaders.slang`) and will load and compile it (if it hasn't been done already).
-
-Note that `loadModule()` does not provide any ways to customize the compiler configuration for that specific module.
-The preprocessor environment, search paths, and targets will always be those specified for the session.
-
-### Capturing Diagnostic Output
-
-Compilers produce various kinds of _diagnostic_ output when compiling code.
-This includes not only error messages when compilation fails, but also warnings and other helpful messages that may be produced even for successful compiles.
-
-Many operations in Slang, such as `ISession::loadModule()` can optionally produce a _blob_ of diagnostic output.
-For example:
-
-```c++
-Slang::ComPtr<IBlob> diagnostics;
-Slang::ComPtr<IModule> module = session->loadModule("MyShaders", diagnostics.writeRef());
-```
-
-In this example, if any diagnostic messages were produced when loading `MyShaders`, then the `diagnostics` pointer will be set to a blob that contains the textual content of those diagnostics.
-
-The content of a blob can be accessed with `getBufferPointer()`, and the size of the content can be accessed with `getBufferSize()`.
-Diagnostic blobs produces by the Slang compiler are always null-terminated, so that they can be used with C-style sting APIs:
-
-```c++
-if(diagnostics)
-{
-    fprintf(stderr, "%s\n", (const char*) diagnostics->getBufferPointer());
-}
-```
-
-> #### Note ####
-> The `slang::IBlob` interface is binary-compatible with the `ID3D10Blob` and `ID3DBlob` interfaces used by some Direct3D compilation APIs.
-
-### Entry Points
-
-When using `loadModule()` applications should ensure that entry points in their shader code are always marked with appropriate `[shader(...)]` attributes.
-For example, if `MyShaders.slang` contained:
-
-```hlsl
-[shader("compute")]
-void myComputeMain(...) { ... }
-```
-
-then the Slang system will automatically detect and validate this entry point as part of a `loadModule("MyShaders")` call.
-
-After a module has been loaded, the application can look up entry points in that module using `IModule::findEntryPointByName()`:
-
-```c++
-Slang::ComPtr<IEntryPoint> computeEntryPoint;
-module->findEntryPointByName("myComputeMain", computeEntryPoint.writeRef());
-```
-
-### Composition
-
-An application might load any number of modules with `loadModule()`, and those modules might contain any number of entry points.
-Before GPU kernel code can be generated it is first necessary to decide which pieces of GPU code will be used together.
-
-Both `slang::IModule` and `slang::IEntryPoint` inherit from `slang::IComponentType`, because both can be used as components when composing a shader program.
-A composition can be created with `ISession::createCompositeComponentType()`:
-
-```c++
-IComponentType* components[] = { module, entryPoint };
-Slang::ComPtr<IComponentType> program;
-session->createCompositeComponentType(components, 2, program.writeRef());
-```
-
-As discussed earlier in this chapter, the composition operation serves two important purposes.
-First, it establishes which code is part of a compiled shader program and which is not.
-Second, it established an ordering for the code in a program, which can be used for layout.
-
-### Layout and Reflection
-
-Some applications need to perform reflection on shader parameters and their layout, whether at runtime or as part of an offline compilation tool.
-The Slang API allows layout to be queried on any `IComponentType` using `getLayout()`:
-
-```c++
-slang::ProgramLayout* layout = program->getLayout();
-```
-
-> #### Note ####
-> In  the current Slang API, the `ProgramLayout` type is not reference-counted.
-> Currently, the lifetime of a `ProgramLayout` is tied to the `IComponentType` that returned it.
-> An application must ensure that it retains the given `IComponentType` for as long as it uses the `ProgramLayout`.
-
-Note that because both `IModule` and `IEntryPoint` inherit from `IComponentType`, they can also be queried for their layouts individually.
-The layout for a module comprises just its global-scope parameters.
-The layout for an entry point comprises just its entry-point parameters (both `uniform` and varying).
-
-The details of how Slang computes layout, what guarantees it makes, and how to inspect the reflection information will be discussed in a later chapter.
-
-Because the layout computed for shader parameters may depend on the compilation target, the `getLayout()` method actually takes a `targetIndex` parameter that is the zero-based index of the target for which layout information is being queried.
-This parameter defaults to zero as a convenience for the common case where applications use only a single compilation target at runtime.
-
-### Kernel Code
-
-Given a composed `IComponentType`, an application can extract kernel code for one of its entry points using `IComponentType::getEntryPointCode()`:
-
-```c++
-int entryPointIndex = 0; // only one entry point
-int targetIndex = 0; // only one target
-Slang::ComPtr<IBlob> kernelBlob;
-program->getEntryPointCode(
-    entryPointIndex,
-    targetIndex,
-    kernelBlob.writeRef(),
-    diagnostics.writeRef());
-```
-
-Any diagnostic messages related to back-end code generation (for example, if the chosen entry point requires features not available on the chosen target) will be written to `diagnostics`.
-The `kernelBlob` output is a `slang::IBlob` that can be used to access the generated code (whether binary or textual).
-In many cases `kernelBlob->getBufferPointer()` can be passed directly to the appropriate graphics API to load kernel code onto a GPU.
diff --git a/docs/user-guide/06-interfaces-generics.md b/docs/user-guide/06-interfaces-generics.md
new file mode 100644
index 000000000..59a0352fa
--- /dev/null
+++ b/docs/user-guide/06-interfaces-generics.md
@@ -0,0 +1,737 @@
+---
+layout: user-guide
+permalink: /user-guide/interfaces-generics
+---
+
+Interfaces and Generics
+===========================
+
+This chapter covers two interrelated Slang language features: interfaces and generics. We will talk about what they are, how do they relate to similar features in other languages, how are they parsed and translated by the compiler, and show examples on how these features simplifies and modularizes shader code.
+
+Interfaces
+----------
+
+Interfaces are used to define the methods and services a type should provide. You can define a interface as the following example:
+```csharp
+interface IFoo
+{
+    int myMethod(float arg);
+}
+```
+
+Slang's syntax for defining interfaces are similar to `interface`s in C# and `protocol`s in Swift. In this example, the `IFoo` interface establishes a contract that any type conforming to this interface must provide a method named `myMethod` that accepts a `float` argument and returns an `int` value.
+
+A `struct` type may declare its conformance to an `interface` via the following syntax:
+```csharp
+struct MyType : IFoo
+{
+    int myMethod(float arg)
+    {
+        return (int)arg + 1;
+    }
+}
+```
+By declaring the conformance to `IFoo`, the definition of `MyType` must include a method named `myMethod` with a matching signature to that defined in the `IFoo` interface to satisfy the declared conformance. If a type misses any methods required by the interface, the Slang compiler will generate an error message.
+
+A `struct` type may declare multiple interface conformances:
+```csharp
+interface IBar { uint myMethod2(uint2 x); }
+
+struct MyType : IFoo, IBar
+{
+    int myMethod(float arg) {...}
+    uint myMethod2(uint2 x) {...}
+}
+```
+In this case, the definition of `MyType` must satisfy the requirements from both the `IFoo` and `IBar` interfaces by providing both the `myMethod` and `myMethod2` methods.
+
+Generics
+---------------------
+
+Generics can be used to eliminate duplicate code for shared logic that operates on different types. The following example shows how to define a generic method in Slang.
+
+```csharp
+int myGenericMethod<T: IFoo>(T arg)
+{
+    return arg.myMethod(1.0);
+}
+```
+
+The above listing defines a generic method named `myGenericMethod`, which accepts an argument that can be of any type `T` as long as `T` conforms to the `IFoo` interface. The `T` here is called a _generic type parameter_, and it is associated with an _type constraint_ that any type represented by `T` must conform to the interface `IFoo`.
+
+The following listing shows how to invoke a generic method:
+```csharp
+MyType obj;
+int a = myGenericMethod<MyType>(obj); // OK, explicit type argument
+int b = myGenericMethod(obj); // OK, automatic type deduction
+```
+
+You may explicitly specify the concrete type to used for the generic type argument, by providing the types in angular brackets after the method name, or leave it to the compiler to automatically deduce the type from the argument list.
+
+> #### Note ####
+> Slang currently does not support partial type argument list deduction.
+> For example if you have a generic method that accepts two type arguments:
+> ```
+> void g<T:IFoo, U:IBar>(T a, U b) {...}
+> ```
+> You may either call this method with no explicit type arguments:
+> ```
+> MyType a, b;
+> g(a, b);
+> ```
+> Or with explicit arguments for both generic type parameters:
+> ```
+> g<MyType, MyType>(a,b);
+> ```
+> If you only provide first type argument, Slang will generate an error:
+> ```
+> g<MyType>(a,b); // error, does not work today.
+> ```
+> We plan to support such use in a future version.
+
+
+Note that it is important to associate a generic type parameter with a type constraint. In the above example, although the definition of `myGenericMethod` is agnostic of the concrete type `T` will stand for, knowing that `T` conforms to `IFoo` allows the compiler to type-check and pre-compile `myGenericMethod` without needing to substitute `T` with any concrete types first. Similar to languages like C#, Rust, Swift and Java, leaving out the type constraint declaration on type parameter `T` will result in a compile error at the line calling `arg.myMethod` since the compiler cannot verify that `arg` has a member named `myMethod` without any knowledge on `T`. This is a major difference of Slang's generics compared to _templates_ in C++. 
+
+While C++ templates are a powerful language mechanism, Slang has followed the path of many other modern programming languages to adopt the more structural and restricted generics feature instead. This enables the Slang compiler to perform type checking early to give more readable error messages, and to speed-up compilation by reusing a lot of work for different instantiations of `myGenericMethod`.
+
+
+Supported Constructs in Interface Definitions
+-----------------------------------------------------
+
+Slang supports many other constructs in addition to ordinary methods as a part of an interface definition.
+
+### Properties
+
+```csharp
+interface IFoo
+{
+    property int count {get; set;}
+}
+```
+The above listing declares that any conforming type must define a property named `count` with both a `getter` and a `setter` method.
+
+### Generic Methods
+
+```csharp
+interface IFoo
+{
+    int compute<T:IBar>(T val);
+}
+```
+The above listing declares that any conforming type must define a generic method named `compute` that has one generic type parameter conforming to the `IBar` interface.
+
+### Static Methods
+
+```csharp
+interface IFoo
+{
+    static int compute(int val);
+};
+```
+
+The above listing declares that any conforming type must define a static method named `compute`. This allows the following generic method to pass type-checking:
+```csharp
+void f<T:IFoo>()
+{
+    T.compute(5); // OK, T has a static method `compute`.
+}
+```
+
+### Static Constants
+
+You can define static constant requirements in an interface. The constants can be accessed in places where a compile-time constant is needed.
+```csharp
+interface IMyValue
+{
+    static const int value;
+}
+struct MyObject2 : IMyValue
+{
+    static const int value = 2;
+}
+struct GetValuePlus1<T:IMyValue>
+{
+    static const int value = T.value + 1;
+}
+
+static const int result = GetValuePlus1<MyObject2>.value;  // result == 3
+```
+
+### `This` Type
+
+You may use a special keyword `This` in interface definitions to refer to the type that is conforming to the interface. The following examples demonstrate a use of `This` type:
+```csharp
+interface IComparable
+{
+    int comparesTo(This other);
+}
+struct MyObject : IComparable
+{
+    int val;
+    int comparesTo(MyObject other)
+    {
+        return val < other.val ? -1 : 1;
+    }
+}
+```
+In this example, the `IComparable` interface declares that any conforming type must provide a `comparesTo` method that performs a comparison between an object to another object of the same type. The `MyObject` type satisfies this requirement by providing a `comparesTo` method that accepts a `MyObject` typed argument, since in the scope of `MyObject`, `This` type is equivalent to `MyObject`.
+
+### Initializers
+
+Consider a generic method that wants to create and initialize a new instance of generic type `T`:
+```csharp
+void f<T:IFoo>()
+{
+    T obj = /*a newly initialized T*/
+}
+```
+One way to implement this is to introduce a static method requirement in `IFoo`:
+```csharp
+interface IFoo
+{
+    static This create();
+}
+```
+With this interface definition, we can define `f` as following:
+```csharp
+void f<T:IFoo>()
+{
+    T obj = T.create();
+}
+```
+
+This solution works just fine, but it would be nicer if you can just write:
+```csharp
+T obj = T();
+```
+Or simply
+```csharp
+T obj;
+```
+And let the compiler invoke the default initializer defined in the type.
+To enable this, you can include an initializer requirement in the interface definition:
+```csharp
+interface IFoo
+{
+    __init();
+}
+```
+
+Initializers with parameters are supported as well. For example:
+```csharp
+interface IFoo
+{
+    __init(int a, int b);
+}
+void g<T:IFoo>()
+{
+    T obj = {1, 2}; // OK, invoking the initializer on T.
+}
+```
+
+Associated Types
+-------------------------
+
+When writing code using interfaces and generics, there are some situations where the an interface method needs to return an object whose type is implementation-dependent. For example, consider the following `IFloatContainer` interface that represents a container of `float` values:
+```csharp
+// Represents a container of float values.
+interface IFloatContainer
+{
+    // Returns the number of elements in this container.
+    uint getCount();
+    // Returns an iterator representing the start of the container.
+    Iterator begin();
+    // Returns an iterator representing the end of the container.
+    Iterator end();
+    // Return the element at the location represented by `iter`.
+    float getElementAt(Iterator iter);
+}
+```
+An implementation of the `IFloatContainer` interface may use different types of iterators. For example, an implementation that is simply an array of `float`s can expose `Iterator` as a simple integer index:
+```csharp
+struct ArrayFloatContainer : IFloatContainer
+{
+    float content[10];
+    uint getCount() { return 10; }
+    uint begin() { return 0; }
+    uint end() { return 10; }
+    float getElementAt(uint iter) { return content[iter]; }
+}
+```
+On the other hand, an implementation that uses multiple buffers as the backing storage may use a more complex type to locate an element:
+```csharp
+// Exposes values in two `StructuredBuffer`s as a single container.
+struct MultiArrayFloatContainer : IFloatContainer
+{
+    StructuredBuffer<float> firstBuffer;
+    StructuredBuffer<float> secondBuffer;
+    uint getCount() { return getBufferSize(firstBuffer) + getBufferSize(secondBuffer); }
+
+    // `uint2.x` indicates which buffer, `uint2.y` indicates the index within the buffer.
+    uint2 begin() { return uint2(0,0); }
+    uint2 end() { return uint2 (1, getBufferSize(secondBuffer)); }
+    float getElementAt(uint2 iter)
+    {
+        if (iter.x == 0) return firstBuffer[iter.y];
+        else return secondBuffer[iter.y];
+    }
+}
+```
+
+Ideally, a generic function that wishes to enumerate values in a `IFloatContainer` shouldn't need to care about the implementation details on what the concrete type of `Iterator` is, and we would like to be able to write the following:
+```csharp
+float sum<T:IFloatContainer>(T container)
+{
+    float result = 0.0f;
+    for (T.Iterator iter = container.begin(); iter != container.end(); iter=iter.next())
+    {
+        float val = container.getElementAt(iter);
+        result += val;
+    }
+    return result;
+}
+```
+Here the `sum` function simply wants to access all the elements and sum them up. The details of what the `Iterator` type actually is does not matter to the definition of `sum`.
+
+The problem is that the `IFloatContainer` interface definition requires methods like `begin()`, `end()` and `getElementAt()` to refer to a iterator type that is implementation dependent. How should the signature of these methods be defined in the interface? The answer is to use _associated types_.
+
+In addition to constructs listed in the previous section, Slang also supports defining associated types in an `interface` definition. An associated type can be defined as following.
+```csharp
+// The interface for an iterator type.
+interface IIterator
+{
+    // An iterator needs to know how to move to the next element.
+    This next();
+}
+
+interface IFloatContainer
+{
+    // Requires an implementation to define a typed named `Iterator` that
+    // conforms to the `IIterator` interface.
+    associatedtype Iterator : IIterator;
+
+    // Returns the number of elements in this container.
+    uint getCount();
+    // Returns an iterator representing the start of the container.
+    Iterator begin();
+    // Returns an iterator representing the end of the container.
+    Iterator end();
+    // Return the element at the location represented by `iter`.
+    float getElementAt(Iterator iter);
+};
+```
+
+This `associatedtype` definition in `IFloatContainer` requires that all types conforming to this interface must also define a type in its scope named `Iterator`, and this iterator type must conform to the `IIterator` interface. An implementation to the `IFloatContainer` interface by using either a `typedef` declaration or a `struct` definition inside its scope to satisfy the associated type requirement. For example, the `ArrayFloatContainer` can be implemented as following:
+```csharp
+struct ArrayIterator : IIterator
+{
+    uint index;
+    __init(int x) { index = x; }
+    ArrayIterator next()
+    {
+        return ArrayIterator(index + 1);
+    }
+}
+struct ArrayFloatContainer : IFloatContainer
+{
+    float content[10];
+
+    // Specify that the associated `Iterator` type is `ArrayIterator`.
+    typedef ArrayIterator Iterator;
+
+    Iterator getCount() { return 10; }
+    Iterator begin() { return ArrayIterator(0); }
+    Iterator end() { return ArrayIterator(10); }
+    float getElementAt(Iterator iter) { return content[iter.index]; }
+}
+```
+
+Alternatively, you may also define the `Iterator` type directly inside a `struct` implementation, as in the following definition for `MultiArrayFloatContainer`:
+```csharp
+// Exposes values in two `StructuredBuffer`s as a single container.
+struct MultiArrayFloatContainer : IFloatContainer
+{
+    // Represents an iterator of this container
+    struct Iterator : IIterator
+    {
+        // `index.x` indicates which buffer the element is located in.
+        // `index.y` indicates which the index of the element inside the buffer.
+        uint2 index;
+
+        // We also need to keep a size of the first buffer so we know when to
+        // switch to the second buffer.
+        uint firstBufferSize;
+
+        // Implementation of IIterator.next()
+        Iterator next()
+        {
+            Iterator result;
+            result.index.x = index.x;
+            result.index.y = index.y + 1;
+            // If we are at the end of the first buffer,
+            // move to the head of the second buffer
+            if (result.index.x == 0 && result.index.y == firstBufferSize)
+            {
+                result.index = uint2(1, 0);
+            }
+            return result;
+        }
+    }
+
+    StructuredBuffer<float> firstBuffer;
+    StructuredBuffer<float> secondBuffer;
+    uint getCount() { return getBufferSize(firstBuffer) + getBufferSize(secondBuffer); }
+
+    Iterator begin()
+    {
+        Iterator iter;
+        iter.index = uint2(0, 0);
+        iter.firstBufferSize = getBufferSize(firstBuffer);
+        return iter;
+    }
+    Iterator end()
+    {
+        Iterator iter;
+        iter.index = uint2(1, getBufferSize(secondBuffer));
+        iter.firstBufferSize = 0;
+        return iter;
+    }
+    float getElementAt(Iterator iter)
+    {
+        if (ite.indexr.x == 0) return firstBuffer[iter.index.y];
+        else return secondBuffer[iter.index.y];
+    }
+}
+```
+
+In summary, an `asssociatedtype` requirement in an interface is similar to other types of requirements: a method requirement means that an implementation must provide a method matching the interface signature, while an `associatedtype` requirement means that an implementation must provide a type in its scope with the matching name and interface constraint. In general, when defining an interface that is producing and consuming an object whose actual type is implementation-dependent, the type of this object can often be modeled as an associated type in the interface.
+
+### Comparison to the C++ Approach
+Readers who are familiar with C++ could easily relate the `Iterator` example in previous subsection to the implementation of STL. In C++, the `sum` function can be easily written with templates:
+```C++
+template<typename TContainer>
+float sum(const TContainer& container)
+{
+    float result = 0.0f;
+    // Assumes `TContainer` has a type `Iterator` that supports `operator++`.
+    for (TContainer::Iterator iter = container.begin(); iter != container.end(); ++iter)
+    {
+        result += container.getElementAt(iter);
+    }
+    return result;
+}
+```
+
+A C++ programmer can implement `ArrayFloatContainer` as following:
+```C++
+struct ArrayFloatContainer
+{
+    float content[10];
+
+    typedef uint32_t Iterator;
+
+    Iterator getCount() { return 10; }
+    Iterator begin() { return 0; }
+    Iterator end() { return 10; }
+    float getElementAt(Iterator iter) { return content[iter]; }
+};
+```
+Because C++ does not require a template function to define _constraints_ on the templated type, there are no interfaces or inheritances involved in the definition of `ArrayFloatContainer`. However `ArrayFloatContainer` still needs to define what its `Iterator` type is, so the `sum` function can be successfully specialized with an `ArrayFloatContainer`.
+
+Note that the biggest difference between C++ templates and generics is that templates are not type-checked prior to specialization, and therefore the code that consumes a templated type (`TContainer` in this example) can simply assume `container` has a method named `getElementAt`, and the `TContainer` scope provides a type definition for `TContainer::Iterator`. Compiler error only arises when the programmer is attempting to specialize the `sum` function with a type that does not meet these assumptions. Contrarily, Slang requires all possible uses of a generic type be declared through an interface. By stating that `TContainer:IContainer` in the generics declaration, the Slang compiler can verify that `container.getElementAt` is calling a valid function. Similarily, the interface also tells the compiler that `TContainer.Iterator` is a valid type and enables the compiler to fully type check the `sum` function without specializing it first.
+
+### Similarity to Swift and Rust
+
+Slang's `associatedtype` shares the same semantic meaning with `associatedtype` in a Swift `protocol` or `type` in a Rust `trait`, except that Slang currently does not support the more general `where` clause in these languages. C# does not have an equivalent to `associatedtype`, and programmers need to resort to generic interfaces to achieve similar goals.
+
+Generic Value Parameters
+-------------------------------
+
+So far we have demonstrated generics with _type parameters_. Additionally, Slang also supports generic _value_ parameters.
+The following listing shows an example of generic value parameters.
+```csharp
+struct Array<T, let N : int>
+{
+    T arrayContent[N];
+}
+```
+In this example, the `Array` type has a generic type parameter, `T`, that is used as the element type of the `arrayContent` array, and a generic value parameter `N` of integer type.
+
+Note that the builtin `vector<float, N>` type also has an generic value parameter `N`.
+
+> #### Note ####
+> The only type of generic value parameters are `int`, `uint` and `bool`. `float` and
+> other types cannot be used in a generic value parameter. Computations in a type
+> expression are supported as long as they can be evaluated at compile time. For example,
+`vector<float, 1+1>` is allowed and considered equivalent to `vector<float, 2>`.
+
+
+Interface-typed Values
+-------------------------------
+
+So far we have been using interfaces as constraints to generic type parameters. For example, the following listing defines a generic function with a type parameter `TTransform` constrained by interface `ITransform`:
+
+```csharp
+interface ITransform
+{
+    int compute(MyObject obj);
+}
+
+// Defining a generic method:
+int apply<TTransform : ITransform>(TTransform transform, MyObject object)
+{
+    return transform.compute(object);
+}
+```
+
+While Slang's syntax for defining generic methods bears similarity to generics in C#/Java and templates in C++ and should be easy to users who are familiar with these languages, codebases that make heavy use of generics can quickly become verbose and difficult to read. To reduce the amount of boilerplate, Slang supports an alternate way to define the `apply` method by using the interface type `ITransform` as parameter type directly:
+
+```csharp
+// A method that is equivalent to `apply` but uses simpler syntax:
+int apply_simple(ITransform transform, MyObject object)
+{
+    return transform.compute(object);
+}
+```
+
+Instead of defining a generic type parameter `TTransform` and a method parameter `transform` that has `TTransform` type, you can simply define the same `apply` function like a normal method, with a `transform` parameter whose type is an interface. From the Slang compiler's view, `apply` and `apply_simple` will be compiled to the same target code.
+
+In addition to parameters, Slang allows variables, and function return values to have an interface type as well:
+```csharp
+ITransform test(ITransform arg)
+{
+    ITransform v = arg;
+    return v;
+}
+```
+
+### Restrictions and Caveats
+
+The Slang compiler always attempts to determine the actual type of an interface-typed value at compile time and specialize the code with the actual type. As long as the compiler can successfully determine the actual type, code that uses interface-typed values are equivalent to code written in the generics syntax. However, when interface types are used in function return values, the compiler will not be able to trivially propagate type information. For example:
+```csharp
+ITransform getTransform(int x)
+{
+    if (x == 0)
+    {
+        Type1Transform rs = {};
+        return rs;
+    }
+    else
+    {
+        Type2Transform rs = {};
+        return rs;
+    }
+}
+```
+In this example, the actual type of the return value is dependent on the value of `x`, which may not be known at compile time. This means that the concrete type of the return value at invocation sites of `getTransform` may not be statically determinable. When the Slang compiler cannot infer the concrete type of an interface-type value, it will generate code that performs a dynamic dispatch based on the concrete type of the value at runtime, which may introduce performance overhead. Note that this behavior applies to function return values in the form of `out` parameters as well:
+
+```csharp
+void getTransform(int x, out ITransform transform)
+{
+    if (x == 0)
+    {
+        Type1Transform rs = {};
+        transform = rs;
+    }
+    else
+    {
+        Type2Transform rs = {};
+        transform = rs;
+    }
+}
+```
+This `getTransform` definition can also result in dynamic dispatch code since the type of `transform` may not be statically determinable.
+
+When the compiler is generating dynamic dispatch code for interface-typed values, it requires the concrete type of the interface-typed value to be free of any opaque-typed fields (e.g. resources and buffer types). A compiler error will generated upon such attempts:
+```csharp
+struct MyTransform : ITransform
+{
+    StructuredBuffer<int> buffer;
+    int compute(MyObject obj)
+    {
+        return buffer[0];
+    }
+}
+
+ITransform getTransform(int x)
+{
+    MyTransform rs;
+    // Error: cannot use an opaque value as an interface-typed return value.
+    return rs;
+}
+```
+
+Assigning different values to a mutable interface-typed variable also undermines the compiler's ability to statically determine the type of the variable, and is not supported by the Slang compiler today:
+```csharp
+void test(int x)
+{
+    ITransform t = Type1Transform();
+    // Do something ...
+    // Assign a different type of transform to `t`:
+    // (Not supported by Slang today)
+    t = Type2Transform();
+    // Do something else...
+}
+```
+
+In general, if the use of interface-typed values is restricted to function parameters only, then the all code that involves interface-typed values will be compiled the same way as if the code is written using standard generics syntax.
+
+
+Extending a Type with Additional Interface Conformances
+-----------------------------
+In the previous chapter, we introduced the `extension` feature that lets you define new members to an existing type in a separate location outside the original definition of the type. 
+
+`extensions` can be used to make an existing type conform to additional interfaces. Suppose we have an interface `IFoo` and a type `MyObject` that implements the interface:
+
+```csharp
+interface IFoo
+{
+    int foo();
+};
+
+struct MyObject : IFoo
+{
+    int foo() { return 0; }
+}
+```
+
+Now we introduce another interface, `IBar`:
+```csharp
+interface IBar
+{
+    float bar();
+}
+```
+
+We can define an `extension` to make `MyObject` conform to `IBar` as well:
+```csharp
+extension MyObject : IBar
+{
+    float bar() { return 1.0f }
+}
+```
+
+With this extension, we can use `MyObject` in places that expects an `IBar` as well:
+```csharp
+void use(IBar b)
+{
+    b.bar();
+}
+
+void test()
+{
+    MyObject obj;
+    use(obj); // OK, `MyObject` is extended to conform to `IBar`.
+}
+```
+
+You may define more than one interface conformances in a single `extension`:
+```csharp
+interface IBar2
+{
+    float bar2();
+}
+extension MyObject : IBar, IBar2
+{
+    float bar() { return 1.0f }
+    float bar2() { return 2.0f }
+}
+```
+
+`is` and `as` Operator
+----------------------------
+
+You can use `is` operator to test if an interface-typed value is of a specific concrete type, and use `as` operator to downcast the value into a specific type.
+The `as` operator returns an `Optional<T>` that is not `none` if the downcast succeeds.
+
+```csharp
+interface IFoo
+{
+    int foo();
+}
+struct MyImpl : IFoo
+{
+    int foo() { return 0; }
+}
+void test(IFoo foo)
+{
+    bool t = foo is MyImpl; // true
+    Optional<MyImpl> optV = foo as MyImpl;
+    if (t == (optV != none))
+        printf("success");
+    else
+        printf("fail");
+}
+void main()
+{
+    MyImpl v;
+    test(v);
+}
+// Result:
+// "success"
+```
+
+
+Extensions to Interfaces
+-----------------------------
+
+In addtion to extending ordinary types, you can define extensions on interfaces as well:
+```csharp
+// An example interface.
+interface IFoo
+{
+    int foo();
+}
+
+// Extending `IFoo` with a new method requirement
+// with a default implementation.
+extension IFoo
+{
+    int bar() { return 0; }
+}
+
+int use(IFoo foo)
+{
+    // With the extension, all uses of `IFoo` typed values
+    // can assume there is a `bar` method.
+    return foo.bar();
+}
+```
+
+Although the syntax of above listing suggests that we are extending an interface with additional requirements, this interpretation does not make logical sense in many ways. Consider a type `MyType` that exists before the extension is defined:
+```csharp
+struct MyType : IFoo
+{
+    int foo() { return 0; }
+}
+```
+
+If we extend the `IFoo` with new requirements, the existing `MyType` definition would become invalid since `MyType` no longer provides implementations to all interface requirements. Instead, what an `extension` on an interface `IFoo` means is that for all types that conforms to the `IFoo` interface and does not have a `bar` method defined, add a `bar` method defined in this extension to that type so that all `IFoo` typed values have a `bar` method defined. If a type already defines a matching `bar` method, then the existing method will always override the default method provided in the extension:
+
+```csharp
+interface IFoo
+{
+    int foo();
+}
+struct MyFoo1 : IFoo
+{
+    int foo() { return 0; }
+}
+extension IFoo
+{
+    int bar() { return 0; }
+}
+struct MyFoo2 : IFoo
+{
+    int foo() { return 0; }
+    int bar() { return 1; }
+}
+void test()
+{
+    MyFoo1 f1;
+    MyFoo2 f2;
+    int a = f1.bar(); // a == 0, calling the method in the extension.
+    int b = f2.bar(); // b == 1, calling the existing method in `MyFoo2`.
+}
+```
+This feature is similar to extension traits in Rust.
diff --git a/docs/user-guide/07-autodiff.md b/docs/user-guide/07-autodiff.md
new file mode 100644
index 000000000..b3a25358c
--- /dev/null
+++ b/docs/user-guide/07-autodiff.md
@@ -0,0 +1,758 @@
+---
+layout: user-guide
+permalink: /user-guide/autodiff
+---
+
+# Automatic Differentiation
+
+Neural networks and other machine learning techniques are becoming an increasingly popular way to solve many difficult problems in modern visual computing systems. However, to take advantage of these techniques, developers often need to reimplement many existing system components in a differentiable form to allow computing the derivatives of a function, or to propagate the derivative of a result backwards to each parameter. Slang provides built-in auto differentiation features to support developers adding differentiability to their existing code with as little effort as possible. In this chapter, we provide an overview of the auto differentiation features, followed by a detailed description on the new syntax and rules.
+
+## Using Automatic Differentiation in Slang
+
+In this section, we walk through the steps to compute forward-derivative from input, and backward propagate the derivative from output to input.
+
+### Forward Differentiation
+
+Suppose the user has already written a function that computes some mathematic term:
+
+```csharp
+float myFunc(float a, float x)
+{
+    return a * x * x;
+}
+```
+
+The user can make this function *forward-differentiable* by adding a `[ForwardDerivative]` attribute:
+```csharp
+[ForwardDifferentiable]
+float myFunc(float a, float x)
+{
+    return a * x * x;
+}
+```
+
+This allows the function to be used in the `fwd_diff` operator, which is a higher order operation that takes in a forward-differentiable function and returns the forward-derivative of the function.
+
+The expression `fwd_diff(myFunc)` will have the following signature:
+```csharp
+DifferentialPair<float> myFunc_fwd_derivative(DifferentialPair<float> a, DifferentialPair<float> x);
+```
+
+Where `DifferentialPair<T>` is a built-in type that encodes both the primal(original) value and the derivative value of a term.
+To use this function to compute the derivative of `myFunc` with regard to `x`, the user can call the forward-derivative function by supplying the derivative value of `x` with `1.0` and the derivative value of `a` with `0.0`, as in the following code:
+
+```csharp
+float a = 2.0;
+float x = 3.0;
+// Compute derivative with regard to `x`:
+let result = fwd_diff(myFunc)(diffPair(a, 0.0), diffPair(x, 1.0));
+// Print the derivative.
+printf("%f", result.d);
+
+// Output: 12.0
+```
+
+In the example code above, `diffPair()` is a built-in function to construct a value of `DifferentialPair<T>` with a primal value and a derivative value. The primal value and derivative value stored in a `DifferentialPair` can be accessed with the `.p` and a `.d` property.
+
+### Backward Propagation
+
+The forward derivative function allows the user to compute the derivative of a function with regard to a specific combination of input parameters at a time. In many cases, we need to know how each parameter affects the output. Instead of calling the forward derivative function once for each parameter, it is more efficient to call the *backward propagation* function that propagate the derivative of outputs to each input parameter.
+
+To allow the compiler to generate the backward propagation function, we simply mark our function with the `[Differentiable]` or `[BackwardDifferentiable]` attribute:
+```csharp
+[Differentiable]
+float myFunc(float a, float x)
+{
+    return a * x * x;
+}
+```
+
+> #### Note:
+> When a function is marked as `[Differentiable]`, it is implied that the function is both `[ForwardDifferentiable]` and `[BackwardDifferentiable]` and can be used in the `fwd_diff` operator.
+
+
+The `bwd_diff` operator applies to a backward differentiable function and returns the backward propagation function. In this case, `bwd_diff(myFunc)` will have the following signature:
+
+```csharp
+void myFunc_backProp(inout DifferentialPair<float> a, inout DifferentialPair<float> x, float dResult);
+```
+
+Where `a` is an `inout DifferentialPair` where the initial value of `a` is passed into the function as primal value (in the `.p` property), and the propagated derivative of `a` is returned via the `.d` property of the `DifferentialPair`. The same rules apply to `x`.
+
+The additional `dResult` parameter is the derivative of the return value to be propagated to the input parameters. Note that in a backward propagation function, an input will become a `inout DifferentialPair` where the `.d` property of the pair is intended for receiving the propagation result, and the return value will become an input parameter that represents the source of backward propagation.
+
+The backward propagation function can be called as in the following code:
+```csharp
+var a = diffPair(2.0); // constructs DifferentialPair{2.0, 0.0}
+var x = diffPair(3.0); // constructs DifferentialPair{3.0, 0.0}
+
+bwd_diff(myFunc)(a, x, 1.0);
+
+// a.d is now 9.0
+// x.d is now 12.0
+```
+
+This completes the walkthrough of automatic differentiation features. The following sections will cover each perspective of the auto differentiation feature in more detail.
+
+## Mathematic Concepts and Terminologies
+
+This section briefly reviews the mathematic theories behind differentiable programming with the intention to clarify the concepts and terminologies that will be used in the rest of this documentation. We assume the reader is already familiar with the basic theories behind neural network training, in particular the back-propagation algorithm.
+
+A differentiable system can be represented a composition of differentiable functions (kernels) with learnable parameters, where each differentiable function has the form:
+
+$$\mathbf{w}_{i+1} = f_i(\mathbf{w}_i) $$
+
+Where $$f_i$$ represents a differentiable function (kernel) in the system, $$\mathbf{w}$$ represents a collection of learnable parameters defined in function $$f_i$$, and $$\mathbf{w}_{i+1}$$ is the output of $$f_i$$. We will use $$\omega$$ to denote a specific parameter in $$\mathbf{w}$$.
+
+In a composed system, the value of $$\mathbf{w}$$ used to evaluate $$f_i$$ may come from an *upstream* function
+
+$$ \mathbf{w}_i = f_{i-1}(\mathbf{w}_{i-1}) $$
+
+Similarly, the value computed by $$f_i$$ may be used as argument to a *downstream* function
+
+$$ h = f_{i+1}(\mathbf{w}_{i+1}) = f_{i+1}(f_{i}(\mathbf{w}_{i}))$$
+
+The entire system composed from differentiable functions can be noted as
+
+$$Y = f_1 \circ f_2 \circ \cdots \circ f_n(\mathbf{w}_0)$$
+
+Where $$\mathbf{w}_0$$ is the first layer of parameters.
+
+### Forward Propagation of Derivatives
+When developing and training such a system, we often need to evaluate the partial derivative of a differentiable function with regard to some parameter $$\omega$$. The simplest way to obtain a partial derivative is to call a forward derivative propagation function, which is defined by:
+
+$$ \mathbb{F}[f_i] = f_i'(\mathbf{w}_i, \mathbf{w}_i') = \sum_{\omega_i\in\mathbf{w}_i} \frac{\partial f}{\partial \omega_i} \omega_i' $$
+
+Where $$\omega' \in \mathbf{w}'$$ represents the partial derivative of $$\omega_i$$ with regard to some upstream parameter $$\omega_{i-1}$$ that is used to compute $$\omega_i$$, i.e. $$\omega'=\frac{\partial \omega_{i}}{\partial \omega_{i-1}}$$.
+
+Given this definition, $$\mathbb{F}[f]$$ can be used as a forward propagation function that is able to compute $$\frac{\partial f_i}{\partial \omega_0}$$ from $$\frac{\partial \omega_{i-1}}{\partial \omega_0}$$.
+
+### Backward Propagation of Derivatives
+When using the backpropagation algorithm to train a neural network, we are more interested in figuring out the partial derivative of the final system output with regard to a parameter $$\omega_i$$ in $$f_i$$. To do so, we generally utilize the backward derivative propagation function
+
+$$\mathbb{B}[f_i] = f_i^{-1}(\frac{\partial Y}{\partial f_i}) = \frac{\partial Y}{\partial \mathbf{w}_i}$$
+
+Where the backward propagation function $$\mathbb{B}[f_i]$$ takes as input the partial derivative of the final system output $$Y$$ with regard to the output of $$f_i$$ (i.e. $$\mathbf{w}_i$$), and computes the partial derivative of the final system output with regard to the input of $$f_i$$ (i.e. $$\mathbf{w}_{i-1}$$).
+
+The higher order operator $$\mathbb{F}$$ and $$\mathbb{B}$$ represent the operations that converts an original or primal function $$f$$ to its forward or backward derivative propagation function. Slang's automatic differentiation feature provide built-in support for these operators to automatically generate the derivative propagation functions from a user defined primal function. The remaining documentation will discuss this feature from a programming language perspective.
+
+## Differentiable Types
+Slang will only generate differentiation code for values that has a *differentiable* type. A type is differentiable if it conforms to the built-in `IDifferentiable` interface. The definition of the `IDifferentiable` interface is:
+```csharp
+interface IDifferentiable
+{
+    associatedtype Differential : IDifferentiable
+        where Differential.Differential == Differential;
+
+    static Differential dzero();
+
+    static Differential dadd(Differential, Differential);
+
+    static Differential dmul(This, Differential);
+}
+```
+As defined by the `IDifferentiable` interface, a differentiable type must have a `Differential` associated type that stores the derivative of the value. A further requirement is that the type of the second-order derivative must be the same `Differential` type. In another word, given a type `T`, `T.Differential` can be different from `T`, but `T.Differential.Differential` must equal to `T.Differential`.
+
+In addition, a differentiable type must define the `zero` value of its derivative, and how to add and multiply derivative values.
+
+### Builtin Differentiable Types
+The following built-in types are differentiable: 
+- Scalars: `float`, `double` and `half`.
+- Vector/Matrix: `vector` and `matrix` of `float`, `double` and `half` types.
+- Arrays: `T[n]` is differentiable if `T` is differentiable.
+
+### User Defined Differentiable Types
+
+The user can make any `struct` types differentiable by implementing the `IDifferentiable` interface on the type. The requirements from `IDifferentiable` interface can be fulfilled automatically or manually.
+
+#### Automatic Fulfillment of `IDifferentiable` Requirements
+Assume the user has defined the following type:
+
+```csharp
+struct MyRay
+{
+    float3 origin;
+    float3 dir;
+    int nonDifferentiablePayload;
+}
+```
+
+The type can be made differentiable by adding `IDifferentiable` conformance:
+```csharp
+struct MyRay : IDifferentiable
+{
+    float3 origin;
+    float3 dir;
+    int nonDifferentiablePayload;
+}
+```
+
+Note that this code does not provide any explicit implementation of the `IDifferentiable` requirements. In this case the compiler will automatically synthesize all the requirements. This should provide the desired behavior most of the time. The procedure for synthesizing the interface implementation is as follows:
+1. A new type is generated that stores the `Differential` of all differentiable fields. This new type itself will conform to the `IDifferentiable` interface, and it will be used to satisfy the `Differential` associated type requirement.
+2. Each differential field will be associated to its corresponding field in the newly synthesized `Differential` type.
+3. The `zero` value of the differential type is made from the `zero` value of each field in the differential type.
+4. The `dadd` and `dmul` methods simply perform `dadd` and `dmul` operations on each field.
+5. If the synthesized `Differential` type contains exactly the same fields as the original type, and the type of each field is the same as the original field type, then the original type itself will be used as the `Differential` type instead of creating a new type to satisfy the `Differential` associated type requirement. This means that all the synthesized `Differential` type use itself to meet its own `IDifferentiable` requirements.
+
+#### Manual Fulfillment of `IDifferentiable` Requirements
+
+In rare cases where more control is desired, the user can manually provide the implementation. To do so, we will first define the `Differential` type for `MyRay`, and use it to fulfill the `Differential` requirement in `MyRay`:
+
+```csharp
+struct MyRayDifferential
+{
+    float3 d_origin;
+    float3 d_dir;
+}
+
+struct MyRay : IDifferentiable
+{
+    // Specify that `MyRay.Differential` is `MyRayDifferential`.
+    typealias Differential = MyRayDifferential;
+
+    // Specify that the derivative for `origin` will be stored in `MayRayDifferential.d_origin`.
+    [DerivativeMember(MayRayDifferential.d_origin)]
+    float3 origin;
+
+    // Specify that the derivative for `dir` will be stored in `MayRayDifferential.d_dir`.
+    [DerivativeMember(MayRayDifferential.d_dir)]
+    float3 dir;
+
+    // This is a non-differentiable field so we don't put any attributes on it.
+    int nonDifferentiablePayload;
+
+    // Define zero derivative.
+    static MyRayDifferential dzero()
+    {
+        return {float3(0.0), float3(0.0)};
+    }
+
+    // Define the add operation of two derivatives.
+    static MyRayDifferential dadd(MyRayDifferential v1, MyRayDifferential v2)
+    {
+        MyRayDifferential result;
+        result.d_origin = v1.d_origin + v2.d_origin;
+        result.d_dir = v1.d_dir + v2.d_dir;
+        return result;
+    }
+
+    // Define the multiply operation of a primal value and a derivative value.
+    static MyRayDifferential dmul(MyRay p, MyRayDifferential d)
+    {
+        MyRayDifferential result;
+        result.d_origin = p.origin * d.d_origin;
+        result.d_dir = p.dir * d.d_dir;
+        return result;
+    }
+}
+```
+
+Note that for each struct field that is differentiable, we need to use the `[DerivativeMember]` attribute to associate it with the corresponding field in the `Differential` type, so the compiler knows how to access the derivative for the field.
+
+However, there is still a missing piece in the above code: we also need to make `MyRayDifferential` conform to `IDifferentiable` because it is required that the `Differential` of a type must itself be `Differential`. Again we can use automatic fulfillment by simply adding `IDifferentiable` conformance to `MyRayDifferential`:
+```csharp
+struct MyRayDifferential : IDifferentiable
+{
+    float3 d_origin;
+    float3 d_dir;
+}
+```
+In this case, since all fields in `MyRayDifferential` are differentiable, and the `Differential` of each field is the same as the original type of each field (i.e. `float3.Differential == float3` as defined in built-in library), the compiler will automatically use the type itself as its own `Differential`, making `MyRayDifferential` suitable for use as `Differential` of `MyRay`.
+
+We can also choose to manually implement `IDifferentiable` interface for `MyRayDifferential` as in the following code:
+
+```csharp
+struct MyRayDifferential : IDifferentiable
+{
+    typealias Differential = MyRayDifferential;
+
+    [DerivativeMember(MyRayDifferential.d_origin)]
+    float3 d_origin;
+
+    [DerivativeMember(MyRayDifferential.d_dir)]
+    float3 d_dir;
+
+    static MyRayDifferential dzero()
+    {
+        return {float3(0.0), float3(0.0)};
+    }
+
+    static MyRayDifferential dadd(MyRayDifferential v1, MyRayDifferential v2)
+    {
+        MyRayDifferential result;
+        result.d_origin = v1.d_origin + v2.d_origin;
+        result.d_dir = v1.d_dir + v2.d_dir;
+        return result;
+    }
+
+    static MyRayDifferential dmul(MyRayDifferential p, MyRayDifferential d)
+    {
+        MyRayDifferential result;
+        result.d_origin = p.d_origin * d.d_origin;
+        result.d_dir = p.d_dir * d.d_dir;
+        return result;
+    }
+}
+```
+In this specific case, the automatically generated `IDifferentiable` implementation will be exactly the same as the manually written code listed above.
+
+
+## Forward Derivative Propagation Function
+
+Functions in Slang can be marked as forward-differentiable or backward-differentiable. The `fwd_diff` operator can be used on a forward-differentiable function to obtain the forward derivative propagation function. Likewise, the `bwd_diff` operator can be used on a backward-differentiable function to obtain the backward derivative propagation function. This and the next sections cover the semantics of forward and backward propagation functions, and different ways to make a function forward and backward differentiable. 
+
+A forward derivative propagation function computes the derivative of the result value with regard to a specific set of input parameters. 
+Given an original function, the signature of its forward propagation function is determined using the following rules:
+- If the return type `R` is differentiable, the forward propagation function will return `DifferentialPair<R>` that consists of both the computed original result value and the (partial) derivative of the result value. Otherwise, the return type is kept unmodified as `R`.
+- If a parameter has type `T` that is differentiable, it will be translated into a `DifferentialPair<T>` parameter in the derivative function, where the differential component of the `DifferentialPair` holds the initial derivatives of each parameter with regard to their upstream parameters.
+- All parameter directions are unchanged. For example, an `out` parameter in the original function will remain an `out` parameter in the derivative function.
+
+For example, given original function:
+```csharp
+R original(T0 p0, inout T1 p1, T2 p2);
+```
+Where `R`, `T0`, and `T1` is differentiable and `T2` is non-differentiable, the forward derivative function will have the following signature:
+```csharp
+DifferentialPair<R> derivative(DifferentialPair<T0> p0, inout DifferentialPair<T1> p1, T2 p2);
+```
+
+This forward propagation function takes the initial primal value of `p0` in `p0.p`, and the partial derivative of `p0` with regard to some upstream parameter in `p0.d`. It takes the initial primal and derivative values of `p1` and updates `p1` to hold the newly computed value and propagated derivative. Since `p2` is not differentiable, it remains unchanged.
+
+`DifferentialPair<T>` is a built-in type that carries both the original and derivative value of a term. It is defined as follows:
+```csharp
+struct DifferentialPair<T : IDifferentiable> : IDifferentiable
+{
+    typealias Differential = DifferentialPair<T.Differential>;
+    property T p {get;}
+    property T.Differential d {get;}
+    static Differential dzero();
+    static Differential dadd(Differential a, Differential b);
+    static Differential dmul(This a, Differential b);
+}
+```
+
+### Automatic Implementation of Forward Derivative Functions
+
+A function can be made forward-differentiable with a `[ForwardDifferentiable]` attribute. This attribute will cause the compiler to automatically implement the forward propagation function. The syntax for using `[ForwardDifferentiable]` is:
+
+```csharp
+[ForwardDifferentiable]
+R original(T0 p0, inout T1, p1, T2 p2);
+```
+
+Once the function is made forward-differentiable, the forward propagation function can then be called with the `fwd_diff` operator:
+```csharp
+DifferentialPair<R> result = fwd_diff(original)(...);
+```
+
+### User Defined Forward Derivative Functions
+As an alternative to compiler-implemented forward derivatives, the user can choose to manually provide a derivative implementation to make an existing function forward-differentiable. The `[ForwardDerivative(derivative_func)]` attribute is used to associate a function with its forward derivative propagation implementation. The syntax for using `[ForwardDerivative]` attribute is:
+```csharp
+DifferentialPair<R> derivative(DifferentialPair<T0> p0, inout DifferentialPair<T1> p1, T2 p2)
+{
+    ....
+}
+
+[ForwardDerivative(derivative)]
+R original(T0 p0, inout T1, p1, T2 p2);
+```
+If `derivative` is defined in a different scope from `original`, such as in a different namespace or `struct` type, a fully qualified name is required. For example:
+```csharp
+struct MyType
+{
+    // Implementing derivative function in a different name scope.
+    static DifferentialPair<R> derivative(DifferentialPair<T0> p0, inout DifferentialPair<T1> p1, T2 p2)
+    {
+        ....
+    }
+}
+
+// Use fully qualified name in the attribute.
+[ForwardDerivative(MyType.derivative)]
+R original(T0 p0, inout T1, p1, T2 p2);
+```
+
+Sometimes the derivative function needs to be defined in a different module from the original function, or the derivative function cannot be made visible from the original function. In this case, we can use the `[ForwardDerivativeOf(originalFunnc)]` attribute to inform the compiler that `originalFunc` should be treated as a forward-differentiable function, and the current function is the derivative implementation of `originalFunc`. The following code will have the same effect to associate `derivative` and the forward-derivative implementation of `original`:
+
+```csharp
+R original(T0 p0, inout T1, p1, T2 p2);
+
+[ForwardDerivativeOf(original)]
+DifferentialPair<R> derivative(DifferentialPair<T0> p0, inout DifferentialPair<T1> p1, T2 p2)
+{
+    ....
+}
+```
+
+## Backward Derivative Propagation Function
+
+A backward derivative propagation function propagates the derivative of the function output to all the input parameters simultaneously.
+
+Given an original function `f`, the general rule for determining the signature of its backward propagation function is that a differentiable output `o` becomes an input parameter holding the partial derivative of a downstream output with regard to the differentiable output, i.e. $$\partial y/\partial o$$); an input differentiable parameter `i` in the original function will become an output in the backward propagation function, holding the propagated partial derivative $$\partial y/\partial i$$; and any non-differentiable outputs are dropped from the backward propagation function. This means that the backward propagation function never returns any values computed in the original function.
+
+More specifically, the signature of its backward propagation function is determined using the following rules:
+- A backward propagation function always returns `void`.
+- A differentiable `in` parameter of type `T` will become an `inout DifferentialPair<T>` parameter, where the original value part of the differential pair contains the original value of the parameter to pass into the back-prop function. The original value will not be overwritten by the backward propagation function. The propagated derivative will be written to the derivative part of the differential pair after the backward propagation function returns. The initial derivative value of the pair is ignored as input.
+- A differentiable `out` parameter of type `T` will become an `in T.Differential` parameter, carrying the partial derivative of some downstream term with regard to the return value.
+- A differentiable `inout` parameter of type `T` will become an `inout DifferentialPair<T>` parameter, where the original value of the argument, along with the downstream partial derivative with regard to the argument is passed as input to the backward propagation function as the original and derivative part of the pair. The propagated derivative with regard to this input parameter will be written back and replace the derivative part of the pair. The primal value part of the parameter will *not* be updated.
+- A differentiable return value of type `R` will become an additional `in R.Differential` parameter at the end of the backward propagation function parameter list, carrying the result derivative of a downstream term with regard to the return value of the original function.
+- A non-differentiable return value of type `NDR` will be dropped.
+- A non-differentiable `in` parameter of type `ND` will remain unchanged in the backward propagation function.
+- A non-differentiable `out` parameter of type `ND` will be removed from the parameter list of the backward propagation function.
+- A non-differentiable `inout` parameter of type `ND` will become an `in ND` parameter.
+
+For example consider the following original function:
+```csharp
+struct T : IDifferentiable {...}
+struct R : IDifferentiable {...}
+struct ND {} // Non differentiable
+
+[Differentiable]
+R original(T p0, out T p1, inout T p2, ND p3, out ND p4, inout ND p5);
+```
+The signature of its backward propagation function is:
+```csharp
+void back_prop(
+    inout DifferentialPair<T> p0,
+    T.Differential p1,
+    inout DifferentialPair<T> p2,
+    ND p3,
+    ND p5,
+    R.Differential dResult);
+```
+Note that although `p2` is still `inout` in the backward propagation function, the backward propagation function will only write propagated derivative to `p2.d` and will not modify `p2.p`.
+
+### Automatically Implemented Backward Propagation Functions
+
+A function can be made backward-differentiable with a `[Differentiable]` or `[BackwardDifferentiable]` attribute. This attribute will cause the compiler to automatically implement the backward propagation function. The syntax for using `[Differentiable]` is:
+
+```csharp
+[Differentiable]
+R original(T0 p0, inout T1, p1, T2 p2);
+```
+
+Once the function is made backward-differentiable, the backward propagation function can then be called with the `bwd_diff` operator:
+```csharp
+bwd_diff(original)(...);
+```
+
+### User Defined Backward Propagation Functions
+Similar to user-defined forward derivative functions, the `[BackwardDerivative]` and `[BackwardDerivativeOf]` attributes can be used to supply a function with user defined backward propagation function.
+
+The syntax for using `[BackwardDerivative]` attribute is:
+```csharp
+void back_prop(
+    inout DifferentialPair<T> p0,
+    T1.Differential p1,
+    inout DifferentialPair<T> p2,
+    ND p3,
+    ND p5,
+    R.Differential dResult)
+{
+    ...
+}
+
+[BackwardDerivative(back_prop)]
+R original(T0 p0, inout T1, p1, T2 p2);
+```
+
+Similarly, the `[BackwardDerivativeOf]` attribute can be used on the back-prop function in case it is not convenient to modify the definition of the original function, or the back-prop function can't be made visible from the original function:
+
+```csharp
+R original(T0 p0, inout T1, p1, T2 p2);
+
+[BackwardDerivativeOf(original)]
+void back_prop(
+    inout DifferentialPair<T> p0,
+    T1.Differential p1,
+    inout DifferentialPair<T> p2,
+    ND p3,
+    ND p5,
+    R.Differential dResult)
+{
+    ...
+}
+```
+
+## Builtin Differentiable Functions
+
+The following built-in functions are backward differentiable and both their forward-derivative and backward-propagation functions are already defined in the built-in library:
+
+- Arithmetic functions: `abs`, `max`, `min`, `sqrt`, `rcp`, `rsqrt`, `fma`, `mad`, `fmod`, `frac`, `radians`, `degrees`
+- Interpolation and clamping functions: `lerp`, `smoothstep`, `clamp`, `saturate`
+- Trigonometric functions: `sin`, `cos`, `sincos`, `tan`, `asin`, `acos`, `atan`, `atan2`
+- Hyperbolic functions: `sinh`, `cosh`, `tanh`
+- Exponential and logarithmic functions: `exp`, `exp2`, `pow`, `log`, `log2`, `log10`
+- Vector functions: `dot`, `cross`, `length`, `distance`, `normalize`, `reflect`, `refract`
+- Matrix transforms: `mul(matrix, vector)`, `mul(vector, matrix)`, `mul(matrix, matrix)`
+- Matrix operations: `transpose`, `determinant`
+- Legacy blending and lighting intrinsics: `dst`, `lit`
+
+## Primal Substitute Functions
+
+Sometimes it is desirable to replace a function with another when generating forward or backward derivative propagation code. For example, the following code shows a function that computes the integral of some term by sampling and we want to use a different sampling stragegy when computing the derivatives.
+```csharp
+float myTerm(float x)
+{
+     return someComplexComputation(x);
+}
+
+float getSample(float a, float b) { ... }
+
+[Differentiable]
+float computeIntegralOverMyTerm(float x, float a, float b)
+{
+     float sum = 0.0;
+     for (int i = 0; i < SAMPLE_COUNT; i++)
+     {
+          let s = no_diff getSample(a, b);
+          let y = myTerm(s);
+          sum += y * ((b-a)/SAMPLE_COUNT);
+     }
+     return sum;
+}
+```
+
+In this code, the `getSample` function returns a random sample in the range of `[a,b]`. Assume we have another sampling function `getSampleForDerivativeComputation(a,b)` that we wish to use instead in derivative computation, we can do so by marking it as a primal-substitute of `getSample`, as in the following code:
+```csharp
+[PrimalSubstituteOf(getSample)]
+float getSampleForDerivativeComputation(float a, float b)
+{
+     ...
+}
+```
+
+Here, the `[PrimalSubstituteOf(getSample)]` attributes marks the `getSampleForDerivativeComputation` function as the substitute for `getSample` in derivative propagation functions. When a function has a primal substitute, the compiler will treat all calls to that function as if it is a call to the substitute function when generating derivative code. Note that this only applies to compiler generated derivative function and does not affect user provided derivative functions. If a user provided derivative function calls `getSample`, it will not be replaced by `getSampleForDerivativeComputation` by the compiler.
+
+Similar to `[ForwardDerivative]` and `[ForwardDerivativeOf]` attributes, The `[PrimalSubsitute(substFunc)]` attribute works the other way around: it specifies the primal substitute function of the function being marked.
+
+Primal substitute can be used as another way to make a function differentiable. A function is considered differentiable if it has a primal substitute that is differentiable. The following code illustrates this mechanism.
+```csharp
+float myFunc(float x) {...}
+
+[PrimalSubstituteOf(myFunc)]
+[Differentiable]
+float myFuncSubst(float x) {...}
+
+// myFunc is now considered backward differentiable.
+```
+
+The following example shows in more detail on how primal substitute affects derivative computation.
+```csharp
+float myFunc(float x) { return x*x; }
+
+[PrimalSubstituteOf(myFunc)]
+[ForwardDifferentiable]
+float myFuncSubst(float x) { return x*x*x; }
+
+[ForwardDifferentiable]
+float caller(float x) { return myFunc(x); }
+
+let a = caller(4.0); // a == 16.0 (calling myFunc)
+let b = fwd_diff(caller)(diffPair(4.0, 1.0)).p; // b == 64.0 (calling myFuncSubst)
+let c = fwd_diff(caller)(diffPair(4.0, 1.0)).d; // c == 48.0 (calling derivative of myFuncSubst)
+```
+
+In case that a function has both custom defined derivatives and a differentiable primal substitute, the primal substitute overrides the custom defined derivative on the original function. All calls to the original function will be translated into calls to the primal substitute first, and differentiation step follows after. This means that the derivatives of the primal substitute function will be used instead of the derivatives defined on the original function.
+
+## Working with Mixed Differentiable and Non-Differentiable Code
+
+Introducing differentiability to an existing system often involves dealing with code that mixes differentiable and non-differentiable logic.
+Slang provides type checking and code analysis features to allow users to clarify the intention and guard against unexpected behaviors involving when to propagate derivatives through operations.
+
+### Excluding Parameters from Differentiation
+
+Sometimes we do not wish a parameter to be considered differentiable despite it has a differentiable type. We can use the `no_diff` modifier on the parameter to inform the compiler to treat the parameter as non-differentiable and skip generating differentiation code for the parameter. The syntax is:
+
+```csharp
+// Only differentiate this function with regard to `x`.
+float myFunc(no_diff float a, float x);
+```
+
+The forward derivative and backward propagation functions of `myFunc` should have the following signature:
+```csharp
+DifferentialPair<float> fwd_derivative(float a, DifferentialPair<float> x);
+void back_prop(float a, inout DifferentialPair<float> x, float dResult);
+```
+
+In addition, the `no_diff` modifier can also be used on the return type to indicate the return value should be considered non-differentiable. For example, the function
+```csharp
+no_diff float myFunc(no_diff float a, float x, out float y);
+```
+Will have the following forward derivative and backward propagation function signatures:
+
+```csharp
+float fwd_derivative(float a, DifferentialPair<float> x);
+void back_prop(float a, inout DifferentialPair<float> x, float d_y);
+```
+
+By default, the implicit `this` parameter will be treated as differentiable if the enclosing type of the member method is differentiable. If you wish to exclude `this` parameter from differentiation, use `[NoDiffThis]` attribute on the method:
+```csharp
+struct MyDifferentiableType : IDifferentiable
+{
+    [NoDiffThis]   // Make `this` parameter `no_diff`.
+    float compute(float x) { ... }
+}
+```
+
+### Excluding Struct Members from Differentiation
+
+When using automatic `IDifferentiable` conformance synthesis for a `struct` type, Slang will by-default treat all struct members that have a differentiable type as differentiable, and thus include a corresponding field in the generated `Differential` type for the struct.
+For example, given the following definition
+```csharp
+struct MyType : IDifferentiable
+{
+    float member1;
+    float2 member2;
+}
+```
+Slang will generate:
+```csharp
+struct MyType.Differential : IDifferentiable
+{
+    float member1;  // derivative for MyType.member1
+    float2 member2; // derivative for MyType.member2
+}
+```
+If the user does not want a certain member to be treated as differentiable despite it has a differentiable type, a `no_diff` modifier can be used on the struct member to exclude it from differentiation.
+For example, the following code excludes `member1` from differentiation:
+```csharp
+struct MyType : IDifferentiable
+{
+    no_diff float member1;  // excluded from differentiation
+    float2 member2;
+}
+```
+The generated `Differential` in this case will be:
+```csharp
+struct MyType.Differential : IDifferentiable
+{
+    float2 member2;
+}
+```
+
+### Assigning Differentiable Values into a Non-Differentiable Location
+
+When a value with derivatives is being assigned to a location that is not differentiable, such as a struct member that is marked as `no_diff`, the derivative info is discarded and any derivative propagation is stopped at the assignment site.
+This may lead to unexpected results. For example:
+```csharp
+struct MyType : IDifferentiable
+{
+    no_diff float member;
+    float someOtherMemther;
+}
+[ForwardDifferentiable]
+float f(float x)
+{
+    MyType t;
+    t.member = x * x; // Error: assigning value with derivative into a non-differentiable location.
+    return t.member;
+}
+...
+let result = fwd_diff(f)(diffPair(3.0, 1.0)).d; // result == 0.0
+```
+In this case, we are assigning the value `x*x`, which carries a derivative, into a non-differentiable location `MyType.member`, thus throwing away any derivative info. When `f` returns `t.member`, there will be no derivative associated with it, so the function will not propagate the derivative through. This code is most likely not intending to discard the derivative through the assignment. To help avoid this kind of unintentional behavior, Slang will treat any assignments of a value with derivative info into a non-differentiable location as a compile-time error. To eliminate this error, the user should either make `t.member` differentiable, or to force the assignment by clarifying the intention to discard any derivatives using the built-in `detach` method.
+The following code will compile, and the derivatives will be discarded:
+```csharp
+[ForwardDifferentiable]
+float f(float x)
+{
+    MyType t;
+    // OK: the code has expressed clearly the intention to discard the derivative and perform the assignment.
+    t.member = detach(x * x);
+    return t.member;
+}
+```
+
+### Calling Non-Differentiable Functions from a Differentiable Function
+Calling non-differentiable function from a differentiable function is allowed. However, derivatives will not be propagated through the call. The user is required to clarify the intention by prefixing the call with the `no_diff` keyword. An un-clarified call to non-differentiable function will result in a compile-time error.
+
+For example, consider the following code:
+```csharp
+float g(float x)
+{
+    return 2*x;
+}
+
+[ForwardDifferentiable]
+float f(float x)
+{
+    // Error: implicit call to non-differentiable function g.
+    return g(x) + x * x;
+}
+```
+The derivative will not propagate through the call to `g` in `f`. As a result, `fwd_diff(f)(diffPair(1.0, 1.0))` will return
+`{3.0, 2.0}` instead of `{3.0, 4.0}` as the derivative from `2*x` is lost through the non-differentiable call. To prevent unintended error, it is treated as a compile-time error to call `g` from `f`. If such a non-differentiable call is intended, a `no_diff` prefix is required in the call:
+```csharp
+[ForwardDifferentiable]
+float f(float x)
+{
+    // OK. The intention to call a non-differentiable function is clarified.
+    return no_diff g(x) + x * x;
+}
+```
+
+However, the `no_diff` keyword is not required in a call if a non-differentiable function does not take any differentiable parameters, or if the result of the differentiable function is not dependent on the derivative being propagated through the call.
+
+### Treat Non-Differentiable Functions as Differentiable
+Slang allows functions to be marked with a `[TreatAsDifferentiable]` attribute for them to be considered as differentiable functions by the type-system. When a function is marked as `[TreatAsDifferentiable]`, the compiler will not generate derivative propagation code from the original function body or perform any additional checking on the function definition. Instead, it will generate trivial forward and backward propagation functions that returns 0.
+
+This feature can be useful if the user marked an `interface` method as forward or backward differentiable, but only wish to provide non-trivial derivative propagation functions for a subset of types that implement the interface. For other types that does not actually need differentiation, the user can simply put `[TreatAsDifferentiable]` on the method implementations for them to satisfy the interface requirement.
+
+See the following code for an example of `[TreatAsDifferentiable]`:
+```csharp
+interface IFoo
+{
+    [Differentiable]
+    float f(float v);
+}
+
+struct B : IFoo
+{
+    [TreatAsDifferentiable]
+    float f(float v)
+    {
+        return v * v;
+    }
+}
+
+[Differentiable]
+float use(IFoo o, float x)
+{
+    return o.f(x);
+}
+
+// Test:
+B obj;
+float result = fwd_diff(use)(obj, diffPair(2.0, 1.0)).d;
+// result == 0.0, since `[TreatAsDifferentiable]` causes a trivial derivative implementation
+// being generated regardless of the original code.
+```
+
+## Higher Order Differentiation
+
+Slang supports generating higher order forward and backward derivative propagation functions. It is allowed to use `fwd_diff` and `bwd_diff` operators inside a forward or backward differentiable function, or to nest `fwd_diff` and `bwd_diff` operators. For example, `fwd_diff(fwd_diff(sin))` will have the following signature:
+
+```csharp
+DifferentialPair<DifferentialPair<float>> sin_diff2(DifferentialPair<DifferentialPair<float>> x);
+```
+
+The input parameter `x` contains four fields: `x.p.p`, `x.p.d,`, `x.d.p`, `x.d.d`, where `x.p.p` specifies the original input value, both `x.p.d` and `x.d.p` store the first order derivative if `x`, and `x.d.d` stores the second order derivative of `x`. Calling `fwd_diff(fwd_diff(sin))` with `diffPair(diffPair(pi/2, 1.0), DiffPair(1.0, 0.0))` will result `{ { 1.0, 0.0 }, { 0.0, -1.0 } }`.
+
+User defined higher-order derivative functions can be specified by using `[ForwardDerivative]` or `[BackwardDerivative]` attribute on the derivative function, or by using `[ForwardDerivativeOf]` or `[BackwardDerivativeOf]` attribute on the higher-order derivative function.
+
+## Interactions with Generics and Interfaces
+
+Automatic differentiation for generic functions is supported. The forward-derivative and backward propagation functions of a generic function is also a generic function with the same set of generic parameters and constraints. Using `[ForwardDerivative]`, `[ForwardDerivativeOf]`, `[BackwardDerivative]` or `[BackwardDerivativeOf]` attributes to associate a derivative function with different set of generic parameters or constraints is a compile-time error.
+
+An interface method requirement can be marked as `[ForwardDifferentiable]` or `[Differentiable]`, so they may be called in a forward or backward differentiable function and have the derivatives propagate through the call. This works regardless of whether the call can be specialized or has to go through dynamic dispatch. However, calls to interface methods are only differentiable once. Higher order differentiation through interface method calls are not supported.
+
+## Restrictions of Automatic Differentiation
+
+The compiler can generate forward derivative and backward propagation implementations for most uses of array and struct types, including arbitrary read and write access at dynamic array indices, and supports uses of all types of control flows, mutable parameters, generics and interfaces. This covers the set of operations that is sufficient for a lot of functions. However, the user needs to be aware of the following restrictions when using automatic differentiation:
+
+- All operations to global resources, global variables and shader parameters, including texture reads or atomic writes, are treating as a non-differentiable operation.
+- If a differentiable function contains calls that cause side-effects such as updates to global memory, there will not be a guarantee on how many times the side-effect will occur during the resulting derivative function or back-propagation function.
+- Loops: Loops must use the attribute `[MaxIters(<count>)]` to specify a maximum number of iterations. This will be used by compiler to allocate space to store intermediate data. If the actual number of iterations exceeds the provided maximum, the behavior is undefined. You can always mark a loop with the `[ForceUnroll]` attribute to instruct the Slang compiler to unroll the loop before generating derivative propagation functions. Unrolled loops will be treated the same way as ordinary code and are not subject to any additional restrictions.
+
+The above restrictions do not apply if a user-defined derivative or backward propagation function is provided.
diff --git a/docs/user-guide/07-targets.md b/docs/user-guide/07-targets.md
deleted file mode 100644
index 68f18cc9e..000000000
--- a/docs/user-guide/07-targets.md
+++ /dev/null
@@ -1,366 +0,0 @@
----
-layout: user-guide
----
-
-Supported Compilation Targets
-============================
-
-This chapter provides a brief overview of the compilation targets supported by Slang, and their different capabilities.
-
-Background and Terminology
---------------------------
-
-### Code Formats
-
-When Slang compiles for a target platform one of the most important distinctions is the _format_ of code for that platform.
-For a native CPU target, the format is typically the executable machine-code format for the processor family (for example, x86-64).
-In contrast, GPUs are typically programmed through APIs that abstract over multiple GPU processor families and versions.
-GPU APIs usually define an _intermediate language_ that sits between a high-level-language compiler like Slang and GPU-specific compilers that live in drivers for the API.
-
-### Pipelines and Stages
-
-GPU code execution occurs in the context of a _pipeline_.
-A pipeline comprises one or more _stages_ and dataflow connections between them.
-Some stages are _programmable_ and run a user-defined _kernel_ that has been compiled from a language like Slang, while others are _fixed-function_ and can only be configured, rather than programmed, by the user.
-Slang supports three different pipelines.
-
-#### Rasterization
-
-The _rasterization_ pipeline is the original GPU rendering pipeline.
-On current GPUs, the simplest rasterization pipelines have two programmable stages: a `vertex` stage and a `fragment` stage.
-The rasterization pipeline is named after its most important fixed-function stage: the rasterizer, which determines the pixels covered by a geometric primitive, and emits _fragments_ covering those pixels, to be shaded.
-
-#### Compute
-
-The _compute_ pipeline is a simple pipeline with only one stage: a programmable `compute` stage.
-As a result of being a single-stage pipeline the compute pipeline doesn't need to deal with many issues around inter-stage dataflow that other pipelines do.
-
-#### Ray Tracing
-
-A _ray tracing_ pipeline has multiple stages pertaining to the life cycle of a ray being traced through a scene of geometric primitives.
-These can include an `intersection` stage to compute whether a ray intersects a geometry primitive, a `miss` stage that runs when a ray does not intersect any geometric object in a scene, etc.
-
-Note that some platforms support types and operations related to ray tracing that can run outside of the context of a dedicated ray tracing pipeline.
-Just as applications can do computation outside of the dedicated compute pipeline, the use of ray tracing does not necessarily mean that a ray tracing pipeline is being used.
-
-### Shader Parameter Bindings
-
-The kernels that execute within a pipeline typically has access to four different kinds of data:
-
-* _Varying inputs_ coming from the system or from a preceding pipeline stage
-
-* _Varying outputs_ which will be passed along to the system or to a following pipeline stage
-
-* _Temporaries_ which are scratch memory or registers used by each invocation of the kernel and then dismissed on exit.
-
-* _Shader parameters_ (sometimes also called _uniform parameters_), which provide access to data from outside the pipeline dataflow
-
-The first three of these kinds of data are largely handled by the implementation of a pipeline.
-In contrast, an application programmer typically needs to manually prepare shader parameters, using the appropriate mechanisms and rules for each target platform.
-
-On platforms that provide a CPU-like "flat" memory model with a single virtual address space, and where any kind of data can be stored at any address, passing shader parameters can be almost trivial.
-Current graphics APIs provide far more complicated and less uniform mechanisms for passing shader parameters.
-
-A high-level language compiler like Slang handles the task of _binding_ each user-defined shader parameter to one or more of the parameter-passing resources defined by a target platform.
-For example, the Slang compiler might bindg a global `Texture2D` parameter called `gDiffuse` to the `t1` register defined by the Direct3D 11 API.
-
-An application is responsible for passing the argument data for a parameter using the using the corresponding platform-specific resource it was bound to.
-For example, an application should set the texture they want to use for `gDiffuse` to the `t1` register using Direct3D 11 API calls.
-
-#### Slots
-
-Historically, most graphics APIs have used a model where shader parameters are passed using a number of API-defined _slots_.
-Each slot can store a single argument value of an allowed type.
-Depending on the platform slots might be called "registers," "locations," "bindings," "texture units," or other similar names.
-
-Slots almost exclusively use opaque types: textures, buffers, etc.
-On platforms that use slots for passing shader parameters, value of ordinary types like `float` or `int` need to be stored into a buffer, and then that buffer is passed via an appropriate slot.
-
-Although many graphics APIs use slots as an abstraction, the details vary greatly across APIs.
-Different APIs define different kinds of slots, and the types of arguments that may be stored in those slots vary.
-For example, one API might use two different kinds of slots for textures and buffers, while another uses a single kind of slot for both.
-On some APIs each pipeline stage gets is own dedicated slots, while on others slots are shared across all stages in a pipeline.
-
-#### Blocks
-
-Newer graphics APIs typically provide a system for grouping related shader parameters into re-usable _blocks_.
-Blocks might be referred to as "descriptor tables," "descriptor sets," or "argument buffers."
-Each block comprises one or more slots (often called "descriptors") that can be used to bind textures, buffers, etc.
-
-Blocks are in turn set into appropriate slots provided by a pipeline.
-Because a block can contain many different slots for textures or buffers, switching a pipeline argument from one block to another can effectively swap out a large number of shader parameters in one operation.
-Thus, while blocks introduce a level of indirection to parameter setting, then can also enable greater efficiency when parameters are grouped into blocks according to frequency of change.
-
-#### Root Constants
-
-Most recent graphics APIs also allow for a small amount of ordinary data (meaning types like `float` and `int` but not opaque types like buffers or textures) to be passed to the pipeline as _root constants_ (also called "push constants").
-
-Using root constants can eliminate some overheads from passing parameters of ordinary types via buffers.
-Passing a single `float` using a root constant rather than a buffer obviously eliminates a level of indirection.
-More importantly, though, using a root constant can avoid application code having to allocate and manage the lifetime of a buffer in a concurrent CPU/GPU program.
-
-Direct3D 11
------------
-
-Direct3D 11 (D3D11) is a older graphics API, but remains popular because it is much simpler to learn and use than some more recent APIs.
-In this section we will give an overview of the relevant features of D3D11 when used as a target platform for Slang.
-Subsequent sections about other APIs may describe them by comparison to D3D11.
-
-D3D11 kernels must be compiled to the DirectX Bytecode (DXBC) intermediate language.
-A DXBC binary includes a hash/checksum computed using an undocumented algorithm, and the runtime API rejects kernels without a valid checksum.
-The only supported way to generate DXBC is by compiling HLSL using the fxc compiler.
-
-### Pipelines
-
-D3D11 exposes two pipelines: rasterization and compute.
-
-The D3D11 rasterization pipeline can include up to five programmable stages, although most of them are optional:
-
-* The `vertex` stage (VS) transforms vertex data loaded from memory
-
-* The optional `hull` stage (HS) typically sets up and computes desired tessellation levels for a higher-order primitive
-
-* The optional `domain` stage (DS) evaluates a higher-order surface at domain locations chosen by a fixed-function tessellator
-
-* The optional `geometry` stage  (GS) receives as input a primitive and can produce zero or more new primitives as output
-
-* The optional `fragment` stage transforms fragments produced by the fixed-function rasterizer, determining the values for those fragments that will be merged with values in zero or more render targets. The fragment stage is sometimes called a "pixel" stage (PS), even when it does not process pixels.
-
-### Parameter Passing
-
-Shader parameters are passed to each D3D11 stage via slots.
-Each stage has its own slots of the following types:
-
-* **Constant buffers** are used for passing relatively small (4KB or less) amounts of data that will be read by GPU code. Constant bufers are passed via `b` registers.
-
-* **Shader resource views** (SRVs) include most textures, buffers, and other opaque resource types thare are read or sampled by GPU code. SRVs use `t` registers.
-
-* **Unordered access views** (UAVs) include textures, buffers, and other opaque resource types used for write or read-write operations in GPU code. UAVs use `u` registers.
-
-* **Samplers** are used to pass opaque texture-sampling stage, and use `s` registers.
-
-In addition, the D3D11 pipeline provides _vertex buffer_ slots and a single _index buffer_ slot to be used as the source vertex and index data that defines primitives.
-User-defined varying vertex shader inputs are bound to _vertex attribute_ slots (referred to as "input elements" in D3D11) which define how data from vertex buffers should be fetched to provide values for vertex attributes.
-
-The D3D11 rasterization pipeline also provides a mechanism for specifying _render target views_ (RTVs) and _depth-stencil views_ (DSVs) that provide the backing storage for the pixels in a framebuffer.
-User-defined fragment shader varying outputs (with `SV_Target` binding semantics) are bound to RTV slots.
-
-One notable detail of the D3D11 API is that the slots for fragment-stage UAVs and RTVs overlap.
-For example, a fragment kernel cannot use both `u0` and `SV_Target0` at once.
-
-Direct3D 12
------------
-
-Direct3D 12 (D3D12) is the current major version of the Direct2D API.
-
-D3D12 kernels must be compiled to the DirectX Intermediate Language (DXIL).
-DXIL is a layered encoding based off of LLVM bitcode; it introduces additional formatting rules and constraints which are loosely documented.
-A DXIL binary may be signed, and the runtime API only accepts appropriately signed binaries (unless a developer mode is enabled on the host machine).
-A DXIL validator `dxil.dll` is included in SDK releases, and this validator can sign binaries that pass validation.
-While DXIL can in principle be generated from multiple compiler front-ends, support for other compilers is not prioritized.
-
-### Pipelines
-
-D3D12 includes rasterization and compute pipelines similar to those in D3D11.
-Revisions to D3D12 have added additional stages to the rasterization pipeline, as well as a ray-tracing pipeline.
-
-#### Mesh Shaders
-
-> #### Note ###
-> The Slang system does not currently support mesh shaders.
-
-The D3D12 rasterization pipeline provides alternative geometry processing stages that may be used as an alternative to the `vertex`, `hull`, `domain`, and `geometry` stages:
-
-* The `mesh` stage runs groups of threads which are responsible cooperating to produce both the vertex and index data for a _meshlet_ a bounded-size chunk of geometry.
-
-* The optional `amplification` stage precedes the mesh stage and is responsible for determining how many mesh shader invocations should be run.
-
-Compared to the D3D11 pipeline without tesselllation (hull and domain shaders), a mesh shader is kind of like a combined/generalized vertex and geometry shader.
-
-Compared to the D3D11 pipeline with tessellation, an amplification shader is kind of like a combined/generalized vertex and hull shader, while a mesh shader is kind of like a combined/generalized domain and geometry shader.
-
-#### Ray Tracing
-
-The DirectX Ray Tracing (DXR) feature added a ray tracing pipeline to D3D12.
-The D3D12 ray tracing pipeline exposes the following programmable stages:
-
-* The ray generation (`raygeneration`) stage is similar to a compute stage, but can trace zero or more rays and make use of the results of those traces.
-
-* The `intersection` stage runs kernels to compute whether a ray intersects a user-defined primitive type. The system also includes a default intersector that handles triangle meshes.
-
-* The so-called any-hit (`anyhit`) stage runs on _candidate_ hits where a ray has intersected some geometry, but the hit must be either accepted or rejected by application logic. Note that the any-hit stage does not necessarily run on *all* hits, because configuration options on both scene geometry and rays can lead to these checks being bypassed.
-
-* The closest-hit (`closesthit`) stage runs a single _accepted_ hit for a ray; under typical circumstances this will be the closest hit to the origin of the ray. A typical closest-hit shader might compute the apparent color of a surface, similar to a typical fragment shader.
-
-* The `miss` stage runs for rays that do not find or accept any hits in a scene. A typical miss shader might return a background color or sample an environment map.
-
-* The `callable` stage allows user-defined kernels to be invoked like subroutines in the context of the ray tracing pipeline.
-
-Compared to existing rasterization and compute pipelines, an important difference in the design of the D3D12 ray tracing pipeline is that multiple kernels can be loaded into the pipeline for each of the programming stages.
-The specific closest-hit, miss, or other kernel that runs for a given hit or ray is determined by indexing into an appropriate _shader table_, which is effectively an array of kernels.
-The indexing into a shader table can depend on many factors including the type of ray, the type of geometry hit, etc.
-
-Note that DXR version 1.1 adds ray tracing types and operations that can be used outside of the dedicated ray tracing pipeline.
-These new mechanisms have less visible impact for a programmer using or integrating Slang.
-
-
-### Parameter Passing
-
-The mechanisms for parameter passing in D3D12 differ greatly from D3D11.
-Most opaque types (texture, resources, samplers) must be set into blocks (D3D12 calls blocks "descriptor tables").
-Each pipeline supports a fixed amount of storage for "root parameters," and allows those root parameters to be configured as root constants, slots for blocks, or slots for a limited number of opaque types (primarily just flat buffers).
-
-Shader parameters are still grouped and bound to registers as in D3D11; for example, a `Texture2D` parameter is considered as an SRV and uses a `t` register.
-D3D12 additionally associates binds shader parameters to "spaces" which are expressed similarly to registers (e.g., `space2`), but represent an orthogonal "axis" of binding.
-
-While shader parameters are bound registers and spaces, those registers and spaces do not directly correspond to slots provided by the D3D12 API the way registers do in D3D11.
-Instead, the configuration of the root parameters and the correspondence of registers/spaces to root parameters, blocks, and/or slots are defined by a _pipeline layout_ that D3D12 calls a "root signature."
-
-Unlike D3D11, all of the stages in a D3D12 pipeline share the same root parameters.
-A D3D12 pipeline layout can specify that certain root parameters or certain slots within blocks will only be accessed by a subset of stages, and can map the *same* register/space pair to different parameters/blocks/slots as long as this is done for disjoint subset of stages.
-
-#### Ray Tracing Specifics
-
-The D3D12 ray tracing pipeline adds a new mechanism for passing shader parameters.
-In addition to allowing shader parameters to be passed to the entire pipeline via root parameters, each shader table entry provides storage space for passing argument data specific to that entry.
-
-Similar to the use of a pipline layout (root signature) to configure the use of root parameters, each kernel used within shader entries must be configured with a "local root signature" that defines how the storage space in the shader table entry is to be used.
-Shader parameters are still bound to registers and spaces as for non-ray-tracing code, and the local root signature simply allows those same registers/spaces to be associated with locations in a shader table entry.
-
-One important detail is that some shader table entries are associated with a kernel for a single stage (e.g., a single miss shader), while other shader table entries are associated with a "hit group" consisting of up to one each of an intersection, any-hit, and closest-hit kernel.
-Because multiple kernels in a hit group share the same shader table entry, they also share the configured slots in that entry for binding root constants, blocks, etc.
-
-Vulkan
-------
-
-Vulkan is a cross-platform GPU API for graphics and compute with a detailed specification produced by a multi-vendor standards body.
-In contrast with OpenGL, Vulkan focuses on providing explicit control over as many aspects of GPU work as possible.
-In contrast with OpenCL, Vulkan focuses first and foremost on the needs of real-time graphics developers.
-
-Vulkan requires kernels to be compiled to the SPIR-V intermediate language.
-SPIR-V is a simple and extensible binary program format with a detailed specification; it is largely unrelated to earlier "SPIR" formats that were LLVM-based and loosely specified.
-The SPIR-V format does not require signing or hashing, and is explicitly designed to allow many different tools to produce and manipulate the format.
-Drivers that consume SPIR-V are expected to perform validation at load time.
-Some choices in the SPIR-V encoding are heavily influenced by specific design choices in the GLSL language, and may require non-GLSL compilers to transform code to match GLSL idioms.
-
-### Pipelines
-
-Vulkan includes rasterization, compute, and ray tracing pipelines with the same set of stages as described for D3D12 above.
-
-### Parameter Passing
-
-Like D3D12, Vulkan uses blocks (called "descriptor sets") to organize groups of bindings for opaque types (textures, buffers, samplers).
-Similar to D3D12, a Vulkan pipeline supports a limited number of slots for passing blocks to the pipeline, and these slots are shared across all stages.
-Vulkan also supports a limited number of bytes reserved for passing root constants (called "push constants").
-Vulkan uses pipeline layouts to describe configurations of usage for blocks and root constants.
-
-High-level-language shader parameters are bound to a combination of a "binding" and a "set" for Vulkan, which are superficially similar to the registers and spaces of D3D12.
-Unlike D3D12, however, bindings and sets in Vulkan directly correspond to the API-provided parameter-passing mechanism.
-The set index of a parameter indicates the zero-based index of a slot where a block must be passed, and the binding index is the zero-based index of a particular opaque value set into the block.
-A shader parameter that will be passed using root constants (rather than via blocks) must be bound to a root-constant offset as part of compilation.
-
-Unlike D3D12, where SRVs, UAVs, etc. use distinct classes of registers, all opaque-type shader parameters use the same index space of bindings.
-That is, a buffer and a texture both using `binding=2` in `set=3` for Vulkan will alias the same slot in the same block.
-
-The Vulkan ray tracing pipeline also uses a shader table, and also forms hit groups similar to D3D12.
-Unlike D3D12, each shader table entry in Vulkan can only be used to pass ordinary values (akin to root constants), and cannot be configured for binding of opaque types or blocks.
-
-OpenGL
-------
-
-> #### Note ####
-> Slang has only limited support for compiling code for OpenGL.
-
-OpenGL has existed for many years, and predates programmable GPU pipelines of the kind this chapter discusses; we will focus solely on use of OpenGL as an API for programmable GPU pipelines.
-
-OpenGL is a cross-platform GPU API for graphics and compute with a detailed specification produced by a multi-vendor standard body.
-In contrast with Vulkan, OpenGL provides many convenience and safety features that can simplify GPU programming.
-
-OpenGL allows kernels to be loaded as SPIR-V binaries, vendor-specific binaries, or using GLSL source code.
-Loading shaders as GLSL source code is the most widely supported of these options, such that GLSL is the _de facto_ intermediate language of OpenGL.
-
-### Pipelines
-
-OpenGL supports rasterization and compute pipelines with the same stages as described for D3D11.
-The OpenGL rasterization pipeline also supports the same mesh shader stages that are supported by D3D12.
-
-### Parameter Passing
-
-OpenGL uses slots for binding.
-There are distinct kinds of slots for buffers and textures/images, and each set of slots is shared by all pipeline stages.
-
-High-level-language shader parameters are bounding to a "binding" index for OpenGL.
-The binding index of a parameter is the zero-based index of the slot (of the appropriate kind) that must be used to pass an argument value.
-
-Note that while OpenGL and Vulkan both use binding indices for shader parameters like textures, the semantics of those are different because OpenGL uses distinct slots for passing buffers and textures.
-For OpenGL is is legal to have a texture that uses `binding=2` and a buffer that uses `binding=2` in the same kernel, because those are indices of distinct kinds of slots, while this scenario would typically be invalid for Vulkan.
-
-CUDA and OptiX
---------------
-
-> #### Note ####
-> Slang support for OptiX is a work in progress.
-
-CUDA C/C++ is a language for expressing heterogeneous CPU and GPU code with a simple interface to invoking GPU compute work.
-OptiX is a ray tracing API that uses CUDA C++ as the language for expressing shader code.
-We focus here on OptiX version 7 and up.
-
-CUDA and OptiX allow kernels to be loaded as GPU-specific binaries, or using the PTX intermediate language.
-
-
-### Pipelines
-
-CUDA supports a compute pipeline that is similar to D3D12 or Vulkan, with additional features.
-
-OptiX introduced the style of ray tracing pipeline adopted by D3D12 and Vulkan, and thus uses the same basic stages.
-
-The CUDA system does not currently expose a rasterization pipeline.
-
-### Parameter Passing
-
-Unlike most of the GPU APIs discussed so far, CUDA supports a "flat" memory model with a single virtual address space for all GPU data.
-Textures, buffers, etc. are not opaque types, but can instead sit in the same memory as ordinary data like `float`s or `int`s.
-
-With a flat memory model, a distinct notion of a slot or block is not needed.
-A slot is just an ordinary memory location that happens to be used to store a value of texture, buffer, or other resource type.
-A block is just an ordinary memory buffer that happens to be filled with values of texture/buffer/etc. type.
-
-CUDA provides two parameter-passing mechanisms for the compute pipeline.
-First, when invoking a compute kernel, the application passes a limited number of bytes of parameter data that act as root constants.
-Second, each loaded module of GPU code may contain pre-allocated "constant memory" storage which can be initialized from the host and then read by GPU code.
-Because types like blocks or textures are not special in CUDA, either of these mechanisms can be utilized to pass any kind of data including references to pointer-based data structures stored in the GPU virtual address space.
-The use of "slots" or "blocks" or "root constants" is a matter of application policy instead of API mechanism.
-
-OptiX supports use of constant memory storage for ray tracing pipelines, where all the stages in a ray tracing pipeline share that storage.
-OptiX uses a shader table for managing kernels and hit groups, and allows kernels to access the bytes of their shader table entry via a pointer.
-Similar to the compute pipeline, application code can layer many different policies on top of these mechanisms.
-
-CPU Compute
------------
-
-> #### Note ####
-> Slang's support for CPU compute is functional, but not feature- or performance-complete.
-> Backwards-incompatible changes to this target may come in future versions of Slang.
-
-For the purposes of Slang, different CPU-based host platforms are largely the same.
-All support binary code in a native machine-code format.
-All CPU platforms Slang supports use a flat memory model with a single virtual address space, where any data type can be stored at any virtual address.
-
-Note that this section consider CPU-based platforms only as targets for kernel compilation; using a CPU as a target for scalar "host" code is an advanced target beyond the scope of this document.
-
-### Pipelines
-
-Slang's CPU compute target supports only a compute pipeline.
-
-### Parameter Passing
-
-Because CPU target support flexible pointer-based addressing and large low-latency caches, a compute kernel can simply be passed a small fixed number of pointers and be relied upon to load parameter values of any types via indirection through those pointers.
-
-Summary
--------
-
-This chapter has reviewed the main target platforms supported by the Slang compiler and runtime system.
-A key point to take away is that there is great variation in the capabilities of these systems.
-Even superficially similar graphics APIs have complicated differences in their parameter-passing mechanisms that must be accounted for by application programmers and GPU compilers.
-
-In the next chapter, we will discuss how the Slang compiler adapts to the different capabilities and rules of these platforms when laying out shader parameters in memory and then binding those parameters to the mechanisms defined by each platform.
diff --git a/docs/user-guide/08-autodiff.md b/docs/user-guide/08-autodiff.md
deleted file mode 100644
index a2a06b64b..000000000
--- a/docs/user-guide/08-autodiff.md
+++ /dev/null
@@ -1,757 +0,0 @@
----
-layout: user-guide
----
-
-# Automatic Differentiation
-
-Neural networks and other machine learning techniques are becoming an increasingly popular way to solve many difficult problems in modern visual computing systems. However, to take advantage of these techniques, developers often need to reimplement many existing system components in a differentiable form to allow computing the derivatives of a function, or to propagate the derivative of a result backwards to each parameter. Slang provides built-in auto differentiation features to support developers adding differentiability to their existing code with as little effort as possible. In this chapter, we provide an overview of the auto differentiation features, followed by a detailed description on the new syntax and rules.
-
-## Using Automatic Differentiation in Slang
-
-In this section, we walk through the steps to compute forward-derivative from input, and backward propagate the derivative from output to input.
-
-### Forward Differentiation
-
-Suppose the user has already written a function that computes some mathematic term:
-
-```csharp
-float myFunc(float a, float x)
-{
-    return a * x * x;
-}
-```
-
-The user can make this function *forward-differentiable* by adding a `[ForwardDerivative]` attribute:
-```csharp
-[ForwardDifferentiable]
-float myFunc(float a, float x)
-{
-    return a * x * x;
-}
-```
-
-This allows the function to be used in the `fwd_diff` operator, which is a higher order operation that takes in a forward-differentiable function and returns the forward-derivative of the function.
-
-The expression `fwd_diff(myFunc)` will have the following signature:
-```csharp
-DifferentialPair<float> myFunc_fwd_derivative(DifferentialPair<float> a, DifferentialPair<float> x);
-```
-
-Where `DifferentialPair<T>` is a built-in type that encodes both the primal(original) value and the derivative value of a term.
-To use this function to compute the derivative of `myFunc` with regard to `x`, the user can call the forward-derivative function by supplying the derivative value of `x` with `1.0` and the derivative value of `a` with `0.0`, as in the following code:
-
-```csharp
-float a = 2.0;
-float x = 3.0;
-// Compute derivative with regard to `x`:
-let result = fwd_diff(myFunc)(diffPair(a, 0.0), diffPair(x, 1.0));
-// Print the derivative.
-printf("%f", result.d);
-
-// Output: 12.0
-```
-
-In the example code above, `diffPair()` is a built-in function to construct a value of `DifferentialPair<T>` with a primal value and a derivative value. The primal value and derivative value stored in a `DifferentialPair` can be accessed with the `.p` and a `.d` property.
-
-### Backward Propagation
-
-The forward derivative function allows the user to compute the derivative of a function with regard to a specific combination of input parameters at a time. In many cases, we need to know how each parameter affects the output. Instead of calling the forward derivative function once for each parameter, it is more efficient to call the *backward propagation* function that propagate the derivative of outputs to each input parameter.
-
-To allow the compiler to generate the backward propagation function, we simply mark our function with the `[Differentiable]` or `[BackwardDifferentiable]` attribute:
-```csharp
-[Differentiable]
-float myFunc(float a, float x)
-{
-    return a * x * x;
-}
-```
-
-> #### Note:
-> When a function is marked as `[Differentiable]`, it is implied that the function is both `[ForwardDifferentiable]` and `[BackwardDifferentiable]` and can be used in the `fwd_diff` operator.
-
-
-The `bwd_diff` operator applies to a backward differentiable function and returns the backward propagation function. In this case, `bwd_diff(myFunc)` will have the following signature:
-
-```csharp
-void myFunc_backProp(inout DifferentialPair<float> a, inout DifferentialPair<float> x, float dResult);
-```
-
-Where `a` is an `inout DifferentialPair` where the initial value of `a` is passed into the function as primal value (in the `.p` property), and the propagated derivative of `a` is returned via the `.d` property of the `DifferentialPair`. The same rules apply to `x`.
-
-The additional `dResult` parameter is the derivative of the return value to be propagated to the input parameters. Note that in a backward propagation function, an input will become a `inout DifferentialPair` where the `.d` property of the pair is intended for receiving the propagation result, and the return value will become an input parameter that represents the source of backward propagation.
-
-The backward propagation function can be called as in the following code:
-```csharp
-var a = diffPair(2.0); // constructs DifferentialPair{2.0, 0.0}
-var x = diffPair(3.0); // constructs DifferentialPair{3.0, 0.0}
-
-bwd_diff(myFunc)(a, x, 1.0);
-
-// a.d is now 9.0
-// x.d is now 12.0
-```
-
-This completes the walkthrough of automatic differentiation features. The following sections will cover each perspective of the auto differentiation feature in more detail.
-
-## Mathematic Concepts and Terminologies
-
-This section briefly reviews the mathematic theories behind differentiable programming with the intention to clarify the concepts and terminologies that will be used in the rest of this documentation. We assume the reader is already familiar with the basic theories behind neural network training, in particular the back-propagation algorithm.
-
-A differentiable system can be represented a composition of differentiable functions (kernels) with learnable parameters, where each differentiable function has the form:
-
-$$\mathbf{w}_{i+1} = f_i(\mathbf{w}_i) $$
-
-Where $$f_i$$ represents a differentiable function (kernel) in the system, $$\mathbf{w}$$ represents a collection of learnable parameters defined in function $$f_i$$, and $$\mathbf{w}_{i+1}$$ is the output of $$f_i$$. We will use $$\omega$$ to denote a specific parameter in $$\mathbf{w}$$.
-
-In a composed system, the value of $$\mathbf{w}$$ used to evaluate $$f_i$$ may come from an *upstream* function
-
-$$ \mathbf{w}_i = f_{i-1}(\mathbf{w}_{i-1}) $$
-
-Similarly, the value computed by $$f_i$$ may be used as argument to a *downstream* function
-
-$$ h = f_{i+1}(\mathbf{w}_{i+1}) = f_{i+1}(f_{i}(\mathbf{w}_{i}))$$
-
-The entire system composed from differentiable functions can be noted as
-
-$$Y = f_1 \circ f_2 \circ \cdots \circ f_n(\mathbf{w}_0)$$
-
-Where $$\mathbf{w}_0$$ is the first layer of parameters.
-
-### Forward Propagation of Derivatives
-When developing and training such a system, we often need to evaluate the partial derivative of a differentiable function with regard to some parameter $$\omega$$. The simplest way to obtain a partial derivative is to call a forward derivative propagation function, which is defined by:
-
-$$ \mathbb{F}[f_i] = f_i'(\mathbf{w}_i, \mathbf{w}_i') = \sum_{\omega_i\in\mathbf{w}_i} \frac{\partial f}{\partial \omega_i} \omega_i' $$
-
-Where $$\omega' \in \mathbf{w}'$$ represents the partial derivative of $$\omega_i$$ with regard to some upstream parameter $$\omega_{i-1}$$ that is used to compute $$\omega_i$$, i.e. $$\omega'=\frac{\partial \omega_{i}}{\partial \omega_{i-1}}$$.
-
-Given this definition, $$\mathbb{F}[f]$$ can be used as a forward propagation function that is able to compute $$\frac{\partial f_i}{\partial \omega_0}$$ from $$\frac{\partial \omega_{i-1}}{\partial \omega_0}$$.
-
-### Backward Propagation of Derivatives
-When using the backpropagation algorithm to train a neural network, we are more interested in figuring out the partial derivative of the final system output with regard to a parameter $$\omega_i$$ in $$f_i$$. To do so, we generally utilize the backward derivative propagation function
-
-$$\mathbb{B}[f_i] = f_i^{-1}(\frac{\partial Y}{\partial f_i}) = \frac{\partial Y}{\partial \mathbf{w}_i}$$
-
-Where the backward propagation function $$\mathbb{B}[f_i]$$ takes as input the partial derivative of the final system output $$Y$$ with regard to the output of $$f_i$$ (i.e. $$\mathbf{w}_i$$), and computes the partial derivative of the final system output with regard to the input of $$f_i$$ (i.e. $$\mathbf{w}_{i-1}$$).
-
-The higher order operator $$\mathbb{F}$$ and $$\mathbb{B}$$ represent the operations that converts an original or primal function $$f$$ to its forward or backward derivative propagation function. Slang's automatic differentiation feature provide built-in support for these operators to automatically generate the derivative propagation functions from a user defined primal function. The remaining documentation will discuss this feature from a programming language perspective.
-
-## Differentiable Types
-Slang will only generate differentiation code for values that has a *differentiable* type. A type is differentiable if it conforms to the built-in `IDifferentiable` interface. The definition of the `IDifferentiable` interface is:
-```csharp
-interface IDifferentiable
-{
-    associatedtype Differential : IDifferentiable
-        where Differential.Differential == Differential;
-
-    static Differential dzero();
-
-    static Differential dadd(Differential, Differential);
-
-    static Differential dmul(This, Differential);
-}
-```
-As defined by the `IDifferentiable` interface, a differentiable type must have a `Differential` associated type that stores the derivative of the value. A further requirement is that the type of the second-order derivative must be the same `Differential` type. In another word, given a type `T`, `T.Differential` can be different from `T`, but `T.Differential.Differential` must equal to `T.Differential`.
-
-In addition, a differentiable type must define the `zero` value of its derivative, and how to add and multiply derivative values.
-
-### Builtin Differentiable Types
-The following built-in types are differentiable: 
-- Scalars: `float`, `double` and `half`.
-- Vector/Matrix: `vector` and `matrix` of `float`, `double` and `half` types.
-- Arrays: `T[n]` is differentiable if `T` is differentiable.
-
-### User Defined Differentiable Types
-
-The user can make any `struct` types differentiable by implementing the `IDifferentiable` interface on the type. The requirements from `IDifferentiable` interface can be fulfilled automatically or manually.
-
-#### Automatic Fulfillment of `IDifferentiable` Requirements
-Assume the user has defined the following type:
-
-```csharp
-struct MyRay
-{
-    float3 origin;
-    float3 dir;
-    int nonDifferentiablePayload;
-}
-```
-
-The type can be made differentiable by adding `IDifferentiable` conformance:
-```csharp
-struct MyRay : IDifferentiable
-{
-    float3 origin;
-    float3 dir;
-    int nonDifferentiablePayload;
-}
-```
-
-Note that this code does not provide any explicit implementation of the `IDifferentiable` requirements. In this case the compiler will automatically synthesize all the requirements. This should provide the desired behavior most of the time. The procedure for synthesizing the interface implementation is as follows:
-1. A new type is generated that stores the `Differential` of all differentiable fields. This new type itself will conform to the `IDifferentiable` interface, and it will be used to satisfy the `Differential` associated type requirement.
-2. Each differential field will be associated to its corresponding field in the newly synthesized `Differential` type.
-3. The `zero` value of the differential type is made from the `zero` value of each field in the differential type.
-4. The `dadd` and `dmul` methods simply perform `dadd` and `dmul` operations on each field.
-5. If the synthesized `Differential` type contains exactly the same fields as the original type, and the type of each field is the same as the original field type, then the original type itself will be used as the `Differential` type instead of creating a new type to satisfy the `Differential` associated type requirement. This means that all the synthesized `Differential` type use itself to meet its own `IDifferentiable` requirements.
-
-#### Manual Fulfillment of `IDifferentiable` Requirements
-
-In rare cases where more control is desired, the user can manually provide the implementation. To do so, we will first define the `Differential` type for `MyRay`, and use it to fulfill the `Differential` requirement in `MyRay`:
-
-```csharp
-struct MyRayDifferential
-{
-    float3 d_origin;
-    float3 d_dir;
-}
-
-struct MyRay : IDifferentiable
-{
-    // Specify that `MyRay.Differential` is `MyRayDifferential`.
-    typealias Differential = MyRayDifferential;
-
-    // Specify that the derivative for `origin` will be stored in `MayRayDifferential.d_origin`.
-    [DerivativeMember(MayRayDifferential.d_origin)]
-    float3 origin;
-
-    // Specify that the derivative for `dir` will be stored in `MayRayDifferential.d_dir`.
-    [DerivativeMember(MayRayDifferential.d_dir)]
-    float3 dir;
-
-    // This is a non-differentiable field so we don't put any attributes on it.
-    int nonDifferentiablePayload;
-
-    // Define zero derivative.
-    static MyRayDifferential dzero()
-    {
-        return {float3(0.0), float3(0.0)};
-    }
-
-    // Define the add operation of two derivatives.
-    static MyRayDifferential dadd(MyRayDifferential v1, MyRayDifferential v2)
-    {
-        MyRayDifferential result;
-        result.d_origin = v1.d_origin + v2.d_origin;
-        result.d_dir = v1.d_dir + v2.d_dir;
-        return result;
-    }
-
-    // Define the multiply operation of a primal value and a derivative value.
-    static MyRayDifferential dmul(MyRay p, MyRayDifferential d)
-    {
-        MyRayDifferential result;
-        result.d_origin = p.origin * d.d_origin;
-        result.d_dir = p.dir * d.d_dir;
-        return result;
-    }
-}
-```
-
-Note that for each struct field that is differentiable, we need to use the `[DerivativeMember]` attribute to associate it with the corresponding field in the `Differential` type, so the compiler knows how to access the derivative for the field.
-
-However, there is still a missing piece in the above code: we also need to make `MyRayDifferential` conform to `IDifferentiable` because it is required that the `Differential` of a type must itself be `Differential`. Again we can use automatic fulfillment by simply adding `IDifferentiable` conformance to `MyRayDifferential`:
-```csharp
-struct MyRayDifferential : IDifferentiable
-{
-    float3 d_origin;
-    float3 d_dir;
-}
-```
-In this case, since all fields in `MyRayDifferential` are differentiable, and the `Differential` of each field is the same as the original type of each field (i.e. `float3.Differential == float3` as defined in built-in library), the compiler will automatically use the type itself as its own `Differential`, making `MyRayDifferential` suitable for use as `Differential` of `MyRay`.
-
-We can also choose to manually implement `IDifferentiable` interface for `MyRayDifferential` as in the following code:
-
-```csharp
-struct MyRayDifferential : IDifferentiable
-{
-    typealias Differential = MyRayDifferential;
-
-    [DerivativeMember(MyRayDifferential.d_origin)]
-    float3 d_origin;
-
-    [DerivativeMember(MyRayDifferential.d_dir)]
-    float3 d_dir;
-
-    static MyRayDifferential dzero()
-    {
-        return {float3(0.0), float3(0.0)};
-    }
-
-    static MyRayDifferential dadd(MyRayDifferential v1, MyRayDifferential v2)
-    {
-        MyRayDifferential result;
-        result.d_origin = v1.d_origin + v2.d_origin;
-        result.d_dir = v1.d_dir + v2.d_dir;
-        return result;
-    }
-
-    static MyRayDifferential dmul(MyRayDifferential p, MyRayDifferential d)
-    {
-        MyRayDifferential result;
-        result.d_origin = p.d_origin * d.d_origin;
-        result.d_dir = p.d_dir * d.d_dir;
-        return result;
-    }
-}
-```
-In this specific case, the automatically generated `IDifferentiable` implementation will be exactly the same as the manually written code listed above.
-
-
-## Forward Derivative Propagation Function
-
-Functions in Slang can be marked as forward-differentiable or backward-differentiable. The `fwd_diff` operator can be used on a forward-differentiable function to obtain the forward derivative propagation function. Likewise, the `bwd_diff` operator can be used on a backward-differentiable function to obtain the backward derivative propagation function. This and the next sections cover the semantics of forward and backward propagation functions, and different ways to make a function forward and backward differentiable. 
-
-A forward derivative propagation function computes the derivative of the result value with regard to a specific set of input parameters. 
-Given an original function, the signature of its forward propagation function is determined using the following rules:
-- If the return type `R` is differentiable, the forward propagation function will return `DifferentialPair<R>` that consists of both the computed original result value and the (partial) derivative of the result value. Otherwise, the return type is kept unmodified as `R`.
-- If a parameter has type `T` that is differentiable, it will be translated into a `DifferentialPair<T>` parameter in the derivative function, where the differential component of the `DifferentialPair` holds the initial derivatives of each parameter with regard to their upstream parameters.
-- All parameter directions are unchanged. For example, an `out` parameter in the original function will remain an `out` parameter in the derivative function.
-
-For example, given original function:
-```csharp
-R original(T0 p0, inout T1 p1, T2 p2);
-```
-Where `R`, `T0`, and `T1` is differentiable and `T2` is non-differentiable, the forward derivative function will have the following signature:
-```csharp
-DifferentialPair<R> derivative(DifferentialPair<T0> p0, inout DifferentialPair<T1> p1, T2 p2);
-```
-
-This forward propagation function takes the initial primal value of `p0` in `p0.p`, and the partial derivative of `p0` with regard to some upstream parameter in `p0.d`. It takes the initial primal and derivative values of `p1` and updates `p1` to hold the newly computed value and propagated derivative. Since `p2` is not differentiable, it remains unchanged.
-
-`DifferentialPair<T>` is a built-in type that carries both the original and derivative value of a term. It is defined as follows:
-```csharp
-struct DifferentialPair<T : IDifferentiable> : IDifferentiable
-{
-    typealias Differential = DifferentialPair<T.Differential>;
-    property T p {get;}
-    property T.Differential d {get;}
-    static Differential dzero();
-    static Differential dadd(Differential a, Differential b);
-    static Differential dmul(This a, Differential b);
-}
-```
-
-### Automatic Implementation of Forward Derivative Functions
-
-A function can be made forward-differentiable with a `[ForwardDifferentiable]` attribute. This attribute will cause the compiler to automatically implement the forward propagation function. The syntax for using `[ForwardDifferentiable]` is:
-
-```csharp
-[ForwardDifferentiable]
-R original(T0 p0, inout T1, p1, T2 p2);
-```
-
-Once the function is made forward-differentiable, the forward propagation function can then be called with the `fwd_diff` operator:
-```csharp
-DifferentialPair<R> result = fwd_diff(original)(...);
-```
-
-### User Defined Forward Derivative Functions
-As an alternative to compiler-implemented forward derivatives, the user can choose to manually provide a derivative implementation to make an existing function forward-differentiable. The `[ForwardDerivative(derivative_func)]` attribute is used to associate a function with its forward derivative propagation implementation. The syntax for using `[ForwardDerivative]` attribute is:
-```csharp
-DifferentialPair<R> derivative(DifferentialPair<T0> p0, inout DifferentialPair<T1> p1, T2 p2)
-{
-    ....
-}
-
-[ForwardDerivative(derivative)]
-R original(T0 p0, inout T1, p1, T2 p2);
-```
-If `derivative` is defined in a different scope from `original`, such as in a different namespace or `struct` type, a fully qualified name is required. For example:
-```csharp
-struct MyType
-{
-    // Implementing derivative function in a different name scope.
-    static DifferentialPair<R> derivative(DifferentialPair<T0> p0, inout DifferentialPair<T1> p1, T2 p2)
-    {
-        ....
-    }
-}
-
-// Use fully qualified name in the attribute.
-[ForwardDerivative(MyType.derivative)]
-R original(T0 p0, inout T1, p1, T2 p2);
-```
-
-Sometimes the derivative function needs to be defined in a different module from the original function, or the derivative function cannot be made visible from the original function. In this case, we can use the `[ForwardDerivativeOf(originalFunnc)]` attribute to inform the compiler that `originalFunc` should be treated as a forward-differentiable function, and the current function is the derivative implementation of `originalFunc`. The following code will have the same effect to associate `derivative` and the forward-derivative implementation of `original`:
-
-```csharp
-R original(T0 p0, inout T1, p1, T2 p2);
-
-[ForwardDerivativeOf(original)]
-DifferentialPair<R> derivative(DifferentialPair<T0> p0, inout DifferentialPair<T1> p1, T2 p2)
-{
-    ....
-}
-```
-
-## Backward Derivative Propagation Function
-
-A backward derivative propagation function propagates the derivative of the function output to all the input parameters simultaneously.
-
-Given an original function `f`, the general rule for determining the signature of its backward propagation function is that a differentiable output `o` becomes an input parameter holding the partial derivative of a downstream output with regard to the differentiable output, i.e. $$\partial y/\partial o$$); an input differentiable parameter `i` in the original function will become an output in the backward propagation function, holding the propagated partial derivative $$\partial y/\partial i$$; and any non-differentiable outputs are dropped from the backward propagation function. This means that the backward propagation function never returns any values computed in the original function.
-
-More specifically, the signature of its backward propagation function is determined using the following rules:
-- A backward propagation function always returns `void`.
-- A differentiable `in` parameter of type `T` will become an `inout DifferentialPair<T>` parameter, where the original value part of the differential pair contains the original value of the parameter to pass into the back-prop function. The original value will not be overwritten by the backward propagation function. The propagated derivative will be written to the derivative part of the differential pair after the backward propagation function returns. The initial derivative value of the pair is ignored as input.
-- A differentiable `out` parameter of type `T` will become an `in T.Differential` parameter, carrying the partial derivative of some downstream term with regard to the return value.
-- A differentiable `inout` parameter of type `T` will become an `inout DifferentialPair<T>` parameter, where the original value of the argument, along with the downstream partial derivative with regard to the argument is passed as input to the backward propagation function as the original and derivative part of the pair. The propagated derivative with regard to this input parameter will be written back and replace the derivative part of the pair. The primal value part of the parameter will *not* be updated.
-- A differentiable return value of type `R` will become an additional `in R.Differential` parameter at the end of the backward propagation function parameter list, carrying the result derivative of a downstream term with regard to the return value of the original function.
-- A non-differentiable return value of type `NDR` will be dropped.
-- A non-differentiable `in` parameter of type `ND` will remain unchanged in the backward propagation function.
-- A non-differentiable `out` parameter of type `ND` will be removed from the parameter list of the backward propagation function.
-- A non-differentiable `inout` parameter of type `ND` will become an `in ND` parameter.
-
-For example consider the following original function:
-```csharp
-struct T : IDifferentiable {...}
-struct R : IDifferentiable {...}
-struct ND {} // Non differentiable
-
-[Differentiable]
-R original(T p0, out T p1, inout T p2, ND p3, out ND p4, inout ND p5);
-```
-The signature of its backward propagation function is:
-```csharp
-void back_prop(
-    inout DifferentialPair<T> p0,
-    T.Differential p1,
-    inout DifferentialPair<T> p2,
-    ND p3,
-    ND p5,
-    R.Differential dResult);
-```
-Note that although `p2` is still `inout` in the backward propagation function, the backward propagation function will only write propagated derivative to `p2.d` and will not modify `p2.p`.
-
-### Automatically Implemented Backward Propagation Functions
-
-A function can be made backward-differentiable with a `[Differentiable]` or `[BackwardDifferentiable]` attribute. This attribute will cause the compiler to automatically implement the backward propagation function. The syntax for using `[Differentiable]` is:
-
-```csharp
-[Differentiable]
-R original(T0 p0, inout T1, p1, T2 p2);
-```
-
-Once the function is made backward-differentiable, the backward propagation function can then be called with the `bwd_diff` operator:
-```csharp
-bwd_diff(original)(...);
-```
-
-### User Defined Backward Propagation Functions
-Similar to user-defined forward derivative functions, the `[BackwardDerivative]` and `[BackwardDerivativeOf]` attributes can be used to supply a function with user defined backward propagation function.
-
-The syntax for using `[BackwardDerivative]` attribute is:
-```csharp
-void back_prop(
-    inout DifferentialPair<T> p0,
-    T1.Differential p1,
-    inout DifferentialPair<T> p2,
-    ND p3,
-    ND p5,
-    R.Differential dResult)
-{
-    ...
-}
-
-[BackwardDerivative(back_prop)]
-R original(T0 p0, inout T1, p1, T2 p2);
-```
-
-Similarly, the `[BackwardDerivativeOf]` attribute can be used on the back-prop function in case it is not convenient to modify the definition of the original function, or the back-prop function can't be made visible from the original function:
-
-```csharp
-R original(T0 p0, inout T1, p1, T2 p2);
-
-[BackwardDerivativeOf(original)]
-void back_prop(
-    inout DifferentialPair<T> p0,
-    T1.Differential p1,
-    inout DifferentialPair<T> p2,
-    ND p3,
-    ND p5,
-    R.Differential dResult)
-{
-    ...
-}
-```
-
-## Builtin Differentiable Functions
-
-The following built-in functions are backward differentiable and both their forward-derivative and backward-propagation functions are already defined in the built-in library:
-
-- Arithmetic functions: `abs`, `max`, `min`, `sqrt`, `rcp`, `rsqrt`, `fma`, `mad`, `fmod`, `frac`, `radians`, `degrees`
-- Interpolation and clamping functions: `lerp`, `smoothstep`, `clamp`, `saturate`
-- Trigonometric functions: `sin`, `cos`, `sincos`, `tan`, `asin`, `acos`, `atan`, `atan2`
-- Hyperbolic functions: `sinh`, `cosh`, `tanh`
-- Exponential and logarithmic functions: `exp`, `exp2`, `pow`, `log`, `log2`, `log10`
-- Vector functions: `dot`, `cross`, `length`, `distance`, `normalize`, `reflect`, `refract`
-- Matrix transforms: `mul(matrix, vector)`, `mul(vector, matrix)`, `mul(matrix, matrix)`
-- Matrix operations: `transpose`, `determinant`
-- Legacy blending and lighting intrinsics: `dst`, `lit`
-
-## Primal Substitute Functions
-
-Sometimes it is desirable to replace a function with another when generating forward or backward derivative propagation code. For example, the following code shows a function that computes the integral of some term by sampling and we want to use a different sampling stragegy when computing the derivatives.
-```csharp
-float myTerm(float x)
-{
-     return someComplexComputation(x);
-}
-
-float getSample(float a, float b) { ... }
-
-[Differentiable]
-float computeIntegralOverMyTerm(float x, float a, float b)
-{
-     float sum = 0.0;
-     for (int i = 0; i < SAMPLE_COUNT; i++)
-     {
-          let s = no_diff getSample(a, b);
-          let y = myTerm(s);
-          sum += y * ((b-a)/SAMPLE_COUNT);
-     }
-     return sum;
-}
-```
-
-In this code, the `getSample` function returns a random sample in the range of `[a,b]`. Assume we have another sampling function `getSampleForDerivativeComputation(a,b)` that we wish to use instead in derivative computation, we can do so by marking it as a primal-substitute of `getSample`, as in the following code:
-```csharp
-[PrimalSubstituteOf(getSample)]
-float getSampleForDerivativeComputation(float a, float b)
-{
-     ...
-}
-```
-
-Here, the `[PrimalSubstituteOf(getSample)]` attributes marks the `getSampleForDerivativeComputation` function as the substitute for `getSample` in derivative propagation functions. When a function has a primal substitute, the compiler will treat all calls to that function as if it is a call to the substitute function when generating derivative code. Note that this only applies to compiler generated derivative function and does not affect user provided derivative functions. If a user provided derivative function calls `getSample`, it will not be replaced by `getSampleForDerivativeComputation` by the compiler.
-
-Similar to `[ForwardDerivative]` and `[ForwardDerivativeOf]` attributes, The `[PrimalSubsitute(substFunc)]` attribute works the other way around: it specifies the primal substitute function of the function being marked.
-
-Primal substitute can be used as another way to make a function differentiable. A function is considered differentiable if it has a primal substitute that is differentiable. The following code illustrates this mechanism.
-```csharp
-float myFunc(float x) {...}
-
-[PrimalSubstituteOf(myFunc)]
-[Differentiable]
-float myFuncSubst(float x) {...}
-
-// myFunc is now considered backward differentiable.
-```
-
-The following example shows in more detail on how primal substitute affects derivative computation.
-```csharp
-float myFunc(float x) { return x*x; }
-
-[PrimalSubstituteOf(myFunc)]
-[ForwardDifferentiable]
-float myFuncSubst(float x) { return x*x*x; }
-
-[ForwardDifferentiable]
-float caller(float x) { return myFunc(x); }
-
-let a = caller(4.0); // a == 16.0 (calling myFunc)
-let b = fwd_diff(caller)(diffPair(4.0, 1.0)).p; // b == 64.0 (calling myFuncSubst)
-let c = fwd_diff(caller)(diffPair(4.0, 1.0)).d; // c == 48.0 (calling derivative of myFuncSubst)
-```
-
-In case that a function has both custom defined derivatives and a differentiable primal substitute, the primal substitute overrides the custom defined derivative on the original function. All calls to the original function will be translated into calls to the primal substitute first, and differentiation step follows after. This means that the derivatives of the primal substitute function will be used instead of the derivatives defined on the original function.
-
-## Working with Mixed Differentiable and Non-Differentiable Code
-
-Introducing differentiability to an existing system often involves dealing with code that mixes differentiable and non-differentiable logic.
-Slang provides type checking and code analysis features to allow users to clarify the intention and guard against unexpected behaviors involving when to propagate derivatives through operations.
-
-### Excluding Parameters from Differentiation
-
-Sometimes we do not wish a parameter to be considered differentiable despite it has a differentiable type. We can use the `no_diff` modifier on the parameter to inform the compiler to treat the parameter as non-differentiable and skip generating differentiation code for the parameter. The syntax is:
-
-```csharp
-// Only differentiate this function with regard to `x`.
-float myFunc(no_diff float a, float x);
-```
-
-The forward derivative and backward propagation functions of `myFunc` should have the following signature:
-```csharp
-DifferentialPair<float> fwd_derivative(float a, DifferentialPair<float> x);
-void back_prop(float a, inout DifferentialPair<float> x, float dResult);
-```
-
-In addition, the `no_diff` modifier can also be used on the return type to indicate the return value should be considered non-differentiable. For example, the function
-```csharp
-no_diff float myFunc(no_diff float a, float x, out float y);
-```
-Will have the following forward derivative and backward propagation function signatures:
-
-```csharp
-float fwd_derivative(float a, DifferentialPair<float> x);
-void back_prop(float a, inout DifferentialPair<float> x, float d_y);
-```
-
-By default, the implicit `this` parameter will be treated as differentiable if the enclosing type of the member method is differentiable. If you wish to exclude `this` parameter from differentiation, use `[NoDiffThis]` attribute on the method:
-```csharp
-struct MyDifferentiableType : IDifferentiable
-{
-    [NoDiffThis]   // Make `this` parameter `no_diff`.
-    float compute(float x) { ... }
-}
-```
-
-### Excluding Struct Members from Differentiation
-
-When using automatic `IDifferentiable` conformance synthesis for a `struct` type, Slang will by-default treat all struct members that have a differentiable type as differentiable, and thus include a corresponding field in the generated `Differential` type for the struct.
-For example, given the following definition
-```csharp
-struct MyType : IDifferentiable
-{
-    float member1;
-    float2 member2;
-}
-```
-Slang will generate:
-```csharp
-struct MyType.Differential : IDifferentiable
-{
-    float member1;  // derivative for MyType.member1
-    float2 member2; // derivative for MyType.member2
-}
-```
-If the user does not want a certain member to be treated as differentiable despite it has a differentiable type, a `no_diff` modifier can be used on the struct member to exclude it from differentiation.
-For example, the following code excludes `member1` from differentiation:
-```csharp
-struct MyType : IDifferentiable
-{
-    no_diff float member1;  // excluded from differentiation
-    float2 member2;
-}
-```
-The generated `Differential` in this case will be:
-```csharp
-struct MyType.Differential : IDifferentiable
-{
-    float2 member2;
-}
-```
-
-### Assigning Differentiable Values into a Non-Differentiable Location
-
-When a value with derivatives is being assigned to a location that is not differentiable, such as a struct member that is marked as `no_diff`, the derivative info is discarded and any derivative propagation is stopped at the assignment site.
-This may lead to unexpected results. For example:
-```csharp
-struct MyType : IDifferentiable
-{
-    no_diff float member;
-    float someOtherMemther;
-}
-[ForwardDifferentiable]
-float f(float x)
-{
-    MyType t;
-    t.member = x * x; // Error: assigning value with derivative into a non-differentiable location.
-    return t.member;
-}
-...
-let result = fwd_diff(f)(diffPair(3.0, 1.0)).d; // result == 0.0
-```
-In this case, we are assigning the value `x*x`, which carries a derivative, into a non-differentiable location `MyType.member`, thus throwing away any derivative info. When `f` returns `t.member`, there will be no derivative associated with it, so the function will not propagate the derivative through. This code is most likely not intending to discard the derivative through the assignment. To help avoid this kind of unintentional behavior, Slang will treat any assignments of a value with derivative info into a non-differentiable location as a compile-time error. To eliminate this error, the user should either make `t.member` differentiable, or to force the assignment by clarifying the intention to discard any derivatives using the built-in `detach` method.
-The following code will compile, and the derivatives will be discarded:
-```csharp
-[ForwardDifferentiable]
-float f(float x)
-{
-    MyType t;
-    // OK: the code has expressed clearly the intention to discard the derivative and perform the assignment.
-    t.member = detach(x * x);
-    return t.member;
-}
-```
-
-### Calling Non-Differentiable Functions from a Differentiable Function
-Calling non-differentiable function from a differentiable function is allowed. However, derivatives will not be propagated through the call. The user is required to clarify the intention by prefixing the call with the `no_diff` keyword. An un-clarified call to non-differentiable function will result in a compile-time error.
-
-For example, consider the following code:
-```csharp
-float g(float x)
-{
-    return 2*x;
-}
-
-[ForwardDifferentiable]
-float f(float x)
-{
-    // Error: implicit call to non-differentiable function g.
-    return g(x) + x * x;
-}
-```
-The derivative will not propagate through the call to `g` in `f`. As a result, `fwd_diff(f)(diffPair(1.0, 1.0))` will return
-`{3.0, 2.0}` instead of `{3.0, 4.0}` as the derivative from `2*x` is lost through the non-differentiable call. To prevent unintended error, it is treated as a compile-time error to call `g` from `f`. If such a non-differentiable call is intended, a `no_diff` prefix is required in the call:
-```csharp
-[ForwardDifferentiable]
-float f(float x)
-{
-    // OK. The intention to call a non-differentiable function is clarified.
-    return no_diff g(x) + x * x;
-}
-```
-
-However, the `no_diff` keyword is not required in a call if a non-differentiable function does not take any differentiable parameters, or if the result of the differentiable function is not dependent on the derivative being propagated through the call.
-
-### Treat Non-Differentiable Functions as Differentiable
-Slang allows functions to be marked with a `[TreatAsDifferentiable]` attribute for them to be considered as differentiable functions by the type-system. When a function is marked as `[TreatAsDifferentiable]`, the compiler will not generate derivative propagation code from the original function body or perform any additional checking on the function definition. Instead, it will generate trivial forward and backward propagation functions that returns 0.
-
-This feature can be useful if the user marked an `interface` method as forward or backward differentiable, but only wish to provide non-trivial derivative propagation functions for a subset of types that implement the interface. For other types that does not actually need differentiation, the user can simply put `[TreatAsDifferentiable]` on the method implementations for them to satisfy the interface requirement.
-
-See the following code for an example of `[TreatAsDifferentiable]`:
-```csharp
-interface IFoo
-{
-    [Differentiable]
-    float f(float v);
-}
-
-struct B : IFoo
-{
-    [TreatAsDifferentiable]
-    float f(float v)
-    {
-        return v * v;
-    }
-}
-
-[Differentiable]
-float use(IFoo o, float x)
-{
-    return o.f(x);
-}
-
-// Test:
-B obj;
-float result = fwd_diff(use)(obj, diffPair(2.0, 1.0)).d;
-// result == 0.0, since `[TreatAsDifferentiable]` causes a trivial derivative implementation
-// being generated regardless of the original code.
-```
-
-## Higher Order Differentiation
-
-Slang supports generating higher order forward and backward derivative propagation functions. It is allowed to use `fwd_diff` and `bwd_diff` operators inside a forward or backward differentiable function, or to nest `fwd_diff` and `bwd_diff` operators. For example, `fwd_diff(fwd_diff(sin))` will have the following signature:
-
-```csharp
-DifferentialPair<DifferentialPair<float>> sin_diff2(DifferentialPair<DifferentialPair<float>> x);
-```
-
-The input parameter `x` contains four fields: `x.p.p`, `x.p.d,`, `x.d.p`, `x.d.d`, where `x.p.p` specifies the original input value, both `x.p.d` and `x.d.p` store the first order derivative if `x`, and `x.d.d` stores the second order derivative of `x`. Calling `fwd_diff(fwd_diff(sin))` with `diffPair(diffPair(pi/2, 1.0), DiffPair(1.0, 0.0))` will result `{ { 1.0, 0.0 }, { 0.0, -1.0 } }`.
-
-User defined higher-order derivative functions can be specified by using `[ForwardDerivative]` or `[BackwardDerivative]` attribute on the derivative function, or by using `[ForwardDerivativeOf]` or `[BackwardDerivativeOf]` attribute on the higher-order derivative function.
-
-## Interactions with Generics and Interfaces
-
-Automatic differentiation for generic functions is supported. The forward-derivative and backward propagation functions of a generic function is also a generic function with the same set of generic parameters and constraints. Using `[ForwardDerivative]`, `[ForwardDerivativeOf]`, `[BackwardDerivative]` or `[BackwardDerivativeOf]` attributes to associate a derivative function with different set of generic parameters or constraints is a compile-time error.
-
-An interface method requirement can be marked as `[ForwardDifferentiable]` or `[Differentiable]`, so they may be called in a forward or backward differentiable function and have the derivatives propagate through the call. This works regardless of whether the call can be specialized or has to go through dynamic dispatch. However, calls to interface methods are only differentiable once. Higher order differentiation through interface method calls are not supported.
-
-## Restrictions of Automatic Differentiation
-
-The compiler can generate forward derivative and backward propagation implementations for most uses of array and struct types, including arbitrary read and write access at dynamic array indices, and supports uses of all types of control flows, mutable parameters, generics and interfaces. This covers the set of operations that is sufficient for a lot of functions. However, the user needs to be aware of the following restrictions when using automatic differentiation:
-
-- All operations to global resources, global variables and shader parameters, including texture reads or atomic writes, are treating as a non-differentiable operation.
-- If a differentiable function contains calls that cause side-effects such as updates to global memory, there will not be a guarantee on how many times the side-effect will occur during the resulting derivative function or back-propagation function.
-- Loops: Loops must use the attribute `[MaxIters(<count>)]` to specify a maximum number of iterations. This will be used by compiler to allocate space to store intermediate data. If the actual number of iterations exceeds the provided maximum, the behavior is undefined. You can always mark a loop with the `[ForceUnroll]` attribute to instruct the Slang compiler to unroll the loop before generating derivative propagation functions. Unrolled loops will be treated the same way as ordinary code and are not subject to any additional restrictions.
-
-The above restrictions do not apply if a user-defined derivative or backward propagation function is provided.
diff --git a/docs/user-guide/08-compiling.md b/docs/user-guide/08-compiling.md
new file mode 100644
index 000000000..861da962e
--- /dev/null
+++ b/docs/user-guide/08-compiling.md
@@ -0,0 +1,498 @@
+---
+layout: user-guide
+permalink: /user-guide/compiling
+---
+
+Compiling Code with Slang
+=========================
+
+This chapter presents the ways that the Slang system supports compiling and composing shader code.
+We will start with a discussion of the mental model that Slang uses for compilation.
+Next we will cover the command-line Slang compiler, `slangc`, and how to use it to perform offline compilation.
+Finally we will discuss the Slang compilation API, which can be used to integrate Slang compilation into an application at runtime, or to build custom tools that implement application-specific compilation policy.
+
+Concepts
+--------
+
+For simple scenarios it may be enough to think of a shader compiler as a box where source code goes in and compiled kernels come out.
+Most real-time graphics applications end up needing more control over shader compilation, and/or more information about the results of compilation.
+In order to make use of the services provided by the Slang compilation system, it is useful to start with a clear model of the concepts that are involved in compilation.
+
+### Source Units
+
+At the finest granularity, code is fed to the compiler in _source units_ which are most often stored as files on disk or strings of text in memory.
+The compilation model largely does not care whether source units have been authored by human programmers or automatically assembled by other tools.
+
+If multiple source units are specified as part of the same compile, they will be preprocessed and parsed independently.
+However, a source unit might contain `#include` directives, so that the preprocessed text of that source unit includes the content of other files.
+Note that the `#include`d files do not become additional source units; they are just part of the text of a source unit that was fed to the compiler.
+
+### Translation Units and Modules
+
+Source units (such as files) are grouped into _translation units_, and each translation unit will produce a single _module_ when compiled.
+
+While the source units are all preprocessed and parsed independently, semantic checking is applied to a translation unit as a whole.
+One source file in a translation unit may freely refer to declarations in another translation unit without any need for forward declarations. For example:
+
+```hlsl
+// A.slang
+
+float getFactor() { return 10.0; }
+```
+
+```hlsl
+// B.slang
+
+float scaleValue(float value)
+{
+    return value * getFactor();
+}
+```
+
+In this example, the `scaleValue()` function in `B.slang` can freely refer to the `getFactor()` function in `A.slang` because they are part of the same translation unit.
+
+It is allowed, and indeed common, for a translation unit to contain only a single source unit.
+For example, when adapting an existing codebase with many `.hlsl` files, it is appropriate to compile each `.hlsl` file as its own translation unit.
+A modernized codebase might decide to compile multiple `.slang` files in a single directory as a single translation unit.
+
+The result of compiling a translation unit is a module in Slang's internal intermediate representation (IR).
+
+### Entry Points
+
+A translation unit / module may contain zero or more entry points.
+Slang supports two models for identifying entry points when compiling.
+
+#### Entry Point Attributes
+
+By default, the compiler wll scan a translation unit for function declarations marked with the `[shader(...)]` attribute; each such function will be identified as an entry point in the module.
+Developers are encouraged to use this model because it directly documents intention and makes source code less dependent on external compiler configuration options.
+
+#### Explicit Entry Point Options
+
+For compatibility with existing code, the Slang compiler also supports explicit specification of entry point functions using configuration optiosn external to shader source code.
+When these options are used the compiler will *ignore* all `[shader(...)]` attributes and only use the explicitly-specified entry points intead.
+
+### Shader Parameters
+
+A translation unit / module may contain zero or more global shader parameters.
+Similarly, each entry point may define zero or more entry-point `uniform` shader parameters.
+
+The shader parameters of a module or entry point are significant because they describe the interface between host application code and GPU code.
+It is important that both the application and generated GPU kernel code agree on how parameters are laid out in memory and/or how they are assigned to particular API-defined registers, locations, or other "slots."
+
+### Targets
+
+Within the Slang system a _target_ represents a particular platform and set of capabilities that output code can be generated for.
+A target includes information such as:
+
+* The _format_ that code should be generated in: SPIR-V, DXIL, etc.
+
+* A _profile_ that specifies a general feature/capability level for the target: D3D Shader Model 5.1, GLSL version 4.60, etc.
+
+* Optional _capabilities_ that should be assumed available on the target: for example, specific Vulkan GLSL extensions
+
+* Options that impact code generation: floating-point strictness, level of debug information to generate, etc.
+
+Slang supports compiling for multiple targets in the same compilation session.
+When using multiple targets at a time, it is important to understand the distinction between the _front-end_ of the compiler, and the _back-end_:
+
+* The compiler front-end comprises preprocessing, parsing, and semantic checking. The front-end runs once for each translation unit and its results are shared across all targets.
+
+* The compiler back-end generates output code, and thus runs once per target.
+
+> #### Note ####
+> Because front-end actions, including preprocessing, only run once, across all targets, the Slang compiler does not automatically provide any target-specific preprocessor `#define`s that can be used for preprocessor conditionals.
+> Applications that need target-specific `#define`s should always compile for one target at a time, and set up their per-target preprocessor state manually.
+
+### Layout
+
+While the front-end of the compiler determines what the shader parameters of a module or entry point are, the _layout_ for those parameters is dependent on a particular compilation target.
+A `Texture2D` might consume a `t` register for Direct3D, a `binding` for Vulkan, or just plain bytes for CUDA.
+
+The details of layout in Slang will come in a later chapter.
+For the purposes of the compilation model it is important to note that the layout computed for shader parameters depends on:
+
+* What modules and entry points are being used together; these define which parameters are relevant.
+
+* Some well-defined ordering of those parameters; this defines which parameters should be laid out before which others.
+
+* The rules and constraints that the target imposes on layout.
+
+An important design choice in Slang is give the user of the compiler control over these choices.
+
+### Composition
+
+The user of the Slang compiler communicates the modules and entry points that will be used together, as well as their relative order, using a system for _composition_.
+
+A _component type_ is a unit of shader code composition; both modules and entry points are examples of component types.
+A _composite_ component type is formed from a list of other component types (for example, one module and two entry points) and can be used to define a unit of shader code that is meant to be used together.
+
+Once a programmer has formed a composite of all the code they intend to use together, they can query the layout of the shader parameters in that composite, or request kernel code generation for its entry points.
+
+### Kernels
+
+A _kernel_ is generated code for an entry point.
+The same entry point can be used to generate many different kernels.
+First, and entry point can be compiled for different targets, resulting in different kernels in the appropriate format for each target.
+Second, different compositions of shader code can result in different layouts, which leads to different kernels being required.
+
+Command-Line Compilation with `slangc`
+--------------------------------------
+
+The `slangc` tool, included in binary distributions of Slang, is a command-line compiler that can handle most simple compilation tasks.
+`slangc` is intended to be usable as a replacement for tools like `fxc` and `dxc`, and covers most of the same use cases.
+
+### Example
+
+Here we will repeat the example used in the [Getting Started](01-get-started.md) chapter.
+Given the following Slang code:
+
+```hlsl
+// hello-world.slang
+StructuredBuffer<float> buffer0;
+StructuredBuffer<float> buffer1;
+RWStructuredBuffer<float> result;
+
+[shader("compute")]
+[numthreads(1,1,1)]
+void computeMain(uint3 threadId : SV_DispatchThreadID)
+{
+    uint index = threadId.x;
+    result[index] = buffer0[index] + buffer1[index];
+}
+```
+
+we can compile the `computeMain()` entry point to SPIR-V using the following command line:
+
+```bat
+slangc hello-world.slang -entry computeMain -target spirv -o hello-world.spv
+```
+
+### Source Files and Translation Units
+
+The `hello-world.slang` argument here is specifying an input file.
+Each input file specified on the command line will be a distinct source unit during compilation.
+Slang supports multiple file-name extensions for input files, but the most common ones will be `.hlsl` for existing HLSL code, and `.slang` for files written specifically for Slang.
+
+If multiple source files are passed to `slangc`, they will be grouped into translation units using the following rules:
+
+* If there are any `.slang` files, then all of them will be grouped into a single translation unit
+
+* Each `.hlsl` file will be grouped into a distinct translation unit of its own
+
+### Entry Points
+
+When using `slangc`, you will typically want to identify which entry point(s) you intend to compile.
+The `-entry computeMain` option selects an entry point to be compiled to output code in this invocation of `slangc`.
+
+Because the `computeMain()` entry point in this example has a `[shader(...)]` attribute, the compiler is able to deduce that it should be compiled for the `compute` stage.
+In code that does not use `[shader(...)]` attributes, a `-entry` option should be followed by a `-stage` option to specify the stage of the entry point:
+
+```bat
+slangc hello-world.slang -entry computeMain -stage compute -o hello-world.spv
+```
+
+### Targets
+
+Our example uses the option `-target spirv` to introduce a compilation target; in this case, code will be generated as SPIR-V.
+The argument of a `-target` option specified the format to use for the target; common values are `dxbc`, `dxil`, and `spirv`.
+
+Additional options for a target can be specified after the `-target` option.
+For example, a `-profile` option can be used to specify a profile that should be used.
+Slang provides two main kinds of profiles for use with `slangc`:
+
+* Direct3D "Shader Model" profiles have names like `sm_5_1` and `sm_6_3`
+
+* GLSL versions can be used as profile with names like `glsl_430` and `glsl_460`
+
+### Kernels
+
+A `-o` option indicates that kernel code should be written to a file on disk.
+In our example, the SPIR-V kernel code for the `computeMain()` entry point will be written to the file `hello-world.spv`.
+
+### Working with Multiples
+
+It is possible to use `slangc` with multiple input files, entry points, or targets.
+In these cases, the ordering of arguments on the command line becomes significant.
+
+When an option modifies or relates to another command-line argument, it implicitly applies to the most recent relevant argument.
+For example:
+
+* If there are multiple input files, then an `-entry` option applies to the preceding input file
+
+* If there are multiple entry points, then a `-stage` option applies to the preceding `-entry` option
+
+* If there are multiple targets, then a `-profile` option applies to the preceding `-target` option
+
+Kernel `-o` options are the most complicated case, because they depend on both a target and entry point.
+A `-o` option applies to the preceding entry point, and the compiler will try to apply it to a matching target based on its file extension.
+For example, a `.spv` output file will be matched to a `-target spriv`.
+
+The compiler makes a best effort to support complicated cases with multiple files, entry points, and targets.
+Users with very complicated compilation requirements will probably be better off using multiple `slangc` invocations or migrating to the compilation API.
+
+### Additional Options
+
+The main other options are:
+
+* `-D<name>` or `-D<name>=<value>` can be used to introduce preprocessor macros.
+
+* `-I<path>` or `-I <path>` can be used to introduce a _search path_ to be used when resolving `#include` directives and `import` declarations.
+
+* `-g` can be used to enable inclusion of debug information in output files (where possible and implemented)
+
+* `-O<level>` can be used to control optimization levels when the Slang compiler invokes downstream code generator
+
+### Convenience Features
+
+The `slangc` compiler provides a few conveniences for command-line compilation:
+
+* Most options can appear out of order when they are unambiguous. For example, if there is only a single translation unit a `-entry` option can appear before or after any file.
+
+* A `-target` option can be left out if it can be inferred from the only `-o` option present. For example, `-o hello-world.spv` already implies `-target spriv`.
+
+* If a `-o` option is left out then kernel code will be written to the standard output. This output can be piped to a file, or can be printed to a console. In the latter case, the compiler will automatically disassemble binary formats for printing.
+
+### Limitations
+
+The `slangc` tool is meant to serve the needs of many developers, including those who are currently using `fxc`, `dxc`, or similar tools.
+However, some applications will benefit from deeper integration of the Slang compiler into application-specific code and workflows.
+Notable features that Slang supports which cannot be accessed from `slangc` include:
+
+* Slang can provide _reflection_ information about shader parameters and their layouts for particular targets; this information is not currently output by `slangc`.
+
+* Slang allows applications to control the way that shader modules and entry points are composed (which in turn influences their layout); `slangc` currently implements a single default policy for how to generate a composition of shader code.
+
+Applications that more control over compilation are encouraged to use the C++ compilation API described in the next section.
+
+Using the Compilation API
+-------------------------
+
+The C++ API provided by Slang is meant to provide more complete control over compilation for applications that need it.
+The additional level of control means that some tasks require more individual steps than they would when using a one-size-fits-all tool like `slangc`.
+
+### "COM-lite" Components
+
+Many parts of the Slang C++ API use interfaces that follow the design of COM (the Component Object Model).
+Some key Slang interfaces are binary-compatible with existing COM interfaces.
+However, the Slang API does not depend on any runtime aspects of the COM system, even on Windows; the Slang system can be seen as a "COM-lite" API.
+
+The `ISlangUnknown` interface is equivalent to (and binary-compatible with) the standard COM `IUnknown`.
+Application code is expected to correctly maintain the reference counts of `ISlangUnknown` objects returned from API calls; the `Slang::ComPtr<T>` "smart pointer" type is provided as an optional convenience for applications that want to use it.
+
+Many Slang API calls return `SlangResult` values; this type is equivalent to (and binary-compatible with) the standard COM `HRESULT` type.
+As a matter of convention, Slang API calls return a zero value (`SLANG_OK`) on success, and a negative value on errors.
+
+### Creating a Global Session
+
+A Slang _global session_ uses the interface `slang::IGlobalSession` and it represents a connection from an application to a particular implementation of the Slang API.
+A global session is created using the function `slang::createGlobalSession()`:
+
+```c++
+Slang::ComPtr<IGlobalSession> globalSession;
+slang::createGlobalSession(globalSession.writeRef());
+```
+
+When a global session is created, the Slang system will load its internal representation of the _standard library_ that the compiler provides to user code.
+The standard library can take a significant amount of time to load, so applications are advised to use a single global session if possible, rather than creating and then disposing of one for each compile.
+
+> #### Note ####
+> Currently, the global session type is *not* thread-safe.
+> Applications that wish to compile on multiple threads will need to ensure that each concurrent thread compiles with a distinct global session.
+
+### Creating a Session
+
+A _session_ uses the interface `slang::ISession`, and represents a scope for compilation with a consistent set of compiler options.
+In particular, all compilation with a single session will share:
+
+* A list of enabled compilation targets (with their options)
+
+* A list of search paths (for `#include` and `import`)
+
+* A list of pre-defined macros
+
+In addition, a session provides a scope for the loading and re-use of modules.
+If two pieces of code compiled in a session both `import`  the same module, then that module will only be loaded and compiled once.
+
+To create a session, use the `IGlobalSession::createSession()` method:
+
+```c++
+SessionDesc sessionDesc;
+/* ... fill in `sessionDesc` ... */
+Slang::ComPtr<ISession> session;
+globalSession->createSession(sessionDesc, session.writeRef());
+```
+
+#### Targets
+
+The `SessionDesc::targets` array can be used to describe the list of targets that the application wants to support in a session.
+Often, this will consist of a single target.
+
+Each target is described with a `TargetDesc` which includes options to control code generation for the target.
+The most important fields of the `TargetDesc` are the `format` and `profile`; most others can be left at their default values.
+
+The `format` field should be set to one of the values from the `SlangCompileTarget` enumeration.
+For example:
+
+```c++
+TargetDesc targetDesc;
+targetDesc.format = SLANG_FORMAT_SPIRV;
+```
+
+The `profile` field must be set with the ID of one of the profiles supported by the Slang compiler.
+The exact numeric value of the different profiles is not currently stable across compiler versions, so applications should look up a chosen profile using `IGlobalSession::findProfile`.
+For example:
+
+```c++
+targetDesc.profile = globalSession->findProfile("glsl_450");
+```
+
+Once the chosen `TargetDesc`s have been initialized, they can be attached to the `SessionDesc`:
+
+```c++
+sessionDesc.targets = &targetDesc;
+sessionDesc.targetCount = 1;
+```
+
+#### Search Paths
+
+The search paths on a session provide the paths where the compiler will look when trying to resolve a `#include` directive or `import` declaration.
+The search paths can be set in the `SessionDesc` as an array of `const char*`:
+
+```c++
+const char* searchPaths[] = { "myapp/shaders/" };
+sessionDesc.searchPaths = searchPaths;
+sessionDesc.searchPathCount = 1;
+```
+
+#### Pre-Defined Macros
+
+The pre-defined macros in a session will be visible at the start of each source unit that is compiled, including source units loaded via `import`.
+Each pre-defined macro is described with a `PreprocessorMacroDesc`, which has `name` and `value` fields:
+
+```c++
+PreprocessorMacroDesc fancyFlag = { "ENABLE_FANCY_FEATURE", "1" };
+sessionDesc.preprocessorMacros = &fancyFlag;
+sessionDesc.preprocessorMacroCount = 1;
+```
+
+### Loading a Module
+
+The simplest way to load code into a session is with `ISession::loadModule()`:
+
+```c++
+Slang::ComPtr<IModule> module = session->loadModule("MyShaders");
+```
+
+Executing `loadModule("MyShaders")` in host C++ code is similar to using `import MyShaders` in Slang code.
+The session will search for a matching module (usually in a file called `MyShaders.slang`) and will load and compile it (if it hasn't been done already).
+
+Note that `loadModule()` does not provide any ways to customize the compiler configuration for that specific module.
+The preprocessor environment, search paths, and targets will always be those specified for the session.
+
+### Capturing Diagnostic Output
+
+Compilers produce various kinds of _diagnostic_ output when compiling code.
+This includes not only error messages when compilation fails, but also warnings and other helpful messages that may be produced even for successful compiles.
+
+Many operations in Slang, such as `ISession::loadModule()` can optionally produce a _blob_ of diagnostic output.
+For example:
+
+```c++
+Slang::ComPtr<IBlob> diagnostics;
+Slang::ComPtr<IModule> module = session->loadModule("MyShaders", diagnostics.writeRef());
+```
+
+In this example, if any diagnostic messages were produced when loading `MyShaders`, then the `diagnostics` pointer will be set to a blob that contains the textual content of those diagnostics.
+
+The content of a blob can be accessed with `getBufferPointer()`, and the size of the content can be accessed with `getBufferSize()`.
+Diagnostic blobs produces by the Slang compiler are always null-terminated, so that they can be used with C-style sting APIs:
+
+```c++
+if(diagnostics)
+{
+    fprintf(stderr, "%s\n", (const char*) diagnostics->getBufferPointer());
+}
+```
+
+> #### Note ####
+> The `slang::IBlob` interface is binary-compatible with the `ID3D10Blob` and `ID3DBlob` interfaces used by some Direct3D compilation APIs.
+
+### Entry Points
+
+When using `loadModule()` applications should ensure that entry points in their shader code are always marked with appropriate `[shader(...)]` attributes.
+For example, if `MyShaders.slang` contained:
+
+```hlsl
+[shader("compute")]
+void myComputeMain(...) { ... }
+```
+
+then the Slang system will automatically detect and validate this entry point as part of a `loadModule("MyShaders")` call.
+
+After a module has been loaded, the application can look up entry points in that module using `IModule::findEntryPointByName()`:
+
+```c++
+Slang::ComPtr<IEntryPoint> computeEntryPoint;
+module->findEntryPointByName("myComputeMain", computeEntryPoint.writeRef());
+```
+
+### Composition
+
+An application might load any number of modules with `loadModule()`, and those modules might contain any number of entry points.
+Before GPU kernel code can be generated it is first necessary to decide which pieces of GPU code will be used together.
+
+Both `slang::IModule` and `slang::IEntryPoint` inherit from `slang::IComponentType`, because both can be used as components when composing a shader program.
+A composition can be created with `ISession::createCompositeComponentType()`:
+
+```c++
+IComponentType* components[] = { module, entryPoint };
+Slang::ComPtr<IComponentType> program;
+session->createCompositeComponentType(components, 2, program.writeRef());
+```
+
+As discussed earlier in this chapter, the composition operation serves two important purposes.
+First, it establishes which code is part of a compiled shader program and which is not.
+Second, it established an ordering for the code in a program, which can be used for layout.
+
+### Layout and Reflection
+
+Some applications need to perform reflection on shader parameters and their layout, whether at runtime or as part of an offline compilation tool.
+The Slang API allows layout to be queried on any `IComponentType` using `getLayout()`:
+
+```c++
+slang::ProgramLayout* layout = program->getLayout();
+```
+
+> #### Note ####
+> In  the current Slang API, the `ProgramLayout` type is not reference-counted.
+> Currently, the lifetime of a `ProgramLayout` is tied to the `IComponentType` that returned it.
+> An application must ensure that it retains the given `IComponentType` for as long as it uses the `ProgramLayout`.
+
+Note that because both `IModule` and `IEntryPoint` inherit from `IComponentType`, they can also be queried for their layouts individually.
+The layout for a module comprises just its global-scope parameters.
+The layout for an entry point comprises just its entry-point parameters (both `uniform` and varying).
+
+The details of how Slang computes layout, what guarantees it makes, and how to inspect the reflection information will be discussed in a later chapter.
+
+Because the layout computed for shader parameters may depend on the compilation target, the `getLayout()` method actually takes a `targetIndex` parameter that is the zero-based index of the target for which layout information is being queried.
+This parameter defaults to zero as a convenience for the common case where applications use only a single compilation target at runtime.
+
+### Kernel Code
+
+Given a composed `IComponentType`, an application can extract kernel code for one of its entry points using `IComponentType::getEntryPointCode()`:
+
+```c++
+int entryPointIndex = 0; // only one entry point
+int targetIndex = 0; // only one target
+Slang::ComPtr<IBlob> kernelBlob;
+program->getEntryPointCode(
+    entryPointIndex,
+    targetIndex,
+    kernelBlob.writeRef(),
+    diagnostics.writeRef());
+```
+
+Any diagnostic messages related to back-end code generation (for example, if the chosen entry point requires features not available on the chosen target) will be written to `diagnostics`.
+The `kernelBlob` output is a `slang::IBlob` that can be used to access the generated code (whether binary or textual).
+In many cases `kernelBlob->getBufferPointer()` can be passed directly to the appropriate graphics API to load kernel code onto a GPU.
diff --git a/docs/user-guide/09-targets.md b/docs/user-guide/09-targets.md
new file mode 100644
index 000000000..d6dfd4cf6
--- /dev/null
+++ b/docs/user-guide/09-targets.md
@@ -0,0 +1,367 @@
+---
+layout: user-guide
+permalink: /user-guide/targets
+---
+
+Supported Compilation Targets
+============================
+
+This chapter provides a brief overview of the compilation targets supported by Slang, and their different capabilities.
+
+Background and Terminology
+--------------------------
+
+### Code Formats
+
+When Slang compiles for a target platform one of the most important distinctions is the _format_ of code for that platform.
+For a native CPU target, the format is typically the executable machine-code format for the processor family (for example, x86-64).
+In contrast, GPUs are typically programmed through APIs that abstract over multiple GPU processor families and versions.
+GPU APIs usually define an _intermediate language_ that sits between a high-level-language compiler like Slang and GPU-specific compilers that live in drivers for the API.
+
+### Pipelines and Stages
+
+GPU code execution occurs in the context of a _pipeline_.
+A pipeline comprises one or more _stages_ and dataflow connections between them.
+Some stages are _programmable_ and run a user-defined _kernel_ that has been compiled from a language like Slang, while others are _fixed-function_ and can only be configured, rather than programmed, by the user.
+Slang supports three different pipelines.
+
+#### Rasterization
+
+The _rasterization_ pipeline is the original GPU rendering pipeline.
+On current GPUs, the simplest rasterization pipelines have two programmable stages: a `vertex` stage and a `fragment` stage.
+The rasterization pipeline is named after its most important fixed-function stage: the rasterizer, which determines the pixels covered by a geometric primitive, and emits _fragments_ covering those pixels, to be shaded.
+
+#### Compute
+
+The _compute_ pipeline is a simple pipeline with only one stage: a programmable `compute` stage.
+As a result of being a single-stage pipeline the compute pipeline doesn't need to deal with many issues around inter-stage dataflow that other pipelines do.
+
+#### Ray Tracing
+
+A _ray tracing_ pipeline has multiple stages pertaining to the life cycle of a ray being traced through a scene of geometric primitives.
+These can include an `intersection` stage to compute whether a ray intersects a geometry primitive, a `miss` stage that runs when a ray does not intersect any geometric object in a scene, etc.
+
+Note that some platforms support types and operations related to ray tracing that can run outside of the context of a dedicated ray tracing pipeline.
+Just as applications can do computation outside of the dedicated compute pipeline, the use of ray tracing does not necessarily mean that a ray tracing pipeline is being used.
+
+### Shader Parameter Bindings
+
+The kernels that execute within a pipeline typically has access to four different kinds of data:
+
+* _Varying inputs_ coming from the system or from a preceding pipeline stage
+
+* _Varying outputs_ which will be passed along to the system or to a following pipeline stage
+
+* _Temporaries_ which are scratch memory or registers used by each invocation of the kernel and then dismissed on exit.
+
+* _Shader parameters_ (sometimes also called _uniform parameters_), which provide access to data from outside the pipeline dataflow
+
+The first three of these kinds of data are largely handled by the implementation of a pipeline.
+In contrast, an application programmer typically needs to manually prepare shader parameters, using the appropriate mechanisms and rules for each target platform.
+
+On platforms that provide a CPU-like "flat" memory model with a single virtual address space, and where any kind of data can be stored at any address, passing shader parameters can be almost trivial.
+Current graphics APIs provide far more complicated and less uniform mechanisms for passing shader parameters.
+
+A high-level language compiler like Slang handles the task of _binding_ each user-defined shader parameter to one or more of the parameter-passing resources defined by a target platform.
+For example, the Slang compiler might bindg a global `Texture2D` parameter called `gDiffuse` to the `t1` register defined by the Direct3D 11 API.
+
+An application is responsible for passing the argument data for a parameter using the using the corresponding platform-specific resource it was bound to.
+For example, an application should set the texture they want to use for `gDiffuse` to the `t1` register using Direct3D 11 API calls.
+
+#### Slots
+
+Historically, most graphics APIs have used a model where shader parameters are passed using a number of API-defined _slots_.
+Each slot can store a single argument value of an allowed type.
+Depending on the platform slots might be called "registers," "locations," "bindings," "texture units," or other similar names.
+
+Slots almost exclusively use opaque types: textures, buffers, etc.
+On platforms that use slots for passing shader parameters, value of ordinary types like `float` or `int` need to be stored into a buffer, and then that buffer is passed via an appropriate slot.
+
+Although many graphics APIs use slots as an abstraction, the details vary greatly across APIs.
+Different APIs define different kinds of slots, and the types of arguments that may be stored in those slots vary.
+For example, one API might use two different kinds of slots for textures and buffers, while another uses a single kind of slot for both.
+On some APIs each pipeline stage gets is own dedicated slots, while on others slots are shared across all stages in a pipeline.
+
+#### Blocks
+
+Newer graphics APIs typically provide a system for grouping related shader parameters into re-usable _blocks_.
+Blocks might be referred to as "descriptor tables," "descriptor sets," or "argument buffers."
+Each block comprises one or more slots (often called "descriptors") that can be used to bind textures, buffers, etc.
+
+Blocks are in turn set into appropriate slots provided by a pipeline.
+Because a block can contain many different slots for textures or buffers, switching a pipeline argument from one block to another can effectively swap out a large number of shader parameters in one operation.
+Thus, while blocks introduce a level of indirection to parameter setting, then can also enable greater efficiency when parameters are grouped into blocks according to frequency of change.
+
+#### Root Constants
+
+Most recent graphics APIs also allow for a small amount of ordinary data (meaning types like `float` and `int` but not opaque types like buffers or textures) to be passed to the pipeline as _root constants_ (also called "push constants").
+
+Using root constants can eliminate some overheads from passing parameters of ordinary types via buffers.
+Passing a single `float` using a root constant rather than a buffer obviously eliminates a level of indirection.
+More importantly, though, using a root constant can avoid application code having to allocate and manage the lifetime of a buffer in a concurrent CPU/GPU program.
+
+Direct3D 11
+-----------
+
+Direct3D 11 (D3D11) is a older graphics API, but remains popular because it is much simpler to learn and use than some more recent APIs.
+In this section we will give an overview of the relevant features of D3D11 when used as a target platform for Slang.
+Subsequent sections about other APIs may describe them by comparison to D3D11.
+
+D3D11 kernels must be compiled to the DirectX Bytecode (DXBC) intermediate language.
+A DXBC binary includes a hash/checksum computed using an undocumented algorithm, and the runtime API rejects kernels without a valid checksum.
+The only supported way to generate DXBC is by compiling HLSL using the fxc compiler.
+
+### Pipelines
+
+D3D11 exposes two pipelines: rasterization and compute.
+
+The D3D11 rasterization pipeline can include up to five programmable stages, although most of them are optional:
+
+* The `vertex` stage (VS) transforms vertex data loaded from memory
+
+* The optional `hull` stage (HS) typically sets up and computes desired tessellation levels for a higher-order primitive
+
+* The optional `domain` stage (DS) evaluates a higher-order surface at domain locations chosen by a fixed-function tessellator
+
+* The optional `geometry` stage  (GS) receives as input a primitive and can produce zero or more new primitives as output
+
+* The optional `fragment` stage transforms fragments produced by the fixed-function rasterizer, determining the values for those fragments that will be merged with values in zero or more render targets. The fragment stage is sometimes called a "pixel" stage (PS), even when it does not process pixels.
+
+### Parameter Passing
+
+Shader parameters are passed to each D3D11 stage via slots.
+Each stage has its own slots of the following types:
+
+* **Constant buffers** are used for passing relatively small (4KB or less) amounts of data that will be read by GPU code. Constant bufers are passed via `b` registers.
+
+* **Shader resource views** (SRVs) include most textures, buffers, and other opaque resource types thare are read or sampled by GPU code. SRVs use `t` registers.
+
+* **Unordered access views** (UAVs) include textures, buffers, and other opaque resource types used for write or read-write operations in GPU code. UAVs use `u` registers.
+
+* **Samplers** are used to pass opaque texture-sampling stage, and use `s` registers.
+
+In addition, the D3D11 pipeline provides _vertex buffer_ slots and a single _index buffer_ slot to be used as the source vertex and index data that defines primitives.
+User-defined varying vertex shader inputs are bound to _vertex attribute_ slots (referred to as "input elements" in D3D11) which define how data from vertex buffers should be fetched to provide values for vertex attributes.
+
+The D3D11 rasterization pipeline also provides a mechanism for specifying _render target views_ (RTVs) and _depth-stencil views_ (DSVs) that provide the backing storage for the pixels in a framebuffer.
+User-defined fragment shader varying outputs (with `SV_Target` binding semantics) are bound to RTV slots.
+
+One notable detail of the D3D11 API is that the slots for fragment-stage UAVs and RTVs overlap.
+For example, a fragment kernel cannot use both `u0` and `SV_Target0` at once.
+
+Direct3D 12
+-----------
+
+Direct3D 12 (D3D12) is the current major version of the Direct2D API.
+
+D3D12 kernels must be compiled to the DirectX Intermediate Language (DXIL).
+DXIL is a layered encoding based off of LLVM bitcode; it introduces additional formatting rules and constraints which are loosely documented.
+A DXIL binary may be signed, and the runtime API only accepts appropriately signed binaries (unless a developer mode is enabled on the host machine).
+A DXIL validator `dxil.dll` is included in SDK releases, and this validator can sign binaries that pass validation.
+While DXIL can in principle be generated from multiple compiler front-ends, support for other compilers is not prioritized.
+
+### Pipelines
+
+D3D12 includes rasterization and compute pipelines similar to those in D3D11.
+Revisions to D3D12 have added additional stages to the rasterization pipeline, as well as a ray-tracing pipeline.
+
+#### Mesh Shaders
+
+> #### Note ###
+> The Slang system does not currently support mesh shaders.
+
+The D3D12 rasterization pipeline provides alternative geometry processing stages that may be used as an alternative to the `vertex`, `hull`, `domain`, and `geometry` stages:
+
+* The `mesh` stage runs groups of threads which are responsible cooperating to produce both the vertex and index data for a _meshlet_ a bounded-size chunk of geometry.
+
+* The optional `amplification` stage precedes the mesh stage and is responsible for determining how many mesh shader invocations should be run.
+
+Compared to the D3D11 pipeline without tesselllation (hull and domain shaders), a mesh shader is kind of like a combined/generalized vertex and geometry shader.
+
+Compared to the D3D11 pipeline with tessellation, an amplification shader is kind of like a combined/generalized vertex and hull shader, while a mesh shader is kind of like a combined/generalized domain and geometry shader.
+
+#### Ray Tracing
+
+The DirectX Ray Tracing (DXR) feature added a ray tracing pipeline to D3D12.
+The D3D12 ray tracing pipeline exposes the following programmable stages:
+
+* The ray generation (`raygeneration`) stage is similar to a compute stage, but can trace zero or more rays and make use of the results of those traces.
+
+* The `intersection` stage runs kernels to compute whether a ray intersects a user-defined primitive type. The system also includes a default intersector that handles triangle meshes.
+
+* The so-called any-hit (`anyhit`) stage runs on _candidate_ hits where a ray has intersected some geometry, but the hit must be either accepted or rejected by application logic. Note that the any-hit stage does not necessarily run on *all* hits, because configuration options on both scene geometry and rays can lead to these checks being bypassed.
+
+* The closest-hit (`closesthit`) stage runs a single _accepted_ hit for a ray; under typical circumstances this will be the closest hit to the origin of the ray. A typical closest-hit shader might compute the apparent color of a surface, similar to a typical fragment shader.
+
+* The `miss` stage runs for rays that do not find or accept any hits in a scene. A typical miss shader might return a background color or sample an environment map.
+
+* The `callable` stage allows user-defined kernels to be invoked like subroutines in the context of the ray tracing pipeline.
+
+Compared to existing rasterization and compute pipelines, an important difference in the design of the D3D12 ray tracing pipeline is that multiple kernels can be loaded into the pipeline for each of the programming stages.
+The specific closest-hit, miss, or other kernel that runs for a given hit or ray is determined by indexing into an appropriate _shader table_, which is effectively an array of kernels.
+The indexing into a shader table can depend on many factors including the type of ray, the type of geometry hit, etc.
+
+Note that DXR version 1.1 adds ray tracing types and operations that can be used outside of the dedicated ray tracing pipeline.
+These new mechanisms have less visible impact for a programmer using or integrating Slang.
+
+
+### Parameter Passing
+
+The mechanisms for parameter passing in D3D12 differ greatly from D3D11.
+Most opaque types (texture, resources, samplers) must be set into blocks (D3D12 calls blocks "descriptor tables").
+Each pipeline supports a fixed amount of storage for "root parameters," and allows those root parameters to be configured as root constants, slots for blocks, or slots for a limited number of opaque types (primarily just flat buffers).
+
+Shader parameters are still grouped and bound to registers as in D3D11; for example, a `Texture2D` parameter is considered as an SRV and uses a `t` register.
+D3D12 additionally associates binds shader parameters to "spaces" which are expressed similarly to registers (e.g., `space2`), but represent an orthogonal "axis" of binding.
+
+While shader parameters are bound registers and spaces, those registers and spaces do not directly correspond to slots provided by the D3D12 API the way registers do in D3D11.
+Instead, the configuration of the root parameters and the correspondence of registers/spaces to root parameters, blocks, and/or slots are defined by a _pipeline layout_ that D3D12 calls a "root signature."
+
+Unlike D3D11, all of the stages in a D3D12 pipeline share the same root parameters.
+A D3D12 pipeline layout can specify that certain root parameters or certain slots within blocks will only be accessed by a subset of stages, and can map the *same* register/space pair to different parameters/blocks/slots as long as this is done for disjoint subset of stages.
+
+#### Ray Tracing Specifics
+
+The D3D12 ray tracing pipeline adds a new mechanism for passing shader parameters.
+In addition to allowing shader parameters to be passed to the entire pipeline via root parameters, each shader table entry provides storage space for passing argument data specific to that entry.
+
+Similar to the use of a pipline layout (root signature) to configure the use of root parameters, each kernel used within shader entries must be configured with a "local root signature" that defines how the storage space in the shader table entry is to be used.
+Shader parameters are still bound to registers and spaces as for non-ray-tracing code, and the local root signature simply allows those same registers/spaces to be associated with locations in a shader table entry.
+
+One important detail is that some shader table entries are associated with a kernel for a single stage (e.g., a single miss shader), while other shader table entries are associated with a "hit group" consisting of up to one each of an intersection, any-hit, and closest-hit kernel.
+Because multiple kernels in a hit group share the same shader table entry, they also share the configured slots in that entry for binding root constants, blocks, etc.
+
+Vulkan
+------
+
+Vulkan is a cross-platform GPU API for graphics and compute with a detailed specification produced by a multi-vendor standards body.
+In contrast with OpenGL, Vulkan focuses on providing explicit control over as many aspects of GPU work as possible.
+In contrast with OpenCL, Vulkan focuses first and foremost on the needs of real-time graphics developers.
+
+Vulkan requires kernels to be compiled to the SPIR-V intermediate language.
+SPIR-V is a simple and extensible binary program format with a detailed specification; it is largely unrelated to earlier "SPIR" formats that were LLVM-based and loosely specified.
+The SPIR-V format does not require signing or hashing, and is explicitly designed to allow many different tools to produce and manipulate the format.
+Drivers that consume SPIR-V are expected to perform validation at load time.
+Some choices in the SPIR-V encoding are heavily influenced by specific design choices in the GLSL language, and may require non-GLSL compilers to transform code to match GLSL idioms.
+
+### Pipelines
+
+Vulkan includes rasterization, compute, and ray tracing pipelines with the same set of stages as described for D3D12 above.
+
+### Parameter Passing
+
+Like D3D12, Vulkan uses blocks (called "descriptor sets") to organize groups of bindings for opaque types (textures, buffers, samplers).
+Similar to D3D12, a Vulkan pipeline supports a limited number of slots for passing blocks to the pipeline, and these slots are shared across all stages.
+Vulkan also supports a limited number of bytes reserved for passing root constants (called "push constants").
+Vulkan uses pipeline layouts to describe configurations of usage for blocks and root constants.
+
+High-level-language shader parameters are bound to a combination of a "binding" and a "set" for Vulkan, which are superficially similar to the registers and spaces of D3D12.
+Unlike D3D12, however, bindings and sets in Vulkan directly correspond to the API-provided parameter-passing mechanism.
+The set index of a parameter indicates the zero-based index of a slot where a block must be passed, and the binding index is the zero-based index of a particular opaque value set into the block.
+A shader parameter that will be passed using root constants (rather than via blocks) must be bound to a root-constant offset as part of compilation.
+
+Unlike D3D12, where SRVs, UAVs, etc. use distinct classes of registers, all opaque-type shader parameters use the same index space of bindings.
+That is, a buffer and a texture both using `binding=2` in `set=3` for Vulkan will alias the same slot in the same block.
+
+The Vulkan ray tracing pipeline also uses a shader table, and also forms hit groups similar to D3D12.
+Unlike D3D12, each shader table entry in Vulkan can only be used to pass ordinary values (akin to root constants), and cannot be configured for binding of opaque types or blocks.
+
+OpenGL
+------
+
+> #### Note ####
+> Slang has only limited support for compiling code for OpenGL.
+
+OpenGL has existed for many years, and predates programmable GPU pipelines of the kind this chapter discusses; we will focus solely on use of OpenGL as an API for programmable GPU pipelines.
+
+OpenGL is a cross-platform GPU API for graphics and compute with a detailed specification produced by a multi-vendor standard body.
+In contrast with Vulkan, OpenGL provides many convenience and safety features that can simplify GPU programming.
+
+OpenGL allows kernels to be loaded as SPIR-V binaries, vendor-specific binaries, or using GLSL source code.
+Loading shaders as GLSL source code is the most widely supported of these options, such that GLSL is the _de facto_ intermediate language of OpenGL.
+
+### Pipelines
+
+OpenGL supports rasterization and compute pipelines with the same stages as described for D3D11.
+The OpenGL rasterization pipeline also supports the same mesh shader stages that are supported by D3D12.
+
+### Parameter Passing
+
+OpenGL uses slots for binding.
+There are distinct kinds of slots for buffers and textures/images, and each set of slots is shared by all pipeline stages.
+
+High-level-language shader parameters are bounding to a "binding" index for OpenGL.
+The binding index of a parameter is the zero-based index of the slot (of the appropriate kind) that must be used to pass an argument value.
+
+Note that while OpenGL and Vulkan both use binding indices for shader parameters like textures, the semantics of those are different because OpenGL uses distinct slots for passing buffers and textures.
+For OpenGL is is legal to have a texture that uses `binding=2` and a buffer that uses `binding=2` in the same kernel, because those are indices of distinct kinds of slots, while this scenario would typically be invalid for Vulkan.
+
+CUDA and OptiX
+--------------
+
+> #### Note ####
+> Slang support for OptiX is a work in progress.
+
+CUDA C/C++ is a language for expressing heterogeneous CPU and GPU code with a simple interface to invoking GPU compute work.
+OptiX is a ray tracing API that uses CUDA C++ as the language for expressing shader code.
+We focus here on OptiX version 7 and up.
+
+CUDA and OptiX allow kernels to be loaded as GPU-specific binaries, or using the PTX intermediate language.
+
+
+### Pipelines
+
+CUDA supports a compute pipeline that is similar to D3D12 or Vulkan, with additional features.
+
+OptiX introduced the style of ray tracing pipeline adopted by D3D12 and Vulkan, and thus uses the same basic stages.
+
+The CUDA system does not currently expose a rasterization pipeline.
+
+### Parameter Passing
+
+Unlike most of the GPU APIs discussed so far, CUDA supports a "flat" memory model with a single virtual address space for all GPU data.
+Textures, buffers, etc. are not opaque types, but can instead sit in the same memory as ordinary data like `float`s or `int`s.
+
+With a flat memory model, a distinct notion of a slot or block is not needed.
+A slot is just an ordinary memory location that happens to be used to store a value of texture, buffer, or other resource type.
+A block is just an ordinary memory buffer that happens to be filled with values of texture/buffer/etc. type.
+
+CUDA provides two parameter-passing mechanisms for the compute pipeline.
+First, when invoking a compute kernel, the application passes a limited number of bytes of parameter data that act as root constants.
+Second, each loaded module of GPU code may contain pre-allocated "constant memory" storage which can be initialized from the host and then read by GPU code.
+Because types like blocks or textures are not special in CUDA, either of these mechanisms can be utilized to pass any kind of data including references to pointer-based data structures stored in the GPU virtual address space.
+The use of "slots" or "blocks" or "root constants" is a matter of application policy instead of API mechanism.
+
+OptiX supports use of constant memory storage for ray tracing pipelines, where all the stages in a ray tracing pipeline share that storage.
+OptiX uses a shader table for managing kernels and hit groups, and allows kernels to access the bytes of their shader table entry via a pointer.
+Similar to the compute pipeline, application code can layer many different policies on top of these mechanisms.
+
+CPU Compute
+-----------
+
+> #### Note ####
+> Slang's support for CPU compute is functional, but not feature- or performance-complete.
+> Backwards-incompatible changes to this target may come in future versions of Slang.
+
+For the purposes of Slang, different CPU-based host platforms are largely the same.
+All support binary code in a native machine-code format.
+All CPU platforms Slang supports use a flat memory model with a single virtual address space, where any data type can be stored at any virtual address.
+
+Note that this section consider CPU-based platforms only as targets for kernel compilation; using a CPU as a target for scalar "host" code is an advanced target beyond the scope of this document.
+
+### Pipelines
+
+Slang's CPU compute target supports only a compute pipeline.
+
+### Parameter Passing
+
+Because CPU target support flexible pointer-based addressing and large low-latency caches, a compute kernel can simply be passed a small fixed number of pointers and be relied upon to load parameter values of any types via indirection through those pointers.
+
+Summary
+-------
+
+This chapter has reviewed the main target platforms supported by the Slang compiler and runtime system.
+A key point to take away is that there is great variation in the capabilities of these systems.
+Even superficially similar graphics APIs have complicated differences in their parameter-passing mechanisms that must be accounted for by application programmers and GPU compilers.
+
+In the next chapter, we will discuss how the Slang compiler adapts to the different capabilities and rules of these platforms when laying out shader parameters in memory and then binding those parameters to the mechanisms defined by each platform.
diff --git a/docs/user-guide/a1-02-slangpy.md b/docs/user-guide/a1-02-slangpy.md
index bebdb06f4..bee514c55 100644
--- a/docs/user-guide/a1-02-slangpy.md
+++ b/docs/user-guide/a1-02-slangpy.md
@@ -153,7 +153,7 @@ print(output_grad)
 
 `slangpy` also binds the forward-mode version of your kernel (propagate derivatives of inputs to the output) which can be invoked the same way using `module.square.fwd()`
 
-You can refer to [this documentation](08-autodiff.md) for a detailed reference of Slang's automatic differentiation feature.
+You can refer to [this documentation](autodiff) for a detailed reference of Slang's automatic differentiation feature.
 
 ### Wrapping your kernels as pytorch functions
 
@@ -451,7 +451,7 @@ in the `inputGradToPropagateTo` tensor. Therefore, after running `boxFilter_bwd`
 back propagated derivative values.
 
 Again, to understand all the details of the automatic differentiation system, please refer to the 
-[Automatic Differentiation](08-autodiff.md) chapter for a detailed explanation.
+[Automatic Differentiation](autodiff) chapter for a detailed explanation.
 
 ## Manually binding kernels
 `[AutoPyBindCUDA]` works for most use cases, but in certain situations, it may be necessary to write the *host* function by hand. The host function can also be written in Slang, and `slangpy` handles its compilation to C++.
diff --git a/docs/user-guide/toc.html b/docs/user-guide/toc.html
index dceca681f..9e8c6f57d 100644
--- a/docs/user-guide/toc.html
+++ b/docs/user-guide/toc.html
@@ -1,104 +1,116 @@
 <ul class="toc_root_list"><li data-link="index"><span>Slang User's Guide</span>
 <ul class="toc_list">
-<li data-link="00-introduction"><span>Introduction</span>
+<li data-link="introduction"><span>Introduction</span>
 <ul class="toc_list">
-<li data-link="00-introduction#why-use-slang"><span>Why use Slang?</span></li>
-<li data-link="00-introduction#who-is-slang-for"><span>Who is Slang for?</span></li>
-<li data-link="00-introduction#who-is-this-guide-for"><span>Who is this guide for?</span></li>
-<li data-link="00-introduction#goals-and-non-goals"><span>Goals and Non-Goals</span></li>
+<li data-link="introduction#why-use-slang"><span>Why use Slang?</span></li>
+<li data-link="introduction#who-is-slang-for"><span>Who is Slang for?</span></li>
+<li data-link="introduction#who-is-this-guide-for"><span>Who is this guide for?</span></li>
+<li data-link="introduction#goals-and-non-goals"><span>Goals and Non-Goals</span></li>
 </ul>
 </li>
-<li data-link="01-get-started"><span>Getting Started with Slang</span>
+<li data-link="get-started"><span>Getting Started with Slang</span>
 <ul class="toc_list">
-<li data-link="01-get-started#installation"><span>Installation</span></li>
-<li data-link="01-get-started#your-first-slang-shader"><span>Your first Slang shader</span></li>
-<li data-link="01-get-started#the-full-example"><span>The full example</span></li>
+<li data-link="get-started#installation"><span>Installation</span></li>
+<li data-link="get-started#your-first-slang-shader"><span>Your first Slang shader</span></li>
+<li data-link="get-started#the-full-example"><span>The full example</span></li>
 </ul>
 </li>
-<li data-link="02-conventional-features"><span>Conventional Language Features</span>
+<li data-link="conventional-features"><span>Conventional Language Features</span>
 <ul class="toc_list">
-<li data-link="02-conventional-features#types"><span>Types</span></li>
-<li data-link="02-conventional-features#expressions"><span>Expressions</span></li>
-<li data-link="02-conventional-features#statements"><span>Statements</span></li>
-<li data-link="02-conventional-features#functions"><span>Functions</span></li>
-<li data-link="02-conventional-features#preprocessor"><span>Preprocessor</span></li>
-<li data-link="02-conventional-features#attributes"><span>Attributes</span></li>
-<li data-link="02-conventional-features#global-variables-and-shader-parameters"><span>Global Variables and Shader Parameters</span></li>
-<li data-link="02-conventional-features#shader-entry-points"><span>Shader Entry Points</span></li>
+<li data-link="conventional-features#types"><span>Types</span></li>
+<li data-link="conventional-features#expressions"><span>Expressions</span></li>
+<li data-link="conventional-features#statements"><span>Statements</span></li>
+<li data-link="conventional-features#functions"><span>Functions</span></li>
+<li data-link="conventional-features#preprocessor"><span>Preprocessor</span></li>
+<li data-link="conventional-features#attributes"><span>Attributes</span></li>
+<li data-link="conventional-features#global-variables-and-shader-parameters"><span>Global Variables and Shader Parameters</span></li>
+<li data-link="conventional-features#shader-entry-points"><span>Shader Entry Points</span></li>
+<li data-link="conventional-features#mixed-shader-entry-points"><span>Mixed Shader Entry Points</span></li>
 </ul>
 </li>
-<li data-link="03-convenience-features"><span>Basic Convenience Features</span>
+<li data-link="convenience-features"><span>Basic Convenience Features</span>
 <ul class="toc_list">
-<li data-link="03-convenience-features#type-inference-in-variable-definitions"><span>Type Inference in Variable Definitions</span></li>
-<li data-link="03-convenience-features#immutable-values"><span>Immutable Values</span></li>
-<li data-link="03-convenience-features#namespaces"><span>Namespaces</span></li>
-<li data-link="03-convenience-features#member-functions"><span>Member functions</span></li>
-<li data-link="03-convenience-features#properties"><span>Properties</span></li>
-<li data-link="03-convenience-features#initializers"><span>Initializers</span></li>
-<li data-link="03-convenience-features#operator-overloading"><span>Operator Overloading</span></li>
-<li data-link="03-convenience-features#subscript-operator"><span>Subscript Operator</span></li>
-<li data-link="03-convenience-features#optionalt-type"><span>`Optional&lt;T&gt;` type</span></li>
-<li data-link="03-convenience-features#reinterprett-operation"><span>`reinterpret&lt;T&gt;` operation</span></li>
-<li data-link="03-convenience-features#struct-inheritance-limited"><span>`struct` inheritance (limited)</span></li>
-<li data-link="03-convenience-features#extensions"><span>Extensions</span></li>
-<li data-link="03-convenience-features#multi-level-break"><span>Multi-level break</span></li>
-<li data-link="03-convenience-features#force-inlining"><span>Force inlining</span></li>
-<li data-link="03-convenience-features#special-scoping-syntax"><span>Special Scoping Syntax</span></li>
+<li data-link="convenience-features#type-inference-in-variable-definitions"><span>Type Inference in Variable Definitions</span></li>
+<li data-link="convenience-features#immutable-values"><span>Immutable Values</span></li>
+<li data-link="convenience-features#namespaces"><span>Namespaces</span></li>
+<li data-link="convenience-features#member-functions"><span>Member functions</span></li>
+<li data-link="convenience-features#properties"><span>Properties</span></li>
+<li data-link="convenience-features#initializers"><span>Initializers</span></li>
+<li data-link="convenience-features#operator-overloading"><span>Operator Overloading</span></li>
+<li data-link="convenience-features#subscript-operator"><span>Subscript Operator</span></li>
+<li data-link="convenience-features#optionalt-type"><span>`Optional&lt;T&gt;` type</span></li>
+<li data-link="convenience-features#reinterprett-operation"><span>`reinterpret&lt;T&gt;` operation</span></li>
+<li data-link="convenience-features#struct-inheritance-limited"><span>`struct` inheritance (limited)</span></li>
+<li data-link="convenience-features#extensions"><span>Extensions</span></li>
+<li data-link="convenience-features#multi-level-break"><span>Multi-level break</span></li>
+<li data-link="convenience-features#force-inlining"><span>Force inlining</span></li>
+<li data-link="convenience-features#special-scoping-syntax"><span>Special Scoping Syntax</span></li>
 </ul>
 </li>
-<li data-link="04-modules-and-access-control"><span>Modules and Access Control</span>
+<li data-link="modules"><span>Modules and Access Control</span>
 <ul class="toc_list">
-<li data-link="04-modules-and-access-control#defining-a-module"><span>Defining a Module</span></li>
-<li data-link="04-modules-and-access-control#importing-a-module"><span>Importing a Module</span></li>
-<li data-link="04-modules-and-access-control#access-control"><span>Access Control</span></li>
-<li data-link="04-modules-and-access-control#legacy-modules"><span>Legacy Modules</span></li>
+<li data-link="modules#defining-a-module"><span>Defining a Module</span></li>
+<li data-link="modules#importing-a-module"><span>Importing a Module</span></li>
+<li data-link="modules#access-control"><span>Access Control</span></li>
+<li data-link="modules#legacy-modules"><span>Legacy Modules</span></li>
 </ul>
 </li>
-<li data-link="05-interfaces-generics"><span>Interfaces and Generics</span>
+<li data-link="capabilities"><span>Capabilities</span>
 <ul class="toc_list">
-<li data-link="05-interfaces-generics#interfaces"><span>Interfaces</span></li>
-<li data-link="05-interfaces-generics#generics"><span>Generics</span></li>
-<li data-link="05-interfaces-generics#supported-constructs-in-interface-definitions"><span>Supported Constructs in Interface Definitions</span></li>
-<li data-link="05-interfaces-generics#associated-types"><span>Associated Types</span></li>
-<li data-link="05-interfaces-generics#generic-value-parameters"><span>Generic Value Parameters</span></li>
-<li data-link="05-interfaces-generics#interface-typed-values"><span>Interface-typed Values</span></li>
-<li data-link="05-interfaces-generics#extending-a-type-with-additional-interface-conformances"><span>Extending a Type with Additional Interface Conformances</span></li>
-<li data-link="05-interfaces-generics#is-and-as-operator"><span>`is` and `as` Operator</span></li>
-<li data-link="05-interfaces-generics#extensions-to-interfaces"><span>Extensions to Interfaces</span></li>
+<li data-link="capabilities#capability-atoms-and-capability-requirements"><span>Capability Atoms and Capability Requirements</span></li>
+<li data-link="capabilities#conflicting-capabilities"><span>Conflicting Capabilities</span></li>
+<li data-link="capabilities#requirements-in-parent-scope"><span>Requirements in Parent Scope</span></li>
+<li data-link="capabilities#inferrence-of-capability-requirements"><span>Inferrence of Capability Requirements</span></li>
+<li data-link="capabilities#inferrence-on-target-switch"><span>Inferrence on target_switch</span></li>
+<li data-link="capabilities#capability-aliases"><span>Capability Aliases</span></li>
+<li data-link="capabilities#validation-of-capability-requirements"><span>Validation of Capability Requirements</span></li>
 </ul>
 </li>
-<li data-link="06-compiling"><span>Compiling Code with Slang</span>
+<li data-link="interfaces-generics"><span>Interfaces and Generics</span>
 <ul class="toc_list">
-<li data-link="06-compiling#concepts"><span>Concepts</span></li>
-<li data-link="06-compiling#command-line-compilation-with-slangc"><span>Command-Line Compilation with `slangc`</span></li>
-<li data-link="06-compiling#using-the-compilation-api"><span>Using the Compilation API</span></li>
+<li data-link="interfaces-generics#interfaces"><span>Interfaces</span></li>
+<li data-link="interfaces-generics#generics"><span>Generics</span></li>
+<li data-link="interfaces-generics#supported-constructs-in-interface-definitions"><span>Supported Constructs in Interface Definitions</span></li>
+<li data-link="interfaces-generics#associated-types"><span>Associated Types</span></li>
+<li data-link="interfaces-generics#generic-value-parameters"><span>Generic Value Parameters</span></li>
+<li data-link="interfaces-generics#interface-typed-values"><span>Interface-typed Values</span></li>
+<li data-link="interfaces-generics#extending-a-type-with-additional-interface-conformances"><span>Extending a Type with Additional Interface Conformances</span></li>
+<li data-link="interfaces-generics#is-and-as-operator"><span>`is` and `as` Operator</span></li>
+<li data-link="interfaces-generics#extensions-to-interfaces"><span>Extensions to Interfaces</span></li>
 </ul>
 </li>
-<li data-link="07-targets"><span>Supported Compilation Targets</span>
+<li data-link="autodiff"><span>Automatic Differentiation</span>
 <ul class="toc_list">
-<li data-link="07-targets#background-and-terminology"><span>Background and Terminology</span></li>
-<li data-link="07-targets#direct3d-11"><span>Direct3D 11</span></li>
-<li data-link="07-targets#direct3d-12"><span>Direct3D 12</span></li>
-<li data-link="07-targets#vulkan"><span>Vulkan</span></li>
-<li data-link="07-targets#opengl"><span>OpenGL</span></li>
-<li data-link="07-targets#cuda-and-optix"><span>CUDA and OptiX</span></li>
-<li data-link="07-targets#cpu-compute"><span>CPU Compute</span></li>
-<li data-link="07-targets#summary"><span>Summary</span></li>
+<li data-link="autodiff#using-automatic-differentiation-in-slang"><span>Using Automatic Differentiation in Slang</span></li>
+<li data-link="autodiff#mathematic-concepts-and-terminologies"><span>Mathematic Concepts and Terminologies</span></li>
+<li data-link="autodiff#differentiable-types"><span>Differentiable Types</span></li>
+<li data-link="autodiff#forward-derivative-propagation-function"><span>Forward Derivative Propagation Function</span></li>
+<li data-link="autodiff#backward-derivative-propagation-function"><span>Backward Derivative Propagation Function</span></li>
+<li data-link="autodiff#builtin-differentiable-functions"><span>Builtin Differentiable Functions</span></li>
+<li data-link="autodiff#primal-substitute-functions"><span>Primal Substitute Functions</span></li>
+<li data-link="autodiff#working-with-mixed-differentiable-and-non-differentiable-code"><span>Working with Mixed Differentiable and Non-Differentiable Code</span></li>
+<li data-link="autodiff#higher-order-differentiation"><span>Higher Order Differentiation</span></li>
+<li data-link="autodiff#interactions-with-generics-and-interfaces"><span>Interactions with Generics and Interfaces</span></li>
+<li data-link="autodiff#restrictions-of-automatic-differentiation"><span>Restrictions of Automatic Differentiation</span></li>
 </ul>
 </li>
-<li data-link="08-autodiff"><span>Automatic Differentiation</span>
+<li data-link="compiling"><span>Compiling Code with Slang</span>
 <ul class="toc_list">
-<li data-link="08-autodiff#using-automatic-differentiation-in-slang"><span>Using Automatic Differentiation in Slang</span></li>
-<li data-link="08-autodiff#mathematic-concepts-and-terminologies"><span>Mathematic Concepts and Terminologies</span></li>
-<li data-link="08-autodiff#differentiable-types"><span>Differentiable Types</span></li>
-<li data-link="08-autodiff#forward-derivative-propagation-function"><span>Forward Derivative Propagation Function</span></li>
-<li data-link="08-autodiff#backward-derivative-propagation-function"><span>Backward Derivative Propagation Function</span></li>
-<li data-link="08-autodiff#builtin-differentiable-functions"><span>Builtin Differentiable Functions</span></li>
-<li data-link="08-autodiff#primal-substitute-functions"><span>Primal Substitute Functions</span></li>
-<li data-link="08-autodiff#working-with-mixed-differentiable-and-non-differentiable-code"><span>Working with Mixed Differentiable and Non-Differentiable Code</span></li>
-<li data-link="08-autodiff#higher-order-differentiation"><span>Higher Order Differentiation</span></li>
-<li data-link="08-autodiff#interactions-with-generics-and-interfaces"><span>Interactions with Generics and Interfaces</span></li>
-<li data-link="08-autodiff#restrictions-of-automatic-differentiation"><span>Restrictions of Automatic Differentiation</span></li>
+<li data-link="compiling#concepts"><span>Concepts</span></li>
+<li data-link="compiling#command-line-compilation-with-slangc"><span>Command-Line Compilation with `slangc`</span></li>
+<li data-link="compiling#using-the-compilation-api"><span>Using the Compilation API</span></li>
+</ul>
+</li>
+<li data-link="targets"><span>Supported Compilation Targets</span>
+<ul class="toc_list">
+<li data-link="targets#background-and-terminology"><span>Background and Terminology</span></li>
+<li data-link="targets#direct3d-11"><span>Direct3D 11</span></li>
+<li data-link="targets#direct3d-12"><span>Direct3D 12</span></li>
+<li data-link="targets#vulkan"><span>Vulkan</span></li>
+<li data-link="targets#opengl"><span>OpenGL</span></li>
+<li data-link="targets#cuda-and-optix"><span>CUDA and OptiX</span></li>
+<li data-link="targets#cpu-compute"><span>CPU Compute</span></li>
+<li data-link="targets#summary"><span>Summary</span></li>
 </ul>
 </li>
 <li data-link="a1-special-topics"><span>Special Topics</span>
-- 
cgit v1.2.3