1 files changed, 48 insertions, 24 deletions
diff --git a/glitter.cginc b/glitter.cginc
index 908c1ab..a9bbf8b 100644
--- a/glitter.cginc
+++ b/glitter.cginc
@@ -171,13 +171,15 @@ float3 disk_to_ndf_ggx(float2 v_disk, float alpha) {
 }
 
 // Algorithm 1 from Kemppinen et. al.
-float D_Kemppinen(float3 h, float alpha, float glint_alpha, float2 uv,
-    float2x2 uv_J, float N, float filter_size, out float3 micro_normal) {
+float D_Kemppinen(float3 h, float alpha, float glint_alpha, int angular_cells,
+    float2 uv, float2x2 uv_J, float N, float filter_size,
+    out float3 micro_normal) {
   float res = sqrt(N);
   float2 x_s = uv;
   float3 x_a_and_d = ndf_to_disk_ggx(h, alpha);
   float2 x_a = x_a_and_d.xy;
   float d = x_a_and_d.z;
+  int angular_sample_count = clamp(angular_cells, 1, 4);
 
   float lambda = QueryLod(res * uv_J, filter_size);
 
@@ -185,7 +187,8 @@ float D_Kemppinen(float3 h, float alpha, float glint_alpha, float2 uv,
   float best_weight = 0;
   float2 best_g_a = x_a;
 
-  [loop]
+  [unroll]
+  //[loop]
   for (float m = 0; m < 2; m += 1) {
     float l = floor(lambda) + m;
 
@@ -200,29 +203,48 @@ float D_Kemppinen(float3 h, float alpha, float glint_alpha, float2 uv,
 
     float2 base_i_a = floor(x_a * res_a) + 0.5;
     float2 i_a = clamp(base_i_a, 0.5, res_a - 0.5);
+    float2 angular_frac = frac(x_a * res_a) - 0.5;
+    float2 angular_step = lerp(float2(-1.0, -1.0), float2(1.0, 1.0),
+        step(0.0, angular_frac));
 
     float2 base_i_s = floor(x_s * res_s) + 0.5;
     float2 i_s = clamp(base_i_s, 0.5, res_s - 0.5);
 
-    float2 g_s = (i_s + Rand2D(i_s, i_a, l, 1u) - .5) / res_s;
-    float2 g_a = (i_a + Rand2D(i_s, i_a, l, 2u) - .5) / res_a;
-
-    float r = Rand1D(i_s, i_a, l, 4u);
-    float roulette = smoothstep(max(.0, r-.1), min(1.0, r+.1), w_lambda);
-
-    float w = roulette * normal(sigma_a, x_a - g_a)
-      * normal(sigma_s, x_s - g_s) / N;
-    // This is hacky nonsense intended to improve the 1-sampling case. Original
-    // code is commented out below.
-    D_filter += w < 1 ? sqrt(w) * 2 : w;
-    //D_filter += w;
-    if (w > best_weight) {
-      best_weight = w;
-      best_g_a = g_a;
+    // This unroll actually does substantially improve benchmarks.
+    [unroll]
+    for (int angular_index = 0; angular_index < 4; ++angular_index) {
+      if (angular_index >= angular_sample_count) {
+        break;
+      }
+
+      float2 angular_offset = 0.0;
+      if (angular_index == 1) {
+        angular_offset = float2(angular_step.x, 0.0);
+      } else if (angular_index == 2) {
+        angular_offset = float2(0.0, angular_step.y);
+      } else if (angular_index == 3) {
+        angular_offset = angular_step;
+      }
+
+      float2 i_a_neighbor = clamp(i_a + angular_offset, 0.5, res_a - 0.5);
+      float2 g_s = (i_s + Rand2D(i_s, i_a_neighbor, l, 1u) - .5) / res_s;
+      float2 g_a = (i_a_neighbor + Rand2D(i_s, i_a_neighbor, l, 2u) - .5) / res_a;
+
+      float r = Rand1D(i_s, i_a_neighbor, l, 4u);
+      float roulette = smoothstep(max(.0, r-.1), min(1.0, r+.1), w_lambda);
+
+      float w = roulette * normal(sigma_a, x_a - g_a)
+        * normal(sigma_s, x_s - g_s) / N;
+      // This is hacky nonsense intended to improve the 1-sampling case. Original
+      // code is commented out below.
+      D_filter += w < 1 ? sqrt(w) * 2 : w;
+      //D_filter += w;
+      if (w > best_weight) {
+        best_weight = w;
+        best_g_a = g_a;
+      }
     }
     D_filter += w_lambda * compensation(x_a, sigma_a, res_a);
-    // This is also hacked in.
-    D_filter += w_lambda * compensation(x_s, sigma_s, res_s);
   }
 
   micro_normal = normalize(disk_to_ndf_ggx(best_g_a, alpha));
@@ -240,10 +262,10 @@ struct LightGlitter {
 
 // Glitter data getter to be run from lighting code.
 LightGlitter GetGlitterLighting(
-    float glitter_amount, float glitter_roughness, float2 uv, float3x3 tbn, float roughness,
+    float glitter_amount, float glitter_roughness, int glitter_angular_cells,
+    float glitter_filter_size, float2 uv, float3x3 tbn, float roughness,
     float3 normal, float3 V, float3 direct_H, float3 indirect_dir) {
   LightGlitter g;
-  const float glitter_filter_size = 0.7f;
   float2x2 uv_J = uv_ellipsoid(transpose(float2x2(ddx(uv), ddy(uv))));
   float N = 8.0e5f * pow(10.0f, glitter_amount * 6.0f - 2.0f);
 
@@ -251,14 +273,16 @@ LightGlitter GetGlitterLighting(
   float3 direct_H_tangent = mul(direct_H, transpose(tbn));
   float3 direct_micro_normal;  // unused
   g.direct_D = D_Kemppinen(direct_H_tangent, roughness, glitter_roughness,
-      uv, uv_J, N, glitter_filter_size, direct_micro_normal);
+      glitter_angular_cells, uv, uv_J, N, glitter_filter_size,
+      direct_micro_normal);
 
   // Indirect
   float3 indirect_H = normalize(V + indirect_dir);
   float3 indirect_H_tangent = mul(indirect_H, transpose(tbn));
   float3 indirect_micro_normal;  // unused, but required by D_Kemppinen
   g.indirect_D = D_Kemppinen(indirect_H_tangent, roughness, glitter_roughness,
-      uv, uv_J, N, glitter_filter_size, indirect_micro_normal);
+      glitter_angular_cells, uv, uv_J, N, glitter_filter_size,
+      indirect_micro_normal);
   g.indirect_NoL = max(1e-4, dot(normal, indirect_dir));
   g.indirect_LoH = max(1e-4, dot(indirect_dir, indirect_H));