#ifndef __GLITTER_INC
#define __GLITTER_INC

/*
 * This is an implementation of Kemppinen et. al.'s "Evaluating and Sampling
 * Glinty NDFs in Constant Time".
 * It is ported from: https://www.shadertoy.com/view/tcdGDl
 * Since no license terms are listed in the shader body, it is protected by
 * the default Shadertoy license (per https://www.shadertoy.com/terms),
 * which is the Creative Commons Attribution-NonCommercial-ShareAlike 3.0
 * Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/deed.en
 *
 * I have made changes to this code. They are:
 *   1. Syntax changes required to translate GLSL to HLSL.
 *   2. Stylistic preferences, like using "1" or "1.0" instead of "1.".
 *   3. The `GetGlitterLighting` function, which populates data required for
 *      indirect glitter. The original paper only discusses analytic lighting.
 *
 * @article{KPT:2025:Glinty,
 *   title = {Evaluating and Sampling Glinty NDFs in Constant Time},
 *   author = {Kemppinen, Pauli and Paulin, LoÏs and Thonat,
 *       Théo and Thiery, Jean-Marc and Lehtinen, Jaakko and Boubekeur,
 *       Tamy},
 *   year = {2025},
 *   journal = {ACM Transactions on Graphics (Proc. SIGGRAPH Asia 2025)},
 *   volume = {44},
 *   number = {6},
 *   articleno = {255},
 * }
 */

#define PI 3.1415926535897932384626433832795028841971
// Remaps [0, UINT_MAX] to [0, 1]
#define UINT_TO_UNIT (1.0 / 4294967296.0)

// Lambert azimuthal equal area projection
float2 lambert(float3 v) {
  return v.xy / sqrt(1 + v.z);
}

// v is a microfacet normal that has been squished according to alpha, a
// roughness parameter.
float3 ndf_to_disk_ggx(float3 v, float alpha) {
  // Map `v` onto a hemisphere.
  float3 hemi = float3(v.xy / alpha, v.z);
  float denom = dot(hemi, hemi);
  // Project onto circle with equal area projection, and remap from [-1, 1]
  // to [0, 1].
  float2 v_disk = lambert(normalize(hemi)) * 0.5 + 0.5;
  float jacobian_determinant = 1.0 / (alpha * alpha * denom * denom);
  return float3(v_disk, jacobian_determinant);
}

// Computes (M^T M)^-1
float2x2 inv_quadratic(float2x2 M) {
  float D = determinant(M);
  float2 c0 = transpose(M)[0] / D;
  float2 c1 = transpose(M)[1] / D;
  float A = dot(c0, c0);
  float B = -dot(c0, c1);
  float C = dot(c1, c1);
  return transpose(float2x2(float2(C, B), float2(B, A)));
}

float2x2 uv_ellipsoid(float2x2 uv_J) {
  float2x2 Q = inv_quadratic(transpose(uv_J));
  float2 q0 = transpose(Q)[0];
  float2 q1 = transpose(Q)[1];
  float tr = 0.5 * (q0.x + q1.y);
  float  D = sqrt(max(0.0, tr * tr - determinant(Q)));
  float l1 = tr - D;
  float l2 = tr + D;
  float2 v1 = float2(l1 - q1.y, q0.y);
  float2 v2 = float2(q1.x, l2 - q0.x);
  float2 n = 1.0/sqrt(float2(l1, l2));
  return transpose(float2x2(normalize(v1) * n.x, normalize(v2) * n.y));
}

float QueryLod(float2x2 uv_J, float filter_size) {
  float s0 = length(transpose(uv_J)[0]);
  float s1 = length(transpose(uv_J)[1]);
  return log2(max(s0, s1) * filter_size) + pow(2.0, filter_size);
}

float2x2 inverse(float2x2 m) {
  float det = (m[0][0] * m[1][1]) - (m[0][1] * m[1][0]);

  return float2x2(
      m[1][1], -m[0][1],
      -m[1][0],  m[0][0]
      ) / det;
}

float normal(float2x2 cov, float2 x) {
  return exp(-.5 * dot(x, mul(inverse(cov), x)))
    / (sqrt(determinant(cov)) * 2.0 * PI);
}

uint2 shuffle(uint2 v) {
  v = v * 1664525u + 1013904223u;
  v.x += v.y * 1664525u;
  v.y += v.x * 1664525u;

  v = v ^ (v>>16u);

  v.x += v.y * 1664525u;
  v.y += v.x * 1664525u;
  v = v ^ (v>>16u);
  return v;
}

float2 rand(uint2 v) {
  return float2(shuffle(v)) * UINT_TO_UNIT;
}

float2 Rand2D(float2 x, float2 y, float l, uint i) {
  uint2 ux = asuint(x);
  uint2 uy = asuint(y);
  uint  ul = asuint(l);
  // This is broken, but looks cool.
  //return hash22_fast(asfloat((ux>>16|ux<<16) ^ uy ^ ul ^ (i*0x124u)));
  return rand((ux>>16|ux<<16) ^ uy ^ ul ^ (i*0x124u));
}

float Rand1D(float2 x, float2 y, float l, uint i) {
  return Rand2D(x, y, l, i).x;
}

// Bürmann series, see https://en.wikipedia.org/wiki/Error_function
float erf(float x) {
  float e = exp(-x*x);
  return sign(x) * 2.0 * sqrt((1.0 - e) / PI) *
    (sqrt(PI) * 0.5 + 31.0/200.0 * e - 341.0/8000.0 * e * e);
}

float cdf(float x, float mu, float sigma) {
  return 0.5 + 0.5 * erf((x-mu)/(sigma*sqrt(2.0)));
}

float integrate_interval(float x, float size, float mu, float stdev,
    float lower_limit, float upper_limit) {
  return cdf(min(x+size, upper_limit), mu, stdev)
    - cdf(max(x-size, lower_limit), mu, stdev);
}

float integrate_box(float2 x, float2 size, float2 mu, float2x2 sigma,
    float2 lower_limit, float2 upper_limit) {
  return
    integrate_interval(x.x, size.x, mu.x,
        sqrt(sigma[0][0]), lower_limit.x, upper_limit.x) *
    integrate_interval(x.y, size.y, mu.y,
        sqrt(sigma[1][1]), lower_limit.y, upper_limit.y);
}

float compensation(float2 x, float2x2 sigma, float res) {
  float containing = integrate_box(0.5, 0.5, x, sigma, 0.0, 1.0);
  float2 sampled_cell_center = (floor(x * res) + 0.5) / res;
  float explicitly_evaluated =
    integrate_box(sampled_cell_center, 1.0 / res, x, sigma, 0, 1);
  return containing - explicitly_evaluated;
}

float3 disk_to_ndf_ggx(float2 v_disk, float alpha) {
  float2 p = v_disk * 2.0f - 1.0f;
  float r2 = saturate(dot(p, p));
  float3 hemi = float3(p * sqrt(max(1e-6f, 2.0f - r2)), 1.0f - r2);
  float alpha2 = alpha * alpha;
  float denom =
    sqrt(max(1e-6f, alpha2 * dot(hemi.xy, hemi.xy) + hemi.z * hemi.z));
  return float3(alpha * hemi.xy, hemi.z) / denom;
}

// Algorithm 1 from Kemppinen et. al.
float D_Kemppinen(float3 h, float alpha, float glint_alpha, float2 uv,
    float2x2 uv_J, float N, float filter_size, out float3 micro_normal) {
  float res = sqrt(N);
  float2 x_s = uv;
  float3 x_a_and_d = ndf_to_disk_ggx(h, alpha);
  float2 x_a = x_a_and_d.xy;
  float d = x_a_and_d.z;

  float lambda = QueryLod(res * uv_J, filter_size);

  float D_filter = 0;
  float best_weight = 0;
  float2 best_g_a = x_a;

  [loop]
  for (float m = 0; m < 2; m += 1) {
    float l = floor(lambda) + m;

    float w_lambda = 1.0 - abs(lambda - l);
    float res_s = res * pow(2, -l);
    float res_a = pow(2, l);

    float2x2 uv_J2 = filter_size * uv_J;
    float2x2 sigma_s = mul(uv_J2, transpose(uv_J2));

    float2x2 sigma_a = d * pow(glint_alpha, 2) * float2x2(1, 0, 0, 1);

    float2 base_i_a = floor(x_a * res_a) + 0.5;
    float2 i_a = clamp(base_i_a, 0.5, res_a - 0.5);

    float2 base_i_s = floor(x_s * res_s) + 0.5;
    float2 i_s = clamp(base_i_s, 0.5, res_s - 0.5);

    float2 g_s = (i_s + Rand2D(i_s, i_a, l, 1u) - .5) / res_s;
    float2 g_a = (i_a + Rand2D(i_s, i_a, l, 2u) - .5) / res_a;

    float r = Rand1D(i_s, i_a, l, 4u);
    float roulette = smoothstep(max(.0, r-.1), min(1.0, r+.1), w_lambda);

    float w = roulette * normal(sigma_a, x_a - g_a)
      * normal(sigma_s, x_s - g_s) / N;
    // This is hacky nonsense intended to improve the 1-sampling case. Original
    // code is commented out below.
    D_filter += w < 1 ? sqrt(w) * 2 : w;
    //D_filter += w;
    if (w > best_weight) {
      best_weight = w;
      best_g_a = g_a;
    }
    D_filter += w_lambda * compensation(x_a, sigma_a, res_a);
    // This is also hacked in.
    D_filter += w_lambda * compensation(x_s, sigma_s, res_s);
  }

  micro_normal = normalize(disk_to_ndf_ggx(best_g_a, alpha));
  return D_filter * d / PI;
}

#if defined(_GLITTER)
struct LightGlitter {
  float direct_D;

  float indirect_D;
  float indirect_NoL;
  float indirect_LoH;
};

// Glitter data getter to be run from lighting code.
LightGlitter GetGlitterLighting(
    float glitter_amount, float glitter_roughness, float2 uv, float3x3 tbn, float roughness,
    float3 normal, float3 V, float3 direct_H, float3 indirect_dir) {
  LightGlitter g;
  const float glitter_filter_size = 0.7f;
  float2x2 uv_J = uv_ellipsoid(transpose(float2x2(ddx(uv), ddy(uv))));
  float N = 8.0e5f * pow(10.0f, glitter_amount * 6.0f - 2.0f);

  // Direct
  float3 direct_H_tangent = mul(direct_H, transpose(tbn));
  float3 direct_micro_normal;  // unused
  g.direct_D = D_Kemppinen(direct_H_tangent, roughness, glitter_roughness,
      uv, uv_J, N, glitter_filter_size, direct_micro_normal);

  // Indirect
  float3 indirect_H = normalize(V + indirect_dir);
  float3 indirect_H_tangent = mul(indirect_H, transpose(tbn));
  float3 indirect_micro_normal;  // unused, but required by D_Kemppinen
  g.indirect_D = D_Kemppinen(indirect_H_tangent, roughness, glitter_roughness,
      uv, uv_J, N, glitter_filter_size, indirect_micro_normal);
  g.indirect_NoL = max(1e-4, dot(normal, indirect_dir));
  g.indirect_LoH = max(1e-4, dot(indirect_dir, indirect_H));

  return g;
}
#endif  // _GLITTER

#endif  // __GLITTER_INC