#!/usr/bin/env python3

# To run with uv:
# uv run -w OpenEXR -w numba ./make_dfg_lut.py

import argparse
import math
import numpy as np
import OpenEXR
import Imath
import numba
import random
import concurrent.futures
import os
from functools import partial


@numba.njit(cache=True)
def rcp(a):
    return 1.0 / a


@numba.njit(cache=True)
def lerp(a, b, t):
    return a + (b - a) * t


@numba.njit(cache=True)
def saturate(a):
    if a < 0.0: return 0.0
    if a > 1.0: return 1.0
    return a


# Standard BRDF components.
@numba.njit(cache=True)
def F_Schlick(LoH, f0, f90=1.0):
    term = 1.0 - LoH
    term2 = term * term
    term5 = term2 * term2 * term
    return f0 + (f90 - f0) * term5


@numba.njit(cache=True)
def D_GGX(roughness, NoH):
    r2 = roughness * roughness
    NoH2 = NoH * NoH
    NoH4 = NoH2 * NoH2
    k = rcp(NoH2) - 1.0
    r2_plus_k = r2 + k
    denom = NoH4 * r2_plus_k * r2_plus_k
    return r2 / (denom + 1e-6)


@numba.njit(cache=True)
def G_GGXSmith(roughness, NoL, NoV):
    denom = 2.0 * lerp(2.0 * NoL * NoV, NoL + NoV, roughness)
    return rcp(denom + 1e-6)


# Cloth BRDF components.
@numba.njit(cache=True)
def D_Cloth(roughness, NoH):
    if roughness < 1e-4: return 0.0
    r_rcp = rcp(roughness)
    sin2H = 1.0 - NoH * NoH
    return (2.0 + r_rcp) * pow(sin2H, r_rcp * 0.5) / (2.0 * math.pi)


@numba.njit(cache=True)
def G_Cloth_L(x, a, b, c, d, e):
    return a / (1.0 + b * pow(x, c)) + d * x + e


@numba.njit(cache=True)
def Lambda_Cloth_Raw(roughness, cos_theta):
    a0, a1 = 25.3245, 21.5473
    b0, b1 = 3.32435, 3.82987
    c0, c1 = 0.16801, 0.19823
    d0, d1 = -1.27393, -1.97760
    e0, e1 = -4.85967, -4.32054

    one_minus_r = 1.0 - roughness
    interp = one_minus_r * one_minus_r
    rough_weight = 1.0 - interp

    lambda_val = 0.0
    if cos_theta < 0.5:
        L0 = G_Cloth_L(cos_theta, a0, b0, c0, d0, e0)
        L1 = G_Cloth_L(cos_theta, a1, b1, c1, d1, e1)
        L = lerp(L0, L1, rough_weight)
        lambda_val = math.exp(L)
    else:
        L_05_0 = G_Cloth_L(0.5, a0, b0, c0, d0, e0)
        L_05_1 = G_Cloth_L(0.5, a1, b1, c1, d1, e1)
        L_05 = lerp(L_05_0, L_05_1, rough_weight)

        one_minus_cos = 1.0 - cos_theta
        L_c_0 = G_Cloth_L(one_minus_cos, a0, b0, c0, d0, e0)
        L_c_1 = G_Cloth_L(one_minus_cos, a1, b1, c1, d1, e1)
        L_c = lerp(L_c_0, L_c_1, rough_weight)

        lambda_val = math.exp(2.0 * L_05 - L_c)

    return lambda_val


@numba.njit(cache=True)
def Lambda_Cloth_Softened(roughness, cos_theta):
    lambda_val = Lambda_Cloth_Raw(roughness, cos_theta)
    return pow(lambda_val, 1.0 + 2.0 * pow(1.0 - cos_theta, 8.0))


@numba.njit(cache=True)
def V_Cloth_Outgoing(roughness, NoL, NoV):
    # Height-correlated Smith: G2 / (4 * NoL * NoV)
    lambda_l = Lambda_Cloth_Softened(roughness, NoL)
    lambda_v = Lambda_Cloth_Raw(roughness, NoV)
    return 1.0 / ((1.0 + lambda_l + lambda_v) * 4.0 * NoL * NoV + 1e-6)


@numba.njit(cache=True)
def V_Cloth_Incoming(roughness, NoL, NoV):
    lambda_l = Lambda_Cloth_Softened(roughness, NoL)
    lambda_v = Lambda_Cloth_Raw(roughness, NoV)
    return 1.0 / ((1.0 + lambda_l + lambda_v) * 4.0 * NoL * NoV + 1e-6)


@numba.njit(cache=True)
def integrate_brdf_jitted(roughness, NoV, num_samples):
    V_x = math.sqrt(1.0 - NoV * NoV)
    V_y = 0.0
    V_z = NoV

    # R: GGX scale, G: GGX bias, B: cloth outgoing albedo, A: cloth incoming albedo
    std_scale, std_bias, cloth_out, cloth_in = 0.0, 0.0, 0.0, 0.0

    for i in range(num_samples):
        e1, e2 = random.random(), random.random()

        # Importance sample GGX
        a = roughness
        a2 = a * a

        phi = 2.0 * math.pi * e1
        cos_theta = math.sqrt((1.0 - e2) / (1.0 + (a2 - 1.0) * e2))
        sin_theta = math.sqrt(1.0 - cos_theta * cos_theta)

        H_x = math.cos(phi) * sin_theta
        H_y = math.sin(phi) * sin_theta
        H_z = cos_theta

        VoH = H_x * V_x + H_y * V_y + H_z * V_z
        if VoH <= 0: continue

        L_x = 2.0 * VoH * H_x - V_x
        L_y = 2.0 * VoH * H_y - V_y
        L_z = 2.0 * VoH * H_z - V_z

        NoL = saturate(L_z)
        NoH = saturate(H_z)
        NoV_proxy = saturate(V_z)

        if NoL > 0:
            # --- Standard BRDF ---
            # D cancels between numerator and PDF.
            G = G_GGXSmith(roughness, NoL, NoV_proxy)
            Fc_term = pow(1.0 - VoH, 5.0)

            # PDF = D_GGX * NoH / (4 * VoH), so (D * G * NoL) / PDF simplifies to:
            common_term = (G * NoL * 4.0 * VoH) / max(NoH, 1e-5)

            std_scale += common_term * (1.0 - Fc_term)
            std_bias += common_term * Fc_term

            # --- Cloth BRDF ---
            # Same GGX importance samples, reweighted for cloth D and V.
            if roughness >= 1e-4:
                D_c = D_Cloth(roughness, NoH)
                pdf_ggx = D_GGX(roughness, NoH) * NoH / (4.0 * VoH + 1e-6)
                V_out = V_Cloth_Outgoing(roughness, NoL, NoV_proxy)
                V_in = V_Cloth_Incoming(roughness, NoV_proxy, NoL)
                cloth_out += (D_c * V_out * NoL) / (pdf_ggx + 1e-6)
                cloth_in += (D_c * V_in * NoL) / (pdf_ggx + 1e-6)

    inv_n = 1.0 / num_samples
    return std_scale * inv_n, std_bias * inv_n, cloth_out * inv_n, cloth_in * inv_n


def calculate_pixel(coords, resolution, num_samples):
    x, y = coords
    u = (x + 0.5) / resolution
    v = (y + 0.5) / resolution

    NoV = saturate(u)
    perceptual_roughness = saturate(v)
    roughness = max(perceptual_roughness * perceptual_roughness, 1e-4)
    if NoV < 1e-4: return x, y, 0.0, 0.0, 0.0, 0.0

    std_scale, std_bias, cloth_out, cloth_in = integrate_brdf_jitted(roughness, NoV, num_samples)

    # R: GGX scale, G: GGX bias, B: cloth outgoing albedo, A: cloth incoming albedo
    return x, y, std_scale, std_bias, cloth_out, cloth_in


def generate_exr(resolution, output_filename, num_samples, num_workers):
    print(f"Generating {resolution}x{resolution} EXR '{output_filename}' (R=GGX scale, G=GGX bias, B=cloth out, A=cloth in) ({num_samples} samples/pixel) using {num_workers} workers.")
    header = OpenEXR.Header(resolution, resolution)
    pt = Imath.PixelType(Imath.PixelType.FLOAT)
    header['channels'] = {
        'R': Imath.Channel(pt),
        'G': Imath.Channel(pt),
        'B': Imath.Channel(pt),
        'A': Imath.Channel(pt),
    }

    pixel_data = np.zeros((resolution, resolution, 4), dtype=np.float32)

    coords_to_process = [(x, y) for y in range(resolution) for x in range(resolution)]
    worker_func = partial(calculate_pixel, resolution=resolution, num_samples=num_samples)

    processed_count = 0
    total_pixels = len(coords_to_process)
    print(f"Starting pixel processing...")

    with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
        futures = {executor.submit(worker_func, coord): coord for coord in coords_to_process}

        for future in concurrent.futures.as_completed(futures):
            try:
                x, y, r, g, b, a = future.result()
                pixel_data[y, x] = (r, g, b, a)
            except Exception as exc:
                coord = futures[future]
                print(f'\nPixel at {coord} generated an exception: {exc}')

            processed_count += 1
            print(f"  ...processed {processed_count}/{total_pixels} pixels ({processed_count/total_pixels:.1%})", end='\r')

    print(f"\nProcessing complete. Writing to {output_filename}...")
    try:
        # Vertically flip to match UV coordinates (0,0 at bottom-left).
        pixel_data = np.flipud(pixel_data)

        exr_file = OpenEXR.OutputFile(output_filename, header)
        r_data = pixel_data[:, :, 0].ravel().tobytes()
        g_data = pixel_data[:, :, 1].ravel().tobytes()
        b_data = pixel_data[:, :, 2].ravel().tobytes()
        a_data = pixel_data[:, :, 3].ravel().tobytes()
        exr_file.writePixels({'R': r_data, 'G': g_data, 'B': b_data, 'A': a_data})
        exr_file.close()
        print(f"Successfully generated {output_filename}")
    except Exception as e:
        raise RuntimeError(f"Failed to write EXR file '{output_filename}': {e}")

def main():
    parser = argparse.ArgumentParser(description='Generate packed DFG LUT (R=GGX scale, G=GGX bias, B=cloth out, A=cloth in).')
    parser.add_argument('-r', '--resolution', type=int, default=512,
                        help='Resolution of the square EXR image (default: 512)')
    parser.add_argument('-s', '--samples', type=int, default=8192,
                        help='Number of samples per pixel for integration (default: 8192)')
    parser.add_argument('-o', '--output', default='dfg.exr',
                        help='Output filename (default: dfg.exr)')
    parser.add_argument('-j', '--workers', type=int, default=os.cpu_count(),
                        help=f'Number of worker processes (default: {os.cpu_count()})')

    args = parser.parse_args()

    if args.resolution <= 0:
        print("Error: Resolution must be a positive integer")
        return 1

    try:
        generate_exr(args.resolution, args.output, args.samples, args.workers)
    except Exception as e:
        print(f"Error: {e}")
        return 1

    return 0

if __name__ == '__main__':
    exit(main())