summaryrefslogtreecommitdiffstats
path: root/gpu_fft.cc
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2025-07-27 23:16:51 -0700
committeryum <yum.food.vr@gmail.com>2025-07-27 23:16:51 -0700
commitf8a7d3fda6abef2923b0fa8f7021b9d0646ed8d5 (patch)
tree017b0f7504064327928d62998cf5e68194be7e70 /gpu_fft.cc
parentd90ab65bfc740ea7657ac8fca201bdfa8ecc7a22 (diff)
Fix twiddle factors in shader
Diffstat (limited to 'gpu_fft.cc')
-rw-r--r--gpu_fft.cc245
1 files changed, 199 insertions, 46 deletions
diff --git a/gpu_fft.cc b/gpu_fft.cc
index f5dd97e..5b3f4ab 100644
--- a/gpu_fft.cc
+++ b/gpu_fft.cc
@@ -15,6 +15,13 @@
#include <random>
#include <vector>
+struct float2 {
+ float x, y;
+};
+#define GPU_FFT_RADIX16
+#define GPU_FFT_RADIX16_N256
+#include "fft_twiddle_tables.cginc"
+
// This is a reference Cooley-Tukey FFT implementation. It's just here to check
// for correctness.
std::vector<std::complex<float>> fft1d_naive(
@@ -110,8 +117,32 @@ unsigned int reverse_digits(unsigned int n, unsigned int num_digits, unsigned in
// Compute twiddle factor W_N^k = exp(-2*pi*i*k/N) for forward FFT
// or exp(+2*pi*i*k/N) for inverse FFT
gpu_complex twiddle_factor(int k, int N, bool inverse = false) {
- float angle = (inverse ? 2.0f : -2.0f) * std::numbers::pi * k / N;
- return {std::cos(angle), std::sin(angle)};
+ // Use double precision for angle computation, then cast to float
+ // This matches how the precomputed tables were generated
+ double angle = (inverse ? 2.0 : -2.0) * std::numbers::pi * k / N;
+ return {(float)std::cos(angle), (float)std::sin(angle)};
+}
+
+// Helper to compute radix^power using integer arithmetic
+int int_pow(int base, int exp) {
+ int result = 1;
+ for (int i = 0; i < exp; ++i) {
+ result *= base;
+ }
+ return result;
+}
+
+// Apply 2D bit reversal to the output
+void apply_2d_bit_reversal(const int n, const int radix, const stage_texture& in, stage_texture& out) {
+ const int num_digits = std::log2(n) / std::log2(radix);
+
+ for (int y = 0; y < n; ++y) {
+ for (int x = 0; x < n; ++x) {
+ const int rev_x = reverse_digits(x, num_digits, radix);
+ const int rev_y = reverse_digits(y, num_digits, radix);
+ out[y][x] = in[rev_y][rev_x];
+ }
+ }
}
// Main shader function - simplified for GPU/HLSL conversion
@@ -183,15 +214,42 @@ void evaluate_stage(
}
}
-// Apply 2D bit reversal to the output
-void apply_2d_bit_reversal(const int n, const int radix, const stage_texture& in, stage_texture& out) {
- const int num_digits = std::log2(n) / std::log2(radix);
+// Evaluate all stages - unified function
+void evaluate_stages(
+ const int n,
+ const int radix,
+ const bool inverse,
+ std::vector<stage_texture>& textures,
+ const std::vector<std::vector<gpu_complex>>& dft_matrix,
+ const std::vector<std::vector<gpu_complex>>& stage_twiddles_array) {
- for (int y = 0; y < n; ++y) {
- for (int x = 0; x < n; ++x) {
- const int rev_x = reverse_digits(x, num_digits, radix);
- const int rev_y = reverse_digits(y, num_digits, radix);
- out[y][x] = in[rev_y][rev_x];
+ const int num_stages_per_dim = std::log2(n) / std::log2(radix);
+ const int num_stages = num_stages_per_dim * 2;
+
+ for (int stage = 0; stage < num_stages; ++stage) {
+ int current_stage = (stage < num_stages_per_dim) ? stage : (stage - num_stages_per_dim);
+ int span = n / int_pow(radix, current_stage + 1);
+ int butterfly_size = span * radix;
+
+ const std::vector<gpu_complex>& stage_twiddles = stage_twiddles_array[current_stage];
+
+ ShaderUniforms uniforms = {n, radix, stage, num_stages_per_dim, span, butterfly_size,
+ inverse, dft_matrix, stage_twiddles};
+ evaluate_stage(uniforms, textures[stage], textures[stage+1]);
+ }
+
+ // Apply bit reversal once at the end
+ stage_texture temp = textures[num_stages];
+ apply_2d_bit_reversal(n, radix, temp, textures[num_stages]);
+
+ // For inverse FFT, normalize by 1/(n*n)
+ if (inverse) {
+ float norm_factor = 1.0f / (n * n);
+ for (int y = 0; y < n; ++y) {
+ for (int x = 0; x < n; ++x) {
+ textures[num_stages][y][x].first *= norm_factor;
+ textures[num_stages][y][x].second *= norm_factor;
+ }
}
}
}
@@ -208,15 +266,6 @@ std::vector<std::vector<gpu_complex>> compute_twiddle_factors(int radix, bool in
return twiddle_factors;
}
-// Helper to compute radix^power using integer arithmetic
-int int_pow(int base, int exp) {
- int result = 1;
- for (int i = 0; i < exp; ++i) {
- result *= base;
- }
- return result;
-}
-
// Precompute stage twiddle factors
std::vector<gpu_complex> compute_stage_twiddles(int butterfly_size, bool inverse = false) {
std::vector<gpu_complex> stage_twiddles(butterfly_size);
@@ -226,46 +275,142 @@ std::vector<gpu_complex> compute_stage_twiddles(int butterfly_size, bool inverse
return stage_twiddles;
}
-// Evaluate all stages.
-void evaluate_stages(
+// Convert float2 arrays to gpu_complex vectors
+std::vector<std::vector<gpu_complex>> convert_dft_matrix(bool inverse = false) {
+ std::vector<std::vector<gpu_complex>> result(16, std::vector<gpu_complex>(16));
+ for (int i = 0; i < 16; ++i) {
+ for (int j = 0; j < 16; ++j) {
+ if (inverse) {
+ result[i][j] = {IDFT_MATRIX[i][j].x, IDFT_MATRIX[i][j].y};
+ } else {
+ result[i][j] = {DFT_MATRIX[i][j].x, DFT_MATRIX[i][j].y};
+ }
+ }
+ }
+ return result;
+}
+
+std::vector<gpu_complex> convert_stage_twiddles(int stage, bool inverse = false) {
+ if (stage == 0) { // butterfly_size = 256
+ std::vector<gpu_complex> result(256);
+ for (int i = 0; i < 256; ++i) {
+ if (inverse) {
+ result[i] = {STAGE0_TWIDDLES_INV[i].x, STAGE0_TWIDDLES_INV[i].y};
+ } else {
+ result[i] = {STAGE0_TWIDDLES[i].x, STAGE0_TWIDDLES[i].y};
+ }
+ }
+ return result;
+ } else { // stage == 1, butterfly_size = 16
+ std::vector<gpu_complex> result(16);
+ for (int i = 0; i < 16; ++i) {
+ if (inverse) {
+ result[i] = {STAGE1_TWIDDLES_INV[i].x, STAGE1_TWIDDLES_INV[i].y};
+ } else {
+ result[i] = {STAGE1_TWIDDLES[i].x, STAGE1_TWIDDLES[i].y};
+ }
+ }
+ return result;
+ }
+}
+
+// Wrapper for computed twiddles
+void evaluate_stages_computed(
const int n,
const int radix,
const bool inverse,
std::vector<stage_texture>& textures) {
- const std::vector<std::vector<gpu_complex>> twiddle_factors =
+
+ const std::vector<std::vector<gpu_complex>> dft_matrix =
compute_twiddle_factors(radix, inverse);
+ // Precompute all stage twiddles
const int num_stages_per_dim = std::log2(n) / std::log2(radix);
- const int num_stages = num_stages_per_dim * 2;
+ std::vector<std::vector<gpu_complex>> stage_twiddles_array(num_stages_per_dim);
- for (int stage = 0; stage < num_stages; ++stage) {
- // Compute span and butterfly_size for this stage
- int current_stage = (stage < num_stages_per_dim) ? stage : (stage - num_stages_per_dim);
- int span = n / int_pow(radix, current_stage + 1);
+ for (int stage = 0; stage < num_stages_per_dim; ++stage) {
+ int span = n / int_pow(radix, stage + 1);
int butterfly_size = span * radix;
+ stage_twiddles_array[stage] = compute_stage_twiddles(butterfly_size, inverse);
+ }
- // Precompute stage twiddle factors
- std::vector<gpu_complex> stage_twiddles = compute_stage_twiddles(butterfly_size, inverse);
+ evaluate_stages(n, radix, inverse, textures, dft_matrix, stage_twiddles_array);
+}
- ShaderUniforms uniforms = {n, radix, stage, num_stages_per_dim, span, butterfly_size,
- inverse, twiddle_factors, stage_twiddles};
- evaluate_stage(uniforms, textures[stage], textures[stage+1]);
+// Wrapper for precomputed twiddles
+void evaluate_stages_precomputed(
+ const int n,
+ const int radix,
+ const bool inverse,
+ std::vector<stage_texture>& textures,
+ const std::vector<std::vector<gpu_complex>>& precomputed_dft_matrix) {
+
+ // Convert precomputed stage twiddles
+ const int num_stages_per_dim = std::log2(n) / std::log2(radix);
+ std::vector<std::vector<gpu_complex>> stage_twiddles_array(num_stages_per_dim);
+
+ for (int stage = 0; stage < num_stages_per_dim; ++stage) {
+ stage_twiddles_array[stage] = convert_stage_twiddles(stage, inverse);
}
- // Apply bit reversal once at the end
- stage_texture temp = textures[num_stages];
- apply_2d_bit_reversal(n, radix, temp, textures[num_stages]);
+ evaluate_stages(n, radix, inverse, textures, precomputed_dft_matrix, stage_twiddles_array);
+}
- // For inverse FFT, normalize by 1/(n*n)
- if (inverse) {
- float norm_factor = 1.0f / (n * n);
- for (int y = 0; y < n; ++y) {
- for (int x = 0; x < n; ++x) {
- textures[num_stages][y][x].first *= norm_factor;
- textures[num_stages][y][x].second *= norm_factor;
+// Verify FFT results match between computed and precomputed tables
+bool verify_fft_with_tables(std::mt19937& rng) {
+ const int n = 256;
+ const int radix = 16;
+ const int NUM_STAGES = (std::log2(n) / std::log2(radix)) * 2;
+
+ // Initialize test data
+ const std::vector<std::vector<gpu_complex>> black_texture(n,
+ std::vector<gpu_complex>(n, {0, 0}));
+ std::vector<stage_texture> textures_computed(NUM_STAGES + 1, black_texture);
+ std::vector<stage_texture> textures_precomputed(NUM_STAGES + 1, black_texture);
+
+ // Fill with identical random data
+ std::uniform_real_distribution<float> dist(0.0f, 1.0f);
+ for (int y = 0; y < n; ++y) {
+ for (int x = 0; x < n; ++x) {
+ float real = dist(rng);
+ float imag = dist(rng);
+ textures_computed[0][y][x] = {real, imag};
+ textures_precomputed[0][y][x] = {real, imag};
+ }
+ }
+
+ // Run FFT with computed tables
+ evaluate_stages_computed(n, radix, false, textures_computed);
+
+ // Run FFT with precomputed tables from cginc
+ auto dft_matrix = convert_dft_matrix(false);
+ evaluate_stages_precomputed(n, radix, false, textures_precomputed, dft_matrix);
+
+ // Compare results
+ float max_error = 0.0f;
+ int mismatch_count = 0;
+ for (int y = 0; y < n; ++y) {
+ for (int x = 0; x < n; ++x) {
+ float err_real = std::abs(textures_computed[NUM_STAGES][y][x].first -
+ textures_precomputed[NUM_STAGES][y][x].first);
+ float err_imag = std::abs(textures_computed[NUM_STAGES][y][x].second -
+ textures_precomputed[NUM_STAGES][y][x].second);
+ max_error = std::max(max_error, std::max(err_real, err_imag));
+ if (err_real > 1e-6f || err_imag > 1e-6f) {
+ mismatch_count++;
}
}
}
+
+ std::cout << "FFT max error between computed and precomputed tables: "
+ << std::scientific << max_error << std::fixed << std::endl;
+
+ if (mismatch_count > 0) {
+ std::cout << "ERROR: " << mismatch_count << " pixels have error > 1e-6" << std::endl;
+ return false;
+ }
+
+ return true;
}
bool check_result(
@@ -361,7 +506,7 @@ bool evaluateAlgorithm(const int n, const int radix, const bool inverse, std::mt
// Evaluate the GPU algorithm.
auto start = std::chrono::high_resolution_clock::now();
- evaluate_stages(n, radix, inverse, textures);
+ evaluate_stages_computed(n, radix, inverse, textures);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
@@ -418,13 +563,13 @@ bool testFFTInverse(const int n, const int radix, std::mt19937& rng) {
stage_texture original_input = textures_fft[0];
// Perform forward FFT
- evaluate_stages(n, radix, false, textures_fft);
+ evaluate_stages_computed(n, radix, false, textures_fft);
// Copy FFT result as input to inverse FFT
textures_ifft[0] = textures_fft[NUM_STAGES];
// Perform inverse FFT
- evaluate_stages(n, radix, true, textures_ifft);
+ evaluate_stages_computed(n, radix, true, textures_ifft);
// Check if inverse FFT gives back the original input
float max_error = 0.0f;
@@ -436,7 +581,7 @@ bool testFFTInverse(const int n, const int radix, std::mt19937& rng) {
}
}
- const float epsilon = 1e-3; // Tolerance for round-trip error
+ const float epsilon = 1e-5; // Tolerance for round-trip error
bool success = (max_error < epsilon);
if (!success) {
@@ -465,6 +610,14 @@ bool testFFTInverse(const int n, const int radix, std::mt19937& rng) {
int main() {
std::mt19937 rng(std::random_device{}());
+ // Verify FFT results match with precomputed tables
+ std::cout << "Verifying FFT with precomputed tables from fft_twiddle_tables.cginc..." << std::endl;
+ if (!verify_fft_with_tables(rng)) {
+ std::cout << "ERROR: FFT results do not match between computed and precomputed tables!" << std::endl;
+ return 1;
+ }
+ std::cout << "FFT verification passed!\n" << std::endl;
+
// First run the original forward FFT tests
std::cout << "Testing forward FFT correctness against reference implementation..." << std::endl;
for (int log_radix = 1; log_radix < 5; ++log_radix) {