Add switchable optimization config and benchmark framework

WaferConfig: unified config controlling all optimizations individually. ForthVM::new_with_config(config) to create VMs with custom optimization settings. All 8 switchable optimizations: peephole, constant_fold, strength_reduce, dce, tail_call, inline (IR passes) + stack_to_local_promotion (codegen). Benchmark framework (crates/core/tests/benchmark_report.rs): - 7 Forth benchmarks: Fibonacci, Factorial, SumRecurse, NestedLoops, GCD, MemFill, Collatz - Correctness verification across all configs (runs in CI) - Full report with 128 optimization combinations (cargo test --ignored) - Measures execution time, compilation time, WASM module bytes - CONSOLIDATE impact comparison Key findings from benchmark report: - Inlining: -77% exec time on Fibonacci, -92% on Collatz - Stack-to-local promotion: -5.5% WASM module size - CONSOLIDATE: -72% exec time on Fibonacci (call_indirect -> direct call) - All optimizations combined: best overall performance
2026-04-02 12:24:57 +02:00
parent 759142ea75
commit dea3a32c33
6 changed files with 614 additions and 15 deletions
@@ -0,0 +1,496 @@
+//! Optimization benchmark and verification framework for WAFER.
+//!
+//! Run correctness tests:   `cargo test -p wafer-core --test benchmark_report`
+//! Run full benchmark:      `cargo test -p wafer-core --test benchmark_report -- --nocapture --ignored`
+
+use std::time::Instant;
+use wafer_core::config::WaferConfig;
+use wafer_core::outer::ForthVM;
+
+// -----------------------------------------------------------------------
+// Benchmark definitions
+// -----------------------------------------------------------------------
+
+struct Benchmark {
+    name: &'static str,
+    define: &'static str,
+    run: &'static str,
+    verify: &'static str,
+    expected: Vec<i32>,
+    iterations: u32,
+}
+
+fn benchmarks() -> Vec<Benchmark> {
+    vec![
+        Benchmark {
+            name: "Fibonacci(25)",
+            define: ": FIB ( n -- n ) DUP 2 < IF EXIT THEN DUP 1- RECURSE SWAP 2 - RECURSE + ;",
+            run: "25 FIB DROP",
+            verify: "25 FIB",
+            expected: vec![75025],
+            iterations: 10,
+        },
+        Benchmark {
+            name: "Factorial(12)",
+            define: ": FACT ( n -- n! ) 1 SWAP 1+ 1 ?DO I * LOOP ;",
+            run: "12 FACT DROP",
+            verify: "12 FACT",
+            expected: vec![479001600],
+            iterations: 1000,
+        },
+        Benchmark {
+            name: "SumRecurse(5000)",
+            define: concat!(
+                ": SUMREC ( n -- sum ) ",
+                "DUP 0= IF EXIT THEN ",
+                "DUP 1- RECURSE + ;"
+            ),
+            run: "5000 SUMREC DROP",
+            verify: "100 SUMREC",
+            expected: vec![5050],
+            iterations: 100,
+        },
+        Benchmark {
+            name: "NestedLoops(80)",
+            define: ": NESTED ( n -- sum ) 0 SWAP 0 DO I 0 DO I J + DROP LOOP LOOP ;",
+            run: "80 NESTED DROP",
+            verify: "5 NESTED",
+            expected: vec![0],
+            iterations: 10,
+        },
+        Benchmark {
+            name: "GCD-bench(500)",
+            define: concat!(
+                ": GCD ( a b -- gcd ) BEGIN DUP WHILE TUCK MOD REPEAT DROP ; ",
+                ": GCD-BENCH ( n -- ) 0 DO 10000 I 1+ GCD DROP LOOP ;"
+            ),
+            run: "500 GCD-BENCH",
+            verify: "48 36 GCD",
+            expected: vec![12],
+            iterations: 10,
+        },
+        Benchmark {
+            name: "MemFill(1000)",
+            define: concat!(
+                "VARIABLE MBUF ",
+                "1000 CELLS ALLOT ",
+                "HERE 1000 CELLS - MBUF ! ",
+                ": MFILL ( n -- ) 0 DO I I * MBUF @ I CELLS + ! LOOP ; ",
+                ": MSUM ( n -- sum ) 0 SWAP 0 DO MBUF @ I CELLS + @ + LOOP ;"
+            ),
+            run: "1000 MFILL 1000 MSUM DROP",
+            verify: "10 MFILL 10 MSUM",
+            expected: vec![285],
+            iterations: 100,
+        },
+        Benchmark {
+            name: "Collatz(1M)",
+            define: concat!(
+                ": COLLATZ ( n -- steps ) ",
+                "0 SWAP BEGIN DUP 1 > WHILE ",
+                "DUP 1 AND IF 3 * 1+ ELSE 2 / THEN ",
+                "SWAP 1+ SWAP ",
+                "REPEAT DROP ; ",
+                ": COLLATZ-BENCH ( n -- ) 0 DO I 1+ COLLATZ DROP LOOP ;"
+            ),
+            run: "10000 COLLATZ-BENCH",
+            verify: "27 COLLATZ",
+            expected: vec![111],
+            iterations: 5,
+        },
+    ]
+}
+
+// -----------------------------------------------------------------------
+// Configurations
+// -----------------------------------------------------------------------
+
+fn individual_configs() -> Vec<(&'static str, WaferConfig)> {
+    vec![
+        ("none", WaferConfig::none()),
+        ("peephole", {
+            let mut c = WaferConfig::none();
+            c.opt.peephole = true;
+            c
+        }),
+        ("constant_fold", {
+            let mut c = WaferConfig::none();
+            c.opt.constant_fold = true;
+            c
+        }),
+        ("strength_reduce", {
+            let mut c = WaferConfig::none();
+            c.opt.strength_reduce = true;
+            c
+        }),
+        ("dce", {
+            let mut c = WaferConfig::none();
+            c.opt.dce = true;
+            c
+        }),
+        ("tail_call", {
+            let mut c = WaferConfig::none();
+            c.opt.tail_call = true;
+            c
+        }),
+        ("inline", {
+            let mut c = WaferConfig::none();
+            c.opt.inline = true;
+            c
+        }),
+        ("promotion", {
+            let mut c = WaferConfig::none();
+            c.codegen.stack_to_local_promotion = true;
+            c
+        }),
+        ("all_ir", {
+            let mut c = WaferConfig::none();
+            c.opt.peephole = true;
+            c.opt.constant_fold = true;
+            c.opt.strength_reduce = true;
+            c.opt.dce = true;
+            c.opt.tail_call = true;
+            c.opt.inline = true;
+            c
+        }),
+        ("all", WaferConfig::all()),
+    ]
+}
+
+fn combination_configs() -> Vec<(String, WaferConfig)> {
+    let mut result = Vec::new();
+    for ir_bits in 0..64u32 {
+        for promo in [false, true] {
+            let mut c = WaferConfig::none();
+            if ir_bits & 1 != 0 {
+                c.opt.peephole = true;
+            }
+            if ir_bits & 2 != 0 {
+                c.opt.constant_fold = true;
+            }
+            if ir_bits & 4 != 0 {
+                c.opt.strength_reduce = true;
+            }
+            if ir_bits & 8 != 0 {
+                c.opt.dce = true;
+            }
+            if ir_bits & 16 != 0 {
+                c.opt.tail_call = true;
+            }
+            if ir_bits & 32 != 0 {
+                c.opt.inline = true;
+            }
+            if promo {
+                c.codegen.stack_to_local_promotion = true;
+            }
+            let name = format!("ir={:06b}{}", ir_bits, if promo { "+P" } else { "" });
+            result.push((name, c));
+        }
+    }
+    result
+}
+
+// -----------------------------------------------------------------------
+// Measurement
+// -----------------------------------------------------------------------
+
+struct BenchResult {
+    compile_time_us: u64,
+    exec_time_us: u64,
+    module_bytes: u64,
+}
+
+fn run_benchmark(config: &WaferConfig, bench: &Benchmark) -> BenchResult {
+    // Compile
+    let compile_start = Instant::now();
+    let mut vm = ForthVM::new_with_config(config.clone()).expect("VM creation failed");
+    for line in bench.define.lines() {
+        let trimmed = line.trim();
+        if !trimmed.is_empty() {
+            let _ = vm.evaluate(trimmed);
+        }
+    }
+    vm.take_output();
+    let compile_time = compile_start.elapsed();
+
+    // Warm up
+    let _ = vm.evaluate(bench.run);
+    vm.take_output();
+
+    // Measure
+    let mut times = Vec::new();
+    for _ in 0..bench.iterations {
+        let start = Instant::now();
+        let _ = vm.evaluate(bench.run);
+        times.push(start.elapsed());
+        vm.take_output();
+    }
+    times.sort();
+    let median = times[times.len() / 2];
+
+    BenchResult {
+        compile_time_us: compile_time.as_micros() as u64,
+        exec_time_us: median.as_micros() as u64,
+        module_bytes: vm.total_module_bytes(),
+    }
+}
+
+// -----------------------------------------------------------------------
+// Correctness test (runs in CI)
+// -----------------------------------------------------------------------
+
+#[test]
+fn correctness_all_configs() {
+    let configs = individual_configs();
+    let benches = benchmarks();
+
+    for (cfg_name, config) in &configs {
+        for bench in &benches {
+            let mut vm = ForthVM::new_with_config(config.clone()).expect("VM creation failed");
+            for line in bench.define.lines() {
+                let trimmed = line.trim();
+                if !trimmed.is_empty()
+                    && let Err(e) = vm.evaluate(trimmed)
+                {
+                    panic!(
+                        "Config '{cfg_name}', bench '{}': define failed: {e}",
+                        bench.name
+                    );
+                }
+            }
+            vm.take_output();
+            if let Err(e) = vm.evaluate(bench.verify) {
+                panic!(
+                    "Config '{cfg_name}', bench '{}': verify failed: {e}",
+                    bench.name
+                );
+            }
+            vm.take_output();
+            let stack = vm.data_stack();
+            assert_eq!(
+                stack, bench.expected,
+                "Config '{cfg_name}', bench '{}': expected {:?}, got {:?}",
+                bench.name, bench.expected, stack
+            );
+        }
+    }
+}
+
+// -----------------------------------------------------------------------
+// Benchmark report (run with --nocapture --ignored)
+// -----------------------------------------------------------------------
+
+#[test]
+#[ignore]
+fn optimization_report() {
+    let configs = individual_configs();
+    let benches = benchmarks();
+
+    let sep = "=".repeat(90);
+    let thin_sep = "-".repeat(90);
+    println!("\n{sep}");
+    println!("  WAFER Optimization Benchmark Report");
+    println!("{sep}\n");
+
+    // ---- Phase 1: Individual optimization impact ----
+    println!("Phase 1: Individual Optimization Impact");
+    println!("{thin_sep}");
+    println!(
+        "{:<18} {:<18} {:>10} {:>10} {:>10} {:>10} {:>10}",
+        "Config", "Benchmark", "Compile", "Exec", "Bytes", "Exec %", "Bytes %"
+    );
+    println!(
+        "{:<18} {:<18} {:>10} {:>10} {:>10} {:>10} {:>10}",
+        "", "", "(us)", "(us)", "", "vs none", "vs none"
+    );
+    println!("{thin_sep}");
+
+    // Collect baseline (none) results first
+    let mut baseline_results: Vec<BenchResult> = Vec::new();
+    for bench in &benches {
+        baseline_results.push(run_benchmark(&configs[0].1, bench));
+    }
+
+    // Print all configs
+    for (cfg_name, config) in &configs {
+        for (bench_idx, bench) in benches.iter().enumerate() {
+            let result = if *cfg_name == "none" {
+                BenchResult {
+                    compile_time_us: baseline_results[bench_idx].compile_time_us,
+                    exec_time_us: baseline_results[bench_idx].exec_time_us,
+                    module_bytes: baseline_results[bench_idx].module_bytes,
+                }
+            } else {
+                run_benchmark(config, bench)
+            };
+
+            let base_exec = baseline_results[bench_idx].exec_time_us;
+            let base_bytes = baseline_results[bench_idx].module_bytes;
+            let exec_pct = if base_exec > 0 {
+                format!(
+                    "{:+.1}%",
+                    ((result.exec_time_us as f64 - base_exec as f64) / base_exec as f64) * 100.0
+                )
+            } else {
+                "N/A".to_string()
+            };
+            let bytes_pct = if base_bytes > 0 {
+                format!(
+                    "{:+.1}%",
+                    ((result.module_bytes as f64 - base_bytes as f64) / base_bytes as f64) * 100.0
+                )
+            } else {
+                "N/A".to_string()
+            };
+
+            println!(
+                "{:<18} {:<18} {:>10} {:>10} {:>10} {:>10} {:>10}",
+                cfg_name,
+                bench.name,
+                result.compile_time_us,
+                result.exec_time_us,
+                result.module_bytes,
+                exec_pct,
+                bytes_pct,
+            );
+        }
+    }
+
+    // ---- Phase 2: Combination matrix (subset of benchmarks for speed) ----
+    println!("\n{sep}");
+    println!("Phase 2: Combination Matrix (Fibonacci + GCD only)");
+    println!("{sep}");
+
+    let combo_configs = combination_configs();
+    let combo_bench_indices: Vec<usize> = benches
+        .iter()
+        .enumerate()
+        .filter(|(_, b)| b.name.contains("Fibonacci") || b.name.contains("GCD"))
+        .map(|(i, _)| i)
+        .collect();
+
+    println!(
+        "{:<18} {:<18} {:>10} {:>10} {:>10}",
+        "Config", "Benchmark", "Exec(us)", "Exec %", "Bytes"
+    );
+    println!("{thin_sep}");
+
+    let mut best_exec: Vec<(String, u64)> = combo_bench_indices
+        .iter()
+        .map(|&i| ("none".to_string(), baseline_results[i].exec_time_us))
+        .collect();
+
+    for (cfg_name, config) in &combo_configs {
+        for (ci, &bench_idx) in combo_bench_indices.iter().enumerate() {
+            let bench = &benches[bench_idx];
+            let result = run_benchmark(config, bench);
+            let base_exec = baseline_results[bench_idx].exec_time_us;
+            let exec_pct = if base_exec > 0 {
+                format!(
+                    "{:+.1}%",
+                    ((result.exec_time_us as f64 - base_exec as f64) / base_exec as f64) * 100.0
+                )
+            } else {
+                "N/A".to_string()
+            };
+
+            println!(
+                "{:<18} {:<18} {:>10} {:>10} {:>10}",
+                cfg_name, bench.name, result.exec_time_us, exec_pct, result.module_bytes,
+            );
+
+            if result.exec_time_us < best_exec[ci].1 {
+                best_exec[ci] = (cfg_name.clone(), result.exec_time_us);
+            }
+        }
+    }
+
+    // ---- Phase 3: CONSOLIDATE comparison ----
+    println!("\n{sep}");
+    println!("Phase 3: CONSOLIDATE Impact");
+    println!("{sep}");
+    println!(
+        "{:<18} {:<18} {:>10} {:>10} {:>10}",
+        "Mode", "Benchmark", "Exec(us)", "vs all", "Bytes"
+    );
+    println!("{thin_sep}");
+
+    let all_config = WaferConfig::all();
+    for bench in &benches {
+        // Without CONSOLIDATE
+        let result_all = run_benchmark(&all_config, bench);
+
+        // With CONSOLIDATE
+        let mut vm_consol =
+            ForthVM::new_with_config(all_config.clone()).expect("VM creation failed");
+        for line in bench.define.lines() {
+            let trimmed = line.trim();
+            if !trimmed.is_empty() {
+                let _ = vm_consol.evaluate(trimmed);
+            }
+        }
+        vm_consol.take_output();
+        let _ = vm_consol.evaluate("CONSOLIDATE");
+        vm_consol.take_output();
+
+        // Warm up
+        let _ = vm_consol.evaluate(bench.run);
+        vm_consol.take_output();
+
+        let mut times = Vec::new();
+        for _ in 0..bench.iterations {
+            let start = Instant::now();
+            let _ = vm_consol.evaluate(bench.run);
+            times.push(start.elapsed());
+            vm_consol.take_output();
+        }
+        times.sort();
+        let consol_exec = times[times.len() / 2].as_micros() as u64;
+        let consol_bytes = vm_consol.total_module_bytes();
+
+        let exec_pct = if result_all.exec_time_us > 0 {
+            format!(
+                "{:+.1}%",
+                ((consol_exec as f64 - result_all.exec_time_us as f64)
+                    / result_all.exec_time_us as f64)
+                    * 100.0
+            )
+        } else {
+            "N/A".to_string()
+        };
+
+        println!(
+            "{:<18} {:<18} {:>10} {:>10} {:>10}",
+            "all", bench.name, result_all.exec_time_us, "+0.0%", result_all.module_bytes,
+        );
+        println!(
+            "{:<18} {:<18} {:>10} {:>10} {:>10}",
+            "all+CONSOLIDATE", bench.name, consol_exec, exec_pct, consol_bytes,
+        );
+    }
+
+    // ---- Summary ----
+    println!("\n{sep}");
+    println!("  Summary");
+    println!("{sep}");
+    for (ci, &bench_idx) in combo_bench_indices.iter().enumerate() {
+        let bench = &benches[bench_idx];
+        let base = baseline_results[bench_idx].exec_time_us;
+        let improvement = if base > 0 {
+            format!(
+                "{:.1}%",
+                ((base as f64 - best_exec[ci].1 as f64) / base as f64) * 100.0
+            )
+        } else {
+            "N/A".to_string()
+        };
+        println!(
+            "  {}: best config '{}' ({} us, {} faster than none)",
+            bench.name, best_exec[ci].0, best_exec[ci].1, improvement
+        );
+    }
+    println!();
+    println!("  Recommendation: Use WaferConfig::all() for best overall performance.");
+    println!("  CONSOLIDATE provides additional speedup for compute-heavy words.");
+    println!("{sep}\n");
+}