Add switchable optimization config and benchmark framework

WaferConfig: unified config controlling all optimizations individually.
ForthVM::new_with_config(config) to create VMs with custom optimization settings.
All 8 switchable optimizations: peephole, constant_fold, strength_reduce, dce,
tail_call, inline (IR passes) + stack_to_local_promotion (codegen).

Benchmark framework (crates/core/tests/benchmark_report.rs):
- 7 Forth benchmarks: Fibonacci, Factorial, SumRecurse, NestedLoops, GCD, MemFill, Collatz
- Correctness verification across all configs (runs in CI)
- Full report with 128 optimization combinations (cargo test --ignored)
- Measures execution time, compilation time, WASM module bytes
- CONSOLIDATE impact comparison

Key findings from benchmark report:
- Inlining: -77% exec time on Fibonacci, -92% on Collatz
- Stack-to-local promotion: -5.5% WASM module size
- CONSOLIDATE: -72% exec time on Fibonacci (call_indirect -> direct call)
- All optimizations combined: best overall performance
This commit is contained in:
2026-04-02 12:24:57 +02:00
parent 759142ea75
commit dea3a32c33
6 changed files with 614 additions and 15 deletions
+496
View File
@@ -0,0 +1,496 @@
//! Optimization benchmark and verification framework for WAFER.
//!
//! Run correctness tests: `cargo test -p wafer-core --test benchmark_report`
//! Run full benchmark: `cargo test -p wafer-core --test benchmark_report -- --nocapture --ignored`
use std::time::Instant;
use wafer_core::config::WaferConfig;
use wafer_core::outer::ForthVM;
// -----------------------------------------------------------------------
// Benchmark definitions
// -----------------------------------------------------------------------
struct Benchmark {
name: &'static str,
define: &'static str,
run: &'static str,
verify: &'static str,
expected: Vec<i32>,
iterations: u32,
}
fn benchmarks() -> Vec<Benchmark> {
vec![
Benchmark {
name: "Fibonacci(25)",
define: ": FIB ( n -- n ) DUP 2 < IF EXIT THEN DUP 1- RECURSE SWAP 2 - RECURSE + ;",
run: "25 FIB DROP",
verify: "25 FIB",
expected: vec![75025],
iterations: 10,
},
Benchmark {
name: "Factorial(12)",
define: ": FACT ( n -- n! ) 1 SWAP 1+ 1 ?DO I * LOOP ;",
run: "12 FACT DROP",
verify: "12 FACT",
expected: vec![479001600],
iterations: 1000,
},
Benchmark {
name: "SumRecurse(5000)",
define: concat!(
": SUMREC ( n -- sum ) ",
"DUP 0= IF EXIT THEN ",
"DUP 1- RECURSE + ;"
),
run: "5000 SUMREC DROP",
verify: "100 SUMREC",
expected: vec![5050],
iterations: 100,
},
Benchmark {
name: "NestedLoops(80)",
define: ": NESTED ( n -- sum ) 0 SWAP 0 DO I 0 DO I J + DROP LOOP LOOP ;",
run: "80 NESTED DROP",
verify: "5 NESTED",
expected: vec![0],
iterations: 10,
},
Benchmark {
name: "GCD-bench(500)",
define: concat!(
": GCD ( a b -- gcd ) BEGIN DUP WHILE TUCK MOD REPEAT DROP ; ",
": GCD-BENCH ( n -- ) 0 DO 10000 I 1+ GCD DROP LOOP ;"
),
run: "500 GCD-BENCH",
verify: "48 36 GCD",
expected: vec![12],
iterations: 10,
},
Benchmark {
name: "MemFill(1000)",
define: concat!(
"VARIABLE MBUF ",
"1000 CELLS ALLOT ",
"HERE 1000 CELLS - MBUF ! ",
": MFILL ( n -- ) 0 DO I I * MBUF @ I CELLS + ! LOOP ; ",
": MSUM ( n -- sum ) 0 SWAP 0 DO MBUF @ I CELLS + @ + LOOP ;"
),
run: "1000 MFILL 1000 MSUM DROP",
verify: "10 MFILL 10 MSUM",
expected: vec![285],
iterations: 100,
},
Benchmark {
name: "Collatz(1M)",
define: concat!(
": COLLATZ ( n -- steps ) ",
"0 SWAP BEGIN DUP 1 > WHILE ",
"DUP 1 AND IF 3 * 1+ ELSE 2 / THEN ",
"SWAP 1+ SWAP ",
"REPEAT DROP ; ",
": COLLATZ-BENCH ( n -- ) 0 DO I 1+ COLLATZ DROP LOOP ;"
),
run: "10000 COLLATZ-BENCH",
verify: "27 COLLATZ",
expected: vec![111],
iterations: 5,
},
]
}
// -----------------------------------------------------------------------
// Configurations
// -----------------------------------------------------------------------
fn individual_configs() -> Vec<(&'static str, WaferConfig)> {
vec![
("none", WaferConfig::none()),
("peephole", {
let mut c = WaferConfig::none();
c.opt.peephole = true;
c
}),
("constant_fold", {
let mut c = WaferConfig::none();
c.opt.constant_fold = true;
c
}),
("strength_reduce", {
let mut c = WaferConfig::none();
c.opt.strength_reduce = true;
c
}),
("dce", {
let mut c = WaferConfig::none();
c.opt.dce = true;
c
}),
("tail_call", {
let mut c = WaferConfig::none();
c.opt.tail_call = true;
c
}),
("inline", {
let mut c = WaferConfig::none();
c.opt.inline = true;
c
}),
("promotion", {
let mut c = WaferConfig::none();
c.codegen.stack_to_local_promotion = true;
c
}),
("all_ir", {
let mut c = WaferConfig::none();
c.opt.peephole = true;
c.opt.constant_fold = true;
c.opt.strength_reduce = true;
c.opt.dce = true;
c.opt.tail_call = true;
c.opt.inline = true;
c
}),
("all", WaferConfig::all()),
]
}
fn combination_configs() -> Vec<(String, WaferConfig)> {
let mut result = Vec::new();
for ir_bits in 0..64u32 {
for promo in [false, true] {
let mut c = WaferConfig::none();
if ir_bits & 1 != 0 {
c.opt.peephole = true;
}
if ir_bits & 2 != 0 {
c.opt.constant_fold = true;
}
if ir_bits & 4 != 0 {
c.opt.strength_reduce = true;
}
if ir_bits & 8 != 0 {
c.opt.dce = true;
}
if ir_bits & 16 != 0 {
c.opt.tail_call = true;
}
if ir_bits & 32 != 0 {
c.opt.inline = true;
}
if promo {
c.codegen.stack_to_local_promotion = true;
}
let name = format!("ir={:06b}{}", ir_bits, if promo { "+P" } else { "" });
result.push((name, c));
}
}
result
}
// -----------------------------------------------------------------------
// Measurement
// -----------------------------------------------------------------------
struct BenchResult {
compile_time_us: u64,
exec_time_us: u64,
module_bytes: u64,
}
fn run_benchmark(config: &WaferConfig, bench: &Benchmark) -> BenchResult {
// Compile
let compile_start = Instant::now();
let mut vm = ForthVM::new_with_config(config.clone()).expect("VM creation failed");
for line in bench.define.lines() {
let trimmed = line.trim();
if !trimmed.is_empty() {
let _ = vm.evaluate(trimmed);
}
}
vm.take_output();
let compile_time = compile_start.elapsed();
// Warm up
let _ = vm.evaluate(bench.run);
vm.take_output();
// Measure
let mut times = Vec::new();
for _ in 0..bench.iterations {
let start = Instant::now();
let _ = vm.evaluate(bench.run);
times.push(start.elapsed());
vm.take_output();
}
times.sort();
let median = times[times.len() / 2];
BenchResult {
compile_time_us: compile_time.as_micros() as u64,
exec_time_us: median.as_micros() as u64,
module_bytes: vm.total_module_bytes(),
}
}
// -----------------------------------------------------------------------
// Correctness test (runs in CI)
// -----------------------------------------------------------------------
#[test]
fn correctness_all_configs() {
let configs = individual_configs();
let benches = benchmarks();
for (cfg_name, config) in &configs {
for bench in &benches {
let mut vm = ForthVM::new_with_config(config.clone()).expect("VM creation failed");
for line in bench.define.lines() {
let trimmed = line.trim();
if !trimmed.is_empty()
&& let Err(e) = vm.evaluate(trimmed)
{
panic!(
"Config '{cfg_name}', bench '{}': define failed: {e}",
bench.name
);
}
}
vm.take_output();
if let Err(e) = vm.evaluate(bench.verify) {
panic!(
"Config '{cfg_name}', bench '{}': verify failed: {e}",
bench.name
);
}
vm.take_output();
let stack = vm.data_stack();
assert_eq!(
stack, bench.expected,
"Config '{cfg_name}', bench '{}': expected {:?}, got {:?}",
bench.name, bench.expected, stack
);
}
}
}
// -----------------------------------------------------------------------
// Benchmark report (run with --nocapture --ignored)
// -----------------------------------------------------------------------
#[test]
#[ignore]
fn optimization_report() {
let configs = individual_configs();
let benches = benchmarks();
let sep = "=".repeat(90);
let thin_sep = "-".repeat(90);
println!("\n{sep}");
println!(" WAFER Optimization Benchmark Report");
println!("{sep}\n");
// ---- Phase 1: Individual optimization impact ----
println!("Phase 1: Individual Optimization Impact");
println!("{thin_sep}");
println!(
"{:<18} {:<18} {:>10} {:>10} {:>10} {:>10} {:>10}",
"Config", "Benchmark", "Compile", "Exec", "Bytes", "Exec %", "Bytes %"
);
println!(
"{:<18} {:<18} {:>10} {:>10} {:>10} {:>10} {:>10}",
"", "", "(us)", "(us)", "", "vs none", "vs none"
);
println!("{thin_sep}");
// Collect baseline (none) results first
let mut baseline_results: Vec<BenchResult> = Vec::new();
for bench in &benches {
baseline_results.push(run_benchmark(&configs[0].1, bench));
}
// Print all configs
for (cfg_name, config) in &configs {
for (bench_idx, bench) in benches.iter().enumerate() {
let result = if *cfg_name == "none" {
BenchResult {
compile_time_us: baseline_results[bench_idx].compile_time_us,
exec_time_us: baseline_results[bench_idx].exec_time_us,
module_bytes: baseline_results[bench_idx].module_bytes,
}
} else {
run_benchmark(config, bench)
};
let base_exec = baseline_results[bench_idx].exec_time_us;
let base_bytes = baseline_results[bench_idx].module_bytes;
let exec_pct = if base_exec > 0 {
format!(
"{:+.1}%",
((result.exec_time_us as f64 - base_exec as f64) / base_exec as f64) * 100.0
)
} else {
"N/A".to_string()
};
let bytes_pct = if base_bytes > 0 {
format!(
"{:+.1}%",
((result.module_bytes as f64 - base_bytes as f64) / base_bytes as f64) * 100.0
)
} else {
"N/A".to_string()
};
println!(
"{:<18} {:<18} {:>10} {:>10} {:>10} {:>10} {:>10}",
cfg_name,
bench.name,
result.compile_time_us,
result.exec_time_us,
result.module_bytes,
exec_pct,
bytes_pct,
);
}
}
// ---- Phase 2: Combination matrix (subset of benchmarks for speed) ----
println!("\n{sep}");
println!("Phase 2: Combination Matrix (Fibonacci + GCD only)");
println!("{sep}");
let combo_configs = combination_configs();
let combo_bench_indices: Vec<usize> = benches
.iter()
.enumerate()
.filter(|(_, b)| b.name.contains("Fibonacci") || b.name.contains("GCD"))
.map(|(i, _)| i)
.collect();
println!(
"{:<18} {:<18} {:>10} {:>10} {:>10}",
"Config", "Benchmark", "Exec(us)", "Exec %", "Bytes"
);
println!("{thin_sep}");
let mut best_exec: Vec<(String, u64)> = combo_bench_indices
.iter()
.map(|&i| ("none".to_string(), baseline_results[i].exec_time_us))
.collect();
for (cfg_name, config) in &combo_configs {
for (ci, &bench_idx) in combo_bench_indices.iter().enumerate() {
let bench = &benches[bench_idx];
let result = run_benchmark(config, bench);
let base_exec = baseline_results[bench_idx].exec_time_us;
let exec_pct = if base_exec > 0 {
format!(
"{:+.1}%",
((result.exec_time_us as f64 - base_exec as f64) / base_exec as f64) * 100.0
)
} else {
"N/A".to_string()
};
println!(
"{:<18} {:<18} {:>10} {:>10} {:>10}",
cfg_name, bench.name, result.exec_time_us, exec_pct, result.module_bytes,
);
if result.exec_time_us < best_exec[ci].1 {
best_exec[ci] = (cfg_name.clone(), result.exec_time_us);
}
}
}
// ---- Phase 3: CONSOLIDATE comparison ----
println!("\n{sep}");
println!("Phase 3: CONSOLIDATE Impact");
println!("{sep}");
println!(
"{:<18} {:<18} {:>10} {:>10} {:>10}",
"Mode", "Benchmark", "Exec(us)", "vs all", "Bytes"
);
println!("{thin_sep}");
let all_config = WaferConfig::all();
for bench in &benches {
// Without CONSOLIDATE
let result_all = run_benchmark(&all_config, bench);
// With CONSOLIDATE
let mut vm_consol =
ForthVM::new_with_config(all_config.clone()).expect("VM creation failed");
for line in bench.define.lines() {
let trimmed = line.trim();
if !trimmed.is_empty() {
let _ = vm_consol.evaluate(trimmed);
}
}
vm_consol.take_output();
let _ = vm_consol.evaluate("CONSOLIDATE");
vm_consol.take_output();
// Warm up
let _ = vm_consol.evaluate(bench.run);
vm_consol.take_output();
let mut times = Vec::new();
for _ in 0..bench.iterations {
let start = Instant::now();
let _ = vm_consol.evaluate(bench.run);
times.push(start.elapsed());
vm_consol.take_output();
}
times.sort();
let consol_exec = times[times.len() / 2].as_micros() as u64;
let consol_bytes = vm_consol.total_module_bytes();
let exec_pct = if result_all.exec_time_us > 0 {
format!(
"{:+.1}%",
((consol_exec as f64 - result_all.exec_time_us as f64)
/ result_all.exec_time_us as f64)
* 100.0
)
} else {
"N/A".to_string()
};
println!(
"{:<18} {:<18} {:>10} {:>10} {:>10}",
"all", bench.name, result_all.exec_time_us, "+0.0%", result_all.module_bytes,
);
println!(
"{:<18} {:<18} {:>10} {:>10} {:>10}",
"all+CONSOLIDATE", bench.name, consol_exec, exec_pct, consol_bytes,
);
}
// ---- Summary ----
println!("\n{sep}");
println!(" Summary");
println!("{sep}");
for (ci, &bench_idx) in combo_bench_indices.iter().enumerate() {
let bench = &benches[bench_idx];
let base = baseline_results[bench_idx].exec_time_us;
let improvement = if base > 0 {
format!(
"{:.1}%",
((base as f64 - best_exec[ci].1 as f64) / base as f64) * 100.0
)
} else {
"N/A".to_string()
};
println!(
" {}: best config '{}' ({} us, {} faster than none)",
bench.name, best_exec[ci].0, best_exec[ci].1, improvement
);
}
println!();
println!(" Recommendation: Use WaferConfig::all() for best overall performance.");
println!(" CONSOLIDATE provides additional speedup for compute-heavy words.");
println!("{sep}\n");
}