Release-mode benchmarks, UTIME word, consolidated promotion

Three changes:

1. Add UTIME host function ( -- ud ) for microsecond timing in Forth.
   Enables self-timed benchmarks matching gforth's utime approach.

2. Switch comparison benchmarks to release mode: builds wafer binary
   with --release, measures via UTIME (excludes startup overhead).
   Previously measured debug-mode Rust overhead, not WASM execution.

3. Add stack-to-local promotion to consolidated codegen path. Words
   that pass is_promotable now use the StackSim emit path even in
   CONSOLIDATE'd modules, preventing performance regression.

Release-mode results (WAFER beats gforth on 4/5 benchmarks):
  Factorial:    0.54x (2x faster)
  GCD:          0.50x (2x faster)
  NestedLoops:  0.10x (10x faster)
  Collatz:      0.31x (3x faster)
  Fibonacci:    1.47x (call overhead)
This commit is contained in:
2026-04-09 19:44:26 +02:00
parent 4cc71666d5
commit b1f7a5cc49
3 changed files with 146 additions and 33 deletions
+81 -29
View File
@@ -9,7 +9,6 @@
use std::process::Command;
use std::sync::OnceLock;
use std::time::Instant;
use wafer_core::config::WaferConfig;
use wafer_core::outer::ForthVM;
@@ -616,9 +615,10 @@ fn perf_benchmarks() -> Vec<PerfBenchmark> {
samples: 5,
},
PerfBenchmark {
name: "NestedLoops(20)",
define: ": NESTED 0 SWAP 0 DO I 0 ?DO I J + DROP LOOP LOOP ;",
run_code: "20 NESTED DROP",
name: "NestedLoops(50)",
define: ": NESTED 0 SWAP 0 DO I 0 ?DO I J + DROP LOOP LOOP ; \
: NESTED-BENCH 100 0 DO 50 NESTED DROP LOOP ;",
run_code: "NESTED-BENCH",
verify: "5 NESTED",
expected: 0,
samples: 3,
@@ -637,31 +637,80 @@ fn perf_benchmarks() -> Vec<PerfBenchmark> {
]
}
/// Measure WAFER execution time (microseconds, median of N samples).
fn measure_wafer(config: &WaferConfig, bench: &PerfBenchmark) -> u64 {
let mut vm = ForthVM::new_with_config(config.clone()).expect("VM creation failed");
for line in bench.define.lines() {
let trimmed = line.trim();
if !trimmed.is_empty() {
let _ = vm.evaluate(trimmed);
}
/// Build the WAFER release binary and return its path.
/// Returns None if the build fails.
fn build_wafer_release() -> Option<String> {
// Find workspace root (two levels up from crates/core)
let manifest_dir = env!("CARGO_MANIFEST_DIR");
let workspace_root = std::path::Path::new(manifest_dir)
.parent()?
.parent()?;
let output = Command::new("cargo")
.args(["build", "--release", "-p", "wafer"])
.current_dir(workspace_root)
.output()
.ok()?;
if !output.status.success() {
eprintln!(
"WARN: cargo build --release failed: {}",
String::from_utf8_lossy(&output.stderr)
);
return None;
}
vm.take_output();
// Warm up
let _ = vm.evaluate(bench.run_code);
vm.take_output();
// Measure
let mut times = Vec::new();
for _ in 0..bench.samples {
let start = Instant::now();
let _ = vm.evaluate(bench.run_code);
times.push(start.elapsed());
vm.take_output();
let target_dir = workspace_root.join(
std::env::var("CARGO_TARGET_DIR").unwrap_or_else(|_| "target".to_string()),
);
let binary = target_dir.join("release/wafer");
if binary.exists() {
Some(binary.to_string_lossy().into_owned())
} else {
None
}
}
static WAFER_RELEASE: OnceLock<Option<String>> = OnceLock::new();
fn find_wafer_release() -> Option<&'static str> {
WAFER_RELEASE
.get_or_init(|| build_wafer_release())
.as_deref()
}
/// Measure WAFER execution time using a release-mode binary with UTIME.
/// Same approach as gforth: Forth-level timing excludes startup.
fn measure_wafer_release(wafer: &str, bench: &PerfBenchmark) -> Option<u64> {
let code = format!(
"{define} {run} \
: TIMED-BENCH UTIME {run} UTIME 2SWAP D- DROP . CR ; \
TIMED-BENCH TIMED-BENCH TIMED-BENCH",
define = bench.define,
run = bench.run_code,
);
let output = Command::new(wafer)
.stdin(std::process::Stdio::piped())
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.and_then(|mut child| {
use std::io::Write;
child.stdin.take().unwrap().write_all(code.as_bytes())?;
child.wait_with_output()
})
.ok()?;
if !output.status.success() {
return None;
}
let stdout = String::from_utf8_lossy(&output.stdout);
let mut times: Vec<u64> = stdout
.trim()
.lines()
.filter_map(|l| l.trim().parse::<u64>().ok())
.collect();
times.sort();
times[times.len() / 2].as_micros() as u64
if times.is_empty() {
return None;
}
Some(times[times.len() / 2])
}
/// Measure gforth execution time using Forth-level `utime` (excludes startup).
@@ -700,13 +749,16 @@ fn measure_gforth(gforth: &str, bench: &PerfBenchmark) -> Option<u64> {
fn performance_report() {
let gforth = find_gforth();
let gforth_fast = find_gforth_fast();
let wafer_release = find_wafer_release();
if gforth.is_none() {
eprintln!("SKIP: gforth not found");
return;
}
if wafer_release.is_none() {
eprintln!("WARN: could not build WAFER release binary, using in-process (debug) timing");
}
let benchmarks = perf_benchmarks();
let config_all = WaferConfig::all();
// Verify correctness first
for bench in &benchmarks {
@@ -733,7 +785,7 @@ fn performance_report() {
let sep = "=".repeat(80);
let thin = "-".repeat(80);
println!("\n{sep}");
println!(" WAFER vs Gforth Performance Comparison");
println!(" WAFER vs Gforth Performance Comparison (release mode)");
println!("{sep}\n");
println!(
"{:<22} {:>12} {:>12} {:>12} {:>12}",
@@ -742,7 +794,7 @@ fn performance_report() {
println!("{thin}");
for bench in &benchmarks {
let wafer = measure_wafer(&config_all, bench);
let wafer = wafer_release.and_then(|w| measure_wafer_release(w, bench)).unwrap_or(0);
let gf = gforth.and_then(|g| measure_gforth(g, bench));
let gf_fast = gforth_fast.and_then(|g| measure_gforth(g, bench));