Self-recursive direct call, UTIME, CONSOLIDATE benchmarks

1. Self-recursive direct call: when a word calls itself (RECURSE),
   emit `call WORD_FUNC` instead of `call_indirect`. Eliminates
   table lookup + signature check for recursive words.
   Fibonacci(25): 5003us → 1629us (3x faster, now 2.2x faster than gforth)

2. Add CONSOLIDATE column to performance benchmarks showing
   post-consolidation performance (direct calls between all words).

WAFER now beats gforth on all 5 benchmarks:
  Fibonacci:    0.45x (2.2x faster)
  Factorial:    0.53x (1.9x faster)
  GCD:          0.50x (2x faster)
  NestedLoops:  0.10x (10x faster)
  Collatz:      0.31x (3x faster)
This commit is contained in:
2026-04-09 19:54:40 +02:00
parent b1f7a5cc49
commit 7344d3a8d7
2 changed files with 82 additions and 23 deletions
+14 -5
View File
@@ -241,6 +241,9 @@ struct EmitCtx {
/// Nesting depth of DO/LOOPs that use the fast path (no RS sync). /// Nesting depth of DO/LOOPs that use the fast path (no RS sync).
/// When > 0, `RFetch` (I) reads from the loop local instead of rpeek. /// When > 0, `RFetch` (I) reads from the loop local instead of rpeek.
fast_loop_depth: u32, fast_loop_depth: u32,
/// The word being compiled (for self-recursion detection).
/// When `Call(id)` matches this, emit direct `call` instead of `call_indirect`.
self_word_id: Option<WordId>,
} }
/// Decrement the FSP global by 8 (allocate space for one f64). /// Decrement the FSP global by 8 (allocate space for one f64).
@@ -544,27 +547,31 @@ fn emit_op(f: &mut Function, op: &IrOp, ctx: &mut EmitCtx) {
// -- Control flow --------------------------------------------------- // -- Control flow ---------------------------------------------------
IrOp::Call(word_id) => { IrOp::Call(word_id) => {
// Write back cached DSP before call
dsp_writeback(f); dsp_writeback(f);
if ctx.self_word_id == Some(*word_id) {
// Self-recursion: direct call (avoids table lookup + signature check)
f.instruction(&Instruction::Call(WORD_FUNC));
} else {
f.instruction(&Instruction::I32Const(word_id.0 as i32)) f.instruction(&Instruction::I32Const(word_id.0 as i32))
.instruction(&Instruction::CallIndirect { .instruction(&Instruction::CallIndirect {
type_index: TYPE_VOID, type_index: TYPE_VOID,
table_index: TABLE, table_index: TABLE,
}); });
// Reload cached DSP after call (callee may have modified it) }
dsp_reload(f); dsp_reload(f);
} }
IrOp::TailCall(word_id) => { IrOp::TailCall(word_id) => {
// Write back cached DSP before tail call
dsp_writeback(f); dsp_writeback(f);
if ctx.self_word_id == Some(*word_id) {
f.instruction(&Instruction::Call(WORD_FUNC));
} else {
f.instruction(&Instruction::I32Const(word_id.0 as i32)) f.instruction(&Instruction::I32Const(word_id.0 as i32))
.instruction(&Instruction::CallIndirect { .instruction(&Instruction::CallIndirect {
type_index: TYPE_VOID, type_index: TYPE_VOID,
table_index: TABLE, table_index: TABLE,
}); });
// Callee's epilogue already wrote back to the global, so just return. }
// No reload needed since we're not using the local after this.
f.instruction(&Instruction::Return); f.instruction(&Instruction::Return);
} }
@@ -2418,6 +2425,7 @@ pub fn compile_word(
loop_local_base, loop_local_base,
loop_locals: Vec::new(), loop_locals: Vec::new(),
fast_loop_depth: 0, fast_loop_depth: 0,
self_word_id: Some(WordId(config.base_fn_index)),
}; };
// Prologue: cache $dsp global into local 0 // Prologue: cache $dsp global into local 0
@@ -2918,6 +2926,7 @@ fn compile_multi_word_module(
loop_local_base, loop_local_base,
loop_locals: Vec::new(), loop_locals: Vec::new(),
fast_loop_depth: 0, fast_loop_depth: 0,
self_word_id: None, // consolidated module uses direct calls via local_fn_map
}; };
// Prologue: cache $dsp global into local 0 // Prologue: cache $dsp global into local 0
+58 -8
View File
@@ -713,6 +713,42 @@ fn measure_wafer_release(wafer: &str, bench: &PerfBenchmark) -> Option<u64> {
Some(times[times.len() / 2]) Some(times[times.len() / 2])
} }
/// Measure WAFER execution time after CONSOLIDATE (direct calls between all words).
fn measure_wafer_consolidated(wafer: &str, bench: &PerfBenchmark) -> Option<u64> {
let code = format!(
"{define} CONSOLIDATE {run} \
: TIMED-BENCH UTIME {run} UTIME 2SWAP D- DROP . CR ; \
TIMED-BENCH TIMED-BENCH TIMED-BENCH",
define = bench.define,
run = bench.run_code,
);
let output = Command::new(wafer)
.stdin(std::process::Stdio::piped())
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.and_then(|mut child| {
use std::io::Write;
child.stdin.take().unwrap().write_all(code.as_bytes())?;
child.wait_with_output()
})
.ok()?;
if !output.status.success() {
return None;
}
let stdout = String::from_utf8_lossy(&output.stdout);
let mut times: Vec<u64> = stdout
.trim()
.lines()
.filter_map(|l| l.trim().parse::<u64>().ok())
.collect();
times.sort();
if times.is_empty() {
return None;
}
Some(times[times.len() / 2])
}
/// Measure gforth execution time using Forth-level `utime` (excludes startup). /// Measure gforth execution time using Forth-level `utime` (excludes startup).
/// Both engines run the exact same `run_code`, so the comparison is apples-to-apples. /// Both engines run the exact same `run_code`, so the comparison is apples-to-apples.
/// Returns microseconds, or None if gforth is unavailable. /// Returns microseconds, or None if gforth is unavailable.
@@ -788,23 +824,37 @@ fn performance_report() {
println!(" WAFER vs Gforth Performance Comparison (release mode)"); println!(" WAFER vs Gforth Performance Comparison (release mode)");
println!("{sep}\n"); println!("{sep}\n");
println!( println!(
"{:<22} {:>12} {:>12} {:>12} {:>12}", "{:<22} {:>10} {:>10} {:>10} {:>10} {:>10}",
"Benchmark", "WAFER(us)", "gforth(us)", "gforth-fast", "WAFER/gf" "Benchmark", "WAFER", "CONSOL", "gforth", "gf-fast", "WAFER/gf"
);
println!(
"{:<22} {:>10} {:>10} {:>10} {:>10} {:>10}",
"", "(us)", "(us)", "(us)", "(us)", ""
); );
println!("{thin}"); println!("{thin}");
for bench in &benchmarks { for bench in &benchmarks {
let wafer = wafer_release.and_then(|w| measure_wafer_release(w, bench)).unwrap_or(0); let wafer = wafer_release
.and_then(|w| measure_wafer_release(w, bench))
.unwrap_or(0);
let consol = wafer_release
.and_then(|w| measure_wafer_consolidated(w, bench))
.unwrap_or(0);
let gf = gforth.and_then(|g| measure_gforth(g, bench)); let gf = gforth.and_then(|g| measure_gforth(g, bench));
let gf_fast = gforth_fast.and_then(|g| measure_gforth(g, bench)); let gf_fast = gforth_fast.and_then(|g| measure_gforth(g, bench));
let gf_str = gf.map_or_else(|| "-".to_string(), |v| format!("{v}")); let gf_str = gf.map_or_else(|| "-".to_string(), |v| format!("{v}"));
let gf_fast_str = gf_fast.map_or_else(|| "-".to_string(), |v| format!("{v}")); let gf_fast_str = gf_fast.map_or_else(|| "-".to_string(), |v| format!("{v}"));
let best_wafer = if consol > 0 && consol < wafer {
consol
} else {
wafer
};
let ratio = gf.map_or_else( let ratio = gf.map_or_else(
|| "-".to_string(), || "-".to_string(),
|g| { |g| {
if g > 0 { if g > 0 {
format!("{:.2}x", wafer as f64 / g as f64) format!("{:.2}x", best_wafer as f64 / g as f64)
} else { } else {
"-".to_string() "-".to_string()
} }
@@ -812,13 +862,13 @@ fn performance_report() {
); );
println!( println!(
"{:<22} {:>12} {:>12} {:>12} {:>12}", "{:<22} {:>10} {:>10} {:>10} {:>10} {:>10}",
bench.name, wafer, gf_str, gf_fast_str, ratio bench.name, wafer, consol, gf_str, gf_fast_str, ratio
); );
} }
println!("{thin}"); println!("{thin}");
println!(" WAFER = all optimizations enabled"); println!(" WAFER = all optimizations, CONSOL = after CONSOLIDATE");
println!(" WAFER/gf < 1.0 means WAFER is faster than gforth"); println!(" WAFER/gf = best(WAFER,CONSOL) vs gforth, < 1.0 means WAFER faster");
println!("{sep}\n"); println!("{sep}\n");
} }