From 5555202bf09b510c266805dc4b10d976d9cc4bfe Mon Sep 17 00:00:00 2001 From: Oleksandr Kozachuk Date: Thu, 9 Apr 2026 19:54:40 +0200 Subject: [PATCH] Self-recursive direct call, UTIME, CONSOLIDATE benchmarks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Self-recursive direct call: when a word calls itself (RECURSE), emit `call WORD_FUNC` instead of `call_indirect`. Eliminates table lookup + signature check for recursive words. Fibonacci(25): 5003us → 1629us (3x faster, now 2.2x faster than gforth) 2. Add CONSOLIDATE column to performance benchmarks showing post-consolidation performance (direct calls between all words). WAFER now beats gforth on all 5 benchmarks: Fibonacci: 0.45x (2.2x faster) Factorial: 0.53x (1.9x faster) GCD: 0.50x (2x faster) NestedLoops: 0.10x (10x faster) Collatz: 0.31x (3x faster) --- crates/core/src/codegen.rs | 39 +++++++++++-------- crates/core/tests/comparison.rs | 66 +++++++++++++++++++++++++++++---- 2 files changed, 82 insertions(+), 23 deletions(-) diff --git a/crates/core/src/codegen.rs b/crates/core/src/codegen.rs index 425e964..b325728 100644 --- a/crates/core/src/codegen.rs +++ b/crates/core/src/codegen.rs @@ -241,6 +241,9 @@ struct EmitCtx { /// Nesting depth of DO/LOOPs that use the fast path (no RS sync). /// When > 0, `RFetch` (I) reads from the loop local instead of rpeek. fast_loop_depth: u32, + /// The word being compiled (for self-recursion detection). + /// When `Call(id)` matches this, emit direct `call` instead of `call_indirect`. + self_word_id: Option, } /// Decrement the FSP global by 8 (allocate space for one f64). @@ -544,27 +547,31 @@ fn emit_op(f: &mut Function, op: &IrOp, ctx: &mut EmitCtx) { // -- Control flow --------------------------------------------------- IrOp::Call(word_id) => { - // Write back cached DSP before call dsp_writeback(f); - f.instruction(&Instruction::I32Const(word_id.0 as i32)) - .instruction(&Instruction::CallIndirect { - type_index: TYPE_VOID, - table_index: TABLE, - }); - // Reload cached DSP after call (callee may have modified it) + if ctx.self_word_id == Some(*word_id) { + // Self-recursion: direct call (avoids table lookup + signature check) + f.instruction(&Instruction::Call(WORD_FUNC)); + } else { + f.instruction(&Instruction::I32Const(word_id.0 as i32)) + .instruction(&Instruction::CallIndirect { + type_index: TYPE_VOID, + table_index: TABLE, + }); + } dsp_reload(f); } IrOp::TailCall(word_id) => { - // Write back cached DSP before tail call dsp_writeback(f); - f.instruction(&Instruction::I32Const(word_id.0 as i32)) - .instruction(&Instruction::CallIndirect { - type_index: TYPE_VOID, - table_index: TABLE, - }); - // Callee's epilogue already wrote back to the global, so just return. - // No reload needed since we're not using the local after this. + if ctx.self_word_id == Some(*word_id) { + f.instruction(&Instruction::Call(WORD_FUNC)); + } else { + f.instruction(&Instruction::I32Const(word_id.0 as i32)) + .instruction(&Instruction::CallIndirect { + type_index: TYPE_VOID, + table_index: TABLE, + }); + } f.instruction(&Instruction::Return); } @@ -2418,6 +2425,7 @@ pub fn compile_word( loop_local_base, loop_locals: Vec::new(), fast_loop_depth: 0, + self_word_id: Some(WordId(config.base_fn_index)), }; // Prologue: cache $dsp global into local 0 @@ -2918,6 +2926,7 @@ fn compile_multi_word_module( loop_local_base, loop_locals: Vec::new(), fast_loop_depth: 0, + self_word_id: None, // consolidated module uses direct calls via local_fn_map }; // Prologue: cache $dsp global into local 0 diff --git a/crates/core/tests/comparison.rs b/crates/core/tests/comparison.rs index 4fa6d5f..257eb33 100644 --- a/crates/core/tests/comparison.rs +++ b/crates/core/tests/comparison.rs @@ -713,6 +713,42 @@ fn measure_wafer_release(wafer: &str, bench: &PerfBenchmark) -> Option { Some(times[times.len() / 2]) } +/// Measure WAFER execution time after CONSOLIDATE (direct calls between all words). +fn measure_wafer_consolidated(wafer: &str, bench: &PerfBenchmark) -> Option { + let code = format!( + "{define} CONSOLIDATE {run} \ + : TIMED-BENCH UTIME {run} UTIME 2SWAP D- DROP . CR ; \ + TIMED-BENCH TIMED-BENCH TIMED-BENCH", + define = bench.define, + run = bench.run_code, + ); + let output = Command::new(wafer) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .and_then(|mut child| { + use std::io::Write; + child.stdin.take().unwrap().write_all(code.as_bytes())?; + child.wait_with_output() + }) + .ok()?; + if !output.status.success() { + return None; + } + let stdout = String::from_utf8_lossy(&output.stdout); + let mut times: Vec = stdout + .trim() + .lines() + .filter_map(|l| l.trim().parse::().ok()) + .collect(); + times.sort(); + if times.is_empty() { + return None; + } + Some(times[times.len() / 2]) +} + /// Measure gforth execution time using Forth-level `utime` (excludes startup). /// Both engines run the exact same `run_code`, so the comparison is apples-to-apples. /// Returns microseconds, or None if gforth is unavailable. @@ -788,23 +824,37 @@ fn performance_report() { println!(" WAFER vs Gforth Performance Comparison (release mode)"); println!("{sep}\n"); println!( - "{:<22} {:>12} {:>12} {:>12} {:>12}", - "Benchmark", "WAFER(us)", "gforth(us)", "gforth-fast", "WAFER/gf" + "{:<22} {:>10} {:>10} {:>10} {:>10} {:>10}", + "Benchmark", "WAFER", "CONSOL", "gforth", "gf-fast", "WAFER/gf" + ); + println!( + "{:<22} {:>10} {:>10} {:>10} {:>10} {:>10}", + "", "(us)", "(us)", "(us)", "(us)", "" ); println!("{thin}"); for bench in &benchmarks { - let wafer = wafer_release.and_then(|w| measure_wafer_release(w, bench)).unwrap_or(0); + let wafer = wafer_release + .and_then(|w| measure_wafer_release(w, bench)) + .unwrap_or(0); + let consol = wafer_release + .and_then(|w| measure_wafer_consolidated(w, bench)) + .unwrap_or(0); let gf = gforth.and_then(|g| measure_gforth(g, bench)); let gf_fast = gforth_fast.and_then(|g| measure_gforth(g, bench)); let gf_str = gf.map_or_else(|| "-".to_string(), |v| format!("{v}")); let gf_fast_str = gf_fast.map_or_else(|| "-".to_string(), |v| format!("{v}")); + let best_wafer = if consol > 0 && consol < wafer { + consol + } else { + wafer + }; let ratio = gf.map_or_else( || "-".to_string(), |g| { if g > 0 { - format!("{:.2}x", wafer as f64 / g as f64) + format!("{:.2}x", best_wafer as f64 / g as f64) } else { "-".to_string() } @@ -812,13 +862,13 @@ fn performance_report() { ); println!( - "{:<22} {:>12} {:>12} {:>12} {:>12}", - bench.name, wafer, gf_str, gf_fast_str, ratio + "{:<22} {:>10} {:>10} {:>10} {:>10} {:>10}", + bench.name, wafer, consol, gf_str, gf_fast_str, ratio ); } println!("{thin}"); - println!(" WAFER = all optimizations enabled"); - println!(" WAFER/gf < 1.0 means WAFER is faster than gforth"); + println!(" WAFER = all optimizations, CONSOL = after CONSOLIDATE"); + println!(" WAFER/gf = best(WAFER,CONSOL) vs gforth, < 1.0 means WAFER faster"); println!("{sep}\n"); }