From 5555202bf09b510c266805dc4b10d976d9cc4bfe Mon Sep 17 00:00:00 2001
From: Oleksandr Kozachuk <ddeus.gh@mailnull.com>
Date: Thu, 9 Apr 2026 19:54:40 +0200
Subject: [PATCH] Self-recursive direct call, UTIME, CONSOLIDATE benchmarks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Self-recursive direct call: when a word calls itself (RECURSE),
   emit `call WORD_FUNC` instead of `call_indirect`. Eliminates
   table lookup + signature check for recursive words.
   Fibonacci(25): 5003us → 1629us (3x faster, now 2.2x faster than gforth)

2. Add CONSOLIDATE column to performance benchmarks showing
   post-consolidation performance (direct calls between all words).

WAFER now beats gforth on all 5 benchmarks:
  Fibonacci:    0.45x (2.2x faster)
  Factorial:    0.53x (1.9x faster)
  GCD:          0.50x (2x faster)
  NestedLoops:  0.10x (10x faster)
  Collatz:      0.31x (3x faster)
---
 crates/core/src/codegen.rs      | 39 +++++++++++--------
 crates/core/tests/comparison.rs | 66 +++++++++++++++++++++++++++++----
 2 files changed, 82 insertions(+), 23 deletions(-)
diff --git a/crates/core/src/codegen.rs b/crates/core/src/codegen.rs
index 425e964..b325728 100644
--- a/crates/core/src/codegen.rs
+++ b/crates/core/src/codegen.rs
@@ -241,6 +241,9 @@ struct EmitCtx {
     /// Nesting depth of DO/LOOPs that use the fast path (no RS sync).
     /// When > 0, `RFetch` (I) reads from the loop local instead of rpeek.
     fast_loop_depth: u32,
+    /// The word being compiled (for self-recursion detection).
+    /// When `Call(id)` matches this, emit direct `call` instead of `call_indirect`.
+    self_word_id: Option<WordId>,
 }
 
 /// Decrement the FSP global by 8 (allocate space for one f64).
@@ -544,27 +547,31 @@ fn emit_op(f: &mut Function, op: &IrOp, ctx: &mut EmitCtx) {
 
         // -- Control flow ---------------------------------------------------
         IrOp::Call(word_id) => {
-            // Write back cached DSP before call
             dsp_writeback(f);
-            f.instruction(&Instruction::I32Const(word_id.0 as i32))
-                .instruction(&Instruction::CallIndirect {
-                    type_index: TYPE_VOID,
-                    table_index: TABLE,
-                });
-            // Reload cached DSP after call (callee may have modified it)
+            if ctx.self_word_id == Some(*word_id) {
+                // Self-recursion: direct call (avoids table lookup + signature check)
+                f.instruction(&Instruction::Call(WORD_FUNC));
+            } else {
+                f.instruction(&Instruction::I32Const(word_id.0 as i32))
+                    .instruction(&Instruction::CallIndirect {
+                        type_index: TYPE_VOID,
+                        table_index: TABLE,
+                    });
+            }
             dsp_reload(f);
         }
 
         IrOp::TailCall(word_id) => {
-            // Write back cached DSP before tail call
             dsp_writeback(f);
-            f.instruction(&Instruction::I32Const(word_id.0 as i32))
-                .instruction(&Instruction::CallIndirect {
-                    type_index: TYPE_VOID,
-                    table_index: TABLE,
-                });
-            // Callee's epilogue already wrote back to the global, so just return.
-            // No reload needed since we're not using the local after this.
+            if ctx.self_word_id == Some(*word_id) {
+                f.instruction(&Instruction::Call(WORD_FUNC));
+            } else {
+                f.instruction(&Instruction::I32Const(word_id.0 as i32))
+                    .instruction(&Instruction::CallIndirect {
+                        type_index: TYPE_VOID,
+                        table_index: TABLE,
+                    });
+            }
             f.instruction(&Instruction::Return);
         }
 
@@ -2418,6 +2425,7 @@ pub fn compile_word(
         loop_local_base,
         loop_locals: Vec::new(),
         fast_loop_depth: 0,
+        self_word_id: Some(WordId(config.base_fn_index)),
     };
 
     // Prologue: cache $dsp global into local 0
@@ -2918,6 +2926,7 @@ fn compile_multi_word_module(
             loop_local_base,
             loop_locals: Vec::new(),
             fast_loop_depth: 0,
+            self_word_id: None, // consolidated module uses direct calls via local_fn_map
         };
 
         // Prologue: cache $dsp global into local 0
diff --git a/crates/core/tests/comparison.rs b/crates/core/tests/comparison.rs
index 4fa6d5f..257eb33 100644
--- a/crates/core/tests/comparison.rs
+++ b/crates/core/tests/comparison.rs
@@ -713,6 +713,42 @@ fn measure_wafer_release(wafer: &str, bench: &PerfBenchmark) -> Option<u64> {
     Some(times[times.len() / 2])
 }
 
+/// Measure WAFER execution time after CONSOLIDATE (direct calls between all words).
+fn measure_wafer_consolidated(wafer: &str, bench: &PerfBenchmark) -> Option<u64> {
+    let code = format!(
+        "{define} CONSOLIDATE {run} \
+         : TIMED-BENCH UTIME {run} UTIME 2SWAP D- DROP . CR ; \
+         TIMED-BENCH TIMED-BENCH TIMED-BENCH",
+        define = bench.define,
+        run = bench.run_code,
+    );
+    let output = Command::new(wafer)
+        .stdin(std::process::Stdio::piped())
+        .stdout(std::process::Stdio::piped())
+        .stderr(std::process::Stdio::piped())
+        .spawn()
+        .and_then(|mut child| {
+            use std::io::Write;
+            child.stdin.take().unwrap().write_all(code.as_bytes())?;
+            child.wait_with_output()
+        })
+        .ok()?;
+    if !output.status.success() {
+        return None;
+    }
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    let mut times: Vec<u64> = stdout
+        .trim()
+        .lines()
+        .filter_map(|l| l.trim().parse::<u64>().ok())
+        .collect();
+    times.sort();
+    if times.is_empty() {
+        return None;
+    }
+    Some(times[times.len() / 2])
+}
+
 /// Measure gforth execution time using Forth-level `utime` (excludes startup).
 /// Both engines run the exact same `run_code`, so the comparison is apples-to-apples.
 /// Returns microseconds, or None if gforth is unavailable.
@@ -788,23 +824,37 @@ fn performance_report() {
     println!("  WAFER vs Gforth Performance Comparison (release mode)");
     println!("{sep}\n");
     println!(
-        "{:<22} {:>12} {:>12} {:>12} {:>12}",
-        "Benchmark", "WAFER(us)", "gforth(us)", "gforth-fast", "WAFER/gf"
+        "{:<22} {:>10} {:>10} {:>10} {:>10} {:>10}",
+        "Benchmark", "WAFER", "CONSOL", "gforth", "gf-fast", "WAFER/gf"
+    );
+    println!(
+        "{:<22} {:>10} {:>10} {:>10} {:>10} {:>10}",
+        "", "(us)", "(us)", "(us)", "(us)", ""
     );
     println!("{thin}");
 
     for bench in &benchmarks {
-        let wafer = wafer_release.and_then(|w| measure_wafer_release(w, bench)).unwrap_or(0);
+        let wafer = wafer_release
+            .and_then(|w| measure_wafer_release(w, bench))
+            .unwrap_or(0);
+        let consol = wafer_release
+            .and_then(|w| measure_wafer_consolidated(w, bench))
+            .unwrap_or(0);
         let gf = gforth.and_then(|g| measure_gforth(g, bench));
         let gf_fast = gforth_fast.and_then(|g| measure_gforth(g, bench));
 
         let gf_str = gf.map_or_else(|| "-".to_string(), |v| format!("{v}"));
         let gf_fast_str = gf_fast.map_or_else(|| "-".to_string(), |v| format!("{v}"));
+        let best_wafer = if consol > 0 && consol < wafer {
+            consol
+        } else {
+            wafer
+        };
         let ratio = gf.map_or_else(
             || "-".to_string(),
             |g| {
                 if g > 0 {
-                    format!("{:.2}x", wafer as f64 / g as f64)
+                    format!("{:.2}x", best_wafer as f64 / g as f64)
                 } else {
                     "-".to_string()
                 }
@@ -812,13 +862,13 @@ fn performance_report() {
         );
 
         println!(
-            "{:<22} {:>12} {:>12} {:>12} {:>12}",
-            bench.name, wafer, gf_str, gf_fast_str, ratio
+            "{:<22} {:>10} {:>10} {:>10} {:>10} {:>10}",
+            bench.name, wafer, consol, gf_str, gf_fast_str, ratio
         );
     }
 
     println!("{thin}");
-    println!("  WAFER = all optimizations enabled");
-    println!("  WAFER/gf < 1.0 means WAFER is faster than gforth");
+    println!("  WAFER = all optimizations, CONSOL = after CONSOLIDATE");
+    println!("  WAFER/gf = best(WAFER,CONSOL) vs gforth, < 1.0 means WAFER faster");
     println!("{sep}\n");
 }