Self-recursive direct call, UTIME, CONSOLIDATE benchmarks

1. Self-recursive direct call: when a word calls itself (RECURSE),
   emit `call WORD_FUNC` instead of `call_indirect`. Eliminates
   table lookup + signature check for recursive words.
   Fibonacci(25): 5003us → 1629us (3x faster, now 2.2x faster than gforth)

2. Add CONSOLIDATE column to performance benchmarks showing
   post-consolidation performance (direct calls between all words).

WAFER now beats gforth on all 5 benchmarks:
  Fibonacci:    0.45x (2.2x faster)
  Factorial:    0.53x (1.9x faster)
  GCD:          0.50x (2x faster)
  NestedLoops:  0.10x (10x faster)
  Collatz:      0.31x (3x faster)
This commit is contained in:
2026-04-09 19:54:40 +02:00
parent b1f7a5cc49
commit 7344d3a8d7
2 changed files with 82 additions and 23 deletions
+24 -15
View File
@@ -241,6 +241,9 @@ struct EmitCtx {
/// Nesting depth of DO/LOOPs that use the fast path (no RS sync).
/// When > 0, `RFetch` (I) reads from the loop local instead of rpeek.
fast_loop_depth: u32,
/// The word being compiled (for self-recursion detection).
/// When `Call(id)` matches this, emit direct `call` instead of `call_indirect`.
self_word_id: Option<WordId>,
}
/// Decrement the FSP global by 8 (allocate space for one f64).
@@ -544,27 +547,31 @@ fn emit_op(f: &mut Function, op: &IrOp, ctx: &mut EmitCtx) {
// -- Control flow ---------------------------------------------------
IrOp::Call(word_id) => {
// Write back cached DSP before call
dsp_writeback(f);
f.instruction(&Instruction::I32Const(word_id.0 as i32))
.instruction(&Instruction::CallIndirect {
type_index: TYPE_VOID,
table_index: TABLE,
});
// Reload cached DSP after call (callee may have modified it)
if ctx.self_word_id == Some(*word_id) {
// Self-recursion: direct call (avoids table lookup + signature check)
f.instruction(&Instruction::Call(WORD_FUNC));
} else {
f.instruction(&Instruction::I32Const(word_id.0 as i32))
.instruction(&Instruction::CallIndirect {
type_index: TYPE_VOID,
table_index: TABLE,
});
}
dsp_reload(f);
}
IrOp::TailCall(word_id) => {
// Write back cached DSP before tail call
dsp_writeback(f);
f.instruction(&Instruction::I32Const(word_id.0 as i32))
.instruction(&Instruction::CallIndirect {
type_index: TYPE_VOID,
table_index: TABLE,
});
// Callee's epilogue already wrote back to the global, so just return.
// No reload needed since we're not using the local after this.
if ctx.self_word_id == Some(*word_id) {
f.instruction(&Instruction::Call(WORD_FUNC));
} else {
f.instruction(&Instruction::I32Const(word_id.0 as i32))
.instruction(&Instruction::CallIndirect {
type_index: TYPE_VOID,
table_index: TABLE,
});
}
f.instruction(&Instruction::Return);
}
@@ -2418,6 +2425,7 @@ pub fn compile_word(
loop_local_base,
loop_locals: Vec::new(),
fast_loop_depth: 0,
self_word_id: Some(WordId(config.base_fn_index)),
};
// Prologue: cache $dsp global into local 0
@@ -2918,6 +2926,7 @@ fn compile_multi_word_module(
loop_local_base,
loop_locals: Vec::new(),
fast_loop_depth: 0,
self_word_id: None, // consolidated module uses direct calls via local_fn_map
};
// Prologue: cache $dsp global into local 0