Optimize DO/LOOP: index/limit in WASM locals, J as IR primitive

Two-path DO/LOOP codegen based on static analysis of the loop body:

- Fast path (no calls, no >R/R> in body): index and limit live purely
  in WASM locals with zero return stack traffic per iteration. RFetch (I)
  and LoopJ (J) resolve to local.get instead of memory access.

- Slow path (body has calls or explicit RS ops): locals still used for
  loop control, but synced to return stack for LEAVE/UNLOOP compatibility.

Also converts J from a host function (WASM→Rust roundtrip per call) to
an IR primitive (IrOp::LoopJ) that compiles to local.get of the outer
loop's index local.

Performance impact (vs gforth, all opts enabled):
- Factorial: 1.02x → 0.94x (now faster than gforth)
- NestedLoops: 717x → 543x (24% faster, still bottlenecked by data stack)
- Fibonacci, GCD, Collatz: unchanged (don't use DO/LOOP)
This commit is contained in:
2026-04-09 17:13:31 +02:00
parent 1e2ede58ac
commit 4feeaeb0ba
3 changed files with 319 additions and 156 deletions
+2 -40
View File
@@ -2255,8 +2255,8 @@ impl ForthVM {
// -- Priority 1: Loop support --
// I -- push loop index (top of return stack)
self.register_primitive("I", false, vec![IrOp::RFetch])?;
// J -- outer loop counter (third item on return stack)
self.register_j()?;
// J -- outer loop counter
self.register_primitive("J", false, vec![IrOp::LoopJ])?;
// UNLOOP -- remove loop parameters from return stack
self.register_primitive(
"UNLOOP",
@@ -2515,44 +2515,6 @@ impl ForthVM {
// Priority 1: Loop support host functions
// -----------------------------------------------------------------------
/// Register J (outer loop counter) as a host function.
/// During nested DO loops the return stack looks like:
/// ... `outer_limit` `outer_index` `inner_limit` `inner_index` (`inner_index` on top)
/// J reads the outer index = rsp + 8 (skip inner index and inner limit).
fn register_j(&mut self) -> anyhow::Result<()> {
let memory = self.memory;
let dsp = self.dsp;
let rsp = self.rsp;
let func = Func::new(
&mut self.store,
FuncType::new(&self.engine, [], []),
move |mut caller, _params, _results| {
let rsp_val = rsp.get(&mut caller).unwrap_i32() as u32;
// rsp points to inner_index, rsp+4 = inner_limit, rsp+8 = outer_index
let addr = (rsp_val + 8) as usize;
let data = memory.data(&caller);
let b: [u8; 4] = data[addr..addr + 4].try_into().unwrap();
let value = i32::from_le_bytes(b);
// Push onto data stack
let sp = dsp.get(&mut caller).unwrap_i32() as u32;
let mem_len = memory.data(&caller).len() as u32;
if sp < CELL_SIZE || sp > mem_len {
return Err(wasmtime::Error::msg("data stack overflow in J"));
}
let new_sp = sp - CELL_SIZE;
let data = memory.data_mut(&mut caller);
let bytes = value.to_le_bytes();
data[new_sp as usize..new_sp as usize + 4].copy_from_slice(&bytes);
dsp.set(&mut caller, Val::I32(new_sp as i32))?;
Ok(())
},
);
self.register_host_primitive("J", false, func)?;
Ok(())
}
/// Register LEAVE as a host function.
/// Sets the loop index equal to the limit and sets the leave flag
/// so the loop exits on the next +LOOP/LOOP check.