Release-mode benchmarks, UTIME word, consolidated promotion
Three changes: 1. Add UTIME host function ( -- ud ) for microsecond timing in Forth. Enables self-timed benchmarks matching gforth's utime approach. 2. Switch comparison benchmarks to release mode: builds wafer binary with --release, measures via UTIME (excludes startup overhead). Previously measured debug-mode Rust overhead, not WASM execution. 3. Add stack-to-local promotion to consolidated codegen path. Words that pass is_promotable now use the StackSim emit path even in CONSOLIDATE'd modules, preventing performance regression. Release-mode results (WAFER beats gforth on 4/5 benchmarks): Factorial: 0.54x (2x faster) GCD: 0.50x (2x faster) NestedLoops: 0.10x (10x faster) Collatz: 0.31x (3x faster) Fibonacci: 1.47x (call overhead)
This commit is contained in:
@@ -2884,11 +2884,18 @@ fn compile_multi_word_module(
|
||||
// -- Code section: emit each function body --
|
||||
let mut code = CodeSection::new();
|
||||
for (_word_id, body) in words {
|
||||
let promoted = is_promotable(body);
|
||||
let scratch_count = count_scratch_locals(body);
|
||||
let forth_local_count = count_forth_locals(body);
|
||||
let loop_depth = count_loop_depth(body);
|
||||
let loop_local_count = loop_depth * 2;
|
||||
let num_locals = 1 + scratch_count + forth_local_count + loop_local_count;
|
||||
let num_locals = if promoted {
|
||||
let (preload, _) = compute_stack_needs(body);
|
||||
let promoted_count = count_promoted_locals(body, preload);
|
||||
1 + promoted_count + forth_local_count + loop_local_count
|
||||
} else {
|
||||
1 + scratch_count + forth_local_count + loop_local_count
|
||||
};
|
||||
let has_floats = needs_f64_locals(body);
|
||||
let num_f64: u32 = if has_floats { 2 } else { 0 };
|
||||
let mut locals_decl = vec![(num_locals, ValType::I32)];
|
||||
@@ -2896,7 +2903,13 @@ fn compile_multi_word_module(
|
||||
locals_decl.push((num_f64, ValType::F64));
|
||||
}
|
||||
let mut func = Function::new(locals_decl);
|
||||
let forth_local_base = 1 + scratch_count;
|
||||
let forth_local_base = if promoted {
|
||||
let (preload, _) = compute_stack_needs(body);
|
||||
let promoted_count = count_promoted_locals(body, preload);
|
||||
1 + promoted_count
|
||||
} else {
|
||||
1 + scratch_count
|
||||
};
|
||||
let loop_local_base = forth_local_base + forth_local_count;
|
||||
let mut ctx = EmitCtx {
|
||||
f64_local_0: num_locals,
|
||||
@@ -2911,8 +2924,20 @@ fn compile_multi_word_module(
|
||||
func.instruction(&Instruction::GlobalGet(DSP))
|
||||
.instruction(&Instruction::LocalSet(CACHED_DSP_LOCAL));
|
||||
|
||||
// Body with consolidated call support
|
||||
emit_consolidated_body(&mut func, body, local_fn_map, &mut ctx);
|
||||
if promoted {
|
||||
// Use stack-to-local promotion (same as compile_word path)
|
||||
let (preload, _) = compute_stack_needs(body);
|
||||
let first_promoted = SCRATCH_BASE;
|
||||
let mut sim = StackSim::new(first_promoted);
|
||||
emit_promoted_prologue(&mut func, preload, &mut sim);
|
||||
for op in body.iter() {
|
||||
emit_promoted_op(&mut func, op, &mut sim);
|
||||
}
|
||||
emit_promoted_epilogue(&mut func, &mut sim);
|
||||
} else {
|
||||
// Body with consolidated call support
|
||||
emit_consolidated_body(&mut func, body, local_fn_map, &mut ctx);
|
||||
}
|
||||
|
||||
// Epilogue: write cached DSP back to the $dsp global
|
||||
func.instruction(&Instruction::LocalGet(CACHED_DSP_LOCAL))
|
||||
|
||||
@@ -2403,6 +2403,9 @@ impl ForthVM {
|
||||
// UNUSED
|
||||
self.register_unused()?;
|
||||
|
||||
// UTIME ( -- ud ) microseconds since epoch as double-cell
|
||||
self.register_utime()?;
|
||||
|
||||
// HOLDS
|
||||
// HOLDS: defined in boot.fth
|
||||
|
||||
@@ -5125,6 +5128,39 @@ impl ForthVM {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// UTIME ( -- ud ) push microseconds since epoch as a double-cell value.
|
||||
fn register_utime(&mut self) -> anyhow::Result<()> {
|
||||
let memory = self.memory;
|
||||
let dsp = self.dsp;
|
||||
|
||||
let func = Func::new(
|
||||
&mut self.store,
|
||||
FuncType::new(&self.engine, [], []),
|
||||
move |mut caller, _params, _results| {
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
let us = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_micros() as u64;
|
||||
let lo = us as i32;
|
||||
let hi = (us >> 32) as i32;
|
||||
// Push double: lo first (deeper), then hi on top
|
||||
let sp = dsp.get(&mut caller).unwrap_i32() as u32;
|
||||
let new_sp = sp - 2 * CELL_SIZE;
|
||||
let data = memory.data_mut(&mut caller);
|
||||
data[new_sp as usize..new_sp as usize + 4]
|
||||
.copy_from_slice(&hi.to_le_bytes());
|
||||
data[new_sp as usize + 4..new_sp as usize + 8]
|
||||
.copy_from_slice(&lo.to_le_bytes());
|
||||
dsp.set(&mut caller, Val::I32(new_sp as i32))?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
self.register_host_primitive("UTIME", false, func)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// PARSE ( char "ccc<char>" -- c-addr u ) as inline host function.
|
||||
fn register_parse_host(&mut self) -> anyhow::Result<()> {
|
||||
let memory = self.memory;
|
||||
|
||||
Reference in New Issue
Block a user