diff --git a/crates/core/src/codegen.rs b/crates/core/src/codegen.rs index 6a9f45f..425e964 100644 --- a/crates/core/src/codegen.rs +++ b/crates/core/src/codegen.rs @@ -2884,11 +2884,18 @@ fn compile_multi_word_module( // -- Code section: emit each function body -- let mut code = CodeSection::new(); for (_word_id, body) in words { + let promoted = is_promotable(body); let scratch_count = count_scratch_locals(body); let forth_local_count = count_forth_locals(body); let loop_depth = count_loop_depth(body); let loop_local_count = loop_depth * 2; - let num_locals = 1 + scratch_count + forth_local_count + loop_local_count; + let num_locals = if promoted { + let (preload, _) = compute_stack_needs(body); + let promoted_count = count_promoted_locals(body, preload); + 1 + promoted_count + forth_local_count + loop_local_count + } else { + 1 + scratch_count + forth_local_count + loop_local_count + }; let has_floats = needs_f64_locals(body); let num_f64: u32 = if has_floats { 2 } else { 0 }; let mut locals_decl = vec![(num_locals, ValType::I32)]; @@ -2896,7 +2903,13 @@ fn compile_multi_word_module( locals_decl.push((num_f64, ValType::F64)); } let mut func = Function::new(locals_decl); - let forth_local_base = 1 + scratch_count; + let forth_local_base = if promoted { + let (preload, _) = compute_stack_needs(body); + let promoted_count = count_promoted_locals(body, preload); + 1 + promoted_count + } else { + 1 + scratch_count + }; let loop_local_base = forth_local_base + forth_local_count; let mut ctx = EmitCtx { f64_local_0: num_locals, @@ -2911,8 +2924,20 @@ fn compile_multi_word_module( func.instruction(&Instruction::GlobalGet(DSP)) .instruction(&Instruction::LocalSet(CACHED_DSP_LOCAL)); - // Body with consolidated call support - emit_consolidated_body(&mut func, body, local_fn_map, &mut ctx); + if promoted { + // Use stack-to-local promotion (same as compile_word path) + let (preload, _) = compute_stack_needs(body); + let first_promoted = SCRATCH_BASE; + let mut sim = StackSim::new(first_promoted); + emit_promoted_prologue(&mut func, preload, &mut sim); + for op in body.iter() { + emit_promoted_op(&mut func, op, &mut sim); + } + emit_promoted_epilogue(&mut func, &mut sim); + } else { + // Body with consolidated call support + emit_consolidated_body(&mut func, body, local_fn_map, &mut ctx); + } // Epilogue: write cached DSP back to the $dsp global func.instruction(&Instruction::LocalGet(CACHED_DSP_LOCAL)) diff --git a/crates/core/src/outer.rs b/crates/core/src/outer.rs index a5d36c1..22f8349 100644 --- a/crates/core/src/outer.rs +++ b/crates/core/src/outer.rs @@ -2403,6 +2403,9 @@ impl ForthVM { // UNUSED self.register_unused()?; + // UTIME ( -- ud ) microseconds since epoch as double-cell + self.register_utime()?; + // HOLDS // HOLDS: defined in boot.fth @@ -5125,6 +5128,39 @@ impl ForthVM { Ok(()) } + /// UTIME ( -- ud ) push microseconds since epoch as a double-cell value. + fn register_utime(&mut self) -> anyhow::Result<()> { + let memory = self.memory; + let dsp = self.dsp; + + let func = Func::new( + &mut self.store, + FuncType::new(&self.engine, [], []), + move |mut caller, _params, _results| { + use std::time::{SystemTime, UNIX_EPOCH}; + let us = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_micros() as u64; + let lo = us as i32; + let hi = (us >> 32) as i32; + // Push double: lo first (deeper), then hi on top + let sp = dsp.get(&mut caller).unwrap_i32() as u32; + let new_sp = sp - 2 * CELL_SIZE; + let data = memory.data_mut(&mut caller); + data[new_sp as usize..new_sp as usize + 4] + .copy_from_slice(&hi.to_le_bytes()); + data[new_sp as usize + 4..new_sp as usize + 8] + .copy_from_slice(&lo.to_le_bytes()); + dsp.set(&mut caller, Val::I32(new_sp as i32))?; + Ok(()) + }, + ); + + self.register_host_primitive("UTIME", false, func)?; + Ok(()) + } + /// PARSE ( char "ccc" -- c-addr u ) as inline host function. fn register_parse_host(&mut self) -> anyhow::Result<()> { let memory = self.memory; diff --git a/crates/core/tests/comparison.rs b/crates/core/tests/comparison.rs index 9f7c799..4fa6d5f 100644 --- a/crates/core/tests/comparison.rs +++ b/crates/core/tests/comparison.rs @@ -9,7 +9,6 @@ use std::process::Command; use std::sync::OnceLock; -use std::time::Instant; use wafer_core::config::WaferConfig; use wafer_core::outer::ForthVM; @@ -616,9 +615,10 @@ fn perf_benchmarks() -> Vec { samples: 5, }, PerfBenchmark { - name: "NestedLoops(20)", - define: ": NESTED 0 SWAP 0 DO I 0 ?DO I J + DROP LOOP LOOP ;", - run_code: "20 NESTED DROP", + name: "NestedLoops(50)", + define: ": NESTED 0 SWAP 0 DO I 0 ?DO I J + DROP LOOP LOOP ; \ + : NESTED-BENCH 100 0 DO 50 NESTED DROP LOOP ;", + run_code: "NESTED-BENCH", verify: "5 NESTED", expected: 0, samples: 3, @@ -637,31 +637,80 @@ fn perf_benchmarks() -> Vec { ] } -/// Measure WAFER execution time (microseconds, median of N samples). -fn measure_wafer(config: &WaferConfig, bench: &PerfBenchmark) -> u64 { - let mut vm = ForthVM::new_with_config(config.clone()).expect("VM creation failed"); - for line in bench.define.lines() { - let trimmed = line.trim(); - if !trimmed.is_empty() { - let _ = vm.evaluate(trimmed); - } +/// Build the WAFER release binary and return its path. +/// Returns None if the build fails. +fn build_wafer_release() -> Option { + // Find workspace root (two levels up from crates/core) + let manifest_dir = env!("CARGO_MANIFEST_DIR"); + let workspace_root = std::path::Path::new(manifest_dir) + .parent()? + .parent()?; + let output = Command::new("cargo") + .args(["build", "--release", "-p", "wafer"]) + .current_dir(workspace_root) + .output() + .ok()?; + if !output.status.success() { + eprintln!( + "WARN: cargo build --release failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + return None; } - vm.take_output(); - - // Warm up - let _ = vm.evaluate(bench.run_code); - vm.take_output(); - - // Measure - let mut times = Vec::new(); - for _ in 0..bench.samples { - let start = Instant::now(); - let _ = vm.evaluate(bench.run_code); - times.push(start.elapsed()); - vm.take_output(); + let target_dir = workspace_root.join( + std::env::var("CARGO_TARGET_DIR").unwrap_or_else(|_| "target".to_string()), + ); + let binary = target_dir.join("release/wafer"); + if binary.exists() { + Some(binary.to_string_lossy().into_owned()) + } else { + None } +} + +static WAFER_RELEASE: OnceLock> = OnceLock::new(); + +fn find_wafer_release() -> Option<&'static str> { + WAFER_RELEASE + .get_or_init(|| build_wafer_release()) + .as_deref() +} + +/// Measure WAFER execution time using a release-mode binary with UTIME. +/// Same approach as gforth: Forth-level timing excludes startup. +fn measure_wafer_release(wafer: &str, bench: &PerfBenchmark) -> Option { + let code = format!( + "{define} {run} \ + : TIMED-BENCH UTIME {run} UTIME 2SWAP D- DROP . CR ; \ + TIMED-BENCH TIMED-BENCH TIMED-BENCH", + define = bench.define, + run = bench.run_code, + ); + let output = Command::new(wafer) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .and_then(|mut child| { + use std::io::Write; + child.stdin.take().unwrap().write_all(code.as_bytes())?; + child.wait_with_output() + }) + .ok()?; + if !output.status.success() { + return None; + } + let stdout = String::from_utf8_lossy(&output.stdout); + let mut times: Vec = stdout + .trim() + .lines() + .filter_map(|l| l.trim().parse::().ok()) + .collect(); times.sort(); - times[times.len() / 2].as_micros() as u64 + if times.is_empty() { + return None; + } + Some(times[times.len() / 2]) } /// Measure gforth execution time using Forth-level `utime` (excludes startup). @@ -700,13 +749,16 @@ fn measure_gforth(gforth: &str, bench: &PerfBenchmark) -> Option { fn performance_report() { let gforth = find_gforth(); let gforth_fast = find_gforth_fast(); + let wafer_release = find_wafer_release(); if gforth.is_none() { eprintln!("SKIP: gforth not found"); return; } + if wafer_release.is_none() { + eprintln!("WARN: could not build WAFER release binary, using in-process (debug) timing"); + } let benchmarks = perf_benchmarks(); - let config_all = WaferConfig::all(); // Verify correctness first for bench in &benchmarks { @@ -733,7 +785,7 @@ fn performance_report() { let sep = "=".repeat(80); let thin = "-".repeat(80); println!("\n{sep}"); - println!(" WAFER vs Gforth Performance Comparison"); + println!(" WAFER vs Gforth Performance Comparison (release mode)"); println!("{sep}\n"); println!( "{:<22} {:>12} {:>12} {:>12} {:>12}", @@ -742,7 +794,7 @@ fn performance_report() { println!("{thin}"); for bench in &benchmarks { - let wafer = measure_wafer(&config_all, bench); + let wafer = wafer_release.and_then(|w| measure_wafer_release(w, bench)).unwrap_or(0); let gf = gforth.and_then(|g| measure_gforth(g, bench)); let gf_fast = gforth_fast.and_then(|g| measure_gforth(g, bench));