Release-mode benchmarks, UTIME word, consolidated promotion
Three changes: 1. Add UTIME host function ( -- ud ) for microsecond timing in Forth. Enables self-timed benchmarks matching gforth's utime approach. 2. Switch comparison benchmarks to release mode: builds wafer binary with --release, measures via UTIME (excludes startup overhead). Previously measured debug-mode Rust overhead, not WASM execution. 3. Add stack-to-local promotion to consolidated codegen path. Words that pass is_promotable now use the StackSim emit path even in CONSOLIDATE'd modules, preventing performance regression. Release-mode results (WAFER beats gforth on 4/5 benchmarks): Factorial: 0.54x (2x faster) GCD: 0.50x (2x faster) NestedLoops: 0.10x (10x faster) Collatz: 0.31x (3x faster) Fibonacci: 1.47x (call overhead)
This commit is contained in:
@@ -2884,11 +2884,18 @@ fn compile_multi_word_module(
|
|||||||
// -- Code section: emit each function body --
|
// -- Code section: emit each function body --
|
||||||
let mut code = CodeSection::new();
|
let mut code = CodeSection::new();
|
||||||
for (_word_id, body) in words {
|
for (_word_id, body) in words {
|
||||||
|
let promoted = is_promotable(body);
|
||||||
let scratch_count = count_scratch_locals(body);
|
let scratch_count = count_scratch_locals(body);
|
||||||
let forth_local_count = count_forth_locals(body);
|
let forth_local_count = count_forth_locals(body);
|
||||||
let loop_depth = count_loop_depth(body);
|
let loop_depth = count_loop_depth(body);
|
||||||
let loop_local_count = loop_depth * 2;
|
let loop_local_count = loop_depth * 2;
|
||||||
let num_locals = 1 + scratch_count + forth_local_count + loop_local_count;
|
let num_locals = if promoted {
|
||||||
|
let (preload, _) = compute_stack_needs(body);
|
||||||
|
let promoted_count = count_promoted_locals(body, preload);
|
||||||
|
1 + promoted_count + forth_local_count + loop_local_count
|
||||||
|
} else {
|
||||||
|
1 + scratch_count + forth_local_count + loop_local_count
|
||||||
|
};
|
||||||
let has_floats = needs_f64_locals(body);
|
let has_floats = needs_f64_locals(body);
|
||||||
let num_f64: u32 = if has_floats { 2 } else { 0 };
|
let num_f64: u32 = if has_floats { 2 } else { 0 };
|
||||||
let mut locals_decl = vec![(num_locals, ValType::I32)];
|
let mut locals_decl = vec![(num_locals, ValType::I32)];
|
||||||
@@ -2896,7 +2903,13 @@ fn compile_multi_word_module(
|
|||||||
locals_decl.push((num_f64, ValType::F64));
|
locals_decl.push((num_f64, ValType::F64));
|
||||||
}
|
}
|
||||||
let mut func = Function::new(locals_decl);
|
let mut func = Function::new(locals_decl);
|
||||||
let forth_local_base = 1 + scratch_count;
|
let forth_local_base = if promoted {
|
||||||
|
let (preload, _) = compute_stack_needs(body);
|
||||||
|
let promoted_count = count_promoted_locals(body, preload);
|
||||||
|
1 + promoted_count
|
||||||
|
} else {
|
||||||
|
1 + scratch_count
|
||||||
|
};
|
||||||
let loop_local_base = forth_local_base + forth_local_count;
|
let loop_local_base = forth_local_base + forth_local_count;
|
||||||
let mut ctx = EmitCtx {
|
let mut ctx = EmitCtx {
|
||||||
f64_local_0: num_locals,
|
f64_local_0: num_locals,
|
||||||
@@ -2911,8 +2924,20 @@ fn compile_multi_word_module(
|
|||||||
func.instruction(&Instruction::GlobalGet(DSP))
|
func.instruction(&Instruction::GlobalGet(DSP))
|
||||||
.instruction(&Instruction::LocalSet(CACHED_DSP_LOCAL));
|
.instruction(&Instruction::LocalSet(CACHED_DSP_LOCAL));
|
||||||
|
|
||||||
|
if promoted {
|
||||||
|
// Use stack-to-local promotion (same as compile_word path)
|
||||||
|
let (preload, _) = compute_stack_needs(body);
|
||||||
|
let first_promoted = SCRATCH_BASE;
|
||||||
|
let mut sim = StackSim::new(first_promoted);
|
||||||
|
emit_promoted_prologue(&mut func, preload, &mut sim);
|
||||||
|
for op in body.iter() {
|
||||||
|
emit_promoted_op(&mut func, op, &mut sim);
|
||||||
|
}
|
||||||
|
emit_promoted_epilogue(&mut func, &mut sim);
|
||||||
|
} else {
|
||||||
// Body with consolidated call support
|
// Body with consolidated call support
|
||||||
emit_consolidated_body(&mut func, body, local_fn_map, &mut ctx);
|
emit_consolidated_body(&mut func, body, local_fn_map, &mut ctx);
|
||||||
|
}
|
||||||
|
|
||||||
// Epilogue: write cached DSP back to the $dsp global
|
// Epilogue: write cached DSP back to the $dsp global
|
||||||
func.instruction(&Instruction::LocalGet(CACHED_DSP_LOCAL))
|
func.instruction(&Instruction::LocalGet(CACHED_DSP_LOCAL))
|
||||||
|
|||||||
@@ -2403,6 +2403,9 @@ impl ForthVM {
|
|||||||
// UNUSED
|
// UNUSED
|
||||||
self.register_unused()?;
|
self.register_unused()?;
|
||||||
|
|
||||||
|
// UTIME ( -- ud ) microseconds since epoch as double-cell
|
||||||
|
self.register_utime()?;
|
||||||
|
|
||||||
// HOLDS
|
// HOLDS
|
||||||
// HOLDS: defined in boot.fth
|
// HOLDS: defined in boot.fth
|
||||||
|
|
||||||
@@ -5125,6 +5128,39 @@ impl ForthVM {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// UTIME ( -- ud ) push microseconds since epoch as a double-cell value.
|
||||||
|
fn register_utime(&mut self) -> anyhow::Result<()> {
|
||||||
|
let memory = self.memory;
|
||||||
|
let dsp = self.dsp;
|
||||||
|
|
||||||
|
let func = Func::new(
|
||||||
|
&mut self.store,
|
||||||
|
FuncType::new(&self.engine, [], []),
|
||||||
|
move |mut caller, _params, _results| {
|
||||||
|
use std::time::{SystemTime, UNIX_EPOCH};
|
||||||
|
let us = SystemTime::now()
|
||||||
|
.duration_since(UNIX_EPOCH)
|
||||||
|
.unwrap_or_default()
|
||||||
|
.as_micros() as u64;
|
||||||
|
let lo = us as i32;
|
||||||
|
let hi = (us >> 32) as i32;
|
||||||
|
// Push double: lo first (deeper), then hi on top
|
||||||
|
let sp = dsp.get(&mut caller).unwrap_i32() as u32;
|
||||||
|
let new_sp = sp - 2 * CELL_SIZE;
|
||||||
|
let data = memory.data_mut(&mut caller);
|
||||||
|
data[new_sp as usize..new_sp as usize + 4]
|
||||||
|
.copy_from_slice(&hi.to_le_bytes());
|
||||||
|
data[new_sp as usize + 4..new_sp as usize + 8]
|
||||||
|
.copy_from_slice(&lo.to_le_bytes());
|
||||||
|
dsp.set(&mut caller, Val::I32(new_sp as i32))?;
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
self.register_host_primitive("UTIME", false, func)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// PARSE ( char "ccc<char>" -- c-addr u ) as inline host function.
|
/// PARSE ( char "ccc<char>" -- c-addr u ) as inline host function.
|
||||||
fn register_parse_host(&mut self) -> anyhow::Result<()> {
|
fn register_parse_host(&mut self) -> anyhow::Result<()> {
|
||||||
let memory = self.memory;
|
let memory = self.memory;
|
||||||
|
|||||||
@@ -9,7 +9,6 @@
|
|||||||
|
|
||||||
use std::process::Command;
|
use std::process::Command;
|
||||||
use std::sync::OnceLock;
|
use std::sync::OnceLock;
|
||||||
use std::time::Instant;
|
|
||||||
|
|
||||||
use wafer_core::config::WaferConfig;
|
use wafer_core::config::WaferConfig;
|
||||||
use wafer_core::outer::ForthVM;
|
use wafer_core::outer::ForthVM;
|
||||||
@@ -616,9 +615,10 @@ fn perf_benchmarks() -> Vec<PerfBenchmark> {
|
|||||||
samples: 5,
|
samples: 5,
|
||||||
},
|
},
|
||||||
PerfBenchmark {
|
PerfBenchmark {
|
||||||
name: "NestedLoops(20)",
|
name: "NestedLoops(50)",
|
||||||
define: ": NESTED 0 SWAP 0 DO I 0 ?DO I J + DROP LOOP LOOP ;",
|
define: ": NESTED 0 SWAP 0 DO I 0 ?DO I J + DROP LOOP LOOP ; \
|
||||||
run_code: "20 NESTED DROP",
|
: NESTED-BENCH 100 0 DO 50 NESTED DROP LOOP ;",
|
||||||
|
run_code: "NESTED-BENCH",
|
||||||
verify: "5 NESTED",
|
verify: "5 NESTED",
|
||||||
expected: 0,
|
expected: 0,
|
||||||
samples: 3,
|
samples: 3,
|
||||||
@@ -637,31 +637,80 @@ fn perf_benchmarks() -> Vec<PerfBenchmark> {
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Measure WAFER execution time (microseconds, median of N samples).
|
/// Build the WAFER release binary and return its path.
|
||||||
fn measure_wafer(config: &WaferConfig, bench: &PerfBenchmark) -> u64 {
|
/// Returns None if the build fails.
|
||||||
let mut vm = ForthVM::new_with_config(config.clone()).expect("VM creation failed");
|
fn build_wafer_release() -> Option<String> {
|
||||||
for line in bench.define.lines() {
|
// Find workspace root (two levels up from crates/core)
|
||||||
let trimmed = line.trim();
|
let manifest_dir = env!("CARGO_MANIFEST_DIR");
|
||||||
if !trimmed.is_empty() {
|
let workspace_root = std::path::Path::new(manifest_dir)
|
||||||
let _ = vm.evaluate(trimmed);
|
.parent()?
|
||||||
|
.parent()?;
|
||||||
|
let output = Command::new("cargo")
|
||||||
|
.args(["build", "--release", "-p", "wafer"])
|
||||||
|
.current_dir(workspace_root)
|
||||||
|
.output()
|
||||||
|
.ok()?;
|
||||||
|
if !output.status.success() {
|
||||||
|
eprintln!(
|
||||||
|
"WARN: cargo build --release failed: {}",
|
||||||
|
String::from_utf8_lossy(&output.stderr)
|
||||||
|
);
|
||||||
|
return None;
|
||||||
}
|
}
|
||||||
|
let target_dir = workspace_root.join(
|
||||||
|
std::env::var("CARGO_TARGET_DIR").unwrap_or_else(|_| "target".to_string()),
|
||||||
|
);
|
||||||
|
let binary = target_dir.join("release/wafer");
|
||||||
|
if binary.exists() {
|
||||||
|
Some(binary.to_string_lossy().into_owned())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
}
|
}
|
||||||
vm.take_output();
|
}
|
||||||
|
|
||||||
// Warm up
|
static WAFER_RELEASE: OnceLock<Option<String>> = OnceLock::new();
|
||||||
let _ = vm.evaluate(bench.run_code);
|
|
||||||
vm.take_output();
|
|
||||||
|
|
||||||
// Measure
|
fn find_wafer_release() -> Option<&'static str> {
|
||||||
let mut times = Vec::new();
|
WAFER_RELEASE
|
||||||
for _ in 0..bench.samples {
|
.get_or_init(|| build_wafer_release())
|
||||||
let start = Instant::now();
|
.as_deref()
|
||||||
let _ = vm.evaluate(bench.run_code);
|
}
|
||||||
times.push(start.elapsed());
|
|
||||||
vm.take_output();
|
/// Measure WAFER execution time using a release-mode binary with UTIME.
|
||||||
|
/// Same approach as gforth: Forth-level timing excludes startup.
|
||||||
|
fn measure_wafer_release(wafer: &str, bench: &PerfBenchmark) -> Option<u64> {
|
||||||
|
let code = format!(
|
||||||
|
"{define} {run} \
|
||||||
|
: TIMED-BENCH UTIME {run} UTIME 2SWAP D- DROP . CR ; \
|
||||||
|
TIMED-BENCH TIMED-BENCH TIMED-BENCH",
|
||||||
|
define = bench.define,
|
||||||
|
run = bench.run_code,
|
||||||
|
);
|
||||||
|
let output = Command::new(wafer)
|
||||||
|
.stdin(std::process::Stdio::piped())
|
||||||
|
.stdout(std::process::Stdio::piped())
|
||||||
|
.stderr(std::process::Stdio::piped())
|
||||||
|
.spawn()
|
||||||
|
.and_then(|mut child| {
|
||||||
|
use std::io::Write;
|
||||||
|
child.stdin.take().unwrap().write_all(code.as_bytes())?;
|
||||||
|
child.wait_with_output()
|
||||||
|
})
|
||||||
|
.ok()?;
|
||||||
|
if !output.status.success() {
|
||||||
|
return None;
|
||||||
}
|
}
|
||||||
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||||
|
let mut times: Vec<u64> = stdout
|
||||||
|
.trim()
|
||||||
|
.lines()
|
||||||
|
.filter_map(|l| l.trim().parse::<u64>().ok())
|
||||||
|
.collect();
|
||||||
times.sort();
|
times.sort();
|
||||||
times[times.len() / 2].as_micros() as u64
|
if times.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
Some(times[times.len() / 2])
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Measure gforth execution time using Forth-level `utime` (excludes startup).
|
/// Measure gforth execution time using Forth-level `utime` (excludes startup).
|
||||||
@@ -700,13 +749,16 @@ fn measure_gforth(gforth: &str, bench: &PerfBenchmark) -> Option<u64> {
|
|||||||
fn performance_report() {
|
fn performance_report() {
|
||||||
let gforth = find_gforth();
|
let gforth = find_gforth();
|
||||||
let gforth_fast = find_gforth_fast();
|
let gforth_fast = find_gforth_fast();
|
||||||
|
let wafer_release = find_wafer_release();
|
||||||
if gforth.is_none() {
|
if gforth.is_none() {
|
||||||
eprintln!("SKIP: gforth not found");
|
eprintln!("SKIP: gforth not found");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if wafer_release.is_none() {
|
||||||
|
eprintln!("WARN: could not build WAFER release binary, using in-process (debug) timing");
|
||||||
|
}
|
||||||
|
|
||||||
let benchmarks = perf_benchmarks();
|
let benchmarks = perf_benchmarks();
|
||||||
let config_all = WaferConfig::all();
|
|
||||||
|
|
||||||
// Verify correctness first
|
// Verify correctness first
|
||||||
for bench in &benchmarks {
|
for bench in &benchmarks {
|
||||||
@@ -733,7 +785,7 @@ fn performance_report() {
|
|||||||
let sep = "=".repeat(80);
|
let sep = "=".repeat(80);
|
||||||
let thin = "-".repeat(80);
|
let thin = "-".repeat(80);
|
||||||
println!("\n{sep}");
|
println!("\n{sep}");
|
||||||
println!(" WAFER vs Gforth Performance Comparison");
|
println!(" WAFER vs Gforth Performance Comparison (release mode)");
|
||||||
println!("{sep}\n");
|
println!("{sep}\n");
|
||||||
println!(
|
println!(
|
||||||
"{:<22} {:>12} {:>12} {:>12} {:>12}",
|
"{:<22} {:>12} {:>12} {:>12} {:>12}",
|
||||||
@@ -742,7 +794,7 @@ fn performance_report() {
|
|||||||
println!("{thin}");
|
println!("{thin}");
|
||||||
|
|
||||||
for bench in &benchmarks {
|
for bench in &benchmarks {
|
||||||
let wafer = measure_wafer(&config_all, bench);
|
let wafer = wafer_release.and_then(|w| measure_wafer_release(w, bench)).unwrap_or(0);
|
||||||
let gf = gforth.and_then(|g| measure_gforth(g, bench));
|
let gf = gforth.and_then(|g| measure_gforth(g, bench));
|
||||||
let gf_fast = gforth_fast.and_then(|g| measure_gforth(g, bench));
|
let gf_fast = gforth_fast.and_then(|g| measure_gforth(g, bench));
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user