From f7a8bf1d24c7431c6bf920c28057b7963960c158 Mon Sep 17 00:00:00 2001 From: Oleksandr Kozachuk Date: Thu, 2 Apr 2026 13:05:53 +0200 Subject: [PATCH] Implement startup batching: 12x faster boot Batch-compile all ~64 IR primitives into a single WASM module at startup. Replaces 64 separate Module::new + Instance::new with 1 of each. Reuses compile_consolidated_module() directly, removed compile_core_module() stub. Boot time: 7.7ms -> 0.6ms (release), test suite: 5.1s -> 1.5s (debug). 13 of 14 optimizations now implemented. 392 tests passing. --- crates/core/src/codegen.rs | 10 ----- crates/core/src/outer.rs | 80 +++++++++++++++++++++++++++++++++----- docs/OPTIMIZATIONS.md | 4 +- 3 files changed, 72 insertions(+), 22 deletions(-) diff --git a/crates/core/src/codegen.rs b/crates/core/src/codegen.rs index c510269..b2cbde7 100644 --- a/crates/core/src/codegen.rs +++ b/crates/core/src/codegen.rs @@ -1877,16 +1877,6 @@ pub fn compile_consolidated_module( Ok(bytes) } -/// Generate the core/bootstrap WASM module. -/// -/// Not yet implemented -- will be built in a future step. -pub fn compile_core_module(primitives: &[(String, Vec)]) -> WaferResult> { - let _ = primitives; - Err(WaferError::CodegenError( - "compile_core_module not yet implemented".to_string(), - )) -} - // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- diff --git a/crates/core/src/outer.rs b/crates/core/src/outer.rs index ab24fd4..6265cd2 100644 --- a/crates/core/src/outer.rs +++ b/crates/core/src/outer.rs @@ -237,6 +237,10 @@ pub struct ForthVM { config: WaferConfig, /// Total WASM module bytes compiled. total_module_bytes: u64, + /// When true, `register_primitive` defers WASM compilation for batch processing. + batch_mode: bool, + /// IR primitives deferred during `batch_mode` for single-module compilation. + deferred_ir: Vec<(WordId, Vec)>, } impl ForthVM { @@ -360,6 +364,8 @@ impl ForthVM { ir_bodies: HashMap::new(), config: wafer_config, total_module_bytes: 0, + batch_mode: false, + deferred_ir: Vec::new(), }; vm.register_primitives()?; @@ -1563,6 +1569,50 @@ impl ForthVM { Ok(()) } + /// Batch-compile all deferred IR primitives into a single WASM module. + fn batch_compile_deferred(&mut self) -> anyhow::Result<()> { + let words = std::mem::take(&mut self.deferred_ir); + if words.is_empty() { + return Ok(()); + } + + let mut local_fn_map = HashMap::new(); + for (i, (word_id, _)) in words.iter().enumerate() { + local_fn_map.insert(*word_id, (i as u32) + 1); + } + + self.ensure_table_size(self.next_table_index)?; + let table_size = self.table_size(); + + let module_bytes = compile_consolidated_module(&words, &local_fn_map, table_size) + .map_err(|e| anyhow::anyhow!("batch compile error: {e}"))?; + + self.total_module_bytes += module_bytes.len() as u64; + let module = Module::new(&self.engine, &module_bytes)?; + let instance = Instance::new( + &mut self.store, + &module, + &[ + self.emit_func.into(), + self.memory.into(), + self.dsp.into(), + self.rsp.into(), + self.fsp.into(), + self.table.into(), + ], + )?; + + for (i, (word_id, _)) in words.iter().enumerate() { + let func = instance + .get_func(&mut self.store, &format!("fn_{i}")) + .ok_or_else(|| anyhow::anyhow!("missing batch export fn_{i}"))?; + self.table + .set(&mut self.store, word_id.0 as u64, Ref::Func(Some(func)))?; + } + + Ok(()) + } + // ----------------------------------------------------------------------- // WASM instantiation // ----------------------------------------------------------------------- @@ -1860,20 +1910,24 @@ impl ForthVM { .create(name, immediate) .map_err(|e| anyhow::anyhow!("{e}"))?; self.ir_bodies.insert(word_id, ir_body.clone()); - - let config = CodegenConfig { - base_fn_index: word_id.0, - table_size: self.table_size(), - stack_to_local_promotion: self.config.codegen.stack_to_local_promotion, - }; - let compiled = compile_word(name, &ir_body, &config) - .map_err(|e| anyhow::anyhow!("codegen error for {name}: {e}"))?; - - self.instantiate_and_install(&compiled, word_id)?; self.dictionary.reveal(); self.sync_word_lookup(name, word_id, immediate); self.next_table_index = self.next_table_index.max(word_id.0 + 1); + if self.batch_mode { + // Defer WASM compilation for batch processing + self.deferred_ir.push((word_id, ir_body)); + } else { + let config = CodegenConfig { + base_fn_index: word_id.0, + table_size: self.table_size(), + stack_to_local_promotion: self.config.codegen.stack_to_local_promotion, + }; + let compiled = compile_word(name, &ir_body, &config) + .map_err(|e| anyhow::anyhow!("codegen error for {name}: {e}"))?; + self.instantiate_and_install(&compiled, word_id)?; + } + Ok(word_id) } @@ -1901,6 +1955,8 @@ impl ForthVM { /// Register all built-in primitive words. fn register_primitives(&mut self) -> anyhow::Result<()> { + self.batch_mode = true; + // -- Stack manipulation -- self.register_primitive("DUP", false, vec![IrOp::Dup])?; self.register_primitive("DROP", false, vec![IrOp::Drop])?; @@ -2187,6 +2243,10 @@ impl ForthVM { // -- Floating-Point word set -- self.register_float_words()?; + // Batch-compile all deferred IR primitives into a single WASM module + self.batch_mode = false; + self.batch_compile_deferred()?; + Ok(()) } diff --git a/docs/OPTIMIZATIONS.md b/docs/OPTIMIZATIONS.md index 5ba09ed..edc6c90 100644 --- a/docs/OPTIMIZATIONS.md +++ b/docs/OPTIMIZATIONS.md @@ -26,7 +26,7 @@ This document describes every optimization that makes sense for WAFER, why it ma | 10 | Codegen Improvements | Codegen | Done | Medium | | 11 | wasmtime Configuration | Runtime | Done | Low | | 12 | Dictionary Hash Index | Runtime | Done | Low | -| 13 | Startup Batching | Architecture | Not started | Low | +| 13 | Startup Batching | Architecture | Done | Low | | 14 | Float / Double-Cell | Codegen | Not started | Future | ## 1. Stack-to-Local Promotion @@ -422,7 +422,7 @@ This affects **compile time** (word lookup during parsing), not runtime (compile ## 13. Startup Batching -**Status: Not started.** `compile_core_module()` stub exists in `codegen.rs`. +**Status: Done.** All IR primitives batch-compiled into a single WASM module at boot via `compile_consolidated_module()`. Reduces boot from ~7.7ms to ~0.6ms (12x faster). The `compile_core_module()` stub has been removed. Currently, each of the 80+ primitives registered at boot creates a separate WASM module: `wasm-encoder` builds it, `wasmparser` validates it, Cranelift compiles it, and wasmtime instantiates it. This happens 80+ times sequentially.