Implement startup batching: 12x faster boot

Batch-compile all ~64 IR primitives into a single WASM module at startup.
Replaces 64 separate Module::new + Instance::new with 1 of each.
Reuses compile_consolidated_module() directly, removed compile_core_module() stub.

Boot time: 7.7ms -> 0.6ms (release), test suite: 5.1s -> 1.5s (debug).
13 of 14 optimizations now implemented. 392 tests passing.
This commit is contained in:
2026-04-02 13:05:53 +02:00
parent 8c53afa28a
commit f7a8bf1d24
3 changed files with 72 additions and 22 deletions
-10
View File
@@ -1877,16 +1877,6 @@ pub fn compile_consolidated_module(
Ok(bytes) Ok(bytes)
} }
/// Generate the core/bootstrap WASM module.
///
/// Not yet implemented -- will be built in a future step.
pub fn compile_core_module(primitives: &[(String, Vec<IrOp>)]) -> WaferResult<Vec<u8>> {
let _ = primitives;
Err(WaferError::CodegenError(
"compile_core_module not yet implemented".to_string(),
))
}
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
// Tests // Tests
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
+70 -10
View File
@@ -237,6 +237,10 @@ pub struct ForthVM {
config: WaferConfig, config: WaferConfig,
/// Total WASM module bytes compiled. /// Total WASM module bytes compiled.
total_module_bytes: u64, total_module_bytes: u64,
/// When true, `register_primitive` defers WASM compilation for batch processing.
batch_mode: bool,
/// IR primitives deferred during `batch_mode` for single-module compilation.
deferred_ir: Vec<(WordId, Vec<IrOp>)>,
} }
impl ForthVM { impl ForthVM {
@@ -360,6 +364,8 @@ impl ForthVM {
ir_bodies: HashMap::new(), ir_bodies: HashMap::new(),
config: wafer_config, config: wafer_config,
total_module_bytes: 0, total_module_bytes: 0,
batch_mode: false,
deferred_ir: Vec::new(),
}; };
vm.register_primitives()?; vm.register_primitives()?;
@@ -1563,6 +1569,50 @@ impl ForthVM {
Ok(()) Ok(())
} }
/// Batch-compile all deferred IR primitives into a single WASM module.
fn batch_compile_deferred(&mut self) -> anyhow::Result<()> {
let words = std::mem::take(&mut self.deferred_ir);
if words.is_empty() {
return Ok(());
}
let mut local_fn_map = HashMap::new();
for (i, (word_id, _)) in words.iter().enumerate() {
local_fn_map.insert(*word_id, (i as u32) + 1);
}
self.ensure_table_size(self.next_table_index)?;
let table_size = self.table_size();
let module_bytes = compile_consolidated_module(&words, &local_fn_map, table_size)
.map_err(|e| anyhow::anyhow!("batch compile error: {e}"))?;
self.total_module_bytes += module_bytes.len() as u64;
let module = Module::new(&self.engine, &module_bytes)?;
let instance = Instance::new(
&mut self.store,
&module,
&[
self.emit_func.into(),
self.memory.into(),
self.dsp.into(),
self.rsp.into(),
self.fsp.into(),
self.table.into(),
],
)?;
for (i, (word_id, _)) in words.iter().enumerate() {
let func = instance
.get_func(&mut self.store, &format!("fn_{i}"))
.ok_or_else(|| anyhow::anyhow!("missing batch export fn_{i}"))?;
self.table
.set(&mut self.store, word_id.0 as u64, Ref::Func(Some(func)))?;
}
Ok(())
}
// ----------------------------------------------------------------------- // -----------------------------------------------------------------------
// WASM instantiation // WASM instantiation
// ----------------------------------------------------------------------- // -----------------------------------------------------------------------
@@ -1860,20 +1910,24 @@ impl ForthVM {
.create(name, immediate) .create(name, immediate)
.map_err(|e| anyhow::anyhow!("{e}"))?; .map_err(|e| anyhow::anyhow!("{e}"))?;
self.ir_bodies.insert(word_id, ir_body.clone()); self.ir_bodies.insert(word_id, ir_body.clone());
let config = CodegenConfig {
base_fn_index: word_id.0,
table_size: self.table_size(),
stack_to_local_promotion: self.config.codegen.stack_to_local_promotion,
};
let compiled = compile_word(name, &ir_body, &config)
.map_err(|e| anyhow::anyhow!("codegen error for {name}: {e}"))?;
self.instantiate_and_install(&compiled, word_id)?;
self.dictionary.reveal(); self.dictionary.reveal();
self.sync_word_lookup(name, word_id, immediate); self.sync_word_lookup(name, word_id, immediate);
self.next_table_index = self.next_table_index.max(word_id.0 + 1); self.next_table_index = self.next_table_index.max(word_id.0 + 1);
if self.batch_mode {
// Defer WASM compilation for batch processing
self.deferred_ir.push((word_id, ir_body));
} else {
let config = CodegenConfig {
base_fn_index: word_id.0,
table_size: self.table_size(),
stack_to_local_promotion: self.config.codegen.stack_to_local_promotion,
};
let compiled = compile_word(name, &ir_body, &config)
.map_err(|e| anyhow::anyhow!("codegen error for {name}: {e}"))?;
self.instantiate_and_install(&compiled, word_id)?;
}
Ok(word_id) Ok(word_id)
} }
@@ -1901,6 +1955,8 @@ impl ForthVM {
/// Register all built-in primitive words. /// Register all built-in primitive words.
fn register_primitives(&mut self) -> anyhow::Result<()> { fn register_primitives(&mut self) -> anyhow::Result<()> {
self.batch_mode = true;
// -- Stack manipulation -- // -- Stack manipulation --
self.register_primitive("DUP", false, vec![IrOp::Dup])?; self.register_primitive("DUP", false, vec![IrOp::Dup])?;
self.register_primitive("DROP", false, vec![IrOp::Drop])?; self.register_primitive("DROP", false, vec![IrOp::Drop])?;
@@ -2187,6 +2243,10 @@ impl ForthVM {
// -- Floating-Point word set -- // -- Floating-Point word set --
self.register_float_words()?; self.register_float_words()?;
// Batch-compile all deferred IR primitives into a single WASM module
self.batch_mode = false;
self.batch_compile_deferred()?;
Ok(()) Ok(())
} }
+2 -2
View File
@@ -26,7 +26,7 @@ This document describes every optimization that makes sense for WAFER, why it ma
| 10 | Codegen Improvements | Codegen | Done | Medium | | 10 | Codegen Improvements | Codegen | Done | Medium |
| 11 | wasmtime Configuration | Runtime | Done | Low | | 11 | wasmtime Configuration | Runtime | Done | Low |
| 12 | Dictionary Hash Index | Runtime | Done | Low | | 12 | Dictionary Hash Index | Runtime | Done | Low |
| 13 | Startup Batching | Architecture | Not started | Low | | 13 | Startup Batching | Architecture | Done | Low |
| 14 | Float / Double-Cell | Codegen | Not started | Future | | 14 | Float / Double-Cell | Codegen | Not started | Future |
## 1. Stack-to-Local Promotion ## 1. Stack-to-Local Promotion
@@ -422,7 +422,7 @@ This affects **compile time** (word lookup during parsing), not runtime (compile
## 13. Startup Batching ## 13. Startup Batching
**Status: Not started.** `compile_core_module()` stub exists in `codegen.rs`. **Status: Done.** All IR primitives batch-compiled into a single WASM module at boot via `compile_consolidated_module()`. Reduces boot from ~7.7ms to ~0.6ms (12x faster). The `compile_core_module()` stub has been removed.
Currently, each of the 80+ primitives registered at boot creates a separate WASM module: `wasm-encoder` builds it, `wasmparser` validates it, Cranelift compiles it, and wasmtime instantiates it. This happens 80+ times sequentially. Currently, each of the 80+ primitives registered at boot creates a separate WASM module: `wasm-encoder` builds it, `wasmparser` validates it, Cranelift compiles it, and wasmtime instantiates it. This happens 80+ times sequentially.