Implement stack-to-local promotion and consolidation recompiler

Stack-to-local promotion (Phase 1: straight-line code):
- Words with no control flow/calls use WASM locals instead of memory stack
- Stack manipulation (Swap, Rot, Nip, Tuck, Dup, Drop) emits ZERO instructions
- ~7x instruction reduction for arithmetic-heavy words like DUP *
- Pre-loads consumed items from memory, writes results back at exit

Consolidation recompiler (CONSOLIDATE word):
- Recompiles all IR-based words into single WASM module
- Direct call instructions instead of call_indirect through function table
- Cranelift can inline and optimize across word boundaries
- All control flow variants support consolidated calls

342 unit tests + 11 compliance, all passing.
This commit is contained in:
2026-04-01 22:56:00 +02:00
parent fcd063d83d
commit 4f59ffa19e
4 changed files with 694 additions and 32 deletions
+368
View File
@@ -7,6 +7,7 @@
//! remains a global.
use std::borrow::Cow;
use std::collections::HashMap;
use wasm_encoder::{
BlockType, CodeSection, ConstExpr, ElementSection, Elements, EntityType, ExportKind,
@@ -14,6 +15,7 @@ use wasm_encoder::{
MemoryType, Module, RefType, TableType, TypeSection, ValType,
};
use crate::dictionary::WordId;
use crate::error::{WaferError, WaferResult};
use crate::ir::IrOp;
use crate::memory::CELL_SIZE;
@@ -954,6 +956,372 @@ pub fn compile_word(
})
}
// ---------------------------------------------------------------------------
// Consolidated module generation
// ---------------------------------------------------------------------------
/// Emit all IR operations, replacing `Call`/`TailCall` with direct calls
/// when the target word is within the consolidated module.
fn emit_consolidated_body(f: &mut Function, ops: &[IrOp], local_fn_map: &HashMap<WordId, u32>) {
for op in ops {
emit_consolidated_op(f, op, local_fn_map);
}
}
/// Emit a single IR operation with consolidated call support.
///
/// For `Call` and `TailCall`, emits a direct `call` if the target is in the
/// consolidated module, otherwise falls back to `call_indirect`. For control
/// flow with nested bodies, recurses to handle inner calls.
fn emit_consolidated_op(f: &mut Function, op: &IrOp, local_fn_map: &HashMap<WordId, u32>) {
match op {
IrOp::Call(word_id) => {
if let Some(&fn_idx) = local_fn_map.get(word_id) {
dsp_writeback(f);
f.instruction(&Instruction::Call(fn_idx));
dsp_reload(f);
} else {
// Fall back to indirect call for host functions
dsp_writeback(f);
f.instruction(&Instruction::I32Const(word_id.0 as i32))
.instruction(&Instruction::CallIndirect {
type_index: TYPE_VOID,
table_index: TABLE,
});
dsp_reload(f);
}
}
IrOp::TailCall(word_id) => {
if let Some(&fn_idx) = local_fn_map.get(word_id) {
dsp_writeback(f);
f.instruction(&Instruction::Call(fn_idx));
f.instruction(&Instruction::Return);
} else {
dsp_writeback(f);
f.instruction(&Instruction::I32Const(word_id.0 as i32))
.instruction(&Instruction::CallIndirect {
type_index: TYPE_VOID,
table_index: TABLE,
});
f.instruction(&Instruction::Return);
}
}
// Control flow with nested bodies -- recurse for consolidated calls
IrOp::If {
then_body,
else_body,
} => {
pop(f);
f.instruction(&Instruction::If(BlockType::Empty));
emit_consolidated_body(f, then_body, local_fn_map);
if let Some(eb) = else_body {
f.instruction(&Instruction::Else);
emit_consolidated_body(f, eb, local_fn_map);
}
f.instruction(&Instruction::End);
}
IrOp::DoLoop { body, is_plus_loop } => {
emit_consolidated_do_loop(f, body, *is_plus_loop, local_fn_map);
}
IrOp::BeginUntil { body } => {
f.instruction(&Instruction::Loop(BlockType::Empty));
emit_consolidated_body(f, body, local_fn_map);
pop(f);
f.instruction(&Instruction::I32Eqz)
.instruction(&Instruction::BrIf(0))
.instruction(&Instruction::End);
}
IrOp::BeginAgain { body } => {
f.instruction(&Instruction::Loop(BlockType::Empty));
emit_consolidated_body(f, body, local_fn_map);
f.instruction(&Instruction::Br(0))
.instruction(&Instruction::End);
}
IrOp::BeginWhileRepeat { test, body } => {
f.instruction(&Instruction::Block(BlockType::Empty));
f.instruction(&Instruction::Loop(BlockType::Empty));
emit_consolidated_body(f, test, local_fn_map);
pop(f);
f.instruction(&Instruction::I32Eqz)
.instruction(&Instruction::BrIf(1));
emit_consolidated_body(f, body, local_fn_map);
f.instruction(&Instruction::Br(0))
.instruction(&Instruction::End)
.instruction(&Instruction::End);
}
IrOp::BeginDoubleWhileRepeat {
outer_test,
inner_test,
body,
after_repeat,
else_body,
} => {
f.instruction(&Instruction::Block(BlockType::Empty)); // $end
f.instruction(&Instruction::Block(BlockType::Empty)); // $else
f.instruction(&Instruction::Block(BlockType::Empty)); // $after
f.instruction(&Instruction::Loop(BlockType::Empty)); // $begin
emit_consolidated_body(f, outer_test, local_fn_map);
pop(f);
f.instruction(&Instruction::I32Eqz)
.instruction(&Instruction::BrIf(2)); // to $else
emit_consolidated_body(f, inner_test, local_fn_map);
pop(f);
f.instruction(&Instruction::I32Eqz)
.instruction(&Instruction::BrIf(1)); // to $after
emit_consolidated_body(f, body, local_fn_map);
f.instruction(&Instruction::Br(0)); // back to $begin
f.instruction(&Instruction::End); // end loop
f.instruction(&Instruction::End); // end $after block
emit_consolidated_body(f, after_repeat, local_fn_map);
if else_body.is_some() {
f.instruction(&Instruction::Br(1)); // skip else, goto $end
}
f.instruction(&Instruction::End); // end $else block
if let Some(eb) = else_body {
emit_consolidated_body(f, eb, local_fn_map);
}
f.instruction(&Instruction::End); // end $end block
}
// All other ops have no nested bodies with calls -- delegate to emit_op
other => emit_op(f, other),
}
}
/// Emit a DO...LOOP / DO...+LOOP with consolidated call support for the body.
fn emit_consolidated_do_loop(
f: &mut Function,
body: &[IrOp],
is_plus_loop: bool,
local_fn_map: &HashMap<WordId, u32>,
) {
// DO ( limit index -- )
pop_to(f, SCRATCH_BASE); // index
pop_to(f, SCRATCH_BASE + 1); // limit
// Push limit then index to return stack
f.instruction(&Instruction::LocalGet(SCRATCH_BASE + 1));
rpush_via_local(f, SCRATCH_BASE + 2);
f.instruction(&Instruction::LocalGet(SCRATCH_BASE));
rpush_via_local(f, SCRATCH_BASE + 2);
f.instruction(&Instruction::Block(BlockType::Empty));
f.instruction(&Instruction::Loop(BlockType::Empty));
emit_consolidated_body(f, body, local_fn_map);
// Pop current index from return stack into scratch local
rpop(f);
if is_plus_loop {
f.instruction(&Instruction::LocalSet(SCRATCH_BASE));
pop_to(f, SCRATCH_BASE + 2); // step from data stack
rpeek(f);
f.instruction(&Instruction::LocalSet(SCRATCH_BASE + 1));
f.instruction(&Instruction::LocalGet(SCRATCH_BASE))
.instruction(&Instruction::LocalGet(SCRATCH_BASE + 1))
.instruction(&Instruction::I32Sub)
.instruction(&Instruction::LocalSet(SCRATCH_BASE + 3));
f.instruction(&Instruction::LocalGet(SCRATCH_BASE))
.instruction(&Instruction::LocalGet(SCRATCH_BASE + 2))
.instruction(&Instruction::I32Add)
.instruction(&Instruction::LocalSet(SCRATCH_BASE));
f.instruction(&Instruction::LocalGet(SCRATCH_BASE));
rpush_via_local(f, SCRATCH_BASE + 2);
f.instruction(&Instruction::LocalGet(SCRATCH_BASE + 3))
.instruction(&Instruction::LocalGet(SCRATCH_BASE))
.instruction(&Instruction::LocalGet(SCRATCH_BASE + 1))
.instruction(&Instruction::I32Sub)
.instruction(&Instruction::I32Xor)
.instruction(&Instruction::I32Const(0))
.instruction(&Instruction::I32LtS)
.instruction(&Instruction::BrIf(1))
.instruction(&Instruction::Br(0))
.instruction(&Instruction::End)
.instruction(&Instruction::End);
} else {
f.instruction(&Instruction::I32Const(1))
.instruction(&Instruction::I32Add)
.instruction(&Instruction::LocalSet(SCRATCH_BASE));
rpeek(f);
f.instruction(&Instruction::LocalSet(SCRATCH_BASE + 1));
f.instruction(&Instruction::LocalGet(SCRATCH_BASE));
rpush_via_local(f, SCRATCH_BASE + 2);
f.instruction(&Instruction::LocalGet(SCRATCH_BASE))
.instruction(&Instruction::LocalGet(SCRATCH_BASE + 1))
.instruction(&Instruction::I32GeS)
.instruction(&Instruction::BrIf(1))
.instruction(&Instruction::Br(0))
.instruction(&Instruction::End)
.instruction(&Instruction::End);
}
// Clean up: pop index and limit from return stack
rpop(f);
f.instruction(&Instruction::Drop);
rpop(f);
f.instruction(&Instruction::Drop);
}
/// Compile all given words into a single consolidated WASM module.
///
/// Each word becomes a function in the module. Calls between words within the
/// module use direct `call` instructions instead of `call_indirect` through the
/// function table, enabling Cranelift to inline and optimize across word
/// boundaries.
///
/// # Arguments
///
/// * `words` - Words to consolidate, sorted by `WordId`. Each entry is
/// `(WordId, Vec<IrOp>)` containing the word's IR body.
/// * `local_fn_map` - Maps each `WordId` in the module to its WASM function
/// index (imported functions come first, so defined functions start at 1).
/// * `table_size` - Current function table size, used for table import minimum.
pub fn compile_consolidated_module(
words: &[(WordId, Vec<IrOp>)],
local_fn_map: &HashMap<WordId, u32>,
table_size: u32,
) -> WaferResult<Vec<u8>> {
let mut module = Module::new();
// -- Type section --
let mut types = TypeSection::new();
types.ty().function([], []); // type 0: () -> ()
types.ty().function([ValType::I32], []); // type 1: (i32) -> ()
module.section(&types);
// -- Import section (same as single-word modules) --
let mut imports = ImportSection::new();
imports.import("env", "emit", EntityType::Function(TYPE_I32));
imports.import(
"env",
"memory",
EntityType::Memory(MemoryType {
minimum: 1,
maximum: None,
memory64: false,
shared: false,
page_size_log2: None,
}),
);
imports.import(
"env",
"dsp",
EntityType::Global(GlobalType {
val_type: ValType::I32,
mutable: true,
shared: false,
}),
);
imports.import(
"env",
"rsp",
EntityType::Global(GlobalType {
val_type: ValType::I32,
mutable: true,
shared: false,
}),
);
imports.import(
"env",
"fsp",
EntityType::Global(GlobalType {
val_type: ValType::I32,
mutable: true,
shared: false,
}),
);
imports.import(
"env",
"table",
EntityType::Table(TableType {
element_type: RefType::FUNCREF,
minimum: table_size as u64,
maximum: None,
table64: false,
shared: false,
}),
);
module.section(&imports);
// -- Function section: N functions, all type void --
let mut functions = FunctionSection::new();
for _ in words {
functions.function(TYPE_VOID);
}
module.section(&functions);
// -- Export section: export each function as "fn_0", "fn_1", etc. --
let mut exports = ExportSection::new();
for (i, _) in words.iter().enumerate() {
let name = format!("fn_{i}");
// +1 because emit is imported function index 0
exports.export(&name, ExportKind::Func, (i as u32) + 1);
}
module.section(&exports);
// -- Element section: place each function in the table at its WordId slot --
// Use a single element section with one active segment per word.
let mut elements = ElementSection::new();
for (i, (word_id, _)) in words.iter().enumerate() {
let offset = ConstExpr::i32_const(word_id.0 as i32);
let fn_idx = (i as u32) + 1; // +1 for the emit import
let indices = [fn_idx];
elements.active(
Some(TABLE),
&offset,
Elements::Functions(Cow::Borrowed(&indices)),
);
}
module.section(&elements);
// -- Code section: emit each function body --
let mut code = CodeSection::new();
for (_word_id, body) in words {
let num_locals = 1 + count_scratch_locals(body);
let mut func = Function::new(vec![(num_locals, ValType::I32)]);
// Prologue: cache $dsp global into local 0
func.instruction(&Instruction::GlobalGet(DSP))
.instruction(&Instruction::LocalSet(CACHED_DSP_LOCAL));
// Body with consolidated call support
emit_consolidated_body(&mut func, body, local_fn_map);
// Epilogue: write cached DSP back to the $dsp global
func.instruction(&Instruction::LocalGet(CACHED_DSP_LOCAL))
.instruction(&Instruction::GlobalSet(DSP));
func.instruction(&Instruction::End);
code.function(&func);
}
module.section(&code);
let bytes = module.finish();
// Validate
wasmparser::validate(&bytes).map_err(|e| {
WaferError::ValidationError(format!("Consolidated WASM failed validation: {e}"))
})?;
Ok(bytes)
}
/// Generate the core/bootstrap WASM module.
///
/// Not yet implemented -- will be built in a future step.