Add optimization docs, workspace lints, and pre-commit hooks

- Add docs/OPTIMIZATIONS.md: catalog of 14 optimization passes with
  status tracking and implementation roadmap
- Configure workspace-level clippy and rustc lints in Cargo.toml
- Add clippy.toml and deny.toml for clippy thresholds and dependency
  auditing (licenses, advisories, bans)
- Set up pre-commit hook: cargo fmt, dprint, clippy, cargo deny,
  cargo machete
- Update Justfile with deny/machete targets, dprint in fmt checks
This commit is contained in:
2026-03-30 23:01:35 +02:00
parent 7507b1f164
commit 193ad7ec5a
8 changed files with 171 additions and 101 deletions
+32
View File
@@ -8,6 +8,38 @@ edition = "2024"
license = "MIT OR Apache-2.0" license = "MIT OR Apache-2.0"
repository = "https://github.com/ok2/wafer" repository = "https://github.com/ok2/wafer"
[workspace.lints.rust]
unsafe_code = "deny"
unused_must_use = "deny"
elided_lifetimes_in_paths = "warn"
trivial_numeric_casts = "warn"
unreachable_pub = "warn"
unused_qualifications = "warn"
[workspace.lints.clippy]
# Pedantic group (selective)
cloned_instead_of_copied = "warn"
explicit_iter_loop = "warn"
flat_map_option = "warn"
implicit_clone = "warn"
inconsistent_struct_constructor = "warn"
inefficient_to_string = "warn"
manual_let_else = "warn"
map_unwrap_or = "warn"
redundant_closure_for_method_calls = "warn"
semicolon_if_nothing_returned = "warn"
uninlined_format_args = "warn"
unnested_or_patterns = "warn"
unused_self = "warn"
# Correctness & suspicious
doc_markdown = "warn"
match_wildcard_for_single_variants = "warn"
needless_continue = "warn"
ref_as_ptr = "warn"
# Nursery (stable enough to use)
needless_collect = "warn"
or_fun_call = "warn"
[workspace.dependencies] [workspace.dependencies]
wasm-encoder = "0.228" wasm-encoder = "0.228"
wasmparser = "0.228" wasmparser = "0.228"
+13 -3
View File
@@ -17,13 +17,15 @@ compliance:
clippy: clippy:
cargo clippy --workspace -- -D warnings cargo clippy --workspace -- -D warnings
# Check formatting # Check formatting (Rust + Markdown)
fmt: fmt:
cargo fmt --all --check cargo fmt --all --check
dprint check
# Format code # Format code (Rust + Markdown)
fmt-fix: fmt-fix:
cargo fmt --all cargo fmt --all
dprint fmt
# Run the REPL # Run the REPL
repl: repl:
@@ -37,8 +39,16 @@ run file:
bench: bench:
cargo bench --workspace cargo bench --workspace
# Check dependency licenses and advisories
deny:
cargo deny check
# Detect unused dependencies
machete:
cargo machete --skip-target-dir
# Full CI check (what CI runs) # Full CI check (what CI runs)
ci: fmt clippy test ci: fmt clippy deny test
# Check compilation without running # Check compilation without running
check: check:
+3
View File
@@ -0,0 +1,3 @@
too-many-lines-threshold = 150
type-complexity-threshold = 300
too-many-arguments-threshold = 8
+7 -1
View File
@@ -5,8 +5,14 @@ version.workspace = true
edition.workspace = true edition.workspace = true
license.workspace = true license.workspace = true
[package.metadata.cargo-machete]
ignored = ["wasmtime", "wasmtime-wasi"]
[lints]
workspace = true
[dependencies] [dependencies]
wafer-core = { path = "../core" } wafer-core = { path = "../core", version = "0.1.0" }
wasmtime = { workspace = true } wasmtime = { workspace = true }
wasmtime-wasi = { workspace = true } wasmtime-wasi = { workspace = true }
anyhow = { workspace = true } anyhow = { workspace = true }
+3
View File
@@ -5,6 +5,9 @@ version.workspace = true
edition.workspace = true edition.workspace = true
license.workspace = true license.workspace = true
[lints]
workspace = true
[dependencies] [dependencies]
wasm-encoder = { workspace = true } wasm-encoder = { workspace = true }
wasmparser = { workspace = true } wasmparser = { workspace = true }
+7 -1
View File
@@ -5,5 +5,11 @@ version.workspace = true
edition.workspace = true edition.workspace = true
license.workspace = true license.workspace = true
[package.metadata.cargo-machete]
ignored = ["wafer-core"]
[lints]
workspace = true
[dependencies] [dependencies]
wafer-core = { path = "../core" } wafer-core = { path = "../core", version = "0.1.0" }
+35
View File
@@ -0,0 +1,35 @@
[advisories]
ignore = [
# wasmtime v31 known issues -- will resolve when upgrading wasmtime
{ id = "RUSTSEC-2025-0046", reason = "wasmtime v31: fd_renumber panic" },
{ id = "RUSTSEC-2025-0118", reason = "wasmtime v31: shared memory unsoundness" },
{ id = "RUSTSEC-2026-0006", reason = "wasmtime v31: f64.copysign segfault" },
{ id = "RUSTSEC-2026-0020", reason = "wasmtime v31: WASI resource exhaustion" },
{ id = "RUSTSEC-2026-0021", reason = "wasmtime v31: fields instance panic" },
# Unmaintained transitive deps from wasmtime/rustyline
{ id = "RUSTSEC-2025-0057", reason = "fxhash: transitive dep, no alternative" },
{ id = "RUSTSEC-2024-0436", reason = "paste: transitive dep, no alternative" },
]
[licenses]
allow = [
"MIT",
"Apache-2.0",
"Apache-2.0 WITH LLVM-exception",
"BSD-2-Clause",
"BSD-3-Clause",
"BSL-1.0",
"Unicode-3.0",
"Zlib",
]
confidence-threshold = 0.8
[bans]
multiple-versions = "warn"
wildcards = "deny"
[sources]
unknown-registry = "deny"
unknown-git = "deny"
allow-registry = ["https://github.com/rust-lang/crates.io-index"]
allow-git = []
+71 -96
View File
@@ -12,24 +12,22 @@ This document describes every optimization that makes sense for WAFER, why it ma
## Status Summary ## Status Summary
| # | Optimization | Level | Status | Impact | | # | Optimization | Level | Status | Impact |
|----|---------------------------|--------------|---------------------|----------| | -- | ------------------------ | ------------ | --------------- | ------- |
| 1 | Stack-to-Local Promotion | Codegen | Not implemented | Highest | | 1 | Stack-to-Local Promotion | Codegen | Not implemented | Highest |
| 2 | Peephole Optimization | IR pass | Not implemented | High | | 2 | Peephole Optimization | IR pass | Not implemented | High |
| 3 | Constant Folding | IR pass | Not implemented | High | | 3 | Constant Folding | IR pass | Not implemented | High |
| 4 | Inlining | IR pass | Not implemented | High | | 4 | Inlining | IR pass | Not implemented | High |
| 5 | Strength Reduction | IR pass | Not implemented | Medium | | 5 | Strength Reduction | IR pass | Not implemented | Medium |
| 6 | Dead Code Elimination | IR pass | Not implemented | Medium | | 6 | Dead Code Elimination | IR pass | Not implemented | Medium |
| 7 | Tail Call Optimization | IR + Codegen | Partial | Medium | | 7 | Tail Call Optimization | IR + Codegen | Partial | Medium |
| 8 | Consolidation | Architecture | Not implemented | High | | 8 | Consolidation | Architecture | Not implemented | High |
| 9 | Compound IR Operations | IR + Codegen | Not implemented | Medium | | 9 | Compound IR Operations | IR + Codegen | Not implemented | Medium |
| 10 | Codegen Improvements | Codegen | Not implemented | Medium | | 10 | Codegen Improvements | Codegen | Not implemented | Medium |
| 11 | wasmtime Configuration | Runtime | Not implemented | Low | | 11 | wasmtime Configuration | Runtime | Not implemented | Low |
| 12 | Dictionary Hash Index | Runtime | Not implemented | Low | | 12 | Dictionary Hash Index | Runtime | Not implemented | Low |
| 13 | Startup Batching | Architecture | Not implemented | Low | | 13 | Startup Batching | Architecture | Not implemented | Low |
| 14 | Float / Double-Cell | Codegen | Not implemented | Future | | 14 | Float / Double-Cell | Codegen | Not implemented | Future |
---
## 1. Stack-to-Local Promotion ## 1. Stack-to-Local Promotion
@@ -105,8 +103,6 @@ When the compiler can statically determine the types and lifetimes of values on
- Inference pass: new code in `optimizer.rs` or a dedicated `promote.rs` - Inference pass: new code in `optimizer.rs` or a dedicated `promote.rs`
- Codegen integration: `crates/core/src/codegen.rs` `emit_op()` needs a second code path - Codegen integration: `crates/core/src/codegen.rs` `emit_op()` needs a second code path
---
## 2. Peephole Optimization ## 2. Peephole Optimization
**Status: Not implemented.** **Status: Not implemented.**
@@ -115,29 +111,27 @@ A peephole optimizer scans adjacent IR operations and replaces recognized patter
### Patterns ### Patterns
| Pattern | Replacement | Savings | | Pattern | Replacement | Savings |
|---------|-------------|---------| | -------------------- | ---------------------------------- | -------------------------- |
| `PushI32(n), Drop` | *(remove both)* | 1 push + 1 pop | | `PushI32(n), Drop` | _(remove both)_ | 1 push + 1 pop |
| `Dup, Drop` | *(remove both)* | 1 peek+push + 1 pop | | `Dup, Drop` | _(remove both)_ | 1 peek+push + 1 pop |
| `Swap, Swap` | *(remove both)* | 2x(2 pops + 2 pushes) | | `Swap, Swap` | _(remove both)_ | 2x(2 pops + 2 pushes) |
| `Swap, Drop` | `Nip` | 1 pop | | `Swap, Drop` | `Nip` | 1 pop |
| `Over, Over` | `TwoDup` (new) | 1 peek+push | | `Over, Over` | `TwoDup` (new) | 1 peek+push |
| `Drop, Drop` | `TwoDrop` (new) | 1 dsp adjustment | | `Drop, Drop` | `TwoDrop` (new) | 1 dsp adjustment |
| `PushI32(0), Add` | *(remove both)* | 1 push + 1 pop + add | | `PushI32(0), Add` | _(remove both)_ | 1 push + 1 pop + add |
| `PushI32(0), Or` | *(remove both)* | same | | `PushI32(0), Or` | _(remove both)_ | same |
| `PushI32(-1), And` | *(remove both)* | same | | `PushI32(-1), And` | _(remove both)_ | same |
| `PushI32(1), Add` | `Inc` (new or codegen special) | avoids pushing constant | | `PushI32(1), Add` | `Inc` (new or codegen special) | avoids pushing constant |
| `PushI32(1), Sub` | `Dec` (new or codegen special) | same | | `PushI32(1), Sub` | `Dec` (new or codegen special) | same |
| `ZeroEq, ZeroEq` | *(remove both)* for boolean inputs | 2 comparisons | | `ZeroEq, ZeroEq` | _(remove both)_ for boolean inputs | 2 comparisons |
| `DivMod, Swap, Drop` | `Div` (new or codegen special) | avoids computing remainder | | `DivMod, Swap, Drop` | `Div` (new or codegen special) | avoids computing remainder |
| `DivMod, Drop` | `Mod` (new or codegen special) | avoids computing quotient | | `DivMod, Drop` | `Mod` (new or codegen special) | avoids computing quotient |
### Implementation ### Implementation
A single function `fn peephole(ops: Vec<IrOp>) -> Vec<IrOp>` that makes repeated passes until no more patterns match. Recurse into control flow bodies (If/DoLoop/Begin*). A single function `fn peephole(ops: Vec<IrOp>) -> Vec<IrOp>` that makes repeated passes until no more patterns match. Recurse into control flow bodies (If/DoLoop/Begin*).
---
## 3. Constant Folding ## 3. Constant Folding
**Status: Not implemented.** **Status: Not implemented.**
@@ -167,8 +161,6 @@ Constant folding composes with inlining: after inlining a word, new folding oppo
A function `fn constant_fold(ops: Vec<IrOp>) -> Vec<IrOp>` that simulates a compile-time stack of known constants and replaces foldable sequences. Must handle all arithmetic, comparison, logic, and unary operations in `IrOp`. A function `fn constant_fold(ops: Vec<IrOp>) -> Vec<IrOp>` that simulates a compile-time stack of known constants and replaces foldable sequences. Must handle all arithmetic, comparison, logic, and unary operations in `IrOp`.
---
## 4. Inlining ## 4. Inlining
**Status: Not implemented.** **Status: Not implemented.**
@@ -187,21 +179,25 @@ Every call in WAFER is `call_indirect` through a function table. This is slower
``` ```
Before inlining, MAIN's IR: Before inlining, MAIN's IR:
``` ```
PushI32(5), Call(SQUARE), PushI32(3), Call(SQUARE), Add PushI32(5), Call(SQUARE), PushI32(3), Call(SQUARE), Add
``` ```
After inlining SQUARE: After inlining SQUARE:
``` ```
PushI32(5), Dup, Mul, PushI32(3), Dup, Mul, Add PushI32(5), Dup, Mul, PushI32(3), Dup, Mul, Add
``` ```
After constant folding: After constant folding:
``` ```
PushI32(25), PushI32(9), Add PushI32(25), PushI32(9), Add
``` ```
After more folding: After more folding:
``` ```
PushI32(34) PushI32(34)
``` ```
@@ -214,8 +210,6 @@ PushI32(34)
- Do not inline words with side effects that depend on call context (rare) - Do not inline words with side effects that depend on call context (rare)
- Re-run peephole and constant folding after inlining - Re-run peephole and constant folding after inlining
---
## 5. Strength Reduction ## 5. Strength Reduction
**Status: Not implemented.** **Status: Not implemented.**
@@ -224,19 +218,17 @@ Replace expensive operations with cheaper equivalents when one operand is a know
### Patterns ### Patterns
| Pattern | Replacement | Why | | Pattern | Replacement | Why |
|---------|-------------|-----| | ---------------------- | ------------------------------- | ------------------------------ |
| `PushI32(2^n), Mul` | `PushI32(n), Lshift` | shift is 1 cycle vs multiply | | `PushI32(2^n), Mul` | `PushI32(n), Lshift` | shift is 1 cycle vs multiply |
| `PushI32(2^n), DivMod` | `PushI32(n), Rshift` (unsigned) | shift vs divide | | `PushI32(2^n), DivMod` | `PushI32(n), Rshift` (unsigned) | shift vs divide |
| `PushI32(1), Lshift` | `Dup, Add` | add is often faster than shift | | `PushI32(1), Lshift` | `Dup, Add` | add is often faster than shift |
| `PushI32(0), Gt` | `ZeroGt` (if added) | avoids pushing constant | | `PushI32(0), Gt` | `ZeroGt` (if added) | avoids pushing constant |
| `PushI32(0), Eq` | `ZeroEq` | already exists as IR op | | `PushI32(0), Eq` | `ZeroEq` | already exists as IR op |
| `PushI32(0), Lt` | `ZeroLt` | already exists as IR op | | `PushI32(0), Lt` | `ZeroLt` | already exists as IR op |
The most common case is `CELLS` which is defined as `PushI32(4), Mul`. Strength reduction turns this into `PushI32(2), Lshift`. The most common case is `CELLS` which is defined as `PushI32(4), Mul`. Strength reduction turns this into `PushI32(2), Lshift`.
---
## 6. Dead Code Elimination ## 6. Dead Code Elimination
**Status: Not implemented.** **Status: Not implemented.**
@@ -252,8 +244,6 @@ Remove IR operations that can never execute or whose results are never used.
DCE should run after constant folding, since folding can create new constant conditionals. DCE should run after constant folding, since folding can create new constant conditionals.
---
## 7. Tail Call Optimization ## 7. Tail Call Optimization
**Status: Partial.** `IrOp::TailCall(WordId)` exists in `ir.rs` and codegen handles it in `codegen.rs`, but the compiler never generates it. **Status: Partial.** `IrOp::TailCall(WordId)` exists in `ir.rs` and codegen handles it in `codegen.rs`, but the compiler never generates it.
@@ -261,6 +251,7 @@ DCE should run after constant folding, since folding can create new constant con
### What Exists ### What Exists
The codegen for `TailCall` emits: The codegen for `TailCall` emits:
```wasm ```wasm
i32.const <word_id> i32.const <word_id>
call_indirect (type $void) (table 0) call_indirect (type $void) (table 0)
@@ -279,8 +270,6 @@ The compiler (`outer.rs`) needs to detect tail position: when the last operation
Detection rule: if the last IR op in a word body (or in a branch of an `If`) is `Call(id)`, and there are no pending return-stack items (`>R` without matching `R>`), replace with `TailCall(id)`. Detection rule: if the last IR op in a word body (or in a branch of an `If`) is `Call(id)`, and there are no pending return-stack items (`>R` without matching `R>`), replace with `TailCall(id)`.
---
## 8. Consolidation ## 8. Consolidation
**Status: Not implemented.** Stub exists at `crates/core/src/consolidate.rs`. **Status: Not implemented.** Stub exists at `crates/core/src/consolidate.rs`.
@@ -304,12 +293,10 @@ After interactive development, `CONSOLIDATE` recompiles all defined words into a
### Two Modes ### Two Modes
| Mode | When | Properties | | Mode | When | Properties |
|------|------|------------| | ------------- | ----------------------- | ------------------------------------------------ |
| JIT (current) | Interactive development | Per-word modules, `call_indirect`, fast redefine | | JIT (current) | Interactive development | Per-word modules, `call_indirect`, fast redefine |
| Consolidated | After `CONSOLIDATE` | Single module, direct `call`, no redefine | | Consolidated | After `CONSOLIDATE` | Single module, direct `call`, no redefine |
---
## 9. Compound IR Operations ## 9. Compound IR Operations
@@ -353,8 +340,6 @@ Instead of two separate `dsp += 4`, emit one `dsp += 8`.
These can be added as new `IrOp` variants recognized by peephole and emitted by codegen with specialized WASM sequences. These can be added as new `IrOp` variants recognized by peephole and emitted by codegen with specialized WASM sequences.
---
## 10. Codegen Improvements ## 10. Codegen Improvements
**Status: Not implemented.** **Status: Not implemented.**
@@ -407,26 +392,22 @@ i32.add ;; result on wasm stack
`DO...LOOP` currently stores the loop index and limit on the return stack (in memory). Keep them in WASM locals for the duration of the loop body. This makes `I` (read loop index) a simple `local.get` instead of a memory load from the return stack. `DO...LOOP` currently stores the loop index and limit on the return stack (in memory). Keep them in WASM locals for the duration of the loop body. This makes `I` (read loop index) a simple `local.get` instead of a memory load from the return stack.
---
## 11. wasmtime Configuration ## 11. wasmtime Configuration
**Status: Not implemented.** Currently using `Engine::default()`. **Status: Not implemented.** Currently using `Engine::default()`.
### Available Knobs ### Available Knobs
| Setting | Current | Recommended | Effect | | Setting | Current | Recommended | Effect |
|---------|---------|-------------|--------| | ---------------------------------------- | --------------- | ----------- | -------------------------------------- |
| `Config::cranelift_opt_level` | Speed (default) | Speed | Already optimal for JIT | | `Config::cranelift_opt_level` | Speed (default) | Speed | Already optimal for JIT |
| `Config::cranelift_nan_canonicalization` | true | false | Skip NaN fixup (no floats yet) | | `Config::cranelift_nan_canonicalization` | true | false | Skip NaN fixup (no floats yet) |
| `Config::parallel_compilation` | true | true | Already optimal | | `Config::parallel_compilation` | true | true | Already optimal |
| Module caching | none | file-based | Cache compiled modules across sessions | | Module caching | none | file-based | Cache compiled modules across sessions |
| Epoch interruption | none | enable | Protect against infinite loops | | Epoch interruption | none | enable | Protect against infinite loops |
Module caching is the most impactful: `wasmtime::Config::cache_config_load_default()` enables disk-based caching of compiled WASM, so restarting WAFER with the same definitions does not re-invoke Cranelift. Module caching is the most impactful: `wasmtime::Config::cache_config_load_default()` enables disk-based caching of compiled WASM, so restarting WAFER with the same definitions does not re-invoke Cranelift.
---
## 12. Dictionary Hash Index ## 12. Dictionary Hash Index
**Status: Not implemented.** **Status: Not implemented.**
@@ -439,8 +420,6 @@ Maintain a `HashMap<String, (u32, WordId, bool)>` alongside the linked list. Upd
This affects **compile time** (word lookup during parsing), not runtime (compiled code uses function table indices directly). This affects **compile time** (word lookup during parsing), not runtime (compiled code uses function table indices directly).
---
## 13. Startup Batching ## 13. Startup Batching
**Status: Not implemented.** `compile_core_module()` stub exists in `codegen.rs`. **Status: Not implemented.** `compile_core_module()` stub exists in `codegen.rs`.
@@ -451,35 +430,31 @@ Currently, each of the 80+ primitives registered at boot creates a separate WASM
Batch all IR-based primitives into a single WASM module with multiple exported functions. One `Module::new()` + one `Instance::new()` replaces 80+ pairs. This is a subset of what Consolidation (section 8) achieves, but scoped to primitives only and simpler to implement. Batch all IR-based primitives into a single WASM module with multiple exported functions. One `Module::new()` + one `Instance::new()` replaces 80+ pairs. This is a subset of what Consolidation (section 8) achieves, but scoped to primitives only and simpler to implement.
---
## 14. Float and Double-Cell Stack ## 14. Float and Double-Cell Stack
**Status: Not implemented.** `PushI64` and `PushF64` exist as IR ops but are stubs in codegen. **Status: Not implemented.** `PushI64` and `PushF64` exist as IR ops but are stubs in codegen.
The float stack lives in its own memory region (0x2540--0x2D40). Float operations will have the same memory-based overhead as integer operations, but worse: `f64` values are 8 bytes, doubling the memory traffic per push/pop. Stack-to-local promotion (section 1) is even more impactful for floats because WASM has native `f64` locals and operand stack support. The float stack lives in its own memory region (0x2540--0x2D40). Float operations will have the same memory-based overhead as integer operations, but worse: `f64` values are 8 bytes, doubling the memory traffic per push/pop. Stack-to-local promotion (section 1) is even more impactful for floats because WASM has native `f64` locals and operand stack support.
---
## Suggested Implementation Order ## Suggested Implementation Order
Ordered by effort-to-impact ratio (cheapest wins first): Ordered by effort-to-impact ratio (cheapest wins first):
| Priority | Optimization | Effort | Unlocks | | Priority | Optimization | Effort | Unlocks |
|----------|-------------|--------|---------| | -------- | -------------------------------------------------- | ------- | ----------------------------------------------- |
| 1 | Peephole optimization | Low | Immediate code size reduction | | 1 | Peephole optimization | Low | Immediate code size reduction |
| 2 | Constant folding | Low | Composes with peephole | | 2 | Constant folding | Low | Composes with peephole |
| 3 | Tail call detection | Low | Recursive word optimization | | 3 | Tail call detection | Low | Recursive word optimization |
| 4 | Dictionary hash index | Low | Faster compilation | | 4 | Dictionary hash index | Low | Faster compilation |
| 5 | wasmtime config tuning | Trivial | Caching, interruption | | 5 | wasmtime config tuning | Trivial | Caching, interruption |
| 6 | Codegen improvements (global caching, loop locals) | Medium | ~30% fewer instructions | | 6 | Codegen improvements (global caching, loop locals) | Medium | ~30% fewer instructions |
| 7 | Inlining | Medium | Unlocks cross-word folding and peephole | | 7 | Inlining | Medium | Unlocks cross-word folding and peephole |
| 8 | Strength reduction | Low | Best after inlining exists | | 8 | Strength reduction | Low | Best after inlining exists |
| 9 | Dead code elimination | Low | Best after constant folding exists | | 9 | Dead code elimination | Low | Best after constant folding exists |
| 10 | Compound IR operations | Medium | Cumulative gains | | 10 | Compound IR operations | Medium | Cumulative gains |
| 11 | Stack-to-local promotion | High | The single biggest speedup (~7x for arithmetic) | | 11 | Stack-to-local promotion | High | The single biggest speedup (~7x for arithmetic) |
| 12 | Startup batching | Medium | Faster boot | | 12 | Startup batching | Medium | Faster boot |
| 13 | Consolidation | High | Direct calls, cross-word optimization | | 13 | Consolidation | High | Direct calls, cross-word optimization |
| 14 | Float/double-cell | Medium | Depends on stack-to-local | | 14 | Float/double-cell | Medium | Depends on stack-to-local |
Stack-to-local promotion has the highest impact but also the highest implementation cost. The passes before it (peephole, folding, inlining) are simpler and their benefits multiply when stack-to-local promotion is eventually added. Consolidation is last because it requires storing IR bodies and restructuring the module generation -- it benefits most from having all other passes working first. Stack-to-local promotion has the highest impact but also the highest implementation cost. The passes before it (peephole, folding, inlining) are simpler and their benefits multiply when stack-to-local promotion is eventually added. Consolidation is last because it requires storing IR bodies and restructuring the module generation -- it benefits most from having all other passes working first.