Add optimization docs, workspace lints, and pre-commit hooks
- Add docs/OPTIMIZATIONS.md: catalog of 14 optimization passes with status tracking and implementation roadmap - Configure workspace-level clippy and rustc lints in Cargo.toml - Add clippy.toml and deny.toml for clippy thresholds and dependency auditing (licenses, advisories, bans) - Set up pre-commit hook: cargo fmt, dprint, clippy, cargo deny, cargo machete - Update Justfile with deny/machete targets, dprint in fmt checks
This commit is contained in:
+32
@@ -8,6 +8,38 @@ edition = "2024"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/ok2/wafer"
|
||||
|
||||
[workspace.lints.rust]
|
||||
unsafe_code = "deny"
|
||||
unused_must_use = "deny"
|
||||
elided_lifetimes_in_paths = "warn"
|
||||
trivial_numeric_casts = "warn"
|
||||
unreachable_pub = "warn"
|
||||
unused_qualifications = "warn"
|
||||
|
||||
[workspace.lints.clippy]
|
||||
# Pedantic group (selective)
|
||||
cloned_instead_of_copied = "warn"
|
||||
explicit_iter_loop = "warn"
|
||||
flat_map_option = "warn"
|
||||
implicit_clone = "warn"
|
||||
inconsistent_struct_constructor = "warn"
|
||||
inefficient_to_string = "warn"
|
||||
manual_let_else = "warn"
|
||||
map_unwrap_or = "warn"
|
||||
redundant_closure_for_method_calls = "warn"
|
||||
semicolon_if_nothing_returned = "warn"
|
||||
uninlined_format_args = "warn"
|
||||
unnested_or_patterns = "warn"
|
||||
unused_self = "warn"
|
||||
# Correctness & suspicious
|
||||
doc_markdown = "warn"
|
||||
match_wildcard_for_single_variants = "warn"
|
||||
needless_continue = "warn"
|
||||
ref_as_ptr = "warn"
|
||||
# Nursery (stable enough to use)
|
||||
needless_collect = "warn"
|
||||
or_fun_call = "warn"
|
||||
|
||||
[workspace.dependencies]
|
||||
wasm-encoder = "0.228"
|
||||
wasmparser = "0.228"
|
||||
|
||||
@@ -17,13 +17,15 @@ compliance:
|
||||
clippy:
|
||||
cargo clippy --workspace -- -D warnings
|
||||
|
||||
# Check formatting
|
||||
# Check formatting (Rust + Markdown)
|
||||
fmt:
|
||||
cargo fmt --all --check
|
||||
dprint check
|
||||
|
||||
# Format code
|
||||
# Format code (Rust + Markdown)
|
||||
fmt-fix:
|
||||
cargo fmt --all
|
||||
dprint fmt
|
||||
|
||||
# Run the REPL
|
||||
repl:
|
||||
@@ -37,8 +39,16 @@ run file:
|
||||
bench:
|
||||
cargo bench --workspace
|
||||
|
||||
# Check dependency licenses and advisories
|
||||
deny:
|
||||
cargo deny check
|
||||
|
||||
# Detect unused dependencies
|
||||
machete:
|
||||
cargo machete --skip-target-dir
|
||||
|
||||
# Full CI check (what CI runs)
|
||||
ci: fmt clippy test
|
||||
ci: fmt clippy deny test
|
||||
|
||||
# Check compilation without running
|
||||
check:
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
too-many-lines-threshold = 150
|
||||
type-complexity-threshold = 300
|
||||
too-many-arguments-threshold = 8
|
||||
@@ -5,8 +5,14 @@ version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[package.metadata.cargo-machete]
|
||||
ignored = ["wasmtime", "wasmtime-wasi"]
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
wafer-core = { path = "../core" }
|
||||
wafer-core = { path = "../core", version = "0.1.0" }
|
||||
wasmtime = { workspace = true }
|
||||
wasmtime-wasi = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
|
||||
@@ -5,6 +5,9 @@ version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
wasm-encoder = { workspace = true }
|
||||
wasmparser = { workspace = true }
|
||||
|
||||
@@ -5,5 +5,11 @@ version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[package.metadata.cargo-machete]
|
||||
ignored = ["wafer-core"]
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
wafer-core = { path = "../core" }
|
||||
wafer-core = { path = "../core", version = "0.1.0" }
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
[advisories]
|
||||
ignore = [
|
||||
# wasmtime v31 known issues -- will resolve when upgrading wasmtime
|
||||
{ id = "RUSTSEC-2025-0046", reason = "wasmtime v31: fd_renumber panic" },
|
||||
{ id = "RUSTSEC-2025-0118", reason = "wasmtime v31: shared memory unsoundness" },
|
||||
{ id = "RUSTSEC-2026-0006", reason = "wasmtime v31: f64.copysign segfault" },
|
||||
{ id = "RUSTSEC-2026-0020", reason = "wasmtime v31: WASI resource exhaustion" },
|
||||
{ id = "RUSTSEC-2026-0021", reason = "wasmtime v31: fields instance panic" },
|
||||
# Unmaintained transitive deps from wasmtime/rustyline
|
||||
{ id = "RUSTSEC-2025-0057", reason = "fxhash: transitive dep, no alternative" },
|
||||
{ id = "RUSTSEC-2024-0436", reason = "paste: transitive dep, no alternative" },
|
||||
]
|
||||
|
||||
[licenses]
|
||||
allow = [
|
||||
"MIT",
|
||||
"Apache-2.0",
|
||||
"Apache-2.0 WITH LLVM-exception",
|
||||
"BSD-2-Clause",
|
||||
"BSD-3-Clause",
|
||||
"BSL-1.0",
|
||||
"Unicode-3.0",
|
||||
"Zlib",
|
||||
]
|
||||
confidence-threshold = 0.8
|
||||
|
||||
[bans]
|
||||
multiple-versions = "warn"
|
||||
wildcards = "deny"
|
||||
|
||||
[sources]
|
||||
unknown-registry = "deny"
|
||||
unknown-git = "deny"
|
||||
allow-registry = ["https://github.com/rust-lang/crates.io-index"]
|
||||
allow-git = []
|
||||
+18
-43
@@ -13,7 +13,7 @@ This document describes every optimization that makes sense for WAFER, why it ma
|
||||
## Status Summary
|
||||
|
||||
| # | Optimization | Level | Status | Impact |
|
||||
|----|---------------------------|--------------|---------------------|----------|
|
||||
| -- | ------------------------ | ------------ | --------------- | ------- |
|
||||
| 1 | Stack-to-Local Promotion | Codegen | Not implemented | Highest |
|
||||
| 2 | Peephole Optimization | IR pass | Not implemented | High |
|
||||
| 3 | Constant Folding | IR pass | Not implemented | High |
|
||||
@@ -29,8 +29,6 @@ This document describes every optimization that makes sense for WAFER, why it ma
|
||||
| 13 | Startup Batching | Architecture | Not implemented | Low |
|
||||
| 14 | Float / Double-Cell | Codegen | Not implemented | Future |
|
||||
|
||||
---
|
||||
|
||||
## 1. Stack-to-Local Promotion
|
||||
|
||||
**Status: Not implemented.** Type infrastructure exists (`crates/core/src/types.rs`) but is not wired into codegen.
|
||||
@@ -105,8 +103,6 @@ When the compiler can statically determine the types and lifetimes of values on
|
||||
- Inference pass: new code in `optimizer.rs` or a dedicated `promote.rs`
|
||||
- Codegen integration: `crates/core/src/codegen.rs` `emit_op()` needs a second code path
|
||||
|
||||
---
|
||||
|
||||
## 2. Peephole Optimization
|
||||
|
||||
**Status: Not implemented.**
|
||||
@@ -116,19 +112,19 @@ A peephole optimizer scans adjacent IR operations and replaces recognized patter
|
||||
### Patterns
|
||||
|
||||
| Pattern | Replacement | Savings |
|
||||
|---------|-------------|---------|
|
||||
| `PushI32(n), Drop` | *(remove both)* | 1 push + 1 pop |
|
||||
| `Dup, Drop` | *(remove both)* | 1 peek+push + 1 pop |
|
||||
| `Swap, Swap` | *(remove both)* | 2x(2 pops + 2 pushes) |
|
||||
| -------------------- | ---------------------------------- | -------------------------- |
|
||||
| `PushI32(n), Drop` | _(remove both)_ | 1 push + 1 pop |
|
||||
| `Dup, Drop` | _(remove both)_ | 1 peek+push + 1 pop |
|
||||
| `Swap, Swap` | _(remove both)_ | 2x(2 pops + 2 pushes) |
|
||||
| `Swap, Drop` | `Nip` | 1 pop |
|
||||
| `Over, Over` | `TwoDup` (new) | 1 peek+push |
|
||||
| `Drop, Drop` | `TwoDrop` (new) | 1 dsp adjustment |
|
||||
| `PushI32(0), Add` | *(remove both)* | 1 push + 1 pop + add |
|
||||
| `PushI32(0), Or` | *(remove both)* | same |
|
||||
| `PushI32(-1), And` | *(remove both)* | same |
|
||||
| `PushI32(0), Add` | _(remove both)_ | 1 push + 1 pop + add |
|
||||
| `PushI32(0), Or` | _(remove both)_ | same |
|
||||
| `PushI32(-1), And` | _(remove both)_ | same |
|
||||
| `PushI32(1), Add` | `Inc` (new or codegen special) | avoids pushing constant |
|
||||
| `PushI32(1), Sub` | `Dec` (new or codegen special) | same |
|
||||
| `ZeroEq, ZeroEq` | *(remove both)* for boolean inputs | 2 comparisons |
|
||||
| `ZeroEq, ZeroEq` | _(remove both)_ for boolean inputs | 2 comparisons |
|
||||
| `DivMod, Swap, Drop` | `Div` (new or codegen special) | avoids computing remainder |
|
||||
| `DivMod, Drop` | `Mod` (new or codegen special) | avoids computing quotient |
|
||||
|
||||
@@ -136,8 +132,6 @@ A peephole optimizer scans adjacent IR operations and replaces recognized patter
|
||||
|
||||
A single function `fn peephole(ops: Vec<IrOp>) -> Vec<IrOp>` that makes repeated passes until no more patterns match. Recurse into control flow bodies (If/DoLoop/Begin*).
|
||||
|
||||
---
|
||||
|
||||
## 3. Constant Folding
|
||||
|
||||
**Status: Not implemented.**
|
||||
@@ -167,8 +161,6 @@ Constant folding composes with inlining: after inlining a word, new folding oppo
|
||||
|
||||
A function `fn constant_fold(ops: Vec<IrOp>) -> Vec<IrOp>` that simulates a compile-time stack of known constants and replaces foldable sequences. Must handle all arithmetic, comparison, logic, and unary operations in `IrOp`.
|
||||
|
||||
---
|
||||
|
||||
## 4. Inlining
|
||||
|
||||
**Status: Not implemented.**
|
||||
@@ -187,21 +179,25 @@ Every call in WAFER is `call_indirect` through a function table. This is slower
|
||||
```
|
||||
|
||||
Before inlining, MAIN's IR:
|
||||
|
||||
```
|
||||
PushI32(5), Call(SQUARE), PushI32(3), Call(SQUARE), Add
|
||||
```
|
||||
|
||||
After inlining SQUARE:
|
||||
|
||||
```
|
||||
PushI32(5), Dup, Mul, PushI32(3), Dup, Mul, Add
|
||||
```
|
||||
|
||||
After constant folding:
|
||||
|
||||
```
|
||||
PushI32(25), PushI32(9), Add
|
||||
```
|
||||
|
||||
After more folding:
|
||||
|
||||
```
|
||||
PushI32(34)
|
||||
```
|
||||
@@ -214,8 +210,6 @@ PushI32(34)
|
||||
- Do not inline words with side effects that depend on call context (rare)
|
||||
- Re-run peephole and constant folding after inlining
|
||||
|
||||
---
|
||||
|
||||
## 5. Strength Reduction
|
||||
|
||||
**Status: Not implemented.**
|
||||
@@ -225,7 +219,7 @@ Replace expensive operations with cheaper equivalents when one operand is a know
|
||||
### Patterns
|
||||
|
||||
| Pattern | Replacement | Why |
|
||||
|---------|-------------|-----|
|
||||
| ---------------------- | ------------------------------- | ------------------------------ |
|
||||
| `PushI32(2^n), Mul` | `PushI32(n), Lshift` | shift is 1 cycle vs multiply |
|
||||
| `PushI32(2^n), DivMod` | `PushI32(n), Rshift` (unsigned) | shift vs divide |
|
||||
| `PushI32(1), Lshift` | `Dup, Add` | add is often faster than shift |
|
||||
@@ -235,8 +229,6 @@ Replace expensive operations with cheaper equivalents when one operand is a know
|
||||
|
||||
The most common case is `CELLS` which is defined as `PushI32(4), Mul`. Strength reduction turns this into `PushI32(2), Lshift`.
|
||||
|
||||
---
|
||||
|
||||
## 6. Dead Code Elimination
|
||||
|
||||
**Status: Not implemented.**
|
||||
@@ -252,8 +244,6 @@ Remove IR operations that can never execute or whose results are never used.
|
||||
|
||||
DCE should run after constant folding, since folding can create new constant conditionals.
|
||||
|
||||
---
|
||||
|
||||
## 7. Tail Call Optimization
|
||||
|
||||
**Status: Partial.** `IrOp::TailCall(WordId)` exists in `ir.rs` and codegen handles it in `codegen.rs`, but the compiler never generates it.
|
||||
@@ -261,6 +251,7 @@ DCE should run after constant folding, since folding can create new constant con
|
||||
### What Exists
|
||||
|
||||
The codegen for `TailCall` emits:
|
||||
|
||||
```wasm
|
||||
i32.const <word_id>
|
||||
call_indirect (type $void) (table 0)
|
||||
@@ -279,8 +270,6 @@ The compiler (`outer.rs`) needs to detect tail position: when the last operation
|
||||
|
||||
Detection rule: if the last IR op in a word body (or in a branch of an `If`) is `Call(id)`, and there are no pending return-stack items (`>R` without matching `R>`), replace with `TailCall(id)`.
|
||||
|
||||
---
|
||||
|
||||
## 8. Consolidation
|
||||
|
||||
**Status: Not implemented.** Stub exists at `crates/core/src/consolidate.rs`.
|
||||
@@ -305,12 +294,10 @@ After interactive development, `CONSOLIDATE` recompiles all defined words into a
|
||||
### Two Modes
|
||||
|
||||
| Mode | When | Properties |
|
||||
|------|------|------------|
|
||||
| ------------- | ----------------------- | ------------------------------------------------ |
|
||||
| JIT (current) | Interactive development | Per-word modules, `call_indirect`, fast redefine |
|
||||
| Consolidated | After `CONSOLIDATE` | Single module, direct `call`, no redefine |
|
||||
|
||||
---
|
||||
|
||||
## 9. Compound IR Operations
|
||||
|
||||
**Status: Not implemented.**
|
||||
@@ -353,8 +340,6 @@ Instead of two separate `dsp += 4`, emit one `dsp += 8`.
|
||||
|
||||
These can be added as new `IrOp` variants recognized by peephole and emitted by codegen with specialized WASM sequences.
|
||||
|
||||
---
|
||||
|
||||
## 10. Codegen Improvements
|
||||
|
||||
**Status: Not implemented.**
|
||||
@@ -407,8 +392,6 @@ i32.add ;; result on wasm stack
|
||||
|
||||
`DO...LOOP` currently stores the loop index and limit on the return stack (in memory). Keep them in WASM locals for the duration of the loop body. This makes `I` (read loop index) a simple `local.get` instead of a memory load from the return stack.
|
||||
|
||||
---
|
||||
|
||||
## 11. wasmtime Configuration
|
||||
|
||||
**Status: Not implemented.** Currently using `Engine::default()`.
|
||||
@@ -416,7 +399,7 @@ i32.add ;; result on wasm stack
|
||||
### Available Knobs
|
||||
|
||||
| Setting | Current | Recommended | Effect |
|
||||
|---------|---------|-------------|--------|
|
||||
| ---------------------------------------- | --------------- | ----------- | -------------------------------------- |
|
||||
| `Config::cranelift_opt_level` | Speed (default) | Speed | Already optimal for JIT |
|
||||
| `Config::cranelift_nan_canonicalization` | true | false | Skip NaN fixup (no floats yet) |
|
||||
| `Config::parallel_compilation` | true | true | Already optimal |
|
||||
@@ -425,8 +408,6 @@ i32.add ;; result on wasm stack
|
||||
|
||||
Module caching is the most impactful: `wasmtime::Config::cache_config_load_default()` enables disk-based caching of compiled WASM, so restarting WAFER with the same definitions does not re-invoke Cranelift.
|
||||
|
||||
---
|
||||
|
||||
## 12. Dictionary Hash Index
|
||||
|
||||
**Status: Not implemented.**
|
||||
@@ -439,8 +420,6 @@ Maintain a `HashMap<String, (u32, WordId, bool)>` alongside the linked list. Upd
|
||||
|
||||
This affects **compile time** (word lookup during parsing), not runtime (compiled code uses function table indices directly).
|
||||
|
||||
---
|
||||
|
||||
## 13. Startup Batching
|
||||
|
||||
**Status: Not implemented.** `compile_core_module()` stub exists in `codegen.rs`.
|
||||
@@ -451,22 +430,18 @@ Currently, each of the 80+ primitives registered at boot creates a separate WASM
|
||||
|
||||
Batch all IR-based primitives into a single WASM module with multiple exported functions. One `Module::new()` + one `Instance::new()` replaces 80+ pairs. This is a subset of what Consolidation (section 8) achieves, but scoped to primitives only and simpler to implement.
|
||||
|
||||
---
|
||||
|
||||
## 14. Float and Double-Cell Stack
|
||||
|
||||
**Status: Not implemented.** `PushI64` and `PushF64` exist as IR ops but are stubs in codegen.
|
||||
|
||||
The float stack lives in its own memory region (0x2540--0x2D40). Float operations will have the same memory-based overhead as integer operations, but worse: `f64` values are 8 bytes, doubling the memory traffic per push/pop. Stack-to-local promotion (section 1) is even more impactful for floats because WASM has native `f64` locals and operand stack support.
|
||||
|
||||
---
|
||||
|
||||
## Suggested Implementation Order
|
||||
|
||||
Ordered by effort-to-impact ratio (cheapest wins first):
|
||||
|
||||
| Priority | Optimization | Effort | Unlocks |
|
||||
|----------|-------------|--------|---------|
|
||||
| -------- | -------------------------------------------------- | ------- | ----------------------------------------------- |
|
||||
| 1 | Peephole optimization | Low | Immediate code size reduction |
|
||||
| 2 | Constant folding | Low | Composes with peephole |
|
||||
| 3 | Tail call detection | Low | Recursive word optimization |
|
||||
|
||||
Reference in New Issue
Block a user