From 8bfdd966ea911123a826d72e6bcc98ef1f7821d9 Mon Sep 17 00:00:00 2001 From: Oleksandr Kozachuk Date: Mon, 30 Mar 2026 23:01:35 +0200 Subject: [PATCH] Add optimization docs, workspace lints, and pre-commit hooks - Add docs/OPTIMIZATIONS.md: catalog of 14 optimization passes with status tracking and implementation roadmap - Configure workspace-level clippy and rustc lints in Cargo.toml - Add clippy.toml and deny.toml for clippy thresholds and dependency auditing (licenses, advisories, bans) - Set up pre-commit hook: cargo fmt, dprint, clippy, cargo deny, cargo machete - Update Justfile with deny/machete targets, dprint in fmt checks --- Cargo.toml | 32 ++++++++ Justfile | 16 +++- clippy.toml | 3 + crates/cli/Cargo.toml | 8 +- crates/core/Cargo.toml | 3 + crates/web/Cargo.toml | 8 +- deny.toml | 35 +++++++++ docs/OPTIMIZATIONS.md | 167 ++++++++++++++++++----------------------- 8 files changed, 171 insertions(+), 101 deletions(-) create mode 100644 clippy.toml create mode 100644 deny.toml diff --git a/Cargo.toml b/Cargo.toml index 433fc11..646182d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,38 @@ edition = "2024" license = "MIT OR Apache-2.0" repository = "https://github.com/ok2/wafer" +[workspace.lints.rust] +unsafe_code = "deny" +unused_must_use = "deny" +elided_lifetimes_in_paths = "warn" +trivial_numeric_casts = "warn" +unreachable_pub = "warn" +unused_qualifications = "warn" + +[workspace.lints.clippy] +# Pedantic group (selective) +cloned_instead_of_copied = "warn" +explicit_iter_loop = "warn" +flat_map_option = "warn" +implicit_clone = "warn" +inconsistent_struct_constructor = "warn" +inefficient_to_string = "warn" +manual_let_else = "warn" +map_unwrap_or = "warn" +redundant_closure_for_method_calls = "warn" +semicolon_if_nothing_returned = "warn" +uninlined_format_args = "warn" +unnested_or_patterns = "warn" +unused_self = "warn" +# Correctness & suspicious +doc_markdown = "warn" +match_wildcard_for_single_variants = "warn" +needless_continue = "warn" +ref_as_ptr = "warn" +# Nursery (stable enough to use) +needless_collect = "warn" +or_fun_call = "warn" + [workspace.dependencies] wasm-encoder = "0.228" wasmparser = "0.228" diff --git a/Justfile b/Justfile index cf20230..e6eab14 100644 --- a/Justfile +++ b/Justfile @@ -17,13 +17,15 @@ compliance: clippy: cargo clippy --workspace -- -D warnings -# Check formatting +# Check formatting (Rust + Markdown) fmt: cargo fmt --all --check + dprint check -# Format code +# Format code (Rust + Markdown) fmt-fix: cargo fmt --all + dprint fmt # Run the REPL repl: @@ -37,8 +39,16 @@ run file: bench: cargo bench --workspace +# Check dependency licenses and advisories +deny: + cargo deny check + +# Detect unused dependencies +machete: + cargo machete --skip-target-dir + # Full CI check (what CI runs) -ci: fmt clippy test +ci: fmt clippy deny test # Check compilation without running check: diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 0000000..071cb95 --- /dev/null +++ b/clippy.toml @@ -0,0 +1,3 @@ +too-many-lines-threshold = 150 +type-complexity-threshold = 300 +too-many-arguments-threshold = 8 diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index 6c95bdf..7989589 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -5,8 +5,14 @@ version.workspace = true edition.workspace = true license.workspace = true +[package.metadata.cargo-machete] +ignored = ["wasmtime", "wasmtime-wasi"] + +[lints] +workspace = true + [dependencies] -wafer-core = { path = "../core" } +wafer-core = { path = "../core", version = "0.1.0" } wasmtime = { workspace = true } wasmtime-wasi = { workspace = true } anyhow = { workspace = true } diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index f339d32..7550a4e 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -5,6 +5,9 @@ version.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] wasm-encoder = { workspace = true } wasmparser = { workspace = true } diff --git a/crates/web/Cargo.toml b/crates/web/Cargo.toml index f5b4b5d..5e6c7b2 100644 --- a/crates/web/Cargo.toml +++ b/crates/web/Cargo.toml @@ -5,5 +5,11 @@ version.workspace = true edition.workspace = true license.workspace = true +[package.metadata.cargo-machete] +ignored = ["wafer-core"] + +[lints] +workspace = true + [dependencies] -wafer-core = { path = "../core" } +wafer-core = { path = "../core", version = "0.1.0" } diff --git a/deny.toml b/deny.toml new file mode 100644 index 0000000..ca0bd44 --- /dev/null +++ b/deny.toml @@ -0,0 +1,35 @@ +[advisories] +ignore = [ + # wasmtime v31 known issues -- will resolve when upgrading wasmtime + { id = "RUSTSEC-2025-0046", reason = "wasmtime v31: fd_renumber panic" }, + { id = "RUSTSEC-2025-0118", reason = "wasmtime v31: shared memory unsoundness" }, + { id = "RUSTSEC-2026-0006", reason = "wasmtime v31: f64.copysign segfault" }, + { id = "RUSTSEC-2026-0020", reason = "wasmtime v31: WASI resource exhaustion" }, + { id = "RUSTSEC-2026-0021", reason = "wasmtime v31: fields instance panic" }, + # Unmaintained transitive deps from wasmtime/rustyline + { id = "RUSTSEC-2025-0057", reason = "fxhash: transitive dep, no alternative" }, + { id = "RUSTSEC-2024-0436", reason = "paste: transitive dep, no alternative" }, +] + +[licenses] +allow = [ + "MIT", + "Apache-2.0", + "Apache-2.0 WITH LLVM-exception", + "BSD-2-Clause", + "BSD-3-Clause", + "BSL-1.0", + "Unicode-3.0", + "Zlib", +] +confidence-threshold = 0.8 + +[bans] +multiple-versions = "warn" +wildcards = "deny" + +[sources] +unknown-registry = "deny" +unknown-git = "deny" +allow-registry = ["https://github.com/rust-lang/crates.io-index"] +allow-git = [] diff --git a/docs/OPTIMIZATIONS.md b/docs/OPTIMIZATIONS.md index 5e06a77..001f2c7 100644 --- a/docs/OPTIMIZATIONS.md +++ b/docs/OPTIMIZATIONS.md @@ -12,24 +12,22 @@ This document describes every optimization that makes sense for WAFER, why it ma ## Status Summary -| # | Optimization | Level | Status | Impact | -|----|---------------------------|--------------|---------------------|----------| -| 1 | Stack-to-Local Promotion | Codegen | Not implemented | Highest | -| 2 | Peephole Optimization | IR pass | Not implemented | High | -| 3 | Constant Folding | IR pass | Not implemented | High | -| 4 | Inlining | IR pass | Not implemented | High | -| 5 | Strength Reduction | IR pass | Not implemented | Medium | -| 6 | Dead Code Elimination | IR pass | Not implemented | Medium | -| 7 | Tail Call Optimization | IR + Codegen | Partial | Medium | -| 8 | Consolidation | Architecture | Not implemented | High | -| 9 | Compound IR Operations | IR + Codegen | Not implemented | Medium | -| 10 | Codegen Improvements | Codegen | Not implemented | Medium | -| 11 | wasmtime Configuration | Runtime | Not implemented | Low | -| 12 | Dictionary Hash Index | Runtime | Not implemented | Low | -| 13 | Startup Batching | Architecture | Not implemented | Low | -| 14 | Float / Double-Cell | Codegen | Not implemented | Future | - ---- +| # | Optimization | Level | Status | Impact | +| -- | ------------------------ | ------------ | --------------- | ------- | +| 1 | Stack-to-Local Promotion | Codegen | Not implemented | Highest | +| 2 | Peephole Optimization | IR pass | Not implemented | High | +| 3 | Constant Folding | IR pass | Not implemented | High | +| 4 | Inlining | IR pass | Not implemented | High | +| 5 | Strength Reduction | IR pass | Not implemented | Medium | +| 6 | Dead Code Elimination | IR pass | Not implemented | Medium | +| 7 | Tail Call Optimization | IR + Codegen | Partial | Medium | +| 8 | Consolidation | Architecture | Not implemented | High | +| 9 | Compound IR Operations | IR + Codegen | Not implemented | Medium | +| 10 | Codegen Improvements | Codegen | Not implemented | Medium | +| 11 | wasmtime Configuration | Runtime | Not implemented | Low | +| 12 | Dictionary Hash Index | Runtime | Not implemented | Low | +| 13 | Startup Batching | Architecture | Not implemented | Low | +| 14 | Float / Double-Cell | Codegen | Not implemented | Future | ## 1. Stack-to-Local Promotion @@ -105,8 +103,6 @@ When the compiler can statically determine the types and lifetimes of values on - Inference pass: new code in `optimizer.rs` or a dedicated `promote.rs` - Codegen integration: `crates/core/src/codegen.rs` `emit_op()` needs a second code path ---- - ## 2. Peephole Optimization **Status: Not implemented.** @@ -115,29 +111,27 @@ A peephole optimizer scans adjacent IR operations and replaces recognized patter ### Patterns -| Pattern | Replacement | Savings | -|---------|-------------|---------| -| `PushI32(n), Drop` | *(remove both)* | 1 push + 1 pop | -| `Dup, Drop` | *(remove both)* | 1 peek+push + 1 pop | -| `Swap, Swap` | *(remove both)* | 2x(2 pops + 2 pushes) | -| `Swap, Drop` | `Nip` | 1 pop | -| `Over, Over` | `TwoDup` (new) | 1 peek+push | -| `Drop, Drop` | `TwoDrop` (new) | 1 dsp adjustment | -| `PushI32(0), Add` | *(remove both)* | 1 push + 1 pop + add | -| `PushI32(0), Or` | *(remove both)* | same | -| `PushI32(-1), And` | *(remove both)* | same | -| `PushI32(1), Add` | `Inc` (new or codegen special) | avoids pushing constant | -| `PushI32(1), Sub` | `Dec` (new or codegen special) | same | -| `ZeroEq, ZeroEq` | *(remove both)* for boolean inputs | 2 comparisons | -| `DivMod, Swap, Drop` | `Div` (new or codegen special) | avoids computing remainder | -| `DivMod, Drop` | `Mod` (new or codegen special) | avoids computing quotient | +| Pattern | Replacement | Savings | +| -------------------- | ---------------------------------- | -------------------------- | +| `PushI32(n), Drop` | _(remove both)_ | 1 push + 1 pop | +| `Dup, Drop` | _(remove both)_ | 1 peek+push + 1 pop | +| `Swap, Swap` | _(remove both)_ | 2x(2 pops + 2 pushes) | +| `Swap, Drop` | `Nip` | 1 pop | +| `Over, Over` | `TwoDup` (new) | 1 peek+push | +| `Drop, Drop` | `TwoDrop` (new) | 1 dsp adjustment | +| `PushI32(0), Add` | _(remove both)_ | 1 push + 1 pop + add | +| `PushI32(0), Or` | _(remove both)_ | same | +| `PushI32(-1), And` | _(remove both)_ | same | +| `PushI32(1), Add` | `Inc` (new or codegen special) | avoids pushing constant | +| `PushI32(1), Sub` | `Dec` (new or codegen special) | same | +| `ZeroEq, ZeroEq` | _(remove both)_ for boolean inputs | 2 comparisons | +| `DivMod, Swap, Drop` | `Div` (new or codegen special) | avoids computing remainder | +| `DivMod, Drop` | `Mod` (new or codegen special) | avoids computing quotient | ### Implementation A single function `fn peephole(ops: Vec) -> Vec` that makes repeated passes until no more patterns match. Recurse into control flow bodies (If/DoLoop/Begin*). ---- - ## 3. Constant Folding **Status: Not implemented.** @@ -167,8 +161,6 @@ Constant folding composes with inlining: after inlining a word, new folding oppo A function `fn constant_fold(ops: Vec) -> Vec` that simulates a compile-time stack of known constants and replaces foldable sequences. Must handle all arithmetic, comparison, logic, and unary operations in `IrOp`. ---- - ## 4. Inlining **Status: Not implemented.** @@ -187,21 +179,25 @@ Every call in WAFER is `call_indirect` through a function table. This is slower ``` Before inlining, MAIN's IR: + ``` PushI32(5), Call(SQUARE), PushI32(3), Call(SQUARE), Add ``` After inlining SQUARE: + ``` PushI32(5), Dup, Mul, PushI32(3), Dup, Mul, Add ``` After constant folding: + ``` PushI32(25), PushI32(9), Add ``` After more folding: + ``` PushI32(34) ``` @@ -214,8 +210,6 @@ PushI32(34) - Do not inline words with side effects that depend on call context (rare) - Re-run peephole and constant folding after inlining ---- - ## 5. Strength Reduction **Status: Not implemented.** @@ -224,19 +218,17 @@ Replace expensive operations with cheaper equivalents when one operand is a know ### Patterns -| Pattern | Replacement | Why | -|---------|-------------|-----| -| `PushI32(2^n), Mul` | `PushI32(n), Lshift` | shift is 1 cycle vs multiply | -| `PushI32(2^n), DivMod` | `PushI32(n), Rshift` (unsigned) | shift vs divide | -| `PushI32(1), Lshift` | `Dup, Add` | add is often faster than shift | -| `PushI32(0), Gt` | `ZeroGt` (if added) | avoids pushing constant | -| `PushI32(0), Eq` | `ZeroEq` | already exists as IR op | -| `PushI32(0), Lt` | `ZeroLt` | already exists as IR op | +| Pattern | Replacement | Why | +| ---------------------- | ------------------------------- | ------------------------------ | +| `PushI32(2^n), Mul` | `PushI32(n), Lshift` | shift is 1 cycle vs multiply | +| `PushI32(2^n), DivMod` | `PushI32(n), Rshift` (unsigned) | shift vs divide | +| `PushI32(1), Lshift` | `Dup, Add` | add is often faster than shift | +| `PushI32(0), Gt` | `ZeroGt` (if added) | avoids pushing constant | +| `PushI32(0), Eq` | `ZeroEq` | already exists as IR op | +| `PushI32(0), Lt` | `ZeroLt` | already exists as IR op | The most common case is `CELLS` which is defined as `PushI32(4), Mul`. Strength reduction turns this into `PushI32(2), Lshift`. ---- - ## 6. Dead Code Elimination **Status: Not implemented.** @@ -252,8 +244,6 @@ Remove IR operations that can never execute or whose results are never used. DCE should run after constant folding, since folding can create new constant conditionals. ---- - ## 7. Tail Call Optimization **Status: Partial.** `IrOp::TailCall(WordId)` exists in `ir.rs` and codegen handles it in `codegen.rs`, but the compiler never generates it. @@ -261,6 +251,7 @@ DCE should run after constant folding, since folding can create new constant con ### What Exists The codegen for `TailCall` emits: + ```wasm i32.const call_indirect (type $void) (table 0) @@ -279,8 +270,6 @@ The compiler (`outer.rs`) needs to detect tail position: when the last operation Detection rule: if the last IR op in a word body (or in a branch of an `If`) is `Call(id)`, and there are no pending return-stack items (`>R` without matching `R>`), replace with `TailCall(id)`. ---- - ## 8. Consolidation **Status: Not implemented.** Stub exists at `crates/core/src/consolidate.rs`. @@ -304,12 +293,10 @@ After interactive development, `CONSOLIDATE` recompiles all defined words into a ### Two Modes -| Mode | When | Properties | -|------|------|------------| +| Mode | When | Properties | +| ------------- | ----------------------- | ------------------------------------------------ | | JIT (current) | Interactive development | Per-word modules, `call_indirect`, fast redefine | -| Consolidated | After `CONSOLIDATE` | Single module, direct `call`, no redefine | - ---- +| Consolidated | After `CONSOLIDATE` | Single module, direct `call`, no redefine | ## 9. Compound IR Operations @@ -353,8 +340,6 @@ Instead of two separate `dsp += 4`, emit one `dsp += 8`. These can be added as new `IrOp` variants recognized by peephole and emitted by codegen with specialized WASM sequences. ---- - ## 10. Codegen Improvements **Status: Not implemented.** @@ -407,26 +392,22 @@ i32.add ;; result on wasm stack `DO...LOOP` currently stores the loop index and limit on the return stack (in memory). Keep them in WASM locals for the duration of the loop body. This makes `I` (read loop index) a simple `local.get` instead of a memory load from the return stack. ---- - ## 11. wasmtime Configuration **Status: Not implemented.** Currently using `Engine::default()`. ### Available Knobs -| Setting | Current | Recommended | Effect | -|---------|---------|-------------|--------| -| `Config::cranelift_opt_level` | Speed (default) | Speed | Already optimal for JIT | -| `Config::cranelift_nan_canonicalization` | true | false | Skip NaN fixup (no floats yet) | -| `Config::parallel_compilation` | true | true | Already optimal | -| Module caching | none | file-based | Cache compiled modules across sessions | -| Epoch interruption | none | enable | Protect against infinite loops | +| Setting | Current | Recommended | Effect | +| ---------------------------------------- | --------------- | ----------- | -------------------------------------- | +| `Config::cranelift_opt_level` | Speed (default) | Speed | Already optimal for JIT | +| `Config::cranelift_nan_canonicalization` | true | false | Skip NaN fixup (no floats yet) | +| `Config::parallel_compilation` | true | true | Already optimal | +| Module caching | none | file-based | Cache compiled modules across sessions | +| Epoch interruption | none | enable | Protect against infinite loops | Module caching is the most impactful: `wasmtime::Config::cache_config_load_default()` enables disk-based caching of compiled WASM, so restarting WAFER with the same definitions does not re-invoke Cranelift. ---- - ## 12. Dictionary Hash Index **Status: Not implemented.** @@ -439,8 +420,6 @@ Maintain a `HashMap` alongside the linked list. Upd This affects **compile time** (word lookup during parsing), not runtime (compiled code uses function table indices directly). ---- - ## 13. Startup Batching **Status: Not implemented.** `compile_core_module()` stub exists in `codegen.rs`. @@ -451,35 +430,31 @@ Currently, each of the 80+ primitives registered at boot creates a separate WASM Batch all IR-based primitives into a single WASM module with multiple exported functions. One `Module::new()` + one `Instance::new()` replaces 80+ pairs. This is a subset of what Consolidation (section 8) achieves, but scoped to primitives only and simpler to implement. ---- - ## 14. Float and Double-Cell Stack **Status: Not implemented.** `PushI64` and `PushF64` exist as IR ops but are stubs in codegen. The float stack lives in its own memory region (0x2540--0x2D40). Float operations will have the same memory-based overhead as integer operations, but worse: `f64` values are 8 bytes, doubling the memory traffic per push/pop. Stack-to-local promotion (section 1) is even more impactful for floats because WASM has native `f64` locals and operand stack support. ---- - ## Suggested Implementation Order Ordered by effort-to-impact ratio (cheapest wins first): -| Priority | Optimization | Effort | Unlocks | -|----------|-------------|--------|---------| -| 1 | Peephole optimization | Low | Immediate code size reduction | -| 2 | Constant folding | Low | Composes with peephole | -| 3 | Tail call detection | Low | Recursive word optimization | -| 4 | Dictionary hash index | Low | Faster compilation | -| 5 | wasmtime config tuning | Trivial | Caching, interruption | -| 6 | Codegen improvements (global caching, loop locals) | Medium | ~30% fewer instructions | -| 7 | Inlining | Medium | Unlocks cross-word folding and peephole | -| 8 | Strength reduction | Low | Best after inlining exists | -| 9 | Dead code elimination | Low | Best after constant folding exists | -| 10 | Compound IR operations | Medium | Cumulative gains | -| 11 | Stack-to-local promotion | High | The single biggest speedup (~7x for arithmetic) | -| 12 | Startup batching | Medium | Faster boot | -| 13 | Consolidation | High | Direct calls, cross-word optimization | -| 14 | Float/double-cell | Medium | Depends on stack-to-local | +| Priority | Optimization | Effort | Unlocks | +| -------- | -------------------------------------------------- | ------- | ----------------------------------------------- | +| 1 | Peephole optimization | Low | Immediate code size reduction | +| 2 | Constant folding | Low | Composes with peephole | +| 3 | Tail call detection | Low | Recursive word optimization | +| 4 | Dictionary hash index | Low | Faster compilation | +| 5 | wasmtime config tuning | Trivial | Caching, interruption | +| 6 | Codegen improvements (global caching, loop locals) | Medium | ~30% fewer instructions | +| 7 | Inlining | Medium | Unlocks cross-word folding and peephole | +| 8 | Strength reduction | Low | Best after inlining exists | +| 9 | Dead code elimination | Low | Best after constant folding exists | +| 10 | Compound IR operations | Medium | Cumulative gains | +| 11 | Stack-to-local promotion | High | The single biggest speedup (~7x for arithmetic) | +| 12 | Startup batching | Medium | Faster boot | +| 13 | Consolidation | High | Direct calls, cross-word optimization | +| 14 | Float/double-cell | Medium | Depends on stack-to-local | Stack-to-local promotion has the highest impact but also the highest implementation cost. The passes before it (peephole, folding, inlining) are simpler and their benefits multiply when stack-to-local promotion is eventually added. Consolidation is last because it requires storing IR bodies and restructuring the module generation -- it benefits most from having all other passes working first.