From bf7581ad9e1a48fceea94042fea06f31351944a1 Mon Sep 17 00:00:00 2001
From: Oleksandr Kozachuk <ddeus.gh@mailnull.com>
Date: Thu, 2 Apr 2026 13:47:28 +0200
Subject: [PATCH] Implement float IR operations: 25 words compiled to native
 WASM f64

Convert 25 float words from host functions to IR primitives:
- Stack: FDROP FDUP FSWAP FOVER FNIP FTUCK
- Arithmetic: F+ F- F* F/ FNEGATE FABS FSQRT FMIN FMAX FLOOR FROUND
- Comparisons: F0= F0< F= F<
- Memory: F@ F!
- Conversions: S>F F>S

24 new IrOp variants compiled to native WASM f64 instructions.
EmitCtx struct threads f64 scratch locals through all emit functions.
Float constant folding: 1.5E0 2.5E0 F+ folds to PushF64(4.0).
Float peephole: PushF64+FDrop, FDup+FDrop, FSwap+FSwap eliminated.
Float literals now compile as PushF64 IR ops instead of anonymous host calls.

~420 lines of Rust closure code removed from outer.rs.
All 14 optimizations now implemented. 430 tests passing.
---
 crates/core/src/codegen.rs   | 663 ++++++++++++++++++++++++++++++++---
 crates/core/src/ir.rs        |  56 +++
 crates/core/src/optimizer.rs | 133 +++++++
 crates/core/src/outer.rs     | 464 +++++-------------------
 4 files changed, 893 insertions(+), 423 deletions(-)

diff --git a/crates/core/src/codegen.rs b/crates/core/src/codegen.rs
index b2cbde7..873da2e 100644
--- a/crates/core/src/codegen.rs
+++ b/crates/core/src/codegen.rs
@@ -34,7 +34,6 @@ const DSP: u32 = 0;
 const RSP: u32 = 1;
 
 /// Index of the `$fsp` global (float stack pointer).
-#[allow(dead_code)]
 const FSP: u32 = 2;
 
 /// Index of the imported function table.
@@ -74,6 +73,13 @@ const MEM1: MemArg = MemArg {
     memory_index: MEMORY_INDEX,
 };
 
+/// Natural-alignment `MemArg` for 8-byte f64 operations.
+const MEM8: MemArg = MemArg {
+    offset: 0,
+    align: 3, // 2^3 = 8
+    memory_index: MEMORY_INDEX,
+};
+
 // ---------------------------------------------------------------------------
 // Public types
 // ---------------------------------------------------------------------------
@@ -214,24 +220,119 @@ fn bool_to_forth_flag(f: &mut Function, tmp: u32) {
         .instruction(&Instruction::I32Sub);
 }
 
+// ---------------------------------------------------------------------------
+// Float stack helpers
+// ---------------------------------------------------------------------------
+
+/// Carries f64 scratch local indices for float codegen.
+struct EmitCtx {
+    f64_local_0: u32,
+    f64_local_1: u32,
+}
+
+/// Decrement the FSP global by 8 (allocate space for one f64).
+fn fsp_dec(f: &mut Function) {
+    f.instruction(&Instruction::GlobalGet(FSP))
+        .instruction(&Instruction::I32Const(8))
+        .instruction(&Instruction::I32Sub)
+        .instruction(&Instruction::GlobalSet(FSP));
+}
+
+/// Increment the FSP global by 8 (free space for one f64).
+fn fsp_inc(f: &mut Function) {
+    f.instruction(&Instruction::GlobalGet(FSP))
+        .instruction(&Instruction::I32Const(8))
+        .instruction(&Instruction::I32Add)
+        .instruction(&Instruction::GlobalSet(FSP));
+}
+
+/// Save an f64 from the WASM operand stack into `tmp`, decrement FSP,
+/// then store the f64 at [FSP].
+fn fpush_via_local(f: &mut Function, tmp: u32) {
+    f.instruction(&Instruction::LocalSet(tmp));
+    fsp_dec(f);
+    f.instruction(&Instruction::GlobalGet(FSP))
+        .instruction(&Instruction::LocalGet(tmp))
+        .instruction(&Instruction::F64Store(MEM8));
+}
+
+/// Decrement FSP, then store the f64 from local `src` at [FSP].
+fn fpush_from_local(f: &mut Function, src: u32) {
+    fsp_dec(f);
+    f.instruction(&Instruction::GlobalGet(FSP))
+        .instruction(&Instruction::LocalGet(src))
+        .instruction(&Instruction::F64Store(MEM8));
+}
+
+/// Load f64 from [FSP] onto the WASM operand stack, then increment FSP.
+fn fpop(f: &mut Function) {
+    f.instruction(&Instruction::GlobalGet(FSP))
+        .instruction(&Instruction::F64Load(MEM8));
+    fsp_inc(f);
+}
+
+/// Load f64 from [FSP] onto the WASM operand stack without popping.
+fn fpeek(f: &mut Function) {
+    f.instruction(&Instruction::GlobalGet(FSP))
+        .instruction(&Instruction::F64Load(MEM8));
+}
+
+/// Pop two floats (b then a), apply binary op, push result.
+fn emit_float_binary(f: &mut Function, ctx: &EmitCtx, wasm_op: &Instruction<'_>) {
+    fpop(f);
+    f.instruction(&Instruction::LocalSet(ctx.f64_local_0));
+    fpop(f);
+    f.instruction(&Instruction::LocalSet(ctx.f64_local_1));
+    f.instruction(&Instruction::LocalGet(ctx.f64_local_1))
+        .instruction(&Instruction::LocalGet(ctx.f64_local_0))
+        .instruction(wasm_op);
+    fpush_via_local(f, ctx.f64_local_0);
+}
+
+/// Pop one float, apply unary op, push result.
+fn emit_float_unary(f: &mut Function, ctx: &EmitCtx, wasm_op: &Instruction<'_>) {
+    fpop(f);
+    f.instruction(wasm_op);
+    fpush_via_local(f, ctx.f64_local_0);
+}
+
+/// Pop two floats, compare, push Forth flag to data stack.
+fn emit_float_cmp(f: &mut Function, ctx: &EmitCtx, wasm_cmp: &Instruction<'_>) {
+    fpop(f);
+    f.instruction(&Instruction::LocalSet(ctx.f64_local_0));
+    fpop(f);
+    f.instruction(&Instruction::LocalSet(ctx.f64_local_1));
+    f.instruction(&Instruction::LocalGet(ctx.f64_local_1))
+        .instruction(&Instruction::LocalGet(ctx.f64_local_0))
+        .instruction(wasm_cmp);
+    bool_to_forth_flag(f, SCRATCH_BASE);
+    push_via_local(f, SCRATCH_BASE + 1);
+}
+
 // ---------------------------------------------------------------------------
 // IR emission
 // ---------------------------------------------------------------------------
 
 /// Emit all IR operations in `ops` into the WASM function body `f`.
-fn emit_body(f: &mut Function, ops: &[IrOp]) {
+fn emit_body(f: &mut Function, ops: &[IrOp], ctx: &EmitCtx) {
     for op in ops {
-        emit_op(f, op);
+        emit_op(f, op, ctx);
     }
 }
 
 /// Emit a single IR operation.
 #[allow(clippy::too_many_lines)]
-fn emit_op(f: &mut Function, op: &IrOp) {
+fn emit_op(f: &mut Function, op: &IrOp, ctx: &EmitCtx) {
     match op {
         // -- Literals -------------------------------------------------------
         IrOp::PushI32(n) => push_const(f, *n),
-        IrOp::PushI64(_) | IrOp::PushF64(_) => { /* TODO: double / float stacks */ }
+        IrOp::PushI64(_) => { /* TODO: double-cell */ }
+        IrOp::PushF64(val) => {
+            fsp_dec(f);
+            f.instruction(&Instruction::GlobalGet(FSP))
+                .instruction(&Instruction::F64Const(*val))
+                .instruction(&Instruction::F64Store(MEM8));
+        }
 
         // -- Stack manipulation ---------------------------------------------
         IrOp::Drop => dsp_inc(f),
@@ -460,21 +561,21 @@ fn emit_op(f: &mut Function, op: &IrOp) {
         } => {
             pop(f);
             f.instruction(&Instruction::If(BlockType::Empty));
-            emit_body(f, then_body);
+            emit_body(f, then_body, ctx);
             if let Some(eb) = else_body {
                 f.instruction(&Instruction::Else);
-                emit_body(f, eb);
+                emit_body(f, eb, ctx);
             }
             f.instruction(&Instruction::End);
         }
 
         IrOp::DoLoop { body, is_plus_loop } => {
-            emit_do_loop(f, body, *is_plus_loop);
+            emit_do_loop(f, body, *is_plus_loop, ctx);
         }
 
         IrOp::BeginUntil { body } => {
             f.instruction(&Instruction::Loop(BlockType::Empty));
-            emit_body(f, body);
+            emit_body(f, body, ctx);
             pop(f);
             f.instruction(&Instruction::I32Eqz)
                 .instruction(&Instruction::BrIf(0))
@@ -483,7 +584,7 @@ fn emit_op(f: &mut Function, op: &IrOp) {
 
         IrOp::BeginAgain { body } => {
             f.instruction(&Instruction::Loop(BlockType::Empty));
-            emit_body(f, body);
+            emit_body(f, body, ctx);
             f.instruction(&Instruction::Br(0))
                 .instruction(&Instruction::End);
         }
@@ -491,11 +592,11 @@ fn emit_op(f: &mut Function, op: &IrOp) {
         IrOp::BeginWhileRepeat { test, body } => {
             f.instruction(&Instruction::Block(BlockType::Empty));
             f.instruction(&Instruction::Loop(BlockType::Empty));
-            emit_body(f, test);
+            emit_body(f, test, ctx);
             pop(f);
             f.instruction(&Instruction::I32Eqz)
                 .instruction(&Instruction::BrIf(1)); // break to outer block
-            emit_body(f, body);
+            emit_body(f, body, ctx);
             f.instruction(&Instruction::Br(0)) // continue loop
                 .instruction(&Instruction::End) // end loop
                 .instruction(&Instruction::End); // end block
@@ -530,25 +631,25 @@ fn emit_op(f: &mut Function, op: &IrOp) {
             f.instruction(&Instruction::Block(BlockType::Empty)); // $else
             f.instruction(&Instruction::Block(BlockType::Empty)); // $after
             f.instruction(&Instruction::Loop(BlockType::Empty)); // $begin
-            emit_body(f, outer_test);
+            emit_body(f, outer_test, ctx);
             pop(f);
             f.instruction(&Instruction::I32Eqz)
                 .instruction(&Instruction::BrIf(2)); // to $else
-            emit_body(f, inner_test);
+            emit_body(f, inner_test, ctx);
             pop(f);
             f.instruction(&Instruction::I32Eqz)
                 .instruction(&Instruction::BrIf(1)); // to $after
-            emit_body(f, body);
+            emit_body(f, body, ctx);
             f.instruction(&Instruction::Br(0)); // back to $begin
             f.instruction(&Instruction::End); // end loop
             f.instruction(&Instruction::End); // end $after block
-            emit_body(f, after_repeat);
+            emit_body(f, after_repeat, ctx);
             if else_body.is_some() {
                 f.instruction(&Instruction::Br(1)); // skip else, goto $end
             }
             f.instruction(&Instruction::End); // end $else block
             if let Some(eb) = else_body {
-                emit_body(f, eb);
+                emit_body(f, eb, ctx);
             }
             f.instruction(&Instruction::End); // end $end block
         }
@@ -647,6 +748,90 @@ fn emit_op(f: &mut Function, op: &IrOp) {
                 .instruction(&Instruction::I32Add)
                 .instruction(&Instruction::LocalSet(CACHED_DSP_LOCAL));
         }
+
+        // -- Float stack ops -----------------------------------------------
+        IrOp::FDrop => fsp_inc(f),
+        IrOp::FDup => {
+            fpeek(f);
+            fpush_via_local(f, ctx.f64_local_0);
+        }
+        IrOp::FSwap => {
+            fpop(f);
+            f.instruction(&Instruction::LocalSet(ctx.f64_local_0));
+            fpop(f);
+            f.instruction(&Instruction::LocalSet(ctx.f64_local_1));
+            fpush_from_local(f, ctx.f64_local_0);
+            fpush_from_local(f, ctx.f64_local_1);
+        }
+        IrOp::FOver => {
+            f.instruction(&Instruction::GlobalGet(FSP))
+                .instruction(&Instruction::I32Const(8))
+                .instruction(&Instruction::I32Add)
+                .instruction(&Instruction::F64Load(MEM8));
+            fpush_via_local(f, ctx.f64_local_0);
+        }
+
+        // -- Float arithmetic ----------------------------------------------
+        IrOp::FAdd => emit_float_binary(f, ctx, &Instruction::F64Add),
+        IrOp::FSub => emit_float_binary(f, ctx, &Instruction::F64Sub),
+        IrOp::FMul => emit_float_binary(f, ctx, &Instruction::F64Mul),
+        IrOp::FDiv => emit_float_binary(f, ctx, &Instruction::F64Div),
+        IrOp::FMin => emit_float_binary(f, ctx, &Instruction::F64Min),
+        IrOp::FMax => emit_float_binary(f, ctx, &Instruction::F64Max),
+        IrOp::FNegate => emit_float_unary(f, ctx, &Instruction::F64Neg),
+        IrOp::FAbs => emit_float_unary(f, ctx, &Instruction::F64Abs),
+        IrOp::FSqrt => emit_float_unary(f, ctx, &Instruction::F64Sqrt),
+        IrOp::FFloor => emit_float_unary(f, ctx, &Instruction::F64Floor),
+        IrOp::FRound => emit_float_unary(f, ctx, &Instruction::F64Nearest),
+
+        // -- Float comparisons (cross-stack) --------------------------------
+        IrOp::FZeroEq => {
+            fpop(f);
+            f.instruction(&Instruction::F64Const(0.0))
+                .instruction(&Instruction::F64Eq);
+            bool_to_forth_flag(f, SCRATCH_BASE);
+            push_via_local(f, SCRATCH_BASE + 1);
+        }
+        IrOp::FZeroLt => {
+            fpop(f);
+            f.instruction(&Instruction::F64Const(0.0))
+                .instruction(&Instruction::F64Lt);
+            bool_to_forth_flag(f, SCRATCH_BASE);
+            push_via_local(f, SCRATCH_BASE + 1);
+        }
+        IrOp::FEq => emit_float_cmp(f, ctx, &Instruction::F64Eq),
+        IrOp::FLt => emit_float_cmp(f, ctx, &Instruction::F64Lt),
+
+        // -- Float memory (cross-stack) ------------------------------------
+        IrOp::FetchFloat => {
+            // ( addr -- ) ( F: -- r )
+            pop(f); // addr on operand stack
+            f.instruction(&Instruction::F64Load(MEM8));
+            fpush_via_local(f, ctx.f64_local_0);
+        }
+        IrOp::StoreFloat => {
+            // ( addr -- ) ( F: r -- )
+            pop_to(f, SCRATCH_BASE); // addr
+            fpop(f);
+            f.instruction(&Instruction::LocalSet(ctx.f64_local_0));
+            f.instruction(&Instruction::LocalGet(SCRATCH_BASE))
+                .instruction(&Instruction::LocalGet(ctx.f64_local_0))
+                .instruction(&Instruction::F64Store(MEM8));
+        }
+
+        // -- Float/integer conversions (cross-stack) -----------------------
+        IrOp::StoF => {
+            // ( n -- ) ( F: -- r )
+            pop(f);
+            f.instruction(&Instruction::F64ConvertI32S);
+            fpush_via_local(f, ctx.f64_local_0);
+        }
+        IrOp::FtoS => {
+            // ( F: r -- ) ( -- n )
+            fpop(f);
+            f.instruction(&Instruction::I32TruncF64S);
+            push_via_local(f, SCRATCH_BASE);
+        }
     }
 }
 
@@ -684,7 +869,7 @@ fn emit_cmp(f: &mut Function, cmp: &Instruction<'_>) {
 }
 
 /// Emit a DO...LOOP / DO...+LOOP construct.
-fn emit_do_loop(f: &mut Function, body: &[IrOp], is_plus_loop: bool) {
+fn emit_do_loop(f: &mut Function, body: &[IrOp], is_plus_loop: bool, ctx: &EmitCtx) {
     // DO ( limit index -- )
     pop_to(f, SCRATCH_BASE); // index
     pop_to(f, SCRATCH_BASE + 1); // limit
@@ -704,7 +889,7 @@ fn emit_do_loop(f: &mut Function, body: &[IrOp], is_plus_loop: bool) {
     f.instruction(&Instruction::Block(BlockType::Empty));
     f.instruction(&Instruction::Loop(BlockType::Empty));
 
-    emit_body(f, body);
+    emit_body(f, body, ctx);
 
     // Pop current index from return stack into scratch local
     rpop(f);
@@ -807,6 +992,29 @@ fn is_promotable(ops: &[IrOp]) -> bool {
             IrOp::ToR | IrOp::FromR | IrOp::RFetch => return false,
             IrOp::Emit | IrOp::Dot | IrOp::Cr | IrOp::Type => return false,
             IrOp::PushI64(_) | IrOp::PushF64(_) => return false,
+            IrOp::FDup
+            | IrOp::FDrop
+            | IrOp::FSwap
+            | IrOp::FOver
+            | IrOp::FAdd
+            | IrOp::FSub
+            | IrOp::FMul
+            | IrOp::FDiv
+            | IrOp::FNegate
+            | IrOp::FAbs
+            | IrOp::FSqrt
+            | IrOp::FMin
+            | IrOp::FMax
+            | IrOp::FFloor
+            | IrOp::FRound
+            | IrOp::FZeroEq
+            | IrOp::FZeroLt
+            | IrOp::FEq
+            | IrOp::FLt
+            | IrOp::FetchFloat
+            | IrOp::StoreFloat
+            | IrOp::StoF
+            | IrOp::FtoS => return false,
             _ => {}
         }
     }
@@ -839,6 +1047,27 @@ fn stack_delta(op: &IrOp) -> i32 {
         IrOp::Store | IrOp::CStore | IrOp::PlusStore => -2,
         IrOp::TwoDup => 2,
         IrOp::TwoDrop => -2,
+        // Float-only ops: no data stack change
+        IrOp::PushF64(_)
+        | IrOp::FDup
+        | IrOp::FDrop
+        | IrOp::FSwap
+        | IrOp::FOver
+        | IrOp::FAdd
+        | IrOp::FSub
+        | IrOp::FMul
+        | IrOp::FDiv
+        | IrOp::FNegate
+        | IrOp::FAbs
+        | IrOp::FSqrt
+        | IrOp::FMin
+        | IrOp::FMax
+        | IrOp::FFloor
+        | IrOp::FRound => 0,
+        // Cross-stack: push to data stack
+        IrOp::FZeroEq | IrOp::FZeroLt | IrOp::FEq | IrOp::FLt | IrOp::FtoS => 1,
+        // Cross-stack: pop from data stack
+        IrOp::FetchFloat | IrOp::StoreFloat | IrOp::StoF => -1,
         _ => 0,
     }
 }
@@ -897,7 +1126,9 @@ fn compute_stack_needs(ops: &[IrOp]) -> (u32, i32) {
             | IrOp::Fetch
             | IrOp::CFetch => depth - 1,
             IrOp::TwoDrop => depth - 2,
-            // Push ops don't read existing items
+            // Cross-stack ops that pop from data stack
+            IrOp::FetchFloat | IrOp::StoreFloat | IrOp::StoF => depth - 1,
+            // Push ops and float-only ops don't read data stack items
             _ => depth,
         };
         min_accessed = min_accessed.min(reads_from);
@@ -1320,6 +1551,83 @@ fn emit_promoted_cmp(f: &mut Function, sim: &mut StackSim, cmp: &Instruction<'_>
 // Public API
 // ---------------------------------------------------------------------------
 
+/// Check if an IR body (recursively) contains any float ops that need f64 locals.
+fn needs_f64_locals(ops: &[IrOp]) -> bool {
+    for op in ops {
+        match op {
+            IrOp::PushF64(_)
+            | IrOp::FDup
+            | IrOp::FDrop
+            | IrOp::FSwap
+            | IrOp::FOver
+            | IrOp::FAdd
+            | IrOp::FSub
+            | IrOp::FMul
+            | IrOp::FDiv
+            | IrOp::FNegate
+            | IrOp::FAbs
+            | IrOp::FSqrt
+            | IrOp::FMin
+            | IrOp::FMax
+            | IrOp::FFloor
+            | IrOp::FRound
+            | IrOp::FZeroEq
+            | IrOp::FZeroLt
+            | IrOp::FEq
+            | IrOp::FLt
+            | IrOp::FetchFloat
+            | IrOp::StoreFloat
+            | IrOp::StoF
+            | IrOp::FtoS => return true,
+            IrOp::If {
+                then_body,
+                else_body,
+            } => {
+                if needs_f64_locals(then_body) {
+                    return true;
+                }
+                if let Some(eb) = else_body
+                    && needs_f64_locals(eb)
+                {
+                    return true;
+                }
+            }
+            IrOp::DoLoop { body, .. } | IrOp::BeginUntil { body } | IrOp::BeginAgain { body } => {
+                if needs_f64_locals(body) {
+                    return true;
+                }
+            }
+            IrOp::BeginWhileRepeat { test, body } => {
+                if needs_f64_locals(test) || needs_f64_locals(body) {
+                    return true;
+                }
+            }
+            IrOp::BeginDoubleWhileRepeat {
+                outer_test,
+                inner_test,
+                body,
+                after_repeat,
+                else_body,
+            } => {
+                if needs_f64_locals(outer_test)
+                    || needs_f64_locals(inner_test)
+                    || needs_f64_locals(body)
+                    || needs_f64_locals(after_repeat)
+                {
+                    return true;
+                }
+                if let Some(eb) = else_body
+                    && needs_f64_locals(eb)
+                {
+                    return true;
+                }
+            }
+            _ => {}
+        }
+    }
+    false
+}
+
 /// Estimate scratch locals a function body needs (not counting cached DSP).
 fn count_scratch_locals(ops: &[IrOp]) -> u32 {
     let mut max: u32 = 4; // baseline scratch space (indices SCRATCH_BASE..SCRATCH_BASE+3)
@@ -1469,7 +1777,17 @@ pub fn compile_word(
     } else {
         1 + scratch_count
     };
-    let mut func = Function::new(vec![(num_locals, ValType::I32)]);
+    let has_floats = needs_f64_locals(body);
+    let num_f64: u32 = if has_floats { 2 } else { 0 };
+    let mut locals_decl = vec![(num_locals, ValType::I32)];
+    if num_f64 > 0 {
+        locals_decl.push((num_f64, ValType::F64));
+    }
+    let mut func = Function::new(locals_decl);
+    let ctx = EmitCtx {
+        f64_local_0: num_locals,
+        f64_local_1: num_locals + 1,
+    };
 
     // Prologue: cache $dsp global into local 0
     func.instruction(&Instruction::GlobalGet(DSP))
@@ -1485,7 +1803,7 @@ pub fn compile_word(
         }
         emit_promoted_epilogue(&mut func, &mut sim);
     } else {
-        emit_body(&mut func, body);
+        emit_body(&mut func, body, &ctx);
     }
 
     // Epilogue: write cached DSP back to the $dsp global
@@ -1517,9 +1835,14 @@ pub fn compile_word(
 
 /// Emit all IR operations, replacing `Call`/`TailCall` with direct calls
 /// when the target word is within the consolidated module.
-fn emit_consolidated_body(f: &mut Function, ops: &[IrOp], local_fn_map: &HashMap<WordId, u32>) {
+fn emit_consolidated_body(
+    f: &mut Function,
+    ops: &[IrOp],
+    local_fn_map: &HashMap<WordId, u32>,
+    ctx: &EmitCtx,
+) {
     for op in ops {
-        emit_consolidated_op(f, op, local_fn_map);
+        emit_consolidated_op(f, op, local_fn_map, ctx);
     }
 }
 
@@ -1528,7 +1851,12 @@ fn emit_consolidated_body(f: &mut Function, ops: &[IrOp], local_fn_map: &HashMap
 /// For `Call` and `TailCall`, emits a direct `call` if the target is in the
 /// consolidated module, otherwise falls back to `call_indirect`. For control
 /// flow with nested bodies, recurses to handle inner calls.
-fn emit_consolidated_op(f: &mut Function, op: &IrOp, local_fn_map: &HashMap<WordId, u32>) {
+fn emit_consolidated_op(
+    f: &mut Function,
+    op: &IrOp,
+    local_fn_map: &HashMap<WordId, u32>,
+    ctx: &EmitCtx,
+) {
     match op {
         IrOp::Call(word_id) => {
             if let Some(&fn_idx) = local_fn_map.get(word_id) {
@@ -1570,21 +1898,21 @@ fn emit_consolidated_op(f: &mut Function, op: &IrOp, local_fn_map: &HashMap<Word
         } => {
             pop(f);
             f.instruction(&Instruction::If(BlockType::Empty));
-            emit_consolidated_body(f, then_body, local_fn_map);
+            emit_consolidated_body(f, then_body, local_fn_map, ctx);
             if let Some(eb) = else_body {
                 f.instruction(&Instruction::Else);
-                emit_consolidated_body(f, eb, local_fn_map);
+                emit_consolidated_body(f, eb, local_fn_map, ctx);
             }
             f.instruction(&Instruction::End);
         }
 
         IrOp::DoLoop { body, is_plus_loop } => {
-            emit_consolidated_do_loop(f, body, *is_plus_loop, local_fn_map);
+            emit_consolidated_do_loop(f, body, *is_plus_loop, local_fn_map, ctx);
         }
 
         IrOp::BeginUntil { body } => {
             f.instruction(&Instruction::Loop(BlockType::Empty));
-            emit_consolidated_body(f, body, local_fn_map);
+            emit_consolidated_body(f, body, local_fn_map, ctx);
             pop(f);
             f.instruction(&Instruction::I32Eqz)
                 .instruction(&Instruction::BrIf(0))
@@ -1593,7 +1921,7 @@ fn emit_consolidated_op(f: &mut Function, op: &IrOp, local_fn_map: &HashMap<Word
 
         IrOp::BeginAgain { body } => {
             f.instruction(&Instruction::Loop(BlockType::Empty));
-            emit_consolidated_body(f, body, local_fn_map);
+            emit_consolidated_body(f, body, local_fn_map, ctx);
             f.instruction(&Instruction::Br(0))
                 .instruction(&Instruction::End);
         }
@@ -1601,11 +1929,11 @@ fn emit_consolidated_op(f: &mut Function, op: &IrOp, local_fn_map: &HashMap<Word
         IrOp::BeginWhileRepeat { test, body } => {
             f.instruction(&Instruction::Block(BlockType::Empty));
             f.instruction(&Instruction::Loop(BlockType::Empty));
-            emit_consolidated_body(f, test, local_fn_map);
+            emit_consolidated_body(f, test, local_fn_map, ctx);
             pop(f);
             f.instruction(&Instruction::I32Eqz)
                 .instruction(&Instruction::BrIf(1));
-            emit_consolidated_body(f, body, local_fn_map);
+            emit_consolidated_body(f, body, local_fn_map, ctx);
             f.instruction(&Instruction::Br(0))
                 .instruction(&Instruction::End)
                 .instruction(&Instruction::End);
@@ -1622,31 +1950,31 @@ fn emit_consolidated_op(f: &mut Function, op: &IrOp, local_fn_map: &HashMap<Word
             f.instruction(&Instruction::Block(BlockType::Empty)); // $else
             f.instruction(&Instruction::Block(BlockType::Empty)); // $after
             f.instruction(&Instruction::Loop(BlockType::Empty)); // $begin
-            emit_consolidated_body(f, outer_test, local_fn_map);
+            emit_consolidated_body(f, outer_test, local_fn_map, ctx);
             pop(f);
             f.instruction(&Instruction::I32Eqz)
                 .instruction(&Instruction::BrIf(2)); // to $else
-            emit_consolidated_body(f, inner_test, local_fn_map);
+            emit_consolidated_body(f, inner_test, local_fn_map, ctx);
             pop(f);
             f.instruction(&Instruction::I32Eqz)
                 .instruction(&Instruction::BrIf(1)); // to $after
-            emit_consolidated_body(f, body, local_fn_map);
+            emit_consolidated_body(f, body, local_fn_map, ctx);
             f.instruction(&Instruction::Br(0)); // back to $begin
             f.instruction(&Instruction::End); // end loop
             f.instruction(&Instruction::End); // end $after block
-            emit_consolidated_body(f, after_repeat, local_fn_map);
+            emit_consolidated_body(f, after_repeat, local_fn_map, ctx);
             if else_body.is_some() {
                 f.instruction(&Instruction::Br(1)); // skip else, goto $end
             }
             f.instruction(&Instruction::End); // end $else block
             if let Some(eb) = else_body {
-                emit_consolidated_body(f, eb, local_fn_map);
+                emit_consolidated_body(f, eb, local_fn_map, ctx);
             }
             f.instruction(&Instruction::End); // end $end block
         }
 
         // All other ops have no nested bodies with calls -- delegate to emit_op
-        other => emit_op(f, other),
+        other => emit_op(f, other, ctx),
     }
 }
 
@@ -1656,6 +1984,7 @@ fn emit_consolidated_do_loop(
     body: &[IrOp],
     is_plus_loop: bool,
     local_fn_map: &HashMap<WordId, u32>,
+    ctx: &EmitCtx,
 ) {
     // DO ( limit index -- )
     pop_to(f, SCRATCH_BASE); // index
@@ -1670,7 +1999,7 @@ fn emit_consolidated_do_loop(
     f.instruction(&Instruction::Block(BlockType::Empty));
     f.instruction(&Instruction::Loop(BlockType::Empty));
 
-    emit_consolidated_body(f, body, local_fn_map);
+    emit_consolidated_body(f, body, local_fn_map, ctx);
 
     // Pop current index from return stack into scratch local
     rpop(f);
@@ -1849,14 +2178,24 @@ pub fn compile_consolidated_module(
     let mut code = CodeSection::new();
     for (_word_id, body) in words {
         let num_locals = 1 + count_scratch_locals(body);
-        let mut func = Function::new(vec![(num_locals, ValType::I32)]);
+        let has_floats = needs_f64_locals(body);
+        let num_f64: u32 = if has_floats { 2 } else { 0 };
+        let mut locals_decl = vec![(num_locals, ValType::I32)];
+        if num_f64 > 0 {
+            locals_decl.push((num_f64, ValType::F64));
+        }
+        let mut func = Function::new(locals_decl);
+        let ctx = EmitCtx {
+            f64_local_0: num_locals,
+            f64_local_1: num_locals + 1,
+        };
 
         // Prologue: cache $dsp global into local 0
         func.instruction(&Instruction::GlobalGet(DSP))
             .instruction(&Instruction::LocalSet(CACHED_DSP_LOCAL));
 
         // Body with consolidated call support
-        emit_consolidated_body(&mut func, body, local_fn_map);
+        emit_consolidated_body(&mut func, body, local_fn_map, &ctx);
 
         // Epilogue: write cached DSP back to the $dsp global
         func.instruction(&Instruction::LocalGet(CACHED_DSP_LOCAL))
@@ -2746,4 +3085,246 @@ mod tests {
         assert!(!is_promotable(&ops));
         assert_eq!(run_word(&ops), vec![42]);
     }
+
+    // ===================================================================
+    // Float IR tests
+    // ===================================================================
+
+    /// Run a compiled word and return the float stack (top first).
+    fn run_float_word(ops: &[IrOp]) -> Vec<f64> {
+        use wasmtime::*;
+
+        let compiled = compile_word("test", ops, &default_config()).unwrap();
+        let engine = Engine::default();
+        let mut store = Store::new(&engine, ());
+
+        let memory = Memory::new(&mut store, MemoryType::new(16, None)).unwrap();
+
+        let dsp = Global::new(
+            &mut store,
+            wasmtime::GlobalType::new(ValType::I32, Mutability::Var),
+            Val::I32(DATA_STACK_TOP as i32),
+        )
+        .unwrap();
+
+        let rsp = Global::new(
+            &mut store,
+            wasmtime::GlobalType::new(ValType::I32, Mutability::Var),
+            Val::I32(RETURN_STACK_TOP as i32),
+        )
+        .unwrap();
+
+        let fsp = Global::new(
+            &mut store,
+            wasmtime::GlobalType::new(ValType::I32, Mutability::Var),
+            Val::I32(FLOAT_STACK_TOP as i32),
+        )
+        .unwrap();
+
+        let table = Table::new(
+            &mut store,
+            wasmtime::TableType::new(RefType::FUNCREF, 16, None),
+            Ref::Func(None),
+        )
+        .unwrap();
+
+        let emit_ty = FuncType::new(&engine, [ValType::I32], []);
+        let emit = Func::new(&mut store, emit_ty, |_caller, _params, _results| Ok(()));
+
+        let module = wasmtime::Module::new(&engine, &compiled.bytes).unwrap();
+        let instance = Instance::new(
+            &mut store,
+            &module,
+            &[
+                emit.into(),
+                memory.into(),
+                dsp.into(),
+                rsp.into(),
+                fsp.into(),
+                table.into(),
+            ],
+        )
+        .unwrap();
+
+        instance
+            .get_func(&mut store, "fn")
+            .unwrap()
+            .call(&mut store, &[], &mut [])
+            .unwrap();
+
+        // Read float stack
+        let sp = fsp.get(&mut store).unwrap_i32() as u32;
+        let data = memory.data(&store);
+        let mut stack = Vec::new();
+        let mut addr = sp;
+        while addr < FLOAT_STACK_TOP {
+            let b: [u8; 8] = data[addr as usize..addr as usize + 8].try_into().unwrap();
+            stack.push(f64::from_le_bytes(b));
+            addr += 8;
+        }
+        stack
+    }
+
+    #[test]
+    fn compile_push_f64_validates() {
+        let m = compile_word("test", &[IrOp::PushF64(3.14)], &default_config()).unwrap();
+        validate_wasm(&m.bytes).unwrap();
+    }
+
+    #[test]
+    fn compile_float_arithmetic_validates() {
+        let ops = vec![IrOp::PushF64(1.0), IrOp::PushF64(2.0), IrOp::FAdd];
+        let m = compile_word("fadd", &ops, &default_config()).unwrap();
+        validate_wasm(&m.bytes).unwrap();
+    }
+
+    #[test]
+    fn compile_float_cross_stack_validates() {
+        let ops = vec![IrOp::PushI32(42), IrOp::StoF, IrOp::FtoS];
+        let m = compile_word("cross", &ops, &default_config()).unwrap();
+        validate_wasm(&m.bytes).unwrap();
+    }
+
+    #[test]
+    fn execute_push_f64() {
+        assert_eq!(run_float_word(&[IrOp::PushF64(3.14)]), vec![3.14]);
+    }
+
+    #[test]
+    fn execute_float_add() {
+        let ops = vec![IrOp::PushF64(1.0), IrOp::PushF64(2.0), IrOp::FAdd];
+        assert_eq!(run_float_word(&ops), vec![3.0]);
+    }
+
+    #[test]
+    fn execute_float_sub() {
+        let ops = vec![IrOp::PushF64(5.0), IrOp::PushF64(3.0), IrOp::FSub];
+        assert_eq!(run_float_word(&ops), vec![2.0]);
+    }
+
+    #[test]
+    fn execute_float_mul() {
+        let ops = vec![IrOp::PushF64(3.0), IrOp::PushF64(4.0), IrOp::FMul];
+        assert_eq!(run_float_word(&ops), vec![12.0]);
+    }
+
+    #[test]
+    fn execute_float_div() {
+        let ops = vec![IrOp::PushF64(10.0), IrOp::PushF64(4.0), IrOp::FDiv];
+        assert_eq!(run_float_word(&ops), vec![2.5]);
+    }
+
+    #[test]
+    fn execute_float_negate() {
+        let ops = vec![IrOp::PushF64(3.0), IrOp::FNegate];
+        assert_eq!(run_float_word(&ops), vec![-3.0]);
+    }
+
+    #[test]
+    fn execute_float_abs() {
+        let ops = vec![IrOp::PushF64(-7.0), IrOp::FAbs];
+        assert_eq!(run_float_word(&ops), vec![7.0]);
+    }
+
+    #[test]
+    fn execute_float_sqrt() {
+        let ops = vec![IrOp::PushF64(9.0), IrOp::FSqrt];
+        assert_eq!(run_float_word(&ops), vec![3.0]);
+    }
+
+    #[test]
+    fn execute_float_floor() {
+        let ops = vec![IrOp::PushF64(3.7), IrOp::FFloor];
+        assert_eq!(run_float_word(&ops), vec![3.0]);
+    }
+
+    #[test]
+    fn execute_float_round() {
+        let ops = vec![IrOp::PushF64(2.5), IrOp::FRound];
+        assert_eq!(run_float_word(&ops), vec![2.0]); // round ties even
+    }
+
+    #[test]
+    fn execute_float_min_max() {
+        let ops = vec![IrOp::PushF64(3.0), IrOp::PushF64(5.0), IrOp::FMin];
+        assert_eq!(run_float_word(&ops), vec![3.0]);
+        let ops = vec![IrOp::PushF64(3.0), IrOp::PushF64(5.0), IrOp::FMax];
+        assert_eq!(run_float_word(&ops), vec![5.0]);
+    }
+
+    #[test]
+    fn execute_fdup() {
+        let ops = vec![IrOp::PushF64(7.0), IrOp::FDup];
+        assert_eq!(run_float_word(&ops), vec![7.0, 7.0]);
+    }
+
+    #[test]
+    fn execute_fdrop() {
+        let ops = vec![IrOp::PushF64(1.0), IrOp::PushF64(2.0), IrOp::FDrop];
+        assert_eq!(run_float_word(&ops), vec![1.0]);
+    }
+
+    #[test]
+    fn execute_fswap() {
+        let ops = vec![IrOp::PushF64(1.0), IrOp::PushF64(2.0), IrOp::FSwap];
+        assert_eq!(run_float_word(&ops), vec![1.0, 2.0]);
+    }
+
+    #[test]
+    fn execute_fover() {
+        let ops = vec![IrOp::PushF64(1.0), IrOp::PushF64(2.0), IrOp::FOver];
+        assert_eq!(run_float_word(&ops), vec![1.0, 2.0, 1.0]);
+    }
+
+    #[test]
+    fn execute_float_zero_eq() {
+        let ops = vec![IrOp::PushF64(0.0), IrOp::FZeroEq];
+        assert_eq!(run_word(&ops), vec![-1]);
+        let ops = vec![IrOp::PushF64(1.0), IrOp::FZeroEq];
+        assert_eq!(run_word(&ops), vec![0]);
+    }
+
+    #[test]
+    fn execute_float_zero_lt() {
+        let ops = vec![IrOp::PushF64(-1.0), IrOp::FZeroLt];
+        assert_eq!(run_word(&ops), vec![-1]);
+        let ops = vec![IrOp::PushF64(1.0), IrOp::FZeroLt];
+        assert_eq!(run_word(&ops), vec![0]);
+    }
+
+    #[test]
+    fn execute_float_eq() {
+        let ops = vec![IrOp::PushF64(3.0), IrOp::PushF64(3.0), IrOp::FEq];
+        assert_eq!(run_word(&ops), vec![-1]);
+        let ops = vec![IrOp::PushF64(3.0), IrOp::PushF64(4.0), IrOp::FEq];
+        assert_eq!(run_word(&ops), vec![0]);
+    }
+
+    #[test]
+    fn execute_float_lt() {
+        let ops = vec![IrOp::PushF64(2.0), IrOp::PushF64(3.0), IrOp::FLt];
+        assert_eq!(run_word(&ops), vec![-1]);
+        let ops = vec![IrOp::PushF64(3.0), IrOp::PushF64(2.0), IrOp::FLt];
+        assert_eq!(run_word(&ops), vec![0]);
+    }
+
+    #[test]
+    fn execute_stof_ftos() {
+        // ( 42 -- ) ( F: -- 42.0 ) then ( F: 42.0 -- ) ( -- 42 )
+        let ops = vec![IrOp::PushI32(42), IrOp::StoF, IrOp::FtoS];
+        assert_eq!(run_word(&ops), vec![42]);
+    }
+
+    #[test]
+    fn execute_fetch_store_float() {
+        // Store 3.14 at address 0x100, then fetch it back
+        let ops = vec![
+            IrOp::PushF64(3.14),
+            IrOp::PushI32(0x100),
+            IrOp::StoreFloat,
+            IrOp::PushI32(0x100),
+            IrOp::FetchFloat,
+        ];
+        assert_eq!(run_float_word(&ops), vec![3.14]);
+    }
 }
diff --git a/crates/core/src/ir.rs b/crates/core/src/ir.rs
index 0e2d5f3..a340740 100644
--- a/crates/core/src/ir.rs
+++ b/crates/core/src/ir.rs
@@ -133,6 +133,62 @@ pub enum IrOp {
     // -- System --
     /// Execute word by function table index: ( xt -- )
     Execute,
+
+    // -- Float stack manipulation --
+    /// Float duplicate: ( F: r -- r r )
+    FDup,
+    /// Float drop: ( F: r -- )
+    FDrop,
+    /// Float swap: ( F: r1 r2 -- r2 r1 )
+    FSwap,
+    /// Float over: ( F: r1 r2 -- r1 r2 r1 )
+    FOver,
+
+    // -- Float arithmetic --
+    /// Float add: ( F: r1 r2 -- r1+r2 )
+    FAdd,
+    /// Float subtract: ( F: r1 r2 -- r1-r2 )
+    FSub,
+    /// Float multiply: ( F: r1 r2 -- r1*r2 )
+    FMul,
+    /// Float divide: ( F: r1 r2 -- r1/r2 )
+    FDiv,
+    /// Float negate: ( F: r -- -r )
+    FNegate,
+    /// Float absolute value: ( F: r -- |r| )
+    FAbs,
+    /// Float square root: ( F: r -- sqrt(r) )
+    FSqrt,
+    /// Float minimum: ( F: r1 r2 -- min(r1,r2) )
+    FMin,
+    /// Float maximum: ( F: r1 r2 -- max(r1,r2) )
+    FMax,
+    /// Float floor: ( F: r -- floor(r) )
+    FFloor,
+    /// Float round to nearest even: ( F: r -- round(r) )
+    FRound,
+
+    // -- Float comparisons (cross-stack: pop float, push data) --
+    /// Float zero equal: ( F: r -- ) ( -- flag )
+    FZeroEq,
+    /// Float zero less-than: ( F: r -- ) ( -- flag )
+    FZeroLt,
+    /// Float equal: ( F: r1 r2 -- ) ( -- flag )
+    FEq,
+    /// Float less-than: ( F: r1 r2 -- ) ( -- flag )
+    FLt,
+
+    // -- Float memory (cross-stack) --
+    /// Float fetch: ( addr -- ) ( F: -- r )
+    FetchFloat,
+    /// Float store: ( addr -- ) ( F: r -- )
+    StoreFloat,
+
+    // -- Float/integer conversions (cross-stack) --
+    /// Single to float: ( n -- ) ( F: -- r )
+    StoF,
+    /// Float to single: ( F: r -- ) ( -- n )
+    FtoS,
 }
 
 /// A compiled word definition as IR.
diff --git a/crates/core/src/optimizer.rs b/crates/core/src/optimizer.rs
index 19c8586..c7dc8c0 100644
--- a/crates/core/src/optimizer.rs
+++ b/crates/core/src/optimizer.rs
@@ -194,6 +194,26 @@ fn peephole_one_pass(ops: Vec<IrOp>) -> Vec<IrOp> {
                     out.pop();
                     continue;
                 }
+                // PushF64, FDrop => remove both
+                (IrOp::PushF64(_), IrOp::FDrop) => {
+                    out.pop();
+                    continue;
+                }
+                // FDup, FDrop => remove both
+                (IrOp::FDup, IrOp::FDrop) => {
+                    out.pop();
+                    continue;
+                }
+                // FSwap, FSwap => remove both
+                (IrOp::FSwap, IrOp::FSwap) => {
+                    out.pop();
+                    continue;
+                }
+                // FNegate, FNegate => remove both
+                (IrOp::FNegate, IrOp::FNegate) => {
+                    out.pop();
+                    continue;
+                }
                 // Over, Over => TwoDup
                 (IrOp::Over, IrOp::Over) => {
                     out.pop();
@@ -236,6 +256,17 @@ fn constant_fold(ops: Vec<IrOp>) -> Vec<IrOp> {
             continue;
         }
 
+        // Try float binary fold: last two outputs are PushF64
+        if out.len() >= 2
+            && let Some(result) =
+                try_float_binary_fold(&out[out.len() - 2], &out[out.len() - 1], &op)
+        {
+            out.pop();
+            out.pop();
+            out.push(IrOp::PushF64(result));
+            continue;
+        }
+
         // Try unary fold: last output is PushI32, current op is foldable
         if !out.is_empty()
             && let Some(result) = try_unary_fold(&out[out.len() - 1], &op)
@@ -245,6 +276,15 @@ fn constant_fold(ops: Vec<IrOp>) -> Vec<IrOp> {
             continue;
         }
 
+        // Try float unary fold: last output is PushF64
+        if !out.is_empty()
+            && let Some(result) = try_float_unary_fold(&out[out.len() - 1], &op)
+        {
+            out.pop();
+            out.push(IrOp::PushF64(result));
+            continue;
+        }
+
         out.push(op);
     }
     out
@@ -317,6 +357,53 @@ fn try_unary_fold(n_op: &IrOp, op: &IrOp) -> Option<i32> {
     }
 }
 
+/// Try to fold a binary float operation on two constants.
+fn try_float_binary_fold(a_op: &IrOp, b_op: &IrOp, op: &IrOp) -> Option<f64> {
+    let (a, b) = match (a_op, b_op) {
+        (IrOp::PushF64(a), IrOp::PushF64(b)) => (*a, *b),
+        _ => return None,
+    };
+
+    match op {
+        IrOp::FAdd => Some(a + b),
+        IrOp::FSub => Some(a - b),
+        IrOp::FMul => Some(a * b),
+        IrOp::FDiv => {
+            if b != 0.0 {
+                Some(a / b)
+            } else {
+                None
+            }
+        }
+        IrOp::FMin => Some(a.min(b)),
+        IrOp::FMax => Some(a.max(b)),
+        _ => None,
+    }
+}
+
+/// Try to fold a unary float operation on a constant.
+fn try_float_unary_fold(n_op: &IrOp, op: &IrOp) -> Option<f64> {
+    let n = match n_op {
+        IrOp::PushF64(n) => *n,
+        _ => return None,
+    };
+
+    match op {
+        IrOp::FNegate => Some(-n),
+        IrOp::FAbs => Some(n.abs()),
+        IrOp::FSqrt => {
+            if n >= 0.0 {
+                Some(n.sqrt())
+            } else {
+                None
+            }
+        }
+        IrOp::FFloor => Some(n.floor()),
+        IrOp::FRound => Some(n.round_ties_even()),
+        _ => None,
+    }
+}
+
 // ---------------------------------------------------------------------------
 // Pass 3: Strength reduction
 // ---------------------------------------------------------------------------
@@ -779,6 +866,52 @@ mod tests {
         ));
     }
 
+    // Float peephole tests
+    #[test]
+    fn float_push_fdrop_removed() {
+        assert_eq!(opt(vec![IrOp::PushF64(1.0), IrOp::FDrop]), vec![]);
+    }
+
+    #[test]
+    fn float_fdup_fdrop_removed() {
+        assert_eq!(opt(vec![IrOp::FDup, IrOp::FDrop]), vec![]);
+    }
+
+    #[test]
+    fn float_fswap_fswap_removed() {
+        assert_eq!(opt(vec![IrOp::FSwap, IrOp::FSwap]), vec![]);
+    }
+
+    #[test]
+    fn float_fnegate_fnegate_removed() {
+        assert_eq!(opt(vec![IrOp::FNegate, IrOp::FNegate]), vec![]);
+    }
+
+    // Float constant folding tests
+    #[test]
+    fn float_constant_fold_add() {
+        assert_eq!(
+            opt(vec![IrOp::PushF64(1.5), IrOp::PushF64(2.5), IrOp::FAdd]),
+            vec![IrOp::PushF64(4.0)]
+        );
+    }
+
+    #[test]
+    fn float_constant_fold_negate() {
+        assert_eq!(
+            opt(vec![IrOp::PushF64(3.0), IrOp::FNegate]),
+            vec![IrOp::PushF64(-3.0)]
+        );
+    }
+
+    #[test]
+    fn float_constant_fold_sqrt() {
+        assert_eq!(
+            opt(vec![IrOp::PushF64(9.0), IrOp::FSqrt]),
+            vec![IrOp::PushF64(3.0)]
+        );
+    }
+
     #[test]
     fn no_inline_large() {
         let mut bodies = HashMap::new();
diff --git a/crates/core/src/outer.rs b/crates/core/src/outer.rs
index 6265cd2..b2da080 100644
--- a/crates/core/src/outer.rs
+++ b/crates/core/src/outer.rs
@@ -7148,95 +7148,10 @@ impl ForthVM {
 
     /// Float stack manipulation words.
     fn register_float_stack_ops(&mut self) -> anyhow::Result<()> {
-        // FDROP ( F: r -- )
-        {
-            let fsp = self.fsp;
-            let func = Func::new(
-                &mut self.store,
-                FuncType::new(&self.engine, [], []),
-                move |mut caller, _, _| {
-                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
-                    if sp >= FLOAT_STACK_TOP {
-                        return Err(wasmtime::Error::msg("float stack underflow"));
-                    }
-                    fsp.set(&mut caller, Val::I32((sp + 8) as i32)).unwrap();
-                    Ok(())
-                },
-            );
-            self.register_host_primitive("FDROP", false, func)?;
-        }
-
-        // FDUP ( F: r -- r r )
-        {
-            let memory = self.memory;
-            let fsp = self.fsp;
-            let func = Func::new(
-                &mut self.store,
-                FuncType::new(&self.engine, [], []),
-                move |mut caller, _, _| {
-                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
-                    if sp >= FLOAT_STACK_TOP {
-                        return Err(wasmtime::Error::msg("float stack underflow"));
-                    }
-                    let new_sp = sp - 8;
-                    if new_sp < FLOAT_STACK_BASE {
-                        return Err(wasmtime::Error::msg("float stack overflow"));
-                    }
-                    let mem = memory.data(&caller);
-                    let bytes: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
-                    fsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
-                    let mem = memory.data_mut(&mut caller);
-                    mem[new_sp as usize..new_sp as usize + 8].copy_from_slice(&bytes);
-                    Ok(())
-                },
-            );
-            self.register_host_primitive("FDUP", false, func)?;
-        }
-
-        // FSWAP ( F: r1 r2 -- r2 r1 )
-        {
-            let memory = self.memory;
-            let fsp = self.fsp;
-            let func = Func::new(
-                &mut self.store,
-                FuncType::new(&self.engine, [], []),
-                move |mut caller, _, _| {
-                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
-                    let mem = memory.data(&caller);
-                    let b: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
-                    let a: [u8; 8] = mem[sp as usize + 8..sp as usize + 16].try_into().unwrap();
-                    let mem = memory.data_mut(&mut caller);
-                    mem[sp as usize..sp as usize + 8].copy_from_slice(&a);
-                    mem[sp as usize + 8..sp as usize + 16].copy_from_slice(&b);
-                    Ok(())
-                },
-            );
-            self.register_host_primitive("FSWAP", false, func)?;
-        }
-
-        // FOVER ( F: r1 r2 -- r1 r2 r1 )
-        {
-            let memory = self.memory;
-            let fsp = self.fsp;
-            let func = Func::new(
-                &mut self.store,
-                FuncType::new(&self.engine, [], []),
-                move |mut caller, _, _| {
-                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
-                    let mem = memory.data(&caller);
-                    let a: [u8; 8] = mem[sp as usize + 8..sp as usize + 16].try_into().unwrap();
-                    let new_sp = sp - 8;
-                    if new_sp < FLOAT_STACK_BASE {
-                        return Err(wasmtime::Error::msg("float stack overflow"));
-                    }
-                    fsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
-                    let mem = memory.data_mut(&mut caller);
-                    mem[new_sp as usize..new_sp as usize + 8].copy_from_slice(&a);
-                    Ok(())
-                },
-            );
-            self.register_host_primitive("FOVER", false, func)?;
-        }
+        self.register_primitive("FDROP", false, vec![IrOp::FDrop])?;
+        self.register_primitive("FDUP", false, vec![IrOp::FDup])?;
+        self.register_primitive("FSWAP", false, vec![IrOp::FSwap])?;
+        self.register_primitive("FOVER", false, vec![IrOp::FOver])?;
 
         // FROT ( F: r1 r2 r3 -- r2 r3 r1 )
         {
@@ -7288,166 +7203,35 @@ impl ForthVM {
             self.register_host_primitive("FDEPTH", false, func)?;
         }
 
-        // FNIP ( F: r1 r2 -- r2 )
-        {
-            let memory = self.memory;
-            let fsp = self.fsp;
-            let func = Func::new(
-                &mut self.store,
-                FuncType::new(&self.engine, [], []),
-                move |mut caller, _, _| {
-                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
-                    let mem = memory.data(&caller);
-                    let top: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
-                    let new_sp = sp + 8;
-                    fsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
-                    let mem = memory.data_mut(&mut caller);
-                    mem[new_sp as usize..new_sp as usize + 8].copy_from_slice(&top);
-                    Ok(())
-                },
-            );
-            self.register_host_primitive("FNIP", false, func)?;
-        }
-
-        // FTUCK ( F: r1 r2 -- r2 r1 r2 )
-        {
-            let memory = self.memory;
-            let fsp = self.fsp;
-            let func = Func::new(
-                &mut self.store,
-                FuncType::new(&self.engine, [], []),
-                move |mut caller, _, _| {
-                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
-                    let mem = memory.data(&caller);
-                    let r2: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
-                    let r1: [u8; 8] = mem[sp as usize + 8..sp as usize + 16].try_into().unwrap();
-                    let new_sp = sp - 8;
-                    if new_sp < FLOAT_STACK_BASE {
-                        return Err(wasmtime::Error::msg("float stack overflow"));
-                    }
-                    fsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
-                    let mem = memory.data_mut(&mut caller);
-                    // r2 r1 r2 (bottom to top)
-                    mem[new_sp as usize..new_sp as usize + 8].copy_from_slice(&r2);
-                    mem[new_sp as usize + 8..new_sp as usize + 16].copy_from_slice(&r1);
-                    mem[new_sp as usize + 16..new_sp as usize + 24].copy_from_slice(&r2);
-                    Ok(())
-                },
-            );
-            self.register_host_primitive("FTUCK", false, func)?;
-        }
+        self.register_primitive("FNIP", false, vec![IrOp::FSwap, IrOp::FDrop])?;
+        self.register_primitive("FTUCK", false, vec![IrOp::FSwap, IrOp::FOver])?;
 
         Ok(())
     }
 
     /// Float arithmetic words.
     fn register_float_arithmetic(&mut self) -> anyhow::Result<()> {
-        self.register_float_binary("F+", |a, b| a + b)?;
-        self.register_float_binary("F-", |a, b| a - b)?;
-        self.register_float_binary("F*", |a, b| a * b)?;
-        self.register_float_binary("F/", |a, b| a / b)?;
-        self.register_float_unary("FNEGATE", |a| -a)?;
-        self.register_float_unary("FABS", f64::abs)?;
-        self.register_float_binary("FMAX", f64::max)?;
-        self.register_float_binary("FMIN", f64::min)?;
-        self.register_float_unary("FSQRT", f64::sqrt)?;
-        self.register_float_unary("FLOOR", f64::floor)?;
-        self.register_float_unary("FROUND", f64::round_ties_even)?;
+        self.register_primitive("F+", false, vec![IrOp::FAdd])?;
+        self.register_primitive("F-", false, vec![IrOp::FSub])?;
+        self.register_primitive("F*", false, vec![IrOp::FMul])?;
+        self.register_primitive("F/", false, vec![IrOp::FDiv])?;
+        self.register_primitive("FNEGATE", false, vec![IrOp::FNegate])?;
+        self.register_primitive("FABS", false, vec![IrOp::FAbs])?;
+        self.register_primitive("FMAX", false, vec![IrOp::FMax])?;
+        self.register_primitive("FMIN", false, vec![IrOp::FMin])?;
+        self.register_primitive("FSQRT", false, vec![IrOp::FSqrt])?;
+        self.register_primitive("FLOOR", false, vec![IrOp::FFloor])?;
+        self.register_primitive("FROUND", false, vec![IrOp::FRound])?;
         self.register_float_binary("F**", f64::powf)?;
         Ok(())
     }
 
     /// Float comparison words. Results go on the DATA stack.
     fn register_float_comparisons(&mut self) -> anyhow::Result<()> {
-        // F0= ( -- flag ) ( F: r -- )
-        {
-            let memory = self.memory;
-            let dsp = self.dsp;
-            let fsp = self.fsp;
-            let func = Func::new(
-                &mut self.store,
-                FuncType::new(&self.engine, [], []),
-                move |mut caller, _, _| {
-                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
-                    let mem = memory.data(&caller);
-                    let bytes: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
-                    let val = f64::from_le_bytes(bytes);
-                    fsp.set(&mut caller, Val::I32((sp + 8) as i32)).unwrap();
-                    let flag: i32 = if val == 0.0 { -1 } else { 0 };
-                    let dsp_val = dsp.get(&mut caller).unwrap_i32() as u32;
-                    let new_dsp = dsp_val - CELL_SIZE;
-                    dsp.set(&mut caller, Val::I32(new_dsp as i32)).unwrap();
-                    let mem = memory.data_mut(&mut caller);
-                    mem[new_dsp as usize..new_dsp as usize + 4]
-                        .copy_from_slice(&flag.to_le_bytes());
-                    Ok(())
-                },
-            );
-            self.register_host_primitive("F0=", false, func)?;
-        }
-
-        // F0< ( -- flag ) ( F: r -- )
-        {
-            let memory = self.memory;
-            let dsp = self.dsp;
-            let fsp = self.fsp;
-            let func = Func::new(
-                &mut self.store,
-                FuncType::new(&self.engine, [], []),
-                move |mut caller, _, _| {
-                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
-                    let mem = memory.data(&caller);
-                    let bytes: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
-                    let val = f64::from_le_bytes(bytes);
-                    fsp.set(&mut caller, Val::I32((sp + 8) as i32)).unwrap();
-                    let flag: i32 = if val < 0.0 { -1 } else { 0 };
-                    let dsp_val = dsp.get(&mut caller).unwrap_i32() as u32;
-                    let new_dsp = dsp_val - CELL_SIZE;
-                    dsp.set(&mut caller, Val::I32(new_dsp as i32)).unwrap();
-                    let mem = memory.data_mut(&mut caller);
-                    mem[new_dsp as usize..new_dsp as usize + 4]
-                        .copy_from_slice(&flag.to_le_bytes());
-                    Ok(())
-                },
-            );
-            self.register_host_primitive("F0<", false, func)?;
-        }
-
-        // Helper for binary float comparisons that pop two floats and push a flag
-        let register_float_cmp =
-            |vm: &mut Self, name: &str, cmp: fn(f64, f64) -> bool| -> anyhow::Result<()> {
-                let memory = vm.memory;
-                let dsp = vm.dsp;
-                let fsp = vm.fsp;
-                let func = Func::new(
-                    &mut vm.store,
-                    FuncType::new(&vm.engine, [], []),
-                    move |mut caller, _, _| {
-                        let sp = fsp.get(&mut caller).unwrap_i32() as u32;
-                        let mem = memory.data(&caller);
-                        let b_bytes: [u8; 8] =
-                            mem[sp as usize..sp as usize + 8].try_into().unwrap();
-                        let a_bytes: [u8; 8] =
-                            mem[sp as usize + 8..sp as usize + 16].try_into().unwrap();
-                        let b = f64::from_le_bytes(b_bytes);
-                        let a = f64::from_le_bytes(a_bytes);
-                        fsp.set(&mut caller, Val::I32((sp + 16) as i32)).unwrap();
-                        let flag: i32 = if cmp(a, b) { -1 } else { 0 };
-                        let dsp_val = dsp.get(&mut caller).unwrap_i32() as u32;
-                        let new_dsp = dsp_val - CELL_SIZE;
-                        dsp.set(&mut caller, Val::I32(new_dsp as i32)).unwrap();
-                        let mem = memory.data_mut(&mut caller);
-                        mem[new_dsp as usize..new_dsp as usize + 4]
-                            .copy_from_slice(&flag.to_le_bytes());
-                        Ok(())
-                    },
-                );
-                vm.register_host_primitive(name, false, func)?;
-                Ok(())
-            };
-
-        register_float_cmp(self, "F=", |a, b| a == b)?;
-        register_float_cmp(self, "F<", |a, b| a < b)?;
+        self.register_primitive("F0=", false, vec![IrOp::FZeroEq])?;
+        self.register_primitive("F0<", false, vec![IrOp::FZeroLt])?;
+        self.register_primitive("F=", false, vec![IrOp::FEq])?;
+        self.register_primitive("F<", false, vec![IrOp::FLt])?;
 
         // F~ ( -- flag ) ( F: r1 r2 r3 -- ) approximate float comparison
         // If r3 > 0: true if |r1-r2| < r3
@@ -7502,76 +7286,8 @@ impl ForthVM {
 
     /// Float memory words.
     fn register_float_memory(&mut self) -> anyhow::Result<()> {
-        // F@ ( f-addr -- ) ( F: -- r ) fetch a float from memory
-        {
-            let memory = self.memory;
-            let dsp = self.dsp;
-            let fsp = self.fsp;
-            let func = Func::new(
-                &mut self.store,
-                FuncType::new(&self.engine, [], []),
-                move |mut caller, _, _| {
-                    // Read all we need from memory first
-                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
-                    let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
-                    let (addr, val) = {
-                        let mem = memory.data(&caller);
-                        let addr_bytes: [u8; 4] =
-                            mem[sp as usize..sp as usize + 4].try_into().unwrap();
-                        let addr = u32::from_le_bytes(addr_bytes) as usize;
-                        let float_bytes: [u8; 8] = mem[addr..addr + 8].try_into().unwrap();
-                        (addr, f64::from_le_bytes(float_bytes))
-                    };
-                    let _ = addr;
-                    // Update stack pointers
-                    dsp.set(&mut caller, Val::I32((sp + CELL_SIZE) as i32))
-                        .unwrap();
-                    let new_fsp = fsp_val - FLOAT_SIZE;
-                    fsp.set(&mut caller, Val::I32(new_fsp as i32)).unwrap();
-                    // Write float to float stack
-                    let mem = memory.data_mut(&mut caller);
-                    mem[new_fsp as usize..new_fsp as usize + 8].copy_from_slice(&val.to_le_bytes());
-                    Ok(())
-                },
-            );
-            self.register_host_primitive("F@", false, func)?;
-        }
-
-        // F! ( f-addr -- ) ( F: r -- ) store a float to memory
-        {
-            let memory = self.memory;
-            let dsp = self.dsp;
-            let fsp = self.fsp;
-            let func = Func::new(
-                &mut self.store,
-                FuncType::new(&self.engine, [], []),
-                move |mut caller, _, _| {
-                    // Read all we need first
-                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
-                    let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
-                    let (addr, float_bytes) = {
-                        let mem = memory.data(&caller);
-                        let addr_bytes: [u8; 4] =
-                            mem[sp as usize..sp as usize + 4].try_into().unwrap();
-                        let addr = u32::from_le_bytes(addr_bytes) as usize;
-                        let float_bytes: [u8; 8] = mem[fsp_val as usize..fsp_val as usize + 8]
-                            .try_into()
-                            .unwrap();
-                        (addr, float_bytes)
-                    };
-                    // Update stack pointers
-                    dsp.set(&mut caller, Val::I32((sp + CELL_SIZE) as i32))
-                        .unwrap();
-                    fsp.set(&mut caller, Val::I32((fsp_val + FLOAT_SIZE) as i32))
-                        .unwrap();
-                    // Store float at addr
-                    let mem = memory.data_mut(&mut caller);
-                    mem[addr..addr + 8].copy_from_slice(&float_bytes);
-                    Ok(())
-                },
-            );
-            self.register_host_primitive("F!", false, func)?;
-        }
+        self.register_primitive("F@", false, vec![IrOp::FetchFloat])?;
+        self.register_primitive("F!", false, vec![IrOp::StoreFloat])?;
 
         // FLOAT+ ( f-addr1 -- f-addr2 ) add float size to address
         self.register_primitive(
@@ -7742,61 +7458,8 @@ impl ForthVM {
             self.register_host_primitive("F>D", false, func)?;
         }
 
-        // S>F ( n -- ) ( F: -- r ) convert single-cell integer to float
-        {
-            let memory = self.memory;
-            let dsp = self.dsp;
-            let fsp = self.fsp;
-            let func = Func::new(
-                &mut self.store,
-                FuncType::new(&self.engine, [], []),
-                move |mut caller, _, _| {
-                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
-                    let mem = memory.data(&caller);
-                    let b: [u8; 4] = mem[sp as usize..sp as usize + 4].try_into().unwrap();
-                    let n = i32::from_le_bytes(b);
-                    dsp.set(&mut caller, Val::I32((sp + CELL_SIZE) as i32))
-                        .unwrap();
-                    let f = n as f64;
-                    let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
-                    let new_fsp = fsp_val - FLOAT_SIZE;
-                    fsp.set(&mut caller, Val::I32(new_fsp as i32)).unwrap();
-                    let mem = memory.data_mut(&mut caller);
-                    mem[new_fsp as usize..new_fsp as usize + 8].copy_from_slice(&f.to_le_bytes());
-                    Ok(())
-                },
-            );
-            self.register_host_primitive("S>F", false, func)?;
-        }
-
-        // F>S ( -- n ) ( F: r -- ) convert float to single-cell integer
-        {
-            let memory = self.memory;
-            let dsp = self.dsp;
-            let fsp = self.fsp;
-            let func = Func::new(
-                &mut self.store,
-                FuncType::new(&self.engine, [], []),
-                move |mut caller, _, _| {
-                    let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
-                    let mem = memory.data(&caller);
-                    let bytes: [u8; 8] = mem[fsp_val as usize..fsp_val as usize + 8]
-                        .try_into()
-                        .unwrap();
-                    let f = f64::from_le_bytes(bytes);
-                    fsp.set(&mut caller, Val::I32((fsp_val + FLOAT_SIZE) as i32))
-                        .unwrap();
-                    let n = f as i32;
-                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
-                    let new_sp = sp - CELL_SIZE;
-                    dsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
-                    let mem = memory.data_mut(&mut caller);
-                    mem[new_sp as usize..new_sp as usize + 4].copy_from_slice(&n.to_le_bytes());
-                    Ok(())
-                },
-            );
-            self.register_host_primitive("F>S", false, func)?;
-        }
+        self.register_primitive("S>F", false, vec![IrOp::StoF])?;
+        self.register_primitive("F>S", false, vec![IrOp::FtoS])?;
 
         Ok(())
     }
@@ -8361,27 +8024,9 @@ impl ForthVM {
     }
 
     /// Compile a float literal for use inside a colon definition.
-    /// Creates a tiny host function that pushes the given f64 onto the float stack.
+    /// Emits `PushF64` IR op which compiles directly to WASM f64.const + float stack push.
     fn compile_float_literal(&mut self, val: f64) -> anyhow::Result<()> {
-        let memory = self.memory;
-        let fsp = self.fsp;
-        let func = Func::new(
-            &mut self.store,
-            FuncType::new(&self.engine, [], []),
-            move |mut caller, _, _| {
-                let sp = fsp.get(&mut caller).unwrap_i32() as u32;
-                let new_sp = sp - FLOAT_SIZE;
-                if new_sp < FLOAT_STACK_BASE {
-                    return Err(wasmtime::Error::msg("float stack overflow"));
-                }
-                fsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
-                let mem = memory.data_mut(&mut caller);
-                mem[new_sp as usize..new_sp as usize + 8].copy_from_slice(&val.to_le_bytes());
-                Ok(())
-            },
-        );
-        let word_id = self.install_anon_func(func)?;
-        self.push_ir(IrOp::Call(word_id));
+        self.push_ir(IrOp::PushF64(val));
         Ok(())
     }
 
@@ -10463,4 +10108,59 @@ mod tests {
         assert_eq!(eval_stack(": T = ; 5 5 T"), vec![-1]);
         assert_eq!(eval_stack(": T < ; 3 5 T"), vec![-1]);
     }
+
+    // ===================================================================
+    // Float IR tests
+    // ===================================================================
+
+    #[test]
+    fn float_ir_add() {
+        assert_eq!(eval_output("1E 2E F+ F."), "3.000000 ");
+    }
+
+    #[test]
+    fn float_ir_literal_in_colon() {
+        assert_eq!(eval_output(": T 1.5E0 2.5E0 F+ F. ; T"), "4.000000 ");
+    }
+
+    #[test]
+    fn float_ir_conversions() {
+        assert_eq!(eval_stack("42 S>F F>S"), vec![42]);
+    }
+
+    #[test]
+    fn float_ir_memory() {
+        assert_eq!(eval_output("FVARIABLE X 3.14E0 X F! X F@ F."), "3.140000 ");
+    }
+
+    #[test]
+    fn float_ir_comparisons() {
+        assert_eq!(eval_stack("1E 2E F<"), vec![-1]);
+        assert_eq!(eval_stack("2E 1E F<"), vec![0]);
+        assert_eq!(eval_stack("3E 3E F="), vec![-1]);
+        assert_eq!(eval_stack("0E F0="), vec![-1]);
+        assert_eq!(eval_stack("1E F0="), vec![0]);
+        assert_eq!(eval_stack("-1E F0<"), vec![-1]);
+        assert_eq!(eval_stack("1E F0<"), vec![0]);
+    }
+
+    #[test]
+    fn float_ir_stack_ops() {
+        assert_eq!(eval_output("1E FDUP F. F."), "1.000000 1.000000 ");
+        assert_eq!(eval_output("1E 2E FSWAP F. F."), "1.000000 2.000000 ");
+        assert_eq!(
+            eval_output("1E 2E FOVER F. F. F."),
+            "1.000000 2.000000 1.000000 "
+        );
+    }
+
+    #[test]
+    fn float_ir_arithmetic() {
+        assert_eq!(eval_output("10E 3E F- F."), "7.000000 ");
+        assert_eq!(eval_output("3E 4E F* F."), "12.000000 ");
+        assert_eq!(eval_output("10E 4E F/ F."), "2.500000 ");
+        assert_eq!(eval_output("3E FNEGATE F."), "-3.000000 ");
+        assert_eq!(eval_output("-7E FABS F."), "7.000000 ");
+        assert_eq!(eval_output("9E FSQRT F."), "3.000000 ");
+    }
 }