From eb79c40c69f98c2d3699fabe0706aeb77655cf66 Mon Sep 17 00:00:00 2001
From: Oleksandr Kozachuk <oleksandr@kozachuk.info>
Date: Wed, 1 Apr 2026 20:38:48 +0200
Subject: [PATCH] Implement complete Floating-Point word set, 70+ float words

Separate float stack with fsp global, IEEE 754 double precision.
Stack ops: FDROP FDUP FSWAP FOVER FROT FDEPTH
Arithmetic: F+ F- F* F/ FNEGATE FABS FMAX FMIN FSQRT FLOOR FROUND F**
Comparisons: F0= F0< F= F< F~
Memory: F@ F! SF@ SF! DF@ DF! FLOAT+ FLOATS FALIGNED FALIGN
Conversions: D>F F>D S>F F>S
Trig: FSIN FCOS FTAN FASIN FACOS FATAN FATAN2 FSINCOS
Exp/Log: FEXP FEXPM1 FLN FLNP1 FLOG FALOG
Hyperbolic: FSINH FCOSH FTANH FASINH FACOSH FATANH
I/O: F. FE. FS. REPRESENT >FLOAT PRECISION SET-PRECISION
Defining: FVARIABLE FCONSTANT FVALUE FLITERAL
Float literal parsing (1E, 1.5E2, -3.14E0 format)
299 unit tests + 11 compliance tests, 0 errors on float test suite
---
 crates/core/src/codegen.rs    |   23 +-
 crates/core/src/dictionary.rs |    6 +
 crates/core/src/outer.rs      | 1971 ++++++++++++++++++++++++++++++++-
 docs/APPLICATIONS.md          |  890 +++++++++++++++
 4 files changed, 2870 insertions(+), 20 deletions(-)
 create mode 100644 docs/APPLICATIONS.md

diff --git a/crates/core/src/codegen.rs b/crates/core/src/codegen.rs
index b8ecc31..c389c42 100644
--- a/crates/core/src/codegen.rs
+++ b/crates/core/src/codegen.rs
@@ -29,6 +29,10 @@ const DSP: u32 = 0;
 /// Index of the `$rsp` global (return stack pointer).
 const RSP: u32 = 1;
 
+/// Index of the `$fsp` global (float stack pointer).
+#[allow(dead_code)]
+const FSP: u32 = 2;
+
 /// Index of the imported function table.
 const TABLE: u32 = 0;
 
@@ -795,6 +799,15 @@ pub fn compile_word(
             shared: false,
         }),
     );
+    imports.import(
+        "env",
+        "fsp",
+        EntityType::Global(GlobalType {
+            val_type: ValType::I32,
+            mutable: true,
+            shared: false,
+        }),
+    );
     imports.import(
         "env",
         "table",
@@ -871,7 +884,7 @@ mod tests {
     use super::*;
     use crate::dictionary::WordId;
     use crate::ir::IrOp;
-    use crate::memory::{DATA_STACK_TOP, RETURN_STACK_TOP};
+    use crate::memory::{DATA_STACK_TOP, FLOAT_STACK_TOP, RETURN_STACK_TOP};
 
     fn default_config() -> CodegenConfig {
         CodegenConfig {
@@ -1133,6 +1146,13 @@ mod tests {
         )
         .unwrap();
 
+        let fsp = Global::new(
+            &mut store,
+            wasmtime::GlobalType::new(ValType::I32, Mutability::Var),
+            Val::I32(FLOAT_STACK_TOP as i32),
+        )
+        .unwrap();
+
         let table = Table::new(
             &mut store,
             wasmtime::TableType::new(RefType::FUNCREF, 16, None),
@@ -1152,6 +1172,7 @@ mod tests {
                 memory.into(),
                 dsp.into(),
                 rsp.into(),
+                fsp.into(),
                 table.into(),
             ],
         )
diff --git a/crates/core/src/dictionary.rs b/crates/core/src/dictionary.rs
index d9419fc..e9a3091 100644
--- a/crates/core/src/dictionary.rs
+++ b/crates/core/src/dictionary.rs
@@ -111,6 +111,12 @@ impl Dictionary {
         Ok(WordId(fn_index))
     }
 
+    /// Reserve a function index without creating a dictionary entry.
+    /// Used for anonymous host functions (e.g., float literals during compilation).
+    pub fn reserve_fn_index(&mut self) {
+        self.next_fn_index += 1;
+    }
+
     /// Reveal the most recent word (remove HIDDEN flag).
     /// Called after `: ... ;` completes compilation.
     pub fn reveal(&mut self) {
diff --git a/crates/core/src/outer.rs b/crates/core/src/outer.rs
index b7c78e0..397f665 100644
--- a/crates/core/src/outer.rs
+++ b/crates/core/src/outer.rs
@@ -20,8 +20,9 @@ use crate::codegen::{CodegenConfig, CompiledModule, compile_word};
 use crate::dictionary::{Dictionary, WordId};
 use crate::ir::IrOp;
 use crate::memory::{
-    CELL_SIZE, DATA_STACK_TOP, INPUT_BUFFER_BASE, INPUT_BUFFER_SIZE, RETURN_STACK_TOP,
-    SYSVAR_BASE_VAR, SYSVAR_NUM_TIB, SYSVAR_STATE, SYSVAR_TO_IN,
+    CELL_SIZE, DATA_STACK_TOP, FLOAT_SIZE, FLOAT_STACK_BASE, FLOAT_STACK_TOP, INPUT_BUFFER_BASE,
+    INPUT_BUFFER_SIZE, RETURN_STACK_TOP, SYSVAR_BASE_VAR, SYSVAR_NUM_TIB, SYSVAR_STATE,
+    SYSVAR_TO_IN,
 };
 
 // ---------------------------------------------------------------------------
@@ -172,6 +173,7 @@ pub struct ForthVM {
     table: Table,
     dsp: Global,
     rsp: Global,
+    fsp: Global,
     /// 0 = interpreting, -1 = compiling
     state: i32,
     /// Number base (default 10)
@@ -223,6 +225,10 @@ pub struct ForthVM {
     word_lookup: Arc<Mutex<HashMap<String, (u32, bool)>>>,
     // Set of word_ids that are 2VALUEs (need 2-cell TO semantics)
     two_value_words: std::collections::HashSet<u32>,
+    // Set of word_ids that are FVALUEs (need float TO semantics)
+    fvalue_words: std::collections::HashSet<u32>,
+    // Float I/O precision (default 6)
+    float_precision: Arc<Mutex<usize>>,
 }
 
 impl ForthVM {
@@ -253,6 +259,13 @@ impl ForthVM {
             Val::I32(RETURN_STACK_TOP as i32),
         )?;
 
+        // Float stack pointer global
+        let fsp = Global::new(
+            &mut store,
+            wasmtime::GlobalType::new(ValType::I32, Mutability::Var),
+            Val::I32(FLOAT_STACK_TOP as i32),
+        )?;
+
         // Function table (initial 256 entries)
         let table = Table::new(
             &mut store,
@@ -297,6 +310,7 @@ impl ForthVM {
             table,
             dsp,
             rsp,
+            fsp,
             state: 0,
             base: 10,
             input_buffer: String::new(),
@@ -324,6 +338,8 @@ impl ForthVM {
             throw_code: Arc::new(Mutex::new(None)),
             word_lookup: Arc::new(Mutex::new(HashMap::new())),
             two_value_words: std::collections::HashSet::new(),
+            fvalue_words: std::collections::HashSet::new(),
+            float_precision: Arc::new(Mutex::new(6)),
         };
 
         vm.register_primitives()?;
@@ -613,6 +629,9 @@ impl ForthVM {
             "2CONSTANT" => return self.define_2constant(),
             "2VARIABLE" => return self.define_2variable(),
             "2VALUE" => return self.define_2value(),
+            "FVARIABLE" => return self.define_fvariable(),
+            "FCONSTANT" => return self.define_fconstant(),
+            "FVALUE" => return self.define_fvalue(),
             _ => {}
         }
 
@@ -639,6 +658,12 @@ impl ForthVM {
             return Ok(());
         }
 
+        // Try to parse as float literal (contains 'E' or 'e')
+        if let Some(f) = self.parse_float_literal(token) {
+            self.fpush(f)?;
+            return Ok(());
+        }
+
         anyhow::bail!("unknown word: {token}");
     }
 
@@ -786,6 +811,12 @@ impl ForthVM {
                 }
                 return Ok(());
             }
+            "FLITERAL" => {
+                // compile-time: pop from float stack, compile as float literal
+                let f = self.fpop()?;
+                self.compile_float_literal(f)?;
+                return Ok(());
+            }
             "SLITERAL" => {
                 // compile-time: pop (c-addr u) from data stack, copy string,
                 // compile code to push the new (c-addr u)
@@ -936,6 +967,12 @@ impl ForthVM {
             return Ok(());
         }
 
+        // Try to parse as float literal -- compile as FLITERAL
+        if let Some(f) = self.parse_float_literal(token) {
+            self.compile_float_literal(f)?;
+            return Ok(());
+        }
+
         anyhow::bail!("unknown word: {token}");
     }
 
@@ -1464,6 +1501,7 @@ impl ForthVM {
                 self.memory.into(),
                 self.dsp.into(),
                 self.rsp.into(),
+                self.fsp.into(),
                 self.table.into(),
             ],
         )?;
@@ -1540,6 +1578,50 @@ impl ForthVM {
         Ok(value)
     }
 
+    // -----------------------------------------------------------------------
+    // Float stack operations
+    // -----------------------------------------------------------------------
+
+    /// Push a value onto the float stack.
+    fn fpush(&mut self, val: f64) -> anyhow::Result<()> {
+        let sp = self.fsp.get(&mut self.store).unwrap_i32() as u32;
+        let new_sp = sp - FLOAT_SIZE;
+        if new_sp < FLOAT_STACK_BASE {
+            anyhow::bail!("float stack overflow");
+        }
+        self.fsp.set(&mut self.store, Val::I32(new_sp as i32))?;
+        let mem = self.memory.data_mut(&mut self.store);
+        mem[new_sp as usize..new_sp as usize + 8].copy_from_slice(&val.to_le_bytes());
+        Ok(())
+    }
+
+    /// Pop a value from the float stack.
+    fn fpop(&mut self) -> anyhow::Result<f64> {
+        let sp = self.fsp.get(&mut self.store).unwrap_i32() as u32;
+        if sp >= FLOAT_STACK_TOP {
+            anyhow::bail!("float stack underflow");
+        }
+        let mem = self.memory.data(&self.store);
+        let bytes: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
+        self.fsp.set(&mut self.store, Val::I32((sp + 8) as i32))?;
+        Ok(f64::from_le_bytes(bytes))
+    }
+
+    /// Read the current float stack contents (top-first).
+    #[cfg(test)]
+    fn float_stack(&mut self) -> Vec<f64> {
+        let sp = self.fsp.get(&mut self.store).unwrap_i32() as u32;
+        let data = self.memory.data(&self.store);
+        let mut stack = Vec::new();
+        let mut addr = sp;
+        while addr < FLOAT_STACK_TOP {
+            let b: [u8; 8] = data[addr as usize..addr as usize + 8].try_into().unwrap();
+            stack.push(f64::from_le_bytes(b));
+            addr += FLOAT_SIZE;
+        }
+        stack
+    }
+
     // -----------------------------------------------------------------------
     // Number parsing
     // -----------------------------------------------------------------------
@@ -1620,6 +1702,37 @@ impl ForthVM {
         })
     }
 
+    // -----------------------------------------------------------------------
+    // Float literal parsing
+    // -----------------------------------------------------------------------
+
+    /// Try to parse a token as a floating-point literal (Forth 2012 format).
+    /// Forth float literals contain 'E' or 'e', e.g. `1E`, `1.5E0`, `-3.14E2`, `1E-3`.
+    #[allow(clippy::unused_self)]
+    fn parse_float_literal(&self, token: &str) -> Option<f64> {
+        if token.is_empty() {
+            return None;
+        }
+        let upper = token.to_ascii_uppercase();
+        // Must contain 'E' or 'D' (Forth sometimes uses D for double-float exponent)
+        if !upper.contains('E') && !upper.contains('D') {
+            return None;
+        }
+        // Replace D with E for Rust parsing
+        let normalized = upper.replace('D', "E");
+        // Forth allows trailing E without exponent: "1E" means "1E0"
+        // Also "1E+" or "1E-" mean "1E+0" and "1E-0"
+        let s = if normalized.ends_with('E')
+            || normalized.ends_with("E+")
+            || normalized.ends_with("E-")
+        {
+            format!("{normalized}0")
+        } else {
+            normalized
+        };
+        s.parse::<f64>().ok()
+    }
+
     // -----------------------------------------------------------------------
     // Push IR to the active body
     // -----------------------------------------------------------------------
@@ -1967,6 +2080,9 @@ impl ForthVM {
         self.register_blank()?;
         self.register_minus_trailing()?;
 
+        // -- Floating-Point word set --
+        self.register_float_words()?;
+
         Ok(())
     }
 
@@ -2412,7 +2528,12 @@ impl ForthVM {
 
         if let Some((_addr, word_id, _imm)) = self.dictionary.find(&name) {
             if let Some(&pfa) = self.word_pfa_map.get(&word_id.0) {
-                if self.two_value_words.contains(&word_id.0) {
+                if self.fvalue_words.contains(&word_id.0) {
+                    // FVALUE: pop from float stack, store 8 bytes
+                    let value = self.fpop()?;
+                    let data = self.memory.data_mut(&mut self.store);
+                    data[pfa as usize..pfa as usize + 8].copy_from_slice(&value.to_le_bytes());
+                } else if self.two_value_words.contains(&word_id.0) {
                     // 2VALUE: pop two cells
                     let hi = self.pop_data_stack()?;
                     let lo = self.pop_data_stack()?;
@@ -2482,23 +2603,13 @@ impl ForthVM {
 
         if let Some((_addr, word_id, _imm)) = self.dictionary.find(&name) {
             if let Some(&pfa) = self.word_pfa_map.get(&word_id.0) {
-                if self.two_value_words.contains(&word_id.0) {
+                if self.fvalue_words.contains(&word_id.0) {
+                    // FVALUE: compile a call to a host function that pops
+                    // from the float stack and stores at pfa
+                    let store_word = self.make_fvalue_store(pfa)?;
+                    self.push_ir(IrOp::Call(store_word));
+                } else if self.two_value_words.contains(&word_id.0) {
                     // 2VALUE: ( x1 x2 -- ) store two cells
-                    // Stack: x2 on top, x1 below. Store x1 at pfa, x2 at pfa+4
-                    // Compile: swap over swap pfa ! pfa+4 !
-                    // Actually: ( x1 x2 -- ) we want x1 at pfa, x2 at pfa+4
-                    // The top is x2, below is x1
-                    // SWAP gives us x2 x1, then PFA ! gives x1 at pfa (pops x1)
-                    // Then PFA+4 ! gives x2 at pfa+4
-                    // Wait: stack is ( x1 x2 -- ). x2 is TOS.
-                    // We want: x1 at [pfa], x2 at [pfa+4]
-                    // PFA+4 SWAP ROT (? no)
-                    // Simply: SWAP PFA ! PFA+4 !
-                    // But SWAP makes it (x2 x1). PFA ! stores x1, leaves x2. PFA+4 ! stores x2.
-                    // Wait, ! pops (val addr). So we need addr on top.
-                    // ( x1 x2 ) -> we need ( x1 pfa ) to store, then ( x2 pfa+4 )
-                    // So: PFA+4 SWAP PFA+4 ! PFA !  -- no
-                    // Let's just do it with explicit IR:
                     self.push_ir(IrOp::PushI32((pfa + 4) as i32));
                     self.push_ir(IrOp::Store); // stores x2 at pfa+4
                     self.push_ir(IrOp::PushI32(pfa as i32));
@@ -6776,6 +6887,1584 @@ impl ForthVM {
         self.register_host_primitive("-TRAILING", false, func)?;
         Ok(())
     }
+
+    // -----------------------------------------------------------------------
+    // Floating-Point word set
+    // -----------------------------------------------------------------------
+
+    /// Register all floating-point words.
+    fn register_float_words(&mut self) -> anyhow::Result<()> {
+        self.register_float_stack_ops()?;
+        self.register_float_arithmetic()?;
+        self.register_float_comparisons()?;
+        self.register_float_memory()?;
+        self.register_float_conversions()?;
+        self.register_float_trig()?;
+        self.register_float_exp_log()?;
+        self.register_float_hyperbolic()?;
+        self.register_float_io()?;
+        self.register_float_misc()?;
+        Ok(())
+    }
+
+    /// Helper: create a host function that takes no data-stack args
+    /// and operates on the float stack via fsp/memory closures.
+    /// Pattern for unary float ops: pop one float, compute, push result.
+    fn register_float_unary(&mut self, name: &str, op: fn(f64) -> f64) -> anyhow::Result<()> {
+        let memory = self.memory;
+        let fsp = self.fsp;
+        let func = Func::new(
+            &mut self.store,
+            FuncType::new(&self.engine, [], []),
+            move |mut caller, _, _| {
+                let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                if sp >= FLOAT_STACK_TOP {
+                    return Err(wasmtime::Error::msg("float stack underflow"));
+                }
+                let mem = memory.data(&caller);
+                let bytes: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
+                let a = f64::from_le_bytes(bytes);
+                let result = op(a);
+                let mem = memory.data_mut(&mut caller);
+                mem[sp as usize..sp as usize + 8].copy_from_slice(&result.to_le_bytes());
+                Ok(())
+            },
+        );
+        self.register_host_primitive(name, false, func)?;
+        Ok(())
+    }
+
+    /// Pattern for binary float ops: pop two floats (b then a), compute, push result.
+    fn register_float_binary(&mut self, name: &str, op: fn(f64, f64) -> f64) -> anyhow::Result<()> {
+        let memory = self.memory;
+        let fsp = self.fsp;
+        let func = Func::new(
+            &mut self.store,
+            FuncType::new(&self.engine, [], []),
+            move |mut caller, _, _| {
+                let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                if sp + 8 >= FLOAT_STACK_TOP {
+                    return Err(wasmtime::Error::msg("float stack underflow"));
+                }
+                let mem = memory.data(&caller);
+                let b_bytes: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
+                let a_bytes: [u8; 8] = mem[sp as usize + 8..sp as usize + 16].try_into().unwrap();
+                let b = f64::from_le_bytes(b_bytes);
+                let a = f64::from_le_bytes(a_bytes);
+                let result = op(a, b);
+                let new_sp = sp + 8;
+                fsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
+                let mem = memory.data_mut(&mut caller);
+                mem[new_sp as usize..new_sp as usize + 8].copy_from_slice(&result.to_le_bytes());
+                Ok(())
+            },
+        );
+        self.register_host_primitive(name, false, func)?;
+        Ok(())
+    }
+
+    /// Float stack manipulation words.
+    fn register_float_stack_ops(&mut self) -> anyhow::Result<()> {
+        // FDROP ( F: r -- )
+        {
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                    if sp >= FLOAT_STACK_TOP {
+                        return Err(wasmtime::Error::msg("float stack underflow"));
+                    }
+                    fsp.set(&mut caller, Val::I32((sp + 8) as i32)).unwrap();
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("FDROP", false, func)?;
+        }
+
+        // FDUP ( F: r -- r r )
+        {
+            let memory = self.memory;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                    if sp >= FLOAT_STACK_TOP {
+                        return Err(wasmtime::Error::msg("float stack underflow"));
+                    }
+                    let new_sp = sp - 8;
+                    if new_sp < FLOAT_STACK_BASE {
+                        return Err(wasmtime::Error::msg("float stack overflow"));
+                    }
+                    let mem = memory.data(&caller);
+                    let bytes: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
+                    fsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[new_sp as usize..new_sp as usize + 8].copy_from_slice(&bytes);
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("FDUP", false, func)?;
+        }
+
+        // FSWAP ( F: r1 r2 -- r2 r1 )
+        {
+            let memory = self.memory;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let b: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
+                    let a: [u8; 8] = mem[sp as usize + 8..sp as usize + 16].try_into().unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[sp as usize..sp as usize + 8].copy_from_slice(&a);
+                    mem[sp as usize + 8..sp as usize + 16].copy_from_slice(&b);
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("FSWAP", false, func)?;
+        }
+
+        // FOVER ( F: r1 r2 -- r1 r2 r1 )
+        {
+            let memory = self.memory;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let a: [u8; 8] = mem[sp as usize + 8..sp as usize + 16].try_into().unwrap();
+                    let new_sp = sp - 8;
+                    if new_sp < FLOAT_STACK_BASE {
+                        return Err(wasmtime::Error::msg("float stack overflow"));
+                    }
+                    fsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[new_sp as usize..new_sp as usize + 8].copy_from_slice(&a);
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("FOVER", false, func)?;
+        }
+
+        // FROT ( F: r1 r2 r3 -- r2 r3 r1 )
+        {
+            let memory = self.memory;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let c: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
+                    let b: [u8; 8] = mem[sp as usize + 8..sp as usize + 16].try_into().unwrap();
+                    let a: [u8; 8] = mem[sp as usize + 16..sp as usize + 24].try_into().unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[sp as usize..sp as usize + 8].copy_from_slice(&a);
+                    mem[sp as usize + 8..sp as usize + 16].copy_from_slice(&c);
+                    mem[sp as usize + 16..sp as usize + 24].copy_from_slice(&b);
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("FROT", false, func)?;
+        }
+
+        // FDEPTH ( -- +n ) number of floats on the float stack, pushed onto DATA stack
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let depth = if fsp_val <= FLOAT_STACK_TOP {
+                        ((FLOAT_STACK_TOP - fsp_val) / FLOAT_SIZE) as i32
+                    } else {
+                        0
+                    };
+                    // Push onto data stack
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let new_sp = sp - CELL_SIZE;
+                    dsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[new_sp as usize..new_sp as usize + 4].copy_from_slice(&depth.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("FDEPTH", false, func)?;
+        }
+
+        // FNIP ( F: r1 r2 -- r2 )
+        {
+            let memory = self.memory;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let top: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
+                    let new_sp = sp + 8;
+                    fsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[new_sp as usize..new_sp as usize + 8].copy_from_slice(&top);
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("FNIP", false, func)?;
+        }
+
+        // FTUCK ( F: r1 r2 -- r2 r1 r2 )
+        {
+            let memory = self.memory;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let r2: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
+                    let r1: [u8; 8] = mem[sp as usize + 8..sp as usize + 16].try_into().unwrap();
+                    let new_sp = sp - 8;
+                    if new_sp < FLOAT_STACK_BASE {
+                        return Err(wasmtime::Error::msg("float stack overflow"));
+                    }
+                    fsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    // r2 r1 r2 (bottom to top)
+                    mem[new_sp as usize..new_sp as usize + 8].copy_from_slice(&r2);
+                    mem[new_sp as usize + 8..new_sp as usize + 16].copy_from_slice(&r1);
+                    mem[new_sp as usize + 16..new_sp as usize + 24].copy_from_slice(&r2);
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("FTUCK", false, func)?;
+        }
+
+        Ok(())
+    }
+
+    /// Float arithmetic words.
+    fn register_float_arithmetic(&mut self) -> anyhow::Result<()> {
+        self.register_float_binary("F+", |a, b| a + b)?;
+        self.register_float_binary("F-", |a, b| a - b)?;
+        self.register_float_binary("F*", |a, b| a * b)?;
+        self.register_float_binary("F/", |a, b| a / b)?;
+        self.register_float_unary("FNEGATE", |a| -a)?;
+        self.register_float_unary("FABS", f64::abs)?;
+        self.register_float_binary("FMAX", f64::max)?;
+        self.register_float_binary("FMIN", f64::min)?;
+        self.register_float_unary("FSQRT", f64::sqrt)?;
+        self.register_float_unary("FLOOR", f64::floor)?;
+        self.register_float_unary("FROUND", f64::round_ties_even)?;
+        self.register_float_binary("F**", f64::powf)?;
+        Ok(())
+    }
+
+    /// Float comparison words. Results go on the DATA stack.
+    fn register_float_comparisons(&mut self) -> anyhow::Result<()> {
+        // F0= ( -- flag ) ( F: r -- )
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let bytes: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
+                    let val = f64::from_le_bytes(bytes);
+                    fsp.set(&mut caller, Val::I32((sp + 8) as i32)).unwrap();
+                    let flag: i32 = if val == 0.0 { -1 } else { 0 };
+                    let dsp_val = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let new_dsp = dsp_val - CELL_SIZE;
+                    dsp.set(&mut caller, Val::I32(new_dsp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[new_dsp as usize..new_dsp as usize + 4]
+                        .copy_from_slice(&flag.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("F0=", false, func)?;
+        }
+
+        // F0< ( -- flag ) ( F: r -- )
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let bytes: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
+                    let val = f64::from_le_bytes(bytes);
+                    fsp.set(&mut caller, Val::I32((sp + 8) as i32)).unwrap();
+                    let flag: i32 = if val < 0.0 { -1 } else { 0 };
+                    let dsp_val = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let new_dsp = dsp_val - CELL_SIZE;
+                    dsp.set(&mut caller, Val::I32(new_dsp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[new_dsp as usize..new_dsp as usize + 4]
+                        .copy_from_slice(&flag.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("F0<", false, func)?;
+        }
+
+        // Helper for binary float comparisons that pop two floats and push a flag
+        let register_float_cmp =
+            |vm: &mut Self, name: &str, cmp: fn(f64, f64) -> bool| -> anyhow::Result<()> {
+                let memory = vm.memory;
+                let dsp = vm.dsp;
+                let fsp = vm.fsp;
+                let func = Func::new(
+                    &mut vm.store,
+                    FuncType::new(&vm.engine, [], []),
+                    move |mut caller, _, _| {
+                        let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                        let mem = memory.data(&caller);
+                        let b_bytes: [u8; 8] =
+                            mem[sp as usize..sp as usize + 8].try_into().unwrap();
+                        let a_bytes: [u8; 8] =
+                            mem[sp as usize + 8..sp as usize + 16].try_into().unwrap();
+                        let b = f64::from_le_bytes(b_bytes);
+                        let a = f64::from_le_bytes(a_bytes);
+                        fsp.set(&mut caller, Val::I32((sp + 16) as i32)).unwrap();
+                        let flag: i32 = if cmp(a, b) { -1 } else { 0 };
+                        let dsp_val = dsp.get(&mut caller).unwrap_i32() as u32;
+                        let new_dsp = dsp_val - CELL_SIZE;
+                        dsp.set(&mut caller, Val::I32(new_dsp as i32)).unwrap();
+                        let mem = memory.data_mut(&mut caller);
+                        mem[new_dsp as usize..new_dsp as usize + 4]
+                            .copy_from_slice(&flag.to_le_bytes());
+                        Ok(())
+                    },
+                );
+                vm.register_host_primitive(name, false, func)?;
+                Ok(())
+            };
+
+        register_float_cmp(self, "F=", |a, b| a == b)?;
+        register_float_cmp(self, "F<", |a, b| a < b)?;
+
+        // F~ ( -- flag ) ( F: r1 r2 r3 -- ) approximate float comparison
+        // If r3 > 0: true if |r1-r2| < r3
+        // If r3 = 0: true if r1 and r2 are exactly equal (bitwise)
+        // If r3 < 0: true if |r1-r2| < |r3|*(|r1|+|r2|)
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let r3_bytes: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
+                    let r2_bytes: [u8; 8] =
+                        mem[sp as usize + 8..sp as usize + 16].try_into().unwrap();
+                    let r1_bytes: [u8; 8] =
+                        mem[sp as usize + 16..sp as usize + 24].try_into().unwrap();
+                    let r3 = f64::from_le_bytes(r3_bytes);
+                    let r2 = f64::from_le_bytes(r2_bytes);
+                    let r1 = f64::from_le_bytes(r1_bytes);
+                    fsp.set(&mut caller, Val::I32((sp + 24) as i32)).unwrap();
+
+                    let result = if r3 > 0.0 {
+                        (r1 - r2).abs() < r3
+                    } else if r3 == 0.0 {
+                        r1.to_bits() == r2.to_bits()
+                    } else {
+                        // r3 < 0: relative comparison
+                        (r1 - r2).abs() < r3.abs() * (r1.abs() + r2.abs())
+                    };
+
+                    let flag: i32 = if result { -1 } else { 0 };
+                    let dsp_val = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let new_dsp = dsp_val - CELL_SIZE;
+                    dsp.set(&mut caller, Val::I32(new_dsp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[new_dsp as usize..new_dsp as usize + 4]
+                        .copy_from_slice(&flag.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("F~", false, func)?;
+        }
+
+        Ok(())
+    }
+
+    /// Float memory words.
+    fn register_float_memory(&mut self) -> anyhow::Result<()> {
+        // F@ ( f-addr -- ) ( F: -- r ) fetch a float from memory
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    // Read all we need from memory first
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let (addr, val) = {
+                        let mem = memory.data(&caller);
+                        let addr_bytes: [u8; 4] =
+                            mem[sp as usize..sp as usize + 4].try_into().unwrap();
+                        let addr = u32::from_le_bytes(addr_bytes) as usize;
+                        let float_bytes: [u8; 8] = mem[addr..addr + 8].try_into().unwrap();
+                        (addr, f64::from_le_bytes(float_bytes))
+                    };
+                    let _ = addr;
+                    // Update stack pointers
+                    dsp.set(&mut caller, Val::I32((sp + CELL_SIZE) as i32))
+                        .unwrap();
+                    let new_fsp = fsp_val - FLOAT_SIZE;
+                    fsp.set(&mut caller, Val::I32(new_fsp as i32)).unwrap();
+                    // Write float to float stack
+                    let mem = memory.data_mut(&mut caller);
+                    mem[new_fsp as usize..new_fsp as usize + 8].copy_from_slice(&val.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("F@", false, func)?;
+        }
+
+        // F! ( f-addr -- ) ( F: r -- ) store a float to memory
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    // Read all we need first
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let (addr, float_bytes) = {
+                        let mem = memory.data(&caller);
+                        let addr_bytes: [u8; 4] =
+                            mem[sp as usize..sp as usize + 4].try_into().unwrap();
+                        let addr = u32::from_le_bytes(addr_bytes) as usize;
+                        let float_bytes: [u8; 8] = mem[fsp_val as usize..fsp_val as usize + 8]
+                            .try_into()
+                            .unwrap();
+                        (addr, float_bytes)
+                    };
+                    // Update stack pointers
+                    dsp.set(&mut caller, Val::I32((sp + CELL_SIZE) as i32))
+                        .unwrap();
+                    fsp.set(&mut caller, Val::I32((fsp_val + FLOAT_SIZE) as i32))
+                        .unwrap();
+                    // Store float at addr
+                    let mem = memory.data_mut(&mut caller);
+                    mem[addr..addr + 8].copy_from_slice(&float_bytes);
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("F!", false, func)?;
+        }
+
+        // FLOAT+ ( f-addr1 -- f-addr2 ) add float size to address
+        self.register_primitive(
+            "FLOAT+",
+            false,
+            vec![IrOp::PushI32(FLOAT_SIZE as i32), IrOp::Add],
+        )?;
+
+        // FLOATS ( n1 -- n2 ) multiply by float size
+        self.register_primitive(
+            "FLOATS",
+            false,
+            vec![IrOp::PushI32(FLOAT_SIZE as i32), IrOp::Mul],
+        )?;
+
+        // FALIGNED ( addr -- f-addr ) align to float boundary (8 bytes)
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let b: [u8; 4] = mem[sp as usize..sp as usize + 4].try_into().unwrap();
+                    let addr = u32::from_le_bytes(b);
+                    let aligned = (addr + 7) & !7;
+                    let mem = memory.data_mut(&mut caller);
+                    mem[sp as usize..sp as usize + 4].copy_from_slice(&aligned.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("FALIGNED", false, func)?;
+        }
+
+        // FALIGN ( -- ) align HERE to float boundary
+        {
+            let memory = self.memory;
+            let here_cell = self.here_cell.clone();
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let here_val = if let Some(ref cell) = here_cell {
+                        *cell.lock().unwrap()
+                    } else {
+                        let mem = memory.data(&caller);
+                        let b: [u8; 4] = mem[crate::memory::SYSVAR_HERE as usize
+                            ..crate::memory::SYSVAR_HERE as usize + 4]
+                            .try_into()
+                            .unwrap();
+                        u32::from_le_bytes(b)
+                    };
+                    let aligned = (here_val + 7) & !7;
+                    if let Some(ref cell) = here_cell {
+                        *cell.lock().unwrap() = aligned;
+                    }
+                    let mem = memory.data_mut(&mut caller);
+                    mem[crate::memory::SYSVAR_HERE as usize
+                        ..crate::memory::SYSVAR_HERE as usize + 4]
+                        .copy_from_slice(&aligned.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("FALIGN", false, func)?;
+        }
+
+        // SFLOATS ( n -- n*sfloat_size ) single-float size (same as FLOATS for us)
+        self.register_primitive(
+            "SFLOATS",
+            false,
+            vec![IrOp::PushI32(FLOAT_SIZE as i32), IrOp::Mul],
+        )?;
+
+        // SFLOAT+ ( addr -- addr+sfloat_size )
+        self.register_primitive(
+            "SFLOAT+",
+            false,
+            vec![IrOp::PushI32(FLOAT_SIZE as i32), IrOp::Add],
+        )?;
+
+        // DFLOATS ( n -- n*dfloat_size )
+        self.register_primitive(
+            "DFLOATS",
+            false,
+            vec![IrOp::PushI32(FLOAT_SIZE as i32), IrOp::Mul],
+        )?;
+
+        // DFLOAT+ ( addr -- addr+dfloat_size )
+        self.register_primitive(
+            "DFLOAT+",
+            false,
+            vec![IrOp::PushI32(FLOAT_SIZE as i32), IrOp::Add],
+        )?;
+
+        Ok(())
+    }
+
+    /// Float conversion words.
+    fn register_float_conversions(&mut self) -> anyhow::Result<()> {
+        // D>F ( d -- ) ( F: -- r ) convert double-cell integer to float
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    // Double-cell: hi on top, lo below
+                    let hi_bytes: [u8; 4] = mem[sp as usize..sp as usize + 4].try_into().unwrap();
+                    let lo_bytes: [u8; 4] =
+                        mem[sp as usize + 4..sp as usize + 8].try_into().unwrap();
+                    let hi = i32::from_le_bytes(hi_bytes);
+                    let lo = i32::from_le_bytes(lo_bytes);
+                    let d = ((hi as i64) << 32) | (lo as u32 as i64);
+                    let f = d as f64;
+                    // Pop two cells from data stack
+                    dsp.set(&mut caller, Val::I32((sp + 8) as i32)).unwrap();
+                    // Push onto float stack
+                    let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let new_fsp = fsp_val - FLOAT_SIZE;
+                    fsp.set(&mut caller, Val::I32(new_fsp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[new_fsp as usize..new_fsp as usize + 8].copy_from_slice(&f.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("D>F", false, func)?;
+        }
+
+        // F>D ( -- d ) ( F: r -- ) convert float to double-cell integer
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    // Pop from float stack
+                    let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let bytes: [u8; 8] = mem[fsp_val as usize..fsp_val as usize + 8]
+                        .try_into()
+                        .unwrap();
+                    let f = f64::from_le_bytes(bytes);
+                    fsp.set(&mut caller, Val::I32((fsp_val + FLOAT_SIZE) as i32))
+                        .unwrap();
+                    // Convert to i64
+                    let d = f as i64;
+                    let lo = d as i32;
+                    let hi = (d >> 32) as i32;
+                    // Push lo then hi onto data stack
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let new_sp = sp - 8; // two cells
+                    dsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[new_sp as usize + 4..new_sp as usize + 8]
+                        .copy_from_slice(&lo.to_le_bytes());
+                    mem[new_sp as usize..new_sp as usize + 4].copy_from_slice(&hi.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("F>D", false, func)?;
+        }
+
+        // S>F ( n -- ) ( F: -- r ) convert single-cell integer to float
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let b: [u8; 4] = mem[sp as usize..sp as usize + 4].try_into().unwrap();
+                    let n = i32::from_le_bytes(b);
+                    dsp.set(&mut caller, Val::I32((sp + CELL_SIZE) as i32))
+                        .unwrap();
+                    let f = n as f64;
+                    let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let new_fsp = fsp_val - FLOAT_SIZE;
+                    fsp.set(&mut caller, Val::I32(new_fsp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[new_fsp as usize..new_fsp as usize + 8].copy_from_slice(&f.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("S>F", false, func)?;
+        }
+
+        // F>S ( -- n ) ( F: r -- ) convert float to single-cell integer
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let bytes: [u8; 8] = mem[fsp_val as usize..fsp_val as usize + 8]
+                        .try_into()
+                        .unwrap();
+                    let f = f64::from_le_bytes(bytes);
+                    fsp.set(&mut caller, Val::I32((fsp_val + FLOAT_SIZE) as i32))
+                        .unwrap();
+                    let n = f as i32;
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let new_sp = sp - CELL_SIZE;
+                    dsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[new_sp as usize..new_sp as usize + 4].copy_from_slice(&n.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("F>S", false, func)?;
+        }
+
+        Ok(())
+    }
+
+    /// Trigonometric functions.
+    fn register_float_trig(&mut self) -> anyhow::Result<()> {
+        self.register_float_unary("FSIN", f64::sin)?;
+        self.register_float_unary("FCOS", f64::cos)?;
+        self.register_float_unary("FTAN", f64::tan)?;
+        self.register_float_unary("FASIN", f64::asin)?;
+        self.register_float_unary("FACOS", f64::acos)?;
+        self.register_float_unary("FATAN", f64::atan)?;
+        self.register_float_binary("FATAN2", f64::atan2)?;
+
+        // FSINCOS ( F: r1 -- r2 r3 ) r2=sin(r1) r3=cos(r1)
+        {
+            let memory = self.memory;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let bytes: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
+                    let val = f64::from_le_bytes(bytes);
+                    let sin_val = val.sin();
+                    let cos_val = val.cos();
+                    // Replace TOS with sin, push cos on top
+                    // Result: sin deeper, cos on top
+                    let new_sp = sp - 8; // one more item
+                    if new_sp < FLOAT_STACK_BASE {
+                        return Err(wasmtime::Error::msg("float stack overflow"));
+                    }
+                    fsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[new_sp as usize + 8..new_sp as usize + 16]
+                        .copy_from_slice(&sin_val.to_le_bytes());
+                    mem[new_sp as usize..new_sp as usize + 8]
+                        .copy_from_slice(&cos_val.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("FSINCOS", false, func)?;
+        }
+
+        Ok(())
+    }
+
+    /// Exponential and logarithmic functions.
+    fn register_float_exp_log(&mut self) -> anyhow::Result<()> {
+        self.register_float_unary("FEXP", f64::exp)?;
+        self.register_float_unary("FEXPM1", f64::exp_m1)?;
+        self.register_float_unary("FLN", f64::ln)?;
+        self.register_float_unary("FLNP1", f64::ln_1p)?;
+        self.register_float_unary("FLOG", f64::log10)?;
+        self.register_float_unary("FALOG", |x| 10.0_f64.powf(x))?;
+        Ok(())
+    }
+
+    /// Hyperbolic functions.
+    fn register_float_hyperbolic(&mut self) -> anyhow::Result<()> {
+        self.register_float_unary("FSINH", f64::sinh)?;
+        self.register_float_unary("FCOSH", f64::cosh)?;
+        self.register_float_unary("FTANH", f64::tanh)?;
+        self.register_float_unary("FASINH", f64::asinh)?;
+        self.register_float_unary("FACOSH", f64::acosh)?;
+        self.register_float_unary("FATANH", f64::atanh)?;
+        Ok(())
+    }
+
+    /// Float I/O words.
+    fn register_float_io(&mut self) -> anyhow::Result<()> {
+        // F. ( F: r -- ) print float followed by space
+        {
+            let memory = self.memory;
+            let fsp = self.fsp;
+            let output = Arc::clone(&self.output);
+            let precision = Arc::clone(&self.float_precision);
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let bytes: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
+                    let val = f64::from_le_bytes(bytes);
+                    fsp.set(&mut caller, Val::I32((sp + 8) as i32)).unwrap();
+                    let prec = *precision.lock().unwrap();
+                    let s = format!("{val:.prec$} ");
+                    output.lock().unwrap().push_str(&s);
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("F.", false, func)?;
+        }
+
+        // FE. ( F: r -- ) print float in engineering notation
+        {
+            let memory = self.memory;
+            let fsp = self.fsp;
+            let output = Arc::clone(&self.output);
+            let precision = Arc::clone(&self.float_precision);
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let bytes: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
+                    let val = f64::from_le_bytes(bytes);
+                    fsp.set(&mut caller, Val::I32((sp + 8) as i32)).unwrap();
+                    let prec = *precision.lock().unwrap();
+                    let s = format_engineering(val, prec);
+                    output.lock().unwrap().push_str(&s);
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("FE.", false, func)?;
+        }
+
+        // FS. ( F: r -- ) print float in scientific notation
+        {
+            let memory = self.memory;
+            let fsp = self.fsp;
+            let output = Arc::clone(&self.output);
+            let precision = Arc::clone(&self.float_precision);
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let bytes: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
+                    let val = f64::from_le_bytes(bytes);
+                    fsp.set(&mut caller, Val::I32((sp + 8) as i32)).unwrap();
+                    let prec = *precision.lock().unwrap();
+                    let s = format!("{val:.prec$E} ");
+                    output.lock().unwrap().push_str(&s);
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("FS.", false, func)?;
+        }
+
+        // PRECISION ( -- u ) get current float output precision
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let precision = Arc::clone(&self.float_precision);
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let prec = *precision.lock().unwrap() as i32;
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let new_sp = sp - CELL_SIZE;
+                    dsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[new_sp as usize..new_sp as usize + 4].copy_from_slice(&prec.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("PRECISION", false, func)?;
+        }
+
+        // SET-PRECISION ( u -- ) set float output precision
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let precision = Arc::clone(&self.float_precision);
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let b: [u8; 4] = mem[sp as usize..sp as usize + 4].try_into().unwrap();
+                    let n = i32::from_le_bytes(b) as usize;
+                    dsp.set(&mut caller, Val::I32((sp + CELL_SIZE) as i32))
+                        .unwrap();
+                    *precision.lock().unwrap() = n;
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("SET-PRECISION", false, func)?;
+        }
+
+        // REPRESENT ( c-addr u -- n flag1 flag2 ) ( F: r -- )
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    // Read all values from memory first
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let (u, c_addr, val) = {
+                        let mem = memory.data(&caller);
+                        let u_bytes: [u8; 4] =
+                            mem[sp as usize..sp as usize + 4].try_into().unwrap();
+                        let addr_bytes: [u8; 4] =
+                            mem[sp as usize + 4..sp as usize + 8].try_into().unwrap();
+                        let u = i32::from_le_bytes(u_bytes) as usize;
+                        let c_addr = u32::from_le_bytes(addr_bytes) as usize;
+                        let f_bytes: [u8; 8] = mem[fsp_val as usize..fsp_val as usize + 8]
+                            .try_into()
+                            .unwrap();
+                        (u, c_addr, f64::from_le_bytes(f_bytes))
+                    };
+
+                    // Update stack pointers: pop 2 data cells, pop 1 float
+                    dsp.set(&mut caller, Val::I32((sp + 8) as i32)).unwrap();
+                    fsp.set(&mut caller, Val::I32((fsp_val + FLOAT_SIZE) as i32))
+                        .unwrap();
+
+                    let (digits, exp, is_negative, is_valid) = represent_float(val, u);
+
+                    // Store digits at c-addr, then push results
+                    let digit_bytes = digits.as_bytes();
+                    let copy_len = digit_bytes.len().min(u);
+                    // Push n, flag1 (sign), flag2 (valid) onto data stack
+                    let cur_sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let new_sp = cur_sp - 12;
+                    dsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[c_addr..c_addr + copy_len].copy_from_slice(&digit_bytes[..copy_len]);
+                    // Bottom: n (exponent)
+                    mem[new_sp as usize + 8..new_sp as usize + 12]
+                        .copy_from_slice(&exp.to_le_bytes());
+                    // Middle: flag1 (is_negative => true flag)
+                    let sign_flag: i32 = if is_negative { -1 } else { 0 };
+                    mem[new_sp as usize + 4..new_sp as usize + 8]
+                        .copy_from_slice(&sign_flag.to_le_bytes());
+                    // Top: flag2 (is_valid => true flag)
+                    let valid_flag: i32 = if is_valid { -1 } else { 0 };
+                    mem[new_sp as usize..new_sp as usize + 4]
+                        .copy_from_slice(&valid_flag.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("REPRESENT", false, func)?;
+        }
+
+        // >FLOAT ( c-addr u -- flag ) ( F: -- r | ) parse string as float
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let (u, c_addr, s_owned) = {
+                        let mem = memory.data(&caller);
+                        let u_bytes: [u8; 4] =
+                            mem[sp as usize..sp as usize + 4].try_into().unwrap();
+                        let addr_bytes: [u8; 4] =
+                            mem[sp as usize + 4..sp as usize + 8].try_into().unwrap();
+                        let u = i32::from_le_bytes(u_bytes) as usize;
+                        let c_addr = u32::from_le_bytes(addr_bytes) as usize;
+                        let s = std::str::from_utf8(&mem[c_addr..c_addr + u])
+                            .unwrap_or("")
+                            .to_string();
+                        (u, c_addr, s)
+                    };
+                    let _ = (u, c_addr);
+                    // Pop u and c-addr (2 cells), will push back 1 cell (flag)
+                    dsp.set(&mut caller, Val::I32((sp + 4) as i32)).unwrap();
+
+                    let result = parse_forth_float(&s_owned);
+
+                    match result {
+                        Some(f) => {
+                            // Push float onto float stack
+                            let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
+                            let new_fsp = fsp_val - FLOAT_SIZE;
+                            fsp.set(&mut caller, Val::I32(new_fsp as i32)).unwrap();
+                            let flag_sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                            let mem = memory.data_mut(&mut caller);
+                            mem[new_fsp as usize..new_fsp as usize + 8]
+                                .copy_from_slice(&f.to_le_bytes());
+                            mem[flag_sp as usize..flag_sp as usize + 4]
+                                .copy_from_slice(&(-1_i32).to_le_bytes());
+                        }
+                        None => {
+                            let flag_sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                            let mem = memory.data_mut(&mut caller);
+                            mem[flag_sp as usize..flag_sp as usize + 4]
+                                .copy_from_slice(&0_i32.to_le_bytes());
+                        }
+                    }
+                    Ok(())
+                },
+            );
+            self.register_host_primitive(">FLOAT", false, func)?;
+        }
+
+        Ok(())
+    }
+
+    /// Miscellaneous float words: FVARIABLE, FCONSTANT, FVALUE, >FLOAT parsing.
+    fn register_float_misc(&mut self) -> anyhow::Result<()> {
+        // FVARIABLE, FCONSTANT, FVALUE are handled in interpret_token_immediate
+        // as special tokens (like VARIABLE/CONSTANT/VALUE).
+
+        // SF! ( sf-addr -- ) ( F: r -- ) store as single-precision float (f32)
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let (addr, f32_bytes) = {
+                        let mem = memory.data(&caller);
+                        let addr_bytes: [u8; 4] =
+                            mem[sp as usize..sp as usize + 4].try_into().unwrap();
+                        let addr = u32::from_le_bytes(addr_bytes) as usize;
+                        let f_bytes: [u8; 8] = mem[fsp_val as usize..fsp_val as usize + 8]
+                            .try_into()
+                            .unwrap();
+                        let val = f64::from_le_bytes(f_bytes);
+                        (addr, (val as f32).to_le_bytes())
+                    };
+                    dsp.set(&mut caller, Val::I32((sp + CELL_SIZE) as i32))
+                        .unwrap();
+                    fsp.set(&mut caller, Val::I32((fsp_val + FLOAT_SIZE) as i32))
+                        .unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[addr..addr + 4].copy_from_slice(&f32_bytes);
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("SF!", false, func)?;
+        }
+
+        // SF@ ( sf-addr -- ) ( F: -- r ) fetch single-precision float (f32)
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let val = {
+                        let mem = memory.data(&caller);
+                        let addr_bytes: [u8; 4] =
+                            mem[sp as usize..sp as usize + 4].try_into().unwrap();
+                        let addr = u32::from_le_bytes(addr_bytes) as usize;
+                        let f32_bytes: [u8; 4] = mem[addr..addr + 4].try_into().unwrap();
+                        f32::from_le_bytes(f32_bytes) as f64
+                    };
+                    dsp.set(&mut caller, Val::I32((sp + CELL_SIZE) as i32))
+                        .unwrap();
+                    let new_fsp = fsp_val - FLOAT_SIZE;
+                    fsp.set(&mut caller, Val::I32(new_fsp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[new_fsp as usize..new_fsp as usize + 8].copy_from_slice(&val.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("SF@", false, func)?;
+        }
+
+        // DF! ( df-addr -- ) ( F: r -- ) same as F! (our floats are already f64)
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let (addr, float_bytes) = {
+                        let mem = memory.data(&caller);
+                        let addr_bytes: [u8; 4] =
+                            mem[sp as usize..sp as usize + 4].try_into().unwrap();
+                        let addr = u32::from_le_bytes(addr_bytes) as usize;
+                        let float_bytes: [u8; 8] = mem[fsp_val as usize..fsp_val as usize + 8]
+                            .try_into()
+                            .unwrap();
+                        (addr, float_bytes)
+                    };
+                    dsp.set(&mut caller, Val::I32((sp + CELL_SIZE) as i32))
+                        .unwrap();
+                    fsp.set(&mut caller, Val::I32((fsp_val + FLOAT_SIZE) as i32))
+                        .unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[addr..addr + 8].copy_from_slice(&float_bytes);
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("DF!", false, func)?;
+        }
+
+        // DF@ ( df-addr -- ) ( F: -- r ) same as F@ (our floats are already f64)
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let fsp = self.fsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let fsp_val = fsp.get(&mut caller).unwrap_i32() as u32;
+                    let val = {
+                        let mem = memory.data(&caller);
+                        let addr_bytes: [u8; 4] =
+                            mem[sp as usize..sp as usize + 4].try_into().unwrap();
+                        let addr = u32::from_le_bytes(addr_bytes) as usize;
+                        let float_bytes: [u8; 8] = mem[addr..addr + 8].try_into().unwrap();
+                        f64::from_le_bytes(float_bytes)
+                    };
+                    dsp.set(&mut caller, Val::I32((sp + CELL_SIZE) as i32))
+                        .unwrap();
+                    let new_fsp = fsp_val - FLOAT_SIZE;
+                    fsp.set(&mut caller, Val::I32(new_fsp as i32)).unwrap();
+                    let mem = memory.data_mut(&mut caller);
+                    mem[new_fsp as usize..new_fsp as usize + 8].copy_from_slice(&val.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("DF@", false, func)?;
+        }
+
+        // SFALIGNED, DFALIGNED (alignment words for single/double floats)
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let b: [u8; 4] = mem[sp as usize..sp as usize + 4].try_into().unwrap();
+                    let addr = u32::from_le_bytes(b);
+                    let aligned = (addr + 3) & !3; // 4-byte alignment for single float
+                    let mem = memory.data_mut(&mut caller);
+                    mem[sp as usize..sp as usize + 4].copy_from_slice(&aligned.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("SFALIGNED", false, func)?;
+        }
+
+        // DFALIGNED is the same as FALIGNED (8-byte alignment)
+        {
+            let memory = self.memory;
+            let dsp = self.dsp;
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let sp = dsp.get(&mut caller).unwrap_i32() as u32;
+                    let mem = memory.data(&caller);
+                    let b: [u8; 4] = mem[sp as usize..sp as usize + 4].try_into().unwrap();
+                    let addr = u32::from_le_bytes(b);
+                    let aligned = (addr + 7) & !7;
+                    let mem = memory.data_mut(&mut caller);
+                    mem[sp as usize..sp as usize + 4].copy_from_slice(&aligned.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("DFALIGNED", false, func)?;
+        }
+
+        // SFALIGN, DFALIGN (align HERE)
+        // Not commonly needed but let's register stubs
+        // SFALIGN aligns to 4, DFALIGN aligns to 8
+        {
+            let memory = self.memory;
+            let here_cell = self.here_cell.clone();
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let here_val = if let Some(ref cell) = here_cell {
+                        *cell.lock().unwrap()
+                    } else {
+                        let mem = memory.data(&caller);
+                        let b: [u8; 4] = mem[crate::memory::SYSVAR_HERE as usize
+                            ..crate::memory::SYSVAR_HERE as usize + 4]
+                            .try_into()
+                            .unwrap();
+                        u32::from_le_bytes(b)
+                    };
+                    let aligned = (here_val + 3) & !3;
+                    if let Some(ref cell) = here_cell {
+                        *cell.lock().unwrap() = aligned;
+                    }
+                    let mem = memory.data_mut(&mut caller);
+                    mem[crate::memory::SYSVAR_HERE as usize
+                        ..crate::memory::SYSVAR_HERE as usize + 4]
+                        .copy_from_slice(&aligned.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("SFALIGN", false, func)?;
+        }
+
+        {
+            let memory = self.memory;
+            let here_cell = self.here_cell.clone();
+            let func = Func::new(
+                &mut self.store,
+                FuncType::new(&self.engine, [], []),
+                move |mut caller, _, _| {
+                    let here_val = if let Some(ref cell) = here_cell {
+                        *cell.lock().unwrap()
+                    } else {
+                        let mem = memory.data(&caller);
+                        let b: [u8; 4] = mem[crate::memory::SYSVAR_HERE as usize
+                            ..crate::memory::SYSVAR_HERE as usize + 4]
+                            .try_into()
+                            .unwrap();
+                        u32::from_le_bytes(b)
+                    };
+                    let aligned = (here_val + 7) & !7;
+                    if let Some(ref cell) = here_cell {
+                        *cell.lock().unwrap() = aligned;
+                    }
+                    let mem = memory.data_mut(&mut caller);
+                    mem[crate::memory::SYSVAR_HERE as usize
+                        ..crate::memory::SYSVAR_HERE as usize + 4]
+                        .copy_from_slice(&aligned.to_le_bytes());
+                    Ok(())
+                },
+            );
+            self.register_host_primitive("DFALIGN", false, func)?;
+        }
+
+        Ok(())
+    }
+
+    /// Allocate a function table slot for an anonymous host function.
+    /// Returns a `WordId` that can be used in `IrOp::Call`.
+    /// Does NOT touch the dictionary, so it's safe during colon compilation.
+    fn install_anon_func(&mut self, func: Func) -> anyhow::Result<WordId> {
+        let idx = self.next_table_index;
+        self.next_table_index += 1;
+        // Also advance the dictionary's fn index counter to stay in sync
+        self.dictionary.reserve_fn_index();
+        self.ensure_table_size(idx)?;
+        self.table
+            .set(&mut self.store, idx as u64, Ref::Func(Some(func)))?;
+        Ok(WordId(idx))
+    }
+
+    /// Compile a float literal for use inside a colon definition.
+    /// Creates a tiny host function that pushes the given f64 onto the float stack.
+    fn compile_float_literal(&mut self, val: f64) -> anyhow::Result<()> {
+        let memory = self.memory;
+        let fsp = self.fsp;
+        let func = Func::new(
+            &mut self.store,
+            FuncType::new(&self.engine, [], []),
+            move |mut caller, _, _| {
+                let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                let new_sp = sp - FLOAT_SIZE;
+                if new_sp < FLOAT_STACK_BASE {
+                    return Err(wasmtime::Error::msg("float stack overflow"));
+                }
+                fsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
+                let mem = memory.data_mut(&mut caller);
+                mem[new_sp as usize..new_sp as usize + 8].copy_from_slice(&val.to_le_bytes());
+                Ok(())
+            },
+        );
+        let word_id = self.install_anon_func(func)?;
+        self.push_ir(IrOp::Call(word_id));
+        Ok(())
+    }
+
+    /// Create a host function that pops from float stack and stores at the given address.
+    /// Used for `TO <fvalue>` in compile mode.
+    fn make_fvalue_store(&mut self, pfa: u32) -> anyhow::Result<WordId> {
+        let memory = self.memory;
+        let fsp = self.fsp;
+        let func = Func::new(
+            &mut self.store,
+            FuncType::new(&self.engine, [], []),
+            move |mut caller, _, _| {
+                let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                let mem = memory.data(&caller);
+                let bytes: [u8; 8] = mem[sp as usize..sp as usize + 8].try_into().unwrap();
+                fsp.set(&mut caller, Val::I32((sp + FLOAT_SIZE) as i32))
+                    .unwrap();
+                let mem = memory.data_mut(&mut caller);
+                mem[pfa as usize..pfa as usize + 8].copy_from_slice(&bytes);
+                Ok(())
+            },
+        );
+        self.install_anon_func(func)
+    }
+
+    /// FVARIABLE <name> -- allocate 8 bytes, word pushes address
+    fn define_fvariable(&mut self) -> anyhow::Result<()> {
+        let name = self
+            .next_token()
+            .ok_or_else(|| anyhow::anyhow!("FVARIABLE: expected name"))?;
+
+        let word_id = self
+            .dictionary
+            .create(&name, false)
+            .map_err(|e| anyhow::anyhow!("{e}"))?;
+
+        // Allocate 8 bytes aligned
+        self.refresh_user_here();
+        let addr = (self.user_here + 7) & !7;
+        self.user_here = addr + FLOAT_SIZE;
+
+        // Initialize to zero
+        let data = self.memory.data_mut(&mut self.store);
+        data[addr as usize..addr as usize + 8].copy_from_slice(&0.0_f64.to_le_bytes());
+
+        // Compile a word that pushes the address onto the DATA stack
+        let ir_body = vec![IrOp::PushI32(addr as i32)];
+        let config = CodegenConfig {
+            base_fn_index: word_id.0,
+            table_size: self.table_size(),
+        };
+        let compiled = compile_word(&name, &ir_body, &config)
+            .map_err(|e| anyhow::anyhow!("codegen error for FVARIABLE {name}: {e}"))?;
+
+        self.instantiate_and_install(&compiled, word_id)?;
+        self.dictionary.reveal();
+        self.sync_word_lookup(&name, word_id, false);
+        self.next_table_index = self.next_table_index.max(word_id.0 + 1);
+        self.sync_here_cell();
+
+        Ok(())
+    }
+
+    /// FCONSTANT <name> ( F: r -- ) -- create a word that pushes r onto float stack
+    fn define_fconstant(&mut self) -> anyhow::Result<()> {
+        let val = self.fpop()?;
+        let name = self
+            .next_token()
+            .ok_or_else(|| anyhow::anyhow!("FCONSTANT: expected name"))?;
+
+        let word_id = self
+            .dictionary
+            .create(&name, false)
+            .map_err(|e| anyhow::anyhow!("{e}"))?;
+
+        // Create a host function that pushes the constant onto float stack
+        let memory = self.memory;
+        let fsp = self.fsp;
+        let func = Func::new(
+            &mut self.store,
+            FuncType::new(&self.engine, [], []),
+            move |mut caller, _, _| {
+                let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                let new_sp = sp - FLOAT_SIZE;
+                if new_sp < FLOAT_STACK_BASE {
+                    return Err(wasmtime::Error::msg("float stack overflow"));
+                }
+                fsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
+                let mem = memory.data_mut(&mut caller);
+                mem[new_sp as usize..new_sp as usize + 8].copy_from_slice(&val.to_le_bytes());
+                Ok(())
+            },
+        );
+
+        self.ensure_table_size(word_id.0)?;
+        self.table
+            .set(&mut self.store, word_id.0 as u64, Ref::Func(Some(func)))?;
+        self.dictionary.reveal();
+        self.sync_word_lookup(&name, word_id, false);
+        self.next_table_index = self.next_table_index.max(word_id.0 + 1);
+
+        Ok(())
+    }
+
+    /// FVALUE <name> ( F: r -- ) -- create a word that fetches r from storage
+    fn define_fvalue(&mut self) -> anyhow::Result<()> {
+        let val = self.fpop()?;
+        let name = self
+            .next_token()
+            .ok_or_else(|| anyhow::anyhow!("FVALUE: expected name"))?;
+
+        let word_id = self
+            .dictionary
+            .create(&name, false)
+            .map_err(|e| anyhow::anyhow!("{e}"))?;
+
+        // Allocate 8 bytes aligned for the value's storage
+        self.refresh_user_here();
+        let val_addr = (self.user_here + 7) & !7;
+        self.user_here = val_addr + FLOAT_SIZE;
+
+        // Initialize the storage with the given value
+        let data = self.memory.data_mut(&mut self.store);
+        data[val_addr as usize..val_addr as usize + 8].copy_from_slice(&val.to_le_bytes());
+
+        // Create a host function that fetches from storage and pushes onto float stack
+        let memory = self.memory;
+        let fsp = self.fsp;
+        let func = Func::new(
+            &mut self.store,
+            FuncType::new(&self.engine, [], []),
+            move |mut caller, _, _| {
+                let mem = memory.data(&caller);
+                let bytes: [u8; 8] = mem[val_addr as usize..val_addr as usize + 8]
+                    .try_into()
+                    .unwrap();
+                let sp = fsp.get(&mut caller).unwrap_i32() as u32;
+                let new_sp = sp - FLOAT_SIZE;
+                if new_sp < FLOAT_STACK_BASE {
+                    return Err(wasmtime::Error::msg("float stack overflow"));
+                }
+                fsp.set(&mut caller, Val::I32(new_sp as i32)).unwrap();
+                let mem = memory.data_mut(&mut caller);
+                mem[new_sp as usize..new_sp as usize + 8].copy_from_slice(&bytes);
+                Ok(())
+            },
+        );
+
+        self.ensure_table_size(word_id.0)?;
+        self.table
+            .set(&mut self.store, word_id.0 as u64, Ref::Func(Some(func)))?;
+        self.dictionary.reveal();
+        self.sync_word_lookup(&name, word_id, false);
+        self.next_table_index = self.next_table_index.max(word_id.0 + 1);
+        // Map xt -> PFA for TO
+        self.word_pfa_map.insert(word_id.0, val_addr);
+        self.sync_pfa_map(word_id.0, val_addr);
+        self.fvalue_words.insert(word_id.0);
+        self.sync_here_cell();
+
+        Ok(())
+    }
+}
+
+/// Format a float in engineering notation (exponent is multiple of 3).
+fn format_engineering(val: f64, prec: usize) -> String {
+    if val == 0.0 {
+        return format!("0.{:0>width$}E0 ", "", width = prec);
+    }
+    let abs_val = val.abs();
+    let exp = abs_val.log10().floor() as i32;
+    let eng_exp = exp - exp.rem_euclid(3);
+    let mantissa = val / 10.0_f64.powi(eng_exp);
+    format!("{mantissa:.prec$}E{eng_exp} ")
+}
+
+/// Parse a Forth float format string into f64.
+fn parse_forth_float(s: &str) -> Option<f64> {
+    let s = s.trim();
+    // Empty string or all spaces = 0.0 (Forth 2012 >FLOAT special case)
+    if s.is_empty() {
+        return Some(0.0);
+    }
+    let upper = s.to_ascii_uppercase();
+
+    // Reject anything with letters other than E or D
+    for c in upper.chars() {
+        if c.is_ascii_alphabetic() && c != 'E' && c != 'D' {
+            return None;
+        }
+    }
+
+    // Replace 'D' with 'E' for Rust parsing
+    let normalized = upper.replace('D', "E");
+
+    // Check that there's at least one digit somewhere
+    let has_digit = normalized.chars().any(|c| c.is_ascii_digit());
+    if !has_digit {
+        return None;
+    }
+
+    // Must contain 'E' or a '.' to be a valid float
+    if !normalized.contains('E') {
+        if normalized.contains('.') {
+            return normalized.parse::<f64>().ok();
+        }
+        // Just digits with no E and no dot -- not a valid float for >FLOAT
+        return None;
+    }
+
+    // Must not have multiple E's
+    if normalized.matches('E').count() > 1 {
+        return None;
+    }
+
+    // Must not contain spaces within the number
+    if normalized.contains(' ') {
+        return None;
+    }
+
+    // Split on E, verify the mantissa part has digits
+    let parts: Vec<&str> = normalized.splitn(2, 'E').collect();
+    let mantissa = parts[0];
+    // Strip sign from mantissa
+    let mantissa_stripped = mantissa.trim_start_matches(['+', '-']);
+    // Must have at least one digit in mantissa
+    if !mantissa_stripped.chars().any(|c| c.is_ascii_digit()) {
+        return None;
+    }
+
+    // Trailing E without exponent: "1E" means "1E0"
+    let s = if normalized.ends_with('E') || normalized.ends_with("E+") || normalized.ends_with("E-")
+    {
+        format!("{normalized}0")
+    } else {
+        normalized
+    };
+
+    s.parse::<f64>().ok()
+}
+
+/// REPRESENT helper: convert f64 to digit string.
+fn represent_float(val: f64, buf_len: usize) -> (String, i32, bool, bool) {
+    if buf_len == 0 {
+        return (String::new(), 0, val.is_sign_negative(), false);
+    }
+    if val.is_nan() {
+        return ("0".repeat(buf_len), 0, false, false);
+    }
+    if val.is_infinite() {
+        return ("0".repeat(buf_len), 0, val < 0.0, false);
+    }
+    let is_negative = val.is_sign_negative();
+    let abs_val = val.abs();
+    if abs_val == 0.0 {
+        return ("0".repeat(buf_len), 0, is_negative, true);
+    }
+    let exp = abs_val.log10().floor() as i32 + 1;
+    let scaled = abs_val / 10.0_f64.powi(exp - buf_len as i32);
+    let digits = format!("{:.0}", scaled.round());
+    // Handle carry (e.g., 9.95 with buf_len=2 -> "100")
+    if digits.len() > buf_len {
+        // Rounding caused overflow; increment exponent
+        let truncated = &digits[..buf_len];
+        return (truncated.to_string(), exp + 1, is_negative, true);
+    }
+    let padded = format!("{digits:0>buf_len$}");
+    (padded, exp, is_negative, true)
 }
 
 /// Format a signed 64-bit integer in the given base, followed by a space.
@@ -8137,4 +9826,248 @@ mod tests {
         let stack = vm.data_stack();
         assert_eq!(stack[0], dup_xt);
     }
+
+    // -- Floating-Point word set tests --
+
+    fn eval_float_stack(input: &str) -> Vec<f64> {
+        let mut vm = ForthVM::new().unwrap();
+        vm.evaluate(input).unwrap();
+        vm.float_stack()
+    }
+
+    #[test]
+    fn test_float_literal_interpret() {
+        let fs = eval_float_stack("1E");
+        assert_eq!(fs.len(), 1);
+        assert!((fs[0] - 1.0).abs() < 1e-15);
+    }
+
+    #[test]
+    fn test_float_literal_with_exponent() {
+        let fs = eval_float_stack("1.5E2");
+        assert!((fs[0] - 150.0).abs() < 1e-10);
+    }
+
+    #[test]
+    fn test_float_add() {
+        assert_eq!(eval_output("1E 2E F+ F."), "3.000000 ");
+    }
+
+    #[test]
+    fn test_float_sub() {
+        assert_eq!(eval_output("5E 3E F- F."), "2.000000 ");
+    }
+
+    #[test]
+    fn test_float_mul() {
+        assert_eq!(eval_output("3E 4E F* F."), "12.000000 ");
+    }
+
+    #[test]
+    fn test_float_div() {
+        assert_eq!(eval_output("10E 4E F/ F."), "2.500000 ");
+    }
+
+    #[test]
+    fn test_float_negate() {
+        assert_eq!(eval_output("3E FNEGATE F."), "-3.000000 ");
+    }
+
+    #[test]
+    fn test_float_abs() {
+        assert_eq!(eval_output("-5E FABS F."), "5.000000 ");
+    }
+
+    #[test]
+    fn test_fdepth() {
+        assert_eq!(eval_stack("FDEPTH"), vec![0]);
+        assert_eq!(eval_stack("1E FDEPTH"), vec![1]);
+        assert_eq!(eval_stack("1E 2E FDEPTH"), vec![2]);
+    }
+
+    #[test]
+    fn test_fdrop() {
+        assert_eq!(eval_stack("1E 2E FDROP FDEPTH"), vec![1]);
+    }
+
+    #[test]
+    fn test_fdup() {
+        assert_eq!(eval_stack("3E FDUP FDEPTH"), vec![2]);
+    }
+
+    #[test]
+    fn test_fswap() {
+        assert_eq!(eval_output("1E 2E FSWAP F. F."), "1.000000 2.000000 ");
+    }
+
+    #[test]
+    fn test_fover() {
+        assert_eq!(
+            eval_output("1E 2E FOVER F. F. F."),
+            "1.000000 2.000000 1.000000 "
+        );
+    }
+
+    #[test]
+    fn test_frot() {
+        assert_eq!(
+            eval_output("1E 2E 3E FROT F. F. F."),
+            "1.000000 3.000000 2.000000 "
+        );
+    }
+
+    #[test]
+    fn test_f0_eq() {
+        assert_eq!(eval_stack("0E F0="), vec![-1]);
+        assert_eq!(eval_stack("1E F0="), vec![0]);
+    }
+
+    #[test]
+    fn test_f0_lt() {
+        assert_eq!(eval_stack("-1E F0<"), vec![-1]);
+        assert_eq!(eval_stack("0E F0<"), vec![0]);
+        assert_eq!(eval_stack("1E F0<"), vec![0]);
+    }
+
+    #[test]
+    fn test_f_eq() {
+        assert_eq!(eval_stack("1E 1E F="), vec![-1]);
+        assert_eq!(eval_stack("1E 2E F="), vec![0]);
+    }
+
+    #[test]
+    fn test_f_lt() {
+        assert_eq!(eval_stack("1E 2E F<"), vec![-1]);
+        assert_eq!(eval_stack("2E 1E F<"), vec![0]);
+    }
+
+    #[test]
+    fn test_s_to_f_f_to_s() {
+        assert_eq!(eval_stack("42 S>F F>S"), vec![42]);
+        assert_eq!(eval_stack("-7 S>F F>S"), vec![-7]);
+    }
+
+    #[test]
+    fn test_d_to_f_f_to_d() {
+        assert_eq!(eval_stack("1. D>F F>D"), vec![0, 1]); // 1. = lo=1, hi=0
+    }
+
+    #[test]
+    fn test_float_literal_compile_mode() {
+        assert_eq!(eval_stack(": TEST 3.14E0 F>S ; TEST"), vec![3]);
+    }
+
+    #[test]
+    fn test_float_compile_fplus() {
+        assert_eq!(eval_output(": FTEST 1E 2E F+ ; FTEST F."), "3.000000 ");
+    }
+
+    #[test]
+    fn test_fvariable() {
+        assert_eq!(eval_output("FVARIABLE X 3.14E0 X F! X F@ F."), "3.140000 ");
+    }
+
+    #[test]
+    fn test_fconstant() {
+        assert_eq!(eval_output("3.14E0 FCONSTANT PI PI F."), "3.140000 ");
+    }
+
+    #[test]
+    fn test_fvalue_and_to() {
+        assert_eq!(
+            eval_output("1E FVALUE V V F. 2E TO V V F."),
+            "1.000000 2.000000 "
+        );
+    }
+
+    #[test]
+    fn test_fliteral() {
+        assert_eq!(eval_output(": FT [ -2E ] FLITERAL F. ; FT"), "-2.000000 ");
+    }
+
+    #[test]
+    fn test_fsqrt() {
+        assert_eq!(eval_output("4E FSQRT F."), "2.000000 ");
+    }
+
+    #[test]
+    fn test_fsin_cos() {
+        // sin(0) = 0, cos(0) = 1
+        assert_eq!(eval_stack("0E FSIN F>S"), vec![0]);
+        assert_eq!(eval_stack("0E FCOS F>S"), vec![1]);
+    }
+
+    #[test]
+    fn test_fexp_fln() {
+        assert_eq!(eval_stack("0E FEXP F>S"), vec![1]); // e^0 = 1
+        assert_eq!(eval_stack("1E FLN F>S"), vec![0]); // ln(1) = 0
+    }
+
+    #[test]
+    fn test_floor_fround() {
+        assert_eq!(eval_output("1.7E FLOOR F."), "1.000000 ");
+        assert_eq!(eval_output("-1.3E FLOOR F."), "-2.000000 ");
+    }
+
+    #[test]
+    fn test_fpower() {
+        assert_eq!(eval_output("2E 3E F** F."), "8.000000 ");
+    }
+
+    #[test]
+    fn test_fmax_fmin() {
+        assert_eq!(eval_output("3E 5E FMAX F."), "5.000000 ");
+        assert_eq!(eval_output("3E 5E FMIN F."), "3.000000 ");
+    }
+
+    #[test]
+    fn test_precision() {
+        assert_eq!(eval_output("3 SET-PRECISION 1E F."), "1.000 ");
+    }
+
+    #[test]
+    fn test_f_store_fetch() {
+        assert_eq!(
+            eval_output("VARIABLE BUF 2 CELLS ALLOT 42E BUF F! BUF F@ F."),
+            "42.000000 "
+        );
+    }
+
+    #[test]
+    fn test_float_plus_floats() {
+        assert_eq!(eval_stack("0 FLOAT+"), vec![8]);
+        assert_eq!(eval_stack("3 FLOATS"), vec![24]);
+    }
+
+    #[test]
+    fn test_represent() {
+        // 1E with 5 digits should give "10000" and exponent 1
+        let mut vm = ForthVM::new().unwrap();
+        vm.evaluate("CREATE FBUF 20 ALLOT").unwrap();
+        vm.evaluate("1E FBUF 5 REPRESENT").unwrap();
+        let stack = vm.data_stack();
+        // Stack should be: exponent=1, sign=0 (not negative), valid=-1 (true)
+        // Top first: valid, sign, exponent
+        assert_eq!(stack[0], -1); // valid = true
+        assert_eq!(stack[1], 0); // not negative
+        assert_eq!(stack[2], 1); // exponent
+    }
+
+    #[test]
+    fn test_to_float() {
+        // >FLOAT with "1E" should return true and push 1.0
+        assert_eq!(eval_stack(r#"S" 1E" >FLOAT"#), vec![-1]);
+        // >FLOAT with "." should return false
+        assert_eq!(eval_stack(r#"S" ." >FLOAT"#), vec![0]);
+    }
+
+    #[test]
+    fn test_f_tilde() {
+        // Exact comparison: F~ with 0E
+        assert_eq!(eval_stack("1E 1E 0E F~"), vec![-1]);
+        assert_eq!(eval_stack("1E 2E 0E F~"), vec![0]);
+        // Absolute comparison
+        assert_eq!(eval_stack("1E 1.5E 1E F~"), vec![-1]); // |1-1.5| < 1
+        assert_eq!(eval_stack("1E 2.5E 1E F~"), vec![0]); // |1-2.5| = 1.5 >= 1
+    }
 }
diff --git a/docs/APPLICATIONS.md b/docs/APPLICATIONS.md
new file mode 100644
index 0000000..7a6dca6
--- /dev/null
+++ b/docs/APPLICATIONS.md
@@ -0,0 +1,890 @@
+# The Unreasonable Effectiveness of Stack Machines
+
+_How Forth — and WAFER — can serve as infrastructure for data analytics,
+databases, AI inference, AI code generation, and AI agent control._
+
+---
+
+Forth is 55 years old. It has no type system, no garbage collector, no package
+manager, no syntax to speak of. By most conventional measures, it shouldn't
+still be relevant.
+
+But it keeps showing up at the edges — in firmware, in space probes, in
+real-time systems, in places where correctness and determinism matter more than
+developer ergonomics. That's worth paying attention to.
+
+The properties that make Forth unusual — concatenative composition, zero-cost
+abstraction through word definition, a stack-based execution model that maps
+directly to hardware — happen to line up surprisingly well with what five of
+the most active areas in modern computing are independently reaching for:
+
+1. **Data analytics** wants composable, streaming pipelines.
+2. **Database engines** want stack-based virtual machines for query execution.
+3. **AI inference** wants tiny, deterministic, embeddable runtimes.
+4. **AI code generation** wants the smallest possible target language.
+5. **AI agent systems** want plans that are also executable programs.
+
+Forth won't single-handedly solve any of these. But it offers a useful lens
+for understanding what each of them actually needs — and WAFER, a Forth that
+compiles to WebAssembly, is in a good position to explore that space.
+
+WAFER (WebAssembly Forth Engine in Rust) JIT-compiles each Forth word to its
+own WASM module, linked through shared linear memory, globals, and a function
+table. It runs anywhere WASM runs: browsers, edge devices, servers, embedded
+systems. It has 160+ words, 100% Forth 2012 compliance on 10 word sets, and
+fits in ~50 KB. It has exception handling (`CATCH`/`THROW`), metaprogramming
+(`DOES>`), dynamic compilation (`EVALUATE`), and an optimization pipeline
+designed for stack-to-local promotion that can achieve 7x speedups.
+
+This document explores what becomes possible when you take these properties
+seriously.
+
+---
+
+## 1. Data Analytics: Pipelines Without Plumbing
+
+### The Problem with Pipelines
+
+Every data analytics framework reinvents the same idea: take data, push it
+through a sequence of transformations, collect the result. Pandas chains
+methods. Spark builds DAGs. dplyr pipes with `%>%`. Unix pipes bytes through
+`|`. They all converge on the same shape: **linear composition of operations
+on an implicit data flow**.
+
+This is exactly what Forth does. It has done it since 1970. The data stack
+_is_ the pipeline. Each word _is_ a transformation. Composition is
+juxtaposition — you don't pipe, you don't chain, you don't bind. You just
+write the words next to each other.
+
+```forth
+\ Pandas: df['amount'].where(df['amount'] > 0).mean()
+\ Forth:
+: POSITIVE?  ( n -- n flag )  DUP 0> ;
+: FILTER-POSITIVE  ( addr n -- addr' n' )
+    0 >R  0 >R   \ count and sum accumulators on return stack
+    0 DO
+        DUP I CELLS + @
+        POSITIVE? IF  R> + >R  R> 1+ >R  THEN
+    LOOP DROP
+    R> R>  \ ( sum count )
+;
+: MEAN  ( sum count -- avg )  / ;
+
+data 100 FILTER-POSITIVE MEAN .
+```
+
+This goes a bit deeper than syntactic sugar. The absence of intermediate
+variables is a structural property. In a Pandas chain, every `.method()`
+returns a new DataFrame object that must be allocated, tracked, and eventually
+collected. In Forth, the data flows through the stack with zero allocation.
+The pipeline _is_ the execution.
+
+### Streaming and Incremental Computation
+
+The stack model is inherently streaming. A word consumes its inputs and
+produces its outputs in the same motion. There is no "collect all data first,
+then process" step unless you explicitly build one. This makes Forth natural
+for:
+
+- **Event stream processing**: each event lands on the stack, a word
+  processes it, the result is consumed by the next word.
+- **Incremental aggregation**: running sums, counts, and statistics
+  maintained on the return stack across invocations.
+- **Windowed computation**: a circular buffer in linear memory with
+  stack-based access patterns.
+
+```forth
+\ Running average over a stream of values
+VARIABLE running-sum
+VARIABLE running-count
+
+: UPDATE-AVG  ( new-value -- running-avg )
+    running-sum @ +  DUP running-sum !
+    running-count @ 1+  DUP running-count !
+    /
+;
+
+\ Each incoming value:
+42 UPDATE-AVG .    \ prints running average after adding 42
+17 UPDATE-AVG .    \ prints updated average after adding 17
+```
+
+### Client-Side Analytics via WASM
+
+WAFER compiles to WebAssembly. This means analytics can run _in the browser_
+with no server round-trips. A user uploads a CSV, WAFER parses and processes
+it entirely client-side, and the results render immediately. No data leaves
+the machine. No API calls. No latency.
+
+This isn't just a nice demo. For privacy-sensitive analytics (healthcare,
+finance, GDPR-regulated data), client-side processing can be a compliance
+requirement, not just a nice-to-have. WAFER's deterministic execution (no GC
+pauses, no background threads, fixed memory layout) makes it predictable
+enough for real-time dashboards.
+
+### Domain-Specific Languages
+
+Forth's defining feature is that you build the language up to your problem.
+An analytics team doesn't write Forth — they write _their DSL_, which
+happens to be implemented in Forth:
+
+```forth
+\ Define a mini analytics vocabulary
+: COLUMN  ( col# -- addr n )  table-base SWAP col-offset + col-length ;
+: SUM     ( addr n -- total )  0 ROT ROT 0 DO  OVER I CELLS + @ +  LOOP NIP ;
+: COUNT   ( addr n -- n )      NIP ;
+: AVG     ( addr n -- avg )    2DUP SUM -ROT COUNT / ;
+: WHERE>  ( addr n thresh -- addr' n' )  filter-gt ;
+
+\ The analyst writes:
+3 COLUMN  1000 WHERE>  AVG .
+\ "Average of column 3 where values exceed 1000"
+```
+
+The DSL compiles to WASM through WAFER's IR pipeline. There is no
+interpreter overhead at query time. The analyst's vocabulary _is_ the
+optimized code.
+
+### A Different Way to Look at It
+
+Most languages treat the absence of named variables as a limitation. But in
+data pipelines, it can actually be a **feature**. Named intermediates create
+coupling points — places where code can refer to stale state, where
+refactoring requires renaming, where parallelization requires dependency
+analysis. Point-free composition through a stack sidesteps this whole class
+of problems. The data is always _here_, on top of the stack, ready for the
+next transformation.
+
+---
+
+## 2. Database Engine: The Query VM You Already Have
+
+### Databases Already Think in Stacks
+
+SQLite — the most deployed database engine in the world — executes queries
+through the VDBE (Virtual Database Engine), a stack-based bytecode virtual
+machine. When you write `SELECT * FROM users WHERE age > 30`, SQLite's query
+planner compiles it into a sequence of stack operations: open cursor, seek,
+compare, jump, emit row.
+
+PostgreSQL's executor runs a tree of plan nodes, each of which pushes tuples
+upward. MySQL's handler interface is a stack of operations. CockroachDB
+compiles SQL to a vectorized execution engine that operates on batches — but
+the control flow is still a stack of operators.
+
+There's a pattern here: **query execution engines tend to converge on
+stack machines**. Forth just happens to already be one, with no extra
+abstraction layers in between.
+
+### Query Plans as Forth Programs
+
+A SQL query plan is a tree. Flattened into execution order, it becomes a
+sequence of operations — which is exactly a Forth program:
+
+```sql
+SELECT name, salary FROM employees WHERE dept = 'ENG' AND salary > 100000;
+```
+
+The query plan, expressed as Forth:
+
+```forth
+\ Primitives provided by the storage engine
+\ SCAN        ( table -- cursor )
+\ NEXT-ROW    ( cursor -- cursor flag )  flag=true if row available
+\ COL@        ( cursor col# -- value )
+\ EMIT-ROW    ( v1 v2 -- )              send to result set
+\ CLOSE       ( cursor -- )
+
+: MATCH-DEPT?   ( cursor -- cursor flag )  DUP 2 COL@ S" ENG" COMPARE 0= ;
+: MATCH-SAL?    ( cursor -- cursor flag )  DUP 3 COL@ 100000 > ;
+: PROJECT       ( cursor -- )             DUP 0 COL@  OVER 3 COL@  EMIT-ROW ;
+
+: QUERY  ( -- )
+    employees SCAN
+    BEGIN
+        NEXT-ROW
+    WHILE
+        MATCH-DEPT? IF
+            MATCH-SAL? IF
+                PROJECT
+            THEN
+        THEN
+    REPEAT
+    CLOSE
+;
+```
+
+This isn't just pseudocode, either. Every word here could be a real WAFER
+word backed by storage primitives implemented as host functions. The query
+compiles through WAFER's IR pipeline to native WASM, with the same
+optimization opportunities as any other Forth word: inlining, constant
+folding, dead code elimination.
+
+### EVALUATE as Dynamic Query Compilation
+
+SQL databases accept queries as strings and compile them at runtime. Forth
+has `EVALUATE`, which does exactly the same thing — takes a string and
+compiles/executes it:
+
+```forth
+\ Build a query string dynamically
+S" employees SCAN BEGIN NEXT-ROW WHILE MATCH-DEPT? IF PROJECT THEN REPEAT CLOSE"
+EVALUATE
+```
+
+The difference from SQL: the "query language" and the "implementation
+language" are the same. There is no impedance mismatch between the language
+the user writes queries in and the language the engine executes them in. A
+user-defined function is just another word. An index lookup is just another
+word. A join strategy is just another word. They all compose the same way.
+
+### Linear Memory as Storage Pages
+
+WAFER's linear memory model maps directly to how databases manage storage.
+A database page is a fixed-size block of bytes at a known offset — exactly
+what Forth's `@` and `!` operate on. B-tree nodes are structures in linear
+memory traversed by pointer arithmetic:
+
+```forth
+\ B-tree node layout:
+\   +0: key count (cell)
+\   +4: is-leaf flag (cell)
+\   +8: keys array (key-count cells)
+\   +8+4*key-count: child pointers (key-count+1 cells)
+
+: NODE-KEYS    ( node -- addr )  8 + ;
+: NODE-KEY@    ( node i -- key )  CELLS SWAP NODE-KEYS + @ ;
+: NODE-CHILD@  ( node i -- child )
+    OVER NODE-KEYS
+    OVER @ CELLS +    \ skip past keys array
+    SWAP CELLS + 4 +  \ index into children
+    @
+;
+
+: BTREE-SEARCH  ( node target-key -- addr|0 )
+    OVER @ 0= IF  2DROP 0  EXIT  THEN  \ empty node
+    OVER 4 + @ IF                        \ leaf node
+        LEAF-SEARCH
+    ELSE
+        INTERNAL-SEARCH                  \ recurse into child
+    THEN
+;
+```
+
+### WASM Sandboxing for User-Defined Functions
+
+Safely executing user-defined functions (UDFs) is one of the trickier
+problems in database engines. PostgreSQL UDFs in C can crash the server.
+JavaScript UDFs require embedding V8. Python UDFs tend to be slow.
+
+WAFER UDFs compile to WASM and execute in a sandbox with bounded memory,
+bounded execution time, and no access to anything outside the linear memory
+they're given. A malicious UDF can't read other users' data, can't make
+network calls, can't crash the host. WAFER gets this for free — it's
+inherent to WASM's security model.
+
+```forth
+\ User defines a custom scoring function
+: SCORE  ( age salary -- score )
+    1000 /         \ salary contribution (salary/1000)
+    SWAP 50 - ABS  \ age penalty (distance from 50)
+    -              \ final score
+;
+
+\ Engine uses it in a query
+: RANKED-QUERY  ( -- )
+    employees SCAN
+    BEGIN NEXT-ROW WHILE
+        DUP 1 COL@  OVER 3 COL@  SCORE
+        50 > IF  PROJECT  THEN
+    REPEAT CLOSE
+;
+```
+
+The `SCORE` function compiles to a WASM module through WAFER's JIT. It runs
+at near-native speed, sandboxed, with no FFI overhead.
+
+### A Different Way to Look at It
+
+Database engineers put a lot of effort into building query VMs — designing
+bytecode formats, writing interpreters, adding JIT compilation. In a sense,
+they're often reinventing something Forth-shaped each time. It's worth asking:
+what if you just started with Forth and built the storage layer underneath it?
+
+---
+
+## 3. AI Inference: Neural Networks as Word Composition
+
+### Layers Are Words, Forward Pass Is Composition
+
+A neural network's forward pass is a pipeline: input tensor enters, passes
+through a sequence of layers (linear transform, activation, normalization),
+and a prediction exits. Each layer takes a tensor and produces a tensor.
+
+In Forth terms: each layer is a word. The tensor sits on the stack. The
+forward pass is the composition of those words:
+
+```forth
+\ Assuming tensor operations as primitives (host functions):
+\ T-MATMUL  ( tensor weights -- tensor )
+\ T-ADD     ( tensor bias -- tensor )
+\ T-RELU    ( tensor -- tensor )
+\ T-SOFTMAX ( tensor -- tensor )
+
+: LINEAR1    ( tensor -- tensor )  w1 T-MATMUL b1 T-ADD ;
+: LINEAR2    ( tensor -- tensor )  w2 T-MATMUL b2 T-ADD ;
+: LINEAR3    ( tensor -- tensor )  w3 T-MATMUL b3 T-ADD ;
+
+: CLASSIFIER ( tensor -- tensor )
+    LINEAR1 T-RELU
+    LINEAR2 T-RELU
+    LINEAR3 T-SOFTMAX
+;
+
+input-data CLASSIFIER  \ forward pass
+```
+
+This maps more directly than you might expect. The compositional structure of
+neural networks lines up nicely with the compositional structure of Forth
+programs. The stack carries the data flow. The words are the layers. The
+dictionary holds the model architecture.
+
+### Quantized Inference on the Integer Stack
+
+Most production inference runs quantized — INT8 or INT4 weights, integer
+arithmetic, no floating point. Forth's native data type is the integer cell.
+WAFER's `i32` stack operations map directly to quantized tensor operations:
+
+```forth
+\ INT8 quantized dot product of two vectors
+: QDOT  ( addr1 addr2 n -- result )
+    0 >R                         \ accumulator on return stack
+    0 DO
+        OVER I + C@  127 -       \ load and de-bias first element
+        OVER I + C@  127 -       \ load and de-bias second element
+        *  R> + >R               \ multiply-accumulate
+    LOOP
+    2DROP R>
+;
+
+\ Quantized linear layer
+: QLINEAR  ( input-addr weight-addr rows cols -- output-addr )
+    \ For each output neuron, compute QDOT with input
+    output-buf >R
+    0 DO
+        2DUP I row-offset + SWAP QDOT
+        R@ I CELLS + !
+    LOOP
+    2DROP R>
+;
+```
+
+No framework dependency, no Python interpreter, no CUDA runtime — just
+integer arithmetic on a stack, compiled to WASM, running on any device.
+
+### Edge AI: The 50 KB Runtime
+
+ML inference frameworks tend to be big. PyTorch is ~500 MB. TensorFlow Lite
+is ~1 MB for the runtime alone. ONNX Runtime is ~10 MB.
+
+WAFER is ~50 KB for the full Forth system. The model weights dominate the
+binary size, not the runtime. For edge devices — IoT sensors, wearables,
+microcontrollers, browser tabs — that size difference can be the difference
+between "fits" and "doesn't fit."
+
+WASM's portability means the same inference code runs on an ARM
+microcontroller, in a browser, on a server, without recompilation. Write the
+model once in Forth, deploy everywhere WASM reaches.
+
+### DOES> for Architecture Generation
+
+Forth's `DOES>` is a metaprogramming facility: it creates words that create
+other words, each with custom runtime behavior. This is exactly what neural
+architecture construction needs:
+
+```forth
+\ LAYER is a defining word that creates layer words
+: LAYER  ( weights bias rows cols -- )
+    CREATE  , , , ,           \ store dimensions and pointers
+    DOES>   ( tensor -- tensor )
+        DUP >R                \ save parameter field address
+        R@ @  R@ 4 + @       \ get cols, rows
+        R@ 8 + @              \ get weights address
+        T-MATMUL
+        R> 12 + @             \ get bias address
+        T-ADD
+;
+
+\ Define the network architecture
+w1 b1 768 512  LAYER EMBED
+w2 b2 512 256  LAYER HIDDEN1
+w3 b3 256 10   LAYER OUTPUT
+
+\ The architecture is now executable
+: MODEL  ( tensor -- tensor )  EMBED T-RELU HIDDEN1 T-RELU OUTPUT T-SOFTMAX ;
+```
+
+Each `LAYER` invocation creates a new word with its own weights and
+dimensions baked in. The `MODEL` word composes them. This is the same
+pattern as `nn.Sequential` in PyTorch — but it compiles to WASM, has zero
+framework overhead, and the "architecture definition" and the "executable
+model" are the same thing.
+
+### Automatic Differentiation via Dual Numbers
+
+Backpropagation is reverse-mode automatic differentiation. There is an
+elegant formulation using dual numbers (a value paired with its derivative)
+that maps to Forth's double-cell operations:
+
+```forth
+\ A dual number is a pair ( value derivative ) stored as a double cell
+\ WAFER's double-cell words (D+, D-, D*, 2DUP, etc.) operate on these natively
+
+\ Dual addition: (a, a') + (b, b') = (a+b, a'+b')
+: D+DUAL  ( a a' b b' -- a+b a'+b' )
+    ROT +          \ a' + b'
+    >R + R>        \ a + b, then restore derivative
+;
+
+\ Dual multiplication: (a, a') * (b, b') = (a*b, a*b' + a'*b)
+: D*DUAL  ( a a' b b' -- a*b a*b'+a'*b )
+    3 PICK *       \ a * b'
+    >R
+    ROT *          \ a' * b
+    R> +           \ a*b' + a'*b = derivative
+    >R
+    *              \ a * b = value
+    R>
+;
+```
+
+The chain rule emerges naturally: composing dual-number operations through a
+sequence of words automatically computes the derivative of the whole
+pipeline. This is the same principle behind JAX's `jvp` — but expressed as
+stack operations.
+
+### A Different Way to Look at It
+
+Most of the ML ecosystem's complexity lives in _training_. Inference, by
+comparison, is fairly straightforward: load weights, multiply matrices, apply
+activations, read output. That's a pipeline of arithmetic operations — which
+is pretty much what Forth was designed for. The industry tends to wrap
+inference in 500 MB frameworks because training needed those frameworks, and
+the two haven't been fully separated. A 50 KB Forth runtime doing quantized
+integer operations might be closer to what inference actually needs than we
+usually assume.
+
+---
+
+## 4. AI Generating Code: The Smallest Target Language
+
+### The Token Economy
+
+When an LLM generates code, every token costs money and adds latency. A
+Python solution to "compute the average of a list" looks like:
+
+```python
+def average(numbers):
+    if not numbers:
+        return 0
+    return sum(numbers) / len(numbers)
+```
+
+That is 25 tokens. The Forth equivalent:
+
+```forth
+: AVERAGE  ( addr n -- avg )  2DUP SUM -ROT NIP / ;
+```
+
+That is 12 tokens. For the same semantic content, Forth uses roughly half
+the tokens. At scale — millions of API calls, each generating hundreds of
+lines — this is a meaningful cost reduction. But the token savings are the
+least interesting advantage.
+
+### Minimal Syntax, Maximal Verifiability
+
+Forth has essentially no syntax. There are words separated by spaces. There
+are numbers. There are a few special constructs (`:` for definitions, `IF`
+/`THEN` for conditionals, `DO`/`LOOP` for iteration). That's about it.
+
+An LLM generating Python must get indentation right, match parentheses and
+brackets, handle keyword arguments, manage import statements, respect method
+resolution order, and navigate a standard library of thousands of functions.
+An LLM generating Forth mostly just needs to get the stack effect right.
+That's the main failure mode worth worrying about.
+
+And stack effects are **mechanically verifiable**:
+
+```forth
+\ Stack effect: ( n1 n2 -- n3 )
+\ Verification: start with 2 items on stack, end with 1
+: ADD-AND-DOUBLE  ( n1 n2 -- n3 )  + 2* ;
+
+\ Test:
+3 4 ADD-AND-DOUBLE   \ stack should contain: 14
+```
+
+You don't need a type checker or static analysis. Just run the word with
+known inputs and check the stack. If the stack depth and values match the
+declared effect, the word is correct. It's hard to think of another practical
+language where verification is this straightforward.
+
+### Self-Extending Vocabulary
+
+LLMs struggle with large codebases because context windows are finite. A
+Python project with 50 files and 10,000 lines requires the LLM to hold (or
+retrieve) vast amounts of context to generate correct code.
+
+Forth's defining characteristic is that you build the language up to your
+problem. The LLM doesn't need to generate a 100-line solution. It generates
+5-line words, each building on the previous ones:
+
+```forth
+\ Step 1: LLM generates basic operations
+: CLAMP  ( n lo hi -- n' )  ROT MIN MAX ;
+: BETWEEN?  ( n lo hi -- flag )  OVER - >R - R> U< ;
+
+\ Step 2: LLM generates higher-level operations using step 1
+: NORMALIZE  ( n -- n' )  0 255 CLAMP ;
+: IN-RANGE?  ( n -- flag )  0 100 BETWEEN? ;
+
+\ Step 3: LLM generates application logic using steps 1-2
+: PROCESS-SENSOR  ( raw -- calibrated )
+    offset @ -          \ remove sensor offset
+    NORMALIZE            \ clamp to valid range
+    scale @ *  1000 /    \ apply calibration scale
+;
+```
+
+Each step requires only the _names_ of previously defined words, not their
+implementations. The dictionary serves as a compressed representation of the
+entire program. An LLM can generate correct code by knowing only the word
+names and their stack effects — a few dozen tokens of context instead of
+thousands of lines.
+
+### WASM Sandbox: Safe Execution of Untrusted Code
+
+AI-generated code generally needs to be executed to be verified. Running
+arbitrary Python is tricky from a security perspective — file system access,
+network calls, `import os`, `eval()`. Sandboxing Python typically requires
+containerization, seccomp filters, or virtual machines.
+
+WAFER compiles to WASM, which executes in a sandbox by construction. A
+WAFER program:
+
+- Cannot access the file system
+- Cannot make network calls
+- Cannot read memory outside its linear memory
+- Cannot execute longer than the host allows (fuel metering)
+- Cannot consume more memory than the host allocates
+
+You can run AI-generated Forth with roughly the same confidence as a pure
+mathematical function. The sandbox isn't a bolt-on — it's just how WASM
+works.
+
+```forth
+\ AI generates this code. Is it safe to run? Yes, always.
+: FIBONACCI  ( n -- fib )
+    DUP 2 < IF EXIT THEN
+    DUP 1- RECURSE
+    SWAP 2 - RECURSE
+    +
+;
+```
+
+There's nothing this word can do except compute. No side effects, no
+escape hatches. The WASM sandbox guarantees that structurally.
+
+### A Different Way to Look at It
+
+The conventional wisdom is that LLMs need expressive, high-level languages
+to generate useful code. But there's a good case for the opposite: what LLMs
+really benefit from are **verifiable** languages — ones where correctness can
+be checked cheaply and deterministically. Expressiveness can actually work
+against you here: more syntax means more ways to be wrong, more edge cases
+to handle, more context to maintain. Forth's extreme minimalism starts to
+look less like a limitation and more like an advantage: generate a few small
+words, verify each one by running it, compose them into larger programs with
+confidence. The language that's hardest for humans to read might just be the
+easiest for machines to write correctly.
+
+---
+
+## 5. AI Agent Control: Plans That Execute Themselves
+
+### The Plan-Program Gap
+
+When an AI agent "plans," it produces a sequence of steps in natural
+language:
+
+> 1. Search for files matching "*.config"
+> 2. Read each file and extract the "timeout" field
+> 3. If timeout > 30, update it to 30
+> 4. Write the modified files back
+
+This plan is then "executed" by the agent interpreting each step, calling
+tools, handling errors, and managing state — all mediated by the LLM at
+every step, consuming tokens and latency for what is fundamentally a
+sequential program.
+
+The gap between "plan" and "program" might be more artificial than it looks.
+A plan _is_ a program — we just don't usually give agents a good executable
+representation for it.
+
+Forth could be that representation.
+
+### Tools as Words
+
+Every agent tool — file read, web search, code execution, API call — maps
+to a Forth word. The agent's toolkit becomes a Forth dictionary:
+
+```forth
+\ Agent tool vocabulary (host functions)
+\ SEARCH-FILES  ( pattern-addr pattern-len -- results-addr count )
+\ READ-FILE     ( path-addr path-len -- content-addr content-len )
+\ WRITE-FILE    ( content-addr content-len path-addr path-len -- )
+\ JSON-GET      ( json-addr key-addr key-len -- value-addr value-len )
+\ SHELL         ( cmd-addr cmd-len -- output-addr output-len )
+\ ASK-USER      ( question-addr question-len -- answer-addr answer-len )
+```
+
+Now the plan from above becomes an executable program:
+
+```forth
+: UPDATE-TIMEOUTS  ( -- )
+    S" *.config" SEARCH-FILES       \ get matching files
+    0 DO                             \ for each file
+        DUP I CELLS + @ COUNT        \ get filename
+        2DUP READ-FILE               \ read contents
+        S" timeout" JSON-GET         \ extract timeout field
+        S>NUMBER DROP                \ convert to number
+        30 > IF                      \ if timeout > 30
+            30 SET-TIMEOUT           \ update to 30
+            WRITE-FILE               \ write back
+        ELSE
+            2DROP                    \ discard unchanged
+        THEN
+    LOOP
+    DROP
+;
+
+UPDATE-TIMEOUTS
+```
+
+This isn't a description of what to do — it _is_ what to do. The agent
+generates it, WAFER compiles it to WASM, and it runs — no LLM in the loop
+during execution, no token cost per step, no latency per tool call.
+
+### Error Handling with CATCH/THROW
+
+Of course, agent plans fail. Files don't exist. APIs return errors.
+Permissions get denied. Production agent systems need robust error handling,
+which typically means calling the LLM at every step to decide what to do
+when something goes wrong.
+
+WAFER has `CATCH` and `THROW` — structured exception handling that lets
+the plan itself define error recovery:
+
+```forth
+: SAFE-READ  ( path-addr path-len -- content-addr content-len | 0 0 )
+    ['] READ-FILE CATCH IF
+        2DROP  0 0                   \ file not found: return empty
+    THEN
+;
+
+: SAFE-UPDATE  ( filename-addr filename-len -- )
+    2DUP SAFE-READ                   \ try to read
+    DUP 0= IF  2DROP 2DROP EXIT THEN \ skip if file missing
+    S" timeout" JSON-GET
+    S>NUMBER DROP
+    30 > IF
+        30 SET-TIMEOUT
+        WRITE-FILE
+    ELSE
+        2DROP 2DROP
+    THEN
+;
+
+: ROBUST-UPDATE-TIMEOUTS  ( -- )
+    S" *.config" SEARCH-FILES
+    0 DO
+        DUP I CELLS + @ COUNT SAFE-UPDATE
+    LOOP
+    DROP
+;
+```
+
+The error handling is part of the plan. The agent generates it once, and it
+runs to completion without further LLM intervention. Errors are handled at
+the speed of WASM, not the speed of an API call to an LLM.
+
+### The Dictionary as Growing Capability
+
+A human Forth programmer builds up vocabulary: small words compose into
+larger words, which compose into still larger words. The dictionary grows
+with the programmer's understanding of the problem.
+
+An AI agent does the same thing. Each successfully executed plan leaves
+behind defined words that can be reused:
+
+```forth
+\ First task: agent learns to read configs
+: READ-CONFIG  ( path-addr path-len -- json-addr json-len )
+    SAFE-READ DUP 0= IF EXIT THEN JSON-PARSE ;
+
+\ Second task: agent learns to update configs
+: UPDATE-CONFIG  ( key-addr key-len value path-addr path-len -- )
+    2DUP READ-CONFIG JSON-SET WRITE-FILE ;
+
+\ Third task: agent composes previous capabilities
+: MIGRATE-CONFIGS  ( -- )
+    S" *.config" SEARCH-FILES
+    0 DO
+        DUP I CELLS + @ COUNT
+        S" timeout" 30 ROT ROT UPDATE-CONFIG
+    LOOP DROP
+;
+
+\ The agent's vocabulary grows with experience.
+\ MIGRATE-CONFIGS didn't exist before. Now it does.
+\ Next time, the agent can use it as a building block.
+```
+
+You could call this _learned tool use_ — not in the machine learning sense,
+but in the software engineering sense. The agent defines new capabilities in
+terms of old ones, and the dictionary persists across invocations. Over time,
+the agent's vocabulary naturally converges on the abstractions that matter
+for its operational domain.
+
+### REPL as Test-Before-Commit
+
+Agents that act irreversibly on the first try are risky. WAFER's REPL model
+gives agents a natural test-before-commit workflow:
+
+1. **Define**: Generate and compile the plan as Forth words.
+2. **Test**: Run the words against sample data on the stack.
+3. **Verify**: Check the stack for expected results.
+4. **Execute**: Run the plan for real only after verification passes.
+
+```forth
+\ Step 1: Define
+: CALCULATE-DISCOUNT  ( price tier -- discounted )
+    CASE
+        1 OF  10 ENDOF   \ tier 1: 10% off
+        2 OF  20 ENDOF   \ tier 2: 20% off
+        3 OF  35 ENDOF   \ tier 3: 35% off
+        0 SWAP
+    ENDCASE
+    100 SWAP - * 100 /
+;
+
+\ Step 2: Test (no side effects, just stack operations)
+1000 1 CALCULATE-DISCOUNT .  \ expect 900
+1000 2 CALCULATE-DISCOUNT .  \ expect 800
+1000 3 CALCULATE-DISCOUNT .  \ expect 650
+
+\ Step 3: Verify output matches expectations
+\ Step 4: Apply to real data only after tests pass
+```
+
+The agent can generate, test, and iterate without ever touching production
+data. The REPL isn't just a debugging convenience here — it's a safety mechanism
+for autonomous agents.
+
+### Multi-Agent Coordination
+
+Multiple agents can share a WAFER dictionary through shared linear memory.
+One agent defines words. Another agent uses them. A coordinator agent
+composes them into higher-level plans:
+
+```forth
+\ Agent A defines data retrieval
+: FETCH-METRICS  ( -- addr n )  metrics-api QUERY PARSE-JSON ;
+
+\ Agent B defines analysis
+: DETECT-ANOMALIES  ( addr n -- anomalies-addr n )
+    THRESHOLD @  FILTER-ABOVE ;
+
+\ Agent C defines actions
+: ALERT  ( anomalies-addr n -- )
+    0 DO  DUP I CELLS + @  SEND-ALERT  LOOP DROP ;
+
+\ Coordinator composes them
+: MONITOR  ( -- )
+    BEGIN
+        FETCH-METRICS DETECT-ANOMALIES
+        DUP 0> IF  ALERT  ELSE  DROP  THEN
+        60000 DELAY
+    AGAIN
+;
+```
+
+Each agent contributes words to a shared vocabulary. The coordinator doesn't
+need to understand the implementation of `FETCH-METRICS` or
+`DETECT-ANOMALIES` — it only needs to know their stack effects. This is
+composability without coupling, coordination without shared state beyond
+the dictionary.
+
+### A Different Way to Look at It
+
+The AI agent community is building increasingly sophisticated "plan
+representations" — DAGs, state machines, behavior trees, ReAct loops — all
+trying to bridge the gap between the LLM's natural language output and
+actual tool execution. But Forth is already a plan representation that
+doubles as an execution engine. It has structured control flow (`IF`/`THEN`,
+`DO`/`LOOP`, `BEGIN`/`UNTIL`), error handling (`CATCH`/`THROW`),
+composability (word definitions), and a test harness (the REPL and stack).
+Maybe the gap between "plan" and "program" doesn't need to be bridged so
+much as it needs to be _erased_.
+
+---
+
+## Convergence: Five Problems, One Shape
+
+These five domains look different on the surface:
+
+| Domain          | Traditional Tool               | Core Operation       |
+| --------------- | ------------------------------ | -------------------- |
+| Data analytics  | Pandas, Spark                  | Transform pipeline   |
+| Database engine | SQLite VDBE, Postgres executor | Query plan execution |
+| AI inference    | PyTorch, TensorFlow            | Layer composition    |
+| AI codegen      | Python, JavaScript             | Program synthesis    |
+| AI agents       | LangChain, CrewAI              | Plan execution       |
+
+But they share a deep structure: **sequential composition of simple
+operations on a data flow**. A data pipeline, a query plan, a forward
+pass, a synthesized program, and an agent plan are all the same thing:
+a sequence of words applied to a stack.
+
+Forth noticed this in 1970. Charles Moore designed a language around the
+observation that most computation is a pipeline of transformations, and
+the simplest way to express pipelines is sequential composition on a
+stack. The language has no syntax because pipelines don't need syntax.
+It has no type system because the data flow _is_ the type. It has no
+package manager because each program builds its own vocabulary from
+primitives.
+
+WAFER brings these ideas to the modern world by targeting WebAssembly — the
+universal runtime that runs in browsers, on servers, on edge devices, in
+sandboxes. That combination opens up some interesting possibilities:
+
+- **Analytics in the browser** with no server, no framework, deterministic
+  execution.
+- **Database VMs** that compile queries to native WASM through an existing
+  Forth JIT.
+- **Inference engines** that fit in 50 KB and run on any device WASM
+  reaches.
+- **AI-generated code** in the language with the smallest syntax, cheapest
+  verification, and safest sandbox.
+- **Agent plans** that are executable programs, testable in a REPL,
+  composable through a growing dictionary.
+
+None of this requires Forth to change. Forth has been this shape for 55
+years. It's kind of fun that the world's problems seem to be circling back
+to it.
+
+---
+
+_WAFER is open source. Start at the [repository root](../README.md)._
+_Architecture details: [WAFER.md](WAFER.md). Language introduction:
+[FORTH.md](FORTH.md)._