feat(gpt-bot): Add WASM memory probing for dynamic model selection

jeremymanning · jeremymanning · commit ee0c8cd77525 · 2026-01-04T08:55:23.000-05:00
- Add probeWasmMemory() to detect browser WASM heap limits via binary search
- Add checkWebGPU() to detect GPU availability for larger models
- Update model selection to use probed limits instead of hardcoded cap
- 1.7B model now selectable when WebGPU available (bypasses WASM heap)
- Update tests to match SmolLM2 configuration
diff --git a/demos/chatbot-evolution/js/gpt-bot.js b/demos/chatbot-evolution/js/gpt-bot.js
@@ -4,12 +4,90 @@
  * Model hierarchy (SmolLM2 family - HuggingFace's browser-optimized models):
  * 1. SmolLM2-135M-Instruct - Ultra-light, works on any device
  * 2. SmolLM2-360M-Instruct - Balanced quality/speed (default for 4GB RAM)
- * 3. SmolLM2-1.7B-Instruct - Best quality (requires 8GB+ RAM)
+ * 3. SmolLM2-1.7B-Instruct - Best quality (requires 8GB+ RAM + WebGPU)
  * 
- * Auto-selects based on device RAM (navigator.deviceMemory)
+ * Auto-selects based on device RAM and WASM memory limits.
+ * When WebGPU is available, larger models become feasible since weights
+ * go to GPU memory, bypassing WASM heap limits.
  */
 
 export class GPTBot {
+    // Cached capability detection (computed once)
+    static _wasmMaxMB = null;
+    static _webGPUAvailable = null;
+    
+    /**
+     * Probe maximum WASM memory available in this browser.
+     * Uses binary search to find the largest allocatable memory.
+     * @returns {number} Maximum WASM memory in MB
+     */
+    static probeWasmMemory() {
+        if (GPTBot._wasmMaxMB !== null) {
+            return GPTBot._wasmMaxMB;
+        }
+        
+        // Binary search for max allocatable WASM pages
+        // 1 page = 64 KiB, max theoretical = 65536 pages (4GB)
+        let min = 1;
+        let max = 65536; // 4GB theoretical max
+        let best = min;
+        
+        while (min <= max) {
+            const mid = Math.floor((min + max) / 2);
+            try {
+                // Try to create Memory with this maximum
+                new WebAssembly.Memory({ initial: 1, maximum: mid });
+                best = mid;
+                min = mid + 1;
+            } catch (e) {
+                max = mid - 1;
+            }
+        }
+        
+        // Convert pages to MB (1 page = 64 KiB = 0.0625 MB)
+        GPTBot._wasmMaxMB = Math.floor((best * 64) / 1024);
+        console.log(`[GPT] Probed WASM memory limit: ${GPTBot._wasmMaxMB}MB (${best} pages)`);
+        return GPTBot._wasmMaxMB;
+    }
+    
+    /**
+     * Check if WebGPU is available and functional.
+     * WebGPU allows larger models since weights go to GPU memory.
+     * @returns {Promise<boolean>}
+     */
+    static async checkWebGPU() {
+        if (GPTBot._webGPUAvailable !== null) {
+            return GPTBot._webGPUAvailable;
+        }
+        
+        try {
+            if (!navigator.gpu) {
+                GPTBot._webGPUAvailable = false;
+                console.log('[GPT] WebGPU not supported in this browser');
+                return false;
+            }
+            
+            const adapter = await navigator.gpu.requestAdapter();
+            if (!adapter) {
+                GPTBot._webGPUAvailable = false;
+                console.log('[GPT] WebGPU adapter not available');
+                return false;
+            }
+            
+            const limits = adapter.limits;
+            const maxBufferSize = limits.maxBufferSize || 0;
+            const maxStorageBufferSize = limits.maxStorageBufferBindingSize || 0;
+            
+            console.log(`[GPT] WebGPU available - maxBufferSize: ${Math.floor(maxBufferSize / 1024 / 1024)}MB, maxStorageBuffer: ${Math.floor(maxStorageBufferSize / 1024 / 1024)}MB`);
+            GPTBot._webGPUAvailable = true;
+            return true;
+        } catch (e) {
+            console.log('[GPT] WebGPU check failed:', e.message);
+            GPTBot._webGPUAvailable = false;
+            return false;
+        }
+    }
+    
     constructor() {
         this.generator = null;
         this.isReady = false;
@@ -19,14 +97,16 @@ export class GPTBot {
         this.loadAttempt = 0;
 
         // SmolLM2 family - all have native Transformers.js support (ONNX bundled)
+        // wasmMinMB: minimum WASM heap needed (weights + runtime overhead)
         this.models = [
             {
                 name: 'HuggingFaceTB/SmolLM2-135M-Instruct',
                 displayName: 'SmolLM2 135M',
                 dtype: 'q4',
                 params: '135M',
                 sizeMB: 85,
-                minRAM: 2,  // Works on 2GB+ devices
+                wasmMinMB: 300,   // 85MB weights + ~200MB runtime
+                minRAM: 2,
                 year: 2024,
                 org: 'HuggingFace'
             },
@@ -36,7 +116,8 @@ export class GPTBot {
                 dtype: 'q4',
                 params: '360M',
                 sizeMB: 210,
-                minRAM: 4,  // Recommended for 4GB+ devices
+                wasmMinMB: 600,   // 210MB weights + ~400MB runtime
+                minRAM: 4,
                 year: 2024,
                 org: 'HuggingFace'
             },
@@ -46,6 +127,7 @@ export class GPTBot {
                 dtype: 'q4',
                 params: '1.7B',
                 sizeMB: 1410,
+                wasmMinMB: 2500,  // 1410MB weights + ~1GB runtime - exceeds most WASM limits
                 minRAM: 8,
                 year: 2024,
                 org: 'HuggingFace'
@@ -60,23 +142,43 @@ export class GPTBot {
     }
     
     /**
-     * Detect device RAM and select the largest model that fits
-     * Uses 50% of available RAM as the threshold
+     * Detect device capabilities and select the best model.
+     * 
+     * Selection logic:
+     * 1. Probe WASM memory limit
+     * 2. Check WebGPU availability (allows larger models)
+     * 3. Consider device RAM
+     * 4. Select largest model that fits all constraints
      */
     getDefaultModelIndex() {
         const deviceRAM = navigator.deviceMemory || 4;
+        const wasmMaxMB = GPTBot.probeWasmMemory();
+        
+        // WebGPU check is async, so we optimistically check the cached value
+        // If WebGPU hasn't been checked yet, assume WASM-only for initial selection
+        const hasWebGPU = GPTBot._webGPUAvailable === true;
         
-        // Cap at 360M (index 1) - 1.7B model exceeds browser WASM memory limits
-        const maxSafeIndex = 1;
+        console.log(`[GPT] Capability detection: RAM=${deviceRAM}GB, WASM=${wasmMaxMB}MB, WebGPU=${hasWebGPU}`);
         
-        for (let i = Math.min(maxSafeIndex, this.models.length - 1); i >= 0; i--) {
-            if (deviceRAM >= this.models[i].minRAM) {
-                console.log(`[GPT] Detected ${deviceRAM}GB RAM, auto-selecting ${this.models[i].displayName}`);
-                return i;
+        for (let i = this.models.length - 1; i >= 0; i--) {
+            const model = this.models[i];
+            
+            if (deviceRAM < model.minRAM) {
+                console.log(`[GPT] ${model.displayName}: skipped (needs ${model.minRAM}GB RAM, have ${deviceRAM}GB)`);
+                continue;
+            }
+            
+            // WebGPU bypasses WASM heap limits by loading weights to GPU memory
+            if (!hasWebGPU && wasmMaxMB < model.wasmMinMB) {
+                console.log(`[GPT] ${model.displayName}: skipped (needs ${model.wasmMinMB}MB WASM, have ${wasmMaxMB}MB)`);
+                continue;
             }
+            
+            console.log(`[GPT] Auto-selecting ${model.displayName} (RAM: ${deviceRAM}GB, WASM: ${wasmMaxMB}MB, WebGPU: ${hasWebGPU})`);
+            return i;
         }
         
-        console.log(`[GPT] Low RAM (${deviceRAM}GB), using smallest model`);
+        console.log(`[GPT] Falling back to smallest model (limited resources)`);
         return 0;
     }
     
diff --git a/notes/2026-01-04-session-handoff.md b/notes/2026-01-04-session-handoff.md
@@ -1,10 +1,39 @@
 # Session Handoff - January 4, 2026
 
 **Status:** ✅ All tasks completed
-**Completed:** 2026-01-04
+**Last Updated:** 2026-01-04 08:15 AM ET
 
-## Summary
-All three tasks from this handoff were completed:
+## Current Session Summary
+Implemented WASM memory probing for SmolLM2 model auto-selection in GPT bot.
+
+### Changes Made
+1. **Added `GPTBot.probeWasmMemory()`** - Binary search to find max allocatable WASM pages
+2. **Added `GPTBot.checkWebGPU()`** - Async check for WebGPU availability with adapter limits
+3. **Updated `getDefaultModelIndex()`** - Now uses probed WASM limits + WebGPU detection
+4. **Added `wasmMinMB` to model configs** - Minimum WASM heap needed (weights + runtime)
+5. **Updated tests** - `test-gpt-bot-loading.mjs` now tests SmolLM2 configuration
+
+### Key Logic
+- WASM binary search: tries `WebAssembly.Memory({ initial: 1, maximum: N })` to find max N
+- 1 page = 64 KiB, so max pages × 64 / 1024 = max MB
+- Model selection: largest model that fits both RAM AND WASM constraints
+- WebGPU bypass: when WebGPU available, weights go to GPU memory, relaxing WASM limit
+
+### Files Modified
+- `demos/chatbot-evolution/js/gpt-bot.js` - WASM probing, WebGPU detection, model selection
+- `tests/test-gpt-bot-loading.mjs` - Updated for SmolLM2 models
+
+### Tests Status
+- ✅ All GPT bot tests pass (32/32)
+- ✅ All chatbot tests pass (ELIZA, PARRY, ALICE)
+
+### Not Yet Pushed
+Run `git status` to see pending changes, then `git push` when ready.
+
+---
+
+## Previous Session Summary (Morning)
+All three tasks from previous handoff were completed:
 1. ✅ Syllabus table fixes (columns, sequential lecture numbering, slide links)
 2. ✅ Tag filtering removed from demos page
 3. ✅ Related Lectures links added to all 15 demos
diff --git a/tests/test-gpt-bot-loading.mjs b/tests/test-gpt-bot-loading.mjs
@@ -54,18 +54,18 @@ async function testGPTBotStructure() {
     );
 
     runner.assert(
-        botCode.includes('DeepSeek-R1-Distill-Qwen-1.5B'),
-        'DeepSeek-R1 is configured as primary model'
+        botCode.includes('SmolLM2-135M-Instruct'),
+        'SmolLM2 135M is configured as smallest model'
     );
 
     runner.assert(
-        botCode.includes('gemma-3-1b-it'),
-        'Gemma 3 1B is configured as first fallback'
+        botCode.includes('SmolLM2-360M-Instruct'),
+        'SmolLM2 360M is configured as medium model'
     );
 
     runner.assert(
-        botCode.includes('gemma-3-270m-it'),
-        'Gemma 3 270M is configured as second fallback'
+        botCode.includes('SmolLM2-1.7B-Instruct'),
+        'SmolLM2 1.7B is configured as largest model'
     );
 
     runner.assert(
@@ -104,8 +104,8 @@ async function testGPTBotStructure() {
     );
 
     runner.assert(
-        botCode.includes('for (let i = 0; i < this.models.length; i++)'),
-        'Iterates through model fallback chain'
+        botCode.includes('for (let i = this.selectedModelIndex; i >= 0; i--)'),
+        'Iterates through model fallback chain (largest to smallest)'
     );
 
     runner.assert(
@@ -139,8 +139,8 @@ async function testModelConfigurations() {
         runner.assertEqual(modelCount, 3, 'Has exactly 3 models configured');
 
         runner.assert(
-            modelsSection.includes('onnx-community/'),
-            'Uses onnx-community models (Transformers.js compatible)'
+            modelsSection.includes('HuggingFaceTB/'),
+            'Uses HuggingFaceTB models (Transformers.js compatible)'
         );
 
         runner.assert(
@@ -154,8 +154,8 @@ async function testModelConfigurations() {
         );
 
         runner.assert(
-            modelsSection.includes('year: 2025'),
-            'Models are from 2025 (modern)'
+            modelsSection.includes('year: 2024'),
+            'Models are from 2024 (SmolLM2 release)'
         );
     }
 
@@ -171,18 +171,18 @@ async function testArchitectureInfo() {
     const botCode = readFileSync(botPath, 'utf-8');
 
     runner.assert(
-        botCode.includes('DeepSeek-R1-Distill-Qwen-1.5B'),
-        'Has DeepSeek architecture info'
+        botCode.includes('SmolLM2-135M'),
+        'Has SmolLM2-135M architecture info'
     );
 
     runner.assert(
-        botCode.includes('Gemma 3 1B IT'),
-        'Has Gemma 3 1B architecture info'
+        botCode.includes('SmolLM2-360M'),
+        'Has SmolLM2-360M architecture info'
     );
 
     runner.assert(
-        botCode.includes('Gemma 3 270M IT'),
-        'Has Gemma 3 270M architecture info'
+        botCode.includes('SmolLM2-1.7B'),
+        'Has SmolLM2-1.7B architecture info'
     );
 
     runner.assert(
@@ -191,8 +191,8 @@ async function testArchitectureInfo() {
     );
 
     runner.assert(
-        botCode.includes('Distilled from DeepSeek-R1 reasoning model'),
-        'Documents DeepSeek distillation origin'
+        botCode.includes('Optimized for browser/edge deployment'),
+        'Documents browser optimization'
     );
 
     runner.assert(
@@ -212,7 +212,7 @@ async function testResponseHandling() {
     const botCode = readFileSync(botPath, 'utf-8');
 
     runner.assert(
-        botCode.includes('max_new_tokens: 150'),
+        botCode.includes('max_new_tokens: 256'),
         'Limits response length'
     );