fix(gpt-bot): Use onnx-community models for Transformers.js compatibility

jeremymanning · jeremymanning · commit 902306fac4c2 · 2026-01-04T11:02:13.000-05:00
- Replace HuggingFaceTB/SmolLM2-xxx-Instruct with onnx-community models
- SmolLM2 135M/360M ONNX versions work correctly with Transformers.js v3
- Add Qwen2.5-0.5B-Instruct as largest model option (replaces 1.7B)
- Use q4f16 quantization for optimal browser performance
- Update HTML to reflect new model specs and links
- Update tests for new model configuration

The original HuggingFaceTB models have ONNX files but aren't properly
structured for Transformers.js browser compatibility, causing load failures.
diff --git a/demos/chatbot-evolution/index.html b/demos/chatbot-evolution/index.html
@@ -455,11 +455,11 @@ <h2>2020s: GPT & Transformers</h2>
 
                     <div class="chatbot-info">
                         <div class="info-card">
-                            <h3>About SmolLM2</h3>
-                            <p><strong>Creator:</strong> HuggingFace (2024)</p>
+                            <h3>About Modern LLMs</h3>
+                            <p><strong>Models:</strong> SmolLM2 (135M, 360M) & Qwen2.5 (0.5B)</p>
                             <p><strong>Method:</strong> Decoder-only transformer with chat templates</p>
-                            <p><strong>Models:</strong> 135M, 360M, 1.7B parameters</p>
                             <p><strong>Innovation:</strong> Optimized for browser/edge deployment</p>
+                            <p><strong>Source:</strong> onnx-community (Transformers.js compatible)</p>
                         </div>
 
                         <div class="info-card">
@@ -522,7 +522,7 @@ <h4>Decoder-Only Transformer (SmolLM2)</h4>
                                             <div class="sub-block">Grouped-Query Attention</div>
                                             <div class="sub-block">SwiGLU FFN</div>
                                             <div class="sub-block">RMSNorm</div>
-                                            <div class="block-note">x9 (135M) / x16 (360M) / x24 (1.7B) layers</div>
+                                            <div class="block-note">x9 (135M) / x16 (360M) / x24 (Qwen 0.5B) layers</div>
                                         </div>
                                     </div>
                                     <div class="arch-arrow">&#8595;</div>
@@ -556,21 +556,21 @@ <h5>Quantization (q4)</h5>
                             </div>
 
                             <div class="model-specs">
-                                <h4>SmolLM2 Model Family</h4>
+                                <h4>Available Models</h4>
                                 <table class="specs-table">
-                                    <tr><th>Spec</th><th>135M</th><th>360M</th><th>1.7B</th></tr>
-                                    <tr><td>Parameters</td><td>135 Million</td><td>360 Million</td><td>1.7 Billion</td></tr>
+                                    <tr><th>Spec</th><th>SmolLM2 135M</th><th>SmolLM2 360M</th><th>Qwen2.5 0.5B</th></tr>
+                                    <tr><td>Parameters</td><td>135 Million</td><td>360 Million</td><td>494 Million</td></tr>
                                     <tr><td>Layers</td><td>9</td><td>16</td><td>24</td></tr>
-                                    <tr><td>Hidden Size</td><td>576</td><td>960</td><td>2048</td></tr>
-                                    <tr><td>Download (q4)</td><td>~85 MB</td><td>~210 MB</td><td>~1.4 GB</td></tr>
-                                    <tr><td>Min RAM</td><td>2 GB</td><td>4 GB</td><td>8 GB</td></tr>
-                                    <tr><td>Context Length</td><td colspan="3">8,192 tokens</td></tr>
+                                    <tr><td>Hidden Size</td><td>576</td><td>960</td><td>896</td></tr>
+                                    <tr><td>Download (q4f16)</td><td>~111 MB</td><td>~259 MB</td><td>~460 MB</td></tr>
+                                    <tr><td>Min RAM</td><td>2 GB</td><td>4 GB</td><td>4 GB</td></tr>
+                                    <tr><td>Context Length</td><td colspan="2">8,192 tokens</td><td>32,768 tokens</td></tr>
                                 </table>
                                 <div class="model-note">
                                     <strong>Model Links:</strong> 
-                                    <a href="https://huggingface.co/HuggingFaceTB/SmolLM2-135M-Instruct" target="_blank">135M</a> | 
-                                    <a href="https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct" target="_blank">360M</a> | 
-                                    <a href="https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct" target="_blank">1.7B</a>
+                                    <a href="https://huggingface.co/onnx-community/SmolLM2-135M-Instruct-ONNX" target="_blank">SmolLM2 135M</a> | 
+                                    <a href="https://huggingface.co/onnx-community/SmolLM2-360M-Instruct-ONNX" target="_blank">SmolLM2 360M</a> | 
+                                    <a href="https://huggingface.co/onnx-community/Qwen2.5-0.5B-Instruct" target="_blank">Qwen2.5 0.5B</a>
                                 </div>
                             </div>
                         </div>
@@ -639,8 +639,8 @@ <h3>Evolution Stats</h3>
                             <span class="stat-value">90M params</span>
                         </div>
                         <div class="stat-item">
-                            <span class="stat-label">SmolLM2 (2024)</span>
-                            <span class="stat-value">135M-1.7B params</span>
+                            <span class="stat-label">SmolLM2/Qwen (2024)</span>
+                            <span class="stat-value">135M-500M params</span>
                         </div>
                         <div class="stat-item">
                             <span class="stat-label">Claude 4 Opus (2025)</span>
diff --git a/demos/chatbot-evolution/js/gpt-bot.js b/demos/chatbot-evolution/js/gpt-bot.js
@@ -1,14 +1,16 @@
 /**
  * GPT-style Bot (2020s) - Modern LLM via Transformers.js v3
  * 
- * Model hierarchy (SmolLM2 family - HuggingFace's browser-optimized models):
- * 1. SmolLM2-135M-Instruct - Ultra-light, works on any device
- * 2. SmolLM2-360M-Instruct - Balanced quality/speed (default for 4GB RAM)
- * 3. SmolLM2-1.7B-Instruct - Best quality (requires 8GB+ RAM + WebGPU)
+ * Model hierarchy (using onnx-community models for Transformers.js compatibility):
+ * 1. SmolLM2-135M-Instruct-ONNX - Ultra-light (~110MB), works on any device
+ * 2. SmolLM2-360M-Instruct-ONNX - Balanced quality/speed (~260MB)
+ * 3. Qwen2.5-0.5B-Instruct - Best quality (~460MB, requires 4GB+ RAM)
+ * 
+ * IMPORTANT: Uses onnx-community models which are specifically exported and
+ * tested for Transformers.js browser compatibility. The original HuggingFaceTB
+ * models may have ONNX files but aren't guaranteed to work with Transformers.js.
  * 
  * Auto-selects based on device RAM and WASM memory limits.
- * When WebGPU is available, larger models become feasible since weights
- * go to GPU memory, bypassing WASM heap limits.
  */
 
 export class GPTBot {
@@ -96,41 +98,39 @@ export class GPTBot {
         this.onProgress = null;
         this.loadAttempt = 0;
 
-        // SmolLM2 family - all have native Transformers.js support (ONNX bundled)
-        // wasmMinMB: minimum WASM heap needed (weights + runtime overhead)
         this.models = [
             {
-                name: 'HuggingFaceTB/SmolLM2-135M-Instruct',
+                name: 'onnx-community/SmolLM2-135M-Instruct-ONNX',
                 displayName: 'SmolLM2 135M',
-                dtype: 'q4',
+                dtype: 'q4f16',
                 params: '135M',
-                sizeMB: 85,
-                wasmMinMB: 300,   // 85MB weights + ~200MB runtime
+                sizeMB: 111,
+                wasmMinMB: 400,
                 minRAM: 2,
                 year: 2024,
                 org: 'HuggingFace'
             },
             {
-                name: 'HuggingFaceTB/SmolLM2-360M-Instruct',
+                name: 'onnx-community/SmolLM2-360M-Instruct-ONNX',
                 displayName: 'SmolLM2 360M',
-                dtype: 'q4',
+                dtype: 'q4f16',
                 params: '360M',
-                sizeMB: 210,
-                wasmMinMB: 600,   // 210MB weights + ~400MB runtime
+                sizeMB: 259,
+                wasmMinMB: 700,
                 minRAM: 4,
                 year: 2024,
                 org: 'HuggingFace'
             },
             {
-                name: 'HuggingFaceTB/SmolLM2-1.7B-Instruct',
-                displayName: 'SmolLM2 1.7B',
-                dtype: 'q4',
-                params: '1.7B',
-                sizeMB: 1410,
-                wasmMinMB: 2500,  // 1410MB weights + ~1GB runtime - exceeds most WASM limits
-                minRAM: 8,
+                name: 'onnx-community/Qwen2.5-0.5B-Instruct',
+                displayName: 'Qwen2.5 0.5B',
+                dtype: 'q4f16',
+                params: '0.5B',
+                sizeMB: 460,
+                wasmMinMB: 1200,
+                minRAM: 4,
                 year: 2024,
-                org: 'HuggingFace'
+                org: 'Alibaba'
             }
         ];
         
@@ -489,11 +489,11 @@ export class GPTBot {
         const specs = {
             'SmolLM2-135M': { layers: 9, hiddenSize: 576, attentionHeads: 9 },
             'SmolLM2-360M': { layers: 16, hiddenSize: 960, attentionHeads: 15 },
-            'SmolLM2-1.7B': { layers: 24, hiddenSize: 2048, attentionHeads: 32 }
+            'Qwen2.5-0.5B': { layers: 24, hiddenSize: 896, attentionHeads: 14 }
         };
         
         const modelKey = model.name.includes('135M') ? 'SmolLM2-135M' :
-                         model.name.includes('360M') ? 'SmolLM2-360M' : 'SmolLM2-1.7B';
+                         model.name.includes('360M') ? 'SmolLM2-360M' : 'Qwen2.5-0.5B';
         const spec = specs[modelKey];
         
         return {
diff --git a/tests/test-gpt-bot-loading.mjs b/tests/test-gpt-bot-loading.mjs
@@ -54,18 +54,18 @@ async function testGPTBotStructure() {
     );
 
     runner.assert(
-        botCode.includes('SmolLM2-135M-Instruct'),
-        'SmolLM2 135M is configured as smallest model'
+        botCode.includes('SmolLM2-135M-Instruct-ONNX'),
+        'SmolLM2 135M ONNX is configured as smallest model'
     );
 
     runner.assert(
-        botCode.includes('SmolLM2-360M-Instruct'),
-        'SmolLM2 360M is configured as medium model'
+        botCode.includes('SmolLM2-360M-Instruct-ONNX'),
+        'SmolLM2 360M ONNX is configured as medium model'
     );
 
     runner.assert(
-        botCode.includes('SmolLM2-1.7B-Instruct'),
-        'SmolLM2 1.7B is configured as largest model'
+        botCode.includes('Qwen2.5-0.5B-Instruct'),
+        'Qwen2.5 0.5B is configured as largest model'
     );
 
     runner.assert(
@@ -74,8 +74,8 @@ async function testGPTBotStructure() {
     );
 
     runner.assert(
-        botCode.includes('dtype: \'q4\''),
-        'Uses q4 quantization for smaller model size'
+        botCode.includes('dtype: \'q4f16\''),
+        'Uses q4f16 quantization for smaller model size'
     );
 
     runner.assert(
@@ -139,8 +139,8 @@ async function testModelConfigurations() {
         runner.assertEqual(modelCount, 3, 'Has exactly 3 models configured');
 
         runner.assert(
-            modelsSection.includes('HuggingFaceTB/'),
-            'Uses HuggingFaceTB models (Transformers.js compatible)'
+            modelsSection.includes('onnx-community/'),
+            'Uses onnx-community models (Transformers.js compatible)'
         );
 
         runner.assert(
@@ -181,8 +181,8 @@ async function testArchitectureInfo() {
     );
 
     runner.assert(
-        botCode.includes('SmolLM2-1.7B'),
-        'Has SmolLM2-1.7B architecture info'
+        botCode.includes('Qwen2.5-0.5B'),
+        'Has Qwen2.5-0.5B architecture info'
     );
 
     runner.assert(