fix: add tokenizer fallback for unknown models and fix test paths

ThomasK33 · ThomasK33 · commit 80ef4f265073 · 2025-10-31T15:13:58.000+01:00
Changes tokenizer to gracefully handle unknown model names by
falling back to a similar model's tokenizer with a warning,
instead of throwing an error. This prevents crashes when new
models are used before tokenizer support is added.

Also fixes runtime tests on macOS by resolving symlinks in temp
paths (/tmp -&gt; /private/tmp) to match git worktree paths.
diff --git a/src/utils/main/tokenizer.ts b/src/utils/main/tokenizer.ts
@@ -35,7 +35,7 @@ const tokenCountCache = new LRUCache<string, number>({
   sizeCalculation: () => 1,
 });
 
-function normalizeModelKey(modelName: string): ModelName {
+function normalizeModelKey(modelName: string): ModelName | null {
   assert(
     typeof modelName === "string" && modelName.length > 0,
     "Model name must be a non-empty string"
@@ -46,11 +46,38 @@ function normalizeModelKey(modelName: string): ModelName {
     override ?? (modelName.includes(":") ? modelName.replace(":", "/") : modelName);
 
   if (!(normalized in models)) {
-    throw new Error(`Invalid model string: ${modelName}`);
+    // Return null for unknown models - caller can decide to fallback or error
+    return null;
   }
   return normalized as ModelName;
 }
 
+/**
+ * Resolves a model string to a ModelName, falling back to a similar model if unknown.
+ * Optionally logs a warning when falling back.
+ */
+function resolveModelName(modelString: string): ModelName {
+  let modelName = normalizeModelKey(modelString);
+
+  if (!modelName) {
+    const provider = modelString.split(":")[0] || "openai";
+    const fallbackModel =
+      provider === "anthropic"
+        ? "anthropic/claude-sonnet-4.5"
+        : provider === "openai"
+          ? "openai/gpt-5"
+          : "openai/gpt-5";
+
+    console.warn(
+      `[tokenizer] Unknown model '${modelString}', using ${fallbackModel} tokenizer for approximate token counting`
+    );
+
+    modelName = fallbackModel as ModelName;
+  }
+
+  return modelName;
+}
+
 function resolveEncoding(modelName: ModelName): Promise<string> {
   let promise = encodingPromises.get(modelName);
   if (!promise) {
@@ -116,13 +143,17 @@ export function loadTokenizerModules(
   return Promise.allSettled(
     modelsToWarm.map((modelString) => {
       const modelName = normalizeModelKey(modelString);
+      // Skip unknown models during warmup
+      if (!modelName) {
+        return Promise.reject(new Error(`Unknown model: ${modelString}`));
+      }
       return resolveEncoding(modelName);
     })
   );
 }
 
 export async function getTokenizerForModel(modelString: string): Promise<Tokenizer> {
-  const modelName = normalizeModelKey(modelString);
+  const modelName = resolveModelName(modelString);
   const encodingName = await resolveEncoding(modelName);
 
   return {
@@ -132,13 +163,13 @@ export async function getTokenizerForModel(modelString: string): Promise<Tokeniz
 }
 
 export function countTokens(modelString: string, text: string): Promise<number> {
-  const modelName = normalizeModelKey(modelString);
+  const modelName = resolveModelName(modelString);
   return countTokensInternal(modelName, text);
 }
 
 export function countTokensBatch(modelString: string, texts: string[]): Promise<number[]> {
   assert(Array.isArray(texts), "Batch token counting expects an array of strings");
-  const modelName = normalizeModelKey(modelString);
+  const modelName = resolveModelName(modelString);
   return Promise.all(texts.map((text) => countTokensInternal(modelName, text)));
 }
 
diff --git a/tests/runtime/runtime.test.ts b/tests/runtime/runtime.test.ts
@@ -6,6 +6,7 @@
  */
 
 // Jest globals are available automatically - no need to import
+import * as os from "os";
 import * as path from "path";
 import { shouldRunIntegrationTests } from "../testUtils";
 import {
@@ -53,7 +54,8 @@ describeIntegration("Runtime integration tests", () => {
     ({ type }) => {
       // Helper to create runtime for this test type
       // Use a base working directory - TestWorkspace will create subdirectories as needed
-      const getBaseWorkdir = () => (type === "ssh" ? sshConfig!.workdir : "/tmp");
+      // For local runtime, use os.tmpdir() which matches where TestWorkspace creates directories
+      const getBaseWorkdir = () => (type === "ssh" ? sshConfig!.workdir : os.tmpdir());
       const createRuntime = (): Runtime => createTestRuntime(type, getBaseWorkdir(), sshConfig);
 
       describe("exec() - Command execution", () => {
diff --git a/tests/runtime/test-helpers.ts b/tests/runtime/test-helpers.ts
@@ -3,6 +3,7 @@
  */
 
 import * as fs from "fs/promises";
+import { realpathSync } from "fs";
 import * as os from "os";
 import * as path from "path";
 import type { Runtime } from "@/runtime/Runtime";
@@ -25,7 +26,9 @@ export function createTestRuntime(
 ): Runtime {
   switch (type) {
     case "local":
-      return new LocalRuntime(workdir);
+      // Resolve symlinks (e.g., /tmp -> /private/tmp on macOS) to match git worktree paths
+      const resolvedWorkdir = realpathSync(workdir);
+      return new LocalRuntime(resolvedWorkdir);
     case "ssh":
       if (!sshConfig) {
         throw new Error("SSH config required for SSH runtime");
@@ -81,7 +84,9 @@ export class TestWorkspace {
       return new TestWorkspace(runtime, workspacePath, true);
     } else {
       // For local, use temp directory
-      const workspacePath = await fs.mkdtemp(path.join(os.tmpdir(), "runtime-test-"));
+      // Resolve symlinks (e.g., /tmp -> /private/tmp on macOS) to avoid git worktree path mismatches
+      const tempPath = await fs.mkdtemp(path.join(os.tmpdir(), "runtime-test-"));
+      const workspacePath = await fs.realpath(tempPath);
       return new TestWorkspace(runtime, workspacePath, false);
     }
   }