fix(provider/xai): handle inconsistent cached token reporting (#12485)

dancer · gr2m · commit 057c54598945 · 2026-02-16T14:35:07.000-08:00
## background

xAI's token reporting is inconsistent across models. most models report
`prompt_tokens`/`input_tokens` inclusive of cached tokens (like OpenAI),
but some models (e.g. `grok-4-1-fast-non-reasoning`) report them
exclusive of cached tokens, where `cached_tokens &gt; prompt_tokens`

## summary

- detect which reporting style xAI is using based on whether
`cached_tokens &lt;= prompt_tokens`
- when inclusive (normal): subtract cached from prompt to get noCache
(OpenAI pattern)
- when exclusive (anomalous): prompt tokens already represent noCache,
add cached for total (Anthropic pattern)
- applies to both chat completions and responses APIs
- add unit tests for the non-inclusive reporting edge case
- add responses usage test file

## verification

&lt;details&gt;
&lt;summary&gt;gateway bug case (cached &gt; prompt)&lt;/summary&gt;

```
before: total=4142, noCache=-186, cacheRead=4328
after:  total=8470, noCache=4142, cacheRead=4328
```

&lt;/details&gt;

&lt;details&gt;
&lt;summary&gt;normal case (cached &lt;= prompt)&lt;/summary&gt;

```
raw:   input_tokens: 12, cached_tokens: 3
sdk:   noCache: 9, cacheRead: 3, total: 12
```

&lt;/details&gt;

## checklist

- [x] tests have been added / updated (for bug fixes / features)
- [ ] documentation has been added / updated (for bug fixes / features)
- [x] a _patch_ changeset for relevant packages has been added (run
`pnpm changeset` in root)
- [x] i have reviewed this pull request (self-review)
diff --git a/.changeset/stupid-bags-crash.md b/.changeset/stupid-bags-crash.md
@@ -0,0 +1,5 @@
+---
+'@ai-sdk/xai': patch
+---
+
+fix(provider/xai): handle inconsistent cached token reporting
diff --git a/examples/ai-functions/src/generate-text/xai-responses-usage-full.ts b/examples/ai-functions/src/generate-text/xai-responses-usage-full.ts
@@ -0,0 +1,44 @@
+import { xai } from '@ai-sdk/xai';
+import { generateText } from 'ai';
+import { run } from '../lib/run';
+
+const models = [
+  'grok-4',
+  'grok-4-1-fast-reasoning',
+  'grok-4-1-fast-non-reasoning',
+  'grok-4-fast-reasoning',
+  'grok-4-fast-non-reasoning',
+  'grok-code-fast-1',
+  'grok-3',
+  'grok-3-fast',
+  'grok-3-mini',
+  'grok-3-mini-fast',
+];
+
+run(async () => {
+  for (const modelId of models) {
+    try {
+      const result = await generateText({
+        model: xai.responses(modelId),
+        prompt: 'Say a single word.',
+      });
+
+      const body = result.response.body as Record<string, any>;
+      const raw = body.usage;
+      const sdk = result.usage;
+
+      console.log(`--- ${modelId} ---`);
+      console.log(
+        `raw: output_tokens=${raw.output_tokens}, reasoning_tokens=${raw.output_tokens_details?.reasoning_tokens ?? 0}, total_tokens=${raw.total_tokens}`,
+      );
+      console.log(
+        `sdk: outputTokens=${sdk.outputTokens}, textTokens=${sdk.outputTokenDetails?.textTokens}, reasoningTokens=${sdk.outputTokenDetails?.reasoningTokens}, totalTokens=${sdk.totalTokens}`,
+      );
+      console.log();
+    } catch (e: any) {
+      console.log(`--- ${modelId} ---`);
+      console.log(`error: ${e.message?.slice(0, 80)}`);
+      console.log();
+    }
+  }
+});
diff --git a/examples/ai-functions/src/generate-text/xai-responses-usage.ts b/examples/ai-functions/src/generate-text/xai-responses-usage.ts
@@ -0,0 +1,16 @@
+import { xai } from '@ai-sdk/xai';
+import { generateText } from 'ai';
+import { run } from '../lib/run';
+
+run(async () => {
+  const result = await generateText({
+    model: xai.responses('grok-4-1-fast-non-reasoning'),
+    prompt: 'Say a single word.',
+  });
+
+  console.log('text:', result.text);
+  console.log();
+  console.log('raw usage:', JSON.stringify(result.response.body, null, 2));
+  console.log();
+  console.log('sdk usage:', JSON.stringify(result.usage, null, 2));
+});
diff --git a/examples/ai-functions/src/stream-text/xai-responses-usage-full.ts b/examples/ai-functions/src/stream-text/xai-responses-usage-full.ts
@@ -0,0 +1,43 @@
+import { xai } from '@ai-sdk/xai';
+import { streamText } from 'ai';
+import { run } from '../lib/run';
+
+const models = [
+  'grok-4',
+  'grok-4-1-fast-reasoning',
+  'grok-4-1-fast-non-reasoning',
+  'grok-4-fast-reasoning',
+  'grok-4-fast-non-reasoning',
+  'grok-code-fast-1',
+  'grok-3',
+  'grok-3-fast',
+  'grok-3-mini',
+  'grok-3-mini-fast',
+];
+
+run(async () => {
+  for (const modelId of models) {
+    try {
+      const result = streamText({
+        model: xai.responses(modelId),
+        prompt: 'Say a single word.',
+      });
+
+      for await (const textPart of result.textStream) {
+        void textPart;
+      }
+
+      const sdk = await result.usage;
+
+      console.log(`--- ${modelId} ---`);
+      console.log(
+        `sdk: outputTokens=${sdk.outputTokens}, textTokens=${sdk.outputTokenDetails?.textTokens}, reasoningTokens=${sdk.outputTokenDetails?.reasoningTokens}, totalTokens=${sdk.totalTokens}`,
+      );
+      console.log();
+    } catch (e: any) {
+      console.log(`--- ${modelId} ---`);
+      console.log(`error: ${e.message?.slice(0, 80)}`);
+      console.log();
+    }
+  }
+});
diff --git a/examples/ai-functions/src/stream-text/xai-responses-usage.ts b/examples/ai-functions/src/stream-text/xai-responses-usage.ts
@@ -0,0 +1,18 @@
+import { xai } from '@ai-sdk/xai';
+import { streamText } from 'ai';
+import { run } from '../lib/run';
+
+run(async () => {
+  const result = streamText({
+    model: xai.responses('grok-3-mini'),
+    prompt: 'Say a single word.',
+  });
+
+  for await (const textPart of result.textStream) {
+    process.stdout.write(textPart);
+  }
+
+  console.log();
+  console.log();
+  console.log('sdk usage:', JSON.stringify(await result.usage, null, 2));
+});
diff --git a/packages/xai/src/convert-xai-chat-usage.test.ts b/packages/xai/src/convert-xai-chat-usage.test.ts
@@ -113,6 +113,26 @@ describe('convertXaiChatUsage', () => {
     `);
   });
 
+  it('should handle cached_tokens exceeding prompt_tokens (non-inclusive reporting)', () => {
+    const result = convertXaiChatUsage({
+      prompt_tokens: 4142,
+      completion_tokens: 254,
+      total_tokens: 8724,
+      prompt_tokens_details: {
+        cached_tokens: 4328,
+      },
+    });
+
+    expect(result.inputTokens).toMatchInlineSnapshot(`
+      {
+        "cacheRead": 4328,
+        "cacheWrite": undefined,
+        "noCache": 4142,
+        "total": 8470,
+      }
+    `);
+  });
+
   it('should handle null token details', () => {
     const result = convertXaiChatUsage({
       prompt_tokens: 100,
diff --git a/packages/xai/src/convert-xai-chat-usage.ts b/packages/xai/src/convert-xai-chat-usage.ts
@@ -6,10 +6,16 @@ export function convertXaiChatUsage(usage: XaiChatUsage): LanguageModelV3Usage {
   const reasoningTokens =
     usage.completion_tokens_details?.reasoning_tokens ?? 0;
 
+  const promptTokensIncludesCached = cacheReadTokens <= usage.prompt_tokens;
+
   return {
     inputTokens: {
-      total: usage.prompt_tokens,
-      noCache: usage.prompt_tokens - cacheReadTokens,
+      total: promptTokensIncludesCached
+        ? usage.prompt_tokens
+        : usage.prompt_tokens + cacheReadTokens,
+      noCache: promptTokensIncludesCached
+        ? usage.prompt_tokens - cacheReadTokens
+        : usage.prompt_tokens,
       cacheRead: cacheReadTokens,
       cacheWrite: undefined,
     },
diff --git a/packages/xai/src/responses/convert-xai-responses-usage.test.ts b/packages/xai/src/responses/convert-xai-responses-usage.test.ts
@@ -0,0 +1,145 @@
+import { convertXaiResponsesUsage } from './convert-xai-responses-usage';
+import { describe, it, expect } from 'vitest';
+
+describe('convertXaiResponsesUsage', () => {
+  it('should convert basic usage without caching or reasoning', () => {
+    const result = convertXaiResponsesUsage({
+      input_tokens: 100,
+      output_tokens: 50,
+    });
+
+    expect(result).toMatchInlineSnapshot(`
+      {
+        "inputTokens": {
+          "cacheRead": 0,
+          "cacheWrite": undefined,
+          "noCache": 100,
+          "total": 100,
+        },
+        "outputTokens": {
+          "reasoning": 0,
+          "text": 50,
+          "total": 50,
+        },
+        "raw": {
+          "input_tokens": 100,
+          "output_tokens": 50,
+        },
+      }
+    `);
+  });
+
+  it('should convert usage with reasoning tokens', () => {
+    const result = convertXaiResponsesUsage({
+      input_tokens: 1941,
+      output_tokens: 583,
+      total_tokens: 2524,
+      output_tokens_details: {
+        reasoning_tokens: 380,
+      },
+    });
+
+    expect(result.outputTokens).toMatchInlineSnapshot(`
+      {
+        "reasoning": 380,
+        "text": 203,
+        "total": 583,
+      }
+    `);
+  });
+
+  it('should convert usage with cached input tokens', () => {
+    const result = convertXaiResponsesUsage({
+      input_tokens: 200,
+      output_tokens: 50,
+      input_tokens_details: {
+        cached_tokens: 150,
+      },
+    });
+
+    expect(result.inputTokens).toMatchInlineSnapshot(`
+      {
+        "cacheRead": 150,
+        "cacheWrite": undefined,
+        "noCache": 50,
+        "total": 200,
+      }
+    `);
+  });
+
+  it('should handle cached_tokens exceeding input_tokens (non-inclusive reporting)', () => {
+    const result = convertXaiResponsesUsage({
+      input_tokens: 4142,
+      output_tokens: 254,
+      input_tokens_details: {
+        cached_tokens: 4328,
+      },
+    });
+
+    expect(result.inputTokens).toMatchInlineSnapshot(`
+      {
+        "cacheRead": 4328,
+        "cacheWrite": undefined,
+        "noCache": 4142,
+        "total": 8470,
+      }
+    `);
+  });
+
+  it('should convert usage with both cached input and reasoning', () => {
+    const result = convertXaiResponsesUsage({
+      input_tokens: 200,
+      output_tokens: 583,
+      input_tokens_details: {
+        cached_tokens: 150,
+      },
+      output_tokens_details: {
+        reasoning_tokens: 380,
+      },
+    });
+
+    expect(result).toMatchInlineSnapshot(`
+      {
+        "inputTokens": {
+          "cacheRead": 150,
+          "cacheWrite": undefined,
+          "noCache": 50,
+          "total": 200,
+        },
+        "outputTokens": {
+          "reasoning": 380,
+          "text": 203,
+          "total": 583,
+        },
+        "raw": {
+          "input_tokens": 200,
+          "input_tokens_details": {
+            "cached_tokens": 150,
+          },
+          "output_tokens": 583,
+          "output_tokens_details": {
+            "reasoning_tokens": 380,
+          },
+        },
+      }
+    `);
+  });
+
+  it('should preserve raw usage data', () => {
+    const rawUsage = {
+      input_tokens: 12,
+      output_tokens: 319,
+      total_tokens: 331,
+      input_tokens_details: {
+        cached_tokens: 2,
+      },
+      output_tokens_details: {
+        reasoning_tokens: 317,
+      },
+    };
+
+    const result = convertXaiResponsesUsage(rawUsage);
+
+    expect(result.raw).toEqual(rawUsage);
+  });
+});
diff --git a/packages/xai/src/responses/convert-xai-responses-usage.ts b/packages/xai/src/responses/convert-xai-responses-usage.ts
@@ -7,10 +7,16 @@ export function convertXaiResponsesUsage(
   const cacheReadTokens = usage.input_tokens_details?.cached_tokens ?? 0;
   const reasoningTokens = usage.output_tokens_details?.reasoning_tokens ?? 0;
 
+  const inputTokensIncludesCached = cacheReadTokens <= usage.input_tokens;
+
   return {
     inputTokens: {
-      total: usage.input_tokens,
-      noCache: usage.input_tokens - cacheReadTokens,
+      total: inputTokensIncludesCached
+        ? usage.input_tokens
+        : usage.input_tokens + cacheReadTokens,
+      noCache: inputTokensIncludesCached
+        ? usage.input_tokens - cacheReadTokens
+        : usage.input_tokens,
       cacheRead: cacheReadTokens,
       cacheWrite: undefined,
     },

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +'@ai-sdk/xai': patch
 +---
++
 +fix(provider/xai): handle inconsistent cached token reporting