feat(transcription): add pruna/whisper-v3-large transcription model (#51)

TimPietruskyRunPod · web-flow · commit cf0c97698c01 · 2026-01-21T11:12:17.000+01:00
* feat(transcription): add Whisper transcription model support

- Add `transcriptionModel()` and `transcription()` methods to the provider
- Support audio transcription via RunPod's pruna/whisper-v3-large endpoint
- Accept audio as Uint8Array, base64 string, or URL via providerOptions
- Return transcription text, segments with timing, detected language, and duration
- Add unit tests for the transcription model
- Update README with transcription documentation

* docs: use real demo audio URL in README examples

* fix: resolve lint errors in transcription model
diff --git a/.changeset/add-transcription-model.md b/.changeset/add-transcription-model.md
@@ -0,0 +1,10 @@
+---
+"@runpod/ai-sdk-provider": minor
+---
+
+Add transcription model support with `pruna/whisper-v3-large`
+
+- Add `transcriptionModel()` and `transcription()` methods to the provider
+- Support audio transcription via RunPod's Whisper endpoint
+- Accept audio as `Uint8Array`, base64 string, or URL via providerOptions
+- Return transcription text, segments with timing, detected language, and duration
diff --git a/README.md b/README.md
@@ -565,6 +565,94 @@ const result = await generateSpeech({
 });
 ```
 
+## Transcription Models
+
+Transcribe audio using the AI SDK's `experimental_transcribe` and `runpod.transcription(...)`:
+
+```ts
+import { runpod } from '@runpod/ai-sdk-provider';
+import { experimental_transcribe as transcribe } from 'ai';
+
+const result = await transcribe({
+  model: runpod.transcription('pruna/whisper-v3-large'),
+  audio: new URL('https://image.runpod.ai/demo/transcription-demo.wav'),
+});
+
+console.log(result.text);
+```
+
+**Returns:**
+
+- `result.text` - Full transcription text
+- `result.segments` - Array of segments with timing info
+  - `segment.text` - Segment text
+  - `segment.startSecond` - Start time in seconds
+  - `segment.endSecond` - End time in seconds
+- `result.language` - Detected language code
+- `result.durationInSeconds` - Audio duration
+- `result.warnings` - Array of any warnings
+- `result.providerMetadata.runpod.jobId` - RunPod job ID
+
+### Audio Input
+
+You can provide audio in several ways:
+
+```ts
+// URL (recommended for large files)
+const result = await transcribe({
+  model: runpod.transcription('pruna/whisper-v3-large'),
+  audio: new URL('https://image.runpod.ai/demo/transcription-demo.wav'),
+});
+
+// Local file as Uint8Array
+import { readFileSync } from 'fs';
+const audioData = readFileSync('./audio.wav');
+
+const result = await transcribe({
+  model: runpod.transcription('pruna/whisper-v3-large'),
+  audio: audioData,
+});
+```
+
+### Examples
+
+Check out our [examples](https://github.com/runpod/examples/tree/main/ai-sdk/getting-started) for more code snippets on how to use all the different models.
+
+### Supported Models
+
+- `pruna/whisper-v3-large`
+
+### Provider Options
+
+Use `providerOptions.runpod` for model-specific parameters:
+
+| Option              | Type      | Default | Description                                    |
+| ------------------- | --------- | ------- | ---------------------------------------------- |
+| `audio`             | `string`  | -       | URL to audio file (alternative to binary data) |
+| `prompt`            | `string`  | -       | Context prompt to guide transcription          |
+| `language`          | `string`  | Auto    | ISO-639-1 language code (e.g., 'en', 'es')     |
+| `word_timestamps`   | `boolean` | `false` | Include word-level timestamps                  |
+| `translate`         | `boolean` | `false` | Translate audio to English                     |
+| `enable_vad`        | `boolean` | `false` | Enable voice activity detection                |
+| `maxPollAttempts`   | `number`  | `120`   | Max polling attempts                           |
+| `pollIntervalMillis`| `number`  | `2000`  | Polling interval (ms)                          |
+
+**Example (providerOptions):**
+
+```ts
+const result = await transcribe({
+  model: runpod.transcription('pruna/whisper-v3-large'),
+  audio: new URL('https://image.runpod.ai/demo/transcription-demo.wav'),
+  providerOptions: {
+    runpod: {
+      language: 'en',
+      prompt: 'This is a demo of audio transcription',
+      word_timestamps: true,
+    },
+  },
+});
+```
+
 ## About Runpod
 
 [Runpod](https://runpod.io) is the foundation for developers to build, deploy, and scale custom AI systems.
diff --git a/src/index.ts b/src/index.ts
@@ -3,5 +3,9 @@ export type { RunpodProvider, RunpodProviderSettings } from './runpod-provider';
 export type { RunpodChatModelId } from './runpod-chat-options';
 export type { RunpodCompletionModelId } from './runpod-completion-options';
 export type { RunpodImageModelId } from './runpod-image-options';
+export type {
+  RunpodTranscriptionModelId,
+  RunpodTranscriptionProviderOptions,
+} from './runpod-transcription-options';
 export type { OpenAICompatibleErrorData as RunpodErrorData } from '@ai-sdk/openai-compatible';
 export type { RunpodImageErrorData } from './runpod-error';
diff --git a/src/runpod-error.ts b/src/runpod-error.ts
@@ -9,36 +9,44 @@ export const runpodImageErrorSchema = z.object({
 
 export type RunpodImageErrorData = z.infer<typeof runpodImageErrorSchema>;
 
-export const runpodImageFailedResponseHandler = createJsonErrorResponseHandler({
-  errorSchema: runpodImageErrorSchema as any,
-  errorToMessage: (data: RunpodImageErrorData) => {
-    // Prefer message if available (more descriptive)
-    if (data.message) {
-      return data.message;
-    }
-    
-    // If error field exists, try to extract nested JSON message
-    if (data.error) {
-      // Runpod sometimes returns nested JSON in the error field like:
-      // "Error submitting task: 400, {\"code\":400,\"message\":\"...\"}"
-      // Try to extract the inner message for cleaner error messages
-      // Find the last occurrence of { which likely starts the JSON object
-      const lastBraceIndex = data.error.lastIndexOf('{');
-      if (lastBraceIndex !== -1) {
-        try {
-          const jsonStr = data.error.substring(lastBraceIndex);
-          const nestedError = JSON.parse(jsonStr);
-          if (nestedError.message && typeof nestedError.message === 'string') {
-            return nestedError.message;
-          }
-        } catch {
-          // If parsing fails, fall back to the original error string
+// Helper function to extract error message from Runpod error data
+function extractErrorMessage(data: RunpodImageErrorData): string {
+  // Prefer message if available (more descriptive)
+  if (data.message) {
+    return data.message;
+  }
+
+  // If error field exists, try to extract nested JSON message
+  if (data.error) {
+    // Runpod sometimes returns nested JSON in the error field like:
+    // "Error submitting task: 400, {\"code\":400,\"message\":\"...\"}"
+    // Try to extract the inner message for cleaner error messages
+    // Find the last occurrence of { which likely starts the JSON object
+    const lastBraceIndex = data.error.lastIndexOf('{');
+    if (lastBraceIndex !== -1) {
+      try {
+        const jsonStr = data.error.substring(lastBraceIndex);
+        const nestedError = JSON.parse(jsonStr);
+        if (nestedError.message && typeof nestedError.message === 'string') {
+          return nestedError.message;
         }
+      } catch {
+        // If parsing fails, fall back to the original error string
       }
-      return data.error;
     }
-    
-    return 'Unknown Runpod error';
-  },
+    return data.error;
+  }
+
+  return 'Unknown Runpod error';
+}
+
+export const runpodImageFailedResponseHandler = createJsonErrorResponseHandler({
+  errorSchema: runpodImageErrorSchema as any,
+  errorToMessage: extractErrorMessage,
+});
+
+export const runpodTranscriptionFailedResponseHandler = createJsonErrorResponseHandler({
+  errorSchema: runpodImageErrorSchema as any,
+  errorToMessage: extractErrorMessage,
 });
 
diff --git a/src/runpod-provider.test.ts b/src/runpod-provider.test.ts
@@ -5,6 +5,7 @@ import {
 } from '@ai-sdk/openai-compatible';
 import { RunpodImageModel } from './runpod-image-model';
 import { RunpodSpeechModel } from './runpod-speech-model';
+import { RunpodTranscriptionModel } from './runpod-transcription-model';
 import { loadApiKey } from '@ai-sdk/provider-utils';
 import { createRunpod } from './runpod-provider';
 import { describe, it, expect, vi, beforeEach, Mock } from 'vitest';
@@ -26,6 +27,10 @@ vi.mock('./runpod-speech-model', () => ({
   RunpodSpeechModel: vi.fn(),
 }));
 
+vi.mock('./runpod-transcription-model', () => ({
+  RunpodTranscriptionModel: vi.fn(),
+}));
+
 vi.mock('@ai-sdk/provider-utils', () => ({
   loadApiKey: vi.fn().mockReturnValue('mock-api-key'),
   withoutTrailingSlash: vi.fn((url) => url),
@@ -245,4 +250,58 @@ describe('RunpodProvider', () => {
       );
     });
   });
+
+  describe('transcriptionModel', () => {
+    it('should use mapping for known transcription model IDs', () => {
+      const provider = createRunpod();
+
+      provider.transcriptionModel('pruna/whisper-v3-large');
+
+      expect((RunpodTranscriptionModel as any).mock.calls[0][0]).toBe(
+        'pruna/whisper-v3-large'
+      );
+      expect((RunpodTranscriptionModel as any).mock.calls[0][1].baseURL).toBe(
+        'https://api.runpod.ai/v2/whisper-v3-large'
+      );
+    });
+
+    it('should construct a transcription model for a serverless endpoint id', () => {
+      const provider = createRunpod();
+      const modelId = 'uhyz0hnkemrk6r';
+
+      const model = provider.transcriptionModel(modelId);
+      expect(model).toBeInstanceOf(RunpodTranscriptionModel);
+
+      expect((RunpodTranscriptionModel as any).mock.calls[0][0]).toBe(modelId);
+      expect((RunpodTranscriptionModel as any).mock.calls[0][1].baseURL).toBe(
+        `https://api.runpod.ai/v2/${modelId}`
+      );
+    });
+
+    it('should accept a Runpod Console endpoint URL', () => {
+      const provider = createRunpod();
+      const url =
+        'https://console.runpod.io/serverless/user/endpoint/uhyz0hnkemrk6r';
+
+      provider.transcriptionModel(url);
+
+      expect((RunpodTranscriptionModel as any).mock.calls[0][0]).toBe(
+        'uhyz0hnkemrk6r'
+      );
+      expect((RunpodTranscriptionModel as any).mock.calls[0][1].baseURL).toBe(
+        'https://api.runpod.ai/v2/uhyz0hnkemrk6r'
+      );
+    });
+  });
+
+  describe('transcription', () => {
+    it('should be an alias for transcriptionModel', () => {
+      const provider = createRunpod();
+      const modelId = 'pruna/whisper-v3-large';
+
+      const model = provider.transcription(modelId);
+
+      expect(model).toBeInstanceOf(RunpodTranscriptionModel);
+    });
+  });
 });
diff --git a/src/runpod-provider.ts b/src/runpod-provider.ts
@@ -1,4 +1,9 @@
-import { ImageModelV3, LanguageModelV3, SpeechModelV3 } from '@ai-sdk/provider';
+import {
+  ImageModelV3,
+  LanguageModelV3,
+  SpeechModelV3,
+  TranscriptionModelV3,
+} from '@ai-sdk/provider';
 import {
   OpenAICompatibleChatLanguageModel,
   OpenAICompatibleCompletionLanguageModel,
@@ -10,6 +15,7 @@ import {
 } from '@ai-sdk/provider-utils';
 import { RunpodImageModel } from './runpod-image-model';
 import { RunpodSpeechModel } from './runpod-speech-model';
+import { RunpodTranscriptionModel } from './runpod-transcription-model';
 
 export interface RunpodProviderSettings {
   /**
@@ -72,6 +78,16 @@ Creates a speech model for speech generation.
 Creates a speech model for speech generation.
 */
   speech(modelId: string): SpeechModelV3;
+
+  /**
+Creates a transcription model for audio transcription.
+*/
+  transcriptionModel(modelId: string): TranscriptionModelV3;
+
+  /**
+Creates a transcription model for audio transcription.
+*/
+  transcription(modelId: string): TranscriptionModelV3;
 }
 
 // Mapping of Runpod model IDs to their endpoint URLs
@@ -123,6 +139,11 @@ const SPEECH_MODEL_ID_TO_ENDPOINT_URL: Record<string, string> = {
   'resembleai/chatterbox-turbo': 'https://api.runpod.ai/v2/chatterbox-turbo/',
 };
 
+// Mapping of Runpod transcription model IDs to their serverless endpoint URLs
+const TRANSCRIPTION_MODEL_ID_TO_ENDPOINT_URL: Record<string, string> = {
+  'pruna/whisper-v3-large': 'https://api.runpod.ai/v2/whisper-v3-large',
+};
+
 // Mapping of Runpod model IDs to their OpenAI model names
 const MODEL_ID_TO_OPENAI_NAME: Record<string, string> = {
   'qwen/qwen3-32b-awq': 'Qwen/Qwen3-32B-AWQ',
@@ -272,6 +293,28 @@ export function createRunpod(
     });
   };
 
+  const createTranscriptionModel = (modelId: string) => {
+    const endpointIdFromConsole = parseRunpodConsoleEndpointId(modelId);
+    const normalizedModelId = endpointIdFromConsole ?? modelId;
+
+    // Prefer explicit mapping for known transcription model IDs.
+    const mappedBaseURL =
+      TRANSCRIPTION_MODEL_ID_TO_ENDPOINT_URL[normalizedModelId];
+
+    const baseURL =
+      mappedBaseURL ??
+      (normalizedModelId.startsWith('http')
+        ? normalizedModelId
+        : `https://api.runpod.ai/v2/${normalizedModelId}`);
+
+    return new RunpodTranscriptionModel(normalizedModelId, {
+      provider: 'runpod.transcription',
+      baseURL,
+      headers: getHeaders,
+      fetch: runpodFetch,
+    });
+  };
+
   const provider = (modelId: string) => createChatModel(modelId);
 
   provider.completionModel = createCompletionModel;
@@ -281,6 +324,8 @@ export function createRunpod(
   provider.image = createImageModel;
   provider.speechModel = createSpeechModel;
   provider.speech = createSpeechModel;
+  provider.transcriptionModel = createTranscriptionModel;
+  provider.transcription = createTranscriptionModel;
 
   return provider;
 }
diff --git a/src/runpod-transcription-model.test.ts b/src/runpod-transcription-model.test.ts
diff --git a/src/runpod-transcription-model.ts b/src/runpod-transcription-model.ts
diff --git a/src/runpod-transcription-options.ts b/src/runpod-transcription-options.ts