|
| 1 | +import { Anthropic } from "@anthropic-ai/sdk" |
| 2 | +import OpenAI from "openai" |
| 3 | +import { withRetry } from "../retry" |
| 4 | +import { ApiHandler } from ".." |
| 5 | +import { |
| 6 | + ApiHandlerOptions, |
| 7 | + DeepSeekModelId, |
| 8 | + ModelInfo, |
| 9 | + deepSeekDefaultModelId, |
| 10 | + deepSeekModels, |
| 11 | + openAiModelInfoSaneDefaults, |
| 12 | +} from "../../shared/api" |
| 13 | +import { convertToOpenAiMessages } from "../transform/openai-format" |
| 14 | +import { ApiStream } from "../transform/stream" |
| 15 | + |
| 16 | +export class FireworksHandler implements ApiHandler { |
| 17 | + private options: ApiHandlerOptions |
| 18 | + private client: OpenAI |
| 19 | + |
| 20 | + constructor(options: ApiHandlerOptions) { |
| 21 | + this.options = options |
| 22 | + this.client = new OpenAI({ |
| 23 | + baseURL: "https://api.fireworks.ai/inference/v1", |
| 24 | + apiKey: this.options.fireworksApiKey, |
| 25 | + }) |
| 26 | + } |
| 27 | + |
| 28 | + @withRetry() |
| 29 | + async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream { |
| 30 | + const modelId = this.options.fireworksModelId ?? "" |
| 31 | + |
| 32 | + const openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [ |
| 33 | + { role: "system", content: systemPrompt }, |
| 34 | + ...convertToOpenAiMessages(messages), |
| 35 | + ] |
| 36 | + |
| 37 | + const stream = await this.client.chat.completions.create({ |
| 38 | + model: modelId, |
| 39 | + ...(this.options.fireworksModelMaxCompletionTokens |
| 40 | + ? { max_completion_tokens: this.options.fireworksModelMaxCompletionTokens } |
| 41 | + : {}), |
| 42 | + ...(this.options.fireworksModelMaxTokens ? { max_tokens: this.options.fireworksModelMaxTokens } : {}), |
| 43 | + messages: openAiMessages, |
| 44 | + stream: true, |
| 45 | + stream_options: { include_usage: true }, |
| 46 | + temperature: 0, |
| 47 | + }) |
| 48 | + |
| 49 | + let reasoning: string | null = null |
| 50 | + for await (const chunk of stream) { |
| 51 | + const delta = chunk.choices[0]?.delta |
| 52 | + if (reasoning || delta?.content?.includes("<think>")) { |
| 53 | + reasoning = (reasoning || "") + (delta.content ?? "") |
| 54 | + } |
| 55 | + |
| 56 | + if (delta?.content && !reasoning) { |
| 57 | + yield { |
| 58 | + type: "text", |
| 59 | + text: delta.content, |
| 60 | + } |
| 61 | + } |
| 62 | + |
| 63 | + if (reasoning || ("reasoning_content" in delta && delta.reasoning_content)) { |
| 64 | + yield { |
| 65 | + type: "reasoning", |
| 66 | + reasoning: delta.content || ((delta as any).reasoning_content as string | undefined) || "", |
| 67 | + } |
| 68 | + if (reasoning?.includes("</think>")) { |
| 69 | + // Reset so the next chunk is regular content |
| 70 | + reasoning = null |
| 71 | + } |
| 72 | + } |
| 73 | + |
| 74 | + if (chunk.usage) { |
| 75 | + yield { |
| 76 | + type: "usage", |
| 77 | + inputTokens: chunk.usage.prompt_tokens || 0, // (deepseek reports total input AND cache reads/writes, see context caching: https://api-docs.deepseek.com/guides/kv_cache) where the input tokens is the sum of the cache hits/misses, while anthropic reports them as separate tokens. This is important to know for 1) context management truncation algorithm, and 2) cost calculation (NOTE: we report both input and cache stats but for now set input price to 0 since all the cost calculation will be done using cache hits/misses) |
| 78 | + outputTokens: chunk.usage.completion_tokens || 0, |
| 79 | + // @ts-ignore-next-line |
| 80 | + cacheReadTokens: chunk.usage.prompt_cache_hit_tokens || 0, |
| 81 | + // @ts-ignore-next-line |
| 82 | + cacheWriteTokens: chunk.usage.prompt_cache_miss_tokens || 0, |
| 83 | + } |
| 84 | + } |
| 85 | + } |
| 86 | + } |
| 87 | + |
| 88 | + getModel(): { id: string; info: ModelInfo } { |
| 89 | + return { |
| 90 | + id: this.options.fireworksModelId ?? "", |
| 91 | + info: openAiModelInfoSaneDefaults, |
| 92 | + } |
| 93 | + } |
| 94 | +} |
0 commit comments