diff --git a/.gitignore b/.gitignore index c4c6365..28a0e82 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,6 @@ Thumbs.db tests/ notes/ test-update.ts + +# Documentation (local development only) +docs/ diff --git a/docs/providers/README.md b/docs/providers/README.md deleted file mode 100644 index 29af628..0000000 --- a/docs/providers/README.md +++ /dev/null @@ -1,339 +0,0 @@ -# Provider API Formats Reference - -This directory contains documentation for each AI provider's API format, designed to help the context pruning plugin implement provider-specific logic. - -## Sources - -All information in these docs was gathered from: - -### Primary Sources - -| Source | Location | Description | -|--------|----------|-------------| -| **Vercel AI SDK** | https://github.com/vercel/ai | Provider conversion logic in `packages/{provider}/src/` | -| **OpenCode Source** | `/packages/opencode/src/provider/` | Custom transforms and provider loading | -| **models.dev API** | https://models.dev/api.json | Authoritative provider list with npm packages | - -### Key AI SDK Files - -| Provider | Conversion File | -|----------|-----------------| -| OpenAI | `packages/openai/src/chat/openai-chat-language-model.ts`, `packages/openai/src/responses/openai-responses-language-model.ts` | -| OpenAI-Compatible | `packages/openai-compatible/src/chat/openai-compatible-chat-language-model.ts` | -| Anthropic | `packages/anthropic/src/convert-to-anthropic-messages-prompt.ts`, `packages/anthropic/src/anthropic-messages-language-model.ts` | -| Google | `packages/google/src/convert-to-google-generative-ai-messages.ts`, `packages/google/src/google-generative-ai-language-model.ts` | -| AWS Bedrock | `packages/amazon-bedrock/src/convert-to-bedrock-chat-messages.ts`, `packages/amazon-bedrock/src/bedrock-chat-language-model.ts` | -| Mistral | `packages/mistral/src/convert-to-mistral-chat-messages.ts`, `packages/mistral/src/mistral-chat-language-model.ts` | -| Cohere | `packages/cohere/src/convert-to-cohere-chat-prompt.ts`, `packages/cohere/src/cohere-chat-language-model.ts` | - -### OpenCode Custom Transform Files - -| File | Purpose | -|------|---------| -| `src/provider/transform.ts` | Provider-specific message normalization, caching hints, schema transforms | -| `src/provider/provider.ts` | Provider loading, custom loaders, SDK instantiation | -| `src/provider/models.ts` | Model database schema, models.dev integration | -| `src/session/message-v2.ts` | Internal message structure, `toModelMessage()` conversion | - -### Official API Documentation - -| Provider | Documentation URL | -|----------|-------------------| -| OpenAI | https://platform.openai.com/docs/api-reference | -| Anthropic | https://docs.anthropic.com/en/api | -| Google Gemini | https://ai.google.dev/api/rest | -| AWS Bedrock | https://docs.aws.amazon.com/bedrock/latest/APIReference/ | -| Mistral | https://docs.mistral.ai/api/ | -| Cohere | https://docs.cohere.com/reference/chat | - ---- - -## Format Categories - -Providers fall into several format categories based on their API structure: - -### 1. OpenAI Chat Completions Format -**Most common format - used by ~60 providers** - -Key identifiers: -- `body.messages[]` array -- Tool results: `role: "tool"`, `tool_call_id` -- System in messages array - -Providers: openai, together, deepseek, groq, fireworks, hyperbolic, novita, cerebras, sambanova, perplexity, openrouter, and most others - -### 2. OpenAI Responses Format (newer) -**Used by OpenAI GPT models via responses API** - -Key identifiers: -- `body.input[]` array -- Tool results: `type: "function_call_output"`, `call_id` - -Providers: openai (responses endpoint), azure (responses endpoint) - -### 3. Anthropic Format -**Distinct format with cache control** - -Key identifiers: -- `body.messages[]` but tool results in user messages -- Tool results: `type: "tool_result"`, `tool_use_id` -- Top-level `system` array -- `cache_control` support - -Providers: anthropic - -### 4. Google Gemini Format -**Position-based tool correlation** - -Key identifiers: -- `body.contents[]` array -- Tool results: `functionResponse` parts (no IDs!) -- Roles: `user`/`model` only -- Top-level `systemInstruction` - -Providers: google, google-vertex - -### 5. AWS Bedrock Format -**Converse API with cache points** - -Key identifiers: -- Top-level `system` array -- Tool results: `toolResult` blocks with `toolUseId` -- `cachePoint` blocks - -Providers: amazon-bedrock - -### 6. Mistral Format (OpenAI-like with quirks) -**Strict ID requirements** - -Key identifiers: -- OpenAI-like but 9-char alphanumeric tool IDs required -- User content always array - -Providers: mistral - -### 7. Cohere Format -**RAG-native with citations** - -Key identifiers: -- Uses `p`/`k` instead of `top_p`/`top_k` -- Uppercase tool choice values -- `documents` array for RAG - -Providers: cohere - -## Quick Reference: Thinking/Reasoning - -| Format | Request Config | Response Structure | Encrypted? | Signature? | -|--------|---------------|-------------------|------------|------------| -| OpenAI Responses | `reasoning: {effort, summary}` | `{type: "reasoning", encrypted_content, summary}` | Yes | No | -| Anthropic | `thinking: {type, budget_tokens}` | `{type: "thinking", thinking, signature}` | Partial* | Yes | -| Google Gemini | `thinkingConfig: {thinkingBudget}` | `{text, thought: true, thoughtSignature}` | No | Optional | -| AWS Bedrock | `additionalModelRequestFields.thinking` | `{reasoningContent: {reasoningText/redactedReasoning}}` | Partial* | Yes | -| Mistral | N/A (model decides) | `{type: "thinking", thinking: [{type: "text", text}]}` | No | No | -| Cohere | `thinking: {type, token_budget}` | `{type: "thinking", thinking: "..."}` | No | No | - -*Partial = has both visible (`thinking`/`reasoningText`) and redacted (`redacted_thinking`/`redactedReasoning`) variants - -**Key differences:** -- **OpenAI**: Reasoning is always encrypted; only summary is readable -- **Anthropic/Bedrock**: Can have visible thinking with signature, or redacted thinking -- **Gemini**: Thinking is a text part with `thought: true` flag -- **Mistral**: Thinking is nested array of text parts -- **Cohere**: Thinking is plain string - -**SDK normalization**: All formats are converted to `{type: "reasoning", text: "..."}` by the AI SDK - -## Quick Reference: Tool Call ID Fields - -| Format | Tool Call ID Field | Tool Result ID Field | -|--------|-------------------|---------------------| -| OpenAI Chat | `tool_calls[].id` | `tool_call_id` | -| OpenAI Responses | `call_id` | `call_id` | -| Anthropic | `tool_use.id` | `tool_use_id` | -| Gemini | **NONE (position-based)** | **NONE** | -| Bedrock | `toolUse.toolUseId` | `toolResult.toolUseId` | -| Mistral | `tool_calls[].id` (9-char) | `tool_call_id` | -| Cohere | `tool_calls[].id` | `tool_call_id` | - -## Detection Strategy - -To detect which format a request uses: - -```typescript -function detectFormat(body: unknown): string { - if (body.input && Array.isArray(body.input)) return 'openai-responses' - if (body.contents && Array.isArray(body.contents)) return 'gemini' - if (body.system && Array.isArray(body.system) && body.inferenceConfig) return 'bedrock' - if (body.messages) { - // Check first message structure for Anthropic vs OpenAI - const msg = body.messages[0] - if (msg?.content?.[0]?.type === 'tool_result') return 'anthropic' - if (msg?.content?.[0]?.tool_use_id) return 'anthropic' - } - return 'openai-chat' // Default -} -``` - -## Files - -- [openai.md](./openai.md) - OpenAI Chat Completions & Responses API -- [anthropic.md](./anthropic.md) - Anthropic Messages API -- [google-gemini.md](./google-gemini.md) - Google Generative AI (Gemini) -- [aws-bedrock.md](./aws-bedrock.md) - AWS Bedrock Converse API -- [mistral.md](./mistral.md) - Mistral API -- [cohere.md](./cohere.md) - Cohere Chat API -- [openai-compatible.md](./openai-compatible.md) - OpenAI-compatible providers - -## Context Pruning Universal Rules - -1. **Tool call/result pairing**: Always prune tool calls and their results together -2. **Message alternation**: Most APIs expect alternating user/assistant messages -3. **System preservation**: System messages typically should not be pruned -4. **ID correlation**: Maintain ID relationships when pruning (except Gemini which is position-based) -5. **Cache markers**: Consider preserving cache control markers when present - ---- - -## Complete Provider List (models.dev) - -Every provider from models.dev and its API format: - -### OpenAI Chat Format (43 providers) -*Uses `@ai-sdk/openai-compatible` - standard OpenAI messages format* - -| Provider ID | Name | Notes | -|-------------|------|-------| -| `agentrouter` | AgentRouter | | -| `alibaba` | Alibaba | | -| `alibaba-cn` | Alibaba (China) | | -| `bailing` | Bailing | | -| `baseten` | Baseten | | -| `chutes` | Chutes | | -| `cortecs` | Cortecs | | -| `deepseek` | DeepSeek | Reasoning models (R1) | -| `fastrouter` | FastRouter | | -| `fireworks-ai` | Fireworks AI | | -| `github-copilot` | GitHub Copilot | | -| `github-models` | GitHub Models | | -| `huggingface` | Hugging Face | | -| `iflowcn` | iFlow | | -| `inception` | Inception | | -| `inference` | Inference | | -| `io-net` | IO.NET | | -| `llama` | Llama | | -| `lmstudio` | LMStudio | Local inference | -| `lucidquery` | LucidQuery AI | | -| `modelscope` | ModelScope | | -| `moonshotai` | Moonshot AI | | -| `moonshotai-cn` | Moonshot AI (China) | | -| `morph` | Morph | | -| `nebius` | Nebius Token Factory | | -| `nvidia` | Nvidia | | -| `opencode` | OpenCode Zen | | -| `openrouter` | OpenRouter | Meta-provider, cache support | -| `ovhcloud` | OVHcloud AI Endpoints | | -| `poe` | Poe | | -| `requesty` | Requesty | | -| `scaleway` | Scaleway | | -| `siliconflow` | SiliconFlow | | -| `submodel` | submodel | | -| `synthetic` | Synthetic | | -| `upstage` | Upstage | | -| `venice` | Venice AI | | -| `vultr` | Vultr | | -| `wandb` | Weights & Biases | | -| `zai` | Z.AI | | -| `zai-coding-plan` | Z.AI Coding Plan | | -| `zenmux` | ZenMux | | -| `zhipuai` | Zhipu AI | | -| `zhipuai-coding-plan` | Zhipu AI Coding Plan | | - -### OpenAI Native Format (1 provider) -*Uses `@ai-sdk/openai` - supports both Chat Completions and Responses API* - -| Provider ID | Name | Notes | -|-------------|------|-------| -| `openai` | OpenAI | Responses API for GPT-4.1+ | - -### Azure Format (2 providers) -*Uses `@ai-sdk/azure` - OpenAI format with Azure auth* - -| Provider ID | Name | Notes | -|-------------|------|-------| -| `azure` | Azure | Supports Responses API | -| `azure-cognitive-services` | Azure Cognitive Services | | - -### Anthropic Format (4 providers) -*Uses `@ai-sdk/anthropic` - distinct message format with cache control* - -| Provider ID | Name | Notes | -|-------------|------|-------| -| `anthropic` | Anthropic | Native Anthropic API | -| `kimi-for-coding` | Kimi For Coding | Uses Anthropic format | -| `minimax` | MiniMax | Uses Anthropic format | -| `minimax-cn` | MiniMax (China) | Uses Anthropic format | - -### Google Gemini Format (3 providers) -*Uses `@ai-sdk/google` or `@ai-sdk/google-vertex` - POSITION-BASED tool correlation* - -| Provider ID | Name | Notes | -|-------------|------|-------| -| `google` | Google | Native Gemini API | -| `google-vertex` | Vertex | Google Cloud Vertex AI | -| `google-vertex-anthropic` | Vertex (Anthropic) | Claude via Vertex | - -### AWS Bedrock Format (1 provider) -*Uses `@ai-sdk/amazon-bedrock` - Converse API with cachePoint* - -| Provider ID | Name | Notes | -|-------------|------|-------| -| `amazon-bedrock` | Amazon Bedrock | Multi-model, cachePoint support | - -### Mistral Format (1 provider) -*Uses `@ai-sdk/mistral` - requires 9-char alphanumeric tool IDs* - -| Provider ID | Name | Notes | -|-------------|------|-------| -| `mistral` | Mistral | Strict tool ID format | - -### Cohere Format (1 provider) -*Uses `@ai-sdk/cohere` - RAG-native with citations* - -| Provider ID | Name | Notes | -|-------------|------|-------| -| `cohere` | Cohere | Uses `p`/`k`, uppercase tool choice | - -### Specialized SDK Providers (13 providers) -*Use provider-specific SDKs but follow OpenAI-like format* - -| Provider ID | Name | SDK | Format | -|-------------|------|-----|--------| -| `cerebras` | Cerebras | `@ai-sdk/cerebras` | OpenAI-like | -| `deepinfra` | Deep Infra | `@ai-sdk/deepinfra` | OpenAI-like | -| `groq` | Groq | `@ai-sdk/groq` | OpenAI-like | -| `perplexity` | Perplexity | `@ai-sdk/perplexity` | OpenAI-like | -| `togetherai` | Together AI | `@ai-sdk/togetherai` | OpenAI-like | -| `xai` | xAI | `@ai-sdk/xai` | OpenAI-like | -| `vercel` | Vercel AI Gateway | `@ai-sdk/gateway` | OpenAI-like | -| `v0` | v0 | `@ai-sdk/vercel` | OpenAI-like | -| `cloudflare-workers-ai` | Cloudflare Workers AI | `workers-ai-provider` | OpenAI-like | -| `ollama-cloud` | Ollama Cloud | `ai-sdk-ollama` | OpenAI-like | -| `aihubmix` | AIHubMix | `@aihubmix/ai-sdk-provider` | OpenAI-like | -| `sap-ai-core` | SAP AI Core | `@mymediset/sap-ai-provider` | OpenAI-like | - ---- - -## Format Summary - -| Format | Provider Count | Tool ID Field | Key Identifier | -|--------|---------------|---------------|----------------| -| OpenAI Chat | 56 | `tool_call_id` | `body.messages[]` | -| OpenAI Responses | 2 | `call_id` | `body.input[]` | -| Anthropic | 4 | `tool_use_id` | `tool_result` in user msg | -| Google Gemini | 3 | **NONE** | `body.contents[]` | -| AWS Bedrock | 1 | `toolUseId` | `body.inferenceConfig` | -| Mistral | 1 | `tool_call_id` (9-char) | Check provider ID | -| Cohere | 1 | `tool_call_id` | Check provider ID | - -**Total: 69 providers** diff --git a/docs/providers/anthropic.md b/docs/providers/anthropic.md deleted file mode 100644 index d1610fa..0000000 --- a/docs/providers/anthropic.md +++ /dev/null @@ -1,216 +0,0 @@ -# Anthropic Messages API Format - -Anthropic uses a distinct message format with unique features like cache control and extended thinking. - -## Sources - -- **AI SDK**: `packages/anthropic/src/convert-to-anthropic-messages-prompt.ts`, `packages/anthropic/src/anthropic-messages-language-model.ts` -- **OpenCode Transform**: `src/provider/transform.ts` (toolCallId sanitization, cache control) -- **Official Docs**: https://docs.anthropic.com/en/api/messages - -## Request Structure - -```json -{ - "model": "claude-sonnet-4-5", - "max_tokens": 4096, - "temperature": 1.0, - "stream": true, - "system": [ - {"type": "text", "text": "System instructions", "cache_control": {"type": "ephemeral"}} - ], - "messages": [...], - "tools": [...], - "tool_choice": {"type": "auto"}, - "thinking": {"type": "enabled", "budget_tokens": 10000} -} -``` - -## Key Differences from OpenAI - -| Feature | OpenAI | Anthropic | -|---------|--------|-----------| -| System message | In messages array | Top-level `system` array | -| Tool results | `role: "tool"` message | In `user` message with `type: "tool_result"` | -| Tool call ID field | `tool_call_id` | `tool_use_id` | -| Caching | Not available | `cache_control` on content blocks | - -## Message Roles - -Only **two roles**: `user` and `assistant`. Tool results are embedded in user messages. - -## Message Formats - -### System Message (top-level, not in messages) -```json -{ - "system": [ - { - "type": "text", - "text": "You are a helpful assistant.", - "cache_control": {"type": "ephemeral"} - } - ] -} -``` - -### User Message -```json -{ - "role": "user", - "content": [ - {"type": "text", "text": "Hello", "cache_control": {"type": "ephemeral"}}, - {"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": "..."}}, - {"type": "document", "source": {"type": "base64", "media_type": "application/pdf", "data": "..."}, "title": "Doc"} - ] -} -``` - -### Assistant Message with Tool Use -```json -{ - "role": "assistant", - "content": [ - {"type": "text", "text": "Let me check the weather."}, - { - "type": "tool_use", - "id": "toolu_01XYZ", - "name": "get_weather", - "input": {"location": "San Francisco"}, - "cache_control": {"type": "ephemeral"} - } - ] -} -``` - -### Tool Result (in user message) -```json -{ - "role": "user", - "content": [ - { - "type": "tool_result", - "tool_use_id": "toolu_01XYZ", - "content": "72°F and sunny", - "is_error": false, - "cache_control": {"type": "ephemeral"} - } - ] -} -``` - -## Thinking/Reasoning (Extended Thinking) - -### Request Configuration -```json -{ - "thinking": { - "type": "enabled", - "budget_tokens": 10000 - } -} -``` - -**Parameters:** -- `type`: `"enabled"` or `"disabled"` -- `budget_tokens`: Token budget for thinking (minimum 1024) - -**Constraints when thinking enabled:** -- `temperature`, `topK`, `topP` are **NOT supported** (ignored with warnings) -- `max_tokens` is automatically adjusted to include `budget_tokens` -- Minimum budget is 1,024 tokens - -### Response Content Blocks - -**Thinking Block** (visible reasoning): -```json -{ - "type": "thinking", - "thinking": "Let me analyze this step by step...", - "signature": "cryptographic_signature_for_verification" -} -``` - -**Redacted Thinking Block** (hidden reasoning): -```json -{ - "type": "redacted_thinking", - "data": "encrypted_base64_redacted_content" -} -``` - -### Streaming Deltas -```json -{"type": "thinking_delta", "thinking": "reasoning chunk..."} -{"type": "signature_delta", "signature": "sig_chunk"} -``` - -### SDK Conversion -The AI SDK converts Anthropic's `thinking` blocks to a unified `reasoning` type: -```typescript -// Anthropic response -{type: "thinking", thinking: "...", signature: "..."} - -// Converted to SDK format -{type: "reasoning", text: "...", signature: "..."} -``` - -### Context Pruning for Thinking -- **Cannot apply cache_control** to thinking or redacted_thinking blocks -- **Signatures are cryptographic** - preserve for verification if replaying -- **Redacted thinking** contains encrypted content that cannot be inspected -- Consider thinking blocks as important context but potentially large - -## Tool Definition - -```json -{ - "name": "get_weather", - "description": "Get weather for a location", - "input_schema": { - "type": "object", - "properties": {"location": {"type": "string"}}, - "required": ["location"] - }, - "cache_control": {"type": "ephemeral"} -} -``` - -### Tool Choice Options -- `{"type": "auto"}` - Model decides -- `{"type": "any"}` - Force tool use -- `{"type": "tool", "name": "get_weather"}` - Force specific tool - -## Cache Control - -```json -{"type": "ephemeral", "ttl": "5m"} -``` - -**Limits**: Maximum **4 cache breakpoints** per request - -**Applicable to**: system messages, user/assistant content parts, tool results, tool definitions - -**NOT applicable to**: `thinking` blocks, `redacted_thinking` blocks - -## Special Tool Types - -**Server Tool Use** (provider-executed): -```json -{"type": "server_tool_use", "id": "...", "name": "web_search", "input": {...}} -``` -Names: `web_fetch`, `web_search`, `code_execution`, `bash_code_execution`, `text_editor_code_execution` - -**MCP Tool Use**: -```json -{"type": "mcp_tool_use", "id": "...", "name": "custom_tool", "server_name": "my-mcp-server", "input": {...}} -``` - -## Context Pruning Considerations - -1. **Tool correlation**: Uses `tool_use_id` (not `tool_call_id`) -2. **Tool results in user messages**: Unlike OpenAI, tool results are `content` parts in user messages -3. **Message merging**: Consecutive user messages are merged; consecutive assistant messages are merged -4. **Cache breakpoints**: Preserve `cache_control` markers when possible (max 4) -5. **Thinking blocks**: Have signatures for verification; handle with care -6. **Paired pruning**: `tool_use` and corresponding `tool_result` must be pruned together diff --git a/docs/providers/aws-bedrock.md b/docs/providers/aws-bedrock.md deleted file mode 100644 index f1c4479..0000000 --- a/docs/providers/aws-bedrock.md +++ /dev/null @@ -1,287 +0,0 @@ -# AWS Bedrock API Format - -AWS Bedrock uses the Converse API with unique content block types and caching via `cachePoint`. - -## Sources - -- **AI SDK**: `packages/amazon-bedrock/src/convert-to-bedrock-chat-messages.ts`, `packages/amazon-bedrock/src/bedrock-chat-language-model.ts` -- **OpenCode Transform**: `src/provider/transform.ts` (cachePoint insertion) -- **Official Docs**: https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_Converse.html - -## Request Structure - -```json -{ - "system": [ - {"text": "System message"}, - {"cachePoint": {"type": "default"}} - ], - "messages": [ - {"role": "user", "content": [...]}, - {"role": "assistant", "content": [...]} - ], - "inferenceConfig": { - "maxTokens": 4096, - "temperature": 0.7, - "topP": 0.9, - "topK": 50, - "stopSequences": ["END"] - }, - "toolConfig": { - "tools": [...], - "toolChoice": {"auto": {}} - }, - "additionalModelRequestFields": { - "thinking": {"type": "enabled", "budget_tokens": 10000} - } -} -``` - -## Key Differences from OpenAI - -| Feature | OpenAI | Bedrock | -|---------|--------|--------| -| System message | In messages | Top-level `system` array | -| Tool calls | `tool_calls` array | `toolUse` content block | -| Tool results | `role: "tool"` | `toolResult` in user content | -| Tool call ID | `tool_call_id` | `toolUseId` | -| Caching | Not available | `cachePoint` blocks | - -## Message Roles - -Only **two roles**: `user` and `assistant`. Tool results go in user messages. - -## Content Block Types - -### Text Block -```json -{"text": "Hello, how can I help?"} -``` - -### Image Block -```json -{ - "image": { - "format": "jpeg", - "source": {"bytes": ""} - } -} -``` -Formats: `jpeg`, `png`, `gif`, `webp` - -### Document Block -```json -{ - "document": { - "format": "pdf", - "name": "document-1", - "source": {"bytes": ""}, - "citations": {"enabled": true} - } -} -``` -Formats: `pdf`, `csv`, `doc`, `docx`, `xls`, `xlsx`, `html`, `txt`, `md` - -### Tool Use Block (Assistant calling tool) -```json -{ - "toolUse": { - "toolUseId": "tool_call_123", - "name": "get_weather", - "input": {"city": "Seattle"} - } -} -``` - -### Tool Result Block (User providing result) -```json -{ - "toolResult": { - "toolUseId": "tool_call_123", - "content": [ - {"text": "Temperature: 72F"}, - {"image": {"format": "png", "source": {"bytes": "..."}}} - ] - } -} -``` - -### Reasoning Block (Anthropic models) -```json -{ - "reasoningContent": { - "reasoningText": { - "text": "Let me think through this...", - "signature": "" - } - } -} -``` - -## Thinking/Reasoning (Anthropic Models via Bedrock) - -### Request Configuration -```json -{ - "additionalModelRequestFields": { - "thinking": { - "type": "enabled", - "budget_tokens": 10000 - } - } -} -``` - -**Note**: Bedrock uses `reasoningConfig` in the SDK which gets transformed to Anthropic's `thinking` format in `additionalModelRequestFields`. - -**Parameters:** -- `type`: `"enabled"` or `"disabled"` -- `budget_tokens`: Token budget for thinking (minimum 1024) - -### Response Content Blocks - -**Reasoning Text Block** (visible reasoning): -```json -{ - "reasoningContent": { - "reasoningText": { - "text": "Let me analyze this step by step...", - "signature": "cryptographic_signature_for_verification" - } - } -} -``` - -**Redacted Reasoning Block** (hidden reasoning): -```json -{ - "reasoningContent": { - "redactedReasoning": { - "data": "encrypted_base64_redacted_content" - } - } -} -``` - -### SDK Conversion -The AI SDK converts Bedrock's reasoning blocks to unified format: -```typescript -// Bedrock response -{reasoningContent: {reasoningText: {text: "...", signature: "..."}}} - -// Converted to SDK format -{type: "reasoning", text: "...", signature: "..."} - -// Redacted version -{reasoningContent: {redactedReasoning: {data: "..."}}} - -// Converted to SDK format -{type: "redacted-reasoning", data: "..."} -``` - -### Context Pruning for Reasoning -- **Signatures are cryptographic** - preserve for verification -- **Redacted reasoning** contains encrypted content that cannot be inspected -- Reasoning blocks appear in assistant message content -- Consider reasoning as important but potentially large context - -### Cache Point -```json -{"cachePoint": {"type": "default"}} -``` - -## Caching Mechanism - -Cache points can be inserted at: -1. In system messages - After each system message -2. In user message content - After content blocks -3. In assistant message content - After content blocks -4. In tool configuration - After tool definitions - -## Tool Definition - -```json -{ - "tools": [ - { - "toolSpec": { - "name": "get_weather", - "description": "Get weather for a city", - "inputSchema": { - "json": { - "type": "object", - "properties": {"city": {"type": "string"}}, - "required": ["city"] - } - } - } - }, - {"cachePoint": {"type": "default"}} - ], - "toolChoice": {"auto": {}} -} -``` - -### Tool Choice Options -- `{"auto": {}}` - Model decides -- `{"any": {}}` - Force tool use (maps to "required") -- `{"tool": {"name": "tool_name"}}` - Force specific tool - -## Complete Example - -```json -{ - "system": [ - {"text": "You are a helpful assistant."}, - {"cachePoint": {"type": "default"}} - ], - "messages": [ - { - "role": "user", - "content": [{"text": "What's the weather in Seattle?"}] - }, - { - "role": "assistant", - "content": [{ - "toolUse": { - "toolUseId": "call_001", - "name": "get_weather", - "input": {"city": "Seattle"} - } - }] - }, - { - "role": "user", - "content": [ - { - "toolResult": { - "toolUseId": "call_001", - "content": [{"text": "{\"temperature\": 72, \"condition\": \"sunny\"}"}] - } - }, - {"cachePoint": {"type": "default"}} - ] - } - ], - "toolConfig": { - "tools": [{"toolSpec": {"name": "get_weather", "description": "Get weather", "inputSchema": {"json": {"type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"]}}}}], - "toolChoice": {"auto": {}} - } -} -``` - -## Unique Behaviors - -1. **Trailing whitespace trimming**: Last text block in assistant messages is trimmed -2. **Empty text blocks skipped**: Whitespace-only text blocks are filtered -3. **Temperature clamping**: Clamped to [0, 1] range -4. **Tool content filtering**: If no tools available, tool content is removed with warning - -## Context Pruning Considerations - -1. **Tool correlation**: Uses `toolUseId` for correlation -2. **Tool results in user messages**: `toolResult` blocks are in user message content -3. **Message grouping**: Consecutive same-role messages are merged -4. **Cache points**: Preserve `cachePoint` markers when beneficial -5. **Paired pruning**: `toolUse` and corresponding `toolResult` must be pruned together -6. **System first**: System messages must come before user/assistant messages diff --git a/docs/providers/cohere.md b/docs/providers/cohere.md deleted file mode 100644 index a1927fb..0000000 --- a/docs/providers/cohere.md +++ /dev/null @@ -1,282 +0,0 @@ -# Cohere API Format - -Cohere uses a chat-based API with unique features like built-in RAG via `documents` and citations. - -## Request Structure - -```json -{ - "model": "command-r-plus", - "messages": [...], - "max_tokens": 4096, - "temperature": 0.7, - "p": 0.9, - "k": 40, - "frequency_penalty": 0.0, - "presence_penalty": 0.0, - "seed": 12345, - "stop_sequences": ["END"], - "response_format": {"type": "json_object"}, - "tools": [...], - "tool_choice": "REQUIRED", - "documents": [...], - "thinking": {"type": "enabled", "token_budget": 2048} -} -``` - -## Key Differences from OpenAI - -| Feature | OpenAI | Cohere | -|---------|--------|-------| -| Top-p parameter | `top_p` | `p` | -| Top-k parameter | `top_k` | `k` | -| Tool choice required | `"required"` | `"REQUIRED"` (uppercase) | -| RAG | Not built-in | `documents` array | -| Citations | Not built-in | Automatic with documents | - -## Message Formats - -### System Message -```json -{"role": "system", "content": "You are a helpful assistant."} -``` - -### User Message (text only) -```json -{"role": "user", "content": "What is the weather today?"} -``` -**Note**: Files/documents are extracted to top-level `documents` array for RAG. - -### Assistant Message -```json -{ - "role": "assistant", - "content": "The weather is sunny.", - "tool_plan": undefined, - "tool_calls": undefined -} -``` - -### Assistant Message with Tool Calls -```json -{ - "role": "assistant", - "content": undefined, - "tool_plan": undefined, - "tool_calls": [{ - "id": "call_abc123", - "type": "function", - "function": { - "name": "get_weather", - "arguments": "{\"location\": \"San Francisco\"}" - } - }] -} -``` -**Key quirk**: When `tool_calls` present, `content` is `undefined`. - -### Tool Result Message -```json -{ - "role": "tool", - "tool_call_id": "call_abc123", - "content": "{\"temperature\": 72, \"conditions\": \"sunny\"}" -} -``` - -## Tool Definition - -```json -{ - "tools": [{ - "type": "function", - "function": { - "name": "get_weather", - "description": "Get weather for a location", - "parameters": { - "type": "object", - "properties": {"location": {"type": "string"}}, - "required": ["location"] - } - } - }], - "tool_choice": "REQUIRED" -} -``` - -### Tool Choice Values (UPPERCASE) -- `undefined` - Auto (model decides) -- `"NONE"` - Disable tool use -- `"REQUIRED"` - Force tool use - -**Note**: To force a specific tool, filter `tools` array and set `tool_choice: "REQUIRED"`. - -## RAG via Documents - -```json -{ - "documents": [ - { - "data": { - "text": "Document content here", - "title": "Optional Title" - } - } - ] -} -``` - -## Response Structure - -```json -{ - "generation_id": "abc-123", - "message": { - "role": "assistant", - "content": [ - {"type": "text", "text": "Response here."}, - {"type": "thinking", "thinking": "Reasoning..."} - ], - "tool_plan": "I will call the API", - "tool_calls": [...], - "citations": [{ - "start": 0, - "end": 10, - "text": "cited text", - "sources": [{"type": "document", "id": "doc1", "document": {...}}] - }] - }, - "finish_reason": "COMPLETE", - "usage": {...} -} -``` - -**Note**: Response `content` is an **array** of typed objects (unlike request which uses string). - -## Unique Features - -1. **Thinking mode**: Native reasoning via `thinking` config, returns `{"type": "thinking"}` blocks -2. **Citations**: Automatic source citations when using `documents` -3. **Tool plan**: `tool_plan` field explains tool usage reasoning -4. **Null arguments**: May return `"null"` for parameterless tools (normalize to `"{}"`) - -## Thinking/Reasoning - -### Request Configuration -```json -{ - "thinking": { - "type": "enabled", - "token_budget": 2048 - } -} -``` - -**Parameters:** -- `type`: `"enabled"` or `"disabled"` -- `token_budget`: Token budget for thinking - -### Response Content Blocks - -**Thinking Block** (in response content array): -```json -{ - "type": "thinking", - "thinking": "Let me reason through this problem..." -} -``` - -**Note**: Unlike Mistral, Cohere's `thinking` field is a **string**, not an array. - -### Response Structure with Thinking -```json -{ - "message": { - "role": "assistant", - "content": [ - {"type": "thinking", "thinking": "First, I need to consider..."}, - {"type": "text", "text": "Based on my analysis..."} - ] - } -} -``` - -### Streaming Events for Thinking -```json -// content-start (thinking) -{"type": "content-start", "index": 0, "delta": {"message": {"content": {"type": "thinking", "thinking": ""}}}} - -// content-delta (thinking) -{"type": "content-delta", "index": 0, "delta": {"message": {"content": {"thinking": "reasoning chunk..."}}}} -``` - -### SDK Conversion -The AI SDK converts Cohere's thinking blocks to unified format: -```typescript -// Cohere response content -{type: "thinking", thinking: "..."} - -// Converted to SDK format -{type: "reasoning", text: "..."} -``` - -### Context Pruning for Thinking -- Thinking blocks appear in response `content` array -- No signatures or encryption - content is plaintext string -- Consider thinking as important context but potentially large -- Thinking appears before text content in the response - -## Complete Example - -```json -{ - "model": "command-r-plus", - "messages": [ - {"role": "system", "content": "You are a weather assistant."}, - {"role": "user", "content": "Weather in Paris?"}, - { - "role": "assistant", - "content": undefined, - "tool_plan": undefined, - "tool_calls": [{ - "id": "call_001", - "type": "function", - "function": {"name": "get_weather", "arguments": "{\"location\":\"Paris\"}"} - }] - }, - { - "role": "tool", - "tool_call_id": "call_001", - "content": "{\"temperature\":18,\"conditions\":\"cloudy\"}" - } - ], - "tools": [{ - "type": "function", - "function": {"name": "get_weather", "description": "Get weather", "parameters": {"type": "object", "properties": {"location": {"type": "string"}}, "required": ["location"]}} - }], - "max_tokens": 1024, - "temperature": 0.7 -} -``` - -## Streaming Events - -| Event | Purpose | -|-------|--------| -| `message-start` | Start of response | -| `content-start` | Start of text/thinking block | -| `content-delta` | Text or thinking chunk | -| `tool-plan-delta` | Tool planning reasoning | -| `tool-call-start` | Start of tool call | -| `tool-call-delta` | Tool call arguments chunk | -| `message-end` | Final with `finish_reason` and `usage` | - -## Context Pruning Considerations - -1. **Tool correlation**: Uses `tool_call_id` like OpenAI -2. **Separate tool results**: Each result is a separate message (not grouped) -3. **Content exclusivity**: When `tool_calls` present, `content` is `undefined` -4. **Response vs request format**: Response content is array, request is string -5. **Uppercase tool choice**: Use `"NONE"` and `"REQUIRED"` (not lowercase) -6. **Paired pruning**: Tool calls and results must be pruned together -7. **Documents top-level**: RAG documents are separate from messages diff --git a/docs/providers/google-gemini.md b/docs/providers/google-gemini.md deleted file mode 100644 index 8ab69b1..0000000 --- a/docs/providers/google-gemini.md +++ /dev/null @@ -1,255 +0,0 @@ -# Google Gemini API Format - -Google's Generative AI (Gemini) uses a unique format with **position-based tool correlation** (no tool call IDs). - -## Sources - -- **AI SDK**: `packages/google/src/convert-to-google-generative-ai-messages.ts`, `packages/google/src/google-generative-ai-language-model.ts` -- **Schema Conversion**: `packages/google/src/convert-json-schema-to-openapi-schema.ts` -- **OpenCode Transform**: `src/provider/transform.ts` (schema integer→string enum conversion) -- **Official Docs**: https://ai.google.dev/api/rest/v1/models/generateContent - -## Request Structure - -```json -{ - "systemInstruction": { - "parts": [{"text": "System prompt text"}] - }, - "contents": [ - {"role": "user", "parts": [...]}, - {"role": "model", "parts": [...]} - ], - "generationConfig": { - "maxOutputTokens": 1024, - "temperature": 0.7, - "topK": 40, - "topP": 0.95, - "responseMimeType": "application/json", - "responseSchema": {...} - }, - "tools": [...], - "toolConfig": { - "functionCallingConfig": {"mode": "AUTO"} - } -} -``` - -## Key Differences from OpenAI - -| Feature | OpenAI | Gemini | -|---------|--------|--------| -| Message container | `messages[]` | `contents[]` | -| System message | In messages | Top-level `systemInstruction` | -| Roles | system/user/assistant/tool | user/model only | -| Tool call IDs | ID-based correlation | **POSITION-BASED** | -| Tool results | Separate `tool` role | In `user` message as `functionResponse` | - -## Message Roles - -Only **two roles**: `user` and `model` - -| SDK Role | Gemini Role | -|----------|-------------| -| `system` | `systemInstruction` (top-level) | -| `user` | `user` | -| `assistant` | `model` | -| `tool` (results) | `user` (with `functionResponse`) | - -## Content Parts - -### Text Part -```json -{"text": "Hello, how are you?"} -``` - -### Thinking Part -```json -{"text": "Let me think...", "thought": true, "thoughtSignature": "sig-for-caching"} -``` - -## Thinking/Reasoning - -### Request Configuration -```json -{ - "generationConfig": { - "thinkingConfig": { - "thinkingBudget": 8192, - "includeThoughts": true - } - } -} -``` - -**Parameters:** -- `thinkingBudget`: Token budget for thinking -- `includeThoughts`: Whether to include thinking in response (default true) - -### Response Content Parts - -**Thinking Part** (in model message): -```json -{ - "text": "Let me reason through this problem...", - "thought": true, - "thoughtSignature": "signature_for_caching" -} -``` - -**Key fields:** -- `thought: true` - Marks this part as reasoning content -- `thoughtSignature` - Optional signature for caching/verification - -### Usage Tracking -```json -{ - "usageMetadata": { - "promptTokenCount": 100, - "candidatesTokenCount": 200, - "thoughtsTokenCount": 150 - } -} -``` - -### SDK Conversion -The AI SDK converts Gemini's thought parts to unified `reasoning` type: -```typescript -// Gemini response part -{text: "...", thought: true, thoughtSignature: "..."} - -// Converted to SDK format -{type: "reasoning", text: "...", signature: "..."} -``` - -### Context Pruning for Thinking -- **Thought parts are regular text parts** with `thought: true` flag -- **thoughtSignature** should be preserved if present (used for caching) -- Thinking parts appear in `model` role messages -- Consider thinking as important but potentially large context - -## Image (inline base64) -```json -{"inlineData": {"mimeType": "image/jpeg", "data": "base64-encoded-data"}} -``` - -### Image (file URI) -```json -{"fileData": {"mimeType": "image/png", "fileUri": "gs://bucket/path/image.png"}} -``` - -### Function Call (tool invocation) -```json -{"functionCall": {"name": "get_weather", "args": {"location": "Tokyo"}}} -``` - -### Function Response (tool result) -```json -{"functionResponse": {"name": "get_weather", "response": {"name": "get_weather", "content": "{\"temp\": 22}"}}} -``` - -## CRITICAL: Position-Based Tool Correlation - -**Gemini does NOT use tool call IDs.** Tool results are correlated by **position/order**. - -### Tool Call (model message) -```json -{ - "role": "model", - "parts": [ - {"functionCall": {"name": "get_weather", "args": {"location": "SF"}}}, - {"functionCall": {"name": "get_time", "args": {"timezone": "PST"}}} - ] -} -``` - -### Tool Results (user message) - ORDER MUST MATCH -```json -{ - "role": "user", - "parts": [ - {"functionResponse": {"name": "get_weather", "response": {"name": "get_weather", "content": "72F"}}}, - {"functionResponse": {"name": "get_time", "response": {"name": "get_time", "content": "2:30 PM"}}} - ] -} -``` - -## Tool Definition - -```json -{ - "tools": [{ - "functionDeclarations": [{ - "name": "get_weather", - "description": "Get the current weather", - "parameters": { - "type": "object", - "properties": {"location": {"type": "string"}}, - "required": ["location"] - } - }] - }], - "toolConfig": { - "functionCallingConfig": {"mode": "AUTO"} - } -} -``` - -### Tool Config Modes -- `AUTO` - Model decides -- `NONE` - Disable tools -- `ANY` - Force tool use -- `ANY` + `allowedFunctionNames` - Force specific tools - -### Provider-Defined Tools -```json -{"googleSearch": {}}, -{"urlContext": {}}, -{"codeExecution": {}} -``` - -## Schema Conversion (JSON Schema to OpenAPI) - -Gemini requires **OpenAPI 3.0 schema format**: - -| JSON Schema | OpenAPI | -|-------------|---------| -| `const: value` | `enum: [value]` | -| `type: ["string", "null"]` | `anyOf` + `nullable: true` | - -## Gemma Model Handling - -For `gemma-*` models, system instructions are **prepended to first user message**: -```json -{ - "contents": [{ - "role": "user", - "parts": [{"text": "System prompt\n\nActual user message"}] - }] -} -``` - -## Complete Example - -```json -{ - "systemInstruction": {"parts": [{"text": "You are a weather assistant."}]}, - "contents": [ - {"role": "user", "parts": [{"text": "Weather in Tokyo?"}]}, - {"role": "model", "parts": [{"functionCall": {"name": "get_weather", "args": {"location": "Tokyo"}}}]}, - {"role": "user", "parts": [{"functionResponse": {"name": "get_weather", "response": {"name": "get_weather", "content": "22C cloudy"}}}]}, - {"role": "model", "parts": [{"text": "Tokyo is 22C and cloudy."}]} - ], - "tools": [{"functionDeclarations": [{"name": "get_weather", "description": "Get weather", "parameters": {"type": "object", "properties": {"location": {"type": "string"}}, "required": ["location"]}}]}] -} -``` - -## Context Pruning Considerations - -1. **POSITION-BASED CORRELATION**: Tool calls and results must be pruned TOGETHER and order preserved -2. **No IDs**: Cannot selectively prune individual tool results - entire pairs must go -3. **System separate**: `systemInstruction` is top-level, typically should NOT be pruned -4. **Alternation required**: Must maintain alternating `user`/`model` pattern -5. **Multi-part messages**: Each message can have multiple parts; prune entire messages, not parts -6. **Tool results are user role**: `functionResponse` parts are in `user` messages -7. **thoughtSignature**: Used for caching reasoning; preserve if present diff --git a/docs/providers/mistral.md b/docs/providers/mistral.md deleted file mode 100644 index 3767830..0000000 --- a/docs/providers/mistral.md +++ /dev/null @@ -1,226 +0,0 @@ -# Mistral API Format - -Mistral uses an OpenAI-compatible format but with **strict tool call ID requirements**. - -## Sources - -- **AI SDK**: `packages/mistral/src/convert-to-mistral-chat-messages.ts`, `packages/mistral/src/mistral-chat-language-model.ts` -- **OpenCode Transform**: `src/provider/transform.ts` (9-char alphanumeric ID normalization) -- **Official Docs**: https://docs.mistral.ai/api/#tag/chat - -## Request Structure - -```json -{ - "model": "mistral-large-latest", - "messages": [...], - "max_tokens": 4096, - "temperature": 0.7, - "top_p": 1.0, - "random_seed": 42, - "safe_prompt": false, - "stream": false, - "response_format": {"type": "json_object"}, - "tools": [...], - "tool_choice": "auto" -} -``` - -## CRITICAL: Tool Call ID Requirement - -**Mistral requires tool call IDs to be exactly 9 alphanumeric characters.** - -| Valid | Invalid | -|-------|--------| -| `abc123xyz` | `call_abc123` (too long, has underscore) | -| `A1B2C3D4E` | `12345` (too short) | -| `def456uvw` | `abc-123-xy` (has hyphens) | - -## Key Differences from OpenAI - -| Feature | OpenAI | Mistral | -|---------|--------|--------| -| Tool call ID format | `call_*` (variable) | **Exactly 9 alphanumeric** | -| Tool choice `required` | `"required"` | `"any"` | -| User content | String or array | **Always array** | -| Assistant `prefix` | Not supported | Supported | -| Stop sequences | Supported | Not supported | -| Frequency/presence penalty | Supported | Not supported | - -## Message Formats - -### System Message -```json -{"role": "system", "content": "You are a helpful assistant."} -``` - -### User Message (always array) -```json -{ - "role": "user", - "content": [ - {"type": "text", "text": "What's in this image?"}, - {"type": "image_url", "image_url": "https://example.com/image.jpg"}, - {"type": "document_url", "document_url": "data:application/pdf;base64,..."} - ] -} -``` - -### Assistant Message -```json -{ - "role": "assistant", - "content": "Here's the analysis...", - "prefix": true, - "tool_calls": [ - { - "id": "abc123xyz", - "type": "function", - "function": { - "name": "get_weather", - "arguments": "{\"location\":\"San Francisco\"}" - } - } - ] -} -``` - -### Tool Result Message -```json -{ - "role": "tool", - "name": "get_weather", - "tool_call_id": "abc123xyz", - "content": "{\"temperature\": 72, \"condition\": \"sunny\"}" -} -``` - -## Tool Definition - -```json -{ - "tools": [{ - "type": "function", - "function": { - "name": "get_weather", - "description": "Get weather for a location", - "parameters": { - "type": "object", - "properties": {"location": {"type": "string"}}, - "required": ["location"] - }, - "strict": true - } - }], - "tool_choice": "auto" -} -``` - -### Tool Choice Options -- `"auto"` - Model decides -- `"none"` - Disable tool calling -- `"any"` - Force tool use (NOT `"required"`) -- `{"type": "function", "function": {"name": "..."}}` - Force specific tool - -## Unique Features - -1. **Prefix flag**: `prefix: true` on assistant messages for continuation mode -2. **PDF support**: Via `document_url` content type with base64 -3. **Thinking mode**: Returns `{"type": "thinking", "thinking": [...]}` content blocks - -## Thinking/Reasoning (Magistral Models) - -### Response Content Structure - -Mistral's reasoning models (Magistral) return thinking in the response content: - -**Thinking Block** (in assistant message content): -```json -{ - "type": "thinking", - "thinking": [ - {"type": "text", "text": "Let me reason through this..."} - ] -} -``` - -**Note**: The `thinking` field is an **array** of text parts, not a string. - -### Streaming Response -When streaming, content can be a string OR array: -```json -{ - "choices": [{ - "delta": { - "role": "assistant", - "content": [ - {"type": "thinking", "thinking": [{"type": "text", "text": "reasoning..."}]}, - {"type": "text", "text": "final response"} - ] - } - }] -} -``` - -### SDK Conversion -The AI SDK extracts and converts Mistral's thinking blocks: -```typescript -// Mistral response content -{type: "thinking", thinking: [{type: "text", text: "..."}]} - -// Converted to SDK format -{type: "reasoning", text: "..."} -``` - -### Context Pruning for Thinking -- Thinking blocks appear as content items in assistant messages -- The nested `thinking` array contains text parts to concatenate -- No signatures or encryption - content is plaintext -- Consider thinking as important context but potentially large - -## Complete Example - -```json -{ - "model": "mistral-large-latest", - "messages": [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": [{"type": "text", "text": "Weather in NYC?"}]}, - { - "role": "assistant", - "content": "", - "tool_calls": [{ - "id": "abc123xyz", - "type": "function", - "function": {"name": "get_weather", "arguments": "{\"location\":\"New York City\"}"} - }] - }, - { - "role": "tool", - "name": "get_weather", - "tool_call_id": "abc123xyz", - "content": "{\"temperature\":72,\"condition\":\"sunny\"}" - } - ], - "tools": [{ - "type": "function", - "function": {"name": "get_weather", "description": "Get weather", "parameters": {"type": "object", "properties": {"location": {"type": "string"}}, "required": ["location"]}} - }], - "tool_choice": "auto" -} -``` - -## Unsupported Features - -- `topK` -- `frequencyPenalty` -- `presencePenalty` -- `stopSequences` - -## Context Pruning Considerations - -1. **9-char alphanumeric IDs**: When generating synthetic tool calls, IDs must be exactly 9 alphanumeric chars -2. **Tool correlation**: Uses `tool_call_id` like OpenAI -3. **User content always array**: Even single text becomes `[{"type": "text", "text": "..."}]` -4. **Tool name in result**: Tool result includes `name` field alongside `tool_call_id` -5. **Paired pruning**: Tool calls and results must be pruned together diff --git a/docs/providers/openai-compatible.md b/docs/providers/openai-compatible.md deleted file mode 100644 index 3406248..0000000 --- a/docs/providers/openai-compatible.md +++ /dev/null @@ -1,135 +0,0 @@ -# OpenAI-Compatible Providers - -Most providers in models.dev use the OpenAI Chat Completions format via `@ai-sdk/openai-compatible`. This document covers these providers and any provider-specific quirks. - -## Standard OpenAI Chat Completions Format - -See [openai.md](./openai.md) for the full format specification. - -### Quick Reference - -```json -{ - "model": "model-name", - "messages": [ - {"role": "system", "content": "..."}, - {"role": "user", "content": "..."}, - {"role": "assistant", "content": "...", "tool_calls": [...]}, - {"role": "tool", "tool_call_id": "...", "content": "..."} - ], - "tools": [...], - "tool_choice": "auto" -} -``` - -## Providers Using OpenAI-Compatible Format - -Based on models.dev, these providers use `@ai-sdk/openai-compatible`: - -| Provider | Base URL | Notes | -|----------|----------|-------| -| together | api.together.xyz | | -| deepseek | api.deepseek.com | | -| groq | api.groq.com | Very fast inference | -| fireworks | api.fireworks.ai | | -| hyperbolic | api.hyperbolic.xyz | | -| novita | api.novita.ai | | -| cerebras | api.cerebras.ai | | -| sambanova | api.sambanova.ai | | -| nebius | api.studio.nebius.ai | | -| chutes | api.chutes.ai | | -| openrouter | openrouter.ai | Meta-provider | -| kluster | api.kluster.ai | | -| glhf | glhf.chat | | -| scaleway | api.scaleway.ai | | -| lepton | api.lepton.ai | | -| nano-gpt | api.nano-gpt.com | | -| arcee | api.arcee.ai | | -| inference-net | api.inference.net | | -| nineteen | api.nineteen.ai | | -| targon | api.targon.ai | | -| req-ai | api.req.ai | | -| vllm | (self-hosted) | | -| ollama | localhost:11434 | Local models | -| lmstudio | localhost:1234 | Local models | -| jan | localhost:1337 | Local models | -| any-provider | (configurable) | Generic OpenAI-compatible | - -## Provider-Specific Quirks - -### OpenRouter -- Acts as a meta-provider routing to various backends -- May have different caching semantics -- Supports `cache_control` similar to Anthropic when routing to Claude - -### Groq -- Extremely fast inference -- Limited model selection -- May have stricter rate limits - -### DeepSeek -- Supports reasoning models (DeepSeek R1) -- May include thinking/reasoning in responses - -### Ollama / LM Studio / Jan -- Local inference -- No rate limits but hardware-dependent -- May not support all features (vision, tools) - -### Together AI -- Wide model selection -- Good tool support -- Supports streaming - -## Caching Considerations - -Some OpenAI-compatible providers support caching hints: - -```json -{ - "role": "user", - "content": "...", - "cache_control": {"type": "ephemeral"} -} -``` - -Supported by: -- OpenRouter (when routing to Anthropic) -- Some enterprise deployments - -## Vision Support - -Not all OpenAI-compatible providers support vision. Check model capabilities: - -```json -{ - "role": "user", - "content": [ - {"type": "text", "text": "What's in this image?"}, - {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}} - ] -} -``` - -## Tool Support - -Tool support varies by provider and model. Common limitations: -- Some models don't support parallel tool calls -- Some models don't support structured outputs/strict mode -- Response format (`json_object`) support varies - -## Context Pruning Considerations - -1. **Standard ID correlation**: All use `tool_call_id` for tool result correlation -2. **Consistent message format**: Messages follow OpenAI structure -3. **Feature detection**: May need to check model capabilities at runtime -4. **Cache support varies**: Not all providers honor cache hints -5. **Paired pruning**: Tool calls and results must be pruned together - -## Detection - -OpenAI-compatible requests can be detected by: -- `body.messages` array present -- Messages have `role` field with values: `system`, `user`, `assistant`, `tool` -- Tool results have `tool_call_id` field -- No special top-level fields like `contents` (Gemini) or `system` array (Bedrock/Anthropic) diff --git a/docs/providers/openai.md b/docs/providers/openai.md deleted file mode 100644 index db24be4..0000000 --- a/docs/providers/openai.md +++ /dev/null @@ -1,223 +0,0 @@ -# OpenAI API Format - -OpenAI offers two API formats: **Chat Completions** (original) and **Responses** (newer). - -## Sources - -- **AI SDK**: `packages/openai/src/chat/openai-chat-language-model.ts`, `packages/openai/src/responses/openai-responses-language-model.ts` -- **AI SDK OpenAI-Compatible**: `packages/openai-compatible/src/chat/openai-compatible-chat-language-model.ts` -- **Official Docs**: https://platform.openai.com/docs/api-reference/chat -- **Responses API**: https://platform.openai.com/docs/api-reference/responses - -## Chat Completions API (`/chat/completions`) - -### Request Structure - -```json -{ - "model": "gpt-4o", - "messages": [...], - "tools": [...], - "tool_choice": "auto" | "none" | "required" | {"type": "function", "function": {"name": "..."}}, - "max_tokens": 4096, - "temperature": 0.7, - "response_format": {"type": "json_object"} | {"type": "json_schema", "json_schema": {...}}, - "stream": false -} -``` - -### Message Roles - -| Role | Description | -|------|-------------| -| `system` | System instructions | -| `user` | User input | -| `assistant` | Model responses | -| `tool` | Tool/function results | - -### Message Formats - -**System Message:** -```json -{"role": "system", "content": "You are a helpful assistant."} -``` - -**User Message (multimodal):** -```json -{ - "role": "user", - "content": [ - {"type": "text", "text": "What's in this image?"}, - {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg", "detail": "auto"}}, - {"type": "file", "file": {"file_id": "file-abc123"}} - ] -} -``` - -**Assistant Message with Tool Calls:** -```json -{ - "role": "assistant", - "content": null, - "tool_calls": [ - { - "id": "call_abc123", - "type": "function", - "function": { - "name": "get_weather", - "arguments": "{\"location\": \"San Francisco\"}" - } - } - ] -} -``` - -**Tool Result Message:** -```json -{ - "role": "tool", - "tool_call_id": "call_abc123", - "content": "{\"temperature\": 72, \"condition\": \"sunny\"}" -} -``` - -### Tool Definition - -```json -{ - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string"} - }, - "required": ["location"] - }, - "strict": true - } -} -``` - ---- - -## Responses API (`/responses`) - -### Key Differences from Chat Completions - -| Feature | Chat Completions | Responses API | -|---------|-----------------|---------------| -| Message array | `messages` | `input` | -| Tool call ID field | `tool_call_id` | `call_id` | -| System message | In messages | `instructions` field or in input | -| Token limit | `max_tokens` | `max_output_tokens` | -| Reasoning | Not supported | `reasoning` config | - -### Request Structure - -```json -{ - "model": "gpt-4o", - "input": [...], - "instructions": "Optional system instructions", - "tools": [...], - "tool_choice": "auto" | "none" | "required" | {"type": "function", "name": "..."}, - "max_output_tokens": 4096, - "previous_response_id": "resp_abc123", - "reasoning": { - "effort": "medium", - "summary": "auto" - }, - "stream": false -} -``` - -## Thinking/Reasoning (Responses API only) - -### Request Configuration -```json -{ - "reasoning": { - "effort": "low" | "medium" | "high", - "summary": "auto" | "concise" | "detailed" - } -} -``` - -**Parameters:** -- `effort`: How much reasoning effort (affects token usage) -- `summary`: How to summarize reasoning in response - -**Constraints when reasoning enabled:** -- `temperature` is **NOT supported** (use default) -- `topP` is **NOT supported** -- Only available on reasoning models (o1, o3, etc.) - -### Response Output Items - -**Reasoning Item** (in output array): -```json -{ - "type": "reasoning", - "id": "reasoning_abc123", - "encrypted_content": "encrypted_base64_reasoning_content", - "summary": [ - {"type": "summary_text", "text": "I analyzed the problem by..."} - ] -} -``` - -**Key fields:** -- `encrypted_content`: The actual reasoning is encrypted/hidden -- `summary`: Optional human-readable summary of reasoning - -### Usage Tracking -```json -{ - "usage": { - "input_tokens": 100, - "output_tokens": 200, - "output_tokens_details": { - "reasoning_tokens": 150 - } - } -} -``` - -### SDK Conversion -The AI SDK handles reasoning items: -```typescript -// OpenAI Responses output -{type: "reasoning", id: "...", encrypted_content: "...", summary: [...]} - -// Kept as reasoning type in SDK -{type: "reasoning", reasoningId: "...", text: "summary text"} -``` - -### Context Pruning for Reasoning -- **Encrypted content** cannot be inspected or modified -- **Summaries** provide readable insight into reasoning -- Reasoning items appear as separate items in `output` array -- `reasoning_tokens` in usage helps track cost - ---- - -## Context Pruning Considerations - -1. **Tool correlation**: Both formats use ID-based correlation (`tool_call_id` or `call_id`) -2. **Paired pruning**: Tool calls and their results should be pruned together -3. **Message roles**: 4 distinct roles in Chat Completions; Responses API uses item types -4. **Content types**: User content is `type: "text"/"image_url"` in Chat, `type: "input_text"/"input_image"` in Responses -5. **Assistant content**: String in Chat Completions, `output_text` array in Responses - -## OpenAI-Compatible Providers - -Most providers in models.dev use the OpenAI Chat Completions format via `@ai-sdk/openai-compatible`: -- together, deepseek, groq, fireworks, hyperbolic, novita, cerebras, sambanova, etc. - -These providers accept the same request format but may have different: -- Supported models -- Rate limits -- Feature availability (vision, tool use, etc.) diff --git a/lib/prompts/prune-nudge.txt b/lib/prompts/prune-nudge.txt index ed2078a..ef84d35 100644 --- a/lib/prompts/prune-nudge.txt +++ b/lib/prompts/prune-nudge.txt @@ -2,9 +2,9 @@ **CRITICAL CONTEXT WARNING:** Your context window is filling with tool outputs. Strict adherence to context hygiene is required. **Immediate Actions Required:** -1. **Garbage Collect:** If you read files or ran commands that yielded no value, prune them NOW. Do not summarize them. -2. **Task Cleanup:** If a sub-task is complete, prune the tools used. -3. **Consolidate:** If you are holding valuable raw data, you *must* distill the insights into your narrative and prune the raw entry. +1. **Task Completion:** If a sub-task is complete, prune the tools used. No distillation. +2. **Noise Removal:** If you read files or ran commands that yielded no value, prune them NOW. No distillation. +3. **Consolidation:** If you are holding valuable raw data, you *must* distill the insights into `metadata.distillation` and prune the raw entry. **Protocol:** You should prioritize this cleanup, but do not interrupt a critical atomic operation if one is in progress. Once the immediate step is done, you must prune. diff --git a/lib/prompts/prune-system-prompt.txt b/lib/prompts/prune-system-prompt.txt index 7307884..397de09 100644 --- a/lib/prompts/prune-system-prompt.txt +++ b/lib/prompts/prune-system-prompt.txt @@ -8,14 +8,14 @@ PRUNE METHODICALLY - CONSOLIDATE YOUR ACTIONS Every tool call adds to your context debt. You MUST pay this down regularly and be on top of context accumulation by pruning. Consolidate your prunes for efficiency; it is rarely worth pruning a single tiny tool output unless it is pure noise. Evaluate what SHOULD be pruned before jumping the gun. WHEN TO PRUNE? THE THREE SCENARIOS TO CONSIDER -1. TASK COMPLETION: When work is done, quietly prune the tools that aren't needed anymore -2. NOISE REMOVAL: If outputs are irrelevant, unhelpful, or superseded by newer info, prune IMMEDIATELY. No distillation - gun it down -3. CONTEXT CONSOLIDATION: When pruning valuable context to the task at hand, you MUST ALWAYS distill key findings into your narrative BEFORE pruning. Be surgical and strategic in what you extract. THINK: high signal, low noise +1. TASK COMPLETION: When work is done, quietly prune the tools that aren't needed anymore. No distillation. +2. NOISE REMOVAL: If outputs are irrelevant, unhelpful, or superseded by newer info, prune. No distillation. +3. CONTEXT CONSOLIDATION: When pruning valuable context to the task at hand, you MUST ALWAYS provide the key findings in the `metadata.distillation` parameter of the `prune` tool (as an object). Be surgical and strategic in what you extract. THINK: high signal, low noise You WILL use the `prune` tool when ANY of these are true: - Task or sub-task is complete - You are about to start a new phase of work -- You have distilled enough information in your messages to prune related tools +- You have gathered enough information to prune related tools and preserve their value in the `metadata.distillation` parameter - Context contains tools output that are unhelpful, noise, or made obsolete by newer outputs - Write or edit operations are complete (pruning removes the large input content) @@ -26,7 +26,7 @@ You MUST NOT prune when: Pruning that forces you to re-call the same tool later is a net loss. Only prune when you're confident the information won't be needed again. NOTES -When in doubt, keep it. Prune frequently yet remain strategic and consolidate your actions. +When in doubt, keep it. Consolidate your actions and aim for high-impact prunes that significantly reduce context size. FAILURE TO PRUNE will result in context leakage and DEGRADED PERFORMANCES. There may be tools in session context that do not appear in the list, this is expected, you can ONLY prune what you see in . diff --git a/lib/prompts/prune-tool-spec.txt b/lib/prompts/prune-tool-spec.txt index c11d46f..450ea9a 100644 --- a/lib/prompts/prune-tool-spec.txt +++ b/lib/prompts/prune-tool-spec.txt @@ -1,4 +1,4 @@ -Prunes tool outputs from context to manage conversation size and reduce noise. For `write` and `edit` tools, the input content is pruned instead of the output. +Prunes tool outputs from context to manage conversation size and reduce noise. ## IMPORTANT: The Prunable List A `` list is injected into user messages showing available tool outputs you can prune when there are tools available for pruning. Each line has the format `ID: tool, parameter` (e.g., `20: read, /path/to/file.ts`). You MUST only use numeric IDs that appear in this list to select which tools to prune. @@ -7,58 +7,63 @@ A `` list is injected into user messages showing available tool ## CRITICAL: When and How to Prune -You must use this tool in three specific scenarios. The rules for distillation (summarizing findings) differ for each. **You must specify the reason as the first element of the `ids` array** to indicate which scenario applies. +You must use this tool in three specific scenarios. The rules for distillation (summarizing findings) differ for each. **You must provide a `metadata` object with a `reason` and optional `distillation`** to indicate which scenario applies. ### 1. Task Completion (Clean Up) — reason: `completion` **When:** You have successfully completed a specific unit of work (e.g., fixed a bug, wrote a file, answered a question). **Action:** Prune the tools used for that task. -**Distillation:** NOT REQUIRED. Since the task is done, the raw data is no longer needed. Simply state that the task is complete. +**Distillation:** FORBIDDEN. Do not summarize completed work. ### 2. Removing Noise (Garbage Collection) — reason: `noise` **When:** You have read files or run commands that turned out to be irrelevant, unhelpful, or outdated (meaning later tools have provided fresher, more valid information). **Action:** Prune these specific tool outputs immediately. -**Distillation:** FORBIDDEN. Do not pollute the context by summarizing useless information. Just cut it out. +**Distillation:** FORBIDDEN. Do not summarize noise. ### 3. Context Conservation (Research & Consolidation) — reason: `consolidation` -**When:** You have gathered useful information. Prune frequently as you work (e.g., after reading a few files), rather than waiting for a "long" phase to end. +**When:** You have gathered useful information. Wait until you have several items or a few large outputs to prune, rather than doing tiny, frequent prunes. Aim for high-impact prunes that significantly reduce context size. **Action:** Convert raw data into distilled knowledge. This allows you to discard large outputs (like full file reads) while keeping only the specific parts you need (like a single function signature or constant). -**Distillation:** MANDATORY. Before pruning, you *must* explicitly summarize the key findings from *every* tool you plan to prune. - - **Extract specific value:** If you read a large file but only care about one function, record that function's details and prune the whole read. - - Narrative format: "I found X in file Y..." - - Capture all relevant details (function names, logic, constraints). - - Once distilled into your response history, the raw tool output can be safely pruned. +**Distillation:** MANDATORY. You MUST provide the distilled findings in the `metadata.distillation` parameter of the `prune` tool (as an object). + - **Extract specific value:** If you read a large file but only care about one function, record that function's details. + - **Consolidate:** When pruning multiple tools, your distillation object MUST aggregate findings from ALL of them. Ensure you capture any information necessary to solve the current task. + - Structure: Map the `ID` from the `` list to its distilled findings. + Example: `{ "20": { "findings": "...", "logic": "..." } }` + - Capture all relevant details (function names, logic, constraints) to ensure no signal is lost. + - Prioritize information that is essential for the immediate next steps of your plan. + - Once distilled into the `metadata` object, the raw tool output can be safely pruned. - **Know when distillation isn't enough:** If you'll need to edit a file, grep for exact strings, or reference precise syntax, keep the raw output. Distillation works for understanding; implementation often requires the original. - **Prefer keeping over re-fetching:** If uncertain whether you'll need the output again, keep it. The cost of retaining context is lower than the cost of redundant tool calls. ## Best Practices -- **Consolidate your prunes:** Don't prune a single small tool output (like a short bash command) unless it's pure noise. Wait until you have several items or a few large outputs to prune. Aim for high-impact prunes that significantly reduce context size or noise. -- **Don't wait too long:** Prune frequently to keep the context agile, but balance this with the need for consolidation. -- **Be surgical:** You can mix strategies. Prune noise without comment, while distilling useful context in the same turn. -- **Verify:** Ensure you have captured what you need before deleting useful raw data. -- **Think ahead:** Before pruning, ask: "Will I need this output for an upcoming task?" If you researched a file you'll later edit, or gathered context for implementation, do NOT prune it—even if you've distilled findings. Distillation captures *knowledge*; implementation requires *context*. +- **Strategic Consolidation:** Don't prune single small tool outputs (like short bash commands) unless they are pure noise. Instead, wait until you have several items or large outputs to perform high-impact prunes. This balances the need for an agile context with the efficiency of larger batches. +- **Think ahead:** Before pruning, ask: "Will I need this output for an upcoming task?" If you researched a file you'll later edit, or gathered context for implementation, do NOT prune it. ## Examples Assistant: [Reads 'wrong_file.ts'] This file isn't relevant to the auth system. I'll remove it to clear the context. -[Uses prune with ids: ["noise", "5"]] +[Uses prune with ids: ["5"], metadata: { "reason": "noise" }] Assistant: [Reads 5 different config files] -I have analyzed the configuration. Here is the distillation: -- 'config.ts' uses port 3000. -- 'db.ts' connects to mongo:27017. -- The other 3 files were defaults. -I have preserved the signals above, so I am now pruning the raw reads. -[Uses prune with ids: ["consolidation", "10", "11", "12", "13", "14"]] +I'll preserve the configuration details and prune the raw reads. +[Uses prune with ids: ["10", "11", "12", "13", "14"], metadata: { + "reason": "consolidation", + "distillation": { + "10": "uses port 3000", + "11": "connects to mongo:27017", + "12": "defines shared constants", + "13": "export defaults", + "14": "unused fallback" + } +}] Assistant: [Runs tests, they pass] -The tests passed. The feature is verified. -[Uses prune with ids: ["completion", "20", "21"]] +The tests passed. I'll clean up now. +[Uses prune with ids: ["20", "21"], metadata: { "reason": "completion" }] @@ -69,5 +74,5 @@ I've understood the auth flow. I'll need to modify this file to add the new vali Assistant: [Edits 'auth.ts' to add validation] The edit was successful. I no longer need the raw edit content in context. -[Uses prune with ids: ["completion", "15"]] +[Uses prune with ids: ["15"], metadata: { "reason": "completion" }] diff --git a/lib/strategies/prune-tool.ts b/lib/strategies/prune-tool.ts index c8f6d38..285a88d 100644 --- a/lib/strategies/prune-tool.ts +++ b/lib/strategies/prune-tool.ts @@ -34,8 +34,14 @@ export function createPruneTool( ids: tool.schema.array( tool.schema.string() ).describe( - "First element is the reason ('completion', 'noise', 'consolidation'), followed by numeric IDs as strings to prune" + "Numeric IDs as strings to prune from the list" ), + metadata: tool.schema.object({ + reason: tool.schema.enum(["completion", "noise", "consolidation"]).describe("The reason for pruning"), + distillation: tool.schema.record(tool.schema.string(), tool.schema.any()).optional().describe( + "An object containing detailed summaries or extractions of the key findings from the tools being pruned. This is REQUIRED for 'consolidation'." + ), + }).describe("Metadata about the pruning operation."), }, async execute(args, toolCtx) { const { client, state, logger, config, workingDirectory } = ctx @@ -49,21 +55,20 @@ export function createPruneTool( return "No IDs provided. Check the list for available IDs to prune." } - // Parse reason from first element, numeric IDs from the rest - - const reason = args.ids[0]; - const validReasons = ["completion", "noise", "consolidation"] as const - if (typeof reason !== "string" || !validReasons.includes(reason as any)) { - logger.debug("Invalid pruning reason provided: " + reason) - return "No valid pruning reason found. Use 'completion', 'noise', or 'consolidation' as the first element." + if (!args.metadata || !args.metadata.reason) { + logger.debug("Prune tool called without metadata.reason: " + JSON.stringify(args)) + return "Missing metadata.reason. Provide metadata: { reason: 'completion' | 'noise' | 'consolidation' }" } - const numericToolIds: number[] = args.ids.slice(1) + const { reason, distillation } = args.metadata; + + const numericToolIds: number[] = args.ids .map(id => parseInt(id, 10)) .filter((n): n is number => !isNaN(n)) + if (numericToolIds.length === 0) { logger.debug("No numeric tool IDs provided for pruning, yet prune tool was called: " + JSON.stringify(args)) - return "No numeric IDs provided. Format: [reason, id1, id2, ...] where reason is 'completion', 'noise', or 'consolidation'." + return "No numeric IDs provided. Format: ids: [id1, id2, ...]" } // Fetch messages to calculate tokens and find current agent @@ -133,11 +138,17 @@ export function createPruneTool( saveSessionState(state, logger) .catch(err => logger.error("Failed to persist state", { error: err.message })) - return formatPruningResultForTool( + const result = formatPruningResultForTool( pruneToolIds, toolMetadata, workingDirectory ) + + if (distillation) { + logger.info("Distillation data received:", distillation) + } + + return result }, }) }