diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index b7b6e892..00000000 --- a/.dockerignore +++ /dev/null @@ -1,44 +0,0 @@ -# Git -.git -.gitignore - -# Python -__pycache__ -*.py[cod] -*$py.class -*.so -.Python -.env -.venv -env/ -venv/ -ENV/ - -# IDE -.idea/ -.vscode/ -*.swp -*.swo - -# Build -*.egg-info/ -dist/ -build/ -.eggs/ - -# Logs (will be mounted as volume) -logs/ - -# OAuth credentials (will be mounted as volume) -oauth_creds/ - -# Documentation -*.md -!README.md - -# GitHub -.github/ - -# Misc -.DS_Store -*.log diff --git a/.env.example b/.env.example deleted file mode 100644 index 387829a2..00000000 --- a/.env.example +++ /dev/null @@ -1,434 +0,0 @@ -# ============================================================================== -# || LLM API Key Proxy - Environment Variable Configuration || -# ============================================================================== -# -# This file provides an example configuration for the proxy server. -# Copy this file to a new file named '.env' in the same directory -# and replace the placeholder values with your actual credentials and settings. -# - -# ------------------------------------------------------------------------------ -# | [REQUIRED] Proxy Server Settings | -# ------------------------------------------------------------------------------ - -# A secret key used to authenticate requests to THIS proxy server. -# This can be any string. Your client application must send this key in the -# 'Authorization' header as a Bearer token (e.g., "Authorization: Bearer YOUR_PROXY_API_KEY"). -PROXY_API_KEY="YOUR_PROXY_API_KEY" - - -# ------------------------------------------------------------------------------ -# | [API KEYS] Provider API Keys | -# ------------------------------------------------------------------------------ -# -# The proxy automatically discovers API keys from environment variables. -# To add multiple keys for a single provider, increment the number at the end -# of the variable name (e.g., GEMINI_API_KEY_1, GEMINI_API_KEY_2). -# -# The provider name is derived from the part of the variable name before "_API_KEY". -# For example, 'GEMINI_API_KEY_1' configures the 'gemini' provider. -# - -# --- Google Gemini --- -GEMINI_API_KEY_1="YOUR_GEMINI_API_KEY_1" -GEMINI_API_KEY_2="YOUR_GEMINI_API_KEY_2" - -# --- OpenAI / Azure OpenAI --- -# For Azure, ensure your key has access to the desired models. -OPENAI_API_KEY_1="YOUR_OPENAI_OR_AZURE_API_KEY" - -# --- Anthropic (Claude) --- -ANTHROPIC_API_KEY_1="YOUR_ANTHROPIC_API_KEY" - -# --- OpenRouter --- -OPENROUTER_API_KEY_1="YOUR_OPENROUTER_API_KEY" - -# --- Groq --- -GROQ_API_KEY_1="YOUR_GROQ_API_KEY" - -# --- Mistral AI --- -MISTRAL_API_KEY_1="YOUR_MISTRAL_API_KEY" - -# --- NVIDIA NIM --- -NVIDIA_API_KEY_1="YOUR_NVIDIA_API_KEY" - -# --- Co:here --- -COHERE_API_KEY_1="YOUR_COHERE_API_KEY" - -# --- AWS Bedrock --- -# Note: Bedrock authentication is typically handled via AWS IAM roles or -# environment variables like AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY. -# Only set this if you are using a specific API key for Bedrock. -BEDROCK_API_KEY_1="" - -# --- Chutes --- -CHUTES_API_KEY_1="YOUR_CHUTES_API_KEY" - - -# ------------------------------------------------------------------------------ -# | [OAUTH] Provider OAuth 2.0 Credentials | -# ------------------------------------------------------------------------------ -# -# The proxy now uses a "local-first" approach for OAuth credentials. -# All OAuth credentials are managed within the 'oauth_creds/' directory. -# -# HOW IT WORKS: -# 1. On the first run, if you provide a path to an existing credential file -# (e.g., from ~/.gemini/), the proxy will COPY it into the local -# 'oauth_creds/' directory with a standardized name (e.g., 'gemini_cli_oauth_1.json'). -# 2. On all subsequent runs, the proxy will ONLY use the files found inside -# 'oauth_creds/'. It will no longer scan system-wide directories. -# 3. To add a new account, either use the '--add-credential' tool or manually -# place a new, valid credential file in the 'oauth_creds/' directory. -# -# Use the variables below for the ONE-TIME setup to import existing credentials. -# After the first successful run, you can clear these paths. -# - -# --- Google Gemini (gcloud CLI) --- -# Path to your gcloud ADC file (e.g., ~/.config/gcloud/application_default_credentials.json) -# or a credential file from the official 'gemini' CLI (e.g., ~/.gemini/credentials.json). -GEMINI_CLI_OAUTH_1="" - -# --- Qwen / Dashscope (Code Companion) --- -# Path to your Qwen credential file (e.g., ~/.qwen/oauth_creds.json). -QWEN_CODE_OAUTH_1="" - -# --- iFlow --- -# Path to your iFlow credential file (e.g., ~/.iflow/oauth_creds.json). -IFLOW_OAUTH_1="" - - -# ------------------------------------------------------------------------------ -# | [ADVANCED] Provider-Specific Settings | -# ------------------------------------------------------------------------------ - -# --- Gemini CLI Project ID --- -# Required if you are using the Gemini CLI OAuth provider and the proxy -# cannot automatically determine your Google Cloud Project ID. -GEMINI_CLI_PROJECT_ID="" - -# --- Model Ignore Lists --- -# Specify a comma-separated list of model names to exclude from a provider's -# available models. This is useful for filtering out models you don't want to use. -# -# Format: IGNORE_MODELS_="model-1,model-2,model-3" -# -# Example: -# IGNORE_MODELS_GEMINI="gemini-1.0-pro-vision-latest,gemini-1.0-pro-latest" -# IGNORE_MODELS_OPENAI="gpt-4-turbo,gpt-3.5-turbo-instruct" -IGNORE_MODELS_GEMINI="" -IGNORE_MODELS_OPENAI="" - -# --- Model Whitelists (Overrides Blacklists) --- -# Specify a comma-separated list of model names to ALWAYS include from a -# provider's list. This acts as an override for the ignore list. -# -# HOW IT WORKS: -# 1. A model on a whitelist will ALWAYS be available, even if it's also on an -# ignore list (or if the ignore list is set to "*"). -# 2. For any models NOT on the whitelist, the standard ignore list logic applies. -# -# This allows for two main use cases: -# - "Pure Whitelist" Mode: Set IGNORE_MODELS_="*" and then specify -# only the models you want in WHITELIST_MODELS_. -# - "Exemption" Mode: Blacklist a broad range of models (e.g., "*-preview*") -# and then use the whitelist to exempt specific preview models you want to test. -# -# Format: WHITELIST_MODELS_="model-1,model-2" -# -# Example of a pure whitelist for Gemini: -# IGNORE_MODELS_GEMINI="*" -# WHITELIST_MODELS_GEMINI="gemini-1.5-pro-latest,gemini-1.5-flash-latest" -WHITELIST_MODELS_GEMINI="" -WHITELIST_MODELS_OPENAI="" - -# --- Maximum Concurrent Requests Per Key --- -# Controls how many concurrent requests for the SAME model can use the SAME key. -# This is useful for providers that can handle concurrent requests without rate limiting. -# Default is 1 (no concurrency, current behavior). -# -# Format: MAX_CONCURRENT_REQUESTS_PER_KEY_= -# -# Example: -# MAX_CONCURRENT_REQUESTS_PER_KEY_OPENAI=3 # Allow 3 concurrent requests per OpenAI key -# MAX_CONCURRENT_REQUESTS_PER_KEY_GEMINI=1 # Allow only 1 request per Gemini key (default) -# -MAX_CONCURRENT_REQUESTS_PER_KEY_OPENAI=1 -MAX_CONCURRENT_REQUESTS_PER_KEY_GEMINI=1 -MAX_CONCURRENT_REQUESTS_PER_KEY_ANTHROPIC=1 -MAX_CONCURRENT_REQUESTS_PER_KEY_IFLOW=1 - -# --- Credential Rotation Mode --- -# Controls how credentials are rotated when multiple are available for a provider. -# This affects how the proxy selects the next credential to use for requests. -# -# Available modes: -# balanced - (Default) Rotate credentials evenly across requests to distribute load. -# Best for API keys with per-minute rate limits. -# sequential - Use one credential until it's exhausted (429 error), then switch to next. -# Best for credentials with daily/weekly quotas (e.g., free tier accounts). -# When a credential hits quota, it's put on cooldown based on the reset time -# parsed from the provider's error response. -# -# Format: ROTATION_MODE_= -# -# Provider Defaults: -# - antigravity: sequential (free tier accounts with daily quotas) -# - All others: balanced -# -# Example: -# ROTATION_MODE_GEMINI=sequential # Use Gemini keys until quota exhausted -# ROTATION_MODE_OPENAI=balanced # Distribute load across OpenAI keys (default) -# ROTATION_MODE_ANTIGRAVITY=balanced # Override Antigravity's sequential default -# -# ROTATION_MODE_GEMINI=balanced -# ROTATION_MODE_ANTIGRAVITY=sequential - -# --- Priority-Based Concurrency Multipliers --- -# Credentials can be assigned to priority tiers (1=highest, 2, 3, etc.). -# Each tier can have a concurrency multiplier that increases the effective -# concurrent request limit for credentials in that tier. -# -# How it works: -# effective_concurrent_limit = MAX_CONCURRENT_REQUESTS_PER_KEY * tier_multiplier -# -# This allows paid/premium credentials to handle more concurrent requests than -# free tier credentials, regardless of rotation mode. -# -# Provider Defaults (built into provider classes): -# Antigravity: -# Priority 1: 5x (paid ultra tier) -# Priority 2: 3x (standard paid tier) -# Priority 3+: 2x (sequential mode) or 1x (balanced mode) -# Gemini CLI: -# Priority 1: 5x -# Priority 2: 3x -# Others: 1x (all modes) -# -# Format: CONCURRENCY_MULTIPLIER__PRIORITY_= -# -# Mode-specific overrides (optional): -# Format: CONCURRENCY_MULTIPLIER__PRIORITY__= -# -# Examples: -# CONCURRENCY_MULTIPLIER_ANTIGRAVITY_PRIORITY_1=10 # Override P1 to 10x -# CONCURRENCY_MULTIPLIER_ANTIGRAVITY_PRIORITY_3=1 # Override P3 to 1x -# CONCURRENCY_MULTIPLIER_ANTIGRAVITY_PRIORITY_2_BALANCED=1 # P2 = 1x in balanced mode only - -# --- Model Quota Groups --- -# Models that share quota/cooldown timing. When one model in a group hits -# quota exhausted (429), all models in the group receive the same cooldown timestamp. -# They also reset (archive stats) together when the quota period expires. -# -# This is useful for providers where multiple model variants share the same -# underlying quota (e.g., Claude Sonnet and Opus on Antigravity). -# -# Format: QUOTA_GROUPS__="model1,model2,model3" -# -# To DISABLE a default group, set it to empty string: -# QUOTA_GROUPS_ANTIGRAVITY_CLAUDE="" -# -# Default groups: -# ANTIGRAVITY.CLAUDE: claude-sonnet-4-5,claude-opus-4-5 -# -# Examples: -# QUOTA_GROUPS_ANTIGRAVITY_CLAUDE="claude-sonnet-4-5,claude-opus-4-5" -# QUOTA_GROUPS_ANTIGRAVITY_GEMINI="gemini-3-pro-preview,gemini-3-pro-image-preview" - -# ------------------------------------------------------------------------------ -# | [ADVANCED] Fair Cycle Rotation | -# ------------------------------------------------------------------------------ -# -# Ensures each credential exhausts at least once before any can be reused. -# Prevents one credential from being repeatedly used while others sit idle. -# -# Provider Defaults (see src/rotator_library/config/defaults.py): -# - Enabled: sequential rotation mode only (balanced mode = disabled) -# - Tracking Mode: model_group (track per quota group) -# - Cross-Tier: false (each priority tier cycles independently) -# - Cycle Duration: 86400 seconds (24 hours) -# - Exhaustion Threshold: 300 seconds (5 minutes) -# -# Format: FAIR_CYCLE_{PROVIDER}=true/false -# Example: -# FAIR_CYCLE_ANTIGRAVITY=true -# FAIR_CYCLE_GEMINI_CLI=false - -# Tracking mode: "model_group" (per quota group) or "credential" (global per key) -# FAIR_CYCLE_TRACKING_MODE_ANTIGRAVITY=model_group - -# Cross-tier: true = ALL credentials must exhaust regardless of tier -# FAIR_CYCLE_CROSS_TIER_ANTIGRAVITY=false - -# Cycle duration in seconds -# FAIR_CYCLE_DURATION_ANTIGRAVITY=86400 - -# Exhaustion threshold - cooldown must exceed this to count as "exhausted" -# EXHAUSTION_COOLDOWN_THRESHOLD_ANTIGRAVITY=300 -# EXHAUSTION_COOLDOWN_THRESHOLD=300 # Global fallback for all providers - -# ------------------------------------------------------------------------------ -# | [ADVANCED] Custom Caps | -# ------------------------------------------------------------------------------ -# -# Set custom usage limits per tier, per model/group that are MORE restrictive -# than actual API limits. When the cap is reached, credential goes on cooldown -# BEFORE hitting the actual API limit. -# -# Cap values: absolute number (100) or percentage ("80%") -# Cooldown modes: quota_reset | offset: | fixed: -# -# Format: CUSTOM_CAP_{PROVIDER}_T{TIER}_{MODEL_OR_GROUP}= -# Format: CUSTOM_CAP_COOLDOWN_{PROVIDER}_T{TIER}_{MODEL_OR_GROUP}=: -# -# Name transformations for env vars: -# - Dashes (-) -> Underscores (_) -# - Dots (.) -> Underscores (_) -# - All UPPERCASE -# Example: claude-opus-4.5 -> CLAUDE_OPUS_4_5 -# -# Tier syntax: -# - Single tier: T2 (tier 2) -# - Multi-tier: T2_3 (tiers 2 and 3 share config) -# - Default: TDEFAULT (fallback for unlisted tiers) -# -# Examples: -# CUSTOM_CAP_ANTIGRAVITY_T2_CLAUDE=100 -# CUSTOM_CAP_COOLDOWN_ANTIGRAVITY_T2_CLAUDE=quota_reset -# -# CUSTOM_CAP_ANTIGRAVITY_T3_CLAUDE=30 -# CUSTOM_CAP_COOLDOWN_ANTIGRAVITY_T3_CLAUDE=offset:3600 -# -# CUSTOM_CAP_ANTIGRAVITY_TDEFAULT_CLAUDE=80% -# -# CUSTOM_CAP_ANTIGRAVITY_T2_3_G25_FLASH=80% -# CUSTOM_CAP_COOLDOWN_ANTIGRAVITY_T2_3_G25_FLASH=offset:1800 - -# ------------------------------------------------------------------------------ -# | [ADVANCED] Proxy Configuration | -# ------------------------------------------------------------------------------ - -# --- OAuth Refresh Interval --- -# How often, in seconds, the background refresher should check and refresh -# expired OAuth tokens. -# Default: 600 (10 minutes) -# OAUTH_REFRESH_INTERVAL=600 - -# --- Skip OAuth Initialization --- -# Set to "true" to prevent the proxy from performing the interactive OAuth -# setup/validation flow on startup. This is highly recommended for non-interactive -# environments like Docker containers or automated scripts. -# Ensure your credentials in 'oauth_creds/' are valid before enabling this. -SKIP_OAUTH_INIT_CHECK=false - -# --- Global Request Timeout --- -# Maximum time (in seconds) a request can wait for an available credential. -# If all credentials are on cooldown and none will become available within -# this timeout, the request fails fast with a clear error message. -# Increase this value if you have limited credentials and want to wait -# longer for capacity (e.g., when credentials hit rate limits). -# Default: 30 seconds -# GLOBAL_TIMEOUT=30 - -# ------------------------------------------------------------------------------ -# | [ADVANCED] HTTP Timeout Configuration | -# ------------------------------------------------------------------------------ -# -# Controls timeouts for HTTP requests to provider APIs. -# All values are in seconds. -# - -# Connection establishment timeout (default: 30) -# TIMEOUT_CONNECT=30 - -# Request body send timeout (default: 30) -# TIMEOUT_WRITE=30 - -# Connection pool acquisition timeout (default: 60) -# TIMEOUT_POOL=60 - -# Read timeout between streaming chunks (default: 300 = 5 minutes) -# If no data arrives for this duration, the connection is considered stalled. -# TIMEOUT_READ_STREAMING=300 - -# Read timeout for non-streaming responses (default: 600 = 10 minutes) -# Some LLM responses take significant time to generate. -# TIMEOUT_READ_NON_STREAMING=600 - -# ------------------------------------------------------------------------------ -# | [ADVANCED] Antigravity Provider Configuration | -# ------------------------------------------------------------------------------ -# -# Configuration for the Antigravity (Google Code Assist) provider. -# These settings control retry behavior and prompt handling. -# - -# --- Empty Response Handling --- -# When Antigravity returns an empty response (no content, no tool calls), -# the proxy will automatically retry up to this many attempts. -# Default: 6 attempts -# ANTIGRAVITY_EMPTY_RESPONSE_ATTEMPTS=6 - -# Delay in seconds between empty response retries. -# Default: 3 seconds -# ANTIGRAVITY_EMPTY_RESPONSE_RETRY_DELAY=3 - -# --- Malformed Function Call Handling --- -# When Gemini 3 returns MALFORMED_FUNCTION_CALL (invalid JSON syntax), -# the proxy injects corrective messages and retries. -# Default: 2 retries -# ANTIGRAVITY_MALFORMED_CALL_RETRIES=2 - -# Delay in seconds between malformed call retries. -# Default: 1 second -# ANTIGRAVITY_MALFORMED_CALL_DELAY=1 - -# --- System Instruction Configuration --- -# When true, prepend the Antigravity agent system instruction. -# Default: true -# ANTIGRAVITY_PREPEND_INSTRUCTION=true - -# When true, inject an identity override instruction after the Antigravity prompt. -# This tells the model to disregard the Antigravity identity. -# Default: true -# ANTIGRAVITY_INJECT_IDENTITY_OVERRIDE=true - -# When true, use shortened versions of prompts to reduce context bloat. -# Default: true -# ANTIGRAVITY_USE_SHORT_PROMPTS=true - -# ------------------------------------------------------------------------------ -# | [ADVANCED] Gemini CLI Provider Configuration | -# ------------------------------------------------------------------------------ -# -# Configuration for the Gemini CLI (Google Code Assist) provider. -# - -# OAuth callback port for interactive re-authentication. -# Default: 8085 -# GEMINI_CLI_OAUTH_PORT=8085 - -# ------------------------------------------------------------------------------ -# | [ADVANCED] Antigravity OAuth Configuration | -# ------------------------------------------------------------------------------ -# -# OAuth callback port for Antigravity interactive re-authentication. -# Default: 8085 (same as Gemini CLI, shared) -# ANTIGRAVITY_OAUTH_PORT=8085 - -# ------------------------------------------------------------------------------ -# | [ADVANCED] Debugging / Logging | -# ------------------------------------------------------------------------------ - -# --- LiteLLM Pydantic Warning Suppression --- -# LiteLLM produces harmless Pydantic serialization warnings during streaming -# due to a known issue with response types (Message, StreamingChoices) having -# mismatched field counts. These warnings don't affect functionality. -# See: https://github.com/BerriAI/litellm/issues/11759 -# -# NOTE: This is a workaround. Remove once litellm patches the issue above. -# -# Set to "0" to show these warnings (useful for debugging). -# Default: "1" (suppress warnings) -# SUPPRESS_LITELLM_SERIALIZATION_WARNINGS=1 \ No newline at end of file diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 3711fdfd..00000000 --- a/.gitignore +++ /dev/null @@ -1,134 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.pyc -*.pyo -*.pyd - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are created by PyInstaller. -# See a comprehensive list at https://github.com/github/gitignore/blob/main/Python.gitignore -# -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyderworkspace - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ -test_proxy.py -start_proxy.bat -key_usage.json -staged_changes.txt -launcher_config.json -quota_viewer_config.json -cache/antigravity/thought_signatures.json -logs/ -cache/ -*.env - -oauth_creds/ - diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md deleted file mode 100644 index f7ebbde5..00000000 --- a/DOCUMENTATION.md +++ /dev/null @@ -1,1928 +0,0 @@ -# Technical Documentation: Universal LLM API Proxy & Resilience Library - -This document provides a detailed technical explanation of the project's architecture, internal components, and data flows. It is intended for developers who want to understand how the system achieves high availability and resilience. - -## 1. Architecture Overview - -The project is a monorepo containing two primary components: - -1. **The Proxy Application (`proxy_app`)**: This is the user-facing component. It's a FastAPI application that acts as a universal gateway. It uses `litellm` to translate requests to various provider formats and includes: - * **Batch Manager**: Optimizes high-volume embedding requests. - * **Detailed Logger**: Provides per-request file logging for debugging. - * **OpenAI-Compatible Endpoints**: `/v1/chat/completions`, `/v1/embeddings`, etc. - * **Anthropic-Compatible Endpoints**: `/v1/messages`, `/v1/messages/count_tokens` for Claude Code and other Anthropic API clients. - * **Model Filter GUI**: Visual interface for configuring model ignore/whitelist rules per provider (see Section 6). -2. **The Resilience Library (`rotator_library`)**: This is the core engine that provides high availability. It is consumed by the proxy app to manage a pool of API keys, handle errors gracefully, and ensure requests are completed successfully even when individual keys or provider endpoints face issues. - -This architecture cleanly separates the API interface from the resilience logic, making the library a portable and powerful tool for any application needing robust API key management. - ---- - -## 2. `rotator_library` - The Resilience Engine - -This library is the heart of the project, containing all the logic for managing a pool of API keys, tracking their usage, and handling provider interactions to ensure application resilience. - -### 2.1. `client.py` - The `RotatingClient` - -The `RotatingClient` is the central class that orchestrates all operations. It is designed as a long-lived, async-native object. - -#### Initialization - -The client is initialized with your provider API keys, retry settings, and a new `global_timeout`. - -```python -client = RotatingClient( - api_keys=api_keys, - oauth_credentials=oauth_credentials, - max_retries=2, - usage_file_path="key_usage.json", - configure_logging=True, - global_timeout=30, - abort_on_callback_error=True, - litellm_provider_params={}, - ignore_models={}, - whitelist_models={}, - enable_request_logging=False, - max_concurrent_requests_per_key={} -) -``` - -- `api_keys` (`Optional[Dict[str, List[str]]]`, default: `None`): A dictionary mapping provider names to a list of API keys. -- `oauth_credentials` (`Optional[Dict[str, List[str]]]`, default: `None`): A dictionary mapping provider names to a list of file paths to OAuth credential JSON files. -- `max_retries` (`int`, default: `2`): The number of times to retry a request with the *same key* if a transient server error occurs. -- `usage_file_path` (`str`, default: `"key_usage.json"`): The path to the JSON file where usage statistics are persisted. -- `configure_logging` (`bool`, default: `True`): If `True`, configures the library's logger to propagate logs to the root logger. -- `global_timeout` (`int`, default: `30`): A hard time limit (in seconds) for the entire request lifecycle. -- `abort_on_callback_error` (`bool`, default: `True`): If `True`, any exception raised by `pre_request_callback` will abort the request. -- `litellm_provider_params` (`Optional[Dict[str, Any]]`, default: `None`): Extra parameters to pass to `litellm` for specific providers. -- `ignore_models` (`Optional[Dict[str, List[str]]]`, default: `None`): Blacklist of models to exclude (supports wildcards). -- `whitelist_models` (`Optional[Dict[str, List[str]]]`, default: `None`): Whitelist of models to always include, overriding `ignore_models`. -- `enable_request_logging` (`bool`, default: `False`): If `True`, enables detailed per-request file logging. -- `max_concurrent_requests_per_key` (`Optional[Dict[str, int]]`, default: `None`): Max concurrent requests allowed for a single API key per provider. -- `rotation_tolerance` (`float`, default: `3.0`): Controls the credential rotation strategy. See Section 2.2 for details. - -#### Core Responsibilities - -* **Lifecycle Management**: Manages a shared `httpx.AsyncClient` for all non-blocking HTTP requests. -* **Key Management**: Interfacing with the `UsageManager` to acquire and release API keys based on load and health. -* **Plugin System**: Dynamically loading and using provider-specific plugins from the `providers/` directory. -* **Execution Logic**: Executing API calls via `litellm` with a robust, **deadline-driven** retry and key selection strategy. -* **Streaming Safety**: Providing a safe, stateful wrapper (`_safe_streaming_wrapper`) for handling streaming responses, buffering incomplete JSON chunks, and detecting mid-stream errors. -* **Model Filtering**: Filtering available models using configurable whitelists and blacklists. -* **Request Sanitization**: Automatically cleaning invalid parameters (like `dimensions` for non-OpenAI models) via `request_sanitizer.py`. - -#### Model Filtering Logic - -The `RotatingClient` provides fine-grained control over which models are exposed via the `/v1/models` endpoint. This is handled by the `get_available_models` method. - -The logic applies in the following order: -1. **Whitelist Check**: If a provider has a whitelist defined (`WHITELIST_MODELS_`), any model on that list will **always be available**, even if it matches a blacklist pattern. This acts as a definitive override. -2. **Blacklist Check**: For any model *not* on the whitelist, the client checks the blacklist (`IGNORE_MODELS_`). If the model matches a blacklist pattern (supports wildcards like `*-preview`), it is excluded. -3. **Default**: If a model is on neither list, it is included. - -#### Request Lifecycle: A Deadline-Driven Approach - -The request lifecycle has been designed around a single, authoritative time budget to ensure predictable performance: - -1. **Deadline Establishment**: The moment `acompletion` or `aembedding` is called, a `deadline` is calculated: `time.time() + self.global_timeout`. This `deadline` is the absolute point in time by which the entire operation must complete. -2. **Deadline-Aware Key Selection**: The main loop checks this deadline before every key acquisition attempt. If the deadline is exceeded, the request fails immediately. -3. **Deadline-Aware Key Acquisition**: The `UsageManager` itself takes this `deadline`. It will only wait for a key (if all are busy) until the deadline is reached. -4. **Deadline-Aware Retries**: If a transient error occurs (like a 500 or 429), the client calculates the backoff time. If waiting would push the total time past the deadline, the wait is skipped, and the client immediately rotates to the next key. - -#### Streaming Resilience - -The `_safe_streaming_wrapper` is a critical component for stability. It: -* **Buffers Fragments**: Reads raw chunks from the stream and buffers them until a valid JSON object can be parsed. This handles providers that may split JSON tokens across network packets. -* **Error Interception**: Detects if a chunk contains an API error (like a quota limit) instead of content, and raises a specific `StreamedAPIError`. -* **Quota Handling**: If a specific "quota exceeded" error is detected mid-stream multiple times, it can terminate the stream gracefully to prevent infinite retry loops on oversized inputs. - -### 2.2. `usage_manager.py` - Stateful Concurrency & Usage Management - -This class is the stateful core of the library, managing concurrency, usage tracking, cooldowns, and quota resets. - -#### Key Concepts - -* **Async-Native & Lazy-Loaded**: Fully asynchronous, using `aiofiles` for non-blocking file I/O. Usage data is loaded only when needed. -* **Fine-Grained Locking**: Each API key has its own `asyncio.Lock` and `asyncio.Condition`. This allows for highly granular control. -* **Multiple Reset Modes**: Supports three reset strategies: - - **per_model**: Each model has independent usage window with authoritative `quota_reset_ts` (from provider errors) - - **credential**: One window per credential with custom duration (e.g., 5 hours, 7 days) - - **daily**: Legacy daily reset at `daily_reset_time_utc` -* **Model Quota Groups**: Models can be grouped to share quota limits. When one model in a group hits quota, all receive the same reset timestamp. - -#### Tiered Key Acquisition Strategy - -The `acquire_key` method uses a sophisticated strategy to balance load: - -1. **Filtering**: Keys currently on cooldown (global or model-specific) are excluded. -2. **Rotation Mode**: Determines credential selection strategy: - * **Balanced Mode** (default): Credentials sorted by usage count - least-used first for even distribution - * **Sequential Mode**: Credentials sorted by usage count descending - most-used first to maintain sticky behavior until exhausted -3. **Tiering**: Valid keys are split into two tiers: - * **Tier 1 (Ideal)**: Keys that are completely idle (0 concurrent requests). - * **Tier 2 (Acceptable)**: Keys that are busy but still under their configured `MAX_CONCURRENT_REQUESTS_PER_KEY_` limit for the requested model. This allows a single key to be used multiple times for the same model, maximizing throughput. -4. **Selection Strategy** (configurable via `rotation_tolerance`): - * **Deterministic (tolerance=0.0)**: Within each tier, keys are sorted by daily usage count and the least-used key is always selected. This provides perfect load balance but predictable patterns. - * **Weighted Random (tolerance>0, default)**: Keys are selected randomly with weights biased toward less-used ones: - - Formula: `weight = (max_usage - credential_usage) + tolerance + 1` - - `tolerance=2.0` (recommended): Balanced randomness - credentials within 2 uses of the maximum can still be selected with reasonable probability - - `tolerance=5.0+`: High randomness - even heavily-used credentials have significant probability - - **Security Benefit**: Unpredictable selection patterns make rate limit detection and fingerprinting harder - - **Load Balance**: Lower-usage credentials still preferred, maintaining reasonable distribution -5. **Concurrency Limits**: Checks against `max_concurrent` limits (with priority multipliers applied) to prevent overloading a single key. -6. **Priority Groups**: When credential prioritization is enabled, higher-tier credentials (lower priority numbers) are tried first before moving to lower tiers. - -#### Failure Handling & Cooldowns - -* **Escalating Backoff**: When a failure occurs, the key gets a temporary cooldown for that specific model. Consecutive failures increase this time (10s -> 30s -> 60s -> 120s). -* **Key-Level Lockouts**: If a key accumulates failures across multiple distinct models (3+), it is assumed to be dead/revoked and placed on a global 5-minute lockout. -* **Authentication Errors**: Immediate 5-minute global lockout. -* **Quota Exhausted Errors**: When a provider returns a quota exhausted error with an authoritative reset timestamp: - - The `quota_reset_ts` is extracted from the error response (via provider's `parse_quota_error()` method) - - Applied to the affected model (and all models in its quota group if defined) - - Cooldown preserved even during daily/window resets until the actual quota reset time - - Logs show the exact reset time in local timezone with ISO format - -### 2.3. `batch_manager.py` - Efficient Request Aggregation - -The `EmbeddingBatcher` class optimizes high-throughput embedding workloads. - -* **Mechanism**: It uses an `asyncio.Queue` to collect incoming requests. -* **Triggers**: A batch is dispatched when either: - 1. The queue size reaches `batch_size` (default: 64). - 2. A time window (`timeout`, default: 0.1s) elapses since the first request in the batch. -* **Efficiency**: This reduces dozens of HTTP calls to a single API request, significantly reducing overhead and rate limit usage. - -### 2.4. `background_refresher.py` - Automated Token Maintenance & Provider Jobs - -The `BackgroundRefresher` manages background tasks for the proxy, including OAuth token refresh and provider-specific periodic jobs. - -#### OAuth Token Refresh - -* **Periodic Checks**: It runs a background task that wakes up at a configurable interval (default: 600 seconds/10 minutes via `OAUTH_REFRESH_INTERVAL`). -* **Proactive Refresh**: It iterates through all loaded OAuth credentials and calls their `proactively_refresh` method to ensure tokens are valid before they are needed. - -#### Provider-Specific Background Jobs - -Providers can define their own background jobs that run on independent schedules: - -* **Independent Timers**: Each provider's job runs on its own interval, separate from the OAuth refresh cycle. -* **Configuration**: Providers implement `get_background_job_config()` to define their job settings. -* **Execution**: Providers implement `run_background_job()` to execute the periodic task. - -**Provider Job Configuration:** -```python -def get_background_job_config(self) -> Optional[Dict[str, Any]]: - """Return configuration for provider-specific background job.""" - return { - "interval": 300, # seconds between runs - "name": "quota_refresh", # for logging - "run_on_start": True, # whether to run immediately at startup - } - -async def run_background_job( - self, - usage_manager: "UsageManager", - credentials: List[str], -) -> None: - """Execute the provider's periodic background job.""" - # Provider-specific logic here - pass -``` - -**Current Provider Jobs:** - -| Provider | Job Name | Default Interval | Purpose | -|----------|----------|------------------|---------| -| Antigravity | `antigravity_quota_refresh` | 300s (5 min) | Fetches quota status from API to update remaining quota estimates | -| Gemini CLI | `gemini_cli_quota_refresh` | 300s (5 min) | Fetches quota status from `retrieveUserQuota` API to update remaining quota estimates | - -### 2.6. Credential Management Architecture - -The `CredentialManager` class (`credential_manager.py`) centralizes the lifecycle of all API credentials. It adheres to a "Local First" philosophy. - -#### 2.6.1. Automated Discovery & Preparation - -On startup (unless `SKIP_OAUTH_INIT_CHECK=true`), the manager performs a comprehensive sweep: - -1. **System-Wide Scan**: Searches for OAuth credential files in standard locations: - - `~/.gemini/` → All `*.json` files (typically `credentials.json`) - - `~/.qwen/` → All `*.json` files (typically `oauth_creds.json`) - - `~/.iflow/` → All `*. json` files - -2. **Local Import**: Valid credentials are **copied** (not moved) to the project's `oauth_creds/` directory with standardized names: - - `gemini_cli_oauth_1.json`, `gemini_cli_oauth_2.json`, etc. - - `qwen_code_oauth_1.json`, `qwen_code_oauth_2.json`, etc. - - `iflow_oauth_1.json`, `iflow_oauth_2.json`, etc. - -3. **Intelligent Deduplication**: - - The manager inspects each credential file for a `_proxy_metadata` field containing the user's email or ID - - If this field doesn't exist, it's added during import using provider-specific APIs (e.g., fetching Google account email for Gemini) - - Duplicate accounts (same email/ID) are detected and skipped with a warning log - - Prevents the same account from being added multiple times, even if the files are in different locations - -4. **Isolation**: The project's credentials in `oauth_creds/` are completely isolated from system-wide credentials, preventing cross-contamination - -#### 2.6.2. Credential Loading & Stateless Operation - -The manager supports loading credentials from two sources, with a clear priority: - -**Priority 1: Local Files** (`oauth_creds/` directory) -- Standard `.json` files are loaded first -- Naming convention: `{provider}_oauth_{number}.json` -- Example: `oauth_creds/gemini_cli_oauth_1.json` - -**Priority 2: Environment Variables** (Stateless Deployment) -- If no local files are found, the manager checks for provider-specific environment variables -- This is the key to "Stateless Deployment" for platforms like Railway, Render, Heroku -- Credentials are referenced internally using `env://` URIs (e.g., `env://gemini_cli/1`) - -**Gemini CLI Environment Variables:** - -Single credential (legacy format): -``` -GEMINI_CLI_ACCESS_TOKEN -GEMINI_CLI_REFRESH_TOKEN -GEMINI_CLI_EXPIRY_DATE -GEMINI_CLI_EMAIL -GEMINI_CLI_PROJECT_ID (optional) -GEMINI_CLI_TIER (optional: standard-tier or free-tier) -``` - -Multiple credentials (use `_N_` suffix where N is 1, 2, 3...): -``` -GEMINI_CLI_1_ACCESS_TOKEN -GEMINI_CLI_1_REFRESH_TOKEN -GEMINI_CLI_1_EXPIRY_DATE -GEMINI_CLI_1_EMAIL -GEMINI_CLI_1_PROJECT_ID (optional) -GEMINI_CLI_1_TIER (optional) - -GEMINI_CLI_2_ACCESS_TOKEN -GEMINI_CLI_2_REFRESH_TOKEN -... -``` - -**Antigravity Environment Variables:** - -Same pattern as Gemini CLI: -``` -ANTIGRAVITY_1_ACCESS_TOKEN -ANTIGRAVITY_1_REFRESH_TOKEN -ANTIGRAVITY_1_EXPIRY_DATE -ANTIGRAVITY_1_EMAIL -ANTIGRAVITY_1_PROJECT_ID (optional) -ANTIGRAVITY_1_TIER (optional) -``` - -**Qwen Code Environment Variables:** -``` -QWEN_CODE_ACCESS_TOKEN -QWEN_CODE_REFRESH_TOKEN -QWEN_CODE_EXPIRY_DATE -QWEN_CODE_EMAIL -``` - -**iFlow Environment Variables:** -``` -IFLOW_ACCESS_TOKEN -IFLOW_REFRESH_TOKEN -IFLOW_EXPIRY_DATE -IFLOW_EMAIL -IFLOW_API_KEY -``` - -**How it works:** -- If the manager finds (e.g.) `GEMINI_CLI_ACCESS_TOKEN` or `GEMINI_CLI_1_ACCESS_TOKEN`, it constructs an in-memory credential object that mimics the file structure -- The credential is referenced internally as `env://gemini_cli/0` (legacy) or `env://gemini_cli/1` (numbered) -- The credential behaves exactly like a file-based credential (automatic refresh, expiry detection, etc.) -- No physical files are created or needed on the host system -- Perfect for ephemeral containers or read-only filesystems - -**env:// URI Format:** -``` -env://{provider}/{index} - -Examples: -- env://gemini_cli/1 → GEMINI_CLI_1_ACCESS_TOKEN, etc. -- env://gemini_cli/0 → GEMINI_CLI_ACCESS_TOKEN (legacy single credential) -- env://antigravity/1 → ANTIGRAVITY_1_ACCESS_TOKEN, etc. -``` - -#### 2.6.3. Credential Tool Integration - -The `credential_tool.py` provides a user-friendly CLI interface to the `CredentialManager`: - -**Key Functions:** -1. **OAuth Setup**: Wraps provider-specific `AuthBase` classes (`GeminiAuthBase`, `QwenAuthBase`, `IFlowAuthBase`) to handle interactive login flows -2. **Credential Export**: Reads local `.json` files and generates `.env` format output for stateless deployment -3. **API Key Management**: Adds or updates `PROVIDER_API_KEY_N` entries in the `.env` file - ---- - -### 2.7. Request Sanitizer (`request_sanitizer.py`) - -The `sanitize_request_payload` function ensures requests are compatible with each provider's specific requirements: - -**Parameter Cleaning Logic:** - -1. **`dimensions` Parameter**: - - Only supported by OpenAI's `text-embedding-3-small` and `text-embedding-3-large` models - - Automatically removed for all other models to prevent `400 Bad Request` errors - -2. **`thinking` Parameter** (Gemini-specific): - - Format: `{"type": "enabled", "budget_tokens": -1}` - - Only valid for `gemini/gemini-2.5-pro` and `gemini/gemini-2.5-flash` - - Removed for all other models - -**Provider-Specific Tool Schema Cleaning:** - -Implemented in individual provider classes (`QwenCodeProvider`, `IFlowProvider`): - -- **Recursively removes** unsupported properties from tool function schemas: - - `strict`: OpenAI-specific, causes validation errors on Qwen/iFlow - - `additionalProperties`: Same issue -- **Prevents `400 Bad Request` errors** when using complex tool definitions -- Applied automatically before sending requests to the provider - ---- - -### 2.8. Error Classification (`error_handler.py`) - -The `ClassifiedError` class wraps all exceptions from `litellm` and categorizes them for intelligent handling: - -**Error Types:** -```python -class ErrorType(Enum): - RATE_LIMIT = "rate_limit" # 429 errors, temporary backoff needed - AUTHENTICATION = "authentication" # 401/403, invalid/revoked key - SERVER_ERROR = "server_error" # 500/502/503, provider infrastructure issues - QUOTA = "quota" # Daily/monthly quota exceeded - CONTEXT_LENGTH = "context_length" # Input too long for model - CONTENT_FILTER = "content_filter" # Request blocked by safety filters - NOT_FOUND = "not_found" # Model/endpoint doesn't exist - TIMEOUT = "timeout" # Request took too long - UNKNOWN = "unknown" # Unclassified error -``` - -**Classification Logic:** - -1. **Status Code Analysis**: Primary classification method - - `401`/`403` → `AUTHENTICATION` - - `429` → `RATE_LIMIT` - - `400` with "context_length" or "tokens" → `CONTEXT_LENGTH` - - `400` with "quota" → `QUOTA` - - `500`/`502`/`503` → `SERVER_ERROR` - -2. **Special Exception Types**: - - `EmptyResponseError` → `SERVER_ERROR` (status 503, rotatable) - - `TransientQuotaError` → `SERVER_ERROR` (status 503, rotatable - bare 429 without retry info) - -3. **Message Analysis**: Fallback for ambiguous errors - - Searches for keywords like "quota exceeded", "rate limit", "invalid api key" - -4. **Provider-Specific Overrides**: Some providers use non-standard error formats - -**Usage in Client:** -- `AUTHENTICATION` → Immediate 5-minute global lockout -- `RATE_LIMIT`/`QUOTA` → Escalating per-model cooldown -- `SERVER_ERROR` → Retry with same key (up to `max_retries`), then rotate -- `CONTEXT_LENGTH`/`CONTENT_FILTER` → Immediate failure (user needs to fix request) - ---- - -### 2.9. Cooldown Management (`cooldown_manager.py`) - -The `CooldownManager` handles IP or account-level rate limiting that affects all keys for a provider: - -**Purpose:** -- Some providers (like NVIDIA NIM) have rate limits tied to account/IP rather than API key -- When a 429 error occurs, ALL keys for that provider must be paused - -**Key Methods:** - -1. **`is_cooling_down(provider: str) -> bool`**: - - Checks if a provider is currently in a global cooldown period - - Returns `True` if the current time is still within the cooldown window - -2. **`start_cooldown(provider: str, duration: int)`**: - - Initiates or extends a cooldown for a provider - - Duration is typically 60-120 seconds for 429 errors - -3. **`get_cooldown_remaining(provider: str) -> float`**: - - Returns remaining cooldown time in seconds - - Used for logging and diagnostics - -**Integration with UsageManager:** -- When a key fails with `RATE_LIMIT` error type, the client checks if it's likely an IP-level limit -- If so, `CooldownManager.start_cooldown()` is called for the entire provider -- All subsequent `acquire_key()` calls for that provider will wait until the cooldown expires - - -### 2.10. Credential Prioritization System (`client.py` & `usage_manager.py`) - -The library now includes an intelligent credential prioritization system that automatically detects credential tiers and ensures optimal credential selection for each request. - -**Key Concepts:** - -- **Provider-Level Priorities**: Providers can implement `get_credential_priority()` to return a priority level (1=highest, 10=lowest) for each credential -- **Model-Level Requirements**: Providers can implement `get_model_tier_requirement()` to specify minimum priority required for specific models -- **Automatic Filtering**: The client automatically filters out incompatible credentials before making requests -- **Priority-Aware Selection**: The `UsageManager` prioritizes higher-tier credentials (lower numbers) within the same priority group - -**Implementation Example (Gemini CLI):** - -```python -def get_credential_priority(self, credential: str) -> Optional[int]: - """Returns priority based on Gemini tier.""" - tier = self.project_tier_cache.get(credential) - if not tier: - return None # Not yet discovered - - # Paid tiers get highest priority - if tier not in ['free-tier', 'legacy-tier', 'unknown']: - return 1 - - # Free tier gets lower priority - if tier == 'free-tier': - return 2 - - return 10 - -def get_model_tier_requirement(self, model: str) -> Optional[int]: - """Returns minimum priority required for model.""" - if model.startswith("gemini-3-"): - return 1 # Only paid tier (priority 1) credentials - - return None # All other models have no restrictions -``` - -**Provider Support:** - -The following providers implement credential prioritization: - -- **Gemini CLI**: Paid tier (priority 1), Free tier (priority 2), Legacy/Unknown (priority 10). Gemini 3 models require paid tier. -- **Antigravity**: Same priority system as Gemini CLI. No model-tier restrictions (all models work on all tiers). Paid tier resets every 5 hours, free tier resets weekly. - -**Usage Manager Integration:** - -The `acquire_key()` method has been enhanced to: -1. Group credentials by priority level -2. Try highest priority group first (priority 1, then 2, etc.) -3. Within each group, use existing tier1/tier2 logic (idle keys first, then busy keys) -4. Load balance within priority groups by usage count -5. Only move to next priority if all higher-priority credentials are exhausted - -**Benefits:** - -- Ensures paid-tier credentials are always used for premium models -- Prevents failed requests due to tier restrictions -- Optimal cost distribution (free tier used when possible, paid when required) -- Graceful fallback if primary credentials are unavailable - ---- - -### 2.11. Provider Cache System (`providers/provider_cache.py`) - -A modular, shared caching system for providers to persist conversation state across requests. - -**Architecture:** - -- **Dual-TTL Design**: Short-lived memory cache (default: 1 hour) + longer-lived disk persistence (default: 24 hours) -- **Background Persistence**: Batched disk writes every 60 seconds (configurable) -- **Automatic Cleanup**: Background task removes expired entries from memory cache - -### 2.15. Antigravity Quota Tracker (`providers/utilities/antigravity_quota_tracker.py`) - -A mixin class providing quota tracking functionality for the Antigravity provider. This enables accurate remaining quota estimation based on API-fetched baselines and local request counting. - -#### Core Concepts - -**Quota Baseline Tracking:** -- Periodically fetches quota status from the Antigravity `fetchAvailableModels` API -- Stores the remaining fraction as a baseline in UsageManager -- Tracks requests since baseline to estimate current remaining quota -- Syncs local request count with API's authoritative values - -**Quota Cost Constants:** -Based on empirical testing (see `docs/ANTIGRAVITY_QUOTA_REPORT.md`), quota costs are known per model and tier: - -| Tier | Model Group | Cost per Request | Requests per 100% | -|------|-------------|------------------|-------------------| -| standard-tier | Claude/GPT-OSS | 0.40% | 250 | -| standard-tier | Gemini 3 Pro | 0.25% | 400 | -| standard-tier | Gemini 2.5 Flash | 0.0333% | ~3000 | -| free-tier | Claude/GPT-OSS | 1.333% | 75 | -| free-tier | Gemini 3 Pro | 0.40% | 250 | - -**Model Name Mappings:** -Some user-facing model names don't exist directly in the API response: -- `claude-opus-4-5` → `claude-opus-4-5-thinking` (Opus only exists as thinking variant) -- `gemini-3-pro-preview` → `gemini-3-pro-high` (preview maps to high by default) - -#### Key Methods - -**`fetch_quota_from_api(credential_path)`:** -Fetches current quota status from the Antigravity API. Returns remaining fraction and reset times for all models. - -**`estimate_remaining_quota(credential_path, model, model_data, tier)`:** -Estimates remaining quota based on baseline + request tracking. Returns confidence level (high/medium/low) based on baseline age. - -**`refresh_active_quota_baselines(credentials, usage_data)`:** -Only refreshes baselines for credentials that have been used recently (within the refresh interval). - -**`discover_quota_costs(credential_path, models_to_test)`:** -Manual utility to discover quota costs by making test requests and measuring before/after quota. Saves learned costs to `cache/antigravity/learned_quota_costs.json`. - -#### Integration with Background Jobs - -The Antigravity provider defines a background job for quota baseline refresh: - -```python -def get_background_job_config(self) -> Optional[Dict[str, Any]]: - return { - "interval": 300, # 5 minutes (configurable via ANTIGRAVITY_QUOTA_REFRESH_INTERVAL) - "name": "quota_baseline_refresh", - "run_on_start": True, - } -``` - -This job: -1. Identifies credentials used since the last refresh -2. Fetches current quota from the API for those credentials -3. Updates baselines in UsageManager for accurate estimation - -#### Data Storage - -Quota baselines are stored in UsageManager's per-model data: - -```json -{ - "credential_path": { - "models": { - "antigravity/claude-sonnet-4-5": { - "request_count": 15, - "baseline_remaining_fraction": 0.94, - "baseline_fetched_at": 1734567890.0, - "requests_at_baseline": 15, - "quota_max_requests": 250, - "quota_display": "15/250" - } - } - } -} -``` - -### 2.16. TransientQuotaError (`error_handler.py`) - -A new error type for handling bare 429 responses without retry timing information. - -**When Raised:** -- Provider returns HTTP 429 status code -- Response doesn't contain retry timing info (no `quotaResetTimeStamp` or `retryDelay`) -- After internal retry attempts are exhausted - -**Behavior:** -- Classified as `server_error` (status 503) rather than quota exhaustion -- Causes credential rotation to try the next credential -- Does NOT trigger long-term quota cooldowns - -**Implementation in Antigravity:** -```python -# Non-streaming and streaming both retry bare 429s -for attempt in range(EMPTY_RESPONSE_MAX_ATTEMPTS): - try: - result = await self._handle_request(...) - except httpx.HTTPStatusError as e: - if e.response.status_code == 429: - quota_info = self.parse_quota_error(e) - if quota_info is None: - # Bare 429 - retry like empty response - if attempt < EMPTY_RESPONSE_MAX_ATTEMPTS - 1: - await asyncio.sleep(EMPTY_RESPONSE_RETRY_DELAY) - continue - else: - raise TransientQuotaError(provider, model, message) - # Has retry info - real quota exhaustion - raise -``` - -**Rationale:** -Some 429 responses are transient rate limits rather than true quota exhaustion. These occur when the API is temporarily overloaded but the credential still has quota available. Retrying internally before rotating credentials provides better resilience. - -### 2.17. Gemini CLI Quota Tracker (`providers/utilities/gemini_cli_quota_tracker.py`) - -A mixin class providing quota tracking functionality for the Gemini CLI provider. This mirrors the Antigravity quota tracker (Section 2.15) and enables accurate remaining quota estimation based on API-fetched baselines and local request counting. - -#### Core Concepts - -**Quota Baseline Tracking:** -- Periodically fetches quota status from the `retrieveUserQuota` API endpoint -- Stores the remaining fraction as a baseline in UsageManager -- Tracks requests since baseline to estimate current remaining quota -- Syncs local request count with API's authoritative values - -**Quota Cost Constants:** -Based on empirical testing, quota limits are known per model and tier: - -| Tier | Model Group | Max Requests per 100% | -|------|-------------|----------------------| -| standard-tier | Pro (gemini-2.5-pro, gemini-3-pro-preview) | 250 | -| standard-tier | 2.5-Flash (gemini-2.0-flash, gemini-2.5-flash, gemini-2.5-flash-lite) | 1500 | -| standard-tier | 3-Flash (gemini-3-flash-preview) | 1500 | -| free-tier | Pro | 100 | -| free-tier | 2.5-Flash | 1000 | -| free-tier | 3-Flash | 1000 | - -**Reset Windows:** -- All tiers use 24-hour fixed windows from first request (verified 2026-01-07) -- The reset time is set when the first request is made and does NOT roll forward - -**Model Quota Groups:** -Models that share quota limits are grouped together: -- `pro`: `gemini-2.5-pro`, `gemini-3-pro-preview` -- `25-flash`: `gemini-2.0-flash`, `gemini-2.5-flash`, `gemini-2.5-flash-lite` -- `3-flash`: `gemini-3-flash-preview` - -Groups can be overridden via environment variables: `QUOTA_GROUPS_GEMINI_CLI_{GROUP}="model1,model2"` - -#### Key Methods - -**`retrieve_user_quota(credential_path)`:** -Fetches current quota status from the Gemini CLI `retrieveUserQuota` API. Returns remaining fraction and reset times for all models. - -**`get_all_quota_info(credential_paths, oauth_base_dir, usage_data, include_estimates)`:** -Gets structured quota info for all credentials, suitable for the TUI quota viewer and stats endpoint. - -**`get_max_requests_for_model(model, tier)`:** -Returns the maximum number of requests for a model/tier combination. Uses learned values if available, otherwise falls back to defaults. - -**`discover_quota_costs(credential_path, models_to_test)`:** -Manual utility to discover quota costs by making test requests and measuring before/after quota. Saves learned costs to `cache/gemini_cli/learned_quota_costs.json`. - -#### Integration with Background Jobs - -The Gemini CLI provider defines a background job for quota baseline refresh: - -```python -def get_background_job_config(self) -> Optional[Dict[str, Any]]: - return { - "interval": 300, # 5 minutes (configurable via GEMINI_CLI_QUOTA_REFRESH_INTERVAL) - "name": "gemini_cli_quota_refresh", - "run_on_start": True, - } -``` - -This job: -1. On first run: Fetches quota for ALL credentials to establish baselines -2. On subsequent runs: Only fetches for credentials used since last refresh -3. Updates baselines in UsageManager for accurate estimation - -#### Data Storage - -Quota baselines are stored in UsageManager's per-model data: - -```json -{ - "credential_path": { - "models": { - "gemini_cli/gemini-2.5-pro": { - "request_count": 15, - "baseline_remaining_fraction": 0.94, - "baseline_fetched_at": 1734567890.0, - "requests_at_baseline": 15, - "quota_max_requests": 250, - "quota_display": "15/250" - } - } - } -} -``` - -#### Environment Variables - -```env -# Background job interval in seconds (default: 300 = 5 min) -GEMINI_CLI_QUOTA_REFRESH_INTERVAL=300 - -# Override default quota groups -QUOTA_GROUPS_GEMINI_CLI_PRO="gemini-2.5-pro,gemini-3-pro-preview" -QUOTA_GROUPS_GEMINI_CLI_25_FLASH="gemini-2.0-flash,gemini-2.5-flash,gemini-2.5-flash-lite" -QUOTA_GROUPS_GEMINI_CLI_3_FLASH="gemini-3-flash-preview" -``` - -### 2.18. Shared Gemini OAuth Utilities (`providers/utilities/`) - -The PR refactors shared logic between Gemini CLI and Antigravity providers into reusable utility modules: - -| Module | Purpose | -|--------|---------| -| `gemini_shared_utils.py` | Shared constants (FINISH_REASON_MAP, DEFAULT_SAFETY_SETTINGS, CODE_ASSIST_ENDPOINT), helper functions (env_bool, env_int, inline_schema_refs, recursively_parse_json_strings) | -| `base_quota_tracker.py` | Abstract base class for quota tracking with learned costs, credential discovery, and baseline management | -| `gemini_credential_manager.py` | Mixin for OAuth credential tier management, initialization, and background job interface | -| `gemini_file_logger.py` | Transaction-level file logging for debugging API requests and responses | -| `gemini_tool_handler.py` | Tool schema transformation and Gemini 3 tool fix logic | - -**Benefits:** -- Eliminates code duplication between Gemini CLI and Antigravity providers -- Single source of truth for shared constants and logic -- Easier maintenance and bug fixes -- Consistent behavior across Google OAuth-based providers - -### 2.19. Fair Cycle Rotation - -Fair Cycle Rotation ensures each credential is used at least once before any credential can be reused within a tier. This prevents a single credential from being repeatedly used and exhausted while others sit idle. - -**Problem Solved:** -- In sequential mode, the same high-priority credential might be used repeatedly -- When exhausted, it gets a cooldown, but after cooldown expires, it's used again -- Other credentials of the same tier never get used - -**Solution:** -- When a credential hits a long cooldown (> threshold), mark it as "exhausted" -- Exhausted credentials are skipped until ALL credentials in the tier exhaust -- Once all exhaust OR cycle duration expires, the cycle resets - -**Configuration (Environment Variables):** - -| Variable | Type | Default | Description | -|----------|------|---------|-------------| -| `FAIR_CYCLE_{PROVIDER}` | bool | sequential only | Enable/disable fair cycle | -| `FAIR_CYCLE_TRACKING_MODE_{PROVIDER}` | string | `model_group` | `model_group` or `credential` | -| `FAIR_CYCLE_CROSS_TIER_{PROVIDER}` | bool | `false` | Track across all tiers | -| `FAIR_CYCLE_DURATION_{PROVIDER}` | int | `86400` | Cycle duration in seconds | -| `EXHAUSTION_COOLDOWN_THRESHOLD_{PROVIDER}` | int | `300` | Threshold in seconds | - -**Defaults:** All defaults are defined in `src/rotator_library/config/defaults.py`. - -**Logging Format:** -``` -Acquiring key for model antigravity/claude-opus-4.5. Tried keys: 0/12(17,cd:3,fc:2) -# Breakdown: 0 tried, 12 available, 17 total, 3 on cooldown, 2 fair-cycle excluded -``` - -**Persistence:** -Cycle state is persisted in `key_usage.json` under the `__fair_cycle__` key. - -### 2.20. Custom Caps - -Custom Caps allow setting custom usage limits per tier, per model/group that are MORE restrictive than actual API limits. When the custom cap is reached, the credential is put on cooldown BEFORE hitting the actual API limit. - -**Use Cases:** -- Pace usage across quota window (don't burn 150 requests in first hour) -- Reserve capacity for certain times of day -- Add safety buffer (stop at 120/150 to avoid edge cases) -- Extend cooldown beyond natural reset for pacing - -**Key Principle: More Restrictive Only** -- Custom cap is always <= actual max (clamped if set higher) -- Custom cooldown is always >= natural reset time (clamped if set shorter) - -**Configuration (Environment Variables):** - -```bash -# Format -CUSTOM_CAP_{PROVIDER}_T{TIER}_{MODEL_OR_GROUP}= -CUSTOM_CAP_COOLDOWN_{PROVIDER}_T{TIER}_{MODEL_OR_GROUP}=: - -# Examples -CUSTOM_CAP_ANTIGRAVITY_T2_CLAUDE=100 -CUSTOM_CAP_COOLDOWN_ANTIGRAVITY_T2_CLAUDE=quota_reset - -CUSTOM_CAP_ANTIGRAVITY_T3_CLAUDE=30 -CUSTOM_CAP_COOLDOWN_ANTIGRAVITY_T3_CLAUDE=offset:3600 -``` - -**Cap Values:** -- Absolute number: `100` -- Percentage of actual max: `"80%"` - -**Cooldown Modes:** - -| Mode | Formula | Use Case | -|------|---------|----------| -| `quota_reset` | `quota_reset_ts` | Same as natural behavior | -| `offset` | `quota_reset_ts + value` | Add buffer time | -| `fixed` | `window_start_ts + value` | Fixed window from start | - -**Resolution Priority:** -1. Tier + Model (most specific) -2. Tier + Group (model's quota group) -3. Default + Model -4. Default + Group -5. No custom cap (use actual API limits) - -**Integration with Fair Cycle:** -When a custom cap triggers a cooldown longer than the exhaustion threshold, it also marks the credential as exhausted for fair cycle rotation. - -**Defaults:** See `src/rotator_library/config/defaults.py` for all configurable defaults. - -### 2.21. Anthropic API Compatibility (`anthropic_compat/`) - -A translation layer that enables Anthropic API clients (like Claude Code) to use any OpenAI-compatible provider through the proxy. - -#### Architecture - -The module consists of three components: - -| File | Purpose | -|------|---------| -| `models.py` | Pydantic models for Anthropic request/response formats (`AnthropicMessagesRequest`, `AnthropicMessage`, `AnthropicTool`, etc.) | -| `translator.py` | Bidirectional format translation functions | -| `streaming.py` | SSE format conversion for streaming responses | - -#### Request Translation (`translate_anthropic_request`) - -Converts Anthropic Messages API requests to OpenAI Chat Completions format: - -**Message Conversion:** -- Anthropic `system` field → OpenAI system message -- `content` blocks (text, image, tool_use, tool_result) → OpenAI format -- Image blocks with base64 data → OpenAI `image_url` with data URI -- Document blocks (PDF, etc.) → OpenAI `image_url` format - -**Tool Conversion:** -- Anthropic `tools` with `input_schema` → OpenAI `tools` with `parameters` -- `tool_choice.type: "any"` → `"required"` -- `tool_choice.type: "tool"` → `{"type": "function", "function": {"name": ...}}` - -**Thinking Configuration:** -- `thinking.type: "enabled"` → `reasoning_effort: "high"` + `thinking_budget` -- `thinking.type: "disabled"` → `reasoning_effort: "disable"` -- Opus models default to thinking enabled - -**Special Handling:** -- Reorders assistant content blocks: thinking → text → tool_use -- Injects `[Continue]` prompt for fresh thinking turns -- Preserves thinking signatures for multi-turn conversations - -#### Response Translation (`openai_to_anthropic_response`) - -Converts OpenAI Chat Completions responses to Anthropic Messages format: - -**Content Blocks:** -- `reasoning_content` → thinking block with signature -- `content` → text block -- `tool_calls` → tool_use blocks with parsed JSON input - -**Field Mapping:** -- `finish_reason: "stop"` → `stop_reason: "end_turn"` -- `finish_reason: "length"` → `stop_reason: "max_tokens"` -- `finish_reason: "tool_calls"` → `stop_reason: "tool_use"` - -**Usage Translation:** -- `prompt_tokens` minus `cached_tokens` → `input_tokens` -- `completion_tokens` → `output_tokens` -- `prompt_tokens_details.cached_tokens` → `cache_read_input_tokens` - -#### Streaming Wrapper (`anthropic_streaming_wrapper`) - -Converts OpenAI SSE streaming format to Anthropic's event-based format: - -**Event Types Generated:** -``` -message_start → Initial message metadata -content_block_start → Start of text/thinking/tool_use block -content_block_delta → Incremental content (text_delta, thinking_delta, input_json_delta) -content_block_stop → End of content block -message_delta → Final metadata (stop_reason, usage) -message_stop → End of message -``` - -**Features:** -- Accumulates tool call arguments across chunks -- Handles thinking/reasoning content from `delta.reasoning_content` -- Proper block indexing for multiple content blocks -- Cache token handling in usage statistics -- Error recovery with proper message structure - -#### Client Integration - -The `RotatingClient` provides two methods for Anthropic compatibility: - -```python -async def anthropic_messages(self, request, raw_request=None, pre_request_callback=None): - """Handle Anthropic Messages API requests.""" - # 1. Translate Anthropic request to OpenAI format - # 2. Call acompletion() with translated request - # 3. Convert response back to Anthropic format - # 4. For streaming: wrap with anthropic_streaming_wrapper - -async def anthropic_count_tokens(self, request): - """Count tokens for Anthropic-format request.""" - # Translates messages and tools, then uses token_count() -``` - -#### Authentication - -The proxy accepts both Anthropic and OpenAI authentication styles: -- `x-api-key` header (Anthropic style) -- `Authorization: Bearer` header (OpenAI style) - -### 3.5. Antigravity (`antigravity_provider.py`) - -The most sophisticated provider implementation, supporting Google's internal Antigravity API for Gemini 3 and Claude models (including **Claude Opus 4.5**, Anthropic's most powerful model). - -#### Architecture - -- **Unified Streaming/Non-Streaming**: Single code path handles both response types with optimal transformations -- **Thought Signature Caching**: Server-side caching of encrypted signatures for multi-turn Gemini 3 conversations -- **Model-Specific Logic**: Automatic configuration based on model type (Gemini 3, Claude Sonnet, Claude Opus) -- **Credential Prioritization**: Automatic tier detection with paid credentials prioritized over free (paid tier resets every 5 hours, free tier resets weekly) -- **Sequential Rotation Mode**: Default rotation mode is sequential (use credentials until exhausted) to maximize thought signature cache hits -- **Per-Model Quota Tracking**: Each model tracks independent usage windows with authoritative reset timestamps from quota errors -- **Quota Groups**: Models that share quota limits are grouped together (Claude/GPT-OSS share quota, Gemini 3 Pro variants share quota, Gemini 2.5 Flash variants share quota) -- **Priority Multipliers**: Paid tier credentials get higher concurrency limits (Priority 1: 5x, Priority 2: 3x, Priority 3+: 2x in sequential mode) -- **Quota Baseline Tracking**: Background job fetches quota status from API to provide accurate remaining quota estimates -- **TransientQuotaError Handling**: Bare 429 responses (without retry info) are retried internally before credential rotation - -#### Model Support - -**Gemini 3 Pro:** -- Uses `thinkingLevel` parameter (string: "low" or "high") -- **Tool Hallucination Prevention**: - - Automatic system instruction injection explaining custom tool schema rules - - Parameter signature injection into tool descriptions (e.g., "STRICT PARAMETERS: files (ARRAY_OF_OBJECTS[path: string REQUIRED, ...])") - - Namespace prefix for tool names (`gemini3_` prefix) to avoid training data conflicts - - Malformed JSON auto-correction (handles extra trailing braces) -- **ThoughtSignature Management**: - - Caching signatures from responses for reuse in follow-up messages - - Automatic injection into functionCalls for multi-turn conversations - - Fallback to bypass value if signature unavailable -- **Parallel Tool Usage Instruction**: Configurable instruction injection to encourage parallel tool calls (disabled by default for Gemini 3) - -**Gemini 2.5 Flash:** -- Uses `-thinking` variant when `reasoning_effort` is provided -- Shares quota with `gemini-2.5-flash-thinking` and `gemini-2.5-flash-lite` variants -- Parallel tool usage instruction configurable - -**Gemini 2.5 Flash Lite:** -- Configurable thinking budget, no name change required -- Shares quota with Flash variants - -**Claude Opus 4.5:** -- Anthropic's most powerful model, now available via Antigravity proxy -- **Always uses thinking variant** - `claude-opus-4-5-thinking` is the only available variant (non-thinking version doesn't exist) -- Uses `thinkingBudget` parameter for extended thinking control (-1 for auto, 0 to disable, or specific token count) -- Full support for tool use with schema cleaning -- Same thinking preservation and sanitization features as Sonnet -- Increased default max output tokens to 64000 to accommodate thinking output - -**Claude Sonnet 4.5:** -- Proxied through Antigravity API -- **Supports both thinking and non-thinking modes**: - - With `reasoning_effort`: Uses `claude-sonnet-4-5-thinking` variant with `thinkingBudget` - - Without `reasoning_effort`: Uses standard `claude-sonnet-4-5` variant -- **Thinking Preservation**: Caches thinking content using composite keys (tool_call_id + text_hash) -- **Schema Cleaning**: Removes unsupported properties (`$schema`, `additionalProperties`, `const` → `enum`) -- **Parallel Tool Usage Instruction**: Automatic instruction injection to encourage parallel tool calls (enabled by default for Claude) - -**GPT-OSS 120B Medium:** -- OpenAI-compatible model available via Antigravity -- Shares quota with Claude models (Claude/GPT-OSS quota group) - -#### Base URL Fallback - -Automatic fallback chain for resilience: -1. `daily-cloudcode-pa.sandbox.googleapis.com` (primary sandbox) -2. `autopush-cloudcode-pa.sandbox.googleapis.com` (fallback sandbox) -3. `cloudcode-pa.googleapis.com` (production fallback) - -#### Message Transformation - -**OpenAI → Gemini Format:** -- System messages → `systemInstruction` with parts array -- Multi-part content (text + images) → `inlineData` format -- Tool calls → `functionCall` with args and id -- Tool responses → `functionResponse` with name and response -- ThoughtSignatures preserved/injected as needed - -**Tool Response Grouping:** -- Converts linear format (call, response, call, response) to grouped format -- Groups all function calls in one `model` message -- Groups all responses in one `user` message -- Required for Antigravity API compatibility - -#### Configuration (Environment Variables) - -```env -# Cache control -ANTIGRAVITY_SIGNATURE_CACHE_TTL=3600 # Memory cache TTL -ANTIGRAVITY_SIGNATURE_DISK_TTL=86400 # Disk cache TTL -ANTIGRAVITY_ENABLE_SIGNATURE_CACHE=true - -# Feature flags -ANTIGRAVITY_PRESERVE_THOUGHT_SIGNATURES=true # Include signatures in client responses -ANTIGRAVITY_ENABLE_DYNAMIC_MODELS=false # Use API model discovery -ANTIGRAVITY_GEMINI3_TOOL_FIX=true # Enable Gemini 3 hallucination prevention -ANTIGRAVITY_CLAUDE_THINKING_SANITIZATION=true # Enable Claude thinking mode auto-correction - -# Gemini 3 tool fix customization -ANTIGRAVITY_GEMINI3_TOOL_PREFIX="gemini3_" # Namespace prefix -ANTIGRAVITY_GEMINI3_DESCRIPTION_PROMPT="\n\nSTRICT PARAMETERS: {params}." -ANTIGRAVITY_GEMINI3_SYSTEM_INSTRUCTION="..." # Full system prompt - -# Parallel tool usage instruction -ANTIGRAVITY_PARALLEL_TOOL_INSTRUCTION_CLAUDE=true # Inject parallel tool instruction for Claude (default: true) -ANTIGRAVITY_PARALLEL_TOOL_INSTRUCTION_GEMINI3=false # Inject parallel tool instruction for Gemini 3 (default: false) -ANTIGRAVITY_PARALLEL_TOOL_INSTRUCTION="..." # Custom instruction text - -# Quota tracking -ANTIGRAVITY_QUOTA_REFRESH_INTERVAL=300 # Background quota refresh interval in seconds (default: 300 = 5 min) -``` - -#### Claude Extended Thinking Sanitization - -The provider now includes robust automatic sanitization for Claude's extended thinking mode, handling all common error scenarios with conversation history. - -**Problem**: Claude's extended thinking API requires strict consistency in thinking blocks: -- If thinking is enabled, the final assistant turn must start with a thinking block -- If thinking is disabled, no thinking blocks can be present in the final turn -- Tool use loops are part of a single "assistant turn" -- You **cannot** toggle thinking mode mid-turn (this is invalid per Claude API) - -**Scenarios Handled**: - -| Scenario | Action | -|----------|--------| -| Tool loop WITH thinking + thinking enabled | Preserve thinking, continue normally | -| Tool loop WITHOUT thinking + thinking enabled | **Inject synthetic closure** to start fresh turn with thinking | -| Thinking disabled | Strip all thinking blocks | -| Normal conversation (no tool loop) | Strip old thinking, new response adds thinking naturally | -| Function call ID mismatch | Three-tier recovery: ID match → name match → fallback | -| Missing tool responses | Automatic placeholder injection | -| Compacted/cached conversations | Recover thinking from cache post-transformation | - -**Key Implementation Details**: - -The `_sanitize_thinking_for_claude()` method now: -- Operates on Gemini-format messages (`parts[]` with `"thought": true` markers) -- Detects tool results as user messages with `functionResponse` parts -- Uses `_analyze_turn_state()` to classify conversation state on Gemini format -- Recovers thinking from cache when client strips reasoning_content -- When enabling thinking in a tool loop started without thinking: - - Injects synthetic assistant message to close the previous turn - - Allows Claude to start fresh turn with thinking capability - -**Function Call Response Grouping**: - -The enhanced pairing system ensures conversation history integrity: -``` -Problem: Client/proxy may mutate response IDs or lose responses during context processing - -Solution: -1. Try direct ID match (tool_call_id == response.id) -2. If no match, try function name match (tool.name == response.name) -3. If still no match, use order-based fallback (nth tool → nth response) -4. Repair "unknown_function" responses with correct names -5. Create placeholders for completely missing responses -``` - -**Configuration**: -```env -ANTIGRAVITY_CLAUDE_THINKING_SANITIZATION=true # Enable/disable auto-correction (default: true) -``` - -**Note**: These fixes ensure Claude thinking mode works seamlessly with tool use, model switching, context compression, and cached conversations. No manual intervention required. - -#### File Logging - -Optional transaction logging for debugging: -- Enabled via `enable_request_logging` parameter -- Creates `logs/antigravity_logs/TIMESTAMP_MODEL_UUID/` directory per request -- Logs: `request_payload.json`, `response_stream.log`, `final_response.json`, `error.log` - ---- - - -- **Atomic Disk Writes**: Uses temp-file-and-move pattern to prevent corruption - -**Key Methods:** - -1. **`store(key, value)`**: Synchronously queues value for storage (schedules async write) -2. **`retrieve(key)`**: Synchronously retrieves from memory, optionally schedules disk fallback -3. **`store_async(key, value)`**: Awaitable storage for guaranteed persistence -4. **`retrieve_async(key)`**: Awaitable retrieval with disk fallback - -**Use Cases:** - -- **Gemini 3 ThoughtSignatures**: Caching tool call signatures for multi-turn conversations -- **Claude Thinking**: Preserving thinking content for consistency across conversation turns -- **Any Transient State**: Generic key-value storage for provider-specific needs - -**Configuration (Environment Variables):** - -```env -# Cache control (prefix can be customized per cache instance) -PROVIDER_CACHE_ENABLE=true -PROVIDER_CACHE_WRITE_INTERVAL=60 # seconds between disk writes -PROVIDER_CACHE_CLEANUP_INTERVAL=1800 # 30 min between cleanups - -# Gemini 3 specific -GEMINI_CLI_SIGNATURE_CACHE_ENABLE=true -GEMINI_CLI_SIGNATURE_CACHE_TTL=3600 # 1 hour memory TTL -GEMINI_CLI_SIGNATURE_DISK_TTL=86400 # 24 hours disk TTL -``` - -**File Structure:** - -``` -cache/ -├── gemini_cli/ -│ └── gemini3_signatures.json -└── antigravity/ - ├── gemini3_signatures.json - └── claude_thinking.json -``` - ---- - -### 2.13. Sequential Rotation & Per-Model Quota Tracking - -A comprehensive credential rotation and quota management system introduced in PR #31. - -#### Rotation Modes - -Two rotation strategies are available per provider: - -**Balanced Mode (Default)**: -- Distributes load evenly across all credentials -- Least-used credentials selected first -- Best for providers with per-minute rate limits -- Prevents any single credential from being overused - -**Sequential Mode**: -- Uses one credential until it's exhausted (429 quota error) -- Switches to next credential only after current one fails -- Most-used credentials selected first (sticky behavior) -- Best for providers with daily/weekly quotas -- Maximizes cache hit rates (e.g., Antigravity thought signatures) -- Default for Antigravity provider - -**Configuration**: -```env -# Set per provider -ROTATION_MODE_GEMINI=sequential -ROTATION_MODE_OPENAI=balanced -ROTATION_MODE_ANTIGRAVITY=balanced # Override default -``` - -#### Per-Model Quota Tracking - -Instead of tracking usage at the credential level, the system now supports granular per-model tracking: - -**Data Structure** (when `mode="per_model"`): -```json -{ - "credential_id": { - "models": { - "gemini-2.5-pro": { - "window_start_ts": 1733678400.0, - "quota_reset_ts": 1733696400.0, - "success_count": 15, - "prompt_tokens": 5000, - "completion_tokens": 1000, - "approx_cost": 0.05, - "window_started": "2025-12-08 14:00:00 +0100", - "quota_resets": "2025-12-08 19:00:00 +0100" - } - }, - "global": {...}, - "model_cooldowns": {...} - } -} -``` - -**Key Features**: -- Each model tracks its own usage window independently -- `window_start_ts`: When the current quota period started -- `quota_reset_ts`: Authoritative reset time from provider error response -- Human-readable timestamps added for debugging -- Supports custom window durations (5h, 7d, etc.) - -#### Provider-Specific Quota Parsing - -Providers can implement `parse_quota_error()` to extract precise reset times from error responses: - -```python -@staticmethod -def parse_quota_error(error, error_body) -> Optional[Dict]: - """Extract quota reset timestamp from provider error. - - Returns: - { - 'quota_reset_timestamp': 1733696400.0, # Unix timestamp - 'retry_after': 18000 # Seconds until reset - } - """ -``` - -**Google RPC Format** (Antigravity, Gemini CLI): -- Parses `RetryInfo` and `ErrorInfo` from error details -- Handles duration strings: `"143h4m52.73s"` or `"515092.73s"` -- Extracts `quotaResetTimeStamp` and converts to Unix timestamp -- Falls back to `quotaResetDelay` if timestamp not available - -**Example Error Response**: -```json -{ - "error": { - "code": 429, - "message": "Quota exceeded", - "details": [{ - "@type": "type.googleapis.com/google.rpc.RetryInfo", - "retryDelay": "143h4m52.73s" - }, { - "@type": "type.googleapis.com/google.rpc.ErrorInfo", - "metadata": { - "quotaResetTimeStamp": "2025-12-08T19:00:00Z" - } - }] - } -} -``` - -#### Model Quota Groups - -Models that share the same quota limits can be grouped: - -**Configuration**: -```env -# Models in a group share quota/cooldown timing -QUOTA_GROUPS_ANTIGRAVITY_CLAUDE="claude-sonnet-4-5,claude-sonnet-4-5-thinking,claude-opus-4-5,claude-opus-4-5-thinking,gpt-oss-120b-medium" -QUOTA_GROUPS_ANTIGRAVITY_GEMINI_3_PRO="gemini-3-pro-high,gemini-3-pro-low,gemini-3-pro-preview" -QUOTA_GROUPS_ANTIGRAVITY_GEMINI_2_5_FLASH="gemini-2.5-flash,gemini-2.5-flash-thinking,gemini-2.5-flash-lite" - -# To disable a default group: -QUOTA_GROUPS_ANTIGRAVITY_CLAUDE="" -``` - -**Default Quota Groups (Antigravity)**: - -| Group Name | Models | Shared Quota | -|------------|--------|--------------| -| `claude` | claude-sonnet-4-5, claude-sonnet-4-5-thinking, claude-opus-4-5, claude-opus-4-5-thinking, gpt-oss-120b-medium | Yes (Claude and GPT-OSS share quota) | -| `gemini-3-pro` | gemini-3-pro-high, gemini-3-pro-low, gemini-3-pro-preview | Yes | -| `gemini-2.5-flash` | gemini-2.5-flash, gemini-2.5-flash-thinking, gemini-2.5-flash-lite | Yes | - -**Behavior**: -- When one model hits quota, all models in the group receive the same `quota_reset_ts` -- Group resets only when ALL models' quotas have reset -- Preserves unexpired cooldowns during other resets - -**Provider Implementation**: -```python -class AntigravityProvider(ProviderInterface): - model_quota_groups = { - # Claude and GPT-OSS share the same quota pool - "claude": [ - "claude-sonnet-4-5", - "claude-sonnet-4-5-thinking", - "claude-opus-4-5", - "claude-opus-4-5-thinking", - "gpt-oss-120b-medium", - ], - # Gemini 3 Pro variants share quota - "gemini-3-pro": [ - "gemini-3-pro-high", - "gemini-3-pro-low", - "gemini-3-pro-preview", - ], - # Gemini 2.5 Flash variants share quota - "gemini-2.5-flash": [ - "gemini-2.5-flash", - "gemini-2.5-flash-thinking", - "gemini-2.5-flash-lite", - ], - } -``` - -#### Priority-Based Concurrency Multipliers - -Credentials can be assigned to priority tiers with configurable concurrency limits: - -**Configuration**: -```env -# Universal multipliers (all modes) -CONCURRENCY_MULTIPLIER_ANTIGRAVITY_PRIORITY_1=10 -CONCURRENCY_MULTIPLIER_ANTIGRAVITY_PRIORITY_2=3 - -# Mode-specific overrides -CONCURRENCY_MULTIPLIER_ANTIGRAVITY_PRIORITY_2_BALANCED=1 # Lower in balanced mode -``` - -**How it works**: -```python -effective_concurrent_limit = MAX_CONCURRENT_REQUESTS_PER_KEY * tier_multiplier -``` - -**Provider Defaults** (Antigravity): -- Priority 1 (paid ultra): 5x multiplier -- Priority 2 (standard paid): 3x multiplier -- Priority 3+ (free): 2x (sequential mode) or 1x (balanced mode) - -**Benefits**: -- Paid credentials handle more load without manual configuration -- Different concurrency for different rotation modes -- Automatic tier detection based on credential properties - -#### Reset Window Configuration - -Providers can specify custom reset windows per priority tier: - -```python -class AntigravityProvider(ProviderInterface): - usage_reset_configs = { - frozenset([1, 2]): UsageResetConfigDef( - mode="per_model", - window_hours=5, # 5-hour rolling window for paid tiers - field_name="5h_window" - ), - frozenset([3, 4, 5]): UsageResetConfigDef( - mode="per_model", - window_hours=168, # 7-day window for free tier - field_name="7d_window" - ) - } -``` - -**Supported Modes**: -- `per_model`: Independent window per model with authoritative reset times -- `credential`: Single window per credential (legacy) -- `daily`: Daily reset at configured UTC hour (legacy) - -#### Usage Flow - -1. **Request arrives** for model X with credential Y -2. **Check rotation mode**: Sequential or balanced? -3. **Select credential**: - - Filter by priority tier requirements - - Apply concurrency multiplier for effective limit - - Sort by rotation mode strategy -4. **Check quota**: - - Load model's usage data - - Check if within window (window_start_ts to quota_reset_ts) - - Check model quota groups for combined usage -5. **Execute request** -6. **On success**: Increment model usage count -7. **On quota error**: - - Parse error for `quota_reset_ts` - - Apply to model (and quota group) - - Credential remains on cooldown until reset time -8. **On window expiration**: - - Archive model data to global stats - - Start fresh window with new `window_start_ts` - - Preserve unexpired quota cooldowns - ---- - -### 2.12. Google OAuth Base (`providers/google_oauth_base.py`) - -A refactored, reusable OAuth2 base class that eliminates code duplication across Google-based providers. - -**Refactoring Benefits:** - -- **Single Source of Truth**: All OAuth logic centralized in one class -- **Easy Provider Addition**: New providers only need to override constants -- **Consistent Behavior**: Token refresh, expiry handling, and validation work identically across providers -- **Maintainability**: OAuth bugs fixed once apply to all inheriting providers - -**Provider Implementation:** - -```python -class AntigravityAuthBase(GoogleOAuthBase): - # Required overrides - CLIENT_ID = "antigravity-client-id" - CLIENT_SECRET = "antigravity-secret" - OAUTH_SCOPES = [ - "https://www.googleapis.com/auth/cloud-platform", - "https://www.googleapis.com/auth/cclog", # Antigravity-specific - "https://www.googleapis.com/auth/experimentsandconfigs", - ] - ENV_PREFIX = "ANTIGRAVITY" # Used for env var loading - - # Optional overrides (defaults provided) - CALLBACK_PORT = 51121 - CALLBACK_PATH = "/oauthcallback" -``` - -**Inherited Features:** - -- Automatic token refresh with exponential backoff -- Invalid grant re-authentication flow -- Stateless deployment support (env var loading) -- Atomic credential file writes -- Headless environment detection -- Sequential refresh queue processing - -#### OAuth Callback Port Configuration - -Each OAuth provider uses a local callback server during authentication. The callback port can be customized via environment variables to avoid conflicts with other services. - -**Default Ports:** - -| Provider | Default Port | Environment Variable | -|----------|-------------|---------------------| -| Gemini CLI | 8085 | `GEMINI_CLI_OAUTH_PORT` | -| Antigravity | 51121 | `ANTIGRAVITY_OAUTH_PORT` | -| iFlow | 11451 | `IFLOW_OAUTH_PORT` | - -**Configuration Methods:** - -1. **Via TUI Settings Menu:** - - Main Menu → `4. View Provider & Advanced Settings` → `1. Launch Settings Tool` - - Select the provider (Gemini CLI, Antigravity, or iFlow) - - Modify the `*_OAUTH_PORT` setting - - Use "Reset to Default" to restore the original port - -2. **Via `.env` file:** - ```env - # Custom OAuth callback ports (optional) - GEMINI_CLI_OAUTH_PORT=8085 - ANTIGRAVITY_OAUTH_PORT=51121 - IFLOW_OAUTH_PORT=11451 - ``` - -**When to Change Ports:** - -- If the default port conflicts with another service on your system -- If running multiple proxy instances on the same machine -- If firewall rules require specific port ranges - -**Note:** Port changes take effect on the next OAuth authentication attempt. Existing tokens are not affected. - ---- - -### 2.14. HTTP Timeout Configuration (`timeout_config.py`) - -Centralized timeout configuration for all HTTP requests to LLM providers. - -#### Purpose - -The `TimeoutConfig` class provides fine-grained control over HTTP timeouts for streaming and non-streaming LLM requests. This addresses the common issue of proxy hangs when upstream providers stall during connection establishment or response generation. - -#### Timeout Types Explained - -| Timeout | Description | -|---------|-------------| -| **connect** | Maximum time to establish a TCP/TLS connection to the upstream server | -| **read** | Maximum time to wait between receiving data chunks (resets on each chunk for streaming) | -| **write** | Maximum time to wait while sending the request body | -| **pool** | Maximum time to wait for a connection from the connection pool | - -#### Default Values - -| Setting | Streaming | Non-Streaming | Rationale | -|---------|-----------|---------------|-----------| -| **connect** | 30s | 30s | Fast fail if server is unreachable | -| **read** | 180s (3 min) | 600s (10 min) | Streaming expects periodic chunks; non-streaming may wait for full generation | -| **write** | 30s | 30s | Request bodies are typically small | -| **pool** | 60s | 60s | Reasonable wait for connection pool | - -#### Environment Variable Overrides - -All timeout values can be customized via environment variables: - -```env -# Connection establishment timeout (seconds) -TIMEOUT_CONNECT=30 - -# Request body send timeout (seconds) -TIMEOUT_WRITE=30 - -# Connection pool acquisition timeout (seconds) -TIMEOUT_POOL=60 - -# Read timeout between chunks for streaming requests (seconds) -# If no data arrives for this duration, the connection is considered stalled -TIMEOUT_READ_STREAMING=180 - -# Read timeout for non-streaming responses (seconds) -# Longer to accommodate models that take time to generate full responses -TIMEOUT_READ_NON_STREAMING=600 -``` - -#### Streaming vs Non-Streaming Behavior - -**Streaming Requests** (`TimeoutConfig.streaming()`): -- Uses shorter read timeout (default 3 minutes) -- Timer resets every time a chunk arrives -- If no data for 3 minutes → connection considered dead → failover to next credential -- Appropriate for chat completions where tokens should arrive periodically - -**Non-Streaming Requests** (`TimeoutConfig.non_streaming()`): -- Uses longer read timeout (default 10 minutes) -- Server may take significant time to generate the complete response before sending anything -- Complex reasoning tasks or large outputs may legitimately take several minutes -- Only used by Antigravity provider's `_handle_non_streaming()` method - -#### Provider Usage - -The following providers use `TimeoutConfig`: - -| Provider | Method | Timeout Type | -|----------|--------|--------------| -| `antigravity_provider.py` | `_handle_non_streaming()` | `non_streaming()` | -| `antigravity_provider.py` | `_handle_streaming()` | `streaming()` | -| `gemini_cli_provider.py` | `acompletion()` | `streaming()` | -| `iflow_provider.py` | `acompletion()` | `streaming()` | -| `qwen_code_provider.py` | `acompletion()` | `streaming()` | - -**Note:** iFlow, Qwen Code, and Gemini CLI providers always use streaming internally (even for non-streaming requests), aggregating chunks into a complete response. Only Antigravity has a true non-streaming path. - -#### Tuning Recommendations - -| Use Case | Recommendation | -|----------|----------------| -| **Long thinking tasks** | Increase `TIMEOUT_READ_STREAMING` to 300-360s | -| **Unstable network** | Increase `TIMEOUT_CONNECT` to 60s | -| **High concurrency** | Increase `TIMEOUT_POOL` if seeing pool exhaustion | -| **Large context/output** | Increase `TIMEOUT_READ_NON_STREAMING` to 900s+ | - -#### Example Configuration - -```env -# For environments with complex reasoning tasks -TIMEOUT_READ_STREAMING=300 -TIMEOUT_READ_NON_STREAMING=900 - -# For unstable network conditions -TIMEOUT_CONNECT=60 -TIMEOUT_POOL=120 -``` - ---- - - ---- - -## 3. Provider Specific Implementations - -The library handles provider idiosyncrasies through specialized "Provider" classes in `src/rotator_library/providers/`. - -### 3.1. Gemini CLI (`gemini_cli_provider.py`) - -The `GeminiCliProvider` is the most complex implementation, mimicking the Google Cloud Code extension. - -**New in PR #62**: -- **Quota Baseline Tracking**: Background job fetches quota status from API (`retrieveUserQuota`) to provide accurate remaining quota estimates -- **GeminiCliQuotaTracker Mixin**: Inherits from `BaseQuotaTracker` for shared quota infrastructure with Antigravity -- **env:// Credential Support**: Environment-based credentials are detected and loaded via `env://gemini_cli/N` URIs -- **Quota Groups**: Models sharing quota are grouped (`pro`, `25-flash`, `3-flash`) for accurate cooldown propagation -- **24-Hour Fixed Windows**: All tiers use fixed 24-hour windows from first request (verified 2026-01-07) - -**From PR #31**: -- **Quota Parsing**: Implements `parse_quota_error()` using Google RPC format parser -- **Tier Configuration**: Defines `tier_priorities` and `usage_reset_configs` for automatic priority resolution -- **Sequential Rotation**: Defaults to sequential mode (uses credentials until quota exhausted) -- **Priority Multipliers**: Same as Antigravity (P1: 5x, P2: 3x, others: 2x in sequential mode) - -#### Authentication (`gemini_auth_base.py`) - - * **Device Flow**: Uses a standard OAuth 2.0 flow. The `credential_tool` spins up a local web server (default: `localhost:8085`, configurable via `GEMINI_CLI_OAUTH_PORT`) to capture the callback from Google's auth page. - * **Token Lifecycle**: - * **Proactive Refresh**: Tokens are refreshed 5 minutes before expiry. - * **Atomic Writes**: Credential files are updated using a temp-file-and-move strategy to prevent corruption during writes. - * **Revocation Handling**: If a `400` or `401` occurs during refresh, the token is marked as revoked, preventing infinite retry loops. - -#### Project ID Discovery (Zero-Config) - -The provider employs a sophisticated, cached discovery mechanism to find a valid Google Cloud Project ID: -1. **Configuration**: Checks `GEMINI_CLI_PROJECT_ID` first. -2. **Code Assist API**: Tries `CODE_ASSIST_ENDPOINT:loadCodeAssist`. This returns the project associated with the Cloud Code extension. -3. **Onboarding Flow**: If step 2 fails, it triggers the `onboardUser` endpoint. This initiates a Long-Running Operation (LRO) that automatically provisions a free-tier Google Cloud Project for the user. The proxy polls this operation for up to 5 minutes until completion. -4. **Resource Manager**: As a final fallback, it lists all active projects via the Cloud Resource Manager API and selects the first one. - -#### Rate Limit Handling - -* **Internal Endpoints**: Uses `https://cloudcode-pa.googleapis.com/v1internal`, which typically has higher quotas than the public API. -* **Smart Fallback**: If `gemini-2.5-pro` hits a rate limit (`429`), the provider transparently retries the request using `gemini-2.5-pro-preview-06-05`. This fallback chain is configurable in code. - -#### Quota Tracking - -The provider implements quota tracking via the `GeminiCliQuotaTracker` mixin (see Section 2.17): - -* **Real-Time Quota API**: Fetches quota status from `retrieveUserQuota` endpoint -* **Background Refresh**: Configurable interval (default: 5 minutes) via `GEMINI_CLI_QUOTA_REFRESH_INTERVAL` -* **Model Quota Groups**: Pro models share quota, Flash 2.x models share quota, Flash 3 is standalone - -**Default Quota Groups:** - -| Group Name | Models | Verified Sharing | -|------------|--------|------------------| -| `pro` | gemini-2.5-pro, gemini-3-pro-preview | Yes (same bucket) | -| `25-flash` | gemini-2.0-flash, gemini-2.5-flash, gemini-2.5-flash-lite | Yes (same bucket) | -| `3-flash` | gemini-3-flash-preview | Standalone | - -**Quota Limits by Tier:** - -| Tier | Pro Group | Flash Groups | -|------|-----------|--------------| -| standard-tier | 250 requests/24h | 1500 requests/24h | -| free-tier | 100 requests/24h | 1000 requests/24h | - -#### Configuration (Environment Variables) - -```env -# Quota tracking -GEMINI_CLI_QUOTA_REFRESH_INTERVAL=300 # Background refresh interval (default: 5 min) - -# Override quota groups -QUOTA_GROUPS_GEMINI_CLI_PRO="gemini-2.5-pro,gemini-3-pro-preview" -QUOTA_GROUPS_GEMINI_CLI_25_FLASH="gemini-2.0-flash,gemini-2.5-flash,gemini-2.5-flash-lite" -QUOTA_GROUPS_GEMINI_CLI_3_FLASH="gemini-3-flash-preview" -``` - -### 3.2. Qwen Code (`qwen_code_provider.py`) - -* **Dual Auth**: Supports both standard API keys (direct) and OAuth (via `QwenAuthBase`). -* **Device Flow**: Implements the OAuth Device Authorization Grant (RFC 8628). It displays a code to the user and polls the token endpoint until the user authorizes the device in their browser. -* **Dummy Tool Injection**: To work around a Qwen API bug where streams hang if `tools` is empty but `tool_choice` logic is present, the provider injects a benign `do_not_call_me` tool. -* **Schema Cleaning**: Recursively removes `strict` and `additionalProperties` from tool schemas, as Qwen's validation is stricter than OpenAI's. -* **Reasoning Parsing**: Detects `` tags in the raw stream and redirects their content to a separate `reasoning_content` field in the delta, mimicking the OpenAI o1 format. - -### 3.3. iFlow (`iflow_provider.py`) - -* **Hybrid Auth**: Uses a custom OAuth flow (Authorization Code) to obtain an `access_token`. However, the *actual* API calls use a separate `apiKey` that is retrieved from the user's profile (`/api/oauth/getUserInfo`) using the access token. -* **Callback Server**: The auth flow spins up a local server (default: port `11451`, configurable via `IFLOW_OAUTH_PORT`) to capture the redirect. -* **Token Management**: Automatically refreshes the OAuth token and re-fetches the API key if needed. -* **Schema Cleaning**: Similar to Qwen, it aggressively sanitizes tool schemas to prevent 400 errors. -* **Dedicated Logging**: Implements `_IFlowFileLogger` to capture raw chunks for debugging proprietary API behaviors. - -### 3.4. Google Gemini (`gemini_provider.py`) - -* **Thinking Parameter**: Automatically handles the `thinking` parameter transformation required for Gemini 2.5 models (`thinking` -> `gemini-2.5-pro` reasoning parameter). -* **Safety Settings**: Ensures default safety settings (blocking nothing) are applied if not provided, preventing over-sensitive refusals. - ---- - -## 4. Logging & Debugging - -### `detailed_logger.py` - -To facilitate robust debugging, the proxy includes a comprehensive transaction logging system. - -* **Unique IDs**: Every request generates a UUID. -* **Directory Structure**: Logs are stored in `logs/detailed_logs/YYYYMMDD_HHMMSS_{uuid}/`. -* **Artifacts**: - * `request.json`: The exact payload sent to the proxy. - * `final_response.json`: The complete reassembled response. - * `streaming_chunks.jsonl`: A line-by-line log of every SSE chunk received from the provider. - * `metadata.json`: Performance metrics (duration, token usage, model used). - -This level of detail allows developers to trace exactly why a request failed or why a specific key was rotated. - ---- - -## 5. Runtime Resilience - -The proxy is engineered to maintain high availability even in the face of runtime filesystem disruptions. This "Runtime Resilience" capability ensures that the service continues to process API requests even if data files or directories are deleted while the application is running. - -### 5.1. Centralized Resilient I/O (`resilient_io.py`) - -All file operations are centralized in a single utility module that provides consistent error handling, graceful degradation, and automatic retry with shutdown flush: - -#### `BufferedWriteRegistry` (Singleton) - -Global registry for buffered writes with periodic retry and shutdown flush. Ensures critical data is saved even if disk writes fail temporarily: - -- **Per-file buffering**: Each file path has its own pending write (latest data always wins) -- **Periodic retries**: Background thread retries failed writes every 30 seconds -- **Shutdown flush**: `atexit` hook ensures final write attempt on app exit (Ctrl+C) -- **Thread-safe**: Safe for concurrent access from multiple threads - -```python -# Get the singleton instance -registry = BufferedWriteRegistry.get_instance() - -# Check pending writes (for monitoring) -pending_count = registry.get_pending_count() -pending_files = registry.get_pending_paths() - -# Manual flush (optional - atexit handles this automatically) -results = registry.flush_all() # Returns {path: success_bool} - -# Manual shutdown (if needed before atexit) -results = registry.shutdown() -``` - -#### `ResilientStateWriter` - -For stateful files that must persist (usage stats): -- **Memory-first**: Always updates in-memory state before attempting disk write -- **Atomic writes**: Uses tempfile + move pattern to prevent corruption -- **Automatic retry with backoff**: If disk fails, waits `retry_interval` seconds before trying again -- **Shutdown integration**: Registers with `BufferedWriteRegistry` on failure for final flush -- **Health monitoring**: Exposes `is_healthy` property for monitoring - -```python -writer = ResilientStateWriter("data.json", logger, retry_interval=30.0) -writer.write({"key": "value"}) # Always succeeds (memory update) -if not writer.is_healthy: - logger.warning("Disk writes failing, data in memory only") -# On next write() call after retry_interval, disk write is attempted again -# On app exit (Ctrl+C), BufferedWriteRegistry attempts final save -``` - -#### `safe_write_json()` - -For JSON writes with configurable options (credentials, cache): - -| Parameter | Default | Description | -|-----------|---------|-------------| -| `path` | required | File path to write to | -| `data` | required | JSON-serializable data | -| `logger` | required | Logger for warnings | -| `atomic` | `True` | Use atomic write pattern (tempfile + move) | -| `indent` | `2` | JSON indentation level | -| `ensure_ascii` | `True` | Escape non-ASCII characters | -| `secure_permissions` | `False` | Set file permissions to 0o600 | -| `buffer_on_failure` | `False` | Register with BufferedWriteRegistry on failure | - -When `buffer_on_failure=True`: -- Failed writes are registered with `BufferedWriteRegistry` -- Data is retried every 30 seconds in background -- On app exit, final write attempt is made automatically -- Success unregisters the pending write - -```python -# For critical data (auth tokens) - use buffer_on_failure -safe_write_json(path, creds, logger, secure_permissions=True, buffer_on_failure=True) - -# For non-critical data (logs) - no buffering needed -safe_write_json(path, data, logger) -``` - -#### `safe_log_write()` - -For log files where occasional loss is acceptable: -- Fire-and-forget pattern -- Creates parent directories if needed -- Returns `True`/`False`, never raises -- **No buffering** - logs are dropped on failure - -#### `safe_mkdir()` - -For directory creation with error handling. - -### 5.2. Resilience Hierarchy - -The system follows a strict hierarchy of survival: - -1. **Core API Handling (Level 1)**: The Python runtime keeps all necessary code in memory. Deleting source code files while the proxy is running will **not** crash active requests. - -2. **Credential Management (Level 2)**: OAuth tokens are cached in memory first. If credential files are deleted, the proxy continues using cached tokens. If a token refresh succeeds but the file cannot be written, the new token is buffered for retry and saved on shutdown. - -3. **Usage Tracking (Level 3)**: Usage statistics (`key_usage.json`) are maintained in memory via `ResilientStateWriter`. If the file is deleted, the system tracks usage internally and attempts to recreate the file on the next save interval. Pending writes are flushed on shutdown. - -4. **Provider Cache (Level 4)**: The provider cache tracks disk health and continues operating in memory-only mode if disk writes fail. Has its own shutdown mechanism. - -5. **Logging (Level 5)**: Logging is treated as non-critical. If the `logs/` directory is removed, the system attempts to recreate it. If creation fails, logging degrades gracefully without interrupting the request flow. **No buffering or retry**. - -### 5.3. Component Integration - -| Component | Utility Used | Behavior on Disk Failure | Shutdown Flush | -|-----------|--------------|--------------------------|----------------| -| `UsageManager` | `ResilientStateWriter` | Continues in memory, retries after 30s | Yes (via registry) | -| `GoogleOAuthBase` | `safe_write_json(buffer_on_failure=True)` | Memory cache preserved, buffered for retry | Yes (via registry) | -| `QwenAuthBase` | `safe_write_json(buffer_on_failure=True)` | Memory cache preserved, buffered for retry | Yes (via registry) | -| `IFlowAuthBase` | `safe_write_json(buffer_on_failure=True)` | Memory cache preserved, buffered for retry | Yes (via registry) | -| `ProviderCache` | `safe_write_json` + own shutdown | Retries via own background loop | Yes (own mechanism) | -| `DetailedLogger` | `safe_write_json` | Logs dropped, no crash | No | -| `failure_logger` | Python `logging.RotatingFileHandler` | Falls back to NullHandler | No | - -### 5.4. Shutdown Behavior - -When the application exits (including Ctrl+C): - -1. **atexit handler fires**: `BufferedWriteRegistry._atexit_handler()` is called -2. **Pending writes counted**: Registry checks how many files have pending writes -3. **Flush attempted**: Each pending file gets a final write attempt -4. **Results logged**: - - Success: `"Shutdown flush: all N write(s) succeeded"` - - Partial: `"Shutdown flush: X succeeded, Y failed"` with failed file names - -**Console output example:** -``` -INFO:rotator_library.resilient_io:Flushing 2 pending write(s) on shutdown... -INFO:rotator_library.resilient_io:Shutdown flush: all 2 write(s) succeeded -``` - -### 5.5. "Develop While Running" - -This architecture supports a robust development workflow: - -- **Log Cleanup**: You can safely run `rm -rf logs/` while the proxy is serving traffic. The system will recreate the directory structure on the next request. -- **Config Reset**: Deleting `key_usage.json` resets the persistence layer, but the running instance preserves its current in-memory counts for load balancing consistency. -- **File Recovery**: If you delete a critical file, the system attempts directory auto-recreation before every write operation. -- **Safe Exit**: Ctrl+C triggers graceful shutdown with final data flush attempt. - -### 5.6. Graceful Degradation & Data Loss - -While functionality is preserved, persistence may be compromised during filesystem failures: - -- **Logs**: If disk writes fail, detailed request logs may be lost (no buffering). -- **Usage Stats**: Buffered in memory and flushed on shutdown. Data loss only if shutdown flush also fails. -- **Credentials**: Buffered in memory and flushed on shutdown. Re-authentication only needed if shutdown flush fails. -- **Cache**: Provider cache entries may need to be regenerated after restart if its own shutdown mechanism fails. - -### 5.7. Monitoring Disk Health - -Components expose health information for monitoring: - -```python -# BufferedWriteRegistry -registry = BufferedWriteRegistry.get_instance() -pending = registry.get_pending_count() # Number of files with pending writes -files = registry.get_pending_paths() # List of pending file names - -# UsageManager -writer = usage_manager._state_writer -health = writer.get_health_info() -# Returns: {"healthy": True, "failure_count": 0, "last_success": 1234567890.0, ...} - -# ProviderCache -stats = cache.get_stats() -# Includes: {"disk_available": True, "disk_errors": 0, ...} -``` - ---- - -## 6. Model Filter GUI - -The Model Filter GUI (`model_filter_gui.py`) provides a visual interface for configuring model ignore and whitelist rules per provider. It replaces the need to manually edit `IGNORE_MODELS_*` and `WHITELIST_MODELS_*` environment variables. - -### 6.1. Overview - -**Purpose**: Visually manage which models are exposed via the `/v1/models` endpoint for each provider. - -**Launch**: -```bash -python -c "from src.proxy_app.model_filter_gui import run_model_filter_gui; run_model_filter_gui()" -``` - -Or via the launcher TUI if integrated. - -### 6.2. Features - -#### Core Functionality - -- **Provider Selection**: Dropdown to switch between available providers with automatic model fetching -- **Ignore Rules**: Pattern-based rules (supports wildcards like `*-preview`, `gpt-4*`) to exclude models -- **Whitelist Rules**: Pattern-based rules to explicitly include models, overriding ignore rules -- **Real-time Preview**: Typing in rule input fields highlights affected models before committing -- **Rule-Model Linking**: Click a model to highlight the affecting rule; click a rule to highlight all affected models -- **Persistence**: Rules saved to `.env` file in standard `IGNORE_MODELS_` and `WHITELIST_MODELS_` format - -#### Dual-Pane Model View - -The interface displays two synchronized lists: - -| Left Pane | Right Pane | -|-----------|------------| -| All fetched models (plain text) | Same models with color-coded status | -| Shows total count | Shows available/ignored count | -| Scrolls in sync with right pane | Color indicates affecting rule | - -**Color Coding**: -- **Green**: Model is available (no rule affects it, or whitelisted) -- **Red/Orange tones**: Model is ignored (color matches the specific ignore rule) -- **Blue/Teal tones**: Model is explicitly whitelisted (color matches the whitelist rule) - -#### Rule Management - -- **Comma-separated input**: Add multiple rules at once (e.g., `*-preview, *-beta, gpt-3.5*`) -- **Wildcard support**: `*` matches any characters (e.g., `gemini-*-preview`) -- **Affected count**: Each rule shows how many models it affects -- **Tooltips**: Hover over a rule to see the list of affected models -- **Instant delete**: Click the × button to remove a rule immediately - -### 6.3. Keyboard Shortcuts - -| Shortcut | Action | -|----------|--------| -| `Ctrl+S` | Save changes to `.env` | -| `Ctrl+R` | Refresh models from provider | -| `Ctrl+F` | Focus search field | -| `F1` | Show help dialog | -| `Escape` | Clear search / Clear highlights | - -### 6.4. Context Menu - -Right-click on any model to access: - -- **Add to Ignore List**: Creates an ignore rule for the exact model name -- **Add to Whitelist**: Creates a whitelist rule for the exact model name -- **View Affecting Rule**: Highlights the rule that affects this model -- **Copy Model Name**: Copies the full model ID to clipboard - -### 6.5. Integration with Proxy - -The GUI modifies the same environment variables that the `RotatingClient` reads: - -1. **GUI saves rules** → Updates `.env` file -2. **Proxy reads on startup** → Loads `IGNORE_MODELS_*` and `WHITELIST_MODELS_*` -3. **Proxy applies rules** → `get_available_models()` filters based on rules - -**Note**: The proxy must be restarted to pick up rule changes made via the GUI (or use the Launcher TUI's reload functionality if available). - diff --git a/Deployment guide.md b/Deployment guide.md deleted file mode 100644 index 44c7e033..00000000 --- a/Deployment guide.md +++ /dev/null @@ -1,753 +0,0 @@ -# Easy Guide to Deploying LLM-API-Key-Proxy on Render - -This guide walks you through deploying the [LLM-API-Key-Proxy](https://github.com/Mirrowel/LLM-API-Key-Proxy) as a hosted service on Render.com. The project provides a universal, OpenAI-compatible API endpoint for all your LLM providers (like Gemini or OpenAI), powered by an intelligent key management library. It's perfect for integrating with platforms like JanitorAI, where you can use it as a custom proxy for highly available and resilient chats. - -The process is beginner-friendly and takes about 15-30 minutes. We'll use Render's free tier (with limitations like sleep after 15 minutes of inactivity) and upload your `.env` file as a secret for easy key management—no manual entry of variables required. - -## Prerequisites - -- A free Render.com account (sign up at render.com). -- A GitHub account (for forking the repo). -- Basic terminal access (e.g., Command Prompt, Terminal, or Git Bash). -- API keys from LLM providers (e.g., Gemini, OpenAI—get them from their dashboards). For details on supported providers and how to format their keys (e.g., API key naming conventions), refer to the [LiteLLM Providers Documentation](https://docs.litellm.ai/docs/providers). - -**Note**: You don't need Python installed for initial testing—use the pre-compiled Windows EXE from the repo's releases for a quick local trial. - -## Step 1: Test Locally with the Compiled EXE (No Python Required) - -Before deploying, try the proxy locally to ensure your keys work. This uses a pre-built executable that's easy to set up. - -1. Go to the repo's [GitHub Releases page](https://github.com/Mirrowel/LLM-API-Key-Proxy/releases). -2. Download the latest release ZIP file (e.g., for Windows). -3. Unzip the file. -4. Double-click `setup_env.bat`. A window will open—follow the prompts to add your PROXY_API_KEY (a strong secret you create) and provider keys. Use the [LiteLLM Providers Documentation](https://docs.litellm.ai/docs/providers) for guidance on key formats (e.g., `GEMINI_API_KEY_1="your-key"`). -5. Double-click `proxy_app.exe` to start the proxy. It runs at `http://127.0.0.1:8000`—visit in a browser to confirm "API Key Proxy is running". -6. Test with curl (replace with your PROXY_API_KEY): - -``` -curl -X POST http://127.0.0.1:8000/v1/chat/completions -H "Content-Type: application/json" -H "Authorization: Bearer your-proxy-key" -d '{"model": "gemini/gemini-2.5-flash", "messages": [{"role": "user", "content": "What is the capital of France?"}]}' -``` - - - Expected: A JSON response with the answer (e.g., "Paris"). - -If it works, you're ready to deploy. If not, double-check your keys against LiteLLM docs. - -## Step 2: Fork and Prepare the Repository - -1. Go to the original repo: [https://github.com/Mirrowel/LLM-API-Key-Proxy](https://github.com/Mirrowel/LLM-API-Key-Proxy). -2. Click **Fork** in the top-right to create your own copy (this lets you make changes if needed). -3. Clone your forked repo locally: - -``` -git clone https://github.com/YOUR-USERNAME/LLM-API-Key-Proxy.git -cd LLM-API-Key-Proxy -``` - -## Step 3: Assemble Your .env File - -The proxy uses a `.env` file to store your API keys securely. We'll create this based on the repo's documentation. - -1. In your cloned repo, copy the example: `copy .env.example .env` (Windows) or `cp .env.example .env` (macOS/Linux). -2. Open `.env` in a text editor (e.g., Notepad or VS Code). -3. Add your keys following the format from the repo's README and [LiteLLM Providers Documentation](https://docs.litellm.ai/docs/providers): - - **PROXY_API_KEY**: Create a strong, unique secret (e.g., "my-super-secret-proxy-key"). This authenticates requests to your proxy. - - **Provider Keys**: Add keys for your chosen providers. You can add multiple per provider (e.g., \_1, \_2) for rotation. - -Example `.env` (customize with your real keys): - -``` -# Your proxy's authentication key (invent a strong one) -PROXY_API_KEY="my-super-secret-proxy-key" - -# Provider API keys (get from provider dashboards; see LiteLLM docs for formats) -GEMINI_API_KEY_1="your-gemini-key-here" -GEMINI_API_KEY_2="another-gemini-key" - -OPENROUTER_API_KEY_1="your-openrouter-key" -``` - - - Supported providers: Check LiteLLM docs for a full list and specifics (e.g., GEMINI, OPENROUTER, NVIDIA_NIM). - - Tip: Start with 1-2 providers to test. Don't share this file publicly! - -### Advanced: Stateless Deployment for OAuth Providers (Gemini CLI, Qwen, iFlow) - -If you are using providers that require complex OAuth files (like **Gemini CLI**, **Qwen Code**, or **iFlow**), you don't need to upload the JSON files manually. The proxy includes a tool to "export" these credentials into environment variables. - -1. Run the credential tool locally: `python -m rotator_library.credential_tool` -2. Select the "Export ... to .env" option for your provider. -3. The tool will generate a file (e.g., `gemini_cli_user_at_gmail.env`) containing variables like `GEMINI_CLI_ACCESS_TOKEN`, `GEMINI_CLI_REFRESH_TOKEN`, etc. -4. Copy the contents of this file and paste them directly into your `.env` file or Render's "Environment Variables" section. -5. The proxy will automatically detect and use these variables—no file upload required! - -### Advanced: Antigravity OAuth Provider - -The Antigravity provider requires OAuth2 authentication similar to Gemini CLI. It provides access to: - -- Gemini 2.5 models (Pro/Flash) -- Gemini 3 models (Pro/Image-preview) - **requires paid-tier Google Cloud project** -- Claude Sonnet 4.5 via Google's Antigravity proxy - -**Setting up Antigravity locally:** - -1. Run the credential tool: `python -m rotator_library.credential_tool` -2. Select "Add OAuth Credential" and choose "Antigravity" -3. Complete the OAuth flow in your browser -4. The credential is saved to `oauth_creds/antigravity_oauth_1.json` - -**Exporting for stateless deployment:** - -1. Run: `python -m rotator_library.credential_tool` -2. Select "Export Antigravity to .env" -3. Copy the generated environment variables to your deployment platform: - ```env - ANTIGRAVITY_ACCESS_TOKEN="..." - ANTIGRAVITY_REFRESH_TOKEN="..." - ANTIGRAVITY_EXPIRY_DATE="..." - ANTIGRAVITY_EMAIL="your-email@gmail.com" - ``` - -**Important Notes:** - -- Antigravity uses Google OAuth with additional scopes for cloud platform access -- Gemini 3 models require a paid-tier Google Cloud project (free tier will fail) -- The provider automatically handles thought signature caching for multi-turn conversations -- Tool hallucination prevention is enabled by default for Gemini 3 models - -4. Save the file. (We'll upload it to Render in Step 5.) - -## Step 4: Create a New Web Service on Render - -1. Log in to render.com and go to your Dashboard. -2. Click **New > Web Service**. -3. Choose **Build and deploy from a Git repository** > **Next**. -4. Connect your GitHub account and select your forked repo. -5. In the setup form: - - **Name**: Something like "llm-api-key-proxy". - - **Region**: Choose one close to you (e.g., Oregon for US West). - - **Branch**: "main" (or your default). - - **Runtime**: Python 3. - - **Build Command**: `pip install -r requirements.txt`. - - **Start Command**: `uvicorn src.proxy_app.main:app --host 0.0.0.0 --port $PORT`. - - **Instance Type**: Free (for testing; upgrade later for always-on service). -6. Click **Create Web Service**. Render will build and deploy—watch the progress in the Events tab. - -## Step 5: Upload .env as a Secret File - -Render mounts secret files securely at runtime, making your `.env` available to the app without exposing it. - -1. In your new service's Dashboard, go to **Environment > Secret Files**. -2. Click **Add Secret File**. -3. **File Path**: Don't change. Keep it as root directory of the repo. -4. **Contents**: Upload the `.env` file you created previously. -5. Save. This injects the file for the app to load via `dotenv` (already in the code). -6. Trigger a redeploy: Go to **Deploy > Manual Deploy** > **Deploy HEAD** (or push a small change to your repo). - -Your keys are now loaded automatically! - -## Step 6: Test Your Deployed Proxy - -1. Note your service URL: It's in the Dashboard (e.g., https://llm-api-key-proxy.onrender.com). -2. Test with curl (replace with your PROXY_API_KEY): - -``` -curl -X POST https://your-service.onrender.com/v1/chat/completions -H "Content-Type: application/json" -H "Authorization: Bearer your-proxy-key" -d '{"model": "gemini/gemini-2.5-flash", "messages": [{"role": "user", "content": "What is the capital of France?"}]}' -``` - - - Expected: A JSON response with the answer (e.g., "Paris"). - -3. Check logs in Render's Dashboard for startup messages (e.g., "RotatingClient initialized"). - -## Step 7: Integrate with JanitorAI - -1. Log in to janitorai.com and go to API settings (usually in a chat or account menu). -2. Select "Proxy" mode. -3. **API URL**: `https://your-service.onrender.com/v1`. -4. **API Key**: Your PROXY_API_KEY (from .env). -5. **Model**: Format as "provider/model" (e.g., "gemini/gemini-2.5-flash"; check LiteLLM docs for options). -6. Save and test a chat—messages should route through your proxy. - -## Troubleshooting - -- **Build Fails**: Check Render logs for missing dependencies—ensure `requirements.txt` is up to date. -- **401 Unauthorized**: Verify your PROXY_API_KEY matches exactly (case-sensitive) and includes "Bearer " in requests. Or you have no keys for the provider/model added that you are trying to use. -- **405 on OPTIONS**: If CORS issues arise, add the middleware from Step 3 and redeploy. -- **Service Sleeps**: Free tier sleeps after inactivity—first requests may delay. -- **Provider Key Issues**: Double-check formats in [LiteLLM Providers Documentation](https://docs.litellm.ai/docs/providers). -- **More Help**: Check Render docs or the repo's README. If stuck, share error logs. - -That is it. - ---- - -## Appendix: Deploying with Docker - -Docker provides a consistent, portable deployment option for any platform. The proxy image is automatically built and published to GitHub Container Registry (GHCR) on every push to `main` or `dev` branches. - -### Quick Start with Docker Compose - -This is the **fastest way** to deploy the proxy using Docker. - -1. **Create your configuration files:** - -```bash -# Clone the repo (or just download docker-compose.yml and .env.example) -git clone https://github.com/Mirrowel/LLM-API-Key-Proxy.git -cd LLM-API-Key-Proxy - -# Create your .env file -cp .env.example .env -nano .env # Add your PROXY_API_KEY and provider keys - -# Create key_usage.json file (required before first run) -touch key_usage.json -``` - -> **Important:** You must create `key_usage.json` before running Docker Compose. If this file doesn't exist on the host, Docker will create it as a directory instead of a file, causing the container to fail. - -2. **Start the proxy:** - -```bash -docker compose up -d -``` - -3. **Verify it's running:** - -```bash -# Check container status -docker compose ps - -# View logs -docker compose logs -f - -# Test the endpoint -curl http://localhost:8000/ -``` - -### Manual Docker Run - -If you prefer not to use Docker Compose: - -```bash -# Create necessary directories and files -mkdir -p oauth_creds logs -touch key_usage.json - -# Run the container -docker run -d \ - --name llm-api-proxy \ - --restart unless-stopped \ - -p 8000:8000 \ - -v $(pwd)/.env:/app/.env:ro \ - -v $(pwd)/oauth_creds:/app/oauth_creds \ - -v $(pwd)/logs:/app/logs \ - -v $(pwd)/key_usage.json:/app/key_usage.json \ - -e SKIP_OAUTH_INIT_CHECK=true \ - -e PYTHONUNBUFFERED=1 \ - ghcr.io/mirrowel/llm-api-key-proxy:latest -``` - -### Available Image Tags - -| Tag | Description | Use Case | -| ----------------------- | ----------------------------------------------- | -------------------- | -| `latest` | Latest stable build from `main` branch | Production | -| `dev-latest` | Latest build from `dev` branch | Testing new features | -| `YYYYMMDD-HHMMSS-` | Specific version with timestamp and commit hash | Pinned deployments | - -Example using a specific version: - -```bash -docker pull ghcr.io/mirrowel/llm-api-key-proxy:20250106-143022-abc1234 -``` - -### Volume Mounts Explained - -| Host Path | Container Path | Purpose | Mode | -| ------------------ | --------------------- | --------------------------------- | ----------------- | -| `./.env` | `/app/.env` | Configuration and API keys | Read-only (`:ro`) | -| `./oauth_creds/` | `/app/oauth_creds/` | OAuth credential JSON files | Read-write | -| `./logs/` | `/app/logs/` | Request logs and detailed logging | Read-write | -| `./key_usage.json` | `/app/key_usage.json` | Usage statistics persistence | Read-write | - -### Setting Up OAuth Providers with Docker - -OAuth providers (Antigravity, Gemini CLI, Qwen Code, iFlow) require interactive browser authentication. Since Docker containers run headless, you must authenticate **outside the container** first. - -#### Option 1: Authenticate Locally, Mount Credentials (Recommended) - -1. **Set up the project locally:** - -```bash -git clone https://github.com/Mirrowel/LLM-API-Key-Proxy.git -cd LLM-API-Key-Proxy -pip install -r requirements.txt -``` - -2. **Run the credential tool and complete OAuth flows:** - -```bash -python -m rotator_library.credential_tool -# Select "Add OAuth Credential" → Choose provider -# Complete authentication in browser -``` - -3. **Deploy with Docker, mounting the oauth_creds directory:** - -```bash -docker compose up -d -# The oauth_creds/ directory is automatically mounted -``` - -#### Option 2: Export Credentials to Environment Variables - -For truly stateless deployments (no mounted credential files): - -1. **Complete OAuth locally as above** - -2. **Export credentials to environment variables:** - -```bash -python -m rotator_library.credential_tool -# Select "Export [Provider] to .env" -``` - -3. **Add the exported variables to your `.env` file:** - -```env -# Example for Antigravity -ANTIGRAVITY_ACCESS_TOKEN="ya29.a0AfB_byD..." -ANTIGRAVITY_REFRESH_TOKEN="1//0gL6dK9..." -ANTIGRAVITY_EXPIRY_DATE="1735901234567" -ANTIGRAVITY_EMAIL="user@gmail.com" -ANTIGRAVITY_CLIENT_ID="1071006060591-..." -ANTIGRAVITY_CLIENT_SECRET="GOCSPX-..." -``` - -4. **Deploy with Docker:** - -```bash -docker compose up -d -# Credentials are loaded from .env, no oauth_creds mount needed -``` - -### Development: Building Locally - -For development or customization, use the development compose file: - -```bash -# Build and run from local source -docker compose -f docker-compose.dev.yml up -d --build - -# Rebuild after code changes -docker compose -f docker-compose.dev.yml up -d --build --force-recreate -``` - -### Container Management - -```bash -# Stop the proxy -docker compose down - -# Restart the proxy -docker compose restart - -# View real-time logs -docker compose logs -f - -# Check container resource usage -docker stats llm-api-proxy - -# Update to latest image -docker compose pull -docker compose up -d -``` - -### Docker on Different Platforms - -The image is built for both `linux/amd64` and `linux/arm64` architectures, so it works on: - -- Linux servers (x86_64, ARM64) -- macOS (Intel and Apple Silicon) -- Windows with WSL2/Docker Desktop -- Raspberry Pi 4+ (ARM64) - -### Troubleshooting Docker Deployment - -| Issue | Solution | -| ----------------------------- | ---------------------------------------------------------------------------------------------------------------- | -| Container exits immediately | Check logs: `docker compose logs` — likely missing `.env` or invalid config | -| Permission denied on volumes | Ensure directories exist and have correct permissions: `mkdir -p oauth_creds logs && chmod 755 oauth_creds logs` | -| OAuth credentials not loading | Verify `oauth_creds/` is mounted and contains valid JSON files, or check environment variables are set | -| Port already in use | Change the port mapping: `-p 9000:8000` or edit `docker-compose.yml` | -| Image not updating | Force pull: `docker compose pull && docker compose up -d` | - ---- - -## Appendix: Deploying to a Custom VPS - -If you're deploying the proxy to a **custom VPS** (DigitalOcean, AWS EC2, Linode, etc.) instead of Render.com, you'll encounter special considerations when setting up OAuth providers (Antigravity, Gemini CLI, iFlow). This section covers the professional deployment workflow. - -### Understanding the OAuth Callback Problem - -OAuth providers like Antigravity, Gemini CLI, and iFlow require an interactive authentication flow that: - -1. Opens a browser for you to log in -2. Redirects back to a **local callback server** running on specific ports -3. Receives an authorization code to exchange for tokens - -The callback servers bind to `localhost` on these ports: - -| Provider | Port | Notes | -| --------------- | ----- | ---------------------------------------------- | -| **Antigravity** | 51121 | Google OAuth with extended scopes | -| **Gemini CLI** | 8085 | Google OAuth for Gemini API | -| **iFlow** | 11451 | Authorization Code flow with API key fetch | -| **Qwen Code** | N/A | Uses Device Code flow - works on remote VPS ✅ | - -**The Issue**: When running on a remote VPS, your local browser cannot reach `http://localhost:51121` (or other callback ports) on the remote server, causing authentication to fail with a "connection refused" error. - -### Recommended Deployment Workflow - -There are **three professional approaches** to handle OAuth authentication for VPS deployment, listed from most recommended to least: - ---- - -### **Option 1: Authenticate Locally, Deploy Credentials (RECOMMENDED)** - -This is the **cleanest and most secure** approach. Complete OAuth flows on your local machine, export to environment variables, then deploy. - -#### Step 1: Clone and Set Up Locally - -```bash -# On your local development machine -git clone https://github.com/YOUR-USERNAME/LLM-API-Key-Proxy.git -cd LLM-API-Key-Proxy - -# Install dependencies -pip install -r requirements.txt -``` - -#### Step 2: Run OAuth Authentication Locally - -```bash -# Start the credential tool -python -m rotator_library.credential_tool -``` - -Select **"Add OAuth Credential"** and choose your provider: - -- Antigravity -- Gemini CLI -- iFlow -- Qwen Code (works directly on VPS, but can authenticate locally too) - -The tool will: - -1. Open your browser automatically -2. Start a local callback server -3. Complete the OAuth flow -4. Save credentials to `oauth_creds/_oauth_N.json` - -#### Step 3: Export Credentials to Environment Variables - -Still in the credential tool, select the export option for each provider: - -- **"Export Antigravity to .env"** -- **"Export Gemini CLI to .env"** -- **"Export iFlow to .env"** -- **"Export Qwen Code to .env"** - -The tool generates a `.env` file snippet like: - -```env -# Antigravity OAuth Credentials -ANTIGRAVITY_ACCESS_TOKEN="ya29.a0AfB_byD..." -ANTIGRAVITY_REFRESH_TOKEN="1//0gL6dK9..." -ANTIGRAVITY_EXPIRY_DATE="1735901234567" -ANTIGRAVITY_EMAIL="user@gmail.com" -ANTIGRAVITY_CLIENT_ID="1071006060591-..." -ANTIGRAVITY_CLIENT_SECRET="GOCSPX-..." -ANTIGRAVITY_TOKEN_URI="https://oauth2.googleapis.com/token" -ANTIGRAVITY_UNIVERSE_DOMAIN="googleapis.com" -``` - -Copy these variables to a file (e.g., `oauth_credentials.env`). - -#### Step 4: Deploy to VPS - -**Method A: Using Environment Variables (Recommended)** - -```bash -# On your VPS -cd /path/to/LLM-API-Key-Proxy - -# Create or edit .env file -nano .env - -# Paste the exported environment variables -# Also add your PROXY_API_KEY and other provider keys - -# Start the proxy -uvicorn src.proxy_app.main:app --host 0.0.0.0 --port 8000 -``` - -**Method B: Upload Credential Files** - -```bash -# On your local machine - copy credential files to VPS -scp -r oauth_creds/ user@your-vps-ip:/path/to/LLM-API-Key-Proxy/ - -# On VPS - verify files exist -ls -la oauth_creds/ - -# Start the proxy -uvicorn src.proxy_app.main:app --host 0.0.0.0 --port 8000 -``` - -> **Note**: Environment variables are preferred for production deployments (more secure, easier to manage, works with container orchestration). - ---- - -### **Option 2: SSH Port Forwarding (For Direct VPS Authentication)** - -If you need to authenticate directly on the VPS (e.g., you don't have a local development environment), use SSH port forwarding to create secure tunnels. - -#### How It Works - -SSH tunnels forward ports from your local machine to the remote VPS, allowing your local browser to reach the callback servers. - -#### Step-by-Step Process - -**Step 1: Create SSH Tunnels** - -From your **local machine**, open a terminal and run: - -```bash -# Forward all OAuth callback ports at once -ssh -L 51121:localhost:51121 -L 8085:localhost:8085 -L 11451:localhost:11451 user@your-vps-ip - -# Alternative: Forward ports individually as needed -ssh -L 51121:localhost:51121 user@your-vps-ip # For Antigravity -ssh -L 8085:localhost:8085 user@your-vps-ip # For Gemini CLI -ssh -L 11451:localhost:11451 user@your-vps-ip # For iFlow -``` - -**Keep this SSH session open** during the entire authentication process. - -**Step 2: Run Credential Tool on VPS** - -In the same SSH terminal (or open a new SSH connection): - -```bash -cd /path/to/LLM-API-Key-Proxy - -# Ensure Python dependencies are installed -pip install -r requirements.txt - -# Run the credential tool -python -m rotator_library.credential_tool -``` - -**Step 3: Complete OAuth Flow** - -1. Select **"Add OAuth Credential"** → Choose your provider -2. The tool displays an authorization URL -3. **Click the URL in your local browser** (works because of the SSH tunnel!) -4. Complete the authentication flow -5. The browser redirects to `localhost:` - **this now routes through the tunnel to your VPS** -6. Credentials are saved to `oauth_creds/` on the VPS - -**Step 4: Export to Environment Variables** - -Still in the credential tool: - -1. Select the export option for each provider -2. Copy the generated environment variables -3. Add them to `/path/to/LLM-API-Key-Proxy/.env` on your VPS - -**Step 5: Close Tunnels and Deploy** - -```bash -# Exit the SSH session with tunnels (Ctrl+D or type 'exit') -# Tunnels are no longer needed - -# Start the proxy on VPS (in a screen/tmux session or as a service) -uvicorn src.proxy_app.main:app --host 0.0.0.0 --port 8000 -``` - ---- - -### **Option 3: Copy Credential Files to VPS** - -If you've already authenticated locally and have credential files, you can copy them directly. - -#### Copy OAuth Credential Files - -```bash -# From your local machine -scp -r oauth_creds/ user@your-vps-ip:/path/to/LLM-API-Key-Proxy/ - -# Verify on VPS -ssh user@your-vps-ip -ls -la /path/to/LLM-API-Key-Proxy/oauth_creds/ -``` - -Expected files: - -- `antigravity_oauth_1.json` -- `gemini_cli_oauth_1.json` -- `iflow_oauth_1.json` -- `qwen_code_oauth_1.json` - -#### Configure .env to Use Credential Files - -On your VPS, edit `.env`: - -```env -# Option A: Use credential files directly (not recommended for production) -# No special configuration needed - the proxy auto-detects oauth_creds/ folder - -# Option B: Export to environment variables (recommended) -# Run credential tool and export each provider to .env -``` - ---- - -### Environment Variables vs. Credential Files - -| Aspect | Environment Variables | Credential Files | -| -------------------------- | --------------------------------------- | --------------------------------------- | -| **Security** | ✅ More secure (no files on disk) | ⚠️ Files readable if server compromised | -| **Container-Friendly** | ✅ Perfect for Docker/K8s | ❌ Requires volume mounts | -| **Ease of Rotation** | ✅ Update .env and restart | ⚠️ Need to regenerate JSON files | -| **Backup/Version Control** | ✅ Easy to manage with secrets managers | ❌ Binary files, harder to manage | -| **Auto-Refresh** | ✅ Uses refresh tokens | ✅ Uses refresh tokens | -| **Recommended For** | Production deployments | Local development / testing | - -**Best Practice**: Always export to environment variables for VPS/cloud deployments. - ---- - -### Production Deployment Checklist - -#### Security Best Practices - -- [ ] Never commit `.env` or `oauth_creds/` to version control -- [ ] Use environment variables instead of credential files in production -- [ ] Secure your VPS firewall - **do not** open OAuth callback ports (51121, 8085, 11451) to public internet -- [ ] Use SSH port forwarding only during initial authentication -- [ ] Rotate credentials regularly using the credential tool's export feature -- [ ] Set file permissions on `.env`: `chmod 600 .env` - -#### Firewall Configuration - -OAuth callback ports should **never** be publicly exposed: - -```bash -# ❌ DO NOT DO THIS - keeps ports closed -# sudo ufw allow 51121/tcp -# sudo ufw allow 8085/tcp -# sudo ufw allow 11451/tcp - -# ✅ Only open your proxy API port -sudo ufw allow 8000/tcp - -# Check firewall status -sudo ufw status -``` - -The SSH tunnel method works **without** opening these ports because traffic routes through the SSH connection (port 22). - -#### Running as a Service - -Create a systemd service file on your VPS: - -```bash -# Create service file -sudo nano /etc/systemd/system/llm-proxy.service -``` - -```ini -[Unit] -Description=LLM API Key Proxy -After=network.target - -[Service] -Type=simple -User=your-username -WorkingDirectory=/path/to/LLM-API-Key-Proxy -Environment="PATH=/path/to/python/bin" -ExecStart=/path/to/python/bin/uvicorn src.proxy_app.main:app --host 0.0.0.0 --port 8000 -Restart=always -RestartSec=10 - -[Install] -WantedBy=multi-user.target -``` - -```bash -# Enable and start the service -sudo systemctl daemon-reload -sudo systemctl enable llm-proxy -sudo systemctl start llm-proxy - -# Check status -sudo systemctl status llm-proxy - -# View logs -sudo journalctl -u llm-proxy -f -``` - ---- - -### Troubleshooting VPS Deployment - -#### "localhost:51121 connection refused" Error - -**Cause**: Trying to authenticate directly on VPS without SSH tunnel. - -**Solution**: Use Option 1 (authenticate locally) or Option 2 (SSH port forwarding). - -#### OAuth Credentials Not Loading - -```bash -# Check if environment variables are set -printenv | grep -E '(ANTIGRAVITY|GEMINI_CLI|IFLOW|QWEN_CODE)' - -# Verify .env file exists and is readable -ls -la .env -cat .env | grep -E '(ANTIGRAVITY|GEMINI_CLI|IFLOW|QWEN_CODE)' - -# Check credential files if using file-based approach -ls -la oauth_creds/ -``` - -#### Token Refresh Failing - -The proxy automatically refreshes tokens using refresh tokens. If refresh fails: - -1. **Re-authenticate**: Run credential tool again and export new credentials -2. **Check token expiry**: Some providers require periodic re-authentication -3. **Verify credentials**: Ensure `REFRESH_TOKEN` is present in environment variables - -#### Permission Denied on .env - -```bash -# Set correct permissions -chmod 600 .env -chown your-username:your-username .env -``` - ---- - -### Summary: VPS Deployment Best Practices - -1. **Authenticate locally** on your development machine (easiest, most secure) -2. **Export to environment variables** using the credential tool's built-in export feature -3. **Deploy to VPS** by adding environment variables to `.env` -4. **Never open OAuth callback ports** to the public internet -5. **Use SSH port forwarding** only if you must authenticate directly on VPS -6. **Run as a systemd service** for production reliability -7. **Monitor logs** for authentication errors and token refresh issues - -This approach ensures secure, production-ready deployment while maintaining the convenience of OAuth authentication. diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index aafcb117..00000000 --- a/Dockerfile +++ /dev/null @@ -1,49 +0,0 @@ -# Build stage -FROM python:3.11-slim AS builder - -WORKDIR /app - -# Install build dependencies -RUN apt-get update && apt-get install -y --no-install-recommends \ - gcc \ - && rm -rf /var/lib/apt/lists/* - -# Set PATH for user-installed packages in builder stage -ENV PATH=/root/.local/bin:$PATH - -# Copy requirements first for better caching -COPY requirements.txt . - -# Copy the local rotator_library for editable install -COPY src/rotator_library ./src/rotator_library - -# Install dependencies -RUN pip install --no-cache-dir --user -r requirements.txt - -# Production stage -FROM python:3.11-slim - -WORKDIR /app - -# Copy installed packages from builder -COPY --from=builder /root/.local /root/.local - -# Make sure scripts in .local are usable -ENV PATH=/root/.local/bin:$PATH - -# Copy application code -COPY src/ ./src/ - -# Create directories for logs and oauth credentials -RUN mkdir -p logs oauth_creds - -# Expose the default port -EXPOSE 8000 - -# Set environment variables -ENV PYTHONUNBUFFERED=1 -ENV PYTHONDONTWRITEBYTECODE=1 -ENV PYTHONPATH=/app/src - -# Default command - runs proxy with the correct PYTHONPATH -CMD ["python", "src/proxy_app/main.py", "--port", "8000"] diff --git a/LICENSE b/LICENSE deleted file mode 100644 index ff19bcb5..00000000 --- a/LICENSE +++ /dev/null @@ -1,7 +0,0 @@ -This project contains components with different licenses. - -- The core library, located in `src/rotator_library/`, is licensed under the GNU Lesser General Public License, Version 3.0. Copies of the license can be found in `src/rotator_library/COPYING` and `src/rotator_library/COPYING.LESSER`. - -- The proxy application, located in `src/proxy_app/`, is licensed under the MIT License. A copy of the license can be found in `src/proxy_app/LICENSE`. - -Please see the individual license files for the full terms. \ No newline at end of file diff --git a/README.md b/README.md deleted file mode 100644 index a7c3c438..00000000 --- a/README.md +++ /dev/null @@ -1,999 +0,0 @@ -# Universal LLM API Proxy & Resilience Library -[![ko-fi](https://ko-fi.com/img/githubbutton_sm.svg)](https://ko-fi.com/C0C0UZS4P) -[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/Mirrowel/LLM-API-Key-Proxy) [![zread](https://img.shields.io/badge/Ask_Zread-_.svg?style=flat&color=00b0aa&labelColor=000000&logo=data%3Aimage%2Fsvg%2Bxml%3Bbase64%2CPHN2ZyB3aWR0aD0iMTYiIGhlaWdodD0iMTYiIHZpZXdCb3g9IjAgMCAxNiAxNiIgZmlsbD0ibm9uZSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KPHBhdGggZD0iTTQuOTYxNTYgMS42MDAxSDIuMjQxNTZDMS44ODgxIDEuNjAwMSAxLjYwMTU2IDEuODg2NjQgMS42MDE1NiAyLjI0MDFWNC45NjAxQzEuNjAxNTYgNS4zMTM1NiAxLjg4ODEgNS42MDAxIDIuMjQxNTYgNS42MDAxSDQuOTYxNTZDNS4zMTUwMiA1LjYwMDEgNS42MDE1NiA1LjMxMzU2IDUuNjAxNTYgNC45NjAxVjIuMjQwMUM1LjYwMTU2IDEuODg2NjQgNS4zMTUwMiAxLjYwMDEgNC45NjE1NiAxLjYwMDFaIiBmaWxsPSIjZmZmIi8%2BCjxwYXRoIGQ9Ik00Ljk2MTU2IDEwLjM5OTlIMi4yNDE1NkMxLjg4ODEgMTAuMzk5OSAxLjYwMTU2IDEwLjY4NjQgMS42MDE1NiAxMS4wMzk5VjEzLjc1OTlDMS42MDE1NiAxNC4xMTM0IDEuODg4MSAxNC4zOTk5IDIuMjQxNTYgMTQuMzk5OUg0Ljk2MTU2QzUuMzE1MDIgMTQuMzk5OSA1LjYwMTU2IDE0LjExMzQgNS42MDE1NiAxMy43NTk5VjExLjAzOTlDNS42MDE1NiAxMC42ODY0IDUuMzE1MDIgMTAuMzk5OSA0Ljk2MTU2IDEwLjM5OTlaIiBmaWxsPSIjZmZmIi8%2BCjxwYXRoIGQ9Ik0xMy43NTg0IDEuNjAwMUgxMS4wMzg0QzEwLjY4NSAxLjYwMDEgMTAuMzk4NCAxLjg4NjY0IDEwLjM5ODQgMi4yNDAxVjQuOTYwMUMxMC4zOTg0IDUuMzEzNTYgMTAuNjg1IDUuNjAwMSAxMS4wMzg0IDUuNjAwMUgxMy43NTg0QzE0LjExMTkgNS42MDAxIDE0LjM5ODQgNS4zMTM1NiAxNC4zOTg0IDQuOTYwMVYyLjI0MDFDMTQuMzk4NCAxLjg4NjY0IDE0LjExMTkgMS42MDAxIDEzLjc1ODQgMS42MDAxWiIgZmlsbD0iI2ZmZiIvPgo8cGF0aCBkPSJNNCAxMkwxMiA0TDQgMTJaIiBmaWxsPSIjZmZmIi8%2BCjxwYXRoIGQ9Ik00IDEyTDEyIDQiIHN0cm9rZT0iI2ZmZiIgc3Ryb2tlLXdpZHRoPSIxLjUiIHN0cm9rZS1saW5lY2FwPSJyb3VuZCIvPgo8L3N2Zz4K&logoColor=ffffff)](https://zread.ai/Mirrowel/LLM-API-Key-Proxy) - -**One proxy. Any LLM provider. Zero code changes.** - -A self-hosted proxy that provides OpenAI and Anthropic compatible API endpoints for all your LLM providers. Works with any application that supports custom OpenAI or Anthropic base URLs—including Claude Code, Opencode, and more—no code changes required in your existing tools. - -This project consists of two components: - -1. **The API Proxy** — A FastAPI application providing universal `/v1/chat/completions` (OpenAI) and `/v1/messages` (Anthropic) endpoints -2. **The Resilience Library** — A reusable Python library for intelligent API key management, rotation, and failover - ---- - -## Why Use This? - -- **Universal Compatibility** — Works with any app supporting OpenAI or Anthropic APIs: Claude Code, Opencode, Continue, Roo/Kilo Code, Cursor, JanitorAI, SillyTavern, custom applications, and more -- **One Endpoint, Many Providers** — Configure Gemini, OpenAI, Anthropic, and [any LiteLLM-supported provider](https://docs.litellm.ai/docs/providers) once. Access them all through a single API key -- **Anthropic API Compatible** — Use Claude Code or any Anthropic SDK client with non-Anthropic providers like Gemini, OpenAI, or custom models -- **Built-in Resilience** — Automatic key rotation, failover on errors, rate limit handling, and intelligent cooldowns -- **Exclusive Provider Support** — Includes custom providers not available elsewhere: **Antigravity** (Gemini 3 + Claude Sonnet/Opus 4.5), **Gemini CLI**, **Qwen Code**, and **iFlow** - ---- - -## Quick Start - -### Windows - -1. **Download** the latest release from [GitHub Releases](https://github.com/Mirrowel/LLM-API-Key-Proxy/releases/latest) -2. **Unzip** the downloaded file -3. **Run** `proxy_app.exe` — the interactive TUI launcher opens - - - -### macOS / Linux - -```bash -# Download and extract the release for your platform -chmod +x proxy_app -./proxy_app -``` - -### Docker - -**Using the pre-built image (recommended):** - -```bash -# Pull and run directly -docker run -d \ - --name llm-api-proxy \ - -p 8000:8000 \ - -v $(pwd)/.env:/app/.env:ro \ - -v $(pwd)/oauth_creds:/app/oauth_creds \ - -v $(pwd)/logs:/app/logs \ - -e SKIP_OAUTH_INIT_CHECK=true \ - ghcr.io/mirrowel/llm-api-key-proxy:latest -``` - -**Using Docker Compose:** - -```bash -# Create your .env file and key_usage.json first, then: -cp .env.example .env -touch key_usage.json -docker compose up -d -``` - -> **Important:** You must create both `.env` and `key_usage.json` files before running Docker Compose. If `key_usage.json` doesn't exist, Docker will create it as a directory instead of a file, causing errors. - -> **Note:** For OAuth providers, complete authentication locally first using the credential tool, then mount the `oauth_creds/` directory or export credentials to environment variables. - -### From Source - -```bash -git clone https://github.com/Mirrowel/LLM-API-Key-Proxy.git -cd LLM-API-Key-Proxy -python3 -m venv venv -source venv/bin/activate # Windows: venv\Scripts\activate -pip install -r requirements.txt -python src/proxy_app/main.py -``` - -> **Tip:** Running with command-line arguments (e.g., `--host 0.0.0.0 --port 8000`) bypasses the TUI and starts the proxy directly. - ---- - -## Connecting to the Proxy - -Once the proxy is running, configure your application with these settings: - -| Setting | Value | -|---------|-------| -| **Base URL / API Endpoint** | `http://127.0.0.1:8000/v1` | -| **API Key** | Your `PROXY_API_KEY` | - -### Model Format: `provider/model_name` - -**Important:** Models must be specified in the format `provider/model_name`. The `provider/` prefix tells the proxy which backend to route the request to. - -``` -gemini/gemini-2.5-flash ← Gemini API -openai/gpt-4o ← OpenAI API -anthropic/claude-3-5-sonnet ← Anthropic API -openrouter/anthropic/claude-3-opus ← OpenRouter -gemini_cli/gemini-2.5-pro ← Gemini CLI (OAuth) -antigravity/gemini-3-pro-preview ← Antigravity (Gemini 3, Claude Opus 4.5) -``` - -### Usage Examples - -
-Python (OpenAI Library) - -```python -from openai import OpenAI - -client = OpenAI( - base_url="http://127.0.0.1:8000/v1", - api_key="your-proxy-api-key" -) - -response = client.chat.completions.create( - model="gemini/gemini-2.5-flash", # provider/model format - messages=[{"role": "user", "content": "Hello!"}] -) -print(response.choices[0].message.content) -``` - -
- -
-curl - -```bash -curl -X POST http://127.0.0.1:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer your-proxy-api-key" \ - -d '{ - "model": "gemini/gemini-2.5-flash", - "messages": [{"role": "user", "content": "What is the capital of France?"}] - }' -``` - -
- -
-JanitorAI / SillyTavern / Other Chat UIs - -1. Go to **API Settings** -2. Select **"Proxy"** or **"Custom OpenAI"** mode -3. Configure: - - **API URL:** `http://127.0.0.1:8000/v1` - - **API Key:** Your `PROXY_API_KEY` - - **Model:** `provider/model_name` (e.g., `gemini/gemini-2.5-flash`) -4. Save and start chatting - -
- -
-Continue / Cursor / IDE Extensions - -In your configuration file (e.g., `config.json`): - -```json -{ - "models": [ - { - "title": "Gemini via Proxy", - "provider": "openai", - "model": "gemini/gemini-2.5-flash", - "apiBase": "http://127.0.0.1:8000/v1", - "apiKey": "your-proxy-api-key" - } - ] -} -``` - -
- -
-Claude Code - -Claude Code natively supports custom Anthropic API endpoints. The recommended setup is to edit your Claude Code `settings.json`: - -```json -{ - "env": { - "ANTHROPIC_AUTH_TOKEN": "your-proxy-api-key", - "ANTHROPIC_BASE_URL": "http://127.0.0.1:8000", - "ANTHROPIC_DEFAULT_OPUS_MODEL": "gemini/gemini-3-pro", - "ANTHROPIC_DEFAULT_SONNET_MODEL": "gemini/gemini-3-flash", - "ANTHROPIC_DEFAULT_HAIKU_MODEL": "openai/gpt-5-mini" - } -} -``` - -Now you can use Claude Code with Gemini, OpenAI, or any other configured provider. - -
- -
-Anthropic Python SDK - -```python -from anthropic import Anthropic - -client = Anthropic( - base_url="http://127.0.0.1:8000", - api_key="your-proxy-api-key" -) - -# Use any provider through Anthropic's API format -response = client.messages.create( - model="gemini/gemini-3-flash", # provider/model format - max_tokens=1024, - messages=[{"role": "user", "content": "Hello!"}] -) -print(response.content[0].text) -``` - -
- -### API Endpoints - -| Endpoint | Description | -|----------|-------------| -| `GET /` | Status check — confirms proxy is running | -| `POST /v1/chat/completions` | Chat completions (OpenAI format) | -| `POST /v1/messages` | Chat completions (Anthropic format) — Claude Code compatible | -| `POST /v1/messages/count_tokens` | Count tokens for Anthropic-format requests | -| `POST /v1/embeddings` | Text embeddings | -| `GET /v1/models` | List all available models with pricing & capabilities | -| `GET /v1/models/{model_id}` | Get details for a specific model | -| `GET /v1/providers` | List configured providers | -| `POST /v1/token-count` | Calculate token count for a payload | -| `POST /v1/cost-estimate` | Estimate cost based on token counts | - -> **Tip:** The `/v1/models` endpoint is useful for discovering available models in your client. Many apps can fetch this list automatically. Add `?enriched=false` for a minimal response without pricing data. - ---- - -## Managing Credentials - -The proxy includes an interactive tool for managing all your API keys and OAuth credentials. - -### Using the TUI - - - -1. Run the proxy without arguments to open the TUI -2. Select **"🔑 Manage Credentials"** -3. Choose to add API keys or OAuth credentials - -### Using the Command Line - -```bash -python -m rotator_library.credential_tool -``` - -### Credential Types - -| Type | Providers | How to Add | -|------|-----------|------------| -| **API Keys** | Gemini, OpenAI, Anthropic, OpenRouter, Groq, Mistral, NVIDIA, Cohere, Chutes | Enter key in TUI or add to `.env` | -| **OAuth** | Gemini CLI, Antigravity, Qwen Code, iFlow | Interactive browser login via credential tool | - -### The `.env` File - -Credentials are stored in a `.env` file. You can edit it directly or use the TUI: - -```env -# Required: Authentication key for YOUR proxy -PROXY_API_KEY="your-secret-proxy-key" - -# Provider API Keys (add multiple with _1, _2, etc.) -GEMINI_API_KEY_1="your-gemini-key" -GEMINI_API_KEY_2="another-gemini-key" -OPENAI_API_KEY_1="your-openai-key" -ANTHROPIC_API_KEY_1="your-anthropic-key" -``` - -> Copy `.env.example` to `.env` as a starting point. - ---- - -## The Resilience Library - -The proxy is powered by a standalone Python library that you can use directly in your own applications. - -### Key Features - -- **Async-native** with `asyncio` and `httpx` -- **Intelligent key selection** with tiered, model-aware locking -- **Deadline-driven requests** with configurable global timeout -- **Automatic failover** between keys on errors -- **OAuth support** for Gemini CLI, Antigravity, Qwen, iFlow -- **Stateless deployment ready** — load credentials from environment variables - -### Basic Usage - -```python -from rotator_library import RotatingClient - -client = RotatingClient( - api_keys={"gemini": ["key1", "key2"], "openai": ["key3"]}, - global_timeout=30, - max_retries=2 -) - -async with client: - response = await client.acompletion( - model="gemini/gemini-2.5-flash", - messages=[{"role": "user", "content": "Hello!"}] - ) -``` - -### Library Documentation - -See the [Library README](src/rotator_library/README.md) for complete documentation including: -- All initialization parameters -- Streaming support -- Error handling and cooldown strategies -- Provider plugin system -- Credential prioritization - ---- - -## Interactive TUI - -The proxy includes a powerful text-based UI for configuration and management. - - - -### TUI Features - -- **🚀 Run Proxy** — Start the server with saved settings -- **⚙️ Configure Settings** — Host, port, API key, request logging -- **🔑 Manage Credentials** — Add/edit API keys and OAuth credentials -- **📊 View Status** — See configured providers and credential counts -- **🔧 Advanced Settings** — Custom providers, model definitions, concurrency - -### Configuration Files - -| File | Contents | -|------|----------| -| `.env` | All credentials and advanced settings | -| `launcher_config.json` | TUI-specific settings (host, port, logging) | - ---- - -## Features - -### Core Capabilities - -- **Universal OpenAI-compatible endpoint** for all providers -- **Multi-provider support** via [LiteLLM](https://docs.litellm.ai/docs/providers) fallback -- **Automatic key rotation** and load balancing -- **Interactive TUI** for easy configuration -- **Detailed request logging** for debugging - -
-🛡️ Resilience & High Availability - -- **Global timeout** with deadline-driven retries -- **Escalating cooldowns** per model (10s → 30s → 60s → 120s) -- **Key-level lockouts** for consistently failing keys -- **Stream error detection** and graceful recovery -- **Batch embedding aggregation** for improved throughput -- **Automatic daily resets** for cooldowns and usage stats - -
- -
-🔑 Credential Management - -- **Auto-discovery** of API keys from environment variables -- **OAuth discovery** from standard paths (`~/.gemini/`, `~/.qwen/`, `~/.iflow/`) -- **Duplicate detection** warns when same account added multiple times -- **Credential prioritization** — paid tier used before free tier -- **Stateless deployment** — export OAuth to environment variables -- **Local-first storage** — credentials isolated in `oauth_creds/` directory - -
- -
-⚙️ Advanced Configuration - -- **Model whitelists/blacklists** with wildcard support -- **Per-provider concurrency limits** (`MAX_CONCURRENT_REQUESTS_PER_KEY_`) -- **Rotation modes** — balanced (distribute load) or sequential (use until exhausted) -- **Priority multipliers** — higher concurrency for paid credentials -- **Model quota groups** — shared cooldowns for related models -- **Temperature override** — prevent tool hallucination issues -- **Weighted random rotation** — unpredictable selection patterns - -
- -
-🔌 Provider-Specific Features - -**Gemini CLI:** - -- Zero-config Google Cloud project discovery -- Internal API access with higher rate limits -- Automatic fallback to preview models on rate limit -- Paid vs free tier detection - -**Antigravity:** - -- Gemini 3 Pro with `thinkingLevel` support -- Gemini 2.5 Flash/Flash Lite with thinking mode -- Claude Opus 4.5 (thinking mode) -- Claude Sonnet 4.5 (thinking and non-thinking) -- GPT-OSS 120B Medium -- Thought signature caching for multi-turn conversations -- Tool hallucination prevention -- Quota baseline tracking with background refresh -- Parallel tool usage instruction injection -- **Quota Groups**: Models that share quota are automatically grouped: - - Claude/GPT-OSS: `claude-sonnet-4-5`, `claude-opus-4-5`, `gpt-oss-120b-medium` - - Gemini 3 Pro: `gemini-3-pro-high`, `gemini-3-pro-low`, `gemini-3-pro-preview` - - Gemini 2.5 Flash: `gemini-2.5-flash`, `gemini-2.5-flash-thinking`, `gemini-2.5-flash-lite` - - All models in a group deplete the usage of the group equally. So in claude group - it is beneficial to use only Opus, and forget about Sonnet and GPT-OSS. - -**Qwen Code:** - -- Dual auth (API key + OAuth Device Flow) -- `` tag parsing as `reasoning_content` -- Tool schema cleaning - -**iFlow:** - -- Dual auth (API key + OAuth Authorization Code) -- Hybrid auth with separate API key fetch -- Tool schema cleaning - -**NVIDIA NIM:** - -- Dynamic model discovery -- DeepSeek thinking support - -
- -
-📝 Logging & Debugging - -- **Per-request file logging** with `--enable-request-logging` -- **Unique request directories** with full transaction details -- **Streaming chunk capture** for debugging -- **Performance metadata** (duration, tokens, model used) -- **Provider-specific logs** for Qwen, iFlow, Antigravity - -
- ---- - -## Advanced Configuration - -
-Environment Variables Reference - -### Proxy Settings - -| Variable | Description | Default | -|----------|-------------|---------| -| `PROXY_API_KEY` | Authentication key for your proxy | Required | -| `OAUTH_REFRESH_INTERVAL` | Token refresh check interval (seconds) | `600` | -| `SKIP_OAUTH_INIT_CHECK` | Skip interactive OAuth setup on startup | `false` | - -### Per-Provider Settings - -| Pattern | Description | Example | -|---------|-------------|---------| -| `_API_KEY_` | API key for provider | `GEMINI_API_KEY_1` | -| `MAX_CONCURRENT_REQUESTS_PER_KEY_` | Concurrent request limit | `MAX_CONCURRENT_REQUESTS_PER_KEY_OPENAI=3` | -| `ROTATION_MODE_` | `balanced` or `sequential` | `ROTATION_MODE_GEMINI=sequential` | -| `IGNORE_MODELS_` | Blacklist (comma-separated, supports `*`) | `IGNORE_MODELS_OPENAI=*-preview*` | -| `WHITELIST_MODELS_` | Whitelist (overrides blacklist) | `WHITELIST_MODELS_GEMINI=gemini-2.5-pro` | - -### Advanced Features - -| Variable | Description | -|----------|-------------| -| `ROTATION_TOLERANCE` | `0.0`=deterministic, `3.0`=weighted random (default) | -| `CONCURRENCY_MULTIPLIER__PRIORITY_` | Concurrency multiplier per priority tier | -| `QUOTA_GROUPS__` | Models sharing quota limits | -| `OVERRIDE_TEMPERATURE_ZERO` | `remove` or `set` to prevent tool hallucination | -| `GEMINI_CLI_QUOTA_REFRESH_INTERVAL` | Quota baseline refresh interval in seconds (default: 300) | -| `ANTIGRAVITY_QUOTA_REFRESH_INTERVAL` | Quota baseline refresh interval in seconds (default: 300) | - -
- -
-Model Filtering (Whitelists & Blacklists) - -Control which models are exposed through your proxy. - -### Blacklist Only - -```env -# Hide all preview models -IGNORE_MODELS_OPENAI="*-preview*" -``` - -### Pure Whitelist Mode - -```env -# Block all, then allow specific models -IGNORE_MODELS_GEMINI="*" -WHITELIST_MODELS_GEMINI="gemini-2.5-pro,gemini-2.5-flash" -``` - -### Exemption Mode - -```env -# Block preview models, but allow one specific preview -IGNORE_MODELS_OPENAI="*-preview*" -WHITELIST_MODELS_OPENAI="gpt-4o-2024-08-06-preview" -``` - -**Logic order:** Whitelist check → Blacklist check → Default allow - -
- -
-Concurrency & Rotation Settings - -### Concurrency Limits - -```env -# Allow 3 concurrent requests per OpenAI key -MAX_CONCURRENT_REQUESTS_PER_KEY_OPENAI=3 - -# Default is 1 (no concurrency) -MAX_CONCURRENT_REQUESTS_PER_KEY_GEMINI=1 -``` - -### Rotation Modes - -```env -# balanced (default): Distribute load evenly - best for per-minute rate limits -ROTATION_MODE_OPENAI=balanced - -# sequential: Use until exhausted - best for daily/weekly quotas -ROTATION_MODE_GEMINI=sequential -``` - -### Priority Multipliers - -Paid credentials can handle more concurrent requests: - -```env -# Priority 1 (paid ultra): 10x concurrency -CONCURRENCY_MULTIPLIER_ANTIGRAVITY_PRIORITY_1=10 - -# Priority 2 (standard paid): 3x -CONCURRENCY_MULTIPLIER_ANTIGRAVITY_PRIORITY_2=3 -``` - -### Model Quota Groups - -Models sharing quota limits: - -```env -# Claude models share quota - when one hits limit, both cool down -QUOTA_GROUPS_ANTIGRAVITY_CLAUDE="claude-sonnet-4-5,claude-opus-4-5" -``` - -
- -
-Timeout Configuration - -Fine-grained control over HTTP timeouts: - -```env -TIMEOUT_CONNECT=30 # Connection establishment -TIMEOUT_WRITE=30 # Request body send -TIMEOUT_POOL=60 # Connection pool acquisition -TIMEOUT_READ_STREAMING=180 # Between streaming chunks (3 min) -TIMEOUT_READ_NON_STREAMING=600 # Full response wait (10 min) -``` - -**Recommendations:** - -- Long thinking tasks: Increase `TIMEOUT_READ_STREAMING` to 300-360s -- Unstable network: Increase `TIMEOUT_CONNECT` to 60s -- Large outputs: Increase `TIMEOUT_READ_NON_STREAMING` to 900s+ - -
- ---- - -## OAuth Providers - -
-Gemini CLI - -Uses Google OAuth to access internal Gemini endpoints with higher rate limits. - -**Setup:** - -1. Run `python -m rotator_library.credential_tool` -2. Select "Add OAuth Credential" → "Gemini CLI" -3. Complete browser authentication -4. Credentials saved to `oauth_creds/gemini_cli_oauth_1.json` - -**Features:** - -- Zero-config project discovery -- Automatic free-tier project onboarding -- Paid vs free tier detection -- Smart fallback on rate limits -- Quota baseline tracking with background refresh (accurate remaining quota estimates) -- Sequential rotation mode (uses credentials until quota exhausted) - -**Quota Groups:** Models that share quota are automatically grouped: -- **Pro**: `gemini-2.5-pro`, `gemini-3-pro-preview` -- **2.5-Flash**: `gemini-2.0-flash`, `gemini-2.5-flash`, `gemini-2.5-flash-lite` -- **3-Flash**: `gemini-3-flash-preview` - -All models in a group deplete the shared quota equally. 24-hour per-model quota windows. - -**Environment Variables (for stateless deployment):** - -Single credential (legacy): -```env -GEMINI_CLI_ACCESS_TOKEN="ya29.your-access-token" -GEMINI_CLI_REFRESH_TOKEN="1//your-refresh-token" -GEMINI_CLI_EXPIRY_DATE="1234567890000" -GEMINI_CLI_EMAIL="your-email@gmail.com" -GEMINI_CLI_PROJECT_ID="your-gcp-project-id" # Optional -GEMINI_CLI_TIER="standard-tier" # Optional: standard-tier or free-tier -``` - -Multiple credentials (use `_N_` suffix where N is 1, 2, 3...): -```env -GEMINI_CLI_1_ACCESS_TOKEN="ya29.first-token" -GEMINI_CLI_1_REFRESH_TOKEN="1//first-refresh" -GEMINI_CLI_1_EXPIRY_DATE="1234567890000" -GEMINI_CLI_1_EMAIL="first@gmail.com" -GEMINI_CLI_1_PROJECT_ID="project-1" -GEMINI_CLI_1_TIER="standard-tier" - -GEMINI_CLI_2_ACCESS_TOKEN="ya29.second-token" -GEMINI_CLI_2_REFRESH_TOKEN="1//second-refresh" -GEMINI_CLI_2_EXPIRY_DATE="1234567890000" -GEMINI_CLI_2_EMAIL="second@gmail.com" -GEMINI_CLI_2_PROJECT_ID="project-2" -GEMINI_CLI_2_TIER="free-tier" -``` - -**Feature Toggles:** -```env -GEMINI_CLI_QUOTA_REFRESH_INTERVAL=300 # Quota refresh interval in seconds (default: 300 = 5 min) -``` - -
- -
-Antigravity (Gemini 3 + Claude Opus 4.5) - -Access Google's internal Antigravity API for cutting-edge models. - -**Supported Models:** - -- **Gemini 3 Pro** — with `thinkingLevel` support (low/high) -- **Gemini 2.5 Flash** — with thinking mode support -- **Gemini 2.5 Flash Lite** — configurable thinking budget -- **Claude Opus 4.5** — Anthropic's most powerful model (thinking mode only) -- **Claude Sonnet 4.5** — supports both thinking and non-thinking modes -- **GPT-OSS 120B** — OpenAI-compatible model - -**Setup:** - -1. Run `python -m rotator_library.credential_tool` -2. Select "Add OAuth Credential" → "Antigravity" -3. Complete browser authentication - -**Advanced Features:** - -- Thought signature caching for multi-turn conversations -- Tool hallucination prevention via parameter signature injection -- Automatic thinking block sanitization for Claude -- Credential prioritization (paid resets every 5 hours, free weekly) -- Quota baseline tracking with background refresh (accurate remaining quota estimates) -- Parallel tool usage instruction injection for Claude - -**Environment Variables:** - -```env -ANTIGRAVITY_ACCESS_TOKEN="ya29.your-access-token" -ANTIGRAVITY_REFRESH_TOKEN="1//your-refresh-token" -ANTIGRAVITY_EXPIRY_DATE="1234567890000" -ANTIGRAVITY_EMAIL="your-email@gmail.com" - -# Feature toggles -ANTIGRAVITY_ENABLE_SIGNATURE_CACHE=true -ANTIGRAVITY_GEMINI3_TOOL_FIX=true -ANTIGRAVITY_QUOTA_REFRESH_INTERVAL=300 # Quota refresh interval (seconds) -ANTIGRAVITY_PARALLEL_TOOL_INSTRUCTION_CLAUDE=true # Parallel tool instruction for Claude -``` - -> **Note:** Gemini 3 models require a paid-tier Google Cloud project. - -
- -
-Qwen Code - -Uses OAuth Device Flow for Qwen/Dashscope APIs. - -**Setup:** - -1. Run the credential tool -2. Select "Add OAuth Credential" → "Qwen Code" -3. Enter the code displayed in your browser -4. Or add API key directly: `QWEN_CODE_API_KEY_1="your-key"` - -**Features:** - -- Dual auth (API key or OAuth) -- `` tag parsing as `reasoning_content` -- Automatic tool schema cleaning -- Custom models via `QWEN_CODE_MODELS` env var - -
- -
-iFlow - -Uses OAuth Authorization Code flow with local callback server. - -**Setup:** - -1. Run the credential tool -2. Select "Add OAuth Credential" → "iFlow" -3. Complete browser authentication (callback on port 11451) -4. Or add API key directly: `IFLOW_API_KEY_1="sk-your-key"` - -**Features:** - -- Dual auth (API key or OAuth) -- Hybrid auth (OAuth token fetches separate API key) -- Automatic tool schema cleaning -- Custom models via `IFLOW_MODELS` env var - -
- -
-Stateless Deployment (Export to Environment Variables) - -For platforms without file persistence (Railway, Render, Vercel): - -1. **Set up credentials locally:** - - ```bash - python -m rotator_library.credential_tool - # Complete OAuth flows - ``` - -2. **Export to environment variables:** - - ```bash - python -m rotator_library.credential_tool - # Select "Export [Provider] to .env" - ``` - -3. **Copy generated variables to your platform:** - The tool creates files like `gemini_cli_credential_1.env` containing all necessary variables. - -4. **Set `SKIP_OAUTH_INIT_CHECK=true`** to skip interactive validation on startup. - -
- -
-OAuth Callback Port Configuration - -Customize OAuth callback ports if defaults conflict: - -| Provider | Default Port | Environment Variable | -| ----------- | ------------ | ------------------------ | -| Gemini CLI | 8085 | `GEMINI_CLI_OAUTH_PORT` | -| Antigravity | 51121 | `ANTIGRAVITY_OAUTH_PORT` | -| iFlow | 11451 | `IFLOW_OAUTH_PORT` | - -
- ---- - -## Deployment - -
-Command-Line Arguments - -```bash -python src/proxy_app/main.py [OPTIONS] - -Options: - --host TEXT Host to bind (default: 0.0.0.0) - --port INTEGER Port to run on (default: 8000) - --enable-request-logging Enable detailed per-request logging - --add-credential Launch interactive credential setup tool -``` - -**Examples:** - -```bash -# Run on custom port -python src/proxy_app/main.py --host 127.0.0.1 --port 9000 - -# Run with logging -python src/proxy_app/main.py --enable-request-logging - -# Add credentials without starting proxy -python src/proxy_app/main.py --add-credential -``` - -
- -
-Render / Railway / Vercel - -See the [Deployment Guide](Deployment%20guide.md) for complete instructions. - -**Quick Setup:** - -1. Fork the repository -2. Create a `.env` file with your credentials -3. Create a new Web Service pointing to your repo -4. Set build command: `pip install -r requirements.txt` -5. Set start command: `uvicorn src.proxy_app.main:app --host 0.0.0.0 --port $PORT` -6. Upload `.env` as a secret file - -**OAuth Credentials:** -Export OAuth credentials to environment variables using the credential tool, then add them to your platform's environment settings. - -
- -
-Docker - -The proxy is available as a multi-architecture Docker image (amd64/arm64) from GitHub Container Registry. - -**Quick Start with Docker Compose:** - -```bash -# 1. Create your .env file with PROXY_API_KEY and provider keys -cp .env.example .env -nano .env - -# 2. Create key_usage.json file (required before first run) -touch key_usage.json - -# 3. Start the proxy -docker compose up -d - -# 4. Check logs -docker compose logs -f -``` - -> **Important:** You must create `key_usage.json` before running Docker Compose. If this file doesn't exist on the host, Docker will create it as a directory instead of a file, causing the container to fail. - -**Manual Docker Run:** - -```bash -# Create key_usage.json if it doesn't exist -touch key_usage.json - -docker run -d \ - --name llm-api-proxy \ - --restart unless-stopped \ - -p 8000:8000 \ - -v $(pwd)/.env:/app/.env:ro \ - -v $(pwd)/oauth_creds:/app/oauth_creds \ - -v $(pwd)/logs:/app/logs \ - -v $(pwd)/key_usage.json:/app/key_usage.json \ - -e SKIP_OAUTH_INIT_CHECK=true \ - -e PYTHONUNBUFFERED=1 \ - ghcr.io/mirrowel/llm-api-key-proxy:latest -``` - -**Development with Local Build:** - -```bash -# Build and run locally -docker compose -f docker-compose.dev.yml up -d --build -``` - -**Volume Mounts:** - -| Path | Purpose | -| ---------------- | -------------------------------------- | -| `.env` | Configuration and API keys (read-only) | -| `oauth_creds/` | OAuth credential files (persistent) | -| `logs/` | Request logs and detailed logging | -| `key_usage.json` | Usage statistics persistence | - -**Image Tags:** - -| Tag | Description | -| ----------------------- | ------------------------------------------ | -| `latest` | Latest stable from `main` branch | -| `dev-latest` | Latest from `dev` branch | -| `YYYYMMDD-HHMMSS-` | Specific version with timestamp and commit | - -**OAuth with Docker:** - -For OAuth providers (Antigravity, Gemini CLI, etc.), you must authenticate locally first: - -1. Run `python -m rotator_library.credential_tool` on your local machine -2. Complete OAuth flows in browser -3. Either: - - Mount `oauth_creds/` directory to container, or - - Export credentials to `.env` using the export option - -
- -
-Custom VPS / Systemd - -**Option 1: Authenticate locally, deploy credentials** - -1. Complete OAuth flows on your local machine -2. Export to environment variables -3. Deploy `.env` to your server - -**Option 2: SSH Port Forwarding** - -```bash -# Forward callback ports through SSH -ssh -L 51121:localhost:51121 -L 8085:localhost:8085 user@your-vps - -# Then run credential tool on the VPS -``` - -**Systemd Service:** - -```ini -[Unit] -Description=LLM API Key Proxy -After=network.target - -[Service] -Type=simple -WorkingDirectory=/path/to/LLM-API-Key-Proxy -ExecStart=/path/to/python -m uvicorn src.proxy_app.main:app --host 0.0.0.0 --port 8000 -Restart=always - -[Install] -WantedBy=multi-user.target -``` - -See [VPS Deployment](Deployment%20guide.md#appendix-deploying-to-a-custom-vps) for complete guide. - -
- ---- - -## Troubleshooting - -| Issue | Solution | -|-------|----------| -| `401 Unauthorized` | Verify `PROXY_API_KEY` matches your `Authorization: Bearer` header exactly | -| `500 Internal Server Error` | Check provider key validity; enable `--enable-request-logging` for details | -| All keys on cooldown | All keys failed recently; check `logs/detailed_logs/` for upstream errors | -| Model not found | Verify format is `provider/model_name` (e.g., `gemini/gemini-2.5-flash`) | -| OAuth callback failed | Ensure callback port (8085, 51121, 11451) isn't blocked by firewall | -| Streaming hangs | Increase `TIMEOUT_READ_STREAMING`; check provider status | - -**Detailed Logs:** - -When `--enable-request-logging` is enabled, check `logs/detailed_logs/` for: - -- `request.json` — Exact request payload -- `final_response.json` — Complete response or error -- `streaming_chunks.jsonl` — All SSE chunks received -- `metadata.json` — Performance metrics - ---- - -## Documentation - -| Document | Description | -|----------|-------------| -| [Technical Documentation](DOCUMENTATION.md) | Architecture, internals, provider implementations | -| [Library README](src/rotator_library/README.md) | Using the resilience library directly | -| [Deployment Guide](Deployment%20guide.md) | Hosting on Render, Railway, VPS | -| [.env.example](.env.example) | Complete environment variable reference | - ---- - -## License - -This project is dual-licensed: - -- **Proxy Application** (`src/proxy_app/`) — [MIT License](src/proxy_app/LICENSE) -- **Resilience Library** (`src/rotator_library/`) — [LGPL-3.0](src/rotator_library/COPYING.LESSER) diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml deleted file mode 100644 index 36458929..00000000 --- a/docker-compose.dev.yml +++ /dev/null @@ -1,30 +0,0 @@ -services: - llm-proxy: - build: - context: . - dockerfile: Dockerfile - container_name: llm-api-proxy-dev - restart: unless-stopped - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - ports: - - "${PORT:-8000}:8000" - volumes: - # Mount .env files for configuration - - ./.env:/app/.env:ro - # Mount oauth_creds directory for OAuth credentials persistence - - ./oauth_creds:/app/oauth_creds - # Mount logs directory for persistent logging - - ./logs:/app/logs - # Mount key_usage.json for usage statistics persistence - - ./key_usage.json:/app/key_usage.json - # Optionally mount additional .env files (e.g., combined credential files) - # - ./antigravity_all_combined.env:/app/antigravity_all_combined.env:ro - environment: - # Skip OAuth interactive initialization in container (non-interactive) - - SKIP_OAUTH_INIT_CHECK=true - # Ensure Python output is not buffered - - PYTHONUNBUFFERED=1 diff --git a/docker-compose.tls.yml b/docker-compose.tls.yml deleted file mode 100644 index e210423f..00000000 --- a/docker-compose.tls.yml +++ /dev/null @@ -1,47 +0,0 @@ -services: - nginx-proxy-manager: - image: "jc21/nginx-proxy-manager:latest" - container_name: nginx-proxy-manager - restart: unless-stopped - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - ports: - - "80:80" # Public HTTP - - "443:443" # Public HTTPS - - "81:81" # Admin Web Interface - volumes: - - ./data:/data - - ./letsencrypt:/etc/letsencrypt - # This allows the proxy to talk to other containers using "host.docker.internal" - extra_hosts: - - "host.docker.internal:host-gateway" - llm-proxy: - image: ghcr.io/mirrowel/llm-api-key-proxy:latest - container_name: llm-api-proxy-tls - restart: unless-stopped - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - ports: - - "${PORT:-8000}:8000" - volumes: - # Mount .env files for configuration - - ./.env:/app/.env:ro - # Mount oauth_creds directory for OAuth credentials persistence - - ./oauth_creds:/app/oauth_creds - # Mount logs directory for persistent logging - - ./logs:/app/logs - # Mount key_usage.json for usage statistics persistence - - ./key_usage.json:/app/key_usage.json - # Optionally mount additional .env files (e.g., combined credential files) - # - ./antigravity_all_combined.env:/app/antigravity_all_combined.env:ro - environment: - # Skip OAuth interactive initialization in container (non-interactive) - - SKIP_OAUTH_INIT_CHECK=true - # Ensure Python output is not buffered - - PYTHONUNBUFFERED=1 diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index 31964b60..00000000 --- a/docker-compose.yml +++ /dev/null @@ -1,28 +0,0 @@ -services: - llm-proxy: - image: ghcr.io/mirrowel/llm-api-key-proxy:latest - container_name: llm-api-proxy - restart: unless-stopped - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - ports: - - "${PORT:-8000}:8000" - volumes: - # Mount .env files for configuration - - ./.env:/app/.env:ro - # Mount oauth_creds directory for OAuth credentials persistence - - ./oauth_creds:/app/oauth_creds - # Mount logs directory for persistent logging - - ./logs:/app/logs - # Mount key_usage.json for usage statistics persistence - - ./key_usage.json:/app/key_usage.json - # Optionally mount additional .env files (e.g., combined credential files) - # - ./antigravity_all_combined.env:/app/antigravity_all_combined.env:ro - environment: - # Skip OAuth interactive initialization in container (non-interactive) - - SKIP_OAUTH_INIT_CHECK=true - # Ensure Python output is not buffered - - PYTHONUNBUFFERED=1 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 1f5d4985..00000000 --- a/requirements.txt +++ /dev/null @@ -1,27 +0,0 @@ -# FastAPI framework for building the proxy server -fastapi -# ASGI server for running the FastAPI application -uvicorn -# For loading environment variables from a .env file -python-dotenv - -# Installs the local rotator_library in editable mode --e src/rotator_library - -# A library for calling LLM APIs with a consistent format -litellm - -filelock -httpx -aiofiles -aiohttp - -colorlog - -rich - -# GUI for model filter configuration -customtkinter - -# For building the executable -pyinstaller diff --git a/src/proxy_app/LICENSE b/src/proxy_app/LICENSE deleted file mode 100644 index 2810a890..00000000 --- a/src/proxy_app/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2025 Mirrowel - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/src/proxy_app/__init__.py b/src/proxy_app/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/proxy_app/batch_manager.py b/src/proxy_app/batch_manager.py deleted file mode 100644 index 90888950..00000000 --- a/src/proxy_app/batch_manager.py +++ /dev/null @@ -1,81 +0,0 @@ -import asyncio -from typing import List, Dict, Any, Tuple -import time -from rotator_library import RotatingClient - -class EmbeddingBatcher: - def __init__(self, client: RotatingClient, batch_size: int = 64, timeout: float = 0.1): - self.client = client - self.batch_size = batch_size - self.timeout = timeout - self.queue = asyncio.Queue() - self.worker_task = asyncio.create_task(self._batch_worker()) - - async def add_request(self, request_data: Dict[str, Any]) -> Any: - future = asyncio.Future() - await self.queue.put((request_data, future)) - return await future - - async def _batch_worker(self): - while True: - batch, futures = await self._gather_batch() - if not batch: - continue - - try: - # Assume all requests in a batch use the same model and other settings - model = batch[0]["model"] - inputs = [item["input"][0] for item in batch] # Extract single string input - - batched_request = { - "model": model, - "input": inputs - } - - # Pass through any other relevant parameters from the first request - for key in ["input_type", "dimensions", "user"]: - if key in batch[0]: - batched_request[key] = batch[0][key] - - response = await self.client.aembedding(**batched_request) - - # Distribute results back to the original requesters - for i, future in enumerate(futures): - # Create a new response object for each item in the batch - single_response_data = { - "object": response.object, - "model": response.model, - "data": [response.data[i]], - "usage": response.usage # Usage is for the whole batch - } - future.set_result(single_response_data) - - except Exception as e: - for future in futures: - future.set_exception(e) - - async def _gather_batch(self) -> Tuple[List[Dict[str, Any]], List[asyncio.Future]]: - batch = [] - futures = [] - start_time = time.time() - - while len(batch) < self.batch_size and (time.time() - start_time) < self.timeout: - try: - # Wait for an item with a timeout - timeout = self.timeout - (time.time() - start_time) - if timeout <= 0: - break - request, future = await asyncio.wait_for(self.queue.get(), timeout=timeout) - batch.append(request) - futures.append(future) - except asyncio.TimeoutError: - break - - return batch, futures - - async def stop(self): - self.worker_task.cancel() - try: - await self.worker_task - except asyncio.CancelledError: - pass \ No newline at end of file diff --git a/src/proxy_app/build.py b/src/proxy_app/build.py deleted file mode 100644 index 7aee640b..00000000 --- a/src/proxy_app/build.py +++ /dev/null @@ -1,92 +0,0 @@ -import os -import sys -import platform -import subprocess - - -def get_providers(): - """ - Scans the 'src/rotator_library/providers' directory to find all provider modules. - Returns a list of hidden import arguments for PyInstaller. - """ - hidden_imports = [] - # Get the absolute path to the directory containing this script - script_dir = os.path.dirname(os.path.abspath(__file__)) - # Construct the path to the providers directory relative to this script's location - providers_path = os.path.join(script_dir, "..", "rotator_library", "providers") - - if not os.path.isdir(providers_path): - print(f"Error: Directory not found at '{os.path.abspath(providers_path)}'") - return [] - - for filename in os.listdir(providers_path): - if filename.endswith("_provider.py") and filename != "__init__.py": - module_name = f"rotator_library.providers.{filename[:-3]}" - hidden_imports.append(f"--hidden-import={module_name}") - return hidden_imports - - -def main(): - """ - Constructs and runs the PyInstaller command to build the executable. - """ - # Base PyInstaller command with optimizations - command = [ - sys.executable, - "-m", - "PyInstaller", - "--onefile", - "--name", - "proxy_app", - "--paths", - "../", - "--paths", - ".", - # Core imports - "--hidden-import=rotator_library", - "--hidden-import=tiktoken_ext.openai_public", - "--hidden-import=tiktoken_ext", - "--collect-data", - "litellm", - # Optimization: Exclude unused heavy modules - "--exclude-module=matplotlib", - "--exclude-module=IPython", - "--exclude-module=jupyter", - "--exclude-module=notebook", - "--exclude-module=PIL.ImageTk", - # Optimization: Enable UPX compression (if available) - "--upx-dir=upx" - if platform.system() != "Darwin" - else "--noupx", # macOS has issues with UPX - # Optimization: Strip debug symbols (smaller binary) - "--strip" - if platform.system() != "Windows" - else "--console", # Windows gets clean console - ] - - # Add hidden imports for providers - provider_imports = get_providers() - if not provider_imports: - print( - "Warning: No providers found. The build might not include any LLM providers." - ) - command.extend(provider_imports) - - # Add the main script - command.append("main.py") - - # Execute the command - print(f"Running command: {' '.join(command)}") - try: - # Run PyInstaller from the script's directory to ensure relative paths are correct - script_dir = os.path.dirname(os.path.abspath(__file__)) - subprocess.run(command, check=True, cwd=script_dir) - print("Build successful!") - except subprocess.CalledProcessError as e: - print(f"Build failed with error: {e}") - except FileNotFoundError: - print("Error: PyInstaller is not installed or not in the system's PATH.") - - -if __name__ == "__main__": - main() diff --git a/src/proxy_app/detailed_logger.py b/src/proxy_app/detailed_logger.py deleted file mode 100644 index 8bc18c72..00000000 --- a/src/proxy_app/detailed_logger.py +++ /dev/null @@ -1,184 +0,0 @@ -# src/proxy_app/detailed_logger.py -""" -Raw I/O Logger for the Proxy Layer. - -This logger captures the UNMODIFIED HTTP request and response at the proxy boundary. -It is disabled by default and should only be enabled for debugging the proxy itself. - -Use this when you need to: -- Verify that requests/responses are not being corrupted -- Debug HTTP-level issues between the client and proxy -- Capture exact payloads as received/sent by the proxy - -For normal request/response logging with provider correlation, use the -TransactionLogger in the rotator_library instead (enabled via --enable-request-logging). - -Directory structure: - logs/raw_io/{YYYYMMDD_HHMMSS}_{request_id}/ - request.json # Unmodified incoming HTTP request - streaming_chunks.jsonl # If streaming mode - final_response.json # Unmodified outgoing HTTP response - metadata.json # Summary metadata -""" - -import json -import time -import uuid -from datetime import datetime -from pathlib import Path -from typing import Any, Dict, Optional -import logging - -from rotator_library.utils.resilient_io import ( - safe_write_json, - safe_log_write, - safe_mkdir, -) -from rotator_library.utils.paths import get_logs_dir - - -def _get_raw_io_logs_dir() -> Path: - """Get the raw I/O logs directory, creating it if needed.""" - logs_dir = get_logs_dir() - raw_io_dir = logs_dir / "raw_io" - raw_io_dir.mkdir(parents=True, exist_ok=True) - return raw_io_dir - - -class RawIOLogger: - """ - Logs raw HTTP request/response at the proxy boundary. - - This captures the EXACT data as received from and sent to the client, - without any transformations. Useful for debugging the proxy itself. - - DISABLED by default. Enable with --enable-raw-logging flag. - - Uses fire-and-forget logging - if disk writes fail, logs are dropped (not buffered) - to prevent memory issues, especially with streaming responses. - """ - - def __init__(self): - """ - Initializes the logger for a single request, creating a unique directory - to store all related log files. - """ - self.start_time = time.time() - self.request_id = str(uuid.uuid4()) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - self.log_dir = _get_raw_io_logs_dir() / f"{timestamp}_{self.request_id}" - self.streaming = False - self._dir_available = safe_mkdir(self.log_dir, logging) - - def _write_json(self, filename: str, data: Dict[str, Any]): - """Helper to write data to a JSON file in the log directory.""" - if not self._dir_available: - # Try to create directory again in case it was recreated - self._dir_available = safe_mkdir(self.log_dir, logging) - if not self._dir_available: - return - - safe_write_json( - self.log_dir / filename, - data, - logging, - atomic=False, - indent=4, - ensure_ascii=False, - ) - - def log_request(self, headers: Dict[str, Any], body: Dict[str, Any]): - """Logs the raw incoming request details.""" - self.streaming = body.get("stream", False) - request_data = { - "request_id": self.request_id, - "timestamp_utc": datetime.utcnow().isoformat(), - "headers": dict(headers), - "body": body, - } - self._write_json("request.json", request_data) - - def log_stream_chunk(self, chunk: Dict[str, Any]): - """Logs an individual chunk from a streaming response to a JSON Lines file.""" - if not self._dir_available: - return - - log_entry = {"timestamp_utc": datetime.utcnow().isoformat(), "chunk": chunk} - content = json.dumps(log_entry, ensure_ascii=False) + "\n" - safe_log_write(self.log_dir / "streaming_chunks.jsonl", content, logging) - - def log_final_response( - self, status_code: int, headers: Optional[Dict[str, Any]], body: Dict[str, Any] - ): - """Logs the raw outgoing response.""" - end_time = time.time() - duration_ms = (end_time - self.start_time) * 1000 - - response_data = { - "request_id": self.request_id, - "timestamp_utc": datetime.utcnow().isoformat(), - "status_code": status_code, - "duration_ms": round(duration_ms), - "headers": dict(headers) if headers else None, - "body": body, - } - self._write_json("final_response.json", response_data) - self._log_metadata(response_data) - - def _extract_reasoning(self, response_body: Dict[str, Any]) -> Optional[str]: - """Recursively searches for and extracts 'reasoning' fields from the response body.""" - if not isinstance(response_body, dict): - return None - - if "reasoning" in response_body: - return response_body["reasoning"] - - if "choices" in response_body and response_body["choices"]: - message = response_body["choices"][0].get("message", {}) - if "reasoning" in message: - return message["reasoning"] - if "reasoning_content" in message: - return message["reasoning_content"] - - return None - - def _log_metadata(self, response_data: Dict[str, Any]): - """Logs a summary of the transaction for quick analysis.""" - usage = response_data.get("body", {}).get("usage") or {} - model = response_data.get("body", {}).get("model", "N/A") - finish_reason = "N/A" - if ( - "choices" in response_data.get("body", {}) - and response_data["body"]["choices"] - ): - finish_reason = response_data["body"]["choices"][0].get( - "finish_reason", "N/A" - ) - - metadata = { - "request_id": self.request_id, - "timestamp_utc": response_data["timestamp_utc"], - "duration_ms": response_data["duration_ms"], - "status_code": response_data["status_code"], - "model": model, - "streaming": self.streaming, - "usage": { - "prompt_tokens": usage.get("prompt_tokens"), - "completion_tokens": usage.get("completion_tokens"), - "total_tokens": usage.get("total_tokens"), - }, - "finish_reason": finish_reason, - "reasoning_found": False, - "reasoning_content": None, - } - - reasoning = self._extract_reasoning(response_data.get("body", {})) - if reasoning: - metadata["reasoning_found"] = True - metadata["reasoning_content"] = reasoning - - self._write_json("metadata.json", metadata) - - -# Backward compatibility alias -DetailedLogger = RawIOLogger diff --git a/src/proxy_app/launcher_tui.py b/src/proxy_app/launcher_tui.py deleted file mode 100644 index 60b73fba..00000000 --- a/src/proxy_app/launcher_tui.py +++ /dev/null @@ -1,1081 +0,0 @@ -""" -Interactive TUI launcher for the LLM API Key Proxy. -Provides a beautiful Rich-based interface for configuration and execution. -""" - -import json -import os -import sys -from pathlib import Path -from rich.console import Console -from rich.prompt import IntPrompt, Prompt -from rich.panel import Panel -from rich.text import Text -from dotenv import load_dotenv, set_key - -console = Console() - - -def _get_env_file() -> Path: - """ - Get .env file path (lightweight - no heavy imports). - - Returns: - Path to .env file - EXE directory if frozen, else current working directory - """ - if getattr(sys, "frozen", False): - # Running as PyInstaller EXE - use EXE's directory - return Path(sys.executable).parent / ".env" - # Running as script - use current working directory - return Path.cwd() / ".env" - - -def clear_screen(subtitle: str = ""): - """ - Cross-platform terminal clear with optional header. - - Uses native OS commands instead of ANSI escape sequences: - - Windows (conhost & Windows Terminal): cls - - Unix-like systems (Linux, Mac): clear - - Args: - subtitle: If provided, displays a header panel with this subtitle. - If empty/None, just clears the screen. - """ - os.system("cls" if os.name == "nt" else "clear") - if subtitle: - console.print( - Panel( - f"[bold cyan]{subtitle}[/bold cyan]", - title="--- API Key Proxy ---", - ) - ) - - -class LauncherConfig: - """Manages launcher_config.json (host, port, logging only)""" - - def __init__(self, config_path: Path = Path("launcher_config.json")): - self.config_path = config_path - self.defaults = { - "host": "127.0.0.1", - "port": 8000, - "enable_request_logging": False, - "enable_raw_logging": False, - } - self.config = self.load() - - def load(self) -> dict: - """Load config from file or create with defaults.""" - if self.config_path.exists(): - try: - with open(self.config_path, "r") as f: - config = json.load(f) - # Merge with defaults for any missing keys - for key, value in self.defaults.items(): - if key not in config: - config[key] = value - return config - except (json.JSONDecodeError, IOError): - return self.defaults.copy() - return self.defaults.copy() - - def save(self): - """Save current config to file.""" - import datetime - - self.config["last_updated"] = datetime.datetime.now().isoformat() - try: - with open(self.config_path, "w") as f: - json.dump(self.config, f, indent=2) - except IOError as e: - console.print(f"[red]Error saving config: {e}[/red]") - - def update(self, **kwargs): - """Update config values.""" - self.config.update(kwargs) - self.save() - - @staticmethod - def update_proxy_api_key(new_key: str): - """Update PROXY_API_KEY in .env only""" - env_file = _get_env_file() - set_key(str(env_file), "PROXY_API_KEY", new_key) - load_dotenv(dotenv_path=env_file, override=True) - - -class SettingsDetector: - """Detects settings from .env for display""" - - @staticmethod - def _load_local_env() -> dict: - """Load environment variables from local .env file only""" - env_file = _get_env_file() - env_dict = {} - if not env_file.exists(): - return env_dict - try: - with open(env_file, "r", encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line or line.startswith("#"): - continue - if "=" in line: - key, _, value = line.partition("=") - key, value = key.strip(), value.strip() - if value and value[0] in ('"', "'") and value[-1] == value[0]: - value = value[1:-1] - env_dict[key] = value - except (IOError, OSError): - pass - return env_dict - - @staticmethod - def get_all_settings() -> dict: - """Returns comprehensive settings overview (includes provider_settings which triggers heavy imports)""" - return { - "credentials": SettingsDetector.detect_credentials(), - "custom_bases": SettingsDetector.detect_custom_api_bases(), - "model_definitions": SettingsDetector.detect_model_definitions(), - "concurrency_limits": SettingsDetector.detect_concurrency_limits(), - "model_filters": SettingsDetector.detect_model_filters(), - "provider_settings": SettingsDetector.detect_provider_settings(), - } - - @staticmethod - def get_basic_settings() -> dict: - """Returns basic settings overview without provider_settings (avoids heavy imports)""" - return { - "credentials": SettingsDetector.detect_credentials(), - "custom_bases": SettingsDetector.detect_custom_api_bases(), - "model_definitions": SettingsDetector.detect_model_definitions(), - "concurrency_limits": SettingsDetector.detect_concurrency_limits(), - "model_filters": SettingsDetector.detect_model_filters(), - } - - @staticmethod - def detect_credentials() -> dict: - """Detect API keys and OAuth credentials""" - import re - from pathlib import Path - - providers = {} - - # Scan for API keys - env_vars = SettingsDetector._load_local_env() - for key, value in env_vars.items(): - if "_API_KEY" in key and key != "PROXY_API_KEY": - provider = key.split("_API_KEY")[0].lower() - if provider not in providers: - providers[provider] = {"api_keys": 0, "oauth": 0, "custom": False} - providers[provider]["api_keys"] += 1 - - # Scan for file-based OAuth credentials - oauth_dir = Path("oauth_creds") - if oauth_dir.exists(): - for file in oauth_dir.glob("*_oauth_*.json"): - provider = file.name.split("_oauth_")[0] - if provider not in providers: - providers[provider] = {"api_keys": 0, "oauth": 0, "custom": False} - providers[provider]["oauth"] += 1 - - # Scan for env-based OAuth credentials - # Maps provider name to the ENV_PREFIX used by the provider - # (duplicated from credential_manager to avoid heavy imports) - env_oauth_providers = { - "gemini_cli": "GEMINI_CLI", - "antigravity": "ANTIGRAVITY", - "qwen_code": "QWEN_CODE", - "iflow": "IFLOW", - } - - for provider, env_prefix in env_oauth_providers.items(): - oauth_count = 0 - - # Check numbered credentials (PROVIDER_N_ACCESS_TOKEN pattern) - numbered_pattern = re.compile(rf"^{env_prefix}_(\d+)_ACCESS_TOKEN$") - for key in env_vars.keys(): - match = numbered_pattern.match(key) - if match: - index = match.group(1) - refresh_key = f"{env_prefix}_{index}_REFRESH_TOKEN" - if refresh_key in env_vars and env_vars[refresh_key]: - oauth_count += 1 - - # Check legacy single credential (if no numbered found) - if oauth_count == 0: - access_key = f"{env_prefix}_ACCESS_TOKEN" - refresh_key = f"{env_prefix}_REFRESH_TOKEN" - if env_vars.get(access_key) and env_vars.get(refresh_key): - oauth_count = 1 - - if oauth_count > 0: - if provider not in providers: - providers[provider] = {"api_keys": 0, "oauth": 0, "custom": False} - providers[provider]["oauth"] += oauth_count - - # Mark custom providers (have API_BASE set) - for provider in providers: - if os.getenv(f"{provider.upper()}_API_BASE"): - providers[provider]["custom"] = True - - return providers - - @staticmethod - def detect_custom_api_bases() -> dict: - """Detect custom API base URLs (not in hardcoded map)""" - from proxy_app.provider_urls import PROVIDER_URL_MAP - - bases = {} - env_vars = SettingsDetector._load_local_env() - for key, value in env_vars.items(): - if key.endswith("_API_BASE"): - provider = key.replace("_API_BASE", "").lower() - # Only include if NOT in hardcoded map - if provider not in PROVIDER_URL_MAP: - bases[provider] = value - return bases - - @staticmethod - def detect_model_definitions() -> dict: - """Detect provider model definitions""" - models = {} - env_vars = SettingsDetector._load_local_env() - for key, value in env_vars.items(): - if key.endswith("_MODELS"): - provider = key.replace("_MODELS", "").lower() - try: - parsed = json.loads(value) - if isinstance(parsed, dict): - models[provider] = len(parsed) - elif isinstance(parsed, list): - models[provider] = len(parsed) - except (json.JSONDecodeError, ValueError): - pass - return models - - @staticmethod - def detect_concurrency_limits() -> dict: - """Detect max concurrent requests per key""" - limits = {} - env_vars = SettingsDetector._load_local_env() - for key, value in env_vars.items(): - if key.startswith("MAX_CONCURRENT_REQUESTS_PER_KEY_"): - provider = key.replace("MAX_CONCURRENT_REQUESTS_PER_KEY_", "").lower() - try: - limits[provider] = int(value) - except (json.JSONDecodeError, ValueError): - pass - return limits - - @staticmethod - def detect_model_filters() -> dict: - """Detect active model filters (basic info only: defined or not)""" - filters = {} - env_vars = SettingsDetector._load_local_env() - for key, value in env_vars.items(): - if key.startswith("IGNORE_MODELS_") or key.startswith("WHITELIST_MODELS_"): - filter_type = "ignore" if key.startswith("IGNORE") else "whitelist" - provider = key.replace(f"{filter_type.upper()}_MODELS_", "").lower() - if provider not in filters: - filters[provider] = {"has_ignore": False, "has_whitelist": False} - if filter_type == "ignore": - filters[provider]["has_ignore"] = True - else: - filters[provider]["has_whitelist"] = True - return filters - - @staticmethod - def detect_provider_settings() -> dict: - """Detect provider-specific settings (Antigravity, Gemini CLI)""" - try: - from proxy_app.settings_tool import PROVIDER_SETTINGS_MAP - except ImportError: - # Fallback for direct execution or testing - from .settings_tool import PROVIDER_SETTINGS_MAP - - provider_settings = {} - env_vars = SettingsDetector._load_local_env() - - for provider, definitions in PROVIDER_SETTINGS_MAP.items(): - modified_count = 0 - for key, definition in definitions.items(): - env_value = env_vars.get(key) - if env_value is not None: - # Check if value differs from default - default = definition.get("default") - setting_type = definition.get("type", "str") - - try: - if setting_type == "bool": - current = env_value.lower() in ("true", "1", "yes") - elif setting_type == "int": - current = int(env_value) - else: - current = env_value - - if current != default: - modified_count += 1 - except (ValueError, AttributeError): - pass - - if modified_count > 0: - provider_settings[provider] = modified_count - - return provider_settings - - -class LauncherTUI: - """Main launcher interface""" - - def __init__(self): - self.console = Console() - self.config = LauncherConfig() - self.running = True - self.env_file = _get_env_file() - # Load .env file to ensure environment variables are available - load_dotenv(dotenv_path=self.env_file, override=True) - - def needs_onboarding(self) -> bool: - """Check if onboarding is needed""" - return not self.env_file.exists() or not os.getenv("PROXY_API_KEY") - - def run(self): - """Main TUI loop""" - while self.running: - self.show_main_menu() - - def show_main_menu(self): - """Display main menu and handle selection""" - clear_screen() - - # Detect basic settings (excludes provider_settings to avoid heavy imports) - settings = SettingsDetector.get_basic_settings() - credentials = settings["credentials"] - custom_bases = settings["custom_bases"] - - # Check if setup is needed - show_warning = self.needs_onboarding() - - # Build title with GitHub link - self.console.print( - Panel.fit( - "[bold cyan]🚀 LLM API Key Proxy - Interactive Launcher[/bold cyan]", - border_style="cyan", - ) - ) - self.console.print( - "[dim]GitHub: [blue underline]https://github.com/Mirrowel/LLM-API-Key-Proxy[/blue underline][/dim]" - ) - - # Show warning if .env file doesn't exist - if show_warning: - self.console.print() - self.console.print( - Panel( - Text.from_markup( - "⚠️ [bold yellow]INITIAL SETUP REQUIRED[/bold yellow]\n\n" - "The proxy needs initial configuration:\n" - " ❌ No .env file found\n\n" - "Why this matters:\n" - " • The .env file stores your credentials and settings\n" - " • PROXY_API_KEY protects your proxy from unauthorized access\n" - " • Provider API keys enable LLM access\n\n" - "What to do:\n" - ' 1. Select option "3. Manage Credentials" to launch the credential tool\n' - " 2. The tool will create .env and set up PROXY_API_KEY automatically\n" - " 3. You can add provider credentials (API keys or OAuth)\n\n" - "⚠️ Note: The credential tool adds PROXY_API_KEY by default.\n" - " You can remove it later if you want an unsecured proxy." - ), - border_style="yellow", - expand=False, - ) - ) - # Show security warning if PROXY_API_KEY is missing (but .env exists) - elif not os.getenv("PROXY_API_KEY"): - self.console.print() - self.console.print( - Panel( - Text.from_markup( - "⚠️ [bold red]SECURITY WARNING: PROXY_API_KEY Not Set[/bold red]\n\n" - "Your proxy is currently UNSECURED!\n" - "Anyone can access it without authentication.\n\n" - "This is a serious security risk if your proxy is accessible\n" - "from the internet or untrusted networks.\n\n" - "👉 [bold]Recommended:[/bold] Set PROXY_API_KEY in .env file\n" - ' Use option "2. Configure Proxy Settings" → "3. Set Proxy API Key"\n' - ' or option "3. Manage Credentials"' - ), - border_style="red", - expand=False, - ) - ) - - # Show config - self.console.print() - self.console.print("[bold]📋 Proxy Configuration[/bold]") - self.console.print("━" * 70) - self.console.print(f" Host: {self.config.config['host']}") - self.console.print(f" Port: {self.config.config['port']}") - self.console.print( - f" Transaction Logging: {'✅ Enabled' if self.config.config['enable_request_logging'] else '❌ Disabled'}" - ) - self.console.print( - f" Raw I/O Logging: {'✅ Enabled' if self.config.config.get('enable_raw_logging', False) else '❌ Disabled'}" - ) - - # Show actual API key value - proxy_key = os.getenv("PROXY_API_KEY") - if proxy_key: - self.console.print(f" Proxy API Key: {proxy_key}") - else: - self.console.print(" Proxy API Key: [red]Not Set (INSECURE!)[/red]") - - # Show status summary - self.console.print() - self.console.print("[bold]📊 Status Summary[/bold]") - self.console.print("━" * 70) - provider_count = len(credentials) - custom_count = len(custom_bases) - - self.console.print(f" Providers: {provider_count} configured") - self.console.print(f" Custom Providers: {custom_count} configured") - # Note: provider_settings detection is deferred to avoid heavy imports on startup - has_advanced = bool( - settings["model_definitions"] - or settings["concurrency_limits"] - or settings["model_filters"] - ) - self.console.print( - f" Advanced Settings: {'Active (view in menu 4)' if has_advanced else 'None (view menu 4 for details)'}" - ) - - # Show menu - self.console.print() - self.console.print("━" * 70) - self.console.print() - self.console.print("[bold]🎯 Main Menu[/bold]") - self.console.print() - if show_warning: - self.console.print(" 1. ▶️ Run Proxy Server") - self.console.print(" 2. ⚙️ Configure Proxy Settings") - self.console.print( - " 3. 🔑 Manage Credentials ⬅️ [bold yellow]Start here![/bold yellow]" - ) - else: - self.console.print(" 1. ▶️ Run Proxy Server") - self.console.print(" 2. ⚙️ Configure Proxy Settings") - self.console.print(" 3. 🔑 Manage Credentials") - - self.console.print(" 4. 📊 View Provider & Advanced Settings") - self.console.print(" 5. 📈 View Quota & Usage Stats (Alpha)") - self.console.print(" 6. 🔄 Reload Configuration") - self.console.print(" 7. ℹ️ About") - self.console.print(" 8. 🚪 Exit") - - self.console.print() - self.console.print("━" * 70) - self.console.print() - - choice = Prompt.ask( - "Select option", - choices=["1", "2", "3", "4", "5", "6", "7", "8"], - show_choices=False, - ) - - if choice == "1": - self.run_proxy() - elif choice == "2": - self.show_config_menu() - elif choice == "3": - self.launch_credential_tool() - elif choice == "4": - self.show_provider_settings_menu() - elif choice == "5": - self.launch_quota_viewer() - elif choice == "6": - load_dotenv(dotenv_path=_get_env_file(), override=True) - self.config = LauncherConfig() # Reload config - self.console.print("\n[green]✅ Configuration reloaded![/green]") - elif choice == "7": - self.show_about() - elif choice == "8": - self.running = False - sys.exit(0) - - def confirm_setting_change(self, setting_name: str, warning_lines: list) -> bool: - """ - Display a warning and require Y/N (case-sensitive) confirmation. - Re-prompts until user enters exactly 'Y' or 'N'. - Returns True only if user enters 'Y'. - """ - clear_screen() - self.console.print() - self.console.print( - Panel( - Text.from_markup( - f"[bold yellow]⚠️ WARNING: You are about to change the {setting_name}[/bold yellow]\n\n" - + "\n".join(warning_lines) - + "\n\n[bold]If you are not sure about changing this - don't.[/bold]" - ), - border_style="yellow", - expand=False, - ) - ) - - while True: - response = Prompt.ask( - "Enter [bold]Y[/bold] to confirm, [bold]N[/bold] to cancel (case-sensitive)" - ) - if response == "Y": - return True - elif response == "N": - self.console.print("\n[dim]Operation cancelled.[/dim]") - return False - else: - self.console.print( - "[red]Please enter exactly 'Y' or 'N' (case-sensitive)[/red]" - ) - - def show_config_menu(self): - """Display configuration sub-menu""" - while True: - clear_screen() - - self.console.print( - Panel.fit( - "[bold cyan]⚙️ Proxy Configuration[/bold cyan]", border_style="cyan" - ) - ) - - self.console.print() - self.console.print("[bold]📋 Current Settings[/bold]") - self.console.print("━" * 70) - self.console.print(f" Host: {self.config.config['host']}") - self.console.print(f" Port: {self.config.config['port']}") - self.console.print( - f" Transaction Logging: {'✅ Enabled' if self.config.config['enable_request_logging'] else '❌ Disabled'}" - ) - self.console.print( - f" Raw I/O Logging: {'✅ Enabled' if self.config.config.get('enable_raw_logging', False) else '❌ Disabled'}" - ) - self.console.print( - f" Proxy API Key: {'✅ Set' if os.getenv('PROXY_API_KEY') else '❌ Not Set'}" - ) - - self.console.print() - self.console.print("━" * 70) - self.console.print() - self.console.print("[bold]⚙️ Configuration Options[/bold]") - self.console.print() - self.console.print(" 1. 🌐 Set Host IP") - self.console.print(" 2. 🔌 Set Port") - self.console.print(" 3. 🔑 Set Proxy API Key") - self.console.print(" 4. 📝 Toggle Transaction Logging") - self.console.print(" 5. 📋 Toggle Raw I/O Logging") - self.console.print(" 6. 🔄 Reset to Default Settings") - self.console.print(" 7. ↩️ Back to Main Menu") - - self.console.print() - self.console.print("━" * 70) - self.console.print() - - choice = Prompt.ask( - "Select option", - choices=["1", "2", "3", "4", "5", "6", "7"], - show_choices=False, - ) - - if choice == "1": - # Show warning and require confirmation - confirmed = self.confirm_setting_change( - "Host IP", - [ - "Changing the host IP affects which network interfaces the proxy listens on:", - " • [cyan]127.0.0.1[/cyan] = Local access only (recommended for development)", - " • [cyan]0.0.0.0[/cyan] = Accessible from all network interfaces", - "", - "Applications configured to connect to the old host may fail to connect.", - ], - ) - if not confirmed: - continue - - new_host = Prompt.ask( - "Enter new host IP", default=self.config.config["host"] - ) - self.config.update(host=new_host) - self.console.print(f"\n[green]✅ Host updated to: {new_host}[/green]") - elif choice == "2": - # Show warning and require confirmation - confirmed = self.confirm_setting_change( - "Port", - [ - "Changing the port will affect all applications currently configured", - "to connect to your proxy on the existing port.", - "", - "Applications using the old port will fail to connect.", - ], - ) - if not confirmed: - continue - - new_port = IntPrompt.ask( - "Enter new port", default=self.config.config["port"] - ) - if 1 <= new_port <= 65535: - self.config.update(port=new_port) - self.console.print( - f"\n[green]✅ Port updated to: {new_port}[/green]" - ) - else: - self.console.print("\n[red]❌ Port must be between 1-65535[/red]") - elif choice == "3": - # Show warning and require confirmation - confirmed = self.confirm_setting_change( - "Proxy API Key", - [ - "This is the authentication key that applications use to access your proxy.", - "", - "[bold red]⚠️ Changing this will BREAK all applications currently configured", - " with the existing API key![/bold red]", - "", - "[bold cyan]💡 If you want to add provider API keys (OpenAI, Gemini, etc.),", - ' go to "3. 🔑 Manage Credentials" in the main menu instead.[/bold cyan]', - ], - ) - if not confirmed: - continue - - current = os.getenv("PROXY_API_KEY", "") - new_key = Prompt.ask( - "Enter new Proxy API Key (leave empty to disable authentication)", - default=current, - ) - - if new_key != current: - # If setting to empty, show additional warning - if not new_key: - self.console.print( - "\n[bold red]⚠️ Authentication will be DISABLED - anyone can access your proxy![/bold red]" - ) - Prompt.ask("Press Enter to continue", default="") - - LauncherConfig.update_proxy_api_key(new_key) - - if new_key: - self.console.print( - "\n[green]✅ Proxy API Key updated successfully![/green]" - ) - self.console.print(" Updated in .env file") - else: - self.console.print( - "\n[yellow]⚠️ Proxy API Key cleared - authentication disabled![/yellow]" - ) - self.console.print(" Updated in .env file") - else: - self.console.print("\n[yellow]No changes made[/yellow]") - elif choice == "4": - current = self.config.config["enable_request_logging"] - self.config.update(enable_request_logging=not current) - self.console.print( - f"\n[green]✅ Transaction Logging {'enabled' if not current else 'disabled'}![/green]" - ) - elif choice == "5": - current = self.config.config.get("enable_raw_logging", False) - self.config.update(enable_raw_logging=not current) - self.console.print( - f"\n[green]✅ Raw I/O Logging {'enabled' if not current else 'disabled'}![/green]" - ) - elif choice == "6": - # Reset to Default Settings - # Define defaults - default_host = "127.0.0.1" - default_port = 8000 - default_logging = False - default_raw_logging = False - default_api_key = "VerysecretKey" - - # Get current values - current_host = self.config.config["host"] - current_port = self.config.config["port"] - current_logging = self.config.config["enable_request_logging"] - current_raw_logging = self.config.config.get( - "enable_raw_logging", False - ) - current_api_key = os.getenv("PROXY_API_KEY", "") - - # Build comparison table - warning_lines = [ - "This will reset ALL proxy settings to their defaults:", - "", - "[bold] Setting Current Value → Default Value[/bold]", - " " + "─" * 62, - f" Host IP {current_host:20} → {default_host}", - f" Port {str(current_port):20} → {default_port}", - f" Transaction Logging {'Enabled':20} → Disabled" - if current_logging - else f" Transaction Logging {'Disabled':20} → Disabled", - f" Raw I/O Logging {'Enabled':20} → Disabled" - if current_raw_logging - else f" Raw I/O Logging {'Disabled':20} → Disabled", - f" Proxy API Key {current_api_key[:20]:20} → {default_api_key}", - "", - "[bold red]⚠️ This may break applications configured with current settings![/bold red]", - ] - - confirmed = self.confirm_setting_change( - "Settings (Reset to Defaults)", warning_lines - ) - if not confirmed: - continue - - # Apply defaults - self.config.update( - host=default_host, - port=default_port, - enable_request_logging=default_logging, - enable_raw_logging=default_raw_logging, - ) - LauncherConfig.update_proxy_api_key(default_api_key) - - self.console.print( - "\n[green]✅ All settings have been reset to defaults![/green]" - ) - self.console.print(f" Host: {default_host}") - self.console.print(f" Port: {default_port}") - self.console.print(f" Transaction Logging: Disabled") - self.console.print(f" Raw I/O Logging: Disabled") - self.console.print(f" Proxy API Key: {default_api_key}") - elif choice == "7": - break - - def show_provider_settings_menu(self): - """Display provider/advanced settings (read-only + launch tool)""" - clear_screen() - - # Use basic settings to avoid heavy imports - provider_settings deferred to Settings Tool - settings = SettingsDetector.get_basic_settings() - - credentials = settings["credentials"] - custom_bases = settings["custom_bases"] - model_defs = settings["model_definitions"] - concurrency = settings["concurrency_limits"] - filters = settings["model_filters"] - - self.console.print( - Panel.fit( - "[bold cyan]📊 Provider & Advanced Settings[/bold cyan]", - border_style="cyan", - ) - ) - - # Configured Providers - self.console.print() - self.console.print("[bold]📊 Configured Providers[/bold]") - self.console.print("━" * 70) - if credentials: - for provider, info in credentials.items(): - provider_name = provider.title() - parts = [] - if info["api_keys"] > 0: - parts.append( - f"{info['api_keys']} API key{'s' if info['api_keys'] > 1 else ''}" - ) - if info["oauth"] > 0: - parts.append( - f"{info['oauth']} OAuth credential{'s' if info['oauth'] > 1 else ''}" - ) - - display = " + ".join(parts) - if info["custom"]: - display += " (Custom)" - - self.console.print(f" ✅ {provider_name:20} {display}") - else: - self.console.print(" [dim]No providers configured[/dim]") - - # Custom API Bases - if custom_bases: - self.console.print() - self.console.print("[bold]🌐 Custom API Bases[/bold]") - self.console.print("━" * 70) - for provider, base in custom_bases.items(): - self.console.print(f" • {provider:15} {base}") - - # Model Definitions - if model_defs: - self.console.print() - self.console.print("[bold]📦 Provider Model Definitions[/bold]") - self.console.print("━" * 70) - for provider, count in model_defs.items(): - self.console.print( - f" • {provider:15} {count} model{'s' if count > 1 else ''} configured" - ) - - # Concurrency Limits - if concurrency: - self.console.print() - self.console.print("[bold]⚡ Concurrency Limits[/bold]") - self.console.print("━" * 70) - for provider, limit in concurrency.items(): - self.console.print(f" • {provider:15} {limit} requests/key") - self.console.print(" • Default: 1 request/key (all others)") - - # Model Filters (basic info only) - if filters: - self.console.print() - self.console.print("[bold]🎯 Model Filters[/bold]") - self.console.print("━" * 70) - for provider, filter_info in filters.items(): - status_parts = [] - if filter_info["has_whitelist"]: - status_parts.append("Whitelist") - if filter_info["has_ignore"]: - status_parts.append("Ignore list") - status = " + ".join(status_parts) if status_parts else "None" - self.console.print(f" • {provider:15} ✅ {status}") - - # Provider-Specific Settings (deferred to Settings Tool to avoid heavy imports) - self.console.print() - self.console.print("[bold]🔬 Provider-Specific Settings[/bold]") - self.console.print("━" * 70) - self.console.print( - " [dim]Launch Settings Tool to view/configure provider-specific settings[/dim]" - ) - - # Actions - self.console.print() - self.console.print("━" * 70) - self.console.print() - self.console.print("[bold]💡 Actions[/bold]") - self.console.print() - self.console.print( - " 1. 🔧 Launch Settings Tool (configure advanced settings)" - ) - self.console.print(" 2. ↩️ Back to Main Menu") - - self.console.print() - self.console.print("━" * 70) - self.console.print( - "[dim]ℹ️ Advanced settings are stored in .env file.\n Use the Settings Tool to configure them interactively.[/dim]" - ) - self.console.print() - self.console.print( - "[dim]⚠️ Note: Settings Tool supports only common configuration types.\n For complex settings, edit .env directly.[/dim]" - ) - self.console.print() - - choice = Prompt.ask("Select option", choices=["1", "2"], show_choices=False) - - if choice == "1": - self.launch_settings_tool() - # choice == "2" returns to main menu - - def launch_credential_tool(self): - """Launch credential management tool""" - import time - - # CRITICAL: Show full loading UI to replace the 6-7 second blank wait - clear_screen() - - _start_time = time.time() - - # Show the same header as standalone mode - self.console.print("━" * 70) - self.console.print("Interactive Credential Setup Tool") - self.console.print("GitHub: https://github.com/Mirrowel/LLM-API-Key-Proxy") - self.console.print("━" * 70) - self.console.print("Loading credential management components...") - - # Now import with spinner (this is where the 6-7 second delay happens) - with self.console.status("Initializing credential tool...", spinner="dots"): - from rotator_library.credential_tool import ( - run_credential_tool, - _ensure_providers_loaded, - ) - - _, PROVIDER_PLUGINS = _ensure_providers_loaded() - self.console.print("✓ Credential tool initialized") - - _elapsed = time.time() - _start_time - self.console.print( - f"✓ Tool ready in {_elapsed:.2f}s ({len(PROVIDER_PLUGINS)} providers available)" - ) - - # Small delay to let user see the ready message - time.sleep(0.5) - - # Run the tool with from_launcher=True to skip duplicate loading screen - run_credential_tool(from_launcher=True) - # Reload environment after credential tool - load_dotenv(dotenv_path=_get_env_file(), override=True) - - def launch_settings_tool(self): - """Launch settings configuration tool""" - import time - - clear_screen() - - self.console.print("━" * 70) - self.console.print("Advanced Settings Configuration Tool") - self.console.print("━" * 70) - - _start_time = time.time() - - with self.console.status("Initializing settings tool...", spinner="dots"): - from proxy_app.settings_tool import run_settings_tool - - _elapsed = time.time() - _start_time - self.console.print(f"✓ Settings tool ready in {_elapsed:.2f}s") - - time.sleep(0.3) - - run_settings_tool() - # Reload environment after settings tool - load_dotenv(dotenv_path=_get_env_file(), override=True) - - def launch_quota_viewer(self): - """Launch the quota stats viewer""" - clear_screen() - - self.console.print("━" * 70) - self.console.print("Quota & Usage Statistics Viewer") - self.console.print("━" * 70) - self.console.print() - - # Import the lightweight viewer (no heavy imports) - from proxy_app.quota_viewer import run_quota_viewer - - run_quota_viewer() - - def show_about(self): - """Display About page with project information""" - clear_screen() - - self.console.print( - Panel.fit( - "[bold cyan]ℹ️ About LLM API Key Proxy[/bold cyan]", border_style="cyan" - ) - ) - - self.console.print() - self.console.print("[bold]📦 Project Information[/bold]") - self.console.print("━" * 70) - self.console.print(" [bold cyan]LLM API Key Proxy[/bold cyan]") - self.console.print( - " A lightweight, high-performance proxy server for managing" - ) - self.console.print(" LLM API keys with automatic rotation and OAuth support") - self.console.print() - self.console.print( - " [dim]GitHub:[/dim] [blue underline]https://github.com/Mirrowel/LLM-API-Key-Proxy[/blue underline]" - ) - - self.console.print() - self.console.print("[bold]✨ Key Features[/bold]") - self.console.print("━" * 70) - self.console.print( - " • [green]Smart Key Rotation[/green] - Automatic rotation across multiple API keys" - ) - self.console.print( - " • [green]OAuth Support[/green] - Automated OAuth flows for supported providers" - ) - self.console.print( - " • [green]Multiple Providers[/green] - Support for 10+ LLM providers" - ) - self.console.print( - " • [green]Custom Providers[/green] - Easy integration of custom OpenAI-compatible APIs" - ) - self.console.print( - " • [green]Advanced Filtering[/green] - Model whitelists and ignore lists per provider" - ) - self.console.print( - " • [green]Concurrency Control[/green] - Per-key rate limiting and request management" - ) - self.console.print( - " • [green]Cost Tracking[/green] - Track usage and costs across all providers" - ) - self.console.print( - " • [green]Interactive TUI[/green] - Beautiful terminal interface for easy configuration" - ) - - self.console.print() - self.console.print("[bold]📝 License & Credits[/bold]") - self.console.print("━" * 70) - self.console.print(" Made with ❤️ by the community") - self.console.print(" Open source - contributions welcome!") - - self.console.print() - self.console.print("━" * 70) - self.console.print() - - Prompt.ask("Press Enter to return to main menu", default="") - - def run_proxy(self): - """Prepare and launch proxy in same window""" - # Check if forced onboarding needed - if self.needs_onboarding(): - clear_screen() - self.console.print( - Panel( - Text.from_markup( - "⚠️ [bold yellow]Setup Required[/bold yellow]\n\n" - "Cannot start without .env.\n" - "Launching credential tool..." - ), - border_style="yellow", - ) - ) - - # Force credential tool - from rotator_library.credential_tool import ( - ensure_env_defaults, - run_credential_tool, - ) - - ensure_env_defaults() - load_dotenv(dotenv_path=_get_env_file(), override=True) - run_credential_tool() - load_dotenv(dotenv_path=_get_env_file(), override=True) - - # Check again after credential tool - if not os.getenv("PROXY_API_KEY"): - self.console.print( - "\n[red]❌ PROXY_API_KEY still not set. Cannot start proxy.[/red]" - ) - return - - # Clear console and modify sys.argv - clear_screen() - self.console.print( - f"\n[bold green]🚀 Starting proxy on {self.config.config['host']}:{self.config.config['port']}...[/bold green]\n" - ) - - # Brief pause so user sees the message before main.py takes over - import time - - time.sleep(0.5) - - # Reconstruct sys.argv for main.py - sys.argv = [ - "main.py", - "--host", - self.config.config["host"], - "--port", - str(self.config.config["port"]), - ] - if self.config.config["enable_request_logging"]: - sys.argv.append("--enable-request-logging") - if self.config.config.get("enable_raw_logging", False): - sys.argv.append("--enable-raw-logging") - - # Exit TUI - main.py will continue execution - self.running = False - - -def run_launcher_tui(): - """Entry point for launcher TUI""" - tui = LauncherTUI() - tui.run() diff --git a/src/proxy_app/main.py b/src/proxy_app/main.py deleted file mode 100644 index 4d8dba99..00000000 --- a/src/proxy_app/main.py +++ /dev/null @@ -1,1719 +0,0 @@ -import time -import uuid - -# Phase 1: Minimal imports for arg parsing and TUI -import asyncio -import os -from pathlib import Path -import sys -import argparse -import logging - -# --- Argument Parsing (BEFORE heavy imports) --- -parser = argparse.ArgumentParser(description="API Key Proxy Server") -parser.add_argument( - "--host", type=str, default="0.0.0.0", help="Host to bind the server to." -) -parser.add_argument("--port", type=int, default=8000, help="Port to run the server on.") -parser.add_argument( - "--enable-request-logging", - action="store_true", - help="Enable transaction logging in the library (logs request/response with provider correlation).", -) -parser.add_argument( - "--enable-raw-logging", - action="store_true", - help="Enable raw I/O logging at proxy boundary (captures unmodified HTTP data, disabled by default).", -) -parser.add_argument( - "--add-credential", - action="store_true", - help="Launch the interactive tool to add a new OAuth credential.", -) -args, _ = parser.parse_known_args() - -# Add the 'src' directory to the Python path -sys.path.append(str(Path(__file__).resolve().parent.parent)) - -# Check if we should launch TUI (no arguments = TUI mode) -if len(sys.argv) == 1: - # TUI MODE - Load ONLY what's needed for the launcher (fast path!) - from proxy_app.launcher_tui import run_launcher_tui - - run_launcher_tui() - # Launcher modifies sys.argv and returns, or exits if user chose Exit - # If we get here, user chose "Run Proxy" and sys.argv is modified - # Re-parse arguments with modified sys.argv - args = parser.parse_args() - -# Check if credential tool mode (also doesn't need heavy proxy imports) -if args.add_credential: - from rotator_library.credential_tool import run_credential_tool - - run_credential_tool() - sys.exit(0) - -# If we get here, we're ACTUALLY running the proxy - NOW show startup messages and start timer -_start_time = time.time() - -# Load all .env files from root folder (main .env first, then any additional *.env files) -from dotenv import load_dotenv -from glob import glob - -# Get the application root directory (EXE dir if frozen, else CWD) -# Inlined here to avoid triggering heavy rotator_library imports before loading screen -if getattr(sys, "frozen", False): - _root_dir = Path(sys.executable).parent -else: - _root_dir = Path.cwd() - -# Load main .env first -load_dotenv(_root_dir / ".env") - -# Load any additional .env files (e.g., antigravity_all_combined.env, gemini_cli_all_combined.env) -_env_files_found = list(_root_dir.glob("*.env")) -for _env_file in sorted(_root_dir.glob("*.env")): - if _env_file.name != ".env": # Skip main .env (already loaded) - load_dotenv(_env_file, override=False) # Don't override existing values - -# Log discovered .env files for deployment verification -if _env_files_found: - _env_names = [_ef.name for _ef in _env_files_found] - print(f"📁 Loaded {len(_env_files_found)} .env file(s): {', '.join(_env_names)}") - -# Get proxy API key for display -proxy_api_key = os.getenv("PROXY_API_KEY") -if proxy_api_key: - key_display = f"✓ {proxy_api_key}" -else: - key_display = "✗ Not Set (INSECURE - anyone can access!)" - -print("━" * 70) -print(f"Starting proxy on {args.host}:{args.port}") -print(f"Proxy API Key: {key_display}") -print(f"GitHub: https://github.com/Mirrowel/LLM-API-Key-Proxy") -print("━" * 70) -print("Loading server components...") - - -# Phase 2: Load Rich for loading spinner (lightweight) -from rich.console import Console - -_console = Console() - -# Phase 3: Heavy dependencies with granular loading messages -print(" → Loading FastAPI framework...") -with _console.status("[dim]Loading FastAPI framework...", spinner="dots"): - from contextlib import asynccontextmanager - from fastapi import FastAPI, Request, HTTPException, Depends - from fastapi.middleware.cors import CORSMiddleware - from fastapi.responses import StreamingResponse, JSONResponse - from fastapi.security import APIKeyHeader - -print(" → Loading core dependencies...") -with _console.status("[dim]Loading core dependencies...", spinner="dots"): - from dotenv import load_dotenv - import colorlog - import json - from typing import AsyncGenerator, Any, List, Optional, Union - from pydantic import BaseModel, ConfigDict, Field - - # --- Early Log Level Configuration --- - logging.getLogger("LiteLLM").setLevel(logging.WARNING) - -print(" → Loading LiteLLM library...") -with _console.status("[dim]Loading LiteLLM library...", spinner="dots"): - import litellm - -# Phase 4: Application imports with granular loading messages -print(" → Initializing proxy core...") -with _console.status("[dim]Initializing proxy core...", spinner="dots"): - from rotator_library import RotatingClient - from rotator_library.credential_manager import CredentialManager - from rotator_library.background_refresher import BackgroundRefresher - from rotator_library.model_info_service import init_model_info_service - from proxy_app.request_logger import log_request_to_console - from proxy_app.batch_manager import EmbeddingBatcher - from proxy_app.detailed_logger import RawIOLogger - -print(" → Discovering provider plugins...") -# Provider lazy loading happens during import, so time it here -_provider_start = time.time() -with _console.status("[dim]Discovering provider plugins...", spinner="dots"): - from rotator_library import ( - PROVIDER_PLUGINS, - ) # This triggers lazy load via __getattr__ -_provider_time = time.time() - _provider_start - -# Get count after import (without timing to avoid double-counting) -_plugin_count = len(PROVIDER_PLUGINS) - - -# --- Pydantic Models --- -class EmbeddingRequest(BaseModel): - model: str - input: Union[str, List[str]] - input_type: Optional[str] = None - dimensions: Optional[int] = None - user: Optional[str] = None - - -class ModelCard(BaseModel): - """Basic model card for minimal response.""" - - id: str - object: str = "model" - created: int = Field(default_factory=lambda: int(time.time())) - owned_by: str = "Mirro-Proxy" - - -class ModelCapabilities(BaseModel): - """Model capability flags.""" - - tool_choice: bool = False - function_calling: bool = False - reasoning: bool = False - vision: bool = False - system_messages: bool = True - prompt_caching: bool = False - assistant_prefill: bool = False - - -class EnrichedModelCard(BaseModel): - """Extended model card with pricing and capabilities.""" - - id: str - object: str = "model" - created: int = Field(default_factory=lambda: int(time.time())) - owned_by: str = "unknown" - # Pricing (optional - may not be available for all models) - input_cost_per_token: Optional[float] = None - output_cost_per_token: Optional[float] = None - cache_read_input_token_cost: Optional[float] = None - cache_creation_input_token_cost: Optional[float] = None - # Limits (optional) - max_input_tokens: Optional[int] = None - max_output_tokens: Optional[int] = None - context_window: Optional[int] = None - # Capabilities - mode: str = "chat" - supported_modalities: List[str] = Field(default_factory=lambda: ["text"]) - supported_output_modalities: List[str] = Field(default_factory=lambda: ["text"]) - capabilities: Optional[ModelCapabilities] = None - # Debug info (optional) - _sources: Optional[List[str]] = None - _match_type: Optional[str] = None - - model_config = ConfigDict(extra="allow") # Allow extra fields from the service - - -class ModelList(BaseModel): - """List of models response.""" - - object: str = "list" - data: List[ModelCard] - - -class EnrichedModelList(BaseModel): - """List of enriched models with pricing and capabilities.""" - - object: str = "list" - data: List[EnrichedModelCard] - - -# --- Anthropic API Models (imported from library) --- -from rotator_library.anthropic_compat import ( - AnthropicMessagesRequest, - AnthropicCountTokensRequest, -) - - -# Calculate total loading time -_elapsed = time.time() - _start_time -print( - f"✓ Server ready in {_elapsed:.2f}s ({_plugin_count} providers discovered in {_provider_time:.2f}s)" -) - -# Clear screen and reprint header for clean startup view -# This pushes loading messages up (still in scroll history) but shows a clean final screen -import os as _os_module - -_os_module.system("cls" if _os_module.name == "nt" else "clear") - -# Reprint header -print("━" * 70) -print(f"Starting proxy on {args.host}:{args.port}") -print(f"Proxy API Key: {key_display}") -print(f"GitHub: https://github.com/Mirrowel/LLM-API-Key-Proxy") -print("━" * 70) -print( - f"✓ Server ready in {_elapsed:.2f}s ({_plugin_count} providers discovered in {_provider_time:.2f}s)" -) - - -# Note: Debug logging will be added after logging configuration below - -# --- Logging Configuration --- -# Import path utilities here (after loading screen) to avoid triggering heavy imports early -from rotator_library.utils.paths import get_logs_dir, get_data_file - -LOG_DIR = get_logs_dir(_root_dir) - -# Configure a console handler with color (INFO and above only, no DEBUG) -console_handler = colorlog.StreamHandler(sys.stdout) -console_handler.setLevel(logging.INFO) -formatter = colorlog.ColoredFormatter( - "%(log_color)s%(message)s", - log_colors={ - "DEBUG": "cyan", - "INFO": "green", - "WARNING": "yellow", - "ERROR": "red", - "CRITICAL": "red,bg_white", - }, -) -console_handler.setFormatter(formatter) - -# Configure a file handler for INFO-level logs and higher -info_file_handler = logging.FileHandler(LOG_DIR / "proxy.log", encoding="utf-8") -info_file_handler.setLevel(logging.INFO) -info_file_handler.setFormatter( - logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -) - -# Configure a dedicated file handler for all DEBUG-level logs -debug_file_handler = logging.FileHandler(LOG_DIR / "proxy_debug.log", encoding="utf-8") -debug_file_handler.setLevel(logging.DEBUG) -debug_file_handler.setFormatter( - logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -) - - -# Create a filter to ensure the debug handler ONLY gets DEBUG messages from the rotator_library -class RotatorDebugFilter(logging.Filter): - def filter(self, record): - return record.levelno == logging.DEBUG and record.name.startswith( - "rotator_library" - ) - - -debug_file_handler.addFilter(RotatorDebugFilter()) - -# Configure a console handler with color -console_handler = colorlog.StreamHandler(sys.stdout) -console_handler.setLevel(logging.INFO) -formatter = colorlog.ColoredFormatter( - "%(log_color)s%(message)s", - log_colors={ - "DEBUG": "cyan", - "INFO": "green", - "WARNING": "yellow", - "ERROR": "red", - "CRITICAL": "red,bg_white", - }, -) -console_handler.setFormatter(formatter) - - -# Add a filter to prevent any LiteLLM logs from cluttering the console -class NoLiteLLMLogFilter(logging.Filter): - def filter(self, record): - return not record.name.startswith("LiteLLM") - - -console_handler.addFilter(NoLiteLLMLogFilter()) - -# Get the root logger and set it to DEBUG to capture all messages -root_logger = logging.getLogger() -root_logger.setLevel(logging.DEBUG) - -# Add all handlers to the root logger -root_logger.addHandler(info_file_handler) -root_logger.addHandler(console_handler) -root_logger.addHandler(debug_file_handler) - -# Silence other noisy loggers by setting their level higher than root -logging.getLogger("uvicorn").setLevel(logging.WARNING) -logging.getLogger("httpx").setLevel(logging.WARNING) - -# Isolate LiteLLM's logger to prevent it from reaching the console. -# We will capture its logs via the logger_fn callback in the client instead. -litellm_logger = logging.getLogger("LiteLLM") -litellm_logger.handlers = [] -litellm_logger.propagate = False - -# Now that logging is configured, log the module load time to debug file only -logging.debug(f"Modules loaded in {_elapsed:.2f}s") - -# Load environment variables from .env file -load_dotenv(_root_dir / ".env") - -# --- Configuration --- -USE_EMBEDDING_BATCHER = False -ENABLE_REQUEST_LOGGING = args.enable_request_logging -ENABLE_RAW_LOGGING = args.enable_raw_logging -if ENABLE_REQUEST_LOGGING: - logging.info( - "Transaction logging is enabled (library-level with provider correlation)." - ) -if ENABLE_RAW_LOGGING: - logging.info("Raw I/O logging is enabled (proxy boundary, unmodified HTTP data).") -PROXY_API_KEY = os.getenv("PROXY_API_KEY") -# Note: PROXY_API_KEY validation moved to server startup to allow credential tool to run first - -# Discover API keys from environment variables -api_keys = {} -for key, value in os.environ.items(): - if "_API_KEY" in key and key != "PROXY_API_KEY": - provider = key.split("_API_KEY")[0].lower() - if provider not in api_keys: - api_keys[provider] = [] - api_keys[provider].append(value) - -# Load model ignore lists from environment variables -ignore_models = {} -for key, value in os.environ.items(): - if key.startswith("IGNORE_MODELS_"): - provider = key.replace("IGNORE_MODELS_", "").lower() - models_to_ignore = [ - model.strip() for model in value.split(",") if model.strip() - ] - ignore_models[provider] = models_to_ignore - logging.debug( - f"Loaded ignore list for provider '{provider}': {models_to_ignore}" - ) - -# Load model whitelist from environment variables -whitelist_models = {} -for key, value in os.environ.items(): - if key.startswith("WHITELIST_MODELS_"): - provider = key.replace("WHITELIST_MODELS_", "").lower() - models_to_whitelist = [ - model.strip() for model in value.split(",") if model.strip() - ] - whitelist_models[provider] = models_to_whitelist - logging.debug( - f"Loaded whitelist for provider '{provider}': {models_to_whitelist}" - ) - -# Load max concurrent requests per key from environment variables -max_concurrent_requests_per_key = {} -for key, value in os.environ.items(): - if key.startswith("MAX_CONCURRENT_REQUESTS_PER_KEY_"): - provider = key.replace("MAX_CONCURRENT_REQUESTS_PER_KEY_", "").lower() - try: - max_concurrent = int(value) - if max_concurrent < 1: - logging.warning( - f"Invalid max_concurrent value for provider '{provider}': {value}. Must be >= 1. Using default (1)." - ) - max_concurrent = 1 - max_concurrent_requests_per_key[provider] = max_concurrent - logging.debug( - f"Loaded max concurrent requests for provider '{provider}': {max_concurrent}" - ) - except ValueError: - logging.warning( - f"Invalid max_concurrent value for provider '{provider}': {value}. Using default (1)." - ) - - -# --- Lifespan Management --- -@asynccontextmanager -async def lifespan(app: FastAPI): - """Manage the RotatingClient's lifecycle with the app's lifespan.""" - # [MODIFIED] Perform skippable OAuth initialization at startup - skip_oauth_init = os.getenv("SKIP_OAUTH_INIT_CHECK", "false").lower() == "true" - - # The CredentialManager now handles all discovery, including .env overrides. - # We pass all environment variables to it for this purpose. - cred_manager = CredentialManager(os.environ) - oauth_credentials = cred_manager.discover_and_prepare() - - if not skip_oauth_init and oauth_credentials: - logging.info("Starting OAuth credential validation and deduplication...") - processed_emails = {} # email -> {provider: path} - credentials_to_initialize = {} # provider -> [paths] - final_oauth_credentials = {} - - # --- Pass 1: Pre-initialization Scan & Deduplication --- - # logging.info("Pass 1: Scanning for existing metadata to find duplicates...") - for provider, paths in oauth_credentials.items(): - if provider not in credentials_to_initialize: - credentials_to_initialize[provider] = [] - for path in paths: - # Skip env-based credentials (virtual paths) - they don't have metadata files - if path.startswith("env://"): - credentials_to_initialize[provider].append(path) - continue - - try: - with open(path, "r") as f: - data = json.load(f) - metadata = data.get("_proxy_metadata", {}) - email = metadata.get("email") - - if email: - if email not in processed_emails: - processed_emails[email] = {} - - if provider in processed_emails[email]: - original_path = processed_emails[email][provider] - logging.warning( - f"Duplicate for '{email}' on '{provider}' found in pre-scan: '{Path(path).name}'. Original: '{Path(original_path).name}'. Skipping." - ) - continue - else: - processed_emails[email][provider] = path - - credentials_to_initialize[provider].append(path) - - except (FileNotFoundError, json.JSONDecodeError) as e: - logging.warning( - f"Could not pre-read metadata from '{path}': {e}. Will process during initialization." - ) - credentials_to_initialize[provider].append(path) - - # --- Pass 2: Parallel Initialization of Filtered Credentials --- - # logging.info("Pass 2: Initializing unique credentials and performing final check...") - async def process_credential(provider: str, path: str, provider_instance): - """Process a single credential: initialize and fetch user info.""" - try: - await provider_instance.initialize_token(path) - - if not hasattr(provider_instance, "get_user_info"): - return (provider, path, None, None) - - user_info = await provider_instance.get_user_info(path) - email = user_info.get("email") - return (provider, path, email, None) - - except Exception as e: - logging.error( - f"Failed to process OAuth token for {provider} at '{path}': {e}" - ) - return (provider, path, None, e) - - # Collect all tasks for parallel execution - tasks = [] - for provider, paths in credentials_to_initialize.items(): - if not paths: - continue - - provider_plugin_class = PROVIDER_PLUGINS.get(provider) - if not provider_plugin_class: - continue - - provider_instance = provider_plugin_class() - - for path in paths: - tasks.append(process_credential(provider, path, provider_instance)) - - # Execute all credential processing tasks in parallel - results = await asyncio.gather(*tasks, return_exceptions=True) - - # --- Pass 3: Sequential Deduplication and Final Assembly --- - for result in results: - # Handle exceptions from gather - if isinstance(result, Exception): - logging.error(f"Credential processing raised exception: {result}") - continue - - provider, path, email, error = result - - # Skip if there was an error - if error: - continue - - # If provider doesn't support get_user_info, add directly - if email is None: - if provider not in final_oauth_credentials: - final_oauth_credentials[provider] = [] - final_oauth_credentials[provider].append(path) - continue - - # Handle empty email - if not email: - logging.warning( - f"Could not retrieve email for '{path}'. Treating as unique." - ) - if provider not in final_oauth_credentials: - final_oauth_credentials[provider] = [] - final_oauth_credentials[provider].append(path) - continue - - # Deduplication check - if email not in processed_emails: - processed_emails[email] = {} - - if ( - provider in processed_emails[email] - and processed_emails[email][provider] != path - ): - original_path = processed_emails[email][provider] - logging.warning( - f"Duplicate for '{email}' on '{provider}' found post-init: '{Path(path).name}'. Original: '{Path(original_path).name}'. Skipping." - ) - continue - else: - processed_emails[email][provider] = path - if provider not in final_oauth_credentials: - final_oauth_credentials[provider] = [] - final_oauth_credentials[provider].append(path) - - # Update metadata (skip for env-based credentials - they don't have files) - if not path.startswith("env://"): - try: - with open(path, "r+") as f: - data = json.load(f) - metadata = data.get("_proxy_metadata", {}) - metadata["email"] = email - metadata["last_check_timestamp"] = time.time() - data["_proxy_metadata"] = metadata - f.seek(0) - json.dump(data, f, indent=2) - f.truncate() - except Exception as e: - logging.error(f"Failed to update metadata for '{path}': {e}") - - logging.info("OAuth credential processing complete.") - oauth_credentials = final_oauth_credentials - - # [NEW] Load provider-specific params - litellm_provider_params = { - "gemini_cli": {"project_id": os.getenv("GEMINI_CLI_PROJECT_ID")} - } - - # Load global timeout from environment (default 30 seconds) - global_timeout = int(os.getenv("GLOBAL_TIMEOUT", "30")) - - # The client now uses the root logger configuration - client = RotatingClient( - api_keys=api_keys, - oauth_credentials=oauth_credentials, # Pass OAuth config - configure_logging=True, - global_timeout=global_timeout, - litellm_provider_params=litellm_provider_params, - ignore_models=ignore_models, - whitelist_models=whitelist_models, - enable_request_logging=ENABLE_REQUEST_LOGGING, - max_concurrent_requests_per_key=max_concurrent_requests_per_key, - ) - - # Log loaded credentials summary (compact, always visible for deployment verification) - # _api_summary = ', '.join([f"{p}:{len(c)}" for p, c in api_keys.items()]) if api_keys else "none" - # _oauth_summary = ', '.join([f"{p}:{len(c)}" for p, c in oauth_credentials.items()]) if oauth_credentials else "none" - # _total_summary = ', '.join([f"{p}:{len(c)}" for p, c in client.all_credentials.items()]) - # print(f"🔑 Credentials loaded: {_total_summary} (API: {_api_summary} | OAuth: {_oauth_summary})") - client.background_refresher.start() # Start the background task - app.state.rotating_client = client - - # Warn if no provider credentials are configured - if not client.all_credentials: - logging.warning("=" * 70) - logging.warning("⚠️ NO PROVIDER CREDENTIALS CONFIGURED") - logging.warning("The proxy is running but cannot serve any LLM requests.") - logging.warning( - "Launch the credential tool to add API keys or OAuth credentials." - ) - logging.warning(" • Executable: Run with --add-credential flag") - logging.warning(" • Source: python src/proxy_app/main.py --add-credential") - logging.warning("=" * 70) - - os.environ["LITELLM_LOG"] = "ERROR" - litellm.set_verbose = False - litellm.drop_params = True - if USE_EMBEDDING_BATCHER: - batcher = EmbeddingBatcher(client=client) - app.state.embedding_batcher = batcher - logging.info("RotatingClient and EmbeddingBatcher initialized.") - else: - app.state.embedding_batcher = None - logging.info("RotatingClient initialized (EmbeddingBatcher disabled).") - - # Start model info service in background (fetches pricing/capabilities data) - # This runs asynchronously and doesn't block proxy startup - model_info_service = await init_model_info_service() - app.state.model_info_service = model_info_service - logging.info("Model info service started (fetching pricing data in background).") - - yield - - await client.background_refresher.stop() # Stop the background task on shutdown - if app.state.embedding_batcher: - await app.state.embedding_batcher.stop() - await client.close() - - # Stop model info service - if hasattr(app.state, "model_info_service") and app.state.model_info_service: - await app.state.model_info_service.stop() - - if app.state.embedding_batcher: - logging.info("RotatingClient and EmbeddingBatcher closed.") - else: - logging.info("RotatingClient closed.") - - -# --- FastAPI App Setup --- -app = FastAPI(lifespan=lifespan) - -# Add CORS middleware to allow all origins, methods, and headers -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], # Allows all origins - allow_credentials=True, - allow_methods=["*"], # Allows all methods - allow_headers=["*"], # Allows all headers -) -api_key_header = APIKeyHeader(name="Authorization", auto_error=False) - - -def get_rotating_client(request: Request) -> RotatingClient: - """Dependency to get the rotating client instance from the app state.""" - return request.app.state.rotating_client - - -def get_embedding_batcher(request: Request) -> EmbeddingBatcher: - """Dependency to get the embedding batcher instance from the app state.""" - return request.app.state.embedding_batcher - - -async def verify_api_key(auth: str = Depends(api_key_header)): - """Dependency to verify the proxy API key.""" - # If PROXY_API_KEY is not set or empty, skip verification (open access) - if not PROXY_API_KEY: - return auth - if not auth or auth != f"Bearer {PROXY_API_KEY}": - raise HTTPException(status_code=401, detail="Invalid or missing API Key") - return auth - - -# --- Anthropic API Key Header --- -anthropic_api_key_header = APIKeyHeader(name="x-api-key", auto_error=False) - - -async def verify_anthropic_api_key( - x_api_key: str = Depends(anthropic_api_key_header), - auth: str = Depends(api_key_header), -): - """ - Dependency to verify API key for Anthropic endpoints. - Accepts either x-api-key header (Anthropic style) or Authorization Bearer (OpenAI style). - """ - # Check x-api-key first (Anthropic style) - if x_api_key and x_api_key == PROXY_API_KEY: - return x_api_key - # Fall back to Bearer token (OpenAI style) - if auth and auth == f"Bearer {PROXY_API_KEY}": - return auth - raise HTTPException(status_code=401, detail="Invalid or missing API Key") - - -async def streaming_response_wrapper( - request: Request, - request_data: dict, - response_stream: AsyncGenerator[str, None], - logger: Optional[RawIOLogger] = None, -) -> AsyncGenerator[str, None]: - """ - Wraps a streaming response to log the full response after completion - and ensures any errors during the stream are sent to the client. - """ - response_chunks = [] - full_response = {} - - try: - async for chunk_str in response_stream: - if await request.is_disconnected(): - logging.warning("Client disconnected, stopping stream.") - break - yield chunk_str - if chunk_str.strip() and chunk_str.startswith("data:"): - content = chunk_str[len("data:") :].strip() - if content != "[DONE]": - try: - chunk_data = json.loads(content) - response_chunks.append(chunk_data) - if logger: - logger.log_stream_chunk(chunk_data) - except json.JSONDecodeError: - pass - except Exception as e: - logging.error(f"An error occurred during the response stream: {e}") - # Yield a final error message to the client to ensure they are not left hanging. - error_payload = { - "error": { - "message": f"An unexpected error occurred during the stream: {str(e)}", - "type": "proxy_internal_error", - "code": 500, - } - } - yield f"data: {json.dumps(error_payload)}\n\n" - yield "data: [DONE]\n\n" - # Also log this as a failed request - if logger: - logger.log_final_response( - status_code=500, headers=None, body={"error": str(e)} - ) - return # Stop further processing - finally: - if response_chunks: - # --- Aggregation Logic --- - final_message = {"role": "assistant"} - aggregated_tool_calls = {} - usage_data = None - finish_reason = None - - for chunk in response_chunks: - if "choices" in chunk and chunk["choices"]: - choice = chunk["choices"][0] - delta = choice.get("delta", {}) - - # Dynamically aggregate all fields from the delta - for key, value in delta.items(): - if value is None: - continue - - if key == "content": - if "content" not in final_message: - final_message["content"] = "" - if value: - final_message["content"] += value - - elif key == "tool_calls": - for tc_chunk in value: - index = tc_chunk["index"] - if index not in aggregated_tool_calls: - aggregated_tool_calls[index] = { - "type": "function", - "function": {"name": "", "arguments": ""}, - } - # Ensure 'function' key exists for this index before accessing its sub-keys - if "function" not in aggregated_tool_calls[index]: - aggregated_tool_calls[index]["function"] = { - "name": "", - "arguments": "", - } - if tc_chunk.get("id"): - aggregated_tool_calls[index]["id"] = tc_chunk["id"] - if "function" in tc_chunk: - if "name" in tc_chunk["function"]: - if tc_chunk["function"]["name"] is not None: - aggregated_tool_calls[index]["function"][ - "name" - ] += tc_chunk["function"]["name"] - if "arguments" in tc_chunk["function"]: - if ( - tc_chunk["function"]["arguments"] - is not None - ): - aggregated_tool_calls[index]["function"][ - "arguments" - ] += tc_chunk["function"]["arguments"] - - elif key == "function_call": - if "function_call" not in final_message: - final_message["function_call"] = { - "name": "", - "arguments": "", - } - if "name" in value: - if value["name"] is not None: - final_message["function_call"]["name"] += value[ - "name" - ] - if "arguments" in value: - if value["arguments"] is not None: - final_message["function_call"]["arguments"] += ( - value["arguments"] - ) - - else: # Generic key handling for other data like 'reasoning' - # FIX: Role should always replace, never concatenate - if key == "role": - final_message[key] = value - elif key not in final_message: - final_message[key] = value - elif isinstance(final_message.get(key), str): - final_message[key] += value - else: - final_message[key] = value - - if "finish_reason" in choice and choice["finish_reason"]: - finish_reason = choice["finish_reason"] - - if "usage" in chunk and chunk["usage"]: - usage_data = chunk["usage"] - - # --- Final Response Construction --- - if aggregated_tool_calls: - final_message["tool_calls"] = list(aggregated_tool_calls.values()) - # CRITICAL FIX: Override finish_reason when tool_calls exist - # This ensures OpenCode and other agentic systems continue the conversation loop - finish_reason = "tool_calls" - - # Ensure standard fields are present for consistent logging - for field in ["content", "tool_calls", "function_call"]: - if field not in final_message: - final_message[field] = None - - first_chunk = response_chunks[0] - final_choice = { - "index": 0, - "message": final_message, - "finish_reason": finish_reason, - } - - full_response = { - "id": first_chunk.get("id"), - "object": "chat.completion", - "created": first_chunk.get("created"), - "model": first_chunk.get("model"), - "choices": [final_choice], - "usage": usage_data, - } - - if logger: - logger.log_final_response( - status_code=200, - headers=None, # Headers are not available at this stage - body=full_response, - ) - - -@app.post("/v1/chat/completions") -async def chat_completions( - request: Request, - client: RotatingClient = Depends(get_rotating_client), - _=Depends(verify_api_key), -): - """ - OpenAI-compatible endpoint powered by the RotatingClient. - Handles both streaming and non-streaming responses and logs them. - """ - # Raw I/O logger captures unmodified HTTP data at proxy boundary (disabled by default) - raw_logger = RawIOLogger() if ENABLE_RAW_LOGGING else None - try: - # Read and parse the request body only once at the beginning. - try: - request_data = await request.json() - except json.JSONDecodeError: - raise HTTPException(status_code=400, detail="Invalid JSON in request body.") - - # Global temperature=0 override (controlled by .env variable, default: OFF) - # Low temperature makes models deterministic and prone to following training data - # instead of actual schemas, which can cause tool hallucination - # Modes: "remove" = delete temperature key, "set" = change to 1.0, "false" = disabled - override_temp_zero = os.getenv("OVERRIDE_TEMPERATURE_ZERO", "false").lower() - - if ( - override_temp_zero in ("remove", "set", "true", "1", "yes") - and "temperature" in request_data - and request_data["temperature"] == 0 - ): - if override_temp_zero == "remove": - # Remove temperature key entirely - del request_data["temperature"] - logging.debug( - "OVERRIDE_TEMPERATURE_ZERO=remove: Removed temperature=0 from request" - ) - else: - # Set to 1.0 (for "set", "true", "1", "yes") - request_data["temperature"] = 1.0 - logging.debug( - "OVERRIDE_TEMPERATURE_ZERO=set: Converting temperature=0 to temperature=1.0" - ) - - # If raw logging is enabled, capture the unmodified request data. - if raw_logger: - raw_logger.log_request(headers=request.headers, body=request_data) - - # Extract and log specific reasoning parameters for monitoring. - model = request_data.get("model") - generation_cfg = ( - request_data.get("generationConfig", {}) - or request_data.get("generation_config", {}) - or {} - ) - reasoning_effort = request_data.get("reasoning_effort") or generation_cfg.get( - "reasoning_effort" - ) - - logging.getLogger("rotator_library").debug( - f"Handling reasoning parameters: model={model}, reasoning_effort={reasoning_effort}" - ) - - # Log basic request info to console (this is a separate, simpler logger). - log_request_to_console( - url=str(request.url), - headers=dict(request.headers), - client_info=(request.client.host, request.client.port), - request_data=request_data, - ) - is_streaming = request_data.get("stream", False) - - if is_streaming: - response_generator = client.acompletion(request=request, **request_data) - return StreamingResponse( - streaming_response_wrapper( - request, request_data, response_generator, raw_logger - ), - media_type="text/event-stream", - ) - else: - response = await client.acompletion(request=request, **request_data) - if raw_logger: - # Assuming response has status_code and headers attributes - # This might need adjustment based on the actual response object - response_headers = ( - response.headers if hasattr(response, "headers") else None - ) - status_code = ( - response.status_code if hasattr(response, "status_code") else 200 - ) - raw_logger.log_final_response( - status_code=status_code, - headers=response_headers, - body=response.model_dump(), - ) - return response - - except ( - litellm.InvalidRequestError, - ValueError, - litellm.ContextWindowExceededError, - ) as e: - raise HTTPException(status_code=400, detail=f"Invalid Request: {str(e)}") - except litellm.AuthenticationError as e: - raise HTTPException(status_code=401, detail=f"Authentication Error: {str(e)}") - except litellm.RateLimitError as e: - raise HTTPException(status_code=429, detail=f"Rate Limit Exceeded: {str(e)}") - except (litellm.ServiceUnavailableError, litellm.APIConnectionError) as e: - raise HTTPException(status_code=503, detail=f"Service Unavailable: {str(e)}") - except litellm.Timeout as e: - raise HTTPException(status_code=504, detail=f"Gateway Timeout: {str(e)}") - except (litellm.InternalServerError, litellm.OpenAIError) as e: - raise HTTPException(status_code=502, detail=f"Bad Gateway: {str(e)}") - except Exception as e: - logging.error(f"Request failed after all retries: {e}") - # Optionally log the failed request - if ENABLE_REQUEST_LOGGING: - try: - request_data = await request.json() - except json.JSONDecodeError: - request_data = {"error": "Could not parse request body"} - if logger: - logger.log_final_response( - status_code=500, headers=None, body={"error": str(e)} - ) - raise HTTPException(status_code=500, detail=str(e)) - - -# --- Anthropic Messages API Endpoint --- -@app.post("/v1/messages") -async def anthropic_messages( - request: Request, - body: AnthropicMessagesRequest, - client: RotatingClient = Depends(get_rotating_client), - _=Depends(verify_anthropic_api_key), -): - """ - Anthropic-compatible Messages API endpoint. - - Accepts requests in Anthropic's format and returns responses in Anthropic's format. - Internally translates to OpenAI format for processing via LiteLLM. - - This endpoint is compatible with Claude Code and other Anthropic API clients. - """ - # Initialize raw I/O logger if enabled (for debugging proxy boundary) - logger = RawIOLogger() if ENABLE_RAW_LOGGING else None - - # Log raw Anthropic request if raw logging is enabled - if logger: - logger.log_request( - headers=dict(request.headers), - body=body.model_dump(exclude_none=True), - ) - - try: - # Log the request to console - log_request_to_console( - url=str(request.url), - headers=dict(request.headers), - client_info=( - request.client.host if request.client else "unknown", - request.client.port if request.client else 0, - ), - request_data=body.model_dump(exclude_none=True), - ) - - # Use the library method to handle the request - result = await client.anthropic_messages(body, raw_request=request) - - if body.stream: - # Streaming response - return StreamingResponse( - result, - media_type="text/event-stream", - headers={ - "Cache-Control": "no-cache", - "Connection": "keep-alive", - "X-Accel-Buffering": "no", - }, - ) - else: - # Non-streaming response - if logger: - logger.log_final_response( - status_code=200, - headers=None, - body=result, - ) - return JSONResponse(content=result) - - except ( - litellm.InvalidRequestError, - ValueError, - litellm.ContextWindowExceededError, - ) as e: - error_response = { - "type": "error", - "error": {"type": "invalid_request_error", "message": str(e)}, - } - raise HTTPException(status_code=400, detail=error_response) - except litellm.AuthenticationError as e: - error_response = { - "type": "error", - "error": {"type": "authentication_error", "message": str(e)}, - } - raise HTTPException(status_code=401, detail=error_response) - except litellm.RateLimitError as e: - error_response = { - "type": "error", - "error": {"type": "rate_limit_error", "message": str(e)}, - } - raise HTTPException(status_code=429, detail=error_response) - except (litellm.ServiceUnavailableError, litellm.APIConnectionError) as e: - error_response = { - "type": "error", - "error": {"type": "api_error", "message": str(e)}, - } - raise HTTPException(status_code=503, detail=error_response) - except litellm.Timeout as e: - error_response = { - "type": "error", - "error": {"type": "api_error", "message": f"Request timed out: {str(e)}"}, - } - raise HTTPException(status_code=504, detail=error_response) - except Exception as e: - logging.error(f"Anthropic messages endpoint error: {e}") - if logger: - logger.log_final_response( - status_code=500, - headers=None, - body={"error": str(e)}, - ) - error_response = { - "type": "error", - "error": {"type": "api_error", "message": str(e)}, - } - raise HTTPException(status_code=500, detail=error_response) - - -# --- Anthropic Count Tokens Endpoint --- -@app.post("/v1/messages/count_tokens") -async def anthropic_count_tokens( - request: Request, - body: AnthropicCountTokensRequest, - client: RotatingClient = Depends(get_rotating_client), - _=Depends(verify_anthropic_api_key), -): - """ - Anthropic-compatible count_tokens endpoint. - - Counts the number of tokens that would be used by a Messages API request. - This is useful for estimating costs and managing context windows. - - Accepts requests in Anthropic's format and returns token count in Anthropic's format. - """ - try: - # Use the library method to handle the request - result = await client.anthropic_count_tokens(body) - return JSONResponse(content=result) - - except ( - litellm.InvalidRequestError, - ValueError, - litellm.ContextWindowExceededError, - ) as e: - error_response = { - "type": "error", - "error": {"type": "invalid_request_error", "message": str(e)}, - } - raise HTTPException(status_code=400, detail=error_response) - except litellm.AuthenticationError as e: - error_response = { - "type": "error", - "error": {"type": "authentication_error", "message": str(e)}, - } - raise HTTPException(status_code=401, detail=error_response) - except Exception as e: - logging.error(f"Anthropic count_tokens endpoint error: {e}") - error_response = { - "type": "error", - "error": {"type": "api_error", "message": str(e)}, - } - raise HTTPException(status_code=500, detail=error_response) - - -@app.post("/v1/embeddings") -async def embeddings( - request: Request, - body: EmbeddingRequest, - client: RotatingClient = Depends(get_rotating_client), - batcher: Optional[EmbeddingBatcher] = Depends(get_embedding_batcher), - _=Depends(verify_api_key), -): - """ - OpenAI-compatible endpoint for creating embeddings. - Supports two modes based on the USE_EMBEDDING_BATCHER flag: - - True: Uses a server-side batcher for high throughput. - - False: Passes requests directly to the provider. - """ - try: - request_data = body.model_dump(exclude_none=True) - log_request_to_console( - url=str(request.url), - headers=dict(request.headers), - client_info=(request.client.host, request.client.port), - request_data=request_data, - ) - if USE_EMBEDDING_BATCHER and batcher: - # --- Server-Side Batching Logic --- - request_data = body.model_dump(exclude_none=True) - inputs = request_data.get("input", []) - if isinstance(inputs, str): - inputs = [inputs] - - tasks = [] - for single_input in inputs: - individual_request = request_data.copy() - individual_request["input"] = single_input - tasks.append(batcher.add_request(individual_request)) - - results = await asyncio.gather(*tasks) - - all_data = [] - total_prompt_tokens = 0 - total_tokens = 0 - for i, result in enumerate(results): - result["data"][0]["index"] = i - all_data.extend(result["data"]) - total_prompt_tokens += result["usage"]["prompt_tokens"] - total_tokens += result["usage"]["total_tokens"] - - final_response_data = { - "object": "list", - "model": results[0]["model"], - "data": all_data, - "usage": { - "prompt_tokens": total_prompt_tokens, - "total_tokens": total_tokens, - }, - } - response = litellm.EmbeddingResponse(**final_response_data) - - else: - # --- Direct Pass-Through Logic --- - request_data = body.model_dump(exclude_none=True) - if isinstance(request_data.get("input"), str): - request_data["input"] = [request_data["input"]] - - response = await client.aembedding(request=request, **request_data) - - return response - - except HTTPException as e: - # Re-raise HTTPException to ensure it's not caught by the generic Exception handler - raise e - except ( - litellm.InvalidRequestError, - ValueError, - litellm.ContextWindowExceededError, - ) as e: - raise HTTPException(status_code=400, detail=f"Invalid Request: {str(e)}") - except litellm.AuthenticationError as e: - raise HTTPException(status_code=401, detail=f"Authentication Error: {str(e)}") - except litellm.RateLimitError as e: - raise HTTPException(status_code=429, detail=f"Rate Limit Exceeded: {str(e)}") - except (litellm.ServiceUnavailableError, litellm.APIConnectionError) as e: - raise HTTPException(status_code=503, detail=f"Service Unavailable: {str(e)}") - except litellm.Timeout as e: - raise HTTPException(status_code=504, detail=f"Gateway Timeout: {str(e)}") - except (litellm.InternalServerError, litellm.OpenAIError) as e: - raise HTTPException(status_code=502, detail=f"Bad Gateway: {str(e)}") - except Exception as e: - logging.error(f"Embedding request failed: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@app.get("/") -def read_root(): - return {"Status": "API Key Proxy is running"} - - -@app.get("/v1/models") -async def list_models( - request: Request, - client: RotatingClient = Depends(get_rotating_client), - _=Depends(verify_api_key), - enriched: bool = True, -): - """ - Returns a list of available models in the OpenAI-compatible format. - - Query Parameters: - enriched: If True (default), returns detailed model info with pricing and capabilities. - If False, returns minimal OpenAI-compatible response. - """ - model_ids = await client.get_all_available_models(grouped=False) - - if enriched and hasattr(request.app.state, "model_info_service"): - model_info_service = request.app.state.model_info_service - if model_info_service.is_ready: - # Return enriched model data - enriched_data = model_info_service.enrich_model_list(model_ids) - return {"object": "list", "data": enriched_data} - - # Fallback to basic model cards - model_cards = [ - { - "id": model_id, - "object": "model", - "created": int(time.time()), - "owned_by": "Mirro-Proxy", - } - for model_id in model_ids - ] - return {"object": "list", "data": model_cards} - - -@app.get("/v1/models/{model_id:path}") -async def get_model( - model_id: str, - request: Request, - _=Depends(verify_api_key), -): - """ - Returns detailed information about a specific model. - - Path Parameters: - model_id: The model ID (e.g., "anthropic/claude-3-opus", "openrouter/openai/gpt-4") - """ - if hasattr(request.app.state, "model_info_service"): - model_info_service = request.app.state.model_info_service - if model_info_service.is_ready: - info = model_info_service.get_model_info(model_id) - if info: - return info.to_dict() - - # Return basic info if service not ready or model not found - return { - "id": model_id, - "object": "model", - "created": int(time.time()), - "owned_by": model_id.split("/")[0] if "/" in model_id else "unknown", - } - - -@app.get("/v1/model-info/stats") -async def model_info_stats( - request: Request, - _=Depends(verify_api_key), -): - """ - Returns statistics about the model info service (for monitoring/debugging). - """ - if hasattr(request.app.state, "model_info_service"): - return request.app.state.model_info_service.get_stats() - return {"error": "Model info service not initialized"} - - -@app.get("/v1/providers") -async def list_providers(_=Depends(verify_api_key)): - """ - Returns a list of all available providers. - """ - return list(PROVIDER_PLUGINS.keys()) - - -@app.get("/v1/quota-stats") -async def get_quota_stats( - request: Request, - client: RotatingClient = Depends(get_rotating_client), - _=Depends(verify_api_key), - provider: str = None, -): - """ - Returns quota and usage statistics for all credentials. - - This returns cached data from the proxy without making external API calls. - Use POST to reload from disk or force refresh from external APIs. - - Query Parameters: - provider: Optional filter to return stats for a specific provider only - - Returns: - { - "providers": { - "provider_name": { - "credential_count": int, - "active_count": int, - "on_cooldown_count": int, - "exhausted_count": int, - "total_requests": int, - "tokens": {...}, - "approx_cost": float | null, - "quota_groups": {...}, // For Antigravity - "credentials": [...] - } - }, - "summary": {...}, - "data_source": "cache", - "timestamp": float - } - """ - try: - stats = await client.get_quota_stats(provider_filter=provider) - return stats - except Exception as e: - logging.error(f"Failed to get quota stats: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@app.post("/v1/quota-stats") -async def refresh_quota_stats( - request: Request, - client: RotatingClient = Depends(get_rotating_client), - _=Depends(verify_api_key), -): - """ - Refresh quota and usage statistics. - - Request body: - { - "action": "reload" | "force_refresh", - "scope": "all" | "provider" | "credential", - "provider": "antigravity", // required if scope != "all" - "credential": "antigravity_oauth_1.json" // required if scope == "credential" - } - - Actions: - - reload: Re-read data from disk (no external API calls) - - force_refresh: For Antigravity, fetch live quota from API. - For other providers, same as reload. - - Returns: - Same as GET, plus a "refresh_result" field with operation details. - """ - try: - data = await request.json() - action = data.get("action", "reload") - scope = data.get("scope", "all") - provider = data.get("provider") - credential = data.get("credential") - - # Validate parameters - if action not in ("reload", "force_refresh"): - raise HTTPException( - status_code=400, - detail="action must be 'reload' or 'force_refresh'", - ) - - if scope not in ("all", "provider", "credential"): - raise HTTPException( - status_code=400, - detail="scope must be 'all', 'provider', or 'credential'", - ) - - if scope in ("provider", "credential") and not provider: - raise HTTPException( - status_code=400, - detail="'provider' is required when scope is 'provider' or 'credential'", - ) - - if scope == "credential" and not credential: - raise HTTPException( - status_code=400, - detail="'credential' is required when scope is 'credential'", - ) - - refresh_result = { - "action": action, - "scope": scope, - "provider": provider, - "credential": credential, - } - - if action == "reload": - # Just reload from disk - start_time = time.time() - await client.reload_usage_from_disk() - refresh_result["duration_ms"] = int((time.time() - start_time) * 1000) - refresh_result["success"] = True - refresh_result["message"] = "Reloaded usage data from disk" - - elif action == "force_refresh": - # Force refresh from external API (for supported providers like Antigravity) - result = await client.force_refresh_quota( - provider=provider if scope in ("provider", "credential") else None, - credential=credential if scope == "credential" else None, - ) - refresh_result.update(result) - refresh_result["success"] = result["failed_count"] == 0 - - # Get updated stats - stats = await client.get_quota_stats(provider_filter=provider) - stats["refresh_result"] = refresh_result - stats["data_source"] = "refreshed" - - return stats - - except HTTPException: - raise - except Exception as e: - logging.error(f"Failed to refresh quota stats: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@app.post("/v1/token-count") -async def token_count( - request: Request, - client: RotatingClient = Depends(get_rotating_client), - _=Depends(verify_api_key), -): - """ - Calculates the token count for a given list of messages and a model. - """ - try: - data = await request.json() - model = data.get("model") - messages = data.get("messages") - - if not model or not messages: - raise HTTPException( - status_code=400, detail="'model' and 'messages' are required." - ) - - count = client.token_count(**data) - return {"token_count": count} - - except Exception as e: - logging.error(f"Token count failed: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@app.post("/v1/cost-estimate") -async def cost_estimate(request: Request, _=Depends(verify_api_key)): - """ - Estimates the cost for a request based on token counts and model pricing. - - Request body: - { - "model": "anthropic/claude-3-opus", - "prompt_tokens": 1000, - "completion_tokens": 500, - "cache_read_tokens": 0, # optional - "cache_creation_tokens": 0 # optional - } - - Returns: - { - "model": "anthropic/claude-3-opus", - "cost": 0.0375, - "currency": "USD", - "pricing": { - "input_cost_per_token": 0.000015, - "output_cost_per_token": 0.000075 - }, - "source": "model_info_service" # or "litellm_fallback" - } - """ - try: - data = await request.json() - model = data.get("model") - prompt_tokens = data.get("prompt_tokens", 0) - completion_tokens = data.get("completion_tokens", 0) - cache_read_tokens = data.get("cache_read_tokens", 0) - cache_creation_tokens = data.get("cache_creation_tokens", 0) - - if not model: - raise HTTPException(status_code=400, detail="'model' is required.") - - result = { - "model": model, - "cost": None, - "currency": "USD", - "pricing": {}, - "source": None, - } - - # Try model info service first - if hasattr(request.app.state, "model_info_service"): - model_info_service = request.app.state.model_info_service - if model_info_service.is_ready: - cost = model_info_service.calculate_cost( - model, - prompt_tokens, - completion_tokens, - cache_read_tokens, - cache_creation_tokens, - ) - if cost is not None: - cost_info = model_info_service.get_cost_info(model) - result["cost"] = cost - result["pricing"] = cost_info or {} - result["source"] = "model_info_service" - return result - - # Fallback to litellm - try: - import litellm - - # Create a mock response for cost calculation - model_info = litellm.get_model_info(model) - input_cost = model_info.get("input_cost_per_token", 0) - output_cost = model_info.get("output_cost_per_token", 0) - - if input_cost or output_cost: - cost = (prompt_tokens * input_cost) + (completion_tokens * output_cost) - result["cost"] = cost - result["pricing"] = { - "input_cost_per_token": input_cost, - "output_cost_per_token": output_cost, - } - result["source"] = "litellm_fallback" - return result - except Exception: - pass - - result["source"] = "unknown" - result["error"] = "Pricing data not available for this model" - return result - - except HTTPException: - raise - except Exception as e: - logging.error(f"Cost estimate failed: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -if __name__ == "__main__": - # Define ENV_FILE for onboarding checks using centralized path - ENV_FILE = get_data_file(".env") - - # Check if launcher TUI should be shown (no arguments provided) - if len(sys.argv) == 1: - # No arguments - show launcher TUI (lazy import) - from proxy_app.launcher_tui import run_launcher_tui - - run_launcher_tui() - # Launcher modifies sys.argv and returns, or exits if user chose Exit - # If we get here, user chose "Run Proxy" and sys.argv is modified - # Re-parse arguments with modified sys.argv - args = parser.parse_args() - - def needs_onboarding() -> bool: - """ - Check if the proxy needs onboarding (first-time setup). - Returns True if onboarding is needed, False otherwise. - """ - # Only check if .env file exists - # PROXY_API_KEY is optional (will show warning if not set) - if not ENV_FILE.is_file(): - return True - - return False - - def show_onboarding_message(): - """Display clear explanatory message for why onboarding is needed.""" - os.system( - "cls" if os.name == "nt" else "clear" - ) # Clear terminal for clean presentation - console.print( - Panel.fit( - "[bold cyan]🚀 LLM API Key Proxy - First Time Setup[/bold cyan]", - border_style="cyan", - ) - ) - console.print("[bold yellow]⚠️ Configuration Required[/bold yellow]\n") - - console.print("The proxy needs initial configuration:") - console.print(" [red]❌ No .env file found[/red]") - - console.print("\n[bold]Why this matters:[/bold]") - console.print(" • The .env file stores your credentials and settings") - console.print(" • PROXY_API_KEY protects your proxy from unauthorized access") - console.print(" • Provider API keys enable LLM access") - - console.print("\n[bold]What happens next:[/bold]") - console.print(" 1. We'll create a .env file with PROXY_API_KEY") - console.print(" 2. You can add LLM provider credentials (API keys or OAuth)") - console.print(" 3. The proxy will then start normally") - - console.print( - "\n[bold yellow]⚠️ Note:[/bold yellow] The credential tool adds PROXY_API_KEY by default." - ) - console.print(" You can remove it later if you want an unsecured proxy.\n") - - console.input( - "[bold green]Press Enter to launch the credential setup tool...[/bold green]" - ) - - # Check if user explicitly wants to add credentials - if args.add_credential: - # Import and call ensure_env_defaults to create .env and PROXY_API_KEY if needed - from rotator_library.credential_tool import ensure_env_defaults - - ensure_env_defaults() - # Reload environment variables after ensure_env_defaults creates/updates .env - load_dotenv(ENV_FILE, override=True) - run_credential_tool() - else: - # Check if onboarding is needed - if needs_onboarding(): - # Import console from rich for better messaging - from rich.console import Console - from rich.panel import Panel - - console = Console() - - # Show clear explanatory message - show_onboarding_message() - - # Launch credential tool automatically - from rotator_library.credential_tool import ensure_env_defaults - - ensure_env_defaults() - load_dotenv(ENV_FILE, override=True) - run_credential_tool() - - # After credential tool exits, reload and re-check - load_dotenv(ENV_FILE, override=True) - # Re-read PROXY_API_KEY from environment - PROXY_API_KEY = os.getenv("PROXY_API_KEY") - - # Verify onboarding is complete - if needs_onboarding(): - console.print("\n[bold red]❌ Configuration incomplete.[/bold red]") - console.print( - "The proxy still cannot start. Please ensure PROXY_API_KEY is set in .env\n" - ) - sys.exit(1) - else: - console.print("\n[bold green]✅ Configuration complete![/bold green]") - console.print("\nStarting proxy server...\n") - - import uvicorn - - uvicorn.run(app, host=args.host, port=args.port) diff --git a/src/proxy_app/model_filter_gui.py b/src/proxy_app/model_filter_gui.py deleted file mode 100644 index 9680e24a..00000000 --- a/src/proxy_app/model_filter_gui.py +++ /dev/null @@ -1,3636 +0,0 @@ -""" -Model Filter GUI - Visual editor for model ignore/whitelist rules. - -A CustomTkinter application that provides a friendly interface for managing -which models are available per provider through ignore lists and whitelists. - -Features: -- Two synchronized model lists showing all fetched models and their filtered status -- Color-coded rules with visual association to affected models -- Real-time filtering preview as you type patterns -- Click interactions to highlight rule-model relationships -- Right-click context menus for quick actions -- Comprehensive help documentation -""" - -import customtkinter as ctk -from tkinter import Menu -import asyncio -import fnmatch -import platform -import threading -import os -import re -import traceback -from pathlib import Path -from dataclasses import dataclass, field -from typing import List, Dict, Tuple, Optional, Callable, Set -from dotenv import load_dotenv, set_key, unset_key - - -# ════════════════════════════════════════════════════════════════════════════════ -# CONSTANTS & CONFIGURATION -# ════════════════════════════════════════════════════════════════════════════════ - -# Window settings -WINDOW_TITLE = "Model Filter Configuration" -WINDOW_DEFAULT_SIZE = "1000x750" -WINDOW_MIN_WIDTH = 600 -WINDOW_MIN_HEIGHT = 400 - -# Color scheme (dark mode) -BG_PRIMARY = "#1a1a2e" # Main background -BG_SECONDARY = "#16213e" # Card/panel background -BG_TERTIARY = "#0f0f1a" # Input fields, lists -BG_HOVER = "#1f2b47" # Hover state -BORDER_COLOR = "#2a2a4a" # Subtle borders -TEXT_PRIMARY = "#e8e8e8" # Main text -TEXT_SECONDARY = "#a0a0a0" # Muted text -TEXT_MUTED = "#666680" # Very muted text -ACCENT_BLUE = "#4a9eff" # Primary accent -ACCENT_GREEN = "#2ecc71" # Success/normal -ACCENT_RED = "#e74c3c" # Danger/ignore -ACCENT_YELLOW = "#f1c40f" # Warning - -# Status colors -NORMAL_COLOR = "#2ecc71" # Green - models not affected by any rule -HIGHLIGHT_BG = "#2a3a5a" # Background for highlighted items - -# Ignore rules - warm color progression (reds/oranges) -IGNORE_COLORS = [ - "#e74c3c", # Bright red - "#c0392b", # Dark red - "#e67e22", # Orange - "#d35400", # Dark orange - "#f39c12", # Gold - "#e91e63", # Pink - "#ff5722", # Deep orange - "#f44336", # Material red - "#ff6b6b", # Coral - "#ff8a65", # Light deep orange -] - -# Whitelist rules - cool color progression (blues/teals) -WHITELIST_COLORS = [ - "#3498db", # Blue - "#2980b9", # Dark blue - "#1abc9c", # Teal - "#16a085", # Dark teal - "#9b59b6", # Purple - "#8e44ad", # Dark purple - "#00bcd4", # Cyan - "#2196f3", # Material blue - "#64b5f6", # Light blue - "#4dd0e1", # Light cyan -] - -# Font configuration -FONT_FAMILY = "Segoe UI" -FONT_SIZE_SMALL = 11 -FONT_SIZE_NORMAL = 12 -FONT_SIZE_LARGE = 14 -FONT_SIZE_TITLE = 16 -FONT_SIZE_HEADER = 20 - - -# ════════════════════════════════════════════════════════════════════════════════ -# CROSS-PLATFORM UTILITIES -# ════════════════════════════════════════════════════════════════════════════════ - - -def get_scroll_delta(event) -> int: - """ - Calculate scroll delta in a cross-platform manner. - - On Windows, event.delta is typically ±120 per notch. - On macOS, event.delta is typically ±1 per scroll event. - On Linux/X11, behavior varies but is usually similar to macOS. - - Returns a normalized scroll direction value (typically ±1). - """ - system = platform.system() - if system == "Darwin": # macOS - return -event.delta - elif system == "Linux": - # Linux with X11 typically uses ±1 like macOS - # but some configurations may use larger values - if abs(event.delta) >= 120: - return -1 * (event.delta // 120) - return -event.delta - else: # Windows - return -1 * (event.delta // 120) - - -# ════════════════════════════════════════════════════════════════════════════════ -# DATA CLASSES -# ════════════════════════════════════════════════════════════════════════════════ - - -@dataclass -class FilterRule: - """Represents a single filter rule (ignore or whitelist pattern).""" - - pattern: str - color: str - rule_type: str # 'ignore' or 'whitelist' - affected_count: int = 0 - affected_models: List[str] = field(default_factory=list) - - def __hash__(self): - return hash((self.pattern, self.rule_type)) - - def __eq__(self, other): - if not isinstance(other, FilterRule): - return False - return self.pattern == other.pattern and self.rule_type == other.rule_type - - -@dataclass -class ModelStatus: - """Status information for a single model.""" - - model_id: str - status: str # 'normal', 'ignored', 'whitelisted' - color: str - affecting_rule: Optional[FilterRule] = None - - @property - def display_name(self) -> str: - """Get the model name without provider prefix for display.""" - if "/" in self.model_id: - return self.model_id.split("/", 1)[1] - return self.model_id - - @property - def provider(self) -> str: - """Extract provider from model ID.""" - if "/" in self.model_id: - return self.model_id.split("/")[0] - return "" - - -# ════════════════════════════════════════════════════════════════════════════════ -# FILTER ENGINE -# ════════════════════════════════════════════════════════════════════════════════ - - -class FilterEngine: - """ - Core filtering logic with rule management. - - Handles pattern matching, rule storage, and status calculation. - Tracks changes for save/discard functionality. - Uses caching for performance with large model lists. - """ - - def __init__(self): - self.ignore_rules: List[FilterRule] = [] - self.whitelist_rules: List[FilterRule] = [] - self._ignore_color_index = 0 - self._whitelist_color_index = 0 - self._original_ignore_patterns: Set[str] = set() - self._original_whitelist_patterns: Set[str] = set() - self._current_provider: Optional[str] = None - - # Caching for performance - self._status_cache: Dict[str, ModelStatus] = {} - self._available_count_cache: Optional[Tuple[int, int]] = None - self._cache_valid: bool = False - - def _invalidate_cache(self): - """Mark cache as stale (call when rules change).""" - self._status_cache.clear() - self._available_count_cache = None - self._cache_valid = False - - def reset(self): - """Clear all rules and reset state.""" - self.ignore_rules.clear() - self.whitelist_rules.clear() - self._ignore_color_index = 0 - self._whitelist_color_index = 0 - self._original_ignore_patterns.clear() - self._original_whitelist_patterns.clear() - self._invalidate_cache() - - def _get_next_ignore_color(self) -> str: - """Get next color for ignore rules (cycles through palette).""" - color = IGNORE_COLORS[self._ignore_color_index % len(IGNORE_COLORS)] - self._ignore_color_index += 1 - return color - - def _get_next_whitelist_color(self) -> str: - """Get next color for whitelist rules (cycles through palette).""" - color = WHITELIST_COLORS[self._whitelist_color_index % len(WHITELIST_COLORS)] - self._whitelist_color_index += 1 - return color - - def add_ignore_rule(self, pattern: str) -> Optional[FilterRule]: - """Add a new ignore rule. Returns the rule if added, None if duplicate.""" - pattern = pattern.strip() - if not pattern: - return None - - # Check for duplicates - for rule in self.ignore_rules: - if rule.pattern == pattern: - return None - - rule = FilterRule( - pattern=pattern, color=self._get_next_ignore_color(), rule_type="ignore" - ) - self.ignore_rules.append(rule) - self._invalidate_cache() - return rule - - def add_whitelist_rule(self, pattern: str) -> Optional[FilterRule]: - """Add a new whitelist rule. Returns the rule if added, None if duplicate.""" - pattern = pattern.strip() - if not pattern: - return None - - # Check for duplicates - for rule in self.whitelist_rules: - if rule.pattern == pattern: - return None - - rule = FilterRule( - pattern=pattern, - color=self._get_next_whitelist_color(), - rule_type="whitelist", - ) - self.whitelist_rules.append(rule) - self._invalidate_cache() - return rule - - def remove_ignore_rule(self, pattern: str) -> bool: - """Remove an ignore rule by pattern. Returns True if removed.""" - for i, rule in enumerate(self.ignore_rules): - if rule.pattern == pattern: - self.ignore_rules.pop(i) - self._invalidate_cache() - return True - return False - - def remove_whitelist_rule(self, pattern: str) -> bool: - """Remove a whitelist rule by pattern. Returns True if removed.""" - for i, rule in enumerate(self.whitelist_rules): - if rule.pattern == pattern: - self.whitelist_rules.pop(i) - self._invalidate_cache() - return True - return False - - def _pattern_matches(self, model_id: str, pattern: str) -> bool: - """ - Check if a pattern matches a model ID. - - Supports full glob/fnmatch syntax: - - Exact match: "gpt-4" matches only "gpt-4" - - Prefix wildcard: "gpt-4*" matches "gpt-4", "gpt-4-turbo", etc. - - Suffix wildcard: "*-preview" matches "gpt-4-preview", "o1-preview", etc. - - Contains wildcard: "*-preview*" matches anything containing "-preview" - - Match all: "*" matches everything - - Single char wildcard: "gpt-?" matches "gpt-4", "gpt-5", etc. - - Character sets: "gpt-[45]*" matches "gpt-4*", "gpt-5*" - """ - # Extract model name without provider prefix - if "/" in model_id: - provider_model_name = model_id.split("/", 1)[1] - else: - provider_model_name = model_id - - # Use fnmatch for full glob pattern support - # Match against both the provider model name and the full model ID - return fnmatch.fnmatch(provider_model_name, pattern) or fnmatch.fnmatch( - model_id, pattern - ) - - def pattern_is_covered_by(self, new_pattern: str, existing_pattern: str) -> bool: - """ - Check if new_pattern is already covered by existing_pattern. - - A pattern A is covered by pattern B if every model that would match A - would also match B. - - Examples: - - "gpt-4" is covered by "gpt-4*" (prefix covers exact) - - "gpt-4-turbo" is covered by "gpt-4*" (prefix covers longer) - - "gpt-4*" is covered by "gpt-*" (broader prefix covers narrower) - - Anything is covered by "*" (match-all covers everything) - - "gpt-4" is covered by "gpt-4" (exact duplicate) - """ - # Exact duplicate - if new_pattern == existing_pattern: - return True - - # Existing is wildcard-all - covers everything - if existing_pattern == "*": - return True - - # If existing is a prefix wildcard - if existing_pattern.endswith("*"): - existing_prefix = existing_pattern[:-1] - - # New is exact match - check if it starts with existing prefix - if not new_pattern.endswith("*"): - return new_pattern.startswith(existing_prefix) - - # New is also a prefix wildcard - check if new prefix starts with existing - new_prefix = new_pattern[:-1] - return new_prefix.startswith(existing_prefix) - - # Existing is exact match - only covers exact duplicate (already handled) - return False - - def is_pattern_covered(self, new_pattern: str, rule_type: str) -> bool: - """ - Check if a new pattern is already covered by any existing rule of the same type. - """ - rules = self.ignore_rules if rule_type == "ignore" else self.whitelist_rules - for rule in rules: - if self.pattern_is_covered_by(new_pattern, rule.pattern): - return True - return False - - def get_covered_patterns(self, new_pattern: str, rule_type: str) -> List[str]: - """ - Get list of existing patterns that would be covered (made redundant) - by adding new_pattern. - - Used for smart merge: when adding a broader pattern, remove the - narrower patterns it covers. - """ - rules = self.ignore_rules if rule_type == "ignore" else self.whitelist_rules - covered = [] - for rule in rules: - if self.pattern_is_covered_by(rule.pattern, new_pattern): - # The existing rule would be covered by the new pattern - covered.append(rule.pattern) - return covered - - def _compute_status(self, model_id: str) -> ModelStatus: - """ - Compute the status of a model based on current rules (no caching). - - Priority: Whitelist > Ignore > Normal - """ - # Check whitelist first (takes priority) - for rule in self.whitelist_rules: - if self._pattern_matches(model_id, rule.pattern): - return ModelStatus( - model_id=model_id, - status="whitelisted", - color=rule.color, - affecting_rule=rule, - ) - - # Then check ignore - for rule in self.ignore_rules: - if self._pattern_matches(model_id, rule.pattern): - return ModelStatus( - model_id=model_id, - status="ignored", - color=rule.color, - affecting_rule=rule, - ) - - # Default: normal - return ModelStatus( - model_id=model_id, status="normal", color=NORMAL_COLOR, affecting_rule=None - ) - - def get_model_status(self, model_id: str) -> ModelStatus: - """Get status for a model (uses cache if available).""" - if model_id in self._status_cache: - return self._status_cache[model_id] - return self._compute_status(model_id) - - def _rebuild_cache(self, models: List[str]): - """Rebuild the entire status cache in one efficient pass.""" - self._status_cache.clear() - - # Reset rule counts - for rule in self.ignore_rules + self.whitelist_rules: - rule.affected_count = 0 - rule.affected_models = [] - - available = 0 - for model_id in models: - status = self._compute_status(model_id) - self._status_cache[model_id] = status - - if status.affecting_rule: - status.affecting_rule.affected_count += 1 - status.affecting_rule.affected_models.append(model_id) - - if status.status != "ignored": - available += 1 - - self._available_count_cache = (available, len(models)) - self._cache_valid = True - - def get_all_statuses(self, models: List[str]) -> List[ModelStatus]: - """Get status for all models (rebuilds cache if invalid).""" - if not self._cache_valid: - self._rebuild_cache(models) - return [self._status_cache.get(m, self._compute_status(m)) for m in models] - - def update_affected_counts(self, models: List[str]): - """Update the affected_count and affected_models for all rules.""" - # This now just ensures cache is valid - counts are updated in _rebuild_cache - if not self._cache_valid: - self._rebuild_cache(models) - - def get_available_count(self, models: List[str]) -> Tuple[int, int]: - """Returns (available_count, total_count) from cache.""" - if not self._cache_valid: - self._rebuild_cache(models) - return self._available_count_cache or (0, 0) - - def preview_pattern( - self, pattern: str, rule_type: str, models: List[str] - ) -> List[str]: - """ - Preview which models would be affected by a pattern without adding it. - Returns list of affected model IDs. - """ - affected = [] - pattern = pattern.strip() - if not pattern: - return affected - - for model_id in models: - if self._pattern_matches(model_id, pattern): - affected.append(model_id) - - return affected - - def load_from_env(self, provider: str): - """Load ignore/whitelist rules for a provider from environment.""" - self.reset() - self._current_provider = provider - load_dotenv(override=True) - - # Load ignore list - ignore_key = f"IGNORE_MODELS_{provider.upper()}" - ignore_value = os.getenv(ignore_key, "") - if ignore_value: - patterns = [p.strip() for p in ignore_value.split(",") if p.strip()] - for pattern in patterns: - self.add_ignore_rule(pattern) - self._original_ignore_patterns = set(patterns) - - # Load whitelist - whitelist_key = f"WHITELIST_MODELS_{provider.upper()}" - whitelist_value = os.getenv(whitelist_key, "") - if whitelist_value: - patterns = [p.strip() for p in whitelist_value.split(",") if p.strip()] - for pattern in patterns: - self.add_whitelist_rule(pattern) - self._original_whitelist_patterns = set(patterns) - - def save_to_env(self, provider: str) -> bool: - """ - Save current rules to .env file. - Returns True if successful. - """ - env_path = Path.cwd() / ".env" - - try: - ignore_key = f"IGNORE_MODELS_{provider.upper()}" - whitelist_key = f"WHITELIST_MODELS_{provider.upper()}" - - # Save ignore patterns - ignore_patterns = [rule.pattern for rule in self.ignore_rules] - if ignore_patterns: - set_key(str(env_path), ignore_key, ",".join(ignore_patterns)) - else: - # Remove the key if no patterns - unset_key(str(env_path), ignore_key) - - # Save whitelist patterns - whitelist_patterns = [rule.pattern for rule in self.whitelist_rules] - if whitelist_patterns: - set_key(str(env_path), whitelist_key, ",".join(whitelist_patterns)) - else: - unset_key(str(env_path), whitelist_key) - - # Update original state - self._original_ignore_patterns = set(ignore_patterns) - self._original_whitelist_patterns = set(whitelist_patterns) - - return True - except Exception as e: - print(f"Error saving to .env: {e}") - traceback.print_exc() - return False - - def has_unsaved_changes(self) -> bool: - """Check if current rules differ from saved state.""" - current_ignore = set(rule.pattern for rule in self.ignore_rules) - current_whitelist = set(rule.pattern for rule in self.whitelist_rules) - - return ( - current_ignore != self._original_ignore_patterns - or current_whitelist != self._original_whitelist_patterns - ) - - def discard_changes(self): - """Reload rules from environment, discarding unsaved changes.""" - if self._current_provider: - self.load_from_env(self._current_provider) - - -# ════════════════════════════════════════════════════════════════════════════════ -# MODEL FETCHER -# ════════════════════════════════════════════════════════════════════════════════ - -# Global cache for fetched models (persists across provider switches) -_model_cache: Dict[str, List[str]] = {} - - -class ModelFetcher: - """ - Handles async model fetching from providers. - - Runs fetching in a background thread to avoid blocking the GUI. - Includes caching to avoid refetching on every provider switch. - """ - - @staticmethod - def get_cached_models(provider: str) -> Optional[List[str]]: - """Get cached models for a provider, if available.""" - return _model_cache.get(provider) - - @staticmethod - def clear_cache(provider: Optional[str] = None): - """Clear model cache. If provider specified, only clear that provider.""" - if provider: - _model_cache.pop(provider, None) - else: - _model_cache.clear() - - @staticmethod - def get_available_providers() -> List[str]: - """Get list of providers that have credentials configured.""" - providers = set() - load_dotenv(override=True) - - # Scan environment for API keys (handles numbered keys like GEMINI_API_KEY_1) - for key in os.environ: - if "_API_KEY" in key and "PROXY_API_KEY" not in key: - # Extract provider: NVIDIA_NIM_API_KEY_1 -> nvidia_nim - provider = key.split("_API_KEY")[0].lower() - providers.add(provider) - - # Check for OAuth providers - oauth_dir = Path("oauth_creds") - if oauth_dir.exists(): - for file in oauth_dir.glob("*_oauth_*.json"): - provider = file.name.split("_oauth_")[0] - providers.add(provider) - - return sorted(list(providers)) - - @staticmethod - def _find_credential(provider: str) -> Optional[str]: - """Find a credential for a provider (handles numbered keys).""" - load_dotenv(override=True) - provider_upper = provider.upper() - - # Try exact match first (e.g., GEMINI_API_KEY) - exact_key = f"{provider_upper}_API_KEY" - if os.getenv(exact_key): - return os.getenv(exact_key) - - # Look for numbered keys (e.g., GEMINI_API_KEY_1, NVIDIA_NIM_API_KEY_1) - for key, value in os.environ.items(): - if key.startswith(f"{provider_upper}_API_KEY") and value: - return value - - # Check for OAuth credentials - oauth_dir = Path("oauth_creds") - if oauth_dir.exists(): - oauth_files = list(oauth_dir.glob(f"{provider}_oauth_*.json")) - if oauth_files: - return str(oauth_files[0]) - - return None - - @staticmethod - async def _fetch_models_async(provider: str) -> Tuple[List[str], Optional[str]]: - """ - Async implementation of model fetching. - Returns: (models_list, error_message_or_none) - """ - try: - import httpx - from rotator_library.providers import PROVIDER_PLUGINS - - # Get credential - credential = ModelFetcher._find_credential(provider) - if not credential: - return [], f"No credentials found for '{provider}'" - - # Get provider class - provider_class = PROVIDER_PLUGINS.get(provider.lower()) - if not provider_class: - return [], f"Unknown provider: '{provider}'" - - # Fetch models - async with httpx.AsyncClient(timeout=30.0) as client: - instance = provider_class() - models = await instance.get_models(credential, client) - return models, None - - except ImportError as e: - return [], f"Import error: {e}" - except Exception as e: - return [], f"Failed to fetch: {str(e)}" - - @staticmethod - def fetch_models( - provider: str, - on_success: Callable[[List[str]], None], - on_error: Callable[[str], None], - on_start: Optional[Callable[[], None]] = None, - force_refresh: bool = False, - ): - """ - Fetch models in a background thread. - - Args: - provider: Provider name (e.g., 'openai', 'gemini') - on_success: Callback with list of model IDs - on_error: Callback with error message - on_start: Optional callback when fetching starts - force_refresh: If True, bypass cache and fetch fresh - """ - # Check cache first (unless force refresh) - if not force_refresh: - cached = ModelFetcher.get_cached_models(provider) - if cached is not None: - on_success(cached) - return - - def run_fetch(): - if on_start: - on_start() - - try: - # Run async fetch in new event loop - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - try: - models, error = loop.run_until_complete( - ModelFetcher._fetch_models_async(provider) - ) - # Clean up any pending tasks to avoid warnings - pending = asyncio.all_tasks(loop) - for task in pending: - task.cancel() - if pending: - loop.run_until_complete( - asyncio.gather(*pending, return_exceptions=True) - ) - finally: - loop.run_until_complete(loop.shutdown_asyncgens()) - loop.close() - - if error: - on_error(error) - else: - # Cache the results - _model_cache[provider] = models - on_success(models) - - except Exception as e: - on_error(str(e)) - - thread = threading.Thread(target=run_fetch, daemon=True) - thread.start() - - -# ════════════════════════════════════════════════════════════════════════════════ -# HELP WINDOW -# ════════════════════════════════════════════════════════════════════════════════ - - -class HelpWindow(ctk.CTkToplevel): - """ - Modal help popup with comprehensive filtering documentation. - Uses CTkTextbox for proper scrolling with dark theme styling. - """ - - def __init__(self, parent): - super().__init__(parent) - - self.title("Help - Model Filtering") - self.geometry("700x600") - self.minsize(600, 500) - - # Make modal - self.transient(parent) - self.grab_set() - - # Configure appearance - self.configure(fg_color=BG_PRIMARY) - - # Build content - self._create_content() - - # Center on parent - self.update_idletasks() - x = parent.winfo_x() + (parent.winfo_width() - self.winfo_width()) // 2 - y = parent.winfo_y() + (parent.winfo_height() - self.winfo_height()) // 2 - self.geometry(f"+{x}+{y}") - - # Focus - self.focus_force() - - # Bind escape to close - self.bind("", lambda e: self.destroy()) - - def _create_content(self): - """Build the help content using CTkTextbox for proper scrolling.""" - # Main container - main_frame = ctk.CTkFrame(self, fg_color="transparent") - main_frame.pack(fill="both", expand=True, padx=20, pady=(20, 10)) - - # Use CTkTextbox - CustomTkinter's styled text widget with built-in scrolling - self.text_box = ctk.CTkTextbox( - main_frame, - font=(FONT_FAMILY, FONT_SIZE_NORMAL), - fg_color=BG_SECONDARY, - text_color=TEXT_SECONDARY, - corner_radius=8, - wrap="word", - activate_scrollbars=True, - ) - self.text_box.pack(fill="both", expand=True) - - # Configure text tags for formatting - # Access the underlying tk.Text widget for tag configuration - text_widget = self.text_box._textbox - - text_widget.tag_configure( - "title", - font=(FONT_FAMILY, FONT_SIZE_HEADER, "bold"), - foreground=TEXT_PRIMARY, - spacing1=5, - spacing3=15, - ) - text_widget.tag_configure( - "section_title", - font=(FONT_FAMILY, FONT_SIZE_LARGE, "bold"), - foreground=ACCENT_BLUE, - spacing1=20, - spacing3=8, - ) - text_widget.tag_configure( - "separator", - font=(FONT_FAMILY, 6), - foreground=BORDER_COLOR, - spacing3=5, - ) - text_widget.tag_configure( - "content", - font=(FONT_FAMILY, FONT_SIZE_NORMAL), - foreground=TEXT_SECONDARY, - spacing1=2, - spacing3=5, - lmargin1=5, - lmargin2=5, - ) - - # Insert content - self._insert_help_content() - - # Make read-only by disabling - self.text_box.configure(state="disabled") - - # Bind mouse wheel for faster scrolling on the internal canvas - self.text_box.bind("", self._on_mousewheel) - # Also bind on the textbox's internal widget - self.text_box._textbox.bind("", self._on_mousewheel) - - # Close button at bottom - btn_frame = ctk.CTkFrame(self, fg_color="transparent") - btn_frame.pack(fill="x", padx=20, pady=(10, 15)) - - close_btn = ctk.CTkButton( - btn_frame, - text="Got it!", - font=(FONT_FAMILY, FONT_SIZE_NORMAL, "bold"), - fg_color=ACCENT_BLUE, - hover_color="#3a8aee", - height=40, - width=120, - command=self.destroy, - ) - close_btn.pack() - - def _on_mousewheel(self, event): - """Handle mouse wheel with faster scrolling.""" - # CTkTextbox uses _textbox internally - # Use larger scroll amount (3 units) for faster scrolling in help window - delta = get_scroll_delta(event) * 3 - self.text_box._textbox.yview_scroll(delta, "units") - return "break" - - def _insert_help_content(self): - """Insert all help text with formatting.""" - # Access internal text widget for inserting with tags - text_widget = self.text_box._textbox - - # Title - text_widget.insert("end", "📖 Model Filtering Guide\n", "title") - - # Sections with emojis - sections = [ - ( - "🎯 Overview", - """Model filtering allows you to control which models are available through your proxy for each provider. - -• Use the IGNORE list to block specific models -• Use the WHITELIST to ensure specific models are always available -• Whitelist ALWAYS takes priority over Ignore""", - ), - ( - "⚖️ Filtering Priority", - """When a model is checked, the following order is used: - -1. WHITELIST CHECK - If the model matches any whitelist pattern → AVAILABLE - (Whitelist overrides everything else) - -2. IGNORE CHECK - If the model matches any ignore pattern → BLOCKED - -3. DEFAULT - If no patterns match → AVAILABLE""", - ), - ( - "✏️ Pattern Syntax", - """Full glob/wildcard patterns are supported: - -EXACT MATCH - Pattern: gpt-4 - Matches: only "gpt-4", nothing else - -PREFIX WILDCARD - Pattern: gpt-4* - Matches: "gpt-4", "gpt-4-turbo", "gpt-4-preview", etc. - -SUFFIX WILDCARD - Pattern: *-preview - Matches: "gpt-4-preview", "o1-preview", etc. - -CONTAINS WILDCARD - Pattern: *-preview* - Matches: anything containing "-preview" - -MATCH ALL - Pattern: * - Matches: every model for this provider - -SINGLE CHARACTER - Pattern: gpt-? - Matches: "gpt-4", "gpt-5", etc. (any single char) - -CHARACTER SET - Pattern: gpt-[45]* - Matches: "gpt-4", "gpt-4-turbo", "gpt-5", etc.""", - ), - ( - "💡 Common Patterns", - """BLOCK ALL, ALLOW SPECIFIC: - Ignore: * - Whitelist: gpt-4o, gpt-4o-mini - Result: Only gpt-4o and gpt-4o-mini available - -BLOCK PREVIEW MODELS: - Ignore: *-preview, *-preview* - Result: All preview variants blocked - -BLOCK SPECIFIC SERIES: - Ignore: o1*, dall-e* - Result: All o1 and DALL-E models blocked - -ALLOW ONLY LATEST: - Ignore: * - Whitelist: *-latest - Result: Only models ending in "-latest" available""", - ), - ( - "🖱️ Interface Guide", - """PROVIDER DROPDOWN - Select which provider to configure - -MODEL LISTS - • Left list: All fetched models (unfiltered) - • Right list: Same models with colored status - • Green = Available (normal) - • Red/Orange tones = Blocked (ignored) - • Blue/Teal tones = Whitelisted - -SEARCH BOX - Filter both lists to find specific models quickly - -CLICKING MODELS - • Left-click: Highlight the rule affecting this model - • Right-click: Context menu with quick actions - -CLICKING RULES - • Highlights all models affected by that rule - • Shows which models will be blocked/allowed - -RULE INPUT (Merge Mode) - • Enter patterns separated by commas - • Only adds patterns not covered by existing rules - • Press Add or Enter to create rules - -IMPORT BUTTON (Replace Mode) - • Replaces ALL existing rules with imported ones - • Paste comma-separated patterns - -DELETE RULES - • Click the × button on any rule to remove it""", - ), - ( - "⌨️ Keyboard Shortcuts", - """Ctrl+S Save changes -Ctrl+R Refresh models from provider -Ctrl+F Focus search box -F1 Open this help window -Escape Clear search / Close dialogs""", - ), - ( - "💾 Saving Changes", - """Changes are saved to your .env file in this format: - - IGNORE_MODELS_OPENAI=pattern1,pattern2* - WHITELIST_MODELS_OPENAI=specific-model - -Click "Save" to persist changes, or "Discard" to revert. -Closing the window with unsaved changes will prompt you.""", - ), - ] - - for section_title, content in sections: - text_widget.insert("end", f"\n{section_title}\n", "section_title") - text_widget.insert("end", "─" * 50 + "\n", "separator") - text_widget.insert("end", content.strip() + "\n", "content") - - -# ════════════════════════════════════════════════════════════════════════════════ -# CUSTOM DIALOG -# ════════════════════════════════════════════════════════════════════════════════ - - -class UnsavedChangesDialog(ctk.CTkToplevel): - """Modal dialog for unsaved changes confirmation.""" - - def __init__(self, parent): - super().__init__(parent) - - self.result: Optional[str] = None # 'save', 'discard', 'cancel' - - self.title("Unsaved Changes") - self.geometry("400x180") - self.resizable(False, False) - - # Make modal - self.transient(parent) - self.grab_set() - - # Configure appearance - self.configure(fg_color=BG_PRIMARY) - - # Build content - self._create_content() - - # Center on parent - self.update_idletasks() - x = parent.winfo_x() + (parent.winfo_width() - self.winfo_width()) // 2 - y = parent.winfo_y() + (parent.winfo_height() - self.winfo_height()) // 2 - self.geometry(f"+{x}+{y}") - - # Focus - self.focus_force() - - # Bind escape to cancel - self.bind("", lambda e: self._on_cancel()) - - # Handle window close - self.protocol("WM_DELETE_WINDOW", self._on_cancel) - - def _create_content(self): - """Build dialog content.""" - # Icon and message - msg_frame = ctk.CTkFrame(self, fg_color="transparent") - msg_frame.pack(fill="x", padx=30, pady=(25, 15)) - - icon = ctk.CTkLabel( - msg_frame, text="⚠️", font=(FONT_FAMILY, 32), text_color=ACCENT_YELLOW - ) - icon.pack(side="left", padx=(0, 15)) - - text_frame = ctk.CTkFrame(msg_frame, fg_color="transparent") - text_frame.pack(side="left", fill="x", expand=True) - - title = ctk.CTkLabel( - text_frame, - text="Unsaved Changes", - font=(FONT_FAMILY, FONT_SIZE_LARGE, "bold"), - text_color=TEXT_PRIMARY, - anchor="w", - ) - title.pack(anchor="w") - - subtitle = ctk.CTkLabel( - text_frame, - text="You have unsaved filter changes.\nWhat would you like to do?", - font=(FONT_FAMILY, FONT_SIZE_NORMAL), - text_color=TEXT_SECONDARY, - anchor="w", - justify="left", - ) - subtitle.pack(anchor="w") - - # Buttons - btn_frame = ctk.CTkFrame(self, fg_color="transparent") - btn_frame.pack(fill="x", padx=30, pady=(10, 25)) - - cancel_btn = ctk.CTkButton( - btn_frame, - text="Cancel", - font=(FONT_FAMILY, FONT_SIZE_NORMAL), - fg_color=BG_SECONDARY, - hover_color=BG_HOVER, - border_width=1, - border_color=BORDER_COLOR, - width=100, - command=self._on_cancel, - ) - cancel_btn.pack(side="right", padx=(10, 0)) - - discard_btn = ctk.CTkButton( - btn_frame, - text="Discard", - font=(FONT_FAMILY, FONT_SIZE_NORMAL), - fg_color=ACCENT_RED, - hover_color="#c0392b", - width=100, - command=self._on_discard, - ) - discard_btn.pack(side="right", padx=(10, 0)) - - save_btn = ctk.CTkButton( - btn_frame, - text="Save", - font=(FONT_FAMILY, FONT_SIZE_NORMAL), - fg_color=ACCENT_GREEN, - hover_color="#27ae60", - width=100, - command=self._on_save, - ) - save_btn.pack(side="right") - - def _on_save(self): - self.result = "save" - self.destroy() - - def _on_discard(self): - self.result = "discard" - self.destroy() - - def _on_cancel(self): - self.result = "cancel" - self.destroy() - - def show(self) -> Optional[str]: - """Show dialog and return result.""" - self.wait_window() - return self.result - - -class ImportRulesDialog(ctk.CTkToplevel): - """Modal dialog for importing rules from comma-separated text.""" - - def __init__(self, parent, rule_type: str): - super().__init__(parent) - - self.result: Optional[List[str]] = None - self.rule_type = rule_type - - title_text = ( - "Import Ignore Rules" if rule_type == "ignore" else "Import Whitelist Rules" - ) - self.title(title_text) - self.geometry("500x300") - self.minsize(400, 250) - - # Make modal - self.transient(parent) - self.grab_set() - - # Configure appearance - self.configure(fg_color=BG_PRIMARY) - - # Build content - self._create_content() - - # Center on parent - self.update_idletasks() - x = parent.winfo_x() + (parent.winfo_width() - self.winfo_width()) // 2 - y = parent.winfo_y() + (parent.winfo_height() - self.winfo_height()) // 2 - self.geometry(f"+{x}+{y}") - - # Focus - self.focus_force() - self.text_box.focus_set() - - # Bind escape to cancel - self.bind("", lambda e: self._on_cancel()) - - # Handle window close - self.protocol("WM_DELETE_WINDOW", self._on_cancel) - - def _create_content(self): - """Build dialog content.""" - # Instructions at TOP - instruction_frame = ctk.CTkFrame(self, fg_color="transparent") - instruction_frame.pack(fill="x", padx=20, pady=(15, 10)) - - instruction = ctk.CTkLabel( - instruction_frame, - text="Paste comma-separated patterns below (will REPLACE all existing rules):", - font=(FONT_FAMILY, FONT_SIZE_NORMAL), - text_color=TEXT_PRIMARY, - anchor="w", - ) - instruction.pack(anchor="w") - - example = ctk.CTkLabel( - instruction_frame, - text="Example: gpt-4*, claude-3*, model-name", - font=(FONT_FAMILY, FONT_SIZE_SMALL), - text_color=TEXT_MUTED, - anchor="w", - ) - example.pack(anchor="w") - - # Buttons at BOTTOM - pack BEFORE textbox to reserve space - btn_frame = ctk.CTkFrame(self, fg_color="transparent", height=50) - btn_frame.pack(side="bottom", fill="x", padx=20, pady=(10, 15)) - btn_frame.pack_propagate(False) - - cancel_btn = ctk.CTkButton( - btn_frame, - text="Cancel", - font=(FONT_FAMILY, FONT_SIZE_NORMAL), - fg_color=BG_SECONDARY, - hover_color=BG_HOVER, - border_width=1, - border_color=BORDER_COLOR, - width=100, - height=32, - command=self._on_cancel, - ) - cancel_btn.pack(side="right", padx=(10, 0)) - - import_btn = ctk.CTkButton( - btn_frame, - text="Replace All", - font=(FONT_FAMILY, FONT_SIZE_NORMAL, "bold"), - fg_color=ACCENT_BLUE, - hover_color="#3a8aee", - width=110, - height=32, - command=self._on_import, - ) - import_btn.pack(side="right") - - # Text box fills MIDDLE space - pack LAST - self.text_box = ctk.CTkTextbox( - self, - font=(FONT_FAMILY, FONT_SIZE_NORMAL), - fg_color=BG_TERTIARY, - border_color=BORDER_COLOR, - border_width=1, - text_color=TEXT_PRIMARY, - wrap="word", - ) - self.text_box.pack(fill="both", expand=True, padx=20, pady=(0, 0)) - - # Bind Ctrl+Enter to import - self.text_box.bind("", lambda e: self._on_import()) - - def _on_import(self): - """Parse and return the patterns.""" - text = self.text_box.get("1.0", "end").strip() - if text: - # Parse comma-separated patterns - patterns = [p.strip() for p in text.split(",") if p.strip()] - self.result = patterns - else: - self.result = [] - self.destroy() - - def _on_cancel(self): - self.result = None - self.destroy() - - def show(self) -> Optional[List[str]]: - """Show dialog and return list of patterns, or None if cancelled.""" - self.wait_window() - return self.result - - -class ImportResultDialog(ctk.CTkToplevel): - """Simple dialog showing import results.""" - - def __init__(self, parent, added: int, skipped: int, is_replace: bool = False): - super().__init__(parent) - - self.title("Import Complete") - self.geometry("380x160") - self.resizable(False, False) - - # Make modal - self.transient(parent) - self.grab_set() - - # Configure appearance - self.configure(fg_color=BG_PRIMARY) - - # Build content - self._create_content(added, skipped, is_replace) - - # Center on parent - self.update_idletasks() - x = parent.winfo_x() + (parent.winfo_width() - self.winfo_width()) // 2 - y = parent.winfo_y() + (parent.winfo_height() - self.winfo_height()) // 2 - self.geometry(f"+{x}+{y}") - - # Focus - self.focus_force() - - # Bind escape and enter to close - self.bind("", lambda e: self.destroy()) - self.bind("", lambda e: self.destroy()) - - def _create_content(self, added: int, skipped: int, is_replace: bool): - """Build dialog content.""" - # Icon and message - msg_frame = ctk.CTkFrame(self, fg_color="transparent") - msg_frame.pack(fill="x", padx=30, pady=(25, 15)) - - icon = ctk.CTkLabel( - msg_frame, - text="✅" if added > 0 else "ℹ️", - font=(FONT_FAMILY, 28), - text_color=ACCENT_GREEN if added > 0 else ACCENT_BLUE, - ) - icon.pack(side="left", padx=(0, 15)) - - text_frame = ctk.CTkFrame(msg_frame, fg_color="transparent") - text_frame.pack(side="left", fill="x", expand=True) - - # Title text differs based on mode - if is_replace: - if added > 0: - added_text = f"Replaced with {added} rule{'s' if added != 1 else ''}" - else: - added_text = "All rules cleared" - else: - if added > 0: - added_text = f"Added {added} rule{'s' if added != 1 else ''}" - else: - added_text = "No new rules added" - - title = ctk.CTkLabel( - text_frame, - text=added_text, - font=(FONT_FAMILY, FONT_SIZE_LARGE, "bold"), - text_color=TEXT_PRIMARY, - anchor="w", - ) - title.pack(anchor="w") - - # Subtitle for skipped/duplicates - if skipped > 0: - skip_text = f"{skipped} duplicate{'s' if skipped != 1 else ''} skipped" - subtitle = ctk.CTkLabel( - text_frame, - text=skip_text, - font=(FONT_FAMILY, FONT_SIZE_NORMAL), - text_color=TEXT_MUTED, - anchor="w", - ) - subtitle.pack(anchor="w") - - # OK button - btn_frame = ctk.CTkFrame(self, fg_color="transparent") - btn_frame.pack(fill="x", padx=30, pady=(0, 20)) - - ok_btn = ctk.CTkButton( - btn_frame, - text="OK", - font=(FONT_FAMILY, FONT_SIZE_NORMAL), - fg_color=ACCENT_BLUE, - hover_color="#3a8aee", - width=80, - command=self.destroy, - ) - ok_btn.pack(side="right") - - -# ════════════════════════════════════════════════════════════════════════════════ -# TOOLTIP -# ════════════════════════════════════════════════════════════════════════════════ - - -class ToolTip: - """Simple tooltip implementation for CustomTkinter widgets.""" - - def __init__(self, widget, text: str, delay: int = 500): - self.widget = widget - self.text = text - self.delay = delay - self.tooltip_window = None - self.after_id = None - - widget.bind("", self._schedule_show) - widget.bind("", self._hide) - widget.bind("