diff --git a/src/rotator_library/providers/gemini_auth_base.py b/src/rotator_library/providers/gemini_auth_base.py index 64428935..45d52bc3 100644 --- a/src/rotator_library/providers/gemini_auth_base.py +++ b/src/rotator_library/providers/gemini_auth_base.py @@ -14,12 +14,34 @@ lib_logger = logging.getLogger("rotator_library") -# Headers for Gemini CLI auth/discovery calls -# Uses KV string format for Client-Metadata (different from Antigravity's JSON format) +# Headers for Gemini CLI auth/discovery calls (loadCodeAssist, onboardUser, etc.) +# +# For OAuth/Code Assist path, native gemini-cli only sends: +# - Content-Type: application/json +# - Authorization: Bearer +# - User-Agent: GeminiCLI/${version} (${platform}; ${arch}) +# +# Headers NOT sent by native CLI (confirmed via explore agent analysis of server.ts): +# - X-Goog-Api-Client: Not used in Code Assist path +# - Client-Metadata: Sent in REQUEST BODY for these endpoints, not as HTTP header +# +# Note: The commented headers below previously worked well for SDK fingerprinting. +# Uncomment if you want to try SDK mimicry for potential rate limit benefits. +# +# Source: gemini-cli/packages/core/src/code_assist/server.ts:284-290 GEMINI_CLI_AUTH_HEADERS = { - "User-Agent": "google-api-nodejs-client/9.15.1", - "X-Goog-Api-Client": "gl-node/22.17.0", - "Client-Metadata": "ideType=IDE_UNSPECIFIED,platform=PLATFORM_UNSPECIFIED,pluginType=GEMINI", + "User-Agent": "GeminiCLI/0.26.0 (win32; x64)", + # ------------------------------------------------------------------------- + # COMMENTED OUT - Not sent by native gemini-cli for OAuth/Code Assist path + # ------------------------------------------------------------------------- + # "X-Goog-Api-Client": "gl-node/22.17.0 gdcl/1.30.0", # SDK mimicry - not used by native CLI + # "Client-Metadata": ( # Sent in body, not as header + # "ideType=IDE_UNSPECIFIED," + # "pluginType=GEMINI," + # "ideVersion=0.26.0," + # "platform=WINDOWS_AMD64," + # "updateChannel=stable" + # ), } diff --git a/src/rotator_library/providers/gemini_cli_provider.py b/src/rotator_library/providers/gemini_cli_provider.py index 1275abee..fc58fc52 100644 --- a/src/rotator_library/providers/gemini_cli_provider.py +++ b/src/rotator_library/providers/gemini_cli_provider.py @@ -20,6 +20,7 @@ GEMINI3_TOOL_RENAMES_REVERSE, FINISH_REASON_MAP, CODE_ASSIST_ENDPOINT, + GEMINI_CLI_ENDPOINT_FALLBACKS, ) from ..transaction_logger import ProviderLogger from .utilities.gemini_tool_handler import GeminiToolHandler @@ -33,6 +34,8 @@ import os from pathlib import Path import uuid +import secrets +import hashlib from datetime import datetime lib_logger = logging.getLogger("rotator_library") @@ -407,6 +410,7 @@ def __init__(self): self._learned_costs: Dict[str, Dict[str, float]] = {} self._learned_costs_loaded: bool = False + # ========================================================================= # CREDENTIAL TIER LOOKUP (Provider-specific - uses cache) # ========================================================================= @@ -461,6 +465,104 @@ def _is_gemini_3(self, model: str) -> bool: model_name = model.split("/")[-1].replace(":thinking", "") return model_name.startswith("gemini-3-") + def _generate_user_prompt_id(self) -> str: + """ + Generate a unique prompt ID matching native gemini-cli format. + + Native JS: Math.random().toString(16).slice(2) produces 13-14 hex chars. + Python equivalent using secrets for cryptographic randomness. + """ + return secrets.token_hex(7) # 14 hex characters + + def _generate_stable_session_id(self, contents: List[Dict[str, Any]]) -> str: + """ + Generate a stable session ID based on the first user message. + + This ensures: + - Same conversation = same session_id (even across server restarts) + - Different conversations = different session_ids + - Multi-user scenarios are properly isolated + + Uses SHA256 hash of the first user message to create a deterministic + UUID-formatted session ID. Falls back to random UUID if no user message. + + This approach mirrors Antigravity's _generate_stable_session_id() but + uses UUID format instead of the -{number} format to match native + gemini-cli's crypto.randomUUID() output format. + + Args: + contents: List of message contents in Gemini format + + Returns: + UUID-formatted session ID string + """ + # Find first user message text + for content in contents: + if content.get("role") == "user": + parts = content.get("parts", []) + for part in parts: + if isinstance(part, dict): + text = part.get("text", "") + if text: + # SHA256 hash and use first 16 bytes to create UUID + h = hashlib.sha256(text.encode("utf-8")).digest() + # Format as UUID (8-4-4-4-12 hex chars) + return f"{h[:4].hex()}-{h[4:6].hex()}-{h[6:8].hex()}-{h[8:10].hex()}-{h[10:16].hex()}" + + # Fallback to random UUID if no user message found + return str(uuid.uuid4()) + + def _get_gemini_cli_request_headers(self, model: str) -> Dict[str, str]: + """ + Build request headers matching native gemini-cli client. + + For the OAuth/Code Assist path, native gemini-cli only sends: + - Content-Type: application/json (handled by httpx) + - Authorization: Bearer (handled by auth_header) + - User-Agent: GeminiCLI/${version}/${model} (${platform}; ${arch}) + + Headers NOT sent by native CLI (confirmed via explore agent analysis): + - X-Goog-Api-Client: Not used in Code Assist path (only in SDK/API key path) + - Client-Metadata: Not sent as HTTP header (only in request body for management endpoints) + - X-Goog-User-Project: Only used in MCP path, causes 403 errors in Code Assist + + Source: gemini-cli/packages/core/src/code_assist/server.ts:332 + Source: gemini-cli/packages/core/src/core/contentGenerator.ts:129 + """ + model_name = model.split("/")[-1].replace(":thinking", "") + + # Hardcoded to Windows x64 platform (matching common development environment) + # Native format: GeminiCLI/${version}/${model} (${platform}; ${arch}) + user_agent = f"GeminiCLI/0.26.0/{model_name} (win32; x64)" + + # ========================================================================= + # COMMENTED OUT HEADERS - Not sent by native gemini-cli for Code Assist path + # Keeping these for reference as they worked well for SDK mimicry. + # Uncomment if rate limiting issues arise and you want to try SDK fingerprinting. + # ========================================================================= + + # X-Goog-Api-Client: Mimics @google/genai SDK but native CLI doesn't send this + # for OAuth/Code Assist path (only set when using API key authentication) + # x_goog_api_client = "gl-node/22.17.0 gdcl/1.30.0" + + # Client-Metadata: Native CLI sends this in REQUEST BODY for management endpoints + # (loadCodeAssist, onboardUser, listExperiments, recordCodeAssistMetrics) + # but NOT as an HTTP header for generateContent requests. + # client_metadata = ( + # "ideType=IDE_UNSPECIFIED," + # "pluginType=GEMINI," + # "ideVersion=0.26.0," + # "platform=WINDOWS_AMD64," + # "updateChannel=stable" + # ) + + return { + "User-Agent": user_agent, + # "X-Goog-Api-Client": x_goog_api_client, # Not sent by native CLI + # "Client-Metadata": client_metadata, # Not sent as header by native CLI + # "Accept": "application/json", # Not explicitly sent by native CLI + } + def _get_available_models(self) -> List[str]: """ Get list of user-facing model names available via this provider. @@ -1336,12 +1438,20 @@ async def do_call(attempt_model: str, is_fallback: bool = False): # Fix tool response grouping (handles ID mismatches, missing responses) contents = self._fix_tool_response_grouping(contents) + # Generate unique prompt ID for this request (matches native gemini-cli) + # Source: gemini-cli/packages/cli/src/gemini.tsx line 668 + user_prompt_id = self._generate_user_prompt_id() + + # Build payload matching native gemini-cli structure + # Source: gemini-cli/packages/core/src/code_assist/converter.ts lines 31-48 request_payload = { "model": model_name, "project": project_id, + "user_prompt_id": user_prompt_id, "request": { "contents": contents, "generationConfig": gen_config, + "session_id": self._generate_stable_session_id(contents), }, } @@ -1386,8 +1496,6 @@ async def do_call(attempt_model: str, is_fallback: bool = False): # lib_logger.debug(f"Gemini CLI Request Payload: {json.dumps(request_payload, indent=2)}") file_logger.log_request(request_payload) - url = f"{CODE_ASSIST_ENDPOINT}:streamGenerateContent" - async def stream_handler(): # Track state across chunks for tool indexing accumulator = { @@ -1396,119 +1504,165 @@ async def stream_handler(): "is_complete": False, } + # Build headers matching native gemini-cli client fingerprint final_headers = auth_header.copy() - final_headers.update( - { - "User-Agent": "google-api-nodejs-client/9.15.1", - "X-Goog-Api-Client": "gl-node/22.17.0", - "Client-Metadata": "ideType=IDE_UNSPECIFIED,platform=PLATFORM_UNSPECIFIED,pluginType=GEMINI", - "Accept": "application/json", - } - ) - try: - async with client.stream( - "POST", - url, - headers=final_headers, - json=request_payload, - params={"alt": "sse"}, - timeout=TimeoutConfig.streaming(), - ) as response: - # Read and log error body before raise_for_status for better debugging - if response.status_code >= 400: + final_headers.update(self._get_gemini_cli_request_headers(model_name)) + + # Endpoint fallback loop: try sandbox first, then production + # This mirrors the opencode-antigravity-auth plugin behavior + last_endpoint_error = None + for endpoint_idx, base_endpoint in enumerate( + GEMINI_CLI_ENDPOINT_FALLBACKS + ): + url = f"{base_endpoint}:streamGenerateContent" + is_fallback = endpoint_idx > 0 + + if is_fallback: + lib_logger.debug( + f"Endpoint fallback: trying {base_endpoint} after previous endpoint failed" + ) + + try: + async with client.stream( + "POST", + url, + headers=final_headers, + json=request_payload, + params={"alt": "sse"}, + timeout=TimeoutConfig.streaming(), + ) as response: + # Read and log error body before raise_for_status for better debugging + if response.status_code >= 400: + try: + error_body = await response.aread() + lib_logger.error( + f"Gemini CLI API error {response.status_code}: {error_body.decode()}" + ) + file_logger.log_error( + f"API error {response.status_code}: {error_body.decode()}" + ) + except Exception: + pass + + # This will raise an HTTPStatusError for 4xx/5xx responses + response.raise_for_status() + + async for line in response.aiter_lines(): + file_logger.log_response_chunk(line) + if line.startswith("data: "): + data_str = line[6:] + if data_str == "[DONE]": + break + try: + chunk = json.loads(data_str) + for ( + openai_chunk + ) in self._convert_chunk_to_openai( + chunk, model, accumulator + ): + yield litellm.ModelResponse(**openai_chunk) + except json.JSONDecodeError: + lib_logger.warning( + f"Could not decode JSON from Gemini CLI: {line}" + ) + + # Emit final chunk if stream ended without usageMetadata + # Client will determine the correct finish_reason + if not accumulator.get("is_complete"): + final_chunk = { + "id": f"chatcmpl-geminicli-{time.time()}", + "object": "chat.completion.chunk", + "created": int(time.time()), + "model": model, + "choices": [ + {"index": 0, "delta": {}, "finish_reason": None} + ], + # Include minimal usage to signal this is the final chunk + "usage": { + "prompt_tokens": 0, + "completion_tokens": 1, + "total_tokens": 1, + }, + } + yield litellm.ModelResponse(**final_chunk) + + # Success - exit the endpoint fallback loop + return + + except httpx.HTTPStatusError as e: + error_body = None + if e.response is not None: try: - error_body = await response.aread() - lib_logger.error( - f"Gemini CLI API error {response.status_code}: {error_body.decode()}" - ) - file_logger.log_error( - f"API error {response.status_code}: {error_body.decode()}" - ) + error_body = e.response.text except Exception: pass - # This will raise an HTTPStatusError for 4xx/5xx responses - response.raise_for_status() + # Only log to file logger (for detailed logging) + if error_body: + file_logger.log_error( + f"HTTPStatusError {e.response.status_code}: {error_body}" + ) + else: + file_logger.log_error( + f"HTTPStatusError {e.response.status_code}: {str(e)}" + ) - async for line in response.aiter_lines(): - file_logger.log_response_chunk(line) - if line.startswith("data: "): - data_str = line[6:] - if data_str == "[DONE]": - break - try: - chunk = json.loads(data_str) - for openai_chunk in self._convert_chunk_to_openai( - chunk, model, accumulator - ): - yield litellm.ModelResponse(**openai_chunk) - except json.JSONDecodeError: - lib_logger.warning( - f"Could not decode JSON from Gemini CLI: {line}" - ) + # 429 rate limit - don't fallback to next endpoint, let rotator handle it + if e.response.status_code == 429: + # Extract retry-after time from the error body + retry_after = extract_retry_after_from_body(error_body) + retry_info = ( + f" (retry after {retry_after}s)" if retry_after else "" + ) + error_msg = f"Gemini CLI rate limit exceeded{retry_info}" + if error_body: + error_msg = f"{error_msg} | {error_body}" + # Only log at debug level - rotation happens silently + lib_logger.debug( + f"Gemini CLI 429 rate limit: retry_after={retry_after}s" + ) + raise RateLimitError( + message=error_msg, + llm_provider="gemini_cli", + model=model, + response=e.response, + ) - # Emit final chunk if stream ended without usageMetadata - # Client will determine the correct finish_reason - if not accumulator.get("is_complete"): - final_chunk = { - "id": f"chatcmpl-geminicli-{time.time()}", - "object": "chat.completion.chunk", - "created": int(time.time()), - "model": model, - "choices": [ - {"index": 0, "delta": {}, "finish_reason": None} - ], - # Include minimal usage to signal this is the final chunk - "usage": { - "prompt_tokens": 0, - "completion_tokens": 1, - "total_tokens": 1, - }, - } - yield litellm.ModelResponse(**final_chunk) - - except httpx.HTTPStatusError as e: - error_body = None - if e.response is not None: - try: - error_body = e.response.text - except Exception: - pass - - # Only log to file logger (for detailed logging) - if error_body: - file_logger.log_error( - f"HTTPStatusError {e.response.status_code}: {error_body}" - ) - else: + # 5xx server errors - try next endpoint if available + if e.response.status_code >= 500: + last_endpoint_error = e + if endpoint_idx < len(GEMINI_CLI_ENDPOINT_FALLBACKS) - 1: + lib_logger.warning( + f"Endpoint {base_endpoint} returned {e.response.status_code}, trying fallback" + ) + continue + # No more endpoints to try + raise e + + # Other 4xx errors - don't fallback, re-raise + raise e + + except (httpx.ConnectError, httpx.TimeoutException) as e: + # Connection/timeout errors - try next endpoint if available + last_endpoint_error = e file_logger.log_error( - f"HTTPStatusError {e.response.status_code}: {str(e)}" + f"Connection error to {base_endpoint}: {str(e)}" ) + if endpoint_idx < len(GEMINI_CLI_ENDPOINT_FALLBACKS) - 1: + lib_logger.warning( + f"Connection error to {base_endpoint}, trying fallback endpoint" + ) + continue + # No more endpoints to try + raise e - if e.response.status_code == 429: - # Extract retry-after time from the error body - retry_after = extract_retry_after_from_body(error_body) - retry_info = ( - f" (retry after {retry_after}s)" if retry_after else "" - ) - error_msg = f"Gemini CLI rate limit exceeded{retry_info}" - if error_body: - error_msg = f"{error_msg} | {error_body}" - # Only log at debug level - rotation happens silently - lib_logger.debug( - f"Gemini CLI 429 rate limit: retry_after={retry_after}s" - ) - raise RateLimitError( - message=error_msg, - llm_provider="gemini_cli", - model=model, - response=e.response, - ) - # Re-raise other status errors to be handled by the main acompletion logic - raise e - except Exception as e: - file_logger.log_error(f"Stream handler exception: {str(e)}") - raise + except Exception as e: + file_logger.log_error(f"Stream handler exception: {str(e)}") + raise + + # If we get here, all endpoints failed (shouldn't happen due to raise in loop) + if last_endpoint_error: + raise last_endpoint_error async def logging_stream_wrapper(): """Wraps the stream to log the final reassembled response.""" @@ -1626,10 +1780,14 @@ async def count_tokens( # Fix tool response grouping (handles ID mismatches, missing responses) contents = self._fix_tool_response_grouping(contents) - # Build request payload + # Build request payload matching native gemini-cli structure request_payload = { + "model": model_name, + "project": project_id, + "user_prompt_id": self._generate_user_prompt_id(), "request": { "contents": contents, + "session_id": self._generate_stable_session_id(contents), }, } @@ -1643,37 +1801,54 @@ async def count_tokens( {"functionDeclarations": function_declarations} ] - # Make the request - url = f"{CODE_ASSIST_ENDPOINT}:countTokens" + # Build headers matching native gemini-cli client fingerprint headers = auth_header.copy() - headers.update( - { - "User-Agent": "google-api-nodejs-client/9.15.1", - "X-Goog-Api-Client": "gl-node/22.17.0", - "Client-Metadata": "ideType=IDE_UNSPECIFIED,platform=PLATFORM_UNSPECIFIED,pluginType=GEMINI", - "Accept": "application/json", - } - ) + headers.update(self._get_gemini_cli_request_headers(model_name)) - try: - response = await client.post( - url, headers=headers, json=request_payload, timeout=30 - ) - response.raise_for_status() - data = response.json() + # Endpoint fallback loop: try sandbox first, then production + for endpoint_idx, base_endpoint in enumerate(GEMINI_CLI_ENDPOINT_FALLBACKS): + url = f"{base_endpoint}:countTokens" + try: + response = await client.post( + url, headers=headers, json=request_payload, timeout=30 + ) + response.raise_for_status() + data = response.json() - # Extract token counts from response - total_tokens = data.get("totalTokens", 0) + # Extract token counts from response + total_tokens = data.get("totalTokens", 0) - return { - "prompt_tokens": total_tokens, - "total_tokens": total_tokens, - } + return { + "prompt_tokens": total_tokens, + "total_tokens": total_tokens, + } + + except httpx.HTTPStatusError as e: + # 5xx errors - try next endpoint if available + if ( + e.response.status_code >= 500 + and endpoint_idx < len(GEMINI_CLI_ENDPOINT_FALLBACKS) - 1 + ): + lib_logger.warning( + f"countTokens: endpoint {base_endpoint} returned {e.response.status_code}, trying fallback" + ) + continue + lib_logger.error(f"Failed to count tokens: {e}") + # Return 0 on error rather than raising + return {"prompt_tokens": 0, "total_tokens": 0} + + except (httpx.ConnectError, httpx.TimeoutException) as e: + # Connection errors - try next endpoint if available + if endpoint_idx < len(GEMINI_CLI_ENDPOINT_FALLBACKS) - 1: + lib_logger.warning( + f"countTokens: connection error to {base_endpoint}, trying fallback" + ) + continue + lib_logger.error(f"Failed to count tokens: {e}") + return {"prompt_tokens": 0, "total_tokens": 0} - except httpx.HTTPStatusError as e: - lib_logger.error(f"Failed to count tokens: {e}") - # Return 0 on error rather than raising - return {"prompt_tokens": 0, "total_tokens": 0} + # Shouldn't reach here, but return 0 as fallback + return {"prompt_tokens": 0, "total_tokens": 0} # Use the shared GeminiAuthBase for auth logic async def get_models(self, credential: str, client: httpx.AsyncClient) -> List[str]: diff --git a/src/rotator_library/providers/utilities/gemini_shared_utils.py b/src/rotator_library/providers/utilities/gemini_shared_utils.py index 90a35894..05d36d98 100644 --- a/src/rotator_library/providers/utilities/gemini_shared_utils.py +++ b/src/rotator_library/providers/utilities/gemini_shared_utils.py @@ -39,6 +39,14 @@ def env_int(key: str, default: int) -> int: # Google Code Assist API endpoint (used by Gemini CLI and Antigravity providers) CODE_ASSIST_ENDPOINT = "https://cloudcode-pa.googleapis.com/v1internal" +# Gemini CLI endpoint fallback chain +# Sandbox endpoints may have separate/higher rate limits than production +# Order: sandbox daily -> production (fallback) +GEMINI_CLI_ENDPOINT_FALLBACKS = [ + "https://daily-cloudcode-pa.sandbox.googleapis.com/v1internal", # Sandbox daily + "https://cloudcode-pa.googleapis.com/v1internal", # Production fallback +] + # ============================================================================= # GEMINI 3 TOOL RENAMING CONSTANTS