diff --git a/src/rotator_library/providers/gemini_auth_base.py b/src/rotator_library/providers/gemini_auth_base.py
index 64428935..45d52bc3 100644
--- a/src/rotator_library/providers/gemini_auth_base.py
+++ b/src/rotator_library/providers/gemini_auth_base.py
@@ -14,12 +14,34 @@
 
 lib_logger = logging.getLogger("rotator_library")
 
-# Headers for Gemini CLI auth/discovery calls
-# Uses KV string format for Client-Metadata (different from Antigravity's JSON format)
+# Headers for Gemini CLI auth/discovery calls (loadCodeAssist, onboardUser, etc.)
+#
+# For OAuth/Code Assist path, native gemini-cli only sends:
+# - Content-Type: application/json
+# - Authorization: Bearer <token>
+# - User-Agent: GeminiCLI/${version} (${platform}; ${arch})
+#
+# Headers NOT sent by native CLI (confirmed via explore agent analysis of server.ts):
+# - X-Goog-Api-Client: Not used in Code Assist path
+# - Client-Metadata: Sent in REQUEST BODY for these endpoints, not as HTTP header
+#
+# Note: The commented headers below previously worked well for SDK fingerprinting.
+# Uncomment if you want to try SDK mimicry for potential rate limit benefits.
+#
+# Source: gemini-cli/packages/core/src/code_assist/server.ts:284-290
 GEMINI_CLI_AUTH_HEADERS = {
-    "User-Agent": "google-api-nodejs-client/9.15.1",
-    "X-Goog-Api-Client": "gl-node/22.17.0",
-    "Client-Metadata": "ideType=IDE_UNSPECIFIED,platform=PLATFORM_UNSPECIFIED,pluginType=GEMINI",
+    "User-Agent": "GeminiCLI/0.26.0 (win32; x64)",
+    # -------------------------------------------------------------------------
+    # COMMENTED OUT - Not sent by native gemini-cli for OAuth/Code Assist path
+    # -------------------------------------------------------------------------
+    # "X-Goog-Api-Client": "gl-node/22.17.0 gdcl/1.30.0",  # SDK mimicry - not used by native CLI
+    # "Client-Metadata": (                                  # Sent in body, not as header
+    #     "ideType=IDE_UNSPECIFIED,"
+    #     "pluginType=GEMINI,"
+    #     "ideVersion=0.26.0,"
+    #     "platform=WINDOWS_AMD64,"
+    #     "updateChannel=stable"
+    # ),
 }
 
 
diff --git a/src/rotator_library/providers/gemini_cli_provider.py b/src/rotator_library/providers/gemini_cli_provider.py
index 1275abee..fc58fc52 100644
--- a/src/rotator_library/providers/gemini_cli_provider.py
+++ b/src/rotator_library/providers/gemini_cli_provider.py
@@ -20,6 +20,7 @@
     GEMINI3_TOOL_RENAMES_REVERSE,
     FINISH_REASON_MAP,
     CODE_ASSIST_ENDPOINT,
+    GEMINI_CLI_ENDPOINT_FALLBACKS,
 )
 from ..transaction_logger import ProviderLogger
 from .utilities.gemini_tool_handler import GeminiToolHandler
@@ -33,6 +34,8 @@
 import os
 from pathlib import Path
 import uuid
+import secrets
+import hashlib
 from datetime import datetime
 
 lib_logger = logging.getLogger("rotator_library")
@@ -407,6 +410,7 @@ def __init__(self):
         self._learned_costs: Dict[str, Dict[str, float]] = {}
         self._learned_costs_loaded: bool = False
 
+
     # =========================================================================
     # CREDENTIAL TIER LOOKUP (Provider-specific - uses cache)
     # =========================================================================
@@ -461,6 +465,104 @@ def _is_gemini_3(self, model: str) -> bool:
         model_name = model.split("/")[-1].replace(":thinking", "")
         return model_name.startswith("gemini-3-")
 
+    def _generate_user_prompt_id(self) -> str:
+        """
+        Generate a unique prompt ID matching native gemini-cli format.
+
+        Native JS: Math.random().toString(16).slice(2) produces 13-14 hex chars.
+        Python equivalent using secrets for cryptographic randomness.
+        """
+        return secrets.token_hex(7)  # 14 hex characters
+
+    def _generate_stable_session_id(self, contents: List[Dict[str, Any]]) -> str:
+        """
+        Generate a stable session ID based on the first user message.
+
+        This ensures:
+        - Same conversation = same session_id (even across server restarts)
+        - Different conversations = different session_ids
+        - Multi-user scenarios are properly isolated
+
+        Uses SHA256 hash of the first user message to create a deterministic
+        UUID-formatted session ID. Falls back to random UUID if no user message.
+
+        This approach mirrors Antigravity's _generate_stable_session_id() but
+        uses UUID format instead of the -{number} format to match native
+        gemini-cli's crypto.randomUUID() output format.
+
+        Args:
+            contents: List of message contents in Gemini format
+
+        Returns:
+            UUID-formatted session ID string
+        """
+        # Find first user message text
+        for content in contents:
+            if content.get("role") == "user":
+                parts = content.get("parts", [])
+                for part in parts:
+                    if isinstance(part, dict):
+                        text = part.get("text", "")
+                        if text:
+                            # SHA256 hash and use first 16 bytes to create UUID
+                            h = hashlib.sha256(text.encode("utf-8")).digest()
+                            # Format as UUID (8-4-4-4-12 hex chars)
+                            return f"{h[:4].hex()}-{h[4:6].hex()}-{h[6:8].hex()}-{h[8:10].hex()}-{h[10:16].hex()}"
+
+        # Fallback to random UUID if no user message found
+        return str(uuid.uuid4())
+
+    def _get_gemini_cli_request_headers(self, model: str) -> Dict[str, str]:
+        """
+        Build request headers matching native gemini-cli client.
+
+        For the OAuth/Code Assist path, native gemini-cli only sends:
+        - Content-Type: application/json (handled by httpx)
+        - Authorization: Bearer <token> (handled by auth_header)
+        - User-Agent: GeminiCLI/${version}/${model} (${platform}; ${arch})
+
+        Headers NOT sent by native CLI (confirmed via explore agent analysis):
+        - X-Goog-Api-Client: Not used in Code Assist path (only in SDK/API key path)
+        - Client-Metadata: Not sent as HTTP header (only in request body for management endpoints)
+        - X-Goog-User-Project: Only used in MCP path, causes 403 errors in Code Assist
+
+        Source: gemini-cli/packages/core/src/code_assist/server.ts:332
+        Source: gemini-cli/packages/core/src/core/contentGenerator.ts:129
+        """
+        model_name = model.split("/")[-1].replace(":thinking", "")
+
+        # Hardcoded to Windows x64 platform (matching common development environment)
+        # Native format: GeminiCLI/${version}/${model} (${platform}; ${arch})
+        user_agent = f"GeminiCLI/0.26.0/{model_name} (win32; x64)"
+
+        # =========================================================================
+        # COMMENTED OUT HEADERS - Not sent by native gemini-cli for Code Assist path
+        # Keeping these for reference as they worked well for SDK mimicry.
+        # Uncomment if rate limiting issues arise and you want to try SDK fingerprinting.
+        # =========================================================================
+
+        # X-Goog-Api-Client: Mimics @google/genai SDK but native CLI doesn't send this
+        # for OAuth/Code Assist path (only set when using API key authentication)
+        # x_goog_api_client = "gl-node/22.17.0 gdcl/1.30.0"
+
+        # Client-Metadata: Native CLI sends this in REQUEST BODY for management endpoints
+        # (loadCodeAssist, onboardUser, listExperiments, recordCodeAssistMetrics)
+        # but NOT as an HTTP header for generateContent requests.
+        # client_metadata = (
+        #     "ideType=IDE_UNSPECIFIED,"
+        #     "pluginType=GEMINI,"
+        #     "ideVersion=0.26.0,"
+        #     "platform=WINDOWS_AMD64,"
+        #     "updateChannel=stable"
+        # )
+
+        return {
+            "User-Agent": user_agent,
+            # "X-Goog-Api-Client": x_goog_api_client,  # Not sent by native CLI
+            # "Client-Metadata": client_metadata,      # Not sent as header by native CLI
+            # "Accept": "application/json",            # Not explicitly sent by native CLI
+        }
+
     def _get_available_models(self) -> List[str]:
         """
         Get list of user-facing model names available via this provider.
@@ -1336,12 +1438,20 @@ async def do_call(attempt_model: str, is_fallback: bool = False):
             # Fix tool response grouping (handles ID mismatches, missing responses)
             contents = self._fix_tool_response_grouping(contents)
 
+            # Generate unique prompt ID for this request (matches native gemini-cli)
+            # Source: gemini-cli/packages/cli/src/gemini.tsx line 668
+            user_prompt_id = self._generate_user_prompt_id()
+
+            # Build payload matching native gemini-cli structure
+            # Source: gemini-cli/packages/core/src/code_assist/converter.ts lines 31-48
             request_payload = {
                 "model": model_name,
                 "project": project_id,
+                "user_prompt_id": user_prompt_id,
                 "request": {
                     "contents": contents,
                     "generationConfig": gen_config,
+                    "session_id": self._generate_stable_session_id(contents),
                 },
             }
 
@@ -1386,8 +1496,6 @@ async def do_call(attempt_model: str, is_fallback: bool = False):
             # lib_logger.debug(f"Gemini CLI Request Payload: {json.dumps(request_payload, indent=2)}")
             file_logger.log_request(request_payload)
 
-            url = f"{CODE_ASSIST_ENDPOINT}:streamGenerateContent"
-
             async def stream_handler():
                 # Track state across chunks for tool indexing
                 accumulator = {
@@ -1396,119 +1504,165 @@ async def stream_handler():
                     "is_complete": False,
                 }
 
+                # Build headers matching native gemini-cli client fingerprint
                 final_headers = auth_header.copy()
-                final_headers.update(
-                    {
-                        "User-Agent": "google-api-nodejs-client/9.15.1",
-                        "X-Goog-Api-Client": "gl-node/22.17.0",
-                        "Client-Metadata": "ideType=IDE_UNSPECIFIED,platform=PLATFORM_UNSPECIFIED,pluginType=GEMINI",
-                        "Accept": "application/json",
-                    }
-                )
-                try:
-                    async with client.stream(
-                        "POST",
-                        url,
-                        headers=final_headers,
-                        json=request_payload,
-                        params={"alt": "sse"},
-                        timeout=TimeoutConfig.streaming(),
-                    ) as response:
-                        # Read and log error body before raise_for_status for better debugging
-                        if response.status_code >= 400:
+                final_headers.update(self._get_gemini_cli_request_headers(model_name))
+
+                # Endpoint fallback loop: try sandbox first, then production
+                # This mirrors the opencode-antigravity-auth plugin behavior
+                last_endpoint_error = None
+                for endpoint_idx, base_endpoint in enumerate(
+                    GEMINI_CLI_ENDPOINT_FALLBACKS
+                ):
+                    url = f"{base_endpoint}:streamGenerateContent"
+                    is_fallback = endpoint_idx > 0
+
+                    if is_fallback:
+                        lib_logger.debug(
+                            f"Endpoint fallback: trying {base_endpoint} after previous endpoint failed"
+                        )
+
+                    try:
+                        async with client.stream(
+                            "POST",
+                            url,
+                            headers=final_headers,
+                            json=request_payload,
+                            params={"alt": "sse"},
+                            timeout=TimeoutConfig.streaming(),
+                        ) as response:
+                            # Read and log error body before raise_for_status for better debugging
+                            if response.status_code >= 400:
+                                try:
+                                    error_body = await response.aread()
+                                    lib_logger.error(
+                                        f"Gemini CLI API error {response.status_code}: {error_body.decode()}"
+                                    )
+                                    file_logger.log_error(
+                                        f"API error {response.status_code}: {error_body.decode()}"
+                                    )
+                                except Exception:
+                                    pass
+
+                            # This will raise an HTTPStatusError for 4xx/5xx responses
+                            response.raise_for_status()
+
+                            async for line in response.aiter_lines():
+                                file_logger.log_response_chunk(line)
+                                if line.startswith("data: "):
+                                    data_str = line[6:]
+                                    if data_str == "[DONE]":
+                                        break
+                                    try:
+                                        chunk = json.loads(data_str)
+                                        for (
+                                            openai_chunk
+                                        ) in self._convert_chunk_to_openai(
+                                            chunk, model, accumulator
+                                        ):
+                                            yield litellm.ModelResponse(**openai_chunk)
+                                    except json.JSONDecodeError:
+                                        lib_logger.warning(
+                                            f"Could not decode JSON from Gemini CLI: {line}"
+                                        )
+
+                            # Emit final chunk if stream ended without usageMetadata
+                            # Client will determine the correct finish_reason
+                            if not accumulator.get("is_complete"):
+                                final_chunk = {
+                                    "id": f"chatcmpl-geminicli-{time.time()}",
+                                    "object": "chat.completion.chunk",
+                                    "created": int(time.time()),
+                                    "model": model,
+                                    "choices": [
+                                        {"index": 0, "delta": {}, "finish_reason": None}
+                                    ],
+                                    # Include minimal usage to signal this is the final chunk
+                                    "usage": {
+                                        "prompt_tokens": 0,
+                                        "completion_tokens": 1,
+                                        "total_tokens": 1,
+                                    },
+                                }
+                                yield litellm.ModelResponse(**final_chunk)
+
+                            # Success - exit the endpoint fallback loop
+                            return
+
+                    except httpx.HTTPStatusError as e:
+                        error_body = None
+                        if e.response is not None:
                             try:
-                                error_body = await response.aread()
-                                lib_logger.error(
-                                    f"Gemini CLI API error {response.status_code}: {error_body.decode()}"
-                                )
-                                file_logger.log_error(
-                                    f"API error {response.status_code}: {error_body.decode()}"
-                                )
+                                error_body = e.response.text
                             except Exception:
                                 pass
 
-                        # This will raise an HTTPStatusError for 4xx/5xx responses
-                        response.raise_for_status()
+                        # Only log to file logger (for detailed logging)
+                        if error_body:
+                            file_logger.log_error(
+                                f"HTTPStatusError {e.response.status_code}: {error_body}"
+                            )
+                        else:
+                            file_logger.log_error(
+                                f"HTTPStatusError {e.response.status_code}: {str(e)}"
+                            )
 
-                        async for line in response.aiter_lines():
-                            file_logger.log_response_chunk(line)
-                            if line.startswith("data: "):
-                                data_str = line[6:]
-                                if data_str == "[DONE]":
-                                    break
-                                try:
-                                    chunk = json.loads(data_str)
-                                    for openai_chunk in self._convert_chunk_to_openai(
-                                        chunk, model, accumulator
-                                    ):
-                                        yield litellm.ModelResponse(**openai_chunk)
-                                except json.JSONDecodeError:
-                                    lib_logger.warning(
-                                        f"Could not decode JSON from Gemini CLI: {line}"
-                                    )
+                        # 429 rate limit - don't fallback to next endpoint, let rotator handle it
+                        if e.response.status_code == 429:
+                            # Extract retry-after time from the error body
+                            retry_after = extract_retry_after_from_body(error_body)
+                            retry_info = (
+                                f" (retry after {retry_after}s)" if retry_after else ""
+                            )
+                            error_msg = f"Gemini CLI rate limit exceeded{retry_info}"
+                            if error_body:
+                                error_msg = f"{error_msg} | {error_body}"
+                            # Only log at debug level - rotation happens silently
+                            lib_logger.debug(
+                                f"Gemini CLI 429 rate limit: retry_after={retry_after}s"
+                            )
+                            raise RateLimitError(
+                                message=error_msg,
+                                llm_provider="gemini_cli",
+                                model=model,
+                                response=e.response,
+                            )
 
-                        # Emit final chunk if stream ended without usageMetadata
-                        # Client will determine the correct finish_reason
-                        if not accumulator.get("is_complete"):
-                            final_chunk = {
-                                "id": f"chatcmpl-geminicli-{time.time()}",
-                                "object": "chat.completion.chunk",
-                                "created": int(time.time()),
-                                "model": model,
-                                "choices": [
-                                    {"index": 0, "delta": {}, "finish_reason": None}
-                                ],
-                                # Include minimal usage to signal this is the final chunk
-                                "usage": {
-                                    "prompt_tokens": 0,
-                                    "completion_tokens": 1,
-                                    "total_tokens": 1,
-                                },
-                            }
-                            yield litellm.ModelResponse(**final_chunk)
-
-                except httpx.HTTPStatusError as e:
-                    error_body = None
-                    if e.response is not None:
-                        try:
-                            error_body = e.response.text
-                        except Exception:
-                            pass
-
-                    # Only log to file logger (for detailed logging)
-                    if error_body:
-                        file_logger.log_error(
-                            f"HTTPStatusError {e.response.status_code}: {error_body}"
-                        )
-                    else:
+                        # 5xx server errors - try next endpoint if available
+                        if e.response.status_code >= 500:
+                            last_endpoint_error = e
+                            if endpoint_idx < len(GEMINI_CLI_ENDPOINT_FALLBACKS) - 1:
+                                lib_logger.warning(
+                                    f"Endpoint {base_endpoint} returned {e.response.status_code}, trying fallback"
+                                )
+                                continue
+                            # No more endpoints to try
+                            raise e
+
+                        # Other 4xx errors - don't fallback, re-raise
+                        raise e
+
+                    except (httpx.ConnectError, httpx.TimeoutException) as e:
+                        # Connection/timeout errors - try next endpoint if available
+                        last_endpoint_error = e
                         file_logger.log_error(
-                            f"HTTPStatusError {e.response.status_code}: {str(e)}"
+                            f"Connection error to {base_endpoint}: {str(e)}"
                         )
+                        if endpoint_idx < len(GEMINI_CLI_ENDPOINT_FALLBACKS) - 1:
+                            lib_logger.warning(
+                                f"Connection error to {base_endpoint}, trying fallback endpoint"
+                            )
+                            continue
+                        # No more endpoints to try
+                        raise e
 
-                    if e.response.status_code == 429:
-                        # Extract retry-after time from the error body
-                        retry_after = extract_retry_after_from_body(error_body)
-                        retry_info = (
-                            f" (retry after {retry_after}s)" if retry_after else ""
-                        )
-                        error_msg = f"Gemini CLI rate limit exceeded{retry_info}"
-                        if error_body:
-                            error_msg = f"{error_msg} | {error_body}"
-                        # Only log at debug level - rotation happens silently
-                        lib_logger.debug(
-                            f"Gemini CLI 429 rate limit: retry_after={retry_after}s"
-                        )
-                        raise RateLimitError(
-                            message=error_msg,
-                            llm_provider="gemini_cli",
-                            model=model,
-                            response=e.response,
-                        )
-                    # Re-raise other status errors to be handled by the main acompletion logic
-                    raise e
-                except Exception as e:
-                    file_logger.log_error(f"Stream handler exception: {str(e)}")
-                    raise
+                    except Exception as e:
+                        file_logger.log_error(f"Stream handler exception: {str(e)}")
+                        raise
+
+                # If we get here, all endpoints failed (shouldn't happen due to raise in loop)
+                if last_endpoint_error:
+                    raise last_endpoint_error
 
             async def logging_stream_wrapper():
                 """Wraps the stream to log the final reassembled response."""
@@ -1626,10 +1780,14 @@ async def count_tokens(
         # Fix tool response grouping (handles ID mismatches, missing responses)
         contents = self._fix_tool_response_grouping(contents)
 
-        # Build request payload
+        # Build request payload matching native gemini-cli structure
         request_payload = {
+            "model": model_name,
+            "project": project_id,
+            "user_prompt_id": self._generate_user_prompt_id(),
             "request": {
                 "contents": contents,
+                "session_id": self._generate_stable_session_id(contents),
             },
         }
 
@@ -1643,37 +1801,54 @@ async def count_tokens(
                     {"functionDeclarations": function_declarations}
                 ]
 
-        # Make the request
-        url = f"{CODE_ASSIST_ENDPOINT}:countTokens"
+        # Build headers matching native gemini-cli client fingerprint
         headers = auth_header.copy()
-        headers.update(
-            {
-                "User-Agent": "google-api-nodejs-client/9.15.1",
-                "X-Goog-Api-Client": "gl-node/22.17.0",
-                "Client-Metadata": "ideType=IDE_UNSPECIFIED,platform=PLATFORM_UNSPECIFIED,pluginType=GEMINI",
-                "Accept": "application/json",
-            }
-        )
+        headers.update(self._get_gemini_cli_request_headers(model_name))
 
-        try:
-            response = await client.post(
-                url, headers=headers, json=request_payload, timeout=30
-            )
-            response.raise_for_status()
-            data = response.json()
+        # Endpoint fallback loop: try sandbox first, then production
+        for endpoint_idx, base_endpoint in enumerate(GEMINI_CLI_ENDPOINT_FALLBACKS):
+            url = f"{base_endpoint}:countTokens"
+            try:
+                response = await client.post(
+                    url, headers=headers, json=request_payload, timeout=30
+                )
+                response.raise_for_status()
+                data = response.json()
 
-            # Extract token counts from response
-            total_tokens = data.get("totalTokens", 0)
+                # Extract token counts from response
+                total_tokens = data.get("totalTokens", 0)
 
-            return {
-                "prompt_tokens": total_tokens,
-                "total_tokens": total_tokens,
-            }
+                return {
+                    "prompt_tokens": total_tokens,
+                    "total_tokens": total_tokens,
+                }
+
+            except httpx.HTTPStatusError as e:
+                # 5xx errors - try next endpoint if available
+                if (
+                    e.response.status_code >= 500
+                    and endpoint_idx < len(GEMINI_CLI_ENDPOINT_FALLBACKS) - 1
+                ):
+                    lib_logger.warning(
+                        f"countTokens: endpoint {base_endpoint} returned {e.response.status_code}, trying fallback"
+                    )
+                    continue
+                lib_logger.error(f"Failed to count tokens: {e}")
+                # Return 0 on error rather than raising
+                return {"prompt_tokens": 0, "total_tokens": 0}
+
+            except (httpx.ConnectError, httpx.TimeoutException) as e:
+                # Connection errors - try next endpoint if available
+                if endpoint_idx < len(GEMINI_CLI_ENDPOINT_FALLBACKS) - 1:
+                    lib_logger.warning(
+                        f"countTokens: connection error to {base_endpoint}, trying fallback"
+                    )
+                    continue
+                lib_logger.error(f"Failed to count tokens: {e}")
+                return {"prompt_tokens": 0, "total_tokens": 0}
 
-        except httpx.HTTPStatusError as e:
-            lib_logger.error(f"Failed to count tokens: {e}")
-            # Return 0 on error rather than raising
-            return {"prompt_tokens": 0, "total_tokens": 0}
+        # Shouldn't reach here, but return 0 as fallback
+        return {"prompt_tokens": 0, "total_tokens": 0}
 
     # Use the shared GeminiAuthBase for auth logic
     async def get_models(self, credential: str, client: httpx.AsyncClient) -> List[str]:
diff --git a/src/rotator_library/providers/utilities/gemini_shared_utils.py b/src/rotator_library/providers/utilities/gemini_shared_utils.py
index 90a35894..05d36d98 100644
--- a/src/rotator_library/providers/utilities/gemini_shared_utils.py
+++ b/src/rotator_library/providers/utilities/gemini_shared_utils.py
@@ -39,6 +39,14 @@ def env_int(key: str, default: int) -> int:
 # Google Code Assist API endpoint (used by Gemini CLI and Antigravity providers)
 CODE_ASSIST_ENDPOINT = "https://cloudcode-pa.googleapis.com/v1internal"
 
+# Gemini CLI endpoint fallback chain
+# Sandbox endpoints may have separate/higher rate limits than production
+# Order: sandbox daily -> production (fallback)
+GEMINI_CLI_ENDPOINT_FALLBACKS = [
+    "https://daily-cloudcode-pa.sandbox.googleapis.com/v1internal",  # Sandbox daily
+    "https://cloudcode-pa.googleapis.com/v1internal",  # Production fallback
+]
+
 
 # =============================================================================
 # GEMINI 3 TOOL RENAMING CONSTANTS