nshkrdotcom
diff --git a/‎CHANGELOG.md‎
Lines changed: 19 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 15 additions & 2 deletions b/‎README.md‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎STREAMING.md‎
Lines changed: 3 additions & 3 deletions b/‎STREAMING.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎config/config.exs‎
Lines changed: 1 addition & 1 deletion b/‎config/config.exs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/guides/rate_limiting.md‎
Lines changed: 14 additions & 0 deletions b/‎docs/guides/rate_limiting.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎lib/gemini/apis/context_cache.ex‎
Lines changed: 7 additions & 2 deletions b/‎lib/gemini/apis/context_cache.ex‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎lib/gemini/apis/tokens.ex‎
Lines changed: 1 addition & 1 deletion b/‎lib/gemini/apis/tokens.ex‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/gemini/client/http.ex‎
Lines changed: 55 additions & 22 deletions b/‎lib/gemini/client/http.ex‎
Lines changed: 55 additions & 22 deletions
@@ -5,6 +5,25 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.6.3] - 2025-12-05
+
+### Added
+- Concurrency gate is now partitionable via `concurrency_key` (e.g., per-tenant or per-location) instead of a single global queue per model.
+- Concurrency permit wait is configurable via `permit_timeout_ms`; default is now `:infinity` (no queue drop). Per-call overrides supported.
+- Per-request timeout overrides for HTTP and streaming; global default HTTP/stream timeout raised to 120_000ms.
+- Streaming knobs: `max_backoff_ms`, `connect_timeout`, and configurable cleanup delay for ManagerV2 (`config :gemini_ex, :streaming, cleanup_delay_ms: ...`).
+- Configurable context cache TTL defaults via `config :gemini_ex, :context_cache, default_ttl_seconds: ...`.
+- Configurable retry delay fallback via `config :gemini_ex, :rate_limiter, default_retry_delay_ms: ...`.
+- Permit leak protection: holders are monitored and reclaimed if the process dies without releasing.
+
+### Changed
+- Default HTTP/stream timeout increased from 30_000ms to 120_000ms.
+- Concurrency gate uses configurable `permit_timeout_ms` (default `:infinity`) instead of a fixed 60s timeout.
+
+### Fixed
+- Streaming client no longer leaks `:persistent_term` state; SSE parse errors now surface instead of being silently dropped.
+- Streaming backoff ceiling and connect timeout are tunable; SSE parsing failures return errors.
+
 ## [0.6.2] - 2025-12-05
 
 ### Fixed
 
@@ -47,7 +47,7 @@ Add `gemini` to your list of dependencies in `mix.exs`:
 ```elixir
 def deps do
   [
-    {:gemini_ex, "~> 0.6.2"}
+    {:gemini_ex, "~> 0.6.3"}
   ]
 end
 ```
@@ -150,12 +150,23 @@ Gemini.Streaming.resume_stream(stream_id)
 Gemini.Streaming.stop_stream(stream_id)
 ```
 
-### Rate Limiting (built-in)
+Streaming knobs: pass `timeout:` (per attempt, default `config :gemini_ex, :timeout` = 120_000), `max_retries:` (default 3), `max_backoff_ms:` (default 10_000), and `connect_timeout:` (default 5_000). Manager cleanup delay can be tuned via `config :gemini_ex, :streaming, cleanup_delay_ms: ...`.
+
+### Rate Limiting & Concurrency (built-in)
 
 - Enabled by default: requests block when over budget; non-blocking mode returns `{:error, {:rate_limited, retry_at, details}}` with `retry_at` set to the window end.
 - Oversized requests (estimate exceeds budget) return `reason: :over_budget, request_too_large: true` immediately—no retry loop.
 - Cached context tokens are counted toward budgets. When you precompute cache size, you can pass `estimated_cached_tokens:` alongside `estimated_input_tokens:` to budget correctly before the API reports usage.
 - Optional `max_budget_wait_ms` caps how long blocking calls sleep for a full window; if the cap is hit and the window is still full, you get a `rate_limited` error with `retry_at` set to the actual window end.
+- Concurrency gate: `max_concurrency_per_model` plus `permit_timeout_ms` (default `:infinity`, per-call override). `non_blocking: true` is the fail-fast path (returns `{:error, :no_permit_available}` immediately).
+- Partition the gate with `concurrency_key:` (e.g., tenant/location) to avoid cross-tenant starvation; default key is the model name.
+- Permit leak protection: holders are monitored; if a holder dies without releasing, its permits are reclaimed automatically.
+
+### Timeouts (HTTP & Streaming)
+
+- Global HTTP/stream timeout default is 120_000ms via `config :gemini_ex, :timeout`.
+- Per-call override: `timeout:` on any request/stream.
+- Streaming extras: `max_retries`, `max_backoff_ms` (default 10_000), `connect_timeout` (default 5_000).
 
 ### Advanced Generation Configuration
 
@@ -265,6 +276,8 @@ alias Gemini.Types.Content
   )
 ```
 
+**TTL defaults:** The default cache TTL is configurable via `config :gemini_ex, :context_cache, default_ttl_seconds: ...` (defaults to 3_600). You can also override per call with `default_ttl_seconds:` or pass `:ttl`/`:expire_time` explicitly.
+
 **Models that support explicit caching:**
 - `gemini-2.5-flash`
 - `gemini-2.5-pro`
 
@@ -308,8 +308,8 @@ defp listen_for_events do
     {:stream_complete, _stream_id} ->
       IO.puts("\n\n✅ Stream completed!")
   after
-    30_000 ->
-      IO.puts("\n⏰ Stream timeout after 30 seconds")
+    timeout ->
+      IO.puts("\n⏰ Stream timeout after #{timeout / 1000} seconds (configurable)")
   end
 end
 ```
@@ -360,4 +360,4 @@ end
 3. **Low Latency**: Gap between API generation and CLI display should be <50ms
 4. **No Buffering**: Text should stream continuously, not in large blocks
 
-The current issue where "all text dumps out at the end" should be completely resolved with this streaming implementation.
+The current issue where "all text dumps out at the end" should be completely resolved with this streaming implementation.
@@ -8,7 +8,7 @@ config :gemini_ex,
   # Uncomment to override: default_model: "your-model-name",
 
   # HTTP timeout in milliseconds
-  timeout: 30_000,
+  timeout: 120_000,
 
   # Enable telemetry events
   telemetry_enabled: true
 
@@ -41,6 +41,7 @@ Configure globally via application environment:
 ```elixir
 config :gemini_ex, :rate_limiter,
   max_concurrency_per_model: 4,    # nil or 0 disables concurrency gating
+  permit_timeout_ms: :infinity,     # default: no cap on queue wait; set a number to cap
   max_attempts: 3,                  # Retry attempts for transient errors
   base_backoff_ms: 1000,           # Base backoff duration
   jitter_factor: 0.25,             # Jitter range (±25%)
@@ -69,6 +70,16 @@ end
 
 # Override concurrency limit
 {:ok, response} = Gemini.generate("Hello", max_concurrency_per_model: 8)
+
+# Override permit wait timeout (defaults to :infinity)
+{:ok, response} = Gemini.generate("Hello", permit_timeout_ms: 600_000)
+
+# Partition the concurrency gate (e.g., by tenant/location)
+{:ok, response} = Gemini.generate("Hello", concurrency_key: "tenant_a")
+
+# Fail fast instead of waiting
+{:error, {:rate_limited, nil, %{reason: :no_permit_available}}} =
+  Gemini.generate("Hello", non_blocking: true)
 ```
 
 ## Quick Start
@@ -186,6 +197,9 @@ For most applications, start with a profile and adjust:
 - Seeing 429s? Lower both concurrency and budget
 - Underutilizing quota? Raise budget, enable adaptive concurrency
 
+### Concurrency semantics
+
+The concurrency gate is per model by default (all callers to the same model share a queue). Use `concurrency_key:` to partition by tenant/location. `permit_timeout_ms` defaults to `:infinity`; a waiter only errors if you explicitly set a finite cap and it expires. Use `non_blocking: true` to fail fast instead of queueing.
 ## Structured Errors
 
 Rate limit errors include retry information:
 
@@ -312,8 +312,8 @@ defmodule Gemini.APIs.ContextCache do
         %{ttl: "#{opts[:ttl]}s"}
 
       true ->
-        # Default 1 hour TTL
-        %{ttl: "3600s"}
+        default_ttl = Keyword.get(opts, :default_ttl_seconds, default_ttl_seconds())
+        %{ttl: "#{default_ttl}s"}
     end
   end
 
@@ -333,6 +333,11 @@ defmodule Gemini.APIs.ContextCache do
     end
   end
 
+  defp default_ttl_seconds do
+    context_cache_config = Application.get_env(:gemini_ex, :context_cache, [])
+    Keyword.get(context_cache_config, :default_ttl_seconds, 3_600)
+  end
+
   defp maybe_add_tools(map, opts) do
     case Keyword.get(opts, :tools) do
       tools when is_list(tools) and length(tools) > 0 ->
 
@@ -174,7 +174,7 @@ defmodule Gemini.APIs.Tokens do
   def count_batch(inputs, opts \\ []) when is_list(inputs) do
     # Use Task.async_stream for parallel processing
     max_concurrency = Keyword.get(opts, :max_concurrency, 5)
-    timeout = Keyword.get(opts, :timeout, 30_000)
+    timeout = Keyword.get(opts, :timeout, Config.timeout())
 
     try do
       results =
 
@@ -106,11 +106,13 @@ defmodule Gemini.Client.HTTP do
 
     Telemetry.execute([:gemini, :request, :start], measurements, metadata)
 
+    timeout = Keyword.get(opts, :timeout, Config.timeout())
+
     req_opts = [
       method: method,
       url: url,
       headers: headers,
-      receive_timeout: Config.timeout(),
+      receive_timeout: timeout,
       json: body
     ]
 
@@ -184,10 +186,12 @@ defmodule Gemini.Client.HTTP do
 
             Telemetry.execute([:gemini, :stream, :start], measurements, metadata)
 
+            timeout = Keyword.get(opts, :timeout, Config.timeout())
+
             req_opts = [
               url: sse_url,
               headers: headers,
-              receive_timeout: Config.timeout(),
+              receive_timeout: timeout,
               json: body,
               into: :self
             ]
@@ -196,18 +200,35 @@ defmodule Gemini.Client.HTTP do
               result =
                 case Req.post(req_opts) do
                   {:ok, %Req.Response{status: status, body: body}} when status in 200..299 ->
-                    events = parse_sse_stream(body)
+                    result =
+                      case parse_sse_stream(body) do
+                        {:ok, events} ->
+                          duration = Telemetry.calculate_duration(start_time)
+
+                          stop_measurements = %{
+                            total_duration: duration,
+                            total_chunks: length(events)
+                          }
 
-                    duration = Telemetry.calculate_duration(start_time)
+                          Telemetry.execute(
+                            [:gemini, :stream, :stop],
+                            stop_measurements,
+                            metadata
+                          )
 
-                    stop_measurements = %{
-                      total_duration: duration,
-                      total_chunks: length(events)
-                    }
+                          {:ok, events}
 
-                    Telemetry.execute([:gemini, :stream, :stop], stop_measurements, metadata)
+                        {:error, parse_error} ->
+                          Telemetry.execute(
+                            [:gemini, :stream, :exception],
+                            measurements,
+                            Map.put(metadata, :reason, parse_error)
+                          )
 
-                    {:ok, events}
+                          {:error, parse_error}
+                      end
+
+                    result
 
                   {:ok, %Req.Response{status: status}} ->
                     error = {:http_error, status, "Stream request failed"}
@@ -251,19 +272,23 @@ defmodule Gemini.Client.HTTP do
   @doc """
   Raw streaming POST with full URL (used by streaming manager).
   """
-  def stream_post_raw(url, body, headers, _opts \\ []) do
+  def stream_post_raw(url, body, headers, opts \\ []) do
+    timeout = Keyword.get(opts, :timeout, Config.timeout())
+
     req_opts = [
       url: url,
       headers: headers,
-      receive_timeout: Config.timeout(),
+      receive_timeout: timeout,
       json: body,
       into: :self
     ]
 
     case Req.post(req_opts) do
       {:ok, %Req.Response{status: status, body: body}} when status in 200..299 ->
-        events = parse_sse_stream(body)
-        {:ok, events}
+        case parse_sse_stream(body) do
+          {:ok, events} -> {:ok, events}
+          {:error, parse_error} -> {:error, parse_error}
+        end
 
       {:ok, %Req.Response{status: status}} ->
         {:error, Error.http_error(status, "Stream request failed")}
@@ -369,16 +394,24 @@ defmodule Gemini.Client.HTTP do
 
   # Parse Server-Sent Events format
   defp parse_sse_stream(data) when is_binary(data) do
-    data
-    |> String.split("\n\n")
-    |> Enum.filter(&(String.trim(&1) != ""))
-    |> Enum.map(&parse_sse_event/1)
-    |> Enum.filter(&(&1 != nil))
-  rescue
-    _ -> []
+    try do
+      events =
+        data
+        |> String.split("\n\n")
+        |> Enum.filter(&(String.trim(&1) != ""))
+        |> Enum.map(&parse_sse_event/1)
+        |> Enum.filter(&(&1 != nil))
+
+      {:ok, events}
+    rescue
+      exception ->
+        {:error,
+         Error.invalid_response("Failed to parse SSE stream: #{Exception.message(exception)}")}
+    end
   end
 
-  defp parse_sse_stream(_), do: []
+  defp parse_sse_stream(_),
+    do: {:error, Error.invalid_response("Invalid SSE stream payload")}
 
   defp parse_sse_event(event_data) do
     lines = String.split(event_data, "\n")