feat(validation): add input length guard; expand models; handle truncation

bionicman · bionicman · commit 01829dae9692 · 2025-08-09T01:44:50.000+01:00
- Bblslug::translate():
  - Integrate TextLengthValidator right after filters to fail early when prepared input exceeds model capacity.

- Validation/TextLengthValidator:
  - New validator using model limits (estimated_max_chars, max_tokens, max_output_tokens) with a 4 chars/token heuristic.
  - Configurable overhead buffer (default 2000) and fallback reserve (% of total tokens) when max_output_tokens unknown.
  - Returns detailed error with overage and guidance to split input or reduce output.

- Models (resources/models.yaml):
  - OpenAI: add gpt-5 / gpt-5-mini / gpt-5-nano; set limits (max_tokens/max_output_tokens/estimated_max_chars);
    expose reasoning tokens via usage.completion_tokens_details.reasoning_tokens; add limits for gpt-4o / gpt-4o-mini;
    restore explicit limits for gpt-4 / gpt-4-turbo.
  - Google: add usage breakdown for thoughts; keep 2.0-flash defined and list after 2.5 family.
  - X.ai: normalize usage breakdown keys; set explicit limits for grok-4 / grok-3 / grok-3-mini.

- AnthropicDriver:
  - Detect `finish_reason=length` and fail fast with a clear error message; extract raw content early.

- README:
  - Sync supported model list with registry, including OpenAI GPT-5 family and Gemini ordering.
diff --git a/README.md b/README.md
@@ -14,25 +14,26 @@ APIs supported:
   - `deepl:free` - DeepL free tier
   - `deepl:pro` - DeepL pro tier
 - Google (Gemini):
-  - `google:gemini-2.0-flash` - Gemini 2.0 Flash
   - `google:gemini-2.5-flash` - Gemini 2.5 Flash
   - `google:gemini-2.5-flash-lite` - Gemini 2.5 Flash Lite
   - `google:gemini-2.5-pro` - Gemini 2.5 Pro
+  - `google:gemini-2.0-flash` - Gemini 2.0 Flash
 - OpenAI (GPT):
-  - `openai:gpt-4` - OpenAI GPT-4
-  - `openai:gpt-4-turbo` - OpenAI GPT-4 Turbo
+  - `openai:gpt-5` - OpenAI GPT-5
+  - `openai:gpt-5-mini` - OpenAI GPT-5 Mini
+  - `openai:gpt-5-nano` - OpenAI GPT-5 Nano
   - `openai:gpt-4o` - OpenAI GPT-4o
   - `openai:gpt-4o-mini` - OpenAI GPT-4o Mini
+  - `openai:gpt-4` - OpenAI GPT-4
+  - `openai:gpt-4-turbo` - OpenAI GPT-4 Turbo
 - Yandex:
   - `yandex:gpt-lite` - YandexGPT Lite
   - `yandex:gpt-pro` - YandexGPT Pro
   - `yandex:gpt-32k` - YandexGPT Pro 32K
 - X.ai:
   - `xai:grok-4` - Grok 4
   - `xai:grok-3` - Grok 3
-  - `xai:grok-3-fast` - Grok 3 Fast
   - `xai:grok-3-mini` - Grok 3 Mini
-  - `xai:grok-3-mini-fast` - Grok 3 Mini Fast
 
 ## Features
 
diff --git a/resources/models.yaml b/resources/models.yaml
@@ -119,19 +119,9 @@ google:
       breakdown:
         prompt: promptTokenCount
         candidates: candidatesTokenCount
+        thoughts: thoughtsTokenCount
 
   models:
-    gemini-2.0-flash:
-      name: 'Gemini 2.0 Flash'
-      endpoint: 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent'
-      defaults:
-        model: gemini-2.0-flash
-      limits:
-        max_tokens: 131072
-        token_estimator: gpt
-        estimated_max_chars: 524288
-      notes: 'Low-latency Flash model, balanced cost and performance.'
-
     gemini-2.5-flash:
       name: 'Gemini 2.5 Flash'
       endpoint: 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent'
@@ -171,6 +161,17 @@ google:
         estimated_max_chars: 1048576
       notes: 'Top-tier Pro model for longest contexts and highest accuracy.'
 
+    gemini-2.0-flash:
+      name: 'Gemini 2.0 Flash'
+      endpoint: 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent'
+      defaults:
+        model: gemini-2.0-flash
+      limits:
+        max_tokens: 131072
+        token_estimator: gpt
+        estimated_max_chars: 524288
+      notes: 'Low-latency Flash model, balanced cost and performance.'
+
 
 # -------------------------------------------------------------------
 # OpenAI GPT
@@ -189,41 +190,87 @@ openai:
     headers:
       - 'Content-Type: application/json'
   limits:
-    max_tokens: 128000
     token_estimator: gpt
-    estimated_max_chars: 512000
   usage:
     tokens:
       total: total_tokens
       breakdown:
         prompt: prompt_tokens
         completion: completion_tokens
+        reasoning: completion_tokens_details.reasoning_tokens
 
   models:
-    gpt-4:
-      name: 'OpenAI GPT-4'
+    gpt-5:
+      name: 'OpenAI GPT-5'
       defaults:
-        model: gpt-4
-      notes: 'Classic GPT-4 model: highest reliability.'
+        model: gpt-5
+        temperature: 1
+      limits:
+        max_tokens: 400000
+        max_output_tokens: 128000
+        estimated_max_chars: 1600000   # 400k * 4
+      notes: 'Flagship model with enhanced reasoning, multimodal capabilities, and extended context support.'
 
-    gpt-4-turbo:
-      name: 'OpenAI GPT-4 Turbo'
+    gpt-5-mini:
+      name: 'OpenAI GPT-5 Mini'
       defaults:
-        model: gpt-4-turbo
-      notes: 'Fast & cost-effective GPT-4 quality.'
+        model: gpt-5-mini
+        temperature: 1
+      limits:
+        max_tokens: 400000
+        max_output_tokens: 128000
+        estimated_max_chars: 1600000
+      notes: 'Compact, cost-efficient GPT-5 variant suitable for high-volume or latency-sensitive translation tasks.'
+
+    gpt-5-nano:
+      name: 'OpenAI GPT-5 Nano'
+      defaults:
+        model: gpt-5-nano
+        temperature: 1
+      limits:
+        max_tokens: 400000
+        max_output_tokens: 128000
+        estimated_max_chars: 1600000
+      notes: 'Ultra-fast GPT-5 variant optimized for small-scale and real-time translation tasks.'
 
     gpt-4o:
       name: 'OpenAI GPT-4o'
       defaults:
         model: gpt-4o
+      limits:
+        max_tokens: 128000
+        max_output_tokens: 16384
+        estimated_max_chars: 512000
       notes: 'Highly accurate with flexible prompts, ideal for AI-assisted adaptive translation.'
 
     gpt-4o-mini:
       name: 'OpenAI GPT-4o Mini'
       defaults:
         model: gpt-4o-mini
+      limits:
+        max_tokens: 128000
+        max_output_tokens: 16384
+        estimated_max_chars: 512000
       notes: 'Lightweight GPT-4o: lower latency/cost.'
 
+    gpt-4:
+      name: 'OpenAI GPT-4'
+      defaults:
+        model: gpt-4
+      limits:
+        max_tokens: 8192
+        estimated_max_chars: 32768
+      notes: 'Classic GPT-4 model: highest reliability.'
+
+    gpt-4-turbo:
+      name: 'OpenAI GPT-4 Turbo'
+      defaults:
+        model: gpt-4-turbo
+      limits:
+        max_tokens: 128000
+        estimated_max_chars: 512000
+      notes: 'Fast & cost-effective GPT-4 quality.'
+
 
 # -------------------------------------------------------------------
 # Yandex Foundation Models
@@ -313,44 +360,39 @@ xai:
     headers:
       - 'Content-Type: application/json'
   limits:
-    max_tokens: 262144
     token_estimator: gpt
-    estimated_max_chars: 1048576
   usage:
     tokens:
-      total:       total_tokens
+      total: total_tokens
       breakdown:
-        prompt:      prompt_tokens
-        completion:  completion_tokens
+        prompt: prompt_tokens
+        completion: completion_tokens
         reasoning: completion_tokens_details.reasoning_tokens
 
   models:
     grok-4:
       name: 'Grok 4'
       defaults:
         model: grok-4
+      limits:
+        max_tokens: 256000
+        estimated_max_chars: 1024000
       notes: 'Scientist-grade reasoning, coding mode, and real-time internet understanding'
 
     grok-3:
       name: 'Grok 3'
       defaults:
         model: grok-3
+      limits:
+        max_tokens: 131072
+        estimated_max_chars: 524288
       notes: 'Optimized for logical reasoning, math problem-solving, and real-time data with DeepSearch'
 
-    grok-3-fast:
-      name: 'Grok 3 Fast'
-      defaults:
-        model: grok-3-fast
-      notes: 'Optimized for fastest Grok 3 inference'
-
     grok-3-mini:
       name: 'Grok 3 Mini'
       defaults:
         model: grok-3-mini
+      limits:
+        max_tokens: 131072
+        estimated_max_chars: 524288
       notes: 'Compact variant balancing Grok 3 performance and efficiency'
-
-    grok-3-mini-fast:
-      name: 'Grok 3 Mini Fast'
-      defaults:
-        model: grok-3-mini-fast
-      notes: 'Mini variant optimized for fastest inference with compact footprint'
diff --git a/src/Bblslug/Bblslug.php b/src/Bblslug/Bblslug.php
@@ -10,6 +10,7 @@
 use Bblslug\Validation\HtmlValidator;
 use Bblslug\Validation\JsonValidator;
 use Bblslug\Validation\Schema;
+use Bblslug\Validation\TextLengthValidator;
 
 class Bblslug
 {
@@ -167,6 +168,15 @@ public static function translate(
         $prepared = $filterManager->apply($text);
         $preparedLength = mb_strlen($prepared);
 
+        // Length guard: make sure prepared text fits model constraints
+        $lengthValidator = TextLengthValidator::fromModelConfig($model);
+        $lenResult = $lengthValidator->validate($prepared);
+        if (! $lenResult->isValid()) {
+            throw new \RuntimeException(
+                "Input length exceeds model limits: " . implode('; ', $lenResult->getErrors())
+            );
+        }
+
         // Prepare options for driver, merging in any CLI-provided variables
         $options = array_merge(
             [
diff --git a/src/Bblslug/Models/Drivers/AnthropicDriver.php b/src/Bblslug/Models/Drivers/AnthropicDriver.php
@@ -115,9 +115,21 @@ public function parseResponse(array $config, string $responseBody): array
             throw new \RuntimeException("Anthropic API error: {$message}");
         }
 
+        // Extract raw content early (may be partial when truncated)
+        $contentRaw = $data['choices'][0]['message']['content'] ?? '';
+        $contentRaw = is_string($contentRaw) ? $contentRaw : '';
+
+        // If Anthropic cut output by tokens, fail with a clear message before marker search
+        $finishReason = $data['choices'][0]['finish_reason'] ?? null;
+        if ($finishReason === 'length') {
+            throw new \RuntimeException(
+                "Anthropic: translation was truncated (finish_reason=length) — increase max_tokens or split input. "
+            );
+        }
+
         // Validate content
-        $content = $data['choices'][0]['message']['content'] ?? null;
-        if (!is_string($content)) {
+        $content = $contentRaw;
+        if ($content === '') {
             throw new \RuntimeException("Anthropic translation failed: {$responseBody}");
         }
 
diff --git a/src/Bblslug/Validation/TextLengthValidator.php b/src/Bblslug/Validation/TextLengthValidator.php
@@ -0,0 +1,74 @@
+<?php
+
+namespace Bblslug\Validation;
+
+class TextLengthValidator implements ValidatorInterface
+{
+    private int $limitChars;
+    private int $overheadChars;
+
+    /**
+     * @param int $limitChars   Hard cap for prepared input length (in chars)
+     * @param int $overheadChars Safety buffer to account for prompts/markers/etc.
+     */
+    public function __construct(int $limitChars, int $overheadChars = 2000)
+    {
+        $this->limitChars   = max(0, $limitChars - max(0, $overheadChars));
+        $this->overheadChars = $overheadChars;
+    }
+
+    public function validate(string $content): ValidationResult
+    {
+        $len = mb_strlen($content);
+        if ($this->limitChars > 0 && $len > $this->limitChars) {
+            $excess = $len - $this->limitChars;
+            return ValidationResult::failure([
+                sprintf(
+                    'Prepared text length %d exceeds limit %d by %d chars (includes %d overhead). ' .
+                    'Split input or reduce max output tokens.',
+                    $len,
+                    $this->limitChars,
+                    $excess,
+                    $this->overheadChars
+                )
+            ]);
+        }
+        return ValidationResult::success();
+    }
+
+    /**
+     * Build validator from model config.
+     * Uses estimated_max_chars, max_tokens and (if present) max_output_tokens.
+     *
+     * @param array<string,mixed> $model
+     * @param int $fallbackReservePct  Reserve percent when max_output_tokens unknown (e.g. 20).
+     * @param int $overheadChars       Prompt/markers safety buffer (e.g. 2000).
+     */
+    public static function fromModelConfig(array $model, int $fallbackReservePct = 20, int $overheadChars = 2000): self
+    {
+        $limits = $model['limits'] ?? [];
+
+        $estimatedMaxChars = (int)($limits['estimated_max_chars'] ?? 0);
+        $maxTokens         = (int)($limits['max_tokens'] ?? 0);
+        $maxOutTokens      = (int)($limits['max_output_tokens'] ?? 0);
+
+        // Prefer a token-based calculation if we know both totals.
+        if ($maxTokens > 0) {
+            $reservedOut = $maxOutTokens > 0
+                ? $maxOutTokens
+                : (int)max(1, floor($maxTokens * ($fallbackReservePct / 100)));
+
+            $inputTokenBudget = max(0, $maxTokens - $reservedOut);
+            $charsByTokens    = $inputTokenBudget * 4; // ≈ 4 chars/token heuristic
+
+            $limitChars = $estimatedMaxChars > 0
+                ? min($estimatedMaxChars, $charsByTokens)
+                : $charsByTokens;
+        } else {
+            // No token info — rely only on estimated_max_chars
+            $limitChars = $estimatedMaxChars;
+        }
+
+        return new self($limitChars, $overheadChars);
+    }
+}