fix: Bump max tokens limits

SkylarKelty · SkylarKelty · commit 55f41cb757ac · 2026-03-11T13:16:08.000Z
diff --git a/.env.example b/.env.example
@@ -17,37 +17,53 @@ LITELLM_BASE_URL=http://localhost:11434/api
 # Can also use OPENAI_API_KEY
 LITELLM_API_KEY=sk-your-key-here
 
-# Model to use for summarization (default: arc:apex)
+# Model to use for summarization (default: qwen3.5:27b)
 # Must be a valid model for your LITELLM_BASE_URL
-SUMMARY_MODEL=arc:apex
+SUMMARY_MODEL=qwen3.5:27b
 
 # Deep research settings (static - configured via env, not per-request)
-DEEP_RESEARCH_STAGES=2          # Number of outline sections
-DEEP_RESEARCH_PASSES=1          # Research passes - each refines queries
-DEEP_RESEARCH_SUBQUERIES=5      # Queries per section per pass
-DEEP_RESEARCH_RESULTS_PER_QUERY=10  # Search results per subquery
-DEEP_RESEARCH_MAX_TOKENS=8000       # Max tokens for final essay
+# Number of outline sections
+DEEP_RESEARCH_STAGES=2
+# Research passes - each refines queries
+DEEP_RESEARCH_PASSES=1
+# Queries per section per pass
+DEEP_RESEARCH_SUBQUERIES=5
+# Search results per subquery
+DEEP_RESEARCH_RESULTS_PER_QUERY=10
+# Max tokens for final essay
+DEEP_RESEARCH_MAX_TOKENS=8000
 
 # Content extraction - fetches full page text for richer synthesis
-DEEP_RESEARCH_CONTENT_EXTRACTION=true   # Toggle page fetching on/off
-DEEP_RESEARCH_PAGES_PER_SECTION=3       # Top pages to extract per section
-DEEP_RESEARCH_CONTENT_MAX_CHARS=3000    # Max chars of extracted text per page
+# Toggle page fetching on/off
+DEEP_RESEARCH_CONTENT_EXTRACTION=true
+# Top pages to extract per section
+DEEP_RESEARCH_PAGES_PER_SECTION=3
+# Max chars of extracted text per page
+DEEP_RESEARCH_CONTENT_MAX_CHARS=3000
 
 # Shallow research settings (static - configured via env, not per-request)
-SHALLOW_RESEARCH_STAGES=1               # Number of outline sections
-SHALLOW_RESEARCH_PASSES=1               # Research passes
-SHALLOW_RESEARCH_SUBQUERIES=3           # Queries per section per pass
-SHALLOW_RESEARCH_RESULTS_PER_QUERY=5    # Search results per subquery
-SHALLOW_RESEARCH_MAX_TOKENS=4000        # Max tokens for final essay
+# Number of outline sections
+SHALLOW_RESEARCH_STAGES=1
+# Research passes
+SHALLOW_RESEARCH_PASSES=1
+# Queries per section per pass
+SHALLOW_RESEARCH_SUBQUERIES=3
+# Search results per subquery
+SHALLOW_RESEARCH_RESULTS_PER_QUERY=5
+# Max tokens for final essay
+SHALLOW_RESEARCH_MAX_TOKENS=4000
 
 # Shallow content extraction - typically disabled for faster responses
-SHALLOW_RESEARCH_CONTENT_EXTRACTION=false   # Toggle page fetching on/off
-SHALLOW_RESEARCH_PAGES_PER_SECTION=2        # Top pages to extract per section
-SHALLOW_RESEARCH_CONTENT_MAX_CHARS=2000     # Max chars of extracted text per page
+# Toggle page fetching on/off
+SHALLOW_RESEARCH_CONTENT_EXTRACTION=false
+# Top pages to extract per section
+SHALLOW_RESEARCH_PAGES_PER_SECTION=2
+# Max chars of extracted text per page
+SHALLOW_RESEARCH_CONTENT_MAX_CHARS=2000
 
 # Optional runtime settings
 ENABLE_SUMMARY=true
-SUMMARY_MAX_TOKENS=1024
-SEARXNG_TIMEOUT_SECONDS=30
-LLM_TIMEOUT_SECONDS=60
+SUMMARY_MAX_TOKENS=4000
+SEARXNG_TIMEOUT_SECONDS=60
+LLM_TIMEOUT_SECONDS=120
 LOG_LEVEL=INFO
diff --git a/artemis/config.py b/artemis/config.py
@@ -349,7 +349,7 @@ def get_settings() -> Settings:
             os.getenv("SEARXNG_API_BASE", "http://localhost:8888"),
         ),
         searxng_timeout_seconds=_parse_float(
-            "SEARXNG_TIMEOUT_SECONDS", 30.0, minimum=0.5, maximum=300.0
+            "SEARXNG_TIMEOUT_SECONDS", 60.0, minimum=0.5, maximum=300.0
         ),
         litellm_base_url=_validate_url(
             "LITELLM_BASE_URL",
@@ -360,9 +360,9 @@ def get_settings() -> Settings:
         llm_timeout_seconds=_parse_float(
             "LLM_TIMEOUT_SECONDS", 120.0, minimum=1.0, maximum=600.0
         ),
-        summary_model=os.getenv("SUMMARY_MODEL", "arc:apex").strip() or "arc:apex",
+        summary_model=os.getenv("SUMMARY_MODEL", "qwen3.5:27b").strip() or "qwen3.5:27b",
         summary_max_tokens=_parse_int(
-            "SUMMARY_MAX_TOKENS", 2000, minimum=512, maximum=16384
+            "SUMMARY_MAX_TOKENS", 4000, minimum=512, maximum=16384
         ),
         enable_summary=_parse_bool("ENABLE_SUMMARY", True),
         deep_research_stages=_parse_int(
diff --git a/artemis/main.py b/artemis/main.py
@@ -632,7 +632,7 @@ async def responses(
 
         return ResponsesAPIResponse(
             id=str(uuid.uuid4()),
-            created=_created_timestamp(),
+            created_at=_created_timestamp(),
             model=preset_config.model_name,
             output=[
                 _message_output(research_run.essay),
@@ -651,7 +651,7 @@ async def responses(
 
     return ResponsesAPIResponse(
         id=str(uuid.uuid4()),
-        created=_created_timestamp(),
+        created_at=_created_timestamp(),
         model="artemis-search",
         output=[
             _message_output(summary or _fallback_text(results)),
diff --git a/artemis/models.py b/artemis/models.py
@@ -204,7 +204,7 @@ class ResponsesAPIResponse(BaseModel):
 
     Attributes:
         id: Unique response identifier
-        created: Unix timestamp of creation
+        created_at: Unix timestamp of creation
         model: Model identifier used
         status: Always "completed" for successful responses
         output: List of message and/or search results blocks
@@ -214,7 +214,7 @@ class ResponsesAPIResponse(BaseModel):
 
     id: str
     object: Literal["response"] = "response"
-    created: int
+    created_at: int
     model: str
     status: Literal["completed"] = "completed"
     output: list[AssistantMessage | SearchResultsBlock]
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -24,7 +24,7 @@ services:
       - ALLOWED_ORIGINS=http://localhost:3000
       - ARTEMIS_API_KEY=${ARTEMIS_API_KEY:-}
       - ENABLE_SUMMARY=true
-      - SUMMARY_MODEL=openai/gpt-4o-mini
+      - SUMMARY_MODEL=qwen3.5:27b
       # Add your API key:
       # - OPENAI_API_KEY=sk-...
     depends_on:
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -107,7 +107,7 @@ class ResponsesAPIResponseTestCase(unittest.TestCase):
     def test_defaults(self) -> None:
         resp = ResponsesAPIResponse(
             id="test-id",
-            created=1000,
+            created_at=1000,
             model="artemis-search",
             output=[],
         )
@@ -119,7 +119,7 @@ def test_defaults(self) -> None:
     def test_full_response_serialization(self) -> None:
         resp = ResponsesAPIResponse(
             id="id-1",
-            created=1234567890,
+            created_at=1234567890,
             model="artemis-search",
             output=[
                 AssistantMessage(