Fix structured LLM response handling, update eval model name

neuromechanist · neuromechanist · commit 71e0f5cc484d · 2026-03-04T03:21:01.000-08:00
Extract text content from thinking+text block responses (gpt-oss on
Groq returns structured blocks instead of plain strings). Add
extract_text_content utility used by all agents. Update frontend
model display to show GPT-OSS-120B for evaluation.
diff --git a/frontend/index.html b/frontend/index.html
@@ -1235,7 +1235,7 @@ <h3>Status</h3>
         <a href="https://docs.annotation.garden/projects/hedit/telemetry" target="_blank" rel="noopener noreferrer" title="Learn more about telemetry">Learn more</a>
     </div>
     <div class="model-info">
-        <span>Models: <a href="https://openrouter.ai/anthropic/claude-haiku-4.5" target="_blank" rel="noopener noreferrer">Claude Haiku 4.5</a> (annotation) · <a href="https://openrouter.ai/qwen/qwen3-vl-30b-a3b-instruct" target="_blank" rel="noopener noreferrer">Qwen3-VL-30B</a> (vision) · <a href="https://openrouter.ai/qwen/qwen3-235b-a22b-2507" target="_blank" rel="noopener noreferrer">Qwen3-235B</a> (evaluation) · To change models or use programmatically, install <a href="https://pypi.org/project/hedit/" target="_blank" rel="noopener noreferrer">HEDit CLI</a></span>
+        <span>Models: <a href="https://openrouter.ai/anthropic/claude-haiku-4.5" target="_blank" rel="noopener noreferrer">Claude Haiku 4.5</a> (annotation) · <a href="https://openrouter.ai/qwen/qwen3-vl-30b-a3b-instruct" target="_blank" rel="noopener noreferrer">Qwen3-VL-30B</a> (vision) · <a href="https://openrouter.ai/openai/gpt-oss-120b" target="_blank" rel="noopener noreferrer">GPT-OSS-120B</a> (evaluation) · To change models or use programmatically, install <a href="https://pypi.org/project/hedit/" target="_blank" rel="noopener noreferrer">HEDit CLI</a></span>
     </div>
 
     <script src="config.js"></script>
diff --git a/src/agents/annotation_agent.py b/src/agents/annotation_agent.py
@@ -11,6 +11,7 @@
 from langchain_core.messages import HumanMessage, SystemMessage
 
 from src.agents.state import HedAnnotationState
+from src.utils import extract_text_content
 from src.utils.hed_comprehensive_guide import get_comprehensive_hed_guide
 from src.utils.json_schema_loader import HedJsonSchemaLoader, load_latest_schema
 
@@ -212,8 +213,7 @@ async def annotate(self, state: HedAnnotationState) -> dict:
         except Exception as e:
             logger.error("LLM invocation failed: %s", e, exc_info=True)
             raise
-        content = response.content
-        raw_annotation = content.strip() if isinstance(content, str) else str(content)
+        raw_annotation = extract_text_content(response.content)
 
         # Clean up LLM output - extract just the HED annotation
         annotation = self._extract_hed_annotation(raw_annotation)
diff --git a/src/agents/assessment_agent.py b/src/agents/assessment_agent.py
@@ -11,6 +11,7 @@
 from langchain_core.messages import HumanMessage, SystemMessage
 
 from src.agents.state import HedAnnotationState
+from src.utils import extract_text_content
 
 logger = logging.getLogger(__name__)
 
@@ -112,8 +113,7 @@ async def assess(self, state: HedAnnotationState) -> dict:
         except Exception as e:
             logger.error("Assessment LLM invocation failed: %s", e, exc_info=True)
             raise
-        content = response.content
-        feedback = content.strip() if isinstance(content, str) else str(content)
+        feedback = extract_text_content(response.content)
 
         # Parse completion status from assessment feedback
         # Format is "COMPLETENESS: complete" and "STATUS: COMPLETE"
diff --git a/src/agents/evaluation_agent.py b/src/agents/evaluation_agent.py
@@ -12,6 +12,7 @@
 from langchain_core.messages import HumanMessage, SystemMessage
 
 from src.agents.state import HedAnnotationState
+from src.utils import extract_text_content
 from src.utils.json_schema_loader import HedJsonSchemaLoader, load_latest_schema
 
 logger = logging.getLogger(__name__)
@@ -172,8 +173,7 @@ async def evaluate(self, state: HedAnnotationState) -> dict:
         except Exception as e:
             logger.error("Evaluation LLM invocation failed: %s", e, exc_info=True)
             raise
-        content = response.content
-        feedback = content.strip() if isinstance(content, str) else str(content)
+        feedback = extract_text_content(response.content)
 
         # Parse decision with multiple fallbacks
         is_faithful = self._parse_decision(feedback)
diff --git a/src/agents/feedback_summarizer.py b/src/agents/feedback_summarizer.py
@@ -10,6 +10,7 @@
 from langchain_core.messages import HumanMessage, SystemMessage
 
 from src.agents.state import HedAnnotationState
+from src.utils import extract_text_content
 
 logger = logging.getLogger(__name__)
 
@@ -121,8 +122,7 @@ async def summarize(self, state: HedAnnotationState) -> dict:
         except Exception as e:
             logger.error("Feedback summarization LLM invocation failed: %s", e, exc_info=True)
             raise
-        content = response.content
-        summarized_feedback = content.strip() if isinstance(content, str) else str(content)
+        summarized_feedback = extract_text_content(response.content)
 
         # Replace verbose feedback with summary (only augmented fields for LLM, not raw for users)
         return {
diff --git a/src/agents/vision_agent.py b/src/agents/vision_agent.py
@@ -7,6 +7,7 @@
 from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import HumanMessage
 
+from src.utils import extract_text_content
 from src.utils.image_processing import prepare_image_for_vision_model
 
 DEFAULT_VISION_PROMPT = """Describe what you see in this image. Include the setting, main elements, colors, lighting, and overall composition. Be specific and detailed. Form the response as a continuous paragraph. Maximum 200 words."""
@@ -70,8 +71,7 @@ async def describe_image(
 
         # Generate description
         response = await self.llm.ainvoke([message])
-        content = response.content
-        description = content.strip() if isinstance(content, str) else str(content)
+        description = extract_text_content(response.content)
 
         return {
             "description": description,
@@ -115,8 +115,7 @@ def describe_image_sync(
 
         # Generate description
         response = self.llm.invoke([message])
-        content = response.content
-        description = content.strip() if isinstance(content, str) else str(content)
+        description = extract_text_content(response.content)
 
         return {
             "description": description,
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
@@ -1 +1,27 @@
 """Utility functions."""
+
+
+def extract_text_content(content: object) -> str:
+    """Extract text from an LLM response content field.
+
+    Some models (e.g. gpt-oss on Groq) return structured blocks including
+    thinking and text parts instead of a plain string.  This helper extracts
+    only the text parts and joins them.
+
+    Args:
+        content: The ``response.content`` value, either a plain string or a
+            list of dicts with ``type`` and ``text``/``thinking`` keys.
+
+    Returns:
+        The concatenated text content, stripped of whitespace.
+    """
+    if isinstance(content, str):
+        return content.strip()
+    if isinstance(content, list):
+        parts: list[str] = []
+        for block in content:
+            if isinstance(block, dict) and block.get("type") == "text":
+                parts.append(block.get("text", ""))
+        if parts:
+            return "\n".join(parts).strip()
+    return str(content).strip()