Annotation-Garden
diff --git a/‎.env.example‎
Lines changed: 5 additions & 6 deletions b/‎.env.example‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎.github/workflows/auto-dev-bump.yml‎
Lines changed: 120 additions & 0 deletions b/‎.github/workflows/auto-dev-bump.yml‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎frontend/index.html‎
Lines changed: 1 addition & 1 deletion b/‎frontend/index.html‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/agents/annotation_agent.py‎
Lines changed: 2 additions & 2 deletions b/‎src/agents/annotation_agent.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/agents/assessment_agent.py‎
Lines changed: 10 additions & 3 deletions b/‎src/agents/assessment_agent.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎src/agents/evaluation_agent.py‎
Lines changed: 20 additions & 17 deletions b/‎src/agents/evaluation_agent.py‎
Lines changed: 20 additions & 17 deletions
diff --git a/‎src/agents/feedback_summarizer.py‎
Lines changed: 11 additions & 3 deletions b/‎src/agents/feedback_summarizer.py‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎src/agents/state.py‎
Lines changed: 7 additions & 4 deletions b/‎src/agents/state.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎src/agents/vision_agent.py‎
Lines changed: 3 additions & 4 deletions b/‎src/agents/vision_agent.py‎
Lines changed: 3 additions & 4 deletions
@@ -67,11 +67,10 @@ OPENROUTER_API_KEY=your-openrouter-api-key-here
 ANNOTATION_MODEL=mistralai/mistral-small-3.2-24b-instruct
 ANNOTATION_PROVIDER=mistral
 
-# Evaluation/Assessment Model (consistent quality checks: Qwen3-235B via DeepInfra)
+# Evaluation/Assessment Model (fast quality checks: GPT-OSS-120B via Groq)
 # Used for evaluation, assessment, and feedback agents
-# Leave EVALUATION_PROVIDER empty to let OpenRouter auto-route
-EVALUATION_MODEL=qwen/qwen3-235b-a22b-2507
-EVALUATION_PROVIDER=deepinfra/fp8
+EVALUATION_MODEL=openai/gpt-oss-120b
+EVALUATION_PROVIDER=groq
 
 # Vision Model (image description: Qwen3-VL via deepinfra)
 VISION_MODEL=qwen/qwen3-vl-30b-a3b-instruct
@@ -142,8 +141,8 @@ API_WORKERS=4
 # ============================================================================
 # Workflow Configuration
 # ============================================================================
-MAX_VALIDATION_ATTEMPTS=5
-MAX_TOTAL_ITERATIONS=10
+MAX_VALIDATION_ATTEMPTS=3
+MAX_TOTAL_ITERATIONS=4
 
 # ============================================================================
 # Logging
 
@@ -0,0 +1,120 @@
+name: Auto Dev Bump on Develop
+
+on:
+  push:
+    branches:
+      - develop
+    paths-ignore:
+      - '.github/**'
+      - 'docs/**'
+      - '**/*.md'
+      - '.context/**'
+      - '.rules/**'
+      - '.serena/**'
+
+permissions:
+  contents: write
+
+jobs:
+  auto-dev-bump:
+    name: Auto Bump Dev Version
+    runs-on: ubuntu-latest
+    # Skip if commit message already contains a version bump (avoid loops)
+    if: >-
+      !contains(github.event.head_commit.message, 'Bump version to')
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.RELEASE_PAT }}
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.12'
+
+      - name: Configure Git
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+
+      - name: Get current version
+        id: current
+        run: |
+          VERSION=$(grep -m1 'version = ' pyproject.toml | sed 's/version = "\(.*\)"/\1/')
+          echo "version=$VERSION" >> $GITHUB_OUTPUT
+          echo "Current version: $VERSION"
+
+      - name: Calculate new dev version
+        id: new_version
+        run: |
+          VERSION="${{ steps.current.outputs.version }}"
+
+          # Parse version: expects format like 0.7.5.dev0 or 0.7.5
+          if [[ "$VERSION" =~ ^([0-9]+\.[0-9]+\.[0-9]+)(\.dev([0-9]+))?$ ]]; then
+            BASE="${BASH_REMATCH[1]}"
+            DEV_NUM="${BASH_REMATCH[3]}"
+
+            if [ -z "$DEV_NUM" ]; then
+              # No dev suffix yet, start at dev0
+              NEW_VERSION="${BASE}.dev0"
+            else
+              # Increment dev number
+              NEW_DEV=$((DEV_NUM + 1))
+              NEW_VERSION="${BASE}.dev${NEW_DEV}"
+            fi
+          else
+            echo "Unexpected version format for develop: $VERSION (expected X.Y.Z or X.Y.Z.devN)"
+            exit 1
+          fi
+
+          echo "new_version=$NEW_VERSION" >> $GITHUB_OUTPUT
+          echo "New version: $NEW_VERSION"
+
+      - name: Update version files
+        run: |
+          NEW_VERSION="${{ steps.new_version.outputs.new_version }}"
+
+          # Parse version for __version_info__
+          MAJOR=$(echo "$NEW_VERSION" | cut -d. -f1)
+          MINOR=$(echo "$NEW_VERSION" | cut -d. -f2)
+          PATCH=$(echo "$NEW_VERSION" | cut -d. -f3)
+
+          # Update pyproject.toml
+          sed -i "s/^version = .*/version = \"$NEW_VERSION\"/" pyproject.toml
+
+          # Update src/version.py
+          cat > src/version.py << PYEOF
+          """Version information for HEDit."""
+
+          __version__ = "$NEW_VERSION"
+          __version_info__ = ($MAJOR, $MINOR, $PATCH, "dev")
+
+
+          def get_version() -> str:
+              """Get the current version string."""
+              return __version__
+
+
+          def get_version_info() -> tuple:
+              """Get the version info tuple (major, minor, patch, prerelease)."""
+              return __version_info__
+          PYEOF
+          # Fix indentation (remove leading spaces from heredoc)
+          sed -i 's/^          //' src/version.py
+
+      - name: Commit version bump
+        run: |
+          NEW_VERSION="${{ steps.new_version.outputs.new_version }}"
+          git add pyproject.toml src/version.py
+          git commit -m "Bump version to $NEW_VERSION"
+          git push origin develop
+
+      - name: Create and push tag
+        run: |
+          NEW_VERSION="${{ steps.new_version.outputs.new_version }}"
+          git tag "v$NEW_VERSION"
+          git push origin "v$NEW_VERSION"
+          echo "Created tag: v$NEW_VERSION"
@@ -1235,7 +1235,7 @@ <h3>Status</h3>
         <a href="https://docs.annotation.garden/projects/hedit/telemetry" target="_blank" rel="noopener noreferrer" title="Learn more about telemetry">Learn more</a>
     </div>
     <div class="model-info">
-        <span>Models: <a href="https://openrouter.ai/anthropic/claude-haiku-4.5" target="_blank" rel="noopener noreferrer">Claude Haiku 4.5</a> (annotation) · <a href="https://openrouter.ai/qwen/qwen3-vl-30b-a3b-instruct" target="_blank" rel="noopener noreferrer">Qwen3-VL-30B</a> (vision) · <a href="https://openrouter.ai/qwen/qwen3-235b-a22b-2507" target="_blank" rel="noopener noreferrer">Qwen3-235B</a> (evaluation) · To change models or use programmatically, install <a href="https://pypi.org/project/hedit/" target="_blank" rel="noopener noreferrer">HEDit CLI</a></span>
+        <span>Models: <a href="https://openrouter.ai/anthropic/claude-haiku-4.5" target="_blank" rel="noopener noreferrer">Claude Haiku 4.5</a> (annotation) · <a href="https://openrouter.ai/qwen/qwen3-vl-30b-a3b-instruct" target="_blank" rel="noopener noreferrer">Qwen3-VL-30B</a> (vision) · <a href="https://openrouter.ai/openai/gpt-oss-120b" target="_blank" rel="noopener noreferrer">GPT-OSS-120B</a> (evaluation) · To change models or use programmatically, install <a href="https://pypi.org/project/hedit/" target="_blank" rel="noopener noreferrer">HEDit CLI</a></span>
     </div>
 
     <script src="config.js"></script>
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "hedit"
-version = "0.7.4a4"
+version = "0.7.5.dev0"
 description = "Multi-agent system for HED annotation generation and validation"
 readme = "PKG_README.md"
 requires-python = ">=3.12"
 
@@ -11,6 +11,7 @@
 from langchain_core.messages import HumanMessage, SystemMessage
 
 from src.agents.state import HedAnnotationState
+from src.utils import extract_text_content
 from src.utils.hed_comprehensive_guide import get_comprehensive_hed_guide
 from src.utils.json_schema_loader import HedJsonSchemaLoader, load_latest_schema
 
@@ -212,8 +213,7 @@ async def annotate(self, state: HedAnnotationState) -> dict:
         except Exception as e:
             logger.error("LLM invocation failed: %s", e, exc_info=True)
             raise
-        content = response.content
-        raw_annotation = content.strip() if isinstance(content, str) else str(content)
+        raw_annotation = extract_text_content(response.content)
 
         # Clean up LLM output - extract just the HED annotation
         annotation = self._extract_hed_annotation(raw_annotation)
 
@@ -4,12 +4,16 @@
 elements or dimensions in the HED annotation.
 """
 
+import logging
 from pathlib import Path
 
 from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import HumanMessage, SystemMessage
 
 from src.agents.state import HedAnnotationState
+from src.utils import extract_text_content
+
+logger = logging.getLogger(__name__)
 
 
 class AssessmentAgent:
@@ -104,9 +108,12 @@ async def assess(self, state: HedAnnotationState) -> dict:
             HumanMessage(content=user_prompt),
         ]
 
-        response = await self.llm.ainvoke(messages)
-        content = response.content
-        feedback = content.strip() if isinstance(content, str) else str(content)
+        try:
+            response = await self.llm.ainvoke(messages)
+        except Exception as e:
+            logger.error("Assessment LLM invocation failed: %s", e, exc_info=True)
+            raise
+        feedback = extract_text_content(response.content)
 
         # Parse completion status from assessment feedback
         # Format is "COMPLETENESS: complete" and "STATUS: COMPLETE"
 
@@ -4,14 +4,19 @@
 the original natural language event description.
 """
 
+import logging
+import re
 from pathlib import Path
 
 from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import HumanMessage, SystemMessage
 
 from src.agents.state import HedAnnotationState
+from src.utils import extract_text_content
 from src.utils.json_schema_loader import HedJsonSchemaLoader, load_latest_schema
 
+logger = logging.getLogger(__name__)
+
 
 class EvaluationAgent:
     """Agent that evaluates the faithfulness of HED annotations.
@@ -163,9 +168,12 @@ async def evaluate(self, state: HedAnnotationState) -> dict:
             HumanMessage(content=user_prompt),
         ]
 
-        response = await self.llm.ainvoke(messages)
-        content = response.content
-        feedback = content.strip() if isinstance(content, str) else str(content)
+        try:
+            response = await self.llm.ainvoke(messages)
+        except Exception as e:
+            logger.error("Evaluation LLM invocation failed: %s", e, exc_info=True)
+            raise
+        feedback = extract_text_content(response.content)
 
         # Parse decision with multiple fallbacks
         is_faithful = self._parse_decision(feedback)
@@ -186,8 +194,6 @@ def _parse_decision(self, feedback: str) -> bool:
         Returns:
             True if annotation should be accepted, False if needs refinement
         """
-        import re
-
         feedback_lower = feedback.lower()
 
         # Check for explicit DECISION line
@@ -201,19 +207,16 @@ def _parse_decision(self, feedback: str) -> bool:
             result = faithful_match.group(1)
             return result in ["yes", "partial"]  # Accept partial as good enough!
 
-        # Fallback: look for positive indicators
-        positive_indicators = ["accept", "good", "sufficient", "adequate", "captures well"]
-        negative_indicators = ["refine", "missing", "incorrect", "inaccurate", "lacks"]
-
-        positive_score = sum(1 for indicator in positive_indicators if indicator in feedback_lower)
-        negative_score = sum(1 for indicator in negative_indicators if indicator in feedback_lower)
+        # Fallback: look for explicit refine indicators only
+        refine_indicators = ["refine", "incorrect", "inaccurate", "wrong"]
+        if any(indicator in feedback_lower for indicator in refine_indicators):
+            return False
 
-        # If more positive than negative, accept
-        if positive_score > negative_score:
-            return True
-
-        # Default to refine if ambiguous (conservative)
-        return False
+        # Default to accept if ambiguous -- avoid unnecessary refinement loops
+        logger.debug(
+            "Evaluation parsing: no explicit DECISION/FAITHFUL/refine indicator found; defaulting to ACCEPT"
+        )
+        return True
 
     def _check_tags_and_suggest(self, annotation: str) -> str:
         """Check annotation for invalid tags and suggest alternatives.
 
@@ -4,10 +4,15 @@
 into concise, actionable points for the annotation agent.
 """
 
+import logging
+
 from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import HumanMessage, SystemMessage
 
 from src.agents.state import HedAnnotationState
+from src.utils import extract_text_content
+
+logger = logging.getLogger(__name__)
 
 
 class FeedbackSummarizer:
@@ -112,9 +117,12 @@ async def summarize(self, state: HedAnnotationState) -> dict:
             HumanMessage(content=user_prompt),
         ]
 
-        response = await self.llm.ainvoke(messages)
-        content = response.content
-        summarized_feedback = content.strip() if isinstance(content, str) else str(content)
+        try:
+            response = await self.llm.ainvoke(messages)
+        except Exception as e:
+            logger.error("Feedback summarization LLM invocation failed: %s", e, exc_info=True)
+            raise
+        summarized_feedback = extract_text_content(response.content)
 
         # Replace verbose feedback with summary (only augmented fields for LLM, not raw for users)
         return {
 
@@ -81,8 +81,8 @@ class HedAnnotationState(TypedDict):
 def create_initial_state(
     input_description: str,
     schema_version: str = "8.4.0",
-    max_validation_attempts: int = 5,
-    max_total_iterations: int = 10,
+    max_validation_attempts: int = 3,
+    max_total_iterations: int | None = None,
     run_assessment: bool = False,
     extracted_keywords: list[str] | None = None,
     semantic_hints: list[dict] | None = None,
@@ -93,8 +93,8 @@ def create_initial_state(
     Args:
         input_description: Natural language event description to annotate
         schema_version: HED schema version to use (default: "8.4.0")
-        max_validation_attempts: Maximum validation retry attempts (default: 5)
-        max_total_iterations: Maximum total iterations to prevent infinite loops (default: 10)
+        max_validation_attempts: Maximum validation retry attempts (default: 3)
+        max_total_iterations: Maximum total iterations (default: max_validation_attempts + 1)
         run_assessment: Whether to run final assessment (default: False)
         extracted_keywords: Pre-extracted keywords from description (optional)
         semantic_hints: Pre-computed semantic search hints (optional)
@@ -103,6 +103,9 @@ def create_initial_state(
     Returns:
         Initial HedAnnotationState
     """
+    if max_total_iterations is None:
+        max_total_iterations = max_validation_attempts + 1
+
     return HedAnnotationState(
         messages=[],
         input_description=input_description,
 
@@ -7,6 +7,7 @@
 from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import HumanMessage
 
+from src.utils import extract_text_content
 from src.utils.image_processing import prepare_image_for_vision_model
 
 DEFAULT_VISION_PROMPT = """Describe what you see in this image. Include the setting, main elements, colors, lighting, and overall composition. Be specific and detailed. Form the response as a continuous paragraph. Maximum 200 words."""
@@ -70,8 +71,7 @@ async def describe_image(
 
         # Generate description
         response = await self.llm.ainvoke([message])
-        content = response.content
-        description = content.strip() if isinstance(content, str) else str(content)
+        description = extract_text_content(response.content)
 
         return {
             "description": description,
@@ -115,8 +115,7 @@ def describe_image_sync(
 
         # Generate description
         response = self.llm.invoke([message])
-        content = response.content
-        description = content.strip() if isinstance(content, str) else str(content)
+        description = extract_text_content(response.content)
 
         return {
             "description": description,