Skip to content

Commit ac84093

Browse files
Merge pull request #121 from Annotation-Garden/develop
Release v0.7.5: Fix evaluation loops, structured response handling
2 parents 7a19ae4 + 0eb224f commit ac84093

File tree

17 files changed

+362
-119
lines changed

17 files changed

+362
-119
lines changed

.env.example

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,10 @@ OPENROUTER_API_KEY=your-openrouter-api-key-here
6767
ANNOTATION_MODEL=mistralai/mistral-small-3.2-24b-instruct
6868
ANNOTATION_PROVIDER=mistral
6969

70-
# Evaluation/Assessment Model (consistent quality checks: Qwen3-235B via DeepInfra)
70+
# Evaluation/Assessment Model (fast quality checks: GPT-OSS-120B via Groq)
7171
# Used for evaluation, assessment, and feedback agents
72-
# Leave EVALUATION_PROVIDER empty to let OpenRouter auto-route
73-
EVALUATION_MODEL=qwen/qwen3-235b-a22b-2507
74-
EVALUATION_PROVIDER=deepinfra/fp8
72+
EVALUATION_MODEL=openai/gpt-oss-120b
73+
EVALUATION_PROVIDER=groq
7574

7675
# Vision Model (image description: Qwen3-VL via deepinfra)
7776
VISION_MODEL=qwen/qwen3-vl-30b-a3b-instruct
@@ -142,8 +141,8 @@ API_WORKERS=4
142141
# ============================================================================
143142
# Workflow Configuration
144143
# ============================================================================
145-
MAX_VALIDATION_ATTEMPTS=5
146-
MAX_TOTAL_ITERATIONS=10
144+
MAX_VALIDATION_ATTEMPTS=3
145+
MAX_TOTAL_ITERATIONS=4
147146

148147
# ============================================================================
149148
# Logging
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
name: Auto Dev Bump on Develop
2+
3+
on:
4+
push:
5+
branches:
6+
- develop
7+
paths-ignore:
8+
- '.github/**'
9+
- 'docs/**'
10+
- '**/*.md'
11+
- '.context/**'
12+
- '.rules/**'
13+
- '.serena/**'
14+
15+
permissions:
16+
contents: write
17+
18+
jobs:
19+
auto-dev-bump:
20+
name: Auto Bump Dev Version
21+
runs-on: ubuntu-latest
22+
# Skip if commit message already contains a version bump (avoid loops)
23+
if: >-
24+
!contains(github.event.head_commit.message, 'Bump version to')
25+
26+
steps:
27+
- name: Checkout code
28+
uses: actions/checkout@v6
29+
with:
30+
fetch-depth: 0
31+
token: ${{ secrets.RELEASE_PAT }}
32+
33+
- name: Set up Python
34+
uses: actions/setup-python@v6
35+
with:
36+
python-version: '3.12'
37+
38+
- name: Configure Git
39+
run: |
40+
git config user.name "github-actions[bot]"
41+
git config user.email "github-actions[bot]@users.noreply.github.com"
42+
43+
- name: Get current version
44+
id: current
45+
run: |
46+
VERSION=$(grep -m1 'version = ' pyproject.toml | sed 's/version = "\(.*\)"/\1/')
47+
echo "version=$VERSION" >> $GITHUB_OUTPUT
48+
echo "Current version: $VERSION"
49+
50+
- name: Calculate new dev version
51+
id: new_version
52+
run: |
53+
VERSION="${{ steps.current.outputs.version }}"
54+
55+
# Parse version: expects format like 0.7.5.dev0 or 0.7.5
56+
if [[ "$VERSION" =~ ^([0-9]+\.[0-9]+\.[0-9]+)(\.dev([0-9]+))?$ ]]; then
57+
BASE="${BASH_REMATCH[1]}"
58+
DEV_NUM="${BASH_REMATCH[3]}"
59+
60+
if [ -z "$DEV_NUM" ]; then
61+
# No dev suffix yet, start at dev0
62+
NEW_VERSION="${BASE}.dev0"
63+
else
64+
# Increment dev number
65+
NEW_DEV=$((DEV_NUM + 1))
66+
NEW_VERSION="${BASE}.dev${NEW_DEV}"
67+
fi
68+
else
69+
echo "Unexpected version format for develop: $VERSION (expected X.Y.Z or X.Y.Z.devN)"
70+
exit 1
71+
fi
72+
73+
echo "new_version=$NEW_VERSION" >> $GITHUB_OUTPUT
74+
echo "New version: $NEW_VERSION"
75+
76+
- name: Update version files
77+
run: |
78+
NEW_VERSION="${{ steps.new_version.outputs.new_version }}"
79+
80+
# Parse version for __version_info__
81+
MAJOR=$(echo "$NEW_VERSION" | cut -d. -f1)
82+
MINOR=$(echo "$NEW_VERSION" | cut -d. -f2)
83+
PATCH=$(echo "$NEW_VERSION" | cut -d. -f3)
84+
85+
# Update pyproject.toml
86+
sed -i "s/^version = .*/version = \"$NEW_VERSION\"/" pyproject.toml
87+
88+
# Update src/version.py
89+
cat > src/version.py << PYEOF
90+
"""Version information for HEDit."""
91+
92+
__version__ = "$NEW_VERSION"
93+
__version_info__ = ($MAJOR, $MINOR, $PATCH, "dev")
94+
95+
96+
def get_version() -> str:
97+
"""Get the current version string."""
98+
return __version__
99+
100+
101+
def get_version_info() -> tuple:
102+
"""Get the version info tuple (major, minor, patch, prerelease)."""
103+
return __version_info__
104+
PYEOF
105+
# Fix indentation (remove leading spaces from heredoc)
106+
sed -i 's/^ //' src/version.py
107+
108+
- name: Commit version bump
109+
run: |
110+
NEW_VERSION="${{ steps.new_version.outputs.new_version }}"
111+
git add pyproject.toml src/version.py
112+
git commit -m "Bump version to $NEW_VERSION"
113+
git push origin develop
114+
115+
- name: Create and push tag
116+
run: |
117+
NEW_VERSION="${{ steps.new_version.outputs.new_version }}"
118+
git tag "v$NEW_VERSION"
119+
git push origin "v$NEW_VERSION"
120+
echo "Created tag: v$NEW_VERSION"

frontend/index.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1235,7 +1235,7 @@ <h3>Status</h3>
12351235
<a href="https://docs.annotation.garden/projects/hedit/telemetry" target="_blank" rel="noopener noreferrer" title="Learn more about telemetry">Learn more</a>
12361236
</div>
12371237
<div class="model-info">
1238-
<span>Models: <a href="https://openrouter.ai/anthropic/claude-haiku-4.5" target="_blank" rel="noopener noreferrer">Claude Haiku 4.5</a> (annotation) · <a href="https://openrouter.ai/qwen/qwen3-vl-30b-a3b-instruct" target="_blank" rel="noopener noreferrer">Qwen3-VL-30B</a> (vision) · <a href="https://openrouter.ai/qwen/qwen3-235b-a22b-2507" target="_blank" rel="noopener noreferrer">Qwen3-235B</a> (evaluation) · To change models or use programmatically, install <a href="https://pypi.org/project/hedit/" target="_blank" rel="noopener noreferrer">HEDit CLI</a></span>
1238+
<span>Models: <a href="https://openrouter.ai/anthropic/claude-haiku-4.5" target="_blank" rel="noopener noreferrer">Claude Haiku 4.5</a> (annotation) · <a href="https://openrouter.ai/qwen/qwen3-vl-30b-a3b-instruct" target="_blank" rel="noopener noreferrer">Qwen3-VL-30B</a> (vision) · <a href="https://openrouter.ai/openai/gpt-oss-120b" target="_blank" rel="noopener noreferrer">GPT-OSS-120B</a> (evaluation) · To change models or use programmatically, install <a href="https://pypi.org/project/hedit/" target="_blank" rel="noopener noreferrer">HEDit CLI</a></span>
12391239
</div>
12401240

12411241
<script src="config.js"></script>

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "hedit"
7-
version = "0.7.4a4"
7+
version = "0.7.5.dev0"
88
description = "Multi-agent system for HED annotation generation and validation"
99
readme = "PKG_README.md"
1010
requires-python = ">=3.12"

src/agents/annotation_agent.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from langchain_core.messages import HumanMessage, SystemMessage
1212

1313
from src.agents.state import HedAnnotationState
14+
from src.utils import extract_text_content
1415
from src.utils.hed_comprehensive_guide import get_comprehensive_hed_guide
1516
from src.utils.json_schema_loader import HedJsonSchemaLoader, load_latest_schema
1617

@@ -212,8 +213,7 @@ async def annotate(self, state: HedAnnotationState) -> dict:
212213
except Exception as e:
213214
logger.error("LLM invocation failed: %s", e, exc_info=True)
214215
raise
215-
content = response.content
216-
raw_annotation = content.strip() if isinstance(content, str) else str(content)
216+
raw_annotation = extract_text_content(response.content)
217217

218218
# Clean up LLM output - extract just the HED annotation
219219
annotation = self._extract_hed_annotation(raw_annotation)

src/agents/assessment_agent.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,16 @@
44
elements or dimensions in the HED annotation.
55
"""
66

7+
import logging
78
from pathlib import Path
89

910
from langchain_core.language_models import BaseChatModel
1011
from langchain_core.messages import HumanMessage, SystemMessage
1112

1213
from src.agents.state import HedAnnotationState
14+
from src.utils import extract_text_content
15+
16+
logger = logging.getLogger(__name__)
1317

1418

1519
class AssessmentAgent:
@@ -104,9 +108,12 @@ async def assess(self, state: HedAnnotationState) -> dict:
104108
HumanMessage(content=user_prompt),
105109
]
106110

107-
response = await self.llm.ainvoke(messages)
108-
content = response.content
109-
feedback = content.strip() if isinstance(content, str) else str(content)
111+
try:
112+
response = await self.llm.ainvoke(messages)
113+
except Exception as e:
114+
logger.error("Assessment LLM invocation failed: %s", e, exc_info=True)
115+
raise
116+
feedback = extract_text_content(response.content)
110117

111118
# Parse completion status from assessment feedback
112119
# Format is "COMPLETENESS: complete" and "STATUS: COMPLETE"

src/agents/evaluation_agent.py

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,19 @@
44
the original natural language event description.
55
"""
66

7+
import logging
8+
import re
79
from pathlib import Path
810

911
from langchain_core.language_models import BaseChatModel
1012
from langchain_core.messages import HumanMessage, SystemMessage
1113

1214
from src.agents.state import HedAnnotationState
15+
from src.utils import extract_text_content
1316
from src.utils.json_schema_loader import HedJsonSchemaLoader, load_latest_schema
1417

18+
logger = logging.getLogger(__name__)
19+
1520

1621
class EvaluationAgent:
1722
"""Agent that evaluates the faithfulness of HED annotations.
@@ -163,9 +168,12 @@ async def evaluate(self, state: HedAnnotationState) -> dict:
163168
HumanMessage(content=user_prompt),
164169
]
165170

166-
response = await self.llm.ainvoke(messages)
167-
content = response.content
168-
feedback = content.strip() if isinstance(content, str) else str(content)
171+
try:
172+
response = await self.llm.ainvoke(messages)
173+
except Exception as e:
174+
logger.error("Evaluation LLM invocation failed: %s", e, exc_info=True)
175+
raise
176+
feedback = extract_text_content(response.content)
169177

170178
# Parse decision with multiple fallbacks
171179
is_faithful = self._parse_decision(feedback)
@@ -186,8 +194,6 @@ def _parse_decision(self, feedback: str) -> bool:
186194
Returns:
187195
True if annotation should be accepted, False if needs refinement
188196
"""
189-
import re
190-
191197
feedback_lower = feedback.lower()
192198

193199
# Check for explicit DECISION line
@@ -201,19 +207,16 @@ def _parse_decision(self, feedback: str) -> bool:
201207
result = faithful_match.group(1)
202208
return result in ["yes", "partial"] # Accept partial as good enough!
203209

204-
# Fallback: look for positive indicators
205-
positive_indicators = ["accept", "good", "sufficient", "adequate", "captures well"]
206-
negative_indicators = ["refine", "missing", "incorrect", "inaccurate", "lacks"]
207-
208-
positive_score = sum(1 for indicator in positive_indicators if indicator in feedback_lower)
209-
negative_score = sum(1 for indicator in negative_indicators if indicator in feedback_lower)
210+
# Fallback: look for explicit refine indicators only
211+
refine_indicators = ["refine", "incorrect", "inaccurate", "wrong"]
212+
if any(indicator in feedback_lower for indicator in refine_indicators):
213+
return False
210214

211-
# If more positive than negative, accept
212-
if positive_score > negative_score:
213-
return True
214-
215-
# Default to refine if ambiguous (conservative)
216-
return False
215+
# Default to accept if ambiguous -- avoid unnecessary refinement loops
216+
logger.debug(
217+
"Evaluation parsing: no explicit DECISION/FAITHFUL/refine indicator found; defaulting to ACCEPT"
218+
)
219+
return True
217220

218221
def _check_tags_and_suggest(self, annotation: str) -> str:
219222
"""Check annotation for invalid tags and suggest alternatives.

src/agents/feedback_summarizer.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,15 @@
44
into concise, actionable points for the annotation agent.
55
"""
66

7+
import logging
8+
79
from langchain_core.language_models import BaseChatModel
810
from langchain_core.messages import HumanMessage, SystemMessage
911

1012
from src.agents.state import HedAnnotationState
13+
from src.utils import extract_text_content
14+
15+
logger = logging.getLogger(__name__)
1116

1217

1318
class FeedbackSummarizer:
@@ -112,9 +117,12 @@ async def summarize(self, state: HedAnnotationState) -> dict:
112117
HumanMessage(content=user_prompt),
113118
]
114119

115-
response = await self.llm.ainvoke(messages)
116-
content = response.content
117-
summarized_feedback = content.strip() if isinstance(content, str) else str(content)
120+
try:
121+
response = await self.llm.ainvoke(messages)
122+
except Exception as e:
123+
logger.error("Feedback summarization LLM invocation failed: %s", e, exc_info=True)
124+
raise
125+
summarized_feedback = extract_text_content(response.content)
118126

119127
# Replace verbose feedback with summary (only augmented fields for LLM, not raw for users)
120128
return {

src/agents/state.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,8 @@ class HedAnnotationState(TypedDict):
8181
def create_initial_state(
8282
input_description: str,
8383
schema_version: str = "8.4.0",
84-
max_validation_attempts: int = 5,
85-
max_total_iterations: int = 10,
84+
max_validation_attempts: int = 3,
85+
max_total_iterations: int | None = None,
8686
run_assessment: bool = False,
8787
extracted_keywords: list[str] | None = None,
8888
semantic_hints: list[dict] | None = None,
@@ -93,8 +93,8 @@ def create_initial_state(
9393
Args:
9494
input_description: Natural language event description to annotate
9595
schema_version: HED schema version to use (default: "8.4.0")
96-
max_validation_attempts: Maximum validation retry attempts (default: 5)
97-
max_total_iterations: Maximum total iterations to prevent infinite loops (default: 10)
96+
max_validation_attempts: Maximum validation retry attempts (default: 3)
97+
max_total_iterations: Maximum total iterations (default: max_validation_attempts + 1)
9898
run_assessment: Whether to run final assessment (default: False)
9999
extracted_keywords: Pre-extracted keywords from description (optional)
100100
semantic_hints: Pre-computed semantic search hints (optional)
@@ -103,6 +103,9 @@ def create_initial_state(
103103
Returns:
104104
Initial HedAnnotationState
105105
"""
106+
if max_total_iterations is None:
107+
max_total_iterations = max_validation_attempts + 1
108+
106109
return HedAnnotationState(
107110
messages=[],
108111
input_description=input_description,

src/agents/vision_agent.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from langchain_core.language_models import BaseChatModel
88
from langchain_core.messages import HumanMessage
99

10+
from src.utils import extract_text_content
1011
from src.utils.image_processing import prepare_image_for_vision_model
1112

1213
DEFAULT_VISION_PROMPT = """Describe what you see in this image. Include the setting, main elements, colors, lighting, and overall composition. Be specific and detailed. Form the response as a continuous paragraph. Maximum 200 words."""
@@ -70,8 +71,7 @@ async def describe_image(
7071

7172
# Generate description
7273
response = await self.llm.ainvoke([message])
73-
content = response.content
74-
description = content.strip() if isinstance(content, str) else str(content)
74+
description = extract_text_content(response.content)
7575

7676
return {
7777
"description": description,
@@ -115,8 +115,7 @@ def describe_image_sync(
115115

116116
# Generate description
117117
response = self.llm.invoke([message])
118-
content = response.content
119-
description = content.strip() if isinstance(content, str) else str(content)
118+
description = extract_text_content(response.content)
120119

121120
return {
122121
"description": description,

0 commit comments

Comments
 (0)