Skip to content

Commit d3adb98

Browse files
authored
Merge pull request #5 from atasoglu/develop
chore(release): update version to 0.4.0 with new quality tagging syst…
2 parents 8521d3a + 2889299 commit d3adb98

File tree

5 files changed

+65
-2
lines changed

5 files changed

+65
-2
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,14 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve
88

99
Nothing yet.
1010

11+
## [0.4.0] - 2025-01-10
12+
### Added
13+
- Quality tagging system for generated records
14+
- `generate_quality_tags()` method in `JudgeResponse` to automatically tag samples based on judge scores
15+
- Tags include overall quality levels (high/medium/low_quality) and dimension-specific tags (excellent/poor tool selection, arguments, clarity)
16+
- Configurable thresholds for quality classification
17+
- `quality_tags` field automatically populated in generated records
18+
1119
## [0.3.0] - 2025-01-10
1220
### Added
1321
- Hugging Face dataset integration utilities in `examples/nano_tool_calling_v1/`

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ toolsgen = ["prompts/*.txt"]
1313

1414
[project]
1515
name = "toolsgen"
16-
version = "0.3.0"
16+
version = "0.4.0"
1717
description = "Generate tool-calling datasets from OpenAI-compatible tool specs"
1818
readme = "README.md"
1919
requires-python = ">=3.9"

src/toolsgen/core/record_builder.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def _build_record(
5454
"model": role_config.judge.model,
5555
"temperature": role_config.judge.temperature,
5656
}
57+
quality_tags = []
5758
try:
5859
judge_result = judge_tool_calls(
5960
client=judge_client,
@@ -65,6 +66,7 @@ def _build_record(
6566
max_tokens=role_config.judge.max_tokens,
6667
)
6768
judge_dict.update(judge_result.to_dict())
69+
quality_tags = judge_result.generate_quality_tags()
6870
except Exception:
6971
pass # Continue without judge data
7072

@@ -76,6 +78,7 @@ def _build_record(
7678
assistant_calls=tool_calls,
7779
problem_metadata={"generated": True, "user_request": user_request},
7880
judge=judge_dict,
81+
quality_tags=quality_tags,
7982
tools_metadata={"num_tools": len(tools)},
8083
)
8184

src/toolsgen/judge.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,58 @@ def to_dict(self) -> Dict[str, Any]:
6060
"rubric_version": "0.1.0",
6161
}
6262

63+
def generate_quality_tags(
64+
self,
65+
high_quality_threshold: float = 0.9,
66+
medium_quality_threshold: float = 0.7,
67+
excellent_dimension_pct: float = 0.875,
68+
poor_dimension_pct: float = 0.5,
69+
) -> List[str]:
70+
"""Generate quality tags based on scores.
71+
72+
Args:
73+
high_quality_threshold: Overall score threshold for high_quality tag (default: 0.9).
74+
medium_quality_threshold: Overall score threshold for medium_quality tag (default: 0.7).
75+
excellent_dimension_pct: Percentage of max score for excellent tags (default: 0.875 = 87.5%).
76+
poor_dimension_pct: Percentage of max score for poor tags (default: 0.5 = 50%).
77+
78+
Returns:
79+
List of quality tags describing the sample.
80+
"""
81+
tags = []
82+
83+
# Overall quality
84+
if self.score >= high_quality_threshold:
85+
tags.append("high_quality")
86+
elif self.score >= medium_quality_threshold:
87+
tags.append("medium_quality")
88+
else:
89+
tags.append("low_quality")
90+
91+
# Dimension-specific tags (based on percentage of max possible score)
92+
tool_rel_excellent = 0.4 * excellent_dimension_pct
93+
tool_rel_poor = 0.4 * poor_dimension_pct
94+
if self.tool_relevance >= tool_rel_excellent:
95+
tags.append("excellent_tool_selection")
96+
elif self.tool_relevance < tool_rel_poor:
97+
tags.append("poor_tool_selection")
98+
99+
arg_qual_excellent = 0.4 * excellent_dimension_pct
100+
arg_qual_poor = 0.4 * poor_dimension_pct
101+
if self.argument_quality >= arg_qual_excellent:
102+
tags.append("excellent_arguments")
103+
elif self.argument_quality < arg_qual_poor:
104+
tags.append("poor_arguments")
105+
106+
clarity_excellent = 0.2 * excellent_dimension_pct
107+
clarity_poor = 0.2 * poor_dimension_pct
108+
if self.clarity >= clarity_excellent:
109+
tags.append("high_clarity")
110+
elif self.clarity < clarity_poor:
111+
tags.append("low_clarity")
112+
113+
return tags
114+
63115

64116
def judge_tool_calls(
65117
client: OpenAI,

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)