Merge pull request #5 from atasoglu/develop

atasoglu · web-flow · commit d3adb980c305 · 2025-11-10T22:35:08.000+03:00
chore(release): update version to 0.4.0 with new quality tagging syst…
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,14 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve
 
 Nothing yet.
 
+## [0.4.0] - 2025-01-10
+### Added
+- Quality tagging system for generated records
+  - `generate_quality_tags()` method in `JudgeResponse` to automatically tag samples based on judge scores
+  - Tags include overall quality levels (high/medium/low_quality) and dimension-specific tags (excellent/poor tool selection, arguments, clarity)
+  - Configurable thresholds for quality classification
+  - `quality_tags` field automatically populated in generated records
+
 ## [0.3.0] - 2025-01-10
 ### Added
 - Hugging Face dataset integration utilities in `examples/nano_tool_calling_v1/`
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ toolsgen = ["prompts/*.txt"]
 
 [project]
 name = "toolsgen"
-version = "0.3.0"
+version = "0.4.0"
 description = "Generate tool-calling datasets from OpenAI-compatible tool specs"
 readme = "README.md"
 requires-python = ">=3.9"
diff --git a/src/toolsgen/core/record_builder.py b/src/toolsgen/core/record_builder.py
@@ -54,6 +54,7 @@ def _build_record(
         "model": role_config.judge.model,
         "temperature": role_config.judge.temperature,
     }
+    quality_tags = []
     try:
         judge_result = judge_tool_calls(
             client=judge_client,
@@ -65,6 +66,7 @@ def _build_record(
             max_tokens=role_config.judge.max_tokens,
         )
         judge_dict.update(judge_result.to_dict())
+        quality_tags = judge_result.generate_quality_tags()
     except Exception:
         pass  # Continue without judge data
 
@@ -76,6 +78,7 @@ def _build_record(
         assistant_calls=tool_calls,
         problem_metadata={"generated": True, "user_request": user_request},
         judge=judge_dict,
+        quality_tags=quality_tags,
         tools_metadata={"num_tools": len(tools)},
     )
 
diff --git a/src/toolsgen/judge.py b/src/toolsgen/judge.py
@@ -60,6 +60,58 @@ def to_dict(self) -> Dict[str, Any]:
             "rubric_version": "0.1.0",
         }
 
+    def generate_quality_tags(
+        self,
+        high_quality_threshold: float = 0.9,
+        medium_quality_threshold: float = 0.7,
+        excellent_dimension_pct: float = 0.875,
+        poor_dimension_pct: float = 0.5,
+    ) -> List[str]:
+        """Generate quality tags based on scores.
+
+        Args:
+            high_quality_threshold: Overall score threshold for high_quality tag (default: 0.9).
+            medium_quality_threshold: Overall score threshold for medium_quality tag (default: 0.7).
+            excellent_dimension_pct: Percentage of max score for excellent tags (default: 0.875 = 87.5%).
+            poor_dimension_pct: Percentage of max score for poor tags (default: 0.5 = 50%).
+
+        Returns:
+            List of quality tags describing the sample.
+        """
+        tags = []
+
+        # Overall quality
+        if self.score >= high_quality_threshold:
+            tags.append("high_quality")
+        elif self.score >= medium_quality_threshold:
+            tags.append("medium_quality")
+        else:
+            tags.append("low_quality")
+
+        # Dimension-specific tags (based on percentage of max possible score)
+        tool_rel_excellent = 0.4 * excellent_dimension_pct
+        tool_rel_poor = 0.4 * poor_dimension_pct
+        if self.tool_relevance >= tool_rel_excellent:
+            tags.append("excellent_tool_selection")
+        elif self.tool_relevance < tool_rel_poor:
+            tags.append("poor_tool_selection")
+
+        arg_qual_excellent = 0.4 * excellent_dimension_pct
+        arg_qual_poor = 0.4 * poor_dimension_pct
+        if self.argument_quality >= arg_qual_excellent:
+            tags.append("excellent_arguments")
+        elif self.argument_quality < arg_qual_poor:
+            tags.append("poor_arguments")
+
+        clarity_excellent = 0.2 * excellent_dimension_pct
+        clarity_poor = 0.2 * poor_dimension_pct
+        if self.clarity >= clarity_excellent:
+            tags.append("high_clarity")
+        elif self.clarity < clarity_poor:
+            tags.append("low_clarity")
+
+        return tags
+
 
 def judge_tool_calls(
     client: OpenAI,
diff --git a/uv.lock b/uv.lock