Merge pull request #7 from atasoglu/example/turkish-tool-calling-v1

atasoglu · web-flow · commit 1d59d92b9eda · 2025-11-16T21:16:13.000+03:00
Example/turkish tool calling v1
diff --git a/.gitignore b/.gitignore
@@ -205,4 +205,5 @@ __marimo__/
 .cursor/
 
 # Example outputs
-examples/*/output/
+examples/*/*output*/
+*.jsonl
diff --git a/examples/turkish_tool_calling_v1/README.md b/examples/turkish_tool_calling_v1/README.md
@@ -0,0 +1,111 @@
+# Turkish Tool Calling v1
+
+A synthetic Turkish tool-calling dataset generated using [ToolsGen](https://github.com/atasoglu/toolsgen) with Qwen models via OpenRouter.
+
+## Dataset Details
+
+- **Generated with**: ToolsGen
+- **Total Samples**: 1,000
+- **Language**: Turkish
+- **Format**: Single-turn conversations with tool calls
+
+### Models Used
+
+- **Problem Generator**: qwen/qwen3-235b-a22b-2507 (temp=1.0)
+- **Tool Caller**: qwen/qwen3-235b-a22b-2507 (temp=0.0)
+- **Judge**: qwen/qwen3-235b-a22b-2507 (temp=0.0)
+
+## Dataset Structure
+
+Each record contains:
+
+```json
+{
+  "id": "record_000000",
+  "language": "turkish",
+  "tools": [...],
+  "messages": [
+    {"role": "user", "content": "İstanbul'da hava durumu nasıl?"}
+  ],
+  "assistant_calls": [
+    {
+      "id": "call_...",
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "arguments": "{\"location\": \"Istanbul, Turkey\"}"
+      }
+    }
+  ],
+  "problem_metadata": {...},
+  "judge": {
+    "tool_relevance": 0.4,
+    "argument_quality": 0.38,
+    "clarity": 0.2,
+    "score": 0.98,
+    "verdict": "accept",
+    "rationale": "...",
+    "rubric_version": "0.1.0",
+    "model": "qwen/qwen3-235b-a22b-2507",
+    "temperature": 0.0
+  },
+  "quality_tags": [],
+  "tools_metadata": {"num_tools": 2}
+}
+```
+
+## Generation Details
+
+### Configuration
+
+- **Strategy**: Random tool sampling
+- **Tools per sample**: 1-8 (k_min=1, k_max=8)
+- **Max attempts**: 1
+- **Train split**: 80%
+- **Seed**: Random (1-10M range)
+
+### Quality Control
+
+All samples passed through an LLM-as-a-judge evaluation with a multi-dimensional rubric:
+
+- **Tool Relevance** (40%): Are the selected tools appropriate?
+- **Argument Quality** (38%): Are arguments valid and plausible?
+- **Clarity** (20%): Is the response complete and clear?
+
+Samples with `score >= 0.7` and `verdict == "accept"` are included.
+
+## Usage
+
+```python
+from datasets import load_dataset
+
+dataset = load_dataset("atasoglu/turkish-tool-calling-v1")
+
+# Access a sample
+sample = dataset["train"][0]
+print(sample["messages"])
+print(sample["assistant_calls"])
+```
+
+## Limitations
+
+- Single-turn conversations only
+- Turkish language only
+- Synthetic data generated by LLMs (may contain artifacts)
+- No actual tool execution or validation
+- Judge scores are model-based assessments
+
+## Citation
+
+```bibtex
+@software{toolsgen2025,
+  title = {ToolsGen: Synthetic Tool-Calling Dataset Generator},
+  author = {Ataşoğlu, Ahmet},
+  year = {2025},
+  url = {https://github.com/atasoglu/toolsgen}
+}
+```
+
+## License
+
+MIT License
diff --git a/examples/turkish_tool_calling_v1/config.py b/examples/turkish_tool_calling_v1/config.py
@@ -0,0 +1,46 @@
+from random import randint
+from toolsgen import (
+    GenerationConfig,
+    ModelConfig,
+    RoleBasedModelConfig,
+)
+
+seed = randint(1, 10_000_000)
+print(f"Using seed: {seed}")
+
+openai_params = dict(
+    base_url="https://openrouter.ai/api/v1",
+)
+
+gen_config = GenerationConfig(
+    num_samples=1000,
+    strategy="random",
+    seed=seed,
+    train_split=0.8,
+    language="turkish",
+    max_attempts=1,
+    k_min=1,
+    k_max=8,
+    shuffle_tools=True,
+)
+
+role_config = RoleBasedModelConfig(
+    problem_generator=ModelConfig(
+        model="qwen/qwen3-235b-a22b-2507",
+        temperature=1.0,
+        openai_params=openai_params,
+        max_tokens=500,
+    ),
+    tool_caller=ModelConfig(
+        model="qwen/qwen3-235b-a22b-2507",
+        temperature=0,
+        openai_params=openai_params,
+        max_tokens=500,
+    ),
+    judge=ModelConfig(
+        model="qwen/qwen3-235b-a22b-2507",
+        temperature=0,
+        openai_params=openai_params,
+        max_tokens=500,
+    ),
+)
diff --git a/examples/turkish_tool_calling_v1/main.py b/examples/turkish_tool_calling_v1/main.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+from dotenv import load_dotenv
+from preprocessing import load_tools_from_file
+from config import gen_config, role_config
+from toolsgen import generate_dataset
+from uuid import uuid4
+
+# Load environment variables from .env file
+load_dotenv()
+
+
+def main() -> None:
+    # Load tools from file
+    tools = list(load_tools_from_file("tools.jsonl"))
+    print("Loaded tools from file.")
+    print(f"Number of tools loaded: {len(tools)}")
+
+    # Define output directory with timestamp
+    output_dir = Path(__file__).parent / f"output_{uuid4().hex}"
+
+    # Generate dataset
+    manifest = generate_dataset(output_dir, gen_config, role_config, tools=tools)
+
+    # Print summary
+    print(
+        f"\n✓ Generated {manifest['num_generated']}/{manifest['num_requested']} records"
+    )
+    if manifest["num_failed"] > 0:
+        print(f"  Failed: {manifest['num_failed']} attempts")
+    print(f"  Problem Generator: {role_config.problem_generator.model}")
+    print(f"  Tool Caller: {role_config.tool_caller.model}")
+    print(f"  Judge: {role_config.judge.model}")
+    print(f"  Output: {output_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/turkish_tool_calling_v1/postprocessing/__init__.py b/examples/turkish_tool_calling_v1/postprocessing/__init__.py
@@ -0,0 +1,3 @@
+from .streamer import get_dirs, get_jsonl_files, read_lines, save_line
+
+__all__ = ["get_dirs", "get_jsonl_files", "read_lines", "save_line"]
diff --git a/examples/turkish_tool_calling_v1/postprocessing/__main__.py b/examples/turkish_tool_calling_v1/postprocessing/__main__.py
@@ -0,0 +1,44 @@
+import json
+from .streamer import get_dirs, get_jsonl_files, read_lines, save_line
+from pathlib import Path
+from typing import Optional
+
+
+def postprocess(line: str, max_newlines: int = 10) -> Optional[dict]:
+    """Postprocess the given line string by limiting consecutive newlines."""
+    try:
+        if line.count(r"\n") > max_newlines:
+            raise ValueError("Too many newlines in the line.")
+        return json.loads(line)
+    except Exception:
+        return None
+
+
+def main():
+    success = 0
+    failed = 0
+    base_dir = Path.cwd()
+    output = base_dir / "postprocessed.jsonl"
+    output.touch(exist_ok=True)
+    dirs = get_dirs(base_dir)
+    for dir in dirs:
+        jsonl_files = get_jsonl_files(dir)
+        for jsonl_file in jsonl_files:
+            for line in read_lines(jsonl_file):
+                json_dict = postprocess(line)
+                if json_dict is not None:
+                    success += 1
+                    json_dict["id"] = f"record_{success:06d}"
+                    save_line(output, json.dumps(json_dict, ensure_ascii=False))
+                else:
+                    failed += 1
+                print(
+                    f"\rProcessed lines: {success + failed} (Success: {success}, Failed: {failed})",
+                    end="\r",
+                )
+    print(f"\nTotal processed lines: {success}")
+    print(f"Total failed lines: {failed}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/turkish_tool_calling_v1/postprocessing/streamer.py b/examples/turkish_tool_calling_v1/postprocessing/streamer.py
@@ -0,0 +1,32 @@
+from pathlib import Path
+from typing import Generator
+
+
+def get_jsonl_files(directory: Path) -> Generator[Path, None, None]:
+    """Get all JSONL files in the specified directory."""
+    yield from directory.glob("*.jsonl")
+
+
+def get_dirs(directory: Path) -> Generator[Path, None, None]:
+    """Get all subdirectories in the specified directory."""
+    for d in directory.iterdir():
+        if d.is_dir():
+            yield d
+
+
+def read_lines(file_path: Path) -> Generator[str, None, None]:
+    """Read lines from a file."""
+    with file_path.open("r", encoding="utf-8") as f:
+        for line in f:
+            yield line.strip()
+
+
+def save_line(file_path: Path, line: str):
+    """Save a line to a file."""
+    with file_path.open("a", encoding="utf-8") as f:
+        f.write(line + "\n")
+
+
+def count_newlines(s: str) -> int:
+    """Count the number of newline characters in a string."""
+    return s.count("\n")
diff --git a/examples/turkish_tool_calling_v1/preprocessing/__init__.py b/examples/turkish_tool_calling_v1/preprocessing/__init__.py
@@ -0,0 +1,7 @@
+from .streamer import (
+    stream_tools_from_datasets,
+    save_tools_to_file,
+    load_tools_from_file,
+)
+
+__all__ = ["stream_tools_from_datasets", "save_tools_to_file", "load_tools_from_file"]
diff --git a/examples/turkish_tool_calling_v1/preprocessing/__main__.py b/examples/turkish_tool_calling_v1/preprocessing/__main__.py
@@ -0,0 +1,34 @@
+import time
+from .streamer import stream_tools_from_datasets, save_tools_to_file
+from dotenv import load_dotenv
+from pathlib import Path
+from typing import Generator
+
+load_dotenv()
+example_dir = Path(__file__).parent.parent
+file_path = example_dir / "tools.jsonl"
+
+
+def stream_wrapper(stream: Generator[dict, None, None]) -> Generator[dict, None, None]:
+    total = 0
+    start = time.time()
+    for tool in stream:
+        total += 1
+        yield tool
+        print(f"Processed {total} tools...", end="\r")
+    end = time.time()
+    print(f"Finished processing {total} tool definitions in {end - start:.2f} seconds.")
+
+
+def main():
+    dataset_ids = [
+        "argilla/Synth-APIGen-v0.1",
+        "Salesforce/xlam-function-calling-60k",
+        "argilla-warehouse/python-seed-tools",
+    ]
+    tools = stream_tools_from_datasets(dataset_ids, debug=False)
+    save_tools_to_file(stream_wrapper(tools), str(file_path))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/turkish_tool_calling_v1/preprocessing/schema.py b/examples/turkish_tool_calling_v1/preprocessing/schema.py
diff --git a/examples/turkish_tool_calling_v1/preprocessing/streamer.py b/examples/turkish_tool_calling_v1/preprocessing/streamer.py
diff --git a/examples/turkish_tool_calling_v1/push_to_hf.py b/examples/turkish_tool_calling_v1/push_to_hf.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .streamer import get_dirs, get_jsonl_files, read_lines, save_line`
	`2`	`+`
	`3`	`+__all__ = ["get_dirs", "get_jsonl_files", "read_lines", "save_line"]`