Skip to content

Commit 1d59d92

Browse files
authored
Merge pull request #7 from atasoglu/example/turkish-tool-calling-v1
Example/turkish tool calling v1
2 parents 74a62ac + d541dd2 commit 1d59d92

File tree

12 files changed

+717
-1
lines changed

12 files changed

+717
-1
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,4 +205,5 @@ __marimo__/
205205
.cursor/
206206

207207
# Example outputs
208-
examples/*/output/
208+
examples/*/*output*/
209+
*.jsonl
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# Turkish Tool Calling v1
2+
3+
A synthetic Turkish tool-calling dataset generated using [ToolsGen](https://github.com/atasoglu/toolsgen) with Qwen models via OpenRouter.
4+
5+
## Dataset Details
6+
7+
- **Generated with**: ToolsGen
8+
- **Total Samples**: 1,000
9+
- **Language**: Turkish
10+
- **Format**: Single-turn conversations with tool calls
11+
12+
### Models Used
13+
14+
- **Problem Generator**: qwen/qwen3-235b-a22b-2507 (temp=1.0)
15+
- **Tool Caller**: qwen/qwen3-235b-a22b-2507 (temp=0.0)
16+
- **Judge**: qwen/qwen3-235b-a22b-2507 (temp=0.0)
17+
18+
## Dataset Structure
19+
20+
Each record contains:
21+
22+
```json
23+
{
24+
"id": "record_000000",
25+
"language": "turkish",
26+
"tools": [...],
27+
"messages": [
28+
{"role": "user", "content": "İstanbul'da hava durumu nasıl?"}
29+
],
30+
"assistant_calls": [
31+
{
32+
"id": "call_...",
33+
"type": "function",
34+
"function": {
35+
"name": "get_weather",
36+
"arguments": "{\"location\": \"Istanbul, Turkey\"}"
37+
}
38+
}
39+
],
40+
"problem_metadata": {...},
41+
"judge": {
42+
"tool_relevance": 0.4,
43+
"argument_quality": 0.38,
44+
"clarity": 0.2,
45+
"score": 0.98,
46+
"verdict": "accept",
47+
"rationale": "...",
48+
"rubric_version": "0.1.0",
49+
"model": "qwen/qwen3-235b-a22b-2507",
50+
"temperature": 0.0
51+
},
52+
"quality_tags": [],
53+
"tools_metadata": {"num_tools": 2}
54+
}
55+
```
56+
57+
## Generation Details
58+
59+
### Configuration
60+
61+
- **Strategy**: Random tool sampling
62+
- **Tools per sample**: 1-8 (k_min=1, k_max=8)
63+
- **Max attempts**: 1
64+
- **Train split**: 80%
65+
- **Seed**: Random (1-10M range)
66+
67+
### Quality Control
68+
69+
All samples passed through an LLM-as-a-judge evaluation with a multi-dimensional rubric:
70+
71+
- **Tool Relevance** (40%): Are the selected tools appropriate?
72+
- **Argument Quality** (38%): Are arguments valid and plausible?
73+
- **Clarity** (20%): Is the response complete and clear?
74+
75+
Samples with `score >= 0.7` and `verdict == "accept"` are included.
76+
77+
## Usage
78+
79+
```python
80+
from datasets import load_dataset
81+
82+
dataset = load_dataset("atasoglu/turkish-tool-calling-v1")
83+
84+
# Access a sample
85+
sample = dataset["train"][0]
86+
print(sample["messages"])
87+
print(sample["assistant_calls"])
88+
```
89+
90+
## Limitations
91+
92+
- Single-turn conversations only
93+
- Turkish language only
94+
- Synthetic data generated by LLMs (may contain artifacts)
95+
- No actual tool execution or validation
96+
- Judge scores are model-based assessments
97+
98+
## Citation
99+
100+
```bibtex
101+
@software{toolsgen2025,
102+
title = {ToolsGen: Synthetic Tool-Calling Dataset Generator},
103+
author = {Ataşoğlu, Ahmet},
104+
year = {2025},
105+
url = {https://github.com/atasoglu/toolsgen}
106+
}
107+
```
108+
109+
## License
110+
111+
MIT License
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from random import randint
2+
from toolsgen import (
3+
GenerationConfig,
4+
ModelConfig,
5+
RoleBasedModelConfig,
6+
)
7+
8+
seed = randint(1, 10_000_000)
9+
print(f"Using seed: {seed}")
10+
11+
openai_params = dict(
12+
base_url="https://openrouter.ai/api/v1",
13+
)
14+
15+
gen_config = GenerationConfig(
16+
num_samples=1000,
17+
strategy="random",
18+
seed=seed,
19+
train_split=0.8,
20+
language="turkish",
21+
max_attempts=1,
22+
k_min=1,
23+
k_max=8,
24+
shuffle_tools=True,
25+
)
26+
27+
role_config = RoleBasedModelConfig(
28+
problem_generator=ModelConfig(
29+
model="qwen/qwen3-235b-a22b-2507",
30+
temperature=1.0,
31+
openai_params=openai_params,
32+
max_tokens=500,
33+
),
34+
tool_caller=ModelConfig(
35+
model="qwen/qwen3-235b-a22b-2507",
36+
temperature=0,
37+
openai_params=openai_params,
38+
max_tokens=500,
39+
),
40+
judge=ModelConfig(
41+
model="qwen/qwen3-235b-a22b-2507",
42+
temperature=0,
43+
openai_params=openai_params,
44+
max_tokens=500,
45+
),
46+
)
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from pathlib import Path
2+
from dotenv import load_dotenv
3+
from preprocessing import load_tools_from_file
4+
from config import gen_config, role_config
5+
from toolsgen import generate_dataset
6+
from uuid import uuid4
7+
8+
# Load environment variables from .env file
9+
load_dotenv()
10+
11+
12+
def main() -> None:
13+
# Load tools from file
14+
tools = list(load_tools_from_file("tools.jsonl"))
15+
print("Loaded tools from file.")
16+
print(f"Number of tools loaded: {len(tools)}")
17+
18+
# Define output directory with timestamp
19+
output_dir = Path(__file__).parent / f"output_{uuid4().hex}"
20+
21+
# Generate dataset
22+
manifest = generate_dataset(output_dir, gen_config, role_config, tools=tools)
23+
24+
# Print summary
25+
print(
26+
f"\n✓ Generated {manifest['num_generated']}/{manifest['num_requested']} records"
27+
)
28+
if manifest["num_failed"] > 0:
29+
print(f" Failed: {manifest['num_failed']} attempts")
30+
print(f" Problem Generator: {role_config.problem_generator.model}")
31+
print(f" Tool Caller: {role_config.tool_caller.model}")
32+
print(f" Judge: {role_config.judge.model}")
33+
print(f" Output: {output_dir}")
34+
35+
36+
if __name__ == "__main__":
37+
main()
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .streamer import get_dirs, get_jsonl_files, read_lines, save_line
2+
3+
__all__ = ["get_dirs", "get_jsonl_files", "read_lines", "save_line"]
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import json
2+
from .streamer import get_dirs, get_jsonl_files, read_lines, save_line
3+
from pathlib import Path
4+
from typing import Optional
5+
6+
7+
def postprocess(line: str, max_newlines: int = 10) -> Optional[dict]:
8+
"""Postprocess the given line string by limiting consecutive newlines."""
9+
try:
10+
if line.count(r"\n") > max_newlines:
11+
raise ValueError("Too many newlines in the line.")
12+
return json.loads(line)
13+
except Exception:
14+
return None
15+
16+
17+
def main():
18+
success = 0
19+
failed = 0
20+
base_dir = Path.cwd()
21+
output = base_dir / "postprocessed.jsonl"
22+
output.touch(exist_ok=True)
23+
dirs = get_dirs(base_dir)
24+
for dir in dirs:
25+
jsonl_files = get_jsonl_files(dir)
26+
for jsonl_file in jsonl_files:
27+
for line in read_lines(jsonl_file):
28+
json_dict = postprocess(line)
29+
if json_dict is not None:
30+
success += 1
31+
json_dict["id"] = f"record_{success:06d}"
32+
save_line(output, json.dumps(json_dict, ensure_ascii=False))
33+
else:
34+
failed += 1
35+
print(
36+
f"\rProcessed lines: {success + failed} (Success: {success}, Failed: {failed})",
37+
end="\r",
38+
)
39+
print(f"\nTotal processed lines: {success}")
40+
print(f"Total failed lines: {failed}")
41+
42+
43+
if __name__ == "__main__":
44+
main()
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from pathlib import Path
2+
from typing import Generator
3+
4+
5+
def get_jsonl_files(directory: Path) -> Generator[Path, None, None]:
6+
"""Get all JSONL files in the specified directory."""
7+
yield from directory.glob("*.jsonl")
8+
9+
10+
def get_dirs(directory: Path) -> Generator[Path, None, None]:
11+
"""Get all subdirectories in the specified directory."""
12+
for d in directory.iterdir():
13+
if d.is_dir():
14+
yield d
15+
16+
17+
def read_lines(file_path: Path) -> Generator[str, None, None]:
18+
"""Read lines from a file."""
19+
with file_path.open("r", encoding="utf-8") as f:
20+
for line in f:
21+
yield line.strip()
22+
23+
24+
def save_line(file_path: Path, line: str):
25+
"""Save a line to a file."""
26+
with file_path.open("a", encoding="utf-8") as f:
27+
f.write(line + "\n")
28+
29+
30+
def count_newlines(s: str) -> int:
31+
"""Count the number of newline characters in a string."""
32+
return s.count("\n")
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from .streamer import (
2+
stream_tools_from_datasets,
3+
save_tools_to_file,
4+
load_tools_from_file,
5+
)
6+
7+
__all__ = ["stream_tools_from_datasets", "save_tools_to_file", "load_tools_from_file"]
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import time
2+
from .streamer import stream_tools_from_datasets, save_tools_to_file
3+
from dotenv import load_dotenv
4+
from pathlib import Path
5+
from typing import Generator
6+
7+
load_dotenv()
8+
example_dir = Path(__file__).parent.parent
9+
file_path = example_dir / "tools.jsonl"
10+
11+
12+
def stream_wrapper(stream: Generator[dict, None, None]) -> Generator[dict, None, None]:
13+
total = 0
14+
start = time.time()
15+
for tool in stream:
16+
total += 1
17+
yield tool
18+
print(f"Processed {total} tools...", end="\r")
19+
end = time.time()
20+
print(f"Finished processing {total} tool definitions in {end - start:.2f} seconds.")
21+
22+
23+
def main():
24+
dataset_ids = [
25+
"argilla/Synth-APIGen-v0.1",
26+
"Salesforce/xlam-function-calling-60k",
27+
"argilla-warehouse/python-seed-tools",
28+
]
29+
tools = stream_tools_from_datasets(dataset_ids, debug=False)
30+
save_tools_to_file(stream_wrapper(tools), str(file_path))
31+
32+
33+
if __name__ == "__main__":
34+
main()

0 commit comments

Comments
 (0)