Skip to content

Commit 4a3734d

Browse files
authored
Merge pull request #2 from atasoglu/feat/parallel-generation
feat: Add parallel generation support with multiprocessing and update…
2 parents 74cd385 + 058aed5 commit 4a3734d

File tree

13 files changed

+642
-194
lines changed

13 files changed

+642
-194
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,15 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve
88

99
Nothing yet.
1010

11+
## [0.2.0] - 2025-01-09
12+
### Added
13+
- Parallel generation support with multiprocessing via `--workers` and `--worker-batch-size` CLI flags
14+
- `num_workers` and `worker_batch_size` configuration options in `GenerationConfig`
15+
- Parallel generation example in `examples/parallel/`
16+
17+
### Fixed
18+
- Fixed tool subset diversity preservation in parallel mode by sorting records by original sample index before assigning final IDs
19+
1120
## [0.1.4] - 2025-11-09
1221
### Changed
1322
- Made `max_tokens` optional across all chat completion helpers and dataset flows so callers can rely on model defaults unless a limit is explicitly set.

README.md

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ ToolsGen automates the creation of tool-calling datasets for training and evalua
2323
- **Hugging Face Ready**: JSONL output format compatible with Hugging Face datasets
2424
- **Configurable Quality Control**: Adjustable scoring thresholds and retry mechanisms
2525
- **Train/Val Splitting**: Built-in dataset splitting for model training workflows
26+
- **Parallel Generation**: Multiprocessing pipeline to accelerate dataset creation on multi-core hosts
2627

2728
## Requirements
2829

@@ -62,9 +63,19 @@ toolsgen generate \
6263
--strategy param_aware \
6364
--seed 42 \
6465
--train-split 0.9 \
66+
--workers 4 \
67+
--worker-batch-size 8 \
6568
--problem-model gpt-4o-mini --problem-temp 0.9 \
6669
--caller-model gpt-4o --caller-temp 0.3 \
6770
--judge-model gpt-4o --judge-temp 0.0
71+
72+
# Parallel generation with 6 workers processing four samples per task
73+
toolsgen generate \
74+
--tools tools.json \
75+
--out output_dir \
76+
--num 500 \
77+
--workers 6 \
78+
--worker-batch-size 4
6879
```
6980

7081
### Python API Usage
@@ -87,6 +98,8 @@ gen_config = GenerationConfig(
8798
train_split=0.9, # 90% train, 10% validation
8899
batch_size=10, # optional: iterate tools in batches
89100
shuffle_tools=True, # optional: reshuffle tools between batches
101+
num_workers=4, # enable multiprocessing
102+
worker_batch_size=2, # samples per worker task
90103
)
91104

92105
model_config = ModelConfig(
@@ -213,7 +226,7 @@ For detailed information about the system architecture, pipeline, and core compo
213226
### Planned Features
214227
- [ ] Multi-turn conversation support
215228
- [ ] Custom prompt template system
216-
- [ ] Parallel generation with multiprocessing
229+
- [x] Parallel generation with multiprocessing
217230
- [ ] Additional sampling strategies (coverage-based, difficulty-based)
218231
- [ ] Integration with Hugging Face Hub for direct dataset uploads
219232
- [ ] Support for more LLM providers (Anthropic, Cohere, etc.)

examples/parallel/example.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"""Parallel generation example - accelerate dataset creation with multiprocessing."""
2+
3+
from pathlib import Path
4+
5+
from dotenv import load_dotenv
6+
7+
from toolsgen import GenerationConfig, ModelConfig, generate_dataset
8+
9+
# Automatically load environment variables from .env if present
10+
load_dotenv()
11+
12+
# NOTE: Set OPENAI_API_KEY environment variable before running
13+
# e.g. export OPENAI_API_KEY="your-api-key-here"
14+
15+
16+
def main() -> None:
17+
tools_path = Path(__file__).parent / "tools.json"
18+
output_dir = Path(__file__).parent / "output"
19+
20+
# Configure multiprocessing with 6 workers, each handling 3 samples per task
21+
# Shuffle tools between batches to mix coverage while the workers run in parallel
22+
# Increase num_samples to better showcase the throughput benefits of multiprocessing
23+
24+
gen_config = GenerationConfig(
25+
num_samples=30,
26+
strategy="semantic",
27+
seed=2025,
28+
max_attempts=3,
29+
batch_size=2,
30+
shuffle_tools=True,
31+
num_workers=6,
32+
worker_batch_size=3,
33+
)
34+
35+
# Single model configuration shared across roles
36+
model_config = ModelConfig(
37+
model="gpt-4o-mini",
38+
temperature=0.6,
39+
)
40+
41+
manifest = generate_dataset(
42+
output_dir, gen_config, model_config, tools_path=tools_path
43+
)
44+
45+
print(
46+
f"\n✓ Parallel run complete: {manifest['num_generated']}/{manifest['num_requested']} records"
47+
)
48+
if manifest["num_failed"]:
49+
print(f" Failed attempts: {manifest['num_failed']}")
50+
print(f" Workers used: {gen_config.num_workers}")
51+
print(f" Worker batch size: {gen_config.worker_batch_size}")
52+
print(f" Output directory: {output_dir}")
53+
54+
55+
if __name__ == "__main__":
56+
main()

examples/parallel/tools.json

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
[
2+
{
3+
"function": {
4+
"description": "Get current weather for a location",
5+
"name": "get_weather",
6+
"parameters": {
7+
"properties": {
8+
"location": {
9+
"description": "City name or coordinates",
10+
"type": "string"
11+
},
12+
"unit": {
13+
"description": "Temperature unit",
14+
"enum": [
15+
"celsius",
16+
"fahrenheit"
17+
],
18+
"type": "string"
19+
}
20+
},
21+
"required": [
22+
"location"
23+
],
24+
"type": "object"
25+
}
26+
},
27+
"type": "function"
28+
},
29+
{
30+
"function": {
31+
"description": "Perform basic arithmetic operations",
32+
"name": "calculate",
33+
"parameters": {
34+
"properties": {
35+
"a": {
36+
"description": "First number",
37+
"type": "number"
38+
},
39+
"b": {
40+
"description": "Second number",
41+
"type": "number"
42+
},
43+
"operation": {
44+
"description": "Arithmetic operation",
45+
"enum": [
46+
"add",
47+
"subtract",
48+
"multiply",
49+
"divide"
50+
],
51+
"type": "string"
52+
}
53+
},
54+
"required": [
55+
"operation",
56+
"a",
57+
"b"
58+
],
59+
"type": "object"
60+
}
61+
},
62+
"type": "function"
63+
}
64+
]

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ toolsgen = ["prompts/*.txt"]
1313

1414
[project]
1515
name = "toolsgen"
16-
version = "0.1.4"
16+
version = "0.2.0"
1717
description = "Generate tool-calling datasets from OpenAI-compatible tool specs"
1818
readme = "README.md"
1919
requires-python = ">=3.9"

src/toolsgen/cli.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,18 @@ def create_parser() -> argparse.ArgumentParser:
9696
action="store_true",
9797
help="Shuffle tool order before batching (default: disabled)",
9898
)
99+
gen_parser.add_argument(
100+
"--workers",
101+
type=int,
102+
default=1,
103+
help="Number of worker processes for generation (default: 1)",
104+
)
105+
gen_parser.add_argument(
106+
"--worker-batch-size",
107+
type=int,
108+
default=1,
109+
help="Number of samples each worker processes per task (default: 1)",
110+
)
99111

100112
# Model config
101113
gen_parser.add_argument(
@@ -187,6 +199,14 @@ def cmd_generate(args: argparse.Namespace) -> None:
187199
print("Error: --temperature must be between 0.0 and 2.0", file=sys.stderr)
188200
sys.exit(1)
189201

202+
if args.workers < 1:
203+
print("Error: --workers must be at least 1", file=sys.stderr)
204+
sys.exit(1)
205+
206+
if args.worker_batch_size < 1:
207+
print("Error: --worker-batch-size must be at least 1", file=sys.stderr)
208+
sys.exit(1)
209+
190210
# Create generation config
191211
gen_config = GenerationConfig(
192212
num_samples=args.num,
@@ -197,6 +217,8 @@ def cmd_generate(args: argparse.Namespace) -> None:
197217
max_attempts=args.max_attempts,
198218
batch_size=args.batch_size,
199219
shuffle_tools=args.shuffle_tools,
220+
num_workers=args.workers,
221+
worker_batch_size=args.worker_batch_size,
200222
)
201223

202224
# Create model config

src/toolsgen/core/config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ class GenerationConfig:
2121
k_max: Maximum number of tools per sample. Default None (uses all available tools).
2222
batch_size: Optional chunk size for tool batching. Default None (single batch).
2323
shuffle_tools: Whether to shuffle tools before batching. Default False.
24+
num_workers: Number of concurrent worker processes. Default 1 (sequential).
25+
worker_batch_size: Samples processed per worker task submission. Default 1.
2426
"""
2527

2628
num_samples: int = 10
@@ -33,6 +35,8 @@ class GenerationConfig:
3335
k_max: Optional[int] = None
3436
batch_size: Optional[int] = None
3537
shuffle_tools: bool = False
38+
num_workers: int = 1
39+
worker_batch_size: int = 1
3640

3741

3842
@dataclass

0 commit comments

Comments
 (0)