atasoglu
diff --git a/‎CHANGELOG.md‎
Lines changed: 9 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 14 additions & 1 deletion b/‎README.md‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎examples/parallel/example.py‎
Lines changed: 56 additions & 0 deletions b/‎examples/parallel/example.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎examples/parallel/tools.json‎
Lines changed: 64 additions & 0 deletions b/‎examples/parallel/tools.json‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/toolsgen/cli.py‎
Lines changed: 22 additions & 0 deletions b/‎src/toolsgen/cli.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎src/toolsgen/core/config.py‎
Lines changed: 4 additions & 0 deletions b/‎src/toolsgen/core/config.py‎
Lines changed: 4 additions & 0 deletions
@@ -8,6 +8,15 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve
 
 Nothing yet.
 
+## [0.2.0] - 2025-01-09
+### Added
+- Parallel generation support with multiprocessing via `--workers` and `--worker-batch-size` CLI flags
+- `num_workers` and `worker_batch_size` configuration options in `GenerationConfig`
+- Parallel generation example in `examples/parallel/`
+
+### Fixed
+- Fixed tool subset diversity preservation in parallel mode by sorting records by original sample index before assigning final IDs
+
 ## [0.1.4] - 2025-11-09
 ### Changed
 - Made `max_tokens` optional across all chat completion helpers and dataset flows so callers can rely on model defaults unless a limit is explicitly set.
 
@@ -23,6 +23,7 @@ ToolsGen automates the creation of tool-calling datasets for training and evalua
 - **Hugging Face Ready**: JSONL output format compatible with Hugging Face datasets
 - **Configurable Quality Control**: Adjustable scoring thresholds and retry mechanisms
 - **Train/Val Splitting**: Built-in dataset splitting for model training workflows
+- **Parallel Generation**: Multiprocessing pipeline to accelerate dataset creation on multi-core hosts
 
 ## Requirements
 
@@ -62,9 +63,19 @@ toolsgen generate \
   --strategy param_aware \
   --seed 42 \
   --train-split 0.9 \
+  --workers 4 \
+  --worker-batch-size 8 \
   --problem-model gpt-4o-mini --problem-temp 0.9 \
   --caller-model gpt-4o --caller-temp 0.3 \
   --judge-model gpt-4o --judge-temp 0.0
+
+# Parallel generation with 6 workers processing four samples per task
+toolsgen generate \
+  --tools tools.json \
+  --out output_dir \
+  --num 500 \
+  --workers 6 \
+  --worker-batch-size 4
 ```
 
 ### Python API Usage
@@ -87,6 +98,8 @@ gen_config = GenerationConfig(
     train_split=0.9,  # 90% train, 10% validation
     batch_size=10,  # optional: iterate tools in batches
     shuffle_tools=True,  # optional: reshuffle tools between batches
+    num_workers=4,  # enable multiprocessing
+    worker_batch_size=2,  # samples per worker task
 )
 
 model_config = ModelConfig(
@@ -213,7 +226,7 @@ For detailed information about the system architecture, pipeline, and core compo
 ### Planned Features
 - [ ] Multi-turn conversation support
 - [ ] Custom prompt template system
-- [ ] Parallel generation with multiprocessing
+- [x] Parallel generation with multiprocessing
 - [ ] Additional sampling strategies (coverage-based, difficulty-based)
 - [ ] Integration with Hugging Face Hub for direct dataset uploads
 - [ ] Support for more LLM providers (Anthropic, Cohere, etc.)
 
@@ -0,0 +1,56 @@
+"""Parallel generation example - accelerate dataset creation with multiprocessing."""
+
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+from toolsgen import GenerationConfig, ModelConfig, generate_dataset
+
+# Automatically load environment variables from .env if present
+load_dotenv()
+
+# NOTE: Set OPENAI_API_KEY environment variable before running
+# e.g. export OPENAI_API_KEY="your-api-key-here"
+
+
+def main() -> None:
+    tools_path = Path(__file__).parent / "tools.json"
+    output_dir = Path(__file__).parent / "output"
+
+    # Configure multiprocessing with 6 workers, each handling 3 samples per task
+    # Shuffle tools between batches to mix coverage while the workers run in parallel
+    # Increase num_samples to better showcase the throughput benefits of multiprocessing
+
+    gen_config = GenerationConfig(
+        num_samples=30,
+        strategy="semantic",
+        seed=2025,
+        max_attempts=3,
+        batch_size=2,
+        shuffle_tools=True,
+        num_workers=6,
+        worker_batch_size=3,
+    )
+
+    # Single model configuration shared across roles
+    model_config = ModelConfig(
+        model="gpt-4o-mini",
+        temperature=0.6,
+    )
+
+    manifest = generate_dataset(
+        output_dir, gen_config, model_config, tools_path=tools_path
+    )
+
+    print(
+        f"\n✓ Parallel run complete: {manifest['num_generated']}/{manifest['num_requested']} records"
+    )
+    if manifest["num_failed"]:
+        print(f"  Failed attempts: {manifest['num_failed']}")
+    print(f"  Workers used: {gen_config.num_workers}")
+    print(f"  Worker batch size: {gen_config.worker_batch_size}")
+    print(f"  Output directory: {output_dir}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,64 @@
+[
+  {
+    "function": {
+      "description": "Get current weather for a location",
+      "name": "get_weather",
+      "parameters": {
+        "properties": {
+          "location": {
+            "description": "City name or coordinates",
+            "type": "string"
+          },
+          "unit": {
+            "description": "Temperature unit",
+            "enum": [
+              "celsius",
+              "fahrenheit"
+            ],
+            "type": "string"
+          }
+        },
+        "required": [
+          "location"
+        ],
+        "type": "object"
+      }
+    },
+    "type": "function"
+  },
+  {
+    "function": {
+      "description": "Perform basic arithmetic operations",
+      "name": "calculate",
+      "parameters": {
+        "properties": {
+          "a": {
+            "description": "First number",
+            "type": "number"
+          },
+          "b": {
+            "description": "Second number",
+            "type": "number"
+          },
+          "operation": {
+            "description": "Arithmetic operation",
+            "enum": [
+              "add",
+              "subtract",
+              "multiply",
+              "divide"
+            ],
+            "type": "string"
+          }
+        },
+        "required": [
+          "operation",
+          "a",
+          "b"
+        ],
+        "type": "object"
+      }
+    },
+    "type": "function"
+  }
+]
@@ -13,7 +13,7 @@ toolsgen = ["prompts/*.txt"]
 
 [project]
 name = "toolsgen"
-version = "0.1.4"
+version = "0.2.0"
 description = "Generate tool-calling datasets from OpenAI-compatible tool specs"
 readme = "README.md"
 requires-python = ">=3.9"
 
@@ -96,6 +96,18 @@ def create_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Shuffle tool order before batching (default: disabled)",
     )
+    gen_parser.add_argument(
+        "--workers",
+        type=int,
+        default=1,
+        help="Number of worker processes for generation (default: 1)",
+    )
+    gen_parser.add_argument(
+        "--worker-batch-size",
+        type=int,
+        default=1,
+        help="Number of samples each worker processes per task (default: 1)",
+    )
 
     # Model config
     gen_parser.add_argument(
@@ -187,6 +199,14 @@ def cmd_generate(args: argparse.Namespace) -> None:
         print("Error: --temperature must be between 0.0 and 2.0", file=sys.stderr)
         sys.exit(1)
 
+    if args.workers < 1:
+        print("Error: --workers must be at least 1", file=sys.stderr)
+        sys.exit(1)
+
+    if args.worker_batch_size < 1:
+        print("Error: --worker-batch-size must be at least 1", file=sys.stderr)
+        sys.exit(1)
+
     # Create generation config
     gen_config = GenerationConfig(
         num_samples=args.num,
@@ -197,6 +217,8 @@ def cmd_generate(args: argparse.Namespace) -> None:
         max_attempts=args.max_attempts,
         batch_size=args.batch_size,
         shuffle_tools=args.shuffle_tools,
+        num_workers=args.workers,
+        worker_batch_size=args.worker_batch_size,
     )
 
     # Create model config
 
@@ -21,6 +21,8 @@ class GenerationConfig:
         k_max: Maximum number of tools per sample. Default None (uses all available tools).
         batch_size: Optional chunk size for tool batching. Default None (single batch).
         shuffle_tools: Whether to shuffle tools before batching. Default False.
+        num_workers: Number of concurrent worker processes. Default 1 (sequential).
+        worker_batch_size: Samples processed per worker task submission. Default 1.
     """
 
     num_samples: int = 10
@@ -33,6 +35,8 @@ class GenerationConfig:
     k_max: Optional[int] = None
     batch_size: Optional[int] = None
     shuffle_tools: bool = False
+    num_workers: int = 1
+    worker_batch_size: int = 1
 
 
 @dataclass