feat: Add Hugging Face Hub push functionality with CLI options and environment support

atasoglu · atasoglu · commit 1f55a33fbcbd · 2025-11-10T23:15:32.000+03:00
- Introduced command-line arguments for pushing datasets to Hugging Face Hub (`--push-to-hub`, `--repo-id`, `--hf-token`, `--private`)
- Updated generate command to optionally push generated datasets directly to HF Hub
- Added environment variable support for HF token via `.env` file using `python-dotenv`
- Created new `hf_hub.py` module for HF Hub upload implementation
- Enhanced README with example commands for generating and pushing datasets
- Updated `.env.example` to include optional `HF_TOKEN` for HF Hub authentication
diff --git a/.env.example b/.env.example
@@ -16,3 +16,6 @@ OPENAI_API_KEY=sk-your-api-key-here
 # vLLM / Local server
 # OPENAI_BASE_URL=http://localhost:8000/v1
 # OPENAI_API_KEY=dummy-key
+
+# Optional: Hugging Face token to push generated datasets
+# HF_TOKEN=your-huggingface-token-here
diff --git a/README.md b/README.md
@@ -76,16 +76,26 @@ toolsgen generate \
   --num 500 \
   --workers 6 \
   --worker-batch-size 4
+
+# Generate and push directly to Hugging Face Hub
+export HF_TOKEN="your-hf-token-here"
+toolsgen generate \
+  --tools tools.json \
+  --out output_dir \
+  --num 100 \
+  --push-to-hub \
+  --repo-id username/dataset-name
 ```
 
 ### Python API Usage
 
 ```python
-import os
 from pathlib import Path
+from dotenv import load_dotenv
+
 from toolsgen.core import GenerationConfig, ModelConfig, generate_dataset
 
-os.environ["OPENAI_API_KEY"] = "your-api-key-here"
+load_dotenv()  # Load from .env file
 
 # Configuration
 tools_path = Path("tools.json")
@@ -119,6 +129,50 @@ print(f"Generated {manifest['num_generated']}/{manifest['num_requested']} record
 print(f"Failed: {manifest['num_failed']} attempts")
 ```
 
+### Push to Hugging Face Hub
+
+```python
+from pathlib import Path
+from dotenv import load_dotenv
+
+from toolsgen import GenerationConfig, ModelConfig, generate_dataset, push_to_hub
+
+load_dotenv()  # Load from .env file
+
+tools_path = Path("tools.json")
+output_dir = Path("output")
+
+gen_config = GenerationConfig(
+    num_samples=100,
+    strategy="random",
+    seed=42,
+    train_split=0.9,
+)
+
+model_config = ModelConfig(
+    model="gpt-4o-mini",
+    temperature=0.7,
+)
+
+# Generate dataset
+manifest = generate_dataset(
+    output_dir=output_dir,
+    gen_config=gen_config,
+    model_config=model_config,
+    tools_path=tools_path,
+)
+
+# Push to Hub
+hub_info = push_to_hub(
+    output_dir=output_dir,
+    repo_id="username/dataset-name",
+    private=False,
+)
+
+print(f"Generated: {manifest['num_generated']} records")
+print(f"Repository: {hub_info['repo_url']}")
+```
+
 See `examples/` directory for complete working examples.
 
 **Note**: The examples in `examples/` use `python-dotenv` for convenience (load API keys from `.env` file). Install it with `pip install python-dotenv` if you want to use this approach.
@@ -228,7 +282,7 @@ For detailed information about the system architecture, pipeline, and core compo
 - [ ] Custom prompt template system
 - [x] Parallel generation with multiprocessing
 - [ ] Additional sampling strategies (coverage-based, difficulty-based)
-- [ ] Integration with Hugging Face Hub for direct dataset uploads
+- [x] Integration with Hugging Face Hub for direct dataset uploads
 - [ ] Support for more LLM providers (Anthropic, Cohere, etc.)
 - [ ] Web UI for dataset inspection and curation
 - [ ] Advanced filtering and deduplication
diff --git a/examples/hf_hub_upload/README.md b/examples/hf_hub_upload/README.md
@@ -0,0 +1,76 @@
+# Hugging Face Hub Upload Example
+
+This example demonstrates how to generate a dataset and push it directly to Hugging Face Hub.
+
+## Prerequisites
+
+1. OpenAI API key
+2. Hugging Face account and token with write access
+
+## Setup
+
+```bash
+# Install dependencies
+pip install toolsgen huggingface_hub python-dotenv
+
+# Create .env file from example
+cp .env.example .env
+
+# Edit .env and add your API keys
+# OPENAI_API_KEY=your-openai-api-key
+# HF_TOKEN=your-huggingface-token
+```
+
+## Usage
+
+### Python API
+
+```python
+python example.py
+```
+
+Make sure to update the `repo_id` in the script to your own repository name.
+
+### CLI
+
+```bash
+toolsgen generate \
+  --tools ../basic/tools.json \
+  --out output \
+  --num 50 \
+  --push-to-hub \
+  --repo-id your-username/your-dataset-name
+```
+
+## What Gets Uploaded
+
+The following files are automatically uploaded to your HF Hub repository:
+
+- `train.jsonl` - Training dataset
+- `val.jsonl` - Validation dataset (if train_split < 1.0)
+- `manifest.json` - Generation metadata
+- `README.md` - Auto-generated dataset card
+
+## Repository Visibility
+
+By default, repositories are public. To create a private repository:
+
+**Python API:**
+```python
+hub_info = push_to_hub(
+    output_dir=output_dir,
+    repo_id="username/dataset-name",
+    private=True,
+)
+```
+
+**CLI:**
+```bash
+toolsgen generate ... --push-to-hub --private
+```
+
+## Notes
+
+- The HF token can be provided via `--hf-token` flag or `HF_TOKEN` environment variable
+- If a repository already exists, it will be updated with new files
+- A dataset card (README.md) is automatically generated if not present
diff --git a/examples/hf_hub_upload/example.py b/examples/hf_hub_upload/example.py
@@ -0,0 +1,45 @@
+"""Example: Generate dataset and push to Hugging Face Hub."""
+
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+from toolsgen import GenerationConfig, ModelConfig, generate_dataset, push_to_hub
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Configuration
+tools_path = Path(__file__).parent.parent / "basic" / "tools.json"
+output_dir = Path(__file__).parent / "output"
+
+gen_config = GenerationConfig(
+    num_samples=50,
+    strategy="random",
+    seed=42,
+    train_split=0.9,
+)
+
+model_config = ModelConfig(
+    model="gpt-4o-mini",
+    temperature=0.7,
+)
+
+# Generate dataset
+manifest = generate_dataset(
+    output_dir=output_dir,
+    gen_config=gen_config,
+    model_config=model_config,
+    tools_path=tools_path,
+)
+
+# Push to Hub
+hub_info = push_to_hub(
+    output_dir=output_dir,
+    repo_id="your-username/your-dataset-name",  # Change this!
+    private=False,
+)
+
+print("\n✓ Dataset generated and uploaded!")
+print(f"  Generated: {manifest['num_generated']} records")
+print(f"  Repository: {hub_info['repo_url']}")
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -3,3 +3,4 @@ pytest-cov>=5.0.0
 pre-commit>=3.7.0
 ruff>=0.6.0
 python-dotenv>=1.0.0
+huggingface_hub>=0.20.0
diff --git a/src/toolsgen/__init__.py b/src/toolsgen/__init__.py
@@ -14,6 +14,7 @@
     load_tool_specs,
     write_dataset_jsonl,
 )
+from .hf_hub import push_to_hub
 from .judge import JudgeResponse, judge_tool_calls
 from .problem_generator import generate_problem
 from .tool_caller import generate_tool_calls
@@ -55,6 +56,8 @@
     "generate_dataset",
     "load_tool_specs",
     "write_dataset_jsonl",
+    # HF Hub
+    "push_to_hub",
     # Judge
     "JudgeResponse",
     "judge_tool_calls",
diff --git a/src/toolsgen/cli.py b/src/toolsgen/cli.py
@@ -168,6 +168,28 @@ def create_parser() -> argparse.ArgumentParser:
         help="Temperature for judging (defaults to --temperature)",
     )
 
+    # Hugging Face Hub options
+    gen_parser.add_argument(
+        "--push-to-hub",
+        action="store_true",
+        help="Push dataset to Hugging Face Hub after generation",
+    )
+    gen_parser.add_argument(
+        "--repo-id",
+        default=None,
+        help="HF Hub repository ID (e.g., 'username/dataset-name')",
+    )
+    gen_parser.add_argument(
+        "--hf-token",
+        default=None,
+        help="HF API token (defaults to HF_TOKEN env var)",
+    )
+    gen_parser.add_argument(
+        "--private",
+        action="store_true",
+        help="Create private repository on HF Hub",
+    )
+
     return parser
 
 
@@ -275,9 +297,15 @@ def cmd_generate(args: argparse.Namespace) -> None:
             max_tokens=args.max_tokens,
         )
 
+    # Validate HF Hub options
+    if args.push_to_hub and not args.repo_id:
+        print("Error: --repo-id is required when using --push-to-hub", file=sys.stderr)
+        sys.exit(1)
+
     # Generate dataset
     try:
         print(f"Generating {args.num} samples using {args.model}...")
+
         manifest = generate_dataset(
             args.out, gen_config, model_config, tools_path=args.tools
         )
@@ -297,6 +325,20 @@ def cmd_generate(args: argparse.Namespace) -> None:
 
         print(f"  - Manifest: {args.out / 'manifest.json'}")
 
+        if args.push_to_hub:
+            from .hf_hub import push_to_hub
+
+            print("\nPushing to Hugging Face Hub...")
+            hub_info = push_to_hub(
+                output_dir=args.out,
+                repo_id=args.repo_id,
+                token=args.hf_token,
+                private=args.private,
+            )
+            print("✓ Pushed to Hugging Face Hub")
+            print(f"  - Repository: {hub_info['repo_url']}")
+            print(f"  - Files uploaded: {', '.join(hub_info['files_uploaded'])}")
+
     except ValueError as e:
         print(f"Error: {e}", file=sys.stderr)
         sys.exit(1)
diff --git a/src/toolsgen/hf_hub.py b/src/toolsgen/hf_hub.py
diff --git a/tests/test_hf_hub.py b/tests/test_hf_hub.py