diff --git a/tests/test_kaggle.py b/tests/test_kaggle.py new file mode 100644 index 0000000..2a1e94f --- /dev/null +++ b/tests/test_kaggle.py @@ -0,0 +1,190 @@ +"""Tests for Kaggle integration module.""" + +import pytest +from toon.kaggle import ( + is_kaggle_slug, + csv_to_records, + parse_croissant, + croissant_to_summary, + find_best_csv, +) +from pathlib import Path +import tempfile + + +class TestIsKaggleSlug: + """Tests for is_kaggle_slug function.""" + + def test_valid_slug(self): + """Test valid Kaggle slugs.""" + assert is_kaggle_slug("username/dataset-name") is True + assert is_kaggle_slug("user123/my-dataset") is True + assert is_kaggle_slug("org-name/dataset_v2") is True + + def test_invalid_slug(self): + """Test invalid Kaggle slugs.""" + assert is_kaggle_slug("not-a-slug") is False + assert is_kaggle_slug("username/dataset/extra") is False + assert is_kaggle_slug("") is False + assert is_kaggle_slug("/dataset") is False + + +class TestCsvToRecords: + """Tests for csv_to_records function.""" + + def test_basic_csv(self): + """Test basic CSV conversion.""" + csv_data = "name,age,city\nAlice,30,NYC\nBob,25,LA" + result = csv_to_records(csv_data) + + assert len(result) == 2 + assert result[0] == {"name": "Alice", "age": "30", "city": "NYC"} + assert result[1] == {"name": "Bob", "age": "25", "city": "LA"} + + def test_empty_csv(self): + """Test empty CSV (headers only).""" + csv_data = "name,age\n" + result = csv_to_records(csv_data) + assert result == [] + + def test_csv_with_quotes(self): + """Test CSV with quoted fields.""" + csv_data = 'name,description\nAlice,"Hello, World"\nBob,"Line1\nLine2"' + result = csv_to_records(csv_data) + + assert len(result) == 2 + assert result[0]["description"] == "Hello, World" + + +class TestParseCroissant: + """Tests for parse_croissant function.""" + + def test_basic_metadata(self): + """Test parsing basic Croissant metadata.""" + metadata = { + "name": "Test Dataset", + "description": "A test dataset", + "distribution": [ + { + "name": "data.csv", + "encodingFormat": "text/csv", + "contentUrl": "https://example.com/data.csv", + } + ], + "recordSet": [ + { + "name": "data.csv", + "field": [ + {"name": "id", "dataType": ["sc:Integer"]}, + {"name": "value", "dataType": ["sc:Float"]}, + ], + } + ], + } + + result = parse_croissant(metadata) + + assert result["name"] == "Test Dataset" + assert result["description"] == "A test dataset" + assert len(result["files"]) == 1 + assert result["files"][0]["name"] == "data.csv" + assert len(result["schema"]["data.csv"]) == 2 + assert result["schema"]["data.csv"][0]["name"] == "id" + assert result["schema"]["data.csv"][0]["type"] == "Integer" + + def test_kaggle_url_extraction(self): + """Test Kaggle slug extraction from URL.""" + metadata = { + "name": "Kaggle Dataset", + "distribution": [ + { + "name": "archive.zip", + "contentUrl": "https://www.kaggle.com/api/v1/datasets/download/user/dataset?version=1", + } + ], + "recordSet": [], + } + + result = parse_croissant(metadata) + assert result["kaggle_slug"] == "user/dataset" + + def test_empty_metadata(self): + """Test parsing empty metadata.""" + result = parse_croissant({}) + + assert result["name"] == "Unknown" + assert result["description"] == "" + assert result["files"] == [] + assert result["schema"] == {} + + +class TestCroissantToSummary: + """Tests for croissant_to_summary function.""" + + def test_summary_output(self): + """Test summary string generation.""" + info = { + "name": "Air Quality Dataset", + "schema": { + "data.csv": [ + {"name": "Date", "type": "Date"}, + {"name": "AQI", "type": "Float"}, + ] + }, + "kaggle_slug": "user/air-quality", + } + + result = croissant_to_summary(info) + + assert "# Dataset: Air Quality Dataset" in result + assert "Date:Date" in result + assert "AQI:Float" in result + assert "toon user/air-quality --kaggle" in result + + +class TestFindBestCsv: + """Tests for find_best_csv function.""" + + def test_finds_csv(self): + """Test finding CSV in file list.""" + with tempfile.TemporaryDirectory() as tmpdir: + # Create test files + csv1 = Path(tmpdir) / "data.csv" + csv2 = Path(tmpdir) / "all_data.csv" + txt = Path(tmpdir) / "readme.txt" + + csv1.write_text("a,b\n1,2") + csv2.write_text("a,b,c\n1,2,3\n4,5,6") # Larger + txt.write_text("readme") + + files = [csv1, csv2, txt] + result = find_best_csv(files) + + # Should prefer "all_data.csv" due to "all" in name + assert result == csv2 + + def test_no_csv(self): + """Test when no CSV files exist.""" + with tempfile.TemporaryDirectory() as tmpdir: + txt = Path(tmpdir) / "readme.txt" + txt.write_text("readme") + + result = find_best_csv([txt]) + assert result is None + + def test_prefers_main_patterns(self): + """Test preference for files with main/full/combined in name.""" + with tempfile.TemporaryDirectory() as tmpdir: + # Use "big" instead of "small" - "small" contains "all"! + big = Path(tmpdir) / "big.csv" + combined = Path(tmpdir) / "combined.csv" + + # Make big.csv actually larger in bytes + big.write_text("a,b\n" + "1,2\n" * 100) + combined.write_text("a,b\n1,2") + + files = [big, combined] + result = find_best_csv(files) + + # Should prefer "combined" despite being smaller + assert result == combined diff --git a/toon/__init__.py b/toon/__init__.py index 5d1eee4..6ff2d94 100644 --- a/toon/__init__.py +++ b/toon/__init__.py @@ -23,6 +23,32 @@ def decode_to_pydantic(*args, **kwargs): def generate_structure_from_pydantic(*args, **kwargs): raise ImportError("generate_structure_from_pydantic requires pydantic to be installed. Please install pydantic to use this feature.") +# Kaggle integration (optional - requires kaggle installation) +try: + from .kaggle import ( + download_dataset, + find_best_csv, + csv_to_records, + parse_croissant, + croissant_to_summary, + is_kaggle_slug, + ) + _KAGGLE_AVAILABLE = True +except ImportError: + _KAGGLE_AVAILABLE = False + def download_dataset(*args, **kwargs): + raise ImportError("download_dataset requires kaggle to be installed. Please install kaggle to use this feature.") + def find_best_csv(*args, **kwargs): + raise ImportError("find_best_csv requires kaggle to be installed. Please install kaggle to use this feature.") + def csv_to_records(*args, **kwargs): + raise ImportError("csv_to_records requires kaggle to be installed. Please install kaggle to use this feature.") + def parse_croissant(*args, **kwargs): + raise ImportError("parse_croissant requires kaggle to be installed. Please install kaggle to use this feature.") + def croissant_to_summary(*args, **kwargs): + raise ImportError("croissant_to_summary requires kaggle to be installed. Please install kaggle to use this feature.") + def is_kaggle_slug(*args, **kwargs): + raise ImportError("is_kaggle_slug requires kaggle to be installed. Please install kaggle to use this feature.") + __version__ = '1.0.0' __all__ = [ 'encode', @@ -31,6 +57,12 @@ def generate_structure_from_pydantic(*args, **kwargs): 'encode_pydantic', 'decode_to_pydantic', 'generate_structure_from_pydantic', + 'download_dataset', + 'find_best_csv', + 'csv_to_records', + 'parse_croissant', + 'croissant_to_summary', + 'is_kaggle_slug', 'COMMA', 'TAB', 'PIPE', diff --git a/toon/cli.py b/toon/cli.py index c872e4c..03d0039 100644 --- a/toon/cli.py +++ b/toon/cli.py @@ -11,6 +11,19 @@ except ImportError: TIKTOKEN_AVAILABLE = False +try: + from .kaggle import ( + is_kaggle_slug, + download_dataset, + find_best_csv, + csv_to_records, + parse_croissant, + croissant_to_summary, + ) + KAGGLE_AVAILABLE = True +except ImportError: + KAGGLE_AVAILABLE = False + from . import encode, decode @@ -104,18 +117,27 @@ def main(): Examples: # Encode JSON file to TOON toon input.json -o output.toon - + # Decode TOON file to JSON toon input.toon -o output.json - + # Pipe JSON and encode to TOON echo '{"key": "value"}' | toon -e - + # Force decode mode with custom delimiter toon input.txt -d --delimiter tab - + # Show token statistics toon input.json --stats + + # Download Kaggle dataset and convert to TOON + toon username/dataset-name --kaggle --stats + + # Select specific file from Kaggle dataset + toon username/dataset-name --kaggle --file data.csv + + # Parse Croissant metadata to see dataset schema + toon metadata.json --croissant """ ) @@ -177,13 +199,133 @@ def main(): default='off', help='Path expansion mode (decode only, default: off)' ) - + parser.add_argument( + '--kaggle', + action='store_true', + help='Treat input as Kaggle dataset slug (e.g., username/dataset-name)' + ) + parser.add_argument( + '--croissant', + action='store_true', + help='Parse input as Croissant (ML Commons) metadata and show schema' + ) + parser.add_argument( + '--file', '-f', + dest='select_file', + help='Select specific file from Kaggle dataset (use with --kaggle)' + ) + args = parser.parse_args() # Validate arguments if args.encode and args.decode: parser.error('Cannot specify both --encode and --decode') - + + # Handle Kaggle dataset download + if args.kaggle or (KAGGLE_AVAILABLE and args.input and is_kaggle_slug(args.input)): + if not KAGGLE_AVAILABLE: + print('Error: Kaggle support requires the kaggle package. ' + 'Install with: pip install kaggle', file=sys.stderr) + return 1 + + try: + print(f'Downloading Kaggle dataset: {args.input}', file=sys.stderr) + files = download_dataset(args.input) + + # Find the target file + if args.select_file: + target = next( + (f for f in files if args.select_file in f.name), + None + ) + if not target: + print(f'Error: No file matching "{args.select_file}" in dataset', + file=sys.stderr) + print(f'Available files: {[f.name for f in files]}', file=sys.stderr) + return 1 + else: + target = find_best_csv(files) + if not target: + # Try JSON files + json_files = [f for f in files if f.suffix.lower() == '.json'] + target = json_files[0] if json_files else None + + if not target: + print('Error: No CSV or JSON files found in dataset', file=sys.stderr) + return 1 + + print(f'Using: {target.name}', file=sys.stderr) + + # Read and convert + content = target.read_text(encoding='utf-8', errors='replace') + + if target.suffix.lower() == '.csv': + data = csv_to_records(content) + else: + data = json.loads(content) + + # Encode to TOON + options = { + 'delimiter': args.delimiter, + 'indent': args.indent, + 'key_folding': args.key_folding, + } + if args.flatten_depth is not None: + options['flatten_depth'] = args.flatten_depth + + output_content = encode(data, options) + input_content = json.dumps(data) # For stats comparison + + # Show statistics if requested + if args.stats: + input_tokens = count_tokens(input_content) + output_tokens = count_tokens(output_content) + + print(f'Input (JSON): {len(input_content)} bytes', file=sys.stderr) + print(f'Output (TOON): {len(output_content)} bytes', file=sys.stderr) + if len(input_content) > 0: + print(f'Size reduction: {(1 - len(output_content) / len(input_content)) * 100:.1f}%', + file=sys.stderr) + + if input_tokens is not None and output_tokens is not None: + print(f'Input tokens: {input_tokens}', file=sys.stderr) + print(f'Output tokens: {output_tokens}', file=sys.stderr) + print(f'Token reduction: {(1 - output_tokens / input_tokens) * 100:.1f}%', + file=sys.stderr) + else: + print('(Install tiktoken for token statistics)', file=sys.stderr) + + print('---', file=sys.stderr) + + write_output(output_content, args.output) + return 0 + + except Exception as e: + print(f'Error: {e}', file=sys.stderr) + return 1 + + # Handle Croissant metadata parsing + if args.croissant: + if not KAGGLE_AVAILABLE: + print('Error: Croissant support requires the kaggle module.', file=sys.stderr) + return 1 + + try: + input_content = read_input(args.input) + metadata = json.loads(input_content) + info = parse_croissant(metadata) + output_content = croissant_to_summary(info) + + print(f'Dataset: {info["name"]}', file=sys.stderr) + print(f'Files: {[f["name"] for f in info["files"]]}', file=sys.stderr) + + write_output(output_content, args.output) + return 0 + + except Exception as e: + print(f'Error parsing Croissant metadata: {e}', file=sys.stderr) + return 1 + try: # Read input input_content = read_input(args.input) diff --git a/toon/kaggle.py b/toon/kaggle.py new file mode 100644 index 0000000..b5f7c01 --- /dev/null +++ b/toon/kaggle.py @@ -0,0 +1,265 @@ +"""Kaggle dataset integration for TOON. + +This module provides utilities for downloading Kaggle datasets and parsing +Croissant (ML Commons) metadata, making it easy to convert dataset files +directly to TOON format. + +Example: + >>> from toon.kaggle import download_dataset, parse_croissant + >>> files = download_dataset("username/dataset-name", "/tmp/data") + >>> # Or parse Croissant metadata to understand dataset structure + >>> info = parse_croissant(metadata_dict) +""" + +from __future__ import annotations + +import csv +import io +import json +import re +import subprocess +import tempfile +from pathlib import Path +from typing import Any, Optional + + +def is_kaggle_slug(s: str) -> bool: + """Check if string is a valid Kaggle dataset slug. + + Args: + s: String to check + + Returns: + True if string matches pattern 'username/dataset-name' + + Example: + >>> is_kaggle_slug("username/my-dataset") + True + >>> is_kaggle_slug("/path/to/file.csv") + False + """ + import os + return bool(re.match(r"^[\w-]+/[\w-]+$", s)) and not os.path.exists(s) + + +def download_dataset( + slug: str, + output_dir: Optional[str] = None, + unzip: bool = True +) -> list[Path]: + """Download a Kaggle dataset. + + Requires the Kaggle CLI to be installed and configured with API credentials. + See: https://github.com/Kaggle/kaggle-api#api-credentials + + Args: + slug: Kaggle dataset slug (e.g., 'username/dataset-name') + output_dir: Directory to download to (default: temp directory) + unzip: Whether to unzip the downloaded archive (default: True) + + Returns: + List of paths to downloaded/extracted files + + Raises: + RuntimeError: If Kaggle CLI is not installed or download fails + FileNotFoundError: If no files are found after download + + Example: + >>> files = download_dataset("youssefelebiary/global-air-quality-2025") + >>> csv_files = [f for f in files if f.suffix == '.csv'] + """ + if output_dir is None: + output_dir = tempfile.mkdtemp(prefix="toon_kaggle_") + + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + cmd = ["kaggle", "datasets", "download", "-d", slug, "-p", str(output_path)] + if unzip: + cmd.append("--unzip") + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + check=False, + ) + except FileNotFoundError: + raise RuntimeError( + "Kaggle CLI not found. Install with: pip install kaggle\n" + "Then configure credentials: https://github.com/Kaggle/kaggle-api#api-credentials" + ) + + if result.returncode != 0: + raise RuntimeError(f"Kaggle download failed: {result.stderr}") + + files = list(output_path.rglob("*")) + files = [f for f in files if f.is_file()] + + if not files: + raise FileNotFoundError(f"No files found after downloading {slug}") + + return files + + +def find_best_csv(files: list[Path]) -> Optional[Path]: + """Find the best CSV file from a list of files. + + Heuristics: + - Prefers files with 'all', 'full', 'combined', or 'main' in the name + - Falls back to the largest CSV file + + Args: + files: List of file paths + + Returns: + Path to the best CSV file, or None if no CSVs found + + Example: + >>> files = list(Path("/data").rglob("*")) + >>> best = find_best_csv(files) + """ + csv_files = [f for f in files if f.suffix.lower() == ".csv"] + + if not csv_files: + return None + + # Look for common "main" file patterns + main_patterns = ["all", "full", "combined", "main", "complete"] + for pattern in main_patterns: + for f in csv_files: + if pattern in f.stem.lower(): + return f + + # Fall back to largest file + return max(csv_files, key=lambda f: f.stat().st_size) + + +def csv_to_records(csv_content: str) -> list[dict[str, Any]]: + """Convert CSV string to list of dictionaries. + + Args: + csv_content: CSV data as string + + Returns: + List of dictionaries, one per row + + Example: + >>> data = csv_to_records("name,age\\nAlice,30\\nBob,25") + >>> data[0] + {'name': 'Alice', 'age': '30'} + """ + reader = csv.DictReader(io.StringIO(csv_content)) + return list(reader) + + +def parse_croissant(metadata: dict[str, Any]) -> dict[str, Any]: + """Parse Croissant (ML Commons) JSON-LD metadata. + + Croissant is the ML Commons standard for dataset documentation. + See: https://mlcommons.org/croissant/ + + Args: + metadata: Parsed Croissant JSON-LD document + + Returns: + Dictionary with: + - name: Dataset name + - description: Dataset description + - files: List of file info dicts with name, url, contained_in + - schema: Dict mapping table names to field definitions + - kaggle_slug: Kaggle dataset slug if detectable from URLs + + Example: + >>> with open("metadata.json") as f: + ... metadata = json.load(f) + >>> info = parse_croissant(metadata) + >>> print(info['name']) + 'Global Air Quality Dataset' + """ + info: dict[str, Any] = { + "name": metadata.get("name", "Unknown"), + "description": metadata.get("description", ""), + "files": [], + "schema": {}, + "kaggle_slug": None, + } + + # Extract file distribution + for dist in metadata.get("distribution", []): + file_info = { + "name": dist.get("name"), + "url": dist.get("contentUrl"), + "encoding": dist.get("encodingFormat"), + "contained_in": dist.get("containedIn", {}).get("@id") if isinstance( + dist.get("containedIn"), dict + ) else dist.get("containedIn"), + } + info["files"].append(file_info) + + # Try to extract Kaggle slug from URL + url = dist.get("contentUrl", "") + if "kaggle.com" in url and info["kaggle_slug"] is None: + match = re.search(r"datasets/download/([^?]+)", url) + if match: + info["kaggle_slug"] = match.group(1) + + # Extract schema from recordSet + for record_set in metadata.get("recordSet", []): + fields = [] + for field in record_set.get("field", []): + field_name = field.get("name") + if field_name: + data_types = field.get("dataType", ["unknown"]) + type_str = data_types[0] if data_types else "unknown" + # Clean up schema.org prefixes + type_str = type_str.replace("sc:", "").replace("https://schema.org/", "") + + fields.append({ + "name": field_name, + "type": type_str, + "description": field.get("description", ""), + }) + + if fields: + table_name = record_set.get("name", "default") + info["schema"][table_name] = fields + + return info + + +def croissant_to_summary(info: dict[str, Any]) -> str: + """Generate a human-readable summary from parsed Croissant metadata. + + Args: + info: Parsed Croissant info from parse_croissant() + + Returns: + Formatted summary string suitable for display or LLM context + + Example: + >>> info = parse_croissant(metadata) + >>> print(croissant_to_summary(info)) + # Dataset: Global Air Quality + # Schema: + # data.csv: Date:Date, City:Text, AQI:Float + """ + lines = [ + f"# Dataset: {info['name']}", + "# Schema:", + ] + + for table, fields in info["schema"].items(): + field_strs = [f"{f['name']}:{f['type']}" for f in fields if f["name"]] + if field_strs: + lines.append(f"# {table}: {', '.join(field_strs)}") + + if info["kaggle_slug"]: + lines.extend([ + "#", + "# To download and convert this dataset:", + f"# toon {info['kaggle_slug']} --kaggle", + ]) + + return "\n".join(lines)