diff --git a/tests/test_kaggle.py b/tests/test_kaggle.py
new file mode 100644
index 0000000..2a1e94f
--- /dev/null
+++ b/tests/test_kaggle.py
@@ -0,0 +1,190 @@
+"""Tests for Kaggle integration module."""
+
+import pytest
+from toon.kaggle import (
+    is_kaggle_slug,
+    csv_to_records,
+    parse_croissant,
+    croissant_to_summary,
+    find_best_csv,
+)
+from pathlib import Path
+import tempfile
+
+
+class TestIsKaggleSlug:
+    """Tests for is_kaggle_slug function."""
+
+    def test_valid_slug(self):
+        """Test valid Kaggle slugs."""
+        assert is_kaggle_slug("username/dataset-name") is True
+        assert is_kaggle_slug("user123/my-dataset") is True
+        assert is_kaggle_slug("org-name/dataset_v2") is True
+
+    def test_invalid_slug(self):
+        """Test invalid Kaggle slugs."""
+        assert is_kaggle_slug("not-a-slug") is False
+        assert is_kaggle_slug("username/dataset/extra") is False
+        assert is_kaggle_slug("") is False
+        assert is_kaggle_slug("/dataset") is False
+
+
+class TestCsvToRecords:
+    """Tests for csv_to_records function."""
+
+    def test_basic_csv(self):
+        """Test basic CSV conversion."""
+        csv_data = "name,age,city\nAlice,30,NYC\nBob,25,LA"
+        result = csv_to_records(csv_data)
+
+        assert len(result) == 2
+        assert result[0] == {"name": "Alice", "age": "30", "city": "NYC"}
+        assert result[1] == {"name": "Bob", "age": "25", "city": "LA"}
+
+    def test_empty_csv(self):
+        """Test empty CSV (headers only)."""
+        csv_data = "name,age\n"
+        result = csv_to_records(csv_data)
+        assert result == []
+
+    def test_csv_with_quotes(self):
+        """Test CSV with quoted fields."""
+        csv_data = 'name,description\nAlice,"Hello, World"\nBob,"Line1\nLine2"'
+        result = csv_to_records(csv_data)
+
+        assert len(result) == 2
+        assert result[0]["description"] == "Hello, World"
+
+
+class TestParseCroissant:
+    """Tests for parse_croissant function."""
+
+    def test_basic_metadata(self):
+        """Test parsing basic Croissant metadata."""
+        metadata = {
+            "name": "Test Dataset",
+            "description": "A test dataset",
+            "distribution": [
+                {
+                    "name": "data.csv",
+                    "encodingFormat": "text/csv",
+                    "contentUrl": "https://example.com/data.csv",
+                }
+            ],
+            "recordSet": [
+                {
+                    "name": "data.csv",
+                    "field": [
+                        {"name": "id", "dataType": ["sc:Integer"]},
+                        {"name": "value", "dataType": ["sc:Float"]},
+                    ],
+                }
+            ],
+        }
+
+        result = parse_croissant(metadata)
+
+        assert result["name"] == "Test Dataset"
+        assert result["description"] == "A test dataset"
+        assert len(result["files"]) == 1
+        assert result["files"][0]["name"] == "data.csv"
+        assert len(result["schema"]["data.csv"]) == 2
+        assert result["schema"]["data.csv"][0]["name"] == "id"
+        assert result["schema"]["data.csv"][0]["type"] == "Integer"
+
+    def test_kaggle_url_extraction(self):
+        """Test Kaggle slug extraction from URL."""
+        metadata = {
+            "name": "Kaggle Dataset",
+            "distribution": [
+                {
+                    "name": "archive.zip",
+                    "contentUrl": "https://www.kaggle.com/api/v1/datasets/download/user/dataset?version=1",
+                }
+            ],
+            "recordSet": [],
+        }
+
+        result = parse_croissant(metadata)
+        assert result["kaggle_slug"] == "user/dataset"
+
+    def test_empty_metadata(self):
+        """Test parsing empty metadata."""
+        result = parse_croissant({})
+
+        assert result["name"] == "Unknown"
+        assert result["description"] == ""
+        assert result["files"] == []
+        assert result["schema"] == {}
+
+
+class TestCroissantToSummary:
+    """Tests for croissant_to_summary function."""
+
+    def test_summary_output(self):
+        """Test summary string generation."""
+        info = {
+            "name": "Air Quality Dataset",
+            "schema": {
+                "data.csv": [
+                    {"name": "Date", "type": "Date"},
+                    {"name": "AQI", "type": "Float"},
+                ]
+            },
+            "kaggle_slug": "user/air-quality",
+        }
+
+        result = croissant_to_summary(info)
+
+        assert "# Dataset: Air Quality Dataset" in result
+        assert "Date:Date" in result
+        assert "AQI:Float" in result
+        assert "toon user/air-quality --kaggle" in result
+
+
+class TestFindBestCsv:
+    """Tests for find_best_csv function."""
+
+    def test_finds_csv(self):
+        """Test finding CSV in file list."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Create test files
+            csv1 = Path(tmpdir) / "data.csv"
+            csv2 = Path(tmpdir) / "all_data.csv"
+            txt = Path(tmpdir) / "readme.txt"
+
+            csv1.write_text("a,b\n1,2")
+            csv2.write_text("a,b,c\n1,2,3\n4,5,6")  # Larger
+            txt.write_text("readme")
+
+            files = [csv1, csv2, txt]
+            result = find_best_csv(files)
+
+            # Should prefer "all_data.csv" due to "all" in name
+            assert result == csv2
+
+    def test_no_csv(self):
+        """Test when no CSV files exist."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            txt = Path(tmpdir) / "readme.txt"
+            txt.write_text("readme")
+
+            result = find_best_csv([txt])
+            assert result is None
+
+    def test_prefers_main_patterns(self):
+        """Test preference for files with main/full/combined in name."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Use "big" instead of "small" - "small" contains "all"!
+            big = Path(tmpdir) / "big.csv"
+            combined = Path(tmpdir) / "combined.csv"
+
+            # Make big.csv actually larger in bytes
+            big.write_text("a,b\n" + "1,2\n" * 100)
+            combined.write_text("a,b\n1,2")
+
+            files = [big, combined]
+            result = find_best_csv(files)
+
+            # Should prefer "combined" despite being smaller
+            assert result == combined
diff --git a/toon/__init__.py b/toon/__init__.py
index 5d1eee4..6ff2d94 100644
--- a/toon/__init__.py
+++ b/toon/__init__.py
@@ -23,6 +23,32 @@ def decode_to_pydantic(*args, **kwargs):
     def generate_structure_from_pydantic(*args, **kwargs):
         raise ImportError("generate_structure_from_pydantic requires pydantic to be installed. Please install pydantic to use this feature.")
 
+# Kaggle integration (optional - requires kaggle installation)
+try:
+    from .kaggle import (
+        download_dataset,
+        find_best_csv,
+        csv_to_records,
+        parse_croissant,
+        croissant_to_summary,
+        is_kaggle_slug,
+    )
+    _KAGGLE_AVAILABLE = True
+except ImportError:
+    _KAGGLE_AVAILABLE = False
+    def download_dataset(*args, **kwargs):
+        raise ImportError("download_dataset requires kaggle to be installed. Please install kaggle to use this feature.")
+    def find_best_csv(*args, **kwargs):
+        raise ImportError("find_best_csv requires kaggle to be installed. Please install kaggle to use this feature.")
+    def csv_to_records(*args, **kwargs):
+        raise ImportError("csv_to_records requires kaggle to be installed. Please install kaggle to use this feature.")
+    def parse_croissant(*args, **kwargs):
+        raise ImportError("parse_croissant requires kaggle to be installed. Please install kaggle to use this feature.")
+    def croissant_to_summary(*args, **kwargs):
+        raise ImportError("croissant_to_summary requires kaggle to be installed. Please install kaggle to use this feature.")
+    def is_kaggle_slug(*args, **kwargs):
+        raise ImportError("is_kaggle_slug requires kaggle to be installed. Please install kaggle to use this feature.")
+
 __version__ = '1.0.0'
 __all__ = [
     'encode',
@@ -31,6 +57,12 @@ def generate_structure_from_pydantic(*args, **kwargs):
     'encode_pydantic',
     'decode_to_pydantic',
     'generate_structure_from_pydantic',
+    'download_dataset',
+    'find_best_csv',
+    'csv_to_records',
+    'parse_croissant',
+    'croissant_to_summary',
+    'is_kaggle_slug',
     'COMMA',
     'TAB',
     'PIPE',
diff --git a/toon/cli.py b/toon/cli.py
index c872e4c..03d0039 100644
--- a/toon/cli.py
+++ b/toon/cli.py
@@ -11,6 +11,19 @@
 except ImportError:
     TIKTOKEN_AVAILABLE = False
 
+try:
+    from .kaggle import (
+        is_kaggle_slug,
+        download_dataset,
+        find_best_csv,
+        csv_to_records,
+        parse_croissant,
+        croissant_to_summary,
+    )
+    KAGGLE_AVAILABLE = True
+except ImportError:
+    KAGGLE_AVAILABLE = False
+
 from . import encode, decode
 
 
@@ -104,18 +117,27 @@ def main():
 Examples:
   # Encode JSON file to TOON
   toon input.json -o output.toon
-  
+
   # Decode TOON file to JSON
   toon input.toon -o output.json
-  
+
   # Pipe JSON and encode to TOON
   echo '{"key": "value"}' | toon -e
-  
+
   # Force decode mode with custom delimiter
   toon input.txt -d --delimiter tab
-  
+
   # Show token statistics
   toon input.json --stats
+
+  # Download Kaggle dataset and convert to TOON
+  toon username/dataset-name --kaggle --stats
+
+  # Select specific file from Kaggle dataset
+  toon username/dataset-name --kaggle --file data.csv
+
+  # Parse Croissant metadata to see dataset schema
+  toon metadata.json --croissant
         """
     )
     
@@ -177,13 +199,133 @@ def main():
         default='off',
         help='Path expansion mode (decode only, default: off)'
     )
-    
+    parser.add_argument(
+        '--kaggle',
+        action='store_true',
+        help='Treat input as Kaggle dataset slug (e.g., username/dataset-name)'
+    )
+    parser.add_argument(
+        '--croissant',
+        action='store_true',
+        help='Parse input as Croissant (ML Commons) metadata and show schema'
+    )
+    parser.add_argument(
+        '--file', '-f',
+        dest='select_file',
+        help='Select specific file from Kaggle dataset (use with --kaggle)'
+    )
+
     args = parser.parse_args()
     
     # Validate arguments
     if args.encode and args.decode:
         parser.error('Cannot specify both --encode and --decode')
-    
+
+    # Handle Kaggle dataset download
+    if args.kaggle or (KAGGLE_AVAILABLE and args.input and is_kaggle_slug(args.input)):
+        if not KAGGLE_AVAILABLE:
+            print('Error: Kaggle support requires the kaggle package. '
+                  'Install with: pip install kaggle', file=sys.stderr)
+            return 1
+
+        try:
+            print(f'Downloading Kaggle dataset: {args.input}', file=sys.stderr)
+            files = download_dataset(args.input)
+
+            # Find the target file
+            if args.select_file:
+                target = next(
+                    (f for f in files if args.select_file in f.name),
+                    None
+                )
+                if not target:
+                    print(f'Error: No file matching "{args.select_file}" in dataset',
+                          file=sys.stderr)
+                    print(f'Available files: {[f.name for f in files]}', file=sys.stderr)
+                    return 1
+            else:
+                target = find_best_csv(files)
+                if not target:
+                    # Try JSON files
+                    json_files = [f for f in files if f.suffix.lower() == '.json']
+                    target = json_files[0] if json_files else None
+
+                if not target:
+                    print('Error: No CSV or JSON files found in dataset', file=sys.stderr)
+                    return 1
+
+            print(f'Using: {target.name}', file=sys.stderr)
+
+            # Read and convert
+            content = target.read_text(encoding='utf-8', errors='replace')
+
+            if target.suffix.lower() == '.csv':
+                data = csv_to_records(content)
+            else:
+                data = json.loads(content)
+
+            # Encode to TOON
+            options = {
+                'delimiter': args.delimiter,
+                'indent': args.indent,
+                'key_folding': args.key_folding,
+            }
+            if args.flatten_depth is not None:
+                options['flatten_depth'] = args.flatten_depth
+
+            output_content = encode(data, options)
+            input_content = json.dumps(data)  # For stats comparison
+
+            # Show statistics if requested
+            if args.stats:
+                input_tokens = count_tokens(input_content)
+                output_tokens = count_tokens(output_content)
+
+                print(f'Input (JSON):  {len(input_content)} bytes', file=sys.stderr)
+                print(f'Output (TOON): {len(output_content)} bytes', file=sys.stderr)
+                if len(input_content) > 0:
+                    print(f'Size reduction: {(1 - len(output_content) / len(input_content)) * 100:.1f}%',
+                          file=sys.stderr)
+
+                if input_tokens is not None and output_tokens is not None:
+                    print(f'Input tokens:  {input_tokens}', file=sys.stderr)
+                    print(f'Output tokens: {output_tokens}', file=sys.stderr)
+                    print(f'Token reduction: {(1 - output_tokens / input_tokens) * 100:.1f}%',
+                          file=sys.stderr)
+                else:
+                    print('(Install tiktoken for token statistics)', file=sys.stderr)
+
+                print('---', file=sys.stderr)
+
+            write_output(output_content, args.output)
+            return 0
+
+        except Exception as e:
+            print(f'Error: {e}', file=sys.stderr)
+            return 1
+
+    # Handle Croissant metadata parsing
+    if args.croissant:
+        if not KAGGLE_AVAILABLE:
+            print('Error: Croissant support requires the kaggle module.', file=sys.stderr)
+            return 1
+
+        try:
+            input_content = read_input(args.input)
+            metadata = json.loads(input_content)
+            info = parse_croissant(metadata)
+            output_content = croissant_to_summary(info)
+
+            print(f'Dataset: {info["name"]}', file=sys.stderr)
+            print(f'Files: {[f["name"] for f in info["files"]]}', file=sys.stderr)
+
+            write_output(output_content, args.output)
+            return 0
+
+        except Exception as e:
+            print(f'Error parsing Croissant metadata: {e}', file=sys.stderr)
+            return 1
+
     try:
         # Read input
         input_content = read_input(args.input)
diff --git a/toon/kaggle.py b/toon/kaggle.py
new file mode 100644
index 0000000..b5f7c01
--- /dev/null
+++ b/toon/kaggle.py
@@ -0,0 +1,265 @@
+"""Kaggle dataset integration for TOON.
+
+This module provides utilities for downloading Kaggle datasets and parsing
+Croissant (ML Commons) metadata, making it easy to convert dataset files
+directly to TOON format.
+
+Example:
+    >>> from toon.kaggle import download_dataset, parse_croissant
+    >>> files = download_dataset("username/dataset-name", "/tmp/data")
+    >>> # Or parse Croissant metadata to understand dataset structure
+    >>> info = parse_croissant(metadata_dict)
+"""
+
+from __future__ import annotations
+
+import csv
+import io
+import json
+import re
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Any, Optional
+
+
+def is_kaggle_slug(s: str) -> bool:
+    """Check if string is a valid Kaggle dataset slug.
+
+    Args:
+        s: String to check
+
+    Returns:
+        True if string matches pattern 'username/dataset-name'
+
+    Example:
+        >>> is_kaggle_slug("username/my-dataset")
+        True
+        >>> is_kaggle_slug("/path/to/file.csv")
+        False
+    """
+    import os
+    return bool(re.match(r"^[\w-]+/[\w-]+$", s)) and not os.path.exists(s)
+
+
+def download_dataset(
+    slug: str,
+    output_dir: Optional[str] = None,
+    unzip: bool = True
+) -> list[Path]:
+    """Download a Kaggle dataset.
+
+    Requires the Kaggle CLI to be installed and configured with API credentials.
+    See: https://github.com/Kaggle/kaggle-api#api-credentials
+
+    Args:
+        slug: Kaggle dataset slug (e.g., 'username/dataset-name')
+        output_dir: Directory to download to (default: temp directory)
+        unzip: Whether to unzip the downloaded archive (default: True)
+
+    Returns:
+        List of paths to downloaded/extracted files
+
+    Raises:
+        RuntimeError: If Kaggle CLI is not installed or download fails
+        FileNotFoundError: If no files are found after download
+
+    Example:
+        >>> files = download_dataset("youssefelebiary/global-air-quality-2025")
+        >>> csv_files = [f for f in files if f.suffix == '.csv']
+    """
+    if output_dir is None:
+        output_dir = tempfile.mkdtemp(prefix="toon_kaggle_")
+
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    cmd = ["kaggle", "datasets", "download", "-d", slug, "-p", str(output_path)]
+    if unzip:
+        cmd.append("--unzip")
+
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+    except FileNotFoundError:
+        raise RuntimeError(
+            "Kaggle CLI not found. Install with: pip install kaggle\n"
+            "Then configure credentials: https://github.com/Kaggle/kaggle-api#api-credentials"
+        )
+
+    if result.returncode != 0:
+        raise RuntimeError(f"Kaggle download failed: {result.stderr}")
+
+    files = list(output_path.rglob("*"))
+    files = [f for f in files if f.is_file()]
+
+    if not files:
+        raise FileNotFoundError(f"No files found after downloading {slug}")
+
+    return files
+
+
+def find_best_csv(files: list[Path]) -> Optional[Path]:
+    """Find the best CSV file from a list of files.
+
+    Heuristics:
+    - Prefers files with 'all', 'full', 'combined', or 'main' in the name
+    - Falls back to the largest CSV file
+
+    Args:
+        files: List of file paths
+
+    Returns:
+        Path to the best CSV file, or None if no CSVs found
+
+    Example:
+        >>> files = list(Path("/data").rglob("*"))
+        >>> best = find_best_csv(files)
+    """
+    csv_files = [f for f in files if f.suffix.lower() == ".csv"]
+
+    if not csv_files:
+        return None
+
+    # Look for common "main" file patterns
+    main_patterns = ["all", "full", "combined", "main", "complete"]
+    for pattern in main_patterns:
+        for f in csv_files:
+            if pattern in f.stem.lower():
+                return f
+
+    # Fall back to largest file
+    return max(csv_files, key=lambda f: f.stat().st_size)
+
+
+def csv_to_records(csv_content: str) -> list[dict[str, Any]]:
+    """Convert CSV string to list of dictionaries.
+
+    Args:
+        csv_content: CSV data as string
+
+    Returns:
+        List of dictionaries, one per row
+
+    Example:
+        >>> data = csv_to_records("name,age\\nAlice,30\\nBob,25")
+        >>> data[0]
+        {'name': 'Alice', 'age': '30'}
+    """
+    reader = csv.DictReader(io.StringIO(csv_content))
+    return list(reader)
+
+
+def parse_croissant(metadata: dict[str, Any]) -> dict[str, Any]:
+    """Parse Croissant (ML Commons) JSON-LD metadata.
+
+    Croissant is the ML Commons standard for dataset documentation.
+    See: https://mlcommons.org/croissant/
+
+    Args:
+        metadata: Parsed Croissant JSON-LD document
+
+    Returns:
+        Dictionary with:
+        - name: Dataset name
+        - description: Dataset description
+        - files: List of file info dicts with name, url, contained_in
+        - schema: Dict mapping table names to field definitions
+        - kaggle_slug: Kaggle dataset slug if detectable from URLs
+
+    Example:
+        >>> with open("metadata.json") as f:
+        ...     metadata = json.load(f)
+        >>> info = parse_croissant(metadata)
+        >>> print(info['name'])
+        'Global Air Quality Dataset'
+    """
+    info: dict[str, Any] = {
+        "name": metadata.get("name", "Unknown"),
+        "description": metadata.get("description", ""),
+        "files": [],
+        "schema": {},
+        "kaggle_slug": None,
+    }
+
+    # Extract file distribution
+    for dist in metadata.get("distribution", []):
+        file_info = {
+            "name": dist.get("name"),
+            "url": dist.get("contentUrl"),
+            "encoding": dist.get("encodingFormat"),
+            "contained_in": dist.get("containedIn", {}).get("@id") if isinstance(
+                dist.get("containedIn"), dict
+            ) else dist.get("containedIn"),
+        }
+        info["files"].append(file_info)
+
+        # Try to extract Kaggle slug from URL
+        url = dist.get("contentUrl", "")
+        if "kaggle.com" in url and info["kaggle_slug"] is None:
+            match = re.search(r"datasets/download/([^?]+)", url)
+            if match:
+                info["kaggle_slug"] = match.group(1)
+
+    # Extract schema from recordSet
+    for record_set in metadata.get("recordSet", []):
+        fields = []
+        for field in record_set.get("field", []):
+            field_name = field.get("name")
+            if field_name:
+                data_types = field.get("dataType", ["unknown"])
+                type_str = data_types[0] if data_types else "unknown"
+                # Clean up schema.org prefixes
+                type_str = type_str.replace("sc:", "").replace("https://schema.org/", "")
+
+                fields.append({
+                    "name": field_name,
+                    "type": type_str,
+                    "description": field.get("description", ""),
+                })
+
+        if fields:
+            table_name = record_set.get("name", "default")
+            info["schema"][table_name] = fields
+
+    return info
+
+
+def croissant_to_summary(info: dict[str, Any]) -> str:
+    """Generate a human-readable summary from parsed Croissant metadata.
+
+    Args:
+        info: Parsed Croissant info from parse_croissant()
+
+    Returns:
+        Formatted summary string suitable for display or LLM context
+
+    Example:
+        >>> info = parse_croissant(metadata)
+        >>> print(croissant_to_summary(info))
+        # Dataset: Global Air Quality
+        # Schema:
+        #   data.csv: Date:Date, City:Text, AQI:Float
+    """
+    lines = [
+        f"# Dataset: {info['name']}",
+        "# Schema:",
+    ]
+
+    for table, fields in info["schema"].items():
+        field_strs = [f"{f['name']}:{f['type']}" for f in fields if f["name"]]
+        if field_strs:
+            lines.append(f"#   {table}: {', '.join(field_strs)}")
+
+    if info["kaggle_slug"]:
+        lines.extend([
+            "#",
+            "# To download and convert this dataset:",
+            f"#   toon {info['kaggle_slug']} --kaggle",
+        ])
+
+    return "\n".join(lines)