CCExtractor · 03-SudheshnaReddy · Jan 3, 2026 · Jan 3, 2026 · Jan 3, 2026 · Jan 3, 2026
@@ -6,6 +6,10 @@
 from mod_regression.update_regression import update_expected_results
 from run import app
 
+import json
+from pathlib import Path
+from mod_regression.sample_inventory import inventory_samples
+
 
 @app.cli.command('update')
 @click.argument('path_to_ccex')
@@ -29,6 +33,35 @@ def update_results(path_to_ccex):
     click.echo('update function finished')
     return 0
 
+def inventory_command():
+    import argparse
+    parser = argparse.ArgumentParser(description="Generate sample inventory")
+    parser.add_argument(
+        "--samples",
+        default="TestData",
+        help="Path to samples directory"
+    )
+    parser.add_argument(
+        "--output",
+        default="metadata/sample_inventory.json",
+        help="Output JSON file"
+    )
+
+    args = parser.parse_args()
+
+    samples_dir = Path(args.samples)
+    out = Path(args.output)
+
+    inventory = inventory_samples(samples_dir)
+
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(inventory, indent=2))
+
+    print(f"Inventory written: {out} ({len(inventory)} samples)")
 
 if __name__ == '__main__':
     app.cli()
+    import sys
+    if len(sys.argv) > 1 and sys.argv[1] == "inventory":
+        sys.argv.pop(1)
+        inventory_command()
@@ -0,0 +1,90 @@
+import json
+import subprocess
+import hashlib
+from pathlib import Path
+
+
+def _run(cmd):
+    try:
+        return subprocess.run(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            timeout=30,
+            check=False
+        )
+    except (OSError, subprocess.SubprocessError):
+        return None
+
+
+
+def sha256sum(path: Path) -> str:
+    h = hashlib.sha256()
+    with path.open("rb") as f:
+        for chunk in iter(lambda: f.read(8192), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def probe_sample(sample_path: Path) -> dict:
+    result = {
+        "path": str(sample_path),
+        "sha256": sha256sum(sample_path),
+        "container": None,
+        "streams": [],
+        "caption_types_detected": [],
+        "duration_sec": None,
+    }
+
+    # ---- ffprobe ----
+    ffprobe = _run([
+        "ffprobe",
+        "-v", "error",
+        "-show_format",
+        "-show_streams",
+        "-print_format", "json",
+        str(sample_path)
+    ])
+
+    if ffprobe and ffprobe.returncode == 0:
+        try:
+            meta = json.loads(ffprobe.stdout)
+            fmt = meta.get("format", {})
+            result["container"] = fmt.get("format_name")
+            if "duration" in fmt:
+                result["duration_sec"] = float(fmt["duration"])
+
+            for s in meta.get("streams", []):
+                result["streams"].append({
+                    "type": s.get("codec_type"),
+                    "codec": s.get("codec_name")
+                })
+        except (ValueError, KeyError):
+            pass
+
+    # ---- CCExtractor ----
+    cce = _run([
+        "ccextractor",
+        str(sample_path),
+        "-stdout"
+    ])
+
+    if cce and cce.returncode == 0:
+        stderr = (cce.stderr or "").lower()
+        if "608" in stderr:
+            result["caption_types_detected"].append("CEA-608")
+        if "708" in stderr:
+            result["caption_types_detected"].append("CEA-708")
+        if "dvb" in stderr:
+            result["caption_types_detected"].append("DVB")
+
+    return result
+
+
+def inventory_samples(sample_root: Path) -> list:
+    inventory = []
+    for p in sample_root.rglob("*"):
+        if p.is_file():
+            inventory.append(probe_sample(p))
+    return inventory
@@ -0,0 +1,13 @@
+from pathlib import Path
+import json
+from mod_regression.sample_inventory import inventory_samples
+
+
+def test_inventory_multiple_files(tmp_path):
+    (tmp_path / "a.ts").write_bytes(b"a")
+    (tmp_path / "b.ts").write_bytes(b"b")
+
+    inventory = inventory_samples(tmp_path)
+
+    assert len(inventory) == 2
+    assert all("sha256" in i for i in inventory)
@@ -0,0 +1,30 @@
+import math
+import tempfile
+from pathlib import Path
+from unittest import mock
+from mod_regression.sample_inventory import probe_sample
+
+
+def fake_run(cmd, **kwargs):
+    class R:
+        returncode = 0
+        stdout = (
+            '{"format":{"format_name":"mpegts","duration":"10.0"},'
+            '"streams":[{"codec_type":"video","codec_name":"h264"}]}'
+        )
+        stderr = "Detected CEA-608 captions"
+    return R()
+
+
+@mock.patch("mod_regression.sample_inventory.subprocess.run", side_effect=fake_run)
+def test_probe_sample_basic(mock_run):
+    with tempfile.TemporaryDirectory() as tmp:
+        f = Path(tmp) / "sample.ts"
+        f.write_bytes(b"dummy")
+
+        result = probe_sample(f)
+
+        assert result["container"] == "mpegts"
+        assert math.isclose(result["duration_sec"], 10.0, rel_tol=1e-9)
+        assert "CEA-608" in result["caption_types_detected"]
+        assert result["streams"][0]["codec"] == "h264"