simonw
diff --git a/‎docs/har.md‎
Lines changed: 20 additions & 0 deletions b/‎docs/har.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎shot_scraper/cli.py‎
Lines changed: 116 additions & 1 deletion b/‎shot_scraper/cli.py‎
Lines changed: 116 additions & 1 deletion
diff --git a/‎shot_scraper/utils.py‎
Lines changed: 111 additions & 0 deletions b/‎shot_scraper/utils.py‎
Lines changed: 111 additions & 0 deletions
@@ -55,6 +55,23 @@ Archive:  datasette-io.har.zip
 
 You can record multiple pages to a single HTTP Archive using the {ref}`shot-scraper multi --har option<multi-har>`.
 
+## Extracting resources from HAR files
+
+Use the `--extract` or `-x` option to automatically extract all resources from the HAR file into a directory:
+
+```bash
+shot-scraper har https://datasette.io/ --extract
+```
+This will create both `datasette-io.har` and a `datasette-io/` directory containing all resources with meaningful filenames derived from their URLs.
+
+The extracted files use extensions based on their content-type. For example, a request to `/api/data` that returns JSON will be saved with a `.json` extension.
+
+You can combine this with `--zip`:
+```bash
+shot-scraper har https://datasette.io/ --extract --zip
+```
+This creates `datasette-io.har.zip` and extracts resources to the `datasette-io/` directory.
+
 ## `shot-scraper har --help`
 
 Full `--help` for this command:
@@ -87,8 +104,11 @@ Usage: shot-scraper har [OPTIONS] URL
   Use --zip to save as a .har.zip file instead, or specify a filename ending in
   .har.zip
 
+  Use --extract / -x to also extract all resources from the HAR into a directory
+
 Options:
   -z, --zip              Save as a .har.zip file
+  -x, --extract          Extract resources from the HAR file into a directory
   -a, --auth FILENAME    Path to JSON authentication context file
   -o, --output FILE      HAR filename
   --wait INTEGER         Wait this many milliseconds before taking the
 
@@ -1,3 +1,4 @@
+import base64
 import secrets
 import subprocess
 import sys
@@ -6,14 +7,20 @@
 import json
 import os
 import pathlib
+import zipfile
 from runpy import run_module
 from click_default_group import DefaultGroup
 import yaml
 import click
 from playwright.sync_api import sync_playwright, Error, TimeoutError
 
 
-from shot_scraper.utils import filename_for_url, load_github_script, url_or_file_path
+from shot_scraper.utils import (
+    filename_for_url,
+    filename_for_har_entry,
+    load_github_script,
+    url_or_file_path,
+)
 
 BROWSERS = ("chromium", "firefox", "webkit", "chrome", "chrome-beta")
 
@@ -711,6 +718,13 @@ def accessibility(
 @cli.command()
 @click.argument("url")
 @click.option("zip_", "-z", "--zip", is_flag=True, help="Save as a .har.zip file")
+@click.option(
+    "extract",
+    "-x",
+    "--extract",
+    is_flag=True,
+    help="Extract resources from the HAR file into a directory",
+)
 @click.option(
     "-a",
     "--auth",
@@ -740,6 +754,7 @@ def accessibility(
 def har(
     url,
     zip_,
+    extract,
     auth,
     output,
     wait,
@@ -765,6 +780,8 @@ def har(
         shot-scraper har https://datasette.io/ -o trace.har
 
     Use --zip to save as a .har.zip file instead, or specify a filename ending in .har.zip
+
+    Use --extract / -x to also extract all resources from the HAR into a directory
     """
     if output is None:
         output = filename_for_url(
@@ -799,6 +816,104 @@ def har(
         context.close()
         browser_obj.close()
 
+    if extract:
+        _extract_har_resources(output)
+
+
+def _extract_har_resources(har_path):
+    """Extract resources from a HAR file into a directory."""
+    har_path = pathlib.Path(har_path)
+
+    # Determine if it's a zip file
+    is_zip = zipfile.is_zipfile(har_path)
+
+    # Determine extract directory name (parallel to har file)
+    if str(har_path).endswith(".har.zip"):
+        extract_dir = har_path.parent / har_path.name.replace(".har.zip", "")
+    else:
+        extract_dir = har_path.parent / har_path.name.replace(".har", "")
+
+    # Create the extract directory
+    extract_dir.mkdir(exist_ok=True)
+
+    # Track existing files to handle duplicates
+    existing_files = set()
+
+    def file_exists_in_dir(filename):
+        return filename in existing_files
+
+    # Load the HAR data (and keep zip file open if needed)
+    if is_zip:
+        with zipfile.ZipFile(har_path) as zf:
+            with zf.open("har.har") as har_file:
+                har_data = json.load(har_file)
+
+            # Extract each entry (with zip file open for _file references)
+            for entry in har_data.get("log", {}).get("entries", []):
+                _extract_har_entry(entry, extract_dir, existing_files, file_exists_in_dir, zf)
+    else:
+        with open(har_path) as har_file:
+            har_data = json.load(har_file)
+
+        # Extract each entry
+        for entry in har_data.get("log", {}).get("entries", []):
+            _extract_har_entry(entry, extract_dir, existing_files, file_exists_in_dir, None)
+
+    click.echo(f"Extracted resources to: {extract_dir}", err=True)
+
+
+def _extract_har_entry(entry, extract_dir, existing_files, file_exists_fn, zip_file):
+    """Extract a single HAR entry to the extract directory."""
+    request = entry.get("request", {})
+    response = entry.get("response", {})
+    content = response.get("content", {})
+
+    url = request.get("url", "")
+    if not url:
+        return
+
+    # Get content-type from response headers
+    content_type = None
+    for header in response.get("headers", []):
+        if header.get("name", "").lower() == "content-type":
+            content_type = header.get("value", "")
+            break
+
+    # Get the content - either from text field or from _file reference in zip
+    text = content.get("text", "")
+    encoding = content.get("encoding", "")
+    file_ref = content.get("_file", "")
+
+    data = None
+
+    if file_ref and zip_file:
+        # Content is stored as a separate file in the zip
+        try:
+            with zip_file.open(file_ref) as f:
+                data = f.read()
+        except KeyError:
+            pass
+    elif text:
+        # Decode the content from text field
+        if encoding == "base64":
+            try:
+                data = base64.b64decode(text)
+            except Exception:
+                return
+        else:
+            data = text.encode("utf-8")
+
+    if not data:
+        return
+
+    # Generate filename
+    filename = filename_for_har_entry(url, content_type, file_exists=file_exists_fn)
+    existing_files.add(filename)
+
+    # Write the file
+    file_path = extract_dir / filename
+    file_path.write_bytes(data)
+
 
 @cli.command()
 @click.argument("url")
 
@@ -1,8 +1,50 @@
 import urllib.parse
 import re
+import os.path
 
 disallowed_re = re.compile("[^a-zA-Z0-9_-]")
 
+# Map content-type to file extension
+CONTENT_TYPE_EXTENSIONS = {
+    "text/html": "html",
+    "text/css": "css",
+    "application/javascript": "js",
+    "text/javascript": "js",
+    "application/json": "json",
+    "image/png": "png",
+    "image/jpeg": "jpg",
+    "image/gif": "gif",
+    "image/webp": "webp",
+    "image/svg+xml": "svg",
+    "application/pdf": "pdf",
+    "text/plain": "txt",
+    "application/xml": "xml",
+    "text/xml": "xml",
+    "font/woff2": "woff2",
+    "font/woff": "woff",
+    "application/font-woff": "woff",
+}
+
+# Map file extension to expected content-type prefix
+EXTENSION_CONTENT_TYPES = {
+    "html": "text/html",
+    "htm": "text/html",
+    "css": "text/css",
+    "js": "application/javascript",
+    "json": "application/json",
+    "png": "image/png",
+    "jpg": "image/jpeg",
+    "jpeg": "image/jpeg",
+    "gif": "image/gif",
+    "webp": "image/webp",
+    "svg": "image/svg+xml",
+    "pdf": "application/pdf",
+    "txt": "text/plain",
+    "xml": "application/xml",
+    "woff2": "font/woff2",
+    "woff": "font/woff",
+}
+
 
 def file_exists_never(filename):
     return False
@@ -72,3 +114,72 @@ def load_github_script(github_path: str) -> str:
                 )
     except urllib.error.URLError as e:
         raise ValueError(f"Error fetching from GitHub: {e}")
+
+
+def extension_for_content_type(content_type):
+    """
+    Return the file extension for a given content-type.
+
+    Returns None if the content-type is unknown or empty.
+    """
+    if not content_type:
+        return None
+    # Strip charset and other parameters
+    mime_type = content_type.split(";")[0].strip().lower()
+    return CONTENT_TYPE_EXTENSIONS.get(mime_type)
+
+
+def filename_for_har_entry(url, content_type, file_exists=file_exists_never):
+    """
+    Derive a filename for a HAR entry based on its URL and content-type.
+
+    Uses the URL to generate a base filename, then determines the extension:
+    - If the URL has an extension that matches the content-type, use it
+    - If the URL has no extension, or the extension doesn't match, use content-type
+    - If neither URL nor content-type provide an extension, use .bin
+    """
+    bits = urllib.parse.urlparse(url)
+    url_path = bits.path
+
+    # Try to get extension from URL path
+    path_base, url_ext_with_dot = os.path.splitext(url_path)
+    url_ext = url_ext_with_dot.lstrip(".").lower() if url_ext_with_dot else None
+
+    # Get extension from content-type
+    ct_ext = extension_for_content_type(content_type)
+
+    # Determine if URL extension matches content-type
+    url_ext_matches_ct = False
+    if url_ext and ct_ext:
+        expected_ct = EXTENSION_CONTENT_TYPES.get(url_ext, "").lower()
+        actual_ct = content_type.split(";")[0].strip().lower() if content_type else ""
+        if expected_ct and expected_ct == actual_ct:
+            url_ext_matches_ct = True
+        elif url_ext in ("jpg", "jpeg") and ct_ext in ("jpg", "jpeg"):
+            url_ext_matches_ct = True
+
+    # Get base filename from URL (netloc + path, excluding query)
+    # Only strip extension from path if it matches content-type
+    if url_ext and url_ext_matches_ct:
+        path_for_base = path_base
+    else:
+        path_for_base = url_path
+    base = (bits.netloc + path_for_base).replace(".", "-").replace("/", "-").rstrip("-")
+    base = disallowed_re.sub("", base).lstrip("-")
+
+    # Determine final extension
+    if url_ext_matches_ct:
+        ext = url_ext
+    elif ct_ext:
+        ext = ct_ext
+    elif url_ext:
+        ext = url_ext
+    else:
+        ext = "bin"
+
+    filename = f"{base}.{ext}"
+    suffix = 0
+    while file_exists(filename):
+        suffix += 1
+        filename = f"{base}.{suffix}.{ext}"
+    return filename