Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docs/har.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,23 @@ Archive: datasette-io.har.zip

You can record multiple pages to a single HTTP Archive using the {ref}`shot-scraper multi --har option<multi-har>`.

## Extracting resources from HAR files

Use the `--extract` or `-x` option to automatically extract all resources from the HAR file into a directory:

```bash
shot-scraper har https://datasette.io/ --extract
```
This will create both `datasette-io.har` and a `datasette-io/` directory containing all resources with meaningful filenames derived from their URLs.

The extracted files use extensions based on their content-type. For example, a request to `/api/data` that returns JSON will be saved with a `.json` extension.

You can combine this with `--zip`:
```bash
shot-scraper har https://datasette.io/ --extract --zip
```
This creates `datasette-io.har.zip` and extracts resources to the `datasette-io/` directory.

## `shot-scraper har --help`

Full `--help` for this command:
Expand Down Expand Up @@ -87,8 +104,11 @@ Usage: shot-scraper har [OPTIONS] URL
Use --zip to save as a .har.zip file instead, or specify a filename ending in
.har.zip
Use --extract / -x to also extract all resources from the HAR into a directory
Options:
-z, --zip Save as a .har.zip file
-x, --extract Extract resources from the HAR file into a directory
-a, --auth FILENAME Path to JSON authentication context file
-o, --output FILE HAR filename
--wait INTEGER Wait this many milliseconds before taking the
Expand Down
117 changes: 116 additions & 1 deletion shot_scraper/cli.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import base64
import secrets
import subprocess
import sys
Expand All @@ -6,14 +7,20 @@
import json
import os
import pathlib
import zipfile
from runpy import run_module
from click_default_group import DefaultGroup
import yaml
import click
from playwright.sync_api import sync_playwright, Error, TimeoutError


from shot_scraper.utils import filename_for_url, load_github_script, url_or_file_path
from shot_scraper.utils import (
filename_for_url,
filename_for_har_entry,
load_github_script,
url_or_file_path,
)

BROWSERS = ("chromium", "firefox", "webkit", "chrome", "chrome-beta")

Expand Down Expand Up @@ -712,6 +719,13 @@ def accessibility(
@cli.command()
@click.argument("url")
@click.option("zip_", "-z", "--zip", is_flag=True, help="Save as a .har.zip file")
@click.option(
"extract",
"-x",
"--extract",
is_flag=True,
help="Extract resources from the HAR file into a directory",
)
@click.option(
"-a",
"--auth",
Expand Down Expand Up @@ -741,6 +755,7 @@ def accessibility(
def har(
url,
zip_,
extract,
auth,
output,
wait,
Expand All @@ -766,6 +781,8 @@ def har(
shot-scraper har https://datasette.io/ -o trace.har
Use --zip to save as a .har.zip file instead, or specify a filename ending in .har.zip
Use --extract / -x to also extract all resources from the HAR into a directory
"""
if output is None:
output = filename_for_url(
Expand Down Expand Up @@ -800,6 +817,104 @@ def har(
context.close()
browser_obj.close()

if extract:
_extract_har_resources(output)


def _extract_har_resources(har_path):
"""Extract resources from a HAR file into a directory."""
har_path = pathlib.Path(har_path)

# Determine if it's a zip file
is_zip = zipfile.is_zipfile(har_path)

# Determine extract directory name (parallel to har file)
if str(har_path).endswith(".har.zip"):
extract_dir = har_path.parent / har_path.name.replace(".har.zip", "")
else:
extract_dir = har_path.parent / har_path.name.replace(".har", "")

# Create the extract directory
extract_dir.mkdir(exist_ok=True)

# Track existing files to handle duplicates
existing_files = set()

def file_exists_in_dir(filename):
return filename in existing_files

# Load the HAR data (and keep zip file open if needed)
if is_zip:
with zipfile.ZipFile(har_path) as zf:
with zf.open("har.har") as har_file:
har_data = json.load(har_file)

# Extract each entry (with zip file open for _file references)
for entry in har_data.get("log", {}).get("entries", []):
_extract_har_entry(entry, extract_dir, existing_files, file_exists_in_dir, zf)
else:
with open(har_path) as har_file:
har_data = json.load(har_file)

# Extract each entry
for entry in har_data.get("log", {}).get("entries", []):
_extract_har_entry(entry, extract_dir, existing_files, file_exists_in_dir, None)

click.echo(f"Extracted resources to: {extract_dir}", err=True)


def _extract_har_entry(entry, extract_dir, existing_files, file_exists_fn, zip_file):
"""Extract a single HAR entry to the extract directory."""
request = entry.get("request", {})
response = entry.get("response", {})
content = response.get("content", {})

url = request.get("url", "")
if not url:
return

# Get content-type from response headers
content_type = None
for header in response.get("headers", []):
if header.get("name", "").lower() == "content-type":
content_type = header.get("value", "")
break

# Get the content - either from text field or from _file reference in zip
text = content.get("text", "")
encoding = content.get("encoding", "")
file_ref = content.get("_file", "")

data = None

if file_ref and zip_file:
# Content is stored as a separate file in the zip
try:
with zip_file.open(file_ref) as f:
data = f.read()
except KeyError:
pass
elif text:
# Decode the content from text field
if encoding == "base64":
try:
data = base64.b64decode(text)
except Exception:
return
else:
data = text.encode("utf-8")

if not data:
return

# Generate filename
filename = filename_for_har_entry(url, content_type, file_exists=file_exists_fn)
existing_files.add(filename)

# Write the file
file_path = extract_dir / filename
file_path.write_bytes(data)


@cli.command()
@click.argument("url")
Expand Down
111 changes: 111 additions & 0 deletions shot_scraper/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,50 @@
import urllib.parse
import re
import os.path

disallowed_re = re.compile("[^a-zA-Z0-9_-]")

# Map content-type to file extension
CONTENT_TYPE_EXTENSIONS = {
"text/html": "html",
"text/css": "css",
"application/javascript": "js",
"text/javascript": "js",
"application/json": "json",
"image/png": "png",
"image/jpeg": "jpg",
"image/gif": "gif",
"image/webp": "webp",
"image/svg+xml": "svg",
"application/pdf": "pdf",
"text/plain": "txt",
"application/xml": "xml",
"text/xml": "xml",
"font/woff2": "woff2",
"font/woff": "woff",
"application/font-woff": "woff",
}

# Map file extension to expected content-type prefix
EXTENSION_CONTENT_TYPES = {
"html": "text/html",
"htm": "text/html",
"css": "text/css",
"js": "application/javascript",
"json": "application/json",
"png": "image/png",
"jpg": "image/jpeg",
"jpeg": "image/jpeg",
"gif": "image/gif",
"webp": "image/webp",
"svg": "image/svg+xml",
"pdf": "application/pdf",
"txt": "text/plain",
"xml": "application/xml",
"woff2": "font/woff2",
"woff": "font/woff",
}


def file_exists_never(filename):
return False
Expand Down Expand Up @@ -72,3 +114,72 @@ def load_github_script(github_path: str) -> str:
)
except urllib.error.URLError as e:
raise ValueError(f"Error fetching from GitHub: {e}")


def extension_for_content_type(content_type):
"""
Return the file extension for a given content-type.
Returns None if the content-type is unknown or empty.
"""
if not content_type:
return None
# Strip charset and other parameters
mime_type = content_type.split(";")[0].strip().lower()
return CONTENT_TYPE_EXTENSIONS.get(mime_type)


def filename_for_har_entry(url, content_type, file_exists=file_exists_never):
"""
Derive a filename for a HAR entry based on its URL and content-type.
Uses the URL to generate a base filename, then determines the extension:
- If the URL has an extension that matches the content-type, use it
- If the URL has no extension, or the extension doesn't match, use content-type
- If neither URL nor content-type provide an extension, use .bin
"""
bits = urllib.parse.urlparse(url)
url_path = bits.path

# Try to get extension from URL path
path_base, url_ext_with_dot = os.path.splitext(url_path)
url_ext = url_ext_with_dot.lstrip(".").lower() if url_ext_with_dot else None

# Get extension from content-type
ct_ext = extension_for_content_type(content_type)

# Determine if URL extension matches content-type
url_ext_matches_ct = False
if url_ext and ct_ext:
expected_ct = EXTENSION_CONTENT_TYPES.get(url_ext, "").lower()
actual_ct = content_type.split(";")[0].strip().lower() if content_type else ""
if expected_ct and expected_ct == actual_ct:
url_ext_matches_ct = True
elif url_ext in ("jpg", "jpeg") and ct_ext in ("jpg", "jpeg"):
url_ext_matches_ct = True

# Get base filename from URL (netloc + path, excluding query)
# Only strip extension from path if it matches content-type
if url_ext and url_ext_matches_ct:
path_for_base = path_base
else:
path_for_base = url_path
base = (bits.netloc + path_for_base).replace(".", "-").replace("/", "-").rstrip("-")
base = disallowed_re.sub("", base).lstrip("-")

# Determine final extension
if url_ext_matches_ct:
ext = url_ext
elif ct_ext:
ext = ct_ext
elif url_ext:
ext = url_ext
else:
ext = "bin"

filename = f"{base}.{ext}"
suffix = 0
while file_exists(filename):
suffix += 1
filename = f"{base}.{suffix}.{ext}"
return filename
Loading