Skip to content

Commit c922540

Browse files
committed
Add -x/--extract option to har command
The new --extract / -x option extracts all resources from the HAR file into a directory parallel to the HAR file, with meaningful filenames derived from URLs and extensions based on content-type. Example usage: shot-scraper har https://example.com/ --extract This creates both example-com.har and an example-com/ directory containing all the resources. Works with both .har and .har.zip formats: shot-scraper har https://example.com/ --extract --zip For the is-it-a-bird demo, use: shot-scraper har https://tools.simonwillison.net/is-it-a-bird \ --extract -o isitabird.har \ -j "document.querySelector('button')?.click()" \ --wait-for "document.body.innerText.includes('Model loaded')" \ --timeout 120000 Implements: - New extension_for_content_type() utility for mapping MIME types - New filename_for_har_entry() for deriving filenames from HAR entries - Extraction logic handles both plain HAR and zip formats - Supports content stored in HAR text field or as _file reference in zip Closes #XXX
1 parent 8435690 commit c922540

File tree

5 files changed

+412
-2
lines changed

5 files changed

+412
-2
lines changed

docs/har.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,23 @@ Archive: datasette-io.har.zip
5555

5656
You can record multiple pages to a single HTTP Archive using the {ref}`shot-scraper multi --har option<multi-har>`.
5757

58+
## Extracting resources from HAR files
59+
60+
Use the `--extract` or `-x` option to automatically extract all resources from the HAR file into a directory:
61+
62+
```bash
63+
shot-scraper har https://datasette.io/ --extract
64+
```
65+
This will create both `datasette-io.har` and a `datasette-io/` directory containing all resources with meaningful filenames derived from their URLs.
66+
67+
The extracted files use extensions based on their content-type. For example, a request to `/api/data` that returns JSON will be saved with a `.json` extension.
68+
69+
You can combine this with `--zip`:
70+
```bash
71+
shot-scraper har https://datasette.io/ --extract --zip
72+
```
73+
This creates `datasette-io.har.zip` and extracts resources to the `datasette-io/` directory.
74+
5875
## `shot-scraper har --help`
5976

6077
Full `--help` for this command:
@@ -87,8 +104,11 @@ Usage: shot-scraper har [OPTIONS] URL
87104
Use --zip to save as a .har.zip file instead, or specify a filename ending in
88105
.har.zip
89106
107+
Use --extract / -x to also extract all resources from the HAR into a directory
108+
90109
Options:
91110
-z, --zip Save as a .har.zip file
111+
-x, --extract Extract resources from the HAR file into a directory
92112
-a, --auth FILENAME Path to JSON authentication context file
93113
-o, --output FILE HAR filename
94114
--wait INTEGER Wait this many milliseconds before taking the

shot_scraper/cli.py

Lines changed: 116 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import base64
12
import secrets
23
import subprocess
34
import sys
@@ -6,14 +7,20 @@
67
import json
78
import os
89
import pathlib
10+
import zipfile
911
from runpy import run_module
1012
from click_default_group import DefaultGroup
1113
import yaml
1214
import click
1315
from playwright.sync_api import sync_playwright, Error, TimeoutError
1416

1517

16-
from shot_scraper.utils import filename_for_url, load_github_script, url_or_file_path
18+
from shot_scraper.utils import (
19+
filename_for_url,
20+
filename_for_har_entry,
21+
load_github_script,
22+
url_or_file_path,
23+
)
1724

1825
BROWSERS = ("chromium", "firefox", "webkit", "chrome", "chrome-beta")
1926

@@ -711,6 +718,13 @@ def accessibility(
711718
@cli.command()
712719
@click.argument("url")
713720
@click.option("zip_", "-z", "--zip", is_flag=True, help="Save as a .har.zip file")
721+
@click.option(
722+
"extract",
723+
"-x",
724+
"--extract",
725+
is_flag=True,
726+
help="Extract resources from the HAR file into a directory",
727+
)
714728
@click.option(
715729
"-a",
716730
"--auth",
@@ -740,6 +754,7 @@ def accessibility(
740754
def har(
741755
url,
742756
zip_,
757+
extract,
743758
auth,
744759
output,
745760
wait,
@@ -765,6 +780,8 @@ def har(
765780
shot-scraper har https://datasette.io/ -o trace.har
766781
767782
Use --zip to save as a .har.zip file instead, or specify a filename ending in .har.zip
783+
784+
Use --extract / -x to also extract all resources from the HAR into a directory
768785
"""
769786
if output is None:
770787
output = filename_for_url(
@@ -799,6 +816,104 @@ def har(
799816
context.close()
800817
browser_obj.close()
801818

819+
if extract:
820+
_extract_har_resources(output)
821+
822+
823+
def _extract_har_resources(har_path):
824+
"""Extract resources from a HAR file into a directory."""
825+
har_path = pathlib.Path(har_path)
826+
827+
# Determine if it's a zip file
828+
is_zip = zipfile.is_zipfile(har_path)
829+
830+
# Determine extract directory name (parallel to har file)
831+
if str(har_path).endswith(".har.zip"):
832+
extract_dir = har_path.parent / har_path.name.replace(".har.zip", "")
833+
else:
834+
extract_dir = har_path.parent / har_path.name.replace(".har", "")
835+
836+
# Create the extract directory
837+
extract_dir.mkdir(exist_ok=True)
838+
839+
# Track existing files to handle duplicates
840+
existing_files = set()
841+
842+
def file_exists_in_dir(filename):
843+
return filename in existing_files
844+
845+
# Load the HAR data (and keep zip file open if needed)
846+
if is_zip:
847+
with zipfile.ZipFile(har_path) as zf:
848+
with zf.open("har.har") as har_file:
849+
har_data = json.load(har_file)
850+
851+
# Extract each entry (with zip file open for _file references)
852+
for entry in har_data.get("log", {}).get("entries", []):
853+
_extract_har_entry(entry, extract_dir, existing_files, file_exists_in_dir, zf)
854+
else:
855+
with open(har_path) as har_file:
856+
har_data = json.load(har_file)
857+
858+
# Extract each entry
859+
for entry in har_data.get("log", {}).get("entries", []):
860+
_extract_har_entry(entry, extract_dir, existing_files, file_exists_in_dir, None)
861+
862+
click.echo(f"Extracted resources to: {extract_dir}", err=True)
863+
864+
865+
def _extract_har_entry(entry, extract_dir, existing_files, file_exists_fn, zip_file):
866+
"""Extract a single HAR entry to the extract directory."""
867+
request = entry.get("request", {})
868+
response = entry.get("response", {})
869+
content = response.get("content", {})
870+
871+
url = request.get("url", "")
872+
if not url:
873+
return
874+
875+
# Get content-type from response headers
876+
content_type = None
877+
for header in response.get("headers", []):
878+
if header.get("name", "").lower() == "content-type":
879+
content_type = header.get("value", "")
880+
break
881+
882+
# Get the content - either from text field or from _file reference in zip
883+
text = content.get("text", "")
884+
encoding = content.get("encoding", "")
885+
file_ref = content.get("_file", "")
886+
887+
data = None
888+
889+
if file_ref and zip_file:
890+
# Content is stored as a separate file in the zip
891+
try:
892+
with zip_file.open(file_ref) as f:
893+
data = f.read()
894+
except KeyError:
895+
pass
896+
elif text:
897+
# Decode the content from text field
898+
if encoding == "base64":
899+
try:
900+
data = base64.b64decode(text)
901+
except Exception:
902+
return
903+
else:
904+
data = text.encode("utf-8")
905+
906+
if not data:
907+
return
908+
909+
# Generate filename
910+
filename = filename_for_har_entry(url, content_type, file_exists=file_exists_fn)
911+
existing_files.add(filename)
912+
913+
# Write the file
914+
file_path = extract_dir / filename
915+
file_path.write_bytes(data)
916+
802917

803918
@cli.command()
804919
@click.argument("url")

shot_scraper/utils.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,50 @@
11
import urllib.parse
22
import re
3+
import os.path
34

45
disallowed_re = re.compile("[^a-zA-Z0-9_-]")
56

7+
# Map content-type to file extension
8+
CONTENT_TYPE_EXTENSIONS = {
9+
"text/html": "html",
10+
"text/css": "css",
11+
"application/javascript": "js",
12+
"text/javascript": "js",
13+
"application/json": "json",
14+
"image/png": "png",
15+
"image/jpeg": "jpg",
16+
"image/gif": "gif",
17+
"image/webp": "webp",
18+
"image/svg+xml": "svg",
19+
"application/pdf": "pdf",
20+
"text/plain": "txt",
21+
"application/xml": "xml",
22+
"text/xml": "xml",
23+
"font/woff2": "woff2",
24+
"font/woff": "woff",
25+
"application/font-woff": "woff",
26+
}
27+
28+
# Map file extension to expected content-type prefix
29+
EXTENSION_CONTENT_TYPES = {
30+
"html": "text/html",
31+
"htm": "text/html",
32+
"css": "text/css",
33+
"js": "application/javascript",
34+
"json": "application/json",
35+
"png": "image/png",
36+
"jpg": "image/jpeg",
37+
"jpeg": "image/jpeg",
38+
"gif": "image/gif",
39+
"webp": "image/webp",
40+
"svg": "image/svg+xml",
41+
"pdf": "application/pdf",
42+
"txt": "text/plain",
43+
"xml": "application/xml",
44+
"woff2": "font/woff2",
45+
"woff": "font/woff",
46+
}
47+
648

749
def file_exists_never(filename):
850
return False
@@ -72,3 +114,72 @@ def load_github_script(github_path: str) -> str:
72114
)
73115
except urllib.error.URLError as e:
74116
raise ValueError(f"Error fetching from GitHub: {e}")
117+
118+
119+
def extension_for_content_type(content_type):
120+
"""
121+
Return the file extension for a given content-type.
122+
123+
Returns None if the content-type is unknown or empty.
124+
"""
125+
if not content_type:
126+
return None
127+
# Strip charset and other parameters
128+
mime_type = content_type.split(";")[0].strip().lower()
129+
return CONTENT_TYPE_EXTENSIONS.get(mime_type)
130+
131+
132+
def filename_for_har_entry(url, content_type, file_exists=file_exists_never):
133+
"""
134+
Derive a filename for a HAR entry based on its URL and content-type.
135+
136+
Uses the URL to generate a base filename, then determines the extension:
137+
- If the URL has an extension that matches the content-type, use it
138+
- If the URL has no extension, or the extension doesn't match, use content-type
139+
- If neither URL nor content-type provide an extension, use .bin
140+
"""
141+
bits = urllib.parse.urlparse(url)
142+
url_path = bits.path
143+
144+
# Try to get extension from URL path
145+
path_base, url_ext_with_dot = os.path.splitext(url_path)
146+
url_ext = url_ext_with_dot.lstrip(".").lower() if url_ext_with_dot else None
147+
148+
# Get extension from content-type
149+
ct_ext = extension_for_content_type(content_type)
150+
151+
# Determine if URL extension matches content-type
152+
url_ext_matches_ct = False
153+
if url_ext and ct_ext:
154+
expected_ct = EXTENSION_CONTENT_TYPES.get(url_ext, "").lower()
155+
actual_ct = content_type.split(";")[0].strip().lower() if content_type else ""
156+
if expected_ct and expected_ct == actual_ct:
157+
url_ext_matches_ct = True
158+
elif url_ext in ("jpg", "jpeg") and ct_ext in ("jpg", "jpeg"):
159+
url_ext_matches_ct = True
160+
161+
# Get base filename from URL (netloc + path, excluding query)
162+
# Only strip extension from path if it matches content-type
163+
if url_ext and url_ext_matches_ct:
164+
path_for_base = path_base
165+
else:
166+
path_for_base = url_path
167+
base = (bits.netloc + path_for_base).replace(".", "-").replace("/", "-").rstrip("-")
168+
base = disallowed_re.sub("", base).lstrip("-")
169+
170+
# Determine final extension
171+
if url_ext_matches_ct:
172+
ext = url_ext
173+
elif ct_ext:
174+
ext = ct_ext
175+
elif url_ext:
176+
ext = url_ext
177+
else:
178+
ext = "bin"
179+
180+
filename = f"{base}.{ext}"
181+
suffix = 0
182+
while file_exists(filename):
183+
suffix += 1
184+
filename = f"{base}.{suffix}.{ext}"
185+
return filename

0 commit comments

Comments
 (0)