Skip to content

Commit a6ca48d

Browse files
authored
Add -x/--extract option to har command (#183)
* Add -x/--extract option to har command The new --extract / -x option extracts all resources from the HAR file into a directory parallel to the HAR file, with meaningful filenames derived from URLs and extensions based on content-type. Example usage: shot-scraper har https://example.com/ --extract This creates both example-com.har and an example-com/ directory containing all the resources. Works with both .har and .har.zip formats: shot-scraper har https://example.com/ --extract --zip For the is-it-a-bird demo, use: shot-scraper har https://tools.simonwillison.net/is-it-a-bird \ --extract -o isitabird.har \ -j "document.querySelector('button')?.click()" \ --wait-for "document.body.innerText.includes('Model loaded')" \ --timeout 120000 Implements: - New extension_for_content_type() utility for mapping MIME types - New filename_for_har_entry() for deriving filenames from HAR entries - Extraction logic handles both plain HAR and zip formats - Supports content stored in HAR text field or as _file reference in zip Closes #XXX * Drop testing on 3.9, test on 3.14 --------- https://gistpreview.github.io/?8958f38250b3bf8f5693fa7f0c73b57c/index.html
1 parent 51afe62 commit a6ca48d

File tree

5 files changed

+412
-2
lines changed

5 files changed

+412
-2
lines changed

docs/har.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,23 @@ Archive: datasette-io.har.zip
5555

5656
You can record multiple pages to a single HTTP Archive using the {ref}`shot-scraper multi --har option<multi-har>`.
5757

58+
## Extracting resources from HAR files
59+
60+
Use the `--extract` or `-x` option to automatically extract all resources from the HAR file into a directory:
61+
62+
```bash
63+
shot-scraper har https://datasette.io/ --extract
64+
```
65+
This will create both `datasette-io.har` and a `datasette-io/` directory containing all resources with meaningful filenames derived from their URLs.
66+
67+
The extracted files use extensions based on their content-type. For example, a request to `/api/data` that returns JSON will be saved with a `.json` extension.
68+
69+
You can combine this with `--zip`:
70+
```bash
71+
shot-scraper har https://datasette.io/ --extract --zip
72+
```
73+
This creates `datasette-io.har.zip` and extracts resources to the `datasette-io/` directory.
74+
5875
## `shot-scraper har --help`
5976

6077
Full `--help` for this command:
@@ -87,8 +104,11 @@ Usage: shot-scraper har [OPTIONS] URL
87104
Use --zip to save as a .har.zip file instead, or specify a filename ending in
88105
.har.zip
89106
107+
Use --extract / -x to also extract all resources from the HAR into a directory
108+
90109
Options:
91110
-z, --zip Save as a .har.zip file
111+
-x, --extract Extract resources from the HAR file into a directory
92112
-a, --auth FILENAME Path to JSON authentication context file
93113
-o, --output FILE HAR filename
94114
--wait INTEGER Wait this many milliseconds before taking the

shot_scraper/cli.py

Lines changed: 116 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import base64
12
import secrets
23
import subprocess
34
import sys
@@ -6,14 +7,20 @@
67
import json
78
import os
89
import pathlib
10+
import zipfile
911
from runpy import run_module
1012
from click_default_group import DefaultGroup
1113
import yaml
1214
import click
1315
from playwright.sync_api import sync_playwright, Error, TimeoutError
1416

1517

16-
from shot_scraper.utils import filename_for_url, load_github_script, url_or_file_path
18+
from shot_scraper.utils import (
19+
filename_for_url,
20+
filename_for_har_entry,
21+
load_github_script,
22+
url_or_file_path,
23+
)
1724

1825
BROWSERS = ("chromium", "firefox", "webkit", "chrome", "chrome-beta")
1926

@@ -712,6 +719,13 @@ def accessibility(
712719
@cli.command()
713720
@click.argument("url")
714721
@click.option("zip_", "-z", "--zip", is_flag=True, help="Save as a .har.zip file")
722+
@click.option(
723+
"extract",
724+
"-x",
725+
"--extract",
726+
is_flag=True,
727+
help="Extract resources from the HAR file into a directory",
728+
)
715729
@click.option(
716730
"-a",
717731
"--auth",
@@ -741,6 +755,7 @@ def accessibility(
741755
def har(
742756
url,
743757
zip_,
758+
extract,
744759
auth,
745760
output,
746761
wait,
@@ -766,6 +781,8 @@ def har(
766781
shot-scraper har https://datasette.io/ -o trace.har
767782
768783
Use --zip to save as a .har.zip file instead, or specify a filename ending in .har.zip
784+
785+
Use --extract / -x to also extract all resources from the HAR into a directory
769786
"""
770787
if output is None:
771788
output = filename_for_url(
@@ -800,6 +817,104 @@ def har(
800817
context.close()
801818
browser_obj.close()
802819

820+
if extract:
821+
_extract_har_resources(output)
822+
823+
824+
def _extract_har_resources(har_path):
825+
"""Extract resources from a HAR file into a directory."""
826+
har_path = pathlib.Path(har_path)
827+
828+
# Determine if it's a zip file
829+
is_zip = zipfile.is_zipfile(har_path)
830+
831+
# Determine extract directory name (parallel to har file)
832+
if str(har_path).endswith(".har.zip"):
833+
extract_dir = har_path.parent / har_path.name.replace(".har.zip", "")
834+
else:
835+
extract_dir = har_path.parent / har_path.name.replace(".har", "")
836+
837+
# Create the extract directory
838+
extract_dir.mkdir(exist_ok=True)
839+
840+
# Track existing files to handle duplicates
841+
existing_files = set()
842+
843+
def file_exists_in_dir(filename):
844+
return filename in existing_files
845+
846+
# Load the HAR data (and keep zip file open if needed)
847+
if is_zip:
848+
with zipfile.ZipFile(har_path) as zf:
849+
with zf.open("har.har") as har_file:
850+
har_data = json.load(har_file)
851+
852+
# Extract each entry (with zip file open for _file references)
853+
for entry in har_data.get("log", {}).get("entries", []):
854+
_extract_har_entry(entry, extract_dir, existing_files, file_exists_in_dir, zf)
855+
else:
856+
with open(har_path) as har_file:
857+
har_data = json.load(har_file)
858+
859+
# Extract each entry
860+
for entry in har_data.get("log", {}).get("entries", []):
861+
_extract_har_entry(entry, extract_dir, existing_files, file_exists_in_dir, None)
862+
863+
click.echo(f"Extracted resources to: {extract_dir}", err=True)
864+
865+
866+
def _extract_har_entry(entry, extract_dir, existing_files, file_exists_fn, zip_file):
867+
"""Extract a single HAR entry to the extract directory."""
868+
request = entry.get("request", {})
869+
response = entry.get("response", {})
870+
content = response.get("content", {})
871+
872+
url = request.get("url", "")
873+
if not url:
874+
return
875+
876+
# Get content-type from response headers
877+
content_type = None
878+
for header in response.get("headers", []):
879+
if header.get("name", "").lower() == "content-type":
880+
content_type = header.get("value", "")
881+
break
882+
883+
# Get the content - either from text field or from _file reference in zip
884+
text = content.get("text", "")
885+
encoding = content.get("encoding", "")
886+
file_ref = content.get("_file", "")
887+
888+
data = None
889+
890+
if file_ref and zip_file:
891+
# Content is stored as a separate file in the zip
892+
try:
893+
with zip_file.open(file_ref) as f:
894+
data = f.read()
895+
except KeyError:
896+
pass
897+
elif text:
898+
# Decode the content from text field
899+
if encoding == "base64":
900+
try:
901+
data = base64.b64decode(text)
902+
except Exception:
903+
return
904+
else:
905+
data = text.encode("utf-8")
906+
907+
if not data:
908+
return
909+
910+
# Generate filename
911+
filename = filename_for_har_entry(url, content_type, file_exists=file_exists_fn)
912+
existing_files.add(filename)
913+
914+
# Write the file
915+
file_path = extract_dir / filename
916+
file_path.write_bytes(data)
917+
803918

804919
@cli.command()
805920
@click.argument("url")

shot_scraper/utils.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,50 @@
11
import urllib.parse
22
import re
3+
import os.path
34

45
disallowed_re = re.compile("[^a-zA-Z0-9_-]")
56

7+
# Map content-type to file extension
8+
CONTENT_TYPE_EXTENSIONS = {
9+
"text/html": "html",
10+
"text/css": "css",
11+
"application/javascript": "js",
12+
"text/javascript": "js",
13+
"application/json": "json",
14+
"image/png": "png",
15+
"image/jpeg": "jpg",
16+
"image/gif": "gif",
17+
"image/webp": "webp",
18+
"image/svg+xml": "svg",
19+
"application/pdf": "pdf",
20+
"text/plain": "txt",
21+
"application/xml": "xml",
22+
"text/xml": "xml",
23+
"font/woff2": "woff2",
24+
"font/woff": "woff",
25+
"application/font-woff": "woff",
26+
}
27+
28+
# Map file extension to expected content-type prefix
29+
EXTENSION_CONTENT_TYPES = {
30+
"html": "text/html",
31+
"htm": "text/html",
32+
"css": "text/css",
33+
"js": "application/javascript",
34+
"json": "application/json",
35+
"png": "image/png",
36+
"jpg": "image/jpeg",
37+
"jpeg": "image/jpeg",
38+
"gif": "image/gif",
39+
"webp": "image/webp",
40+
"svg": "image/svg+xml",
41+
"pdf": "application/pdf",
42+
"txt": "text/plain",
43+
"xml": "application/xml",
44+
"woff2": "font/woff2",
45+
"woff": "font/woff",
46+
}
47+
648

749
def file_exists_never(filename):
850
return False
@@ -72,3 +114,72 @@ def load_github_script(github_path: str) -> str:
72114
)
73115
except urllib.error.URLError as e:
74116
raise ValueError(f"Error fetching from GitHub: {e}")
117+
118+
119+
def extension_for_content_type(content_type):
120+
"""
121+
Return the file extension for a given content-type.
122+
123+
Returns None if the content-type is unknown or empty.
124+
"""
125+
if not content_type:
126+
return None
127+
# Strip charset and other parameters
128+
mime_type = content_type.split(";")[0].strip().lower()
129+
return CONTENT_TYPE_EXTENSIONS.get(mime_type)
130+
131+
132+
def filename_for_har_entry(url, content_type, file_exists=file_exists_never):
133+
"""
134+
Derive a filename for a HAR entry based on its URL and content-type.
135+
136+
Uses the URL to generate a base filename, then determines the extension:
137+
- If the URL has an extension that matches the content-type, use it
138+
- If the URL has no extension, or the extension doesn't match, use content-type
139+
- If neither URL nor content-type provide an extension, use .bin
140+
"""
141+
bits = urllib.parse.urlparse(url)
142+
url_path = bits.path
143+
144+
# Try to get extension from URL path
145+
path_base, url_ext_with_dot = os.path.splitext(url_path)
146+
url_ext = url_ext_with_dot.lstrip(".").lower() if url_ext_with_dot else None
147+
148+
# Get extension from content-type
149+
ct_ext = extension_for_content_type(content_type)
150+
151+
# Determine if URL extension matches content-type
152+
url_ext_matches_ct = False
153+
if url_ext and ct_ext:
154+
expected_ct = EXTENSION_CONTENT_TYPES.get(url_ext, "").lower()
155+
actual_ct = content_type.split(";")[0].strip().lower() if content_type else ""
156+
if expected_ct and expected_ct == actual_ct:
157+
url_ext_matches_ct = True
158+
elif url_ext in ("jpg", "jpeg") and ct_ext in ("jpg", "jpeg"):
159+
url_ext_matches_ct = True
160+
161+
# Get base filename from URL (netloc + path, excluding query)
162+
# Only strip extension from path if it matches content-type
163+
if url_ext and url_ext_matches_ct:
164+
path_for_base = path_base
165+
else:
166+
path_for_base = url_path
167+
base = (bits.netloc + path_for_base).replace(".", "-").replace("/", "-").rstrip("-")
168+
base = disallowed_re.sub("", base).lstrip("-")
169+
170+
# Determine final extension
171+
if url_ext_matches_ct:
172+
ext = url_ext
173+
elif ct_ext:
174+
ext = ct_ext
175+
elif url_ext:
176+
ext = url_ext
177+
else:
178+
ext = "bin"
179+
180+
filename = f"{base}.{ext}"
181+
suffix = 0
182+
while file_exists(filename):
183+
suffix += 1
184+
filename = f"{base}.{suffix}.{ext}"
185+
return filename

0 commit comments

Comments
 (0)