Skip to content

Commit 477572c

Browse files
authored
shot-scraper har command (#165)
* shot-scraper har command, closes #146 * Tests for shot-scraper har command * Docs for shot-scraper-har
1 parent 7b8acee commit 477572c

File tree

5 files changed

+274
-0
lines changed

5 files changed

+274
-0
lines changed

docs/har.md

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Saving a web page to an HTTP Archive
2+
3+
An HTTP Archive file captures the full details of a series of HTTP requests and responses as JSON.
4+
5+
The `shot-scraper har` command can save a `*.har.zip` file that contains both that JSON data and the content of any assets that were loaded by the page.
6+
```bash
7+
shot-scraper har https://datasette.io/
8+
```
9+
This will save to `datasette-io.har.zip`. You can use `-o` to specify a filename:
10+
```bash
11+
shot-scraper har https://datasette.io/tutorials/learn-sql \
12+
-o learn-sql.har.zip
13+
```
14+
You can view the contents of a HAR file using `unzip -l`:
15+
```bash
16+
unzip -l datasette-io.har.zip
17+
```
18+
```
19+
Archive: datasette-io.har.zip
20+
Length Date Time Name
21+
--------- ---------- ----- ----
22+
39067 02-13-2025 10:33 41824dbd0c51f584faf0e2c4e88de01b8a5dcdcd.html
23+
4052 02-13-2025 10:33 34972651f161f0396c697c65ef9aaeb2c9ac50c4.css
24+
2501 02-13-2025 10:33 9f612e71165058f0046d8bf8fec12af7eb15f39d.css
25+
10916 02-13-2025 10:33 2737174596eafba6e249022203c324605f023cdd.svg
26+
5557 02-13-2025 10:33 427504aa6ef5a8786f90fb2de636133b3fc6d1fe.js
27+
1393 02-13-2025 10:33 25c68a82b654c9d844c604565dab4785161ef697.js
28+
1170 02-13-2025 10:33 31c073551ef5c84324073edfc7b118f81ce9a7d2.svg
29+
1158 02-13-2025 10:33 1e0c64af7e6a4712f5e7d1917d9555bbc3d01f7a.svg
30+
1161 02-13-2025 10:33 ec8282b36a166d63fae4c04166bb81f945660435.svg
31+
3373 02-13-2025 10:33 5f85a11ef89c0e3f237c8e926c1cb66727182102.svg
32+
1134 02-13-2025 10:33 3b9d8109b919dfe9393dab2376fe03267dadc1f1.svg
33+
31670 02-13-2025 10:33 469f0d28af6c026dcae8c81731e2b0484aeac92c.jpeg
34+
1157 02-13-2025 10:33 b7786336bfce38a9677d26dc9ef468bb1ed45e19.svg
35+
50494 02-13-2025 10:33 har.har
36+
--------- -------
37+
154803 14 files
38+
```
39+
40+
## `shot-scraper har --help`
41+
42+
Full `--help` for this command:
43+
44+
<!-- [[[cog
45+
import cog
46+
from shot_scraper import cli
47+
from click.testing import CliRunner
48+
runner = CliRunner()
49+
result = runner.invoke(cli.cli, ["har", "--help"])
50+
help = result.output.replace("Usage: cli", "Usage: shot-scraper")
51+
cog.out(
52+
"```\n{}\n```\n".format(help.strip())
53+
)
54+
]]] -->
55+
```
56+
Usage: shot-scraper har [OPTIONS] URL
57+
58+
Record a HAR file for the specified page
59+
60+
Usage:
61+
62+
shot-scraper har https://datasette.io/
63+
64+
Options:
65+
-a, --auth FILENAME Path to JSON authentication context file
66+
-o, --output FILE HAR filename
67+
--timeout INTEGER Wait this many milliseconds before failing
68+
--log-console Write console.log() to stderr
69+
--fail Fail with an error code if a page returns an HTTP error
70+
--skip Skip pages that return HTTP errors
71+
--bypass-csp Bypass Content-Security-Policy
72+
--auth-password TEXT Password for HTTP Basic authentication
73+
--auth-username TEXT Username for HTTP Basic authentication
74+
--help Show this message and exit.
75+
```
76+
<!-- [[[end]]] -->

docs/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ multi
2626
javascript
2727
pdf
2828
html
29+
har
2930
accessibility
3031
github-actions
3132
contributing

shot_scraper/cli.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,7 @@ def _browser_context(
401401
bypass_csp=False,
402402
auth_username=None,
403403
auth_password=None,
404+
record_har_path=None,
404405
):
405406
browser_kwargs = dict(
406407
headless=not interactive, devtools=devtools, args=browser_args
@@ -430,6 +431,8 @@ def _browser_context(
430431
"username": auth_username,
431432
"password": auth_password,
432433
}
434+
if record_har_path:
435+
context_args["record_har_path"] = record_har_path
433436
context = browser_obj.new_context(**context_args)
434437
if timeout:
435438
context.set_default_timeout(timeout)
@@ -670,6 +673,71 @@ def accessibility(
670673
output.write("\n")
671674

672675

676+
@cli.command()
677+
@click.argument("url")
678+
@click.option(
679+
"-a",
680+
"--auth",
681+
type=click.File("r"),
682+
help="Path to JSON authentication context file",
683+
)
684+
@click.option(
685+
"-o",
686+
"--output",
687+
type=click.Path(file_okay=True, dir_okay=False, writable=True, allow_dash=False),
688+
help="HAR filename",
689+
)
690+
@click.option(
691+
"--timeout",
692+
type=int,
693+
help="Wait this many milliseconds before failing",
694+
)
695+
@log_console_option
696+
@skip_fail_options
697+
@bypass_csp_option
698+
@http_auth_options
699+
def har(
700+
url,
701+
auth,
702+
output,
703+
timeout,
704+
log_console,
705+
skip,
706+
fail,
707+
bypass_csp,
708+
auth_username,
709+
auth_password,
710+
):
711+
"""
712+
Record a HAR file for the specified page
713+
714+
Usage:
715+
716+
shot-scraper har https://datasette.io/
717+
"""
718+
if output is None:
719+
output = filename_for_url(url, ext="har.zip", file_exists=os.path.exists)
720+
721+
url = url_or_file_path(url, _check_and_absolutize)
722+
with sync_playwright() as p:
723+
context, browser_obj = _browser_context(
724+
p,
725+
auth,
726+
timeout=timeout,
727+
bypass_csp=bypass_csp,
728+
auth_username=auth_username,
729+
auth_password=auth_password,
730+
record_har_path=str(output),
731+
)
732+
page = context.new_page()
733+
if log_console:
734+
page.on("console", console_log)
735+
response = page.goto(url)
736+
skip_or_fail(response, skip, fail)
737+
context.close()
738+
browser_obj.close()
739+
740+
673741
@cli.command()
674742
@click.argument("url")
675743
@click.argument("javascript", required=False)

tests/conftest.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import pytest
2+
import subprocess
3+
import tempfile
4+
import time
5+
import socket
6+
import urllib.request
7+
import urllib.error
8+
from dataclasses import dataclass
9+
from pathlib import Path
10+
from contextlib import closing
11+
12+
13+
@dataclass
14+
class HTTPServer:
15+
"""Container for HTTP server information."""
16+
17+
base_url: str
18+
base_dir: Path
19+
20+
21+
def find_free_port():
22+
"""Find an available port by creating a temporary socket."""
23+
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
24+
sock.bind(("", 0))
25+
sock.listen(1)
26+
port = sock.getsockname()[1]
27+
return port
28+
29+
30+
@pytest.fixture
31+
def http_server():
32+
"""
33+
Pytest fixture that starts a Python HTTP server in a subprocess.
34+
Creates a temporary directory with an index.html file, starts the server
35+
on an available port, and cleans up afterwards.
36+
37+
Yields:
38+
HTTPServer: Object containing server information:
39+
- base_url: The base URL of the running server
40+
- base_dir: Path to the temporary directory serving files
41+
"""
42+
# Find an available port
43+
port = find_free_port()
44+
45+
# Create temp directory
46+
with tempfile.TemporaryDirectory() as temp_dir:
47+
base_dir = Path(temp_dir)
48+
49+
# Create index.html
50+
index_path = base_dir / "index.html"
51+
index_path.write_text("<html><body>Hello World</body></html>")
52+
53+
# Start server process in temp directory
54+
process = subprocess.Popen(
55+
["python", "-m", "http.server", str(port)],
56+
cwd=temp_dir,
57+
stdout=subprocess.PIPE,
58+
stderr=subprocess.PIPE,
59+
)
60+
61+
# Wait for server to start
62+
base_url = f"http://localhost:{port}"
63+
max_retries = 5
64+
retry_delay = 0.5
65+
66+
for _ in range(max_retries):
67+
try:
68+
with urllib.request.urlopen(base_url) as response:
69+
if response.status == 200:
70+
break
71+
except (urllib.error.URLError, ConnectionRefusedError):
72+
time.sleep(retry_delay)
73+
else:
74+
process.terminate()
75+
stdout, stderr = process.communicate()
76+
raise RuntimeError(
77+
f"Failed to start HTTP server on port {port}.\n"
78+
f"stdout: {stdout.decode()}\n"
79+
f"stderr: {stderr.decode()}"
80+
)
81+
82+
try:
83+
yield HTTPServer(base_url=base_url, base_dir=base_dir)
84+
finally:
85+
# Clean up
86+
process.terminate()
87+
process.wait(timeout=5) # Wait up to 5 seconds for process to terminate
88+
89+
# Force kill if still running
90+
if process.poll() is None:
91+
process.kill()
92+
process.wait()

tests/test_shot_scraper.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import pytest
44
import textwrap
55
from shot_scraper.cli import cli
6+
import zipfile
7+
import json
68

79

810
def test_version():
@@ -212,3 +214,38 @@ def test_error_on_invalid_scale_factors(command, args, expected):
212214
result = runner.invoke(cli, [command, "-"] + args)
213215
assert result.exit_code == 1
214216
assert result.output == expected
217+
218+
219+
@pytest.mark.parametrize("output", (None, "output.har.zip"))
220+
def test_har(http_server, output):
221+
runner = CliRunner()
222+
with runner.isolated_filesystem():
223+
# Should be no files
224+
here = pathlib.Path(".")
225+
assert list(here.glob("*.*")) == []
226+
args = ["har", http_server.base_url]
227+
if output:
228+
args.extend(("--output", output))
229+
result = runner.invoke(cli, args)
230+
assert result.exit_code == 0
231+
# HAR file should have been created
232+
files = here.glob("*.har.zip")
233+
har_files = list(files)
234+
# Should have created exactly one .har file
235+
assert len(har_files) == 1
236+
if output:
237+
assert har_files[0] == pathlib.Path(output)
238+
# It should contain *.html and har.har
239+
with zipfile.ZipFile(har_files[0]) as zip_file:
240+
file_list = zip_file.namelist()
241+
assert any(".html" in file for file in file_list)
242+
assert "har.har" in file_list
243+
244+
# har.har should be JSON with the expected structure
245+
with zip_file.open("har.har") as har_file:
246+
har_content = json.loads(har_file.read())
247+
assert "log" in har_content
248+
assert "entries" in har_content["log"]
249+
# Verify entries is a non-empty list
250+
assert isinstance(har_content["log"]["entries"], list)
251+
assert len(har_content["log"]["entries"]) > 0

0 commit comments

Comments
 (0)