Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions docs/har.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Saving a web page to an HTTP Archive

An HTTP Archive file captures the full details of a series of HTTP requests and responses as JSON.

The `shot-scraper har` command can save a `*.har.zip` file that contains both that JSON data and the content of any assets that were loaded by the page.
```bash
shot-scraper har https://datasette.io/
```
This will save to `datasette-io.har.zip`. You can use `-o` to specify a filename:
```bash
shot-scraper har https://datasette.io/tutorials/learn-sql \
-o learn-sql.har.zip
```
You can view the contents of a HAR file using `unzip -l`:
```bash
unzip -l datasette-io.har.zip
```
```
Archive: datasette-io.har.zip
Length Date Time Name
--------- ---------- ----- ----
39067 02-13-2025 10:33 41824dbd0c51f584faf0e2c4e88de01b8a5dcdcd.html
4052 02-13-2025 10:33 34972651f161f0396c697c65ef9aaeb2c9ac50c4.css
2501 02-13-2025 10:33 9f612e71165058f0046d8bf8fec12af7eb15f39d.css
10916 02-13-2025 10:33 2737174596eafba6e249022203c324605f023cdd.svg
5557 02-13-2025 10:33 427504aa6ef5a8786f90fb2de636133b3fc6d1fe.js
1393 02-13-2025 10:33 25c68a82b654c9d844c604565dab4785161ef697.js
1170 02-13-2025 10:33 31c073551ef5c84324073edfc7b118f81ce9a7d2.svg
1158 02-13-2025 10:33 1e0c64af7e6a4712f5e7d1917d9555bbc3d01f7a.svg
1161 02-13-2025 10:33 ec8282b36a166d63fae4c04166bb81f945660435.svg
3373 02-13-2025 10:33 5f85a11ef89c0e3f237c8e926c1cb66727182102.svg
1134 02-13-2025 10:33 3b9d8109b919dfe9393dab2376fe03267dadc1f1.svg
31670 02-13-2025 10:33 469f0d28af6c026dcae8c81731e2b0484aeac92c.jpeg
1157 02-13-2025 10:33 b7786336bfce38a9677d26dc9ef468bb1ed45e19.svg
50494 02-13-2025 10:33 har.har
--------- -------
154803 14 files
```

## `shot-scraper har --help`

Full `--help` for this command:

<!-- [[[cog
import cog
from shot_scraper import cli
from click.testing import CliRunner
runner = CliRunner()
result = runner.invoke(cli.cli, ["har", "--help"])
help = result.output.replace("Usage: cli", "Usage: shot-scraper")
cog.out(
"```\n{}\n```\n".format(help.strip())
)
]]] -->
```
Usage: shot-scraper har [OPTIONS] URL

Record a HAR file for the specified page

Usage:

shot-scraper har https://datasette.io/

Options:
-a, --auth FILENAME Path to JSON authentication context file
-o, --output FILE HAR filename
--timeout INTEGER Wait this many milliseconds before failing
--log-console Write console.log() to stderr
--fail Fail with an error code if a page returns an HTTP error
--skip Skip pages that return HTTP errors
--bypass-csp Bypass Content-Security-Policy
--auth-password TEXT Password for HTTP Basic authentication
--auth-username TEXT Username for HTTP Basic authentication
--help Show this message and exit.
```
<!-- [[[end]]] -->
1 change: 1 addition & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ multi
javascript
pdf
html
har
accessibility
github-actions
contributing
Expand Down
68 changes: 68 additions & 0 deletions shot_scraper/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,7 @@ def _browser_context(
bypass_csp=False,
auth_username=None,
auth_password=None,
record_har_path=None,
):
browser_kwargs = dict(
headless=not interactive, devtools=devtools, args=browser_args
Expand Down Expand Up @@ -430,6 +431,8 @@ def _browser_context(
"username": auth_username,
"password": auth_password,
}
if record_har_path:
context_args["record_har_path"] = record_har_path
context = browser_obj.new_context(**context_args)
if timeout:
context.set_default_timeout(timeout)
Expand Down Expand Up @@ -670,6 +673,71 @@ def accessibility(
output.write("\n")


@cli.command()
@click.argument("url")
@click.option(
"-a",
"--auth",
type=click.File("r"),
help="Path to JSON authentication context file",
)
@click.option(
"-o",
"--output",
type=click.Path(file_okay=True, dir_okay=False, writable=True, allow_dash=False),
help="HAR filename",
)
@click.option(
"--timeout",
type=int,
help="Wait this many milliseconds before failing",
)
@log_console_option
@skip_fail_options
@bypass_csp_option
@http_auth_options
def har(
url,
auth,
output,
timeout,
log_console,
skip,
fail,
bypass_csp,
auth_username,
auth_password,
):
"""
Record a HAR file for the specified page

Usage:

shot-scraper har https://datasette.io/
"""
if output is None:
output = filename_for_url(url, ext="har.zip", file_exists=os.path.exists)

url = url_or_file_path(url, _check_and_absolutize)
with sync_playwright() as p:
context, browser_obj = _browser_context(
p,
auth,
timeout=timeout,
bypass_csp=bypass_csp,
auth_username=auth_username,
auth_password=auth_password,
record_har_path=str(output),
)
page = context.new_page()
if log_console:
page.on("console", console_log)
response = page.goto(url)
skip_or_fail(response, skip, fail)
context.close()
browser_obj.close()


@cli.command()
@click.argument("url")
@click.argument("javascript", required=False)
Expand Down
92 changes: 92 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import pytest
import subprocess
import tempfile
import time
import socket
import urllib.request
import urllib.error
from dataclasses import dataclass
from pathlib import Path
from contextlib import closing


@dataclass
class HTTPServer:
"""Container for HTTP server information."""

base_url: str
base_dir: Path


def find_free_port():
"""Find an available port by creating a temporary socket."""
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
sock.bind(("", 0))
sock.listen(1)
port = sock.getsockname()[1]
return port


@pytest.fixture
def http_server():
"""
Pytest fixture that starts a Python HTTP server in a subprocess.
Creates a temporary directory with an index.html file, starts the server
on an available port, and cleans up afterwards.

Yields:
HTTPServer: Object containing server information:
- base_url: The base URL of the running server
- base_dir: Path to the temporary directory serving files
"""
# Find an available port
port = find_free_port()

# Create temp directory
with tempfile.TemporaryDirectory() as temp_dir:
base_dir = Path(temp_dir)

# Create index.html
index_path = base_dir / "index.html"
index_path.write_text("<html><body>Hello World</body></html>")

# Start server process in temp directory
process = subprocess.Popen(
["python", "-m", "http.server", str(port)],
cwd=temp_dir,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)

# Wait for server to start
base_url = f"http://localhost:{port}"
max_retries = 5
retry_delay = 0.5

for _ in range(max_retries):
try:
with urllib.request.urlopen(base_url) as response:
if response.status == 200:
break
except (urllib.error.URLError, ConnectionRefusedError):
time.sleep(retry_delay)
else:
process.terminate()
stdout, stderr = process.communicate()
raise RuntimeError(
f"Failed to start HTTP server on port {port}.\n"
f"stdout: {stdout.decode()}\n"
f"stderr: {stderr.decode()}"
)

try:
yield HTTPServer(base_url=base_url, base_dir=base_dir)
finally:
# Clean up
process.terminate()
process.wait(timeout=5) # Wait up to 5 seconds for process to terminate

# Force kill if still running
if process.poll() is None:
process.kill()
process.wait()
37 changes: 37 additions & 0 deletions tests/test_shot_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import pytest
import textwrap
from shot_scraper.cli import cli
import zipfile
import json


def test_version():
Expand Down Expand Up @@ -212,3 +214,38 @@ def test_error_on_invalid_scale_factors(command, args, expected):
result = runner.invoke(cli, [command, "-"] + args)
assert result.exit_code == 1
assert result.output == expected


@pytest.mark.parametrize("output", (None, "output.har.zip"))
def test_har(http_server, output):
runner = CliRunner()
with runner.isolated_filesystem():
# Should be no files
here = pathlib.Path(".")
assert list(here.glob("*.*")) == []
args = ["har", http_server.base_url]
if output:
args.extend(("--output", output))
result = runner.invoke(cli, args)
assert result.exit_code == 0
# HAR file should have been created
files = here.glob("*.har.zip")
har_files = list(files)
# Should have created exactly one .har file
assert len(har_files) == 1
if output:
assert har_files[0] == pathlib.Path(output)
# It should contain *.html and har.har
with zipfile.ZipFile(har_files[0]) as zip_file:
file_list = zip_file.namelist()
assert any(".html" in file for file in file_list)
assert "har.har" in file_list

# har.har should be JSON with the expected structure
with zip_file.open("har.har") as har_file:
har_content = json.loads(har_file.read())
assert "log" in har_content
assert "entries" in har_content["log"]
# Verify entries is a non-empty list
assert isinstance(har_content["log"]["entries"], list)
assert len(har_content["log"]["entries"]) > 0
Loading