simonw · simonw · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/docs/har.md b/docs/har.md
@@ -0,0 +1,76 @@
+# Saving a web page to an HTTP Archive
+
+An HTTP Archive file captures the full details of a series of HTTP requests and responses as JSON.
+
+The `shot-scraper har` command can save a `*.har.zip` file that contains both that JSON data and the content of any assets that were loaded by the page.
+```bash
+shot-scraper har https://datasette.io/
+```
+This will save to `datasette-io.har.zip`. You can use `-o` to specify a filename:
+```bash
+shot-scraper har https://datasette.io/tutorials/learn-sql \
+  -o learn-sql.har.zip
+```
+You can view the contents of a HAR file using `unzip -l`:
+```bash
+unzip -l datasette-io.har.zip
+```
+```
+Archive:  datasette-io.har.zip
+  Length      Date    Time    Name
+---------  ---------- -----   ----
+    39067  02-13-2025 10:33   41824dbd0c51f584faf0e2c4e88de01b8a5dcdcd.html
+     4052  02-13-2025 10:33   34972651f161f0396c697c65ef9aaeb2c9ac50c4.css
+     2501  02-13-2025 10:33   9f612e71165058f0046d8bf8fec12af7eb15f39d.css
+    10916  02-13-2025 10:33   2737174596eafba6e249022203c324605f023cdd.svg
+     5557  02-13-2025 10:33   427504aa6ef5a8786f90fb2de636133b3fc6d1fe.js
+     1393  02-13-2025 10:33   25c68a82b654c9d844c604565dab4785161ef697.js
+     1170  02-13-2025 10:33   31c073551ef5c84324073edfc7b118f81ce9a7d2.svg
+     1158  02-13-2025 10:33   1e0c64af7e6a4712f5e7d1917d9555bbc3d01f7a.svg
+     1161  02-13-2025 10:33   ec8282b36a166d63fae4c04166bb81f945660435.svg
+     3373  02-13-2025 10:33   5f85a11ef89c0e3f237c8e926c1cb66727182102.svg
+     1134  02-13-2025 10:33   3b9d8109b919dfe9393dab2376fe03267dadc1f1.svg
+    31670  02-13-2025 10:33   469f0d28af6c026dcae8c81731e2b0484aeac92c.jpeg
+     1157  02-13-2025 10:33   b7786336bfce38a9677d26dc9ef468bb1ed45e19.svg
+    50494  02-13-2025 10:33   har.har
+---------                     -------
+   154803                     14 files
+```
+
+## `shot-scraper har --help`
+
+Full `--help` for this command:
+
+<!-- [[[cog
+import cog
+from shot_scraper import cli
+from click.testing import CliRunner
+runner = CliRunner()
+result = runner.invoke(cli.cli, ["har", "--help"])
+help = result.output.replace("Usage: cli", "Usage: shot-scraper")
+cog.out(
+    "```\n{}\n```\n".format(help.strip())
+)
+]]] -->
+```
+Usage: shot-scraper har [OPTIONS] URL
+
+  Record a HAR file for the specified page
+
+  Usage:
+
+      shot-scraper har https://datasette.io/
+
+Options:
+  -a, --auth FILENAME   Path to JSON authentication context file
+  -o, --output FILE     HAR filename
+  --timeout INTEGER     Wait this many milliseconds before failing
+  --log-console         Write console.log() to stderr
+  --fail                Fail with an error code if a page returns an HTTP error
+  --skip                Skip pages that return HTTP errors
+  --bypass-csp          Bypass Content-Security-Policy
+  --auth-password TEXT  Password for HTTP Basic authentication
+  --auth-username TEXT  Username for HTTP Basic authentication
+  --help                Show this message and exit.
+```
+<!-- [[[end]]] -->
diff --git a/docs/index.md b/docs/index.md
@@ -26,6 +26,7 @@ multi
 javascript
 pdf
 html
+har
 accessibility
 github-actions
 contributing

diff --git a/shot_scraper/cli.py b/shot_scraper/cli.py
@@ -401,6 +401,7 @@ def _browser_context(
     bypass_csp=False,
     auth_username=None,
     auth_password=None,
+    record_har_path=None,
 ):
     browser_kwargs = dict(
         headless=not interactive, devtools=devtools, args=browser_args
@@ -430,6 +431,8 @@ def _browser_context(
             "username": auth_username,
             "password": auth_password,
         }
+    if record_har_path:
+        context_args["record_har_path"] = record_har_path
     context = browser_obj.new_context(**context_args)
     if timeout:
         context.set_default_timeout(timeout)
@@ -670,6 +673,71 @@ def accessibility(
     output.write("\n")
 
 
+@cli.command()
+@click.argument("url")
+@click.option(
+    "-a",
+    "--auth",
+    type=click.File("r"),
+    help="Path to JSON authentication context file",
+)
+@click.option(
+    "-o",
+    "--output",
+    type=click.Path(file_okay=True, dir_okay=False, writable=True, allow_dash=False),
+    help="HAR filename",
+)
+@click.option(
+    "--timeout",
+    type=int,
+    help="Wait this many milliseconds before failing",
+)
+@log_console_option
+@skip_fail_options
+@bypass_csp_option
+@http_auth_options
+def har(
+    url,
+    auth,
+    output,
+    timeout,
+    log_console,
+    skip,
+    fail,
+    bypass_csp,
+    auth_username,
+    auth_password,
+):
+    """
+    Record a HAR file for the specified page
+
+    Usage:
+
+        shot-scraper har https://datasette.io/
+    """
+    if output is None:
+        output = filename_for_url(url, ext="har.zip", file_exists=os.path.exists)
+
+    url = url_or_file_path(url, _check_and_absolutize)
+    with sync_playwright() as p:
+        context, browser_obj = _browser_context(
+            p,
+            auth,
+            timeout=timeout,
+            bypass_csp=bypass_csp,
+            auth_username=auth_username,
+            auth_password=auth_password,
+            record_har_path=str(output),
+        )
+        page = context.new_page()
+        if log_console:
+            page.on("console", console_log)
+        response = page.goto(url)
+        skip_or_fail(response, skip, fail)
+        context.close()
+        browser_obj.close()
+
+
 @cli.command()
 @click.argument("url")
 @click.argument("javascript", required=False)

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,92 @@
+import pytest
+import subprocess
+import tempfile
+import time
+import socket
+import urllib.request
+import urllib.error
+from dataclasses import dataclass
+from pathlib import Path
+from contextlib import closing
+
+
+@dataclass
+class HTTPServer:
+    """Container for HTTP server information."""
+
+    base_url: str
+    base_dir: Path
+
+
+def find_free_port():
+    """Find an available port by creating a temporary socket."""
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+        sock.bind(("", 0))
+        sock.listen(1)
+        port = sock.getsockname()[1]
+        return port
+
+
+@pytest.fixture
+def http_server():
+    """
+    Pytest fixture that starts a Python HTTP server in a subprocess.
+    Creates a temporary directory with an index.html file, starts the server
+    on an available port, and cleans up afterwards.
+
+    Yields:
+        HTTPServer: Object containing server information:
+            - base_url: The base URL of the running server
+            - base_dir: Path to the temporary directory serving files
+    """
+    # Find an available port
+    port = find_free_port()
+
+    # Create temp directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        base_dir = Path(temp_dir)
+
+        # Create index.html
+        index_path = base_dir / "index.html"
+        index_path.write_text("<html><body>Hello World</body></html>")
+
+        # Start server process in temp directory
+        process = subprocess.Popen(
+            ["python", "-m", "http.server", str(port)],
+            cwd=temp_dir,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+
+        # Wait for server to start
+        base_url = f"http://localhost:{port}"
+        max_retries = 5
+        retry_delay = 0.5
+
+        for _ in range(max_retries):
+            try:
+                with urllib.request.urlopen(base_url) as response:
+                    if response.status == 200:
+                        break
+            except (urllib.error.URLError, ConnectionRefusedError):
+                time.sleep(retry_delay)
+        else:
+            process.terminate()
+            stdout, stderr = process.communicate()
+            raise RuntimeError(
+                f"Failed to start HTTP server on port {port}.\n"
+                f"stdout: {stdout.decode()}\n"
+                f"stderr: {stderr.decode()}"
+            )
+
+        try:
+            yield HTTPServer(base_url=base_url, base_dir=base_dir)
+        finally:
+            # Clean up
+            process.terminate()
+            process.wait(timeout=5)  # Wait up to 5 seconds for process to terminate
+
+            # Force kill if still running
+            if process.poll() is None:
+                process.kill()
+                process.wait()
diff --git a/tests/test_shot_scraper.py b/tests/test_shot_scraper.py
@@ -3,6 +3,8 @@
 import pytest
 import textwrap
 from shot_scraper.cli import cli
+import zipfile
+import json
 
 
 def test_version():
@@ -212,3 +214,38 @@ def test_error_on_invalid_scale_factors(command, args, expected):
     result = runner.invoke(cli, [command, "-"] + args)
     assert result.exit_code == 1
     assert result.output == expected
+
+
+@pytest.mark.parametrize("output", (None, "output.har.zip"))
+def test_har(http_server, output):
+    runner = CliRunner()
+    with runner.isolated_filesystem():
+        # Should be no files
+        here = pathlib.Path(".")
+        assert list(here.glob("*.*")) == []
+        args = ["har", http_server.base_url]
+        if output:
+            args.extend(("--output", output))
+        result = runner.invoke(cli, args)
+        assert result.exit_code == 0
+        # HAR file should have been created
+        files = here.glob("*.har.zip")
+        har_files = list(files)
+        # Should have created exactly one .har file
+        assert len(har_files) == 1
+        if output:
+            assert har_files[0] == pathlib.Path(output)
+        # It should contain *.html and har.har
+        with zipfile.ZipFile(har_files[0]) as zip_file:
+            file_list = zip_file.namelist()
+            assert any(".html" in file for file in file_list)
+            assert "har.har" in file_list
+
+            # har.har should be JSON with the expected structure
+            with zip_file.open("har.har") as har_file:
+                har_content = json.loads(har_file.read())
+                assert "log" in har_content
+                assert "entries" in har_content["log"]
+                # Verify entries is a non-empty list
+                assert isinstance(har_content["log"]["entries"], list)
+                assert len(har_content["log"]["entries"]) > 0