shot-scraper javascript -i gh:simonw/readability feature

simonw · simonw · commit 78bf24b8868c · 2025-03-24T17:26:56.000-07:00
Closes #173 Also replaced SkyPack with jsdelivr in examples
diff --git a/docs/javascript.md b/docs/javascript.md
@@ -55,13 +55,13 @@ shot-scraper javascript https://www.example.com/ "
 
 ## Using async/await
 
-You can pass an `async` function if you want to use `await`, including to import modules from external URLs. This example loads the [Readability.js](https://github.com/mozilla/readability) library from [Skypack](https://www.skypack.dev/) and uses it to extract the core content of a page:
+You can pass an `async` function if you want to use `await`, including to import modules from external URLs. This example loads the [Readability.js](https://github.com/mozilla/readability) library from [jsdelivr](https://www.jsdelivr.com/) and uses it to extract the core content of a page:
 
 ```bash
 shot-scraper javascript \
   https://simonwillison.net/2022/Mar/14/scraping-web-pages-shot-scraper/ "
 async () => {
-  const readability = await import('https://cdn.skypack.dev/@mozilla/readability');
+  const readability = await import('https://cdn.jsdelivr.net/npm/@mozilla/readability@0.6.0/+esm');
   return (new readability.Readability(document)).parse();
 }"
 ```
@@ -108,17 +108,6 @@ Output:
 ```
 "content-security-policy ignored"
 ```
-
-## Running JavaScript from a file
-
-You can also save JavaScript to a file and execute it like this:
-```bash
-shot-scraper javascript datasette.io -i script.js
-```
-Or read it from standard input like this:
-```bash
-echo "document.title" | shot-scraper javascript datasette.io
-```
 ## Using this for automated tests
 
 If a JavaScript error occurs, a stack trace will be written to standard error and the tool will terminate with an exit code of 1.
@@ -136,16 +125,47 @@ This example [uses GitHub Actions](https://docs.github.com/en/actions/quickstart
       }"
 ```
 
+## Running JavaScript from a file
+
+You can also save JavaScript to a file and execute it like this:
+```bash
+shot-scraper javascript datasette.io -i script.js
+```
+Or read it from standard input like this:
+```bash
+echo "document.title" | shot-scraper javascript datasette.io
+```
+Or read it from standard input like this:
+```bash
+echo "document.title" | shot-scraper javascript datasette.io
+```
+
+## Running JavaScript from GitHub
+
+A special `gh:` prefix can be used to load scripts from GitHub.
+
+You can use this with a full path to a `script.js` file in a public GitHub repository like this:
+
+```bash
+shot-scraper javascript datasette.io -i gh:simonw/shot-scraper-scripts/readability.js
+```
+Or by convention if the script lives in a repo called `shot-scraper-scripts` you can omit that (and the `.js` extension too) like this:
+
+```bash
+shot-scraper javascript datasette.io -i gh:simonw/readability
+```
+Both of these examples will execute [readability.js](https://github.com/simonw/shot-scraper-scripts/blob/main/readability.js), explained in the next section.
+
 ## Example: Extracting page content with Readability.js
 
 [Readability.js](https://github.com/mozilla/readability) is "a standalone version of the readability library used for Firefox Reader View." It lets you parse the content on a web page and extract just the title, content, byline and some other key metadata.
 
-The following recipe imports the library from the [Skypack CDN](https://www.skypack.dev/), runs it against the current page and returns the results to the console as JSON:
+The following recipe imports the library from the [jsdelivr CDN](https://www.jsdelivr.com/), runs it against the current page and returns the results to the console as JSON:
 
 ```bash
 shot-scraper javascript https://simonwillison.net/2022/Mar/24/datasette-061/ "
 async () => {
-  const readability = await import('https://cdn.skypack.dev/@mozilla/readability');
+  const readability = await import('https://cdn.jsdelivr.net/npm/@mozilla/readability@0.6.0/+esm');
   return (new readability.Readability(document)).parse();
 }"
 ```
diff --git a/shot_scraper/cli.py b/shot_scraper/cli.py
@@ -13,7 +13,7 @@
 from playwright.sync_api import sync_playwright, Error, TimeoutError
 
 
-from shot_scraper.utils import filename_for_url, url_or_file_path
+from shot_scraper.utils import filename_for_url, load_github_script, url_or_file_path
 
 BROWSERS = ("chromium", "firefox", "webkit", "chrome", "chrome-beta")
 
@@ -806,9 +806,8 @@ def har(
 @click.option(
     "-i",
     "--input",
-    type=click.File("r"),
     default="-",
-    help="Read input JavaScript from this file",
+    help="Read input JavaScript from this file or a GitHub repo with gh: prefix",
 )
 @click.option(
     "-a",
@@ -881,7 +880,20 @@ def javascript(
     If a JavaScript error occurs an exit code of 1 will be returned.
     """
     if not javascript:
-        javascript = input.read()
+        if input.startswith("gh:"):
+            try:
+                javascript = load_github_script(input[3:])
+            except ValueError as ex:
+                raise click.ClickException(str(ex))
+        elif input == "-":
+            javascript = sys.stdin.read()
+        else:
+            try:
+                with open(input, "r") as f:
+                    javascript = f.read()
+            except Exception as e:
+                raise click.ClickException(f"Failed to read file '{input}': {e}")
+
     url = url_or_file_path(url, _check_and_absolutize)
     with sync_playwright() as p:
         context, browser_obj = _browser_context(
diff --git a/shot_scraper/utils.py b/shot_scraper/utils.py
@@ -30,3 +30,45 @@ def url_or_file_path(url, file_exists=file_exists_never):
     if not (url.startswith("http://") or url.startswith("https://")):
         return f"http://{url}"
     return url
+
+
+def load_github_script(github_path: str) -> str:
+    """
+    Load JavaScript script from GitHub
+
+    Format: username/repo/path/to/file.js
+      or username/file.js which means username/shot-scraper-scripts/file.js
+    """
+    if not github_path.endswith(".js"):
+        github_path += ".js"
+    parts = github_path.split("/")
+
+    if len(parts) == 2:
+        # Short form: username/file.js
+        username, file_name = parts
+        parts = [username, "shot-scraper-scripts", file_name]
+
+    if len(parts) < 3:
+        raise ValueError(
+            "GitHub path format should be 'username/repo/path/to/file.js' or 'username/file.js'"
+        )
+
+    username = parts[0]
+    repo = parts[1]
+    file_path = "/".join(parts[2:])
+
+    # Fetch from GitHub
+    import urllib.request
+
+    url = f"https://raw.githubusercontent.com/{username}/{repo}/main/{file_path}"
+    try:
+        with urllib.request.urlopen(url) as response:
+            if response.status == 200:
+                return response.read().decode("utf-8")
+            else:
+                raise ValueError(
+                    f"Failed to load content from GitHub: HTTP {response.status}\n"
+                    f"URL: {url}"
+                )
+    except urllib.error.URLError as e:
+        raise ValueError(f"Error fetching from GitHub: {e}")
diff --git a/tests/test_shot_scraper.py b/tests/test_shot_scraper.py
@@ -1,4 +1,5 @@
 import pathlib
+from unittest.mock import patch, MagicMock
 import textwrap
 from click.testing import CliRunner
 import pytest
@@ -132,6 +133,39 @@ def test_javascript(args, expected):
         assert result.output == expected
 
 
+def test_javascript_input_file():
+    runner = CliRunner()
+    with runner.isolated_filesystem():
+        open("index.html", "w").write(TEST_HTML)
+        open("script.js", "w").write("document.title")
+        result = runner.invoke(cli, ["javascript", "index.html", "-i", "script.js"])
+        assert result.exit_code == 0, str(result.exception)
+        assert result.output == '"Test title"\n'
+
+
+def test_javascript_input_github():
+    mock_response = MagicMock()
+    mock_response.status = 200
+    mock_response.read.return_value = b"document.title"
+    mock_urlopen = MagicMock()
+    mock_urlopen.__enter__.return_value = mock_response
+    mock_context = MagicMock()
+    mock_context.return_value = mock_urlopen
+
+    runner = CliRunner()
+    with patch("urllib.request.urlopen", mock_context):
+        with runner.isolated_filesystem():
+            open("index.html", "w").write(TEST_HTML)
+            result = runner.invoke(
+                cli, ["javascript", "index.html", "-i", "gh:simonw/title"]
+            )
+            assert result.exit_code == 0, str(result.exception)
+            assert result.output == '"Test title"\n'
+            mock_context.assert_called_once_with(
+                "https://raw.githubusercontent.com/simonw/shot-scraper-scripts/main/title.js"
+            )
+
+
 @pytest.mark.parametrize(
     "args,expected",
     (