Skip to content

Commit 78bf24b

Browse files
committed
shot-scraper javascript -i gh:simonw/readability feature
Closes #173 Also replaced SkyPack with jsdelivr in examples
1 parent 3657b69 commit 78bf24b

File tree

4 files changed

+127
-19
lines changed

4 files changed

+127
-19
lines changed

docs/javascript.md

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,13 @@ shot-scraper javascript https://www.example.com/ "
5555

5656
## Using async/await
5757

58-
You can pass an `async` function if you want to use `await`, including to import modules from external URLs. This example loads the [Readability.js](https://github.com/mozilla/readability) library from [Skypack](https://www.skypack.dev/) and uses it to extract the core content of a page:
58+
You can pass an `async` function if you want to use `await`, including to import modules from external URLs. This example loads the [Readability.js](https://github.com/mozilla/readability) library from [jsdelivr](https://www.jsdelivr.com/) and uses it to extract the core content of a page:
5959

6060
```bash
6161
shot-scraper javascript \
6262
https://simonwillison.net/2022/Mar/14/scraping-web-pages-shot-scraper/ "
6363
async () => {
64-
const readability = await import('https://cdn.skypack.dev/@mozilla/readability');
64+
const readability = await import('https://cdn.jsdelivr.net/npm/@mozilla/readability@0.6.0/+esm');
6565
return (new readability.Readability(document)).parse();
6666
}"
6767
```
@@ -108,17 +108,6 @@ Output:
108108
```
109109
"content-security-policy ignored"
110110
```
111-
112-
## Running JavaScript from a file
113-
114-
You can also save JavaScript to a file and execute it like this:
115-
```bash
116-
shot-scraper javascript datasette.io -i script.js
117-
```
118-
Or read it from standard input like this:
119-
```bash
120-
echo "document.title" | shot-scraper javascript datasette.io
121-
```
122111
## Using this for automated tests
123112

124113
If a JavaScript error occurs, a stack trace will be written to standard error and the tool will terminate with an exit code of 1.
@@ -136,16 +125,47 @@ This example [uses GitHub Actions](https://docs.github.com/en/actions/quickstart
136125
}"
137126
```
138127
128+
## Running JavaScript from a file
129+
130+
You can also save JavaScript to a file and execute it like this:
131+
```bash
132+
shot-scraper javascript datasette.io -i script.js
133+
```
134+
Or read it from standard input like this:
135+
```bash
136+
echo "document.title" | shot-scraper javascript datasette.io
137+
```
138+
Or read it from standard input like this:
139+
```bash
140+
echo "document.title" | shot-scraper javascript datasette.io
141+
```
142+
143+
## Running JavaScript from GitHub
144+
145+
A special `gh:` prefix can be used to load scripts from GitHub.
146+
147+
You can use this with a full path to a `script.js` file in a public GitHub repository like this:
148+
149+
```bash
150+
shot-scraper javascript datasette.io -i gh:simonw/shot-scraper-scripts/readability.js
151+
```
152+
Or by convention if the script lives in a repo called `shot-scraper-scripts` you can omit that (and the `.js` extension too) like this:
153+
154+
```bash
155+
shot-scraper javascript datasette.io -i gh:simonw/readability
156+
```
157+
Both of these examples will execute [readability.js](https://github.com/simonw/shot-scraper-scripts/blob/main/readability.js), explained in the next section.
158+
139159
## Example: Extracting page content with Readability.js
140160

141161
[Readability.js](https://github.com/mozilla/readability) is "a standalone version of the readability library used for Firefox Reader View." It lets you parse the content on a web page and extract just the title, content, byline and some other key metadata.
142162

143-
The following recipe imports the library from the [Skypack CDN](https://www.skypack.dev/), runs it against the current page and returns the results to the console as JSON:
163+
The following recipe imports the library from the [jsdelivr CDN](https://www.jsdelivr.com/), runs it against the current page and returns the results to the console as JSON:
144164

145165
```bash
146166
shot-scraper javascript https://simonwillison.net/2022/Mar/24/datasette-061/ "
147167
async () => {
148-
const readability = await import('https://cdn.skypack.dev/@mozilla/readability');
168+
const readability = await import('https://cdn.jsdelivr.net/npm/@mozilla/readability@0.6.0/+esm');
149169
return (new readability.Readability(document)).parse();
150170
}"
151171
```

shot_scraper/cli.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from playwright.sync_api import sync_playwright, Error, TimeoutError
1414

1515

16-
from shot_scraper.utils import filename_for_url, url_or_file_path
16+
from shot_scraper.utils import filename_for_url, load_github_script, url_or_file_path
1717

1818
BROWSERS = ("chromium", "firefox", "webkit", "chrome", "chrome-beta")
1919

@@ -806,9 +806,8 @@ def har(
806806
@click.option(
807807
"-i",
808808
"--input",
809-
type=click.File("r"),
810809
default="-",
811-
help="Read input JavaScript from this file",
810+
help="Read input JavaScript from this file or a GitHub repo with gh: prefix",
812811
)
813812
@click.option(
814813
"-a",
@@ -881,7 +880,20 @@ def javascript(
881880
If a JavaScript error occurs an exit code of 1 will be returned.
882881
"""
883882
if not javascript:
884-
javascript = input.read()
883+
if input.startswith("gh:"):
884+
try:
885+
javascript = load_github_script(input[3:])
886+
except ValueError as ex:
887+
raise click.ClickException(str(ex))
888+
elif input == "-":
889+
javascript = sys.stdin.read()
890+
else:
891+
try:
892+
with open(input, "r") as f:
893+
javascript = f.read()
894+
except Exception as e:
895+
raise click.ClickException(f"Failed to read file '{input}': {e}")
896+
885897
url = url_or_file_path(url, _check_and_absolutize)
886898
with sync_playwright() as p:
887899
context, browser_obj = _browser_context(

shot_scraper/utils.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,45 @@ def url_or_file_path(url, file_exists=file_exists_never):
3030
if not (url.startswith("http://") or url.startswith("https://")):
3131
return f"http://{url}"
3232
return url
33+
34+
35+
def load_github_script(github_path: str) -> str:
36+
"""
37+
Load JavaScript script from GitHub
38+
39+
Format: username/repo/path/to/file.js
40+
or username/file.js which means username/shot-scraper-scripts/file.js
41+
"""
42+
if not github_path.endswith(".js"):
43+
github_path += ".js"
44+
parts = github_path.split("/")
45+
46+
if len(parts) == 2:
47+
# Short form: username/file.js
48+
username, file_name = parts
49+
parts = [username, "shot-scraper-scripts", file_name]
50+
51+
if len(parts) < 3:
52+
raise ValueError(
53+
"GitHub path format should be 'username/repo/path/to/file.js' or 'username/file.js'"
54+
)
55+
56+
username = parts[0]
57+
repo = parts[1]
58+
file_path = "/".join(parts[2:])
59+
60+
# Fetch from GitHub
61+
import urllib.request
62+
63+
url = f"https://raw.githubusercontent.com/{username}/{repo}/main/{file_path}"
64+
try:
65+
with urllib.request.urlopen(url) as response:
66+
if response.status == 200:
67+
return response.read().decode("utf-8")
68+
else:
69+
raise ValueError(
70+
f"Failed to load content from GitHub: HTTP {response.status}\n"
71+
f"URL: {url}"
72+
)
73+
except urllib.error.URLError as e:
74+
raise ValueError(f"Error fetching from GitHub: {e}")

tests/test_shot_scraper.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import pathlib
2+
from unittest.mock import patch, MagicMock
23
import textwrap
34
from click.testing import CliRunner
45
import pytest
@@ -132,6 +133,39 @@ def test_javascript(args, expected):
132133
assert result.output == expected
133134

134135

136+
def test_javascript_input_file():
137+
runner = CliRunner()
138+
with runner.isolated_filesystem():
139+
open("index.html", "w").write(TEST_HTML)
140+
open("script.js", "w").write("document.title")
141+
result = runner.invoke(cli, ["javascript", "index.html", "-i", "script.js"])
142+
assert result.exit_code == 0, str(result.exception)
143+
assert result.output == '"Test title"\n'
144+
145+
146+
def test_javascript_input_github():
147+
mock_response = MagicMock()
148+
mock_response.status = 200
149+
mock_response.read.return_value = b"document.title"
150+
mock_urlopen = MagicMock()
151+
mock_urlopen.__enter__.return_value = mock_response
152+
mock_context = MagicMock()
153+
mock_context.return_value = mock_urlopen
154+
155+
runner = CliRunner()
156+
with patch("urllib.request.urlopen", mock_context):
157+
with runner.isolated_filesystem():
158+
open("index.html", "w").write(TEST_HTML)
159+
result = runner.invoke(
160+
cli, ["javascript", "index.html", "-i", "gh:simonw/title"]
161+
)
162+
assert result.exit_code == 0, str(result.exception)
163+
assert result.output == '"Test title"\n'
164+
mock_context.assert_called_once_with(
165+
"https://raw.githubusercontent.com/simonw/shot-scraper-scripts/main/title.js"
166+
)
167+
168+
135169
@pytest.mark.parametrize(
136170
"args,expected",
137171
(

0 commit comments

Comments
 (0)