Skip to content

Commit 0efa5a0

Browse files
authored
Add depth limiting and path globs options (#65)
1 parent 196a310 commit 0efa5a0

File tree

8 files changed

+206
-3
lines changed

8 files changed

+206
-3
lines changed

.changeset/early-tables-crash.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
---
2+
'lighthouse-parade': minor
3+
---
4+
5+
Add options: `--max-crawl-depth`, `--include-path-glob`, `--exclude-path-glob`
6+
7+
- `--max-crawl-depth`: Control the maximum depth of crawled links. 1 means only the entry page will be used. 2 means the entry page and any page linked directly from the entry page will be used.
8+
- `--include-path-glob`: Specify a glob (in quotes) for paths to match. Links to non-matched paths will not be crawled. The entry page will be crawled regardless of this flag. This flag can be specified multiple times to allow multiple paths. `*` matches one url segment, `**` matches multiple segments. Trailing slashes are ignored.
9+
- `--exclude-path-glob`: Specify a glob (in quotes) for paths to exclude. Links to matched paths will not be crawled. The entry page will be crawled regardless of this flag. This flag can be specified multiple times to exclude multiple paths. `*` matches one url segment, `**` matches multiple segments. Trailing slashes are ignored.

README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,12 @@ Runs a crawler on the provided URL. Discovers all URLs and runs a lighthouse rep
3333
### Options
3434

3535
```
36-
--ignore-robots Crawl pages even if they are listed in the site's robots.txt (default: false)
36+
--ignore-robots Crawl pages even if they are listed in the site's robots.txt (default false)
3737
--crawler-user-agent Pass a user agent string to be used by the crawler (not by Lighthouse)
38-
--lighthouse-concurrency Control the maximum number of ligthhouse reports to run concurrently (default: number of CPU cores minus one)
38+
--lighthouse-concurrency Control the maximum number of ligthhouse reports to run concurrently (default number of CPU cores minus one)
39+
--max-crawl-depth Control the maximum depth of crawled links. 1 means only the entry page will be used. 2 means the entry page and any page linked directly from the entry page will be used.
40+
--include-path-glob Specify a glob (in quotes) for paths to match. Links to non-matched paths will not be crawled. The entry page will be crawled regardless of this flag. This flag can be specified multiple times to allow multiple paths. `*` matches one url segment, `**` matches multiple segments. Trailing slashes are ignored.
41+
--exclude-path-glob Specify a glob (in quotes) for paths to exclude. Links to matched paths will not be crawled. The entry page will be crawled regardless of this flag. This flag can be specified multiple times to exclude multiple paths. `*` matches one url segment, `**` matches multiple segments. Trailing slashes are ignored.
3942
-v, --version Displays current version
4043
-h, --help Displays help text
4144
```

cli.ts

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,25 @@ const symbols = {
2525
success: kleur.green('✔'),
2626
};
2727

28+
const toArray = <T extends unknown>(input: T) =>
29+
Array.isArray(input) ? input : [input];
30+
31+
/** Returns whether the given path is a full URL (with protocol, domain, etc.) */
32+
const isFullURL = (path: string) => {
33+
try {
34+
// eslint-disable-next-line no-new
35+
new URL(path);
36+
return true;
37+
} catch {}
38+
39+
return false;
40+
};
41+
2842
sade('lighthouse-parade <url> [dataDirectory]', true)
2943
.version(version)
44+
.example(
45+
'https://cloudfour.com --exclude-path-glob "/thinks/*" --max-crawl-depth 2'
46+
)
3047
.describe(
3148
'Crawls the site at the provided URL, recording the lighthouse scores for each URL found. The lighthouse data will be stored in the provided directory, which defaults to ./data/YYYY-MM-DDTTZ_HH_MM'
3249
)
@@ -44,6 +61,18 @@ sade('lighthouse-parade <url> [dataDirectory]', true)
4461
'Control the maximum number of ligthhouse reports to run concurrently',
4562
os.cpus().length - 1
4663
)
64+
.option(
65+
'--max-crawl-depth',
66+
'Control the maximum depth of crawled links. 1 means only the entry page will be used. 2 means the entry page and any page linked directly from the entry page will be used.'
67+
)
68+
.option(
69+
'--include-path-glob',
70+
'Specify a glob (in quotes) for paths to match. Links to non-matched paths will not be crawled. The entry page will be crawled regardless of this flag. This flag can be specified multiple times to allow multiple paths. `*` matches one url segment, `**` matches multiple segments. Trailing slashes are ignored.'
71+
)
72+
.option(
73+
'--exclude-path-glob',
74+
'Specify a glob (in quotes) for paths to exclude. Links to matched paths will not be crawled. The entry page will be crawled regardless of this flag. This flag can be specified multiple times to exclude multiple paths. `*` matches one url segment, `**` matches multiple segments. Trailing slashes are ignored.'
75+
)
4776
.action(
4877
(
4978
url,
@@ -65,7 +94,37 @@ sade('lighthouse-parade <url> [dataDirectory]', true)
6594

6695
const userAgent: unknown = opts['crawler-user-agent'];
6796
if (userAgent !== undefined && typeof userAgent !== 'string') {
68-
throw new Error('--crawler-user-agent flag must be a string');
97+
throw new Error('--crawler-user-agent must be a string');
98+
}
99+
100+
const maxCrawlDepth: unknown = opts['max-crawl-depth'];
101+
102+
if (maxCrawlDepth !== undefined && typeof maxCrawlDepth !== 'number') {
103+
throw new Error('--max-crawl-depth must be a number');
104+
}
105+
106+
const includePathGlob: unknown[] = toArray(
107+
opts['include-path-glob'] as unknown
108+
).filter((glob) => glob !== undefined);
109+
110+
if (includePathGlob.some((glob) => typeof glob !== 'string')) {
111+
throw new Error('--include-path-glob must be string(s)');
112+
}
113+
114+
if ((includePathGlob as string[]).some(isFullURL)) {
115+
throw new Error('--include-path-glob must be path(s), not full URL(s)');
116+
}
117+
118+
const excludePathGlob: unknown[] = toArray(
119+
opts['exclude-path-glob'] as unknown
120+
).filter((glob) => glob !== undefined);
121+
122+
if (excludePathGlob.some((glob) => typeof glob !== 'string')) {
123+
throw new Error('--exclude-path-glob must be string(s)');
124+
}
125+
126+
if ((excludePathGlob as string[]).some(isFullURL)) {
127+
throw new Error('--exclude-path-glob must be path(s), not full URL(s)');
69128
}
70129

71130
const lighthouseConcurrency = opts['lighthouse-concurrency'];
@@ -74,6 +133,9 @@ sade('lighthouse-parade <url> [dataDirectory]', true)
74133
ignoreRobotsTxt,
75134
dataDirectory: dataDirPath,
76135
lighthouseConcurrency,
136+
maxCrawlDepth,
137+
includePathGlob: includePathGlob as string[],
138+
excludePathGlob: excludePathGlob as string[],
77139
});
78140

79141
const enum State {

crawl.ts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,18 @@ import type { QueueItem } from 'simplecrawler/queue';
33
import type { IncomingMessage } from 'http';
44
import { createEmitter } from './emitter';
55
import { isContentTypeHtml } from './utilities';
6+
import globrex from 'globrex';
67

78
export interface CrawlOptions {
89
/** Whether to crawl pages even if they are listed in the site's robots.txt */
910
ignoreRobotsTxt: boolean;
1011
userAgent?: string;
12+
/** Maximum depth of fetched links */
13+
maxCrawlDepth?: number;
14+
/** Any path that doesn't match these globs will not be crawled. If the array is empty, all paths are allowed. */
15+
includePathGlob: string[];
16+
/** Any path that matches these globs will not be crawled. */
17+
excludePathGlob: string[];
1118
}
1219

1320
export type CrawlerEvents = {
@@ -26,6 +33,11 @@ export const crawl = (siteUrl: string, opts: CrawlOptions) => {
2633
const crawler = new Crawler(siteUrl);
2734
if (opts.userAgent) crawler.userAgent = opts.userAgent;
2835
crawler.respectRobotsTxt = !opts.ignoreRobotsTxt;
36+
if (opts.maxCrawlDepth !== undefined) crawler.maxDepth = opts.maxCrawlDepth;
37+
38+
crawler.addFetchCondition(
39+
createUrlFilter(opts.includePathGlob, opts.excludePathGlob)
40+
);
2941

3042
const emitWarning = (queueItem: QueueItem, response: IncomingMessage) => {
3143
emit(
@@ -53,3 +65,25 @@ export const crawl = (siteUrl: string, opts: CrawlOptions) => {
5365

5466
return { on, promise };
5567
};
68+
69+
export const createUrlFilter = (
70+
includeGlob: string[],
71+
excludeGlob: string[]
72+
) => {
73+
const pathIncludeRegexes = includeGlob.map(
74+
(glob) => globrex(glob.replace(/\/$/, ''), globOpts).regex
75+
);
76+
const pathExcludeRegexes = excludeGlob.map(
77+
(glob) => globrex(glob.replace(/\/$/, ''), globOpts).regex
78+
);
79+
return ({ path }: { path: string }) => {
80+
const withoutTrailingSlash = path.replace(/\/$/, '');
81+
return (
82+
(pathIncludeRegexes.length === 0 ||
83+
pathIncludeRegexes.some((regex) => regex.test(withoutTrailingSlash))) &&
84+
!pathExcludeRegexes.some((regex) => regex.test(withoutTrailingSlash))
85+
);
86+
};
87+
};
88+
89+
const globOpts: globrex.Options = { globstar: true, extended: true };

package-lock.json

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
"csv": "^5.3.2",
3030
"csv-parse": "^4.12.0",
3131
"csv-stringify": "^5.5.1",
32+
"globrex": "^0.1.2",
3233
"kleur": "^4.1.3",
3334
"lighthouse": "^6.4.0",
3435
"log-update": "^4.0.0",
@@ -46,6 +47,7 @@
4647
"@changesets/changelog-github": "^0.2.7",
4748
"@changesets/cli": "^2.10.3",
4849
"@cloudfour/eslint-plugin": "^14.0.0",
50+
"@types/globrex": "^0.1.0",
4951
"@types/jest": "^26.0.14",
5052
"@types/sade": "^1.7.2",
5153
"@types/simplecrawler": "^1.1.1",

test/path-filter-globs.test.ts

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import { createUrlFilter } from '../crawl';
2+
3+
test('if the include array is empty allow any path', () => {
4+
const filter = createUrlFilter([], []);
5+
expect(filter({ path: '/foo' })).toBeTruthy();
6+
expect(filter({ path: '/' })).toBeTruthy();
7+
expect(filter({ path: '/asdf/1234' })).toBeTruthy();
8+
});
9+
10+
test('only allow items matching static include glob', () => {
11+
const filter = createUrlFilter(['/foo'], []);
12+
expect(filter({ path: '/foo' })).toBeTruthy();
13+
expect(filter({ path: '/foo/' })).toBeTruthy();
14+
expect(filter({ path: '/foo/bar' })).toBeFalsy();
15+
expect(filter({ path: '/foobar' })).toBeFalsy();
16+
expect(filter({ path: '/asdf' })).toBeFalsy();
17+
});
18+
19+
test('only allow items matching include glob', () => {
20+
const filter = createUrlFilter(['/foo/*'], []);
21+
expect(filter({ path: '/foo' })).toBeFalsy();
22+
expect(filter({ path: '/foo/bar' })).toBeTruthy();
23+
expect(filter({ path: '/foo/bar/' })).toBeTruthy();
24+
expect(filter({ path: '/asdf' })).toBeFalsy();
25+
});
26+
27+
test('allow items from multiple include globs', () => {
28+
const filter = createUrlFilter(['/foo/*', '/foo'], []);
29+
expect(filter({ path: '/foo/' })).toBeTruthy();
30+
expect(filter({ path: '/foo' })).toBeTruthy();
31+
expect(filter({ path: '/foo/bar' })).toBeTruthy();
32+
expect(filter({ path: '/foo/bar/' })).toBeTruthy();
33+
expect(filter({ path: '/foo/bar/baz' })).toBeFalsy();
34+
expect(filter({ path: '/asdf' })).toBeFalsy();
35+
});
36+
37+
test('glob with star in the middle', () => {
38+
const filter = createUrlFilter(['/foo/*/bar'], []);
39+
expect(filter({ path: '/foo/asdf/bar' })).toBeTruthy();
40+
expect(filter({ path: '/foo/asdf/bar/' })).toBeTruthy();
41+
expect(filter({ path: '/foo/bar' })).toBeFalsy();
42+
expect(filter({ path: '/foo/asdf/1234' })).toBeFalsy();
43+
});
44+
45+
test('removes trailing slash from glob', () => {
46+
const filter = createUrlFilter(['/foo/'], []);
47+
expect(filter({ path: '/foo/' })).toBeTruthy();
48+
expect(filter({ path: '/foo' })).toBeTruthy();
49+
expect(filter({ path: '/foo/bar' })).toBeFalsy();
50+
});
51+
52+
test('exclude has higher precedence than include', () => {
53+
const filter = createUrlFilter(['/foo/*'], ['/foo/asdf']);
54+
expect(filter({ path: '/foo/bar' })).toBeTruthy();
55+
expect(filter({ path: '/foo/bar/' })).toBeTruthy();
56+
expect(filter({ path: '/foo/asdf' })).toBeFalsy();
57+
expect(filter({ path: '/foo/bar/baz' })).toBeFalsy();
58+
expect(filter({ path: '/foo/asdfasdf' })).toBeTruthy();
59+
});
60+
61+
test('globstar and globs in exclude', () => {
62+
const filter = createUrlFilter(['/foo/**'], ['/foo/asdf/**']);
63+
expect(filter({ path: '/foo' })).toBeFalsy();
64+
expect(filter({ path: '/foo/sdf' })).toBeTruthy();
65+
expect(filter({ path: '/foo/sdf/asdf' })).toBeTruthy();
66+
expect(filter({ path: '/foo/sdf/asdf/' })).toBeTruthy();
67+
expect(filter({ path: '/foo/asdf' })).toBeTruthy();
68+
expect(filter({ path: '/foo/asdf/foo' })).toBeFalsy();
69+
expect(filter({ path: '/foo/asdf/foo/bar' })).toBeFalsy();
70+
});
71+
72+
test('fancy globs', () => {
73+
const filter = createUrlFilter(['/{foo,bar}/*'], []);
74+
expect(filter({ path: '/foo/asdf' })).toBeTruthy();
75+
expect(filter({ path: '/bar/asdf' })).toBeTruthy();
76+
expect(filter({ path: '/bar/asdf/' })).toBeTruthy();
77+
expect(filter({ path: '/1234/asdf' })).toBeFalsy();
78+
});

test/scan-task.test.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ test('Displays useful error if no pages are found while crawling', async () => {
99
const { fakeCrawler, emit: scanEmit } = createFakeCrawler();
1010
const emitter = scan('https://nonexistent-website.com', {
1111
ignoreRobotsTxt: false,
12+
includePathGlob: [],
13+
excludePathGlob: [],
1214
dataDirectory: 'foo',
1315
lighthouseConcurrency: 1,
1416
crawler: fakeCrawler,
@@ -60,6 +62,8 @@ test('Fires correct lighthouse events as pages are found', async () => {
6062

6163
const emitter = scan('https://google.com', {
6264
ignoreRobotsTxt: false,
65+
includePathGlob: [],
66+
excludePathGlob: [],
6367
dataDirectory: 'foo',
6468
lighthouseConcurrency: 1,
6569
lighthouse: (url) => {

0 commit comments

Comments
 (0)