Add depth limiting and path globs options (#65)

calebeby · web-flow · commit 0efa5a001040 · 2021-01-12T18:50:41.000-08:00
diff --git a/.changeset/early-tables-crash.md b/.changeset/early-tables-crash.md
@@ -0,0 +1,9 @@
+---
+'lighthouse-parade': minor
+---
+
+Add options: `--max-crawl-depth`, `--include-path-glob`, `--exclude-path-glob`
+
+- `--max-crawl-depth`: Control the maximum depth of crawled links. 1 means only the entry page will be used. 2 means the entry page and any page linked directly from the entry page will be used.
+- `--include-path-glob`: Specify a glob (in quotes) for paths to match. Links to non-matched paths will not be crawled. The entry page will be crawled regardless of this flag. This flag can be specified multiple times to allow multiple paths. `*` matches one url segment, `**` matches multiple segments. Trailing slashes are ignored.
+- `--exclude-path-glob`: Specify a glob (in quotes) for paths to exclude. Links to matched paths will not be crawled. The entry page will be crawled regardless of this flag. This flag can be specified multiple times to exclude multiple paths. `*` matches one url segment, `**` matches multiple segments. Trailing slashes are ignored.
diff --git a/README.md b/README.md
@@ -33,9 +33,12 @@ Runs a crawler on the provided URL. Discovers all URLs and runs a lighthouse rep
 ### Options
 
 ```
---ignore-robots             Crawl pages even if they are listed in the site's robots.txt  (default: false)
+--ignore-robots             Crawl pages even if they are listed in the site's robots.txt  (default false)
 --crawler-user-agent        Pass a user agent string to be used by the crawler (not by Lighthouse)
---lighthouse-concurrency    Control the maximum number of ligthhouse reports to run concurrently  (default: number of CPU cores minus one)
+--lighthouse-concurrency    Control the maximum number of ligthhouse reports to run concurrently  (default number of CPU cores minus one)
+--max-crawl-depth           Control the maximum depth of crawled links. 1 means only the entry page will be used. 2 means the entry page and any page linked directly from the entry page will be used.
+--include-path-glob         Specify a glob (in quotes) for paths to match. Links to non-matched paths will not be crawled. The entry page will be crawled regardless of this flag. This flag can be specified multiple times to allow multiple paths. `*` matches one url segment, `**` matches multiple segments. Trailing slashes are ignored.
+--exclude-path-glob         Specify a glob (in quotes) for paths to exclude. Links to matched paths will not be crawled. The entry page will be crawled regardless of this flag. This flag can be specified multiple times to exclude multiple paths. `*` matches one url segment, `**` matches multiple segments. Trailing slashes are ignored.
 -v, --version               Displays current version
 -h, --help                  Displays help text
 ```
diff --git a/cli.ts b/cli.ts
@@ -25,8 +25,25 @@ const symbols = {
   success: kleur.green('✔'),
 };
 
+const toArray = <T extends unknown>(input: T) =>
+  Array.isArray(input) ? input : [input];
+
+/** Returns whether the given path is a full URL (with protocol, domain, etc.) */
+const isFullURL = (path: string) => {
+  try {
+    // eslint-disable-next-line no-new
+    new URL(path);
+    return true;
+  } catch {}
+
+  return false;
+};
+
 sade('lighthouse-parade <url> [dataDirectory]', true)
   .version(version)
+  .example(
+    'https://cloudfour.com --exclude-path-glob "/thinks/*" --max-crawl-depth 2'
+  )
   .describe(
     'Crawls the site at the provided URL, recording the lighthouse scores for each URL found. The lighthouse data will be stored in the provided directory, which defaults to ./data/YYYY-MM-DDTTZ_HH_MM'
   )
@@ -44,6 +61,18 @@ sade('lighthouse-parade <url> [dataDirectory]', true)
     'Control the maximum number of ligthhouse reports to run concurrently',
     os.cpus().length - 1
   )
+  .option(
+    '--max-crawl-depth',
+    'Control the maximum depth of crawled links. 1 means only the entry page will be used. 2 means the entry page and any page linked directly from the entry page will be used.'
+  )
+  .option(
+    '--include-path-glob',
+    'Specify a glob (in quotes) for paths to match. Links to non-matched paths will not be crawled. The entry page will be crawled regardless of this flag. This flag can be specified multiple times to allow multiple paths. `*` matches one url segment, `**` matches multiple segments. Trailing slashes are ignored.'
+  )
+  .option(
+    '--exclude-path-glob',
+    'Specify a glob (in quotes) for paths to exclude. Links to matched paths will not be crawled. The entry page will be crawled regardless of this flag. This flag can be specified multiple times to exclude multiple paths. `*` matches one url segment, `**` matches multiple segments. Trailing slashes are ignored.'
+  )
   .action(
     (
       url,
@@ -65,7 +94,37 @@ sade('lighthouse-parade <url> [dataDirectory]', true)
 
       const userAgent: unknown = opts['crawler-user-agent'];
       if (userAgent !== undefined && typeof userAgent !== 'string') {
-        throw new Error('--crawler-user-agent flag must be a string');
+        throw new Error('--crawler-user-agent must be a string');
+      }
+
+      const maxCrawlDepth: unknown = opts['max-crawl-depth'];
+
+      if (maxCrawlDepth !== undefined && typeof maxCrawlDepth !== 'number') {
+        throw new Error('--max-crawl-depth must be a number');
+      }
+
+      const includePathGlob: unknown[] = toArray(
+        opts['include-path-glob'] as unknown
+      ).filter((glob) => glob !== undefined);
+
+      if (includePathGlob.some((glob) => typeof glob !== 'string')) {
+        throw new Error('--include-path-glob must be string(s)');
+      }
+
+      if ((includePathGlob as string[]).some(isFullURL)) {
+        throw new Error('--include-path-glob must be path(s), not full URL(s)');
+      }
+
+      const excludePathGlob: unknown[] = toArray(
+        opts['exclude-path-glob'] as unknown
+      ).filter((glob) => glob !== undefined);
+
+      if (excludePathGlob.some((glob) => typeof glob !== 'string')) {
+        throw new Error('--exclude-path-glob must be string(s)');
+      }
+
+      if ((excludePathGlob as string[]).some(isFullURL)) {
+        throw new Error('--exclude-path-glob must be path(s), not full URL(s)');
       }
 
       const lighthouseConcurrency = opts['lighthouse-concurrency'];
@@ -74,6 +133,9 @@ sade('lighthouse-parade <url> [dataDirectory]', true)
         ignoreRobotsTxt,
         dataDirectory: dataDirPath,
         lighthouseConcurrency,
+        maxCrawlDepth,
+        includePathGlob: includePathGlob as string[],
+        excludePathGlob: excludePathGlob as string[],
       });
 
       const enum State {
diff --git a/crawl.ts b/crawl.ts
@@ -3,11 +3,18 @@ import type { QueueItem } from 'simplecrawler/queue';
 import type { IncomingMessage } from 'http';
 import { createEmitter } from './emitter';
 import { isContentTypeHtml } from './utilities';
+import globrex from 'globrex';
 
 export interface CrawlOptions {
   /** Whether to crawl pages even if they are listed in the site's robots.txt */
   ignoreRobotsTxt: boolean;
   userAgent?: string;
+  /** Maximum depth of fetched links */
+  maxCrawlDepth?: number;
+  /** Any path that doesn't match these globs will not be crawled. If the array is empty, all paths are allowed. */
+  includePathGlob: string[];
+  /** Any path that matches these globs will not be crawled. */
+  excludePathGlob: string[];
 }
 
 export type CrawlerEvents = {
@@ -26,6 +33,11 @@ export const crawl = (siteUrl: string, opts: CrawlOptions) => {
   const crawler = new Crawler(siteUrl);
   if (opts.userAgent) crawler.userAgent = opts.userAgent;
   crawler.respectRobotsTxt = !opts.ignoreRobotsTxt;
+  if (opts.maxCrawlDepth !== undefined) crawler.maxDepth = opts.maxCrawlDepth;
+
+  crawler.addFetchCondition(
+    createUrlFilter(opts.includePathGlob, opts.excludePathGlob)
+  );
 
   const emitWarning = (queueItem: QueueItem, response: IncomingMessage) => {
     emit(
@@ -53,3 +65,25 @@ export const crawl = (siteUrl: string, opts: CrawlOptions) => {
 
   return { on, promise };
 };
+
+export const createUrlFilter = (
+  includeGlob: string[],
+  excludeGlob: string[]
+) => {
+  const pathIncludeRegexes = includeGlob.map(
+    (glob) => globrex(glob.replace(/\/$/, ''), globOpts).regex
+  );
+  const pathExcludeRegexes = excludeGlob.map(
+    (glob) => globrex(glob.replace(/\/$/, ''), globOpts).regex
+  );
+  return ({ path }: { path: string }) => {
+    const withoutTrailingSlash = path.replace(/\/$/, '');
+    return (
+      (pathIncludeRegexes.length === 0 ||
+        pathIncludeRegexes.some((regex) => regex.test(withoutTrailingSlash))) &&
+      !pathExcludeRegexes.some((regex) => regex.test(withoutTrailingSlash))
+    );
+  };
+};
+
+const globOpts: globrex.Options = { globstar: true, extended: true };
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -29,6 +29,7 @@
     "csv": "^5.3.2",
     "csv-parse": "^4.12.0",
     "csv-stringify": "^5.5.1",
+    "globrex": "^0.1.2",
     "kleur": "^4.1.3",
     "lighthouse": "^6.4.0",
     "log-update": "^4.0.0",
@@ -46,6 +47,7 @@
     "@changesets/changelog-github": "^0.2.7",
     "@changesets/cli": "^2.10.3",
     "@cloudfour/eslint-plugin": "^14.0.0",
+    "@types/globrex": "^0.1.0",
     "@types/jest": "^26.0.14",
     "@types/sade": "^1.7.2",
     "@types/simplecrawler": "^1.1.1",
diff --git a/test/path-filter-globs.test.ts b/test/path-filter-globs.test.ts
@@ -0,0 +1,78 @@
+import { createUrlFilter } from '../crawl';
+
+test('if the include array is empty allow any path', () => {
+  const filter = createUrlFilter([], []);
+  expect(filter({ path: '/foo' })).toBeTruthy();
+  expect(filter({ path: '/' })).toBeTruthy();
+  expect(filter({ path: '/asdf/1234' })).toBeTruthy();
+});
+
+test('only allow items matching static include glob', () => {
+  const filter = createUrlFilter(['/foo'], []);
+  expect(filter({ path: '/foo' })).toBeTruthy();
+  expect(filter({ path: '/foo/' })).toBeTruthy();
+  expect(filter({ path: '/foo/bar' })).toBeFalsy();
+  expect(filter({ path: '/foobar' })).toBeFalsy();
+  expect(filter({ path: '/asdf' })).toBeFalsy();
+});
+
+test('only allow items matching include glob', () => {
+  const filter = createUrlFilter(['/foo/*'], []);
+  expect(filter({ path: '/foo' })).toBeFalsy();
+  expect(filter({ path: '/foo/bar' })).toBeTruthy();
+  expect(filter({ path: '/foo/bar/' })).toBeTruthy();
+  expect(filter({ path: '/asdf' })).toBeFalsy();
+});
+
+test('allow items from multiple include globs', () => {
+  const filter = createUrlFilter(['/foo/*', '/foo'], []);
+  expect(filter({ path: '/foo/' })).toBeTruthy();
+  expect(filter({ path: '/foo' })).toBeTruthy();
+  expect(filter({ path: '/foo/bar' })).toBeTruthy();
+  expect(filter({ path: '/foo/bar/' })).toBeTruthy();
+  expect(filter({ path: '/foo/bar/baz' })).toBeFalsy();
+  expect(filter({ path: '/asdf' })).toBeFalsy();
+});
+
+test('glob with star in the middle', () => {
+  const filter = createUrlFilter(['/foo/*/bar'], []);
+  expect(filter({ path: '/foo/asdf/bar' })).toBeTruthy();
+  expect(filter({ path: '/foo/asdf/bar/' })).toBeTruthy();
+  expect(filter({ path: '/foo/bar' })).toBeFalsy();
+  expect(filter({ path: '/foo/asdf/1234' })).toBeFalsy();
+});
+
+test('removes trailing slash from glob', () => {
+  const filter = createUrlFilter(['/foo/'], []);
+  expect(filter({ path: '/foo/' })).toBeTruthy();
+  expect(filter({ path: '/foo' })).toBeTruthy();
+  expect(filter({ path: '/foo/bar' })).toBeFalsy();
+});
+
+test('exclude has higher precedence than include', () => {
+  const filter = createUrlFilter(['/foo/*'], ['/foo/asdf']);
+  expect(filter({ path: '/foo/bar' })).toBeTruthy();
+  expect(filter({ path: '/foo/bar/' })).toBeTruthy();
+  expect(filter({ path: '/foo/asdf' })).toBeFalsy();
+  expect(filter({ path: '/foo/bar/baz' })).toBeFalsy();
+  expect(filter({ path: '/foo/asdfasdf' })).toBeTruthy();
+});
+
+test('globstar and globs in exclude', () => {
+  const filter = createUrlFilter(['/foo/**'], ['/foo/asdf/**']);
+  expect(filter({ path: '/foo' })).toBeFalsy();
+  expect(filter({ path: '/foo/sdf' })).toBeTruthy();
+  expect(filter({ path: '/foo/sdf/asdf' })).toBeTruthy();
+  expect(filter({ path: '/foo/sdf/asdf/' })).toBeTruthy();
+  expect(filter({ path: '/foo/asdf' })).toBeTruthy();
+  expect(filter({ path: '/foo/asdf/foo' })).toBeFalsy();
+  expect(filter({ path: '/foo/asdf/foo/bar' })).toBeFalsy();
+});
+
+test('fancy globs', () => {
+  const filter = createUrlFilter(['/{foo,bar}/*'], []);
+  expect(filter({ path: '/foo/asdf' })).toBeTruthy();
+  expect(filter({ path: '/bar/asdf' })).toBeTruthy();
+  expect(filter({ path: '/bar/asdf/' })).toBeTruthy();
+  expect(filter({ path: '/1234/asdf' })).toBeFalsy();
+});
diff --git a/test/scan-task.test.ts b/test/scan-task.test.ts
@@ -9,6 +9,8 @@ test('Displays useful error if no pages are found while crawling', async () => {
   const { fakeCrawler, emit: scanEmit } = createFakeCrawler();
   const emitter = scan('https://nonexistent-website.com', {
     ignoreRobotsTxt: false,
+    includePathGlob: [],
+    excludePathGlob: [],
     dataDirectory: 'foo',
     lighthouseConcurrency: 1,
     crawler: fakeCrawler,
@@ -60,6 +62,8 @@ test('Fires correct lighthouse events as pages are found', async () => {
 
   const emitter = scan('https://google.com', {
     ignoreRobotsTxt: false,
+    includePathGlob: [],
+    excludePathGlob: [],
     dataDirectory: 'foo',
     lighthouseConcurrency: 1,
     lighthouse: (url) => {