Skip to content

Commit eda38df

Browse files
author
Peter Bengtsson
authored
fetch external urls better in link checker (github#30387)
1 parent 80f0502 commit eda38df

File tree

1 file changed

+205
-57
lines changed

1 file changed

+205
-57
lines changed

script/rendered-content-link-checker.js

Lines changed: 205 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import path from 'path'
1212
import cheerio from 'cheerio'
1313
import { program, Option, InvalidArgumentError } from 'commander'
1414
import chalk from 'chalk'
15-
import got from 'got'
15+
import got, { RequestError } from 'got'
1616

1717
import shortVersions from '../middleware/contextualizers/short-versions.js'
1818
import contextualize from '../middleware/context.js'
@@ -74,6 +74,9 @@ program
7474
.option('-v, --verbose', 'Verbose outputs')
7575
.option('--debug', "Loud about everything it's doing")
7676
.option('--random', 'Load pages in a random order (useful for debugging)')
77+
.option('--patient', 'Give external link checking longer timeouts and more retries')
78+
.option('-o, --out <file>', 'Put warnings and errors into a file instead of stdout')
79+
.option('--json-output', 'Print JSON to stdout or file instead')
7780
.option('--max <number>', 'integer argument (default: none)', (value) => {
7881
const parsed = parseInt(value, 10)
7982
if (isNaN(parsed)) {
@@ -107,7 +110,19 @@ program
107110
main(program.opts(), program.args)
108111

109112
async function main(opts, files) {
110-
const { random, language, filter, exit, debug, max, verbose, list, checkExternalLinks } = opts
113+
const {
114+
random,
115+
language,
116+
filter,
117+
exit,
118+
debug,
119+
max,
120+
verbose,
121+
list,
122+
checkExternalLinks,
123+
jsonOutput,
124+
out,
125+
} = opts
111126

112127
// Note! The reason we're using `warmServer()` in this script,
113128
// even though there's no server involved, is because
@@ -162,6 +177,9 @@ async function main(opts, files) {
162177
)
163178
const processPagesEnd = new Date()
164179
const flaws = flawsGroups.flat()
180+
if (jsonOutput) {
181+
jsonPrintFlaws(flaws, opts)
182+
}
165183

166184
debug && printGlobalCacheHitRatio()
167185

@@ -172,6 +190,9 @@ async function main(opts, files) {
172190
console.log(`Took ${getDurationString(processPagesStart, processPagesEnd)}`)
173191

174192
summarizeFlaws(flaws)
193+
if (out && flaws.length > 0) {
194+
console.log(`All flaws written to ${chalk.bold(out)}`)
195+
}
175196
}
176197

177198
if (exit) {
@@ -244,7 +265,7 @@ function getPages(pageList, languages, filters, files, max) {
244265
}
245266

246267
async function processPage(page, pageMap, redirects, opts) {
247-
const { bail, verboseUrl } = opts
268+
const { bail, verboseUrl, jsonOutput, out } = opts
248269

249270
const allFlawsEach = await Promise.all(
250271
page.permalinks.map((permalink) => processPermalink(permalink, page, pageMap, redirects, opts))
@@ -253,17 +274,23 @@ async function processPage(page, pageMap, redirects, opts) {
253274
const allFlaws = allFlawsEach.flat()
254275

255276
if (bail && allFlaws.length > 0) {
256-
printFlaws(allFlaws, verboseUrl)
277+
if (jsonOutput) {
278+
jsonPrintFlaws(allFlaws, opts)
279+
} else {
280+
printFlaws(allFlaws, { verboseUrl, out })
281+
}
257282
process.exit(1)
258283
}
259284

260-
printFlaws(allFlaws, verboseUrl)
285+
if (!jsonOutput) {
286+
printFlaws(allFlaws, { verboseUrl, out })
287+
}
261288

262289
return allFlaws
263290
}
264291

265292
async function processPermalink(permalink, page, pageMap, redirects, opts) {
266-
const { level, checkAnchors, checkImages, checkExternalLinks } = opts
293+
const { level, checkAnchors, checkImages, checkExternalLinks, verbose, patient } = opts
267294
const html = await renderInnerHTML(page, permalink)
268295
const $ = cheerio.load(html)
269296
const flaws = []
@@ -291,7 +318,8 @@ async function processPermalink(permalink, page, pageMap, redirects, opts) {
291318
redirects,
292319
pageMap,
293320
checkAnchors,
294-
checkExternalLinks
321+
checkExternalLinks,
322+
{ verbose, patient }
295323
)
296324

297325
if (flaw) {
@@ -347,36 +375,92 @@ async function processPermalink(permalink, page, pageMap, redirects, opts) {
347375
return flaws
348376
}
349377

350-
function printFlaws(flaws, verboseUrl = null) {
378+
function jsonPrintFlaws(flaws, { verboseUrl = null, out = null } = {}) {
379+
const printableFlaws = {}
380+
for (const { page, permalink, href, text, src, flaw } of flaws) {
381+
const fullPath = prettyFullPath(page.fullPath)
382+
383+
if (!(fullPath in printableFlaws)) {
384+
printableFlaws[fullPath] = []
385+
}
386+
if (href) {
387+
printableFlaws[fullPath].push({
388+
href,
389+
url: verboseUrl ? new URL(permalink.href, verboseUrl).toString() : permalink.href,
390+
text,
391+
flaw,
392+
})
393+
} else if (src) {
394+
printableFlaws[fullPath].push({
395+
src,
396+
})
397+
}
398+
}
399+
const message = JSON.stringify(printableFlaws, undefined, 2)
400+
if (out) {
401+
fs.writeFileSync(out, message + '\n', 'utf-8')
402+
} else {
403+
console.log(message)
404+
}
405+
}
406+
407+
function printFlaws(flaws, { verboseUrl = null, out = null } = {}) {
351408
let previousPage = null
352409
let previousPermalink = null
410+
411+
function fout(msg) {
412+
if (out) {
413+
fs.appendFileSync(out, `${msg}\n`, 'utf-8')
414+
} else {
415+
console.log(msg)
416+
}
417+
}
418+
353419
for (const { page, permalink, href, text, src, flaw } of flaws) {
420+
const fullPath = prettyFullPath(page.fullPath)
354421
if (page !== previousPage) {
355-
console.log(`PAGE: ${chalk.bold(prettyFullPath(page.fullPath))}`)
422+
if (out) {
423+
fout(`PAGE: ${fullPath}`)
424+
} else {
425+
console.log(`PAGE: ${chalk.bold(fullPath)}`)
426+
}
356427
}
357428
previousPage = page
358429

359430
if (href) {
360431
if (previousPermalink !== permalink.href) {
361432
if (verboseUrl) {
362-
console.log(` URL: ${new URL(permalink.href, verboseUrl).toString()}`)
433+
fout(` URL: ${new URL(permalink.href, verboseUrl).toString()}`)
363434
} else {
364-
console.log(` PERMALINK: ${permalink.href}`)
435+
fout(` PERMALINK: ${permalink.href}`)
365436
}
366437
}
367438
previousPermalink = permalink.href
368439

369-
console.log(` HREF: ${chalk.bold(href)}`)
370-
console.log(` TEXT: ${text}`)
440+
if (out) {
441+
fout(` HREF: ${href}`)
442+
} else {
443+
console.log(` HREF: ${chalk.bold(href)}`)
444+
}
445+
fout(` TEXT: ${text}`)
371446
} else if (src) {
372-
console.log(` IMG SRC: ${chalk.bold(src)}`)
447+
if (out) {
448+
fout(` IMG SRC: ${src}`)
449+
} else {
450+
console.log(` IMG SRC: ${chalk.bold(src)}`)
451+
}
373452
} else {
374453
throw new Error("Flaw has neither 'href' nor 'src'")
375454
}
376-
console.log(
377-
` FLAW: ${flaw.CRITICAL ? chalk.red(flaw.CRITICAL) : chalk.yellow(flaw.WARNING)}`
378-
)
379-
console.log('')
455+
456+
if (out) {
457+
fout(` FLAW: ${flaw.CRITICAL ? flaw.CRITICAL : flaw.WARNING}`)
458+
} else {
459+
console.log(
460+
` FLAW: ${flaw.CRITICAL ? chalk.red(flaw.CRITICAL) : chalk.yellow(flaw.WARNING)}`
461+
)
462+
}
463+
fout('')
380464
}
381465
}
382466

@@ -402,7 +486,8 @@ async function checkHrefLink(
402486
redirects,
403487
pageMap,
404488
checkAnchors = false,
405-
checkExternalLinks = false
489+
checkExternalLinks = false,
490+
{ verbose = false, patient = false } = {}
406491
) {
407492
if (href === '#') {
408493
if (checkAnchors) {
@@ -456,42 +541,38 @@ async function checkHrefLink(
456541
if (linksToSkip(href)) {
457542
return
458543
}
459-
let failed = false
460-
461-
try {
462-
failed = await checkExternalURL(href)
463-
} catch (err) {
464-
return { WARNING: `Got error when testing ${href}: ${err.toString()}` }
465-
}
466-
if (failed) {
467-
return { CRITICAL: 'Broken external link ' }
544+
const { ok, ...info } = await checkExternalURL(href, { verbose, patient })
545+
if (!ok) {
546+
return { CRITICAL: `Broken external link (${JSON.stringify(info)})` }
468547
}
469548
}
470549
}
471550

472-
const externalResponseCache = new Map()
473-
const externalResponseWaiting = new Set()
551+
const _fetchCache = new Map()
552+
async function checkExternalURL(url, { verbose = false, patient = false } = {}) {
553+
if (!url.startsWith('https://')) throw new Error('Invalid URL')
554+
const cleanURL = url.split('#')[0]
555+
if (!_fetchCache.has(cleanURL)) {
556+
_fetchCache.set(cleanURL, innerFetch(cleanURL, { verbose, patient }))
557+
}
558+
return _fetchCache.get(cleanURL)
559+
}
474560

475561
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms))
476562

477-
async function checkExternalURL(url) {
478-
if (!url.startsWith('https://')) throw new Error('Invalid URL')
563+
// Global for recording which domains we get rate-limited on.
564+
// For example, if you got rate limited on `something.github.com/foo`
565+
// and now we're asked to fetch for `something.github.com/bar`
566+
// it's good to know to now bother yet.
567+
const _rateLimitedDomains = new Map()
479568

480-
if (externalResponseCache.has(url)) {
481-
const result = externalResponseCache.get(url)
482-
return result
483-
}
484-
if (externalResponseWaiting.has(url)) {
485-
// Because this whole script is based on `Promise.all()` you can't
486-
// guarantee that you first make the list of external URLs distinct,
487-
// so you'll end up with N concurrent threads that both start,
488-
// waiting for the same URL to check.
489-
// If there's one going on, sleep and retry all over.
490-
await sleep(500 + Math.random() * 100)
491-
return await checkExternalURL(url)
492-
}
493-
externalResponseWaiting.add(url)
569+
async function innerFetch(url, config = {}) {
570+
const { verbose, useGET, patient } = config
494571

572+
const { hostname } = new URL(url)
573+
if (_rateLimitedDomains.has(hostname)) {
574+
await sleep(_rateLimitedDomains.get(hostname))
575+
}
495576
// The way `got` does retries:
496577
//
497578
// sleep = 1000 * Math.pow(2, retry - 1) + Math.random() * 100
@@ -507,20 +588,87 @@ async function checkExternalURL(url) {
507588
// So there's no point in trying more attempts than 3 because it would
508589
// just timeout on the 10s. (i.e. 1000 + 2000 + 4000 + 8000 > 10,000)
509590
const retry = {
510-
limit: 3,
591+
limit: patient ? 5 : 2,
511592
}
512-
const timeout = 2000
593+
const timeout = { request: patient ? 10000 : 2000 }
513594

514-
const r = await got(url, {
515-
throwHttpErrors: false,
516-
retry,
517-
timeout,
518-
})
595+
const headers = {
596+
'User-Agent':
597+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
598+
}
599+
600+
const retries = config.retries || 0
601+
const httpFunction = useGET ? got.get : got.head
519602

520-
const failed = r.statusCode !== 200
521-
externalResponseCache.set(url, failed)
522-
externalResponseWaiting.delete(url)
523-
return failed
603+
if (verbose) console.log(`External URL ${useGET ? 'GET' : 'HEAD'}: ${url} (retries: ${retries})`)
604+
try {
605+
const r = await httpFunction(url, {
606+
headers,
607+
throwHttpErrors: false,
608+
retry,
609+
timeout,
610+
})
611+
if (verbose) {
612+
console.log(
613+
`External URL ${useGET ? 'GET' : 'HEAD'} ${url}: ${r.statusCode} (retries: ${retries})`
614+
)
615+
}
616+
617+
// If we get rate limited, remember that this hostname is now all
618+
// rate limited. And sleep for the number of seconds that the
619+
// `retry-after` header indicated.
620+
if (r.statusCode === 429) {
621+
let sleepTime = Math.min(
622+
60_000,
623+
Math.max(10_000, getRetryAfterSleep(r.headers['retry-after']))
624+
)
625+
// Sprinkle a little jitter so it doesn't all start again all
626+
// at the same time
627+
sleepTime += Math.random() * 10 * 1000
628+
// Give it a bit extra when we can be really patient
629+
if (patient) sleepTime += 30 * 1000
630+
631+
_rateLimitedDomains.set(hostname, sleepTime + Math.random() * 10 * 1000)
632+
if (verbose)
633+
console.log(
634+
chalk.yellow(
635+
`Rate limited on ${hostname} (${url}). Sleeping for ${(sleepTime / 1000).toFixed(1)}s`
636+
)
637+
)
638+
await sleep(sleepTime)
639+
return innerFetch(url, Object.assign({}, config, { retries: retries + 1 }))
640+
} else {
641+
_rateLimitedDomains.delete(hostname)
642+
}
643+
644+
// Perhaps the server doesn't suppport HEAD requests.
645+
// If so, try again with a regular GET.
646+
if ((r.statusCode === 405 || r.statusCode === 404) && !useGET) {
647+
return innerFetch(url, Object.assign({}, config, { useGET: true }))
648+
}
649+
if (verbose) {
650+
console.log((r.ok ? chalk.green : chalk.red)(`${r.statusCode} on ${url}`))
651+
}
652+
return { ok: r.ok, statusCode: r.statusCode }
653+
} catch (err) {
654+
if (err instanceof RequestError) {
655+
if (verbose) {
656+
console.log(chalk.yellow(`RequestError (${err.message}) on ${url}`))
657+
}
658+
return { ok: false, requestError: err.message }
659+
}
660+
throw err
661+
}
662+
}
663+
664+
// Return number of milliseconds from a `Retry-After` header value
665+
function getRetryAfterSleep(headerValue) {
666+
if (!headerValue) return 0
667+
let ms = Math.round(parseFloat(headerValue) * 1000)
668+
if (isNaN(ms)) {
669+
ms = Math.max(0, new Date(headerValue) - new Date())
670+
}
671+
return ms
524672
}
525673

526674
function checkImageSrc(src, $) {

0 commit comments

Comments
 (0)