@@ -12,7 +12,7 @@ import path from 'path'
12
12
import cheerio from 'cheerio'
13
13
import { program , Option , InvalidArgumentError } from 'commander'
14
14
import chalk from 'chalk'
15
- import got from 'got'
15
+ import got , { RequestError } from 'got'
16
16
17
17
import shortVersions from '../middleware/contextualizers/short-versions.js'
18
18
import contextualize from '../middleware/context.js'
@@ -74,6 +74,9 @@ program
74
74
. option ( '-v, --verbose' , 'Verbose outputs' )
75
75
. option ( '--debug' , "Loud about everything it's doing" )
76
76
. option ( '--random' , 'Load pages in a random order (useful for debugging)' )
77
+ . option ( '--patient' , 'Give external link checking longer timeouts and more retries' )
78
+ . option ( '-o, --out <file>' , 'Put warnings and errors into a file instead of stdout' )
79
+ . option ( '--json-output' , 'Print JSON to stdout or file instead' )
77
80
. option ( '--max <number>' , 'integer argument (default: none)' , ( value ) => {
78
81
const parsed = parseInt ( value , 10 )
79
82
if ( isNaN ( parsed ) ) {
@@ -107,7 +110,19 @@ program
107
110
main ( program . opts ( ) , program . args )
108
111
109
112
async function main ( opts , files ) {
110
- const { random, language, filter, exit, debug, max, verbose, list, checkExternalLinks } = opts
113
+ const {
114
+ random,
115
+ language,
116
+ filter,
117
+ exit,
118
+ debug,
119
+ max,
120
+ verbose,
121
+ list,
122
+ checkExternalLinks,
123
+ jsonOutput,
124
+ out,
125
+ } = opts
111
126
112
127
// Note! The reason we're using `warmServer()` in this script,
113
128
// even though there's no server involved, is because
@@ -162,6 +177,9 @@ async function main(opts, files) {
162
177
)
163
178
const processPagesEnd = new Date ( )
164
179
const flaws = flawsGroups . flat ( )
180
+ if ( jsonOutput ) {
181
+ jsonPrintFlaws ( flaws , opts )
182
+ }
165
183
166
184
debug && printGlobalCacheHitRatio ( )
167
185
@@ -172,6 +190,9 @@ async function main(opts, files) {
172
190
console . log ( `Took ${ getDurationString ( processPagesStart , processPagesEnd ) } ` )
173
191
174
192
summarizeFlaws ( flaws )
193
+ if ( out && flaws . length > 0 ) {
194
+ console . log ( `All flaws written to ${ chalk . bold ( out ) } ` )
195
+ }
175
196
}
176
197
177
198
if ( exit ) {
@@ -244,7 +265,7 @@ function getPages(pageList, languages, filters, files, max) {
244
265
}
245
266
246
267
async function processPage ( page , pageMap , redirects , opts ) {
247
- const { bail, verboseUrl } = opts
268
+ const { bail, verboseUrl, jsonOutput , out } = opts
248
269
249
270
const allFlawsEach = await Promise . all (
250
271
page . permalinks . map ( ( permalink ) => processPermalink ( permalink , page , pageMap , redirects , opts ) )
@@ -253,17 +274,23 @@ async function processPage(page, pageMap, redirects, opts) {
253
274
const allFlaws = allFlawsEach . flat ( )
254
275
255
276
if ( bail && allFlaws . length > 0 ) {
256
- printFlaws ( allFlaws , verboseUrl )
277
+ if ( jsonOutput ) {
278
+ jsonPrintFlaws ( allFlaws , opts )
279
+ } else {
280
+ printFlaws ( allFlaws , { verboseUrl, out } )
281
+ }
257
282
process . exit ( 1 )
258
283
}
259
284
260
- printFlaws ( allFlaws , verboseUrl )
285
+ if ( ! jsonOutput ) {
286
+ printFlaws ( allFlaws , { verboseUrl, out } )
287
+ }
261
288
262
289
return allFlaws
263
290
}
264
291
265
292
async function processPermalink ( permalink , page , pageMap , redirects , opts ) {
266
- const { level, checkAnchors, checkImages, checkExternalLinks } = opts
293
+ const { level, checkAnchors, checkImages, checkExternalLinks, verbose , patient } = opts
267
294
const html = await renderInnerHTML ( page , permalink )
268
295
const $ = cheerio . load ( html )
269
296
const flaws = [ ]
@@ -291,7 +318,8 @@ async function processPermalink(permalink, page, pageMap, redirects, opts) {
291
318
redirects ,
292
319
pageMap ,
293
320
checkAnchors ,
294
- checkExternalLinks
321
+ checkExternalLinks ,
322
+ { verbose, patient }
295
323
)
296
324
297
325
if ( flaw ) {
@@ -347,36 +375,92 @@ async function processPermalink(permalink, page, pageMap, redirects, opts) {
347
375
return flaws
348
376
}
349
377
350
- function printFlaws ( flaws , verboseUrl = null ) {
378
+ function jsonPrintFlaws ( flaws , { verboseUrl = null , out = null } = { } ) {
379
+ const printableFlaws = { }
380
+ for ( const { page, permalink, href, text, src, flaw } of flaws ) {
381
+ const fullPath = prettyFullPath ( page . fullPath )
382
+
383
+ if ( ! ( fullPath in printableFlaws ) ) {
384
+ printableFlaws [ fullPath ] = [ ]
385
+ }
386
+ if ( href ) {
387
+ printableFlaws [ fullPath ] . push ( {
388
+ href,
389
+ url : verboseUrl ? new URL ( permalink . href , verboseUrl ) . toString ( ) : permalink . href ,
390
+ text,
391
+ flaw,
392
+ } )
393
+ } else if ( src ) {
394
+ printableFlaws [ fullPath ] . push ( {
395
+ src,
396
+ } )
397
+ }
398
+ }
399
+ const message = JSON . stringify ( printableFlaws , undefined , 2 )
400
+ if ( out ) {
401
+ fs . writeFileSync ( out , message + '\n' , 'utf-8' )
402
+ } else {
403
+ console . log ( message )
404
+ }
405
+ }
406
+
407
+ function printFlaws ( flaws , { verboseUrl = null , out = null } = { } ) {
351
408
let previousPage = null
352
409
let previousPermalink = null
410
+
411
+ function fout ( msg ) {
412
+ if ( out ) {
413
+ fs . appendFileSync ( out , `${ msg } \n` , 'utf-8' )
414
+ } else {
415
+ console . log ( msg )
416
+ }
417
+ }
418
+
353
419
for ( const { page, permalink, href, text, src, flaw } of flaws ) {
420
+ const fullPath = prettyFullPath ( page . fullPath )
354
421
if ( page !== previousPage ) {
355
- console . log ( `PAGE: ${ chalk . bold ( prettyFullPath ( page . fullPath ) ) } ` )
422
+ if ( out ) {
423
+ fout ( `PAGE: ${ fullPath } ` )
424
+ } else {
425
+ console . log ( `PAGE: ${ chalk . bold ( fullPath ) } ` )
426
+ }
356
427
}
357
428
previousPage = page
358
429
359
430
if ( href ) {
360
431
if ( previousPermalink !== permalink . href ) {
361
432
if ( verboseUrl ) {
362
- console . log ( ` URL: ${ new URL ( permalink . href , verboseUrl ) . toString ( ) } ` )
433
+ fout ( ` URL: ${ new URL ( permalink . href , verboseUrl ) . toString ( ) } ` )
363
434
} else {
364
- console . log ( ` PERMALINK: ${ permalink . href } ` )
435
+ fout ( ` PERMALINK: ${ permalink . href } ` )
365
436
}
366
437
}
367
438
previousPermalink = permalink . href
368
439
369
- console . log ( ` HREF: ${ chalk . bold ( href ) } ` )
370
- console . log ( ` TEXT: ${ text } ` )
440
+ if ( out ) {
441
+ fout ( ` HREF: ${ href } ` )
442
+ } else {
443
+ console . log ( ` HREF: ${ chalk . bold ( href ) } ` )
444
+ }
445
+ fout ( ` TEXT: ${ text } ` )
371
446
} else if ( src ) {
372
- console . log ( ` IMG SRC: ${ chalk . bold ( src ) } ` )
447
+ if ( out ) {
448
+ fout ( ` IMG SRC: ${ src } ` )
449
+ } else {
450
+ console . log ( ` IMG SRC: ${ chalk . bold ( src ) } ` )
451
+ }
373
452
} else {
374
453
throw new Error ( "Flaw has neither 'href' nor 'src'" )
375
454
}
376
- console . log (
377
- ` FLAW: ${ flaw . CRITICAL ? chalk . red ( flaw . CRITICAL ) : chalk . yellow ( flaw . WARNING ) } `
378
- )
379
- console . log ( '' )
455
+
456
+ if ( out ) {
457
+ fout ( ` FLAW: ${ flaw . CRITICAL ? flaw . CRITICAL : flaw . WARNING } ` )
458
+ } else {
459
+ console . log (
460
+ ` FLAW: ${ flaw . CRITICAL ? chalk . red ( flaw . CRITICAL ) : chalk . yellow ( flaw . WARNING ) } `
461
+ )
462
+ }
463
+ fout ( '' )
380
464
}
381
465
}
382
466
@@ -402,7 +486,8 @@ async function checkHrefLink(
402
486
redirects ,
403
487
pageMap ,
404
488
checkAnchors = false ,
405
- checkExternalLinks = false
489
+ checkExternalLinks = false ,
490
+ { verbose = false , patient = false } = { }
406
491
) {
407
492
if ( href === '#' ) {
408
493
if ( checkAnchors ) {
@@ -456,42 +541,38 @@ async function checkHrefLink(
456
541
if ( linksToSkip ( href ) ) {
457
542
return
458
543
}
459
- let failed = false
460
-
461
- try {
462
- failed = await checkExternalURL ( href )
463
- } catch ( err ) {
464
- return { WARNING : `Got error when testing ${ href } : ${ err . toString ( ) } ` }
465
- }
466
- if ( failed ) {
467
- return { CRITICAL : 'Broken external link ' }
544
+ const { ok, ...info } = await checkExternalURL ( href , { verbose, patient } )
545
+ if ( ! ok ) {
546
+ return { CRITICAL : `Broken external link (${ JSON . stringify ( info ) } )` }
468
547
}
469
548
}
470
549
}
471
550
472
- const externalResponseCache = new Map ( )
473
- const externalResponseWaiting = new Set ( )
551
+ const _fetchCache = new Map ( )
552
+ async function checkExternalURL ( url , { verbose = false , patient = false } = { } ) {
553
+ if ( ! url . startsWith ( 'https://' ) ) throw new Error ( 'Invalid URL' )
554
+ const cleanURL = url . split ( '#' ) [ 0 ]
555
+ if ( ! _fetchCache . has ( cleanURL ) ) {
556
+ _fetchCache . set ( cleanURL , innerFetch ( cleanURL , { verbose, patient } ) )
557
+ }
558
+ return _fetchCache . get ( cleanURL )
559
+ }
474
560
475
561
const sleep = ( ms ) => new Promise ( ( resolve ) => setTimeout ( resolve , ms ) )
476
562
477
- async function checkExternalURL ( url ) {
478
- if ( ! url . startsWith ( 'https://' ) ) throw new Error ( 'Invalid URL' )
563
+ // Global for recording which domains we get rate-limited on.
564
+ // For example, if you got rate limited on `something.github.com/foo`
565
+ // and now we're asked to fetch for `something.github.com/bar`
566
+ // it's good to know to now bother yet.
567
+ const _rateLimitedDomains = new Map ( )
479
568
480
- if ( externalResponseCache . has ( url ) ) {
481
- const result = externalResponseCache . get ( url )
482
- return result
483
- }
484
- if ( externalResponseWaiting . has ( url ) ) {
485
- // Because this whole script is based on `Promise.all()` you can't
486
- // guarantee that you first make the list of external URLs distinct,
487
- // so you'll end up with N concurrent threads that both start,
488
- // waiting for the same URL to check.
489
- // If there's one going on, sleep and retry all over.
490
- await sleep ( 500 + Math . random ( ) * 100 )
491
- return await checkExternalURL ( url )
492
- }
493
- externalResponseWaiting . add ( url )
569
+ async function innerFetch ( url , config = { } ) {
570
+ const { verbose, useGET, patient } = config
494
571
572
+ const { hostname } = new URL ( url )
573
+ if ( _rateLimitedDomains . has ( hostname ) ) {
574
+ await sleep ( _rateLimitedDomains . get ( hostname ) )
575
+ }
495
576
// The way `got` does retries:
496
577
//
497
578
// sleep = 1000 * Math.pow(2, retry - 1) + Math.random() * 100
@@ -507,20 +588,87 @@ async function checkExternalURL(url) {
507
588
// So there's no point in trying more attempts than 3 because it would
508
589
// just timeout on the 10s. (i.e. 1000 + 2000 + 4000 + 8000 > 10,000)
509
590
const retry = {
510
- limit : 3 ,
591
+ limit : patient ? 5 : 2 ,
511
592
}
512
- const timeout = 2000
593
+ const timeout = { request : patient ? 10000 : 2000 }
513
594
514
- const r = await got ( url , {
515
- throwHttpErrors : false ,
516
- retry,
517
- timeout,
518
- } )
595
+ const headers = {
596
+ 'User-Agent' :
597
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36' ,
598
+ }
599
+
600
+ const retries = config . retries || 0
601
+ const httpFunction = useGET ? got . get : got . head
519
602
520
- const failed = r . statusCode !== 200
521
- externalResponseCache . set ( url , failed )
522
- externalResponseWaiting . delete ( url )
523
- return failed
603
+ if ( verbose ) console . log ( `External URL ${ useGET ? 'GET' : 'HEAD' } : ${ url } (retries: ${ retries } )` )
604
+ try {
605
+ const r = await httpFunction ( url , {
606
+ headers,
607
+ throwHttpErrors : false ,
608
+ retry,
609
+ timeout,
610
+ } )
611
+ if ( verbose ) {
612
+ console . log (
613
+ `External URL ${ useGET ? 'GET' : 'HEAD' } ${ url } : ${ r . statusCode } (retries: ${ retries } )`
614
+ )
615
+ }
616
+
617
+ // If we get rate limited, remember that this hostname is now all
618
+ // rate limited. And sleep for the number of seconds that the
619
+ // `retry-after` header indicated.
620
+ if ( r . statusCode === 429 ) {
621
+ let sleepTime = Math . min (
622
+ 60_000 ,
623
+ Math . max ( 10_000 , getRetryAfterSleep ( r . headers [ 'retry-after' ] ) )
624
+ )
625
+ // Sprinkle a little jitter so it doesn't all start again all
626
+ // at the same time
627
+ sleepTime += Math . random ( ) * 10 * 1000
628
+ // Give it a bit extra when we can be really patient
629
+ if ( patient ) sleepTime += 30 * 1000
630
+
631
+ _rateLimitedDomains . set ( hostname , sleepTime + Math . random ( ) * 10 * 1000 )
632
+ if ( verbose )
633
+ console . log (
634
+ chalk . yellow (
635
+ `Rate limited on ${ hostname } (${ url } ). Sleeping for ${ ( sleepTime / 1000 ) . toFixed ( 1 ) } s`
636
+ )
637
+ )
638
+ await sleep ( sleepTime )
639
+ return innerFetch ( url , Object . assign ( { } , config , { retries : retries + 1 } ) )
640
+ } else {
641
+ _rateLimitedDomains . delete ( hostname )
642
+ }
643
+
644
+ // Perhaps the server doesn't suppport HEAD requests.
645
+ // If so, try again with a regular GET.
646
+ if ( ( r . statusCode === 405 || r . statusCode === 404 ) && ! useGET ) {
647
+ return innerFetch ( url , Object . assign ( { } , config , { useGET : true } ) )
648
+ }
649
+ if ( verbose ) {
650
+ console . log ( ( r . ok ? chalk . green : chalk . red ) ( `${ r . statusCode } on ${ url } ` ) )
651
+ }
652
+ return { ok : r . ok , statusCode : r . statusCode }
653
+ } catch ( err ) {
654
+ if ( err instanceof RequestError ) {
655
+ if ( verbose ) {
656
+ console . log ( chalk . yellow ( `RequestError (${ err . message } ) on ${ url } ` ) )
657
+ }
658
+ return { ok : false , requestError : err . message }
659
+ }
660
+ throw err
661
+ }
662
+ }
663
+
664
+ // Return number of milliseconds from a `Retry-After` header value
665
+ function getRetryAfterSleep ( headerValue ) {
666
+ if ( ! headerValue ) return 0
667
+ let ms = Math . round ( parseFloat ( headerValue ) * 1000 )
668
+ if ( isNaN ( ms ) ) {
669
+ ms = Math . max ( 0 , new Date ( headerValue ) - new Date ( ) )
670
+ }
671
+ return ms
524
672
}
525
673
526
674
function checkImageSrc ( src , $ ) {
0 commit comments