@@ -1073,6 +1073,20 @@ async function checkExternalURLCached(
10731073 const now = new Date ( ) . getTime ( )
10741074 const url = href . split ( '#' ) [ 0 ]
10751075
1076+ // Skip domains that are currently rate limited (with 1 hour TTL)
1077+ const { hostname } = new URL ( url )
1078+ const rateLimitTime = _rateLimitedDomains . get ( hostname )
1079+ if ( rateLimitTime ) {
1080+ const oneHourAgo = Date . now ( ) - 60 * 60 * 1000 // 1 hour in ms
1081+ if ( rateLimitTime > oneHourAgo ) {
1082+ if ( verbose ) core . info ( `Skipping ${ url } - domain ${ hostname } is rate limited` )
1083+ return { ok : false , statusCode : 429 , skipReason : 'Domain rate limited' }
1084+ } else {
1085+ // Rate limit has expired, remove it
1086+ _rateLimitedDomains . delete ( hostname )
1087+ }
1088+ }
1089+
10761090 if ( cacheMaxAge ) {
10771091 const tooOld = now - Math . floor ( jitter ( cacheMaxAge , 10 ) )
10781092 if ( db && db . data . urls [ url ] ) {
@@ -1122,105 +1136,70 @@ async function checkExternalURL(
11221136 return _fetchCache . get ( cleanURL )
11231137}
11241138
1125- const sleep = ( ms : number ) => new Promise ( ( resolve ) => setTimeout ( resolve , ms ) )
1126-
1127- // Global for recording which domains we get rate-limited on.
1128- // For example, if you got rate limited on `something.github.com/foo`
1129- // and now we're asked to fetch for `something.github.com/bar`
1130- // it's good to know to now bother yet.
1131- const _rateLimitedDomains = new Map ( )
1139+ // Track domains that have returned 429 to skip future requests
1140+ // Maps hostname to timestamp when rate limit was detected
1141+ const _rateLimitedDomains = new Map < string , number > ( )
11321142
11331143async function innerFetch (
11341144 core : CoreInject ,
11351145 url : string ,
1136- config : { verbose ?: boolean ; useGET ?: boolean ; patient ?: boolean ; retries ?: number } = { } ,
1146+ config : { verbose ?: boolean ; patient ?: boolean ; retries ?: number } = { } ,
11371147) {
1138- const { verbose, useGET, patient } = config
1139-
1140- const { hostname } = new URL ( url )
1141- if ( _rateLimitedDomains . has ( hostname ) ) {
1142- await sleep ( _rateLimitedDomains . get ( hostname ) )
1143- }
1144- // The way `got` does retries:
1145- //
1146- // sleep = 1000 * Math.pow(2, retry - 1) + Math.random() * 100
1147- //
1148- // So, it means:
1149- //
1150- // 1. ~1000ms
1151- // 2. ~2000ms
1152- // 3. ~4000ms
1153- //
1154- // ...if the limit we set is 3.
1155- // Our own timeout, in @/frame/middleware/timeout.js defaults to 10 seconds.
1156- // So there's no point in trying more attempts than 3 because it would
1157- // just timeout on the 10s. (i.e. 1000 + 2000 + 4000 + 8000 > 10,000)
1158- const retry = {
1159- limit : patient ? 6 : 2 ,
1160- }
1161- const timeout = { request : patient ? 10000 : 2000 }
1148+ const { verbose, patient } = config
11621149
11631150 const headers = {
11641151 'User-Agent' :
11651152 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36' ,
11661153 }
11671154
1168- const retries = config . retries || 0
1169- const method = useGET ? 'GET' : 'HEAD'
1155+ const retries = patient ? 3 : 2
1156+ const timeout = patient ? 10000 : 5000
1157+
1158+ if ( verbose ) core . info ( `External URL HEAD: ${ url } ` )
11701159
1171- if ( verbose ) core . info ( `External URL ${ method } : ${ url } (retries: ${ retries } )` )
11721160 try {
1173- const r = await fetchWithRetry (
1161+ // Try HEAD first
1162+ let r = await fetchWithRetry (
11741163 url ,
11751164 {
1176- method,
1165+ method : 'HEAD' ,
11771166 headers,
11781167 } ,
11791168 {
1180- retries : retry . limit ,
1181- timeout : timeout . request ,
1169+ retries,
1170+ timeout,
11821171 throwHttpErrors : false ,
11831172 } ,
11841173 )
1174+
1175+ // If HEAD doesn't work, try GET
1176+ if ( r . status === 405 || r . status === 404 || r . status === 403 ) {
1177+ if ( verbose ) core . info ( `External URL GET: ${ url } (HEAD failed with ${ r . status } )` )
1178+ r = await fetchWithRetry (
1179+ url ,
1180+ {
1181+ method : 'GET' ,
1182+ headers,
1183+ } ,
1184+ {
1185+ retries,
1186+ timeout,
1187+ throwHttpErrors : false ,
1188+ } ,
1189+ )
1190+ }
1191+
11851192 if ( verbose ) {
1186- core . info ( `External URL ${ method } ${ url } : ${ r . status } (retries: ${ retries } ) ` )
1193+ core . info ( `External URL ${ url } : ${ r . status } ` )
11871194 }
11881195
1189- // If we get rate limited, remember that this hostname is now all
1190- // rate limited. And sleep for the number of seconds that the
1191- // `retry-after` header indicated.
1196+ // Track rate limited domains with timestamp
1197+ const { hostname } = new URL ( url )
11921198 if ( r . status === 429 ) {
1193- let sleepTime = Math . min (
1194- 60_000 ,
1195- Math . max (
1196- 10_000 ,
1197- r . headers . get ( 'retry-after' ) ? getRetryAfterSleep ( r . headers . get ( 'retry-after' ) ) : 1_000 ,
1198- ) ,
1199- )
1200- // Sprinkle a little jitter so it doesn't all start again all
1201- // at the same time
1202- sleepTime += Math . random ( ) * 10 * 1000
1203- // Give it a bit extra when we can be really patient
1204- if ( patient ) sleepTime += 30 * 1000
1205-
1206- _rateLimitedDomains . set ( hostname , sleepTime + Math . random ( ) * 10 * 1000 )
1207- if ( verbose )
1208- core . info (
1209- chalk . yellow (
1210- `Rate limited on ${ hostname } (${ url } ). Sleeping for ${ ( sleepTime / 1000 ) . toFixed ( 1 ) } s` ,
1211- ) ,
1212- )
1213- await sleep ( sleepTime )
1214- return innerFetch ( core , url , Object . assign ( { } , config , { retries : retries + 1 } ) )
1215- } else {
1216- _rateLimitedDomains . delete ( hostname )
1199+ _rateLimitedDomains . set ( hostname , Date . now ( ) )
1200+ if ( verbose ) core . info ( `Domain ${ hostname } is now rate limited for 1 hour` )
12171201 }
12181202
1219- // Perhaps the server doesn't support HEAD requests.
1220- // If so, try again with a regular GET.
1221- if ( ( r . status === 405 || r . status === 404 || r . status === 403 ) && ! useGET ) {
1222- return innerFetch ( core , url , Object . assign ( { } , config , { useGET : true } ) )
1223- }
12241203 if ( verbose ) {
12251204 core . info ( ( r . ok ? chalk . green : chalk . red ) ( `${ r . status } on ${ url } ` ) )
12261205 }
@@ -1236,17 +1215,6 @@ async function innerFetch(
12361215 }
12371216}
12381217
1239- // Return number of milliseconds from a `Retry-After` header value
1240- function getRetryAfterSleep ( headerValue : string | null ) {
1241- if ( ! headerValue ) return 0
1242- let ms = Math . round ( parseFloat ( headerValue ) * 1000 )
1243- if ( isNaN ( ms ) ) {
1244- const nextDate = new Date ( headerValue )
1245- ms = Math . max ( 0 , nextDate . getTime ( ) - new Date ( ) . getTime ( ) )
1246- }
1247- return ms
1248- }
1249-
12501218function checkImageSrc ( src : string ) {
12511219 if ( ! src . startsWith ( '/' ) && ! src . startsWith ( 'http' ) ) {
12521220 return { CRITICAL : 'Image path is not absolute. Should start with a /' }
0 commit comments