Skip to content

Commit 060ccba

Browse files
authored
Fix excessive link checker retries with domain-level rate limiting (#57510)
1 parent ce8415e commit 060ccba

File tree

1 file changed

+51
-83
lines changed

1 file changed

+51
-83
lines changed

src/links/scripts/rendered-content-link-checker.ts

Lines changed: 51 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1073,6 +1073,20 @@ async function checkExternalURLCached(
10731073
const now = new Date().getTime()
10741074
const url = href.split('#')[0]
10751075

1076+
// Skip domains that are currently rate limited (with 1 hour TTL)
1077+
const { hostname } = new URL(url)
1078+
const rateLimitTime = _rateLimitedDomains.get(hostname)
1079+
if (rateLimitTime) {
1080+
const oneHourAgo = Date.now() - 60 * 60 * 1000 // 1 hour in ms
1081+
if (rateLimitTime > oneHourAgo) {
1082+
if (verbose) core.info(`Skipping ${url} - domain ${hostname} is rate limited`)
1083+
return { ok: false, statusCode: 429, skipReason: 'Domain rate limited' }
1084+
} else {
1085+
// Rate limit has expired, remove it
1086+
_rateLimitedDomains.delete(hostname)
1087+
}
1088+
}
1089+
10761090
if (cacheMaxAge) {
10771091
const tooOld = now - Math.floor(jitter(cacheMaxAge, 10))
10781092
if (db && db.data.urls[url]) {
@@ -1122,105 +1136,70 @@ async function checkExternalURL(
11221136
return _fetchCache.get(cleanURL)
11231137
}
11241138

1125-
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms))
1126-
1127-
// Global for recording which domains we get rate-limited on.
1128-
// For example, if you got rate limited on `something.github.com/foo`
1129-
// and now we're asked to fetch for `something.github.com/bar`
1130-
// it's good to know to now bother yet.
1131-
const _rateLimitedDomains = new Map()
1139+
// Track domains that have returned 429 to skip future requests
1140+
// Maps hostname to timestamp when rate limit was detected
1141+
const _rateLimitedDomains = new Map<string, number>()
11321142

11331143
async function innerFetch(
11341144
core: CoreInject,
11351145
url: string,
1136-
config: { verbose?: boolean; useGET?: boolean; patient?: boolean; retries?: number } = {},
1146+
config: { verbose?: boolean; patient?: boolean; retries?: number } = {},
11371147
) {
1138-
const { verbose, useGET, patient } = config
1139-
1140-
const { hostname } = new URL(url)
1141-
if (_rateLimitedDomains.has(hostname)) {
1142-
await sleep(_rateLimitedDomains.get(hostname))
1143-
}
1144-
// The way `got` does retries:
1145-
//
1146-
// sleep = 1000 * Math.pow(2, retry - 1) + Math.random() * 100
1147-
//
1148-
// So, it means:
1149-
//
1150-
// 1. ~1000ms
1151-
// 2. ~2000ms
1152-
// 3. ~4000ms
1153-
//
1154-
// ...if the limit we set is 3.
1155-
// Our own timeout, in @/frame/middleware/timeout.js defaults to 10 seconds.
1156-
// So there's no point in trying more attempts than 3 because it would
1157-
// just timeout on the 10s. (i.e. 1000 + 2000 + 4000 + 8000 > 10,000)
1158-
const retry = {
1159-
limit: patient ? 6 : 2,
1160-
}
1161-
const timeout = { request: patient ? 10000 : 2000 }
1148+
const { verbose, patient } = config
11621149

11631150
const headers = {
11641151
'User-Agent':
11651152
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
11661153
}
11671154

1168-
const retries = config.retries || 0
1169-
const method = useGET ? 'GET' : 'HEAD'
1155+
const retries = patient ? 3 : 2
1156+
const timeout = patient ? 10000 : 5000
1157+
1158+
if (verbose) core.info(`External URL HEAD: ${url}`)
11701159

1171-
if (verbose) core.info(`External URL ${method}: ${url} (retries: ${retries})`)
11721160
try {
1173-
const r = await fetchWithRetry(
1161+
// Try HEAD first
1162+
let r = await fetchWithRetry(
11741163
url,
11751164
{
1176-
method,
1165+
method: 'HEAD',
11771166
headers,
11781167
},
11791168
{
1180-
retries: retry.limit,
1181-
timeout: timeout.request,
1169+
retries,
1170+
timeout,
11821171
throwHttpErrors: false,
11831172
},
11841173
)
1174+
1175+
// If HEAD doesn't work, try GET
1176+
if (r.status === 405 || r.status === 404 || r.status === 403) {
1177+
if (verbose) core.info(`External URL GET: ${url} (HEAD failed with ${r.status})`)
1178+
r = await fetchWithRetry(
1179+
url,
1180+
{
1181+
method: 'GET',
1182+
headers,
1183+
},
1184+
{
1185+
retries,
1186+
timeout,
1187+
throwHttpErrors: false,
1188+
},
1189+
)
1190+
}
1191+
11851192
if (verbose) {
1186-
core.info(`External URL ${method} ${url}: ${r.status} (retries: ${retries})`)
1193+
core.info(`External URL ${url}: ${r.status}`)
11871194
}
11881195

1189-
// If we get rate limited, remember that this hostname is now all
1190-
// rate limited. And sleep for the number of seconds that the
1191-
// `retry-after` header indicated.
1196+
// Track rate limited domains with timestamp
1197+
const { hostname } = new URL(url)
11921198
if (r.status === 429) {
1193-
let sleepTime = Math.min(
1194-
60_000,
1195-
Math.max(
1196-
10_000,
1197-
r.headers.get('retry-after') ? getRetryAfterSleep(r.headers.get('retry-after')) : 1_000,
1198-
),
1199-
)
1200-
// Sprinkle a little jitter so it doesn't all start again all
1201-
// at the same time
1202-
sleepTime += Math.random() * 10 * 1000
1203-
// Give it a bit extra when we can be really patient
1204-
if (patient) sleepTime += 30 * 1000
1205-
1206-
_rateLimitedDomains.set(hostname, sleepTime + Math.random() * 10 * 1000)
1207-
if (verbose)
1208-
core.info(
1209-
chalk.yellow(
1210-
`Rate limited on ${hostname} (${url}). Sleeping for ${(sleepTime / 1000).toFixed(1)}s`,
1211-
),
1212-
)
1213-
await sleep(sleepTime)
1214-
return innerFetch(core, url, Object.assign({}, config, { retries: retries + 1 }))
1215-
} else {
1216-
_rateLimitedDomains.delete(hostname)
1199+
_rateLimitedDomains.set(hostname, Date.now())
1200+
if (verbose) core.info(`Domain ${hostname} is now rate limited for 1 hour`)
12171201
}
12181202

1219-
// Perhaps the server doesn't support HEAD requests.
1220-
// If so, try again with a regular GET.
1221-
if ((r.status === 405 || r.status === 404 || r.status === 403) && !useGET) {
1222-
return innerFetch(core, url, Object.assign({}, config, { useGET: true }))
1223-
}
12241203
if (verbose) {
12251204
core.info((r.ok ? chalk.green : chalk.red)(`${r.status} on ${url}`))
12261205
}
@@ -1236,17 +1215,6 @@ async function innerFetch(
12361215
}
12371216
}
12381217

1239-
// Return number of milliseconds from a `Retry-After` header value
1240-
function getRetryAfterSleep(headerValue: string | null) {
1241-
if (!headerValue) return 0
1242-
let ms = Math.round(parseFloat(headerValue) * 1000)
1243-
if (isNaN(ms)) {
1244-
const nextDate = new Date(headerValue)
1245-
ms = Math.max(0, nextDate.getTime() - new Date().getTime())
1246-
}
1247-
return ms
1248-
}
1249-
12501218
function checkImageSrc(src: string) {
12511219
if (!src.startsWith('/') && !src.startsWith('http')) {
12521220
return { CRITICAL: 'Image path is not absolute. Should start with a /' }

0 commit comments

Comments
 (0)