From 55998fcb1f857f8e5969347e57716a56f0a1e17b Mon Sep 17 00:00:00 2001 From: Pedro Sousa <680496+pedrosousa@users.noreply.github.com> Date: Tue, 17 Dec 2024 10:19:07 +0000 Subject: [PATCH 1/5] [Docs] Update API link crawler --- bin/crawl-api-links.js | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/bin/crawl-api-links.js b/bin/crawl-api-links.js index 7dae34ef6eedc9f..f99be4c81b6bdca 100644 --- a/bin/crawl-api-links.js +++ b/bin/crawl-api-links.js @@ -21,6 +21,13 @@ async function checkLinks() { }); const page = await browser.newPage(); + // skip image requests + await page.setRequestInterception(true); + page.on("request", (request) => { + if (request.resourceType() === "image") request.abort(); + else request.continue(); + }); + const sitemapUrl = "https://developers.cloudflare.com/sitemap.xml"; await page.goto(sitemapUrl, { timeout: navigationTimeout }); @@ -51,22 +58,17 @@ async function checkLinks() { } if ( - pageLink.includes("developers.cloudflare.com/api/operations/") || - pageLink.startsWith("/api/operations/") + pageLink.includes("developers.cloudflare.com/api/resources/") || + pageLink.startsWith("/api/resources/") ) { console.log(`Evaluating link: ${pageLink}`); - await page.goto(pageLink, { + const response = await page.goto(pageLink, { waitUntil: "networkidle0", timeout: navigationTimeout, }); visitedLinks.push(pageLink); - const statusCode = await page.evaluate(() => { - return { - url: window.location.href, - }; - }); - if (statusCode.url === "https://developers.cloudflare.com/api/") { + if (response.status() === 404) { brokenLinks.push(pageLink); } } From a0f23853b6c0fbd6fb4dcb231650e97d7f69c736 Mon Sep 17 00:00:00 2001 From: Pedro Sousa <680496+pedrosousa@users.noreply.github.com> Date: Fri, 3 Jan 2025 12:05:38 +0000 Subject: [PATCH 2/5] Re-add crawler script entry to package.json --- package.json | 1 + 1 file changed, 1 insertion(+) diff --git a/package.json b/package.json index 38c85fecbe56505..a1c5fbed8932bc8 100644 --- a/package.json +++ b/package.json @@ -11,6 +11,7 @@ "check:astro": "npm run sync && astro check", "check:functions": "npx tsc --noEmit -p ./functions/tsconfig.json", "check:worker": "npx tsc --noEmit -p ./worker/tsconfig.json", + "crawl-api-links": "node bin/crawl-api-links.js", "dev": "npx astro dev", "format": "npm run format:core && npm run format:data", "format:core": "npx prettier --write \"**/*.{js,jsx,ts,tsx,mjs,css}\"", From 785268849416a64dbfea7ecc37f6233a9bc821ba Mon Sep 17 00:00:00 2001 From: Pedro Sousa <680496+pedrosousa@users.noreply.github.com> Date: Fri, 3 Jan 2025 12:10:13 +0000 Subject: [PATCH 3/5] Update sitemap URL --- bin/crawl-api-links.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/crawl-api-links.js b/bin/crawl-api-links.js index f99be4c81b6bdca..05ce9ae0bcda301 100644 --- a/bin/crawl-api-links.js +++ b/bin/crawl-api-links.js @@ -28,7 +28,7 @@ async function checkLinks() { else request.continue(); }); - const sitemapUrl = "https://developers.cloudflare.com/sitemap.xml"; + const sitemapUrl = "https://developers.cloudflare.com/sitemap-0.xml"; await page.goto(sitemapUrl, { timeout: navigationTimeout }); const sitemapLinks = await page.$$eval("url loc", (elements) => From 410d39048d819de758353112f41a0a95a046d999 Mon Sep 17 00:00:00 2001 From: Pedro Sousa <680496+pedrosousa@users.noreply.github.com> Date: Fri, 3 Jan 2025 12:10:54 +0000 Subject: [PATCH 4/5] Handle missing response --- bin/crawl-api-links.js | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bin/crawl-api-links.js b/bin/crawl-api-links.js index 05ce9ae0bcda301..d06214fb1e93461 100644 --- a/bin/crawl-api-links.js +++ b/bin/crawl-api-links.js @@ -68,8 +68,12 @@ async function checkLinks() { }); visitedLinks.push(pageLink); - if (response.status() === 404) { - brokenLinks.push(pageLink); + if (response) { + if (response.status() === 404) { + brokenLinks.push(pageLink); + } + } else { + console.log("WARNING: Didn't receive a response... skipping."); } } } From 221e05de31f835325227353494a3cc2836e1214d Mon Sep 17 00:00:00 2001 From: Pedro Sousa <680496+pedrosousa@users.noreply.github.com> Date: Fri, 3 Jan 2025 17:40:23 +0000 Subject: [PATCH 5/5] Revert to original sitemap URL (we're checking loc elements in the script) --- bin/crawl-api-links.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/crawl-api-links.js b/bin/crawl-api-links.js index d06214fb1e93461..2e31f9c35f66ca9 100644 --- a/bin/crawl-api-links.js +++ b/bin/crawl-api-links.js @@ -28,7 +28,7 @@ async function checkLinks() { else request.continue(); }); - const sitemapUrl = "https://developers.cloudflare.com/sitemap-0.xml"; + const sitemapUrl = "https://developers.cloudflare.com/sitemap.xml"; await page.goto(sitemapUrl, { timeout: navigationTimeout }); const sitemapLinks = await page.$$eval("url loc", (elements) =>