Skip to content

Commit 5639932

Browse files
dslovinskyclaude
andauthored
fix: crawl sitemap index to resolve actual page URLs (#1031)
The new docs site uses a sitemapindex at sitemap.xml that references child sitemaps (sitemap-0.xml, etc.) instead of listing pages directly. This recursively follows the index to collect all page URLs. Co-authored-by: dslovinsky <dslovinsky@users.noreply.github.com> Co-authored-by: Claude <noreply@anthropic.com>
1 parent e3a5873 commit 5639932

File tree

1 file changed

+39
-16
lines changed

1 file changed

+39
-16
lines changed

scripts/generate-metadata.ts

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
// Cursor-generated
21
import * as fs from "fs";
32
import * as path from "path";
43

@@ -7,6 +6,43 @@ const OUTPUT_FILE = path.join(API_SPECS_DIR, "metadata.json");
76
const API_SPECS_URL = "https://dev-docs.alchemy.com";
87
const DOCS_URL = "https://www.alchemy.com/docs";
98

9+
function extractLocs(xml: string): string[] {
10+
return (
11+
xml
12+
.match(/<loc>(.*?)<\/loc>/g)
13+
?.map((tag) => tag.replace(/<\/?loc>/g, "")) || []
14+
);
15+
}
16+
17+
function isSitemapIndex(xml: string): boolean {
18+
return xml.includes("<sitemapindex");
19+
}
20+
21+
async function fetchSitemapUrls(sitemapUrl: string): Promise<string[]> {
22+
const response = await fetch(sitemapUrl);
23+
if (!response.ok) {
24+
throw new Error(
25+
`Failed to fetch ${sitemapUrl}: ${response.status} ${response.statusText}`,
26+
);
27+
}
28+
29+
const xml = await response.text();
30+
31+
if (isSitemapIndex(xml)) {
32+
const childSitemapUrls = extractLocs(xml);
33+
console.info(
34+
`Found sitemap index with ${childSitemapUrls.length} sitemap(s)`,
35+
);
36+
37+
const results = await Promise.all(
38+
childSitemapUrls.map((url) => fetchSitemapUrls(url)),
39+
);
40+
return results.flat();
41+
}
42+
43+
return extractLocs(xml);
44+
}
45+
1046
(async () => {
1147
try {
1248
const files: string[] = [];
@@ -27,22 +63,9 @@ const DOCS_URL = "https://www.alchemy.com/docs";
2763

2864
traverse(API_SPECS_DIR);
2965

30-
// Fetch and parse sitemap
31-
const sitemapResponse = await fetch(`${DOCS_URL}/sitemap.xml`);
32-
33-
if (!sitemapResponse.ok) {
34-
throw new Error(`Failed to fetch sitemap: ${sitemapResponse.statusText}`);
35-
}
36-
37-
const sitemapXml = await sitemapResponse.text();
38-
39-
// Extract URLs using regex and remove host
40-
const urls =
41-
sitemapXml
42-
.match(/<loc>(.*?)<\/loc>/g)
43-
?.map((url) => url.replace(/<\/?loc>/g, "")) || [];
66+
const urls = await fetchSitemapUrls(`${DOCS_URL}/sitemap.xml`);
67+
console.info(`Collected ${urls.length} page URL(s) from sitemap`);
4468

45-
// Write to file
4669
fs.writeFileSync(OUTPUT_FILE, JSON.stringify({ files, urls }, null, 2));
4770
console.info(`Successfully generated metadata file at ${OUTPUT_FILE}`);
4871
} catch (error) {

0 commit comments

Comments
 (0)