Skip to content

Commit a8d44c6

Browse files
authored
SUP-611 Serve real robots.txt and sitemap.xml; disallow /search (#89)
* SUP-611 Fix robots.txt/sitemap.xml and noindex /search * SUP-611 Harden robots/sitemap and revert search meta override * SUP-611 Fix SEO title fallback; normalize site URL * SUP-611 Revert SEO changes * SUP-611 Adjust robots rules for noindex; escape sitemap XML * SUP-611 Keep changes focused: disallow /search via robots; drop X-Robots-Tag * Apply suggestion from @redreceipt * SUP-611 Cache robots/sitemap; remove volatile lastmod
1 parent 040aa66 commit a8d44c6

File tree

2 files changed

+104
-0
lines changed

2 files changed

+104
-0
lines changed

pages/robots.txt.js

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
function getCanonicalSiteUrl() {
2+
const raw = process.env.NEXT_PUBLIC_SITE_URL || 'https://longhollow.com';
3+
return raw.replace(/\/+$/, '');
4+
}
5+
6+
export async function getServerSideProps({ req, res }) {
7+
// Serve a real robots.txt. Without this, requests fall through to `pages/[slug].js`
8+
// and return HTML, which crawlers may misinterpret.
9+
void req;
10+
const baseUrl = getCanonicalSiteUrl();
11+
const body = [
12+
'User-agent: *',
13+
// `/search` is server-rendered and expensive
14+
// Blocking here reduces crawl traffic for well-behaved crawlers.
15+
'Disallow: /search',
16+
'',
17+
`Sitemap: ${baseUrl}/sitemap.xml`,
18+
'',
19+
].join('\n');
20+
21+
res.setHeader('Content-Type', 'text/plain; charset=utf-8');
22+
// Cache at the edge; this content is effectively static between deploys.
23+
res.setHeader(
24+
'Cache-Control',
25+
'public, max-age=0, s-maxage=86400, stale-while-revalidate=604800'
26+
);
27+
res.write(body);
28+
res.end();
29+
30+
return { props: {} };
31+
}
32+
33+
export default function RobotsTxt() {
34+
return null;
35+
}

pages/sitemap.xml.js

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
function getCanonicalSiteUrl() {
2+
// Avoid reflecting Host/X-Forwarded-* headers into XML.
3+
// Use a configured canonical origin (or production default) instead.
4+
const raw = process.env.NEXT_PUBLIC_SITE_URL || 'https://longhollow.com';
5+
return raw.replace(/\/+$/, '');
6+
}
7+
8+
function xmlEscape(value) {
9+
return String(value)
10+
.replace(/&/g, '&')
11+
.replace(/</g, '&lt;')
12+
.replace(/>/g, '&gt;')
13+
.replace(/"/g, '&quot;')
14+
.replace(/'/g, '&apos;');
15+
}
16+
17+
function buildSitemapXml({ baseUrl, urls }) {
18+
const entries = urls
19+
.map((path) => {
20+
const loc = `${baseUrl}${path}`;
21+
return [
22+
' <url>',
23+
` <loc>${xmlEscape(loc)}</loc>`,
24+
' </url>',
25+
].join('\n');
26+
})
27+
.join('\n');
28+
29+
return [
30+
'<?xml version="1.0" encoding="UTF-8"?>',
31+
'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">',
32+
entries,
33+
'</urlset>',
34+
'',
35+
].join('\n');
36+
}
37+
38+
export async function getServerSideProps({ req, res }) {
39+
// Serve a real sitemap.xml. Without this, requests fall through to `pages/[slug].js`
40+
// and return HTML (200), which breaks crawler behavior.
41+
void req;
42+
const baseUrl = getCanonicalSiteUrl();
43+
44+
// Keep this conservative: core landing pages only. The rest of the site is CMS-driven
45+
// and would require an explicit "list all slugs" query to be complete.
46+
const urls = [
47+
'/',
48+
'/about',
49+
'/connect',
50+
'/next-steps',
51+
'/watch',
52+
'/privacy-policy',
53+
'/terms-of-use',
54+
];
55+
56+
const xml = buildSitemapXml({ baseUrl, urls });
57+
58+
res.setHeader('Content-Type', 'application/xml; charset=utf-8');
59+
// Cache at the edge; this content is effectively static between deploys.
60+
res.setHeader('Cache-Control', 'public, max-age=0, s-maxage=86400, stale-while-revalidate=604800');
61+
res.write(xml);
62+
res.end();
63+
64+
return { props: {} };
65+
}
66+
67+
export default function SitemapXml() {
68+
return null;
69+
}

0 commit comments

Comments
 (0)