SUP-611 Serve real robots.txt and sitemap.xml; disallow /search (#89)

redreceipt · web-flow · commit a8d44c6f2d5a · 2026-02-10T23:52:39.000-05:00
* SUP-611 Fix robots.txt/sitemap.xml and noindex /search * SUP-611 Harden robots/sitemap and revert search meta override * SUP-611 Fix SEO title fallback; normalize site URL * SUP-611 Revert SEO changes * SUP-611 Adjust robots rules for noindex; escape sitemap XML * SUP-611 Keep changes focused: disallow /search via robots; drop X-Robots-Tag * Apply suggestion from @redreceipt * SUP-611 Cache robots/sitemap; remove volatile lastmod
diff --git a/pages/robots.txt.js b/pages/robots.txt.js
@@ -0,0 +1,35 @@
+function getCanonicalSiteUrl() {
+  const raw = process.env.NEXT_PUBLIC_SITE_URL || 'https://longhollow.com';
+  return raw.replace(/\/+$/, '');
+}
+
+export async function getServerSideProps({ req, res }) {
+  // Serve a real robots.txt. Without this, requests fall through to `pages/[slug].js`
+  // and return HTML, which crawlers may misinterpret.
+  void req;
+  const baseUrl = getCanonicalSiteUrl();
+  const body = [
+    'User-agent: *',
+    // `/search` is server-rendered and expensive
+    // Blocking here reduces crawl traffic for well-behaved crawlers.
+    'Disallow: /search',
+    '',
+    `Sitemap: ${baseUrl}/sitemap.xml`,
+    '',
+  ].join('\n');
+
+  res.setHeader('Content-Type', 'text/plain; charset=utf-8');
+  // Cache at the edge; this content is effectively static between deploys.
+  res.setHeader(
+    'Cache-Control',
+    'public, max-age=0, s-maxage=86400, stale-while-revalidate=604800'
+  );
+  res.write(body);
+  res.end();
+
+  return { props: {} };
+}
+
+export default function RobotsTxt() {
+  return null;
+}
diff --git a/pages/sitemap.xml.js b/pages/sitemap.xml.js
@@ -0,0 +1,69 @@
+function getCanonicalSiteUrl() {
+  // Avoid reflecting Host/X-Forwarded-* headers into XML.
+  // Use a configured canonical origin (or production default) instead.
+  const raw = process.env.NEXT_PUBLIC_SITE_URL || 'https://longhollow.com';
+  return raw.replace(/\/+$/, '');
+}
+
+function xmlEscape(value) {
+  return String(value)
+    .replace(/&/g, '&amp;')
+    .replace(/</g, '&lt;')
+    .replace(/>/g, '&gt;')
+    .replace(/"/g, '&quot;')
+    .replace(/'/g, '&apos;');
+}
+
+function buildSitemapXml({ baseUrl, urls }) {
+  const entries = urls
+    .map((path) => {
+      const loc = `${baseUrl}${path}`;
+      return [
+        '  <url>',
+        `    <loc>${xmlEscape(loc)}</loc>`,
+        '  </url>',
+      ].join('\n');
+    })
+    .join('\n');
+
+  return [
+    '<?xml version="1.0" encoding="UTF-8"?>',
+    '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">',
+    entries,
+    '</urlset>',
+    '',
+  ].join('\n');
+}
+
+export async function getServerSideProps({ req, res }) {
+  // Serve a real sitemap.xml. Without this, requests fall through to `pages/[slug].js`
+  // and return HTML (200), which breaks crawler behavior.
+  void req;
+  const baseUrl = getCanonicalSiteUrl();
+
+  // Keep this conservative: core landing pages only. The rest of the site is CMS-driven
+  // and would require an explicit "list all slugs" query to be complete.
+  const urls = [
+    '/',
+    '/about',
+    '/connect',
+    '/next-steps',
+    '/watch',
+    '/privacy-policy',
+    '/terms-of-use',
+  ];
+
+  const xml = buildSitemapXml({ baseUrl, urls });
+
+  res.setHeader('Content-Type', 'application/xml; charset=utf-8');
+  // Cache at the edge; this content is effectively static between deploys.
+  res.setHeader('Cache-Control', 'public, max-age=0, s-maxage=86400, stale-while-revalidate=604800');
+  res.write(xml);
+  res.end();
+
+  return { props: {} };
+}
+
+export default function SitemapXml() {
+  return null;
+}