feat(seo): implement conditional robots.txt and noindex for staging environments (#111)

luandro · claude · web-flow · commit 264e2ddeb953 · 2025-12-06T10:07:25.000-03:00
Prevent staging and PR preview sites from being indexed by search engines while allowing production to be properly crawled. Changes: - Add generate-robots-txt.ts script that creates different robots.txt based on IS_PRODUCTION env var (allow for production, disallow for staging) - Configure Docusaurus noIndex option conditionally (noindex meta tags for staging) - Enable sitemap plugin only for production builds - Update all deployment workflows to set IS_PRODUCTION=true for production only - Add static/robots.txt to .gitignore (generated at build time) This implements the robots.txt strategy from issue #25 to prevent: - Duplicate content issues between staging and production - Users finding outdated/draft content via search - SEO confusion between environments Closes #25 Co-authored-by: Claude <noreply@anthropic.com>
diff --git a/.github/workflows/deploy-pr-preview.yml b/.github/workflows/deploy-pr-preview.yml
@@ -339,6 +339,7 @@ jobs:
           find static/images -type f 2>/dev/null | wc -l | xargs echo "  - Images:"
 
       - name: Build documentation
+        # IS_PRODUCTION not set - generates noindex meta tags and disallow robots.txt
         run: bun run build
 
       - name: Deploy to Cloudflare Pages (PR Preview)
diff --git a/.github/workflows/deploy-production.yml b/.github/workflows/deploy-production.yml
@@ -101,6 +101,10 @@ jobs:
         run: bun install
 
       - name: Build documentation
+        env:
+          # Only set IS_PRODUCTION=true for actual production deployments
+          # Test deployments should have noindex to prevent search engine indexing
+          IS_PRODUCTION: ${{ inputs.environment != 'test' && 'true' || 'false' }}
         run: bun run build
 
       - name: Deploy to Cloudflare Pages
diff --git a/.github/workflows/deploy-staging.yml b/.github/workflows/deploy-staging.yml
@@ -104,6 +104,7 @@ jobs:
         env:
           BASE_URL: /comapeo-docs/
           DEFAULT_DOCS_PAGE: ${{ secrets.DEFAULT_DOCS_PAGE }}
+          # IS_PRODUCTION not set - generates noindex meta tags and disallow robots.txt
         run: bun run build
 
       - name: Upload artifact
diff --git a/.gitignore b/.gitignore
@@ -59,6 +59,9 @@ favicon.svg
 /i18n/
 /static/images/
 
+# Generated robots.txt (created at build time based on IS_PRODUCTION env var)
+/static/robots.txt
+
 # Keep .gitkeep files for directory structure
 !.gitkeep
 
diff --git a/docusaurus.config.ts b/docusaurus.config.ts
@@ -142,6 +142,10 @@ const DEFAULT_DOCS_PAGE = resolveDefaultDocsPage(
   ALL_DOC_PATHS
 );
 
+// Determine if this is a production build (for SEO settings)
+// Production allows indexing; staging/preview does not
+const isProduction = process.env.IS_PRODUCTION === "true";
+
 // This runs in Node.js - Don't use client-side code here (browser APIs, JSX...)
 
 const config: Config = {
@@ -167,6 +171,10 @@ const config: Config = {
 
   onBrokenLinks: "warn",
 
+  // Prevent search engines from indexing staging/preview builds
+  // Only production (IS_PRODUCTION=true) should be indexed
+  noIndex: !isProduction,
+
   // Even if you don't use internationalization, you can use this field to set
   // useful metadata like html lang. For example, if your site is Chinese, you
   // may want to replace "en" with "zh-Hans".
@@ -236,32 +244,6 @@ const config: Config = {
         ],
       },
     ],
-    // [
-    //   '@docusaurus/preset-classic',
-    //   {
-    //     sitemap: {
-    //       lastmod: 'date',
-    //       changefreq: 'weekly',
-    //       priority: 0.5,
-    //       ignorePatterns: ['/tags/**'],
-    //       filename: 'sitemap.xml',
-    //       createSitemapItems: async (params) => {
-    //         const { defaultCreateSitemapItems, ...rest } = params;
-    //         const items = await defaultCreateSitemapItems(rest);
-    //         return items.filter((item) => !item.url.includes('/page/'));
-    //       },
-    //     },
-    //   },
-    // ],
-    // [
-    //   '@docusaurus/preset-classic',
-    //   {
-    //     gtag: {
-    //       trackingID: 'G-999X9XX9XX',
-    //       anonymizeIP: true,
-    //     },
-    //   },
-    // ],
     [
       "@docusaurus/plugin-ideal-image",
       {
@@ -295,6 +277,16 @@ const config: Config = {
         theme: {
           customCss: "./src/css/custom.css",
         },
+        // Enable sitemap for production, disable for staging/preview
+        sitemap: isProduction
+          ? {
+              lastmod: "date",
+              changefreq: "weekly",
+              priority: 0.5,
+              ignorePatterns: ["/tags/**"],
+              filename: "sitemap.xml",
+            }
+          : false,
       } satisfies Preset.Options,
     ],
   ],
diff --git a/package.json b/package.json
@@ -10,7 +10,8 @@
     "lint": "eslint src --ext .js,.ts,.tsx --fix",
     "lint:fix": "eslint src --ext .js,.ts,.tsx --fix --fix-type problem,suggestion,layout",
     "fix:frontmatter": "bun scripts/fix-frontmatter.ts",
-    "build": "bun run fix:frontmatter && docusaurus build",
+    "generate:robots": "bun scripts/generate-robots-txt.ts",
+    "build": "bun run fix:frontmatter && bun run generate:robots && docusaurus build",
     "notion:fetch": "bun scripts/notion-fetch",
     "notion:fetch-one": "bun scripts/notion-fetch-one",
     "notion:translate": "bun scripts/notion-translate",
diff --git a/scripts/generate-robots-txt.ts b/scripts/generate-robots-txt.ts
@@ -0,0 +1,84 @@
+/**
+ * Generates robots.txt based on deployment environment.
+ *
+ * - Production (IS_PRODUCTION=true): Allow crawling, include sitemap
+ * - Staging/Preview: Disallow all crawling
+ *
+ * The noindex meta tag (configured in docusaurus.config.ts) provides the
+ * primary protection against indexing. The robots.txt Disallow directive
+ * provides an additional signal to well-behaved crawlers.
+ *
+ * Note: Google recommends allowing crawling so crawlers can see noindex tags.
+ * However, the Disallow approach is used here as defense-in-depth since:
+ * 1. Well-behaved crawlers (Google, Bing) respect robots.txt and won't crawl
+ * 2. For any pages that do get crawled, the noindex meta tag prevents indexing
+ * 3. This dual approach is commonly used for staging environments
+ *
+ * Usage:
+ *   IS_PRODUCTION=true bun scripts/generate-robots-txt.ts
+ *   bun scripts/generate-robots-txt.ts  # defaults to staging/disallow
+ */
+
+import fs from "node:fs";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+
+// Get directory of current script (compatible with Node.js 18+)
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+const STATIC_DIR = path.join(__dirname, "..", "static");
+const ROBOTS_PATH = path.join(STATIC_DIR, "robots.txt");
+
+// Production URL for sitemap reference
+const PRODUCTION_URL = "https://docs.comapeo.app";
+
+const isProduction = process.env.IS_PRODUCTION === "true";
+
+// Generate appropriate robots.txt content
+const generateRobotsTxt = (): string => {
+  if (isProduction) {
+    // Production: Allow crawling with sitemap reference
+    return `# robots.txt for ${PRODUCTION_URL}
+# Generated automatically during build
+
+User-agent: *
+Allow: /
+
+# Sitemap location
+Sitemap: ${PRODUCTION_URL}/sitemap.xml
+`;
+  } else {
+    // Staging/Preview: Disallow all crawling
+    // Combined with noindex meta tags for defense-in-depth
+    return `# robots.txt for staging/preview environment
+# Generated automatically during build
+# This file prevents search engines from indexing staging content
+
+User-agent: *
+Disallow: /
+
+# Note: This is a staging/preview environment.
+# Production site is at ${PRODUCTION_URL}
+`;
+  }
+};
+
+// Main execution with error handling
+try {
+  // Ensure static directory exists
+  if (!fs.existsSync(STATIC_DIR)) {
+    fs.mkdirSync(STATIC_DIR, { recursive: true });
+  }
+
+  // Write robots.txt
+  const content = generateRobotsTxt();
+  fs.writeFileSync(ROBOTS_PATH, content, "utf-8");
+
+  const envLabel = isProduction ? "production (allow)" : "staging (disallow)";
+  console.log(`✅ Generated robots.txt for ${envLabel}`);
+  console.log(`   Path: ${ROBOTS_PATH}`);
+} catch (error) {
+  console.error("❌ Failed to generate robots.txt:", error);
+  process.exit(1);
+}