Skip to content

Commit 264e2dd

Browse files
luandroclaude
andauthored
feat(seo): implement conditional robots.txt and noindex for staging environments (#111)
Prevent staging and PR preview sites from being indexed by search engines while allowing production to be properly crawled. Changes: - Add generate-robots-txt.ts script that creates different robots.txt based on IS_PRODUCTION env var (allow for production, disallow for staging) - Configure Docusaurus noIndex option conditionally (noindex meta tags for staging) - Enable sitemap plugin only for production builds - Update all deployment workflows to set IS_PRODUCTION=true for production only - Add static/robots.txt to .gitignore (generated at build time) This implements the robots.txt strategy from issue #25 to prevent: - Duplicate content issues between staging and production - Users finding outdated/draft content via search - SEO confusion between environments Closes #25 Co-authored-by: Claude <noreply@anthropic.com>
1 parent f980504 commit 264e2dd

File tree

7 files changed

+113
-27
lines changed

7 files changed

+113
-27
lines changed

.github/workflows/deploy-pr-preview.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,7 @@ jobs:
339339
find static/images -type f 2>/dev/null | wc -l | xargs echo " - Images:"
340340
341341
- name: Build documentation
342+
# IS_PRODUCTION not set - generates noindex meta tags and disallow robots.txt
342343
run: bun run build
343344

344345
- name: Deploy to Cloudflare Pages (PR Preview)

.github/workflows/deploy-production.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,10 @@ jobs:
101101
run: bun install
102102

103103
- name: Build documentation
104+
env:
105+
# Only set IS_PRODUCTION=true for actual production deployments
106+
# Test deployments should have noindex to prevent search engine indexing
107+
IS_PRODUCTION: ${{ inputs.environment != 'test' && 'true' || 'false' }}
104108
run: bun run build
105109

106110
- name: Deploy to Cloudflare Pages

.github/workflows/deploy-staging.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ jobs:
104104
env:
105105
BASE_URL: /comapeo-docs/
106106
DEFAULT_DOCS_PAGE: ${{ secrets.DEFAULT_DOCS_PAGE }}
107+
# IS_PRODUCTION not set - generates noindex meta tags and disallow robots.txt
107108
run: bun run build
108109

109110
- name: Upload artifact

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ favicon.svg
5959
/i18n/
6060
/static/images/
6161

62+
# Generated robots.txt (created at build time based on IS_PRODUCTION env var)
63+
/static/robots.txt
64+
6265
# Keep .gitkeep files for directory structure
6366
!.gitkeep
6467

docusaurus.config.ts

Lines changed: 18 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,10 @@ const DEFAULT_DOCS_PAGE = resolveDefaultDocsPage(
142142
ALL_DOC_PATHS
143143
);
144144

145+
// Determine if this is a production build (for SEO settings)
146+
// Production allows indexing; staging/preview does not
147+
const isProduction = process.env.IS_PRODUCTION === "true";
148+
145149
// This runs in Node.js - Don't use client-side code here (browser APIs, JSX...)
146150

147151
const config: Config = {
@@ -167,6 +171,10 @@ const config: Config = {
167171

168172
onBrokenLinks: "warn",
169173

174+
// Prevent search engines from indexing staging/preview builds
175+
// Only production (IS_PRODUCTION=true) should be indexed
176+
noIndex: !isProduction,
177+
170178
// Even if you don't use internationalization, you can use this field to set
171179
// useful metadata like html lang. For example, if your site is Chinese, you
172180
// may want to replace "en" with "zh-Hans".
@@ -236,32 +244,6 @@ const config: Config = {
236244
],
237245
},
238246
],
239-
// [
240-
// '@docusaurus/preset-classic',
241-
// {
242-
// sitemap: {
243-
// lastmod: 'date',
244-
// changefreq: 'weekly',
245-
// priority: 0.5,
246-
// ignorePatterns: ['/tags/**'],
247-
// filename: 'sitemap.xml',
248-
// createSitemapItems: async (params) => {
249-
// const { defaultCreateSitemapItems, ...rest } = params;
250-
// const items = await defaultCreateSitemapItems(rest);
251-
// return items.filter((item) => !item.url.includes('/page/'));
252-
// },
253-
// },
254-
// },
255-
// ],
256-
// [
257-
// '@docusaurus/preset-classic',
258-
// {
259-
// gtag: {
260-
// trackingID: 'G-999X9XX9XX',
261-
// anonymizeIP: true,
262-
// },
263-
// },
264-
// ],
265247
[
266248
"@docusaurus/plugin-ideal-image",
267249
{
@@ -295,6 +277,16 @@ const config: Config = {
295277
theme: {
296278
customCss: "./src/css/custom.css",
297279
},
280+
// Enable sitemap for production, disable for staging/preview
281+
sitemap: isProduction
282+
? {
283+
lastmod: "date",
284+
changefreq: "weekly",
285+
priority: 0.5,
286+
ignorePatterns: ["/tags/**"],
287+
filename: "sitemap.xml",
288+
}
289+
: false,
298290
} satisfies Preset.Options,
299291
],
300292
],

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
"lint": "eslint src --ext .js,.ts,.tsx --fix",
1111
"lint:fix": "eslint src --ext .js,.ts,.tsx --fix --fix-type problem,suggestion,layout",
1212
"fix:frontmatter": "bun scripts/fix-frontmatter.ts",
13-
"build": "bun run fix:frontmatter && docusaurus build",
13+
"generate:robots": "bun scripts/generate-robots-txt.ts",
14+
"build": "bun run fix:frontmatter && bun run generate:robots && docusaurus build",
1415
"notion:fetch": "bun scripts/notion-fetch",
1516
"notion:fetch-one": "bun scripts/notion-fetch-one",
1617
"notion:translate": "bun scripts/notion-translate",

scripts/generate-robots-txt.ts

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/**
2+
* Generates robots.txt based on deployment environment.
3+
*
4+
* - Production (IS_PRODUCTION=true): Allow crawling, include sitemap
5+
* - Staging/Preview: Disallow all crawling
6+
*
7+
* The noindex meta tag (configured in docusaurus.config.ts) provides the
8+
* primary protection against indexing. The robots.txt Disallow directive
9+
* provides an additional signal to well-behaved crawlers.
10+
*
11+
* Note: Google recommends allowing crawling so crawlers can see noindex tags.
12+
* However, the Disallow approach is used here as defense-in-depth since:
13+
* 1. Well-behaved crawlers (Google, Bing) respect robots.txt and won't crawl
14+
* 2. For any pages that do get crawled, the noindex meta tag prevents indexing
15+
* 3. This dual approach is commonly used for staging environments
16+
*
17+
* Usage:
18+
* IS_PRODUCTION=true bun scripts/generate-robots-txt.ts
19+
* bun scripts/generate-robots-txt.ts # defaults to staging/disallow
20+
*/
21+
22+
import fs from "node:fs";
23+
import path from "node:path";
24+
import { fileURLToPath } from "node:url";
25+
26+
// Get directory of current script (compatible with Node.js 18+)
27+
const __filename = fileURLToPath(import.meta.url);
28+
const __dirname = path.dirname(__filename);
29+
30+
const STATIC_DIR = path.join(__dirname, "..", "static");
31+
const ROBOTS_PATH = path.join(STATIC_DIR, "robots.txt");
32+
33+
// Production URL for sitemap reference
34+
const PRODUCTION_URL = "https://docs.comapeo.app";
35+
36+
const isProduction = process.env.IS_PRODUCTION === "true";
37+
38+
// Generate appropriate robots.txt content
39+
const generateRobotsTxt = (): string => {
40+
if (isProduction) {
41+
// Production: Allow crawling with sitemap reference
42+
return `# robots.txt for ${PRODUCTION_URL}
43+
# Generated automatically during build
44+
45+
User-agent: *
46+
Allow: /
47+
48+
# Sitemap location
49+
Sitemap: ${PRODUCTION_URL}/sitemap.xml
50+
`;
51+
} else {
52+
// Staging/Preview: Disallow all crawling
53+
// Combined with noindex meta tags for defense-in-depth
54+
return `# robots.txt for staging/preview environment
55+
# Generated automatically during build
56+
# This file prevents search engines from indexing staging content
57+
58+
User-agent: *
59+
Disallow: /
60+
61+
# Note: This is a staging/preview environment.
62+
# Production site is at ${PRODUCTION_URL}
63+
`;
64+
}
65+
};
66+
67+
// Main execution with error handling
68+
try {
69+
// Ensure static directory exists
70+
if (!fs.existsSync(STATIC_DIR)) {
71+
fs.mkdirSync(STATIC_DIR, { recursive: true });
72+
}
73+
74+
// Write robots.txt
75+
const content = generateRobotsTxt();
76+
fs.writeFileSync(ROBOTS_PATH, content, "utf-8");
77+
78+
const envLabel = isProduction ? "production (allow)" : "staging (disallow)";
79+
console.log(`✅ Generated robots.txt for ${envLabel}`);
80+
console.log(` Path: ${ROBOTS_PATH}`);
81+
} catch (error) {
82+
console.error("❌ Failed to generate robots.txt:", error);
83+
process.exit(1);
84+
}

0 commit comments

Comments
 (0)