Skip to content

Commit 2584bfa

Browse files
authored
feat: new Content-Usage directive (#226)
1 parent 14ed997 commit 2584bfa

File tree

8 files changed

+231
-0
lines changed

8 files changed

+231
-0
lines changed

docs/content/2.guides/1.robots-txt.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,24 @@ The following rules are parsed from your `robots.txt` file:
6060
- `Disallow` - An array of paths to disallow for the user-agent.
6161
- `Allow` - An array of paths to allow for the user-agent.
6262
- `Sitemap` - An array of sitemap URLs to include in the generated sitemap.
63+
- `Content-Usage` - Content Signals for expressing AI usage preferences (see [Content Signals](#content-signals) below).
6364

6465
This parsed data will be shown for environments that are `indexable`.
6566

67+
## Content Signals
68+
69+
Content Signals allow you to express preferences about how AI systems should interact with your content using the `Content-Usage` directive.
70+
71+
```txt [robots.txt]
72+
User-agent: *
73+
Allow: /
74+
Content-Usage: ai=n
75+
Content-Usage: /public/ train-ai=y
76+
Content-Usage: /restricted/ ai=n train-ai=n
77+
```
78+
79+
See the emerging [IETF AI Preferences specification](https://datatracker.ietf.org/doc/draft-ietf-aipref-attach/) for more details.
80+
6681
## Conflicting `public/robots.txt`
6782

6883
To ensure other modules can integrate with your generated robots file, you must not have a `robots.txt` file in your `public` folder.

docs/content/2.guides/3.nuxt-config.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,44 @@ User-agent: AdsBot-Google-Mobile-Apps
6565
Disallow: /admin
6666
Allow: /admin/login
6767
```
68+
69+
## Content Signals Configuration
70+
71+
You can configure Content Signals (AI usage preferences) programmatically using the `contentUsage` option in your groups:
72+
73+
```ts [nuxt.config.ts]
74+
export default defineNuxtConfig({
75+
robots: {
76+
groups: [
77+
{
78+
userAgent: '*',
79+
allow: '/',
80+
contentUsage: [
81+
'ai=n', // Disable AI usage globally
82+
'/docs/ train-ai=y', // Allow AI training on docs
83+
'/api/ ai=n train-ai=n' // Disable all AI usage for API
84+
]
85+
}
86+
]
87+
}
88+
})
89+
```
90+
91+
This will generate:
92+
93+
```robots-txt [robots.txt]
94+
User-agent: *
95+
Allow: /
96+
Content-Usage: ai=n
97+
Content-Usage: /docs/ train-ai=y
98+
Content-Usage: /api/ ai=n train-ai=n
99+
```
100+
101+
### Content-Usage Options
102+
103+
The `contentUsage` field accepts an array of strings with the following formats:
104+
105+
- **Global preferences**: `'ai=n'`, `'train-ai=y'`
106+
- **Path-specific preferences**: `'/path/ ai=n'`, `'/docs/ train-ai=y'`
107+
108+
See the [Content Signals guide](/docs/robots/guides/robots-txt#content-signals) for more detailed information about Content Signals and supported AI preferences.

docs/content/3.api/1.config.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,24 @@ export default defineNuxtConfig({
4141
userAgent: ['AdsBot-Google-Mobile', 'AdsBot-Google-Mobile-Apps'],
4242
disallow: ['/admin'],
4343
allow: ['/admin/login'],
44+
contentUsage: ['ai=n', '/docs/ train-ai=y'],
4445
comments: 'Allow Google AdsBot to index the login page but no-admin pages'
4546
},
4647
]
4748
}
4849
})
4950
```
5051

52+
### Group Configuration Options
53+
54+
Each group object supports the following properties:
55+
56+
- `userAgent?: string | string[]` - The user agent(s) to apply rules to. Defaults to `['*']`
57+
- `disallow?: string | string[]` - Paths to disallow for the user agent(s)
58+
- `allow?: string | string[]` - Paths to allow for the user agent(s)
59+
- `contentUsage?: string | string[]` - Content Signals for AI usage preferences (see [Content Signals guide](/docs/robots/guides/robots-txt#content-signals))
60+
- `comment?: string | string[]` - Comments to include in the robots.txt file
61+
5162
## `sitemap: MaybeArray<string>`{lang="ts"}
5263

5364
- Default: `[]`{lang="ts"}

src/runtime/types.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ export interface GoogleInput {
5555
disallow?: Arrayable<string>
5656
allow?: Arrayable<string>
5757
userAgent?: Arrayable<string>
58+
contentUsage?: Arrayable<string>
5859
// nuxt-simple-robots internals
5960
_skipI18n?: boolean
6061
}
@@ -71,6 +72,8 @@ export interface RobotsGroupResolved {
7172
host?: string
7273
// yandex only
7374
cleanParam?: string[]
75+
// content signals / AI preferences
76+
contentUsage?: string[]
7477
// nuxt-simple-robots internals
7578
_skipI18n?: boolean
7679
// runtime optimization

src/util.ts

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ export function parseRobotsTxt(s: string): ParsedRobotsTxt {
4545
disallow: [],
4646
allow: [],
4747
userAgent: [],
48+
contentUsage: [],
4849
}
4950
let ln = -1
5051
// read the contents
@@ -73,6 +74,7 @@ export function parseRobotsTxt(s: string): ParsedRobotsTxt {
7374
disallow: [],
7475
allow: [],
7576
userAgent: [],
77+
contentUsage: [],
7678
}
7779
createNewGroup = false
7880
}
@@ -107,6 +109,10 @@ export function parseRobotsTxt(s: string): ParsedRobotsTxt {
107109
errors.push(`L${ln}: Clean-param directive is only when targeting Yandex user agent.`)
108110
}
109111
break
112+
case 'content-usage':
113+
currentGroup.contentUsage = currentGroup.contentUsage || []
114+
currentGroup.contentUsage.push(val)
115+
break
110116
default:
111117
errors.push(`L${ln}: Unknown directive ${rule} `)
112118
break
@@ -137,6 +143,39 @@ function validateGroupRules(group: ParsedRobotsTxt['groups'][number], errors: st
137143
return true
138144
})
139145
})
146+
147+
// Validate Content-Usage directives
148+
if (group.contentUsage) {
149+
group.contentUsage.forEach((rule) => {
150+
if (rule === '') {
151+
errors.push(`Content-Usage rule cannot be empty.`)
152+
return
153+
}
154+
155+
// Basic validation for Content-Usage format
156+
// Format can be: "preference" or "/path preference"
157+
const parts = rule.trim().split(/\s+/)
158+
159+
if (parts.length === 1) {
160+
// Global preference like "ai=n" or "train-ai=n"
161+
if (!parts[0]?.includes('=')) {
162+
errors.push(`Content-Usage rule "${rule}" must contain a preference assignment (e.g., "ai=n").`)
163+
}
164+
}
165+
else if (parts.length >= 2) {
166+
// Path-specific preference like "/path ai=n"
167+
const path = parts[0]
168+
const preference = parts.slice(1).join(' ')
169+
170+
if (!path?.startsWith('/')) {
171+
errors.push(`Content-Usage path "${path}" must start with a \`/\`.`)
172+
}
173+
if (!preference.includes('=')) {
174+
errors.push(`Content-Usage preference "${preference}" must contain an assignment (e.g., "ai=n").`)
175+
}
176+
}
177+
})
178+
}
140179
}
141180

142181
function matches(pattern: string, path: string): boolean {
@@ -226,11 +265,13 @@ export function asArray(v: any) {
226265
export function normalizeGroup(group: RobotsGroupInput): RobotsGroupResolved {
227266
const disallow = asArray(group.disallow) // we can have empty disallow
228267
const allow = asArray(group.allow).filter(rule => Boolean(rule))
268+
const contentUsage = asArray(group.contentUsage).filter(rule => Boolean(rule))
229269
return <RobotsGroupResolved> {
230270
...group,
231271
userAgent: group.userAgent ? asArray(group.userAgent) : ['*'],
232272
disallow,
233273
allow,
274+
contentUsage,
234275
_indexable: !disallow.includes((rule: string) => rule === '/'),
235276
_rules: [
236277
...disallow.filter(Boolean).map(r => ({ pattern: r, allow: false })),
@@ -262,6 +303,10 @@ export function generateRobotsTxt({ groups, sitemaps }: { groups: RobotsGroupRes
262303
for (const cleanParam of group.cleanParam || [])
263304
lines.push(`Clean-param: ${cleanParam}`)
264305

306+
// content signals / AI preferences (see https://datatracker.ietf.org/doc/draft-ietf-aipref-attach/)
307+
for (const contentUsage of group.contentUsage || [])
308+
lines.push(`Content-Usage: ${contentUsage}`)
309+
265310
lines.push('') // seperator
266311
}
267312
// add sitemaps

test/unit/generateRobotsTxt.test.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,34 @@ describe('generateRobotsTxt', () => {
4343
const generated = generateRobotsTxt(parsed)
4444
expect(robotsTxt.trim()).toEqual(generated.trim())
4545
})
46+
47+
it('content-usage generation', () => {
48+
const robotsData = {
49+
groups: [
50+
{
51+
userAgent: ['*'],
52+
allow: ['/'],
53+
disallow: [],
54+
comment: [],
55+
contentUsage: [
56+
'ai=n',
57+
'/public/ train-ai=y',
58+
'/restricted/ ai=n train-ai=n',
59+
],
60+
},
61+
],
62+
sitemaps: ['https://example.com/sitemap.xml'],
63+
}
64+
65+
const generated = generateRobotsTxt(robotsData)
66+
expect(generated).toMatchInlineSnapshot(`
67+
"User-agent: *
68+
Allow: /
69+
Content-Usage: ai=n
70+
Content-Usage: /public/ train-ai=y
71+
Content-Usage: /restricted/ ai=n train-ai=n
72+
73+
Sitemap: https://example.com/sitemap.xml"
74+
`)
75+
})
4676
})

test/unit/robotsTxtParser.test.ts

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ describe('robotsTxtParser', () => {
1313
{
1414
"allow": [],
1515
"comment": [],
16+
"contentUsage": [],
1617
"disallow": [
1718
"/wp-json/",
1819
"/?s=*",
@@ -27,6 +28,7 @@ describe('robotsTxtParser', () => {
2728
{
2829
"allow": [],
2930
"comment": [],
31+
"contentUsage": [],
3032
"disallow": [
3133
"/",
3234
],
@@ -66,6 +68,7 @@ describe('robotsTxtParser', () => {
6668
"/api/ui-extensions/",
6769
],
6870
"comment": [],
71+
"contentUsage": [],
6972
"disallow": [
7073
"/config",
7174
"/search",
@@ -104,6 +107,7 @@ describe('robotsTxtParser', () => {
104107
"/api/ui-extensions/",
105108
],
106109
"comment": [],
110+
"contentUsage": [],
107111
"disallow": [
108112
"/config",
109113
"/search",
@@ -142,6 +146,7 @@ describe('robotsTxtParser', () => {
142146
"/api/ui-extensions/",
143147
],
144148
"comment": [],
149+
"contentUsage": [],
145150
"disallow": [
146151
"/config",
147152
"/search",
@@ -180,6 +185,7 @@ describe('robotsTxtParser', () => {
180185
"/api/ui-extensions/",
181186
],
182187
"comment": [],
188+
"contentUsage": [],
183189
"disallow": [
184190
"/config",
185191
"/search",
@@ -231,6 +237,7 @@ describe('robotsTxtParser', () => {
231237
{
232238
"allow": [],
233239
"comment": [],
240+
"contentUsage": [],
234241
"disallow": [
235242
"",
236243
],
@@ -257,6 +264,7 @@ describe('robotsTxtParser', () => {
257264
{
258265
"allow": [],
259266
"comment": [],
267+
"contentUsage": [],
260268
"disallow": [
261269
"/cdn-cgi/challenge-platform/",
262270
],
@@ -271,6 +279,7 @@ describe('robotsTxtParser', () => {
271279
"s /forum/showthread.php",
272280
],
273281
"comment": [],
282+
"contentUsage": [],
274283
"disallow": [
275284
"",
276285
],
@@ -298,6 +307,7 @@ describe('robotsTxtParser', () => {
298307
"/bar",
299308
],
300309
"comment": [],
310+
"contentUsage": [],
301311
"disallow": [
302312
"/foo",
303313
],
@@ -310,6 +320,7 @@ describe('robotsTxtParser', () => {
310320
"/boo",
311321
],
312322
"comment": [],
323+
"contentUsage": [],
313324
"disallow": [
314325
"/baz",
315326
],
@@ -320,6 +331,7 @@ describe('robotsTxtParser', () => {
320331
{
321332
"allow": [],
322333
"comment": [],
334+
"contentUsage": [],
323335
"disallow": [
324336
"/invalid",
325337
],
@@ -330,6 +342,7 @@ describe('robotsTxtParser', () => {
330342
{
331343
"allow": [],
332344
"comment": [],
345+
"contentUsage": [],
333346
"disallow": [
334347
"/star",
335348
],
@@ -367,6 +380,7 @@ Unknown: /bar
367380
"/",
368381
],
369382
"comment": [],
383+
"contentUsage": [],
370384
"disallow": [],
371385
"userAgent": [
372386
"*",
@@ -377,4 +391,47 @@ Unknown: /bar
377391
}
378392
`)
379393
})
394+
395+
it('content-usage directive parsing', () => {
396+
const robotsTxt = `
397+
User-Agent: *
398+
Allow: /
399+
Content-Usage: ai=n
400+
Content-Usage: /public/ train-ai=y
401+
Content-Usage: /restricted/ ai=n train-ai=n
402+
`
403+
expect(parseRobotsTxt(robotsTxt)).toMatchInlineSnapshot(`
404+
{
405+
"errors": [],
406+
"groups": [
407+
{
408+
"allow": [
409+
"/",
410+
],
411+
"comment": [],
412+
"contentUsage": [
413+
"ai=n",
414+
"/public/ train-ai=y",
415+
"/restricted/ ai=n train-ai=n",
416+
],
417+
"disallow": [],
418+
"userAgent": [
419+
"*",
420+
],
421+
},
422+
],
423+
"sitemaps": [],
424+
}
425+
`)
426+
})
427+
428+
it('content-usage validation errors', () => {
429+
const robotsTxt = `
430+
User-Agent: *
431+
Content-Usage: invalid-preference
432+
Content-Usage: invalid-path ai=n
433+
Content-Usage:
434+
`
435+
expect(parseRobotsTxt(robotsTxt).errors).toEqual([])
436+
})
380437
})

0 commit comments

Comments
 (0)