Skip to content

Commit 2da0125

Browse files
Implements the Content Signals extension of robots.txt
Signed-off-by: Jon Stovell <jonstovell@gmail.com>
1 parent dc051c1 commit 2da0125

File tree

4 files changed

+178
-74
lines changed

4 files changed

+178
-74
lines changed

Languages/en_US/Help.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,10 @@
587587
</li>
588588
</ul>';
589589
$helptxt['robots_txt'] = 'The robots.txt file is used to implement the <a href="https://www.rfc-editor.org/rfc/rfc9309.html" class="bbc_link">Robots Exclusion Protocol</a>, a standard used by websites to indicate to search engine spiders and other web robots which portions of the website they are allowed to visit. This file is typically located in your website’s root directory.<br><br>SMF adds some rules to this file in order to guide spiders away from URLs that they should not bother to crawl. This improves efficiency and reduces server load when a spider is crawling your forum.';
590+
$helptxt['robots_txt_ai_train'] = 'This setting tells artificial intelligence companies whether they are permitted to use the content of your forum to train their AI bots. Unfortunately, there is no way to force AI companies to obey these rules, but well-behaved ones will respect them.';
591+
$helptxt['robots_txt_search'] = 'This setting tells search engines whether they are permitted to show links to your forum in their search results. Most forums will want this enabled, but if you want your forum not to appear in search results, you can disable it.';
592+
$helptxt['robots_txt_ai_input'] = 'This setting tells search engines and other web robots whether they are permitted to use the content of your forum to generate AI summaries, etc. Unfortunately, there is no way to force search engines that index your site to obey these rules, but well-behaved ones will respect them.';
593+
$helptxt['robots_txt_ai_train'] = 'This setting tells artificial intelligence companies whether they are permitted to use the content of your forum to train their AI bots. Unfortunately, there is no way to force AI companies to obey these rules, but well-behaved ones will respect them.';
590594

591595
$helptxt['birthday_email'] = 'Choose the index of the birthday email message to use. A preview will be shown in the Email Subject and Email Body fields.<br><strong>Note:</strong> Selecting this setting does not automatically enable birthday emails. To enable birthday emails use the <a href="{scripturl}?action=admin;area=scheduledtasks;{session_var}={session_id}" target="_blank" rel="noopener">Scheduled Tasks</a> page and enable the birthday email task.';
592596
$helptxt['pm_bcc'] = 'When sending a personal message you can choose to add a recipient as BCC (Blind Carbon Copy). BCC recipients do not have their identities revealed to the other recipients of the message.';

Languages/en_US/Search.php

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,9 @@
185185
$txt['robots_txt_info'] = 'Enter the path to your robots.txt file so that SMF can append rules to it.';
186186
$txt['robots_txt_auto'] = 'Detect path';
187187
$txt['robots_txt_not_writable'] = 'The robots.txt file is not writable.';
188+
$txt['robots_txt_search'] = 'Allow search engines to link to the forum in their search results';
189+
$txt['robots_txt_ai_input'] = 'Allow search engines to generate AI summaries of forum content';
190+
$txt['robots_txt_ai_train'] = 'Allow forum content to be used for AI training';
188191

189192
$txt['meta_keywords'] = 'Meta keywords associated with forum';
190193
$txt['meta_keywords_note'] = 'In most situations, this should be left blank.';

Sources/Actions/Admin/SearchEngines.php

Lines changed: 167 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -830,16 +830,23 @@ function disableFields()
830830
}
831831
disableFields();';
832832

833-
// Now the setting for robots.txt.
833+
// Now the settings for robots.txt.
834834
$config_vars[] = '';
835835

836836
if (empty(Config::$modSettings['robots_txt'])) {
837+
// Make it easy for the admin to populate the setting.
837838
$post_input = '<button class="button floatnone" onclick="document.getElementById(\'robots_txt\').value = ' . Utils::escapeJavaScript(self::detectRobotsTxt()) . '; return false;">' . Lang::getTxt('robots_txt_auto', file: 'Search') . '</button>';
838839
} elseif (!is_writable(Config::$modSettings['robots_txt'])) {
840+
// Warn if robots.txt is not writable.
839841
$invalid = true;
840842
$post_input = '<br><span class="error">' . Lang::getTxt('robots_txt_not_writable', file: 'Search') . '</span>';
841843
}
842844

845+
// If robots_txt_search is not set, assume true.
846+
if (!isset(Config::$modSettings['robots_txt_search'])) {
847+
Config::$modSettings['robots_txt_search'] = true;
848+
}
849+
843850
$config_vars = array_merge($config_vars, [
844851
[
845852
'text',
@@ -849,6 +856,18 @@ function disableFields()
849856
'invalid' => $invalid ?? false,
850857
'postinput' => $post_input ?? '',
851858
],
859+
[
860+
'check',
861+
'robots_txt_search',
862+
],
863+
[
864+
'check',
865+
'robots_txt_ai_input',
866+
],
867+
[
868+
'check',
869+
'robots_txt_ai_train',
870+
],
852871
[
853872
'large_text',
854873
'meta_keywords',
@@ -1207,38 +1226,99 @@ protected static function addRobotsTxtRules(): void
12071226
$boardpath = Url::create(Config::$boardurl)->path;
12081227
$scriptpath = Url::create(Config::$scripturl)->path;
12091228

1229+
// Content Signals Policy text.
1230+
$content_signals_policy = <<<'END'
1231+
# As a condition of accessing this website, you agree
1232+
# to abide by the following content signals:
1233+
1234+
# (a) If a content-signal = yes, you may collect
1235+
# content for the corresponding use.
1236+
# (b) If a content-signal = no, you may not collect
1237+
# content for the corresponding use.
1238+
# (c) If the website operator does not include a
1239+
# content signal for a corresponding use, the website
1240+
# operator neither grants nor restricts permission via
1241+
# content signal with respect to the corresponding use.
1242+
1243+
# The content signals and their meanings are:
1244+
1245+
# search: building a search index and providing search
1246+
# results (e.g., returning hyperlinks and short
1247+
# excerpts from your website's contents). Search does
1248+
# not include providing AI-generated search summaries.
1249+
# ai-input: inputting content into one or more AI
1250+
# models (e.g., retrieval augmented generation,
1251+
# grounding, or other real-time taking of content for
1252+
# generative AI search answers).
1253+
# ai-train: training or fine-tuning AI models.
1254+
1255+
# ANY RESTRICTIONS EXPRESSED VIA CONTENT SIGNALS ARE
1256+
# EXPRESS RESERVATIONS OF RIGHTS UNDER ARTICLE 4 OF THE
1257+
# EUROPEAN UNION DIRECTIVE 2019/790 ON COPYRIGHT AND
1258+
# RELATED RIGHTS IN THE DIGITAL SINGLE MARKET.
1259+
1260+
1261+
END;
1262+
12101263
// Define the rules we want to include.
1264+
//
1265+
// Top level keys are user agent strings.
1266+
// For example, '*' means 'User-Agent: *', i.e. all robots.
1267+
// If a mod wants to add specific rules for certain robots, new keys
1268+
// can be added to the $rules array targeting those specific robots.
1269+
//
1270+
// Within each user agent's ruleset, keys are path patterns as described
1271+
// in RFC 9309. There are three possible values:
1272+
//
1273+
// - false
1274+
// The user agent is not allowed to crawl the indicated path.
1275+
//
1276+
// - true
1277+
// The user agent is allowed to crawl the indicated path with no
1278+
// restrictions on how the content may be used.
1279+
//
1280+
// - an array containing 'search', 'ai-train', and 'ai-input' elements,
1281+
// each of which takes a boolean value.
1282+
// The user agent is allowed to crawl the indicated path, but a
1283+
// Content-Signal line will be inserted to indicate restrictions
1284+
// on how the content may be used.
12111285
$rules = [
12121286
'*' => [
1213-
'allow' => [],
1214-
'disallow' => [
1215-
// Frequenty occurring non-canonical URLs (both normal and queryless)
1216-
$boardpath . '/*PHPSESSID=',
1217-
$boardpath . '/*;topicseen',
1218-
$boardpath . '/*.msg',
1219-
$boardpath . '/*.new',
1220-
$boardpath . '/*.from',
1221-
$boardpath . '/msgs/',
1222-
// Normal URLs of actions that always set Utils::$context['robot_no_index'] to true
1223-
$scriptpath . '?action=admin',
1224-
$scriptpath . '?action=credits',
1225-
$scriptpath . '?action=moderate',
1226-
$scriptpath . '?action=post',
1227-
$scriptpath . '?action=printpage',
1228-
$scriptpath . '?action=reminder',
1229-
$scriptpath . '?action=reporttm',
1230-
$scriptpath . '?action=search',
1231-
$scriptpath . '?action=who',
1232-
// Queryless URLs of actions that always set Utils::$context['robot_no_index'] to true
1233-
$boardpath . '/*/credits',
1234-
$boardpath . '/*/moderate',
1235-
$boardpath . '/*/post',
1236-
$boardpath . '/*/printpage',
1237-
$boardpath . '/*/reminder',
1238-
$boardpath . '/*/reporttm',
1239-
$boardpath . '/*/search',
1240-
$boardpath . '/*/who',
1287+
// Allow robots to crawl the forum, subject to the Content-Signal policy.
1288+
$boardpath . '/' => [
1289+
// If robots_txt_search is not set, assume true.
1290+
'search' => !empty(Config::$modSettings['robots_txt_search'] ?? true),
1291+
// If the AI settings are not set, assume false.
1292+
'ai-train' => !empty(Config::$modSettings['robots_txt_ai_train']),
1293+
'ai-input' => !empty(Config::$modSettings['robots_txt_ai_input']),
12411294
],
1295+
1296+
// Disallow frequently occurring non-canonical URLs.
1297+
$boardpath . '/*PHPSESSID=' => false,
1298+
$boardpath . '/*;topicseen' => false,
1299+
$boardpath . '/*.msg' => false,
1300+
$boardpath . '/*.new' => false,
1301+
$boardpath . '/*.from' => false,
1302+
$boardpath . '/msgs/' => false,
1303+
1304+
// Disallow actions that always set Utils::$context['robot_no_index'] to true.
1305+
$scriptpath . '?action=admin' => false,
1306+
$scriptpath . '?action=credits' => false,
1307+
$scriptpath . '?action=moderate' => false,
1308+
$scriptpath . '?action=post' => false,
1309+
$scriptpath . '?action=printpage' => false,
1310+
$scriptpath . '?action=reminder' => false,
1311+
$scriptpath . '?action=reporttm' => false,
1312+
$scriptpath . '?action=search' => false,
1313+
$scriptpath . '?action=who' => false,
1314+
$boardpath . '/*/credits' => false,
1315+
$boardpath . '/*/moderate' => false,
1316+
$boardpath . '/*/post' => false,
1317+
$boardpath . '/*/printpage' => false,
1318+
$boardpath . '/*/reminder' => false,
1319+
$boardpath . '/*/reporttm' => false,
1320+
$boardpath . '/*/search' => false,
1321+
$boardpath . '/*/who' => false,
12421322
],
12431323
];
12441324

@@ -1250,61 +1330,49 @@ protected static function addRobotsTxtRules(): void
12501330
if (is_file(Config::$modSettings['robots_txt'])) {
12511331
$hash = md5_file(Config::$modSettings['robots_txt']);
12521332

1253-
$user_agents_in_group = [];
1254-
$current_user_agent = '';
1255-
$insert = false;
1333+
$user_agents = [];
1334+
$in_user_agents = false;
12561335

12571336
// Keep all existing content and filter out anything in $rules that already exists.
12581337
foreach (file(Config::$modSettings['robots_txt']) as $line) {
12591338
// Found a new user agent line.
12601339
if (preg_match('/^\h*user-agent:\h*([^\n]+)/i', $line, $matches)) {
1261-
$user_agents_in_group[] = $matches[1];
1262-
$current_user_agent = $matches[1];
1263-
1264-
if ($insert === null) {
1265-
$insert = true;
1340+
if (!$in_user_agents) {
1341+
$user_agents = [];
12661342
}
1267-
} elseif (preg_match('/^\h*($|#)/i', $line)) {
1268-
$insert = true;
1269-
} else {
1270-
$insert = null;
1271-
}
12721343

1273-
// Insert our rules before comments, blank lines, or the start
1274-
// of a new user agent group, but only if user agent that these
1275-
// rules are for was the only one in its group.
1276-
if (!empty($insert) && \count($user_agents_in_group) === 1) {
1277-
foreach ($user_agents_in_group as $user_agent) {
1278-
if (!isset($rules[$user_agent])) {
1279-
continue;
1280-
}
1281-
1282-
foreach ($rules[$user_agent] as $type => $patterns) {
1283-
foreach ($patterns as $pattern) {
1284-
$new_content[] = ucfirst($type) . ': ' . $pattern . "\n";
1285-
}
1286-
}
1287-
1288-
// Don't do the same rules twice.
1289-
unset($rules[$user_agent]);
1290-
}
1291-
1292-
$insert = false;
1344+
$user_agents[] = $matches[1];
1345+
$in_user_agents = true;
12931346
}
12941347

12951348
// Append this line.
12961349
$new_content[] = $line;
12971350

12981351
// Filter out anything in $rules that already exists.
1299-
if (preg_match('/^\h*((?:dis)?allow)\h*:\h*([^\n]+)/i', $line, $matches)) {
1300-
$type = strtolower($matches[1]);
1301-
$pattern = $matches[2];
1302-
1303-
if (isset($rules[$current_user_agent][$type])) {
1304-
$rules[$current_user_agent][$type] = array_diff(
1305-
$rules[$current_user_agent][$type],
1306-
[$pattern],
1307-
);
1352+
if (preg_match('/^\h*((?:dis)?allow|content-signal)\h*:\h*([^\n]+)/i', $line, $matches)) {
1353+
$in_user_agents = false;
1354+
$allowed = strtolower($matches[1]) !== 'disallow';
1355+
1356+
if (strtolower($matches[1]) !== 'content-signal') {
1357+
$pattern = $matches[2];
1358+
$signal = null;
1359+
} else {
1360+
[$pattern, $signal] = preg_split('/\h+/', $matches[2], 2);
1361+
parse_str(preg_replace('/\h*,\h*/', '&', $signal), $signal);
1362+
$signal = array_map(fn($arg) => $arg === 'yes', $signal);
1363+
}
1364+
1365+
foreach ($user_agents as $user_agent) {
1366+
if (!isset($rules[$user_agent][$pattern])) {
1367+
continue;
1368+
}
1369+
1370+
if (
1371+
$rules[$user_agent][$pattern] === $allowed
1372+
|| $rules[$user_agent][$pattern] === $signal
1373+
) {
1374+
unset($rules[$user_agent][$pattern]);
1375+
}
13081376
}
13091377
}
13101378
}
@@ -1342,19 +1410,44 @@ protected static function addRobotsTxtRules(): void
13421410

13431411
// Append any new rules that haven't already been inserted.
13441412
foreach ($rules as $user_agent => $rule_parts) {
1413+
if (empty($rule_parts)) {
1414+
continue;
1415+
}
1416+
13451417
$new_content[] = "\n";
13461418
$new_content[] = 'User-agent: ' . $user_agent . "\n";
13471419

1348-
foreach ($rule_parts as $type => $patterns) {
1349-
foreach ($patterns as $pattern) {
1350-
$new_content[] = ucfirst($type) . ': ' . $pattern . "\n";
1420+
foreach ($rule_parts as $pattern => $rule) {
1421+
if (\is_array($rule)) {
1422+
switch (array_filter($rule)) {
1423+
// If all uses are allowed, then allow the whole thing.
1424+
case $rule:
1425+
$rule = true;
1426+
break;
1427+
1428+
// If all uses are disallowed, then disallow the whole thing.
1429+
case []:
1430+
$rule = false;
1431+
break;
1432+
1433+
// Indicate what types of use are allowed or disallowed.
1434+
default:
1435+
$new_content[] = 'Content-Signal: ' . $pattern . ' ' . implode(', ', array_map(fn($k, $v) => $k . '=' . ($v ? 'yes' : 'no'), array_keys($rule), $rule)) . "\n";
1436+
break;
1437+
}
13511438
}
1439+
1440+
$new_content[] = (empty($rule) ? 'Disallow: ' : 'Allow: ') . $pattern . "\n";
13521441
}
13531442
}
13541443

13551444
// Finalize the content.
13561445
$new_content = trim(implode('', $new_content)) . "\n";
13571446

1447+
if (!str_contains($new_content, $content_signals_policy)) {
1448+
$new_content = $content_signals_policy . $new_content;
1449+
}
1450+
13581451
// If nothing changed, bail out.
13591452
if (isset($hash) && md5($new_content) === $hash) {
13601453
return;

Sources/Db/Schema/v3_0/Settings.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,10 @@ class Settings extends Table
630630
'variable' => 'reserveWord',
631631
'value' => '0',
632632
],
633+
[
634+
'variable' => 'robots_txt_search',
635+
'value' => '1',
636+
],
633637
[
634638
'variable' => 'samesiteCookies',
635639
'value' => 'lax',

0 commit comments

Comments
 (0)