Skip to content

Commit 3f6e419

Browse files
Automatically populate robots.txt
Signed-off-by: Jon Stovell <jonstovell@gmail.com>
1 parent 5a0150e commit 3f6e419

File tree

3 files changed

+204
-0
lines changed

3 files changed

+204
-0
lines changed

Languages/en_US/Help.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,7 @@
570570
As above except only Administrators can see spider status - to all other users spiders appear as guests.
571571
</li>
572572
</ul>';
573+
$helptxt['robots_txt'] = 'The robots.txt file is used to implement the <a href="https://www.rfc-editor.org/rfc/rfc9309.html" class="bbc_link">Robots Exclusion Protocol</a>, a standard used by websites to indicate to search engine spiders and other web robots which portions of the website they are allowed to visit. This file is typically located in your website’s root directory.<br><br>SMF adds some rules to this file in order to guide spiders away from URLs that they should not bother to crawl. This improves efficiency and reduces server load when a spider is crawling your forum.';
573574

574575
$helptxt['birthday_email'] = 'Choose the index of the birthday email message to use. A preview will be shown in the Email Subject and Email Body fields.<br><strong>Note:</strong> Selecting this setting does not automatically enable birthday emails. To enable birthday emails use the <a href="{scripturl}?action=admin;area=scheduledtasks;{session_var}={session_id}" target="_blank" rel="noopener">Scheduled Tasks</a> page and enable the birthday email task.';
575576
$helptxt['pm_bcc'] = 'When sending a personal message you can choose to add a recipient as BCC (Blind Carbon Copy). BCC recipients do not have their identities revealed to the other recipients of the message.';

Languages/en_US/Search.php

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,4 +180,9 @@
180180
$txt['spider_stats_page_hits'] = 'Page Hits';
181181
$txt['spider_stats_no_entries'] = 'There are currently no spider statistics available.';
182182

183+
$txt['robots_txt'] = 'Add SMF rules to robots.txt';
184+
$txt['robots_txt_info'] = 'Enter the path to your robots.txt file so that SMF can append rules to it.';
185+
$txt['robots_txt_auto'] = 'Detect path';
186+
$txt['robots_txt_not_writable'] = 'The robots.txt file is not writable.';
187+
183188
?>

Sources/Actions/Admin/SearchEngines.php

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
use SMF\SecurityToken;
3131
use SMF\Theme;
3232
use SMF\Time;
33+
use SMF\Url;
3334
use SMF\User;
3435
use SMF\Utils;
3536

@@ -637,6 +638,7 @@ public function settings(): void
637638
ACP::saveDBSettings($config_vars);
638639

639640
self::recacheSpiderNames();
641+
self::addRobotsTxtRules();
640642

641643
$_SESSION['adm-save'] = true;
642644
Utils::redirectexit('action=admin;area=sengines;sa=settings');
@@ -794,6 +796,33 @@ function disableFields()
794796
}
795797
disableFields();';
796798

799+
// Now the setting for robots.txt.
800+
$config_vars[] = '';
801+
802+
if (empty(Config::$modSettings['robots_txt'])) {
803+
$detected_path = self::detectRobotsTxt();
804+
805+
if (
806+
$detected_path !== ''
807+
&& $detected_path !== (Config::$modSettings['robots_txt'] ?? '')
808+
&& is_writable($detected_path)
809+
) {
810+
$post_input = '<button class="button floatnone" onclick="document.getElementById(\'robots_txt\').value = ' . Utils::escapeJavaScript($detected_path) . '; return false;">' . Lang::getTxt('robots_txt_auto') . '</button>';
811+
}
812+
} elseif (!is_writable(Config::$modSettings['robots_txt'])) {
813+
$invalid = true;
814+
$post_input = '<br><span class="error">' . Lang::$txt['robots_txt_not_writable'] . '</span>';
815+
}
816+
817+
$config_vars[] = [
818+
'text',
819+
'robots_txt',
820+
'subtext' => Lang::$txt['robots_txt_info'],
821+
'size' => 45,
822+
'invalid' => $invalid ?? false,
823+
'postinput' => $post_input ?? '',
824+
];
825+
797826
IntegrationHook::call('integrate_modify_search_engine_settings', [&$config_vars]);
798827

799828
return $config_vars;
@@ -1100,6 +1129,175 @@ protected function __construct()
11001129

11011130
Utils::$context['sub_action'] = &$this->subaction;
11021131
}
1132+
1133+
/**
1134+
* Finds and returns the file path to robots.txt, or else the file path
1135+
* where it should be created if it doesn't already exist.
1136+
*
1137+
* @return string The path to robots.txt.
1138+
*/
1139+
protected static function detectRobotsTxt(): string
1140+
{
1141+
// First try $_SERVER['CONTEXT_DOCUMENT_ROOT'], then try $_SERVER['DOCUMENT_ROOT'].
1142+
foreach (['CONTEXT_DOCUMENT_ROOT', 'DOCUMENT_ROOT'] as $var) {
1143+
if (
1144+
isset($_SERVER[$var])
1145+
&& str_starts_with(
1146+
strtr(Config::$boarddir, ['/' => DIRECTORY_SEPARATOR]),
1147+
strtr($_SERVER[$var], ['/' => DIRECTORY_SEPARATOR]),
1148+
)
1149+
) {
1150+
return rtrim(strtr($_SERVER[$var], ['/' => DIRECTORY_SEPARATOR]), DIRECTORY_SEPARATOR) . DIRECTORY_SEPARATOR . 'robots.txt';
1151+
}
1152+
1153+
}
1154+
1155+
// If the server has an odd configuration, try to figure out the path ourselves.
1156+
$path_from_boarddir = strtr(Config::$boarddir, ['/' => DIRECTORY_SEPARATOR]);
1157+
$path_from_boardurl = strtr(Url::create(Config::$boardurl)->path, ['/' => DIRECTORY_SEPARATOR]);
1158+
1159+
// Walk up the path until we find the document root.
1160+
while (
1161+
// Stop if we find robots.txt
1162+
!file_exists($path_from_boarddir . DIRECTORY_SEPARATOR . 'robots.txt')
1163+
// Stop if the URL path and the filesystem path diverge.
1164+
&& basename($path_from_boarddir) === basename($path_from_boardurl)
1165+
// Stop if we get to the root of the path according to the URL.
1166+
&& dirname($path_from_boardurl) !== $path_from_boardurl
1167+
) {
1168+
$path_from_boarddir = dirname($path_from_boarddir);
1169+
$path_from_boardurl = dirname($path_from_boardurl);
1170+
}
1171+
1172+
return $path_from_boarddir . DIRECTORY_SEPARATOR . 'robots.txt';
1173+
}
1174+
1175+
/**
1176+
* Checks whether robots.txt is writable and, if so, adds some rules to it
1177+
* for SMF purposes.
1178+
*/
1179+
protected static function addRobotsTxtRules(): void
1180+
{
1181+
// Can we write to the file?
1182+
if (
1183+
(Config::$modSettings['robots_txt'] ?? '') === ''
1184+
|| (
1185+
is_file(Config::$modSettings['robots_txt'])
1186+
&& !Utils::makeWritable(Config::$modSettings['robots_txt'])
1187+
)
1188+
|| (
1189+
!file_exists(Config::$modSettings['robots_txt'])
1190+
&& !Utils::makeWritable(dirname(Config::$modSettings['robots_txt']))
1191+
)
1192+
) {
1193+
return;
1194+
}
1195+
1196+
// Define the rules we want to include.
1197+
$rules = [
1198+
'*' => [
1199+
'allow' => [],
1200+
'disallow' => [
1201+
Url::create(Config::$scripturl)->path . '?msg=*',
1202+
],
1203+
],
1204+
];
1205+
1206+
IntegrationHook::call('integrate_robots_txt_rules', [&$rules]);
1207+
1208+
// Build the new file content.
1209+
$new_content = [];
1210+
1211+
if (is_file(Config::$modSettings['robots_txt'])) {
1212+
$hash = md5_file(Config::$modSettings['robots_txt']);
1213+
1214+
$current_user_agent = '';
1215+
1216+
// Keep all existing content and filter out anything in $rules that already exists.
1217+
foreach (file(Config::$modSettings['robots_txt']) as $line) {
1218+
// Found a new user agent line.
1219+
if (preg_match('/^user-agent:\h*([^\n]+)/i', $line, $matches)) {
1220+
$prev_user_agent = $current_user_agent;
1221+
$current_user_agent = $matches[1];
1222+
1223+
// Append any new rules for the previous user agent.
1224+
if (isset($rules[$prev_user_agent])) {
1225+
foreach ($rules[$prev_user_agent] as $type => $patterns) {
1226+
foreach ($patterns as $pattern) {
1227+
$new_content[] = ucfirst($type) . ': ' . $pattern . "\n";
1228+
}
1229+
}
1230+
}
1231+
1232+
// Don't do the same rules twice.
1233+
unset($rules[$prev_user_agent]);
1234+
}
1235+
1236+
// Append this line.
1237+
$new_content[] = $line;
1238+
1239+
// Filter out anything in $rules that already exists.
1240+
if (preg_match('/^((?:dis)?allow):\h*([^\n]+)/i', $line, $matches)) {
1241+
$type = strtolower($matches[1]);
1242+
$pattern = $matches[2];
1243+
1244+
$rules[$current_user_agent][$type] = array_diff(
1245+
$rules[$current_user_agent][$type],
1246+
[$pattern],
1247+
);
1248+
}
1249+
}
1250+
}
1251+
1252+
// Filter out empty $rules.
1253+
foreach ($rules as $user_agent => $rule_parts) {
1254+
foreach ($rule_parts as $type => $patterns) {
1255+
if ($rules[$user_agent][$type] === []) {
1256+
unset($rules[$user_agent][$type]);
1257+
}
1258+
}
1259+
1260+
if ($rules[$user_agent] === []) {
1261+
unset($rules[$user_agent]);
1262+
}
1263+
}
1264+
1265+
// Append the new rules.
1266+
foreach ($rules as $user_agent => $rule_parts) {
1267+
$new_content[] = "\n";
1268+
$new_content[] = 'User-agent: ' . $user_agent . "\n";
1269+
1270+
foreach ($rule_parts as $type => $patterns) {
1271+
foreach ($patterns as $pattern) {
1272+
$new_content[] = ucfirst($type) . ': ' . $pattern . "\n";
1273+
}
1274+
}
1275+
}
1276+
1277+
// Finalize the content.
1278+
$new_content = trim(implode('', $new_content)) . "\n";
1279+
1280+
// If nothing changed, bail out.
1281+
if (isset($hash) && md5($new_content) === $hash) {
1282+
return;
1283+
}
1284+
1285+
// Where should we save the backup file?
1286+
if (Utils::makeWritable(dirname(Config::$modSettings['robots_txt']))) {
1287+
$backup_file = preg_replace('/\.txt$/', '.' . (date_create('now UTC')->format('Ymd\THis\Z')) . '.txt', Config::$modSettings['robots_txt']);
1288+
} elseif (Utils::makeWritable(Config::$boarddir)) {
1289+
$backup_file = Config::$boarddir . DIRECTORY_SEPARATOR . 'robots.' . (date_create('now UTC')->format('Ymd\THis\Z')) . '.txt';
1290+
} else {
1291+
$backup_file = null;
1292+
}
1293+
1294+
// Write the new content to disk.
1295+
Config::safeFileWrite(
1296+
file: Config::$modSettings['robots_txt'],
1297+
data: $new_content,
1298+
backup_file: $backup_file,
1299+
);
1300+
}
11031301
}
11041302

11051303
?>

0 commit comments

Comments
 (0)