Skip to content

Commit bbd08fb

Browse files
authored
Merge pull request #990 from 10up/feature/736
Add feature to block AI data scraping bots
2 parents 639cb38 + a2d3409 commit bbd08fb

File tree

12 files changed

+181
-12
lines changed

12 files changed

+181
-12
lines changed

.github/workflows/vipcs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,6 @@ jobs:
3333
with:
3434
standard: WordPress-VIP-Go
3535
use_local_config: false
36-
enable_warnings: true
36+
enable_warnings: false
3737
only_changed_lines: false
3838
excludes: tests

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ Tap into leading cloud-based services like [OpenAI](https://openai.com/), [Micro
3434
* Smartly crop images using [Microsoft Azure's AI Vision API](https://azure.microsoft.com/en-us/services/cognitive-services/computer-vision/)
3535
* Scan PDF files for embedded text and save for use in post meta using [Microsoft Azure's AI Vision API](https://azure.microsoft.com/en-us/services/cognitive-services/computer-vision/)
3636
* Bulk classify content with [WP-CLI](https://wp-cli.org/)
37+
* Modification of your `robots.txt` file to block the most common AI data scraping bots from indexing your site
3738

3839
### Language Processing
3940

@@ -158,7 +159,7 @@ ClassifAI is a sophisticated solution that we want organizations of all shapes a
158159
- Check for an email from `ClassifAI Team` which contains the registration key.
159160
- Note that the email will be sent from `opensource@10up.com`, so please whitelist this email address if needed.
160161

161-
### 2. Configure ClassifAI Registration Key under Tools > ClassifAI > ClassifAI Registration
162+
### 2. Configure ClassifAI Registration Key under Tools > ClassifAI > Settings
162163

163164
- In the `Registered Email` field, enter the email you used for registration.
164165
- In the `Registration Key` field, enter the registration key from the email in step 1 above.

assets/img/screenshot-6.png

197 KB
Loading

includes/Classifai/Admin/templates/classifai-header.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363

6464
$services_menu = Classifai\get_services_menu();
6565
$classifai_settings = array(
66-
'classifai_settings' => __( 'ClassifAI Registration', 'classifai' ),
66+
'classifai_settings' => __( 'Settings', 'classifai' ),
6767
);
6868

6969
$classifai_header_menu = array_merge( $classifai_settings, $services_menu );

includes/Classifai/Plugin.php

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ public function enable() {
3939
add_action( 'admin_init', [ $this, 'add_privacy_policy_content' ] );
4040
add_action( 'admin_init', [ $this, 'maybe_migrate_to_v3' ] );
4141
add_action( 'admin_enqueue_scripts', [ $this, 'enqueue_admin_assets' ] );
42-
add_filter( 'plugin_action_links_' . CLASSIFAI_PLUGIN_BASENAME, array( $this, 'filter_plugin_action_links' ) );
42+
add_filter( 'plugin_action_links_' . CLASSIFAI_PLUGIN_BASENAME, [ $this, 'filter_plugin_action_links' ] );
43+
add_filter( 'robots_txt', [ $this, 'maybe_block_ai_crawlers' ] );
4344
add_action( 'after_classifai_init', [ $this, 'load_action_scheduler' ] );
4445
}
4546

@@ -248,6 +249,59 @@ public function filter_plugin_action_links( $links ): array {
248249
);
249250
}
250251

252+
/**
253+
* Maybe block AI crawlers from indexing the site.
254+
*
255+
* @param string $robots_txt The robots.txt content.
256+
* @return string The robots.txt content.
257+
*/
258+
public function maybe_block_ai_crawlers( $robots_txt ) {
259+
$service_manager = new Services\ServicesManager();
260+
$settings = $service_manager->get_settings();
261+
262+
// Only block AI bots if the setting is enabled.
263+
if ( ! isset( $settings['block_ai_bots'] ) || '1' !== $settings['block_ai_bots'] ) {
264+
return $robots_txt;
265+
}
266+
267+
// Ensure the content is a string, in case some other plugin has messed up.
268+
if ( ! is_string( $robots_txt ) ) {
269+
$robots_txt = (string) $robots_txt;
270+
}
271+
272+
$robots_txt .= '
273+
## Apple crawler (https://support.apple.com/en-us/119829)
274+
User-agent: Applebot-Extended
275+
Disallow: /
276+
277+
## Common Crawl crawler (https://commoncrawl.org/ccbot)
278+
User-agent: CCBot
279+
Disallow: /
280+
281+
## Anthropic crawler (https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)
282+
User-agent: ClaudeBot
283+
Disallow: /
284+
285+
## Facebook crawler (https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/)
286+
User-agent: FacebookBot
287+
Disallow: /
288+
289+
## Google crawler (https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)
290+
User-agent: Google-Extended
291+
Disallow: /
292+
293+
## OpenAI GPTBot crawler (https://platform.openai.com/docs/bots)
294+
User-agent: GPTbot
295+
Disallow: /
296+
297+
## Meta crawler (https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/)
298+
User-agent: Meta-ExternalAgent
299+
Disallow: /
300+
';
301+
302+
return $robots_txt;
303+
}
304+
251305
/**
252306
* Load the Action Scheduler library.
253307
*/

includes/Classifai/Services/ServicesManager.php

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ public function register_recommendation_service_features( array $features ): arr
138138
/**
139139
* Get general ClassifAI settings
140140
*
141-
* @param string $index Optional specific setting to be retrieved.
141+
* @param string|bool $index Optional specific setting to be retrieved. If false, all settings will be returned.
142142
* @return mixed
143143
*/
144144
public function get_settings( $index = false ) {
@@ -187,8 +187,9 @@ public function register_settings() {
187187
public function sanitize_settings( $settings ): array {
188188
$new_settings = [];
189189

190-
if ( isset( $settings['email'] )
191-
&& isset( $settings['license_key'] )
190+
// Save registration settings.
191+
if ( ! empty( $settings['email'] )
192+
&& ! empty( $settings['license_key'] )
192193
&& $this->check_license_key( $settings['email'], $settings['license_key'] )
193194
) {
194195
$new_settings['valid_license'] = true;
@@ -207,6 +208,9 @@ public function sanitize_settings( $settings ): array {
207208
);
208209
}
209210

211+
// Save block AI bots setting.
212+
$new_settings['block_ai_bots'] = $settings['block_ai_bots'] ?? '0';
213+
210214
return $new_settings;
211215
}
212216

readme.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Tap into leading cloud-based services like [OpenAI](https://openai.com/), [Micro
3535
* Smartly crop images using [Microsoft Azure's AI Vision API](https://azure.microsoft.com/en-us/services/cognitive-services/computer-vision/)
3636
* Scan PDF files for embedded text and save for use in post meta using [Microsoft Azure's AI Vision API](https://azure.microsoft.com/en-us/services/cognitive-services/computer-vision/)
3737
* Bulk classify content with [WP-CLI](https://wp-cli.org/)
38+
* Modification of your `robots.txt` file to block the most common AI data scraping bots from indexing your site
3839

3940
**Requirements**
4041

src/js/settings/components/classifai-registration/index.js

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import {
1111
Flex,
1212
FlexItem,
1313
__experimentalInputControl as InputControl, // eslint-disable-line @wordpress/no-unsafe-wp-apis
14+
ToggleControl,
1415
} from '@wordpress/components';
1516
import { __, sprintf } from '@wordpress/i18n';
1617
import apiFetch from '@wordpress/api-fetch';
@@ -105,6 +106,7 @@ export const ClassifAIRegistrationForm = ( { onSaveSuccess = () => {} } ) => {
105106
onChange={ ( value ) => {
106107
setSettings( { ...settings, email: value } );
107108
} }
109+
__next40pxDefaultSize
108110
/>
109111
</SettingsRow>
110112
<SettingsRow
@@ -140,6 +142,37 @@ export const ClassifAIRegistrationForm = ( { onSaveSuccess = () => {} } ) => {
140142
license_key: value,
141143
} );
142144
} }
145+
__next40pxDefaultSize
146+
/>
147+
</SettingsRow>
148+
</PanelBody>
149+
</Panel>
150+
<Panel
151+
header={ __( 'General Settings', 'classifai' ) }
152+
className="settings-panel"
153+
>
154+
<PanelBody>
155+
<SettingsRow
156+
label={ __( 'Block AI Bots', 'classifai' ) }
157+
description={
158+
<>
159+
{ __(
160+
'If you turn on this setting, ClassifAI will modify your robots.txt file to request that known AI data scraping bots do not index your site. This will not block AI search bots, just data scraping bots. At the moment, the following bots are blocked: Applebot-Extended, CCBot, ClaudeBot, FacebookBot, Google-Extended, GPTbot, Meta-ExternalAgent.',
161+
'classifai'
162+
) }
163+
</>
164+
}
165+
>
166+
<ToggleControl
167+
className="classifai-enable-bot-block"
168+
checked={ settings?.block_ai_bots === '1' }
169+
onChange={ ( value ) => {
170+
setSettings( {
171+
...settings,
172+
block_ai_bots: value ? '1' : '0',
173+
} );
174+
} }
175+
__nextHasNoMarginBottom
143176
/>
144177
</SettingsRow>
145178
</PanelBody>

src/js/settings/components/classifai-settings/index.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -108,15 +108,15 @@ export const ServiceNavigation = () => {
108108
</NavLink>
109109
) ) }
110110
<NavLink
111-
to="classifai_registration"
112-
key="classifai_registration"
111+
to="settings"
112+
key="settings"
113113
className={ ( { isActive } ) =>
114114
isActive
115115
? 'active-tab classifai-tabs-item'
116116
: 'classifai-tabs-item'
117117
}
118118
>
119-
{ __( 'ClassifAI Registration', 'classifai' ) }
119+
{ __( 'Settings', 'classifai' ) }
120120
</NavLink>
121121
</div>
122122
</>
@@ -239,7 +239,7 @@ export const ClassifAISettings = () => {
239239
element={ <FeatureSettingsWrapper /> }
240240
/>
241241
<Route
242-
path="classifai_registration"
242+
path="settings"
243243
element={ <ClassifAIRegistration /> }
244244
/>
245245
{ /* When no routes match, it will redirect to this route path. Note that it should be registered above. */ }

tests/cypress/integration/admin.test.js

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,4 +47,69 @@ describe( 'Admin can login and make sure plugin is activated', () => {
4747
.first()
4848
.contains( 'Image Processing' );
4949
} );
50+
51+
it( 'Can visit the general settings page and see all settings.', () => {
52+
// Check Selected Navigation menu
53+
cy.visitFeatureSettings( 'settings' );
54+
cy.get( '.classifai-tabs' ).should( 'exist' );
55+
cy.get( '.classifai-tabs a.active-tab' )
56+
.first()
57+
.contains( 'Settings' );
58+
59+
// Check that all settings are present.
60+
cy.get( '.components-input-control input[type="email"]' ).should(
61+
'exist'
62+
);
63+
cy.get( '.components-input-control input[type="password"]' ).should(
64+
'exist'
65+
);
66+
cy.get( '.classifai-enable-bot-block input' ).should( 'exist' );
67+
cy.get( '.classifai-enable-bot-block input' ).should(
68+
'not.be.checked'
69+
);
70+
} );
71+
72+
it( 'Can turn on "Block AI Bots" setting and it works.', () => {
73+
cy.visitFeatureSettings( 'settings' );
74+
75+
cy.get( '.classifai-enable-bot-block input' ).check();
76+
77+
cy.saveGeneralSettings();
78+
79+
// Check that the robots.txt file has bots blocked.
80+
cy.request( '/robots.txt' ).then( ( response ) => {
81+
expect( response.body ).to.contain(
82+
'User-agent: Applebot-Extended'
83+
);
84+
expect( response.body ).to.contain( 'User-agent: CCBot' );
85+
expect( response.body ).to.contain( 'User-agent: ClaudeBot' );
86+
expect( response.body ).to.contain( 'User-agent: FacebookBot' );
87+
expect( response.body ).to.contain( 'User-agent: Google-Extended' );
88+
expect( response.body ).to.contain( 'User-agent: GPTbot' );
89+
expect( response.body ).to.contain(
90+
'User-agent: Meta-ExternalAgent'
91+
);
92+
} );
93+
94+
cy.get( '.classifai-enable-bot-block input' ).uncheck();
95+
96+
cy.saveGeneralSettings();
97+
98+
// Check that the robots.txt file has bots unblocked.
99+
cy.request( '/robots.txt' ).then( ( response ) => {
100+
expect( response.body ).to.not.contain(
101+
'User-agent: Applebot-Extended'
102+
);
103+
expect( response.body ).to.not.contain( 'User-agent: CCBot' );
104+
expect( response.body ).to.not.contain( 'User-agent: ClaudeBot' );
105+
expect( response.body ).to.not.contain( 'User-agent: FacebookBot' );
106+
expect( response.body ).to.not.contain(
107+
'User-agent: Google-Extended'
108+
);
109+
expect( response.body ).to.not.contain( 'User-agent: GPTbot' );
110+
expect( response.body ).to.not.contain(
111+
'User-agent: Meta-ExternalAgent'
112+
);
113+
} );
114+
} );
50115
} );

0 commit comments

Comments
 (0)