Skip to content

Commit 899db61

Browse files
authored
Strip unnecessary HTML attributes from federated content (#2643)
1 parent cb99315 commit 899db61

File tree

5 files changed

+242
-4
lines changed

5 files changed

+242
-4
lines changed

.github/changelog/2619-clean-html

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Significance: patch
2+
Type: changed
3+
4+
Reduce federated content size by removing unnecessary HTML attributes.

includes/class-sanitize.php

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,4 +215,79 @@ public static function content( $content ) {
215215
public static function strip_whitespace( $content ) {
216216
return \trim( \preg_replace( '/>[\n\r\t]+</', '><', $content ) );
217217
}
218+
219+
/**
220+
* Clean HTML for ActivityPub federation.
221+
*
222+
* Keeps all WordPress allowed tags but removes global attributes like
223+
* class, id, style, data-*, aria-* that increase payload size.
224+
*
225+
* @see https://github.com/Automattic/wordpress-activitypub/issues/2619
226+
*
227+
* @param string $content The HTML content to clean.
228+
*
229+
* @return string The cleaned HTML content.
230+
*/
231+
public static function clean_html( $content ) {
232+
if ( empty( $content ) ) {
233+
return $content;
234+
}
235+
236+
// Start with all WordPress allowed post tags.
237+
$allowed_html = \wp_kses_allowed_html( 'post' );
238+
239+
// Global attributes to remove from all elements.
240+
$remove_attrs = array(
241+
'aria-controls',
242+
'aria-current',
243+
'aria-describedby',
244+
'aria-details',
245+
'aria-expanded',
246+
'aria-hidden',
247+
'aria-label',
248+
'aria-labelledby',
249+
'aria-live',
250+
'class',
251+
'data-*',
252+
'decoding',
253+
'dir',
254+
'hidden',
255+
'id',
256+
'lang',
257+
'loading',
258+
'role',
259+
'style',
260+
'tabindex',
261+
'title',
262+
'xml:lang',
263+
);
264+
265+
/**
266+
* Filter the global attributes to remove from all elements.
267+
*
268+
* @param array $remove_attrs Global attributes to remove.
269+
*/
270+
$remove_attrs = \apply_filters( 'activitypub_remove_html_attributes', $remove_attrs );
271+
272+
// Remove global attributes from all tags.
273+
foreach ( $allowed_html as $tag => $attrs ) {
274+
$allowed_html[ $tag ] = \array_diff_key( $attrs, \array_flip( $remove_attrs ) );
275+
}
276+
277+
// Re-add class and title for anchors (needed for microformats).
278+
$allowed_html['a']['class'] = true;
279+
$allowed_html['a']['title'] = true;
280+
281+
// Re-add class for spans (needed for microformats).
282+
$allowed_html['span']['class'] = true;
283+
284+
/**
285+
* Filter the final allowed HTML for ActivityPub content.
286+
*
287+
* @param array $allowed_html The allowed HTML structure for wp_kses.
288+
*/
289+
$allowed_html = \apply_filters( 'activitypub_allowed_html', $allowed_html );
290+
291+
return \wp_kses( $content, $allowed_html, \wp_allowed_protocols() );
292+
}
218293
}

includes/class-shortcodes.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ public static function content( $attributes, $content, $tag ) {
191191
// Replace script and style elements.
192192
$content = \preg_replace( '@<(script|style)[^>]*?>.*?</\\1>@si', '', $content );
193193
$content = \strip_shortcodes( $content );
194+
$content = Sanitize::clean_html( $content );
194195
$content = Sanitize::strip_whitespace( $content );
195196

196197
add_shortcode( 'ap_content', array( 'Activitypub\Shortcodes', 'content' ) );

tests/phpunit/tests/includes/class-test-sanitize.php

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,4 +385,158 @@ public function strip_whitespace_provider() {
385385
public function test_strip_whitespace( $input, $expected ) {
386386
$this->assertSame( $expected, Sanitize::strip_whitespace( $input ) );
387387
}
388+
389+
/**
390+
* Data provider for clean_html tests.
391+
*
392+
* @return array Test data with input and expected output.
393+
*/
394+
public function clean_html_provider() {
395+
return array(
396+
'empty_string' => array( '', '' ),
397+
'removes_class_from_p' => array(
398+
'<p class="wp-block-paragraph">Hello</p>',
399+
'<p>Hello</p>',
400+
),
401+
'preserves_class_on_a' => array(
402+
'<a href="https://example.com" class="u-url mention">Link</a>',
403+
'<a href="https://example.com" class="u-url mention">Link</a>',
404+
),
405+
'removes_id' => array(
406+
'<span id="main-content">Content</span>',
407+
'<span>Content</span>',
408+
),
409+
'removes_style' => array(
410+
'<span style="color: red;">Styled</span>',
411+
'<span>Styled</span>',
412+
),
413+
'removes_data_attributes' => array(
414+
'<span data-id="123" data-custom="value">Content</span>',
415+
'<span>Content</span>',
416+
),
417+
'strips_loading_decoding' => array(
418+
'<img src="image.jpg" loading="lazy" decoding="async" alt="Test" />',
419+
'<img src="image.jpg" alt="Test" />',
420+
),
421+
'preserves_href' => array(
422+
'<a href="https://example.com">Link</a>',
423+
'<a href="https://example.com">Link</a>',
424+
),
425+
'strips_bad_protocol' => array(
426+
'<a href="javascript:alert(1)">Link</a>',
427+
'<a href="alert(1)">Link</a>',
428+
),
429+
'preserves_img_essentials' => array(
430+
'<img src="image.jpg" alt="Desc" width="300" height="200" />',
431+
'<img src="image.jpg" alt="Desc" width="300" height="200" />',
432+
),
433+
'preserves_title' => array(
434+
'<a href="https://example.com" title="Example">Link</a>',
435+
'<a href="https://example.com" title="Example">Link</a>',
436+
),
437+
'preserves_rel_and_target' => array(
438+
'<a href="https://example.com" rel="me" target="_blank">Link</a>',
439+
'<a href="https://example.com" rel="me" target="_blank">Link</a>',
440+
),
441+
'strips_lang_dir' => array(
442+
'<p lang="en" dir="ltr">Hello</p>',
443+
'<p>Hello</p>',
444+
),
445+
'preserves_cite' => array(
446+
'<blockquote cite="https://example.com">Quote</blockquote>',
447+
'<blockquote cite="https://example.com">Quote</blockquote>',
448+
),
449+
'preserves_video_attrs' => array(
450+
'<video src="video.mp4" width="640" height="360" controls poster="thumb.jpg"></video>',
451+
'<video src="video.mp4" width="640" height="360" controls poster="thumb.jpg"></video>',
452+
),
453+
'preserves_audio_attrs' => array(
454+
'<audio src="audio.mp3" controls></audio>',
455+
'<audio src="audio.mp3" controls></audio>',
456+
),
457+
'strips_hreflang' => array(
458+
'<a href="https://example.de" hreflang="de">German</a>',
459+
'<a href="https://example.de">German</a>',
460+
),
461+
'preserves_details_open' => array(
462+
'<details open><summary>Title</summary></details>',
463+
'<details open><summary>Title</summary></details>',
464+
),
465+
'self_closing_tags' => array(
466+
'<br class="clear" />',
467+
'<br />',
468+
),
469+
'no_attributes' => array(
470+
'<p>Simple paragraph</p>',
471+
'<p>Simple paragraph</p>',
472+
),
473+
'plain_text' => array(
474+
'Just plain text',
475+
'Just plain text',
476+
),
477+
'complex_wordpress_figure' => array(
478+
'<figure class="wp-block-image size-large"><img loading="lazy" decoding="async" width="1024" height="768" src="https://example.com/image.jpg" alt="Test" class="wp-image-123" data-id="123" /><figcaption class="wp-element-caption">Caption</figcaption></figure>',
479+
'<figure><img width="1024" height="768" src="https://example.com/image.jpg" alt="Test" /><figcaption>Caption</figcaption></figure>',
480+
),
481+
);
482+
}
483+
484+
/**
485+
* Test clean_html with various inputs.
486+
*
487+
* @dataProvider clean_html_provider
488+
* @covers ::clean_html
489+
*
490+
* @param string $input Input value.
491+
* @param string $expected Expected output.
492+
*/
493+
public function test_clean_html( $input, $expected ) {
494+
$this->assertSame( $expected, Sanitize::clean_html( $input ) );
495+
}
496+
497+
/**
498+
* Test that null input returns null.
499+
*
500+
* @covers ::clean_html
501+
*/
502+
public function test_clean_html_null() {
503+
$this->assertNull( Sanitize::clean_html( null ) );
504+
}
505+
506+
/**
507+
* Test the activitypub_allowed_html filter.
508+
*
509+
* @covers ::clean_html
510+
*/
511+
public function test_allowed_html_filter() {
512+
add_filter(
513+
'activitypub_allowed_html',
514+
function ( $allowed_html ) {
515+
// Add data-custom attribute to span.
516+
$allowed_html['span']['data-custom'] = true;
517+
return $allowed_html;
518+
}
519+
);
520+
521+
$input = '<span data-custom="allowed" data-other="removed">Content</span>';
522+
$expected = '<span data-custom="allowed">Content</span>';
523+
$this->assertSame( $expected, Sanitize::clean_html( $input ) );
524+
525+
remove_all_filters( 'activitypub_allowed_html' );
526+
}
527+
528+
/**
529+
* Test that rel attribute is preserved on anchors.
530+
*
531+
* @covers ::clean_html
532+
*/
533+
public function test_rel_attribute_preserved() {
534+
$input = '<a href="https://example.com" rel="mention">Link</a>';
535+
$expected = '<a href="https://example.com" rel="mention">Link</a>';
536+
$this->assertSame( $expected, Sanitize::clean_html( $input ) );
537+
538+
$input = '<a href="https://example.com" rel="nofollow">Link</a>';
539+
$expected = '<a href="https://example.com" rel="nofollow">Link</a>';
540+
$this->assertSame( $expected, Sanitize::clean_html( $input ) );
541+
}
388542
}

tests/phpunit/tests/includes/transformer/class-test-post.php

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -796,7 +796,8 @@ public function test_reply_block_transforms_to_mention_link_when_first_block() {
796796
$object = Post::transform( $post )->to_object();
797797

798798
// Assert that the reply block was transformed into a mention link.
799-
$this->assertStringContainsString( '<p class="ap-reply-mention"><a rel="mention ugc" href="https://example.com/posts/123" title="@[email protected]">@author</a></p>', $object->get_content() );
799+
// Note: clean_html() strips class from <p> and the mention link doesn't include u-in-reply-to class.
800+
$this->assertStringContainsString( '<p><a rel="mention ugc" href="https://example.com/posts/123" title="@[email protected]">@author</a></p>', $object->get_content() );
800801

801802
// Clean up.
802803
remove_filter( 'activitypub_pre_http_get_remote_object', $filter_remote_object );
@@ -827,7 +828,8 @@ public function test_reply_block_not_transformed_when_not_first_block() {
827828
$content = $object->get_content();
828829

829830
// Assert that the reply block was not transformed into a mention link.
830-
$this->assertStringContainsString( '<div class="activitypub-reply-block wp-block-activitypub-reply" aria-label="Reply" data-in-reply-to="https://example.com/posts/123"><p><a title="This post is a response to the referenced content." aria-label="This post is a response to the referenced content." href="https://example.com/posts/123" class="u-in-reply-to" target="_blank">&#8620;example.com/posts/123</a></p></div>', $content );
831+
// Note: clean_html() strips class/aria-label/data-* from <div> but preserves class/title on <a>.
832+
$this->assertStringContainsString( '<div><p><a title="This post is a response to the referenced content." href="https://example.com/posts/123" class="u-in-reply-to" target="_blank">&#8620;example.com/posts/123</a></p></div>', $content );
831833
}
832834

833835
/**
@@ -882,10 +884,12 @@ public function test_multiple_reply_blocks_only_first_becomes_mention() {
882884
$content = $object->get_content();
883885

884886
// Assert that the first reply block was transformed into a mention link.
885-
$this->assertStringContainsString( '<p class="ap-reply-mention"><a rel="mention ugc" href="https://example.com/posts/123" title="@[email protected]">@author1</a></p>', $content );
887+
// Note: clean_html() strips class from <p> and the mention link doesn't include u-in-reply-to class.
888+
$this->assertStringContainsString( '<p><a rel="mention ugc" href="https://example.com/posts/123" title="@[email protected]">@author1</a></p>', $content );
886889

887890
// Assert that the second reply block was NOT transformed into a mention link (should remain as regular reply block).
888-
$this->assertStringContainsString( '<div class="activitypub-reply-block wp-block-activitypub-reply" aria-label="Reply" data-in-reply-to="https://other.site/posts/456"><p><a title="This post is a response to the referenced content." aria-label="This post is a response to the referenced content." href="https://other.site/posts/456" class="u-in-reply-to" target="_blank">&#8620;other.site/posts/456</a></p></div>', $content );
891+
// Note: clean_html() strips class/aria-label/data-* from <div> but preserves class/title on <a>.
892+
$this->assertStringContainsString( '<div><p><a title="This post is a response to the referenced content." href="https://other.site/posts/456" class="u-in-reply-to" target="_blank">&#8620;other.site/posts/456</a></p></div>', $content );
889893

890894
// Clean up.
891895
remove_filter( 'activitypub_pre_http_get_remote_object', $filter_remote_object );

0 commit comments

Comments
 (0)