Skip to content

Commit a4b21c1

Browse files
committed
feat: strip HTML comments from RSS feed descriptions
Add automatic HTML comment removal from all feed item descriptions to ensure clean content in RSS and JSON feeds. Comments are stripped from all content sources including markdown, page.meta descriptions, and full page content. - Add strip_html_comments() static method to Util class - Update get_description_or_abstract() to strip comments in all code paths - Handle both inline and multiline HTML comments - Preserve content structure while removing comment artifacts This prevents HTML comments (including TODO notes, analytics codes, and other internal annotations) from appearing in published feeds. Co-Authored-By: Claude <<EMAIL_ADDRESS>>
1 parent 8b37500 commit a4b21c1

File tree

1 file changed

+32
-10
lines changed

1 file changed

+32
-10
lines changed

mkdocs_rss_plugin/util.py

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
from functools import lru_cache
1212
from mimetypes import guess_type
1313
from pathlib import Path
14+
from re import DOTALL
15+
from re import sub as re_sub
1416
from typing import Any, Literal
1517
from urllib.parse import urlencode, urlparse, urlunparse
1618

@@ -486,6 +488,18 @@ def get_date_from_meta(
486488

487489
return out_date
488490

491+
@staticmethod
492+
def strip_html_comments(html_content: str) -> str:
493+
"""Remove HTML comments from content.
494+
495+
Args:
496+
html_content (str): HTML content potentially containing comments
497+
498+
Returns:
499+
str: HTML content with comments removed
500+
"""
501+
return re_sub(r"<!--.*?-->", "", html_content, flags=DOTALL)
502+
489503
def get_description_or_abstract(
490504
self,
491505
in_page: Page,
@@ -514,12 +528,14 @@ def get_description_or_abstract(
514528
# If the full page is wanted (unlimited chars count)
515529
if chars_count == -1 and (in_page.content or in_page.markdown):
516530
if in_page.content:
517-
return in_page.content
531+
return self.strip_html_comments(in_page.content)
518532
else:
519-
return markdown.markdown(in_page.markdown, output_format="html5")
533+
return self.strip_html_comments(
534+
markdown.markdown(in_page.markdown, output_format="html5")
535+
)
520536
# If the description is explicitly given
521537
elif description:
522-
return description
538+
return self.strip_html_comments(description)
523539
# If the abstract is cut by the delimiter
524540
elif (
525541
abstract_delimiter
@@ -528,18 +544,24 @@ def get_description_or_abstract(
528544
)
529545
> -1
530546
):
531-
return markdown.markdown(
532-
in_page.markdown[:excerpt_separator_position],
533-
output_format="html5",
547+
return self.strip_html_comments(
548+
markdown.markdown(
549+
in_page.markdown[:excerpt_separator_position],
550+
output_format="html5",
551+
)
534552
)
535553
# Use first chars_count from the markdown
536554
elif chars_count > 0 and in_page.markdown:
537555
if len(in_page.markdown) <= chars_count:
538-
return markdown.markdown(in_page.markdown, output_format="html5")
556+
return self.strip_html_comments(
557+
markdown.markdown(in_page.markdown, output_format="html5")
558+
)
539559
else:
540-
return markdown.markdown(
541-
f"{in_page.markdown[: chars_count - 3]}...",
542-
output_format="html5",
560+
return self.strip_html_comments(
561+
markdown.markdown(
562+
f"{in_page.markdown[: chars_count - 3]}...",
563+
output_format="html5",
564+
)
543565
)
544566
# No explicit description and no (or empty) abstract found
545567
else:

0 commit comments

Comments
 (0)