feat: strip HTML comments from RSS feed descriptions

manuzhang · manuzhang · commit a4b21c17bd79 · 2026-01-22T23:53:35.000+08:00
Add automatic HTML comment removal from all feed item descriptions to
ensure clean content in RSS and JSON feeds. Comments are stripped from
all content sources including markdown, page.meta descriptions, and
full page content.

- Add strip_html_comments() static method to Util class
- Update get_description_or_abstract() to strip comments in all code paths
- Handle both inline and multiline HTML comments
- Preserve content structure while removing comment artifacts

This prevents HTML comments (including TODO notes, analytics codes,
and other internal annotations) from appearing in published feeds.

Co-Authored-By: Claude &lt;&lt;EMAIL_ADDRESS&gt;&gt;
diff --git a/mkdocs_rss_plugin/util.py b/mkdocs_rss_plugin/util.py
@@ -11,6 +11,8 @@
 from functools import lru_cache
 from mimetypes import guess_type
 from pathlib import Path
+from re import DOTALL
+from re import sub as re_sub
 from typing import Any, Literal
 from urllib.parse import urlencode, urlparse, urlunparse
 
@@ -486,6 +488,18 @@ def get_date_from_meta(
 
         return out_date
 
+    @staticmethod
+    def strip_html_comments(html_content: str) -> str:
+        """Remove HTML comments from content.
+
+        Args:
+            html_content (str): HTML content potentially containing comments
+
+        Returns:
+            str: HTML content with comments removed
+        """
+        return re_sub(r"<!--.*?-->", "", html_content, flags=DOTALL)
+
     def get_description_or_abstract(
         self,
         in_page: Page,
@@ -514,12 +528,14 @@ def get_description_or_abstract(
         # If the full page is wanted (unlimited chars count)
         if chars_count == -1 and (in_page.content or in_page.markdown):
             if in_page.content:
-                return in_page.content
+                return self.strip_html_comments(in_page.content)
             else:
-                return markdown.markdown(in_page.markdown, output_format="html5")
+                return self.strip_html_comments(
+                    markdown.markdown(in_page.markdown, output_format="html5")
+                )
         # If the description is explicitly given
         elif description:
-            return description
+            return self.strip_html_comments(description)
         # If the abstract is cut by the delimiter
         elif (
             abstract_delimiter
@@ -528,18 +544,24 @@ def get_description_or_abstract(
             )
             > -1
         ):
-            return markdown.markdown(
-                in_page.markdown[:excerpt_separator_position],
-                output_format="html5",
+            return self.strip_html_comments(
+                markdown.markdown(
+                    in_page.markdown[:excerpt_separator_position],
+                    output_format="html5",
+                )
             )
         # Use first chars_count from the markdown
         elif chars_count > 0 and in_page.markdown:
             if len(in_page.markdown) <= chars_count:
-                return markdown.markdown(in_page.markdown, output_format="html5")
+                return self.strip_html_comments(
+                    markdown.markdown(in_page.markdown, output_format="html5")
+                )
             else:
-                return markdown.markdown(
-                    f"{in_page.markdown[: chars_count - 3]}...",
-                    output_format="html5",
+                return self.strip_html_comments(
+                    markdown.markdown(
+                        f"{in_page.markdown[: chars_count - 3]}...",
+                        output_format="html5",
+                    )
                 )
         # No explicit description and no (or empty) abstract found
         else: