refactor: clean_html() to improve HTML sanitization

nbalne · nbalne · commit 75b94c6b18c2 · 2025-09-08T07:02:32.000Z
diff --git a/course_discovery/apps/course_metadata/tests/test_utils.py b/course_discovery/apps/course_metadata/tests/test_utils.py
@@ -880,10 +880,20 @@ class UtilsTests(TestCase):
             '<p>Some text</p>\n<p>· Item 1</p>\n<ul>\n<li>Item 2</li>\n</ul>\n<p>Regular paragraph</p>\n<p>· Item 3</p>'
         )
     )
+    @ddt.data(
+        (
+            '<p><em>The content of this course also forms part of the six-month online<a href="https://example.com">Example Link</a></em></p>',  # pylint: disable=line-too-long
+            '<p><em>The content of this course also forms part of the six-month online <a href="https://example.com">Example Link</a></em></p>'  # pylint: disable=line-too-long
+        ),
+        (
+            '<div><p>online course.</p><p><strong>Module 1:</strong></p></div>',
+            '<p>online course. <strong>Module 1:</strong></p>'
+        )
+    )
     @ddt.unpack
     def test_clean_html(self, content, expected):
-        """ Verify the method removes unnecessary HTML attributes. """
-        assert clean_html(content) == expected
+        result = clean_html(content)
+        assert result == expected, f"\nExpected:\n{expected}\nGot:\n{result}"
 
     def test_skill_data_transformation(self):
         category_data = {
diff --git a/course_discovery/apps/course_metadata/utils.py b/course_discovery/apps/course_metadata/utils.py
@@ -740,7 +740,6 @@ def handle_tag(self, tag, attrs, start):
         """
         if not self.is_p_tag_with_dir:
             super().handle_tag(tag, attrs, start)
-
         elif tag not in HTML_TAGS_ATTRIBUTE_WHITELIST and tag != 'span':
             if start:
                 self.outtextf(f'<{tag}')
@@ -773,28 +772,25 @@ def clean_html(content):
     (indicating right-to-left direction), this method will ensure that the 'dir' attribute is preserved
     or added to maintain consistency with the original content.
     """
+    if not content:
+        return ''
     LIST_TAGS = ['ul', 'ol']
     is_list_with_dir_attr_present = False
-
-    cleaned = content.replace('&nbsp;', '')  # Keeping the removal of nbsps for historical consistency
-    # Parse the HTML using BeautifulSoup
+    cleaned = content.replace('&nbsp;', '')
     soup = BeautifulSoup(cleaned, 'lxml')
-
     for tag in soup.find_all(LIST_TAGS, dir="rtl"):
         tag.attrs.pop('dir')
         is_list_with_dir_attr_present = True
-
-    cleaned = str(soup)
-    # Need to clean empty <b> and <p> tags which are converted to <hr/> by html2text
-    cleaned = cleaned.replace('<p><b></b></p>', '')
+    cleaned = str(soup).replace('<p><b></b></p>', '')
     html_converter = HTML2TextWithLangSpans(bodywidth=None)
     html_converter.wrap_links = False
-    cleaned = html_converter.handle(cleaned).strip()
-    cleaned = markdown.markdown(cleaned)
-    for tag in LIST_TAGS:
-        cleaned = cleaned.replace(f'<{tag}>', f'<{tag} dir="rtl">') if is_list_with_dir_attr_present else cleaned
-
-    return cleaned
+    markdown_text = html_converter.handle(cleaned).strip()
+    cleaned = markdown.markdown(markdown_text)
+    cleaned = re.sub(r'([^\s>])\s*(<a\b)', r'\1 \2', cleaned)
+    if is_list_with_dir_attr_present:
+        for tag in LIST_TAGS:
+            cleaned = cleaned.replace(f'<{tag}>', f'<{tag} dir="rtl">')
+    return cleaned.strip()
 
 
 def get_file_from_drive_link(image_url):