Skip to content

Commit 75b94c6

Browse files
committed
refactor: clean_html() to improve HTML sanitization
1 parent d06d7de commit 75b94c6

File tree

2 files changed

+23
-17
lines changed

2 files changed

+23
-17
lines changed

course_discovery/apps/course_metadata/tests/test_utils.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -880,10 +880,20 @@ class UtilsTests(TestCase):
880880
'<p>Some text</p>\n<p>· Item 1</p>\n<ul>\n<li>Item 2</li>\n</ul>\n<p>Regular paragraph</p>\n<p>· Item 3</p>'
881881
)
882882
)
883+
@ddt.data(
884+
(
885+
'<p><em>The content of this course also forms part of the six-month online<a href="https://example.com">Example Link</a></em></p>', # pylint: disable=line-too-long
886+
'<p><em>The content of this course also forms part of the six-month online <a href="https://example.com">Example Link</a></em></p>' # pylint: disable=line-too-long
887+
),
888+
(
889+
'<div><p>online course.</p><p><strong>Module 1:</strong></p></div>',
890+
'<p>online course. <strong>Module 1:</strong></p>'
891+
)
892+
)
883893
@ddt.unpack
884894
def test_clean_html(self, content, expected):
885-
""" Verify the method removes unnecessary HTML attributes. """
886-
assert clean_html(content) == expected
895+
result = clean_html(content)
896+
assert result == expected, f"\nExpected:\n{expected}\nGot:\n{result}"
887897

888898
def test_skill_data_transformation(self):
889899
category_data = {

course_discovery/apps/course_metadata/utils.py

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -740,7 +740,6 @@ def handle_tag(self, tag, attrs, start):
740740
"""
741741
if not self.is_p_tag_with_dir:
742742
super().handle_tag(tag, attrs, start)
743-
744743
elif tag not in HTML_TAGS_ATTRIBUTE_WHITELIST and tag != 'span':
745744
if start:
746745
self.outtextf(f'<{tag}')
@@ -773,28 +772,25 @@ def clean_html(content):
773772
(indicating right-to-left direction), this method will ensure that the 'dir' attribute is preserved
774773
or added to maintain consistency with the original content.
775774
"""
775+
if not content:
776+
return ''
776777
LIST_TAGS = ['ul', 'ol']
777778
is_list_with_dir_attr_present = False
778-
779-
cleaned = content.replace('&nbsp;', '') # Keeping the removal of nbsps for historical consistency
780-
# Parse the HTML using BeautifulSoup
779+
cleaned = content.replace('&nbsp;', '')
781780
soup = BeautifulSoup(cleaned, 'lxml')
782-
783781
for tag in soup.find_all(LIST_TAGS, dir="rtl"):
784782
tag.attrs.pop('dir')
785783
is_list_with_dir_attr_present = True
786-
787-
cleaned = str(soup)
788-
# Need to clean empty <b> and <p> tags which are converted to <hr/> by html2text
789-
cleaned = cleaned.replace('<p><b></b></p>', '')
784+
cleaned = str(soup).replace('<p><b></b></p>', '')
790785
html_converter = HTML2TextWithLangSpans(bodywidth=None)
791786
html_converter.wrap_links = False
792-
cleaned = html_converter.handle(cleaned).strip()
793-
cleaned = markdown.markdown(cleaned)
794-
for tag in LIST_TAGS:
795-
cleaned = cleaned.replace(f'<{tag}>', f'<{tag} dir="rtl">') if is_list_with_dir_attr_present else cleaned
796-
797-
return cleaned
787+
markdown_text = html_converter.handle(cleaned).strip()
788+
cleaned = markdown.markdown(markdown_text)
789+
cleaned = re.sub(r'([^\s>])\s*(<a\b)', r'\1 \2', cleaned)
790+
if is_list_with_dir_attr_present:
791+
for tag in LIST_TAGS:
792+
cleaned = cleaned.replace(f'<{tag}>', f'<{tag} dir="rtl">')
793+
return cleaned.strip()
798794

799795

800796
def get_file_from_drive_link(image_url):

0 commit comments

Comments
 (0)