v0.75055 - html sanitizer fixes; beautifulsoup4

FlyingFathead · FlyingFathead · commit 8c5d8d869a1a · 2024-10-21T18:49:59.000+03:00
diff --git a/README.md b/README.md
@@ -236,6 +236,7 @@ If you run into any issues, consult the logs or reach out on the repository's [I
 ---
 
 # Changelog
+- v0.75055 - fixes to the html sanitizer (for Telegram's API; better handling of malformed html), using BeautifulSoup4 for parsing now
 - v0.75054 - small fixes and more error catching in `calc_module.py`
 - v0.75053 - only include eligible territories in U.S. NWS queries
   - list of queried / eligible territories can be set in `config.ini` under the `NWS` section
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
+beautifulsoup4>=4.12.3
 configparser>=6.0.0
 elastic-transport>=8.15.0
 elasticsearch>=8.15.1
diff --git a/src/main.py b/src/main.py
@@ -8,7 +8,7 @@
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
 # version of this program
-version_number = "0.75054"
+version_number = "0.75055"
 
 # Add the project root directory to Python's path
 import sys
diff --git a/src/text_message_handler.py b/src/text_message_handler.py
@@ -19,6 +19,7 @@
 from utils import holiday_replacements
 import holidays
 import pytz
+from bs4 import BeautifulSoup
 
 from telegram import Update
 from telegram.ext import CallbackContext
@@ -607,13 +608,14 @@ async def handle_message(bot, update: Update, context: CallbackContext, logger)
                         # Ensure the bot has a substantive response to send
                         if bot_reply:
                             # Function to clean unsupported tags
-                            def sanitize_html(content):
-                                # Remove unsupported HTML tags
-                                for tag in ['<pre>', '</pre>', '<br>', '<br/>', '</br>', '<div>', '</div>', '<span>', '</span>', '<p>', '</p>']:
-                                    content = content.replace(tag, '')
-                                # Optionally: Replace line breaks with "\n" to preserve formatting
-                                content = content.replace('<br>', '\n').replace('<br/>', '\n')
-                                return content
+                            # # // old method
+                            # def sanitize_html(content):
+                            #     # Remove unsupported HTML tags
+                            #     for tag in ['<pre>', '</pre>', '<br>', '<br/>', '</br>', '<div>', '</div>', '<span>', '</span>', '<p>', '</p>']:
+                            #         content = content.replace(tag, '')
+                            #     # Optionally: Replace line breaks with "\n" to preserve formatting
+                            #     content = content.replace('<br>', '\n').replace('<br/>', '\n')
+                            #     return content
 
                             # Convert markdown to HTML
                             escaped_reply = markdown_to_html(bot_reply)
@@ -679,13 +681,13 @@ def sanitize_html(content):
                         # Ensure the bot has a substantive response to send
                         if bot_reply:
                             # Function to clean unsupported tags
-                            def sanitize_html(content):
-                                # Remove unsupported HTML tags
-                                for tag in ['<pre>', '</pre>', '<br>', '<br/>', '</br>', '<div>', '</div>', '<span>', '</span>', '<p>', '</p>']:
-                                    content = content.replace(tag, '')
-                                # Optionally: Replace line breaks with "\n" to preserve formatting
-                                content = content.replace('<br>', '\n').replace('<br/>', '\n')
-                                return content
+                            # def sanitize_html(content):
+                            #     # Remove unsupported HTML tags
+                            #     for tag in ['<pre>', '</pre>', '<br>', '<br/>', '</br>', '<div>', '</div>', '<span>', '</span>', '<p>', '</p>']:
+                            #         content = content.replace(tag, '')
+                            #     # Optionally: Replace line breaks with "\n" to preserve formatting
+                            #     content = content.replace('<br>', '\n').replace('<br/>', '\n')
+                            #     return content
 
                             # Convert markdown to HTML
                             escaped_reply = markdown_to_html(bot_reply)
@@ -1019,6 +1021,8 @@ def sanitize_html(content):
                 #     parse_mode=ParseMode.HTML
                 # )
 
+                escaped_reply = sanitize_html(escaped_reply)
+
                 message_parts = split_message(escaped_reply)
 
                 for part in message_parts:
@@ -1280,6 +1284,19 @@ def split_message(message, max_length=4000):
 
     return message_parts
 
+# sanitize html
+def sanitize_html(content):
+    soup = BeautifulSoup(content, 'html.parser')
+
+    # Remove unsupported tags
+    for tag in soup.find_all():
+        if tag.name not in ['b', 'i', 'u', 's', 'a', 'code', 'pre']:
+            tag.unwrap()
+
+    # Fix improperly nested tags
+    content = str(soup)
+    return content
+
 # # // (old request type)
 # async def make_api_request(bot, chat_history, timeout=30):
 #     # Prepare the payload for the API request with updated chat history

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+beautifulsoup4>=4.12.3`
`1`	`2`	`configparser>=6.0.0`
`2`	`3`	`elastic-transport>=8.15.0`
`3`	`4`	`elasticsearch>=8.15.1`
Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@`
`8`	`8`	`# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
`9`	`9`	`#`
`10`	`10`	`# version of this program`
`11`		`-version_number = "0.75054"`
	`11`	`+version_number = "0.75055"`
`12`	`12`
`13`	`13`	`# Add the project root directory to Python's path`
`14`	`14`	`import sys`