Skip to content

Commit 8c5d8d8

Browse files
committed
v0.75055 - html sanitizer fixes; beautifulsoup4
1 parent c3c6379 commit 8c5d8d8

File tree

4 files changed

+34
-15
lines changed

4 files changed

+34
-15
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ If you run into any issues, consult the logs or reach out on the repository's [I
236236
---
237237

238238
# Changelog
239+
- v0.75055 - fixes to the html sanitizer (for Telegram's API; better handling of malformed html), using BeautifulSoup4 for parsing now
239240
- v0.75054 - small fixes and more error catching in `calc_module.py`
240241
- v0.75053 - only include eligible territories in U.S. NWS queries
241242
- list of queried / eligible territories can be set in `config.ini` under the `NWS` section

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
beautifulsoup4>=4.12.3
12
configparser>=6.0.0
23
elastic-transport>=8.15.0
34
elasticsearch>=8.15.1

src/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
99
#
1010
# version of this program
11-
version_number = "0.75054"
11+
version_number = "0.75055"
1212

1313
# Add the project root directory to Python's path
1414
import sys

src/text_message_handler.py

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from utils import holiday_replacements
2020
import holidays
2121
import pytz
22+
from bs4 import BeautifulSoup
2223

2324
from telegram import Update
2425
from telegram.ext import CallbackContext
@@ -607,13 +608,14 @@ async def handle_message(bot, update: Update, context: CallbackContext, logger)
607608
# Ensure the bot has a substantive response to send
608609
if bot_reply:
609610
# Function to clean unsupported tags
610-
def sanitize_html(content):
611-
# Remove unsupported HTML tags
612-
for tag in ['<pre>', '</pre>', '<br>', '<br/>', '</br>', '<div>', '</div>', '<span>', '</span>', '<p>', '</p>']:
613-
content = content.replace(tag, '')
614-
# Optionally: Replace line breaks with "\n" to preserve formatting
615-
content = content.replace('<br>', '\n').replace('<br/>', '\n')
616-
return content
611+
# # // old method
612+
# def sanitize_html(content):
613+
# # Remove unsupported HTML tags
614+
# for tag in ['<pre>', '</pre>', '<br>', '<br/>', '</br>', '<div>', '</div>', '<span>', '</span>', '<p>', '</p>']:
615+
# content = content.replace(tag, '')
616+
# # Optionally: Replace line breaks with "\n" to preserve formatting
617+
# content = content.replace('<br>', '\n').replace('<br/>', '\n')
618+
# return content
617619

618620
# Convert markdown to HTML
619621
escaped_reply = markdown_to_html(bot_reply)
@@ -679,13 +681,13 @@ def sanitize_html(content):
679681
# Ensure the bot has a substantive response to send
680682
if bot_reply:
681683
# Function to clean unsupported tags
682-
def sanitize_html(content):
683-
# Remove unsupported HTML tags
684-
for tag in ['<pre>', '</pre>', '<br>', '<br/>', '</br>', '<div>', '</div>', '<span>', '</span>', '<p>', '</p>']:
685-
content = content.replace(tag, '')
686-
# Optionally: Replace line breaks with "\n" to preserve formatting
687-
content = content.replace('<br>', '\n').replace('<br/>', '\n')
688-
return content
684+
# def sanitize_html(content):
685+
# # Remove unsupported HTML tags
686+
# for tag in ['<pre>', '</pre>', '<br>', '<br/>', '</br>', '<div>', '</div>', '<span>', '</span>', '<p>', '</p>']:
687+
# content = content.replace(tag, '')
688+
# # Optionally: Replace line breaks with "\n" to preserve formatting
689+
# content = content.replace('<br>', '\n').replace('<br/>', '\n')
690+
# return content
689691

690692
# Convert markdown to HTML
691693
escaped_reply = markdown_to_html(bot_reply)
@@ -1019,6 +1021,8 @@ def sanitize_html(content):
10191021
# parse_mode=ParseMode.HTML
10201022
# )
10211023

1024+
escaped_reply = sanitize_html(escaped_reply)
1025+
10221026
message_parts = split_message(escaped_reply)
10231027

10241028
for part in message_parts:
@@ -1280,6 +1284,19 @@ def split_message(message, max_length=4000):
12801284

12811285
return message_parts
12821286

1287+
# sanitize html
1288+
def sanitize_html(content):
1289+
soup = BeautifulSoup(content, 'html.parser')
1290+
1291+
# Remove unsupported tags
1292+
for tag in soup.find_all():
1293+
if tag.name not in ['b', 'i', 'u', 's', 'a', 'code', 'pre']:
1294+
tag.unwrap()
1295+
1296+
# Fix improperly nested tags
1297+
content = str(soup)
1298+
return content
1299+
12831300
# # // (old request type)
12841301
# async def make_api_request(bot, chat_history, timeout=30):
12851302
# # Prepare the payload for the API request with updated chat history

0 commit comments

Comments
 (0)