From 3d40dd6c4b06f0245c34beefa4d0bbd3e3e201ad Mon Sep 17 00:00:00 2001 From: Carmine Sacco Date: Mon, 13 Oct 2025 22:21:46 +0200 Subject: [PATCH] fix: bug 4105 clean_bullets didn't remove en-dashes used as bullets --- CHANGELOG.md | 1 + test_unstructured/cleaners/test_core.py | 1 + unstructured/nlp/patterns.py | 1 + 3 files changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ed763c8ba6..91c261b322 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ ### Features ### Fixes +- Improved clean_bullets in order to remove en-dashes if they are used as bullets ## 0.18.15 diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index eec8edd2b9..f4fb66ca8b 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -30,6 +30,7 @@ def test_clean_non_ascii_chars(text, expected): ("● An excellent point! ●●●", "An excellent point! ●●●"), ("An excellent point!", "An excellent point!"), ("Morse code! ●●●", "Morse code! ●●●"), + ("– An excellent point!", "An excellent point!") ], ) def test_clean_bullets(text, expected): diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py index fe73c6c0dc..f7f2084dbc 100644 --- a/unstructured/nlp/patterns.py +++ b/unstructured/nlp/patterns.py @@ -56,6 +56,7 @@ r"\*", "\x95", "·", + "\u2013", ] BULLETS_PATTERN = "|".join(UNICODE_BULLETS) UNICODE_BULLETS_RE = re.compile(f"(?:{BULLETS_PATTERN})(?!{BULLETS_PATTERN})")