Skip to content

Commit e0eb66d

Browse files
authored
feat: add staging brick to clean non-ascii characters from unicode (#366)
1 parent edb847c commit e0eb66d

File tree

5 files changed

+58
-1
lines changed

5 files changed

+58
-1
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
## 0.5.5-dev0
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
* Add `clean_non_ascii_chars` to remove non-ascii characters from unicode string
8+
9+
### Fixes
10+
111
## 0.5.4
212

313
### Enhancements

docs/source/bricks.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,23 @@ Examples:
613613
clean_postfix(text, r"(END|STOP)", ignore_case=True)
614614
615615
616+
``clean_non_ascii_chars``
617+
-------------------------
618+
619+
Removes non-ascii characters from a string.
620+
621+
Examples:
622+
623+
.. code:: python
624+
625+
from unstructured.cleaners.core import clean_non_ascii_chars
626+
627+
text = "\x88This text contains®non-ascii characters!●"
628+
629+
# Returns "This text containsnon-ascii characters!"
630+
clean_non_ascii_chars(text)
631+
632+
616633
``extract_text_before``
617634
-----------------------
618635

test_unstructured/cleaners/test_core.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,24 @@
33
from unstructured.cleaners import core
44

55

6+
@pytest.mark.parametrize(
7+
("text", "expected"),
8+
[
9+
(
10+
"\x88This text contains non-ascii characters!\x88",
11+
"This text contains non-ascii characters!",
12+
),
13+
("\x93A lovely quote!\x94", "A lovely quote!"),
14+
("● An excellent point! ●●●", " An excellent point! "),
15+
("Item\xa01A", "Item1A"),
16+
("Our dog's bowl.", "Our dog's bowl."),
17+
("5 w=E2=80=99s", "5 w=E2=80=99s"),
18+
],
19+
)
20+
def test_clean_non_ascii_chars(text, expected):
21+
assert core.clean_non_ascii_chars(text) == expected
22+
23+
624
@pytest.mark.parametrize(
725
("text", "expected"),
826
[

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.4" # pragma: no cover
1+
__version__ = "0.5.5-dev0" # pragma: no cover

unstructured/cleaners/core.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,18 @@
66
from unstructured.nlp.patterns import UNICODE_BULLETS_RE
77

88

9+
def clean_non_ascii_chars(text) -> str:
10+
"""Cleans non-ascii characters from unicode string.
11+
12+
Example
13+
-------
14+
\x88This text contains non-ascii characters!\x88
15+
-> This text contains non-ascii characters!
16+
"""
17+
en = text.encode("ascii", "ignore")
18+
return en.decode()
19+
20+
921
def clean_bullets(text) -> str:
1022
"""Cleans unicode bullets from a section of text.
1123

0 commit comments

Comments
 (0)