File tree Expand file tree Collapse file tree 5 files changed +58
-1
lines changed
test_unstructured/cleaners Expand file tree Collapse file tree 5 files changed +58
-1
lines changed Original file line number Diff line number Diff line change 1+ ## 0.5.5-dev0
2+
3+ ### Enhancements
4+
5+ ### Features
6+
7+ * Add ` clean_non_ascii_chars ` to remove non-ascii characters from unicode string
8+
9+ ### Fixes
10+
111## 0.5.4
212
313### Enhancements
Original file line number Diff line number Diff line change @@ -613,6 +613,23 @@ Examples:
613613 clean_postfix(text, r " ( END| STOP) " , ignore_case = True )
614614
615615
616+ ``clean_non_ascii_chars ``
617+ -------------------------
618+
619+ Removes non-ascii characters from a string.
620+
621+ Examples:
622+
623+ .. code :: python
624+
625+ from unstructured.cleaners.core import clean_non_ascii_chars
626+
627+ text = " \x88 This text contains®non-ascii characters!●"
628+
629+ # Returns "This text containsnon-ascii characters!"
630+ clean_non_ascii_chars(text)
631+
632+
616633 ``extract_text_before ``
617634-----------------------
618635
Original file line number Diff line number Diff line change 33from unstructured .cleaners import core
44
55
6+ @pytest .mark .parametrize (
7+ ("text" , "expected" ),
8+ [
9+ (
10+ "\x88 This text contains non-ascii characters!\x88 " ,
11+ "This text contains non-ascii characters!" ,
12+ ),
13+ ("\x93 A lovely quote!\x94 " , "A lovely quote!" ),
14+ ("● An excellent point! ●●●" , " An excellent point! " ),
15+ ("Item\xa0 1A" , "Item1A" ),
16+ ("Our dog's bowl." , "Our dog's bowl." ),
17+ ("5 w=E2=80=99s" , "5 w=E2=80=99s" ),
18+ ],
19+ )
20+ def test_clean_non_ascii_chars (text , expected ):
21+ assert core .clean_non_ascii_chars (text ) == expected
22+
23+
624@pytest .mark .parametrize (
725 ("text" , "expected" ),
826 [
Original file line number Diff line number Diff line change 1- __version__ = "0.5.4 " # pragma: no cover
1+ __version__ = "0.5.5-dev0 " # pragma: no cover
Original file line number Diff line number Diff line change 66from unstructured .nlp .patterns import UNICODE_BULLETS_RE
77
88
9+ def clean_non_ascii_chars (text ) -> str :
10+ """Cleans non-ascii characters from unicode string.
11+
12+ Example
13+ -------
14+ \x88 This text contains non-ascii characters!\x88
15+ -> This text contains non-ascii characters!
16+ """
17+ en = text .encode ("ascii" , "ignore" )
18+ return en .decode ()
19+
20+
921def clean_bullets (text ) -> str :
1022 """Cleans unicode bullets from a section of text.
1123
You can’t perform that action at this time.
0 commit comments