feat: add staging brick to clean non-ascii characters from unicode (#366)

natygyoon · web-flow · commit e0eb66de529c · 2023-03-14T21:31:51.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## 0.5.5-dev0
+
+### Enhancements
+
+### Features
+
+* Add `clean_non_ascii_chars` to remove non-ascii characters from unicode string
+
+### Fixes
+
 ## 0.5.4
 
 ### Enhancements
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -613,6 +613,23 @@ Examples:
   clean_postfix(text, r"(END|STOP)", ignore_case=True)
 
 
+``clean_non_ascii_chars``
+-------------------------
+
+Removes non-ascii characters from a string.
+
+Examples:
+
+.. code:: python
+
+  from unstructured.cleaners.core import clean_non_ascii_chars
+
+  text = "\x88This text contains®non-ascii characters!●"
+
+  # Returns "This text containsnon-ascii characters!"
+  clean_non_ascii_chars(text)
+
+
 ``extract_text_before``
 -----------------------
 
diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py
@@ -3,6 +3,24 @@
 from unstructured.cleaners import core
 
 
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        (
+            "\x88This text contains non-ascii characters!\x88",
+            "This text contains non-ascii characters!",
+        ),
+        ("\x93A lovely quote!\x94", "A lovely quote!"),
+        ("● An excellent point! ●●●", " An excellent point! "),
+        ("Item\xa01A", "Item1A"),
+        ("Our dog&apos;s bowl.", "Our dog&apos;s bowl."),
+        ("5 w=E2=80=99s", "5 w=E2=80=99s"),
+    ],
+)
+def test_clean_non_ascii_chars(text, expected):
+    assert core.clean_non_ascii_chars(text) == expected
+
+
 @pytest.mark.parametrize(
     ("text", "expected"),
     [
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.4"  # pragma: no cover
+__version__ = "0.5.5-dev0"  # pragma: no cover
diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
@@ -6,6 +6,18 @@
 from unstructured.nlp.patterns import UNICODE_BULLETS_RE
 
 
+def clean_non_ascii_chars(text) -> str:
+    """Cleans non-ascii characters from unicode string.
+
+    Example
+    -------
+    \x88This text contains non-ascii characters!\x88
+        -> This text contains non-ascii characters!
+    """
+    en = text.encode("ascii", "ignore")
+    return en.decode()
+
+
 def clean_bullets(text) -> str:
     """Cleans unicode bullets from a section of text.
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.5.4" # pragma: no cover`
	`1`	`+__version__ = "0.5.5-dev0" # pragma: no cover`