Unstructured-IO
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 2 additions & 0 deletions b/‎Makefile‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/sec-sentiment-analysis/fetch.py‎
Lines changed: 9 additions & 9 deletions b/‎examples/sec-sentiment-analysis/fetch.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎requirements/test.in‎
Lines changed: 1 addition & 0 deletions b/‎requirements/test.in‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎requirements/test.txt‎
Lines changed: 6 additions & 0 deletions b/‎requirements/test.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 2 additions & 2 deletions b/‎setup.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎test_unstructured/cleaners/test_core.py‎
Lines changed: 20 additions & 12 deletions b/‎test_unstructured/cleaners/test_core.py‎
Lines changed: 20 additions & 12 deletions
diff --git a/‎test_unstructured/cleaners/test_extract.py‎
Lines changed: 12 additions & 5 deletions b/‎test_unstructured/cleaners/test_extract.py‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎test_unstructured/cleaners/test_translate.py‎
Lines changed: 1 addition & 1 deletion b/‎test_unstructured/cleaners/test_translate.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test_unstructured/documents/test_base.py‎
Lines changed: 1 addition & 0 deletions b/‎test_unstructured/documents/test_base.py‎
Lines changed: 1 addition & 0 deletions
@@ -1,4 +1,4 @@
-## 0.4.16-dev3
+## 0.4.16-dev4
 
 ### Enhancements
 
 
@@ -116,6 +116,7 @@ check: check-src check-tests check-version
 ## check-src:               runs linters (source only, no tests)
 .PHONY: check-src
 check-src:
+	ruff . --select I,UP015,UP032,UP034,UP018,COM,C4,PT,SIM,PLR0402 --ignore PT011,PT012,SIM117
 	black --line-length 100 ${PACKAGE_NAME} --check
 	flake8 ${PACKAGE_NAME}
 	mypy ${PACKAGE_NAME} --ignore-missing-imports --check-untyped-defs
@@ -140,6 +141,7 @@ check-version:
 ## tidy:                    run black
 .PHONY: tidy
 tidy:
+	ruff . --select I,UP015,UP032,UP034,UP018,COM,C4,PT,SIM,PLR0402 --fix-only || true
 	black --line-length 100 ${PACKAGE_NAME}
 	black --line-length 100 test_${PACKAGE_NAME}
 
 
@@ -2,10 +2,10 @@
 import json
 import os
 import re
-import requests
-from typing import Final, List, Optional, Tuple, Union
 import webbrowser
+from typing import Final, List, Optional, Tuple, Union
 
+import requests
 from ratelimit import limits, sleep_and_retry
 
 SEC_ARCHIVE_URL: Final[str] = "https://www.sec.gov/Archives/edgar/data"
@@ -23,7 +23,7 @@
 
 
 def get_filing(
-    cik: Union[str, int], accession_number: Union[str, int], company: str, email: str
+    cik: Union[str, int], accession_number: Union[str, int], company: str, email: str,
 ) -> str:
     """Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate
     limits specified on the SEC website.
@@ -35,7 +35,7 @@ def get_filing(
 @sleep_and_retry
 @limits(calls=10, period=1)
 def _get_filing(
-    session: requests.Session, cik: Union[str, int], accession_number: Union[str, int]
+    session: requests.Session, cik: Union[str, int], accession_number: Union[str, int],
 ) -> str:
     """Wrapped so filings can be retrieved with an existing session."""
     url = archive_url(cik, accession_number)
@@ -70,7 +70,7 @@ def get_forms_by_cik(session: requests.Session, cik: Union[str, int]) -> dict:
 
 
 def _get_recent_acc_num_by_cik(
-    session: requests.Session, cik: Union[str, int], form_types: List[str]
+    session: requests.Session, cik: Union[str, int], form_types: List[str],
 ) -> Tuple[str, str]:
     """Returns accession number and form type for the most recent filing for one of the
     given form_types (AKA filing types) for a given cik."""
@@ -120,7 +120,7 @@ def get_form_by_ticker(
     session = _get_session(company, email)
     cik = get_cik_by_ticker(session, ticker)
     return get_form_by_cik(
-        cik, form_type, allow_amended_filing=allow_amended_filing, company=company, email=email
+        cik, form_type, allow_amended_filing=allow_amended_filing, company=company, email=email,
     )
 
 
@@ -148,7 +148,7 @@ def get_form_by_cik(
     """
     session = _get_session(company, email)
     acc_num, _ = _get_recent_acc_num_by_cik(
-        session, cik, _form_types(form_type, allow_amended_filing)
+        session, cik, _form_types(form_type, allow_amended_filing),
     )
     text = _get_filing(session, cik, acc_num)
     return text
@@ -173,7 +173,7 @@ def open_form_by_ticker(
     session = _get_session(company, email)
     cik = get_cik_by_ticker(session, ticker)
     acc_num, _ = _get_recent_acc_num_by_cik(
-        session, cik, _form_types(form_type, allow_amended_filing)
+        session, cik, _form_types(form_type, allow_amended_filing),
     )
     open_form(cik, acc_num)
 
@@ -219,7 +219,7 @@ def _get_session(company: Optional[str] = None, email: Optional[str] = None) ->
         {
             "User-Agent": f"{company} {email}",
             "Content-Type": "text/html",
-        }
+        },
     )
     return session
 
 
@@ -13,6 +13,7 @@ pytest-cov
 label_studio_sdk==0.0.17
 types-requests
 vcrpy
+ruff
 
 # NOTE(robinson) - The following pins are to address
 # vulnerabilities in dependency scans
 
@@ -18,6 +18,10 @@ click==8.1.3
     # via
     #   -r requirements/test.in
     #   black
+colorama==0.4.6
+    # via
+    #   click
+    #   pytest
 coverage[toml]==7.1.0
     # via
     #   -r requirements/test.in
@@ -70,6 +74,8 @@ pyyaml==6.0
     # via vcrpy
 requests==2.28.2
     # via label-studio-sdk
+ruff==0.0.252
+    # via -r requirements/test.in
 six==1.16.0
     # via vcrpy
 tomli==2.0.1
 
@@ -17,14 +17,14 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-from setuptools import setup, find_packages
+from setuptools import find_packages, setup
 
 from unstructured.__version__ import __version__
 
 setup(
     name="unstructured",
     description="A library that prepares raw documents for downstream ML tasks.",
-    long_description=open("README.md", "r", encoding="utf-8").read(),
+    long_description=open("README.md", encoding="utf-8").read(),  # noqa: SIM115
     long_description_content_type="text/markdown",
     keywords="NLP PDF HTML CV XML parsing preprocessing",
     url="https://github.com/Unstructured-IO/unstructured",
 
@@ -1,10 +1,10 @@
 import pytest
 
-import unstructured.cleaners.core as core
+from unstructured.cleaners import core
 
 
 @pytest.mark.parametrize(
-    "text, expected",
+    ("text", "expected"),
     [
         ("● An excellent point!", "An excellent point!"),
         ("● An excellent point! ●●●", "An excellent point! ●●●"),
@@ -18,7 +18,7 @@ def test_clean_bullets(text, expected):
 
 
 @pytest.mark.parametrize(
-    "text, expected",
+    ("text", "expected"),
     [
         ("1. Introduction:", "Introduction:"),
         ("a. Introduction:", "Introduction:"),
@@ -43,7 +43,7 @@ def test_clean_ordered_bullets(text, expected):
 
 
 @pytest.mark.parametrize(
-    "text, expected",
+    ("text", "expected"),
     [
         ("\x93A lovely quote!\x94", "“A lovely quote!”"),
         ("\x91A lovely quote!\x92", "‘A lovely quote!’"),
@@ -55,15 +55,15 @@ def test_replace_unicode_quotes(text, expected):
 
 
 @pytest.mark.parametrize(
-    "text, expected",
+    ("text", "expected"),
     [("5 w=E2=80=99s", "5 w’s")],
 )
 def test_replace_mime_encodings(text, expected):
     assert core.replace_mime_encodings(text=text) == expected
 
 
 @pytest.mark.parametrize(
-    "text, expected",
+    ("text", "expected"),
     [
         ("“A lovely quote!”", "A lovely quote"),
         ("‘A lovely quote!’", "A lovely quote"),
@@ -75,7 +75,7 @@ def test_remove_punctuation(text, expected):
 
 
 @pytest.mark.parametrize(
-    "text, expected",
+    ("text", "expected"),
     [
         ("RISK\n\nFACTORS", "RISK FACTORS"),
         ("Item\xa01A", "Item 1A"),
@@ -89,7 +89,7 @@ def test_clean_extra_whitespace(text, expected):
 
 
 @pytest.mark.parametrize(
-    "text, expected",
+    ("text", "expected"),
     [
         ("Risk-factors", "Risk factors"),
         ("Risk – factors", "Risk   factors"),
@@ -103,7 +103,7 @@ def test_clean_dashes(text, expected):
 
 
 @pytest.mark.parametrize(
-    "text, expected",
+    ("text", "expected"),
     [
         ("Item 1A:", "Item 1A"),
         ("Item 1A;", "Item 1A"),
@@ -118,7 +118,7 @@ def test_clean_trailing_punctuation(text, expected):
 
 
 @pytest.mark.parametrize(
-    "text, pattern, ignore_case, strip, expected",
+    ("text", "pattern", "ignore_case", "strip", "expected"),
     [
         ("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
         ("DESC: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
@@ -131,7 +131,7 @@ def test_clean_prefix(text, pattern, ignore_case, strip, expected):
 
 
 @pytest.mark.parametrize(
-    "text, pattern, ignore_case, strip, expected",
+    ("text", "pattern", "ignore_case", "strip", "expected"),
     [
         ("The END! END", r"(END|STOP)", False, True, "The END!"),
         ("The END! STOP", r"(END|STOP)", False, True, "The END!"),
@@ -145,7 +145,15 @@ def test_clean_postfix(text, pattern, ignore_case, strip, expected):
 
 @pytest.mark.parametrize(
     # NOTE(yuming): Tests combined cleaners
-    "text, extra_whitespace, dashes, bullets, lowercase, trailing_punctuation, expected",
+    (
+        "text",
+        "extra_whitespace",
+        "dashes",
+        "bullets",
+        "lowercase",
+        "trailing_punctuation",
+        "expected",
+    ),
     [
         ("  Risk-factors ", True, True, False, False, False, "Risk factors"),
         ("● Point!  ●●● ", True, False, True, False, False, "Point! ●●●"),
 
@@ -1,7 +1,8 @@
-import pytest
 import datetime
 
-import unstructured.cleaners.extract as extract
+import pytest
+
+from unstructured.cleaners import extract
 
 EMAIL_META_DATA_INPUT = """from ABC.DEF.local ([ba23::58b5:2236:45g2:88h2]) by
     \n ABC.DEF.local ([ba23::58b5:2236:45g2:88h2%25]) with mapi id\
@@ -53,12 +54,18 @@ def test_extract_mapi_id():
 
 def test_extract_datetimetz():
     assert extract.extract_datetimetz(EMAIL_META_DATA_INPUT) == datetime.datetime(
-        2021, 3, 26, 11, 4, 9, tzinfo=datetime.timezone(datetime.timedelta(seconds=43200))
+        2021,
+        3,
+        26,
+        11,
+        4,
+        9,
+        tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)),
     )
 
 
 @pytest.mark.parametrize(
-    "text, expected",
+    ("text", "expected"),
     [
         ("215-867-5309", "215-867-5309"),
         ("Phone Number: +1 215.867.5309", "+1 215.867.5309"),
@@ -71,7 +78,7 @@ def test_extract_us_phone_number(text, expected):
 
 
 @pytest.mark.parametrize(
-    "text, expected",
+    ("text", "expected"),
     [
         ("1. Introduction:", ("1", None, None)),
         ("a. Introduction:", ("a", None, None)),
 
@@ -1,6 +1,6 @@
 import pytest
 
-import unstructured.cleaners.translate as translate
+from unstructured.cleaners import translate
 
 
 def test_get_opus_mt_model_name():
 
@@ -1,4 +1,5 @@
 import pytest
+
 from unstructured.documents.base import Document, Page
 from unstructured.documents.elements import NarrativeText, Title
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-## 0.4.16-dev3`
	`1`	`+## 0.4.16-dev4`
`2`	`2`
`3`	`3`	`### Enhancements`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`import pytest`
	`2`	`+`
`2`	`3`	`from unstructured.documents.base import Document, Page`
`3`	`4`	`from unstructured.documents.elements import NarrativeText, Title`
`4`	`5`