Skip to content

Commit 5eb1466

Browse files
authored
Resolve various style issues to improve overall code quality (#282)
* Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
1 parent 5db94fd commit 5eb1466

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+601
-473
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.4.16-dev3
1+
## 0.4.16-dev4
22

33
### Enhancements
44

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ check: check-src check-tests check-version
116116
## check-src: runs linters (source only, no tests)
117117
.PHONY: check-src
118118
check-src:
119+
ruff . --select I,UP015,UP032,UP034,UP018,COM,C4,PT,SIM,PLR0402 --ignore PT011,PT012,SIM117
119120
black --line-length 100 ${PACKAGE_NAME} --check
120121
flake8 ${PACKAGE_NAME}
121122
mypy ${PACKAGE_NAME} --ignore-missing-imports --check-untyped-defs
@@ -140,6 +141,7 @@ check-version:
140141
## tidy: run black
141142
.PHONY: tidy
142143
tidy:
144+
ruff . --select I,UP015,UP032,UP034,UP018,COM,C4,PT,SIM,PLR0402 --fix-only || true
143145
black --line-length 100 ${PACKAGE_NAME}
144146
black --line-length 100 test_${PACKAGE_NAME}
145147

examples/sec-sentiment-analysis/fetch.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
import json
33
import os
44
import re
5-
import requests
6-
from typing import Final, List, Optional, Tuple, Union
75
import webbrowser
6+
from typing import Final, List, Optional, Tuple, Union
87

8+
import requests
99
from ratelimit import limits, sleep_and_retry
1010

1111
SEC_ARCHIVE_URL: Final[str] = "https://www.sec.gov/Archives/edgar/data"
@@ -23,7 +23,7 @@
2323

2424

2525
def get_filing(
26-
cik: Union[str, int], accession_number: Union[str, int], company: str, email: str
26+
cik: Union[str, int], accession_number: Union[str, int], company: str, email: str,
2727
) -> str:
2828
"""Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate
2929
limits specified on the SEC website.
@@ -35,7 +35,7 @@ def get_filing(
3535
@sleep_and_retry
3636
@limits(calls=10, period=1)
3737
def _get_filing(
38-
session: requests.Session, cik: Union[str, int], accession_number: Union[str, int]
38+
session: requests.Session, cik: Union[str, int], accession_number: Union[str, int],
3939
) -> str:
4040
"""Wrapped so filings can be retrieved with an existing session."""
4141
url = archive_url(cik, accession_number)
@@ -70,7 +70,7 @@ def get_forms_by_cik(session: requests.Session, cik: Union[str, int]) -> dict:
7070

7171

7272
def _get_recent_acc_num_by_cik(
73-
session: requests.Session, cik: Union[str, int], form_types: List[str]
73+
session: requests.Session, cik: Union[str, int], form_types: List[str],
7474
) -> Tuple[str, str]:
7575
"""Returns accession number and form type for the most recent filing for one of the
7676
given form_types (AKA filing types) for a given cik."""
@@ -120,7 +120,7 @@ def get_form_by_ticker(
120120
session = _get_session(company, email)
121121
cik = get_cik_by_ticker(session, ticker)
122122
return get_form_by_cik(
123-
cik, form_type, allow_amended_filing=allow_amended_filing, company=company, email=email
123+
cik, form_type, allow_amended_filing=allow_amended_filing, company=company, email=email,
124124
)
125125

126126

@@ -148,7 +148,7 @@ def get_form_by_cik(
148148
"""
149149
session = _get_session(company, email)
150150
acc_num, _ = _get_recent_acc_num_by_cik(
151-
session, cik, _form_types(form_type, allow_amended_filing)
151+
session, cik, _form_types(form_type, allow_amended_filing),
152152
)
153153
text = _get_filing(session, cik, acc_num)
154154
return text
@@ -173,7 +173,7 @@ def open_form_by_ticker(
173173
session = _get_session(company, email)
174174
cik = get_cik_by_ticker(session, ticker)
175175
acc_num, _ = _get_recent_acc_num_by_cik(
176-
session, cik, _form_types(form_type, allow_amended_filing)
176+
session, cik, _form_types(form_type, allow_amended_filing),
177177
)
178178
open_form(cik, acc_num)
179179

@@ -219,7 +219,7 @@ def _get_session(company: Optional[str] = None, email: Optional[str] = None) ->
219219
{
220220
"User-Agent": f"{company} {email}",
221221
"Content-Type": "text/html",
222-
}
222+
},
223223
)
224224
return session
225225

requirements/test.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ pytest-cov
1313
label_studio_sdk==0.0.17
1414
types-requests
1515
vcrpy
16+
ruff
1617

1718
# NOTE(robinson) - The following pins are to address
1819
# vulnerabilities in dependency scans

requirements/test.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ click==8.1.3
1818
# via
1919
# -r requirements/test.in
2020
# black
21+
colorama==0.4.6
22+
# via
23+
# click
24+
# pytest
2125
coverage[toml]==7.1.0
2226
# via
2327
# -r requirements/test.in
@@ -70,6 +74,8 @@ pyyaml==6.0
7074
# via vcrpy
7175
requests==2.28.2
7276
# via label-studio-sdk
77+
ruff==0.0.252
78+
# via -r requirements/test.in
7379
six==1.16.0
7480
# via vcrpy
7581
tomli==2.0.1

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@
1717
See the License for the specific language governing permissions and
1818
limitations under the License.
1919
"""
20-
from setuptools import setup, find_packages
20+
from setuptools import find_packages, setup
2121

2222
from unstructured.__version__ import __version__
2323

2424
setup(
2525
name="unstructured",
2626
description="A library that prepares raw documents for downstream ML tasks.",
27-
long_description=open("README.md", "r", encoding="utf-8").read(),
27+
long_description=open("README.md", encoding="utf-8").read(), # noqa: SIM115
2828
long_description_content_type="text/markdown",
2929
keywords="NLP PDF HTML CV XML parsing preprocessing",
3030
url="https://github.com/Unstructured-IO/unstructured",

test_unstructured/cleaners/test_core.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
import pytest
22

3-
import unstructured.cleaners.core as core
3+
from unstructured.cleaners import core
44

55

66
@pytest.mark.parametrize(
7-
"text, expected",
7+
("text", "expected"),
88
[
99
("● An excellent point!", "An excellent point!"),
1010
("● An excellent point! ●●●", "An excellent point! ●●●"),
@@ -18,7 +18,7 @@ def test_clean_bullets(text, expected):
1818

1919

2020
@pytest.mark.parametrize(
21-
"text, expected",
21+
("text", "expected"),
2222
[
2323
("1. Introduction:", "Introduction:"),
2424
("a. Introduction:", "Introduction:"),
@@ -43,7 +43,7 @@ def test_clean_ordered_bullets(text, expected):
4343

4444

4545
@pytest.mark.parametrize(
46-
"text, expected",
46+
("text", "expected"),
4747
[
4848
("\x93A lovely quote!\x94", "“A lovely quote!”"),
4949
("\x91A lovely quote!\x92", "‘A lovely quote!’"),
@@ -55,15 +55,15 @@ def test_replace_unicode_quotes(text, expected):
5555

5656

5757
@pytest.mark.parametrize(
58-
"text, expected",
58+
("text", "expected"),
5959
[("5 w=E2=80=99s", "5 w’s")],
6060
)
6161
def test_replace_mime_encodings(text, expected):
6262
assert core.replace_mime_encodings(text=text) == expected
6363

6464

6565
@pytest.mark.parametrize(
66-
"text, expected",
66+
("text", "expected"),
6767
[
6868
("“A lovely quote!”", "A lovely quote"),
6969
("‘A lovely quote!’", "A lovely quote"),
@@ -75,7 +75,7 @@ def test_remove_punctuation(text, expected):
7575

7676

7777
@pytest.mark.parametrize(
78-
"text, expected",
78+
("text", "expected"),
7979
[
8080
("RISK\n\nFACTORS", "RISK FACTORS"),
8181
("Item\xa01A", "Item 1A"),
@@ -89,7 +89,7 @@ def test_clean_extra_whitespace(text, expected):
8989

9090

9191
@pytest.mark.parametrize(
92-
"text, expected",
92+
("text", "expected"),
9393
[
9494
("Risk-factors", "Risk factors"),
9595
("Risk – factors", "Risk factors"),
@@ -103,7 +103,7 @@ def test_clean_dashes(text, expected):
103103

104104

105105
@pytest.mark.parametrize(
106-
"text, expected",
106+
("text", "expected"),
107107
[
108108
("Item 1A:", "Item 1A"),
109109
("Item 1A;", "Item 1A"),
@@ -118,7 +118,7 @@ def test_clean_trailing_punctuation(text, expected):
118118

119119

120120
@pytest.mark.parametrize(
121-
"text, pattern, ignore_case, strip, expected",
121+
("text", "pattern", "ignore_case", "strip", "expected"),
122122
[
123123
("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
124124
("DESC: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
@@ -131,7 +131,7 @@ def test_clean_prefix(text, pattern, ignore_case, strip, expected):
131131

132132

133133
@pytest.mark.parametrize(
134-
"text, pattern, ignore_case, strip, expected",
134+
("text", "pattern", "ignore_case", "strip", "expected"),
135135
[
136136
("The END! END", r"(END|STOP)", False, True, "The END!"),
137137
("The END! STOP", r"(END|STOP)", False, True, "The END!"),
@@ -145,7 +145,15 @@ def test_clean_postfix(text, pattern, ignore_case, strip, expected):
145145

146146
@pytest.mark.parametrize(
147147
# NOTE(yuming): Tests combined cleaners
148-
"text, extra_whitespace, dashes, bullets, lowercase, trailing_punctuation, expected",
148+
(
149+
"text",
150+
"extra_whitespace",
151+
"dashes",
152+
"bullets",
153+
"lowercase",
154+
"trailing_punctuation",
155+
"expected",
156+
),
149157
[
150158
(" Risk-factors ", True, True, False, False, False, "Risk factors"),
151159
("● Point! ●●● ", True, False, True, False, False, "Point! ●●●"),

test_unstructured/cleaners/test_extract.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
import pytest
21
import datetime
32

4-
import unstructured.cleaners.extract as extract
3+
import pytest
4+
5+
from unstructured.cleaners import extract
56

67
EMAIL_META_DATA_INPUT = """from ABC.DEF.local ([ba23::58b5:2236:45g2:88h2]) by
78
\n ABC.DEF.local ([ba23::58b5:2236:45g2:88h2%25]) with mapi id\
@@ -53,12 +54,18 @@ def test_extract_mapi_id():
5354

5455
def test_extract_datetimetz():
5556
assert extract.extract_datetimetz(EMAIL_META_DATA_INPUT) == datetime.datetime(
56-
2021, 3, 26, 11, 4, 9, tzinfo=datetime.timezone(datetime.timedelta(seconds=43200))
57+
2021,
58+
3,
59+
26,
60+
11,
61+
4,
62+
9,
63+
tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)),
5764
)
5865

5966

6067
@pytest.mark.parametrize(
61-
"text, expected",
68+
("text", "expected"),
6269
[
6370
("215-867-5309", "215-867-5309"),
6471
("Phone Number: +1 215.867.5309", "+1 215.867.5309"),
@@ -71,7 +78,7 @@ def test_extract_us_phone_number(text, expected):
7178

7279

7380
@pytest.mark.parametrize(
74-
"text, expected",
81+
("text", "expected"),
7582
[
7683
("1. Introduction:", ("1", None, None)),
7784
("a. Introduction:", ("a", None, None)),

test_unstructured/cleaners/test_translate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import pytest
22

3-
import unstructured.cleaners.translate as translate
3+
from unstructured.cleaners import translate
44

55

66
def test_get_opus_mt_model_name():

test_unstructured/documents/test_base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import pytest
2+
23
from unstructured.documents.base import Document, Page
34
from unstructured.documents.elements import NarrativeText, Title
45

0 commit comments

Comments
 (0)