Skip to content

Commit 10f0d54

Browse files
authored
build: remove ruff version upper bound (#3829)
**Summary** Remove pin on `ruff` linter and fix the handful of lint errors a newer version catches.
1 parent b092fb7 commit 10f0d54

File tree

10 files changed

+81
-83
lines changed

10 files changed

+81
-83
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.16.12-dev1
1+
## 0.16.12-dev2
22

33
### Enhancements
44

@@ -8,6 +8,8 @@
88

99
### Fixes
1010

11+
- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
12+
1113
## 0.16.11
1214

1315
### Enhancements

pyproject.toml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,17 @@ verboseOutput = true
1212

1313
[tool.ruff]
1414
line-length = 100
15+
target-version = "py39"
1516

16-
# -- changes made here should also be made in `.pre-commit-config.yaml` and `Makefile` --
17-
lint.select = [
17+
[tool.ruff.lint]
18+
ignore = [
19+
"COM812", # -- over aggressively insists on trailing commas where not desireable --
20+
"PT001", # -- wants empty parens on @pytest.fixture where not used (essentially always) --
21+
"PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception --
22+
"PT012", # -- pytest.raises() block should contain a single simple statement --
23+
"SIM117", # -- merge `with` statements for context managers that have same scope --
24+
]
25+
select = [
1826
"C4", # -- flake8-comprehensions --
1927
"COM", # -- flake8-commas --
2028
"E", # -- pycodestyle errors --
@@ -29,11 +37,3 @@ lint.select = [
2937
"UP034", # -- Avoid extraneous parentheses --
3038
"W", # -- Warnings, including invalid escape-sequence --
3139
]
32-
lint.ignore = [
33-
"COM812", # -- over aggressively insists on trailing commas where not desireable --
34-
"PT001", # -- wants empty parens on @pytest.fixture where not used (essentially always) --
35-
"PT005", # -- flags mock fixtures with names intentionally matching private method name --
36-
"PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception --
37-
"PT012", # -- pytest.raises() block should contain a single simple statement --
38-
"SIM117", # -- merge `with` statements for context managers that have same scope --
39-
]

requirements/test.in

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,7 @@ mypy
1111
pydantic
1212
pytest-cov
1313
pytest-mock
14-
# NOTE(robison) - we need to do additional cleanup to pass
15-
# linting for the latest version of ruff
16-
ruff<0.5.0
14+
ruff
1715
types-Markdown
1816
types-requests
1917
types-tabulate

requirements/test.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ requests==2.32.3
171171
# requests-mock
172172
requests-mock==1.12.1
173173
# via label-studio-sdk
174-
ruff==0.4.10
174+
ruff==0.8.3
175175
# via -r ./test.in
176176
semantic-version==2.10.0
177177
# via liccheck

test_unstructured/partition/pdf_image/test_pdf.py

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -245,12 +245,14 @@ def _test(result):
245245
_test(result)
246246
else:
247247
with open(filename, "rb") as test_file:
248-
spooled_temp_file = SpooledTemporaryFile()
249-
spooled_temp_file.write(test_file.read())
250-
spooled_temp_file.seek(0)
251-
result = pdf.partition_pdf(
252-
file=spooled_temp_file, strategy=strategy, starting_page_number=starting_page_number
253-
)
248+
with SpooledTemporaryFile() as spooled_temp_file:
249+
spooled_temp_file.write(test_file.read())
250+
spooled_temp_file.seek(0)
251+
result = pdf.partition_pdf(
252+
file=spooled_temp_file,
253+
strategy=strategy,
254+
starting_page_number=starting_page_number,
255+
)
254256
_test(result)
255257

256258

@@ -757,14 +759,14 @@ def test_partition_pdf_metadata_date(
757759
)
758760
else:
759761
with open(filename, "rb") as test_file:
760-
spooled_temp_file = SpooledTemporaryFile()
761-
spooled_temp_file.write(test_file.read())
762-
spooled_temp_file.seek(0)
763-
elements = pdf.partition_pdf(
764-
file=spooled_temp_file,
765-
strategy=strategy,
766-
metadata_last_modified=metadata_last_modified,
767-
)
762+
with SpooledTemporaryFile() as spooled_temp_file:
763+
spooled_temp_file.write(test_file.read())
764+
spooled_temp_file.seek(0)
765+
elements = pdf.partition_pdf(
766+
file=spooled_temp_file,
767+
strategy=strategy,
768+
metadata_last_modified=metadata_last_modified,
769+
)
768770

769771
assert {el.metadata.last_modified for el in elements} == {expected_last_modified}
770772

@@ -1131,15 +1133,15 @@ def test_partition_pdf_with_ocr_only_strategy(
11311133
)
11321134
else:
11331135
with open(filename, "rb") as test_file:
1134-
spooled_temp_file = SpooledTemporaryFile()
1135-
spooled_temp_file.write(test_file.read())
1136-
spooled_temp_file.seek(0)
1137-
elements = pdf.partition_pdf(
1138-
file=spooled_temp_file,
1139-
strategy=PartitionStrategy.OCR_ONLY,
1140-
languages=["eng"],
1141-
is_image=is_image,
1142-
)
1136+
with SpooledTemporaryFile() as spooled_temp_file:
1137+
spooled_temp_file.write(test_file.read())
1138+
spooled_temp_file.seek(0)
1139+
elements = pdf.partition_pdf(
1140+
file=spooled_temp_file,
1141+
strategy=PartitionStrategy.OCR_ONLY,
1142+
languages=["eng"],
1143+
is_image=is_image,
1144+
)
11431145

11441146
assert elements[0].metadata.languages == ["eng"]
11451147
# check pages

test_unstructured/partition/test_docx.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -77,14 +77,15 @@ def test_partition_docx_with_spooled_file(
7777
`python-docx` will NOT accept a `SpooledTemporaryFile` in Python versions before 3.11 so we need
7878
to ensure the source file is appropriately converted in this case.
7979
"""
80-
with open(mock_document_file_path, "rb") as test_file:
81-
spooled_temp_file = tempfile.SpooledTemporaryFile()
82-
spooled_temp_file.write(test_file.read())
80+
with tempfile.SpooledTemporaryFile() as spooled_temp_file:
81+
with open(mock_document_file_path, "rb") as test_file:
82+
spooled_temp_file.write(test_file.read())
8383
spooled_temp_file.seek(0)
84+
8485
elements = partition_docx(file=spooled_temp_file)
85-
assert elements == expected_elements
86-
for element in elements:
87-
assert element.metadata.filename is None
86+
87+
assert elements == expected_elements
88+
assert all(e.metadata.filename is None for e in elements)
8889

8990

9091
def test_partition_docx_from_file(mock_document_file_path: str, expected_elements: list[Text]):
@@ -921,16 +922,16 @@ def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
921922
def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
922923
self, opts_args: dict[str, Any]
923924
):
924-
spooled_temp_file = tempfile.SpooledTemporaryFile()
925-
spooled_temp_file.write(b"abcdefg")
926-
opts_args["file"] = spooled_temp_file
927-
opts = DocxPartitionerOptions(**opts_args)
925+
with tempfile.SpooledTemporaryFile() as spooled_temp_file:
926+
spooled_temp_file.write(b"abcdefg")
927+
opts_args["file"] = spooled_temp_file
928+
opts = DocxPartitionerOptions(**opts_args)
928929

929-
docx_file = opts._docx_file
930+
docx_file = opts._docx_file
930931

931-
assert docx_file is not spooled_temp_file
932-
assert isinstance(docx_file, io.BytesIO)
933-
assert docx_file.getvalue() == b"abcdefg"
932+
assert docx_file is not spooled_temp_file
933+
assert isinstance(docx_file, io.BytesIO)
934+
assert docx_file.getvalue() == b"abcdefg"
934935

935936
def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
936937
self, opts_args: dict[str, Any]

test_unstructured/partition/test_pptx.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,12 @@ def test_partition_pptx_with_spooled_file():
7474
7575
Including one that does not have its read-pointer set to the start.
7676
"""
77-
with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file:
78-
spooled_temp_file = tempfile.SpooledTemporaryFile()
79-
spooled_temp_file.write(test_file.read())
77+
with tempfile.SpooledTemporaryFile() as spooled_temp_file:
78+
with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file:
79+
spooled_temp_file.write(test_file.read())
80+
8081
elements = partition_pptx(file=spooled_temp_file)
82+
8183
assert elements == EXPECTED_PPTX_OUTPUT
8284
for element in elements:
8385
assert element.metadata.filename is None
@@ -701,16 +703,16 @@ def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
701703
def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
702704
self, opts_args: dict[str, Any]
703705
):
704-
spooled_temp_file = tempfile.SpooledTemporaryFile()
705-
spooled_temp_file.write(b"abcdefg")
706-
opts_args["file"] = spooled_temp_file
707-
opts = PptxPartitionerOptions(**opts_args)
706+
with tempfile.SpooledTemporaryFile() as spooled_temp_file:
707+
spooled_temp_file.write(b"abcdefg")
708+
opts_args["file"] = spooled_temp_file
709+
opts = PptxPartitionerOptions(**opts_args)
708710

709-
pptx_file = opts.pptx_file
711+
pptx_file = opts.pptx_file
710712

711-
assert pptx_file is not spooled_temp_file
712-
assert isinstance(pptx_file, io.BytesIO)
713-
assert pptx_file.getvalue() == b"abcdefg"
713+
assert pptx_file is not spooled_temp_file
714+
assert isinstance(pptx_file, io.BytesIO)
715+
assert pptx_file.getvalue() == b"abcdefg"
714716

715717
def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
716718
self, opts_args: dict[str, Any]

test_unstructured/partition/test_xlsx.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,12 @@ def test_partition_xlsx_from_filename():
6464

6565

6666
def test_partition_xlsx_from_SpooledTemporaryFile_with_emoji():
67-
f = tempfile.SpooledTemporaryFile()
68-
with open("example-docs/emoji.xlsx", "rb") as g:
69-
f.write(g.read())
70-
elements = partition_xlsx(file=f, include_header=False)
67+
with tempfile.SpooledTemporaryFile() as f:
68+
with open("example-docs/emoji.xlsx", "rb") as g:
69+
f.write(g.read())
70+
71+
elements = partition_xlsx(file=f, include_header=False)
72+
7173
assert sum(isinstance(element, Text) for element in elements) == 1
7274
assert len(elements) == 1
7375
assert clean_extra_whitespace(elements[0].text) == "🤠😅"

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.12-dev1" # pragma: no cover
1+
__version__ = "0.16.12-dev2" # pragma: no cover

unstructured/partition/html/transformations.py

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ def ontology_to_unstructured_elements(
5353
"""
5454
elements_to_return = []
5555
if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT:
56-
5756
if page_number is None and isinstance(ontology_element, ontology.Page):
5857
page_number = ontology_element.page_number
5958

@@ -200,10 +199,7 @@ def is_text_element(ontology_element: ontology.OntologyElement) -> bool:
200199
if any(isinstance(ontology_element, class_) for class_ in text_classes):
201200
return True
202201

203-
if any(ontology_element.elementType == category for category in text_categories):
204-
return True
205-
206-
return False
202+
return any(ontology_element.elementType == category for category in text_categories)
207203

208204

209205
def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
@@ -218,10 +214,7 @@ def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
218214
if any(isinstance(ontology_element, class_) for class_ in inline_classes):
219215
return True
220216

221-
if any(ontology_element.elementType == category for category in inline_categories):
222-
return True
223-
224-
return False
217+
return any(ontology_element.elementType == category for category in inline_categories)
225218

226219

227220
def unstructured_elements_to_ontology(
@@ -327,10 +320,7 @@ def is_empty(tag):
327320
if tag.attrs:
328321
return False
329322

330-
if not tag.get_text(strip=True):
331-
return True
332-
333-
return False
323+
return bool(not tag.get_text(strip=True))
334324

335325
def remove_empty_tags(soup):
336326
for tag in soup.find_all():
@@ -419,8 +409,9 @@ def extract_tag_and_ontology_class_from_tag(
419409

420410
# Scenario 1: Valid Ontology Element
421411
if soup.attrs.get("class"):
422-
html_tag, element_class = soup.name, HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get(
423-
(soup.name, soup.attrs["class"][0])
412+
html_tag, element_class = (
413+
soup.name,
414+
HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get((soup.name, soup.attrs["class"][0])),
424415
)
425416

426417
# Scenario 2: HTML tag incorrect, CSS class correct

0 commit comments

Comments
 (0)