Skip to content

Commit a915231

Browse files
authored
refactor: Introduce 'exactly_one' to simplify partitioning functions (#343)
1 parent 70420b5 commit a915231

File tree

11 files changed

+81
-65
lines changed

11 files changed

+81
-65
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
## 0.5.3-dev3
1+
## 0.5.3-dev4
22

33
### Enhancements
44

5+
* Simplify partitioning functions.
56
* Improve logging for ingest CLI.
67

78
### Features

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.3-dev3" # pragma: no cover
1+
__version__ = "0.5.3-dev4" # pragma: no cover

unstructured/partition/common.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,17 @@ def layout_list_to_list_items(text: str, coordinates: List[float]) -> List[Eleme
5252
list_items: List[Element] = []
5353
for text_segment in split_items:
5454
if len(text_segment.strip()) > 0:
55-
list_items.append(ListItem(text=text_segment.strip(), coordinates=coordinates))
55+
list_items.append(
56+
ListItem(text=text_segment.strip(), coordinates=coordinates),
57+
)
5658

5759
return list_items
5860

5961

60-
def document_to_element_list(document, include_page_breaks: bool = False) -> List[Element]:
62+
def document_to_element_list(
63+
document,
64+
include_page_breaks: bool = False,
65+
) -> List[Element]:
6166
"""Converts a DocumentLayout object to a list of unstructured elements."""
6267
elements: List[Element] = []
6368
num_pages = len(document.pages)
@@ -124,3 +129,19 @@ def convert_office_doc(input_filename: str, output_directory: str, target_format
124129
- Mac: https://formulae.brew.sh/cask/libreoffice
125130
- Debian: https://wiki.debian.org/LibreOffice""",
126131
)
132+
133+
134+
def exactly_one(**kwargs) -> None:
135+
"""
136+
Verify arguments; exactly one of all keyword arguments must not be None.
137+
138+
Example:
139+
>>> exactly_one(filename=filename, file=file, text=text, url=url)
140+
"""
141+
if sum([(arg is not None) for arg in kwargs.values()]) != 1:
142+
names = list(kwargs.keys())
143+
if len(names) > 1:
144+
message = f"Exactly one of {', '.join(names[:-1])} and {names[-1]} must be specified."
145+
else:
146+
message = f"{names[0]} must be specified."
147+
raise ValueError(message)

unstructured/partition/doc.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from typing import IO, List, Optional
44

55
from unstructured.documents.elements import Element
6-
from unstructured.partition.common import convert_office_doc
6+
from unstructured.partition.common import convert_office_doc, exactly_one
77
from unstructured.partition.docx import partition_docx
88

99

@@ -12,25 +12,25 @@ def partition_doc(filename: Optional[str] = None, file: Optional[IO] = None) ->
1212
1313
Parameters
1414
----------
15-
filename
15+
filename
1616
A string defining the target filename path.
1717
file
1818
A file-like object using "rb" mode --> open(filename, "rb").
1919
"""
20-
if not any([filename, file]):
21-
raise ValueError("One of filename or file must be specified.")
20+
# Verify that only one of the arguments was provided
21+
exactly_one(filename=filename, file=file)
2222

23-
if filename is not None and not file:
23+
if filename is not None:
2424
_, filename_no_path = os.path.split(os.path.abspath(filename))
2525
base_filename, _ = os.path.splitext(filename_no_path)
26-
elif file is not None and not filename:
26+
elif file is not None:
2727
tmp = tempfile.NamedTemporaryFile(delete=False)
2828
tmp.write(file.read())
2929
tmp.close()
3030
filename = tmp.name
3131
_, filename_no_path = os.path.split(os.path.abspath(tmp.name))
3232
else:
33-
raise ValueError("Only one of filename or file can be specified.")
33+
raise ValueError("Exactly one of filename and file must be specified.")
3434

3535
if not os.path.exists(filename):
3636
raise ValueError(f"The file {filename} does not exist.")

unstructured/partition/docx.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
Text,
1313
Title,
1414
)
15+
from unstructured.partition.common import exactly_one
1516
from unstructured.partition.text_type import (
1617
is_bulleted_text,
1718
is_possible_narrative_text,
@@ -75,15 +76,13 @@ def partition_docx(
7576
metadata.
7677
"""
7778

78-
if not any([filename, file]):
79-
raise ValueError("One of filename or file must be specified.")
79+
# Verify that only one of the arguments was provided
80+
exactly_one(filename=filename, file=file)
8081

81-
if filename is not None and not file:
82+
if filename is not None:
8283
document = docx.Document(filename)
83-
elif file is not None and not filename:
84+
elif file is not None:
8485
document = docx.Document(file)
85-
else:
86-
raise ValueError("Only one of filename or file can be specified.")
8786

8887
metadata_filename = metadata_filename or filename
8988
elements: List[Element] = []

unstructured/partition/email.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from email.message import Message
55
from typing import IO, Dict, List, Optional, Tuple, Union
66

7+
from unstructured.partition.common import exactly_one
8+
79
if sys.version_info < (3, 8):
810
from typing_extensions import Final
911
else:
@@ -166,14 +168,14 @@ def partition_email(
166168
f"Valid content sources are: {VALID_CONTENT_SOURCES}",
167169
)
168170

169-
if not any([filename, file, text]):
170-
raise ValueError("One of filename, file, or text must be specified.")
171+
# Verify that only one of the arguments was provided
172+
exactly_one(filename=filename, file=file, text=text)
171173

172-
if filename is not None and not file and not text:
174+
if filename is not None:
173175
with open(filename) as f:
174176
msg = email.message_from_file(f)
175177

176-
elif file is not None and not filename and not text:
178+
elif file is not None:
177179
file_content = file.read()
178180
if isinstance(file_content, bytes):
179181
file_text = file_content.decode(encoding)
@@ -182,13 +184,10 @@ def partition_email(
182184

183185
msg = email.message_from_string(file_text)
184186

185-
elif text is not None and not filename and not file:
187+
elif text is not None:
186188
_text: str = str(text)
187189
msg = email.message_from_string(_text)
188190

189-
else:
190-
raise ValueError("Only one of filename, file, or text can be specified.")
191-
192191
content_map: Dict[str, str] = {}
193192
for part in msg.walk():
194193
# NOTE(robinson) - content dispostiion is None for the content of the email itself.

unstructured/partition/html.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,11 @@
55
from unstructured.documents.elements import Element
66
from unstructured.documents.html import HTMLDocument
77
from unstructured.documents.xml import VALID_PARSERS
8-
from unstructured.partition.common import add_element_metadata, document_to_element_list
8+
from unstructured.partition.common import (
9+
add_element_metadata,
10+
document_to_element_list,
11+
exactly_one,
12+
)
913

1014

1115
def partition_html(
@@ -40,16 +44,16 @@ def partition_html(
4044
parser
4145
The parser to use for parsing the HTML document. If None, default parser will be used.
4246
"""
43-
if not any([filename, file, text, url]):
44-
raise ValueError("One of filename, file, or text must be specified.")
47+
# Verify that only one of the arguments was provided
48+
exactly_one(filename=filename, file=file, text=text, url=url)
4549

4650
if not encoding:
4751
encoding = "utf-8"
4852

49-
if filename is not None and not file and not text and not url:
53+
if filename is not None:
5054
document = HTMLDocument.from_file(filename, parser=parser, encoding=encoding)
5155

52-
elif file is not None and not filename and not text and not url:
56+
elif file is not None:
5357
file_content = file.read()
5458
if isinstance(file_content, bytes):
5559
file_text = file_content.decode(encoding)
@@ -58,11 +62,11 @@ def partition_html(
5862

5963
document = HTMLDocument.from_string(file_text, parser=parser)
6064

61-
elif text is not None and not filename and not file and not url:
65+
elif text is not None:
6266
_text: str = str(text)
6367
document = HTMLDocument.from_string(_text, parser=parser)
6468

65-
elif url is not None and not filename and not file and not text:
69+
elif url is not None:
6670
response = requests.get(url)
6771
if not response.ok:
6872
raise ValueError(f"URL return an error: {response.status_code}")
@@ -73,9 +77,6 @@ def partition_html(
7377

7478
document = HTMLDocument.from_string(response.text, parser=parser)
7579

76-
else:
77-
raise ValueError("Only one of filename, file, or text can be specified.")
78-
7980
layout_elements = document_to_element_list(document, include_page_breaks=include_page_breaks)
8081
if include_metadata:
8182
return add_element_metadata(

unstructured/partition/md.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from unstructured.documents.elements import Element
77
from unstructured.documents.xml import VALID_PARSERS
8+
from unstructured.partition.common import exactly_one
89
from unstructured.partition.html import partition_html
910

1011

@@ -23,20 +24,17 @@ def partition_md(
2324
include_metadata: bool = True,
2425
parser: VALID_PARSERS = None,
2526
) -> List[Element]:
26-
if not any([filename, file, text, url]):
27-
raise ValueError("One of filename, file, or text must be specified.")
27+
# Verify that only one of the arguments was provided
28+
exactly_one(filename=filename, file=file, text=text, url=url)
2829

29-
if filename is not None and not file and not text and not url:
30+
if filename is not None:
3031
with open(filename, encoding="utf8") as f:
3132
text = optional_decode(f.read())
3233

33-
elif file is not None and not filename and not text and not url:
34+
elif file is not None:
3435
text = optional_decode(file.read())
3536

36-
elif text is not None and not filename and not file and not url:
37-
pass
38-
39-
elif url is not None and not filename and not file and not text:
37+
elif url is not None:
4038
response = requests.get(url)
4139
if not response.ok:
4240
raise ValueError(f"URL return an error: {response.status_code}")
@@ -47,8 +45,8 @@ def partition_md(
4745

4846
text = response.text
4947

50-
else:
51-
raise ValueError("Only one of filename, file, or text can be specified.")
48+
elif text is None:
49+
raise ValueError("Exactly one of filename, file, url or text must be specified.")
5250

5351
html = markdown.markdown(text)
5452

unstructured/partition/ppt.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from typing import IO, List, Optional
44

55
from unstructured.documents.elements import Element
6-
from unstructured.partition.common import convert_office_doc
6+
from unstructured.partition.common import convert_office_doc, exactly_one
77
from unstructured.partition.pptx import partition_pptx
88

99

@@ -23,20 +23,20 @@ def partition_ppt(
2323
include_page_breaks
2424
If True, includes a PageBreak element between slides
2525
"""
26-
if not any([filename, file]):
27-
raise ValueError("One of filename or file must be specified.")
26+
# Verify that only one of the arguments was provided
27+
exactly_one(filename=filename, file=file)
2828

29-
if filename is not None and not file:
29+
if filename is not None:
3030
_, filename_no_path = os.path.split(os.path.abspath(filename))
3131
base_filename, _ = os.path.splitext(filename_no_path)
32-
elif file is not None and not filename:
32+
elif file is not None:
3333
tmp = tempfile.NamedTemporaryFile(delete=False)
3434
tmp.write(file.read())
3535
tmp.close()
3636
filename = tmp.name
3737
_, filename_no_path = os.path.split(os.path.abspath(tmp.name))
3838
else:
39-
raise ValueError("Only one of filename or file can be specified.")
39+
raise ValueError("Exactly one of filename and file must be specified.")
4040

4141
if not os.path.exists(filename):
4242
raise ValueError(f"The file {filename} does not exist.")

unstructured/partition/pptx.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
Text,
1212
Title,
1313
)
14+
from unstructured.partition.common import exactly_one
1415
from unstructured.partition.text_type import (
1516
is_possible_narrative_text,
1617
is_possible_title,
@@ -41,15 +42,13 @@ def partition_pptx(
4142
metadata.
4243
"""
4344

44-
if not any([filename, file]):
45-
raise ValueError("One of filename or file must be specified.")
45+
# Verify that only one of the arguments was provided
46+
exactly_one(filename=filename, file=file)
4647

47-
if filename is not None and not file:
48+
if filename is not None:
4849
presentation = pptx.Presentation(filename)
49-
elif file is not None and not filename:
50+
elif file is not None:
5051
presentation = pptx.Presentation(file)
51-
else:
52-
raise ValueError("Only one of filename or file can be specified.")
5352

5453
elements: List[Element] = []
5554
metadata_filename = metadata_filename or filename

0 commit comments

Comments
 (0)