Skip to content

Commit c1ba090

Browse files
authored
fix: suppress file conversion warnings in convert_office_doc (#703)
* test that output is suppressed * add test for error output * changelog and version
1 parent 559a557 commit c1ba090

File tree

5 files changed

+46
-14
lines changed

5 files changed

+46
-14
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.7.3-dev0
1+
## 0.7.3-dev1
22

33
### Enhancements
44

@@ -8,6 +8,7 @@
88

99
### Fixes
1010

11+
* `convert_office_doc` no longers prints file conversion info messages to stdout.
1112
* `partition_via_api` reflects the actual filetype for the file processed in the API.
1213

1314
## 0.7.2

test_unstructured/partition/test_common.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,3 +147,19 @@ def test_normalize_layout_element_bulleted_list():
147147
ListItem(text="You're cool too.", coordinates=((1, 2), (1, 4), (3, 4), (3, 2))),
148148
ListItem(text="We're all cool!", coordinates=((1, 2), (1, 4), (3, 4), (3, 2))),
149149
]
150+
151+
152+
class MockPopenWithError:
153+
def __init__(self, *args, **kwargs):
154+
pass
155+
156+
def communicate(self):
157+
return b"", b"an error occurred"
158+
159+
160+
def test_convert_office_doc_captures_errors(monkeypatch, caplog):
161+
import subprocess
162+
163+
monkeypatch.setattr(subprocess, "Popen", MockPopenWithError)
164+
common.convert_office_doc("no-real.docx", "fake-directory", target_format="docx")
165+
assert "an error occurred" in caplog.text

test_unstructured/partition/test_doc.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def expected_elements():
5656
]
5757

5858

59-
def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir):
59+
def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir, capsys):
6060
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
6161
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
6262
mock_document.save(docx_filename)
@@ -67,6 +67,9 @@ def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir):
6767
assert elements[0].metadata.filename == "mock_document.doc"
6868
assert elements[0].metadata.file_directory == tmpdir.dirname
6969

70+
assert capsys.readouterr().out == ""
71+
assert capsys.readouterr().err == ""
72+
7073

7174
def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir):
7275
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
@@ -84,7 +87,7 @@ def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmp
8487
partition_doc(filename=doc_filename)
8588

8689

87-
def test_partition_doc_with_file(mock_document, expected_elements, tmpdir):
90+
def test_partition_doc_with_file(mock_document, expected_elements, tmpdir, capsys):
8891
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
8992
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
9093
mock_document.save(docx_filename)
@@ -94,6 +97,9 @@ def test_partition_doc_with_file(mock_document, expected_elements, tmpdir):
9497
elements = partition_doc(file=f)
9598
assert elements == expected_elements
9699

100+
assert capsys.readouterr().out == ""
101+
assert capsys.readouterr().err == ""
102+
97103

98104
def test_partition_doc_raises_with_both_specified(mock_document, tmpdir):
99105
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.7.3-dev0" # pragma: no cover
1+
__version__ = "0.7.3-dev1" # pragma: no cover

unstructured/partition/common.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
PageBreak,
1616
Text,
1717
)
18+
from unstructured.logger import logger
1819
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
1920

2021

@@ -139,18 +140,22 @@ def convert_office_doc(input_filename: str, output_directory: str, target_format
139140
# users who do not have LibreOffice installed
140141
# ref: https://stackoverflow.com/questions/38468442/
141142
# multiple-doc-to-docx-file-conversion-using-python
143+
command = [
144+
"soffice",
145+
"--headless",
146+
"--convert-to",
147+
target_format,
148+
"--outdir",
149+
output_directory,
150+
input_filename,
151+
]
142152
try:
143-
subprocess.call(
144-
[
145-
"soffice",
146-
"--headless",
147-
"--convert-to",
148-
target_format,
149-
"--outdir",
150-
output_directory,
151-
input_filename,
152-
],
153+
process = subprocess.Popen(
154+
command,
155+
stdout=subprocess.PIPE,
156+
stderr=subprocess.PIPE,
153157
)
158+
output, error = process.communicate()
154159
except FileNotFoundError:
155160
raise FileNotFoundError(
156161
"""soffice command was not found. Please install libreoffice
@@ -161,6 +166,10 @@ def convert_office_doc(input_filename: str, output_directory: str, target_format
161166
- Debian: https://wiki.debian.org/LibreOffice""",
162167
)
163168

169+
logger.info(output.decode().strip())
170+
if error:
171+
logger.error(error.decode().strip())
172+
164173

165174
def exactly_one(**kwargs) -> None:
166175
"""

0 commit comments

Comments
 (0)