diff --git a/requirements/base.in b/requirements/base.in index cc2b27d8ad..09c1dc196d 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -22,3 +22,4 @@ tqdm psutil python-oxmsg html5lib +chardet diff --git a/unstructured/partition/common/common.py b/unstructured/partition/common/common.py index 267630a87b..9c4d492753 100644 --- a/unstructured/partition/common/common.py +++ b/unstructured/partition/common/common.py @@ -1,5 +1,6 @@ from __future__ import annotations +import chardet import numbers import subprocess from io import BufferedReader, BytesIO, TextIOWrapper @@ -296,7 +297,9 @@ def convert_office_doc( wait_time = 0 sleep_time = 0.1 output = subprocess.run(command, capture_output=True) - message = output.stdout.decode().strip() + detected_encoding = chardet.detect(output.stdout) + encoding = detected_encoding['encoding'] or 'utf-8' # Default to utf-8 if detection fails + message = output.stdout.decode(encoding).strip() # we can't rely on returncode unfortunately because on macOS it would return 0 even when the # command failed to run; instead we have to rely on the stdout being empty as a sign of the # process failed