Skip to content

Commit 6bc1168

Browse files
authored
enhancement: add encoding to elements_to_json and elements_from_json (#694)
* add encoding to elements_to_json and elements_from_json * version and changelog * add new test * fix version * revert test file * blank line to test * no blank line
1 parent c6dc466 commit 6bc1168

File tree

4 files changed

+19
-5
lines changed

4 files changed

+19
-5
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
## 0.7.2-dev6
1+
## 0.7.2
22

33
### Enhancements
44

5+
* Adds an optional encoding kwarg to `elements_to_json` and `elements_from_json`
56
* Bump version of base image to use new stable version of tesseract
67

78
### Features

test_unstructured/staging/test_base_staging.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
Text,
2020
Title,
2121
)
22+
from unstructured.partition.text import partition_text
2223
from unstructured.staging import base
2324

2425

@@ -134,3 +135,10 @@ def test_serialized_deserialize_elements_to_json(tmpdir):
134135
elements_str = base.elements_to_json(elements)
135136
new_elements_text = base.elements_from_json(text=elements_str)
136137
assert elements == new_elements_text
138+
139+
140+
def test_read_and_write_json_with_encoding(filename="example-docs/fake-text-utf-16-be.txt"):
141+
elements = partition_text(filename=filename)
142+
base.elements_to_json(elements, filename=filename, encoding="utf-16")
143+
new_elements_filename = base.elements_from_json(filename=filename, encoding="utf-16")
144+
assert elements == new_elements_filename

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.7.2-dev6" # pragma: no cover
1+
__version__ = "0.7.2" # pragma: no cover

unstructured/staging/base.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,15 @@ def elements_to_json(
4747
elements: List[Element],
4848
filename: Optional[str] = None,
4949
indent: int = 4,
50+
encoding: str = "utf-8",
5051
) -> Optional[str]:
5152
"""
5253
Saves a list of elements to a JSON file if filename is specified.
5354
Otherwise, return the list of elements as a string.
5455
"""
5556
element_dict = convert_to_dict(elements)
5657
if filename is not None:
57-
with open(filename, "w") as f:
58+
with open(filename, "w", encoding=encoding) as f:
5859
json.dump(element_dict, f, indent=indent)
5960
return None
6061
else:
@@ -105,12 +106,16 @@ def dict_to_elements(element_dict: List[Dict[str, Any]]) -> List[Element]:
105106
return isd_to_elements(element_dict)
106107

107108

108-
def elements_from_json(filename: str = "", text: str = "") -> List[Element]:
109+
def elements_from_json(
110+
filename: str = "",
111+
text: str = "",
112+
encoding: str = "utf-8",
113+
) -> List[Element]:
109114
"""Loads a list of elements from a JSON file or a string."""
110115
exactly_one(filename=filename, text=text)
111116

112117
if filename:
113-
with open(filename) as f:
118+
with open(filename, encoding=encoding) as f:
114119
element_dict = json.load(f)
115120
return dict_to_elements(element_dict)
116121
else:

0 commit comments

Comments
 (0)