Skip to content

Commit e230364

Browse files
authored
bugfix/drop use of ndjson dep, use local code (#3886)
### Description Avoid using the ndjson dependency due to the limiting license that exists on it
1 parent 8f2a719 commit e230364

File tree

7 files changed

+75
-11
lines changed

7 files changed

+75
-11
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.16.16-dev0
1+
## 0.16.16-dev1
22

33
### Enhancements
44

@@ -7,6 +7,8 @@
77

88
### Fixes
99

10+
* **Drop usage of ndjson dependency**
11+
1012
## 0.16.15
1113

1214
### Enhancements

requirements/base.in

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,3 @@ tqdm
2222
psutil
2323
python-oxmsg
2424
html5lib
25-
ndjson

requirements/base.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,6 @@ mypy-extensions==1.0.0
7272
# via
7373
# typing-inspect
7474
# unstructured-client
75-
ndjson==0.3.1
76-
# via -r ./base.in
7775
nest-asyncio==1.6.0
7876
# via unstructured-client
7977
nltk==3.9.1

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.16-dev0" # pragma: no cover
1+
__version__ = "0.16.16-dev1" # pragma: no cover

unstructured/file_utils/ndjson.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""
2+
Adds support for working with newline-delimited JSON (ndjson) files. This format is useful for
3+
streaming json content that would otherwise not be possible using raw JSON files.
4+
"""
5+
6+
import json
7+
from typing import IO, Any
8+
9+
10+
def dumps(obj: list[dict[str, Any]], **kwargs) -> str:
11+
"""
12+
Converts the list of dictionaries into string representation
13+
14+
Args:
15+
obj (list[dict[str, Any]]): List of dictionaries to convert
16+
**kwargs: Additional keyword arguments to pass to json.dumps
17+
18+
Returns:
19+
str: string representation of the list of dictionaries
20+
"""
21+
return "\n".join(json.dumps(each, **kwargs) for each in obj)
22+
23+
24+
def dump(obj: list[dict[str, Any]], fp: IO, **kwargs) -> None:
25+
"""
26+
Writes the list of dictionaries to a newline-delimited file
27+
28+
Args:
29+
obj (list[dict[str, Any]]): List of dictionaries to convert
30+
fp (IO): File pointer to write the string representation to
31+
**kwargs: Additional keyword arguments to pass to json.dumps
32+
33+
Returns:
34+
None
35+
"""
36+
# Indent breaks ndjson formatting
37+
kwargs["indent"] = None
38+
text = dumps(obj, **kwargs)
39+
fp.write(text)
40+
41+
42+
def loads(s: str, **kwargs) -> list[dict[str, Any]]:
43+
"""
44+
Converts the raw string into a list of dictionaries
45+
46+
Args:
47+
s (str): Raw string to convert
48+
**kwargs: Additional keyword arguments to pass to json.loads
49+
50+
Returns:
51+
list[dict[str, Any]]: List of dictionaries parsed from the input string
52+
"""
53+
return [json.loads(line, **kwargs) for line in s.splitlines()]
54+
55+
56+
def load(fp: IO, **kwargs) -> list[dict[str, Any]]:
57+
"""
58+
Converts the contents of the file into a list of dictionaries
59+
60+
Args:
61+
fp (IO): File pointer to read the string representation from
62+
**kwargs: Additional keyword arguments to pass to json.loads
63+
64+
Returns:
65+
list[dict[str, Any]]: List of dictionaries parsed from the file
66+
"""
67+
return loads(fp.read(), **kwargs)

unstructured/partition/ndjson.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,14 @@
1212
import json
1313
from typing import IO, Any, Optional
1414

15-
import ndjson
16-
1715
from unstructured.chunking import add_chunking_strategy
1816
from unstructured.documents.elements import Element, process_metadata
1917
from unstructured.file_utils.filetype import (
2018
FileType,
2119
add_metadata_with_filetype,
2220
is_ndjson_processable,
2321
)
22+
from unstructured.file_utils.ndjson import loads as ndjson_loads
2423
from unstructured.partition.common.common import exactly_one
2524
from unstructured.partition.common.metadata import get_last_modified_date
2625
from unstructured.staging.base import elements_from_dicts
@@ -74,7 +73,7 @@ def partition_ndjson(
7473
)
7574

7675
try:
77-
element_dicts = ndjson.loads(file_text)
76+
element_dicts = ndjson_loads(file_text)
7877
elements = elements_from_dicts(element_dicts)
7978
except json.JSONDecodeError:
8079
raise ValueError("Not a valid ndjson")

unstructured/staging/base.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,14 @@
99
from datetime import datetime
1010
from typing import Any, Iterable, Optional, Sequence, cast
1111

12-
import ndjson
13-
1412
from unstructured.documents.coordinates import PixelSpace
1513
from unstructured.documents.elements import (
1614
TYPE_TO_TEXT_ELEMENT_MAP,
1715
CheckBox,
1816
Element,
1917
ElementMetadata,
2018
)
19+
from unstructured.file_utils.ndjson import dumps as ndjson_dumps
2120
from unstructured.partition.common.common import exactly_one
2221
from unstructured.utils import Point, dependency_exists, requires_dependencies
2322

@@ -168,7 +167,7 @@ def elements_to_ndjson(
168167
# -- serialize `elements` as a JSON array (str) --
169168
precision_adjusted_elements = _fix_metadata_field_precision(elements)
170169
element_dicts = elements_to_dicts(precision_adjusted_elements)
171-
ndjson_str = ndjson.dumps(element_dicts, sort_keys=True)
170+
ndjson_str = ndjson_dumps(element_dicts, sort_keys=True)
172171

173172
if filename is not None:
174173
with open(filename, "w", encoding=encoding) as f:

0 commit comments

Comments
 (0)