Skip to content

Commit 50ea6fe

Browse files
authored
feat: add ndjson support (#3845)
### Description Add ndjson file type support and treat is the same as json files.
1 parent b3a2dd4 commit 50ea6fe

File tree

21 files changed

+670
-110
lines changed

21 files changed

+670
-110
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
### Enhancements
44

55
- **Prepare auto-partitioning for pluggable partitioners**. Move toward a uniform partitioner call signature so a custom or override partitioner can be registered without code changes.
6+
- **Add NDJSON file type support**
67

78
### Features
89

example-docs/simple.ndjson

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{"element_id": "a06d2d9e65212d4aa955c3ab32950ffa", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51"}, "text": "These are a few of my favorite things:", "type": "Title"}
2+
{"element_id": "b334c93e9b1cbca3b6f6d78ce8bc2484", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "a06d2d9e65212d4aa955c3ab32950ffa"}, "text": "Parrots", "type": "ListItem"}
3+
{"element_id": "76469ecb9f1459943c8d8cca1a550b5a", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "a06d2d9e65212d4aa955c3ab32950ffa"}, "text": "Hockey", "type": "ListItem"}
4+
{"element_id": "261fac731945a138415adc2dd4434b17", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51"}, "text": "Analysis", "type": "Title"}
5+
{"element_id": "95f392d32c5271bfdb30eaef45921e59", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "261fac731945a138415adc2dd4434b17"}, "text": "This is my first thought. This is my second thought.", "type": "NarrativeText"}
6+
{"element_id": "0de25bd6f0d74bc4f909f2678f385736", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "261fac731945a138415adc2dd4434b17"}, "text": "This is my third thought.", "type": "NarrativeText"}
7+
{"element_id": "f296a3bc8a901f19199fda1da92829b6", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "261fac731945a138415adc2dd4434b17"}, "text": "2023", "type": "UncategorizedText"}
8+
{"element_id": "78c62edbc674fdca0f6a0e3ffb459f86", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51"}, "text": "DOYLESTOWN, PA 18901", "type": "Address"}

example-docs/spring-weather.html.ndjson

Lines changed: 35 additions & 0 deletions
Large diffs are not rendered by default.

requirements/base.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ tqdm
2222
psutil
2323
python-oxmsg
2424
html5lib
25+
ndjson

requirements/base.txt

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44
#
55
# pip-compile ./base.in
66
#
7-
anyio==4.6.2.post1
7+
anyio==4.7.0
88
# via httpx
99
backoff==2.2.1
1010
# via -r ./base.in
1111
beautifulsoup4==4.12.3
1212
# via -r ./base.in
13-
certifi==2024.8.30
13+
certifi==2024.12.14
1414
# via
1515
# httpcore
1616
# httpx
@@ -28,13 +28,13 @@ click==8.1.7
2828
# via
2929
# nltk
3030
# python-oxmsg
31-
cryptography==43.0.3
31+
cryptography==44.0.0
3232
# via unstructured-client
3333
dataclasses-json==0.6.7
3434
# via
3535
# -r ./base.in
3636
# unstructured-client
37-
deepdiff==8.0.1
37+
deepdiff==8.1.1
3838
# via unstructured-client
3939
emoji==2.14.0
4040
# via -r ./base.in
@@ -46,9 +46,9 @@ h11==0.14.0
4646
# via httpcore
4747
html5lib==1.1
4848
# via -r ./base.in
49-
httpcore==1.0.6
49+
httpcore==1.0.7
5050
# via httpx
51-
httpx==0.27.2
51+
httpx==0.28.1
5252
# via unstructured-client
5353
idna==3.10
5454
# via
@@ -64,14 +64,16 @@ langdetect==1.0.9
6464
# via -r ./base.in
6565
lxml==5.3.0
6666
# via -r ./base.in
67-
marshmallow==3.23.0
67+
marshmallow==3.23.1
6868
# via
6969
# dataclasses-json
7070
# unstructured-client
7171
mypy-extensions==1.0.0
7272
# via
7373
# typing-inspect
7474
# unstructured-client
75+
ndjson==0.3.1
76+
# via -r ./base.in
7577
nest-asyncio==1.6.0
7678
# via unstructured-client
7779
nltk==3.9.1
@@ -80,17 +82,17 @@ numpy==1.26.4
8082
# via -r ./base.in
8183
olefile==0.47
8284
# via python-oxmsg
83-
orderly-set==5.2.2
85+
orderly-set==5.2.3
8486
# via deepdiff
85-
packaging==24.1
87+
packaging==24.2
8688
# via
8789
# marshmallow
8890
# unstructured-client
8991
psutil==6.1.0
9092
# via -r ./base.in
9193
pycparser==2.22
9294
# via cffi
93-
pypdf==5.0.1
95+
pypdf==5.1.0
9496
# via unstructured-client
9597
python-dateutil==2.9.0.post0
9698
# via unstructured-client
@@ -100,9 +102,9 @@ python-magic==0.4.27
100102
# via -r ./base.in
101103
python-oxmsg==0.0.1
102104
# via -r ./base.in
103-
rapidfuzz==3.10.1
105+
rapidfuzz==3.11.0
104106
# via -r ./base.in
105-
regex==2024.9.11
107+
regex==2024.11.6
106108
# via nltk
107109
requests==2.32.3
108110
# via
@@ -111,19 +113,17 @@ requests==2.32.3
111113
# unstructured-client
112114
requests-toolbelt==1.0.0
113115
# via unstructured-client
114-
six==1.16.0
116+
six==1.17.0
115117
# via
116118
# html5lib
117119
# langdetect
118120
# python-dateutil
119121
# unstructured-client
120122
sniffio==1.3.1
121-
# via
122-
# anyio
123-
# httpx
123+
# via anyio
124124
soupsieve==2.6
125125
# via beautifulsoup4
126-
tqdm==4.66.5
126+
tqdm==4.67.1
127127
# via
128128
# -r ./base.in
129129
# nltk
@@ -150,5 +150,5 @@ urllib3==1.26.20
150150
# unstructured-client
151151
webencodings==0.5.1
152152
# via html5lib
153-
wrapt==1.16.0
153+
wrapt==1.17.0
154154
# via -r ./base.in

requirements/dev.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,15 @@ distlib==0.3.9
1717
# via virtualenv
1818
filelock==3.16.1
1919
# via virtualenv
20-
identify==2.6.1
20+
identify==2.6.3
2121
# via pre-commit
2222
importlib-metadata==8.5.0
2323
# via
2424
# -c ././deps/constraints.txt
2525
# build
2626
nodeenv==1.9.1
2727
# via pre-commit
28-
packaging==24.1
28+
packaging==24.2
2929
# via
3030
# -c ./base.txt
3131
# -c ./test.txt
@@ -46,16 +46,16 @@ pyyaml==6.0.2
4646
# via
4747
# -c ./test.txt
4848
# pre-commit
49-
tomli==2.0.2
49+
tomli==2.2.1
5050
# via
5151
# -c ./test.txt
5252
# build
5353
# pip-tools
54-
virtualenv==20.27.0
54+
virtualenv==20.28.0
5555
# via pre-commit
56-
wheel==0.44.0
56+
wheel==0.45.1
5757
# via pip-tools
58-
zipp==3.20.2
58+
zipp==3.21.0
5959
# via importlib-metadata
6060

6161
# The following packages are considered to be unsafe in a requirements file:

requirements/extra-csv.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ python-dateutil==2.9.0.post0
1616
# pandas
1717
pytz==2024.2
1818
# via pandas
19-
six==1.16.0
19+
six==1.17.0
2020
# via
2121
# -c ./base.txt
2222
# python-dateutil

requirements/extra-markdown.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@ importlib-metadata==8.5.0
1010
# markdown
1111
markdown==3.7
1212
# via -r ./extra-markdown.in
13-
zipp==3.20.2
13+
zipp==3.21.0
1414
# via importlib-metadata

requirements/extra-paddleocr.txt

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44
#
55
# pip-compile ./extra-paddleocr.in
66
#
7-
anyio==4.6.2.post1
7+
anyio==4.7.0
88
# via
99
# -c ./base.txt
1010
# httpx
1111
astor==0.8.1
1212
# via paddlepaddle
13-
certifi==2024.8.30
13+
certifi==2024.12.14
1414
# via
1515
# -c ./base.txt
1616
# httpcore
@@ -32,17 +32,17 @@ exceptiongroup==1.2.2
3232
# via
3333
# -c ./base.txt
3434
# anyio
35-
fonttools==4.54.1
35+
fonttools==4.55.3
3636
# via matplotlib
3737
h11==0.14.0
3838
# via
3939
# -c ./base.txt
4040
# httpcore
41-
httpcore==1.0.6
41+
httpcore==1.0.7
4242
# via
4343
# -c ./base.txt
4444
# httpx
45-
httpx==0.27.2
45+
httpx==0.28.1
4646
# via
4747
# -c ./base.txt
4848
# paddlepaddle
@@ -52,7 +52,7 @@ idna==3.10
5252
# anyio
5353
# httpx
5454
# requests
55-
imageio==2.36.0
55+
imageio==2.36.1
5656
# via
5757
# imgaug
5858
# scikit-image
@@ -64,7 +64,7 @@ kiwisolver==1.4.7
6464
# via matplotlib
6565
lazy-loader==0.4
6666
# via scikit-image
67-
matplotlib==3.9.2
67+
matplotlib==3.9.4
6868
# via imgaug
6969
networkx==3.2.1
7070
# via
@@ -94,7 +94,7 @@ opencv-python==4.10.0.84
9494
# unstructured-paddleocr
9595
opt-einsum==3.3.0
9696
# via paddlepaddle
97-
packaging==24.1
97+
packaging==24.2
9898
# via
9999
# -c ./base.txt
100100
# lazy-loader
@@ -127,7 +127,7 @@ python-dateutil==2.9.0.post0
127127
# matplotlib
128128
pyyaml==6.0.2
129129
# via unstructured-paddleocr
130-
rapidfuzz==3.10.1
130+
rapidfuzz==3.11.0
131131
# via
132132
# -c ./base.txt
133133
# unstructured-paddleocr
@@ -147,7 +147,7 @@ shapely==2.0.6
147147
# via
148148
# imgaug
149149
# unstructured-paddleocr
150-
six==1.16.0
150+
six==1.17.0
151151
# via
152152
# -c ./base.txt
153153
# imgaug
@@ -156,10 +156,9 @@ sniffio==1.3.1
156156
# via
157157
# -c ./base.txt
158158
# anyio
159-
# httpx
160159
tifffile==2024.8.30
161160
# via scikit-image
162-
tqdm==4.66.5
161+
tqdm==4.67.1
163162
# via
164163
# -c ./base.txt
165164
# unstructured-paddleocr
@@ -175,5 +174,5 @@ urllib3==1.26.20
175174
# -c ././deps/constraints.txt
176175
# -c ./base.txt
177176
# requests
178-
zipp==3.20.2
177+
zipp==3.21.0
179178
# via importlib-resources

0 commit comments

Comments
 (0)