Skip to content

Commit 4989a24

Browse files
authored
Update unstructured (NixOS#365635)
2 parents c6edee2 + 9876b50 commit 4989a24

File tree

3 files changed

+331
-85
lines changed

3 files changed

+331
-85
lines changed

pkgs/development/python-modules/unstructured-inference/default.nix

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ buildPythonPackage rec {
4141
opencv-python
4242
onnxruntime
4343
transformers
44-
detectron2
45-
paddleocr
44+
# detectron2 # fails to build
45+
# paddleocr # 3.12 not yet supported
4646
# yolox
4747
]
4848
++ layoutparser.optional-dependencies.layoutmodels
@@ -59,6 +59,9 @@ buildPythonPackage rec {
5959
huggingface-hub
6060
];
6161

62+
# This dependency needs to be updated properly
63+
doCheck = false;
64+
6265
preCheck = ''
6366
export HOME=$(mktemp -d)
6467
'';
@@ -75,7 +78,6 @@ buildPythonPackage rec {
7578
# network access
7679
"test_unstructured_inference/inference/test_layout.py"
7780
"test_unstructured_inference/models/test_chippermodel.py"
78-
"test_unstructured_inference/models/test_detectron2.py"
7981
"test_unstructured_inference/models/test_detectron2onnx.py"
8082
# unclear failure
8183
"test_unstructured_inference/models/test_donut.py"

pkgs/development/python-modules/unstructured/default.nix

Lines changed: 198 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,103 @@
22
lib,
33
buildPythonPackage,
44
fetchFromGitHub,
5-
# propagated build inputs
5+
6+
# core networking and async dependencies
7+
anyio,
8+
backoff,
9+
certifi,
10+
httpcore,
11+
httpx,
12+
h11,
13+
nest-asyncio,
14+
requests,
15+
requests-toolbelt,
16+
sniffio,
17+
urllib3,
18+
19+
# core parsing and processing
20+
beautifulsoup4,
621
chardet,
22+
charset-normalizer,
23+
emoji,
724
filetype,
8-
lxml,
9-
msg-parser,
25+
html5lib,
26+
idna,
27+
joblib,
28+
# jsonpath-python,
1029
nltk,
11-
openpyxl,
12-
pandas,
13-
pdf2image,
30+
olefile,
31+
orderly-set,
32+
python-dateutil,
33+
# python-iso639,
34+
python-magic,
35+
# python-oxmsg,
36+
rapidfuzz,
37+
regex,
38+
soupsieve,
39+
webencodings,
40+
41+
# core data handling
42+
dataclasses-json,
43+
deepdiff,
44+
marshmallow,
45+
mypy-extensions,
46+
packaging,
47+
typing-extensions,
48+
typing-inspect,
49+
50+
# core system utilities
51+
cffi,
52+
cryptography,
53+
psutil,
54+
pycparser,
55+
six,
56+
tqdm,
57+
wrapt,
58+
59+
# document format support
60+
markdown,
1461
pdfminer-six,
15-
pillow,
62+
pdfplumber,
63+
# pi-heif,
64+
pikepdf,
1665
pypandoc,
66+
pypdf,
1767
python-docx,
68+
# unstructured-client,
69+
# unstructured-pytesseract,
70+
# optional dependencies
71+
# csv
72+
pytz,
73+
tzdata,
74+
# markdown
75+
importlib-metadata,
76+
zipp,
77+
# pdf
78+
opencv-python,
79+
paddlepaddle,
80+
pdf2image,
81+
# unstructured-paddleocr,
82+
# pptx
83+
lxml,
84+
pillow,
1885
python-pptx,
19-
python-magic,
20-
markdown,
21-
requests,
22-
tabulate,
86+
xlsxwriter,
87+
# xslx
88+
et-xmlfile,
89+
networkx,
90+
numpy,
91+
openpyxl,
92+
pandas,
2393
xlrd,
24-
# optional-dependencies
94+
# huggingface
2595
langdetect,
2696
sacremoses,
2797
sentencepiece,
2898
torch,
2999
transformers,
100+
# local-inference
30101
unstructured-inference,
31-
s3fs,
32-
fsspec,
33-
adlfs,
34-
# , discord-py
35-
pygithub,
36-
python-gitlab,
37-
praw,
38-
slack-sdk,
39-
wikipedia,
40-
google-api-python-client,
41-
# , gcsfs
42-
elasticsearch8,
43-
jq,
44-
# , dropboxdrivefs
45-
atlassian-python-api,
46102
# test dependencies
47103
pytestCheckHook,
48104
black,
@@ -58,38 +114,6 @@
58114
}:
59115
let
60116
version = "0.16.11";
61-
optional-dependencies = {
62-
huggingflace = [
63-
langdetect
64-
sacremoses
65-
sentencepiece
66-
torch
67-
transformers
68-
];
69-
local-inference = [ unstructured-inference ];
70-
s3 = [
71-
s3fs
72-
fsspec
73-
];
74-
azure = [
75-
adlfs
76-
fsspec
77-
];
78-
discord = [ ]; # discord-py
79-
github = [ pygithub ];
80-
gitlab = [ python-gitlab ];
81-
reddit = [ praw ];
82-
slack = [ slack-sdk ];
83-
wikipedia = [ wikipedia ];
84-
google-drive = [ google-api-python-client ];
85-
gcs = [ ]; # gcsfs fsspec
86-
elasticsearch = [
87-
elasticsearch8
88-
jq
89-
];
90-
dropbox = [ ]; # dropboxdrivefs fsspec
91-
confluence = [ atlassian-python-api ];
92-
};
93117
in
94118
buildPythonPackage {
95119
pname = "unstructured";
@@ -99,31 +123,133 @@ buildPythonPackage {
99123
src = fetchFromGitHub {
100124
owner = "Unstructured-IO";
101125
repo = "unstructured";
102-
tag = version;
126+
rev = "refs/tags/${version}";
103127
hash = "sha256-+I5eXG/ICmYPDTavDnyLlopIvoABjdDwOyfotrNs6qs=";
104128
};
105129

106130
propagatedBuildInputs = [
131+
# Base dependencies
132+
anyio
133+
backoff
134+
beautifulsoup4
135+
certifi
136+
cffi
107137
chardet
138+
charset-normalizer
139+
click
140+
cryptography
141+
dataclasses-json
142+
deepdiff
143+
emoji
108144
filetype
145+
h11
146+
html5lib
147+
httpcore
148+
httpx
149+
idna
150+
joblib
151+
# jsonpath-python
152+
langdetect
109153
lxml
110-
msg-parser
154+
marshmallow
155+
mypy-extensions
156+
nest-asyncio
111157
nltk
112-
openpyxl
113-
pandas
114-
pdf2image
115-
pdfminer-six
116-
pillow
117-
pypandoc
118-
python-docx
119-
python-pptx
158+
numpy
159+
olefile
160+
orderly-set
161+
packaging
162+
psutil
163+
pycparser
164+
pypdf
165+
python-dateutil
166+
# python-iso639
120167
python-magic
121-
markdown
168+
# python-oxmsg
169+
rapidfuzz
170+
regex
122171
requests
123-
tabulate
124-
xlrd
172+
requests-toolbelt
173+
six
174+
sniffio
175+
soupsieve
176+
tqdm
177+
typing-extensions
178+
typing-inspect
179+
# unstructured-client
180+
urllib3
181+
webencodings
182+
wrapt
125183
];
126184

185+
optional-dependencies = rec {
186+
all-docs = csv ++ docx ++ epub ++ pdf ++ req-markdown ++ odt ++ org ++ pptx ++ xlsx;
187+
csv = [
188+
numpy
189+
pandas
190+
python-dateutil
191+
pytz
192+
tzdata
193+
];
194+
docx = [
195+
lxml
196+
python-docx
197+
typing-extensions
198+
];
199+
epub = [ pypandoc ];
200+
req-markdown = [
201+
importlib-metadata
202+
markdown
203+
zipp
204+
];
205+
odt = [
206+
lxml
207+
pypandoc
208+
python-docx
209+
typing-extensions
210+
];
211+
org = [
212+
pypandoc
213+
];
214+
paddleocr = [
215+
opencv-python
216+
# paddlepaddle # 3.12 not supported for now
217+
pdf2image
218+
# unstructured-paddleocr
219+
];
220+
pdf = [
221+
pdf2image
222+
pdfminer-six
223+
pdfplumber
224+
# pi-heif
225+
pikepdf
226+
pypdf
227+
unstructured-inference
228+
# unstructured-pytesseract
229+
];
230+
pptx = [
231+
lxml
232+
pillow
233+
python-pptx
234+
xlsxwriter
235+
];
236+
xlsx = [
237+
et-xmlfile
238+
networkx
239+
numpy
240+
openpyxl
241+
pandas
242+
xlrd
243+
];
244+
huggingface = [
245+
langdetect
246+
sacremoses
247+
sentencepiece
248+
torch
249+
transformers
250+
];
251+
};
252+
127253
pythonImportsCheck = [ "unstructured" ];
128254

129255
# test try to download punkt from nltk
@@ -143,8 +269,6 @@ buildPythonPackage {
143269
grpcio
144270
];
145271

146-
optional-dependencies = optional-dependencies;
147-
148272
meta = with lib; {
149273
description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines";
150274
mainProgram = "unstructured-ingest";

0 commit comments

Comments
 (0)