22 lib ,
33 buildPythonPackage ,
44 fetchFromGitHub ,
5- # propagated build inputs
5+
6+ # core networking and async dependencies
7+ anyio ,
8+ backoff ,
9+ certifi ,
10+ httpcore ,
11+ httpx ,
12+ h11 ,
13+ nest-asyncio ,
14+ requests ,
15+ requests-toolbelt ,
16+ sniffio ,
17+ urllib3 ,
18+
19+ # core parsing and processing
20+ beautifulsoup4 ,
621 chardet ,
22+ charset-normalizer ,
23+ emoji ,
724 filetype ,
8- lxml ,
9- msg-parser ,
25+ html5lib ,
26+ idna ,
27+ joblib ,
28+ # jsonpath-python,
1029 nltk ,
11- openpyxl ,
12- pandas ,
13- pdf2image ,
30+ olefile ,
31+ orderly-set ,
32+ python-dateutil ,
33+ # python-iso639,
34+ python-magic ,
35+ # python-oxmsg,
36+ rapidfuzz ,
37+ regex ,
38+ soupsieve ,
39+ webencodings ,
40+
41+ # core data handling
42+ dataclasses-json ,
43+ deepdiff ,
44+ marshmallow ,
45+ mypy-extensions ,
46+ packaging ,
47+ typing-extensions ,
48+ typing-inspect ,
49+
50+ # core system utilities
51+ cffi ,
52+ cryptography ,
53+ psutil ,
54+ pycparser ,
55+ six ,
56+ tqdm ,
57+ wrapt ,
58+
59+ # document format support
60+ markdown ,
1461 pdfminer-six ,
15- pillow ,
62+ pdfplumber ,
63+ # pi-heif,
64+ pikepdf ,
1665 pypandoc ,
66+ pypdf ,
1767 python-docx ,
68+ # unstructured-client,
69+ # unstructured-pytesseract,
70+ # optional dependencies
71+ # csv
72+ pytz ,
73+ tzdata ,
74+ # markdown
75+ importlib-metadata ,
76+ zipp ,
77+ # pdf
78+ opencv-python ,
79+ paddlepaddle ,
80+ pdf2image ,
81+ # unstructured-paddleocr,
82+ # pptx
83+ lxml ,
84+ pillow ,
1885 python-pptx ,
19- python-magic ,
20- markdown ,
21- requests ,
22- tabulate ,
86+ xlsxwriter ,
87+ # xslx
88+ et-xmlfile ,
89+ networkx ,
90+ numpy ,
91+ openpyxl ,
92+ pandas ,
2393 xlrd ,
24- # optional-dependencies
94+ # huggingface
2595 langdetect ,
2696 sacremoses ,
2797 sentencepiece ,
2898 torch ,
2999 transformers ,
100+ # local-inference
30101 unstructured-inference ,
31- s3fs ,
32- fsspec ,
33- adlfs ,
34- # , discord-py
35- pygithub ,
36- python-gitlab ,
37- praw ,
38- slack-sdk ,
39- wikipedia ,
40- google-api-python-client ,
41- # , gcsfs
42- elasticsearch8 ,
43- jq ,
44- # , dropboxdrivefs
45- atlassian-python-api ,
46102 # test dependencies
47103 pytestCheckHook ,
48104 black ,
58114} :
59115let
60116 version = "0.16.11" ;
61- optional-dependencies = {
62- huggingflace = [
63- langdetect
64- sacremoses
65- sentencepiece
66- torch
67- transformers
68- ] ;
69- local-inference = [ unstructured-inference ] ;
70- s3 = [
71- s3fs
72- fsspec
73- ] ;
74- azure = [
75- adlfs
76- fsspec
77- ] ;
78- discord = [ ] ; # discord-py
79- github = [ pygithub ] ;
80- gitlab = [ python-gitlab ] ;
81- reddit = [ praw ] ;
82- slack = [ slack-sdk ] ;
83- wikipedia = [ wikipedia ] ;
84- google-drive = [ google-api-python-client ] ;
85- gcs = [ ] ; # gcsfs fsspec
86- elasticsearch = [
87- elasticsearch8
88- jq
89- ] ;
90- dropbox = [ ] ; # dropboxdrivefs fsspec
91- confluence = [ atlassian-python-api ] ;
92- } ;
93117in
94118buildPythonPackage {
95119 pname = "unstructured" ;
@@ -99,31 +123,133 @@ buildPythonPackage {
99123 src = fetchFromGitHub {
100124 owner = "Unstructured-IO" ;
101125 repo = "unstructured" ;
102- tag = version ;
126+ rev = "refs/tags/ ${ version } " ;
103127 hash = "sha256-+I5eXG/ICmYPDTavDnyLlopIvoABjdDwOyfotrNs6qs=" ;
104128 } ;
105129
106130 propagatedBuildInputs = [
131+ # Base dependencies
132+ anyio
133+ backoff
134+ beautifulsoup4
135+ certifi
136+ cffi
107137 chardet
138+ charset-normalizer
139+ click
140+ cryptography
141+ dataclasses-json
142+ deepdiff
143+ emoji
108144 filetype
145+ h11
146+ html5lib
147+ httpcore
148+ httpx
149+ idna
150+ joblib
151+ # jsonpath-python
152+ langdetect
109153 lxml
110- msg-parser
154+ marshmallow
155+ mypy-extensions
156+ nest-asyncio
111157 nltk
112- openpyxl
113- pandas
114- pdf2image
115- pdfminer-six
116- pillow
117- pypandoc
118- python-docx
119- python-pptx
158+ numpy
159+ olefile
160+ orderly-set
161+ packaging
162+ psutil
163+ pycparser
164+ pypdf
165+ python-dateutil
166+ # python-iso639
120167 python-magic
121- markdown
168+ # python-oxmsg
169+ rapidfuzz
170+ regex
122171 requests
123- tabulate
124- xlrd
172+ requests-toolbelt
173+ six
174+ sniffio
175+ soupsieve
176+ tqdm
177+ typing-extensions
178+ typing-inspect
179+ # unstructured-client
180+ urllib3
181+ webencodings
182+ wrapt
125183 ] ;
126184
185+ optional-dependencies = rec {
186+ all-docs = csv ++ docx ++ epub ++ pdf ++ req-markdown ++ odt ++ org ++ pptx ++ xlsx ;
187+ csv = [
188+ numpy
189+ pandas
190+ python-dateutil
191+ pytz
192+ tzdata
193+ ] ;
194+ docx = [
195+ lxml
196+ python-docx
197+ typing-extensions
198+ ] ;
199+ epub = [ pypandoc ] ;
200+ req-markdown = [
201+ importlib-metadata
202+ markdown
203+ zipp
204+ ] ;
205+ odt = [
206+ lxml
207+ pypandoc
208+ python-docx
209+ typing-extensions
210+ ] ;
211+ org = [
212+ pypandoc
213+ ] ;
214+ paddleocr = [
215+ opencv-python
216+ # paddlepaddle # 3.12 not supported for now
217+ pdf2image
218+ # unstructured-paddleocr
219+ ] ;
220+ pdf = [
221+ pdf2image
222+ pdfminer-six
223+ pdfplumber
224+ # pi-heif
225+ pikepdf
226+ pypdf
227+ unstructured-inference
228+ # unstructured-pytesseract
229+ ] ;
230+ pptx = [
231+ lxml
232+ pillow
233+ python-pptx
234+ xlsxwriter
235+ ] ;
236+ xlsx = [
237+ et-xmlfile
238+ networkx
239+ numpy
240+ openpyxl
241+ pandas
242+ xlrd
243+ ] ;
244+ huggingface = [
245+ langdetect
246+ sacremoses
247+ sentencepiece
248+ torch
249+ transformers
250+ ] ;
251+ } ;
252+
127253 pythonImportsCheck = [ "unstructured" ] ;
128254
129255 # test try to download punkt from nltk
@@ -143,8 +269,6 @@ buildPythonPackage {
143269 grpcio
144270 ] ;
145271
146- optional-dependencies = optional-dependencies ;
147-
148272 meta = with lib ; {
149273 description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines" ;
150274 mainProgram = "unstructured-ingest" ;
0 commit comments