Skip to content

Commit c403237

Browse files
feat/optimize fsspec indexing (#148)
* Use data from initial request to populate metadata object * bump changelog * fix fsspec metadata * fix azure connector * fix dropbox connector * fix dropbox connector * fix gcs created at date * Add file id * Update ingest test fixtures (#151) Co-authored-by: rbiseck3 <[email protected]> --------- Co-authored-by: Unstructured-DevOps <[email protected]>
1 parent a57927c commit c403237

28 files changed

+788
-709
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
## 0.0.22-dev1
1+
## 0.0.22-dev2
22

33
### Enhancements
44

55
* **Add documentation for developing sources/destinations**
66
* **Leverage `uv` for pip compile**
7+
* **Use incoming fsspec data to populate metadata** Rather than make additional calls to collect metadata after initial file list, use connector-specific data to populate the metadata.
78

89
## 0.0.21
910

test_e2e/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"page_number": 1,
1212
"data_source": {
1313
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
14-
"version": "237960874052008560436652606947751982249",
14+
"version": "0x8DB214A673DD8D8",
1515
"record_locator": {
1616
"protocol": "abfs",
1717
"remote_file_path": "abfs://container1/"
@@ -35,7 +35,7 @@
3535
"page_number": 1,
3636
"data_source": {
3737
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
38-
"version": "237960874052008560436652606947751982249",
38+
"version": "0x8DB214A673DD8D8",
3939
"record_locator": {
4040
"protocol": "abfs",
4141
"remote_file_path": "abfs://container1/"
@@ -59,7 +59,7 @@
5959
"page_number": 1,
6060
"data_source": {
6161
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
62-
"version": "237960874052008560436652606947751982249",
62+
"version": "0x8DB214A673DD8D8",
6363
"record_locator": {
6464
"protocol": "abfs",
6565
"remote_file_path": "abfs://container1/"
@@ -83,7 +83,7 @@
8383
"page_number": 1,
8484
"data_source": {
8585
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
86-
"version": "237960874052008560436652606947751982249",
86+
"version": "0x8DB214A673DD8D8",
8787
"record_locator": {
8888
"protocol": "abfs",
8989
"remote_file_path": "abfs://container1/"
@@ -107,7 +107,7 @@
107107
"page_number": 1,
108108
"data_source": {
109109
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
110-
"version": "237960874052008560436652606947751982249",
110+
"version": "0x8DB214A673DD8D8",
111111
"record_locator": {
112112
"protocol": "abfs",
113113
"remote_file_path": "abfs://container1/"
@@ -131,7 +131,7 @@
131131
"page_number": 1,
132132
"data_source": {
133133
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
134-
"version": "237960874052008560436652606947751982249",
134+
"version": "0x8DB214A673DD8D8",
135135
"record_locator": {
136136
"protocol": "abfs",
137137
"remote_file_path": "abfs://container1/"
@@ -155,7 +155,7 @@
155155
"page_number": 1,
156156
"data_source": {
157157
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
158-
"version": "237960874052008560436652606947751982249",
158+
"version": "0x8DB214A673DD8D8",
159159
"record_locator": {
160160
"protocol": "abfs",
161161
"remote_file_path": "abfs://container1/"
@@ -179,7 +179,7 @@
179179
"page_number": 1,
180180
"data_source": {
181181
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
182-
"version": "237960874052008560436652606947751982249",
182+
"version": "0x8DB214A673DD8D8",
183183
"record_locator": {
184184
"protocol": "abfs",
185185
"remote_file_path": "abfs://container1/"
@@ -203,7 +203,7 @@
203203
"page_number": 1,
204204
"data_source": {
205205
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
206-
"version": "237960874052008560436652606947751982249",
206+
"version": "0x8DB214A673DD8D8",
207207
"record_locator": {
208208
"protocol": "abfs",
209209
"remote_file_path": "abfs://container1/"
@@ -227,7 +227,7 @@
227227
"page_number": 1,
228228
"data_source": {
229229
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
230-
"version": "237960874052008560436652606947751982249",
230+
"version": "0x8DB214A673DD8D8",
231231
"record_locator": {
232232
"protocol": "abfs",
233233
"remote_file_path": "abfs://container1/"
@@ -251,7 +251,7 @@
251251
"page_number": 1,
252252
"data_source": {
253253
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
254-
"version": "237960874052008560436652606947751982249",
254+
"version": "0x8DB214A673DD8D8",
255255
"record_locator": {
256256
"protocol": "abfs",
257257
"remote_file_path": "abfs://container1/"
@@ -275,7 +275,7 @@
275275
"page_number": 1,
276276
"data_source": {
277277
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
278-
"version": "237960874052008560436652606947751982249",
278+
"version": "0x8DB214A673DD8D8",
279279
"record_locator": {
280280
"protocol": "abfs",
281281
"remote_file_path": "abfs://container1/"
@@ -299,7 +299,7 @@
299299
"page_number": 1,
300300
"data_source": {
301301
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
302-
"version": "237960874052008560436652606947751982249",
302+
"version": "0x8DB214A673DD8D8",
303303
"record_locator": {
304304
"protocol": "abfs",
305305
"remote_file_path": "abfs://container1/"
@@ -323,7 +323,7 @@
323323
"page_number": 1,
324324
"data_source": {
325325
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
326-
"version": "237960874052008560436652606947751982249",
326+
"version": "0x8DB214A673DD8D8",
327327
"record_locator": {
328328
"protocol": "abfs",
329329
"remote_file_path": "abfs://container1/"
@@ -347,7 +347,7 @@
347347
"page_number": 2,
348348
"data_source": {
349349
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
350-
"version": "237960874052008560436652606947751982249",
350+
"version": "0x8DB214A673DD8D8",
351351
"record_locator": {
352352
"protocol": "abfs",
353353
"remote_file_path": "abfs://container1/"
@@ -371,7 +371,7 @@
371371
"page_number": 2,
372372
"data_source": {
373373
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
374-
"version": "237960874052008560436652606947751982249",
374+
"version": "0x8DB214A673DD8D8",
375375
"record_locator": {
376376
"protocol": "abfs",
377377
"remote_file_path": "abfs://container1/"
@@ -395,7 +395,7 @@
395395
"page_number": 2,
396396
"data_source": {
397397
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
398-
"version": "237960874052008560436652606947751982249",
398+
"version": "0x8DB214A673DD8D8",
399399
"record_locator": {
400400
"protocol": "abfs",
401401
"remote_file_path": "abfs://container1/"
@@ -419,7 +419,7 @@
419419
"page_number": 2,
420420
"data_source": {
421421
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
422-
"version": "237960874052008560436652606947751982249",
422+
"version": "0x8DB214A673DD8D8",
423423
"record_locator": {
424424
"protocol": "abfs",
425425
"remote_file_path": "abfs://container1/"
@@ -443,7 +443,7 @@
443443
"page_number": 2,
444444
"data_source": {
445445
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
446-
"version": "237960874052008560436652606947751982249",
446+
"version": "0x8DB214A673DD8D8",
447447
"record_locator": {
448448
"protocol": "abfs",
449449
"remote_file_path": "abfs://container1/"
@@ -467,7 +467,7 @@
467467
"page_number": 2,
468468
"data_source": {
469469
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
470-
"version": "237960874052008560436652606947751982249",
470+
"version": "0x8DB214A673DD8D8",
471471
"record_locator": {
472472
"protocol": "abfs",
473473
"remote_file_path": "abfs://container1/"
@@ -491,7 +491,7 @@
491491
"page_number": 2,
492492
"data_source": {
493493
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
494-
"version": "237960874052008560436652606947751982249",
494+
"version": "0x8DB214A673DD8D8",
495495
"record_locator": {
496496
"protocol": "abfs",
497497
"remote_file_path": "abfs://container1/"
@@ -515,7 +515,7 @@
515515
"page_number": 2,
516516
"data_source": {
517517
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
518-
"version": "237960874052008560436652606947751982249",
518+
"version": "0x8DB214A673DD8D8",
519519
"record_locator": {
520520
"protocol": "abfs",
521521
"remote_file_path": "abfs://container1/"
@@ -539,7 +539,7 @@
539539
"page_number": 2,
540540
"data_source": {
541541
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
542-
"version": "237960874052008560436652606947751982249",
542+
"version": "0x8DB214A673DD8D8",
543543
"record_locator": {
544544
"protocol": "abfs",
545545
"remote_file_path": "abfs://container1/"
@@ -563,7 +563,7 @@
563563
"page_number": 2,
564564
"data_source": {
565565
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
566-
"version": "237960874052008560436652606947751982249",
566+
"version": "0x8DB214A673DD8D8",
567567
"record_locator": {
568568
"protocol": "abfs",
569569
"remote_file_path": "abfs://container1/"
@@ -587,7 +587,7 @@
587587
"page_number": 2,
588588
"data_source": {
589589
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
590-
"version": "237960874052008560436652606947751982249",
590+
"version": "0x8DB214A673DD8D8",
591591
"record_locator": {
592592
"protocol": "abfs",
593593
"remote_file_path": "abfs://container1/"
@@ -611,7 +611,7 @@
611611
"page_number": 2,
612612
"data_source": {
613613
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
614-
"version": "237960874052008560436652606947751982249",
614+
"version": "0x8DB214A673DD8D8",
615615
"record_locator": {
616616
"protocol": "abfs",
617617
"remote_file_path": "abfs://container1/"

0 commit comments

Comments
 (0)