Skip to content

Commit 1df7908

Browse files
feat: save file id for all fsspec connectors if present (#3405)
### Description If the id value exists in the stats response from fsspec, save it as a `file_id` field in the metadata being persisted on each element. --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: rbiseck3 <[email protected]>
1 parent 0eb461a commit 1df7908

File tree

17 files changed

+163
-81
lines changed

17 files changed

+163
-81
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.15.0-dev15
1+
## 0.15.0-dev16
22

33
### Enhancements
44

test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
"version": "83125548004193369404829885052395764226",
1515
"record_locator": {
1616
"protocol": "box",
17-
"remote_file_path": "box://utic-test-ingest-fixtures"
17+
"remote_file_path": "box://utic-test-ingest-fixtures",
18+
"file_id": "1255888824139"
1819
},
1920
"date_created": "1688874451.0",
2021
"date_modified": "1688874451.0"
@@ -41,7 +42,8 @@
4142
"version": "83125548004193369404829885052395764226",
4243
"record_locator": {
4344
"protocol": "box",
44-
"remote_file_path": "box://utic-test-ingest-fixtures"
45+
"remote_file_path": "box://utic-test-ingest-fixtures",
46+
"file_id": "1255888824139"
4547
},
4648
"date_created": "1688874451.0",
4749
"date_modified": "1688874451.0"
@@ -68,7 +70,8 @@
6870
"version": "83125548004193369404829885052395764226",
6971
"record_locator": {
7072
"protocol": "box",
71-
"remote_file_path": "box://utic-test-ingest-fixtures"
73+
"remote_file_path": "box://utic-test-ingest-fixtures",
74+
"file_id": "1255888824139"
7275
},
7376
"date_created": "1688874451.0",
7477
"date_modified": "1688874451.0"
@@ -89,7 +92,8 @@
8992
"version": "83125548004193369404829885052395764226",
9093
"record_locator": {
9194
"protocol": "box",
92-
"remote_file_path": "box://utic-test-ingest-fixtures"
95+
"remote_file_path": "box://utic-test-ingest-fixtures",
96+
"file_id": "1255888824139"
9397
},
9498
"date_created": "1688874451.0",
9599
"date_modified": "1688874451.0"
@@ -110,7 +114,8 @@
110114
"version": "83125548004193369404829885052395764226",
111115
"record_locator": {
112116
"protocol": "box",
113-
"remote_file_path": "box://utic-test-ingest-fixtures"
117+
"remote_file_path": "box://utic-test-ingest-fixtures",
118+
"file_id": "1255888824139"
114119
},
115120
"date_created": "1688874451.0",
116121
"date_modified": "1688874451.0"
@@ -131,7 +136,8 @@
131136
"version": "83125548004193369404829885052395764226",
132137
"record_locator": {
133138
"protocol": "box",
134-
"remote_file_path": "box://utic-test-ingest-fixtures"
139+
"remote_file_path": "box://utic-test-ingest-fixtures",
140+
"file_id": "1255888824139"
135141
},
136142
"date_created": "1688874451.0",
137143
"date_modified": "1688874451.0"
@@ -152,7 +158,8 @@
152158
"version": "83125548004193369404829885052395764226",
153159
"record_locator": {
154160
"protocol": "box",
155-
"remote_file_path": "box://utic-test-ingest-fixtures"
161+
"remote_file_path": "box://utic-test-ingest-fixtures",
162+
"file_id": "1255888824139"
156163
},
157164
"date_created": "1688874451.0",
158165
"date_modified": "1688874451.0"
@@ -173,7 +180,8 @@
173180
"version": "83125548004193369404829885052395764226",
174181
"record_locator": {
175182
"protocol": "box",
176-
"remote_file_path": "box://utic-test-ingest-fixtures"
183+
"remote_file_path": "box://utic-test-ingest-fixtures",
184+
"file_id": "1255888824139"
177185
},
178186
"date_created": "1688874451.0",
179187
"date_modified": "1688874451.0"
@@ -194,7 +202,8 @@
194202
"version": "83125548004193369404829885052395764226",
195203
"record_locator": {
196204
"protocol": "box",
197-
"remote_file_path": "box://utic-test-ingest-fixtures"
205+
"remote_file_path": "box://utic-test-ingest-fixtures",
206+
"file_id": "1255888824139"
198207
},
199208
"date_created": "1688874451.0",
200209
"date_modified": "1688874451.0"
@@ -215,7 +224,8 @@
215224
"version": "83125548004193369404829885052395764226",
216225
"record_locator": {
217226
"protocol": "box",
218-
"remote_file_path": "box://utic-test-ingest-fixtures"
227+
"remote_file_path": "box://utic-test-ingest-fixtures",
228+
"file_id": "1255888824139"
219229
},
220230
"date_created": "1688874451.0",
221231
"date_modified": "1688874451.0"
@@ -236,7 +246,8 @@
236246
"version": "83125548004193369404829885052395764226",
237247
"record_locator": {
238248
"protocol": "box",
239-
"remote_file_path": "box://utic-test-ingest-fixtures"
249+
"remote_file_path": "box://utic-test-ingest-fixtures",
250+
"file_id": "1255888824139"
240251
},
241252
"date_created": "1688874451.0",
242253
"date_modified": "1688874451.0"
@@ -257,7 +268,8 @@
257268
"version": "83125548004193369404829885052395764226",
258269
"record_locator": {
259270
"protocol": "box",
260-
"remote_file_path": "box://utic-test-ingest-fixtures"
271+
"remote_file_path": "box://utic-test-ingest-fixtures",
272+
"file_id": "1255888824139"
261273
},
262274
"date_created": "1688874451.0",
263275
"date_modified": "1688874451.0"
@@ -278,7 +290,8 @@
278290
"version": "83125548004193369404829885052395764226",
279291
"record_locator": {
280292
"protocol": "box",
281-
"remote_file_path": "box://utic-test-ingest-fixtures"
293+
"remote_file_path": "box://utic-test-ingest-fixtures",
294+
"file_id": "1255888824139"
282295
},
283296
"date_created": "1688874451.0",
284297
"date_modified": "1688874451.0"
@@ -299,7 +312,8 @@
299312
"version": "83125548004193369404829885052395764226",
300313
"record_locator": {
301314
"protocol": "box",
302-
"remote_file_path": "box://utic-test-ingest-fixtures"
315+
"remote_file_path": "box://utic-test-ingest-fixtures",
316+
"file_id": "1255888824139"
303317
},
304318
"date_created": "1688874451.0",
305319
"date_modified": "1688874451.0"
@@ -321,7 +335,8 @@
321335
"version": "83125548004193369404829885052395764226",
322336
"record_locator": {
323337
"protocol": "box",
324-
"remote_file_path": "box://utic-test-ingest-fixtures"
338+
"remote_file_path": "box://utic-test-ingest-fixtures",
339+
"file_id": "1255888824139"
325340
},
326341
"date_created": "1688874451.0",
327342
"date_modified": "1688874451.0"

test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
"version": "77943175838335685751163845636763163681",
1515
"record_locator": {
1616
"protocol": "box",
17-
"remote_file_path": "box://utic-test-ingest-fixtures"
17+
"remote_file_path": "box://utic-test-ingest-fixtures",
18+
"file_id": "1255892530552"
1819
},
1920
"date_created": "1688874401.0",
2021
"date_modified": "1688874401.0"

test_unstructured_ingest/expected-structured-output/box/nested-1/nested-2/ideas-page.html.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
"version": "293680985726204769765169474511274942733",
1515
"record_locator": {
1616
"protocol": "box",
17-
"remote_file_path": "box://utic-test-ingest-fixtures"
17+
"remote_file_path": "box://utic-test-ingest-fixtures",
18+
"file_id": "1255884723846"
1819
},
1920
"date_created": "1688874389.0",
2021
"date_modified": "1688874389.0"

test_unstructured_ingest/expected-structured-output/box/science-exploration-1p.pptx.json

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
"version": "309546934335254463247992132065898582121",
1515
"record_locator": {
1616
"protocol": "box",
17-
"remote_file_path": "box://utic-test-ingest-fixtures"
17+
"remote_file_path": "box://utic-test-ingest-fixtures",
18+
"file_id": "1255894255490"
1819
},
1920
"date_created": "1688874428.0",
2021
"date_modified": "1688874428.0"
@@ -36,7 +37,8 @@
3637
"version": "309546934335254463247992132065898582121",
3738
"record_locator": {
3839
"protocol": "box",
39-
"remote_file_path": "box://utic-test-ingest-fixtures"
40+
"remote_file_path": "box://utic-test-ingest-fixtures",
41+
"file_id": "1255894255490"
4042
},
4143
"date_created": "1688874428.0",
4244
"date_modified": "1688874428.0"
@@ -58,7 +60,8 @@
5860
"version": "309546934335254463247992132065898582121",
5961
"record_locator": {
6062
"protocol": "box",
61-
"remote_file_path": "box://utic-test-ingest-fixtures"
63+
"remote_file_path": "box://utic-test-ingest-fixtures",
64+
"file_id": "1255894255490"
6265
},
6366
"date_created": "1688874428.0",
6467
"date_modified": "1688874428.0"
@@ -80,7 +83,8 @@
8083
"version": "309546934335254463247992132065898582121",
8184
"record_locator": {
8285
"protocol": "box",
83-
"remote_file_path": "box://utic-test-ingest-fixtures"
86+
"remote_file_path": "box://utic-test-ingest-fixtures",
87+
"file_id": "1255894255490"
8488
},
8589
"date_created": "1688874428.0",
8690
"date_modified": "1688874428.0"
@@ -102,7 +106,8 @@
102106
"version": "309546934335254463247992132065898582121",
103107
"record_locator": {
104108
"protocol": "box",
105-
"remote_file_path": "box://utic-test-ingest-fixtures"
109+
"remote_file_path": "box://utic-test-ingest-fixtures",
110+
"file_id": "1255894255490"
106111
},
107112
"date_created": "1688874428.0",
108113
"date_modified": "1688874428.0"
@@ -124,7 +129,8 @@
124129
"version": "309546934335254463247992132065898582121",
125130
"record_locator": {
126131
"protocol": "box",
127-
"remote_file_path": "box://utic-test-ingest-fixtures"
132+
"remote_file_path": "box://utic-test-ingest-fixtures",
133+
"file_id": "1255894255490"
128134
},
129135
"date_created": "1688874428.0",
130136
"date_modified": "1688874428.0"
@@ -146,7 +152,8 @@
146152
"version": "309546934335254463247992132065898582121",
147153
"record_locator": {
148154
"protocol": "box",
149-
"remote_file_path": "box://utic-test-ingest-fixtures"
155+
"remote_file_path": "box://utic-test-ingest-fixtures",
156+
"file_id": "1255894255490"
150157
},
151158
"date_created": "1688874428.0",
152159
"date_modified": "1688874428.0"
@@ -168,7 +175,8 @@
168175
"version": "309546934335254463247992132065898582121",
169176
"record_locator": {
170177
"protocol": "box",
171-
"remote_file_path": "box://utic-test-ingest-fixtures"
178+
"remote_file_path": "box://utic-test-ingest-fixtures",
179+
"file_id": "1255894255490"
172180
},
173181
"date_created": "1688874428.0",
174182
"date_modified": "1688874428.0"
@@ -190,7 +198,8 @@
190198
"version": "309546934335254463247992132065898582121",
191199
"record_locator": {
192200
"protocol": "box",
193-
"remote_file_path": "box://utic-test-ingest-fixtures"
201+
"remote_file_path": "box://utic-test-ingest-fixtures",
202+
"file_id": "1255894255490"
194203
},
195204
"date_created": "1688874428.0",
196205
"date_modified": "1688874428.0"
@@ -212,7 +221,8 @@
212221
"version": "309546934335254463247992132065898582121",
213222
"record_locator": {
214223
"protocol": "box",
215-
"remote_file_path": "box://utic-test-ingest-fixtures"
224+
"remote_file_path": "box://utic-test-ingest-fixtures",
225+
"file_id": "1255894255490"
216226
},
217227
"date_created": "1688874428.0",
218228
"date_modified": "1688874428.0"
@@ -234,7 +244,8 @@
234244
"version": "309546934335254463247992132065898582121",
235245
"record_locator": {
236246
"protocol": "box",
237-
"remote_file_path": "box://utic-test-ingest-fixtures"
247+
"remote_file_path": "box://utic-test-ingest-fixtures",
248+
"file_id": "1255894255490"
238249
},
239250
"date_created": "1688874428.0",
240251
"date_modified": "1688874428.0"
@@ -256,7 +267,8 @@
256267
"version": "309546934335254463247992132065898582121",
257268
"record_locator": {
258269
"protocol": "box",
259-
"remote_file_path": "box://utic-test-ingest-fixtures"
270+
"remote_file_path": "box://utic-test-ingest-fixtures",
271+
"file_id": "1255894255490"
260272
},
261273
"date_created": "1688874428.0",
262274
"date_modified": "1688874428.0"
@@ -278,7 +290,8 @@
278290
"version": "309546934335254463247992132065898582121",
279291
"record_locator": {
280292
"protocol": "box",
281-
"remote_file_path": "box://utic-test-ingest-fixtures"
293+
"remote_file_path": "box://utic-test-ingest-fixtures",
294+
"file_id": "1255894255490"
282295
},
283296
"date_created": "1688874428.0",
284297
"date_modified": "1688874428.0"

0 commit comments

Comments
 (0)