Skip to content

Commit 898d642

Browse files
authored
bugfix/fix data read (#438)
* fix stagers that were only reading json rather than have support for ndjson * bump version
1 parent ec84155 commit 898d642

File tree

6 files changed

+28
-12
lines changed

6 files changed

+28
-12
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## 0.5.18
2+
3+
### Fixes
4+
5+
* **Fix missing support for NDJSON in stagers**
6+
17
## 0.5.17
28

39
### Fixes

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.17" # pragma: no cover
1+
__version__ = "0.5.18" # pragma: no cover

unstructured_ingest/utils/data_prep.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,16 @@ def get_data(path: Union[Path, str]) -> list[dict]:
192192
logger.warning(f"failed to read {path} as parquet: {e}")
193193

194194

195+
def get_json_data(path: Path) -> list[dict]:
196+
with path.open() as f:
197+
if path.suffix == ".json":
198+
return json.load(f)
199+
elif path.suffix == ".ndjson":
200+
return ndjson.load(f)
201+
else:
202+
raise ValueError(f"Unsupported file type: {path}")
203+
204+
195205
def get_data_df(path: Path) -> pd.DataFrame:
196206
with path.open() as f:
197207
if path.suffix == ".json":

unstructured_ingest/v2/processes/connectors/delta_table.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import json
21
import os
32
import traceback
43
from dataclasses import dataclass, field
@@ -11,7 +10,7 @@
1110
from pydantic import Field, Secret
1211

1312
from unstructured_ingest.error import DestinationConnectionError
14-
from unstructured_ingest.utils.data_prep import get_data_df
13+
from unstructured_ingest.utils.data_prep import get_data_df, get_json_data
1514
from unstructured_ingest.utils.dep_check import requires_dependencies
1615
from unstructured_ingest.utils.table import convert_to_pandas_dataframe
1716
from unstructured_ingest.v2.interfaces import (
@@ -86,9 +85,7 @@ def run(
8685
output_filename: str,
8786
**kwargs: Any,
8887
) -> Path:
89-
with open(elements_filepath) as elements_file:
90-
elements_contents = json.load(elements_file)
91-
88+
elements_contents = get_json_data(elements_filepath)
9289
output_path = Path(output_dir) / Path(f"{output_filename}.parquet")
9390

9491
df = convert_to_pandas_dataframe(elements_dict=elements_contents)

unstructured_ingest/v2/processes/connectors/neo4j.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from __future__ import annotations
22

33
import asyncio
4-
import json
54
import uuid
65
from collections import defaultdict
76
from contextlib import asynccontextmanager
@@ -14,7 +13,7 @@
1413

1514
from unstructured_ingest.error import DestinationConnectionError
1615
from unstructured_ingest.logger import logger
17-
from unstructured_ingest.utils.data_prep import batch_generator
16+
from unstructured_ingest.utils.data_prep import batch_generator, get_json_data
1817
from unstructured_ingest.utils.dep_check import requires_dependencies
1918
from unstructured_ingest.v2.interfaces import (
2019
AccessConfig,
@@ -97,8 +96,7 @@ def run( # type: ignore
9796
output_filename: str,
9897
**kwargs: Any,
9998
) -> Path:
100-
with elements_filepath.open() as file:
101-
elements = json.load(file)
99+
elements = get_json_data(elements_filepath)
102100

103101
nx_graph = self._create_lexical_graph(
104102
elements, self._create_document_node(file_data=file_data)
@@ -294,8 +292,7 @@ def is_async(self):
294292
return True
295293

296294
async def run_async(self, path: Path, file_data: FileData, **kwargs) -> None: # type: ignore
297-
with path.open() as file:
298-
staged_data = json.load(file)
295+
staged_data = get_json_data(path)
299296

300297
graph_data = _GraphData.model_validate(staged_data)
301298
async with self.connection_config.get_client() as client:

unstructured_ingest/v2/processes/connectors/onedrive.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@
3535
DestinationRegistryEntry,
3636
SourceRegistryEntry,
3737
)
38+
from unstructured_ingest.v2.processes.utils.blob_storage import (
39+
BlobStoreUploadStager,
40+
BlobStoreUploadStagerConfig,
41+
)
3842

3943
if TYPE_CHECKING:
4044
from office365.graph_client import GraphClient
@@ -428,4 +432,6 @@ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
428432
connection_config=OnedriveConnectionConfig,
429433
uploader=OnedriveUploader,
430434
uploader_config=OnedriveUploaderConfig,
435+
upload_stager_config=BlobStoreUploadStagerConfig,
436+
upload_stager=BlobStoreUploadStager,
431437
)

0 commit comments

Comments
 (0)