22import json
33from datetime import datetime
44from pathlib import Path
5- from typing import Any , Generator , Iterable , Optional , Sequence , TypeVar , Union , cast
6-
7- import pandas as pd
5+ from typing import TYPE_CHECKING , Any , Generator , Iterable , Optional , Sequence , TypeVar , Union , cast
86
97from unstructured_ingest .utils import ndjson
8+ from unstructured_ingest .utils .dep_check import requires_dependencies
109from unstructured_ingest .v2 .logger import logger
1110
11+ if TYPE_CHECKING :
12+ from pandas import DataFrame
13+
1214DATE_FORMATS = ("%Y-%m-%d" , "%Y-%m-%dT%H:%M:%S" , "%Y-%m-%d+%H:%M:%S" , "%Y-%m-%dT%H:%M:%S%z" )
1315
1416T = TypeVar ("T" )
1517IterableT = Iterable [T ]
1618
1719
18- def split_dataframe (df : pd . DataFrame , chunk_size : int = 100 ) -> Generator [pd . DataFrame , None , None ]:
20+ def split_dataframe (df : " DataFrame" , chunk_size : int = 100 ) -> Generator [" DataFrame" , None , None ]:
1921 num_chunks = len (df ) // chunk_size + 1
2022 for i in range (num_chunks ):
2123 yield df [i * chunk_size : (i + 1 ) * chunk_size ]
@@ -144,9 +146,13 @@ def get_data_by_suffix(path: Path) -> list[dict]:
144146 elif path .suffix == ".ndjson" :
145147 return ndjson .load (f )
146148 elif path .suffix == ".csv" :
149+ import pandas as pd
150+
147151 df = pd .read_csv (path )
148152 return df .to_dict (orient = "records" )
149153 elif path .suffix == ".parquet" :
154+ import pandas as pd
155+
150156 df = pd .read_parquet (path )
151157 return df .to_dict (orient = "records" )
152158 else :
@@ -180,6 +186,9 @@ def get_data(path: Union[Path, str]) -> list[dict]:
180186 return ndjson .load (f )
181187 except Exception as e :
182188 logger .warning (f"failed to read { path } as ndjson: { e } " )
189+
190+ import pandas as pd
191+
183192 try :
184193 df = pd .read_csv (path )
185194 return df .to_dict (orient = "records" )
@@ -202,7 +211,10 @@ def get_json_data(path: Path) -> list[dict]:
202211 raise ValueError (f"Unsupported file type: { path } " )
203212
204213
205- def get_data_df (path : Path ) -> pd .DataFrame :
214+ @requires_dependencies (["pandas" ])
215+ def get_data_df (path : Path ) -> "DataFrame" :
216+ import pandas as pd
217+
206218 with path .open () as f :
207219 if path .suffix == ".json" :
208220 data = json .load (f )
0 commit comments