diff --git a/python/deltalake/_internal.pyi b/python/deltalake/_internal.pyi index 228488d91a..bc73f3942b 100644 --- a/python/deltalake/_internal.pyi +++ b/python/deltalake/_internal.pyi @@ -31,6 +31,7 @@ class RawDeltaTable: storage_options: Optional[Dict[str, str]], without_files: bool, log_buffer_size: Optional[int], + load_lazy: bool, ) -> None: ... @staticmethod def get_table_uri_from_data_catalog( diff --git a/python/deltalake/table.py b/python/deltalake/table.py index e7b7613599..f10c82635f 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -235,6 +235,7 @@ def __init__( storage_options: Optional[Dict[str, str]] = None, without_files: bool = False, log_buffer_size: Optional[int] = None, + load_lazy: bool = False, ): """ Create the Delta Table from a path with an optional version. @@ -253,7 +254,7 @@ def __init__( This can decrease latency if there are many files in the log since the last checkpoint, but will also increase memory usage. Possible rate limits of the storage backend should also be considered for optimal performance. Defaults to 4 * number of cpus. - + load_lazy: when true the table metadata isn't loaded """ self._storage_options = storage_options self._table = RawDeltaTable( @@ -262,8 +263,12 @@ def __init__( storage_options=storage_options, without_files=without_files, log_buffer_size=log_buffer_size, + load_lazy=load_lazy, ) - self._metadata = Metadata(self._table) + if load_lazy: + self._metadata = None + else: + self._metadata = Metadata(self._table) @classmethod def from_data_catalog( @@ -498,6 +503,8 @@ def metadata(self) -> Metadata: Returns: the current Metadata registered in the transaction log """ + if not self._metadata: + self._metadata = Metadata(self._table) return self._metadata def protocol(self) -> ProtocolVersions: diff --git a/python/src/lib.rs b/python/src/lib.rs index 5741bd40d2..ee6c1de5d2 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -88,13 +88,14 @@ struct RawDeltaTableMetaData { #[pymethods] impl RawDeltaTable { #[new] - #[pyo3(signature = (table_uri, version = None, storage_options = None, without_files = false, log_buffer_size = None))] + #[pyo3(signature = (table_uri, version = None, storage_options = None, without_files = false, log_buffer_size = None, load_lazy = false))] fn new( table_uri: &str, version: Option, storage_options: Option>, without_files: bool, log_buffer_size: Option, + load_lazy: bool, ) -> PyResult { let mut builder = deltalake::DeltaTableBuilder::from_uri(table_uri); let options = storage_options.clone().unwrap_or_default(); @@ -112,8 +113,11 @@ impl RawDeltaTable { .with_log_buffer_size(buf_size) .map_err(PythonError::from)?; } - - let table = rt()?.block_on(builder.load()).map_err(PythonError::from)?; + let table = if !load_lazy { + rt()?.block_on(builder.load()).map_err(PythonError::from)? + } else { + builder.build().map_err(PythonError::from)? + }; Ok(RawDeltaTable { _table: table, _config: FsConfig { diff --git a/python/tests/test_table_read.py b/python/tests/test_table_read.py index a49374e710..e760116910 100644 --- a/python/tests/test_table_read.py +++ b/python/tests/test_table_read.py @@ -63,6 +63,25 @@ def test_read_simple_table_using_options_to_dict(): assert dt.to_pyarrow_dataset().to_table().to_pydict() == {"value": [1, 2, 3]} +def test_simple_table_lazy_loading(): + table_path = "../crates/deltalake-core/tests/data/simple_table" + dt = DeltaTable(table_path, load_lazy=True) + dt.load_version(2) + assert dt.version() == 2 + + +def test_simple_table_lazy_loading_with_options(): + table_path = "../crates/deltalake-core/tests/data/simple_table" + dt = DeltaTable( + table_path, + storage_options={}, + without_files=False, + log_buffer_size=1, + load_lazy=True, + ) + assert isinstance(dt, DeltaTable) + + def test_load_with_datetime(): log_dir = "../crates/deltalake-core/tests/data/simple_table/_delta_log" log_mtime_pair = [