feat: raise TooBigRows exceptions if the scan size would exceed a limit

kszucs · kszucs · commit f522fe10f9ad · 2025-10-27T12:59:55.000+01:00
diff --git a/libs/libcommon/src/libcommon/parquet_utils.py b/libs/libcommon/src/libcommon/parquet_utils.py
@@ -26,7 +26,7 @@
 from libcommon.viewer_utils.features import get_supported_unsupported_columns
 
 try:
-    from libviewer import Dataset as LibviewerDataset  # type: ignore [import-untyped]
+    import libviewer as lv  # type: ignore [import-untyped]
 
     _has_libviewer = True
 except ImportError:
@@ -522,12 +522,14 @@ def __init__(
         hf_token: Optional[str],
         parquet_metadata_directory: StrPath,
         max_arrow_data_in_memory: int,
+        max_scan_size: int,
         unsupported_features: Sequence[FeatureType] = (),
         data_store: str = "hf://",
     ):
         self.dataset = dataset
         self.config = config
         self.split = split
+        self.max_scan_size = max_scan_size
 
         self._init_dataset_info(parquet_metadata_directory)
         self._init_parquet_index(
@@ -608,7 +610,7 @@ def _init_viewer_index(self, data_store: str, metadata_store: str) -> None:
                 }
             )
 
-        self.viewer_index = LibviewerDataset(
+        self.viewer_index = lv.Dataset(
             name=self.dataset,
             files=files,
             revision=self.revision,
@@ -649,7 +651,17 @@ def query_with_page_pruning(self, offset: int, length: int) -> pa.Table:
             raise IndexError("Offset must be non-negative")
         if length < 0:
             raise IndexError("Length must be non-negative")
-        batches, _files_to_index = self.viewer_index.sync_scan(offset=offset, limit=length)
+
+        try:
+            batches, _files_to_index = self.viewer_index.sync_scan(
+                offset=offset, limit=length, scan_size_limit=self.max_scan_size
+            )
+        except lv.DatasetError as e:
+            if "Scan size limit exceeded" in str(e):
+                raise TooBigRows(str(e)) from e
+            else:
+                raise
+
         return pa.Table.from_batches(batches, schema=self.features.arrow_schema)
 
     # note that this cache size is global for the class, not per instance
@@ -707,6 +719,7 @@ def get_rows_index(self, dataset: str, config: str, split: str, data_store: str
             hf_token=self.hf_token,
             parquet_metadata_directory=self.parquet_metadata_directory,
             max_arrow_data_in_memory=self.max_arrow_data_in_memory,
+            max_scan_size=self.max_arrow_data_in_memory,
             unsupported_features=unsupported_features,
             data_store=data_store,
         )
diff --git a/libs/libcommon/tests/test_parquet_utils.py b/libs/libcommon/tests/test_parquet_utils.py
@@ -368,7 +368,8 @@ def rows_index_with_empty_dataset(
 ) -> Generator[RowsIndex, None, None]:
     with ds_empty_fs.open("default/train/0000.parquet") as f:
         with patch("libcommon.parquet_utils.HTTPFile", return_value=f):
-            yield indexer.get_rows_index("ds_empty", "default", "train")
+            data_store = f"file://{ds_empty_fs.local_root_dir}"
+            yield indexer.get_rows_index("ds_empty", "default", "train", data_store=data_store)
 
 
 @pytest.fixture
@@ -386,7 +387,8 @@ def rows_index_with_too_big_rows(
     )
     with ds_sharded_fs.open("default/train/0003.parquet") as f:
         with patch("libcommon.parquet_utils.HTTPFile", return_value=f):
-            yield indexer.get_rows_index("ds_sharded", "default", "train")
+            data_store = f"file://{ds_sharded_fs.local_root_dir}"
+            yield indexer.get_rows_index("ds_sharded", "default", "train", data_store=data_store)
 
 
 @pytest.fixture
@@ -465,24 +467,18 @@ def test_rows_index_query_with_parquet_metadata(
     with pytest.raises(IndexError):
         rows_index_with_parquet_metadata.query(offset=-1, length=2)
 
+    # test the same with page pruning API
+    import libviewer as lv  # type: ignore [import-untyped]
 
-def test_rows_index_query_with_page_pruning(rows_index_with_parquet_metadata: RowsIndex, ds_sharded: Dataset) -> None:
-    from libviewer import Dataset as LibviewerDataset  # type: ignore [import-untyped]
-
-    assert isinstance(rows_index_with_parquet_metadata.viewer_index, LibviewerDataset)
-
+    assert isinstance(rows_index_with_parquet_metadata.viewer_index, lv.Dataset)
     result = rows_index_with_parquet_metadata.query_with_page_pruning(offset=1, length=3)
     assert result.to_pydict() == ds_sharded[1:4]
-
     result = rows_index_with_parquet_metadata.query_with_page_pruning(offset=1, length=0)
     assert result.to_pydict() == ds_sharded[:0]
-
     result = rows_index_with_parquet_metadata.query_with_page_pruning(offset=999999, length=1)
     assert result.to_pydict() == ds_sharded[:0]
-
     result = rows_index_with_parquet_metadata.query_with_page_pruning(offset=1, length=99999999)
     assert result.to_pydict() == ds_sharded[1:]
-
     with pytest.raises(IndexError):
         rows_index_with_parquet_metadata.query_with_page_pruning(offset=0, length=-1)
     with pytest.raises(IndexError):
@@ -493,13 +489,25 @@ def test_rows_index_query_with_too_big_rows(rows_index_with_too_big_rows: RowsIn
     with pytest.raises(TooBigRows):
         rows_index_with_too_big_rows.query(offset=0, length=3)
 
+    # test the same with page pruning API
+    with pytest.raises(TooBigRows):
+        rows_index_with_too_big_rows.query_with_page_pruning(offset=0, length=2)
+
 
 def test_rows_index_query_with_empty_dataset(rows_index_with_empty_dataset: RowsIndex, ds_sharded: Dataset) -> None:
     assert isinstance(rows_index_with_empty_dataset.parquet_index, ParquetIndexWithMetadata)
     assert rows_index_with_empty_dataset.query(offset=0, length=1).to_pydict() == ds_sharded[:0]
     with pytest.raises(IndexError):
         rows_index_with_empty_dataset.query(offset=-1, length=2)
 
+    # test the same with page pruning API
+    import libviewer as lv  # type: ignore [import-untyped]
+    assert isinstance(rows_index_with_empty_dataset.viewer_index, lv.Dataset)
+    result = rows_index_with_empty_dataset.query_with_page_pruning(offset=0, length=1)
+    assert result.to_pydict() == ds_sharded[:0]
+    with pytest.raises(IndexError):
+        rows_index_with_empty_dataset.query_with_page_pruning(offset=-1, length=2)
+
 
 def test_indexer_schema_mistmatch_error(
     indexer: Indexer,
diff --git a/libs/libviewer/Cargo.lock b/libs/libviewer/Cargo.lock
diff --git a/libs/libviewer/Cargo.toml b/libs/libviewer/Cargo.toml
@@ -10,6 +10,7 @@ crate-type = ["cdylib"]
 
 [dependencies]
 arrow = { version = "56.2", features = ["pyarrow"] }
+bytes = "1.10.1"
 futures = "0.3"
 object_store = "0.12.0"
 object_store_opendal = "0.52.0"
diff --git a/libs/libviewer/libviewer/__init__.py b/libs/libviewer/libviewer/__init__.py
@@ -2,7 +2,7 @@
 
 from huggingface_hub import hf_hub_download, list_repo_files
 
-from ._internal import PyDataset
+from ._internal import PyDataset, PyDatasetError as DatasetError
 
 
 __all__ = ["Dataset"]
diff --git a/libs/libviewer/src/dataset.rs b/libs/libviewer/src/dataset.rs
@@ -30,7 +30,7 @@ pub enum DatasetError {
     #[error("Arrow error: {0}")]
     Arrow(#[from] arrow::error::ArrowError),
 
-    #[error("Parquet error: {0}")]
+    #[error("{0}")]
     Parquet(#[from] ::parquet::errors::ParquetError),
 
     #[error("Object store error: {0}")]
@@ -58,7 +58,7 @@ impl ParquetScan {
         self.metadata.offset_index().is_some()
     }
 
-    fn estimate_scan_size(&self) -> i64 {
+    fn estimate_scan_size(&self) -> u64 {
         let mut scan_size = 0;
         let mut rows_to_skip = self.offset;
         let mut rows_needed = self.limit;
@@ -78,7 +78,7 @@ impl ParquetScan {
 
             // Accumulate the size if we need to scan any rows from this row group
             if rows_to_read > 0 {
-                scan_size += row_group.compressed_size();
+                scan_size += row_group.compressed_size() as u64;
                 rows_needed -= rows_to_read;
             }
 
@@ -94,7 +94,7 @@ impl ParquetScan {
         scan_size
     }
 
-    fn shall_index(&self, scan_size_limit: i64) -> bool {
+    fn shall_index(&self, scan_size_limit: u64) -> bool {
         // TODO(kszucs): reconsider the case when we want to index:
         // 1. file reads with row groups larger than the scan size limit
         // 2. file reads with overall size larger than the scan size limit (multiple row groups)
@@ -107,13 +107,18 @@ impl ParquetScan {
         self.estimate_scan_size() > scan_size_limit
     }
 
-    async fn execute(&self, data_store: Arc<dyn ObjectStore>) -> Result<Vec<RecordBatch>> {
+    async fn execute(
+        &self,
+        data_store: Arc<dyn ObjectStore>,
+        scan_size_limit: u64,
+    ) -> Result<Vec<RecordBatch>> {
         let stream = read_batch_stream(
             data_store,
             self.file.path.clone(),
             self.metadata.clone(),
             self.offset,
             self.limit,
+            scan_size_limit,
         )?;
         Ok(stream.try_collect::<Vec<_>>().await?)
     }
@@ -159,7 +164,7 @@ pub struct Dataset {
     metadata_store: Arc<dyn ObjectStore>,
     pub metadata_store_uri: String,
     /// Scan size limit for triggering indexing
-    indexing_size_threshold: i64,
+    indexing_size_threshold: u64,
 }
 
 impl Dataset {
@@ -169,7 +174,7 @@ impl Dataset {
         revision: Option<&str>,
         data_uri: &str,
         metadata_uri: &str,
-        indexing_size_threshold: i64,
+        indexing_size_threshold: u64,
     ) -> Result<Self> {
         Ok(Self {
             name: name.to_string(),
@@ -265,6 +270,7 @@ impl Dataset {
         &self,
         limit: Option<u64>,
         offset: Option<u64>,
+        scan_size_limit: u64,
     ) -> Result<(Vec<RecordBatch>, Vec<IndexedFile>)> {
         // 1. create an object reader for each file in the access plan
         // 2. generate a stream of record batches from each reader
@@ -280,7 +286,7 @@ impl Dataset {
 
         let tasks = plan.into_iter().map(|scan| {
             let data_store = self.data_store.clone();
-            task::spawn(async move { scan.execute(data_store).await })
+            task::spawn(async move { scan.execute(data_store, scan_size_limit).await })
         });
         let results = future::try_join_all(tasks).await?;
         let batches = results
diff --git a/libs/libviewer/src/lib.rs b/libs/libviewer/src/lib.rs
@@ -2,18 +2,22 @@ mod dataset;
 mod parquet;
 
 use arrow::pyarrow::IntoPyArrow;
-use pyo3::exceptions::PyValueError;
+use pyo3::create_exception;
 use pyo3::prelude::*;
 use pyo3_async_runtimes;
+
 use tokio;
 
 use crate::dataset::{Dataset, DatasetError};
 
-const INDEXING_SIZE_THRESHOLD: i64 = 100 * 1024 * 1024; // 100 MiB
+const INDEXING_SIZE_THRESHOLD: u64 = 100 * 1024 * 1024; // 100 MiB
+const DEFAULT_SCAN_SIZE_LIMIT: u64 = 1 * 1024 * 1024 * 1024; // 1 GiB
+
+create_exception!(libviewer, PyDatasetError, pyo3::exceptions::PyException);
 
 impl From<DatasetError> for PyErr {
     fn from(err: DatasetError) -> Self {
-        PyValueError::new_err(err.to_string())
+        PyDatasetError::new_err(err.to_string())
     }
 }
 
@@ -46,7 +50,7 @@ impl PyDataset {
         metadata_store: &str,
         data_store: &str,
         revision: Option<&str>,
-        indexing_size_threshold: i64,
+        indexing_size_threshold: u64,
     ) -> PyResult<Self> {
         let dataset = Dataset::try_new(
             name,
@@ -88,32 +92,36 @@ impl PyDataset {
         Ok(&self.dataset.metadata_store_uri)
     }
 
-    #[pyo3(signature = (limit=None, offset=None))]
+    #[pyo3(signature = (limit=None, offset=None, scan_size_limit=DEFAULT_SCAN_SIZE_LIMIT))]
     fn sync_scan(
         &self,
         py: Python<'_>,
         limit: Option<u64>,
         offset: Option<u64>,
+        scan_size_limit: u64,
     ) -> PyResult<(Vec<PyObject>, Vec<IndexedFile>)> {
         let rt = tokio::runtime::Runtime::new()?;
-        let (record_batches, files_to_index) = rt.block_on(self.dataset.scan(limit, offset))?;
+        let (record_batches, files_to_index) =
+            rt.block_on(self.dataset.scan(limit, offset, scan_size_limit))?;
         let pyarrow_batches = record_batches
             .into_iter()
             .map(|batch| batch.into_pyarrow(py))
             .collect::<PyResult<Vec<_>>>()?;
         Ok((pyarrow_batches, files_to_index))
     }
 
-    #[pyo3(signature = (limit=None, offset=None))]
+    #[pyo3(signature = (limit=None, offset=None, scan_size_limit=DEFAULT_SCAN_SIZE_LIMIT))]
     fn scan<'py>(
         &self,
         py: Python<'py>,
         limit: Option<u64>,
         offset: Option<u64>,
+        scan_size_limit: u64,
     ) -> PyResult<Bound<'py, PyAny>> {
         let this = self.clone();
         pyo3_async_runtimes::tokio::future_into_py(py, async move {
-            let (record_batches, files_to_index) = this.dataset.scan(limit, offset).await?;
+            let (record_batches, files_to_index) =
+                this.dataset.scan(limit, offset, scan_size_limit).await?;
             let pyarrow_batches = Python::with_gil(|py| {
                 record_batches
                     .into_iter()
@@ -151,5 +159,6 @@ impl PyDataset {
 #[pyo3(name = "_internal")]
 fn dv(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<PyDataset>()?;
+    m.add("PyDatasetError", m.py().get_type::<PyDatasetError>())?;
     Ok(())
 }
diff --git a/libs/libviewer/src/parquet.rs b/libs/libviewer/src/parquet.rs