refactor(libcommon): remove Indexer in favor of local caching in the rows endpoint (#3251)

kszucs · web-flow · commit 51ff4634ecc3 · 2025-11-04T15:23:52.000+01:00
* refactor(libcommon): remove the effectively unused arguments of `Indexer`

* style: remove unnecessarry imports

* refactor(libcommon): remove `Indexer`

* refactor(services): directly create `RowsIndex` instead of `Indexer`

* test(libcommon): fix `test_rows_index_query_with_empty_dataset` to use `ds_empty`

* chore: missing import and mypy types

* style: fix import order

* fix(libcommon): cache the latest instance of `RowsIndex`

* test(libcommon): add a test for caching the latest RowsIndex instance

* fix(libcommon): only cache RowsIndex when serving from the rows endpoint

* test(libcommon): remove previously added test case for caching RowIndex instances

* chore: missing type annotations
diff --git a/libs/libcommon/src/libcommon/parquet_utils.py b/libs/libcommon/src/libcommon/parquet_utils.py
@@ -601,31 +601,3 @@ def query_truncated_binary(self, offset: int, length: int) -> tuple[pa.Table, li
             f" split={self.split}, offset={offset}, length={length}, with truncated binary"
         )
         return self.parquet_index.query_truncated_binary(offset=offset, length=length)
-
-
-class Indexer:
-    def __init__(
-        self,
-        parquet_metadata_directory: StrPath,
-        httpfs: HTTPFileSystem,
-        max_arrow_data_in_memory: int,
-    ):
-        self.parquet_metadata_directory = parquet_metadata_directory
-        self.httpfs = httpfs
-        self.max_arrow_data_in_memory = max_arrow_data_in_memory
-
-    @lru_cache(maxsize=1)
-    def get_rows_index(
-        self,
-        dataset: str,
-        config: str,
-        split: str,
-    ) -> RowsIndex:
-        return RowsIndex(
-            dataset=dataset,
-            config=config,
-            split=split,
-            httpfs=self.httpfs,
-            parquet_metadata_directory=self.parquet_metadata_directory,
-            max_arrow_data_in_memory=self.max_arrow_data_in_memory,
-        )
diff --git a/libs/libcommon/tests/test_parquet_utils.py b/libs/libcommon/tests/test_parquet_utils.py
@@ -17,7 +17,6 @@
 from fsspec.implementations.http import HTTPFileSystem
 
 from libcommon.parquet_utils import (
-    Indexer,
     ParquetIndexWithMetadata,
     RowsIndex,
     SchemaMismatchError,
@@ -346,56 +345,25 @@ def dataset_image_with_config_parquet() -> dict[str, Any]:
     return config_parquet_content
 
 
+# TODO(kszucs): this fixture is used in a single test case, but the tests starts
+# to fail if I move the index creation there.
 @pytest.fixture
 def rows_index_with_parquet_metadata(
-    indexer: Indexer,
     ds_sharded: Dataset,
     ds_sharded_fs: AbstractFileSystem,
     dataset_sharded_with_config_parquet_metadata: dict[str, Any],
-) -> Generator[RowsIndex, None, None]:
-    with ds_sharded_fs.open("default/train/0003.parquet") as f:
-        with patch("libcommon.parquet_utils.HTTPFile", return_value=f):
-            yield indexer.get_rows_index("ds_sharded", "default", "train")
-
-
-@pytest.fixture
-def rows_index_with_empty_dataset(
-    indexer: Indexer,
-    ds_empty: Dataset,
-    ds_empty_fs: AbstractFileSystem,
-    dataset_empty_with_config_parquet_metadata: dict[str, Any],
-) -> Generator[RowsIndex, None, None]:
-    with ds_empty_fs.open("default/train/0000.parquet") as f:
-        with patch("libcommon.parquet_utils.HTTPFile", return_value=f):
-            yield indexer.get_rows_index("ds_empty", "default", "train")
-
-
-@pytest.fixture
-def rows_index_with_too_big_rows(
     parquet_metadata_directory: StrPath,
-    ds_sharded: Dataset,
-    ds_sharded_fs: AbstractFileSystem,
-    dataset_sharded_with_config_parquet_metadata: dict[str, Any],
 ) -> Generator[RowsIndex, None, None]:
-    indexer = Indexer(
-        parquet_metadata_directory=parquet_metadata_directory,
-        httpfs=HTTPFileSystem(),
-        max_arrow_data_in_memory=1,
-    )
     with ds_sharded_fs.open("default/train/0003.parquet") as f:
         with patch("libcommon.parquet_utils.HTTPFile", return_value=f):
-            yield indexer.get_rows_index("ds_sharded", "default", "train")
-
-
-@pytest.fixture
-def indexer(
-    parquet_metadata_directory: StrPath,
-) -> Indexer:
-    return Indexer(
-        parquet_metadata_directory=parquet_metadata_directory,
-        httpfs=HTTPFileSystem(),
-        max_arrow_data_in_memory=9999999999,
-    )
+            yield RowsIndex(
+                dataset="ds_sharded",
+                config="default",
+                split="train",
+                parquet_metadata_directory=parquet_metadata_directory,
+                httpfs=HTTPFileSystem(),
+                max_arrow_data_in_memory=9999999999,
+            )
 
 
 def test_parquet_export_is_partial() -> None:
@@ -411,11 +379,22 @@ def test_parquet_export_is_partial() -> None:
 
 
 def test_indexer_get_rows_index_with_parquet_metadata(
-    indexer: Indexer, ds: Dataset, ds_fs: AbstractFileSystem, dataset_with_config_parquet_metadata: dict[str, Any]
+    ds: Dataset,
+    ds_fs: AbstractFileSystem,
+    parquet_metadata_directory: StrPath,
+    dataset_with_config_parquet_metadata: dict[str, Any],
 ) -> None:
     with ds_fs.open("default/train/0000.parquet") as f:
         with patch("libcommon.parquet_utils.HTTPFile", return_value=f):
-            index = indexer.get_rows_index("ds", "default", "train")
+            index = RowsIndex(
+                dataset="ds",
+                config="default",
+                split="train",
+                parquet_metadata_directory=parquet_metadata_directory,
+                httpfs=HTTPFileSystem(),
+                max_arrow_data_in_memory=9999999999,
+            )
+
     assert isinstance(index.parquet_index, ParquetIndexWithMetadata)
     assert index.parquet_index.features == ds.features
     assert index.parquet_index.num_rows == [len(ds)]
@@ -429,15 +408,23 @@ def test_indexer_get_rows_index_with_parquet_metadata(
 
 
 def test_indexer_get_rows_index_sharded_with_parquet_metadata(
-    indexer: Indexer,
     ds: Dataset,
     ds_sharded: Dataset,
     ds_sharded_fs: AbstractFileSystem,
+    parquet_metadata_directory: StrPath,
     dataset_sharded_with_config_parquet_metadata: dict[str, Any],
 ) -> None:
     with ds_sharded_fs.open("default/train/0003.parquet") as f:
         with patch("libcommon.parquet_utils.HTTPFile", return_value=f):
-            index = indexer.get_rows_index("ds_sharded", "default", "train")
+            index = RowsIndex(
+                dataset="ds_sharded",
+                config="default",
+                split="train",
+                parquet_metadata_directory=parquet_metadata_directory,
+                httpfs=HTTPFileSystem(),
+                max_arrow_data_in_memory=9999999999,
+            )
+
     assert isinstance(index.parquet_index, ParquetIndexWithMetadata)
     assert index.parquet_index.features == ds_sharded.features
     assert index.parquet_index.num_rows == [len(ds)] * 4
@@ -463,28 +450,67 @@ def test_rows_index_query_with_parquet_metadata(
         rows_index_with_parquet_metadata.query(offset=-1, length=2)
 
 
-def test_rows_index_query_with_too_big_rows(rows_index_with_too_big_rows: RowsIndex, ds_sharded: Dataset) -> None:
+def test_rows_index_query_with_too_big_rows(
+    parquet_metadata_directory: StrPath,
+    ds_sharded: Dataset,
+    ds_sharded_fs: AbstractFileSystem,
+    dataset_sharded_with_config_parquet_metadata: dict[str, Any],
+) -> None:
+    with ds_sharded_fs.open("default/train/0003.parquet") as f:
+        with patch("libcommon.parquet_utils.HTTPFile", return_value=f):
+            index = RowsIndex(
+                dataset="ds_sharded",
+                config="default",
+                split="train",
+                parquet_metadata_directory=parquet_metadata_directory,
+                httpfs=HTTPFileSystem(),
+                max_arrow_data_in_memory=1,
+            )
+
     with pytest.raises(TooBigRows):
-        rows_index_with_too_big_rows.query(offset=0, length=3)
+        index.query(offset=0, length=3)
 
 
-def test_rows_index_query_with_empty_dataset(rows_index_with_empty_dataset: RowsIndex, ds_sharded: Dataset) -> None:
-    assert isinstance(rows_index_with_empty_dataset.parquet_index, ParquetIndexWithMetadata)
-    assert rows_index_with_empty_dataset.query(offset=0, length=1).to_pydict() == ds_sharded[:0]
+def test_rows_index_query_with_empty_dataset(
+    ds_empty: Dataset,
+    ds_empty_fs: AbstractFileSystem,
+    dataset_empty_with_config_parquet_metadata: dict[str, Any],
+    parquet_metadata_directory: StrPath,
+) -> None:
+    with ds_empty_fs.open("default/train/0000.parquet") as f:
+        with patch("libcommon.parquet_utils.HTTPFile", return_value=f):
+            index = RowsIndex(
+                dataset="ds_empty",
+                config="default",
+                split="train",
+                parquet_metadata_directory=parquet_metadata_directory,
+                httpfs=HTTPFileSystem(),
+                max_arrow_data_in_memory=9999999999,
+            )
+
+    assert isinstance(index.parquet_index, ParquetIndexWithMetadata)
+    assert index.query(offset=0, length=1).to_pydict() == ds_empty[:0]
     with pytest.raises(IndexError):
-        rows_index_with_empty_dataset.query(offset=-1, length=2)
+        index.query(offset=-1, length=2)
 
 
 def test_indexer_schema_mistmatch_error(
-    indexer: Indexer,
     ds_sharded_fs: AbstractFileSystem,
     ds_sharded_fs_with_different_schema: AbstractFileSystem,
     dataset_sharded_with_config_parquet_metadata: dict[str, Any],
+    parquet_metadata_directory: StrPath,
 ) -> None:
     with ds_sharded_fs_with_different_schema.open("default/train/0000.parquet") as first_parquet:
         with ds_sharded_fs_with_different_schema.open("default/train/0001.parquet") as second_parquet:
             with patch("libcommon.parquet_utils.HTTPFile", side_effect=[first_parquet, second_parquet]):
-                index = indexer.get_rows_index("ds_sharded", "default", "train")
+                index = RowsIndex(
+                    dataset="ds_sharded",
+                    config="default",
+                    split="train",
+                    parquet_metadata_directory=parquet_metadata_directory,
+                    httpfs=HTTPFileSystem(),
+                    max_arrow_data_in_memory=9999999999,
+                )
                 with pytest.raises(SchemaMismatchError):
                     index.query(offset=0, length=3)
 
diff --git a/services/rows/src/rows/routes/rows.py b/services/rows/src/rows/routes/rows.py
@@ -2,6 +2,7 @@
 # Copyright 2022 The HuggingFace Authors.
 
 import logging
+from functools import lru_cache
 from http import HTTPStatus
 from typing import Optional
 
@@ -22,7 +23,7 @@
     try_backfill_dataset_then_raise,
 )
 from libcommon.constants import CONFIG_PARQUET_METADATA_KIND
-from libcommon.parquet_utils import Indexer, TooBigRows
+from libcommon.parquet_utils import RowsIndex, TooBigRows
 from libcommon.prometheus import StepProfiler
 from libcommon.simple_cache import CachedArtifactError, CachedArtifactNotFoundError
 from libcommon.storage import StrPath
@@ -48,14 +49,24 @@ def create_rows_endpoint(
     max_age_short: int = 0,
     storage_clients: Optional[list[StorageClient]] = None,
 ) -> Endpoint:
-    indexer = Indexer(
-        parquet_metadata_directory=parquet_metadata_directory,
-        httpfs=HTTPFileSystem(headers={"authorization": f"Bearer {hf_token}"}),
-        max_arrow_data_in_memory=max_arrow_data_in_memory,
-    )
+    httpfs = HTTPFileSystem(headers={"authorization": f"Bearer {hf_token}"})
+
+    @lru_cache(maxsize=1)
+    def get_rows_index(dataset: str, config: str, split: str) -> RowsIndex:
+        # cache the RowsIndex instance and therefore save one call to Mongo
+        # if multiple queries to the same dataset are done in a row (90% of
+        # requests in a short time window are to the same dataset)
+        return RowsIndex(
+            dataset=dataset,
+            config=config,
+            split=split,
+            httpfs=httpfs,
+            max_arrow_data_in_memory=max_arrow_data_in_memory,
+            parquet_metadata_directory=parquet_metadata_directory,
+        )
 
     async def rows_endpoint(request: Request) -> Response:
-        await indexer.httpfs.set_session()
+        await httpfs.set_session()
         revision: Optional[str] = None
         with StepProfiler(method="rows_endpoint", step="all"):
             try:
@@ -84,11 +95,7 @@ async def rows_endpoint(request: Request) -> Response:
                     )
                 try:
                     with StepProfiler(method="rows_endpoint", step="get row groups index"):
-                        rows_index = indexer.get_rows_index(
-                            dataset=dataset,
-                            config=config,
-                            split=split,
-                        )
+                        rows_index = get_rows_index(dataset=dataset, config=config, split=split)
                         revision = rows_index.revision
                     with StepProfiler(method="rows_endpoint", step="query the rows"):
                         try:
diff --git a/services/worker/src/worker/job_runners/split/first_rows.py b/services/worker/src/worker/job_runners/split/first_rows.py
@@ -18,7 +18,7 @@
     SplitParquetSchemaMismatchError,
     TooBigContentError,
 )
-from libcommon.parquet_utils import EmptyParquetMetadataError, Indexer, SchemaMismatchError, TooBigRows
+from libcommon.parquet_utils import EmptyParquetMetadataError, RowsIndex, SchemaMismatchError, TooBigRows
 from libcommon.simple_cache import CachedArtifactError, CachedArtifactNotFoundError
 from libcommon.storage import StrPath
 from libcommon.storage_client import StorageClient
@@ -41,7 +41,9 @@ def compute_first_rows_from_parquet_response(
     rows_max_number: int,
     rows_min_number: int,
     columns_max_number: int,
-    indexer: Indexer,
+    httpfs: HTTPFileSystem,
+    max_arrow_data_in_memory: int,
+    parquet_metadata_directory: StrPath,
 ) -> SplitFirstRowsResponse:
     """
     Compute the response of 'split-first-rows' for one specific split of a dataset from the parquet files.
@@ -67,8 +69,12 @@ def compute_first_rows_from_parquet_response(
             The minimum number of rows of the response.
         columns_max_number (`int`):
             The maximum number of columns supported.
-        indexer (`Indexer`):
-            An indexer to get the rows index.
+        httpfs (`HTTPFileSystem`):
+            An HTTP filesystem to access the parquet files.
+        parquet_metadata_directory (`StrPath`):
+            The local directory where the parquet metadata are stored.
+        max_arrow_data_in_memory (`int`):
+            The maximum size in bytes of Arrow data loaded in memory.
 
     Raises:
         [~`libcommon.exceptions.ParquetResponseEmptyError`]:
@@ -85,10 +91,13 @@ def compute_first_rows_from_parquet_response(
     logging.info(f"compute 'split-first-rows' from parquet for {dataset=} {config=} {split=}")
 
     try:
-        rows_index = indexer.get_rows_index(
+        rows_index = RowsIndex(
             dataset=dataset,
             config=config,
             split=split,
+            httpfs=httpfs,
+            max_arrow_data_in_memory=max_arrow_data_in_memory,
+            parquet_metadata_directory=parquet_metadata_directory,
         )
     except EmptyParquetMetadataError:
         raise ParquetResponseEmptyError("No parquet files found.")
@@ -272,7 +281,6 @@ def get_rows_content(rows_max_number: int) -> RowsContent:
 
 class SplitFirstRowsJobRunner(SplitJobRunnerWithDatasetsCache):
     first_rows_config: FirstRowsConfig
-    indexer: Indexer
 
     @staticmethod
     def get_job_type() -> str:
@@ -293,11 +301,7 @@ def __init__(
         )
         self.first_rows_config = app_config.first_rows
         self.parquet_metadata_directory = parquet_metadata_directory
-        self.indexer = Indexer(
-            parquet_metadata_directory=parquet_metadata_directory,
-            httpfs=HTTPFileSystem(headers={"authorization": f"Bearer {self.app_config.common.hf_token}"}),
-            max_arrow_data_in_memory=app_config.rows_index.max_arrow_data_in_memory,
-        )
+        self.httpfs = HTTPFileSystem(headers={"authorization": f"Bearer {self.app_config.common.hf_token}"})
         self.storage_client = storage_client
 
     def compute(self) -> CompleteJobResult:
@@ -314,7 +318,9 @@ def compute(self) -> CompleteJobResult:
                     rows_min_number=self.first_rows_config.min_number,
                     rows_max_number=MAX_NUM_ROWS_PER_PAGE,
                     columns_max_number=self.first_rows_config.columns_max_number,
-                    indexer=self.indexer,
+                    httpfs=self.httpfs,
+                    max_arrow_data_in_memory=self.app_config.rows_index.max_arrow_data_in_memory,
+                    parquet_metadata_directory=self.parquet_metadata_directory,
                 )
             )
         except (