chore: fix mypy errors

kszucs · kszucs · commit 689a190dc8fd · 2025-10-14T18:07:35.000+02:00
diff --git a/libs/libcommon/src/libcommon/parquet_utils.py b/libs/libcommon/src/libcommon/parquet_utils.py
@@ -5,7 +5,7 @@
 from dataclasses import dataclass, field
 from functools import lru_cache
 from pathlib import Path
-from typing import Literal, Optional, TypedDict, Union
+from typing import Literal, Optional, TypedDict, Union, Sequence
 
 import numpy as np
 import pyarrow as pa
@@ -24,6 +24,7 @@
 from libcommon.simple_cache import get_previous_step_or_raise
 from libcommon.storage import StrPath
 from libcommon.viewer_utils.features import get_supported_unsupported_columns
+from libviewer import Dataset as LibviewerDataset  # type: ignore [import-untyped]
 
 # For partial Parquet export we have paths like "en/partial-train/0000.parquet".
 # "-" is not allowed is split names so we use it in the prefix to avoid collisions.
@@ -460,7 +461,7 @@ def from_parquet_metadata_items(
         httpfs: HTTPFileSystem,
         hf_token: Optional[str],
         max_arrow_data_in_memory: int,
-        unsupported_features: list[FeatureType] = [],
+        unsupported_features: Sequence[FeatureType] = (),
     ) -> "ParquetIndexWithMetadata":
         if not parquet_files:
             raise EmptyParquetMetadataError("No parquet files found.")
@@ -515,8 +516,8 @@ def __init__(
         hf_token: Optional[str],
         parquet_metadata_directory: StrPath,
         max_arrow_data_in_memory: int,
-        unsupported_features: list[FeatureType] = [],
-        data_store="hf://",
+        unsupported_features: Sequence[FeatureType] = (),
+        data_store: str = "hf://",
     ):
         self.dataset = dataset
         self.config = config
@@ -532,7 +533,7 @@ def __init__(
         )
         self._init_viewer_index(data_store=data_store, metadata_store=f"file://{parquet_metadata_directory}")
 
-    def _init_dataset_info(self, parquet_metadata_directory: StrPath):
+    def _init_dataset_info(self, parquet_metadata_directory: StrPath) -> None:
         # get the list of parquet files and features
         with StepProfiler(method="rows_index._get_dataset_metadata", step="all"):
             response = get_previous_step_or_raise(
@@ -570,7 +571,7 @@ def _init_parquet_index(
         hf_token: Optional[str],
         parquet_metadata_directory: StrPath,
         max_arrow_data_in_memory: int,
-        unsupported_features: list[FeatureType] = [],
+        unsupported_features: Sequence[FeatureType],
     ) -> None:
         logging.info(
             f"Create ParquetIndexWithMetadata for dataset={self.dataset}, config={self.config}, split={self.split}"
@@ -587,12 +588,6 @@ def _init_parquet_index(
 
     def _init_viewer_index(self, data_store: str, metadata_store: str) -> None:
         logging.info(f"Create libviewer.Dataset for dataset={self.dataset}, config={self.config}, split={self.split}")
-        try:
-            from libviewer import Dataset
-        except ImportError as err:
-            raise ImportError(
-                "libviewer is not installed. Please install it with `pip install libviewer` to use page pruning."
-            ) from err
 
         # construct the required parquet_files list for libviewer.Dataset
         files = []
@@ -606,7 +601,7 @@ def _init_viewer_index(self, data_store: str, metadata_store: str) -> None:
                 }
             )
 
-        self.viewer_index = Dataset(
+        self.viewer_index = LibviewerDataset(
             name=self.dataset,
             files=files,
             revision=self.revision,
@@ -689,7 +684,7 @@ def __init__(
         self.all_columns_supported_datasets_allow_list = all_columns_supported_datasets_allow_list
 
     @lru_cache(maxsize=1)
-    def get_rows_index(self, dataset: str, config: str, split: str, data_store="hf://") -> RowsIndex:
+    def get_rows_index(self, dataset: str, config: str, split: str, data_store: str = "hf://") -> RowsIndex:
         filter_features = (
             self.all_columns_supported_datasets_allow_list != "all"
             and dataset not in self.all_columns_supported_datasets_allow_list
diff --git a/libs/libcommon/src/libcommon/viewer_utils/features.py b/libs/libcommon/src/libcommon/viewer_utils/features.py
@@ -5,7 +5,7 @@
 import logging
 import os
 from io import BytesIO
-from typing import Any, Optional, Union
+from typing import Any, Optional, Union, Sequence
 from zlib import adler32
 
 import datasets.config
@@ -533,7 +533,7 @@ def to_features_list(features: Features) -> list[FeatureItem]:
 
 def get_supported_unsupported_columns(
     features: Features,
-    unsupported_features: list[FeatureType] = [],
+    unsupported_features: Sequence[FeatureType],
 ) -> tuple[list[str], list[str]]:
     supported_columns, unsupported_columns = [], []
 
diff --git a/libs/libcommon/tests/test_parquet_utils.py b/libs/libcommon/tests/test_parquet_utils.py
@@ -29,6 +29,7 @@
 from libcommon.resources import CacheMongoResource
 from libcommon.simple_cache import upsert_response
 from libcommon.storage import StrPath
+from libviewer import Dataset as LibviewerDataset  # type: ignore [import-untyped]
 
 REVISION_NAME = "revision"
 CACHED_ASSETS_FOLDER = "cached-assets"
@@ -467,12 +468,7 @@ def test_rows_index_query_with_parquet_metadata(
 
 
 def test_rows_index_query_with_page_pruning(rows_index_with_parquet_metadata: RowsIndex, ds_sharded: Dataset) -> None:
-    try:
-        import libviewer
-    except ImportError:
-        pytest.skip("libviewer is not installed")
-
-    assert isinstance(rows_index_with_parquet_metadata.viewer_index, libviewer.Dataset)
+    assert isinstance(rows_index_with_parquet_metadata.viewer_index, LibviewerDataset)
 
     result = rows_index_with_parquet_metadata.query_with_page_pruning(offset=1, length=3)
     assert result.to_pydict() == ds_sharded[1:4]