55from dataclasses import dataclass , field
66from functools import lru_cache
77from pathlib import Path
8- from typing import Literal , Optional , TypedDict , Union
8+ from typing import Literal , Optional , TypedDict , Union , Sequence
99
1010import numpy as np
1111import pyarrow as pa
2424from libcommon .simple_cache import get_previous_step_or_raise
2525from libcommon .storage import StrPath
2626from libcommon .viewer_utils .features import get_supported_unsupported_columns
27+ from libviewer import Dataset as LibviewerDataset # type: ignore [import-untyped]
2728
2829# For partial Parquet export we have paths like "en/partial-train/0000.parquet".
2930# "-" is not allowed is split names so we use it in the prefix to avoid collisions.
@@ -460,7 +461,7 @@ def from_parquet_metadata_items(
460461 httpfs : HTTPFileSystem ,
461462 hf_token : Optional [str ],
462463 max_arrow_data_in_memory : int ,
463- unsupported_features : list [FeatureType ] = [] ,
464+ unsupported_features : Sequence [FeatureType ] = () ,
464465 ) -> "ParquetIndexWithMetadata" :
465466 if not parquet_files :
466467 raise EmptyParquetMetadataError ("No parquet files found." )
@@ -515,8 +516,8 @@ def __init__(
515516 hf_token : Optional [str ],
516517 parquet_metadata_directory : StrPath ,
517518 max_arrow_data_in_memory : int ,
518- unsupported_features : list [FeatureType ] = [] ,
519- data_store = "hf://" ,
519+ unsupported_features : Sequence [FeatureType ] = () ,
520+ data_store : str = "hf://" ,
520521 ):
521522 self .dataset = dataset
522523 self .config = config
@@ -532,7 +533,7 @@ def __init__(
532533 )
533534 self ._init_viewer_index (data_store = data_store , metadata_store = f"file://{ parquet_metadata_directory } " )
534535
535- def _init_dataset_info (self , parquet_metadata_directory : StrPath ):
536+ def _init_dataset_info (self , parquet_metadata_directory : StrPath ) -> None :
536537 # get the list of parquet files and features
537538 with StepProfiler (method = "rows_index._get_dataset_metadata" , step = "all" ):
538539 response = get_previous_step_or_raise (
@@ -570,7 +571,7 @@ def _init_parquet_index(
570571 hf_token : Optional [str ],
571572 parquet_metadata_directory : StrPath ,
572573 max_arrow_data_in_memory : int ,
573- unsupported_features : list [FeatureType ] = [ ],
574+ unsupported_features : Sequence [FeatureType ],
574575 ) -> None :
575576 logging .info (
576577 f"Create ParquetIndexWithMetadata for dataset={ self .dataset } , config={ self .config } , split={ self .split } "
@@ -587,12 +588,6 @@ def _init_parquet_index(
587588
588589 def _init_viewer_index (self , data_store : str , metadata_store : str ) -> None :
589590 logging .info (f"Create libviewer.Dataset for dataset={ self .dataset } , config={ self .config } , split={ self .split } " )
590- try :
591- from libviewer import Dataset
592- except ImportError as err :
593- raise ImportError (
594- "libviewer is not installed. Please install it with `pip install libviewer` to use page pruning."
595- ) from err
596591
597592 # construct the required parquet_files list for libviewer.Dataset
598593 files = []
@@ -606,7 +601,7 @@ def _init_viewer_index(self, data_store: str, metadata_store: str) -> None:
606601 }
607602 )
608603
609- self .viewer_index = Dataset (
604+ self .viewer_index = LibviewerDataset (
610605 name = self .dataset ,
611606 files = files ,
612607 revision = self .revision ,
@@ -689,7 +684,7 @@ def __init__(
689684 self .all_columns_supported_datasets_allow_list = all_columns_supported_datasets_allow_list
690685
691686 @lru_cache (maxsize = 1 )
692- def get_rows_index (self , dataset : str , config : str , split : str , data_store = "hf://" ) -> RowsIndex :
687+ def get_rows_index (self , dataset : str , config : str , split : str , data_store : str = "hf://" ) -> RowsIndex :
693688 filter_features = (
694689 self .all_columns_supported_datasets_allow_list != "all"
695690 and dataset not in self .all_columns_supported_datasets_allow_list
0 commit comments