55from dataclasses import dataclass , field
66from functools import lru_cache
77from pathlib import Path
8- from typing import Literal , Optional , TypedDict , Union
8+ from typing import Optional , TypedDict
99
1010import numpy as np
1111import pyarrow as pa
1212import pyarrow .compute as pc
1313import pyarrow .parquet as pq
1414from datasets import Features , Value
15- from datasets .features .features import FeatureType
1615from datasets .table import cast_table_to_schema
1716from datasets .utils .py_utils import size_str
1817from fsspec .implementations .http import HTTPFile , HTTPFileSystem
@@ -458,7 +457,6 @@ def from_parquet_metadata_items(
458457 parquet_metadata_directory : StrPath ,
459458 httpfs : HTTPFileSystem ,
460459 max_arrow_data_in_memory : int ,
461- unsupported_features : list [FeatureType ] = [],
462460 ) -> "ParquetIndexWithMetadata" :
463461 if not parquet_file_metadata_items :
464462 raise EmptyParquetMetadataError ("No parquet files found." )
@@ -488,9 +486,10 @@ def from_parquet_metadata_items(
488486 ):
489487 if features is None : # config-parquet version<6 didn't have features
490488 features = Features .from_arrow_schema (pq .read_schema (metadata_paths [0 ]))
489+ # TODO(kszucs): since unsupported_features is always empty list we may omit the call below
491490 supported_columns , unsupported_columns = get_supported_unsupported_columns (
492491 features ,
493- unsupported_features = unsupported_features ,
492+ unsupported_features = [] ,
494493 )
495494 return ParquetIndexWithMetadata (
496495 features = features ,
@@ -515,7 +514,6 @@ def __init__(
515514 httpfs : HfFileSystem ,
516515 parquet_metadata_directory : StrPath ,
517516 max_arrow_data_in_memory : int ,
518- unsupported_features : list [FeatureType ] = [],
519517 ):
520518 self .dataset = dataset
521519 self .config = config
@@ -524,14 +522,12 @@ def __init__(
524522 self .parquet_index = self ._init_parquet_index (
525523 parquet_metadata_directory = parquet_metadata_directory ,
526524 max_arrow_data_in_memory = max_arrow_data_in_memory ,
527- unsupported_features = unsupported_features ,
528525 )
529526
530527 def _init_parquet_index (
531528 self ,
532529 parquet_metadata_directory : StrPath ,
533530 max_arrow_data_in_memory : int ,
534- unsupported_features : list [FeatureType ] = [],
535531 ) -> ParquetIndexWithMetadata :
536532 with StepProfiler (method = "rows_index._init_parquet_index" , step = "all" ):
537533 # get the list of parquet files
@@ -561,7 +557,6 @@ def _init_parquet_index(
561557 parquet_metadata_directory = parquet_metadata_directory ,
562558 httpfs = self .httpfs ,
563559 max_arrow_data_in_memory = max_arrow_data_in_memory ,
564- unsupported_features = unsupported_features ,
565560 )
566561
567562 # note that this cache size is global for the class, not per instance
@@ -614,14 +609,10 @@ def __init__(
614609 parquet_metadata_directory : StrPath ,
615610 httpfs : HTTPFileSystem ,
616611 max_arrow_data_in_memory : int ,
617- unsupported_features : list [FeatureType ] = [],
618- all_columns_supported_datasets_allow_list : Union [Literal ["all" ], list [str ]] = "all" ,
619612 ):
620613 self .parquet_metadata_directory = parquet_metadata_directory
621614 self .httpfs = httpfs
622615 self .max_arrow_data_in_memory = max_arrow_data_in_memory
623- self .unsupported_features = unsupported_features
624- self .all_columns_supported_datasets_allow_list = all_columns_supported_datasets_allow_list
625616
626617 @lru_cache (maxsize = 1 )
627618 def get_rows_index (
@@ -630,17 +621,11 @@ def get_rows_index(
630621 config : str ,
631622 split : str ,
632623 ) -> RowsIndex :
633- filter_features = (
634- self .all_columns_supported_datasets_allow_list != "all"
635- and dataset not in self .all_columns_supported_datasets_allow_list
636- )
637- unsupported_features = self .unsupported_features if filter_features else []
638624 return RowsIndex (
639625 dataset = dataset ,
640626 config = config ,
641627 split = split ,
642628 httpfs = self .httpfs ,
643629 parquet_metadata_directory = self .parquet_metadata_directory ,
644630 max_arrow_data_in_memory = self .max_arrow_data_in_memory ,
645- unsupported_features = unsupported_features ,
646631 )
0 commit comments