Skip to content

Commit c7d0081

Browse files
authored
refactor(libcommon): remove the effectively unused arguments of Indexer (#3250)
* refactor(libcommon): remove the effectively unused arguments of `Indexer` * style: remove unnecessarry imports * refactor(libcommon): remove `unsupported_features` argument from `RowsIndex` * style: remove unnecessarry imports
1 parent ca647b7 commit c7d0081

File tree

3 files changed

+4
-27
lines changed

3 files changed

+4
-27
lines changed

libs/libcommon/src/libcommon/parquet_utils.py

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,13 @@
55
from dataclasses import dataclass, field
66
from functools import lru_cache
77
from pathlib import Path
8-
from typing import Literal, Optional, TypedDict, Union
8+
from typing import Optional, TypedDict
99

1010
import numpy as np
1111
import pyarrow as pa
1212
import pyarrow.compute as pc
1313
import pyarrow.parquet as pq
1414
from datasets import Features, Value
15-
from datasets.features.features import FeatureType
1615
from datasets.table import cast_table_to_schema
1716
from datasets.utils.py_utils import size_str
1817
from fsspec.implementations.http import HTTPFile, HTTPFileSystem
@@ -458,7 +457,6 @@ def from_parquet_metadata_items(
458457
parquet_metadata_directory: StrPath,
459458
httpfs: HTTPFileSystem,
460459
max_arrow_data_in_memory: int,
461-
unsupported_features: list[FeatureType] = [],
462460
) -> "ParquetIndexWithMetadata":
463461
if not parquet_file_metadata_items:
464462
raise EmptyParquetMetadataError("No parquet files found.")
@@ -488,9 +486,10 @@ def from_parquet_metadata_items(
488486
):
489487
if features is None: # config-parquet version<6 didn't have features
490488
features = Features.from_arrow_schema(pq.read_schema(metadata_paths[0]))
489+
# TODO(kszucs): since unsupported_features is always empty list we may omit the call below
491490
supported_columns, unsupported_columns = get_supported_unsupported_columns(
492491
features,
493-
unsupported_features=unsupported_features,
492+
unsupported_features=[],
494493
)
495494
return ParquetIndexWithMetadata(
496495
features=features,
@@ -515,7 +514,6 @@ def __init__(
515514
httpfs: HfFileSystem,
516515
parquet_metadata_directory: StrPath,
517516
max_arrow_data_in_memory: int,
518-
unsupported_features: list[FeatureType] = [],
519517
):
520518
self.dataset = dataset
521519
self.config = config
@@ -524,14 +522,12 @@ def __init__(
524522
self.parquet_index = self._init_parquet_index(
525523
parquet_metadata_directory=parquet_metadata_directory,
526524
max_arrow_data_in_memory=max_arrow_data_in_memory,
527-
unsupported_features=unsupported_features,
528525
)
529526

530527
def _init_parquet_index(
531528
self,
532529
parquet_metadata_directory: StrPath,
533530
max_arrow_data_in_memory: int,
534-
unsupported_features: list[FeatureType] = [],
535531
) -> ParquetIndexWithMetadata:
536532
with StepProfiler(method="rows_index._init_parquet_index", step="all"):
537533
# get the list of parquet files
@@ -561,7 +557,6 @@ def _init_parquet_index(
561557
parquet_metadata_directory=parquet_metadata_directory,
562558
httpfs=self.httpfs,
563559
max_arrow_data_in_memory=max_arrow_data_in_memory,
564-
unsupported_features=unsupported_features,
565560
)
566561

567562
# note that this cache size is global for the class, not per instance
@@ -614,14 +609,10 @@ def __init__(
614609
parquet_metadata_directory: StrPath,
615610
httpfs: HTTPFileSystem,
616611
max_arrow_data_in_memory: int,
617-
unsupported_features: list[FeatureType] = [],
618-
all_columns_supported_datasets_allow_list: Union[Literal["all"], list[str]] = "all",
619612
):
620613
self.parquet_metadata_directory = parquet_metadata_directory
621614
self.httpfs = httpfs
622615
self.max_arrow_data_in_memory = max_arrow_data_in_memory
623-
self.unsupported_features = unsupported_features
624-
self.all_columns_supported_datasets_allow_list = all_columns_supported_datasets_allow_list
625616

626617
@lru_cache(maxsize=1)
627618
def get_rows_index(
@@ -630,17 +621,11 @@ def get_rows_index(
630621
config: str,
631622
split: str,
632623
) -> RowsIndex:
633-
filter_features = (
634-
self.all_columns_supported_datasets_allow_list != "all"
635-
and dataset not in self.all_columns_supported_datasets_allow_list
636-
)
637-
unsupported_features = self.unsupported_features if filter_features else []
638624
return RowsIndex(
639625
dataset=dataset,
640626
config=config,
641627
split=split,
642628
httpfs=self.httpfs,
643629
parquet_metadata_directory=self.parquet_metadata_directory,
644630
max_arrow_data_in_memory=self.max_arrow_data_in_memory,
645-
unsupported_features=unsupported_features,
646631
)

services/rows/src/rows/routes/rows.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
import logging
55
from http import HTTPStatus
6-
from typing import Literal, Optional, Union
6+
from typing import Optional
77

88
from fsspec.implementations.http import HTTPFileSystem
99
from libapi.authentication import auth_check
@@ -33,11 +33,6 @@
3333
logger = logging.getLogger(__name__)
3434

3535

36-
ALL_COLUMNS_SUPPORTED_DATASETS_ALLOW_LIST: Union[Literal["all"], list[str]] = [
37-
"halabi2016/arabic_speech_corpus"
38-
] # for testing
39-
40-
4136
def create_rows_endpoint(
4237
cached_assets_storage_client: StorageClient,
4338
parquet_metadata_directory: StrPath,
@@ -57,7 +52,6 @@ def create_rows_endpoint(
5752
parquet_metadata_directory=parquet_metadata_directory,
5853
httpfs=HTTPFileSystem(headers={"authorization": f"Bearer {hf_token}"}),
5954
max_arrow_data_in_memory=max_arrow_data_in_memory,
60-
all_columns_supported_datasets_allow_list=ALL_COLUMNS_SUPPORTED_DATASETS_ALLOW_LIST,
6155
)
6256

6357
async def rows_endpoint(request: Request) -> Response:

services/worker/src/worker/job_runners/split/first_rows.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -296,8 +296,6 @@ def __init__(
296296
self.indexer = Indexer(
297297
parquet_metadata_directory=parquet_metadata_directory,
298298
httpfs=HTTPFileSystem(headers={"authorization": f"Bearer {self.app_config.common.hf_token}"}),
299-
unsupported_features=[],
300-
all_columns_supported_datasets_allow_list="all",
301299
max_arrow_data_in_memory=app_config.rows_index.max_arrow_data_in_memory,
302300
)
303301
self.storage_client = storage_client

0 commit comments

Comments
 (0)