Skip to content

Commit 689a190

Browse files
committed
chore: fix mypy errors
1 parent 2dbaf35 commit 689a190

File tree

3 files changed

+13
-22
lines changed

3 files changed

+13
-22
lines changed

libs/libcommon/src/libcommon/parquet_utils.py

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from dataclasses import dataclass, field
66
from functools import lru_cache
77
from pathlib import Path
8-
from typing import Literal, Optional, TypedDict, Union
8+
from typing import Literal, Optional, TypedDict, Union, Sequence
99

1010
import numpy as np
1111
import pyarrow as pa
@@ -24,6 +24,7 @@
2424
from libcommon.simple_cache import get_previous_step_or_raise
2525
from libcommon.storage import StrPath
2626
from libcommon.viewer_utils.features import get_supported_unsupported_columns
27+
from libviewer import Dataset as LibviewerDataset # type: ignore [import-untyped]
2728

2829
# For partial Parquet export we have paths like "en/partial-train/0000.parquet".
2930
# "-" is not allowed is split names so we use it in the prefix to avoid collisions.
@@ -460,7 +461,7 @@ def from_parquet_metadata_items(
460461
httpfs: HTTPFileSystem,
461462
hf_token: Optional[str],
462463
max_arrow_data_in_memory: int,
463-
unsupported_features: list[FeatureType] = [],
464+
unsupported_features: Sequence[FeatureType] = (),
464465
) -> "ParquetIndexWithMetadata":
465466
if not parquet_files:
466467
raise EmptyParquetMetadataError("No parquet files found.")
@@ -515,8 +516,8 @@ def __init__(
515516
hf_token: Optional[str],
516517
parquet_metadata_directory: StrPath,
517518
max_arrow_data_in_memory: int,
518-
unsupported_features: list[FeatureType] = [],
519-
data_store="hf://",
519+
unsupported_features: Sequence[FeatureType] = (),
520+
data_store: str = "hf://",
520521
):
521522
self.dataset = dataset
522523
self.config = config
@@ -532,7 +533,7 @@ def __init__(
532533
)
533534
self._init_viewer_index(data_store=data_store, metadata_store=f"file://{parquet_metadata_directory}")
534535

535-
def _init_dataset_info(self, parquet_metadata_directory: StrPath):
536+
def _init_dataset_info(self, parquet_metadata_directory: StrPath) -> None:
536537
# get the list of parquet files and features
537538
with StepProfiler(method="rows_index._get_dataset_metadata", step="all"):
538539
response = get_previous_step_or_raise(
@@ -570,7 +571,7 @@ def _init_parquet_index(
570571
hf_token: Optional[str],
571572
parquet_metadata_directory: StrPath,
572573
max_arrow_data_in_memory: int,
573-
unsupported_features: list[FeatureType] = [],
574+
unsupported_features: Sequence[FeatureType],
574575
) -> None:
575576
logging.info(
576577
f"Create ParquetIndexWithMetadata for dataset={self.dataset}, config={self.config}, split={self.split}"
@@ -587,12 +588,6 @@ def _init_parquet_index(
587588

588589
def _init_viewer_index(self, data_store: str, metadata_store: str) -> None:
589590
logging.info(f"Create libviewer.Dataset for dataset={self.dataset}, config={self.config}, split={self.split}")
590-
try:
591-
from libviewer import Dataset
592-
except ImportError as err:
593-
raise ImportError(
594-
"libviewer is not installed. Please install it with `pip install libviewer` to use page pruning."
595-
) from err
596591

597592
# construct the required parquet_files list for libviewer.Dataset
598593
files = []
@@ -606,7 +601,7 @@ def _init_viewer_index(self, data_store: str, metadata_store: str) -> None:
606601
}
607602
)
608603

609-
self.viewer_index = Dataset(
604+
self.viewer_index = LibviewerDataset(
610605
name=self.dataset,
611606
files=files,
612607
revision=self.revision,
@@ -689,7 +684,7 @@ def __init__(
689684
self.all_columns_supported_datasets_allow_list = all_columns_supported_datasets_allow_list
690685

691686
@lru_cache(maxsize=1)
692-
def get_rows_index(self, dataset: str, config: str, split: str, data_store="hf://") -> RowsIndex:
687+
def get_rows_index(self, dataset: str, config: str, split: str, data_store: str = "hf://") -> RowsIndex:
693688
filter_features = (
694689
self.all_columns_supported_datasets_allow_list != "all"
695690
and dataset not in self.all_columns_supported_datasets_allow_list

libs/libcommon/src/libcommon/viewer_utils/features.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import logging
66
import os
77
from io import BytesIO
8-
from typing import Any, Optional, Union
8+
from typing import Any, Optional, Union, Sequence
99
from zlib import adler32
1010

1111
import datasets.config
@@ -533,7 +533,7 @@ def to_features_list(features: Features) -> list[FeatureItem]:
533533

534534
def get_supported_unsupported_columns(
535535
features: Features,
536-
unsupported_features: list[FeatureType] = [],
536+
unsupported_features: Sequence[FeatureType],
537537
) -> tuple[list[str], list[str]]:
538538
supported_columns, unsupported_columns = [], []
539539

libs/libcommon/tests/test_parquet_utils.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from libcommon.resources import CacheMongoResource
3030
from libcommon.simple_cache import upsert_response
3131
from libcommon.storage import StrPath
32+
from libviewer import Dataset as LibviewerDataset # type: ignore [import-untyped]
3233

3334
REVISION_NAME = "revision"
3435
CACHED_ASSETS_FOLDER = "cached-assets"
@@ -467,12 +468,7 @@ def test_rows_index_query_with_parquet_metadata(
467468

468469

469470
def test_rows_index_query_with_page_pruning(rows_index_with_parquet_metadata: RowsIndex, ds_sharded: Dataset) -> None:
470-
try:
471-
import libviewer
472-
except ImportError:
473-
pytest.skip("libviewer is not installed")
474-
475-
assert isinstance(rows_index_with_parquet_metadata.viewer_index, libviewer.Dataset)
471+
assert isinstance(rows_index_with_parquet_metadata.viewer_index, LibviewerDataset)
476472

477473
result = rows_index_with_parquet_metadata.query_with_page_pruning(offset=1, length=3)
478474
assert result.to_pydict() == ds_sharded[1:4]

0 commit comments

Comments
 (0)