Skip to content

Commit f0ba03f

Browse files
authored
refactor(libcommon): remove now obsolete get_supported_unsupported_columns() function (#3252)
* refactor(libcommon): remove the effectively unused arguments of `Indexer` * style: remove unnecessarry imports * refactor(libcommon): remove `Indexer` * test(libcommon): fix `test_rows_index_query_with_empty_dataset` to use `ds_empty` * fix(libcommon): cache the latest instance of `RowsIndex` * test(libcommon): add a test for caching the latest RowsIndex instance * fix(libcommon): only cache RowsIndex when serving from the rows endpoint * test(libcommon): remove previously added test case for caching RowIndex instances * refactor(libcommon): remove now obsolete `get_supported_unsupported_columns\(\)` function * style(libcommon): remove unnecessary imports
1 parent 51ff463 commit f0ba03f

File tree

11 files changed

+12
-106
lines changed

11 files changed

+12
-106
lines changed

libs/libapi/src/libapi/response.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,11 @@ async def create_response(
2020
pa_table: pa.Table,
2121
offset: int,
2222
features: Features,
23-
unsupported_columns: list[str],
2423
num_rows_total: int,
2524
partial: bool,
2625
use_row_idx_column: bool = False,
2726
truncated_columns: Optional[list[str]] = None,
2827
) -> PaginatedResponse:
29-
if set(pa_table.column_names).intersection(set(unsupported_columns)):
30-
raise RuntimeError(
31-
"The pyarrow table contains unsupported columns. They should have been ignored in the row group reader."
32-
)
3328
logging.debug(f"create response for {dataset=} {config=} {split=}")
3429
return {
3530
"features": [
@@ -46,7 +41,6 @@ async def create_response(
4641
storage_client=storage_client,
4742
offset=offset,
4843
features=features,
49-
unsupported_columns=unsupported_columns,
5044
row_idx_column=ROW_IDX_COLUMN if use_row_idx_column else None,
5145
truncated_columns=truncated_columns,
5246
),

libs/libapi/src/libapi/utils.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -205,15 +205,10 @@ async def to_rows_list(
205205
split: str,
206206
offset: int,
207207
features: Features,
208-
unsupported_columns: list[str],
209208
storage_client: StorageClient,
210209
row_idx_column: Optional[str] = None,
211210
truncated_columns: Optional[list[str]] = None,
212211
) -> list[RowItem]:
213-
num_rows = pa_table.num_rows
214-
for idx, (column, feature) in enumerate(features.items()):
215-
if column in unsupported_columns:
216-
pa_table = pa_table.add_column(idx, column, pa.array([None] * num_rows))
217212
# transform the rows, if needed (e.g. save the images or audio to the assets, and return their URL)
218213
try:
219214
transformed_rows = await transform_rows(

libs/libapi/tests/test_response.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ async def test_create_response(storage_client: StorageClient) -> None:
4242
pa_table=ds.data,
4343
offset=0,
4444
features=ds.features,
45-
unsupported_columns=[],
4645
num_rows_total=10,
4746
partial=False,
4847
)
@@ -67,7 +66,6 @@ async def test_create_response_with_row_idx_column(storage_client: StorageClient
6766
pa_table=ds.data,
6867
offset=0,
6968
features=ds.features,
70-
unsupported_columns=[],
7169
num_rows_total=10,
7270
partial=False,
7371
use_row_idx_column=True,
@@ -96,7 +94,6 @@ async def test_create_response_with_image(image_path: str, storage_client: Stora
9694
pa_table=ds_image.data,
9795
offset=0,
9896
features=ds_image.features,
99-
unsupported_columns=[],
10097
num_rows_total=10,
10198
partial=False,
10299
)
@@ -137,7 +134,6 @@ async def test_create_response_with_document(document_path: str, storage_client:
137134
pa_table=ds_document.data,
138135
offset=0,
139136
features=ds_document.features,
140-
unsupported_columns=[],
141137
num_rows_total=10,
142138
partial=False,
143139
)

libs/libcommon/src/libcommon/parquet_utils.py

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
from libcommon.prometheus import StepProfiler
2323
from libcommon.simple_cache import get_previous_step_or_raise
2424
from libcommon.storage import StrPath
25-
from libcommon.viewer_utils.features import get_supported_unsupported_columns
2625

2726
# For partial Parquet export we have paths like "en/partial-train/0000.parquet".
2827
# "-" is not allowed is split names so we use it in the prefix to avoid collisions.
@@ -181,8 +180,6 @@ def read_size(self, columns: Optional[Iterable[str]] = None) -> int:
181180
@dataclass
182181
class ParquetIndexWithMetadata:
183182
features: Features
184-
supported_columns: list[str]
185-
unsupported_columns: list[str]
186183
parquet_files_urls: list[str]
187184
metadata_paths: list[str]
188185
num_bytes: list[int]
@@ -329,10 +326,11 @@ def query_truncated_binary(self, offset: int, length: int) -> tuple[pa.Table, li
329326
) # we use a minimum length to not end up with too empty cells
330327
try:
331328
pa_tables: list[pa.Table] = []
329+
columns = list(self.features.keys())
332330
truncated_columns: set[str] = set()
333331
for i in range(first_row_group_id, last_row_group_id + 1):
334332
rg_pa_table, rg_truncated_columns = row_group_readers[i].read_truncated_binary(
335-
self.supported_columns, max_binary_length=max_binary_length
333+
columns, max_binary_length=max_binary_length
336334
)
337335
pa_tables.append(rg_pa_table)
338336
truncated_columns |= set(rg_truncated_columns)
@@ -438,12 +436,10 @@ def query(self, offset: int, length: int) -> pa.Table:
438436
)
439437

440438
with StepProfiler(method="parquet_index_with_metadata.query", step="read the row groups"):
439+
columns = list(self.features.keys())
441440
try:
442441
pa_table = pa.concat_tables(
443-
[
444-
row_group_readers[i].read(self.supported_columns)
445-
for i in range(first_row_group_id, last_row_group_id + 1)
446-
]
442+
[row_group_readers[i].read(columns) for i in range(first_row_group_id, last_row_group_id + 1)]
447443
)
448444
except ArrowInvalid as err:
449445
raise SchemaMismatchError("Parquet files have different schema.", err)
@@ -486,15 +482,9 @@ def from_parquet_metadata_items(
486482
):
487483
if features is None: # config-parquet version<6 didn't have features
488484
features = Features.from_arrow_schema(pq.read_schema(metadata_paths[0]))
489-
# TODO(kszucs): since unsupported_features is always empty list we may omit the call below
490-
supported_columns, unsupported_columns = get_supported_unsupported_columns(
491-
features,
492-
unsupported_features=[],
493-
)
485+
494486
return ParquetIndexWithMetadata(
495487
features=features,
496-
supported_columns=supported_columns,
497-
unsupported_columns=unsupported_columns,
498488
parquet_files_urls=parquet_files_urls,
499489
metadata_paths=metadata_paths,
500490
num_bytes=num_bytes,

libs/libcommon/src/libcommon/viewer_utils/features.py

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
Value,
3030
Video,
3131
)
32-
from datasets.features.features import FeatureType, _visit
3332
from PIL import Image as PILImage
3433

3534
from libcommon.dtos import FeatureItem
@@ -529,30 +528,3 @@ def to_features_list(features: Features) -> list[FeatureItem]:
529528
}
530529
for idx, name in enumerate(features)
531530
]
532-
533-
534-
def get_supported_unsupported_columns(
535-
features: Features,
536-
unsupported_features: list[FeatureType] = [],
537-
) -> tuple[list[str], list[str]]:
538-
supported_columns, unsupported_columns = [], []
539-
540-
for column, feature in features.items():
541-
str_column = str(column)
542-
supported = True
543-
544-
def classify(feature: FeatureType) -> None:
545-
nonlocal supported
546-
for unsupported_feature in unsupported_features:
547-
if type(unsupported_feature) is type(feature) is Value:
548-
if unsupported_feature.dtype == feature.dtype:
549-
supported = False
550-
elif type(unsupported_feature) is type(feature):
551-
supported = False
552-
553-
_visit(feature, classify)
554-
if supported:
555-
supported_columns.append(str_column)
556-
else:
557-
unsupported_columns.append(str_column)
558-
return supported_columns, unsupported_columns

libs/libcommon/tests/viewer_utils/test_features.py

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import boto3
1111
import pytest
1212
from aiobotocore.response import StreamingBody
13-
from datasets import Audio, Features, Image, List, Pdf, Value
1413
from moto import mock_s3
1514
from urllib3._collections import HTTPHeaderDict
1615

@@ -19,7 +18,6 @@
1918
from libcommon.url_preparator import URLPreparator
2019
from libcommon.viewer_utils.features import (
2120
get_cell_value,
22-
get_supported_unsupported_columns,
2321
infer_audio_file_extension,
2422
to_features_list,
2523
)
@@ -102,27 +100,6 @@ def test_to_features_list(
102100
assert first_feature["type"] == datasets_fixture.expected_feature_type
103101

104102

105-
def test_get_supported_unsupported_columns() -> None:
106-
features = Features(
107-
{
108-
"audio1": Audio(),
109-
"audio2": Audio(sampling_rate=16_000),
110-
"audio3": List(Audio()),
111-
"image1": Image(),
112-
"image2": Image(decode=False),
113-
"image3": List(Image()),
114-
"string": Value("string"),
115-
"binary": Value("binary"),
116-
"pdf": Pdf(),
117-
"pdf2": [Pdf()],
118-
}
119-
)
120-
unsupported_features = [Value("binary"), Audio()]
121-
supported_columns, unsupported_columns = get_supported_unsupported_columns(features, unsupported_features)
122-
assert supported_columns == ["image1", "image2", "image3", "string", "pdf", "pdf2"]
123-
assert unsupported_columns == ["audio1", "audio2", "audio3", "binary"]
124-
125-
126103
# specific test created for https://github.com/huggingface/dataset-viewer/issues/2045
127104
# which is reproduced only when using s3 for fsspec
128105
def test_ogg_audio_with_s3(

services/rows/src/rows/routes/rows.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,6 @@ async def rows_endpoint(request: Request) -> Response:
115115
pa_table=pa_table,
116116
offset=offset,
117117
features=rows_index.parquet_index.features,
118-
unsupported_columns=rows_index.parquet_index.unsupported_columns,
119118
partial=rows_index.parquet_index.partial,
120119
num_rows_total=rows_index.parquet_index.num_rows_total,
121120
truncated_columns=truncated_columns,

services/search/src/search/routes/filter.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
from libcommon.prometheus import StepProfiler
3434
from libcommon.storage import StrPath, clean_dir
3535
from libcommon.storage_client import StorageClient
36-
from libcommon.viewer_utils.features import get_supported_unsupported_columns
3736
from starlette.requests import Request
3837
from starlette.responses import Response
3938

@@ -147,15 +146,13 @@ async def filter_endpoint(request: Request) -> Response:
147146
# features must contain the row idx column for full_text_search
148147
features = Features.from_dict(content_parquet_metadata["features"])
149148
features[ROW_IDX_COLUMN] = Value("int64")
150-
with StepProfiler(method="filter_endpoint", step="get supported and unsupported columns"):
151-
supported_columns, unsupported_columns = get_supported_unsupported_columns(
152-
features,
153-
)
149+
columns = list(features.keys())
150+
154151
with StepProfiler(method="filter_endpoint", step="execute filter query"):
155152
num_rows_total, pa_table = await anyio.to_thread.run_sync(
156153
execute_filter_query,
157154
index_file_location,
158-
supported_columns,
155+
columns,
159156
where,
160157
orderby,
161158
length,
@@ -180,7 +177,6 @@ async def filter_endpoint(request: Request) -> Response:
180177
pa_table=pa_table,
181178
offset=offset,
182179
features=features or Features.from_arrow_schema(pa_table.schema),
183-
unsupported_columns=unsupported_columns,
184180
num_rows_total=num_rows_total,
185181
partial=partial,
186182
use_row_idx_column=True,

services/search/src/search/routes/search.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,7 @@
3535
from libcommon.prometheus import StepProfiler
3636
from libcommon.storage import StrPath, clean_dir
3737
from libcommon.storage_client import StorageClient
38-
from libcommon.viewer_utils.features import (
39-
get_supported_unsupported_columns,
40-
to_features_list,
41-
)
38+
from libcommon.viewer_utils.features import to_features_list
4239
from starlette.requests import Request
4340
from starlette.responses import Response
4441

@@ -86,14 +83,11 @@ async def create_response(
8683
storage_client: StorageClient,
8784
offset: int,
8885
features: Features,
89-
unsupported_columns: list[str],
9086
num_rows_total: int,
9187
partial: bool,
9288
) -> PaginatedResponse:
9389
features_without_key = features.copy()
9490
features_without_key.pop(ROW_IDX_COLUMN, None)
95-
if len(pa_table) > 0:
96-
pa_table = pa_table.drop(unsupported_columns)
9791
logging.info(f"create response for {dataset=} {config=} {split=}")
9892

9993
return PaginatedResponse(
@@ -107,7 +101,6 @@ async def create_response(
107101
storage_client=storage_client,
108102
offset=offset,
109103
features=features,
110-
unsupported_columns=unsupported_columns,
111104
row_idx_column=ROW_IDX_COLUMN,
112105
),
113106
num_rows_total=num_rows_total,
@@ -202,16 +195,14 @@ async def search_endpoint(request: Request) -> Response:
202195
# features must contain the row idx column for full_text_search
203196
features = Features.from_dict(content_parquet_metadata["features"])
204197
features[ROW_IDX_COLUMN] = Value("int64")
205-
with StepProfiler(method="search_endpoint", step="get supported and unsupported columns"):
206-
supported_columns, unsupported_columns = get_supported_unsupported_columns(
207-
features,
208-
)
198+
columns = list(features.keys())
199+
209200
with StepProfiler(method="search_endpoint", step="perform FTS command"):
210201
logging.debug(f"connect to index file {index_file_location}")
211202
num_rows_total, pa_table = await anyio.to_thread.run_sync(
212203
full_text_search,
213204
index_file_location,
214-
supported_columns,
205+
columns,
215206
query,
216207
offset,
217208
length,
@@ -235,7 +226,6 @@ async def search_endpoint(request: Request) -> Response:
235226
storage_client=cached_assets_storage_client,
236227
offset=offset,
237228
features=features or Features.from_arrow_schema(pa_table.schema),
238-
unsupported_columns=unsupported_columns,
239229
num_rows_total=num_rows_total,
240230
partial=partial,
241231
)

services/search/tests/routes/test_filter.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,6 @@ async def test_create_response(ds: Dataset, app_config: AppConfig, storage_clien
153153
pa_table=pa_table,
154154
offset=0,
155155
features=ds.features,
156-
unsupported_columns=[],
157156
num_rows_total=4,
158157
partial=False,
159158
use_row_idx_column=True,

0 commit comments

Comments
 (0)