|
22 | 22 | from libcommon.prometheus import StepProfiler |
23 | 23 | from libcommon.simple_cache import get_previous_step_or_raise |
24 | 24 | from libcommon.storage import StrPath |
25 | | -from libcommon.viewer_utils.features import get_supported_unsupported_columns |
26 | 25 |
|
27 | 26 | # For partial Parquet export we have paths like "en/partial-train/0000.parquet". |
28 | 27 | # "-" is not allowed is split names so we use it in the prefix to avoid collisions. |
@@ -181,8 +180,6 @@ def read_size(self, columns: Optional[Iterable[str]] = None) -> int: |
181 | 180 | @dataclass |
182 | 181 | class ParquetIndexWithMetadata: |
183 | 182 | features: Features |
184 | | - supported_columns: list[str] |
185 | | - unsupported_columns: list[str] |
186 | 183 | parquet_files_urls: list[str] |
187 | 184 | metadata_paths: list[str] |
188 | 185 | num_bytes: list[int] |
@@ -329,10 +326,11 @@ def query_truncated_binary(self, offset: int, length: int) -> tuple[pa.Table, li |
329 | 326 | ) # we use a minimum length to not end up with too empty cells |
330 | 327 | try: |
331 | 328 | pa_tables: list[pa.Table] = [] |
| 329 | + columns = list(self.features.keys()) |
332 | 330 | truncated_columns: set[str] = set() |
333 | 331 | for i in range(first_row_group_id, last_row_group_id + 1): |
334 | 332 | rg_pa_table, rg_truncated_columns = row_group_readers[i].read_truncated_binary( |
335 | | - self.supported_columns, max_binary_length=max_binary_length |
| 333 | + columns, max_binary_length=max_binary_length |
336 | 334 | ) |
337 | 335 | pa_tables.append(rg_pa_table) |
338 | 336 | truncated_columns |= set(rg_truncated_columns) |
@@ -438,12 +436,10 @@ def query(self, offset: int, length: int) -> pa.Table: |
438 | 436 | ) |
439 | 437 |
|
440 | 438 | with StepProfiler(method="parquet_index_with_metadata.query", step="read the row groups"): |
| 439 | + columns = list(self.features.keys()) |
441 | 440 | try: |
442 | 441 | pa_table = pa.concat_tables( |
443 | | - [ |
444 | | - row_group_readers[i].read(self.supported_columns) |
445 | | - for i in range(first_row_group_id, last_row_group_id + 1) |
446 | | - ] |
| 442 | + [row_group_readers[i].read(columns) for i in range(first_row_group_id, last_row_group_id + 1)] |
447 | 443 | ) |
448 | 444 | except ArrowInvalid as err: |
449 | 445 | raise SchemaMismatchError("Parquet files have different schema.", err) |
@@ -486,15 +482,9 @@ def from_parquet_metadata_items( |
486 | 482 | ): |
487 | 483 | if features is None: # config-parquet version<6 didn't have features |
488 | 484 | features = Features.from_arrow_schema(pq.read_schema(metadata_paths[0])) |
489 | | - # TODO(kszucs): since unsupported_features is always empty list we may omit the call below |
490 | | - supported_columns, unsupported_columns = get_supported_unsupported_columns( |
491 | | - features, |
492 | | - unsupported_features=[], |
493 | | - ) |
| 485 | + |
494 | 486 | return ParquetIndexWithMetadata( |
495 | 487 | features=features, |
496 | | - supported_columns=supported_columns, |
497 | | - unsupported_columns=unsupported_columns, |
498 | 488 | parquet_files_urls=parquet_files_urls, |
499 | 489 | metadata_paths=metadata_paths, |
500 | 490 | num_bytes=num_bytes, |
|
0 commit comments