Skip to content

Commit 3c67982

Browse files
committed
refactor(libcommon): remove unused RowsIndex.partial and duckdb_index_is_partial()
1 parent dd1630b commit 3c67982

File tree

4 files changed

+3
-43
lines changed

4 files changed

+3
-43
lines changed

libs/libcommon/src/libcommon/duckdb_utils.py

Lines changed: 3 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,7 @@
1010
from tqdm.contrib.concurrent import thread_map
1111

1212
from libcommon.constants import ROW_IDX_COLUMN
13-
from libcommon.parquet_utils import (
14-
PARTIAL_PREFIX,
15-
is_list_pa_type,
16-
parquet_export_is_partial,
17-
)
13+
from libcommon.parquet_utils import is_list_pa_type
1814
from libcommon.statistics_utils import (
1915
STRING_DTYPES,
2016
AudioColumn,
@@ -37,8 +33,8 @@
3733
"CREATE OR REPLACE TABLE data AS SELECT {columns} FROM read_parquet({source});"
3834
)
3935
CREATE_TABLE_JOIN_WITH_TRANSFORMED_DATA_COMMAND_FROM_LIST_OF_PARQUET_FILES = """
40-
CREATE OR REPLACE TABLE data AS
41-
SELECT {columns}, transformed_df.* FROM read_parquet({source})
36+
CREATE OR REPLACE TABLE data AS
37+
SELECT {columns}, transformed_df.* FROM read_parquet({source})
4238
POSITIONAL JOIN transformed_df;
4339
"""
4440
CREATE_SEQUENCE_COMMAND = "CREATE OR REPLACE SEQUENCE serial START 0 MINVALUE 0;"
@@ -187,25 +183,6 @@ def compute_transformed_data(parquet_paths: list[Path], features: dict[str, Any]
187183
return transformed_df
188184

189185

190-
def duckdb_index_is_partial(duckdb_index_url: str) -> bool:
191-
"""
192-
Check if the DuckDB index is on the full dataset or if it's partial.
193-
It could be partial for two reasons:
194-
195-
1. if the Parquet export that was used to build it is partial
196-
2. if it's a partial index of the Parquet export
197-
198-
Args:
199-
duckdb_index_url (`str`): The URL of the DuckDB index file.
200-
201-
Returns:
202-
`bool`: True is the DuckDB index is partial,
203-
or False if it's an index of the full dataset.
204-
"""
205-
_, duckdb_index_file_name = duckdb_index_url.rsplit("/", 1)
206-
return parquet_export_is_partial(duckdb_index_url) or duckdb_index_file_name.startswith(PARTIAL_PREFIX)
207-
208-
209186
def create_index(
210187
database: str,
211188
input_table: str,

libs/libcommon/src/libcommon/parquet_utils.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,6 @@ class ParquetIndexWithMetadata:
191191
features: Features
192192
httpfs: HTTPFileSystem
193193
max_arrow_data_in_memory: int
194-
partial: bool
195194
metadata_dir: Path
196195

197196
file_offsets: np.ndarray = field(init=False)
@@ -401,7 +400,6 @@ def from_parquet_metadata_items(
401400
if not parquet_file_metadata_items:
402401
raise EmptyParquetMetadataError("No parquet files found.")
403402

404-
partial = parquet_export_is_partial(parquet_file_metadata_items[0]["url"])
405403
metadata_dir = Path(parquet_metadata_directory)
406404

407405
with StepProfiler(
@@ -425,7 +423,6 @@ def from_parquet_metadata_items(
425423
features=features,
426424
httpfs=httpfs,
427425
max_arrow_data_in_memory=max_arrow_data_in_memory,
428-
partial=partial,
429426
metadata_dir=metadata_dir,
430427
)
431428

libs/libcommon/tests/test_duckdb_utils.py

Lines changed: 0 additions & 13 deletions
This file was deleted.

services/worker/tests/job_runners/config/test_parquet_metadata.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,6 @@ def test_ParquetIndexWithMetadata_query(
404404
features=features,
405405
httpfs=httpfs,
406406
max_arrow_data_in_memory=999999999,
407-
partial=False,
408407
metadata_dir=metadata_dir,
409408
)
410409
with patch("libcommon.parquet_utils.HTTPFile", AuthenticatedHTTPFile):

0 commit comments

Comments
 (0)