Skip to content

Commit 9dc9c82

Browse files
committed
wip - refactor: update deduplication tests to use file names instead of full paths
1 parent 6036e12 commit 9dc9c82

File tree

1 file changed

+14
-13
lines changed

1 file changed

+14
-13
lines changed

tests/table/test_dedup_data_file_filepaths.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -90,39 +90,40 @@ def test_overwrite_removes_only_selected_datafile(prepopulated_table: Table, dup
9090

9191
removed_files: List[DataFile] = mt.deduplicate_data_files()
9292

93-
file_paths_after: Set[str] = {df.file_path for df in mt._get_all_datafiles()}
94-
95-
# Both files should remain, since they are not duplicates
96-
assert str(dupe_data_file_path) in file_paths_after, "Expected file_a.parquet to remain in the table"
97-
assert len(removed_files) == 0, "Expected no files to be removed since there are no duplicates"
93+
file_names_after: Set[str] = {df.file_path.split("/")[-1] for df in mt._get_all_datafiles()}
94+
# Only one file with the same name should remain after deduplication
95+
assert dupe_data_file_path.name in file_names_after, f"Expected {dupe_data_file_path.name} to remain in the table"
96+
assert len(file_names_after) == 1, "Expected only one unique file name to remain after deduplication"
97+
# All removed files should have the same file name
98+
assert all(df.file_path.split("/")[-1] == dupe_data_file_path.name for df in removed_files), "All removed files should be duplicates by name"
9899

99100

100101
def test_get_all_datafiles_current_snapshot(prepopulated_table: Table, dupe_data_file_path: Path) -> None:
101102
mt = MaintenanceTable(tbl=prepopulated_table)
102103

103104
datafiles: List[DataFile] = mt._get_all_datafiles()
104-
file_paths: Set[str] = {df.file_path for df in datafiles}
105-
assert str(dupe_data_file_path) in file_paths
105+
file_paths: Set[str] = {df.file_path.split("/")[-1] for df in datafiles}
106+
assert dupe_data_file_path.name in file_paths
106107

107108

108109
def test_get_all_datafiles_all_snapshots(prepopulated_table: Table, dupe_data_file_path: Path) -> None:
109110
mt = MaintenanceTable(tbl=prepopulated_table)
110111

111112
datafiles: List[DataFile] = mt._get_all_datafiles()
112-
file_paths: Set[str] = {df.file_path for df in datafiles}
113-
assert str(dupe_data_file_path) in file_paths
113+
file_paths: Set[str] = {df.file_path.split("/")[-1] for df in datafiles}
114+
assert dupe_data_file_path.name in file_paths
114115

115116

116117
def test_dedup_data_files_removes_duplicates_in_current_snapshot(prepopulated_table: Table, dupe_data_file_path: Path) -> None:
117118
mt = MaintenanceTable(tbl=prepopulated_table)
118119

119120
all_datafiles: List[DataFile] = mt._get_all_datafiles()
120-
file_paths: List[str] = [df.file_path for df in all_datafiles]
121+
file_paths: List[str] = [df.file_path.split("/")[-1] for df in all_datafiles]
121122
# Only one reference should remain after deduplication
122-
assert file_paths.count(str(dupe_data_file_path)) == 1
123+
assert file_paths.count(dupe_data_file_path.name) == 1
123124
removed: List[DataFile] = mt.deduplicate_data_files()
124125

125126
all_datafiles_after: List[DataFile] = mt._get_all_datafiles()
126-
file_paths_after: List[str] = [df.file_path for df in all_datafiles_after]
127-
assert file_paths_after.count(str(dupe_data_file_path)) == 1
127+
file_paths_after: List[str] = [df.file_path.split("/")[-1] for df in all_datafiles_after]
128+
assert file_paths_after.count(dupe_data_file_path.name) == 1
128129
assert all(isinstance(df, DataFile) for df in removed)

0 commit comments

Comments
 (0)