@@ -90,39 +90,40 @@ def test_overwrite_removes_only_selected_datafile(prepopulated_table: Table, dup
90
90
91
91
removed_files : List [DataFile ] = mt .deduplicate_data_files ()
92
92
93
- file_paths_after : Set [str ] = {df .file_path for df in mt ._get_all_datafiles ()}
94
-
95
- # Both files should remain, since they are not duplicates
96
- assert str (dupe_data_file_path ) in file_paths_after , "Expected file_a.parquet to remain in the table"
97
- assert len (removed_files ) == 0 , "Expected no files to be removed since there are no duplicates"
93
+ file_names_after : Set [str ] = {df .file_path .split ("/" )[- 1 ] for df in mt ._get_all_datafiles ()}
94
+ # Only one file with the same name should remain after deduplication
95
+ assert dupe_data_file_path .name in file_names_after , f"Expected { dupe_data_file_path .name } to remain in the table"
96
+ assert len (file_names_after ) == 1 , "Expected only one unique file name to remain after deduplication"
97
+ # All removed files should have the same file name
98
+ assert all (df .file_path .split ("/" )[- 1 ] == dupe_data_file_path .name for df in removed_files ), "All removed files should be duplicates by name"
98
99
99
100
100
101
def test_get_all_datafiles_current_snapshot (prepopulated_table : Table , dupe_data_file_path : Path ) -> None :
101
102
mt = MaintenanceTable (tbl = prepopulated_table )
102
103
103
104
datafiles : List [DataFile ] = mt ._get_all_datafiles ()
104
- file_paths : Set [str ] = {df .file_path for df in datafiles }
105
- assert str ( dupe_data_file_path ) in file_paths
105
+ file_paths : Set [str ] = {df .file_path . split ( "/" )[ - 1 ] for df in datafiles }
106
+ assert dupe_data_file_path . name in file_paths
106
107
107
108
108
109
def test_get_all_datafiles_all_snapshots (prepopulated_table : Table , dupe_data_file_path : Path ) -> None :
109
110
mt = MaintenanceTable (tbl = prepopulated_table )
110
111
111
112
datafiles : List [DataFile ] = mt ._get_all_datafiles ()
112
- file_paths : Set [str ] = {df .file_path for df in datafiles }
113
- assert str ( dupe_data_file_path ) in file_paths
113
+ file_paths : Set [str ] = {df .file_path . split ( "/" )[ - 1 ] for df in datafiles }
114
+ assert dupe_data_file_path . name in file_paths
114
115
115
116
116
117
def test_dedup_data_files_removes_duplicates_in_current_snapshot (prepopulated_table : Table , dupe_data_file_path : Path ) -> None :
117
118
mt = MaintenanceTable (tbl = prepopulated_table )
118
119
119
120
all_datafiles : List [DataFile ] = mt ._get_all_datafiles ()
120
- file_paths : List [str ] = [df .file_path for df in all_datafiles ]
121
+ file_paths : List [str ] = [df .file_path . split ( "/" )[ - 1 ] for df in all_datafiles ]
121
122
# Only one reference should remain after deduplication
122
- assert file_paths .count (str ( dupe_data_file_path ) ) == 1
123
+ assert file_paths .count (dupe_data_file_path . name ) == 1
123
124
removed : List [DataFile ] = mt .deduplicate_data_files ()
124
125
125
126
all_datafiles_after : List [DataFile ] = mt ._get_all_datafiles ()
126
- file_paths_after : List [str ] = [df .file_path for df in all_datafiles_after ]
127
- assert file_paths_after .count (str ( dupe_data_file_path ) ) == 1
127
+ file_paths_after : List [str ] = [df .file_path . split ( "/" )[ - 1 ] for df in all_datafiles_after ]
128
+ assert file_paths_after .count (dupe_data_file_path . name ) == 1
128
129
assert all (isinstance (df , DataFile ) for df in removed )
0 commit comments