@@ -69,6 +69,7 @@ struct Plan
6969 IcebergHistory history;
7070 std::unordered_map<String, Int64> manifest_file_to_first_snapshot;
7171 std::unordered_map<String, std::vector<String>> manifest_list_to_manifest_files;
72+ std::unordered_map<String, std::unordered_set<Int64>> manifest_file_to_snapshots;
7273 std::unordered_map<Int64, std::vector<std::shared_ptr<DataFilePlan>>> snapshot_id_to_data_files;
7374 std::unordered_map<String, std::shared_ptr<DataFilePlan>> path_to_data_file;
7475 FileNamesGenerator generator;
@@ -111,7 +112,7 @@ Plan getPlan(
111112 IcebergHistory snapshots_info,
112113 const PersistentTableComponents & persistent_table_components,
113114 ObjectStoragePtr object_storage,
114- std::map<String, DB::ObjectStoragePtr> secondary_storages,
115+ std::map<String, DB::ObjectStoragePtr> & secondary_storages,
115116 StorageObjectStorageConfigurationPtr configuration,
116117 ContextPtr context,
117118 CompressionMethod compression_method)
@@ -156,7 +157,9 @@ Plan getPlan(
156157 {
157158 plan.manifest_list_to_manifest_files [snapshot.manifest_list_absolute_path ].push_back (manifest_file.manifest_file_absolute_path );
158159 if (!plan.manifest_file_to_first_snapshot .contains (manifest_file.manifest_file_absolute_path ))
160+ {
159161 plan.manifest_file_to_first_snapshot [manifest_file.manifest_file_absolute_path ] = snapshot.snapshot_id ;
162+ }
160163 auto manifest_file_content = getManifestFile (
161164 object_storage,
162165 configuration,
@@ -174,6 +177,8 @@ Plan getPlan(
174177 manifest_files[manifest_file.manifest_file_absolute_path ]->path = manifest_file.manifest_file_absolute_path ;
175178 }
176179 manifest_files[manifest_file.manifest_file_absolute_path ]->manifest_lists_path .push_back (snapshot.manifest_list_path );
180+ // Track which snapshots this manifest file belongs to
181+ plan.manifest_file_to_snapshots [manifest_file.manifest_file_absolute_path ].insert (snapshot.snapshot_id );
177182 auto data_files = manifest_file_content->getFilesWithoutDeleted (FileContentType::DATA);
178183 auto positional_delete_files = manifest_file_content->getFilesWithoutDeleted (FileContentType::POSITION_DELETE);
179184 for (const auto & pos_delete_file : positional_delete_files)
@@ -185,19 +190,24 @@ Plan getPlan(
185190 if (plan.partitions .size () <= partition_index)
186191 plan.partitions .push_back ({});
187192
188- IcebergDataObjectInfoPtr data_object_info = std::make_shared<IcebergDataObjectInfo>(data_file);
193+ auto [resolved_storage, resolved_key] = resolveObjectStorageForPath (
194+ persistent_table_components.table_location , data_file.file_path , object_storage, secondary_storages, context);
195+
196+ IcebergDataObjectInfoPtr data_object_info = std::make_shared<IcebergDataObjectInfo>(data_file, resolved_storage, resolved_key);
189197 std::shared_ptr<DataFilePlan> data_file_ptr;
190- if (!plan.path_to_data_file .contains (manifest_file.manifest_file_absolute_path ))
198+ std::string storage_identifier = resolved_storage->getDescription () + " :" + resolved_storage->getObjectsNamespace ();
199+ std::string composite_key = storage_identifier + " |" + resolved_key;
200+ if (!plan.path_to_data_file .contains (composite_key))
191201 {
192202 data_file_ptr = std::make_shared<DataFilePlan>(DataFilePlan{
193203 .data_object_info = data_object_info,
194204 .manifest_list = manifest_files[manifest_file.manifest_file_absolute_path ],
195205 .patched_path = plan.generator .generateDataFileName ()});
196- plan.path_to_data_file [manifest_file. manifest_file_absolute_path ] = data_file_ptr;
206+ plan.path_to_data_file [composite_key ] = data_file_ptr;
197207 }
198208 else
199209 {
200- data_file_ptr = plan.path_to_data_file [manifest_file. manifest_file_absolute_path ];
210+ data_file_ptr = plan.path_to_data_file [composite_key ];
201211 }
202212 plan.partitions [partition_index].push_back (data_file_ptr);
203213 plan.snapshot_id_to_data_files [snapshot.snapshot_id ].push_back (plan.partitions [partition_index].back ());
@@ -229,15 +239,18 @@ void writeDataFiles(
229239 ObjectStoragePtr object_storage,
230240 const std::optional<FormatSettings> & format_settings,
231241 ContextPtr context,
232- StorageObjectStorageConfigurationPtr configuration)
242+ StorageObjectStorageConfigurationPtr configuration,
243+ const String & table_location,
244+ std::map<String, ObjectStoragePtr> & secondary_storages)
233245{
234246 for (auto & [_, data_file] : initial_plan.path_to_data_file )
235247 {
236248 auto delete_file_transform = std::make_shared<IcebergBitmapPositionDeleteTransform>(
237- sample_block, data_file->data_object_info , object_storage, format_settings, context);
249+ sample_block, data_file->data_object_info , object_storage, format_settings, context, table_location, secondary_storages );
238250
251+ ObjectStoragePtr storage_to_use = data_file->data_object_info ->getObjectStorage ().value_or (object_storage);
239252 StorageObjectStorage::ObjectInfo object_info (data_file->data_object_info ->getPath ());
240- auto read_buffer = createReadBuffer (object_info, object_storage , context, getLogger (" IcebergCompaction" ));
253+ auto read_buffer = createReadBuffer (object_info, storage_to_use , context, getLogger (" IcebergCompaction" ));
241254
242255 const Settings & settings = context->getSettingsRef ();
243256 auto parser_shared_resources = std::make_shared<FormatParserSharedResources>(
@@ -395,6 +408,9 @@ void writeMetadataFiles(
395408 {
396409 manifest_entry->patched_path = plan.generator .generateManifestEntryName ();
397410 manifest_file_renamings[manifest_entry->path ] = manifest_entry->patched_path .path_in_metadata ;
411+
412+ std::vector<String> unique_data_filenames (data_filenames.begin (), data_filenames.end ());
413+
398414 auto buffer_manifest_entry = object_storage->writeObject (
399415 StoredObject (manifest_entry->patched_path .path_in_storage ),
400416 WriteMode::Rewrite,
@@ -412,7 +428,7 @@ void writeMetadataFiles(
412428 partition_columns,
413429 plan.partition_encoder .getPartitionValue (grouped_by_manifest_files_partitions[manifest_entry]),
414430 ChunkPartitioner (fields_from_partition_spec, current_schema, context, sample_block_).getResultTypes (),
415- std::vector (data_filenames. begin (), data_filenames. end ()) ,
431+ unique_data_filenames ,
416432 manifest_entry->statistics ,
417433 sample_block_,
418434 snapshot,
@@ -441,16 +457,25 @@ void writeMetadataFiles(
441457 if (plan.history [i].added_files == 0 )
442458 continue ;
443459
444- auto initial_manifest_list_name = plan.history [i].manifest_list_path ;
460+ auto initial_manifest_list_name = plan.history [i].manifest_list_absolute_path ;
445461 auto initial_manifest_entries = plan.manifest_list_to_manifest_files [initial_manifest_list_name];
446- auto renamed_manifest_list = manifest_list_renamings[initial_manifest_list_name ];
462+ auto renamed_manifest_list = manifest_list_renamings[plan. history [i]. manifest_list_path ];
447463 std::vector<String> renamed_manifest_entries;
464+ std::unordered_set<String> seen_manifest_entries; // Deduplicate manifest entries
448465 Int32 total_manifest_file_sizes = 0 ;
449466 for (const auto & initial_manifest_entry : initial_manifest_entries)
450467 {
451468 auto renamed_manifest_entry = manifest_file_renamings[initial_manifest_entry];
452469 if (!renamed_manifest_entry.empty ())
453470 {
471+ auto it = plan.manifest_file_to_snapshots .find (initial_manifest_entry);
472+ if (it != plan.manifest_file_to_snapshots .end () && !it->second .contains (plan.history [i].snapshot_id ))
473+ continue ;
474+
475+ if (seen_manifest_entries.contains (renamed_manifest_entry))
476+ continue ;
477+
478+ seen_manifest_entries.insert (renamed_manifest_entry);
454479 renamed_manifest_entries.push_back (renamed_manifest_entry);
455480 total_manifest_file_sizes += manifest_file_sizes[renamed_manifest_entry];
456481 }
@@ -513,7 +538,7 @@ void compactIcebergTable(
513538 IcebergHistory snapshots_info,
514539 const PersistentTableComponents & persistent_table_components,
515540 ObjectStoragePtr object_storage_,
516- std::map<String, DB::ObjectStoragePtr> secondary_storages_,
541+ std::map<String, DB::ObjectStoragePtr> & secondary_storages_,
517542 StorageObjectStorageConfigurationPtr configuration_,
518543 const std::optional<FormatSettings> & format_settings_,
519544 SharedHeader sample_block_,
@@ -525,7 +550,7 @@ void compactIcebergTable(
525550 if (plan.need_optimize )
526551 {
527552 auto old_files = getOldFiles (object_storage_, configuration_);
528- writeDataFiles (plan, sample_block_, object_storage_, format_settings_, context_, configuration_);
553+ writeDataFiles (plan, sample_block_, object_storage_, format_settings_, context_, configuration_, persistent_table_components. table_location , secondary_storages_ );
529554 writeMetadataFiles (plan, object_storage_, configuration_, context_, sample_block_);
530555 clearOldFiles (object_storage_, old_files);
531556 }
0 commit comments