1919
2020#include " iceberg/table_scan.h"
2121
22+ #include < ranges>
23+
2224#include " iceberg/manifest_entry.h"
2325#include " iceberg/manifest_list.h"
2426#include " iceberg/manifest_reader.h"
@@ -64,6 +66,9 @@ int32_t FileScanTask::files_count() const {
6466}
6567
6668int64_t FileScanTask::estimated_row_count () const {
69+ if (data_file_->file_size_in_bytes == 0 ) {
70+ return 0 ;
71+ }
6772 const double scannedFileFraction =
6873 static_cast <double >(length_) / data_file_->file_size_in_bytes ;
6974 return static_cast <int64_t >(scannedFileFraction * data_file_->record_count );
@@ -79,6 +84,7 @@ TableScanBuilder::TableScanBuilder(const Table& table,
7984
8085TableScanBuilder& TableScanBuilder::WithColumnNames (
8186 std::vector<std::string> column_names) {
87+ column_names_.reserve (column_names.size ());
8288 column_names_ = std::move (column_names);
8389 return *this ;
8490}
@@ -205,34 +211,52 @@ Result<std::vector<std::shared_ptr<FileScanTask>>> DataScan::PlanFiles() const {
205211 }
206212 }
207213
214+ DeleteFileIndex delete_file_index;
215+ delete_file_index.BuildIndex (positional_delete_entries);
216+
208217 // TODO(gty404): build residual expression from filter
209218 std::shared_ptr<Expression> residual;
210-
211219 std::vector<std::shared_ptr<FileScanTask>> tasks;
212220 for (const auto & data_entry : data_entries) {
213- auto matched_deletes = GetMatchedDeletes (*data_entry, positional_delete_entries );
221+ auto matched_deletes = GetMatchedDeletes (*data_entry, delete_file_index );
214222 const auto & data_file = data_entry->data_file ;
215223 tasks.emplace_back (std::make_shared<FileScanTask>(
216- data_file, matched_deletes, 0 , data_file->file_size_in_bytes , residual));
224+ data_file, std::move (matched_deletes), 0 , data_file->file_size_in_bytes ,
225+ std::move (residual)));
217226 }
218227 return tasks;
219228}
220229
221- std::vector<std::shared_ptr<DataFile>> DataScan::GetMatchedDeletes (
222- const ManifestEntry& data_entry,
223- const std::vector<std::unique_ptr<ManifestEntry>>& positional_delete_entries) const {
230+ void DataScan::DeleteFileIndex::BuildIndex (
231+ const std::vector<std::unique_ptr<ManifestEntry>>& entries) {
232+ sequence_index.clear ();
233+
234+ for (const auto & entry : entries) {
235+ const int64_t seq_num =
236+ entry->sequence_number .value_or (Snapshot::kInitialSequenceNumber );
237+ sequence_index.emplace (seq_num, entry.get ());
238+ }
239+ }
240+
241+ std::vector<ManifestEntry*> DataScan::DeleteFileIndex::FindRelevantEntries (
242+ const ManifestEntry& data_entry) const {
243+ std::vector<ManifestEntry*> relevant_deletes;
244+
245+ // Use lower_bound for efficient range search
224246 auto data_sequence_number =
225247 data_entry.sequence_number .value_or (Snapshot::kInitialSequenceNumber );
226- std::vector<const ManifestEntry*> relevant_entries;
227- // TODO(gty404): consider using a more efficient data structure
228- for (const auto & delete_entry : positional_delete_entries) {
229- const int64_t delete_sequence_number =
230- delete_entry->sequence_number .value_or (Snapshot::kInitialSequenceNumber );
231- if (delete_sequence_number >= data_sequence_number) {
232- relevant_entries.push_back (delete_entry.get ());
233- }
248+ for (auto it = sequence_index.lower_bound (data_sequence_number);
249+ it != sequence_index.end (); ++it) {
250+ // Additional filtering logic here
251+ relevant_deletes.push_back (it->second );
234252 }
235253
254+ return relevant_deletes;
255+ }
256+
257+ std::vector<std::shared_ptr<DataFile>> DataScan::GetMatchedDeletes (
258+ const ManifestEntry& data_entry, const DeleteFileIndex& delete_file_index) const {
259+ const auto relevant_entries = delete_file_index.FindRelevantEntries (data_entry);
236260 std::vector<std::shared_ptr<DataFile>> matched_deletes;
237261 if (relevant_entries.empty ()) {
238262 return matched_deletes;
0 commit comments