Skip to content

Commit c7621b3

Browse files
committed
fix lint
1 parent 85802e9 commit c7621b3

File tree

2 files changed

+56
-17
lines changed

2 files changed

+56
-17
lines changed

src/iceberg/table_scan.cc

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919

2020
#include "iceberg/table_scan.h"
2121

22+
#include <ranges>
23+
2224
#include "iceberg/manifest_entry.h"
2325
#include "iceberg/manifest_list.h"
2426
#include "iceberg/manifest_reader.h"
@@ -64,6 +66,9 @@ int32_t FileScanTask::files_count() const {
6466
}
6567

6668
int64_t FileScanTask::estimated_row_count() const {
69+
if (data_file_->file_size_in_bytes == 0) {
70+
return 0;
71+
}
6772
const double scannedFileFraction =
6873
static_cast<double>(length_) / data_file_->file_size_in_bytes;
6974
return static_cast<int64_t>(scannedFileFraction * data_file_->record_count);
@@ -79,6 +84,7 @@ TableScanBuilder::TableScanBuilder(const Table& table,
7984

8085
TableScanBuilder& TableScanBuilder::WithColumnNames(
8186
std::vector<std::string> column_names) {
87+
column_names_.reserve(column_names.size());
8288
column_names_ = std::move(column_names);
8389
return *this;
8490
}
@@ -205,34 +211,52 @@ Result<std::vector<std::shared_ptr<FileScanTask>>> DataScan::PlanFiles() const {
205211
}
206212
}
207213

214+
DeleteFileIndex delete_file_index;
215+
delete_file_index.BuildIndex(positional_delete_entries);
216+
208217
// TODO(gty404): build residual expression from filter
209218
std::shared_ptr<Expression> residual;
210-
211219
std::vector<std::shared_ptr<FileScanTask>> tasks;
212220
for (const auto& data_entry : data_entries) {
213-
auto matched_deletes = GetMatchedDeletes(*data_entry, positional_delete_entries);
221+
auto matched_deletes = GetMatchedDeletes(*data_entry, delete_file_index);
214222
const auto& data_file = data_entry->data_file;
215223
tasks.emplace_back(std::make_shared<FileScanTask>(
216-
data_file, matched_deletes, 0, data_file->file_size_in_bytes, residual));
224+
data_file, std::move(matched_deletes), 0, data_file->file_size_in_bytes,
225+
std::move(residual)));
217226
}
218227
return tasks;
219228
}
220229

221-
std::vector<std::shared_ptr<DataFile>> DataScan::GetMatchedDeletes(
222-
const ManifestEntry& data_entry,
223-
const std::vector<std::unique_ptr<ManifestEntry>>& positional_delete_entries) const {
230+
void DataScan::DeleteFileIndex::BuildIndex(
231+
const std::vector<std::unique_ptr<ManifestEntry>>& entries) {
232+
sequence_index.clear();
233+
234+
for (const auto& entry : entries) {
235+
const int64_t seq_num =
236+
entry->sequence_number.value_or(Snapshot::kInitialSequenceNumber);
237+
sequence_index.emplace(seq_num, entry.get());
238+
}
239+
}
240+
241+
std::vector<ManifestEntry*> DataScan::DeleteFileIndex::FindRelevantEntries(
242+
const ManifestEntry& data_entry) const {
243+
std::vector<ManifestEntry*> relevant_deletes;
244+
245+
// Use lower_bound for efficient range search
224246
auto data_sequence_number =
225247
data_entry.sequence_number.value_or(Snapshot::kInitialSequenceNumber);
226-
std::vector<const ManifestEntry*> relevant_entries;
227-
// TODO(gty404): consider using a more efficient data structure
228-
for (const auto& delete_entry : positional_delete_entries) {
229-
const int64_t delete_sequence_number =
230-
delete_entry->sequence_number.value_or(Snapshot::kInitialSequenceNumber);
231-
if (delete_sequence_number >= data_sequence_number) {
232-
relevant_entries.push_back(delete_entry.get());
233-
}
248+
for (auto it = sequence_index.lower_bound(data_sequence_number);
249+
it != sequence_index.end(); ++it) {
250+
// Additional filtering logic here
251+
relevant_deletes.push_back(it->second);
234252
}
235253

254+
return relevant_deletes;
255+
}
256+
257+
std::vector<std::shared_ptr<DataFile>> DataScan::GetMatchedDeletes(
258+
const ManifestEntry& data_entry, const DeleteFileIndex& delete_file_index) const {
259+
const auto relevant_entries = delete_file_index.FindRelevantEntries(data_entry);
236260
std::vector<std::shared_ptr<DataFile>> matched_deletes;
237261
if (relevant_entries.empty()) {
238262
return matched_deletes;

src/iceberg/table_scan.h

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ class ICEBERG_EXPORT FileScanTask : public ScanTask {
4949
int64_t length, std::shared_ptr<Expression> residual);
5050

5151
/// \brief The data file that should be read by this scan task.
52-
virtual const std::shared_ptr<DataFile>& data_file() const;
52+
const std::shared_ptr<DataFile>& data_file() const;
5353

5454
/// \brief The delete files that should be read by this scan task.
5555
const std::vector<std::shared_ptr<DataFile>>& delete_files() const;
@@ -155,12 +155,20 @@ class ICEBERG_EXPORT TableScan {
155155
/// \param file_io File I/O instance for reading manifests and data files.
156156
TableScan(TableScanContext context, std::shared_ptr<FileIO> file_io);
157157

158+
/// \brief Returns the snapshot being scanned.
159+
/// \return A shared pointer to the snapshot.
158160
const std::shared_ptr<Snapshot>& snapshot() const;
159161

162+
/// \brief Returns the projected schema for the scan.
163+
/// \return A shared pointer to the projected schema.
160164
const std::shared_ptr<Schema>& projection() const;
161165

166+
/// \brief Returns the scan context.
167+
/// \return A reference to the TableScanContext.
162168
const TableScanContext& context() const;
163169

170+
/// \brief Returns the file I/O instance used for reading manifests and data files.
171+
/// \return A shared pointer to the FileIO instance.
164172
const std::shared_ptr<FileIO>& io() const;
165173

166174
/// \brief Plans the scan tasks by resolving manifests and data files.
@@ -181,9 +189,16 @@ class ICEBERG_EXPORT DataScan : public TableScan {
181189
Result<std::vector<std::shared_ptr<FileScanTask>>> PlanFiles() const override;
182190

183191
private:
192+
// Use indexed data structures for efficient lookups
193+
struct DeleteFileIndex {
194+
// Index by sequence number for quick filtering
195+
std::multimap<int64_t, ManifestEntry*> sequence_index;
196+
void BuildIndex(const std::vector<std::unique_ptr<ManifestEntry>>& entries);
197+
std::vector<ManifestEntry*> FindRelevantEntries(
198+
const ManifestEntry& data_entry) const;
199+
};
184200
std::vector<std::shared_ptr<DataFile>> GetMatchedDeletes(
185-
const ManifestEntry& data_entry,
186-
const std::vector<std::unique_ptr<ManifestEntry>>& positional_delete_entries) const;
201+
const ManifestEntry& data_entry, const DeleteFileIndex& delete_file_index) const;
187202
};
188203

189204
} // namespace iceberg

0 commit comments

Comments
 (0)