Skip to content

Commit 8705a82

Browse files
committed
resolve some comments
1 parent e4af0e7 commit 8705a82

File tree

2 files changed

+16
-121
lines changed

2 files changed

+16
-121
lines changed

src/iceberg/table_scan.cc

Lines changed: 9 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -33,95 +33,17 @@
3333

3434
namespace iceberg {
3535

36-
namespace {
37-
/// \brief Use indexed data structures for efficient lookups
38-
class DeleteFileIndex {
39-
public:
40-
/// \brief Build the index from a list of manifest entries.
41-
explicit DeleteFileIndex(const std::vector<std::unique_ptr<ManifestEntry>>& entries) {
42-
for (const auto& entry : entries) {
43-
const int64_t seq_num =
44-
entry->sequence_number.value_or(TableMetadata::kInitialSequenceNumber);
45-
sequence_index.emplace(seq_num, entry.get());
46-
}
47-
}
48-
49-
/// \brief Find delete files that match the sequence number of a data entry.
50-
std::vector<ManifestEntry*> FindRelevantEntries(const ManifestEntry& data_entry) const {
51-
std::vector<ManifestEntry*> relevant_deletes;
52-
53-
// Use lower_bound for efficient range search
54-
auto data_sequence_number =
55-
data_entry.sequence_number.value_or(TableMetadata::kInitialSequenceNumber);
56-
for (auto it = sequence_index.lower_bound(data_sequence_number);
57-
it != sequence_index.end(); ++it) {
58-
// Additional filtering logic here
59-
relevant_deletes.push_back(it->second);
60-
}
61-
62-
return relevant_deletes;
63-
}
64-
65-
private:
66-
/// \brief Index by sequence number for quick filtering
67-
std::multimap<int64_t, ManifestEntry*> sequence_index;
68-
};
69-
70-
/// \brief Get matched delete files for a given data entry.
71-
std::vector<std::shared_ptr<DataFile>> GetMatchedDeletes(
72-
const ManifestEntry& data_entry, const DeleteFileIndex& delete_file_index) {
73-
const auto relevant_entries = delete_file_index.FindRelevantEntries(data_entry);
74-
std::vector<std::shared_ptr<DataFile>> matched_deletes;
75-
if (relevant_entries.empty()) {
76-
return matched_deletes;
77-
}
78-
79-
matched_deletes.reserve(relevant_entries.size());
80-
for (const auto& delete_entry : relevant_entries) {
81-
// TODO(gty404): check if the delete entry contains the data entry's file path
82-
matched_deletes.emplace_back(delete_entry->data_file);
83-
}
84-
return matched_deletes;
85-
}
86-
} // namespace
87-
8836
// implement FileScanTask
89-
FileScanTask::FileScanTask(std::shared_ptr<DataFile> file,
90-
std::vector<std::shared_ptr<DataFile>> delete_files,
91-
std::shared_ptr<Expression> residual)
92-
: data_file_(std::move(file)),
93-
delete_files_(std::move(delete_files)),
94-
residual_(std::move(residual)) {}
37+
FileScanTask::FileScanTask(std::shared_ptr<DataFile> file)
38+
: data_file_(std::move(file)) {}
9539

9640
const std::shared_ptr<DataFile>& FileScanTask::data_file() const { return data_file_; }
9741

98-
const std::vector<std::shared_ptr<DataFile>>& FileScanTask::delete_files() const {
99-
return delete_files_;
100-
}
101-
102-
int64_t FileScanTask::SizeBytes() const {
103-
int64_t size_in_bytes = data_file_->file_size_in_bytes;
104-
std::ranges::for_each(delete_files_, [&size_in_bytes](const auto& delete_file) {
105-
size_in_bytes += delete_file->file_size_in_bytes;
106-
});
107-
return size_in_bytes;
108-
}
42+
int64_t FileScanTask::size_bytes() const { return data_file_->file_size_in_bytes; }
10943

110-
int32_t FileScanTask::FilesCount() const {
111-
return static_cast<int32_t>(delete_files_.size() + 1);
112-
}
44+
int32_t FileScanTask::files_count() const { return 1; }
11345

114-
int64_t FileScanTask::EstimatedRowCount() const {
115-
if (data_file_->file_size_in_bytes == 0) {
116-
return 0;
117-
}
118-
const auto size_in_bytes = data_file_->file_size_in_bytes;
119-
const double scannedFileFraction =
120-
static_cast<double>(size_in_bytes) / data_file_->file_size_in_bytes;
121-
return static_cast<int64_t>(scannedFileFraction * data_file_->record_count);
122-
}
123-
124-
const std::shared_ptr<Expression>& FileScanTask::residual() const { return residual_; }
46+
int64_t FileScanTask::estimated_row_count() const { return data_file_->record_count; }
12547

12648
TableScanBuilder::TableScanBuilder(std::shared_ptr<TableMetadata> table_metadata,
12749
std::shared_ptr<FileIO> file_io)
@@ -175,7 +97,7 @@ Result<std::unique_ptr<TableScan>> TableScanBuilder::Build() {
17597
auto iter = std::ranges::find_if(
17698
table_metadata->snapshots,
17799
[id = *snapshot_id](const auto& snapshot) { return snapshot->snapshot_id == id; });
178-
if (iter == table_metadata->snapshots.end() || *iter == nullptr) {
100+
if (iter == table_metadata->snapshots.end()) {
179101
return NotFound("Snapshot with ID {} is not found", *snapshot_id);
180102
}
181103
context_.snapshot = *iter;
@@ -243,8 +165,7 @@ Result<std::vector<std::shared_ptr<FileScanTask>>> DataTableScan::PlanFiles() co
243165
CreateManifestListReader(context_.snapshot->manifest_list));
244166
ICEBERG_ASSIGN_OR_RAISE(auto manifest_files, manifest_list_reader->Files());
245167

246-
std::vector<std::unique_ptr<ManifestEntry>> data_entries;
247-
std::vector<std::unique_ptr<ManifestEntry>> positional_delete_entries;
168+
std::vector<std::shared_ptr<FileScanTask>> tasks;
248169
for (const auto& manifest_file : manifest_files) {
249170
ICEBERG_ASSIGN_OR_RAISE(auto manifest_reader,
250171
CreateManifestReader(manifest_file->manifest_path));
@@ -256,29 +177,15 @@ Result<std::vector<std::shared_ptr<FileScanTask>>> DataTableScan::PlanFiles() co
256177
const auto& data_file = manifest_entry->data_file;
257178
switch (data_file->content) {
258179
case DataFile::Content::kData:
259-
data_entries.push_back(std::move(manifest_entry));
180+
tasks.emplace_back(std::make_shared<FileScanTask>(manifest_entry->data_file));
260181
break;
261182
case DataFile::Content::kPositionDeletes:
262-
// TODO(gty404): check if the sequence number is greater than or equal to the
263-
// minimum sequence number of all manifest entries
264-
positional_delete_entries.push_back(std::move(manifest_entry));
265-
break;
266183
case DataFile::Content::kEqualityDeletes:
267-
return NotSupported("Equality deletes are not supported in data scan");
184+
return NotSupported("Equality/Position deletes are not supported in data scan");
268185
}
269186
}
270187
}
271188

272-
// TODO(gty404): build residual expression from filter
273-
std::shared_ptr<Expression> residual;
274-
std::vector<std::shared_ptr<FileScanTask>> tasks;
275-
DeleteFileIndex delete_file_index(positional_delete_entries);
276-
for (const auto& data_entry : data_entries) {
277-
auto matched_deletes = GetMatchedDeletes(*data_entry, delete_file_index);
278-
const auto& data_file = data_entry->data_file;
279-
tasks.emplace_back(
280-
std::make_shared<FileScanTask>(data_file, std::move(matched_deletes), residual));
281-
}
282189
return tasks;
283190
}
284191

src/iceberg/table_scan.h

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -33,42 +33,30 @@ class ICEBERG_EXPORT ScanTask {
3333
virtual ~ScanTask() = default;
3434

3535
/// \brief The number of bytes that should be read by this scan task.
36-
virtual int64_t SizeBytes() const = 0;
36+
virtual int64_t size_bytes() const = 0;
3737

3838
/// \brief The number of files that should be read by this scan task.
39-
virtual int32_t FilesCount() const = 0;
39+
virtual int32_t files_count() const = 0;
4040

4141
/// \brief The number of rows that should be read by this scan task.
42-
virtual int64_t EstimatedRowCount() const = 0;
42+
virtual int64_t estimated_row_count() const = 0;
4343
};
4444

4545
/// \brief Task representing a data file and its corresponding delete files.
4646
class ICEBERG_EXPORT FileScanTask : public ScanTask {
4747
public:
48-
FileScanTask(std::shared_ptr<DataFile> file,
49-
std::vector<std::shared_ptr<DataFile>> delete_files,
50-
std::shared_ptr<Expression> residual);
48+
explicit FileScanTask(std::shared_ptr<DataFile> file);
5149

5250
/// \brief The data file that should be read by this scan task.
5351
const std::shared_ptr<DataFile>& data_file() const;
5452

55-
/// \brief The delete files that should be read by this scan task.
56-
const std::vector<std::shared_ptr<DataFile>>& delete_files() const;
57-
58-
/// \brief The residual expression to apply after scanning the data file.
59-
const std::shared_ptr<Expression>& residual() const;
60-
61-
int64_t SizeBytes() const override;
62-
int32_t FilesCount() const override;
63-
int64_t EstimatedRowCount() const override;
53+
int64_t size_bytes() const override;
54+
int32_t files_count() const override;
55+
int64_t estimated_row_count() const override;
6456

6557
private:
6658
/// \brief Data file metadata.
6759
std::shared_ptr<DataFile> data_file_;
68-
/// \brief Delete files metadata.
69-
std::vector<std::shared_ptr<DataFile>> delete_files_;
70-
/// \brief Residual expression to apply.
71-
std::shared_ptr<Expression> residual_;
7260
};
7361

7462
/// \brief Scan context holding snapshot and scan-specific metadata.

0 commit comments

Comments
 (0)