Skip to content

Commit 29e8865

Browse files
committed
resolve some comments
1 parent 368e268 commit 29e8865

File tree

4 files changed

+113
-90
lines changed

4 files changed

+113
-90
lines changed

src/iceberg/table.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "iceberg/schema.h"
2424
#include "iceberg/sort_order.h"
2525
#include "iceberg/table_metadata.h"
26+
#include "iceberg/table_scan.h"
2627

2728
namespace iceberg {
2829

@@ -107,4 +108,8 @@ const std::vector<SnapshotLogEntry>& Table::history() const {
107108

108109
const std::shared_ptr<FileIO>& Table::io() const { return io_; }
109110

111+
std::unique_ptr<TableScanBuilder> Table::NewScan() const {
112+
return std::make_unique<TableScanBuilder>(*this, metadata_);
113+
}
114+
110115
} // namespace iceberg

src/iceberg/table.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ class ICEBERG_EXPORT Table {
108108
///
109109
/// Once a table scan builder is created, it can be refined to project columns and
110110
/// filter data.
111-
virtual std::unique_ptr<TableScanBuilder> NewScan() const = 0;
111+
virtual std::unique_ptr<TableScanBuilder> NewScan() const;
112112

113113
/// \brief Returns a FileIO to read and write table data and metadata files
114114
const std::shared_ptr<FileIO>& io() const;

src/iceberg/table_scan.cc

Lines changed: 72 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,58 @@
3333

3434
namespace iceberg {
3535

36+
namespace {
37+
/// \brief Use indexed data structures for efficient lookups
38+
struct DeleteFileIndex {
39+
/// \brief Index by sequence number for quick filtering
40+
std::multimap<int64_t, ManifestEntry*> sequence_index;
41+
42+
/// \brief Build the index from a list of manifest entries.
43+
void BuildIndex(const std::vector<std::unique_ptr<ManifestEntry>>& entries) {
44+
sequence_index.clear();
45+
46+
for (const auto& entry : entries) {
47+
const int64_t seq_num =
48+
entry->sequence_number.value_or(Snapshot::kInitialSequenceNumber);
49+
sequence_index.emplace(seq_num, entry.get());
50+
}
51+
}
52+
53+
/// \brief Find delete files that match the sequence number of a data entry.
54+
std::vector<ManifestEntry*> FindRelevantEntries(const ManifestEntry& data_entry) const {
55+
std::vector<ManifestEntry*> relevant_deletes;
56+
57+
// Use lower_bound for efficient range search
58+
auto data_sequence_number =
59+
data_entry.sequence_number.value_or(Snapshot::kInitialSequenceNumber);
60+
for (auto it = sequence_index.lower_bound(data_sequence_number);
61+
it != sequence_index.end(); ++it) {
62+
// Additional filtering logic here
63+
relevant_deletes.push_back(it->second);
64+
}
65+
66+
return relevant_deletes;
67+
}
68+
};
69+
70+
/// \brief Get matched delete files for a given data entry.
71+
std::vector<std::shared_ptr<DataFile>> GetMatchedDeletes(
72+
const ManifestEntry& data_entry, const DeleteFileIndex& delete_file_index) {
73+
const auto relevant_entries = delete_file_index.FindRelevantEntries(data_entry);
74+
std::vector<std::shared_ptr<DataFile>> matched_deletes;
75+
if (relevant_entries.empty()) {
76+
return matched_deletes;
77+
}
78+
79+
matched_deletes.reserve(relevant_entries.size());
80+
for (const auto& delete_entry : relevant_entries) {
81+
// TODO(gty404): check if the delete entry contains the data entry's file path
82+
matched_deletes.emplace_back(delete_entry->data_file);
83+
}
84+
return matched_deletes;
85+
}
86+
} // namespace
87+
3688
// implement FileScanTask
3789
FileScanTask::FileScanTask(std::shared_ptr<DataFile> file,
3890
std::vector<std::shared_ptr<DataFile>> delete_files,
@@ -122,43 +174,46 @@ TableScanBuilder& TableScanBuilder::WithLimit(std::optional<int64_t> limit) {
122174

123175
Result<std::unique_ptr<TableScan>> TableScanBuilder::Build() {
124176
if (snapshot_id_) {
125-
ICEBERG_ASSIGN_OR_RAISE(context_.snapshot, table_.snapshot(*snapshot_id_));
177+
ICEBERG_ASSIGN_OR_RAISE(context_.snapshot, table_.SnapshotById(*snapshot_id_));
126178
} else {
127-
context_.snapshot = table_.current_snapshot();
179+
ICEBERG_ASSIGN_OR_RAISE(context_.snapshot, table_.current_snapshot());
128180
}
129181
if (context_.snapshot == nullptr) {
130-
return InvalidArgument("No snapshot found for table {}", table_.name());
182+
return InvalidArgument("No snapshot found for table {}", table_.name().name);
131183
}
132184

133185
if (!context_.projected_schema) {
134186
std::shared_ptr<Schema> schema;
135187
const auto& snapshot = context_.snapshot;
136188
if (snapshot->schema_id) {
137-
const auto& schemas = table_.schemas();
189+
const auto& schemas = *table_.schemas();
138190
if (const auto it = schemas.find(*snapshot->schema_id); it != schemas.end()) {
139191
schema = it->second;
140192
} else {
141193
return InvalidArgument("Schema {} in snapshot {} is not found",
142194
*snapshot->schema_id, snapshot->snapshot_id);
143195
}
144196
} else {
145-
schema = table_.schema();
197+
ICEBERG_ASSIGN_OR_RAISE(schema, table_.schema());
146198
}
147199

148-
// TODO(gty404): collect touched columns from filter expression
149-
std::vector<SchemaField> projected_fields;
150-
projected_fields.reserve(column_names_.size());
151-
for (const auto& column_name : column_names_) {
152-
// TODO(gty404): support case-insensitive column names
153-
auto field_opt = schema->GetFieldByName(column_name);
154-
if (!field_opt) {
155-
return InvalidArgument("Column {} not found in schema", column_name);
200+
if (column_names_.empty()) {
201+
context_.projected_schema = schema;
202+
} else {
203+
// TODO(gty404): collect touched columns from filter expression
204+
std::vector<SchemaField> projected_fields;
205+
projected_fields.reserve(column_names_.size());
206+
for (const auto& column_name : column_names_) {
207+
// TODO(gty404): support case-insensitive column names
208+
auto field_opt = schema->GetFieldByName(column_name);
209+
if (!field_opt) {
210+
return InvalidArgument("Column {} not found in schema", column_name);
211+
}
212+
projected_fields.emplace_back(field_opt.value().get());
156213
}
157-
projected_fields.emplace_back(field_opt.value().get());
214+
context_.projected_schema =
215+
std::make_shared<Schema>(std::move(projected_fields), schema->schema_id());
158216
}
159-
160-
context_.projected_schema =
161-
std::make_shared<Schema>(std::move(projected_fields), schema->schema_id());
162217
}
163218

164219
return std::make_unique<DataScan>(std::move(context_), table_.io());
@@ -227,47 +282,4 @@ Result<std::vector<std::shared_ptr<FileScanTask>>> DataScan::PlanFiles() const {
227282
return tasks;
228283
}
229284

230-
void DataScan::DeleteFileIndex::BuildIndex(
231-
const std::vector<std::unique_ptr<ManifestEntry>>& entries) {
232-
sequence_index.clear();
233-
234-
for (const auto& entry : entries) {
235-
const int64_t seq_num =
236-
entry->sequence_number.value_or(Snapshot::kInitialSequenceNumber);
237-
sequence_index.emplace(seq_num, entry.get());
238-
}
239-
}
240-
241-
std::vector<ManifestEntry*> DataScan::DeleteFileIndex::FindRelevantEntries(
242-
const ManifestEntry& data_entry) const {
243-
std::vector<ManifestEntry*> relevant_deletes;
244-
245-
// Use lower_bound for efficient range search
246-
auto data_sequence_number =
247-
data_entry.sequence_number.value_or(Snapshot::kInitialSequenceNumber);
248-
for (auto it = sequence_index.lower_bound(data_sequence_number);
249-
it != sequence_index.end(); ++it) {
250-
// Additional filtering logic here
251-
relevant_deletes.push_back(it->second);
252-
}
253-
254-
return relevant_deletes;
255-
}
256-
257-
std::vector<std::shared_ptr<DataFile>> DataScan::GetMatchedDeletes(
258-
const ManifestEntry& data_entry, const DeleteFileIndex& delete_file_index) {
259-
const auto relevant_entries = delete_file_index.FindRelevantEntries(data_entry);
260-
std::vector<std::shared_ptr<DataFile>> matched_deletes;
261-
if (relevant_entries.empty()) {
262-
return matched_deletes;
263-
}
264-
265-
matched_deletes.reserve(relevant_entries.size());
266-
for (const auto& delete_entry : relevant_entries) {
267-
// TODO(gty404): check if the delete entry contains the data entry's file path
268-
matched_deletes.emplace_back(delete_entry->data_file);
269-
}
270-
return matched_deletes;
271-
}
272-
273285
} // namespace iceberg

src/iceberg/table_scan.h

Lines changed: 35 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
namespace iceberg {
2929

30+
/// \brief Represents a task to scan a table or a portion of it.
3031
class ICEBERG_EXPORT ScanTask {
3132
public:
3233
virtual ~ScanTask() = default;
@@ -68,25 +69,34 @@ class ICEBERG_EXPORT FileScanTask : public ScanTask {
6869
int64_t estimated_row_count() const override;
6970

7071
private:
71-
std::shared_ptr<DataFile> data_file_; ///< Data file metadata.
72-
std::vector<std::shared_ptr<DataFile>> delete_files_; ///< Delete files metadata.
73-
74-
int64_t start_; ///< Start byte offset.
75-
int64_t length_; ///< Length in bytes to scan.
76-
77-
std::shared_ptr<Expression> residual_; ///< Residual expression to apply.
72+
/// \brief Data file metadata.
73+
std::shared_ptr<DataFile> data_file_;
74+
/// \brief Delete files metadata.
75+
std::vector<std::shared_ptr<DataFile>> delete_files_;
76+
/// \brief Start byte offset.
77+
int64_t start_;
78+
/// \brief Length in bytes to scan.
79+
int64_t length_;
80+
/// \brief Residual expression to apply.
81+
std::shared_ptr<Expression> residual_;
7882
};
7983

8084
/// \brief Scan context holding snapshot and scan-specific metadata.
8185
struct TableScanContext {
82-
std::shared_ptr<TableMetadata> table_metadata; ///< Table metadata.
83-
std::shared_ptr<Snapshot> snapshot; ///< Snapshot to scan.
84-
std::shared_ptr<Schema> projected_schema; ///< Projected schema.
85-
std::shared_ptr<Expression> filter; ///< Filter expression to apply.
86-
bool case_sensitive = false; ///< Whether the scan is case-sensitive.
87-
std::unordered_map<std::string, std::string>
88-
options; ///< Additional options for the scan.
89-
std::optional<int64_t> limit; ///< Optional limit on the number of rows to scan.
86+
/// \brief Table metadata.
87+
std::shared_ptr<TableMetadata> table_metadata;
88+
/// \brief Snapshot to scan.
89+
std::shared_ptr<Snapshot> snapshot;
90+
/// \brief Projected schema.
91+
std::shared_ptr<Schema> projected_schema;
92+
/// \brief Filter expression to apply.
93+
std::shared_ptr<Expression> filter;
94+
/// \brief Whether the scan is case-sensitive.
95+
bool case_sensitive = false;
96+
/// \brief Additional options for the scan.
97+
std::unordered_map<std::string, std::string> options;
98+
/// \brief Optional limit on the number of rows to scan.
99+
std::optional<int64_t> limit;
90100
};
91101

92102
/// \brief Builder class for creating TableScan instances.
@@ -139,10 +149,14 @@ class ICEBERG_EXPORT TableScanBuilder {
139149
Result<std::unique_ptr<TableScan>> Build();
140150

141151
private:
142-
const Table& table_; ///< Reference to the table to scan.
152+
/// \brief Reference to the table to scan.
153+
const Table& table_;
154+
/// \brief column names to project in the scan.
143155
std::vector<std::string> column_names_;
156+
/// \brief snapshot ID to scan, if specified.
144157
std::optional<int64_t> snapshot_id_;
145-
TableScanContext context_; ///< Context for the scan.
158+
/// \brief Context for the scan, including snapshot, schema, and filter.
159+
TableScanContext context_;
146160
};
147161

148162
/// \brief Represents a configured scan operation on a table.
@@ -176,29 +190,21 @@ class ICEBERG_EXPORT TableScan {
176190
virtual Result<std::vector<std::shared_ptr<FileScanTask>>> PlanFiles() const = 0;
177191

178192
protected:
193+
/// \brief context for the scan, including snapshot, schema, and filter.
179194
const TableScanContext context_;
195+
/// \brief File I/O instance for reading manifests and data files.
180196
std::shared_ptr<FileIO> file_io_;
181197
};
182198

199+
/// \brief A scan that reads data files and applies delete files to filter rows.
183200
class ICEBERG_EXPORT DataScan : public TableScan {
184201
public:
202+
/// \brief Constructs a DataScan with the given context and file I/O.
185203
DataScan(TableScanContext context, std::shared_ptr<FileIO> file_io);
186204

187205
/// \brief Plans the scan tasks by resolving manifests and data files.
188206
/// \return A Result containing scan tasks or an error.
189207
Result<std::vector<std::shared_ptr<FileScanTask>>> PlanFiles() const override;
190-
191-
private:
192-
// Use indexed data structures for efficient lookups
193-
struct DeleteFileIndex {
194-
// Index by sequence number for quick filtering
195-
std::multimap<int64_t, ManifestEntry*> sequence_index;
196-
void BuildIndex(const std::vector<std::unique_ptr<ManifestEntry>>& entries);
197-
std::vector<ManifestEntry*> FindRelevantEntries(
198-
const ManifestEntry& data_entry) const;
199-
};
200-
static std::vector<std::shared_ptr<DataFile>> GetMatchedDeletes(
201-
const ManifestEntry& data_entry, const DeleteFileIndex& delete_file_index);
202208
};
203209

204210
} // namespace iceberg

0 commit comments

Comments
 (0)