3333
3434namespace iceberg {
3535
36- namespace {
37- // / \brief Use indexed data structures for efficient lookups
38- class DeleteFileIndex {
39- public:
40- // / \brief Build the index from a list of manifest entries.
41- explicit DeleteFileIndex (const std::vector<std::unique_ptr<ManifestEntry>>& entries) {
42- for (const auto & entry : entries) {
43- const int64_t seq_num =
44- entry->sequence_number .value_or (TableMetadata::kInitialSequenceNumber );
45- sequence_index.emplace (seq_num, entry.get ());
46- }
47- }
48-
49- // / \brief Find delete files that match the sequence number of a data entry.
50- std::vector<ManifestEntry*> FindRelevantEntries (const ManifestEntry& data_entry) const {
51- std::vector<ManifestEntry*> relevant_deletes;
52-
53- // Use lower_bound for efficient range search
54- auto data_sequence_number =
55- data_entry.sequence_number .value_or (TableMetadata::kInitialSequenceNumber );
56- for (auto it = sequence_index.lower_bound (data_sequence_number);
57- it != sequence_index.end (); ++it) {
58- // Additional filtering logic here
59- relevant_deletes.push_back (it->second );
60- }
61-
62- return relevant_deletes;
63- }
64-
65- private:
66- // / \brief Index by sequence number for quick filtering
67- std::multimap<int64_t , ManifestEntry*> sequence_index;
68- };
69-
70- // / \brief Get matched delete files for a given data entry.
71- std::vector<std::shared_ptr<DataFile>> GetMatchedDeletes (
72- const ManifestEntry& data_entry, const DeleteFileIndex& delete_file_index) {
73- const auto relevant_entries = delete_file_index.FindRelevantEntries (data_entry);
74- std::vector<std::shared_ptr<DataFile>> matched_deletes;
75- if (relevant_entries.empty ()) {
76- return matched_deletes;
77- }
78-
79- matched_deletes.reserve (relevant_entries.size ());
80- for (const auto & delete_entry : relevant_entries) {
81- // TODO(gty404): check if the delete entry contains the data entry's file path
82- matched_deletes.emplace_back (delete_entry->data_file );
83- }
84- return matched_deletes;
85- }
86- } // namespace
87-
8836// implement FileScanTask
89- FileScanTask::FileScanTask (std::shared_ptr<DataFile> file,
90- std::vector<std::shared_ptr<DataFile>> delete_files,
91- std::shared_ptr<Expression> residual)
92- : data_file_(std::move(file)),
93- delete_files_ (std::move(delete_files)),
94- residual_(std::move(residual)) {}
37+ FileScanTask::FileScanTask (std::shared_ptr<DataFile> file)
38+ : data_file_(std::move(file)) {}
9539
9640const std::shared_ptr<DataFile>& FileScanTask::data_file () const { return data_file_; }
9741
98- const std::vector<std::shared_ptr<DataFile>>& FileScanTask::delete_files () const {
99- return delete_files_;
100- }
101-
102- int64_t FileScanTask::SizeBytes () const {
103- int64_t size_in_bytes = data_file_->file_size_in_bytes ;
104- std::ranges::for_each (delete_files_, [&size_in_bytes](const auto & delete_file) {
105- size_in_bytes += delete_file->file_size_in_bytes ;
106- });
107- return size_in_bytes;
108- }
42+ int64_t FileScanTask::size_bytes () const { return data_file_->file_size_in_bytes ; }
10943
110- int32_t FileScanTask::FilesCount () const {
111- return static_cast <int32_t >(delete_files_.size () + 1 );
112- }
44+ int32_t FileScanTask::files_count () const { return 1 ; }
11345
114- int64_t FileScanTask::EstimatedRowCount () const {
115- if (data_file_->file_size_in_bytes == 0 ) {
116- return 0 ;
117- }
118- const auto size_in_bytes = data_file_->file_size_in_bytes ;
119- const double scannedFileFraction =
120- static_cast <double >(size_in_bytes) / data_file_->file_size_in_bytes ;
121- return static_cast <int64_t >(scannedFileFraction * data_file_->record_count );
122- }
123-
124- const std::shared_ptr<Expression>& FileScanTask::residual () const { return residual_; }
46+ int64_t FileScanTask::estimated_row_count () const { return data_file_->record_count ; }
12547
12648TableScanBuilder::TableScanBuilder (std::shared_ptr<TableMetadata> table_metadata,
12749 std::shared_ptr<FileIO> file_io)
@@ -175,7 +97,7 @@ Result<std::unique_ptr<TableScan>> TableScanBuilder::Build() {
17597 auto iter = std::ranges::find_if (
17698 table_metadata->snapshots ,
17799 [id = *snapshot_id](const auto & snapshot) { return snapshot->snapshot_id == id; });
178- if (iter == table_metadata->snapshots .end () || *iter == nullptr ) {
100+ if (iter == table_metadata->snapshots .end ()) {
179101 return NotFound (" Snapshot with ID {} is not found" , *snapshot_id);
180102 }
181103 context_.snapshot = *iter;
@@ -243,8 +165,7 @@ Result<std::vector<std::shared_ptr<FileScanTask>>> DataTableScan::PlanFiles() co
243165 CreateManifestListReader (context_.snapshot ->manifest_list ));
244166 ICEBERG_ASSIGN_OR_RAISE (auto manifest_files, manifest_list_reader->Files ());
245167
246- std::vector<std::unique_ptr<ManifestEntry>> data_entries;
247- std::vector<std::unique_ptr<ManifestEntry>> positional_delete_entries;
168+ std::vector<std::shared_ptr<FileScanTask>> tasks;
248169 for (const auto & manifest_file : manifest_files) {
249170 ICEBERG_ASSIGN_OR_RAISE (auto manifest_reader,
250171 CreateManifestReader (manifest_file->manifest_path ));
@@ -256,29 +177,15 @@ Result<std::vector<std::shared_ptr<FileScanTask>>> DataTableScan::PlanFiles() co
256177 const auto & data_file = manifest_entry->data_file ;
257178 switch (data_file->content ) {
258179 case DataFile::Content::kData :
259- data_entries. push_back (std::move (manifest_entry));
180+ tasks. emplace_back (std::make_shared<FileScanTask> (manifest_entry-> data_file ));
260181 break ;
261182 case DataFile::Content::kPositionDeletes :
262- // TODO(gty404): check if the sequence number is greater than or equal to the
263- // minimum sequence number of all manifest entries
264- positional_delete_entries.push_back (std::move (manifest_entry));
265- break ;
266183 case DataFile::Content::kEqualityDeletes :
267- return NotSupported (" Equality deletes are not supported in data scan" );
184+ return NotSupported (" Equality/Position deletes are not supported in data scan" );
268185 }
269186 }
270187 }
271188
272- // TODO(gty404): build residual expression from filter
273- std::shared_ptr<Expression> residual;
274- std::vector<std::shared_ptr<FileScanTask>> tasks;
275- DeleteFileIndex delete_file_index (positional_delete_entries);
276- for (const auto & data_entry : data_entries) {
277- auto matched_deletes = GetMatchedDeletes (*data_entry, delete_file_index);
278- const auto & data_file = data_entry->data_file ;
279- tasks.emplace_back (
280- std::make_shared<FileScanTask>(data_file, std::move (matched_deletes), residual));
281- }
282189 return tasks;
283190}
284191
0 commit comments