Skip to content

Commit 607bc4e

Browse files
committed
fix review
1 parent cf2a0ea commit 607bc4e

File tree

5 files changed

+72
-83
lines changed

5 files changed

+72
-83
lines changed

src/iceberg/file_reader.h

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,24 +37,12 @@ namespace iceberg {
3737
/// \brief Base reader class to read data from different file formats.
3838
class ICEBERG_EXPORT Reader : public ArrowArrayReader {
3939
public:
40-
virtual ~Reader() = default;
4140
Reader() = default;
4241
Reader(const Reader&) = delete;
4342
Reader& operator=(const Reader&) = delete;
4443

4544
/// \brief Open the reader.
4645
virtual Status Open(const struct ReaderOptions& options) = 0;
47-
48-
/// \brief Close the reader.
49-
Status Close() override = 0;
50-
51-
/// \brief Read next data from file.
52-
///
53-
/// \return std::nullopt if the reader has no more data, otherwise `ArrowArray`.
54-
Result<std::optional<ArrowArray>> Next() override = 0;
55-
56-
/// \brief Get the schema of the data.
57-
Result<ArrowSchema> Schema() const override = 0;
5846
};
5947

6048
/// \brief A split of the file to read.

src/iceberg/table_scan.cc

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@
2323
#include <ranges>
2424
#include <utility>
2525

26-
#include <iceberg/file_format.h>
27-
2826
#include "iceberg/file_reader.h"
2927
#include "iceberg/manifest_entry.h"
3028
#include "iceberg/manifest_list.h"
@@ -37,6 +35,33 @@
3735

3836
namespace iceberg {
3937

38+
// implement FileScanTask
39+
FileScanTask::FileScanTask(std::shared_ptr<DataFile> data_file)
40+
: data_file_(std::move(data_file)) {}
41+
42+
const std::shared_ptr<DataFile>& FileScanTask::data_file() const { return data_file_; }
43+
44+
int64_t FileScanTask::size_bytes() const { return data_file_->file_size_in_bytes; }
45+
46+
int32_t FileScanTask::files_count() const { return 1; }
47+
48+
int64_t FileScanTask::estimated_row_count() const { return data_file_->record_count; }
49+
50+
Result<std::unique_ptr<ArrowArrayReader>> FileScanTask::ToArrowArrayReader(
51+
const std::shared_ptr<Schema>& projected_schema,
52+
const std::shared_ptr<Expression>& filter, const std::shared_ptr<FileIO>& io) const {
53+
const ReaderOptions options{.path = data_file_->file_path,
54+
.length = data_file_->file_size_in_bytes,
55+
.io = io,
56+
.projection = projected_schema,
57+
.filter = filter};
58+
59+
ICEBERG_ASSIGN_OR_RAISE(auto reader,
60+
ReaderFactoryRegistry::Open(data_file_->file_format, options));
61+
62+
return std::move(reader);
63+
}
64+
4065
TableScanBuilder::TableScanBuilder(std::shared_ptr<TableMetadata> table_metadata,
4166
std::shared_ptr<FileIO> file_io)
4267
: file_io_(std::move(file_io)) {
@@ -170,29 +195,4 @@ Result<std::vector<std::shared_ptr<FileScanTask>>> DataTableScan::PlanFiles() co
170195
return tasks;
171196
}
172197

173-
FileScanTask::FileScanTask(std::shared_ptr<DataFile> data_file)
174-
: data_file_(std::move(data_file)) {}
175-
176-
const std::shared_ptr<DataFile>& FileScanTask::data_file() const { return data_file_; }
177-
178-
int64_t FileScanTask::size_bytes() const { return data_file_->file_size_in_bytes; }
179-
180-
int32_t FileScanTask::files_count() const { return 1; }
181-
182-
int64_t FileScanTask::estimated_row_count() const { return data_file_->record_count; }
183-
184-
Result<std::unique_ptr<ArrowArrayReader>> FileScanTask::ToArrowArrayReader(
185-
const TableScanContext& context, const std::shared_ptr<FileIO>& io) const {
186-
const ReaderOptions options{.path = data_file_->file_path,
187-
.length = data_file_->file_size_in_bytes,
188-
.io = io,
189-
.projection = context.projected_schema,
190-
.filter = context.filter};
191-
192-
ICEBERG_ASSIGN_OR_RAISE(auto reader,
193-
ReaderFactoryRegistry::Open(data_file_->file_format, options));
194-
195-
return std::move(reader);
196-
}
197-
198198
} // namespace iceberg

src/iceberg/table_scan.h

Lines changed: 37 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,43 @@ class ICEBERG_EXPORT ScanTask {
4343
virtual int64_t estimated_row_count() const = 0;
4444
};
4545

46+
/// \brief Task representing a data file and its corresponding delete files.
47+
class ICEBERG_EXPORT FileScanTask : public ScanTask {
48+
public:
49+
explicit FileScanTask(std::shared_ptr<DataFile> data_file);
50+
51+
/// \brief The data file that should be read by this scan task.
52+
const std::shared_ptr<DataFile>& data_file() const;
53+
54+
/// \brief The total size in bytes of the file split to be read.
55+
int64_t size_bytes() const override;
56+
57+
/// \brief The number of files that should be read by this scan task.
58+
int32_t files_count() const override;
59+
60+
/// \brief The number of rows that should be read by this scan task.
61+
int64_t estimated_row_count() const override;
62+
63+
/**
64+
* \brief Returns an ArrowArrayReader to read the data for this task.
65+
*
66+
* This acts as a factory to instantiate a file-format-specific reader (e.g., Parquet)
67+
* based on the metadata in this task and the provided parameters.
68+
*
69+
* \param projected_schema The projected schema for reading the data.
70+
* \param filter Optional filter expression to apply during reading.
71+
* \param io The FileIO instance for accessing the file data.
72+
* \return A Result containing a unique pointer to the reader, or an error on failure.
73+
*/
74+
Result<std::unique_ptr<ArrowArrayReader>> ToArrowArrayReader(
75+
const std::shared_ptr<Schema>& projected_schema,
76+
const std::shared_ptr<Expression>& filter, const std::shared_ptr<FileIO>& io) const;
77+
78+
private:
79+
/// \brief Data file metadata.
80+
std::shared_ptr<DataFile> data_file_;
81+
};
82+
4683
/// \brief Scan context holding snapshot and scan-specific metadata.
4784
struct TableScanContext {
4885
/// \brief Table metadata.
@@ -169,40 +206,4 @@ class ICEBERG_EXPORT DataTableScan : public TableScan {
169206
Result<std::vector<std::shared_ptr<FileScanTask>>> PlanFiles() const override;
170207
};
171208

172-
/// \brief Task representing a data file and its corresponding delete files.
173-
class ICEBERG_EXPORT FileScanTask : public ScanTask {
174-
public:
175-
explicit FileScanTask(std::shared_ptr<DataFile> data_file);
176-
177-
/// \brief The data file that should be read by this scan task.
178-
const std::shared_ptr<DataFile>& data_file() const;
179-
180-
/// \brief The total size in bytes of the file split to be read.
181-
int64_t size_bytes() const override;
182-
183-
/// \brief The number of files that should be read by this scan task.
184-
int32_t files_count() const override;
185-
186-
/// \brief The number of rows that should be read by this scan task.
187-
int64_t estimated_row_count() const override;
188-
189-
/**
190-
* \brief Creates and returns an ArrowArrayReader to read the data for this task.
191-
*
192-
* This acts as a factory to instantiate a file-format-specific reader (e.g., Parquet)
193-
* based on the metadata in this task and the provided context.
194-
*
195-
* \param context The table scan context, used to configure the reader (e.g., with the
196-
* projected schema).
197-
* \param io The FileIO instance for accessing the file data.
198-
* \return A Result containing a unique pointer to the reader, or an error on failure.
199-
*/
200-
Result<std::unique_ptr<ArrowArrayReader>> ToArrowArrayReader(
201-
const TableScanContext& context, const std::shared_ptr<FileIO>& io) const;
202-
203-
private:
204-
/// \brief Data file metadata.
205-
std::shared_ptr<DataFile> data_file_;
206-
};
207-
208209
} // namespace iceberg

test/CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,8 @@ if(ICEBERG_BUILD_BUNDLE)
122122
SOURCES
123123
parquet_data_test.cc
124124
parquet_schema_test.cc
125-
parquet_test.cc
126-
file_scan_task_test.cc)
125+
parquet_test.cc)
126+
127+
add_iceberg_test(scan_test USE_BUNDLE SOURCES file_scan_task_test.cc)
128+
127129
endif()

test/file_scan_task_test.cc

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -124,14 +124,13 @@ TEST_F(FileScanTaskTest, ReadFullSchema) {
124124
data_file->file_size_in_bytes =
125125
io_internal.fs()->GetFileInfo(temp_parquet_file_).ValueOrDie().size();
126126

127-
TableScanContext context;
128-
context.projected_schema = std::make_shared<Schema>(
127+
auto projected_schema = std::make_shared<Schema>(
129128
std::vector<SchemaField>{SchemaField::MakeRequired(1, "id", int32()),
130129
SchemaField::MakeOptional(2, "name", string())});
131130

132131
FileScanTask task(data_file);
133132

134-
auto reader_result = task.ToArrowArrayReader(context, file_io_);
133+
auto reader_result = task.ToArrowArrayReader(projected_schema, nullptr, file_io_);
135134
ASSERT_THAT(reader_result, IsOk());
136135
auto reader = std::move(reader_result.value());
137136

@@ -149,14 +148,13 @@ TEST_F(FileScanTaskTest, ReadProjectedAndReorderedSchema) {
149148
data_file->file_size_in_bytes =
150149
io_internal.fs()->GetFileInfo(temp_parquet_file_).ValueOrDie().size();
151150

152-
TableScanContext context;
153-
context.projected_schema = std::make_shared<Schema>(
151+
auto projected_schema = std::make_shared<Schema>(
154152
std::vector<SchemaField>{SchemaField::MakeOptional(2, "name", string()),
155153
SchemaField::MakeOptional(3, "score", float64())});
156154

157155
FileScanTask task(data_file);
158156

159-
auto reader_result = task.ToArrowArrayReader(context, file_io_);
157+
auto reader_result = task.ToArrowArrayReader(projected_schema, nullptr, file_io_);
160158
ASSERT_THAT(reader_result, IsOk());
161159
auto reader = std::move(reader_result.value());
162160

0 commit comments

Comments
 (0)