apache · HeartLinked · Aug 29, 2025 · Sep 2, 2025 · Sep 3, 2025
diff --git a/src/iceberg/arrow_array_reader.h b/src/iceberg/arrow_array_reader.h
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <optional>
+
+#include "iceberg/arrow_c_data.h"
+#include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
+
+namespace iceberg {
+
+class ICEBERG_EXPORT ArrowArrayReader {
+ public:
+  /// \brief Read next batch of data.
+  ///
+  /// \return std::nullopt if the reader has no more data, otherwise `ArrowArray`.
+  virtual Result<std::optional<ArrowArray>> Next() = 0;
+
+  /// \brief Get schema of data returned by `Next`.
+  virtual Result<ArrowSchema> Schema() const = 0;
+
+  /// \brief Close this reader and release all resources.
+  virtual Status Close() = 0;
+
+  virtual ~ArrowArrayReader() = default;
+};
+
+}  // namespace iceberg
diff --git a/src/iceberg/avro/avro_reader.cc b/src/iceberg/avro/avro_reader.cc
@@ -160,7 +160,7 @@ class AvroReader::Impl {
     return {};
   }
 
-  Result<ArrowSchema> Schema() {
+  Result<ArrowSchema> Schema() const {
     if (!context_) {
       ICEBERG_RETURN_UNEXPECTED(InitReadContext());
     }
@@ -174,7 +174,7 @@ class AvroReader::Impl {
   }
 
  private:
-  Status InitReadContext() {
+  Status InitReadContext() const {
     context_ = std::make_unique<ReadContext>();
     context_->datum_ = std::make_unique<::avro::GenericDatum>(reader_->readerSchema());
 
@@ -232,14 +232,14 @@ class AvroReader::Impl {
   // The avro reader to read the data into a datum.
   std::unique_ptr<::avro::DataFileReader<::avro::GenericDatum>> reader_;
   // The context to keep track of the reading progress.
-  std::unique_ptr<ReadContext> context_;
+  mutable std::unique_ptr<ReadContext> context_;
 };
 
 AvroReader::~AvroReader() = default;
 
 Result<std::optional<ArrowArray>> AvroReader::Next() { return impl_->Next(); }
 
-Result<ArrowSchema> AvroReader::Schema() { return impl_->Schema(); }
+Result<ArrowSchema> AvroReader::Schema() const { return impl_->Schema(); }
 
 Status AvroReader::Open(const ReaderOptions& options) {
   impl_ = std::make_unique<Impl>();

diff --git a/src/iceberg/avro/avro_reader.h b/src/iceberg/avro/avro_reader.h
@@ -37,7 +37,7 @@ class ICEBERG_BUNDLE_EXPORT AvroReader : public Reader {
 
   Result<std::optional<ArrowArray>> Next() final;
 
-  Result<ArrowSchema> Schema() final;
+  Result<ArrowSchema> Schema() const final;
 
  private:
   class Impl;

diff --git a/src/iceberg/file_reader.h b/src/iceberg/file_reader.h
@@ -26,6 +26,7 @@
 #include <memory>
 #include <optional>
 
+#include "iceberg/arrow_array_reader.h"
 #include "iceberg/arrow_c_data.h"
 #include "iceberg/file_format.h"
 #include "iceberg/result.h"
@@ -34,7 +35,7 @@
 namespace iceberg {
 
 /// \brief Base reader class to read data from different file formats.
-class ICEBERG_EXPORT Reader {
+class ICEBERG_EXPORT Reader : public ArrowArrayReader {
  public:
   virtual ~Reader() = default;
   Reader() = default;
@@ -45,15 +46,15 @@ class ICEBERG_EXPORT Reader {
   virtual Status Open(const struct ReaderOptions& options) = 0;
 
   /// \brief Close the reader.
-  virtual Status Close() = 0;
+  Status Close() override = 0;
 
   /// \brief Read next data from file.
   ///
   /// \return std::nullopt if the reader has no more data, otherwise `ArrowArray`.
-  virtual Result<std::optional<ArrowArray>> Next() = 0;
+  Result<std::optional<ArrowArray>> Next() override = 0;
 
   /// \brief Get the schema of the data.
-  virtual Result<ArrowSchema> Schema() = 0;
+  Result<ArrowSchema> Schema() const override = 0;
 };
 
 /// \brief A split of the file to read.

diff --git a/src/iceberg/parquet/parquet_reader.cc b/src/iceberg/parquet/parquet_reader.cc
@@ -173,7 +173,7 @@ class ParquetReader::Impl {
   }
 
   // Get the schema of the data
-  Result<ArrowSchema> Schema() {
+  Result<ArrowSchema> Schema() const {
     if (!context_) {
       ICEBERG_RETURN_UNEXPECTED(InitReadContext());
     }
@@ -185,7 +185,7 @@ class ParquetReader::Impl {
   }
 
  private:
-  Status InitReadContext() {
+  Status InitReadContext() const {
     context_ = std::make_unique<ReadContext>();
 
     // Build the output Arrow schema
@@ -239,14 +239,14 @@ class ParquetReader::Impl {
   // Parquet file reader to create RecordBatchReader.
   std::unique_ptr<::parquet::arrow::FileReader> reader_;
   // The context to keep track of the reading progress.
-  std::unique_ptr<ReadContext> context_;
+  mutable std::unique_ptr<ReadContext> context_;
 };
 
 ParquetReader::~ParquetReader() = default;
 
 Result<std::optional<ArrowArray>> ParquetReader::Next() { return impl_->Next(); }
 
-Result<ArrowSchema> ParquetReader::Schema() { return impl_->Schema(); }
+Result<ArrowSchema> ParquetReader::Schema() const { return impl_->Schema(); }
 
 Status ParquetReader::Open(const ReaderOptions& options) {
   impl_ = std::make_unique<Impl>();

diff --git a/src/iceberg/parquet/parquet_reader.h b/src/iceberg/parquet/parquet_reader.h
@@ -37,7 +37,7 @@ class ICEBERG_BUNDLE_EXPORT ParquetReader : public Reader {
 
   Result<std::optional<ArrowArray>> Next() final;
 
-  Result<ArrowSchema> Schema() final;
+  Result<ArrowSchema> Schema() const final;
 
  private:
   class Impl;

diff --git a/src/iceberg/table_scan.cc b/src/iceberg/table_scan.cc
@@ -21,7 +21,11 @@
 
 #include <algorithm>
 #include <ranges>
+#include <utility>
 
+#include <iceberg/file_format.h>
+
+#include "iceberg/file_reader.h"
 #include "iceberg/manifest_entry.h"
 #include "iceberg/manifest_list.h"
 #include "iceberg/manifest_reader.h"
@@ -33,18 +37,6 @@
 
 namespace iceberg {
 
-// implement FileScanTask
-FileScanTask::FileScanTask(std::shared_ptr<DataFile> data_file)
-    : data_file_(std::move(data_file)) {}
-
-const std::shared_ptr<DataFile>& FileScanTask::data_file() const { return data_file_; }
-
-int64_t FileScanTask::size_bytes() const { return data_file_->file_size_in_bytes; }
-
-int32_t FileScanTask::files_count() const { return 1; }
-
-int64_t FileScanTask::estimated_row_count() const { return data_file_->record_count; }
-
 TableScanBuilder::TableScanBuilder(std::shared_ptr<TableMetadata> table_metadata,
                                    std::shared_ptr<FileIO> file_io)
     : file_io_(std::move(file_io)) {
@@ -178,4 +170,29 @@ Result<std::vector<std::shared_ptr<FileScanTask>>> DataTableScan::PlanFiles() co
   return tasks;
 }
 
+FileScanTask::FileScanTask(std::shared_ptr<DataFile> data_file)
+    : data_file_(std::move(data_file)) {}
+
+const std::shared_ptr<DataFile>& FileScanTask::data_file() const { return data_file_; }
+
+int64_t FileScanTask::size_bytes() const { return data_file_->file_size_in_bytes; }
+
+int32_t FileScanTask::files_count() const { return 1; }
+
+int64_t FileScanTask::estimated_row_count() const { return data_file_->record_count; }
+
+Result<std::unique_ptr<ArrowArrayReader>> FileScanTask::ToArrowArrayReader(
+    const TableScanContext& context, const std::shared_ptr<FileIO>& io) const {
+  const ReaderOptions options{.path = data_file_->file_path,
+                              .length = data_file_->file_size_in_bytes,
+                              .io = io,
+                              .projection = context.projected_schema,
+                              .filter = context.filter};
+
+  ICEBERG_ASSIGN_OR_RAISE(auto reader,
+                          ReaderFactoryRegistry::Open(data_file_->file_format, options));
+
+  return std::move(reader);
+}
+
 }  // namespace iceberg
diff --git a/src/iceberg/table_scan.h b/src/iceberg/table_scan.h
@@ -22,6 +22,7 @@
 #include <string>
 #include <vector>
 
+#include "iceberg/arrow_array_reader.h"
 #include "iceberg/manifest_entry.h"
 #include "iceberg/type_fwd.h"
 
@@ -42,23 +43,6 @@ class ICEBERG_EXPORT ScanTask {
   virtual int64_t estimated_row_count() const = 0;
 };
 
-/// \brief Task representing a data file and its corresponding delete files.
-class ICEBERG_EXPORT FileScanTask : public ScanTask {
- public:
-  explicit FileScanTask(std::shared_ptr<DataFile> data_file);
-
-  /// \brief The data file that should be read by this scan task.
-  const std::shared_ptr<DataFile>& data_file() const;
-
-  int64_t size_bytes() const override;
-  int32_t files_count() const override;
-  int64_t estimated_row_count() const override;
-
- private:
-  /// \brief Data file metadata.
-  std::shared_ptr<DataFile> data_file_;
-};
-
 /// \brief Scan context holding snapshot and scan-specific metadata.
 struct TableScanContext {
   /// \brief Table metadata.
@@ -185,4 +169,40 @@ class ICEBERG_EXPORT DataTableScan : public TableScan {
   Result<std::vector<std::shared_ptr<FileScanTask>>> PlanFiles() const override;
 };
 
+/// \brief Task representing a data file and its corresponding delete files.
+class ICEBERG_EXPORT FileScanTask : public ScanTask {
+ public:
+  explicit FileScanTask(std::shared_ptr<DataFile> data_file);
+
+  /// \brief The data file that should be read by this scan task.
+  const std::shared_ptr<DataFile>& data_file() const;
+
+  /// \brief The total size in bytes of the file split to be read.
+  int64_t size_bytes() const override;
+
+  /// \brief The number of files that should be read by this scan task.
+  int32_t files_count() const override;
+
+  /// \brief The number of rows that should be read by this scan task.
+  int64_t estimated_row_count() const override;
+
+  /**
+   * \brief Creates and returns an ArrowArrayReader to read the data for this task.
+   *
+   * This acts as a factory to instantiate a file-format-specific reader (e.g., Parquet)
+   * based on the metadata in this task and the provided context.
+   *
+   * \param context The table scan context, used to configure the reader (e.g., with the
+   * projected schema).
+   * \param io The FileIO instance for accessing the file data.
+   * \return A Result containing a unique pointer to the reader, or an error on failure.
+   */
+  Result<std::unique_ptr<ArrowArrayReader>> ToArrowArrayReader(
+      const TableScanContext& context, const std::shared_ptr<FileIO>& io) const;
+
+ private:
+  /// \brief Data file metadata.
+  std::shared_ptr<DataFile> data_file_;
+};
+
 }  // namespace iceberg
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -122,5 +122,6 @@ if(ICEBERG_BUILD_BUNDLE)
                    SOURCES
                    parquet_data_test.cc
                    parquet_schema_test.cc
-                   parquet_test.cc)
+                   parquet_test.cc
+                   file_scan_task_test.cc)
 endif()