apache · HeartLinked · Aug 29, 2025 · Sep 2, 2025 · Sep 3, 2025 · zeroshade
diff --git a/src/iceberg/arrow_array_reader.h b/src/iceberg/arrow_array_reader.h
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <optional>
+
+#include "iceberg/arrow_c_data.h"
+#include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
+
+namespace iceberg {
+
+/// \brief A reader interface that returns ArrowArray in a streaming fashion.
+class ICEBERG_EXPORT ArrowArrayReader {
+ public:
+  /// \brief Read next batch of data.
+  ///
+  /// \return std::nullopt if the reader has no more data, otherwise `ArrowArray`.
+  virtual Result<std::optional<ArrowArray>> Next() = 0;
+
+  /// \brief Get schema of data returned by `Next`.
+  virtual Result<ArrowSchema> Schema() const = 0;
+
+  /// \brief Close this reader and release all resources.
+  virtual Status Close() = 0;
+
+  virtual ~ArrowArrayReader() = default;
+};
+
+}  // namespace iceberg
diff --git a/src/iceberg/avro/avro_reader.cc b/src/iceberg/avro/avro_reader.cc
@@ -160,7 +160,7 @@ class AvroReader::Impl {
     return {};
   }
 
-  Result<ArrowSchema> Schema() {
+  Result<ArrowSchema> Schema() const {
     if (!context_) {
       ICEBERG_RETURN_UNEXPECTED(InitReadContext());
     }
@@ -174,7 +174,7 @@ class AvroReader::Impl {
   }
 
  private:
-  Status InitReadContext() {
+  Status InitReadContext() const {
     context_ = std::make_unique<ReadContext>();
     context_->datum_ = std::make_unique<::avro::GenericDatum>(reader_->readerSchema());
 
@@ -232,14 +232,14 @@ class AvroReader::Impl {
   // The avro reader to read the data into a datum.
   std::unique_ptr<::avro::DataFileReader<::avro::GenericDatum>> reader_;
   // The context to keep track of the reading progress.
-  std::unique_ptr<ReadContext> context_;
+  mutable std::unique_ptr<ReadContext> context_;
 };
 
 AvroReader::~AvroReader() = default;
 
 Result<std::optional<ArrowArray>> AvroReader::Next() { return impl_->Next(); }
 
-Result<ArrowSchema> AvroReader::Schema() { return impl_->Schema(); }
+Result<ArrowSchema> AvroReader::Schema() const { return impl_->Schema(); }
 
 Status AvroReader::Open(const ReaderOptions& options) {
   impl_ = std::make_unique<Impl>();

diff --git a/src/iceberg/avro/avro_reader.h b/src/iceberg/avro/avro_reader.h
@@ -37,7 +37,7 @@ class ICEBERG_BUNDLE_EXPORT AvroReader : public Reader {
 
   Result<std::optional<ArrowArray>> Next() final;
 
-  Result<ArrowSchema> Schema() final;
+  Result<ArrowSchema> Schema() const final;
 
  private:
   class Impl;

diff --git a/src/iceberg/file_reader.h b/src/iceberg/file_reader.h
@@ -26,6 +26,7 @@
 #include <memory>
 #include <optional>
 
+#include "iceberg/arrow_array_reader.h"
 #include "iceberg/arrow_c_data.h"
 #include "iceberg/file_format.h"
 #include "iceberg/result.h"
@@ -34,26 +35,14 @@
 namespace iceberg {
 
 /// \brief Base reader class to read data from different file formats.
-class ICEBERG_EXPORT Reader {
+class ICEBERG_EXPORT Reader : public ArrowArrayReader {
  public:
-  virtual ~Reader() = default;
   Reader() = default;
   Reader(const Reader&) = delete;
   Reader& operator=(const Reader&) = delete;
 
   /// \brief Open the reader.
   virtual Status Open(const struct ReaderOptions& options) = 0;
-
-  /// \brief Close the reader.
-  virtual Status Close() = 0;
-
-  /// \brief Read next data from file.
-  ///
-  /// \return std::nullopt if the reader has no more data, otherwise `ArrowArray`.
-  virtual Result<std::optional<ArrowArray>> Next() = 0;
-
-  /// \brief Get the schema of the data.
-  virtual Result<ArrowSchema> Schema() = 0;
 };
 
 /// \brief A split of the file to read.

diff --git a/src/iceberg/parquet/parquet_reader.cc b/src/iceberg/parquet/parquet_reader.cc
@@ -173,7 +173,7 @@ class ParquetReader::Impl {
   }
 
   // Get the schema of the data
-  Result<ArrowSchema> Schema() {
+  Result<ArrowSchema> Schema() const {
     if (!context_) {
       ICEBERG_RETURN_UNEXPECTED(InitReadContext());
     }
@@ -185,7 +185,7 @@ class ParquetReader::Impl {
   }
 
  private:
-  Status InitReadContext() {
+  Status InitReadContext() const {
     context_ = std::make_unique<ReadContext>();
 
     // Build the output Arrow schema
@@ -239,14 +239,14 @@ class ParquetReader::Impl {
   // Parquet file reader to create RecordBatchReader.
   std::unique_ptr<::parquet::arrow::FileReader> reader_;
   // The context to keep track of the reading progress.
-  std::unique_ptr<ReadContext> context_;
+  mutable std::unique_ptr<ReadContext> context_;
 };
 
 ParquetReader::~ParquetReader() = default;
 
 Result<std::optional<ArrowArray>> ParquetReader::Next() { return impl_->Next(); }
 
-Result<ArrowSchema> ParquetReader::Schema() { return impl_->Schema(); }
+Result<ArrowSchema> ParquetReader::Schema() const { return impl_->Schema(); }
 
 Status ParquetReader::Open(const ReaderOptions& options) {
   impl_ = std::make_unique<Impl>();

diff --git a/src/iceberg/parquet/parquet_reader.h b/src/iceberg/parquet/parquet_reader.h
@@ -37,7 +37,7 @@ class ICEBERG_BUNDLE_EXPORT ParquetReader : public Reader {
 
   Result<std::optional<ArrowArray>> Next() final;
 
-  Result<ArrowSchema> Schema() final;
+  Result<ArrowSchema> Schema() const final;
 
  private:
   class Impl;

diff --git a/src/iceberg/table_scan.cc b/src/iceberg/table_scan.cc
@@ -19,9 +19,7 @@
 
 #include "iceberg/table_scan.h"
 
-#include <algorithm>
-#include <ranges>
-
+#include "iceberg/file_reader.h"
 #include "iceberg/manifest_entry.h"
 #include "iceberg/manifest_list.h"
 #include "iceberg/manifest_reader.h"
@@ -45,6 +43,21 @@ int32_t FileScanTask::files_count() const { return 1; }
 
 int64_t FileScanTask::estimated_row_count() const { return data_file_->record_count; }
 
+Result<std::unique_ptr<ArrowArrayReader>> FileScanTask::ToArrowArrayReader(
+    const std::shared_ptr<Schema>& projected_schema,
+    const std::shared_ptr<Expression>& filter, const std::shared_ptr<FileIO>& io) const {
+  const ReaderOptions options{.path = data_file_->file_path,
+                              .length = data_file_->file_size_in_bytes,
+                              .io = io,
+                              .projection = projected_schema,
+                              .filter = filter};
+
+  ICEBERG_ASSIGN_OR_RAISE(auto reader,
+                          ReaderFactoryRegistry::Open(data_file_->file_format, options));
+
+  return reader;
+}
+
 TableScanBuilder::TableScanBuilder(std::shared_ptr<TableMetadata> table_metadata,
                                    std::shared_ptr<FileIO> file_io)
     : file_io_(std::move(file_io)) {

diff --git a/src/iceberg/table_scan.h b/src/iceberg/table_scan.h
@@ -22,6 +22,7 @@
 #include <string>
 #include <vector>
 
+#include "iceberg/arrow_array_reader.h"
 #include "iceberg/manifest_entry.h"
 #include "iceberg/type_fwd.h"
 
@@ -51,9 +52,23 @@ class ICEBERG_EXPORT FileScanTask : public ScanTask {
   const std::shared_ptr<DataFile>& data_file() const;
 
   int64_t size_bytes() const override;
+
   int32_t files_count() const override;
+
   int64_t estimated_row_count() const override;
 
+  /**
+   * \brief Returns an ArrowArrayReader to read the data for this task.
+   *
+   * \param projected_schema The projected schema for reading the data.
+   * \param filter Optional filter expression to apply during reading.
+   * \param io The FileIO instance for accessing the file data.
+   * \return A Result containing a unique pointer to the reader, or an error on failure.
+   */
+  Result<std::unique_ptr<ArrowArrayReader>> ToArrowArrayReader(
+      const std::shared_ptr<Schema>& projected_schema,
+      const std::shared_ptr<Expression>& filter, const std::shared_ptr<FileIO>& io) const;
+
  private:
   /// \brief Data file metadata.
   std::shared_ptr<DataFile> data_file_;

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -123,4 +123,7 @@ if(ICEBERG_BUILD_BUNDLE)
                    parquet_data_test.cc
                    parquet_schema_test.cc
                    parquet_test.cc)
+
+  add_iceberg_test(scan_test USE_BUNDLE SOURCES file_scan_task_test.cc)
+
 endif()