completed with test

HeartLinked · HeartLinked · commit a5ebaf916a6f · 2025-09-05T18:19:55.000+08:00
diff --git a/src/iceberg/table_scan.cc b/src/iceberg/table_scan.cc
@@ -22,8 +22,6 @@
 #include <cstring>
 #include <vector>
 
-#include <iceberg/result.h>
-
 #include "iceberg/arrow_c_data.h"
 #include "iceberg/file_reader.h"
 #include "iceberg/manifest_entry.h"
@@ -48,7 +46,7 @@ struct ReaderStreamPrivateData {
 
   ~ReaderStreamPrivateData() {
     if (reader) {
-      reader->Close();
+      std::ignore = reader->Close();
     }
   }
 };
@@ -303,28 +301,4 @@ Result<std::vector<std::shared_ptr<FileScanTask>>> DataTableScan::PlanFiles() co
   return tasks;
 }
 
-Result<std::vector<ArrowArrayStream>> DataTableScan::ToArrow() const {
-  Result<std::vector<std::shared_ptr<FileScanTask>>> tasks_result = PlanFiles();
-  if (!tasks_result.has_value()) {
-    return InvalidArgument("Failed to plan files: {}", tasks_result.error().message);
-  }
-  auto tasks = tasks_result.value();
-  if (tasks.empty()) {
-    // TODO(Li Feiyang): return a empty arrow stream
-    return NotImplemented("No files to scan");
-  }
-
-  std::vector<ArrowArrayStream> arrow_streams;
-  for (const auto& task : tasks_result.value()) {
-    Result<ArrowArrayStream> arrow_stream_result =
-        task->ToArrow(context_.projected_schema, context_.filter, file_io_);
-    if (!arrow_stream_result.has_value()) {
-      return InvalidArgument("Failed to get arrow stream: {}",
-                             arrow_stream_result.error().message);
-    }
-    arrow_streams.push_back(arrow_stream_result.value());
-  }
-  return std::move(arrow_streams);
-}
-
 }  // namespace iceberg
diff --git a/src/iceberg/table_scan.h b/src/iceberg/table_scan.h
@@ -55,8 +55,15 @@ class ICEBERG_EXPORT FileScanTask : public ScanTask {
   int32_t files_count() const override;
   int64_t estimated_row_count() const override;
 
+  /**
+   * \brief Returns a C-ABI compatible ArrowArrayStream to read the data for this task.
+   *
+   * \param projected_schema The projected schema for reading the data.
+   * \param filter Optional filter expression to apply during reading.
+   * \param io The FileIO instance for accessing the file data.
+   * \return A Result containing an ArrowArrayStream, or an error on failure.
+   */
   Result<ArrowArrayStream> ToArrow(const std::shared_ptr<Schema>& projected_schema,
-
                                    const std::shared_ptr<Expression>& filter,
                                    const std::shared_ptr<FileIO>& io) const;
 
@@ -189,8 +196,6 @@ class ICEBERG_EXPORT DataTableScan : public TableScan {
   /// \brief Plans the scan tasks by resolving manifests and data files.
   /// \return A Result containing scan tasks or an error.
   Result<std::vector<std::shared_ptr<FileScanTask>>> PlanFiles() const override;
-
-  Result<std::vector<ArrowArrayStream>> ToArrow() const;
 };
 
 }  // namespace iceberg
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -124,4 +124,7 @@ if(ICEBERG_BUILD_BUNDLE)
                    parquet_data_test.cc
                    parquet_schema_test.cc
                    parquet_test.cc)
+
+  add_iceberg_test(scan_test USE_BUNDLE SOURCES file_scan_task_test.cc)
+
 endif()
diff --git a/test/file_scan_task_test.cc b/test/file_scan_task_test.cc
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <arrow/array.h>
+#include <arrow/c/bridge.h>
+#include <arrow/json/from_string.h>
+#include <arrow/record_batch.h>
+#include <arrow/table.h>
+#include <arrow/util/key_value_metadata.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/arrow/writer.h>
+#include <parquet/metadata.h>
+
+#include "iceberg/arrow/arrow_fs_file_io_internal.h"
+#include "iceberg/file_format.h"
+#include "iceberg/manifest_entry.h"
+#include "iceberg/parquet/parquet_register.h"
+#include "iceberg/schema.h"
+#include "iceberg/table_scan.h"
+#include "iceberg/type.h"
+#include "iceberg/util/checked_cast.h"
+#include "matchers.h"
+#include "temp_file_test_base.h"
+
+namespace iceberg {
+
+class FileScanTaskTest : public TempFileTestBase {
+ protected:
+  static void SetUpTestSuite() { parquet::RegisterAll(); }
+
+  void SetUp() override {
+    TempFileTestBase::SetUp();
+    file_io_ = arrow::ArrowFileSystemFileIO::MakeLocalFileIO();
+    temp_parquet_file_ = CreateNewTempFilePathWithSuffix(".parquet");
+    CreateSimpleParquetFile();
+  }
+
+  // Helper method to create a Parquet file with sample data.
+  void CreateSimpleParquetFile(int64_t chunk_size = 1024) {
+    const std::string kParquetFieldIdKey = "PARQUET:field_id";
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("id", ::arrow::int32(), /*nullable=*/false,
+                        ::arrow::KeyValueMetadata::Make({kParquetFieldIdKey}, {"1"})),
+         ::arrow::field("name", ::arrow::utf8(), /*nullable=*/true,
+                        ::arrow::KeyValueMetadata::Make({kParquetFieldIdKey}, {"2"}))});
+    auto table = ::arrow::Table::FromRecordBatches(
+                     arrow_schema, {::arrow::RecordBatch::FromStructArray(
+                                        ::arrow::json::ArrayFromJSONString(
+                                            ::arrow::struct_(arrow_schema->fields()),
+                                            R"([[1, "Foo"], [2, "Bar"], [3, "Baz"]])")
+                                            .ValueOrDie())
+                                        .ValueOrDie()})
+                     .ValueOrDie();
+
+    auto io = internal::checked_cast<arrow::ArrowFileSystemFileIO&>(*file_io_);
+    auto outfile = io.fs()->OpenOutputStream(temp_parquet_file_).ValueOrDie();
+
+    ASSERT_TRUE(::parquet::arrow::WriteTable(*table, ::arrow::default_memory_pool(),
+                                             outfile, chunk_size)
+                    .ok());
+  }
+
+  // Helper to create a valid but empty Parquet file.
+  void CreateEmptyParquetFile() {
+    const std::string kParquetFieldIdKey = "PARQUET:field_id";
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("id", ::arrow::int32(), /*nullable=*/false,
+                        ::arrow::KeyValueMetadata::Make({kParquetFieldIdKey}, {"1"}))});
+    auto empty_table = ::arrow::Table::FromRecordBatches(arrow_schema, {}).ValueOrDie();
+
+    auto io = internal::checked_cast<arrow::ArrowFileSystemFileIO&>(*file_io_);
+    auto outfile = io.fs()->OpenOutputStream(temp_parquet_file_).ValueOrDie();
+    ASSERT_TRUE(::parquet::arrow::WriteTable(*empty_table, ::arrow::default_memory_pool(),
+                                             outfile, 1024)
+                    .ok());
+  }
+
+  // Helper method to verify the content of the next batch from an ArrowArrayStream.
+  void VerifyStreamNextBatch(struct ArrowArrayStream* stream,
+                             std::string_view expected_json) {
+    ASSERT_NE(stream->get_schema, nullptr) << "Stream has been released or is invalid.";
+
+    ArrowSchema c_schema;
+    ASSERT_EQ(stream->get_schema(stream, &c_schema), 0);
+    auto import_schema_result = ::arrow::ImportSchema(&c_schema);
+    ASSERT_TRUE(import_schema_result.ok()) << import_schema_result.status().message();
+    auto arrow_schema = import_schema_result.ValueOrDie();
+
+    ArrowArray c_array;
+    ASSERT_EQ(stream->get_next(stream, &c_array), 0)
+        << "get_next failed. Error: " << stream->get_last_error(stream);
+    ASSERT_NE(c_array.release, nullptr) << "Stream is exhausted but expected more data.";
+
+    auto import_batch_result = ::arrow::ImportRecordBatch(&c_array, arrow_schema);
+    ASSERT_TRUE(import_batch_result.ok()) << import_batch_result.status().message();
+    auto actual_batch = import_batch_result.ValueOrDie();
+
+    auto struct_type = ::arrow::struct_(arrow_schema->fields());
+    auto expected_array =
+        ::arrow::json::ArrayFromJSONString(struct_type, expected_json).ValueOrDie();
+    auto expected_batch =
+        ::arrow::RecordBatch::FromStructArray(expected_array).ValueOrDie();
+
+    ASSERT_TRUE(actual_batch->Equals(*expected_batch))
+        << "Actual batch:\n"
+        << actual_batch->ToString() << "\nExpected batch:\n"
+        << expected_batch->ToString();
+  }
+
+  // Helper method to verify that an ArrowArrayStream is exhausted.
+  void VerifyStreamExhausted(struct ArrowArrayStream* stream) {
+    ASSERT_NE(stream->get_next, nullptr) << "Stream has been released or is invalid.";
+    ArrowArray c_array;
+    ASSERT_EQ(stream->get_next(stream, &c_array), 0);
+    ASSERT_EQ(c_array.release, nullptr) << "Stream was not exhausted as expected.";
+  }
+
+  std::shared_ptr<FileIO> file_io_;
+  std::string temp_parquet_file_;
+};
+
+TEST_F(FileScanTaskTest, ReadFullSchema) {
+  auto data_file = std::make_shared<DataFile>();
+  data_file->file_path = temp_parquet_file_;
+  data_file->file_format = FileFormatType::kParquet;
+
+  auto projected_schema = std::make_shared<Schema>(
+      std::vector<SchemaField>{SchemaField::MakeRequired(1, "id", int32()),
+                               SchemaField::MakeOptional(2, "name", string())});
+
+  FileScanTask task(data_file);
+
+  auto stream_result = task.ToArrow(projected_schema, nullptr, file_io_);
+  ASSERT_THAT(stream_result, IsOk());
+  auto stream = std::move(stream_result.value());
+
+  ASSERT_NO_FATAL_FAILURE(
+      VerifyStreamNextBatch(&stream, R"([[1, "Foo"], [2, "Bar"], [3, "Baz"]])"));
+  ASSERT_NO_FATAL_FAILURE(VerifyStreamExhausted(&stream));
+
+  ASSERT_NE(stream.release, nullptr);
+  stream.release(&stream);
+  ASSERT_EQ(stream.release, nullptr);
+  ASSERT_EQ(stream.private_data, nullptr);
+}
+
+TEST_F(FileScanTaskTest, ReadProjectedAndReorderedSchema) {
+  auto data_file = std::make_shared<DataFile>();
+  data_file->file_path = temp_parquet_file_;
+  data_file->file_format = FileFormatType::kParquet;
+
+  auto projected_schema = std::make_shared<Schema>(
+      std::vector<SchemaField>{SchemaField::MakeOptional(2, "name", string()),
+                               SchemaField::MakeOptional(3, "score", float64())});
+
+  FileScanTask task(data_file);
+
+  auto stream_result = task.ToArrow(projected_schema, nullptr, file_io_);
+  ASSERT_THAT(stream_result, IsOk());
+  auto stream = std::move(stream_result.value());
+
+  ASSERT_NO_FATAL_FAILURE(
+      VerifyStreamNextBatch(&stream, R"([["Foo", null], ["Bar", null], ["Baz", null]])"));
+  ASSERT_NO_FATAL_FAILURE(VerifyStreamExhausted(&stream));
+
+  stream.release(&stream);
+}
+
+TEST_F(FileScanTaskTest, ReadEmptyFile) {
+  CreateEmptyParquetFile();
+  auto data_file = std::make_shared<DataFile>();
+  data_file->file_path = temp_parquet_file_;
+  data_file->file_format = FileFormatType::kParquet;
+
+  auto projected_schema = std::make_shared<Schema>(
+      std::vector<SchemaField>{SchemaField::MakeRequired(1, "id", int32())});
+
+  FileScanTask task(data_file);
+
+  auto stream_result = task.ToArrow(projected_schema, nullptr, file_io_);
+  ASSERT_THAT(stream_result, IsOk());
+  auto stream = std::move(stream_result.value());
+
+  // The stream should be immediately exhausted
+  ASSERT_NO_FATAL_FAILURE(VerifyStreamExhausted(&stream));
+
+  stream.release(&stream);
+}
+
+}  // namespace iceberg