Skip to content

Commit 90039fb

Browse files
committed
fix some review comments
1 parent 978dfcf commit 90039fb

File tree

6 files changed

+40
-56
lines changed

6 files changed

+40
-56
lines changed

src/iceberg/parquet/parquet_reader.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -237,12 +237,12 @@ class ParquetReader::Impl {
237237
std::shared_ptr<::iceberg::Schema> read_schema_;
238238
// The projection result to apply to the read schema.
239239
SchemaProjection projection_;
240+
// The input stream to read Parquet file.
241+
std::shared_ptr<::arrow::io::RandomAccessFile> input_stream_;
240242
// Parquet file reader to create RecordBatchReader.
241243
std::unique_ptr<::parquet::arrow::FileReader> reader_;
242244
// The context to keep track of the reading progress.
243245
std::unique_ptr<ReadContext> context_;
244-
// The input stream to read Parquet file.
245-
std::shared_ptr<::arrow::io::RandomAccessFile> input_stream_;
246246
};
247247

248248
ParquetReader::~ParquetReader() = default;

src/iceberg/parquet/parquet_writer.cc

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,9 @@ class ParquetWriter::Impl {
6666
auto schema_node = std::static_pointer_cast<::parquet::schema::GroupNode>(
6767
schema_descriptor->schema_root());
6868

69-
std::shared_ptr<::arrow::KeyValueMetadata> metadata =
70-
::arrow::key_value_metadata(options.properties);
71-
7269
ICEBERG_ASSIGN_OR_RAISE(output_stream_, OpenOutputStream(options));
7370
auto file_writer = ::parquet::ParquetFileWriter::Open(output_stream_, schema_node,
74-
writer_properties, metadata);
71+
writer_properties);
7572
ICEBERG_ARROW_RETURN_NOT_OK(::parquet::arrow::FileWriter::Make(
7673
pool_, std::move(file_writer), arrow_schema_, arrow_writer_properties, &writer_));
7774

@@ -106,10 +103,10 @@ class ParquetWriter::Impl {
106103
::arrow::MemoryPool* pool_ = ::arrow::default_memory_pool();
107104
// Schema to write from the Parquet file.
108105
std::shared_ptr<::arrow::Schema> arrow_schema_;
109-
// Parquet file writer to write ArrowArray.
110-
std::unique_ptr<::parquet::arrow::FileWriter> writer_;
111106
// The output stream to write Parquet file.
112107
std::shared_ptr<::arrow::io::OutputStream> output_stream_;
108+
// Parquet file writer to write ArrowArray.
109+
std::unique_ptr<::parquet::arrow::FileWriter> writer_;
113110
};
114111

115112
ParquetWriter::~ParquetWriter() = default;

src/iceberg/schema_internal.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <optional>
2525
#include <string>
2626

27+
#include "iceberg/constants.h"
2728
#include "iceberg/schema.h"
2829
#include "iceberg/type.h"
2930
#include "iceberg/util/macros.h"
@@ -45,7 +46,7 @@ ArrowErrorCode ToArrowSchema(const Type& type, bool optional, std::string_view n
4546
NANOARROW_RETURN_NOT_OK(ArrowMetadataBuilderInit(&metadata_buffer, nullptr));
4647
if (field_id.has_value()) {
4748
NANOARROW_RETURN_NOT_OK(ArrowMetadataBuilderAppend(
48-
&metadata_buffer, ArrowCharView(std::string(kFieldIdKey).c_str()),
49+
&metadata_buffer, ArrowCharView(std::string(kParquetFieldIdKey).c_str()),
4950
ArrowCharView(std::to_string(field_id.value()).c_str())));
5051
}
5152

@@ -185,8 +186,8 @@ int32_t GetFieldId(const ArrowSchema& schema) {
185186
return kUnknownFieldId;
186187
}
187188

188-
ArrowStringView field_id_key{.data = kFieldIdKey.data(),
189-
.size_bytes = kFieldIdKey.size()};
189+
ArrowStringView field_id_key{.data = kParquetFieldIdKey.data(),
190+
.size_bytes = kParquetFieldIdKey.size()};
190191
ArrowStringView field_id_value;
191192
if (ArrowMetadataGetValue(schema.metadata, field_id_key, &field_id_value) !=
192193
NANOARROW_OK) {

src/iceberg/schema_internal.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,12 @@
2424

2525
#include <nanoarrow/nanoarrow.h>
2626

27-
#include "iceberg/constants.h"
2827
#include "iceberg/iceberg_export.h"
2928
#include "iceberg/result.h"
3029
#include "iceberg/type_fwd.h"
3130

3231
namespace iceberg {
3332

34-
// Apache Arrow C++ uses "PARQUET:field_id" to store field IDs for Parquet.
35-
// Here we follow a similar convention for Iceberg but we might also add
36-
// "PARQUET:field_id" in the future once we implement a Parquet writer.
37-
constexpr std::string_view kFieldIdKey = kParquetFieldIdKey;
38-
3933
/// \brief Convert an Iceberg schema to an Arrow schema.
4034
///
4135
/// \param[in] schema The Iceberg schema to convert.

test/arrow_test.cc

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <gtest/gtest.h>
3030

3131
#include "iceberg/arrow_c_data_internal.h"
32+
#include "iceberg/constants.h"
3233
#include "iceberg/schema.h"
3334
#include "iceberg/schema_internal.h"
3435
#include "matchers.h"
@@ -98,8 +99,8 @@ TEST_P(ToArrowSchemaTest, PrimitiveType) {
9899
ASSERT_TRUE(field->type()->Equals(param.arrow_type));
99100

100101
auto metadata = field->metadata();
101-
ASSERT_TRUE(metadata->Contains(kFieldIdKey));
102-
ASSERT_EQ(metadata->Get(kFieldIdKey), std::to_string(kFieldId));
102+
ASSERT_TRUE(metadata->Contains(kParquetFieldIdKey));
103+
ASSERT_EQ(metadata->Get(kParquetFieldIdKey), std::to_string(kFieldId));
103104
}
104105

105106
INSTANTIATE_TEST_SUITE_P(
@@ -146,8 +147,8 @@ void CheckArrowField(const ::arrow::Field& field, ::arrow::Type::type type_id,
146147

147148
auto metadata = field.metadata();
148149
ASSERT_TRUE(metadata != nullptr);
149-
ASSERT_TRUE(metadata->Contains(kFieldIdKey));
150-
ASSERT_EQ(metadata->Get(kFieldIdKey), std::to_string(field_id));
150+
ASSERT_TRUE(metadata->Contains(kParquetFieldIdKey));
151+
ASSERT_EQ(metadata->Get(kParquetFieldIdKey), std::to_string(field_id));
151152
}
152153

153154
} // namespace
@@ -275,7 +276,7 @@ TEST_P(FromArrowSchemaTest, PrimitiveType) {
275276

276277
auto metadata =
277278
::arrow::key_value_metadata(std::unordered_map<std::string, std::string>{
278-
{std::string(kFieldIdKey), std::to_string(kFieldId)}});
279+
{std::string(kParquetFieldIdKey), std::to_string(kFieldId)}});
279280
auto arrow_schema = ::arrow::schema({::arrow::field(
280281
std::string(kFieldName), param.arrow_type, param.optional, std::move(metadata))});
281282
ArrowSchema exported_schema;
@@ -343,16 +344,16 @@ TEST(FromArrowSchemaTest, StructType) {
343344
auto int_field = ::arrow::field(
344345
std::string(kIntFieldName), ::arrow::int32(), /*nullable=*/false,
345346
::arrow::key_value_metadata(std::unordered_map<std::string, std::string>{
346-
{std::string(kFieldIdKey), std::to_string(kIntFieldId)}}));
347+
{std::string(kParquetFieldIdKey), std::to_string(kIntFieldId)}}));
347348
auto str_field = ::arrow::field(
348349
std::string(kStrFieldName), ::arrow::utf8(), /*nullable=*/true,
349350
::arrow::key_value_metadata(std::unordered_map<std::string, std::string>{
350-
{std::string(kFieldIdKey), std::to_string(kStrFieldId)}}));
351+
{std::string(kParquetFieldIdKey), std::to_string(kStrFieldId)}}));
351352
auto struct_type = ::arrow::struct_({int_field, str_field});
352353
auto struct_field = ::arrow::field(
353354
std::string(kStructFieldName), struct_type, /*nullable=*/false,
354355
::arrow::key_value_metadata(std::unordered_map<std::string, std::string>{
355-
{std::string(kFieldIdKey), std::to_string(kStructFieldId)}}));
356+
{std::string(kParquetFieldIdKey), std::to_string(kStructFieldId)}}));
356357
auto arrow_schema = ::arrow::schema({struct_field});
357358
ArrowSchema exported_schema;
358359
ASSERT_TRUE(::arrow::ExportSchema(*arrow_schema, &exported_schema).ok());
@@ -397,12 +398,12 @@ TEST(FromArrowSchemaTest, ListType) {
397398
auto element_field = ::arrow::field(
398399
std::string(kElemFieldName), ::arrow::int64(), /*nullable=*/true,
399400
::arrow::key_value_metadata(std::unordered_map<std::string, std::string>{
400-
{std::string(kFieldIdKey), std::to_string(kElemFieldId)}}));
401+
{std::string(kParquetFieldIdKey), std::to_string(kElemFieldId)}}));
401402
auto list_type = ::arrow::list(element_field);
402403
auto list_field = ::arrow::field(
403404
std::string(kListFieldName), list_type, /*nullable=*/false,
404405
::arrow::key_value_metadata(std::unordered_map<std::string, std::string>{
405-
{std::string(kFieldIdKey), std::to_string(kListFieldId)}}));
406+
{std::string(kParquetFieldIdKey), std::to_string(kListFieldId)}}));
406407
auto arrow_schema = ::arrow::schema({list_field});
407408

408409
ArrowSchema exported_schema;
@@ -444,16 +445,16 @@ TEST(FromArrowSchemaTest, MapType) {
444445
auto key_field = ::arrow::field(
445446
std::string(kKeyFieldName), ::arrow::utf8(), /*nullable=*/false,
446447
::arrow::key_value_metadata(std::unordered_map<std::string, std::string>{
447-
{std::string(kFieldIdKey), std::to_string(kKeyFieldId)}}));
448+
{std::string(kParquetFieldIdKey), std::to_string(kKeyFieldId)}}));
448449
auto value_field = ::arrow::field(
449450
std::string(kValueFieldName), ::arrow::int32(), /*nullable=*/true,
450451
::arrow::key_value_metadata(std::unordered_map<std::string, std::string>{
451-
{std::string(kFieldIdKey), std::to_string(kValueFieldId)}}));
452+
{std::string(kParquetFieldIdKey), std::to_string(kValueFieldId)}}));
452453
auto map_type = std::make_shared<::arrow::MapType>(key_field, value_field);
453454
auto map_field = ::arrow::field(
454455
std::string(kMapFieldName), map_type, /*nullable=*/true,
455456
::arrow::key_value_metadata(std::unordered_map<std::string, std::string>{
456-
{std::string(kFieldIdKey), std::to_string(kFieldId)}}));
457+
{std::string(kParquetFieldIdKey), std::to_string(kFieldId)}}));
457458
auto arrow_schema = ::arrow::schema({map_field});
458459

459460
ArrowSchema exported_schema;

test/parquet_test.cc

Lines changed: 17 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,19 @@
2626
#include <arrow/table.h>
2727
#include <arrow/type.h>
2828
#include <arrow/util/key_value_metadata.h>
29-
#include <iceberg/arrow_c_data_guard_internal.h>
30-
#include <iceberg/file_reader.h>
31-
#include <iceberg/file_writer.h>
32-
#include <iceberg/result.h>
33-
#include <iceberg/schema_field.h>
34-
#include <iceberg/schema_internal.h>
3529
#include <parquet/arrow/reader.h>
3630
#include <parquet/arrow/writer.h>
3731
#include <parquet/metadata.h>
3832

3933
#include "iceberg/arrow/arrow_error_transform_internal.h"
4034
#include "iceberg/arrow/arrow_fs_file_io_internal.h"
35+
#include "iceberg/file_reader.h"
36+
#include "iceberg/file_writer.h"
4137
#include "iceberg/parquet/parquet_register.h"
38+
#include "iceberg/result.h"
4239
#include "iceberg/schema.h"
40+
#include "iceberg/schema_field.h"
41+
#include "iceberg/schema_internal.h"
4342
#include "iceberg/type.h"
4443
#include "iceberg/util/checked_cast.h"
4544
#include "iceberg/util/macros.h"
@@ -51,15 +50,11 @@ namespace {
5150

5251
Status WriteTable(std::shared_ptr<::arrow::Array> data,
5352
const WriterOptions& writer_options) {
54-
ArrowArray arr;
55-
ICEBERG_ARROW_RETURN_NOT_OK(::arrow::ExportArray(*data, &arr));
56-
5753
ICEBERG_ASSIGN_OR_RAISE(
5854
auto writer, WriterFactoryRegistry::Open(FileFormatType::kParquet, writer_options));
59-
{
60-
internal::ArrowArrayGuard guard(&arr);
61-
ICEBERG_RETURN_UNEXPECTED(writer->Write(arr));
62-
}
55+
ArrowArray arr;
56+
ICEBERG_ARROW_RETURN_NOT_OK(::arrow::ExportArray(*data, &arr));
57+
ICEBERG_RETURN_UNEXPECTED(writer->Write(arr));
6358
return writer->Close();
6459
}
6560

@@ -323,13 +318,10 @@ TEST_F(ParquetReadWrite, SimpleStructRoundTrip) {
323318
ASSERT_THAT(ToArrowSchema(*schema, &arrow_c_schema), IsOk());
324319
auto arrow_schema = ::arrow::ImportType(&arrow_c_schema).ValueOrDie();
325320

326-
auto array =
327-
::arrow::json::ArrayFromJSONString(::arrow::struct_(arrow_schema->fields()),
328-
R"([[{"a1": 1, "a2": "abc"}],
329-
[{"a1": 0}],
330-
[{"a2": "edf"}],
331-
[{}]])")
332-
.ValueOrDie();
321+
auto array = ::arrow::json::ArrayFromJSONString(
322+
::arrow::struct_(arrow_schema->fields()),
323+
R"([[{"a1": 1, "a2": "abc"}], [{"a1": 0}], [{"a2": "edf"}], [{}]])")
324+
.ValueOrDie();
333325

334326
std::shared_ptr<::arrow::Array> out;
335327
DoRoundtrip(array, schema, &out);
@@ -353,12 +345,11 @@ TEST_F(ParquetReadWrite, SimpleTypeRoundTrip) {
353345
ASSERT_THAT(ToArrowSchema(*schema, &arrow_c_schema), IsOk());
354346
auto arrow_schema = ::arrow::ImportType(&arrow_c_schema).ValueOrDie();
355347

356-
auto array = ::arrow::json::ArrayFromJSONString(
357-
::arrow::struct_(arrow_schema->fields()),
358-
R"([[true, 1, 2, 1.1, 1.2, "abc", 44614000, 1756696503000000],
359-
[false, 0, 0, 0, 0, "", 0, 0],
360-
[null, null, null, null, null, null, null, null]])")
361-
.ValueOrDie();
348+
auto array =
349+
::arrow::json::ArrayFromJSONString(
350+
::arrow::struct_(arrow_schema->fields()),
351+
R"([[true, 1, 2, 1.1, 1.2, "abc", 44614000, 1756696503000000], [false, 0, 0, 0, 0, "", 0, 0], [null, null, null, null, null, null, null, null]])")
352+
.ValueOrDie();
362353

363354
std::shared_ptr<::arrow::Array> out;
364355
DoRoundtrip(array, schema, &out);

0 commit comments

Comments
 (0)