@@ -49,23 +49,11 @@ namespace iceberg {
4949 } \
5050 }
5151
52- #define PARSE_ENUM_FIELD (item, array_view, type ) \
53- for (size_t row_idx = 0 ; row_idx < array_view->length; row_idx++) { \
54- if (!ArrowArrayViewIsNull (array_view, row_idx)) { \
55- auto value = ArrowArrayViewGetIntUnsafe (array_view, row_idx); \
56- item = static_cast <type>(value); \
57- } else if (required) { \
58- return InvalidManifestList (" Field {} is required but null at row {}" , field_name, \
59- row_idx); \
60- } \
61- }
62-
6352#define PARSE_STRING_FIELD (item, array_view ) \
6453 for (size_t row_idx = 0 ; row_idx < array_view->length; row_idx++) { \
6554 if (!ArrowArrayViewIsNull (array_view, row_idx)) { \
6655 auto value = ArrowArrayViewGetStringUnsafe (array_view, row_idx); \
67- std::string path_str (value.data , value.size_bytes ); \
68- item = path_str; \
56+ item = std::string (value.data , value.size_bytes ); \
6957 } else if (required) { \
7058 return InvalidManifestList (" Field {} is required but null at row {}" , field_name, \
7159 row_idx); \
@@ -75,16 +63,14 @@ namespace iceberg {
7563#define PARSE_BINARY_FIELD (item, array_view ) \
7664 for (size_t row_idx = 0 ; row_idx < array_view->length; row_idx++) { \
7765 if (!ArrowArrayViewIsNull (view_of_column, row_idx)) { \
78- auto buffer = ArrowArrayViewGetBytesUnsafe (array_view, row_idx); \
79- item = std::vector<uint8_t >(buffer.data .as_char , \
80- buffer.data .as_char + buffer.size_bytes ); \
66+ item = ArrowArrayViewGetInt8Vector (array_view, row_idx); \
8167 } else if (required) { \
8268 return InvalidManifestList (" Field {} is required but null at row {}" , field_name, \
8369 row_idx); \
8470 } \
8571 }
8672
87- #define PARSE_PRIMITIVE_VECTOR_FIELD (item, count, array_view, type ) \
73+ #define PARSE_INTEGER_VECTOR_FIELD (item, count, array_view, type ) \
8874 for (int64_t manifest_idx = 0 ; manifest_idx < count; manifest_idx++) { \
8975 auto offset = ArrowArrayViewListChildOffset (array_view, manifest_idx); \
9076 auto next_offset = ArrowArrayViewListChildOffset (array_view, manifest_idx + 1 ); \
@@ -94,50 +80,36 @@ namespace iceberg {
9480 } \
9581 }
9682
97- #define PARSE_PRIMITIVE_MAP_FIELD (item, count, array_view ) \
83+ #define PARSE_MAP_FIELD (item, count, array_view, key_type, value_type, assignment ) \
9884 do { \
9985 if (array_view->storage_type != ArrowType::NANOARROW_TYPE_MAP) { \
10086 return InvalidManifest (" Field:{} should be a map." , field_name); \
10187 } \
10288 auto view_of_map = array_view->children [0 ]; \
10389 ASSERT_VIEW_TYPE_AND_CHILDREN (view_of_map, ArrowType::NANOARROW_TYPE_STRUCT, 2 ); \
10490 auto view_of_map_key = view_of_map->children [0 ]; \
105- ASSERT_VIEW_TYPE (view_of_map_key, ArrowType::NANOARROW_TYPE_INT32); \
91+ ASSERT_VIEW_TYPE (view_of_map_key, key_type); \
10692 auto view_of_map_value = view_of_map->children [1 ]; \
107- ASSERT_VIEW_TYPE (view_of_map_value, ArrowType::NANOARROW_TYPE_INT64); \
93+ ASSERT_VIEW_TYPE (view_of_map_value, value_type); \
10894 for (int64_t row_idx = 0 ; row_idx < count; row_idx++) { \
10995 auto offset = array_view->buffer_views [1 ].data .as_int32 [row_idx]; \
11096 auto next_offset = array_view->buffer_views [1 ].data .as_int32 [row_idx + 1 ]; \
11197 for (int32_t offset_idx = offset; offset_idx < next_offset; offset_idx++) { \
11298 auto key = ArrowArrayViewGetIntUnsafe (view_of_map_key, offset_idx); \
113- auto value = ArrowArrayViewGetIntUnsafe (view_of_map_value, offset_idx); \
114- item[key] = value; \
99+ item[key] = assignment; \
115100 } \
116101 } \
117102 } while (0 )
118103
119- #define PARSE_BINARY_MAP_FIELD (item, count, array_view ) \
120- do { \
121- if (array_view->storage_type != ArrowType::NANOARROW_TYPE_MAP) { \
122- return InvalidManifest (" Field:{} should be a map." , field_name); \
123- } \
124- auto view_of_map = array_view->children [0 ]; \
125- ASSERT_VIEW_TYPE_AND_CHILDREN (view_of_map, ArrowType::NANOARROW_TYPE_STRUCT, 2 ); \
126- auto view_of_map_key = view_of_map->children [0 ]; \
127- ASSERT_VIEW_TYPE (view_of_map_key, ArrowType::NANOARROW_TYPE_INT32); \
128- auto view_of_map_value = view_of_map->children [1 ]; \
129- ASSERT_VIEW_TYPE (view_of_map_value, ArrowType::NANOARROW_TYPE_BINARY); \
130- for (int64_t row_idx = 0 ; row_idx < count; row_idx++) { \
131- auto offset = array_view->buffer_views [1 ].data .as_int32 [row_idx]; \
132- auto next_offset = array_view->buffer_views [1 ].data .as_int32 [row_idx + 1 ]; \
133- for (int32_t offset_idx = offset; offset_idx < next_offset; offset_idx++) { \
134- auto key = ArrowArrayViewGetIntUnsafe (view_of_map_key, offset_idx); \
135- auto buffer = ArrowArrayViewGetBytesUnsafe (view_of_map_value, offset_idx); \
136- item[key] = std::vector<uint8_t >(buffer.data .as_char , \
137- buffer.data .as_char + buffer.size_bytes ); \
138- } \
139- } \
140- } while (0 )
104+ #define PARSE_INT_LONG_MAP_FIELD (item, count, array_view ) \
105+ PARSE_MAP_FIELD (item, count, array_view, ArrowType::NANOARROW_TYPE_INT32, \
106+ ArrowType::NANOARROW_TYPE_INT64, \
107+ ArrowArrayViewGetIntUnsafe (view_of_map_value, offset_idx));
108+
109+ #define PARSE_INT_BINARY_MAP_FIELD (item, count, array_view ) \
110+ PARSE_MAP_FIELD (item, count, array_view, ArrowType::NANOARROW_TYPE_INT32, \
111+ ArrowType::NANOARROW_TYPE_BINARY, \
112+ ArrowArrayViewGetInt8Vector (view_of_map_value, offset_idx));
141113
142114#define ASSERT_VIEW_TYPE (view, type ) \
143115 if (view->storage_type != type) { \
@@ -153,6 +125,12 @@ namespace iceberg {
153125 field_name, n_child); \
154126 }
155127
128+ std::vector<uint8_t > ArrowArrayViewGetInt8Vector (const ArrowArrayView* view,
129+ int32_t offset_idx) {
130+ auto buffer = ArrowArrayViewGetBytesUnsafe (view, offset_idx);
131+ return {buffer.data .as_char , buffer.data .as_char + buffer.size_bytes };
132+ }
133+
156134Status ParsePartitionFieldSummaryList (ArrowArrayView* view_of_column,
157135 std::vector<ManifestFile>& manifest_files) {
158136 auto manifest_count = view_of_column->length ;
@@ -202,14 +180,12 @@ Status ParsePartitionFieldSummaryList(ArrowArrayView* view_of_column,
202180 ArrowArrayViewGetIntUnsafe (contains_nan, partition_idx);
203181 }
204182 if (!ArrowArrayViewIsNull (lower_bound_list, partition_idx)) {
205- auto buffer = ArrowArrayViewGetBytesUnsafe (lower_bound_list, partition_idx);
206- partition_field_summary.lower_bound = std::vector<uint8_t >(
207- buffer.data .as_char , buffer.data .as_char + buffer.size_bytes );
183+ partition_field_summary.lower_bound =
184+ ArrowArrayViewGetInt8Vector (lower_bound_list, partition_idx);
208185 }
209186 if (!ArrowArrayViewIsNull (upper_bound_list, partition_idx)) {
210- auto buffer = ArrowArrayViewGetBytesUnsafe (upper_bound_list, partition_idx);
211- partition_field_summary.upper_bound = std::vector<uint8_t >(
212- buffer.data .as_char , buffer.data .as_char + buffer.size_bytes );
187+ partition_field_summary.upper_bound =
188+ ArrowArrayViewGetInt8Vector (upper_bound_list, partition_idx);
213189 }
214190
215191 manifest_file.partitions .emplace_back (partition_field_summary);
@@ -264,8 +240,8 @@ Result<std::vector<ManifestFile>> ParseManifestList(ArrowSchema* schema,
264240 int32_t );
265241 break ;
266242 case 3 :
267- PARSE_ENUM_FIELD (manifest_files[row_idx].content , view_of_column,
268- ManifestFile::Content);
243+ PARSE_PRIMITIVE_FIELD (manifest_files[row_idx].content , view_of_column,
244+ ManifestFile::Content);
269245 break ;
270246 case 4 :
271247 PARSE_PRIMITIVE_FIELD (manifest_files[row_idx].sequence_number , view_of_column,
@@ -373,8 +349,8 @@ Status ParseDataFile(const std::shared_ptr<StructType>& data_file_schema,
373349
374350 switch (col_idx) {
375351 case 0 :
376- PARSE_ENUM_FIELD (manifest_entries[row_idx].data_file ->content , view_of_file_field ,
377- DataFile::Content);
352+ PARSE_PRIMITIVE_FIELD (manifest_entries[row_idx].data_file ->content ,
353+ view_of_file_field, DataFile::Content);
378354 break ;
379355 case 1 :
380356 PARSE_STRING_FIELD (manifest_entries[row_idx].data_file ->file_path ,
@@ -415,42 +391,41 @@ Status ParseDataFile(const std::shared_ptr<StructType>& data_file_schema,
415391 // key&value should have the same offset
416392 // HACK(xiao.dong) workaround for arrow bug:
417393 // ArrowArrayViewListChildOffset can not get the correct offset for map
418- PARSE_PRIMITIVE_MAP_FIELD (manifest_entries[row_idx].data_file ->column_sizes ,
419- manifest_entry_count, view_of_file_field);
394+ PARSE_INT_LONG_MAP_FIELD (manifest_entries[row_idx].data_file ->column_sizes ,
395+ manifest_entry_count, view_of_file_field);
420396 break ;
421397 case 7 :
422- PARSE_PRIMITIVE_MAP_FIELD (manifest_entries[row_idx].data_file ->value_counts ,
423- manifest_entry_count, view_of_file_field);
398+ PARSE_INT_LONG_MAP_FIELD (manifest_entries[row_idx].data_file ->value_counts ,
399+ manifest_entry_count, view_of_file_field);
424400 break ;
425401 case 8 :
426- PARSE_PRIMITIVE_MAP_FIELD (manifest_entries[row_idx].data_file ->null_value_counts ,
427- manifest_entry_count, view_of_file_field);
402+ PARSE_INT_LONG_MAP_FIELD (manifest_entries[row_idx].data_file ->null_value_counts ,
403+ manifest_entry_count, view_of_file_field);
428404 break ;
429405 case 9 :
430- PARSE_PRIMITIVE_MAP_FIELD (manifest_entries[row_idx].data_file ->nan_value_counts ,
431- manifest_entry_count, view_of_file_field);
406+ PARSE_INT_LONG_MAP_FIELD (manifest_entries[row_idx].data_file ->nan_value_counts ,
407+ manifest_entry_count, view_of_file_field);
432408 break ;
433409 case 10 :
434- PARSE_BINARY_MAP_FIELD (manifest_entries[row_idx].data_file ->lower_bounds ,
435- manifest_entry_count, view_of_file_field);
410+ PARSE_INT_BINARY_MAP_FIELD (manifest_entries[row_idx].data_file ->lower_bounds ,
411+ manifest_entry_count, view_of_file_field);
436412 break ;
437413 case 11 :
438- PARSE_BINARY_MAP_FIELD (manifest_entries[row_idx].data_file ->upper_bounds ,
439- manifest_entry_count, view_of_file_field);
414+ PARSE_INT_BINARY_MAP_FIELD (manifest_entries[row_idx].data_file ->upper_bounds ,
415+ manifest_entry_count, view_of_file_field);
440416 break ;
441417 case 12 :
442418 PARSE_BINARY_FIELD (manifest_entries[row_idx].data_file ->key_metadata ,
443419 view_of_file_field);
444420 break ;
445421 case 13 :
446- PARSE_PRIMITIVE_VECTOR_FIELD (
422+ PARSE_INTEGER_VECTOR_FIELD (
447423 manifest_entries[manifest_idx].data_file ->split_offsets , manifest_entry_count,
448424 view_of_file_field, int64_t );
449425 break ;
450426 case 14 :
451- PARSE_PRIMITIVE_VECTOR_FIELD (
452- manifest_entries[manifest_idx].data_file ->equality_ids , manifest_entry_count,
453- view_of_file_field, int32_t );
427+ PARSE_INTEGER_VECTOR_FIELD (manifest_entries[manifest_idx].data_file ->equality_ids ,
428+ manifest_entry_count, view_of_file_field, int32_t );
454429 break ;
455430 case 15 :
456431 PARSE_PRIMITIVE_FIELD (manifest_entries[row_idx].data_file ->sort_order_id ,
@@ -518,8 +493,8 @@ Result<std::vector<ManifestEntry>> ParseManifestEntry(ArrowSchema* schema,
518493
519494 switch (idx) {
520495 case 0 :
521- PARSE_ENUM_FIELD (manifest_entries[row_idx].status , view_of_column,
522- ManifestStatus);
496+ PARSE_PRIMITIVE_FIELD (manifest_entries[row_idx].status , view_of_column,
497+ ManifestStatus);
523498 break ;
524499 case 1 :
525500 PARSE_PRIMITIVE_FIELD (manifest_entries[row_idx].snapshot_id , view_of_column,
0 commit comments