2121#include < string>
2222#include < vector>
2323
24+ #include " arrow/util/bit_util.h"
2425#include " arrow/util/compression.h"
2526#include " arrow/util/crc32.h"
2627#include " arrow/util/endian.h"
2728#include " arrow/util/ubsan.h"
2829#include " arrow/util/unreachable.h"
29- #include " parquet/file_writer.h"
3030#include " generated/parquet_types.h"
31+ #include " parquet/file_writer.h"
3132#include " parquet/thrift_internal.h"
3233
3334namespace parquet {
@@ -92,30 +93,35 @@ static_assert(IsEnumEq(format::PageType::DICTIONARY_PAGE,
9293constexpr double kMinCompressionRatio = 1.2 ;
9394
9495constexpr uint8_t kExtUUID [16 ] = {0xde , 0xad , 0xbe , 0xef , 0xde , 0xad , 0xbe , 0xef ,
95- 0xde , 0xad , 0xbe , 0xef , 0xde , 0xad , 0xbe , 0xef };
96+ 0xde , 0xad , 0xbe , 0xef , 0xde , 0xad , 0xbe , 0xef };
9697
9798// Extended format compression codec (using same values as format3::CompressionCodec)
9899enum class CompressionCodec : uint8_t {
99100 UNCOMPRESSED = 0 ,
100101 LZ4_RAW = 7 ,
101102};
102103
103- auto GetNumChildren (
104+ int32_t GetNumChildren (
104105 const flatbuffers::Vector<flatbuffers::Offset<format3::SchemaElement>>& s, size_t i) {
105106 return s.Get (i)->num_children ();
106107}
107108
108- auto GetNumChildren (const std::vector<format::SchemaElement>& s, size_t i) {
109+ int32_t GetNumChildren (const std::vector<format::SchemaElement>& s, size_t i) {
109110 return s[i].num_children ;
110111}
111112
112- auto GetName ( const flatbuffers::Vector<flatbuffers::Offset<format3::SchemaElement>>& s,
113- size_t i) {
113+ std::string GetName (
114+ const flatbuffers::Vector<flatbuffers::Offset<format3::SchemaElement>>& s, size_t i) {
114115 return s.Get (i)->name ()->str ();
115116}
116117
117- auto GetName (const std::vector<format::SchemaElement>& s, size_t i) { return s[i].name ; }
118+ std::string GetName (const std::vector<format::SchemaElement>& s, size_t i) {
119+ return s[i].name ;
120+ }
118121
122+ // Maps between column chunk indices (leaf columns only) and schema element indices
123+ // (all columns including groups). Also tracks parent relationships for building
124+ // column paths.
119125class ColumnMap {
120126 public:
121127 template <typename Schema>
@@ -126,7 +132,9 @@ class ColumnMap {
126132 BuildParents (s);
127133 }
128134
135+ // Convert a column chunk index to the corresponding schema element index.
129136 size_t ToSchema (size_t cc_idx) const { return colchunk2schema_[cc_idx]; }
137+ // Convert a schema element index to its column chunk index, if it is a leaf column.
130138 std::optional<size_t > ToCc (size_t schema_idx) const {
131139 auto it =
132140 std::lower_bound (colchunk2schema_.begin (), colchunk2schema_.end (), schema_idx);
@@ -177,6 +185,9 @@ class ColumnMap {
177185 std::vector<uint32_t > parents_;
178186};
179187
188+ // Packed representation of min/max statistics for a column chunk.
189+ // Values are split into lo4 (4 bytes), lo8 (8 bytes), and hi8 (8 bytes) parts
190+ // to allow compact flatbuffer encoding.
180191struct MinMax {
181192 struct Packed {
182193 uint32_t lo4 = 0 ;
@@ -1137,80 +1148,62 @@ static std::string PackFlatbuffer(const std::string& in) {
11371148 uint8_t * const p = reinterpret_cast <uint8_t *>(out.data ()) + n + 1 ;
11381149
11391150 // Compute and store checksums and lengths
1140- uint32_t crc32 = ::arrow::internal::crc32 (0 , reinterpret_cast <const uint8_t *>(out.data ()), n + 1 );
1141- StoreLE32 (crc32, p + 0 ); // crc32(data .. compressor)
1142- StoreLE32 (n, p + 4 ); // compressed_len
1143- StoreLE32 (in.size (), p + 8 ); // raw_len
1151+ uint32_t crc32 =
1152+ ::arrow::internal::crc32 (0 , reinterpret_cast <const uint8_t *>(out.data()), n + 1);
1153+ StoreLE32 (crc32, p + 0 ); // crc32(data .. compressor)
1154+ StoreLE32 (n, p + 4 ); // compressed_len
1155+ StoreLE32 (in.size (), p + 8 ); // raw_len
11441156 uint32_t len_crc32 = ::arrow::internal::crc32 (0 , p + 4 , 8 );
1145- StoreLE32 (len_crc32, p + 12 ); // crc32(compressed_len .. raw_len)
1157+ StoreLE32 (len_crc32, p + 12 ); // crc32(compressed_len .. raw_len)
11461158
11471159 // Store UUID identifier
11481160 std::memcpy (p + 16 , kExtUUID , 16 );
11491161 out.resize (n + 33 );
11501162 return out;
11511163}
11521164
1153- inline uint8_t * WriteULEB64 (uint64_t v, uint8_t * out) {
1154- uint8_t * p = out;
1155- do {
1156- uint8_t b = v & 0x7F ;
1157- if (v < 0x80 ) {
1158- *p++ = b;
1159- return p;
1160- }
1161- *p++ = b | 0x80 ;
1162- v >>= 7 ;
1163- } while (true );
1164- }
1165-
1166- inline uint32_t CountLeadingZeros32 (uint32_t v) {
1167- if (v == 0 ) return 32 ;
1168- uint32_t count = 0 ;
1169- uint32_t mask = 0x80000000 ;
1170- while ((v & mask) == 0 ) {
1171- ++count;
1172- mask >>= 1 ;
1173- }
1174- return count;
1175- }
1176-
1177- inline int32_t ULEB32Len (uint32_t v) {
1178- return 1 + ((32 - CountLeadingZeros32 (v | 0x1 )) * 9 ) / 64 ;
1179- }
1180-
11811165void AppendFlatbuffer (std::string flatbuffer, std::string* thrift) {
1166+ using ::arrow::bit_util::kMaxLEB128ByteLenFor ;
1167+ using ::arrow::bit_util::WriteLEB128;
1168+
11821169 // Pack the flatbuffer with LZ4 compression and checksums
11831170 std::string packed = PackFlatbuffer (flatbuffer);
11841171
11851172 const uint32_t kFieldId = 32767 ;
1186- int header_size = 1 + ULEB32Len (kFieldId ) + ULEB32Len (packed.length ());
1173+ // Max header: 1 (type byte) + max ULEB for field id + max ULEB for packed length
1174+ constexpr int32_t kMaxHeaderSize =
1175+ 1 + kMaxLEB128ByteLenFor <uint32_t > + kMaxLEB128ByteLenFor <uint32_t >;
11871176
11881177 const size_t old_size = thrift->size ();
1189- thrift->resize (old_size + header_size + packed.size () + 1 ); // +1 for stop field
1178+ thrift->resize (old_size + kMaxHeaderSize + packed.size () + 1 ); // +1 for stop field
11901179
11911180 // Pointer to the new write position
11921181 uint8_t * p = reinterpret_cast <uint8_t *>(&(*thrift)[old_size]);
11931182
11941183 // Write the binary type indicator
11951184 *p++ = 0x08 ;
11961185
1197- // Write field id and size using ULEB64
1198- p = WriteULEB64 (kFieldId , p);
1199- p = WriteULEB64 (static_cast <uint32_t >(packed.size ()), p);
1186+ // Write field id and size using ULEB128
1187+ p += WriteLEB128 (kFieldId , p, kMaxLEB128ByteLenFor <uint32_t >);
1188+ p += WriteLEB128 (static_cast <uint32_t >(packed.size ()), p,
1189+ kMaxLEB128ByteLenFor <uint32_t >);
12001190
12011191 // Copy the packed payload
12021192 std::memcpy (p, packed.data (), packed.size ());
12031193 p += packed.size ();
12041194
12051195 // Add stop field
12061196 *p = 0x00 ;
1207- return ;
1197+
1198+ // Trim to actual size (header may have been smaller than max)
1199+ thrift->resize (p - reinterpret_cast <uint8_t *>(thrift->data ()) + 1 );
12081200}
12091201
1210- ::arrow::Result<int32_t > ExtractFlatbuffer (std::shared_ptr<Buffer> buf, std::string* out_flatbuffer) {
1202+ ::arrow::Result<uint32_t > ExtractFlatbuffer (std::shared_ptr<Buffer> buf,
1203+ std::string* out_flatbuffer) {
12111204 if (buf->size () < 8 ) return 8 ;
12121205 PARQUET_THROW_NOT_OK (CheckMagicNumber (buf->data () + buf->size () - 4 ));
1213- uint32_t md_len = LoadLE32 (buf->data () + buf->size () -8 );
1206+ uint32_t md_len = LoadLE32 (buf->data () + buf->size () - 8 );
12141207 if (md_len < 34 ) return 0 ;
12151208 if (buf->size () < 42 ) return 42 ; // 34 (metadata3 trailer) + 8 (len + PAR1)
12161209
@@ -1235,7 +1228,8 @@ ::arrow::Result<int32_t> ExtractFlatbuffer(std::shared_ptr<Buffer> buf, std::str
12351228 }
12361229
12371230 // Verify data CRC
1238- uint32_t expected_crc = ::arrow::internal::crc32 (0 , p - compressed_len, compressed_len + 1 );
1231+ uint32_t expected_crc =
1232+ ::arrow::internal::crc32 (0 , p - compressed_len, compressed_len + 1 );
12391233 if (crc32_val != expected_crc) {
12401234 return ::arrow::Status::Invalid (" Extended metadata data CRC mismatch" );
12411235 }
@@ -1254,11 +1248,11 @@ ::arrow::Result<int32_t> ExtractFlatbuffer(std::shared_ptr<Buffer> buf, std::str
12541248 return ::arrow::Status::Invalid (" LZ4 length error: raw_len < compressed_len" );
12551249 }
12561250 // Use Arrow's LZ4 codec for decompression
1257- ARROW_ASSIGN_OR_RAISE (auto codec, :: arrow::util::Codec::Create (::arrow::Compression::LZ4));
1258- ARROW_ASSIGN_OR_RAISE (
1259- int64_t actual_size,
1260- codec->Decompress (compressed_len, p - compressed_len, raw_len,
1261- decompressed_data.data ()));
1251+ ARROW_ASSIGN_OR_RAISE (auto codec,
1252+ ::arrow::util::Codec::Create (::arrow::Compression::LZ4));
1253+ ARROW_ASSIGN_OR_RAISE ( int64_t actual_size,
1254+ codec->Decompress (compressed_len, p - compressed_len, raw_len,
1255+ decompressed_data.data ()));
12621256 if (static_cast <uint32_t >(actual_size) != raw_len) {
12631257 return ::arrow::Status::Invalid (" LZ4 decompression failed: expected " , raw_len,
12641258 " bytes but got " , actual_size, " bytes" );
@@ -1276,7 +1270,8 @@ ::arrow::Result<int32_t> ExtractFlatbuffer(std::shared_ptr<Buffer> buf, std::str
12761270 }
12771271
12781272 ARROW_CHECK_NE (out_flatbuffer, nullptr );
1279- out_flatbuffer->assign (reinterpret_cast <const char *>(decompressed_data.data ()), raw_len);
1273+ out_flatbuffer->assign (reinterpret_cast <const char *>(decompressed_data.data ()),
1274+ raw_len);
12801275
12811276 return compressed_len + 42 ;
12821277}
0 commit comments