@@ -467,6 +467,35 @@ struct SchemaElement {
467467 10: optional LogicalType logicalType
468468}
469469
470+ struct SchemaElementV3 {
471+ /** Data type for this field. */
472+ 1: optional Type type ;
473+
474+ /** If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values.
475+ *
476+ * CHANGED from v1: this must be omitted for other column types.
477+ */
478+ 2: optional i32 type_length ;
479+
480+ /** repetition of the field. */
481+ 3: optional FieldRepetitionType repetition_type ;
482+
483+ /** Name of the field in the schema */
484+ 4: required string name ;
485+
486+ /** Nested fields. */
487+ 5: optional i32 num_children ;
488+
489+ /** CHANGED from v1: from i32 to i64
490+ */
491+ 6: optional i64 field_id ;
492+
493+ /** The logical type of this SchemaElement */
494+ 7: optional LogicalType logicalType
495+
496+ /** REMOVED from v1: converted_type, scale, precision (obsolete) */
497+ }
498+
470499/**
471500 * Encodings supported by Parquet. Not all encodings are valid for all types. These
472501 * enums are also used to specify the encoding of definition and repetition levels.
@@ -835,6 +864,65 @@ struct ColumnMetaData {
835864 16: optional SizeStatistics size_statistics ;
836865}
837866
867+ struct ColumnChunkMetaDataV3 {
868+ /** REMOVED from v1: type (redundant with SchemaElementV3) */
869+ /** REMOVED from v1: encodings (unnecessary and non-trivial to get right) */
870+ /** REMOVED from v1: path_in_schema (unnecessary and wasteful) */
871+ /** REMOVED from v1: index_page_offset (unused in practice?) */
872+
873+ /** Compression codec **/
874+ 1: required CompressionCodec codec
875+
876+ /** Number of values in this column chunk **/
877+ 2: required i64 num_values
878+
879+ /** total byte size of all uncompressed pages in this column chunk (including the headers) **/
880+ 3: required i64 total_uncompressed_size
881+
882+ /** total byte size of all compressed, and potentially encrypted, pages
883+ * in this column chunk (including the headers) **/
884+ 4: required i64 total_compressed_size
885+
886+ /** Optional key/value metadata for this column chunk.
887+ ** CHANGED from v1: only use this for chunk-specific metadata, otherwise
888+ ** use `FileColumnMetadataV3.key_value_metadata`.
889+ **/
890+ 5: optional list<KeyValue> key_value_metadata
891+
892+ /** Byte offset from beginning of file to first data page **/
893+ 6: required i64 data_page_offset
894+
895+ /** Byte offset from the beginning of file to first (only) dictionary page **/
896+ 7: optional i64 dictionary_page_offset
897+
898+ /** optional statistics for this column chunk */
899+ 8: optional Statistics statistics ;
900+
901+ /** Set of all encodings used for pages in this column chunk.
902+ * This information can be used to determine if all data pages are
903+ * dictionary encoded for example **/
904+ 9: optional list<PageEncodingStats> encoding_stats ;
905+
906+ /** Byte offset from beginning of file to Bloom filter data. **/
907+ 10: optional i64 bloom_filter_offset ;
908+
909+ /** Size of Bloom filter data including the serialized header, in bytes.
910+ * Added in 2.10 so readers may not read this field from old files and
911+ * it can be obtained after the BloomFilterHeader has been deserialized.
912+ * Writers should write this field so readers can read the bloom filter
913+ * in a single I/O.
914+ */
915+ 11: optional i32 bloom_filter_length ;
916+
917+ /**
918+ * Optional statistics to help estimate total memory when converted to in-memory
919+ * representations. The histograms contained in these statistics can
920+ * also be useful in some cases for more fine-grained nullability/list length
921+ * filter pushdown.
922+ */
923+ 12: optional SizeStatistics size_statistics ;
924+ }
925+
838926struct EncryptionWithFooterKey {
839927}
840928
@@ -885,6 +973,44 @@ struct ColumnChunk {
885973 9: optional binary encrypted_column_metadata
886974}
887975
976+ struct ColumnChunkV3 {
977+ /** File where column data is stored. **/
978+ 1: optional string file_path
979+
980+ /** Byte offset in file_path to the ColumnChunkMetaDataV3, optionally encrypted
981+ ** CHANGED from v1: renamed to metadata_file_offset
982+ **/
983+ 2: required i64 metadata_file_offset
984+
985+ /** NEW from v1: Byte length in file_path of ColumnChunkMetaDataV3, optionally encrypted
986+ **/
987+ 3: required i64 metadata_file_length
988+
989+ /** REMOVED from v1: meta_data, encrypted_column_metadata.
990+ ** Use encoded_metadata instead.
991+ **/
992+
993+ /** NEW from v1: Column metadata for this chunk, duplicated here from file_path.
994+ ** This is a Thrift-encoded ColumnChunkMetaDataV3, optionally encrypted.
995+ **/
996+ 4: optional binary encoded_metadata
997+
998+ /** CHANGED from v1: this is now required **/
999+ 5: required i64 offset_index_offset
1000+
1001+ /** CHANGED from v1: this is now required **/
1002+ 6: required i32 offset_index_length
1003+
1004+ /** File offset of ColumnChunk's ColumnIndex **/
1005+ 7: optional i64 column_index_offset
1006+
1007+ /** Size of ColumnChunk's ColumnIndex, in bytes **/
1008+ 8: optional i32 column_index_length
1009+
1010+ /** Crypto metadata of encrypted columns **/
1011+ 9: optional ColumnCryptoMetaData crypto_metadata
1012+ }
1013+
8881014struct RowGroup {
8891015 /** Metadata for each column chunk in this row group.
8901016 * This list must have the same order as the SchemaElement list in FileMetaData.
@@ -914,6 +1040,32 @@ struct RowGroup {
9141040 7: optional i16 ordinal
9151041}
9161042
1043+ struct RowGroupV3 {
1044+ /** REMOVED from v1: columns.
1045+ * Instead, decode each FileColumnMetadataV3 individually as needed.
1046+ */
1047+
1048+ /** Total byte size of all the uncompressed column data in this row group **/
1049+ 1: required i64 total_byte_size
1050+
1051+ /** Number of rows in this row group **/
1052+ 2: required i64 num_rows
1053+
1054+ /** If set, specifies a sort ordering of the rows in this row group. */
1055+ 3: optional list<SortingColumn> sorting_columns
1056+
1057+ /** REMOVED from v1: file_offset.
1058+ * Use the OffsetIndex for each column instead.
1059+ */
1060+
1061+ /** Total byte size of all compressed (and potentially encrypted) column data
1062+ * in this row group **/
1063+ 4: optional i64 total_compressed_size
1064+
1065+ /** Row group ordinal in the file **/
1066+ 5: optional i16 ordinal
1067+ }
1068+
9171069/** Empty struct to signal the order defined by the physical or logical type */
9181070struct TypeDefinedOrder {}
9191071
@@ -1165,6 +1317,62 @@ struct FileMetaData {
11651317 9: optional binary footer_signing_key_metadata
11661318}
11671319
1320+ /** Metadata for a column in this file. */
1321+ struct FileColumnMetadataV3 {
1322+ /** All column chunks in this file (one per row group) **/
1323+ 1: required list<ColumnChunkV3> columns
1324+
1325+ /** Sort order used for the Statistics min_value and max_value fields
1326+ **/
1327+ 2: optional ColumnOrder column_order ;
1328+
1329+ /** NEW from v1: Optional key/value metadata for this column at the file level
1330+ **/
1331+ 3: optional list<KeyValue> key_value_metadata
1332+ }
1333+
1334+ struct FileMetaDataV3 {
1335+ /** Version of this file **/
1336+ 1: required i32 version
1337+
1338+ /** Parquet schema for this file **/
1339+ 2: required list<SchemaElementV3> schema ;
1340+
1341+ /** Number of rows in this file **/
1342+ 3: required i64 num_rows
1343+
1344+ /** Row groups in this file **/
1345+ 4: required list<RowGroupV3> row_groups
1346+
1347+ /** Optional key/value metadata for this file. **/
1348+ 5: optional list<KeyValue> key_value_metadata
1349+
1350+ /** String for application that wrote this file. **/
1351+ 6: optional string created_by
1352+
1353+ /** NEW from v1: byte offset of FileColumnMetadataV3, for each column **/
1354+ 7: required list<i64> file_column_metadata_offset ;
1355+ /** NEW from v1: byte length of FileColumnMetadataV3, for each column **/
1356+ 8: required list<i64> file_column_metadata_length ;
1357+
1358+ /** REMOVED from v1: column_orders.
1359+ ** Use `FileColumnMetadataV3.column_order` instead.
1360+ **/
1361+
1362+ /**
1363+ * Encryption algorithm. This field is set only in encrypted files
1364+ * with plaintext footer. Files with encrypted footer store algorithm id
1365+ * in FileCryptoMetaData structure.
1366+ */
1367+ 9: optional EncryptionAlgorithm encryption_algorithm
1368+
1369+ /**
1370+ * Retrieval metadata of key used for signing the footer.
1371+ * Used only in encrypted files with plaintext footer.
1372+ */
1373+ 10: optional binary footer_signing_key_metadata
1374+ }
1375+
11681376/** Crypto metadata for files with encrypted footer **/
11691377struct FileCryptoMetaData {
11701378 /**
0 commit comments