Skip to content

Commit b2caa21

Browse files
committed
[T1] Allow fixed length encoding for min/max and deprecate encoding_stats
1. Add `min8`/`max8` fields for encoding fixed length binary encoding for min/max for physical types less than or equal 8 bytes. 2. Deprecate `ColumnMetaData.encoding_stats` and replace with a bool `ColumnMetaData.is_fully_dict_encoded`
1 parent 384bedd commit b2caa21

File tree

1 file changed

+19
-19
lines changed

1 file changed

+19
-19
lines changed

src/main/thrift/parquet.thrift

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -242,43 +242,36 @@ struct SizeStatistics {
242242
* All fields are optional.
243243
*/
244244
struct Statistics {
245-
/**
246-
* DEPRECATED: min and max value of the column. Use min_value and max_value.
247-
*
248-
* Values are encoded using PLAIN encoding, except that variable-length byte
249-
* arrays do not include a length prefix.
250-
*
251-
* These fields encode min and max values determined by signed comparison
252-
* only. New files should use the correct order for a column's logical type
253-
* and store the values in the min_value and max_value fields.
254-
*
255-
* To support older readers, these may be set when the column order is
256-
* signed.
257-
*/
245+
/* DEPRECATED: do not use */
258246
1: optional binary max;
259247
2: optional binary min;
260248
/** count of null value in the column */
261249
3: optional i64 null_count;
262250
/** count of distinct values occurring */
263251
4: optional i64 distinct_count;
264252
/**
265-
* Lower and upper bound values for the column, determined by its ColumnOrder.
253+
* Only one pair of max_value/min_value, max1/min1, max2/min2, max4/min4,
254+
* max8/min8 can be set. The pair is determined by the physical type of the
255+
* column. Floating point values are bitcasted to integers. Variable length
256+
* values are set in min_value/max_value.
257+
*
258+
* Min and Max are the lower and upper bound values for the column,
259+
* respectively, as determined by its ColumnOrder.
266260
*
267261
* These may be the actual minimum and maximum values found on a page or column
268262
* chunk, but can also be (more compact) values that do not exist on a page or
269263
* column chunk. For example, instead of storing "Blart Versenwald III", a writer
270264
* may set min_value="B", max_value="C". Such more compact values must still be
271265
* valid values within the column's logical type.
272-
*
273-
* Values are encoded using PLAIN encoding, except that variable-length byte
274-
* arrays do not include a length prefix.
275266
*/
276267
5: optional binary max_value;
277268
6: optional binary min_value;
278269
/** If true, max_value is the actual maximum value for a column */
279270
7: optional bool is_max_value_exact;
280271
/** If true, min_value is the actual minimum value for a column */
281272
8: optional bool is_min_value_exact;
273+
9: optional i64 max8;
274+
10: optional i64 min8;
282275
}
283276

284277
/** Empty structs to use as logical type annotations */
@@ -810,9 +803,13 @@ struct ColumnMetaData {
810803
/** optional statistics for this column chunk */
811804
12: optional Statistics statistics;
812805

813-
/** Set of all encodings used for pages in this column chunk.
806+
/**
807+
* DEPRECATED: use is_fully_dict_encoded instead
808+
*
809+
* Set of all encodings used for pages in this column chunk.
814810
* This information can be used to determine if all data pages are
815-
* dictionary encoded for example **/
811+
* dictionary encoded for example
812+
*/
816813
13: optional list<PageEncodingStats> encoding_stats;
817814

818815
/** Byte offset from beginning of file to Bloom filter data. **/
@@ -833,6 +830,9 @@ struct ColumnMetaData {
833830
* filter pushdown.
834831
*/
835832
16: optional SizeStatistics size_statistics;
833+
834+
/** If true, all data pages are dictionary encoded **/
835+
17: optional bool is_fully_dict_encoded;
836836
}
837837

838838
struct EncryptionWithFooterKey {

0 commit comments

Comments
 (0)