2222#include < cstdlib>
2323#include < functional>
2424#include < iostream>
25+ #include < limits>
2526#include < memory>
2627#include < sstream>
2728#include < string>
29+ #include < utility>
2830#include < vector>
2931
3032#include " arrow/array.h"
@@ -49,9 +51,11 @@ using ::arrow::internal::CreateDir;
4951using ::arrow::internal::PlatformFilename;
5052using ::arrow::util::Float16;
5153using ::parquet::ArrowWriterProperties;
54+ using ::parquet::Encoding;
5255using ::parquet::WriterProperties;
5356
5457struct WriteConfig {
58+ std::string name;
5559 std::shared_ptr<WriterProperties> writer_properties;
5660 std::shared_ptr<ArrowWriterProperties> arrow_writer_properties;
5761};
@@ -74,6 +78,13 @@ struct Column {
7478 }
7579};
7680
81+ using EncodingVector = std::vector<Encoding::type>;
82+
83+ struct ColumnWithEncodings {
84+ Column column;
85+ EncodingVector encodings;
86+ };
87+
7788std::shared_ptr<Field> FieldForArray (const std::shared_ptr<Array>& array,
7889 std::string name) {
7990 return field (std::move (name), array->type (), /* nullable=*/ array->null_count () != 0 );
@@ -135,13 +146,13 @@ std::vector<WriteConfig> GetWriteConfigurations() {
135146 // clang-format on
136147
137148 std::vector<WriteConfig> configs;
138- configs.push_back ({w_uncompressed, a_default});
139- configs.push_back ({w_brotli, a_default});
140- configs.push_back ({w_gzip, a_default});
141- configs.push_back ({w_lz4, a_default});
142- configs.push_back ({w_snappy, a_default});
143- configs.push_back ({w_zstd, a_default});
144- configs.push_back ({w_pages_v1, a_default});
149+ configs.push_back ({" uncompressed " , w_uncompressed, a_default});
150+ configs.push_back ({" brotli " , w_brotli, a_default});
151+ configs.push_back ({" gzip " , w_gzip, a_default});
152+ configs.push_back ({" lz4 " , w_lz4, a_default});
153+ configs.push_back ({" snappy " , w_snappy, a_default});
154+ configs.push_back ({" zstd " , w_zstd, a_default});
155+ configs.push_back ({" v1_data_page " , w_pages_v1, a_default});
145156 return configs;
146157}
147158
@@ -158,12 +169,14 @@ std::vector<WriteConfig> GetEncryptedWriteConfigurations(const ::arrow::Schema&
158169 return builder;
159170 };
160171
161- std::vector<std::shared_ptr<FileEncryptionProperties>> file_encryptions;
172+ std::vector<std::tuple<std::string, std::shared_ptr<FileEncryptionProperties>>>
173+ file_encryptions;
162174
163175 // Uniform encryption
164- file_encryptions.push_back (file_encryption_builder ().build ());
176+ file_encryptions.push_back ({ " uniform " , file_encryption_builder ().build ()} );
165177 // Uniform encryption with plaintext footer
166- file_encryptions.push_back (file_encryption_builder ().set_plaintext_footer ()->build ());
178+ file_encryptions.push_back ({" uniform_plaintext_footer" ,
179+ file_encryption_builder ().set_plaintext_footer ()->build ()});
167180 // Columns encrypted with individual keys
168181 {
169182 ColumnPathToEncryptionPropertiesMap column_map;
@@ -174,7 +187,8 @@ std::vector<WriteConfig> GetEncryptedWriteConfigurations(const ::arrow::Schema&
174187 }
175188 ARROW_DCHECK_NE (column_map.size (), 0 );
176189 file_encryptions.push_back (
177- file_encryption_builder ().encrypted_columns (std::move (column_map))->build ());
190+ {" column_keys" ,
191+ file_encryption_builder ().encrypted_columns (std::move (column_map))->build ()});
178192 }
179193 // Unencrypted columns
180194 {
@@ -184,15 +198,16 @@ std::vector<WriteConfig> GetEncryptedWriteConfigurations(const ::arrow::Schema&
184198 }
185199 ARROW_DCHECK_NE (column_map.size (), 0 );
186200 file_encryptions.push_back (
187- file_encryption_builder ().encrypted_columns (std::move (column_map))->build ());
201+ {" unencrypted_columns" ,
202+ file_encryption_builder ().encrypted_columns (std::move (column_map))->build ()});
188203 }
189204
190205 auto a_default = MakeArrowPropertiesBuilder ().build ();
191206
192207 std::vector<WriteConfig> configs;
193- for (const auto & file_encryption : file_encryptions) {
208+ for (auto [name, file_encryption] : file_encryptions) {
194209 auto writer_properties = MakePropertiesBuilder ().encryption (file_encryption)->build ();
195- configs.push_back ({writer_properties, a_default});
210+ configs.push_back ({name, writer_properties, a_default});
196211 }
197212 return configs;
198213}
@@ -369,6 +384,110 @@ Result<std::vector<Column>> ExampleColumns(int32_t length, double null_probabili
369384 return columns;
370385}
371386
387+ template <typename T>
388+ constexpr auto kMin = std::numeric_limits<T>::lowest();
389+ template <typename T>
390+ constexpr auto kMax = std::numeric_limits<T>::max();
391+
392+ // Generate columns for physical types along with their supported encodings
393+ Result<std::vector<ColumnWithEncodings>> AllColumnsWithEncodings (
394+ int32_t length, double null_probability = 0.2 ) {
395+ const EncodingVector kIntEncodings = {Encoding::PLAIN, Encoding::RLE_DICTIONARY,
396+ Encoding::DELTA_BINARY_PACKED,
397+ Encoding::BYTE_STREAM_SPLIT};
398+ const EncodingVector kFloatEncodings = {Encoding::PLAIN, Encoding::RLE_DICTIONARY,
399+ Encoding::BYTE_STREAM_SPLIT};
400+ const EncodingVector kBooleanEncodings = {Encoding::PLAIN, Encoding::RLE};
401+ const EncodingVector kByteArrayEncodings = {Encoding::PLAIN, Encoding::RLE_DICTIONARY,
402+ Encoding::DELTA_LENGTH_BYTE_ARRAY,
403+ Encoding::DELTA_BYTE_ARRAY};
404+ const EncodingVector kFixedLenByteArrayEncodings = {
405+ Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::DELTA_BYTE_ARRAY,
406+ Encoding::BYTE_STREAM_SPLIT};
407+ const EncodingVector kInt96Encodings = {Encoding::PLAIN};
408+
409+ std::vector<ColumnWithEncodings> columns;
410+
411+ random::RandomArrayGenerator gen (42 );
412+ auto name_gen = Column::NameGenerator ();
413+
414+ for (const double true_probability : {0.0 , 0.001 , 0.01 , 0.5 , 0.999 }) {
415+ columns.push_back (
416+ {{name_gen (), gen.Boolean (length, true_probability, null_probability)},
417+ kBooleanEncodings });
418+ }
419+
420+ // Generate integer columns with different ranges to trigger delta encoding modes
421+ columns.push_back (
422+ {{name_gen (), gen.Int32 (length, -100 , 100 , null_probability)}, kIntEncodings });
423+ columns.push_back (
424+ {{name_gen (), gen.Int32 (length, kMin <int32_t >, kMax <int32_t >, null_probability)},
425+ kIntEncodings });
426+ columns.push_back ({{name_gen (), gen.Int64 (length, -100'000 , 100'000 , null_probability)},
427+ kIntEncodings });
428+ columns.push_back (
429+ {{name_gen (), gen.Int64 (length, kMin <int64_t >, kMax <int64_t >, null_probability)},
430+ kIntEncodings });
431+
432+ // This won't necessarily span all 96 bits of precision, but PLAIN encoding allows
433+ // the fuzzer to do its thing on the values.
434+ for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}) {
435+ ARROW_ASSIGN_OR_RAISE (
436+ auto array, gen.Int64 (length, kMin <int64_t >, kMax <int64_t >, null_probability)
437+ ->View (timestamp (unit)));
438+ columns.push_back ({{name_gen (), array}, kInt96Encodings });
439+ }
440+
441+ // NOTE: will need to vary NaNs if there are encodings for which that matters (ALP?)
442+ columns.push_back (
443+ {{name_gen (), gen.Float32 (length, -1.0 , 1.0 , null_probability)}, kFloatEncodings });
444+ columns.push_back (
445+ {{name_gen (), gen.Float32 (length, kMin <float >, kMax <float >, null_probability)},
446+ kFloatEncodings });
447+ columns.push_back (
448+ {{name_gen (), gen.Float64 (length, -1.0 , 1.0 , null_probability)}, kFloatEncodings });
449+ columns.push_back (
450+ {{name_gen (), gen.Float64 (length, kMin <double >, kMax <double >, null_probability)},
451+ kFloatEncodings });
452+
453+ // For FLBA
454+ columns.push_back (
455+ {{name_gen (), gen.Float16 (length, Float16 (-1.0 ), Float16 (1.0 ), null_probability)},
456+ kFixedLenByteArrayEncodings });
457+
458+ // For BYTE_ARRAY (vary lengths and repetitions to trigger encoding modes)
459+ columns.push_back ({{name_gen (), gen.String (length, /* min_length=*/ 0 ,
460+ /* max_length=*/ 20 , null_probability)},
461+ kByteArrayEncodings });
462+ columns.push_back ({{name_gen (), gen.String (length, /* min_length=*/ 12 ,
463+ /* max_length=*/ 14 , null_probability)},
464+ kByteArrayEncodings });
465+ columns.push_back ({{name_gen (), gen.String (length, /* min_length=*/ 100 ,
466+ /* max_length=*/ 200 , null_probability)},
467+ kByteArrayEncodings });
468+ columns.push_back ({{name_gen (), gen.StringWithRepeats (
469+ length, /* unique=*/ length / 50 , /* min_length=*/ 0 ,
470+ /* max_length=*/ 20 , null_probability)},
471+ kByteArrayEncodings });
472+ columns.push_back ({{name_gen (), gen.StringWithRepeats (
473+ length, /* unique=*/ length / 100 , /* min_length=*/ 12 ,
474+ /* max_length=*/ 14 , null_probability)},
475+ kByteArrayEncodings });
476+
477+ return columns;
478+ }
479+
480+ Result<std::vector<ColumnWithEncodings>> AllColumnsWithEncodings () {
481+ std::vector<ColumnWithEncodings> all_columns;
482+
483+ for (const double null_probability : {0.0 , 0.01 , 0.5 , 1.0 }) {
484+ ARROW_ASSIGN_OR_RAISE (auto columns,
485+ AllColumnsWithEncodings (/* length=*/ 1'000 , null_probability));
486+ all_columns.insert (all_columns.end (), columns.begin (), columns.end ());
487+ }
488+ return all_columns;
489+ }
490+
372491Result<std::shared_ptr<RecordBatch>> BatchFromColumns (const std::vector<Column>& columns,
373492 int64_t num_rows) {
374493 FieldVector fields;
@@ -425,13 +544,25 @@ Status DoMain(const std::string& out_dir) {
425544 RETURN_NOT_OK (CreateDir (dir_fn));
426545
427546 int sample_num = 1 ;
428- auto sample_name = [&]() -> std::string {
429- return " pq-table-" + std::to_string (sample_num++);
547+ auto sample_file_name = [&](const std::string& name = " " ) -> std::string {
548+ std::stringstream ss;
549+ if (!name.empty ()) {
550+ ss << name << " -" ;
551+ }
552+ ss << sample_num++ << " .pq" ;
553+ return std::move (ss).str ();
430554 };
431555
432- auto write_sample = [&](const std::shared_ptr<Table>& table,
433- const WriteConfig& config) -> Status {
434- ARROW_ASSIGN_OR_RAISE (auto sample_fn, dir_fn.Join (sample_name ()));
556+ auto write_sample = [&](const std::shared_ptr<Table>& table, const WriteConfig& config,
557+ std::string name = " " ) -> Status {
558+ if (name.empty () && table->num_columns () == 1 ) {
559+ name = table->schema ()->field (0 )->type ()->name ();
560+ }
561+ if (!name.empty ()) {
562+ name += " -" ;
563+ }
564+ name += config.name ;
565+ ARROW_ASSIGN_OR_RAISE (auto sample_fn, dir_fn.Join (sample_file_name (name)));
435566 std::cerr << sample_fn.ToString () << std::endl;
436567 ARROW_ASSIGN_OR_RAISE (auto file, io::FileOutputStream::Open (sample_fn.ToString ()));
437568 // Emit several row groups
@@ -443,7 +574,7 @@ Status DoMain(const std::string& out_dir) {
443574 };
444575
445576 {
446- // 1. Unencrypted files
577+ // 1. Unencrypted files for various write configurations
447578 // Write a cardinal product of example batches x write configurations
448579 ARROW_ASSIGN_OR_RAISE (auto batches, Batches ());
449580 auto write_configs = GetWriteConfigurations ();
@@ -458,7 +589,37 @@ Status DoMain(const std::string& out_dir) {
458589 }
459590 }
460591 {
461- // 2. Encrypted files
592+ // 2. Unencrypted files for various column encodings
593+ // Write one file per (column, encoding) pair.
594+ ARROW_ASSIGN_OR_RAISE (auto columns, AllColumnsWithEncodings ());
595+
596+ for (const auto & column : columns) {
597+ RETURN_NOT_OK (column.column .array ->ValidateFull ());
598+ ARROW_ASSIGN_OR_RAISE (
599+ auto batch, BatchFromColumn (column.column , column.column .array ->length ()));
600+ ARROW_ASSIGN_OR_RAISE (auto table, Table::FromRecordBatches ({batch}));
601+
602+ for (const auto encoding : column.encodings ) {
603+ auto w_props_builder = MakePropertiesBuilder ();
604+ if (encoding == Encoding::RLE_DICTIONARY) {
605+ // RLE_DICTIONARY is enabled through enable_dictionary() rather than
606+ // encoding(), also increase the dictionary page size limit as we
607+ // generate data with a typically high cardinality.
608+ w_props_builder.enable_dictionary ()->dictionary_pagesize_limit (1'000'000 );
609+ } else {
610+ w_props_builder.disable_dictionary ()->encoding (encoding);
611+ }
612+ auto w_props = w_props_builder.build ();
613+ // Ensure that we generate INT96 Parquet data when given a timestamp column
614+ auto a_props =
615+ MakeArrowPropertiesBuilder ().enable_deprecated_int96_timestamps ()->build ();
616+ auto config_name = ::parquet::EncodingToString (encoding);
617+ RETURN_NOT_OK (write_sample (table, WriteConfig{config_name, w_props, a_props}));
618+ }
619+ }
620+ }
621+ {
622+ // 3. Encrypted files
462623 // Use a single batch and write it using different configurations
463624 ARROW_ASSIGN_OR_RAISE (auto batch, BatchForEncryption ());
464625 auto write_configs = GetEncryptedWriteConfigurations (*batch->schema ());
0 commit comments