2222#include < cstdlib>
2323#include < functional>
2424#include < iostream>
25+ #include < limits>
2526#include < memory>
2627#include < sstream>
2728#include < string>
@@ -49,6 +50,7 @@ using ::arrow::internal::CreateDir;
4950using ::arrow::internal::PlatformFilename;
5051using ::arrow::util::Float16;
5152using ::parquet::ArrowWriterProperties;
53+ using ::parquet::Encoding;
5254using ::parquet::WriterProperties;
5355
5456struct WriteConfig {
@@ -74,6 +76,13 @@ struct Column {
7476 }
7577};
7678
79+ using EncodingVector = std::vector<Encoding::type>;
80+
81+ struct ColumnWithEncodings {
82+ Column column;
83+ EncodingVector encodings;
84+ };
85+
7786std::shared_ptr<Field> FieldForArray (const std::shared_ptr<Array>& array,
7887 std::string name) {
7988 return field (std::move (name), array->type (), /* nullable=*/ array->null_count () != 0 );
@@ -369,6 +378,100 @@ Result<std::vector<Column>> ExampleColumns(int32_t length, double null_probabili
369378 return columns;
370379}
371380
381+ template <typename T>
382+ constexpr auto kMin = std::numeric_limits<T>::lowest();
383+ template <typename T>
384+ constexpr auto kMax = std::numeric_limits<T>::max();
385+
386+ // Generate columns for physical types along with their supported encodings
387+ Result<std::vector<ColumnWithEncodings>> AllColumnsWithEncodings (
388+ int32_t length, double null_probability = 0.2 ) {
389+ const EncodingVector kIntEncodings = {Encoding::PLAIN, Encoding::RLE_DICTIONARY,
390+ Encoding::DELTA_BINARY_PACKED,
391+ Encoding::BYTE_STREAM_SPLIT};
392+ const EncodingVector kFloatEncodings = {Encoding::PLAIN, Encoding::RLE_DICTIONARY,
393+ Encoding::BYTE_STREAM_SPLIT};
394+ const EncodingVector kBooleanEncodings = {Encoding::PLAIN, Encoding::RLE};
395+ const EncodingVector kByteArrayEncodings = {Encoding::PLAIN, Encoding::RLE_DICTIONARY,
396+ Encoding::DELTA_LENGTH_BYTE_ARRAY,
397+ Encoding::DELTA_BYTE_ARRAY};
398+ const EncodingVector kFixedLenByteArrayEncodings = {
399+ Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::DELTA_BYTE_ARRAY,
400+ Encoding::BYTE_STREAM_SPLIT};
401+
402+ std::vector<ColumnWithEncodings> columns;
403+
404+ random::RandomArrayGenerator gen (42 );
405+ auto name_gen = Column::NameGenerator ();
406+
407+ for (const double true_probability : {0.0 , 0.001 , 0.01 , 0.5 , 0.999 }) {
408+ columns.push_back (
409+ {{name_gen (), gen.Boolean (length, true_probability, null_probability)},
410+ kBooleanEncodings });
411+ }
412+
413+ columns.push_back (
414+ {{name_gen (), gen.Int32 (length, -100 , 100 , null_probability)}, kIntEncodings });
415+ columns.push_back (
416+ {{name_gen (), gen.Int32 (length, kMin <int32_t >, kMax <int32_t >, null_probability)},
417+ kIntEncodings });
418+ columns.push_back ({{name_gen (), gen.Int64 (length, -100'000 , 100'000 , null_probability)},
419+ kIntEncodings });
420+ columns.push_back (
421+ {{name_gen (), gen.Int64 (length, kMin <int64_t >, kMax <int64_t >, null_probability)},
422+ kIntEncodings });
423+
424+ // XXX should we add INT96? It's deprecated, only supports PLAIN and is featured in
425+ // the parquet-testing files.
426+
427+ // NOTE: will need to vary NaNs if there are encodings for which that matters (ALP?)
428+ columns.push_back (
429+ {{name_gen (), gen.Float32 (length, -1.0 , 1.0 , null_probability)}, kFloatEncodings });
430+ columns.push_back (
431+ {{name_gen (), gen.Float32 (length, kMin <float >, kMax <float >, null_probability)},
432+ kFloatEncodings });
433+ columns.push_back (
434+ {{name_gen (), gen.Float64 (length, -1.0 , 1.0 , null_probability)}, kFloatEncodings });
435+ columns.push_back (
436+ {{name_gen (), gen.Float64 (length, kMin <double >, kMax <double >, null_probability)},
437+ kFloatEncodings });
438+
439+ columns.push_back (
440+ {{name_gen (), gen.Float16 (length, Float16 (-1.0 ), Float16 (1.0 ), null_probability)},
441+ kFixedLenByteArrayEncodings });
442+
443+ columns.push_back ({{name_gen (), gen.String (length, /* min_length=*/ 0 ,
444+ /* max_length=*/ 20 , null_probability)},
445+ kByteArrayEncodings });
446+ columns.push_back ({{name_gen (), gen.String (length, /* min_length=*/ 12 ,
447+ /* max_length=*/ 14 , null_probability)},
448+ kByteArrayEncodings });
449+ columns.push_back ({{name_gen (), gen.String (length, /* min_length=*/ 100 ,
450+ /* max_length=*/ 200 , null_probability)},
451+ kByteArrayEncodings });
452+ columns.push_back ({{name_gen (), gen.StringWithRepeats (
453+ length, /* unique=*/ length / 50 , /* min_length=*/ 0 ,
454+ /* max_length=*/ 20 , null_probability)},
455+ kByteArrayEncodings });
456+ columns.push_back ({{name_gen (), gen.StringWithRepeats (
457+ length, /* unique=*/ length / 100 , /* min_length=*/ 12 ,
458+ /* max_length=*/ 14 , null_probability)},
459+ kByteArrayEncodings });
460+
461+ return columns;
462+ }
463+
464+ Result<std::vector<ColumnWithEncodings>> AllColumnsWithEncodings () {
465+ std::vector<ColumnWithEncodings> all_columns;
466+
467+ for (const double null_probability : {0.0 , 0.01 , 0.5 , 1.0 }) {
468+ ARROW_ASSIGN_OR_RAISE (auto columns,
469+ AllColumnsWithEncodings (/* length=*/ 1'000 , null_probability));
470+ all_columns.insert (all_columns.end (), columns.begin (), columns.end ());
471+ }
472+ return all_columns;
473+ }
474+
372475Result<std::shared_ptr<RecordBatch>> BatchFromColumns (const std::vector<Column>& columns,
373476 int64_t num_rows) {
374477 FieldVector fields;
@@ -443,7 +546,7 @@ Status DoMain(const std::string& out_dir) {
443546 };
444547
445548 {
446- // 1. Unencrypted files
549+ // 1. Unencrypted files for various write configurations
447550 // Write a cardinal product of example batches x write configurations
448551 ARROW_ASSIGN_OR_RAISE (auto batches, Batches ());
449552 auto write_configs = GetWriteConfigurations ();
@@ -458,7 +561,34 @@ Status DoMain(const std::string& out_dir) {
458561 }
459562 }
460563 {
461- // 2. Encrypted files
564+ // 2. Unencrypted files for various column encodings
565+ // Write one file per (column, encoding) pair.
566+ ARROW_ASSIGN_OR_RAISE (auto columns, AllColumnsWithEncodings ());
567+
568+ for (const auto & column : columns) {
569+ RETURN_NOT_OK (column.column .array ->ValidateFull ());
570+ ARROW_ASSIGN_OR_RAISE (
571+ auto batch, BatchFromColumn (column.column , column.column .array ->length ()));
572+ ARROW_ASSIGN_OR_RAISE (auto table, Table::FromRecordBatches ({batch}));
573+
574+ for (const auto encoding : column.encodings ) {
575+ auto w_props_builder = MakePropertiesBuilder ();
576+ if (encoding == Encoding::RLE_DICTIONARY) {
577+ // RLE_DICTIONARY is enabled through enable_dictionary() rather than
578+ // encoding(), also increase the dictionary page size limit as we
579+ // generate data with a typically high cardinality.
580+ w_props_builder.enable_dictionary ()->dictionary_pagesize_limit (1'000'000 );
581+ } else {
582+ w_props_builder.disable_dictionary ()->encoding (encoding);
583+ }
584+ auto w_props = w_props_builder.build ();
585+ auto a_props = MakeArrowPropertiesBuilder ().build ();
586+ RETURN_NOT_OK (write_sample (table, WriteConfig{w_props, a_props}));
587+ }
588+ }
589+ }
590+ {
591+ // 3. Encrypted files
462592 // Use a single batch and write it using different configurations
463593 ARROW_ASSIGN_OR_RAISE (auto batch, BatchForEncryption ());
464594 auto write_configs = GetEncryptedWriteConfigurations (*batch->schema ());
0 commit comments