2222#include < cstdlib>
2323#include < functional>
2424#include < iostream>
25+ #include < limits>
2526#include < memory>
2627#include < sstream>
2728#include < string>
@@ -49,6 +50,7 @@ using ::arrow::internal::CreateDir;
4950using ::arrow::internal::PlatformFilename;
5051using ::arrow::util::Float16;
5152using ::parquet::ArrowWriterProperties;
53+ using ::parquet::Encoding;
5254using ::parquet::WriterProperties;
5355
5456struct WriteConfig {
@@ -74,6 +76,11 @@ struct Column {
7476 }
7577};
7678
79+ struct ColumnWithEncodings {
80+ Column column;
81+ std::vector<Encoding::type> encodings;
82+ };
83+
7784std::shared_ptr<Field> FieldForArray (const std::shared_ptr<Array>& array,
7885 std::string name) {
7986 return field (std::move (name), array->type (), /* nullable=*/ array->null_count () != 0 );
@@ -378,6 +385,98 @@ Result<std::vector<Column>> ExampleColumns(int32_t length, double null_probabili
378385 return columns;
379386}
380387
388+ template <typename T>
389+ constexpr auto kMin = std::numeric_limits<T>::lowest();
390+ template <typename T>
391+ constexpr auto kMax = std::numeric_limits<T>::max();
392+
393+ // Generate columns for physical types along with their supported encodings
394+ Result<std::vector<ColumnWithEncodings>> AllColumnsWithEncodings (
395+ int32_t length, double null_probability = 0.2 ) {
396+ // TODO add RLE_DICTIONARY?
397+ const std::vector<Encoding::type> kIntEncodings = {
398+ Encoding::PLAIN, Encoding::DELTA_BINARY_PACKED, Encoding::BYTE_STREAM_SPLIT};
399+ const std::vector<Encoding::type> kFloatEncodings = {Encoding::PLAIN,
400+ Encoding::BYTE_STREAM_SPLIT};
401+ const std::vector<Encoding::type> kBooleanEncodings = {Encoding::PLAIN, Encoding::RLE};
402+ const std::vector<Encoding::type> kByteArrayEncodings = {
403+ Encoding::PLAIN, Encoding::DELTA_LENGTH_BYTE_ARRAY, Encoding::DELTA_BYTE_ARRAY};
404+ const std::vector<Encoding::type> kFixedLenByteArrayEncodings = {
405+ Encoding::PLAIN, Encoding::DELTA_BYTE_ARRAY, Encoding::BYTE_STREAM_SPLIT};
406+
407+ std::vector<ColumnWithEncodings> columns;
408+
409+ random::RandomArrayGenerator gen (42 );
410+ auto name_gen = Column::NameGenerator ();
411+
412+ for (const double true_probability : {0.0 , 0.001 , 0.01 , 0.5 , 0.999 }) {
413+ columns.push_back (
414+ {{name_gen (), gen.Boolean (length, true_probability, null_probability)},
415+ kBooleanEncodings });
416+ }
417+
418+ columns.push_back (
419+ {{name_gen (), gen.Int32 (length, -100 , 100 , null_probability)}, kIntEncodings });
420+ columns.push_back (
421+ {{name_gen (), gen.Int32 (length, kMin <int32_t >, kMax <int32_t >, null_probability)},
422+ kIntEncodings });
423+ columns.push_back ({{name_gen (), gen.Int64 (length, -100'000 , 100'000 , null_probability)},
424+ kIntEncodings });
425+ columns.push_back (
426+ {{name_gen (), gen.Int64 (length, kMin <int64_t >, kMax <int64_t >, null_probability)},
427+ kIntEncodings });
428+
429+ // XXX should we add INT96? It's deprecated, only supports PLAIN and is featured in
430+ // the parquet-testing files.
431+
432+ // NOTE: will need to vary NaNs if there are encodings for which that matters (ALP?)
433+ columns.push_back (
434+ {{name_gen (), gen.Float32 (length, -1.0 , 1.0 , null_probability)}, kFloatEncodings });
435+ columns.push_back (
436+ {{name_gen (), gen.Float32 (length, kMin <float >, kMax <float >, null_probability)},
437+ kFloatEncodings });
438+ columns.push_back (
439+ {{name_gen (), gen.Float64 (length, -1.0 , 1.0 , null_probability)}, kFloatEncodings });
440+ columns.push_back (
441+ {{name_gen (), gen.Float64 (length, kMin <double >, kMax <double >, null_probability)},
442+ kFloatEncodings });
443+
444+ columns.push_back (
445+ {{name_gen (), gen.Float16 (length, Float16 (-1.0 ), Float16 (1.0 ), null_probability)},
446+ kFixedLenByteArrayEncodings });
447+
448+ columns.push_back ({{name_gen (), gen.String (length, /* min_length=*/ 0 ,
449+ /* max_length=*/ 20 , null_probability)},
450+ kByteArrayEncodings });
451+ columns.push_back ({{name_gen (), gen.String (length, /* min_length=*/ 12 ,
452+ /* max_length=*/ 14 , null_probability)},
453+ kByteArrayEncodings });
454+ columns.push_back ({{name_gen (), gen.String (length, /* min_length=*/ 100 ,
455+ /* max_length=*/ 200 , null_probability)},
456+ kByteArrayEncodings });
457+ columns.push_back ({{name_gen (), gen.StringWithRepeats (
458+ length, /* unique=*/ length / 50 , /* min_length=*/ 0 ,
459+ /* max_length=*/ 20 , null_probability)},
460+ kByteArrayEncodings });
461+ columns.push_back ({{name_gen (), gen.StringWithRepeats (
462+ length, /* unique=*/ length / 100 , /* min_length=*/ 12 ,
463+ /* max_length=*/ 14 , null_probability)},
464+ kByteArrayEncodings });
465+
466+ return columns;
467+ }
468+
469+ Result<std::vector<ColumnWithEncodings>> AllColumnsWithEncodings () {
470+ std::vector<ColumnWithEncodings> all_columns;
471+
472+ for (const double null_probability : {0.0 , 0.01 , 0.5 , 1.0 }) {
473+ ARROW_ASSIGN_OR_RAISE (auto columns,
474+ AllColumnsWithEncodings (/* length=*/ 1'000 , null_probability));
475+ all_columns.insert (all_columns.end (), columns.begin (), columns.end ());
476+ }
477+ return all_columns;
478+ }
479+
381480Result<std::shared_ptr<RecordBatch>> BatchFromColumns (const std::vector<Column>& columns,
382481 int64_t num_rows) {
383482 FieldVector fields;
@@ -452,7 +551,7 @@ Status DoMain(const std::string& out_dir) {
452551 };
453552
454553 {
455- // 1. Unencrypted files
554+ // 1. Unencrypted files for various write configurations
456555 // Write a cardinal product of example batches x write configurations
457556 ARROW_ASSIGN_OR_RAISE (auto batches, Batches ());
458557 auto write_configs = GetWriteConfigurations ();
@@ -467,7 +566,27 @@ Status DoMain(const std::string& out_dir) {
467566 }
468567 }
469568 {
470- // 2. Encrypted files
569+ // 2. Unencrypted files for various column encodings
570+ // Write one file per (column, encoding) pair.
571+ ARROW_ASSIGN_OR_RAISE (auto columns, AllColumnsWithEncodings ());
572+
573+ for (const auto & column : columns) {
574+ RETURN_NOT_OK (column.column .array ->ValidateFull ());
575+ ARROW_ASSIGN_OR_RAISE (
576+ auto batch, BatchFromColumn (column.column , column.column .array ->length ()));
577+ ARROW_ASSIGN_OR_RAISE (auto table, Table::FromRecordBatches ({batch}));
578+
579+ for (const auto encoding : column.encodings ) {
580+ // TODO also support dictionary encoding
581+ auto w_props =
582+ MakePropertiesBuilder ().disable_dictionary ()->encoding (encoding)->build ();
583+ auto a_props = MakeArrowPropertiesBuilder ().build ();
584+ RETURN_NOT_OK (write_sample (table, WriteConfig{w_props, a_props}));
585+ }
586+ }
587+ }
588+ {
589+ // 3. Encrypted files
471590 // Use a single batch and write it using different configurations
472591 ARROW_ASSIGN_OR_RAISE (auto batch, BatchForEncryption ());
473592 auto write_configs = GetEncryptedWriteConfigurations (*batch->schema ());
0 commit comments