Skip to content

Commit 1413191

Browse files
committed
GH-48545: [C++][Parquet][CI] Add more encodings to fuzzing seed corpus
1 parent 4fc9f9e commit 1413191

File tree

1 file changed

+121
-2
lines changed

1 file changed

+121
-2
lines changed

cpp/src/parquet/arrow/generate_fuzz_corpus.cc

Lines changed: 121 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <cstdlib>
2323
#include <functional>
2424
#include <iostream>
25+
#include <limits>
2526
#include <memory>
2627
#include <sstream>
2728
#include <string>
@@ -49,6 +50,7 @@ using ::arrow::internal::CreateDir;
4950
using ::arrow::internal::PlatformFilename;
5051
using ::arrow::util::Float16;
5152
using ::parquet::ArrowWriterProperties;
53+
using ::parquet::Encoding;
5254
using ::parquet::WriterProperties;
5355

5456
struct WriteConfig {
@@ -74,6 +76,11 @@ struct Column {
7476
}
7577
};
7678

79+
struct ColumnWithEncodings {
80+
Column column;
81+
std::vector<Encoding::type> encodings;
82+
};
83+
7784
std::shared_ptr<Field> FieldForArray(const std::shared_ptr<Array>& array,
7885
std::string name) {
7986
return field(std::move(name), array->type(), /*nullable=*/array->null_count() != 0);
@@ -378,6 +385,98 @@ Result<std::vector<Column>> ExampleColumns(int32_t length, double null_probabili
378385
return columns;
379386
}
380387

388+
template <typename T>
389+
constexpr auto kMin = std::numeric_limits<T>::lowest();
390+
template <typename T>
391+
constexpr auto kMax = std::numeric_limits<T>::max();
392+
393+
// Generate columns for physical types along with their supported encodings
394+
Result<std::vector<ColumnWithEncodings>> AllColumnsWithEncodings(
395+
int32_t length, double null_probability = 0.2) {
396+
// TODO add RLE_DICTIONARY?
397+
const std::vector<Encoding::type> kIntEncodings = {
398+
Encoding::PLAIN, Encoding::DELTA_BINARY_PACKED, Encoding::BYTE_STREAM_SPLIT};
399+
const std::vector<Encoding::type> kFloatEncodings = {Encoding::PLAIN,
400+
Encoding::BYTE_STREAM_SPLIT};
401+
const std::vector<Encoding::type> kBooleanEncodings = {Encoding::PLAIN, Encoding::RLE};
402+
const std::vector<Encoding::type> kByteArrayEncodings = {
403+
Encoding::PLAIN, Encoding::DELTA_LENGTH_BYTE_ARRAY, Encoding::DELTA_BYTE_ARRAY};
404+
const std::vector<Encoding::type> kFixedLenByteArrayEncodings = {
405+
Encoding::PLAIN, Encoding::DELTA_BYTE_ARRAY, Encoding::BYTE_STREAM_SPLIT};
406+
407+
std::vector<ColumnWithEncodings> columns;
408+
409+
random::RandomArrayGenerator gen(42);
410+
auto name_gen = Column::NameGenerator();
411+
412+
for (const double true_probability : {0.0, 0.001, 0.01, 0.5, 0.999}) {
413+
columns.push_back(
414+
{{name_gen(), gen.Boolean(length, true_probability, null_probability)},
415+
kBooleanEncodings});
416+
}
417+
418+
columns.push_back(
419+
{{name_gen(), gen.Int32(length, -100, 100, null_probability)}, kIntEncodings});
420+
columns.push_back(
421+
{{name_gen(), gen.Int32(length, kMin<int32_t>, kMax<int32_t>, null_probability)},
422+
kIntEncodings});
423+
columns.push_back({{name_gen(), gen.Int64(length, -100'000, 100'000, null_probability)},
424+
kIntEncodings});
425+
columns.push_back(
426+
{{name_gen(), gen.Int64(length, kMin<int64_t>, kMax<int64_t>, null_probability)},
427+
kIntEncodings});
428+
429+
// XXX should we add INT96? It's deprecated, only supports PLAIN and is featured in
430+
// the parquet-testing files.
431+
432+
// NOTE: will need to vary NaNs if there are encodings for which that matters (ALP?)
433+
columns.push_back(
434+
{{name_gen(), gen.Float32(length, -1.0, 1.0, null_probability)}, kFloatEncodings});
435+
columns.push_back(
436+
{{name_gen(), gen.Float32(length, kMin<float>, kMax<float>, null_probability)},
437+
kFloatEncodings});
438+
columns.push_back(
439+
{{name_gen(), gen.Float64(length, -1.0, 1.0, null_probability)}, kFloatEncodings});
440+
columns.push_back(
441+
{{name_gen(), gen.Float64(length, kMin<double>, kMax<double>, null_probability)},
442+
kFloatEncodings});
443+
444+
columns.push_back(
445+
{{name_gen(), gen.Float16(length, Float16(-1.0), Float16(1.0), null_probability)},
446+
kFixedLenByteArrayEncodings});
447+
448+
columns.push_back({{name_gen(), gen.String(length, /*min_length=*/0,
449+
/*max_length=*/20, null_probability)},
450+
kByteArrayEncodings});
451+
columns.push_back({{name_gen(), gen.String(length, /*min_length=*/12,
452+
/*max_length=*/14, null_probability)},
453+
kByteArrayEncodings});
454+
columns.push_back({{name_gen(), gen.String(length, /*min_length=*/100,
455+
/*max_length=*/200, null_probability)},
456+
kByteArrayEncodings});
457+
columns.push_back({{name_gen(), gen.StringWithRepeats(
458+
length, /*unique=*/length / 50, /*min_length=*/0,
459+
/*max_length=*/20, null_probability)},
460+
kByteArrayEncodings});
461+
columns.push_back({{name_gen(), gen.StringWithRepeats(
462+
length, /*unique=*/length / 100, /*min_length=*/12,
463+
/*max_length=*/14, null_probability)},
464+
kByteArrayEncodings});
465+
466+
return columns;
467+
}
468+
469+
Result<std::vector<ColumnWithEncodings>> AllColumnsWithEncodings() {
470+
std::vector<ColumnWithEncodings> all_columns;
471+
472+
for (const double null_probability : {0.0, 0.01, 0.5, 1.0}) {
473+
ARROW_ASSIGN_OR_RAISE(auto columns,
474+
AllColumnsWithEncodings(/*length=*/1'000, null_probability));
475+
all_columns.insert(all_columns.end(), columns.begin(), columns.end());
476+
}
477+
return all_columns;
478+
}
479+
381480
Result<std::shared_ptr<RecordBatch>> BatchFromColumns(const std::vector<Column>& columns,
382481
int64_t num_rows) {
383482
FieldVector fields;
@@ -452,7 +551,7 @@ Status DoMain(const std::string& out_dir) {
452551
};
453552

454553
{
455-
// 1. Unencrypted files
554+
// 1. Unencrypted files for various write configurations
456555
// Write a cardinal product of example batches x write configurations
457556
ARROW_ASSIGN_OR_RAISE(auto batches, Batches());
458557
auto write_configs = GetWriteConfigurations();
@@ -467,7 +566,27 @@ Status DoMain(const std::string& out_dir) {
467566
}
468567
}
469568
{
470-
// 2. Encrypted files
569+
// 2. Unencrypted files for various column encodings
570+
// Write one file per (column, encoding) pair.
571+
ARROW_ASSIGN_OR_RAISE(auto columns, AllColumnsWithEncodings());
572+
573+
for (const auto& column : columns) {
574+
RETURN_NOT_OK(column.column.array->ValidateFull());
575+
ARROW_ASSIGN_OR_RAISE(
576+
auto batch, BatchFromColumn(column.column, column.column.array->length()));
577+
ARROW_ASSIGN_OR_RAISE(auto table, Table::FromRecordBatches({batch}));
578+
579+
for (const auto encoding : column.encodings) {
580+
// TODO also support dictionary encoding
581+
auto w_props =
582+
MakePropertiesBuilder().disable_dictionary()->encoding(encoding)->build();
583+
auto a_props = MakeArrowPropertiesBuilder().build();
584+
RETURN_NOT_OK(write_sample(table, WriteConfig{w_props, a_props}));
585+
}
586+
}
587+
}
588+
{
589+
// 3. Encrypted files
471590
// Use a single batch and write it using different configurations
472591
ARROW_ASSIGN_OR_RAISE(auto batch, BatchForEncryption());
473592
auto write_configs = GetEncryptedWriteConfigurations(*batch->schema());

0 commit comments

Comments
 (0)