Skip to content

Commit 34e293c

Browse files
committed
GH-48545: [C++][Parquet][CI] Add more encodings to fuzzing seed corpus
1 parent 016e3cf commit 34e293c

File tree

1 file changed

+132
-2
lines changed

1 file changed

+132
-2
lines changed

cpp/src/parquet/arrow/generate_fuzz_corpus.cc

Lines changed: 132 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <cstdlib>
2323
#include <functional>
2424
#include <iostream>
25+
#include <limits>
2526
#include <memory>
2627
#include <sstream>
2728
#include <string>
@@ -49,6 +50,7 @@ using ::arrow::internal::CreateDir;
4950
using ::arrow::internal::PlatformFilename;
5051
using ::arrow::util::Float16;
5152
using ::parquet::ArrowWriterProperties;
53+
using ::parquet::Encoding;
5254
using ::parquet::WriterProperties;
5355

5456
struct WriteConfig {
@@ -74,6 +76,13 @@ struct Column {
7476
}
7577
};
7678

79+
using EncodingVector = std::vector<Encoding::type>;
80+
81+
struct ColumnWithEncodings {
82+
Column column;
83+
EncodingVector encodings;
84+
};
85+
7786
std::shared_ptr<Field> FieldForArray(const std::shared_ptr<Array>& array,
7887
std::string name) {
7988
return field(std::move(name), array->type(), /*nullable=*/array->null_count() != 0);
@@ -369,6 +378,100 @@ Result<std::vector<Column>> ExampleColumns(int32_t length, double null_probabili
369378
return columns;
370379
}
371380

381+
template <typename T>
382+
constexpr auto kMin = std::numeric_limits<T>::lowest();
383+
template <typename T>
384+
constexpr auto kMax = std::numeric_limits<T>::max();
385+
386+
// Generate columns for physical types along with their supported encodings
387+
Result<std::vector<ColumnWithEncodings>> AllColumnsWithEncodings(
388+
int32_t length, double null_probability = 0.2) {
389+
const EncodingVector kIntEncodings = {Encoding::PLAIN, Encoding::RLE_DICTIONARY,
390+
Encoding::DELTA_BINARY_PACKED,
391+
Encoding::BYTE_STREAM_SPLIT};
392+
const EncodingVector kFloatEncodings = {Encoding::PLAIN, Encoding::RLE_DICTIONARY,
393+
Encoding::BYTE_STREAM_SPLIT};
394+
const EncodingVector kBooleanEncodings = {Encoding::PLAIN, Encoding::RLE};
395+
const EncodingVector kByteArrayEncodings = {Encoding::PLAIN, Encoding::RLE_DICTIONARY,
396+
Encoding::DELTA_LENGTH_BYTE_ARRAY,
397+
Encoding::DELTA_BYTE_ARRAY};
398+
const EncodingVector kFixedLenByteArrayEncodings = {
399+
Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::DELTA_BYTE_ARRAY,
400+
Encoding::BYTE_STREAM_SPLIT};
401+
402+
std::vector<ColumnWithEncodings> columns;
403+
404+
random::RandomArrayGenerator gen(42);
405+
auto name_gen = Column::NameGenerator();
406+
407+
for (const double true_probability : {0.0, 0.001, 0.01, 0.5, 0.999}) {
408+
columns.push_back(
409+
{{name_gen(), gen.Boolean(length, true_probability, null_probability)},
410+
kBooleanEncodings});
411+
}
412+
413+
columns.push_back(
414+
{{name_gen(), gen.Int32(length, -100, 100, null_probability)}, kIntEncodings});
415+
columns.push_back(
416+
{{name_gen(), gen.Int32(length, kMin<int32_t>, kMax<int32_t>, null_probability)},
417+
kIntEncodings});
418+
columns.push_back({{name_gen(), gen.Int64(length, -100'000, 100'000, null_probability)},
419+
kIntEncodings});
420+
columns.push_back(
421+
{{name_gen(), gen.Int64(length, kMin<int64_t>, kMax<int64_t>, null_probability)},
422+
kIntEncodings});
423+
424+
// XXX should we add INT96? It's deprecated, only supports PLAIN and is featured in
425+
// the parquet-testing files.
426+
427+
// NOTE: will need to vary NaNs if there are encodings for which that matters (ALP?)
428+
columns.push_back(
429+
{{name_gen(), gen.Float32(length, -1.0, 1.0, null_probability)}, kFloatEncodings});
430+
columns.push_back(
431+
{{name_gen(), gen.Float32(length, kMin<float>, kMax<float>, null_probability)},
432+
kFloatEncodings});
433+
columns.push_back(
434+
{{name_gen(), gen.Float64(length, -1.0, 1.0, null_probability)}, kFloatEncodings});
435+
columns.push_back(
436+
{{name_gen(), gen.Float64(length, kMin<double>, kMax<double>, null_probability)},
437+
kFloatEncodings});
438+
439+
columns.push_back(
440+
{{name_gen(), gen.Float16(length, Float16(-1.0), Float16(1.0), null_probability)},
441+
kFixedLenByteArrayEncodings});
442+
443+
columns.push_back({{name_gen(), gen.String(length, /*min_length=*/0,
444+
/*max_length=*/20, null_probability)},
445+
kByteArrayEncodings});
446+
columns.push_back({{name_gen(), gen.String(length, /*min_length=*/12,
447+
/*max_length=*/14, null_probability)},
448+
kByteArrayEncodings});
449+
columns.push_back({{name_gen(), gen.String(length, /*min_length=*/100,
450+
/*max_length=*/200, null_probability)},
451+
kByteArrayEncodings});
452+
columns.push_back({{name_gen(), gen.StringWithRepeats(
453+
length, /*unique=*/length / 50, /*min_length=*/0,
454+
/*max_length=*/20, null_probability)},
455+
kByteArrayEncodings});
456+
columns.push_back({{name_gen(), gen.StringWithRepeats(
457+
length, /*unique=*/length / 100, /*min_length=*/12,
458+
/*max_length=*/14, null_probability)},
459+
kByteArrayEncodings});
460+
461+
return columns;
462+
}
463+
464+
Result<std::vector<ColumnWithEncodings>> AllColumnsWithEncodings() {
465+
std::vector<ColumnWithEncodings> all_columns;
466+
467+
for (const double null_probability : {0.0, 0.01, 0.5, 1.0}) {
468+
ARROW_ASSIGN_OR_RAISE(auto columns,
469+
AllColumnsWithEncodings(/*length=*/1'000, null_probability));
470+
all_columns.insert(all_columns.end(), columns.begin(), columns.end());
471+
}
472+
return all_columns;
473+
}
474+
372475
Result<std::shared_ptr<RecordBatch>> BatchFromColumns(const std::vector<Column>& columns,
373476
int64_t num_rows) {
374477
FieldVector fields;
@@ -443,7 +546,7 @@ Status DoMain(const std::string& out_dir) {
443546
};
444547

445548
{
446-
// 1. Unencrypted files
549+
// 1. Unencrypted files for various write configurations
447550
// Write a cardinal product of example batches x write configurations
448551
ARROW_ASSIGN_OR_RAISE(auto batches, Batches());
449552
auto write_configs = GetWriteConfigurations();
@@ -458,7 +561,34 @@ Status DoMain(const std::string& out_dir) {
458561
}
459562
}
460563
{
461-
// 2. Encrypted files
564+
// 2. Unencrypted files for various column encodings
565+
// Write one file per (column, encoding) pair.
566+
ARROW_ASSIGN_OR_RAISE(auto columns, AllColumnsWithEncodings());
567+
568+
for (const auto& column : columns) {
569+
RETURN_NOT_OK(column.column.array->ValidateFull());
570+
ARROW_ASSIGN_OR_RAISE(
571+
auto batch, BatchFromColumn(column.column, column.column.array->length()));
572+
ARROW_ASSIGN_OR_RAISE(auto table, Table::FromRecordBatches({batch}));
573+
574+
for (const auto encoding : column.encodings) {
575+
auto w_props_builder = MakePropertiesBuilder();
576+
if (encoding == Encoding::RLE_DICTIONARY) {
577+
// RLE_DICTIONARY is enabled through enable_dictionary() rather than
578+
// encoding(), also increase the dictionary page size limit as we
579+
// generate data with a typically high cardinality.
580+
w_props_builder.enable_dictionary()->dictionary_pagesize_limit(1'000'000);
581+
} else {
582+
w_props_builder.disable_dictionary()->encoding(encoding);
583+
}
584+
auto w_props = w_props_builder.build();
585+
auto a_props = MakeArrowPropertiesBuilder().build();
586+
RETURN_NOT_OK(write_sample(table, WriteConfig{w_props, a_props}));
587+
}
588+
}
589+
}
590+
{
591+
// 3. Encrypted files
462592
// Use a single batch and write it using different configurations
463593
ARROW_ASSIGN_OR_RAISE(auto batch, BatchForEncryption());
464594
auto write_configs = GetEncryptedWriteConfigurations(*batch->schema());

0 commit comments

Comments
 (0)