Skip to content

Commit 713c57a

Browse files
authored
GH-48545: [C++][Parquet][CI] Add more encodings to fuzzing seed corpus (#48546)
### Rationale for this change Ensure that the seed corpus contains all supported combinations of physical types and encodings, with varying amounts of nulls and distributions of values. Also give more meaningful names to seed corpus files, to ease manual inspection (this won't affect the fuzzer since we rename those files to the sha256 of their contents). ### Are these changes tested? Manually and later by OSS-Fuzz. ### Are there any user-facing changes? No. * GitHub Issue: #48545 Authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 30809c6 commit 713c57a

File tree

1 file changed

+182
-21
lines changed

1 file changed

+182
-21
lines changed

cpp/src/parquet/arrow/generate_fuzz_corpus.cc

Lines changed: 182 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,11 @@
2222
#include <cstdlib>
2323
#include <functional>
2424
#include <iostream>
25+
#include <limits>
2526
#include <memory>
2627
#include <sstream>
2728
#include <string>
29+
#include <utility>
2830
#include <vector>
2931

3032
#include "arrow/array.h"
@@ -49,9 +51,11 @@ using ::arrow::internal::CreateDir;
4951
using ::arrow::internal::PlatformFilename;
5052
using ::arrow::util::Float16;
5153
using ::parquet::ArrowWriterProperties;
54+
using ::parquet::Encoding;
5255
using ::parquet::WriterProperties;
5356

5457
struct WriteConfig {
58+
std::string name;
5559
std::shared_ptr<WriterProperties> writer_properties;
5660
std::shared_ptr<ArrowWriterProperties> arrow_writer_properties;
5761
};
@@ -74,6 +78,13 @@ struct Column {
7478
}
7579
};
7680

81+
using EncodingVector = std::vector<Encoding::type>;
82+
83+
struct ColumnWithEncodings {
84+
Column column;
85+
EncodingVector encodings;
86+
};
87+
7788
std::shared_ptr<Field> FieldForArray(const std::shared_ptr<Array>& array,
7889
std::string name) {
7990
return field(std::move(name), array->type(), /*nullable=*/array->null_count() != 0);
@@ -135,13 +146,13 @@ std::vector<WriteConfig> GetWriteConfigurations() {
135146
// clang-format on
136147

137148
std::vector<WriteConfig> configs;
138-
configs.push_back({w_uncompressed, a_default});
139-
configs.push_back({w_brotli, a_default});
140-
configs.push_back({w_gzip, a_default});
141-
configs.push_back({w_lz4, a_default});
142-
configs.push_back({w_snappy, a_default});
143-
configs.push_back({w_zstd, a_default});
144-
configs.push_back({w_pages_v1, a_default});
149+
configs.push_back({"uncompressed", w_uncompressed, a_default});
150+
configs.push_back({"brotli", w_brotli, a_default});
151+
configs.push_back({"gzip", w_gzip, a_default});
152+
configs.push_back({"lz4", w_lz4, a_default});
153+
configs.push_back({"snappy", w_snappy, a_default});
154+
configs.push_back({"zstd", w_zstd, a_default});
155+
configs.push_back({"v1_data_page", w_pages_v1, a_default});
145156
return configs;
146157
}
147158

@@ -158,12 +169,14 @@ std::vector<WriteConfig> GetEncryptedWriteConfigurations(const ::arrow::Schema&
158169
return builder;
159170
};
160171

161-
std::vector<std::shared_ptr<FileEncryptionProperties>> file_encryptions;
172+
std::vector<std::tuple<std::string, std::shared_ptr<FileEncryptionProperties>>>
173+
file_encryptions;
162174

163175
// Uniform encryption
164-
file_encryptions.push_back(file_encryption_builder().build());
176+
file_encryptions.push_back({"uniform", file_encryption_builder().build()});
165177
// Uniform encryption with plaintext footer
166-
file_encryptions.push_back(file_encryption_builder().set_plaintext_footer()->build());
178+
file_encryptions.push_back({"uniform_plaintext_footer",
179+
file_encryption_builder().set_plaintext_footer()->build()});
167180
// Columns encrypted with individual keys
168181
{
169182
ColumnPathToEncryptionPropertiesMap column_map;
@@ -174,7 +187,8 @@ std::vector<WriteConfig> GetEncryptedWriteConfigurations(const ::arrow::Schema&
174187
}
175188
ARROW_DCHECK_NE(column_map.size(), 0);
176189
file_encryptions.push_back(
177-
file_encryption_builder().encrypted_columns(std::move(column_map))->build());
190+
{"column_keys",
191+
file_encryption_builder().encrypted_columns(std::move(column_map))->build()});
178192
}
179193
// Unencrypted columns
180194
{
@@ -184,15 +198,16 @@ std::vector<WriteConfig> GetEncryptedWriteConfigurations(const ::arrow::Schema&
184198
}
185199
ARROW_DCHECK_NE(column_map.size(), 0);
186200
file_encryptions.push_back(
187-
file_encryption_builder().encrypted_columns(std::move(column_map))->build());
201+
{"unencrypted_columns",
202+
file_encryption_builder().encrypted_columns(std::move(column_map))->build()});
188203
}
189204

190205
auto a_default = MakeArrowPropertiesBuilder().build();
191206

192207
std::vector<WriteConfig> configs;
193-
for (const auto& file_encryption : file_encryptions) {
208+
for (auto [name, file_encryption] : file_encryptions) {
194209
auto writer_properties = MakePropertiesBuilder().encryption(file_encryption)->build();
195-
configs.push_back({writer_properties, a_default});
210+
configs.push_back({name, writer_properties, a_default});
196211
}
197212
return configs;
198213
}
@@ -369,6 +384,110 @@ Result<std::vector<Column>> ExampleColumns(int32_t length, double null_probabili
369384
return columns;
370385
}
371386

387+
template <typename T>
388+
constexpr auto kMin = std::numeric_limits<T>::lowest();
389+
template <typename T>
390+
constexpr auto kMax = std::numeric_limits<T>::max();
391+
392+
// Generate columns for physical types along with their supported encodings
393+
Result<std::vector<ColumnWithEncodings>> AllColumnsWithEncodings(
394+
int32_t length, double null_probability = 0.2) {
395+
const EncodingVector kIntEncodings = {Encoding::PLAIN, Encoding::RLE_DICTIONARY,
396+
Encoding::DELTA_BINARY_PACKED,
397+
Encoding::BYTE_STREAM_SPLIT};
398+
const EncodingVector kFloatEncodings = {Encoding::PLAIN, Encoding::RLE_DICTIONARY,
399+
Encoding::BYTE_STREAM_SPLIT};
400+
const EncodingVector kBooleanEncodings = {Encoding::PLAIN, Encoding::RLE};
401+
const EncodingVector kByteArrayEncodings = {Encoding::PLAIN, Encoding::RLE_DICTIONARY,
402+
Encoding::DELTA_LENGTH_BYTE_ARRAY,
403+
Encoding::DELTA_BYTE_ARRAY};
404+
const EncodingVector kFixedLenByteArrayEncodings = {
405+
Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::DELTA_BYTE_ARRAY,
406+
Encoding::BYTE_STREAM_SPLIT};
407+
const EncodingVector kInt96Encodings = {Encoding::PLAIN};
408+
409+
std::vector<ColumnWithEncodings> columns;
410+
411+
random::RandomArrayGenerator gen(42);
412+
auto name_gen = Column::NameGenerator();
413+
414+
for (const double true_probability : {0.0, 0.001, 0.01, 0.5, 0.999}) {
415+
columns.push_back(
416+
{{name_gen(), gen.Boolean(length, true_probability, null_probability)},
417+
kBooleanEncodings});
418+
}
419+
420+
// Generate integer columns with different ranges to trigger delta encoding modes
421+
columns.push_back(
422+
{{name_gen(), gen.Int32(length, -100, 100, null_probability)}, kIntEncodings});
423+
columns.push_back(
424+
{{name_gen(), gen.Int32(length, kMin<int32_t>, kMax<int32_t>, null_probability)},
425+
kIntEncodings});
426+
columns.push_back({{name_gen(), gen.Int64(length, -100'000, 100'000, null_probability)},
427+
kIntEncodings});
428+
columns.push_back(
429+
{{name_gen(), gen.Int64(length, kMin<int64_t>, kMax<int64_t>, null_probability)},
430+
kIntEncodings});
431+
432+
// This won't necessarily span all 96 bits of precision, but PLAIN encoding allows
433+
// the fuzzer to do its thing on the values.
434+
for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}) {
435+
ARROW_ASSIGN_OR_RAISE(
436+
auto array, gen.Int64(length, kMin<int64_t>, kMax<int64_t>, null_probability)
437+
->View(timestamp(unit)));
438+
columns.push_back({{name_gen(), array}, kInt96Encodings});
439+
}
440+
441+
// NOTE: will need to vary NaNs if there are encodings for which that matters (ALP?)
442+
columns.push_back(
443+
{{name_gen(), gen.Float32(length, -1.0, 1.0, null_probability)}, kFloatEncodings});
444+
columns.push_back(
445+
{{name_gen(), gen.Float32(length, kMin<float>, kMax<float>, null_probability)},
446+
kFloatEncodings});
447+
columns.push_back(
448+
{{name_gen(), gen.Float64(length, -1.0, 1.0, null_probability)}, kFloatEncodings});
449+
columns.push_back(
450+
{{name_gen(), gen.Float64(length, kMin<double>, kMax<double>, null_probability)},
451+
kFloatEncodings});
452+
453+
// For FLBA
454+
columns.push_back(
455+
{{name_gen(), gen.Float16(length, Float16(-1.0), Float16(1.0), null_probability)},
456+
kFixedLenByteArrayEncodings});
457+
458+
// For BYTE_ARRAY (vary lengths and repetitions to trigger encoding modes)
459+
columns.push_back({{name_gen(), gen.String(length, /*min_length=*/0,
460+
/*max_length=*/20, null_probability)},
461+
kByteArrayEncodings});
462+
columns.push_back({{name_gen(), gen.String(length, /*min_length=*/12,
463+
/*max_length=*/14, null_probability)},
464+
kByteArrayEncodings});
465+
columns.push_back({{name_gen(), gen.String(length, /*min_length=*/100,
466+
/*max_length=*/200, null_probability)},
467+
kByteArrayEncodings});
468+
columns.push_back({{name_gen(), gen.StringWithRepeats(
469+
length, /*unique=*/length / 50, /*min_length=*/0,
470+
/*max_length=*/20, null_probability)},
471+
kByteArrayEncodings});
472+
columns.push_back({{name_gen(), gen.StringWithRepeats(
473+
length, /*unique=*/length / 100, /*min_length=*/12,
474+
/*max_length=*/14, null_probability)},
475+
kByteArrayEncodings});
476+
477+
return columns;
478+
}
479+
480+
Result<std::vector<ColumnWithEncodings>> AllColumnsWithEncodings() {
481+
std::vector<ColumnWithEncodings> all_columns;
482+
483+
for (const double null_probability : {0.0, 0.01, 0.5, 1.0}) {
484+
ARROW_ASSIGN_OR_RAISE(auto columns,
485+
AllColumnsWithEncodings(/*length=*/1'000, null_probability));
486+
all_columns.insert(all_columns.end(), columns.begin(), columns.end());
487+
}
488+
return all_columns;
489+
}
490+
372491
Result<std::shared_ptr<RecordBatch>> BatchFromColumns(const std::vector<Column>& columns,
373492
int64_t num_rows) {
374493
FieldVector fields;
@@ -425,13 +544,25 @@ Status DoMain(const std::string& out_dir) {
425544
RETURN_NOT_OK(CreateDir(dir_fn));
426545

427546
int sample_num = 1;
428-
auto sample_name = [&]() -> std::string {
429-
return "pq-table-" + std::to_string(sample_num++);
547+
auto sample_file_name = [&](const std::string& name = "") -> std::string {
548+
std::stringstream ss;
549+
if (!name.empty()) {
550+
ss << name << "-";
551+
}
552+
ss << sample_num++ << ".pq";
553+
return std::move(ss).str();
430554
};
431555

432-
auto write_sample = [&](const std::shared_ptr<Table>& table,
433-
const WriteConfig& config) -> Status {
434-
ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_name()));
556+
auto write_sample = [&](const std::shared_ptr<Table>& table, const WriteConfig& config,
557+
std::string name = "") -> Status {
558+
if (name.empty() && table->num_columns() == 1) {
559+
name = table->schema()->field(0)->type()->name();
560+
}
561+
if (!name.empty()) {
562+
name += "-";
563+
}
564+
name += config.name;
565+
ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_file_name(name)));
435566
std::cerr << sample_fn.ToString() << std::endl;
436567
ARROW_ASSIGN_OR_RAISE(auto file, io::FileOutputStream::Open(sample_fn.ToString()));
437568
// Emit several row groups
@@ -443,7 +574,7 @@ Status DoMain(const std::string& out_dir) {
443574
};
444575

445576
{
446-
// 1. Unencrypted files
577+
// 1. Unencrypted files for various write configurations
447578
// Write a cardinal product of example batches x write configurations
448579
ARROW_ASSIGN_OR_RAISE(auto batches, Batches());
449580
auto write_configs = GetWriteConfigurations();
@@ -458,7 +589,37 @@ Status DoMain(const std::string& out_dir) {
458589
}
459590
}
460591
{
461-
// 2. Encrypted files
592+
// 2. Unencrypted files for various column encodings
593+
// Write one file per (column, encoding) pair.
594+
ARROW_ASSIGN_OR_RAISE(auto columns, AllColumnsWithEncodings());
595+
596+
for (const auto& column : columns) {
597+
RETURN_NOT_OK(column.column.array->ValidateFull());
598+
ARROW_ASSIGN_OR_RAISE(
599+
auto batch, BatchFromColumn(column.column, column.column.array->length()));
600+
ARROW_ASSIGN_OR_RAISE(auto table, Table::FromRecordBatches({batch}));
601+
602+
for (const auto encoding : column.encodings) {
603+
auto w_props_builder = MakePropertiesBuilder();
604+
if (encoding == Encoding::RLE_DICTIONARY) {
605+
// RLE_DICTIONARY is enabled through enable_dictionary() rather than
606+
// encoding(), also increase the dictionary page size limit as we
607+
// generate data with a typically high cardinality.
608+
w_props_builder.enable_dictionary()->dictionary_pagesize_limit(1'000'000);
609+
} else {
610+
w_props_builder.disable_dictionary()->encoding(encoding);
611+
}
612+
auto w_props = w_props_builder.build();
613+
// Ensure that we generate INT96 Parquet data when given a timestamp column
614+
auto a_props =
615+
MakeArrowPropertiesBuilder().enable_deprecated_int96_timestamps()->build();
616+
auto config_name = ::parquet::EncodingToString(encoding);
617+
RETURN_NOT_OK(write_sample(table, WriteConfig{config_name, w_props, a_props}));
618+
}
619+
}
620+
}
621+
{
622+
// 3. Encrypted files
462623
// Use a single batch and write it using different configurations
463624
ARROW_ASSIGN_OR_RAISE(auto batch, BatchForEncryption());
464625
auto write_configs = GetEncryptedWriteConfigurations(*batch->schema());

0 commit comments

Comments
 (0)