Skip to content

Commit 1619218

Browse files
committed
apacheGH-48545: [C++][Parquet][CI] Add more encodings to fuzzing seed corpus
1 parent 30809c6 commit 1619218

File tree

1 file changed

+182
-21
lines changed

1 file changed

+182
-21
lines changed

cpp/src/parquet/arrow/generate_fuzz_corpus.cc

Lines changed: 182 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,11 @@
2222
#include <cstdlib>
2323
#include <functional>
2424
#include <iostream>
25+
#include <limits>
2526
#include <memory>
2627
#include <sstream>
2728
#include <string>
29+
#include <utility>
2830
#include <vector>
2931

3032
#include "arrow/array.h"
@@ -49,9 +51,11 @@ using ::arrow::internal::CreateDir;
4951
using ::arrow::internal::PlatformFilename;
5052
using ::arrow::util::Float16;
5153
using ::parquet::ArrowWriterProperties;
54+
using ::parquet::Encoding;
5255
using ::parquet::WriterProperties;
5356

5457
struct WriteConfig {
58+
std::string name;
5559
std::shared_ptr<WriterProperties> writer_properties;
5660
std::shared_ptr<ArrowWriterProperties> arrow_writer_properties;
5761
};
@@ -74,6 +78,13 @@ struct Column {
7478
}
7579
};
7680

81+
using EncodingVector = std::vector<Encoding::type>;
82+
83+
struct ColumnWithEncodings {
84+
Column column;
85+
EncodingVector encodings;
86+
};
87+
7788
std::shared_ptr<Field> FieldForArray(const std::shared_ptr<Array>& array,
7889
std::string name) {
7990
return field(std::move(name), array->type(), /*nullable=*/array->null_count() != 0);
@@ -135,13 +146,13 @@ std::vector<WriteConfig> GetWriteConfigurations() {
135146
// clang-format on
136147

137148
std::vector<WriteConfig> configs;
138-
configs.push_back({w_uncompressed, a_default});
139-
configs.push_back({w_brotli, a_default});
140-
configs.push_back({w_gzip, a_default});
141-
configs.push_back({w_lz4, a_default});
142-
configs.push_back({w_snappy, a_default});
143-
configs.push_back({w_zstd, a_default});
144-
configs.push_back({w_pages_v1, a_default});
149+
configs.push_back({"uncompressed", w_uncompressed, a_default});
150+
configs.push_back({"brotli", w_brotli, a_default});
151+
configs.push_back({"gzip", w_gzip, a_default});
152+
configs.push_back({"lz4", w_lz4, a_default});
153+
configs.push_back({"snappy", w_snappy, a_default});
154+
configs.push_back({"zstd", w_zstd, a_default});
155+
configs.push_back({"v1_data_page", w_pages_v1, a_default});
145156
return configs;
146157
}
147158

@@ -158,12 +169,14 @@ std::vector<WriteConfig> GetEncryptedWriteConfigurations(const ::arrow::Schema&
158169
return builder;
159170
};
160171

161-
std::vector<std::shared_ptr<FileEncryptionProperties>> file_encryptions;
172+
std::vector<std::tuple<std::string, std::shared_ptr<FileEncryptionProperties>>>
173+
file_encryptions;
162174

163175
// Uniform encryption
164-
file_encryptions.push_back(file_encryption_builder().build());
176+
file_encryptions.push_back({"uniform", file_encryption_builder().build()});
165177
// Uniform encryption with plaintext footer
166-
file_encryptions.push_back(file_encryption_builder().set_plaintext_footer()->build());
178+
file_encryptions.push_back({"uniform_plaintext_footer",
179+
file_encryption_builder().set_plaintext_footer()->build()});
167180
// Columns encrypted with individual keys
168181
{
169182
ColumnPathToEncryptionPropertiesMap column_map;
@@ -174,7 +187,8 @@ std::vector<WriteConfig> GetEncryptedWriteConfigurations(const ::arrow::Schema&
174187
}
175188
ARROW_DCHECK_NE(column_map.size(), 0);
176189
file_encryptions.push_back(
177-
file_encryption_builder().encrypted_columns(std::move(column_map))->build());
190+
{"column_keys",
191+
file_encryption_builder().encrypted_columns(std::move(column_map))->build()});
178192
}
179193
// Unencrypted columns
180194
{
@@ -184,15 +198,16 @@ std::vector<WriteConfig> GetEncryptedWriteConfigurations(const ::arrow::Schema&
184198
}
185199
ARROW_DCHECK_NE(column_map.size(), 0);
186200
file_encryptions.push_back(
187-
file_encryption_builder().encrypted_columns(std::move(column_map))->build());
201+
{"unencrypted_columns",
202+
file_encryption_builder().encrypted_columns(std::move(column_map))->build()});
188203
}
189204

190205
auto a_default = MakeArrowPropertiesBuilder().build();
191206

192207
std::vector<WriteConfig> configs;
193-
for (const auto& file_encryption : file_encryptions) {
208+
for (auto [name, file_encryption] : file_encryptions) {
194209
auto writer_properties = MakePropertiesBuilder().encryption(file_encryption)->build();
195-
configs.push_back({writer_properties, a_default});
210+
configs.push_back({name, writer_properties, a_default});
196211
}
197212
return configs;
198213
}
@@ -369,6 +384,110 @@ Result<std::vector<Column>> ExampleColumns(int32_t length, double null_probabili
369384
return columns;
370385
}
371386

387+
template <typename T>
388+
constexpr auto kMin = std::numeric_limits<T>::lowest();
389+
template <typename T>
390+
constexpr auto kMax = std::numeric_limits<T>::max();
391+
392+
// Generate columns for physical types along with their supported encodings
393+
Result<std::vector<ColumnWithEncodings>> AllColumnsWithEncodings(
394+
int32_t length, double null_probability = 0.2) {
395+
const EncodingVector kIntEncodings = {Encoding::PLAIN, Encoding::RLE_DICTIONARY,
396+
Encoding::DELTA_BINARY_PACKED,
397+
Encoding::BYTE_STREAM_SPLIT};
398+
const EncodingVector kFloatEncodings = {Encoding::PLAIN, Encoding::RLE_DICTIONARY,
399+
Encoding::BYTE_STREAM_SPLIT};
400+
const EncodingVector kBooleanEncodings = {Encoding::PLAIN, Encoding::RLE};
401+
const EncodingVector kByteArrayEncodings = {Encoding::PLAIN, Encoding::RLE_DICTIONARY,
402+
Encoding::DELTA_LENGTH_BYTE_ARRAY,
403+
Encoding::DELTA_BYTE_ARRAY};
404+
const EncodingVector kFixedLenByteArrayEncodings = {
405+
Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::DELTA_BYTE_ARRAY,
406+
Encoding::BYTE_STREAM_SPLIT};
407+
const EncodingVector kInt96Encodings = {Encoding::PLAIN};
408+
409+
std::vector<ColumnWithEncodings> columns;
410+
411+
random::RandomArrayGenerator gen(42);
412+
auto name_gen = Column::NameGenerator();
413+
414+
for (const double true_probability : {0.0, 0.001, 0.01, 0.5, 0.999}) {
415+
columns.push_back(
416+
{{name_gen(), gen.Boolean(length, true_probability, null_probability)},
417+
kBooleanEncodings});
418+
}
419+
420+
// Generate integer columns with different ranges to trigger delta encoding modes
421+
columns.push_back(
422+
{{name_gen(), gen.Int32(length, -100, 100, null_probability)}, kIntEncodings});
423+
columns.push_back(
424+
{{name_gen(), gen.Int32(length, kMin<int32_t>, kMax<int32_t>, null_probability)},
425+
kIntEncodings});
426+
columns.push_back({{name_gen(), gen.Int64(length, -100'000, 100'000, null_probability)},
427+
kIntEncodings});
428+
columns.push_back(
429+
{{name_gen(), gen.Int64(length, kMin<int64_t>, kMax<int64_t>, null_probability)},
430+
kIntEncodings});
431+
432+
// This won't necessarily span all 96 bits of precision, but PLAIN encoding allows
433+
// the fuzzer to do its thing on the values.
434+
for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}) {
435+
ARROW_ASSIGN_OR_RAISE(
436+
auto array, gen.Int64(length, kMin<int64_t>, kMax<int64_t>, null_probability)
437+
->View(timestamp(unit)));
438+
columns.push_back({{name_gen(), array}, kInt96Encodings});
439+
}
440+
441+
// NOTE: will need to vary NaNs if there are encodings for which that matters (ALP?)
442+
columns.push_back(
443+
{{name_gen(), gen.Float32(length, -1.0, 1.0, null_probability)}, kFloatEncodings});
444+
columns.push_back(
445+
{{name_gen(), gen.Float32(length, kMin<float>, kMax<float>, null_probability)},
446+
kFloatEncodings});
447+
columns.push_back(
448+
{{name_gen(), gen.Float64(length, -1.0, 1.0, null_probability)}, kFloatEncodings});
449+
columns.push_back(
450+
{{name_gen(), gen.Float64(length, kMin<double>, kMax<double>, null_probability)},
451+
kFloatEncodings});
452+
453+
// For FLBA
454+
columns.push_back(
455+
{{name_gen(), gen.Float16(length, Float16(-1.0), Float16(1.0), null_probability)},
456+
kFixedLenByteArrayEncodings});
457+
458+
// For BYTE_ARRAY (vary lengths and repetitions to trigger encoding modes)
459+
columns.push_back({{name_gen(), gen.String(length, /*min_length=*/0,
460+
/*max_length=*/20, null_probability)},
461+
kByteArrayEncodings});
462+
columns.push_back({{name_gen(), gen.String(length, /*min_length=*/12,
463+
/*max_length=*/14, null_probability)},
464+
kByteArrayEncodings});
465+
columns.push_back({{name_gen(), gen.String(length, /*min_length=*/100,
466+
/*max_length=*/200, null_probability)},
467+
kByteArrayEncodings});
468+
columns.push_back({{name_gen(), gen.StringWithRepeats(
469+
length, /*unique=*/length / 50, /*min_length=*/0,
470+
/*max_length=*/20, null_probability)},
471+
kByteArrayEncodings});
472+
columns.push_back({{name_gen(), gen.StringWithRepeats(
473+
length, /*unique=*/length / 100, /*min_length=*/12,
474+
/*max_length=*/14, null_probability)},
475+
kByteArrayEncodings});
476+
477+
return columns;
478+
}
479+
480+
Result<std::vector<ColumnWithEncodings>> AllColumnsWithEncodings() {
481+
std::vector<ColumnWithEncodings> all_columns;
482+
483+
for (const double null_probability : {0.0, 0.01, 0.5, 1.0}) {
484+
ARROW_ASSIGN_OR_RAISE(auto columns,
485+
AllColumnsWithEncodings(/*length=*/1'000, null_probability));
486+
all_columns.insert(all_columns.end(), columns.begin(), columns.end());
487+
}
488+
return all_columns;
489+
}
490+
372491
Result<std::shared_ptr<RecordBatch>> BatchFromColumns(const std::vector<Column>& columns,
373492
int64_t num_rows) {
374493
FieldVector fields;
@@ -425,13 +544,25 @@ Status DoMain(const std::string& out_dir) {
425544
RETURN_NOT_OK(CreateDir(dir_fn));
426545

427546
int sample_num = 1;
428-
auto sample_name = [&]() -> std::string {
429-
return "pq-table-" + std::to_string(sample_num++);
547+
auto sample_file_name = [&](const std::string& name = "") -> std::string {
548+
std::stringstream ss;
549+
if (!name.empty()) {
550+
ss << name << "-";
551+
}
552+
ss << sample_num++ << ".pq";
553+
return std::move(ss).str();
430554
};
431555

432-
auto write_sample = [&](const std::shared_ptr<Table>& table,
433-
const WriteConfig& config) -> Status {
434-
ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_name()));
556+
auto write_sample = [&](const std::shared_ptr<Table>& table, const WriteConfig& config,
557+
std::string name = "") -> Status {
558+
if (name.empty() && table->num_columns() == 1) {
559+
name = table->schema()->field(0)->type()->name();
560+
}
561+
if (!name.empty()) {
562+
name += "-";
563+
}
564+
name += config.name;
565+
ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_file_name(name)));
435566
std::cerr << sample_fn.ToString() << std::endl;
436567
ARROW_ASSIGN_OR_RAISE(auto file, io::FileOutputStream::Open(sample_fn.ToString()));
437568
// Emit several row groups
@@ -443,7 +574,7 @@ Status DoMain(const std::string& out_dir) {
443574
};
444575

445576
{
446-
// 1. Unencrypted files
577+
// 1. Unencrypted files for various write configurations
447578
// Write a cardinal product of example batches x write configurations
448579
ARROW_ASSIGN_OR_RAISE(auto batches, Batches());
449580
auto write_configs = GetWriteConfigurations();
@@ -458,7 +589,37 @@ Status DoMain(const std::string& out_dir) {
458589
}
459590
}
460591
{
461-
// 2. Encrypted files
592+
// 2. Unencrypted files for various column encodings
593+
// Write one file per (column, encoding) pair.
594+
ARROW_ASSIGN_OR_RAISE(auto columns, AllColumnsWithEncodings());
595+
596+
for (const auto& column : columns) {
597+
RETURN_NOT_OK(column.column.array->ValidateFull());
598+
ARROW_ASSIGN_OR_RAISE(
599+
auto batch, BatchFromColumn(column.column, column.column.array->length()));
600+
ARROW_ASSIGN_OR_RAISE(auto table, Table::FromRecordBatches({batch}));
601+
602+
for (const auto encoding : column.encodings) {
603+
auto w_props_builder = MakePropertiesBuilder();
604+
if (encoding == Encoding::RLE_DICTIONARY) {
605+
// RLE_DICTIONARY is enabled through enable_dictionary() rather than
606+
// encoding(), also increase the dictionary page size limit as we
607+
// generate data with a typically high cardinality.
608+
w_props_builder.enable_dictionary()->dictionary_pagesize_limit(1'000'000);
609+
} else {
610+
w_props_builder.disable_dictionary()->encoding(encoding);
611+
}
612+
auto w_props = w_props_builder.build();
613+
// Ensure that we generate INT96 Parquet data when given a timestamp column
614+
auto a_props =
615+
MakeArrowPropertiesBuilder().enable_deprecated_int96_timestamps()->build();
616+
auto config_name = ::parquet::EncodingToString(encoding);
617+
RETURN_NOT_OK(write_sample(table, WriteConfig{config_name, w_props, a_props}));
618+
}
619+
}
620+
}
621+
{
622+
// 3. Encrypted files
462623
// Use a single batch and write it using different configurations
463624
ARROW_ASSIGN_OR_RAISE(auto batch, BatchForEncryption());
464625
auto write_configs = GetEncryptedWriteConfigurations(*batch->schema());

0 commit comments

Comments
 (0)