Skip to content

Commit 35bf488

Browse files
authored
GH-47891: [C++][Parquet] Generate a separate fuzz seed file for each column (#47892)
### Rationale for this change We currently generate a single file with all columns for the Parquet seed corpus. However, mutations will probably only touch a single column most of time. Therefore, putting all columns in the same file makes individual fuzzing iterations much longer for no obvious benefit. ### Are these changes tested? By OSS-Fuzz CI job. The actual efficiency of these changes will have to be later verified in the OSS-Fuzz [fuzzer stats](https://oss-fuzz.com/fuzzer-stats). ### Are there any user-facing changes? No. * GitHub Issue: #47891 Authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent fb51888 commit 35bf488

File tree

1 file changed

+70
-59
lines changed

1 file changed

+70
-59
lines changed

cpp/src/parquet/arrow/generate_fuzz_corpus.cc

Lines changed: 70 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,11 @@ struct Column {
7575
}
7676
};
7777

78+
std::shared_ptr<Field> FieldForArray(const std::shared_ptr<Array>& array,
79+
std::string name) {
80+
return field(std::move(name), array->type(), /*nullable=*/array->null_count() != 0);
81+
}
82+
7883
std::vector<WriteConfig> GetWriteConfigurations() {
7984
// clang-format off
8085
auto w_brotli = WriterProperties::Builder()
@@ -98,59 +103,53 @@ std::vector<WriteConfig> GetWriteConfigurations() {
98103
return configs;
99104
}
100105

101-
Result<std::shared_ptr<RecordBatch>> ExampleBatch1() {
102-
constexpr double kNullProbability = 0.2;
106+
Result<std::vector<Column>> ExampleColumns(int32_t length,
107+
double null_probability = 0.2) {
108+
std::vector<Column> columns;
103109

104110
random::RandomArrayGenerator gen(42);
105111
auto name_gen = Column::NameGenerator();
106-
107-
auto field_for_array_named = [&](const std::shared_ptr<Array>& array,
108-
std::string name) {
109-
return field(std::move(name), array->type(), /*nullable=*/array->null_count() != 0);
110-
};
111112
auto field_for_array = [&](const std::shared_ptr<Array>& array) {
112-
return field_for_array_named(array, name_gen());
113+
return FieldForArray(array, name_gen());
113114
};
114115

115-
std::vector<Column> columns;
116-
117-
auto int16_array = gen.Int16(kBatchSize, -30000, 30000, kNullProbability);
118-
auto int32_array = gen.Int32(kBatchSize, -2000000000, 2000000000, kNullProbability);
119-
auto int64_array = gen.Int64(kBatchSize, -9000000000000000000LL, 9000000000000000000LL,
120-
kNullProbability);
116+
auto int16_array = gen.Int16(length, -30000, 30000, null_probability);
117+
auto int32_array = gen.Int32(length, -2000000000, 2000000000, null_probability);
118+
auto int64_array =
119+
gen.Int64(length, -9000000000000000000LL, 9000000000000000000LL, null_probability);
121120
auto non_null_float64_array =
122-
gen.Float64(kBatchSize, -1e10, 1e10, /*null_probability=*/0.0);
123-
auto tiny_strings_array = gen.String(kBatchSize, 0, 3, kNullProbability);
121+
gen.Float64(length, -1e10, 1e10, /*null_probability=*/0.0);
122+
auto tiny_strings_array = gen.String(length, 0, 3, null_probability);
124123
auto large_strings_array =
125-
gen.LargeString(kBatchSize, /*min_length=*/0, /*max_length=*/20, kNullProbability);
124+
gen.LargeString(length, /*min_length=*/0, /*max_length=*/20, null_probability);
126125
auto string_view_array =
127-
gen.StringView(kBatchSize, /*min_length=*/8, /*max_length=*/30, kNullProbability);
128-
ARROW_ASSIGN_OR_RAISE(auto null_array, MakeArrayOfNull(null(), kBatchSize));
126+
gen.StringView(length, /*min_length=*/8, /*max_length=*/30, null_probability);
127+
ARROW_ASSIGN_OR_RAISE(auto null_array, MakeArrayOfNull(null(), length));
129128

130129
// Null
131130
columns.push_back({name_gen(), null_array});
132131
// Numerics
133132
columns.push_back({name_gen(), int16_array});
134133
columns.push_back({name_gen(), non_null_float64_array});
135134
columns.push_back(
136-
{name_gen(), gen.Float16(kBatchSize, Float16::FromDouble(-1e4),
137-
Float16::FromDouble(1e4), kNullProbability)});
135+
{name_gen(), gen.Float16(length, Float16::FromDouble(-1e4),
136+
Float16::FromDouble(1e4), null_probability)});
138137
columns.push_back({name_gen(), int64_array});
139138
// Decimals
140139
columns.push_back(
141-
{name_gen(), gen.Decimal128(decimal128(24, 7), kBatchSize, kNullProbability)});
140+
{name_gen(), gen.Decimal128(decimal128(24, 7), length, null_probability)});
142141
columns.push_back(
143-
{name_gen(), gen.Decimal256(decimal256(43, 7), kBatchSize, kNullProbability)});
142+
{name_gen(), gen.Decimal256(decimal256(43, 7), length, null_probability)});
144143
columns.push_back(
145-
{name_gen(), gen.Decimal64(decimal64(12, 3), kBatchSize, kNullProbability)});
144+
{name_gen(), gen.Decimal64(decimal64(12, 3), length, null_probability)});
146145
columns.push_back(
147-
{name_gen(), gen.Decimal32(decimal32(7, 3), kBatchSize, kNullProbability)});
146+
{name_gen(), gen.Decimal32(decimal32(7, 3), length, null_probability)});
148147

149148
// Timestamp
150149
// (Parquet doesn't have seconds timestamps so the values are going to be
151150
// multiplied by 10)
152151
auto int64_timestamps_array =
153-
gen.Int64(kBatchSize, -9000000000000000LL, 9000000000000000LL, kNullProbability);
152+
gen.Int64(length, -9000000000000000LL, 9000000000000000LL, null_probability);
154153
for (auto unit : TimeUnit::values()) {
155154
ARROW_ASSIGN_OR_RAISE(auto timestamps,
156155
int64_timestamps_array->View(timestamp(unit, "UTC")));
@@ -159,71 +158,71 @@ Result<std::shared_ptr<RecordBatch>> ExampleBatch1() {
159158
// Time32, time64
160159
ARROW_ASSIGN_OR_RAISE(
161160
auto time32_s,
162-
gen.Int32(kBatchSize, 0, 86399, kNullProbability)->View(time32(TimeUnit::SECOND)));
161+
gen.Int32(length, 0, 86399, null_probability)->View(time32(TimeUnit::SECOND)));
163162
columns.push_back({name_gen(), time32_s});
164-
ARROW_ASSIGN_OR_RAISE(auto time32_ms,
165-
gen.Int32(kBatchSize, 0, 86399999, kNullProbability)
166-
->View(time32(TimeUnit::MILLI)));
163+
ARROW_ASSIGN_OR_RAISE(
164+
auto time32_ms,
165+
gen.Int32(length, 0, 86399999, null_probability)->View(time32(TimeUnit::MILLI)));
167166
columns.push_back({name_gen(), time32_ms});
168167
ARROW_ASSIGN_OR_RAISE(auto time64_us,
169-
gen.Int64(kBatchSize, 0, 86399999999LL, kNullProbability)
168+
gen.Int64(length, 0, 86399999999LL, null_probability)
170169
->View(time64(TimeUnit::MICRO)));
171170
columns.push_back({name_gen(), time64_us});
172171
ARROW_ASSIGN_OR_RAISE(auto time64_ns,
173-
gen.Int64(kBatchSize, 0, 86399999999999LL, kNullProbability)
172+
gen.Int64(length, 0, 86399999999999LL, null_probability)
174173
->View(time64(TimeUnit::NANO)));
175174
columns.push_back({name_gen(), time64_ns});
176175
// Date32, date64
177176
ARROW_ASSIGN_OR_RAISE(
178177
auto date32_array,
179-
gen.Int32(kBatchSize, -1000 * 365, 1000 * 365, kNullProbability)->View(date32()));
178+
gen.Int32(length, -1000 * 365, 1000 * 365, null_probability)->View(date32()));
180179
columns.push_back({name_gen(), date32_array});
181180
columns.push_back(
182-
{name_gen(), gen.Date64(kBatchSize, -1000 * 365, 1000 * 365, kNullProbability)});
181+
{name_gen(), gen.Date64(length, -1000 * 365, 1000 * 365, null_probability)});
183182

184183
// A column of tiny strings that will hopefully trigger dict encoding
185184
columns.push_back({name_gen(), tiny_strings_array});
186185
columns.push_back({name_gen(), large_strings_array});
187186
columns.push_back({name_gen(), string_view_array});
188187
columns.push_back(
189-
{name_gen(), gen.FixedSizeBinary(kBatchSize, /*byte_width=*/7, kNullProbability)});
188+
{name_gen(), gen.FixedSizeBinary(length, /*byte_width=*/7, null_probability)});
190189

191190
// A column of lists/large lists
192191
{
193-
auto values = gen.Int64(kBatchSize * 10, -10000, 10000, kNullProbability);
194-
auto offsets = gen.Offsets(kBatchSize + 1, 0, static_cast<int32_t>(values->length()));
192+
auto values = gen.Int64(length * 10, -10000, 10000, null_probability);
193+
auto offsets = gen.Offsets(length + 1, 0, static_cast<int32_t>(values->length()));
195194
ARROW_ASSIGN_OR_RAISE(auto lists, ListArray::FromArrays(*offsets, *values));
196195
columns.push_back({name_gen(), lists});
197-
auto large_offsets = gen.LargeOffsets(kBatchSize + 1, 0, values->length());
196+
auto large_offsets = gen.LargeOffsets(length + 1, 0, values->length());
198197
ARROW_ASSIGN_OR_RAISE(auto large_lists,
199198
LargeListArray::FromArrays(*large_offsets, *values));
200199
columns.push_back({name_gen(), large_lists});
201200
}
202201
// A column of a repeated constant that will hopefully trigger RLE encoding
203202
{
204-
ARROW_ASSIGN_OR_RAISE(auto values, MakeArrayFromScalar(Int16Scalar(42), kBatchSize));
203+
ARROW_ASSIGN_OR_RAISE(auto values, MakeArrayFromScalar(Int16Scalar(42), length));
205204
columns.push_back({name_gen(), values});
206205
}
207206
// A column of lists of lists
208207
{
209-
auto inner_values = gen.Int64(kBatchSize * 9, -10000, 10000, kNullProbability);
208+
auto inner_values = gen.Int64(length * 9, -10000, 10000, null_probability);
210209
auto inner_offsets =
211-
gen.Offsets(kBatchSize * 3 + 1, 0, static_cast<int32_t>(inner_values->length()),
212-
kNullProbability);
210+
gen.Offsets(length * 3 + 1, 0, static_cast<int32_t>(inner_values->length()),
211+
null_probability);
213212
ARROW_ASSIGN_OR_RAISE(auto inner_lists,
214213
ListArray::FromArrays(*inner_offsets, *inner_values));
215-
auto offsets = gen.Offsets(
216-
kBatchSize + 1, 0, static_cast<int32_t>(inner_lists->length()), kNullProbability);
214+
auto offsets = gen.Offsets(length + 1, 0, static_cast<int32_t>(inner_lists->length()),
215+
null_probability);
217216
ARROW_ASSIGN_OR_RAISE(auto lists, ListArray::FromArrays(*offsets, *inner_lists));
218217
columns.push_back({name_gen(), lists});
219218
}
220219
// A column of maps
221220
{
222-
constexpr auto kChildSize = kBatchSize * 3;
221+
const auto kChildSize = length * 3;
223222
auto keys = gen.String(kChildSize, /*min_length=*/4, /*max_length=*/7,
224223
/*null_probability=*/0);
225-
auto values = gen.Float32(kChildSize, -1e10, 1e10, kNullProbability);
226-
columns.push_back({name_gen(), gen.Map(keys, values, kBatchSize, kNullProbability)});
224+
auto values = gen.Float32(kChildSize, -1e10, 1e10, null_probability);
225+
columns.push_back({name_gen(), gen.Map(keys, values, length, null_probability)});
227226
}
228227
// A column of nested non-nullable structs
229228
{
@@ -239,13 +238,13 @@ Result<std::shared_ptr<RecordBatch>> ExampleBatch1() {
239238
}
240239
// A column of nested nullable structs
241240
{
242-
auto null_bitmap = gen.NullBitmap(kBatchSize, kNullProbability);
241+
auto null_bitmap = gen.NullBitmap(length, null_probability);
243242
ARROW_ASSIGN_OR_RAISE(auto inner_a,
244243
StructArray::Make({int16_array, non_null_float64_array},
245244
{field_for_array(int16_array),
246245
field_for_array(non_null_float64_array)},
247246
std::move(null_bitmap)));
248-
null_bitmap = gen.NullBitmap(kBatchSize, kNullProbability);
247+
null_bitmap = gen.NullBitmap(length, null_probability);
249248
ARROW_ASSIGN_OR_RAISE(
250249
auto structs,
251250
StructArray::Make({inner_a, tiny_strings_array},
@@ -257,25 +256,37 @@ Result<std::shared_ptr<RecordBatch>> ExampleBatch1() {
257256
// TODO extension types: UUID, JSON, GEOMETRY, GEOGRAPHY
258257

259258
// A non-dict-encoded column (see GetWriteConfigurations)
260-
columns.push_back({"no_dict", gen.String(kBatchSize, 0, 30, kNullProbability)});
259+
columns.push_back({"no_dict", gen.String(length, 0, 30, null_probability)});
261260
// A column that should be quite compressible (see GetWriteConfigurations)
262-
columns.push_back({"compressed", gen.Int64(kBatchSize, -10, 10, kNullProbability)});
261+
columns.push_back({"compressed", gen.Int64(length, -10, 10, null_probability)});
262+
263+
return columns;
264+
}
265+
266+
Result<std::shared_ptr<RecordBatch>> BatchFromColumn(const Column& col) {
267+
FieldVector fields{FieldForArray(col.array, col.name)};
268+
ArrayVector arrays{col.array};
263269

264-
FieldVector fields;
265-
ArrayVector arrays;
266-
for (const auto& col : columns) {
267-
fields.push_back(field_for_array_named(col.array, col.name));
268-
arrays.push_back(col.array);
269-
}
270270
auto md = key_value_metadata({"key1", "key2"}, {"value1", ""});
271271
auto schema = ::arrow::schema(std::move(fields), std::move(md));
272272
return RecordBatch::Make(std::move(schema), kBatchSize, std::move(arrays));
273273
}
274274

275275
Result<std::vector<std::shared_ptr<RecordBatch>>> Batches() {
276+
ARROW_ASSIGN_OR_RAISE(auto columns,
277+
ExampleColumns(kBatchSize, /*null_probability=*/0.2));
276278
std::vector<std::shared_ptr<RecordBatch>> batches;
277-
ARROW_ASSIGN_OR_RAISE(auto batch, ExampleBatch1());
278-
batches.push_back(batch);
279+
for (const auto& col : columns) {
280+
// Since Parquet columns are laid out and read independently of each other,
281+
// we estimate that fuzzing is more efficient if we submit multiple one-column
282+
// files than one single file in all columns. The fuzzer should indeed be able
283+
// to test many more variations per unit of time.
284+
// This has to be verified in the OSS-Fuzz fuzzer statistics
285+
// (https://oss-fuzz.com/fuzzer-stats) by looking at the `avg_exec_per_sec`
286+
// column.
287+
ARROW_ASSIGN_OR_RAISE(auto batch, BatchFromColumn(col));
288+
batches.push_back(batch);
289+
}
279290
return batches;
280291
}
281292

0 commit comments

Comments
 (0)