@@ -75,6 +75,11 @@ struct Column {
7575 }
7676};
7777
78+ std::shared_ptr<Field> FieldForArray (const std::shared_ptr<Array>& array,
79+ std::string name) {
80+ return field (std::move (name), array->type (), /* nullable=*/ array->null_count () != 0 );
81+ }
82+
7883std::vector<WriteConfig> GetWriteConfigurations () {
7984 // clang-format off
8085 auto w_brotli = WriterProperties::Builder ()
@@ -98,59 +103,53 @@ std::vector<WriteConfig> GetWriteConfigurations() {
98103 return configs;
99104}
100105
101- Result<std::shared_ptr<RecordBatch>> ExampleBatch1 () {
102- constexpr double kNullProbability = 0.2 ;
106+ Result<std::vector<Column>> ExampleColumns (int32_t length,
107+ double null_probability = 0.2 ) {
108+ std::vector<Column> columns;
103109
104110 random::RandomArrayGenerator gen (42 );
105111 auto name_gen = Column::NameGenerator ();
106-
107- auto field_for_array_named = [&](const std::shared_ptr<Array>& array,
108- std::string name) {
109- return field (std::move (name), array->type (), /* nullable=*/ array->null_count () != 0 );
110- };
111112 auto field_for_array = [&](const std::shared_ptr<Array>& array) {
112- return field_for_array_named (array, name_gen ());
113+ return FieldForArray (array, name_gen ());
113114 };
114115
115- std::vector<Column> columns;
116-
117- auto int16_array = gen.Int16 (kBatchSize , -30000 , 30000 , kNullProbability );
118- auto int32_array = gen.Int32 (kBatchSize , -2000000000 , 2000000000 , kNullProbability );
119- auto int64_array = gen.Int64 (kBatchSize , -9000000000000000000LL , 9000000000000000000LL ,
120- kNullProbability );
116+ auto int16_array = gen.Int16 (length, -30000 , 30000 , null_probability);
117+ auto int32_array = gen.Int32 (length, -2000000000 , 2000000000 , null_probability);
118+ auto int64_array =
119+ gen.Int64 (length, -9000000000000000000LL , 9000000000000000000LL , null_probability);
121120 auto non_null_float64_array =
122- gen.Float64 (kBatchSize , -1e10 , 1e10 , /* null_probability=*/ 0.0 );
123- auto tiny_strings_array = gen.String (kBatchSize , 0 , 3 , kNullProbability );
121+ gen.Float64 (length , -1e10 , 1e10 , /* null_probability=*/ 0.0 );
122+ auto tiny_strings_array = gen.String (length , 0 , 3 , null_probability );
124123 auto large_strings_array =
125- gen.LargeString (kBatchSize , /* min_length=*/ 0 , /* max_length=*/ 20 , kNullProbability );
124+ gen.LargeString (length , /* min_length=*/ 0 , /* max_length=*/ 20 , null_probability );
126125 auto string_view_array =
127- gen.StringView (kBatchSize , /* min_length=*/ 8 , /* max_length=*/ 30 , kNullProbability );
128- ARROW_ASSIGN_OR_RAISE (auto null_array, MakeArrayOfNull (null (), kBatchSize ));
126+ gen.StringView (length , /* min_length=*/ 8 , /* max_length=*/ 30 , null_probability );
127+ ARROW_ASSIGN_OR_RAISE (auto null_array, MakeArrayOfNull (null (), length ));
129128
130129 // Null
131130 columns.push_back ({name_gen (), null_array});
132131 // Numerics
133132 columns.push_back ({name_gen (), int16_array});
134133 columns.push_back ({name_gen (), non_null_float64_array});
135134 columns.push_back (
136- {name_gen (), gen.Float16 (kBatchSize , Float16::FromDouble (-1e4 ),
137- Float16::FromDouble (1e4 ), kNullProbability )});
135+ {name_gen (), gen.Float16 (length , Float16::FromDouble (-1e4 ),
136+ Float16::FromDouble (1e4 ), null_probability )});
138137 columns.push_back ({name_gen (), int64_array});
139138 // Decimals
140139 columns.push_back (
141- {name_gen (), gen.Decimal128 (decimal128 (24 , 7 ), kBatchSize , kNullProbability )});
140+ {name_gen (), gen.Decimal128 (decimal128 (24 , 7 ), length, null_probability )});
142141 columns.push_back (
143- {name_gen (), gen.Decimal256 (decimal256 (43 , 7 ), kBatchSize , kNullProbability )});
142+ {name_gen (), gen.Decimal256 (decimal256 (43 , 7 ), length, null_probability )});
144143 columns.push_back (
145- {name_gen (), gen.Decimal64 (decimal64 (12 , 3 ), kBatchSize , kNullProbability )});
144+ {name_gen (), gen.Decimal64 (decimal64 (12 , 3 ), length, null_probability )});
146145 columns.push_back (
147- {name_gen (), gen.Decimal32 (decimal32 (7 , 3 ), kBatchSize , kNullProbability )});
146+ {name_gen (), gen.Decimal32 (decimal32 (7 , 3 ), length, null_probability )});
148147
149148 // Timestamp
150149 // (Parquet doesn't have seconds timestamps so the values are going to be
151150 // multiplied by 10)
152151 auto int64_timestamps_array =
153- gen.Int64 (kBatchSize , -9000000000000000LL , 9000000000000000LL , kNullProbability );
152+ gen.Int64 (length , -9000000000000000LL , 9000000000000000LL , null_probability );
154153 for (auto unit : TimeUnit::values ()) {
155154 ARROW_ASSIGN_OR_RAISE (auto timestamps,
156155 int64_timestamps_array->View (timestamp (unit, " UTC" )));
@@ -159,71 +158,71 @@ Result<std::shared_ptr<RecordBatch>> ExampleBatch1() {
159158 // Time32, time64
160159 ARROW_ASSIGN_OR_RAISE (
161160 auto time32_s,
162- gen.Int32 (kBatchSize , 0 , 86399 , kNullProbability )->View (time32 (TimeUnit::SECOND)));
161+ gen.Int32 (length , 0 , 86399 , null_probability )->View (time32 (TimeUnit::SECOND)));
163162 columns.push_back ({name_gen (), time32_s});
164- ARROW_ASSIGN_OR_RAISE (auto time32_ms,
165- gen. Int32 ( kBatchSize , 0 , 86399999 , kNullProbability )
166- ->View (time32 (TimeUnit::MILLI)));
163+ ARROW_ASSIGN_OR_RAISE (
164+ auto time32_ms,
165+ gen. Int32 (length, 0 , 86399999 , null_probability) ->View (time32 (TimeUnit::MILLI)));
167166 columns.push_back ({name_gen (), time32_ms});
168167 ARROW_ASSIGN_OR_RAISE (auto time64_us,
169- gen.Int64 (kBatchSize , 0 , 86399999999LL , kNullProbability )
168+ gen.Int64 (length , 0 , 86399999999LL , null_probability )
170169 ->View (time64 (TimeUnit::MICRO)));
171170 columns.push_back ({name_gen (), time64_us});
172171 ARROW_ASSIGN_OR_RAISE (auto time64_ns,
173- gen.Int64 (kBatchSize , 0 , 86399999999999LL , kNullProbability )
172+ gen.Int64 (length , 0 , 86399999999999LL , null_probability )
174173 ->View (time64 (TimeUnit::NANO)));
175174 columns.push_back ({name_gen (), time64_ns});
176175 // Date32, date64
177176 ARROW_ASSIGN_OR_RAISE (
178177 auto date32_array,
179- gen.Int32 (kBatchSize , -1000 * 365 , 1000 * 365 , kNullProbability )->View (date32 ()));
178+ gen.Int32 (length , -1000 * 365 , 1000 * 365 , null_probability )->View (date32 ()));
180179 columns.push_back ({name_gen (), date32_array});
181180 columns.push_back (
182- {name_gen (), gen.Date64 (kBatchSize , -1000 * 365 , 1000 * 365 , kNullProbability )});
181+ {name_gen (), gen.Date64 (length , -1000 * 365 , 1000 * 365 , null_probability )});
183182
184183 // A column of tiny strings that will hopefully trigger dict encoding
185184 columns.push_back ({name_gen (), tiny_strings_array});
186185 columns.push_back ({name_gen (), large_strings_array});
187186 columns.push_back ({name_gen (), string_view_array});
188187 columns.push_back (
189- {name_gen (), gen.FixedSizeBinary (kBatchSize , /* byte_width=*/ 7 , kNullProbability )});
188+ {name_gen (), gen.FixedSizeBinary (length , /* byte_width=*/ 7 , null_probability )});
190189
191190 // A column of lists/large lists
192191 {
193- auto values = gen.Int64 (kBatchSize * 10 , -10000 , 10000 , kNullProbability );
194- auto offsets = gen.Offsets (kBatchSize + 1 , 0 , static_cast <int32_t >(values->length ()));
192+ auto values = gen.Int64 (length * 10 , -10000 , 10000 , null_probability );
193+ auto offsets = gen.Offsets (length + 1 , 0 , static_cast <int32_t >(values->length ()));
195194 ARROW_ASSIGN_OR_RAISE (auto lists, ListArray::FromArrays (*offsets, *values));
196195 columns.push_back ({name_gen (), lists});
197- auto large_offsets = gen.LargeOffsets (kBatchSize + 1 , 0 , values->length ());
196+ auto large_offsets = gen.LargeOffsets (length + 1 , 0 , values->length ());
198197 ARROW_ASSIGN_OR_RAISE (auto large_lists,
199198 LargeListArray::FromArrays (*large_offsets, *values));
200199 columns.push_back ({name_gen (), large_lists});
201200 }
202201 // A column of a repeated constant that will hopefully trigger RLE encoding
203202 {
204- ARROW_ASSIGN_OR_RAISE (auto values, MakeArrayFromScalar (Int16Scalar (42 ), kBatchSize ));
203+ ARROW_ASSIGN_OR_RAISE (auto values, MakeArrayFromScalar (Int16Scalar (42 ), length ));
205204 columns.push_back ({name_gen (), values});
206205 }
207206 // A column of lists of lists
208207 {
209- auto inner_values = gen.Int64 (kBatchSize * 9 , -10000 , 10000 , kNullProbability );
208+ auto inner_values = gen.Int64 (length * 9 , -10000 , 10000 , null_probability );
210209 auto inner_offsets =
211- gen.Offsets (kBatchSize * 3 + 1 , 0 , static_cast <int32_t >(inner_values->length ()),
212- kNullProbability );
210+ gen.Offsets (length * 3 + 1 , 0 , static_cast <int32_t >(inner_values->length ()),
211+ null_probability );
213212 ARROW_ASSIGN_OR_RAISE (auto inner_lists,
214213 ListArray::FromArrays (*inner_offsets, *inner_values));
215- auto offsets = gen.Offsets (
216- kBatchSize + 1 , 0 , static_cast < int32_t >(inner_lists-> length ()), kNullProbability );
214+ auto offsets = gen.Offsets (length + 1 , 0 , static_cast < int32_t >(inner_lists-> length ()),
215+ null_probability );
217216 ARROW_ASSIGN_OR_RAISE (auto lists, ListArray::FromArrays (*offsets, *inner_lists));
218217 columns.push_back ({name_gen (), lists});
219218 }
220219 // A column of maps
221220 {
222- constexpr auto kChildSize = kBatchSize * 3 ;
221+ const auto kChildSize = length * 3 ;
223222 auto keys = gen.String (kChildSize , /* min_length=*/ 4 , /* max_length=*/ 7 ,
224223 /* null_probability=*/ 0 );
225- auto values = gen.Float32 (kChildSize , -1e10 , 1e10 , kNullProbability );
226- columns.push_back ({name_gen (), gen.Map (keys, values, kBatchSize , kNullProbability )});
224+ auto values = gen.Float32 (kChildSize , -1e10 , 1e10 , null_probability );
225+ columns.push_back ({name_gen (), gen.Map (keys, values, length, null_probability )});
227226 }
228227 // A column of nested non-nullable structs
229228 {
@@ -239,13 +238,13 @@ Result<std::shared_ptr<RecordBatch>> ExampleBatch1() {
239238 }
240239 // A column of nested nullable structs
241240 {
242- auto null_bitmap = gen.NullBitmap (kBatchSize , kNullProbability );
241+ auto null_bitmap = gen.NullBitmap (length, null_probability );
243242 ARROW_ASSIGN_OR_RAISE (auto inner_a,
244243 StructArray::Make ({int16_array, non_null_float64_array},
245244 {field_for_array (int16_array),
246245 field_for_array (non_null_float64_array)},
247246 std::move (null_bitmap)));
248- null_bitmap = gen.NullBitmap (kBatchSize , kNullProbability );
247+ null_bitmap = gen.NullBitmap (length, null_probability );
249248 ARROW_ASSIGN_OR_RAISE (
250249 auto structs,
251250 StructArray::Make ({inner_a, tiny_strings_array},
@@ -257,25 +256,37 @@ Result<std::shared_ptr<RecordBatch>> ExampleBatch1() {
257256 // TODO extension types: UUID, JSON, GEOMETRY, GEOGRAPHY
258257
259258 // A non-dict-encoded column (see GetWriteConfigurations)
260- columns.push_back ({" no_dict" , gen.String (kBatchSize , 0 , 30 , kNullProbability )});
259+ columns.push_back ({" no_dict" , gen.String (length , 0 , 30 , null_probability )});
261260 // A column that should be quite compressible (see GetWriteConfigurations)
262- columns.push_back ({" compressed" , gen.Int64 (kBatchSize , -10 , 10 , kNullProbability )});
261+ columns.push_back ({" compressed" , gen.Int64 (length, -10 , 10 , null_probability)});
262+
263+ return columns;
264+ }
265+
266+ Result<std::shared_ptr<RecordBatch>> BatchFromColumn (const Column& col) {
267+ FieldVector fields{FieldForArray (col.array , col.name )};
268+ ArrayVector arrays{col.array };
263269
264- FieldVector fields;
265- ArrayVector arrays;
266- for (const auto & col : columns) {
267- fields.push_back (field_for_array_named (col.array , col.name ));
268- arrays.push_back (col.array );
269- }
270270 auto md = key_value_metadata ({" key1" , " key2" }, {" value1" , " " });
271271 auto schema = ::arrow::schema (std::move (fields), std::move (md));
272272 return RecordBatch::Make (std::move (schema), kBatchSize , std::move (arrays));
273273}
274274
275275Result<std::vector<std::shared_ptr<RecordBatch>>> Batches () {
276+ ARROW_ASSIGN_OR_RAISE (auto columns,
277+ ExampleColumns (kBatchSize , /* null_probability=*/ 0.2 ));
276278 std::vector<std::shared_ptr<RecordBatch>> batches;
277- ARROW_ASSIGN_OR_RAISE (auto batch, ExampleBatch1 ());
278- batches.push_back (batch);
279+ for (const auto & col : columns) {
280+ // Since Parquet columns are laid out and read independently of each other,
281+ // we estimate that fuzzing is more efficient if we submit multiple one-column
282+ // files than one single file in all columns. The fuzzer should indeed be able
283+ // to test many more variations per unit of time.
284+ // This has to be verified in the OSS-Fuzz fuzzer statistics
285+ // (https://oss-fuzz.com/fuzzer-stats) by looking at the `avg_exec_per_sec`
286+ // column.
287+ ARROW_ASSIGN_OR_RAISE (auto batch, BatchFromColumn (col));
288+ batches.push_back (batch);
289+ }
279290 return batches;
280291}
281292
0 commit comments