Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/build-support/fuzzing/generate_corpuses.sh
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ rm -rf ${PANDAS_DIR}
git clone --depth=1 https://github.com/pandas-dev/pandas ${PANDAS_DIR}

rm -rf ${CORPUS_DIR}
mkdir -p ${CORPUS_DIR}
${OUT}/arrow-csv-generate-fuzz-corpus ${CORPUS_DIR}
# Add examples from arrow-testing repo
cp ${ARROW_ROOT}/testing/data/csv/*.csv ${CORPUS_DIR}
# Add examples from Pandas test suite
Expand Down
21 changes: 11 additions & 10 deletions cpp/src/arrow/compute/kernels/aggregate_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3275,15 +3275,8 @@ void CheckVarStd(const Datum& array, const VarianceOptions& options,
auto var = checked_cast<const DoubleScalar*>(out_var.scalar().get());
auto std = checked_cast<const DoubleScalar*>(out_std.scalar().get());
ASSERT_TRUE(var->is_valid && std->is_valid);
// Near zero these macros don't work as well
// (and MinGW can give results slightly off from zero)
if (std::abs(expected_var) < 1e-20) {
ASSERT_NEAR(std->value * std->value, var->value, 1e-20);
ASSERT_NEAR(var->value, expected_var, 1e-20);
} else {
ASSERT_DOUBLE_EQ(std->value * std->value, var->value);
ASSERT_DOUBLE_EQ(var->value, expected_var); // < 4ULP
}
AssertWithinUlp(std->value * std->value, var->value, /*n_ulps=*/2);
AssertWithinUlp(var->value, expected_var, /*n_ulps=*/5);
}

template <typename ArrowType>
Expand Down Expand Up @@ -4159,6 +4152,14 @@ class TestRandomQuantileKernel : public TestPrimitiveQuantileKernel<ArrowType> {

void VerifyTDigest(const std::shared_ptr<ChunkedArray>& chunked,
std::vector<double>& quantiles) {
// For some reason, TDigest computations with libc++ seem much less accurate.
// A possible explanation is that libc++ has less precise implementations
// of std::sin and std::asin, used in the TDigest implementation.
# ifdef _LIBCPP_VERSION
constexpr double kRelativeTolerance = 0.09;
# else
constexpr double kRelativeTolerance = 0.05;
# endif
TDigestOptions options(quantiles);
ASSERT_OK_AND_ASSIGN(Datum out, TDigest(chunked, options));
const auto& out_array = out.make_array();
Expand All @@ -4173,7 +4174,7 @@ class TestRandomQuantileKernel : public TestPrimitiveQuantileKernel<ArrowType> {
const double* approx = out_array->data()->GetValues<double>(1);
for (size_t i = 0; i < quantiles.size(); ++i) {
const auto& exact_scalar = checked_pointer_cast<DoubleScalar>(exact[i][0].scalar());
const double tolerance = std::fabs(exact_scalar->value) * 0.05;
const double tolerance = std::fabs(exact_scalar->value) * kRelativeTolerance;
EXPECT_NEAR(approx[i], exact_scalar->value, tolerance) << quantiles[i];
}
}
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/csv/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ add_arrow_benchmark(converter_benchmark PREFIX "arrow-csv")
add_arrow_benchmark(parser_benchmark PREFIX "arrow-csv")
add_arrow_benchmark(writer_benchmark PREFIX "arrow-csv")

if(ARROW_BUILD_FUZZING_UTILITIES)
add_executable(arrow-csv-generate-fuzz-corpus generate_fuzz_corpus.cc)
target_link_libraries(arrow-csv-generate-fuzz-corpus ${ARROW_UTIL_LIB}
${ARROW_TEST_LINK_LIBS})
endif()

add_arrow_fuzz_target(fuzz PREFIX "arrow-csv")

arrow_install_all_headers("arrow/csv")
Expand Down
8 changes: 6 additions & 2 deletions cpp/src/arrow/csv/fuzz.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,15 @@ Status FuzzCsvReader(const uint8_t* data, int64_t size) {
auto io_context = arrow::io::default_io_context();

auto read_options = ReadOptions::Defaults();
// Make chunking more likely
read_options.block_size = 4096;
// Make chunking more likely to exercise chunked reading and optional parallelization.
// Most files in the seed corpus are currently in the 4-10 kB range.
read_options.block_size = 1000;
auto parse_options = ParseOptions::Defaults();
auto convert_options = ConvertOptions::Defaults();
convert_options.auto_dict_encode = true;
// This is the default value, but we might want to turn this knob to have a better
// mix of dict-encoded and non-dict-encoded columns when reading.
convert_options.auto_dict_max_cardinality = 50;
Comment on lines 46 to 52
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need these changes?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The block_size one is to increase the likelihood of chunking and the number of chunks, to exercise chunked reading and parallelization more. The auto_dict_max_cardinality just explicitly sets to the default value, so it's really a no-op but it signals a knob that we might want to turn.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the record, most files generated by this PR are 5-10 kB in size.


auto input_stream =
std::make_shared<::arrow::io::BufferReader>(std::make_shared<Buffer>(data, size));
Expand Down
204 changes: 204 additions & 0 deletions cpp/src/arrow/csv/generate_fuzz_corpus.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// A command line executable that generates a bunch of valid IPC files
// containing example record batches. Those are used as fuzzing seeds
// to make fuzzing more efficient.

#include <cstdlib>
#include <functional>
#include <iostream>
#include <memory>
#include <string>
#include <vector>

#include "arrow/array.h"
#include "arrow/array/util.h"
#include "arrow/compute/cast.h"
#include "arrow/csv/options.h"
#include "arrow/csv/writer.h"
#include "arrow/io/file.h"
#include "arrow/io/memory.h"
#include "arrow/ipc/writer.h"
#include "arrow/json/from_string.h"
#include "arrow/record_batch.h"
#include "arrow/result.h"
#include "arrow/testing/random.h"
#include "arrow/util/io_util.h"

namespace arrow::csv {

using ::arrow::internal::CreateDir;
using ::arrow::internal::PlatformFilename;
using ::arrow::json::ArrayFromJSONString;

Result<std::shared_ptr<Buffer>> WriteRecordBatch(
const std::shared_ptr<RecordBatch>& batch, const WriteOptions& options) {
ARROW_ASSIGN_OR_RAISE(auto sink, io::BufferOutputStream::Create(1024));
ARROW_ASSIGN_OR_RAISE(auto writer, MakeCSVWriter(sink.get(), batch->schema(), options));
RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
RETURN_NOT_OK(writer->Close());
return sink->Finish();
}

Result<std::shared_ptr<RecordBatch>> MakeBatch(
std::function<Result<std::shared_ptr<Array>>(int64_t length, double null_probability)>
array_factory,
int64_t length) {
ArrayVector columns;
FieldVector fields;

struct ColumnSpec {
std::string name;
double null_probability;
};
for (auto spec : {ColumnSpec{"with_nulls", 0.2}, ColumnSpec{"without_nulls", 0.0}}) {
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> column,
array_factory(length, spec.null_probability));
columns.push_back(column);
fields.push_back(field(spec.name, column->type()));
}
return RecordBatch::Make(schema(std::move(fields)), length, std::move(columns));
}

Result<RecordBatchVector> Batches() {
::arrow::random::RandomArrayGenerator gen(/*seed=*/42);
RecordBatchVector batches;

auto append_batch = [&](auto array_factory, int64_t length) -> Status {
ARROW_ASSIGN_OR_RAISE(auto batch, MakeBatch(array_factory, length));
batches.push_back(batch);
return Status::OK();
};

// Ideally, we should exercise all possible inference kinds (see inference_internal.h)
auto make_nulls = [&](int64_t length, double null_probability) {
return MakeArrayOfNull(null(), length);
};
auto make_ints = [&](int64_t length, double null_probability) {
return gen.Int64(length, /*min=*/-1'000'000, /*max=*/1'000'000, null_probability);
};
auto make_floats = [&](int64_t length, double null_probability) {
return gen.Float64(length, /*min=*/-100.0, /*max=*/100.0, null_probability);
};
auto make_booleans = [&](int64_t length, double null_probability) {
return gen.Boolean(length, /*true_probability=*/0.8, null_probability);
};
auto make_dates = [&](int64_t length, double null_probability) {
return gen.Date64(length, /*min=*/1, /*max=*/365 * 60, null_probability);
};
auto make_times = [&](int64_t length, double null_probability) {
return gen.Int32(length, /*min=*/0, /*max=*/86399, null_probability)
->View(time32(TimeUnit::SECOND));
};

std::string timezone;
auto make_timestamps = [&](int64_t length, double null_probability) {
return gen.Int64(length, /*min=*/1, /*max=*/1764079190, null_probability)
->View(timestamp(TimeUnit::SECOND, timezone));
};
auto make_timestamps_ns = [&](int64_t length, double null_probability) {
return gen
.Int64(length, /*min=*/1, /*max=*/1764079190LL * 1'000'000'000, null_probability)
->View(timestamp(TimeUnit::NANO, timezone));
};

auto make_strings = [&](int64_t length, double null_probability) {
return gen.String(length, /*min_length=*/3, /*max_length=*/15, null_probability);
};
auto make_string_with_repeats = [&](int64_t length, double null_probability) {
// `unique` should be less than `auto_dict_max_cardinality` in fuzz target
return gen.StringWithRepeats(length, /*unique=*/10, /*min_length=*/3,
/*max_length=*/15, null_probability);
};

RETURN_NOT_OK(append_batch(make_nulls, /*length=*/2000));
RETURN_NOT_OK(append_batch(make_ints, /*length=*/500));
RETURN_NOT_OK(append_batch(make_floats, /*length=*/150));
RETURN_NOT_OK(append_batch(make_booleans, /*length=*/500));

RETURN_NOT_OK(append_batch(make_dates, /*length=*/200));
RETURN_NOT_OK(append_batch(make_times, /*length=*/400));
timezone = "";
RETURN_NOT_OK(append_batch(make_timestamps, /*length=*/200));
RETURN_NOT_OK(append_batch(make_timestamps_ns, /*length=*/100));
// Will generate timestamps with a "Z" suffix
timezone = "UTC";
RETURN_NOT_OK(append_batch(make_timestamps, /*length=*/200));
RETURN_NOT_OK(append_batch(make_timestamps_ns, /*length=*/100));
// Will generate timestamps with a "+0100" or "+0200" suffix
timezone = "Europe/Paris";
RETURN_NOT_OK(append_batch(make_timestamps, /*length=*/200));
RETURN_NOT_OK(append_batch(make_timestamps_ns, /*length=*/100));

RETURN_NOT_OK(append_batch(make_strings, /*length=*/300));
RETURN_NOT_OK(append_batch(make_string_with_repeats, /*length=*/300));
// XXX Cannot add non-UTF8 binary as the CSV writer doesn't support writing it

return batches;
}

Status DoMain(const std::string& out_dir) {
ARROW_ASSIGN_OR_RAISE(auto dir_fn, PlatformFilename::FromString(out_dir));
RETURN_NOT_OK(CreateDir(dir_fn));

int sample_num = 1;
auto sample_name = [&]() -> std::string {
return "csv-file-" + std::to_string(sample_num++);
};

ARROW_ASSIGN_OR_RAISE(auto batches, Batches());

auto options = WriteOptions::Defaults();
RETURN_NOT_OK(options.Validate());

for (const auto& batch : batches) {
RETURN_NOT_OK(batch->ValidateFull());
ARROW_ASSIGN_OR_RAISE(auto buffer, WriteRecordBatch(batch, options));

ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_name()));
std::cerr << sample_fn.ToString() << std::endl;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why use standard error rater than standard out?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No precise reason, this is the same thing we're doing in other fuzz corpus generators.

ARROW_ASSIGN_OR_RAISE(auto file, io::FileOutputStream::Open(sample_fn.ToString()));
RETURN_NOT_OK(file->Write(buffer));
RETURN_NOT_OK(file->Close());
}
return Status::OK();
}

ARROW_NORETURN void Usage() {
std::cerr << "Usage: arrow-csv-generate-fuzz-corpus "
<< "<output directory>" << std::endl;
std::exit(2);
}

int Main(int argc, char** argv) {
if (argc != 2) {
Usage();
}
auto out_dir = std::string(argv[1]);

Status st = DoMain(out_dir);
if (!st.ok()) {
std::cerr << st.ToString() << std::endl;
return 1;
}
return 0;
}

} // namespace arrow::csv

int main(int argc, char** argv) { return arrow::csv::Main(argc, argv); }
18 changes: 9 additions & 9 deletions cpp/src/arrow/ipc/read_write_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -952,23 +952,23 @@ TEST_F(TestWriteRecordBatch, SliceTruncatesBuffers) {
}

TEST_F(TestWriteRecordBatch, RoundtripPreservesBufferSizes) {
// ARROW-7975
// ARROW-7975: deserialized buffers should have logically exact size (no padding)
random::RandomArrayGenerator rg(/*seed=*/0);
constexpr int64_t kLength = 30;

int64_t length = 15;
auto arr = rg.String(length, 0, 10, 0.1);
auto batch = RecordBatch::Make(::arrow::schema({field("f0", utf8())}), length, {arr});
auto arr =
rg.String(kLength, /*min_length=*/0, /*max_length=*/10, /*null_probability=*/0.3);
ASSERT_NE(arr->null_count(), 0); // required for validity bitmap size assertion below

auto batch = RecordBatch::Make(::arrow::schema({field("f0", utf8())}), kLength, {arr});

ASSERT_OK_AND_ASSIGN(
mmap_, io::MemoryMapFixture::InitMemoryMap(
/*buffer_size=*/1 << 20, TempFile("test-roundtrip-buffer-sizes")));
DictionaryMemo dictionary_memo;
ASSERT_OK_AND_ASSIGN(
auto result,
DoStandardRoundTrip(*batch, IpcWriteOptions::Defaults(), &dictionary_memo));

// Make sure that the validity bitmap is size 2 as expected
ASSERT_EQ(2, arr->data()->buffers[0]->size());
// Make sure that the validity bitmap has expected size
ASSERT_EQ(bit_util::BytesForBits(kLength), arr->data()->buffers[0]->size());

for (size_t i = 0; i < arr->data()->buffers.size(); ++i) {
ASSERT_EQ(arr->data()->buffers[i]->size(),
Expand Down
Loading
Loading