Skip to content

Commit 3d84d83

Browse files
add benchmark for scanning parquet files with cast
1 parent 4862a14 commit 3d84d83

File tree

2 files changed

+92
-0
lines changed

2 files changed

+92
-0
lines changed

cpp/src/arrow/dataset/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,3 +241,7 @@ endfunction()
241241

242242
add_arrow_dataset_benchmark(file_benchmark)
243243
add_arrow_dataset_benchmark(scanner_benchmark)
244+
245+
if(ARROW_PARQUET)
246+
add_arrow_dataset_benchmark(parquet_scan_benchmark)
247+
endif()
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "arrow/testing/gtest_util.h"
19+
#include "benchmark/benchmark.h"
20+
21+
#include "arrow/api.h"
22+
#include "arrow/compute/initialize.h"
23+
#include "arrow/dataset/dataset.h"
24+
#include "arrow/dataset/file_parquet.h"
25+
#include "arrow/dataset/scanner.h"
26+
#include "arrow/io/memory.h"
27+
#include "parquet/arrow/writer.h"
28+
29+
namespace arrow {
30+
namespace dataset {
31+
32+
using parquet::arrow::WriteTable;
33+
34+
Result<std::shared_ptr<Buffer>> WriteStringColParquetBuffer(int64_t nrows) {
35+
auto schema = arrow::schema({arrow::field("my_string_col", arrow::utf8())});
36+
37+
arrow::StringBuilder builder;
38+
for (int64_t i = 0; i < nrows; i++) {
39+
ARROW_RETURN_NOT_OK(builder.Append("row_" + std::to_string(i)));
40+
}
41+
std::shared_ptr<arrow::Array> arr;
42+
ARROW_RETURN_NOT_OK(builder.Finish(&arr));
43+
auto table = arrow::Table::Make(schema, {arr});
44+
45+
ARROW_ASSIGN_OR_RAISE(auto sink, arrow::io::BufferOutputStream::Create());
46+
ARROW_RETURN_NOT_OK(WriteTable(*table, arrow::default_memory_pool(), sink));
47+
return sink->Finish();
48+
}
49+
50+
static void ParquetScanToTableCastStrings(benchmark::State& state) {
51+
// GH-43660: Scan parquet data including a String column using a dataset object with
52+
// LargeString in schema.
53+
int64_t nrows = 100'000;
54+
int64_t batch_size = 100;
55+
bool use_threads = false;
56+
auto format = std::make_shared<ParquetFileFormat>();
57+
58+
// Create a buffer with a single String column and wrap with FileFragment
59+
ASSERT_OK_AND_ASSIGN(std::shared_ptr<Buffer> buffer,
60+
WriteStringColParquetBuffer(nrows));
61+
auto buffer_reader = std::make_shared<arrow::io::BufferReader>(buffer);
62+
FileSource source(buffer_reader, buffer->size());
63+
ASSERT_OK_AND_ASSIGN(auto fragment, format->MakeFragment(source));
64+
std::vector<std::shared_ptr<FileFragment>> fragments{fragment};
65+
66+
// Create a dataset from FileFragment and set schema to LargeString (require casting).
67+
auto schema = arrow::schema({field("my_string_col", arrow::large_utf8())});
68+
ASSERT_OK_AND_ASSIGN(auto dataset, FileSystemDataset::Make(
69+
schema, compute::literal(true), format,
70+
/*filesystem=*/nullptr, std::move(fragments)));
71+
72+
ASSERT_OK_AND_ASSIGN(auto builder, dataset->NewScan());
73+
ASSERT_OK(builder->BatchSize(batch_size));
74+
ASSERT_OK(builder->UseThreads(use_threads));
75+
ASSERT_OK_AND_ASSIGN(auto scanner, builder->Finish());
76+
77+
for (auto _ : state) {
78+
ASSERT_OK_AND_ASSIGN(auto table, scanner->ToTable());
79+
benchmark::DoNotOptimize(table);
80+
}
81+
82+
state.SetItemsProcessed(state.iterations() * nrows);
83+
}
84+
85+
BENCHMARK(ParquetScanToTableCastStrings);
86+
87+
} // namespace dataset
88+
} // namespace arrow

0 commit comments

Comments
 (0)