Skip to content

Commit 8e199da

Browse files
committed
Merge branch 'main' into flakytest-spillpool-19058
2 parents c77f588 + cf9d078 commit 8e199da

File tree

86 files changed

+3386
-1929
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

86 files changed

+3386
-1929
lines changed

.github/actions/setup-builder/action.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,17 @@ runs:
4646
# https://github.com/actions/checkout/issues/766
4747
shell: bash
4848
run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
49+
- name: Remove unnecessary preinstalled software
50+
shell: bash
51+
run: |
52+
echo "Disk space before cleanup:"
53+
df -h
54+
apt-get clean
55+
# remove tool cache: about 8.5GB (github has host /opt/hostedtoolcache mounted as /__t)
56+
rm -rf /__t/* || true
57+
# remove Haskell runtime: about 6.3GB (host /usr/local/.ghcup)
58+
rm -rf /host/usr/local/.ghcup || true
59+
# remove Android library: about 7.8GB (host /usr/local/lib/android)
60+
rm -rf /host/usr/local/lib/android || true
61+
echo "Disk space after cleanup:"
62+
df -h

.github/workflows/audit.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ jobs:
4242
steps:
4343
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
4444
- name: Install cargo-audit
45-
uses: taiki-e/install-action@92e6dd1c202153a204d471a3c509bf1e03dcfa44 # v2.62.61
45+
uses: taiki-e/install-action@493d7f216ecab2af0602481ce809ab2c72836fa1 # v2.62.62
4646
with:
4747
tool: cargo-audit
4848
- name: Run audit check

.github/workflows/rust.yml

Lines changed: 2 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -272,18 +272,6 @@ jobs:
272272
volumes:
273273
- /usr/local:/host/usr/local
274274
steps:
275-
- name: Remove unnecessary preinstalled software
276-
run: |
277-
echo "Disk space before cleanup:"
278-
df -h
279-
# remove tool cache: about 8.5GB (github has host /opt/hostedtoolcache mounted as /__t)
280-
rm -rf /__t/* || true
281-
# remove Haskell runtime: about 6.3GB (host /usr/local/.ghcup)
282-
rm -rf /host/usr/local/.ghcup || true
283-
# remove Android library: about 7.8GB (host /usr/local/lib/android)
284-
rm -rf /host/usr/local/lib/android || true
285-
echo "Disk space after cleanup:"
286-
df -h
287275
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
288276
with:
289277
submodules: true
@@ -374,19 +362,6 @@ jobs:
374362
with:
375363
save-if: ${{ github.ref_name == 'main' }}
376364
shared-key: "amd-ci-linux-test-example"
377-
- name: Remove unnecessary preinstalled software
378-
run: |
379-
echo "Disk space before cleanup:"
380-
df -h
381-
apt-get clean
382-
rm -rf /__t/CodeQL
383-
rm -rf /__t/PyPy
384-
rm -rf /__t/Java_Temurin-Hotspot_jdk
385-
rm -rf /__t/Python
386-
rm -rf /__t/go
387-
rm -rf /__t/Ruby
388-
echo "Disk space after cleanup:"
389-
df -h
390365
- name: Run examples
391366
run: |
392367
# test datafusion-sql examples
@@ -446,7 +421,7 @@ jobs:
446421
sudo apt-get update -qq
447422
sudo apt-get install -y -qq clang
448423
- name: Setup wasm-pack
449-
uses: taiki-e/install-action@92e6dd1c202153a204d471a3c509bf1e03dcfa44 # v2.62.61
424+
uses: taiki-e/install-action@493d7f216ecab2af0602481ce809ab2c72836fa1 # v2.62.62
450425
with:
451426
tool: wasm-pack
452427
- name: Run tests with headless mode
@@ -749,7 +724,7 @@ jobs:
749724
- name: Setup Rust toolchain
750725
uses: ./.github/actions/setup-builder
751726
- name: Install cargo-msrv
752-
uses: taiki-e/install-action@92e6dd1c202153a204d471a3c509bf1e03dcfa44 # v2.62.61
727+
uses: taiki-e/install-action@493d7f216ecab2af0602481ce809ab2c72836fa1 # v2.62.62
753728
with:
754729
tool: cargo-msrv
755730

benchmarks/README.md

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -243,28 +243,11 @@ See the help for more details.
243243
You can enable `mimalloc` or `snmalloc` (to use either the mimalloc or snmalloc allocator) as features by passing them in as `--features`. For example:
244244

245245
```shell
246-
cargo run --release --features "mimalloc" --bin tpch -- benchmark datafusion --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096
247-
```
248-
249-
The benchmark program also supports CSV and Parquet input file formats and a utility is provided to convert from `tbl`
250-
(generated by the `dbgen` utility) to CSV and Parquet.
251-
252-
```bash
253-
cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-parquet --format parquet
246+
cargo run --release --features "mimalloc" --bin dfbench tpch --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096
254247
```
255248

256249
Or if you want to verify and run all the queries in the benchmark, you can just run `cargo test`.
257250

258-
#### Sorted Conversion
259-
260-
The TPCH tables generated by the dbgen utility are sorted by their first column (their primary key for most tables, the `l_orderkey` column for the `lineitem` table.)
261-
262-
To preserve this sorted order information during conversion (useful for benchmarking execution on pre-sorted data) include the `--sort` flag:
263-
264-
```bash
265-
cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-sorted-parquet --format parquet --sort
266-
```
267-
268251
### Comparing results between runs
269252

270253
Any `dfbench` execution with `-o <dir>` argument will produce a

benchmarks/bench.sh

Lines changed: 51 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -189,8 +189,8 @@ main() {
189189
echo "***************************"
190190
case "$BENCHMARK" in
191191
all)
192-
data_tpch "1"
193-
data_tpch "10"
192+
data_tpch "1" "parquet"
193+
data_tpch "10" "parquet"
194194
data_h2o "SMALL"
195195
data_h2o "MEDIUM"
196196
data_h2o "BIG"
@@ -203,26 +203,22 @@ main() {
203203
# nlj uses range() function, no data generation needed
204204
;;
205205
tpch)
206-
data_tpch "1"
206+
data_tpch "1" "parquet"
207207
;;
208208
tpch_mem)
209-
# same data as for tpch
210-
data_tpch "1"
209+
data_tpch "1" "parquet"
211210
;;
212211
tpch_csv)
213-
# same data as for tpch
214-
data_tpch "1"
212+
data_tpch "1" "csv"
215213
;;
216214
tpch10)
217-
data_tpch "10"
215+
data_tpch "10" "parquet"
218216
;;
219217
tpch_mem10)
220-
# same data as for tpch10
221-
data_tpch "10"
218+
data_tpch "10" "parquet"
222219
;;
223220
tpch_csv10)
224-
# same data as for tpch10
225-
data_tpch "10"
221+
data_tpch "10" "csv"
226222
;;
227223
clickbench_1)
228224
data_clickbench_1
@@ -297,19 +293,19 @@ main() {
297293
;;
298294
external_aggr)
299295
# same data as for tpch
300-
data_tpch "1"
296+
data_tpch "1" "parquet"
301297
;;
302298
sort_tpch)
303299
# same data as for tpch
304-
data_tpch "1"
300+
data_tpch "1" "parquet"
305301
;;
306302
sort_tpch10)
307303
# same data as for tpch10
308-
data_tpch "10"
304+
data_tpch "10" "parquet"
309305
;;
310306
topk_tpch)
311307
# same data as for tpch
312-
data_tpch "1"
308+
data_tpch "1" "parquet"
313309
;;
314310
nlj)
315311
# nlj uses range() function, no data generation needed
@@ -320,7 +316,7 @@ main() {
320316
echo "HJ benchmark does not require data generation"
321317
;;
322318
compile_profile)
323-
data_tpch "1"
319+
data_tpch "1" "parquet"
324320
;;
325321
*)
326322
echo "Error: unknown benchmark '$BENCHMARK' for data generation"
@@ -537,7 +533,7 @@ main() {
537533
# Creates TPCH data at a certain scale factor, if it doesn't already
538534
# exist
539535
#
540-
# call like: data_tpch($scale_factor)
536+
# call like: data_tpch($scale_factor, format)
541537
#
542538
# Creates data in $DATA_DIR/tpch_sf1 for scale factor 1
543539
# Creates data in $DATA_DIR/tpch_sf10 for scale factor 10
@@ -548,20 +544,23 @@ data_tpch() {
548544
echo "Internal error: Scale factor not specified"
549545
exit 1
550546
fi
547+
FORMAT=$2
548+
if [ -z "$FORMAT" ] ; then
549+
echo "Internal error: Format not specified"
550+
exit 1
551+
fi
551552

552553
TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
553-
echo "Creating tpch dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR}..."
554+
echo "Creating tpch $FORMAT dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR}..."
554555

555556
# Ensure the target data directory exists
556557
mkdir -p "${TPCH_DIR}"
557558

558-
# Create 'tbl' (CSV format) data into $DATA_DIR if it does not already exist
559-
FILE="${TPCH_DIR}/supplier.tbl"
560-
if test -f "${FILE}"; then
561-
echo " tbl files exist ($FILE exists)."
562-
else
563-
echo " creating tbl files with tpch_dbgen..."
564-
docker run -v "${TPCH_DIR}":/data -it --rm ghcr.io/scalytics/tpch-docker:main -vf -s "${SCALE_FACTOR}"
559+
# check if tpchgen-cli is installed
560+
if ! command -v tpchgen-cli &> /dev/null
561+
then
562+
echo "tpchgen-cli could not be found, please install it via 'cargo install tpchgen-cli'"
563+
exit 1
565564
fi
566565

567566
# Copy expected answers into the ./data/answers directory if it does not already exist
@@ -574,27 +573,32 @@ data_tpch() {
574573
docker run -v "${TPCH_DIR}":/data -it --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/"
575574
fi
576575

577-
# Create 'parquet' files from tbl
578-
FILE="${TPCH_DIR}/supplier"
579-
if test -d "${FILE}"; then
580-
echo " parquet files exist ($FILE exists)."
581-
else
582-
echo " creating parquet files using benchmark binary ..."
583-
pushd "${SCRIPT_DIR}" > /dev/null
584-
$CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet
585-
popd > /dev/null
576+
if [ "$FORMAT" = "parquet" ]; then
577+
# Create 'parquet' files, one directory per file
578+
FILE="${TPCH_DIR}/supplier"
579+
if test -d "${FILE}"; then
580+
echo " parquet files exist ($FILE exists)."
581+
else
582+
echo " creating parquet files using tpchgen-cli ..."
583+
tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format parquet --parquet-compression='ZSTD(1)' --parts=1 --output-dir "${TPCH_DIR}"
584+
fi
585+
return
586586
fi
587587

588-
# Create 'csv' files from tbl
589-
FILE="${TPCH_DIR}/csv/supplier"
590-
if test -d "${FILE}"; then
591-
echo " csv files exist ($FILE exists)."
592-
else
593-
echo " creating csv files using benchmark binary ..."
594-
pushd "${SCRIPT_DIR}" > /dev/null
595-
$CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}/csv" --format csv
596-
popd > /dev/null
588+
# Create 'csv' files, one directory per file
589+
if [ "$FORMAT" = "csv" ]; then
590+
FILE="${TPCH_DIR}/csv/supplier"
591+
if test -d "${FILE}"; then
592+
echo " csv files exist ($FILE exists)."
593+
else
594+
echo " creating csv files using tpchgen-cli binary ..."
595+
tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format csv --parts=1 --output-dir "${TPCH_DIR}/csv"
596+
fi
597+
return
597598
fi
599+
600+
echo "Error: unknown format '$FORMAT' for tpch data generation, expected 'parquet' or 'csv'"
601+
exit 1
598602
}
599603

600604
# Runs the tpch benchmark
@@ -611,10 +615,10 @@ run_tpch() {
611615
echo "Running tpch benchmark..."
612616

613617
FORMAT=$2
614-
debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format ${FORMAT} -o "${RESULTS_FILE}" ${QUERY_ARG}
618+
debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format ${FORMAT} -o "${RESULTS_FILE}" ${QUERY_ARG}
615619
}
616620

617-
# Runs the tpch in memory
621+
# Runs the tpch in memory (needs tpch parquet data)
618622
run_tpch_mem() {
619623
SCALE_FACTOR=$1
620624
if [ -z "$SCALE_FACTOR" ] ; then
@@ -627,7 +631,7 @@ run_tpch_mem() {
627631
echo "RESULTS_FILE: ${RESULTS_FILE}"
628632
echo "Running tpch_mem benchmark..."
629633
# -m means in memory
630-
debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG}
634+
debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG}
631635
}
632636

633637
# Runs the compile profile benchmark helper

benchmarks/src/bin/dfbench.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ enum Options {
4848
Nlj(nlj::RunOpt),
4949
SortTpch(sort_tpch::RunOpt),
5050
Tpch(tpch::RunOpt),
51-
TpchConvert(tpch::ConvertOpt),
5251
}
5352

5453
// Main benchmark runner entrypoint
@@ -65,6 +64,5 @@ pub async fn main() -> Result<()> {
6564
Options::Nlj(opt) => opt.run().await,
6665
Options::SortTpch(opt) => opt.run().await,
6766
Options::Tpch(opt) => Box::pin(opt.run()).await,
68-
Options::TpchConvert(opt) => opt.run().await,
6967
}
7068
}

benchmarks/src/bin/external_aggr.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ use datafusion::datasource::listing::{
3434
use datafusion::datasource::{MemTable, TableProvider};
3535
use datafusion::error::Result;
3636
use datafusion::execution::memory_pool::FairSpillPool;
37-
use datafusion::execution::memory_pool::{human_readable_size, units};
3837
use datafusion::execution::runtime_env::RuntimeEnvBuilder;
3938
use datafusion::execution::SessionStateBuilder;
4039
use datafusion::physical_plan::display::DisplayableExecutionPlan;
@@ -44,6 +43,7 @@ use datafusion_benchmarks::util::{BenchmarkRun, CommonOpt, QueryResult};
4443
use datafusion_common::instant::Instant;
4544
use datafusion_common::utils::get_available_parallelism;
4645
use datafusion_common::{exec_err, DEFAULT_PARQUET_EXTENSION};
46+
use datafusion_common::{human_readable_size, units};
4747

4848
#[derive(Debug, StructOpt)]
4949
#[structopt(

0 commit comments

Comments
 (0)