Skip to content

Commit a5a5728

Browse files
committed
Merge remote-tracking branch 'apache/main' into union_extract
2 parents 90ef4ad + ee2dc83 commit a5a5728

File tree

128 files changed

+2975
-1066
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

128 files changed

+2975
-1066
lines changed

.github/workflows/rust.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ on:
4141
jobs:
4242
# Check license header
4343
license-header-check:
44-
runs-on: ubuntu-20.04
44+
runs-on: ubuntu-latest
4545
name: Check License Header
4646
steps:
4747
- uses: actions/checkout@v4

Cargo.lock

Lines changed: 10 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,18 +25,21 @@
2525
![Commit Activity][commit-activity-badge]
2626
[![Open Issues][open-issues-badge]][open-issues-url]
2727
[![Discord chat][discord-badge]][discord-url]
28+
[![Linkedin][linkedin-badge]][linkedin-url]
2829

2930
[crates-badge]: https://img.shields.io/crates/v/datafusion.svg
3031
[crates-url]: https://crates.io/crates/datafusion
3132
[license-badge]: https://img.shields.io/badge/license-Apache%20v2-blue.svg
3233
[license-url]: https://github.com/apache/datafusion/blob/main/LICENSE.txt
3334
[actions-badge]: https://github.com/apache/datafusion/actions/workflows/rust.yml/badge.svg
3435
[actions-url]: https://github.com/apache/datafusion/actions?query=branch%3Amain
35-
[discord-badge]: https://img.shields.io/discord/885562378132000778.svg?logo=discord&style=flat-square
36+
[discord-badge]: https://img.shields.io/badge/Chat-Discord-purple
3637
[discord-url]: https://discord.com/invite/Qw5gKqHxUM
3738
[commit-activity-badge]: https://img.shields.io/github/commit-activity/m/apache/datafusion
3839
[open-issues-badge]: https://img.shields.io/github/issues-raw/apache/datafusion
3940
[open-issues-url]: https://github.com/apache/datafusion/issues
41+
[linkedin-badge]: https://img.shields.io/badge/Follow-Linkedin-blue
42+
[linkedin-url]: https://www.linkedin.com/company/apache-datafusion/
4043

4144
[Website](https://datafusion.apache.org/) |
4245
[API Docs](https://docs.rs/datafusion/latest/datafusion/) |

benchmarks/lineprotocol.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
#!/usr/bin/env python
2+
# Licensed to the Apache Software Foundation (ASF) under one
3+
# or more contributor license agreements. See the NOTICE file
4+
# distributed with this work for additional information
5+
# regarding copyright ownership. The ASF licenses this file
6+
# to you under the Apache License, Version 2.0 (the
7+
# "License"); you may not use this file except in compliance
8+
# with the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing,
13+
# software distributed under the License is distributed on an
14+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
# KIND, either express or implied. See the License for the
16+
# specific language governing permissions and limitations
17+
# under the License.
18+
19+
20+
"""
21+
Converts a given json to LineProtocol format that can be
22+
visualised by grafana/other systems that support LineProtocol.
23+
24+
Usage example:
25+
$ python3 lineprotocol.py sort.json
26+
benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=0,row_count=10838832,elapsed_ms=85626006 1691105678000000000
27+
benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=1,row_count=10838832,elapsed_ms=68694468 1691105678000000000
28+
benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=2,row_count=10838832,elapsed_ms=63392883 1691105678000000000
29+
benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=3,row_count=10838832,elapsed_ms=66388367 1691105678000000000
30+
"""
31+
32+
# sort.json
33+
"""
34+
{
35+
"queries": [
36+
{
37+
"iterations": [
38+
{
39+
"elapsed": 85626.006132,
40+
"row_count": 10838832
41+
},
42+
{
43+
"elapsed": 68694.467851,
44+
"row_count": 10838832
45+
},
46+
{
47+
"elapsed": 63392.883406,
48+
"row_count": 10838832
49+
},
50+
{
51+
"elapsed": 66388.367387,
52+
"row_count": 10838832
53+
},
54+
],
55+
"query": "sort utf8",
56+
"start_time": 1691105678
57+
},
58+
],
59+
"context": {
60+
"arguments": [
61+
"sort",
62+
"--path",
63+
"benchmarks/data",
64+
"--scale-factor",
65+
"1.0",
66+
"--iterations",
67+
"4",
68+
"-o",
69+
"sort.json"
70+
],
71+
"benchmark_version": "28.0.0",
72+
"datafusion_version": "28.0.0",
73+
"num_cpus": 8,
74+
"start_time": 1691105678
75+
}
76+
}
77+
"""
78+
79+
from __future__ import annotations
80+
81+
import json
82+
from dataclasses import dataclass
83+
from typing import Dict, List, Any
84+
from pathlib import Path
85+
from argparse import ArgumentParser
86+
import sys
87+
print = sys.stdout.write
88+
89+
90+
@dataclass
91+
class QueryResult:
92+
elapsed: float
93+
row_count: int
94+
95+
@classmethod
96+
def load_from(cls, data: Dict[str, Any]) -> QueryResult:
97+
return cls(elapsed=data["elapsed"], row_count=data["row_count"])
98+
99+
100+
@dataclass
101+
class QueryRun:
102+
query: int
103+
iterations: List[QueryResult]
104+
start_time: int
105+
106+
@classmethod
107+
def load_from(cls, data: Dict[str, Any]) -> QueryRun:
108+
return cls(
109+
query=data["query"],
110+
iterations=[QueryResult(**iteration) for iteration in data["iterations"]],
111+
start_time=data["start_time"],
112+
)
113+
114+
@property
115+
def execution_time(self) -> float:
116+
assert len(self.iterations) >= 1
117+
118+
# Use minimum execution time to account for variations / other
119+
# things the system was doing
120+
return min(iteration.elapsed for iteration in self.iterations)
121+
122+
123+
@dataclass
124+
class Context:
125+
benchmark_version: str
126+
datafusion_version: str
127+
num_cpus: int
128+
start_time: int
129+
arguments: List[str]
130+
name: str
131+
132+
@classmethod
133+
def load_from(cls, data: Dict[str, Any]) -> Context:
134+
return cls(
135+
benchmark_version=data["benchmark_version"],
136+
datafusion_version=data["datafusion_version"],
137+
num_cpus=data["num_cpus"],
138+
start_time=data["start_time"],
139+
arguments=data["arguments"],
140+
name=data["arguments"][0]
141+
)
142+
143+
144+
@dataclass
145+
class BenchmarkRun:
146+
context: Context
147+
queries: List[QueryRun]
148+
149+
@classmethod
150+
def load_from(cls, data: Dict[str, Any]) -> BenchmarkRun:
151+
return cls(
152+
context=Context.load_from(data["context"]),
153+
queries=[QueryRun.load_from(result) for result in data["queries"]],
154+
)
155+
156+
@classmethod
157+
def load_from_file(cls, path: Path) -> BenchmarkRun:
158+
with open(path, "r") as f:
159+
return cls.load_from(json.load(f))
160+
161+
162+
def lineformat(
163+
baseline: Path,
164+
) -> None:
165+
baseline = BenchmarkRun.load_from_file(baseline)
166+
context = baseline.context
167+
benchamrk_str = f"benchmark,name={context.name},version={context.benchmark_version},datafusion_version={context.datafusion_version},num_cpus={context.num_cpus}"
168+
for query in baseline.queries:
169+
query_str = f"query=\"{query.query}\""
170+
timestamp = f"{query.start_time*10**9}"
171+
for iter_num, result in enumerate(query.iterations):
172+
print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed*1000:.0f} {timestamp}\n")
173+
174+
def main() -> None:
175+
parser = ArgumentParser()
176+
parser.add_argument(
177+
"path",
178+
type=Path,
179+
help="Path to the benchmark file.",
180+
)
181+
options = parser.parse_args()
182+
183+
lineformat(options.baseline_path)
184+
185+
186+
187+
if __name__ == "__main__":
188+
main()

datafusion-examples/examples/advanced_parquet_index.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,7 @@ impl TableProvider for IndexTableProvider {
504504
.with_file(partitioned_file);
505505

506506
// Finally, put it all together into a DataSourceExec
507-
Ok(file_scan_config.new_exec())
507+
Ok(file_scan_config.build())
508508
}
509509

510510
/// Tell DataFusion to push filters down to the scan method

datafusion-examples/examples/advanced_udaf.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -423,11 +423,11 @@ impl AggregateUDFImpl for SimplifiedGeoMeanUdaf {
423423
// In real-world scenarios, you might create UDFs from built-in expressions.
424424
Ok(Expr::AggregateFunction(AggregateFunction::new_udf(
425425
Arc::new(AggregateUDF::from(GeoMeanUdaf::new())),
426-
aggregate_function.args,
427-
aggregate_function.distinct,
428-
aggregate_function.filter,
429-
aggregate_function.order_by,
430-
aggregate_function.null_treatment,
426+
aggregate_function.params.args,
427+
aggregate_function.params.distinct,
428+
aggregate_function.params.filter,
429+
aggregate_function.params.order_by,
430+
aggregate_function.params.null_treatment,
431431
)))
432432
};
433433
Some(Box::new(simplify))

0 commit comments

Comments
 (0)