Skip to content

Commit 9377d17

Browse files
committed
Merge branch 'main' of https://github.com/datafuselabs/databend into refactor/const
2 parents 1eda119 + 4e1e864 commit 9377d17

File tree

349 files changed

+8321
-4522
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

349 files changed

+8321
-4522
lines changed

.github/actions/benchmark_cloud/action.yml

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,14 @@ inputs:
2222
size:
2323
description: "Small/Medium/Large"
2424
required: true
25+
cache_size:
26+
description: "Warehouse cache size"
27+
required: false
28+
default: "0"
29+
tries:
30+
description: "Number of attempts per query (1-3)"
31+
required: false
32+
default: ""
2533
version:
2634
description: "Databend version"
2735
required: true
@@ -38,37 +46,39 @@ inputs:
3846
runs:
3947
using: "composite"
4048
steps:
41-
- name: Install script dependencies
42-
shell: bash
43-
run: |
44-
sudo apt-get update -yq
45-
sudo apt-get install -yq python3
46-
4749
- name: Prepare
48-
working-directory: benchmark/clickbench
50+
working-directory: benchmark
4951
shell: bash
5052
id: prepare
5153
env:
5254
BENDSQL_DSN: "databend://${{ inputs.cloud_user }}:${{ inputs.cloud_password }}@${{ inputs.cloud_gateway }}:443"
5355
run: |
5456
if [[ "${{ inputs.dataset }}" == "load" ]]; then
5557
echo "database=load_test_${{ inputs.run_id }}" >> $GITHUB_OUTPUT
56-
echo "tries=1" >> $GITHUB_OUTPUT
5758
else
5859
database="${{ inputs.database }}"
5960
if [[ -z "$database" ]]; then
60-
database="clickbench"
61+
database="benchmark"
6162
fi
6263
echo "database=$database" >> $GITHUB_OUTPUT
63-
echo "tries=3" >> $GITHUB_OUTPUT
6464
fi
65+
tries="${{ inputs.tries }}"
66+
if [[ -z "$tries" ]]; then
67+
if [[ "${{ inputs.dataset }}" == "load" ]]; then
68+
tries=1
69+
else
70+
tries=3
71+
fi
72+
fi
73+
echo "tries=$tries" >> $GITHUB_OUTPUT
6574
6675
- name: Run Benchmark
67-
working-directory: benchmark/clickbench
76+
working-directory: benchmark
6877
env:
6978
BENCHMARK_ID: ${{ inputs.run_id }}
7079
BENCHMARK_DATASET: ${{ inputs.dataset }}
7180
BENCHMARK_SIZE: ${{ inputs.size }}
81+
BENCHMARK_CACHE_SIZE: ${{ inputs.cache_size }}
7282
BENCHMARK_VERSION: ${{ inputs.version }}
7383
BENCHMARK_DATABASE: ${{ steps.prepare.outputs.database }}
7484
BENCHMARK_TRIES: ${{ steps.prepare.outputs.tries }}
@@ -86,10 +96,10 @@ runs:
8696
- name: Upload artifact
8797
uses: actions/upload-artifact@v4
8898
with:
89-
name: benchmark-${{ inputs.dataset }}-${{ inputs.size }}
99+
name: benchmark-${{ inputs.dataset }}-${{ inputs.size }}-cache-${{ inputs.cache_size }}
90100
path: |
91-
benchmark/clickbench/result-${{ inputs.dataset }}-cloud-${{ inputs.size }}.json
92-
benchmark/clickbench/result-${{ inputs.dataset }}-cloud-${{ inputs.size }}-*.ndjson
101+
benchmark/result-${{ inputs.dataset }}-cloud-${{ inputs.size }}-cache-${{ inputs.cache_size }}.json
102+
benchmark/result-${{ inputs.dataset }}-cloud-${{ inputs.size }}-cache-${{ inputs.cache_size }}-*.ndjson
93103
94104
- name: Remove warehouse
95105
if: always()

.github/actions/benchmark_local/action.yml

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,8 @@ inputs:
2020
runs:
2121
using: "composite"
2222
steps:
23-
- name: Install script dependencies
24-
shell: bash
25-
run: |
26-
sudo apt-get update -yq
27-
sudo apt-get install -yq python3
28-
2923
- name: Run Benchmark
30-
working-directory: benchmark/clickbench
24+
working-directory: benchmark
3125
env:
3226
BENCHMARK_ID: ${{ inputs.run_id }}
3327
BENCHMARK_DATASET: ${{ inputs.dataset }}
@@ -42,7 +36,7 @@ runs:
4236
name: benchmark_local
4337

4438
- name: Prepare Metadata
45-
working-directory: benchmark/clickbench
39+
working-directory: benchmark
4640
shell: bash
4741
run: |
4842
case ${{ inputs.source }} in
@@ -63,4 +57,4 @@ runs:
6357
uses: actions/upload-artifact@v4
6458
with:
6559
name: benchmark-${{ inputs.dataset }}-local
66-
path: benchmark/clickbench/result-${{ inputs.dataset }}-local.json
60+
path: benchmark/result-${{ inputs.dataset }}-local.json

.github/actions/publish_debug_symbols/action.yml

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,17 +32,13 @@ runs:
3232
shell: bash
3333
run: |
3434
publish_name="databend-debug-${{ inputs.category }}-${{ inputs.version }}-${{ inputs.target }}.tar.gz"
35-
tar -C distro/bin -czvf ${publish_name} databend-query.debug
36-
echo "name=$publish_name" >> $GITHUB_OUTPUT
35+
symbol_name="databend-query-${{ inputs.category }}-${{ inputs.version }}-${{ inputs.target }}.debug"
36+
mv "distro/bin/databend-query.debug" "distro/bin/${symbol_name}"
37+
tar -C distro/bin -czvf "${publish_name}" "${symbol_name}"
38+
rm "distro/bin/${symbol_name}"
39+
echo "name=${publish_name}" >> $GITHUB_OUTPUT
3740
3841
- name: Update debug symbols to github
3942
shell: bash
4043
run: |
4144
gh release upload ${{ inputs.version }} ${{ steps.prepare.outputs.name }} --clobber
42-
43-
- name: Sync debug symbols to R2
44-
shell: bash
45-
continue-on-error: true
46-
if: inputs.category == 'default'
47-
run: |
48-
aws s3 cp ${{ steps.prepare.outputs.name }} s3://repo/databend/${{ inputs.version }}/${{ steps.prepare.outputs.name }} --no-progress --checksum-algorithm=CRC32

.github/workflows/cloud.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ jobs:
6565
ref: "refs/pull/${{ github.event.number }}/merge"
6666
- name: Build Release
6767
uses: ./.github/actions/build_linux
68-
timeout-minutes: 60
68+
timeout-minutes: 120
6969
env:
7070
DATABEND_ENTERPRISE_LICENSE_PUBLIC_KEY: ${{ secrets.DATABEND_ENTERPRISE_LICENSE_PUBLIC_KEY }}
7171
DATABEND_TELEMETRY_ENDPOINT: ${{ secrets.DATABEND_TELEMETRY_ENDPOINT}}

.github/workflows/release.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -313,10 +313,6 @@ jobs:
313313
uses: ./.github/actions/publish_debug_symbols
314314
env:
315315
GH_TOKEN: ${{ github.token }}
316-
AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
317-
AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
318-
AWS_DEFAULT_REGION: auto
319-
AWS_ENDPOINT_URL: ${{ secrets.R2_ENDPOINT_URL }}
320316
with:
321317
version: ${{ needs.create_release.outputs.version }}
322318
target: ${{ matrix.target }}

.github/workflows/reuse.benchmark.yml

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@ jobs:
115115
source: ${{ inputs.source }}
116116
source_id: ${{ inputs.source_id }}
117117
size: Small
118+
cache_size: "0"
119+
tries: 1
118120
version: ${{ inputs.version }}
119121
cloud_user: ${{ secrets.BENCHMARK_CLOUD_USER }}
120122
cloud_password: ${{ secrets.BENCHMARK_CLOUD_PASSWORD }}
@@ -137,11 +139,12 @@ jobs:
137139
strategy:
138140
matrix:
139141
include:
140-
- { dataset: hits, size: Small, database: hits, timeout: 10 }
141-
- { dataset: hits, size: Large, database: hits, timeout: 10 }
142-
- { dataset: tpch100, size: Small, database: tpch_100, timeout: 20 }
143-
- { dataset: tpch100, size: Large, database: tpch_100, timeout: 20 }
144-
- { dataset: tpch1000, size: Large, database: tpch_1000, timeout: 60 }
142+
- { dataset: hits, size: Small, database: hits, timeout: 10, cache_size: 0, tries: 3 }
143+
- { dataset: hits, size: Large, database: hits, timeout: 10, cache_size: 0, tries: 3 }
144+
- { dataset: tpch100, size: Small, database: tpch_100, timeout: 20, cache_size: 0, tries: 3 }
145+
- { dataset: tpch100, size: Large, database: tpch_100, timeout: 20, cache_size: 0, tries: 3 }
146+
- { dataset: tpch1000, size: Large, database: tpch_1000, timeout: 60, cache_size: 0, tries: 1 }
147+
- { dataset: tpch1000, size: Large, database: tpch_1000, timeout: 60, cache_size: 300, tries: 1 }
145148
fail-fast: true
146149
max-parallel: 1
147150
steps:
@@ -165,6 +168,8 @@ jobs:
165168
source: ${{ inputs.source }}
166169
source_id: ${{ inputs.source_id }}
167170
size: ${{ matrix.size }}
171+
cache_size: ${{ matrix.cache_size }}
172+
tries: ${{ matrix.tries }}
168173
version: ${{ inputs.version }}
169174
cloud_user: ${{ secrets.BENCHMARK_CLOUD_USER }}
170175
cloud_password: ${{ secrets.BENCHMARK_CLOUD_PASSWORD }}
@@ -184,17 +189,14 @@ jobs:
184189
runs-on: ubuntu-latest
185190
steps:
186191
- uses: actions/checkout@v4
187-
- name: Install Dependencies
188-
run: |
189-
sudo apt-get update -yq
190-
sudo apt-get install -yq python3-jinja2
192+
- uses: astral-sh/setup-uv@v5
191193
- uses: actions/download-artifact@v4
192194
with:
193-
path: benchmark/clickbench/results
195+
path: benchmark/results
194196
pattern: benchmark-*
195197
merge-multiple: true
196198
- name: Get Report Prefix
197-
working-directory: benchmark/clickbench
199+
working-directory: benchmark
198200
run: |
199201
shopt -s nullglob
200202
for result in results/*.json; do
@@ -215,16 +217,17 @@ jobs:
215217
AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
216218
AWS_DEFAULT_REGION: auto
217219
AWS_ENDPOINT_URL: ${{ secrets.R2_ENDPOINT_URL }}
218-
working-directory: benchmark/clickbench
220+
working-directory: benchmark
219221
run: |
220222
echo -e "## ClickBench Report\n" > /tmp/body
221223
shopt -s nullglob
224+
uv sync
222225
for p in results/*; do
223226
[ -d "$p" ] || continue
224227
dataset=$(basename $p)
225228
aws s3 sync results/$dataset/ ${REPORT_S3_PREFIX}/ --include "*.json" --no-progress --checksum-algorithm=CRC32
226229
aws s3 sync "s3://benchmark/clickbench/release/${dataset}/latest/" ./results/${dataset}/ --exclude "*" --include "*.json" || true
227-
./update_results.py --dataset $dataset --pr ${{ inputs.source_id }}
230+
uv run update_results.py --dataset $dataset --pr ${{ inputs.source_id }}
228231
aws s3 cp ./results/${dataset}.html ${REPORT_S3_PREFIX}/${dataset}.html --no-progress --checksum-algorithm=CRC32
229232
echo "* **${dataset}**: https://benchmark.databend.com/clickbench/pr/${{ inputs.source_id }}/${{ inputs.run_id }}/${dataset}.html" >> /tmp/body
230233
done
@@ -249,19 +252,17 @@ jobs:
249252
# - "internal"
250253
steps:
251254
- uses: actions/checkout@v4
252-
- name: Install Dependencies
253-
run: |
254-
sudo apt-get update -yq
255-
sudo apt-get install -yq python3-jinja2
255+
- uses: astral-sh/setup-uv@v5
256256
- uses: actions/download-artifact@v4
257257
with:
258-
path: benchmark/clickbench/results
258+
path: benchmark/results
259259
pattern: benchmark-${{ matrix.dataset }}-*
260260
merge-multiple: true
261261
- name: Prepare results directory
262-
working-directory: benchmark/clickbench
262+
working-directory: benchmark
263263
run: |
264264
shopt -s nullglob
265+
uv sync
265266
for result in results/*.json; do
266267
dataset=$(echo $result | sed -E 's/.*result-(\w+)-.*\.json/\1/')
267268
mkdir -p results/${dataset}/
@@ -274,7 +275,7 @@ jobs:
274275
mv $ndjson ndjsons/${dataset}/$(basename $ndjson)
275276
done
276277
- name: Generate report and upload to R2
277-
working-directory: benchmark/clickbench
278+
working-directory: benchmark
278279
env:
279280
AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
280281
AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
@@ -283,7 +284,7 @@ jobs:
283284
run: |
284285
aws s3 sync s3://benchmark/clickbench/release/${{ matrix.dataset }}/$(date --date='-1 month' -u +%Y)/$(date --date='-1 month' -u +%m)/ ./results/${{ matrix.dataset }}/
285286
aws s3 sync s3://benchmark/clickbench/release/${{ matrix.dataset }}/$(date -u +%Y)/$(date -u +%m)/ ./results/${{ matrix.dataset }}/
286-
./update_results.py --dataset ${{ matrix.dataset }} --release ${{ inputs.source_id }}
287+
uv run update_results.py --dataset ${{ matrix.dataset }} --release ${{ inputs.source_id }}
287288
288289
RESULT_PREFIX="s3://benchmark/clickbench/release/${{ matrix.dataset }}/$(date -u +%Y)/$(date -u +%m)/$(date -u +%Y-%m-%d)/${{ inputs.source_id }}"
289290
LATEST_PREFIX="s3://benchmark/clickbench/release/${{ matrix.dataset }}/latest/latest"
@@ -294,7 +295,7 @@ jobs:
294295
295296
aws s3 cp ./results/${{ matrix.dataset }}.html s3://benchmark/clickbench/release/${{ matrix.dataset }}.html --no-progress --checksum-algorithm=CRC32
296297
- name: Upload NDJSON archives to R2
297-
working-directory: benchmark/clickbench
298+
working-directory: benchmark
298299
env:
299300
AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
300301
AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}

README.md

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<h1 align="center">Databend</h1>
2-
<h3 align="center">Unified Multimodal Database for Any Data at Any Scale.</h3>
3-
<p align="center">A <strong>next-generation</strong> cloud-native warehouse built in <strong>Rust</strong>. Open-source, Snowflake-compatible, and unifying BI, AI, Search, Geo, and Stream.</p>
2+
<h3 align="center">The All-in-One Cloud Data Warehouse for Analytics & AI</h3>
3+
<p align="center">Built in <strong>Rust</strong> for blazing fast, cost-efficient analytics.<br> Open-source, <strong>Snowflake-compatible</strong>, and designed to unify BI, Search, and AI on object storage.</p>
44

55
<div align="center">
66

@@ -24,12 +24,13 @@
2424

2525
## 💡 Why Databend?
2626

27-
Databend is an open-source **unified multimodal database** built in Rust. It empowers **Analytics**, **AI**, **Search**, and **Geo** workloads on a single platform directly from object storage.
27+
Databend is an open-source, **All-in-One multimodal database** built in Rust. It seamlessly unifies **Analytics**, **AI**, **Search**, and **Geo** workloads into a single platform, enabling high-performance processing directly on top of object storage.
2828

29-
- **Unified Engine**: One optimizer and runtime for all data types (Structured, Semi-structured, Vector).
30-
- **Native Pipelines**: Built-in **Stream** and **Task** for automated data cleaning and transformation.
31-
- **Cloud Native**: Stateless compute nodes over object storage (S3, GCS, Azure) with full ACID support.
32-
- **High Performance**: Vectorized execution and Zero-Copy processing.
29+
| | |
30+
| :--- | :--- |
31+
| **📊 BI & Analytics**<br>Supercharge your analytics with a high-performance, vectorized SQL query engine. | **✨ Vector Search**<br>Power AI and RAG applications with built-in, high-speed vector similarity search. |
32+
| **📄 JSON Search**<br>Seamlessly query and analyze semi-structured data with powerful JSON optimization. | **🌍 Geo Search**<br>Efficiently store, index, and query geospatial data for location intelligence. |
33+
| **🔄 ETL Pipeline**<br>Streamline data ingestion and transformation with built-in Streams and Tasks. | **🌿 Branching**<br>Create isolated Copy-on-Write branches instantly for dev, test, or experiments. |
3334

3435
![Databend Architecture](https://github.com/user-attachments/assets/288dea8d-0243-4c45-8d18-d4d402b08075)
3536

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,23 @@
11
# Benchmark Directory
22

3-
This directory contains subdirectories dedicated to various performance tests,
3+
This directory contains subdirectories dedicated to various performance tests,
44

55
specifically for TPCH tests, Hits tests, and internal query performance tests. Below is a brief overview of each subdirectory:
66

77
## 1. tpch
88

9-
This subdirectory includes performance evaluation tools and scripts related to TPCH tests.
9+
This subdirectory includes performance evaluation tools and scripts related to TPCH tests.
1010

1111
TPCH tests are designed to simulate complex query scenarios to assess the system's performance when handling large datasets. In this directory, you can find testing scripts, configuration files, and documentation for test results.
1212

1313
## 2. hits
1414

15-
Hits tests focus on specific queries or operations for performance testing.
15+
Hits tests focus on specific queries or operations for performance testing.
1616

1717
In this subdirectory, you'll find scripts for Hits tests, sample queries, and performance analysis tools.
1818

1919
## 3. internal
2020

21-
The internal subdirectory contains testing tools and scripts dedicated to ensuring the performance of internal queries.
21+
The internal subdirectory contains testing tools and scripts dedicated to ensuring the performance of internal queries.
2222

2323
These tests may be conducted to ensure the system performs well when handling internal queries specific.
24-

0 commit comments

Comments
 (0)