Skip to content

Commit 7171610

Browse files
authored
Merge pull request #34058 from ggevay/qps-cluster-spec-sheet
Add QPS measurements to Cluster Spec Sheet
2 parents 13e7ec6 + 45ee138 commit 7171610

File tree

13 files changed

+1162
-216
lines changed

13 files changed

+1162
-216
lines changed

ci/plugins/mzcompose/hooks/command

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ cleanup() {
342342
&& [ "$BUILDKITE_LABEL" != "Parallel Benchmark against QA Benchmarking Staging Environment" ] \
343343
&& [[ ! "$BUILDKITE_LABEL" =~ Terraform\ .* ]] \
344344
&& [[ ! "$BUILDKITE_LABEL" =~ Orchestratord\ test\ .* ]] \
345-
&& [ "$BUILDKITE_LABEL" != "Cluster spec sheet" ]; then
345+
&& [[ ! "$BUILDKITE_LABEL" =~ Cluster\ spec\ sheet.* ]]; then
346346
echo "+++ services.log is empty, failing"
347347
exit 1
348348
fi

ci/release-qualification/pipeline.template.yml

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -530,16 +530,33 @@ steps:
530530
agents:
531531
queue: hetzner-x86-64-dedi-48cpu-192gb # 1 TB disk
532532

533-
- id: cluster-spec-sheet
534-
label: Cluster spec sheet
535-
depends_on: build-aarch64
536-
timeout_in_minutes: 3600
537-
concurrency: 1
538-
concurrency_group: 'cluster-spec-sheet'
539-
plugins:
540-
- ./ci/plugins/mzcompose:
541-
composition: cluster-spec-sheet
542-
run: default
543-
args: [--cleanup]
544-
agents:
545-
queue: linux-aarch64-small
533+
- group: Cluster spec sheet
534+
key: cluster-spec-sheet
535+
steps:
536+
- id: cluster-spec-sheet-cluster
537+
label: "Cluster spec sheet: Cluster (against Production)"
538+
depends_on: build-aarch64
539+
timeout_in_minutes: 3600
540+
concurrency: 1
541+
concurrency_group: 'cluster-spec-sheet'
542+
plugins:
543+
- ./ci/plugins/mzcompose:
544+
composition: cluster-spec-sheet
545+
run: default
546+
args: [--cleanup, --target=cloud-production, cluster]
547+
agents:
548+
queue: linux-aarch64-small
549+
550+
- id: cluster-spec-sheet-environmentd
551+
label: "Cluster spec sheet: Environmentd (against Staging)"
552+
depends_on: build-aarch64
553+
timeout_in_minutes: 3600
554+
concurrency: 1
555+
concurrency_group: 'cluster-spec-sheet-cluster'
556+
plugins:
557+
- ./ci/plugins/mzcompose:
558+
composition: cluster-spec-sheet
559+
run: default
560+
args: [--cleanup, --target=cloud-staging, environmentd]
561+
agents:
562+
queue: linux-aarch64-small

misc/python/materialize/mzcompose/composition.py

Lines changed: 49 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1677,16 +1677,57 @@ def promote_mz(self, mz_service: str = "materialized") -> None:
16771677
)
16781678
assert result["result"] == "Success", f"Unexpected result {result}"
16791679

1680-
def cloud_hostname(self, quiet: bool = False) -> str:
1681-
"""Uses the mz command line tool to get the hostname of the cloud instance"""
1680+
def cloud_hostname(
1681+
self, quiet: bool = False, timeout_secs: int = 180, poll_interval: float = 2.0
1682+
) -> str:
1683+
"""Uses the mz command line tool to get the hostname of the cloud instance, waiting until the region is ready."""
16821684
if not quiet:
16831685
print("Obtaining hostname of cloud instance ...")
1684-
region_status = self.run("mz", "region", "show", capture=True, rm=True)
1685-
sql_line = region_status.stdout.split("\n")[2]
1686-
cloud_url = sql_line.split("\t")[1].strip()
1687-
# It is necessary to append the 'https://' protocol; otherwise, urllib can't parse it correctly.
1688-
cloud_hostname = urllib.parse.urlparse("https://" + cloud_url).hostname
1689-
return str(cloud_hostname)
1686+
1687+
deadline = time.time() + timeout_secs
1688+
last_msg = ""
1689+
1690+
while time.time() < deadline:
1691+
proc = self.run(
1692+
"mz",
1693+
"region",
1694+
"show",
1695+
capture=True,
1696+
capture_stderr=True,
1697+
rm=True,
1698+
check=False,
1699+
silent=True,
1700+
)
1701+
out = proc.stdout or ""
1702+
err = proc.stderr or ""
1703+
1704+
if proc.returncode == 0:
1705+
lines = out.splitlines()
1706+
if len(lines) >= 3:
1707+
line = lines[2]
1708+
parts = line.split("\t")
1709+
if len(parts) >= 2:
1710+
cloud_url = parts[1].strip()
1711+
# It is necessary to append the 'https://' protocol; otherwise, urllib can't parse it correctly.
1712+
hostname = urllib.parse.urlparse(
1713+
"https://" + cloud_url
1714+
).hostname
1715+
if hostname:
1716+
return str(hostname)
1717+
else:
1718+
last_msg = f"failed to parse hostname from URL: {cloud_url}"
1719+
else:
1720+
last_msg = f"unexpected region show output (no tab in line 3): {line!r}"
1721+
else:
1722+
last_msg = f"unexpected region show output (too few lines): {out!r}"
1723+
else:
1724+
last_msg = (out + "\n" + err).strip()
1725+
1726+
time.sleep(poll_interval)
1727+
1728+
raise UIError(
1729+
f"failed to obtain cloud hostname within {timeout_secs}s: {last_msg}"
1730+
)
16901731

16911732
T = TypeVar("T")
16921733

misc/python/materialize/test_analytics/data/cluster_spec_sheet/cluster_spec_sheet_result_storage.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,19 @@ class ClusterSpecSheetResultEntry:
2828
time_ms: int | None
2929

3030

31+
@dataclass
32+
class ClusterSpecSheetEnvironmentdResultEntry:
33+
scenario: str
34+
scenario_version: str
35+
scale: int
36+
mode: str
37+
category: str
38+
test_name: str
39+
envd_cpus: int
40+
repetition: int
41+
qps: float | None
42+
43+
3144
class ClusterSpecSheetResultStorage(BaseDataStorage):
3245

3346
def add_result(
@@ -76,3 +89,51 @@ def add_result(
7689
)
7790

7891
self.database_connector.add_update_statements(sql_statements)
92+
93+
94+
class ClusterSpecSheetEnvironmentdResultStorage(BaseDataStorage):
95+
96+
def add_result(
97+
self,
98+
framework_version: str,
99+
results: list[ClusterSpecSheetEnvironmentdResultEntry],
100+
) -> None:
101+
job_id = buildkite.get_var(BuildkiteEnvVar.BUILDKITE_JOB_ID)
102+
103+
sql_statements = []
104+
105+
for result_entry in results:
106+
# TODO: remove NULL castings when database-issues#8100 is resolved
107+
sql_statements.append(
108+
f"""
109+
INSERT INTO cluster_spec_sheet_environmentd_result
110+
(
111+
build_job_id,
112+
framework_version,
113+
scenario,
114+
scenario_version,
115+
scale,
116+
mode,
117+
category,
118+
test_name,
119+
envd_cpus,
120+
repetition,
121+
qps
122+
)
123+
SELECT
124+
{as_sanitized_literal(job_id)},
125+
{as_sanitized_literal(framework_version)},
126+
{as_sanitized_literal(result_entry.scenario)},
127+
{as_sanitized_literal(result_entry.scenario_version)},
128+
{result_entry.scale},
129+
{as_sanitized_literal(result_entry.mode)},
130+
{as_sanitized_literal(result_entry.category)},
131+
{as_sanitized_literal(result_entry.test_name)},
132+
{result_entry.envd_cpus},
133+
{result_entry.repetition},
134+
{result_entry.qps or 'NULL::FLOAT'}
135+
;
136+
"""
137+
)
138+
139+
self.database_connector.add_update_statements(sql_statements)

misc/python/materialize/test_analytics/setup/cleanup/remove-build.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ DELETE FROM scalability_framework_result WHERE build_job_id IN (SELECT build_id
1414
DELETE FROM parallel_benchmark_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
1515
DELETE FROM product_limits_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
1616
DELETE FROM cluster_spec_sheet_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
17+
DELETE FROM cluster_spec_sheet_environmentd_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
1718
DELETE FROM build_annotation_error WHERE build_job_id IN (SELECT build_job_id FROM build_annotation WHERE build_id IN (%build-ids%));
1819
DELETE FROM build_annotation WHERE build_id IN (%build-ids%);
1920
DELETE FROM build_job WHERE build_id IN (%build-ids%);
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
-- Copyright Materialize, Inc. and contributors. All rights reserved.
2+
--
3+
-- Use of this software is governed by the Business Source License
4+
-- included in the LICENSE file at the root of this repository.
5+
--
6+
-- As of the Change Date specified in that file, in accordance with
7+
-- the Business Source License, use of this software will be governed
8+
-- by the Apache License, Version 2.0.
9+
10+
11+
-- result of individual product limits scenarios
12+
CREATE TABLE cluster_spec_sheet_environmentd_result (
13+
build_job_id TEXT NOT NULL,
14+
framework_version TEXT NOT NULL,
15+
scenario TEXT NOT NULL,
16+
scenario_version TEXT NOT NULL,
17+
scale INT NOT NULL,
18+
mode TEXT NOT NULL,
19+
category TEXT NOT NULL,
20+
test_name TEXT NOT NULL,
21+
envd_cpus INT NOT NULL,
22+
repetition INT NOT NULL,
23+
qps FLOAT
24+
);
25+
26+
ALTER TABLE cluster_spec_sheet_environmentd_result OWNER TO qa;
27+
GRANT SELECT, INSERT, UPDATE ON TABLE cluster_spec_sheet_environmentd_result TO "hetzner-ci";

misc/python/materialize/test_analytics/setup/views/100-data-integrity.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ CREATE OR REPLACE VIEW v_data_integrity (table_name, own_item_key, referenced_it
3333
FROM cluster_spec_sheet_result
3434
WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job)
3535
UNION
36+
SELECT 'cluster_spec_sheet_environmentd_result', build_job_id, build_job_id, 'cluster spec sheet environmentd result references missing build job'
37+
FROM cluster_spec_sheet_environmentd_result
38+
WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job)
39+
UNION
3640
SELECT 'build_annotation', build_job_id, build_job_id, 'build annotation references missing build job'
3741
FROM build_annotation
3842
WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job)

misc/python/materialize/test_analytics/test_analytics_db.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
BuildAnnotationStorage,
2727
)
2828
from materialize.test_analytics.data.cluster_spec_sheet.cluster_spec_sheet_result_storage import (
29+
ClusterSpecSheetEnvironmentdResultStorage,
2930
ClusterSpecSheetResultStorage,
3031
)
3132
from materialize.test_analytics.data.feature_benchmark.feature_benchmark_result_storage import (
@@ -79,6 +80,9 @@ def __init__(self, config: MzDbConfig):
7980
self.cluster_spec_sheet_results = ClusterSpecSheetResultStorage(
8081
self.database_connector
8182
)
83+
self.cluster_spec_sheet_environmentd_results = (
84+
ClusterSpecSheetEnvironmentdResultStorage(self.database_connector)
85+
)
8286

8387
def _create_database_connector(self, config: MzDbConfig) -> DatabaseConnector:
8488
if config.enabled:

misc/shlib/shlib.bash

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ trufflehog_jq_filter_common() {
243243
jq -c '
244244
select(
245245
(.Raw | contains("user1:password") | not) and
246+
(.Raw | contains("[email protected]:XXX") | not) and
246247
.Raw != "postgres://mz_system:materialize@materialized:5432" and
247248
.Raw != "postgres://materialize:materialize@materialized:6875" and
248249
.Raw != "postgres://mz_system:materialize@materialized:6877" and

test/cluster-spec-sheet/README.md

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ Reproduce data for the cluster spec sheet effort.
88

99
This will run all scenarios currently defined for the cluster spec sheet.
1010

11-
The test expects a default cluster.
12-
1311
Pass `--cleanup` to disable the region after the test.
1412

1513
# Running
@@ -21,17 +19,9 @@ The workload runs as part of the release qualification pipeline in Buildkite.
2119

2220
## Running manually in Cloud
2321

24-
To run the cloud canary test manually, a set of environment variables need to be made available locally:
25-
26-
```
27-
export NIGHTLY_MZ_USERNAME=...
28-
export MZ_CLI_APP_PASSWORD=mzp_...
29-
export ENVIRONMENT=...
30-
export REGION=...
31-
```
22+
To run the cloud canary test manually, you can specify either `--target=cloud-production` (which is hardcoded to aws/us-east-1) or `--target=cloud-staging` (which is hardcoded to aws/eu-west-1). For production, you need to set the environment variables `NIGHTLY_MZ_USERNAME` and `MZ_CLI_APP_PASSWORD`. For staging, you need to set the environment variables `NIGHTLY_CANARY_USERNAME` and `NIGHTLY_CANARY_APP_PASSWORD`.
3223

33-
The username is an email address, the app password is a password generated in the cloud console.
34-
The environment is either `production` or `staging`, and the region is one of the supported regions, e.g. `aws/us-east-1`.
24+
The username is an email address, the app password is a password generated in the cloud console (something like `mzp_...`).
3525

3626
Once the environment variables have been set, you can run:
3727

@@ -48,3 +38,27 @@ In this case, the environment variables are not required.
4838
```
4939
bin/mzcompose --find cluster-spec-sheet run default --target=docker
5040
```
41+
42+
## Scenarios
43+
44+
There are two kinds of scenarios:
45+
- cluster scaling: These measure run times and arrangement sizes.
46+
- envd scaling: These measure QPS.
47+
48+
Currently, the envd scaling scenarios can't be run in Production, because changing envd's CPU cores using `mz` is not allowed there. Therefore, these scenarios need to be run with `--target=cloud-staging`.
49+
50+
You can invoke only one kind of scenarios by using the group name from `SCENARIO_GROUPS`. For example:
51+
```
52+
bin/mzcompose --find cluster-spec-sheet run default environmentd --target=cloud-staging
53+
```
54+
or
55+
```
56+
bin/mzcompose --find cluster-spec-sheet run default cluster
57+
```
58+
59+
You can also specify a specific scenario by name.
60+
61+
For testing just the scaffolding of the cluster spec sheet itself, you can make the run much faster by using the various scaling options, e.g.:
62+
```
63+
--scale-tpch=0.01 --scale-tpch-queries=0.01 --scale-auction=1 --max-scale=4
64+
```

0 commit comments

Comments
 (0)