Merge pull request #34058 from ggevay/qps-cluster-spec-sheet

ggevay · web-flow · commit 7171610d99c4 · 2025-11-19T16:21:02.000+01:00
Add QPS measurements to Cluster Spec Sheet
diff --git a/ci/plugins/mzcompose/hooks/command b/ci/plugins/mzcompose/hooks/command
@@ -342,7 +342,7 @@ cleanup() {
     && [ "$BUILDKITE_LABEL" != "Parallel Benchmark against QA Benchmarking Staging Environment" ] \
     && [[ ! "$BUILDKITE_LABEL" =~ Terraform\ .* ]] \
     && [[ ! "$BUILDKITE_LABEL" =~ Orchestratord\ test\ .* ]] \
-    && [ "$BUILDKITE_LABEL" != "Cluster spec sheet" ]; then
+    && [[ ! "$BUILDKITE_LABEL" =~ Cluster\ spec\ sheet.* ]]; then
       echo "+++ services.log is empty, failing"
       exit 1
   fi
diff --git a/ci/release-qualification/pipeline.template.yml b/ci/release-qualification/pipeline.template.yml
@@ -530,16 +530,33 @@ steps:
         agents:
           queue: hetzner-x86-64-dedi-48cpu-192gb # 1 TB disk
 
-  - id: cluster-spec-sheet
-    label: Cluster spec sheet
-    depends_on: build-aarch64
-    timeout_in_minutes: 3600
-    concurrency: 1
-    concurrency_group: 'cluster-spec-sheet'
-    plugins:
-      - ./ci/plugins/mzcompose:
-          composition: cluster-spec-sheet
-          run: default
-          args: [--cleanup]
-    agents:
-      queue: linux-aarch64-small
+  - group: Cluster spec sheet
+    key: cluster-spec-sheet
+    steps:
+      - id: cluster-spec-sheet-cluster
+        label: "Cluster spec sheet: Cluster (against Production)"
+        depends_on: build-aarch64
+        timeout_in_minutes: 3600
+        concurrency: 1
+        concurrency_group: 'cluster-spec-sheet'
+        plugins:
+          - ./ci/plugins/mzcompose:
+              composition: cluster-spec-sheet
+              run: default
+              args: [--cleanup, --target=cloud-production, cluster]
+        agents:
+          queue: linux-aarch64-small
+
+      - id: cluster-spec-sheet-environmentd
+        label: "Cluster spec sheet: Environmentd (against Staging)"
+        depends_on: build-aarch64
+        timeout_in_minutes: 3600
+        concurrency: 1
+        concurrency_group: 'cluster-spec-sheet-cluster'
+        plugins:
+          - ./ci/plugins/mzcompose:
+              composition: cluster-spec-sheet
+              run: default
+              args: [--cleanup, --target=cloud-staging, environmentd]
+        agents:
+          queue: linux-aarch64-small
diff --git a/misc/python/materialize/mzcompose/composition.py b/misc/python/materialize/mzcompose/composition.py
@@ -1677,16 +1677,57 @@ def promote_mz(self, mz_service: str = "materialized") -> None:
         )
         assert result["result"] == "Success", f"Unexpected result {result}"
 
-    def cloud_hostname(self, quiet: bool = False) -> str:
-        """Uses the mz command line tool to get the hostname of the cloud instance"""
+    def cloud_hostname(
+        self, quiet: bool = False, timeout_secs: int = 180, poll_interval: float = 2.0
+    ) -> str:
+        """Uses the mz command line tool to get the hostname of the cloud instance, waiting until the region is ready."""
         if not quiet:
             print("Obtaining hostname of cloud instance ...")
-        region_status = self.run("mz", "region", "show", capture=True, rm=True)
-        sql_line = region_status.stdout.split("\n")[2]
-        cloud_url = sql_line.split("\t")[1].strip()
-        # It is necessary to append the 'https://' protocol; otherwise, urllib can't parse it correctly.
-        cloud_hostname = urllib.parse.urlparse("https://" + cloud_url).hostname
-        return str(cloud_hostname)
+
+        deadline = time.time() + timeout_secs
+        last_msg = ""
+
+        while time.time() < deadline:
+            proc = self.run(
+                "mz",
+                "region",
+                "show",
+                capture=True,
+                capture_stderr=True,
+                rm=True,
+                check=False,
+                silent=True,
+            )
+            out = proc.stdout or ""
+            err = proc.stderr or ""
+
+            if proc.returncode == 0:
+                lines = out.splitlines()
+                if len(lines) >= 3:
+                    line = lines[2]
+                    parts = line.split("\t")
+                    if len(parts) >= 2:
+                        cloud_url = parts[1].strip()
+                        # It is necessary to append the 'https://' protocol; otherwise, urllib can't parse it correctly.
+                        hostname = urllib.parse.urlparse(
+                            "https://" + cloud_url
+                        ).hostname
+                        if hostname:
+                            return str(hostname)
+                        else:
+                            last_msg = f"failed to parse hostname from URL: {cloud_url}"
+                    else:
+                        last_msg = f"unexpected region show output (no tab in line 3): {line!r}"
+                else:
+                    last_msg = f"unexpected region show output (too few lines): {out!r}"
+            else:
+                last_msg = (out + "\n" + err).strip()
+
+            time.sleep(poll_interval)
+
+        raise UIError(
+            f"failed to obtain cloud hostname within {timeout_secs}s: {last_msg}"
+        )
 
     T = TypeVar("T")
 
diff --git a/misc/python/materialize/test_analytics/data/cluster_spec_sheet/cluster_spec_sheet_result_storage.py b/misc/python/materialize/test_analytics/data/cluster_spec_sheet/cluster_spec_sheet_result_storage.py
@@ -28,6 +28,19 @@ class ClusterSpecSheetResultEntry:
     time_ms: int | None
 
 
+@dataclass
+class ClusterSpecSheetEnvironmentdResultEntry:
+    scenario: str
+    scenario_version: str
+    scale: int
+    mode: str
+    category: str
+    test_name: str
+    envd_cpus: int
+    repetition: int
+    qps: float | None
+
+
 class ClusterSpecSheetResultStorage(BaseDataStorage):
 
     def add_result(
@@ -76,3 +89,51 @@ def add_result(
             )
 
         self.database_connector.add_update_statements(sql_statements)
+
+
+class ClusterSpecSheetEnvironmentdResultStorage(BaseDataStorage):
+
+    def add_result(
+        self,
+        framework_version: str,
+        results: list[ClusterSpecSheetEnvironmentdResultEntry],
+    ) -> None:
+        job_id = buildkite.get_var(BuildkiteEnvVar.BUILDKITE_JOB_ID)
+
+        sql_statements = []
+
+        for result_entry in results:
+            # TODO: remove NULL castings when database-issues#8100 is resolved
+            sql_statements.append(
+                f"""
+                INSERT INTO cluster_spec_sheet_environmentd_result
+                (
+                    build_job_id,
+                    framework_version,
+                    scenario,
+                    scenario_version,
+                    scale,
+                    mode,
+                    category,
+                    test_name,
+                    envd_cpus,
+                    repetition,
+                    qps
+                )
+                SELECT
+                    {as_sanitized_literal(job_id)},
+                    {as_sanitized_literal(framework_version)},
+                    {as_sanitized_literal(result_entry.scenario)},
+                    {as_sanitized_literal(result_entry.scenario_version)},
+                    {result_entry.scale},
+                    {as_sanitized_literal(result_entry.mode)},
+                    {as_sanitized_literal(result_entry.category)},
+                    {as_sanitized_literal(result_entry.test_name)},
+                    {result_entry.envd_cpus},
+                    {result_entry.repetition},
+                    {result_entry.qps or 'NULL::FLOAT'}
+                ;
+                """
+            )
+
+        self.database_connector.add_update_statements(sql_statements)
diff --git a/misc/python/materialize/test_analytics/setup/cleanup/remove-build.sql b/misc/python/materialize/test_analytics/setup/cleanup/remove-build.sql
@@ -14,6 +14,7 @@ DELETE FROM scalability_framework_result WHERE build_job_id IN (SELECT build_id
 DELETE FROM parallel_benchmark_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
 DELETE FROM product_limits_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
 DELETE FROM cluster_spec_sheet_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
+DELETE FROM cluster_spec_sheet_environmentd_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
 DELETE FROM build_annotation_error WHERE build_job_id IN (SELECT build_job_id FROM build_annotation WHERE build_id IN (%build-ids%));
 DELETE FROM build_annotation WHERE build_id IN (%build-ids%);
 DELETE FROM build_job WHERE build_id IN (%build-ids%);
diff --git a/misc/python/materialize/test_analytics/setup/tables/16-cluster-spec-sheet-environmentd.sql b/misc/python/materialize/test_analytics/setup/tables/16-cluster-spec-sheet-environmentd.sql
@@ -0,0 +1,27 @@
+-- Copyright Materialize, Inc. and contributors. All rights reserved.
+--
+-- Use of this software is governed by the Business Source License
+-- included in the LICENSE file at the root of this repository.
+--
+-- As of the Change Date specified in that file, in accordance with
+-- the Business Source License, use of this software will be governed
+-- by the Apache License, Version 2.0.
+
+
+-- result of individual product limits scenarios
+CREATE TABLE cluster_spec_sheet_environmentd_result (
+   build_job_id TEXT NOT NULL,
+   framework_version TEXT NOT NULL,
+   scenario TEXT NOT NULL,
+   scenario_version TEXT NOT NULL,
+   scale INT NOT NULL,
+   mode TEXT NOT NULL,
+   category TEXT NOT NULL,
+   test_name TEXT NOT NULL,
+   envd_cpus INT NOT NULL,
+   repetition INT NOT NULL,
+   qps FLOAT
+);
+
+ALTER TABLE cluster_spec_sheet_environmentd_result OWNER TO qa;
+GRANT SELECT, INSERT, UPDATE ON TABLE cluster_spec_sheet_environmentd_result TO "hetzner-ci";
diff --git a/misc/python/materialize/test_analytics/setup/views/100-data-integrity.sql b/misc/python/materialize/test_analytics/setup/views/100-data-integrity.sql
@@ -33,6 +33,10 @@ CREATE OR REPLACE VIEW v_data_integrity (table_name, own_item_key, referenced_it
     FROM cluster_spec_sheet_result
     WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job)
     UNION
+    SELECT 'cluster_spec_sheet_environmentd_result', build_job_id, build_job_id, 'cluster spec sheet environmentd result references missing build job'
+    FROM cluster_spec_sheet_environmentd_result
+    WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job)
+    UNION
     SELECT 'build_annotation', build_job_id, build_job_id, 'build annotation references missing build job'
     FROM build_annotation
     WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job)
diff --git a/misc/python/materialize/test_analytics/test_analytics_db.py b/misc/python/materialize/test_analytics/test_analytics_db.py
@@ -26,6 +26,7 @@
     BuildAnnotationStorage,
 )
 from materialize.test_analytics.data.cluster_spec_sheet.cluster_spec_sheet_result_storage import (
+    ClusterSpecSheetEnvironmentdResultStorage,
     ClusterSpecSheetResultStorage,
 )
 from materialize.test_analytics.data.feature_benchmark.feature_benchmark_result_storage import (
@@ -79,6 +80,9 @@ def __init__(self, config: MzDbConfig):
         self.cluster_spec_sheet_results = ClusterSpecSheetResultStorage(
             self.database_connector
         )
+        self.cluster_spec_sheet_environmentd_results = (
+            ClusterSpecSheetEnvironmentdResultStorage(self.database_connector)
+        )
 
     def _create_database_connector(self, config: MzDbConfig) -> DatabaseConnector:
         if config.enabled:
diff --git a/misc/shlib/shlib.bash b/misc/shlib/shlib.bash
@@ -243,6 +243,7 @@ trufflehog_jq_filter_common() {
   jq -c '
     select(
       (.Raw | contains("user1:password") | not) and
+      (.Raw | contains("infra+nightly-canary@materialize.com:XXX") | not) and
       .Raw != "postgres://mz_system:materialize@materialized:5432" and
       .Raw != "postgres://materialize:materialize@materialized:6875" and
       .Raw != "postgres://mz_system:materialize@materialized:6877" and
diff --git a/test/cluster-spec-sheet/README.md b/test/cluster-spec-sheet/README.md
@@ -8,8 +8,6 @@ Reproduce data for the cluster spec sheet effort.
 
 This will run all scenarios currently defined for the cluster spec sheet.
 
-The test expects a default cluster.
-
 Pass `--cleanup` to disable the region after the test.
 
 # Running
@@ -21,17 +19,9 @@ The workload runs as part of the release qualification pipeline in Buildkite.
 
 ## Running manually in Cloud
 
-To run the cloud canary test manually, a set of environment variables need to be made available locally:
-
-```
-export NIGHTLY_MZ_USERNAME=...
-export MZ_CLI_APP_PASSWORD=mzp_...
-export ENVIRONMENT=...
-export REGION=...
-```
+To run the cloud canary test manually, you can specify either `--target=cloud-production` (which is hardcoded to aws/us-east-1) or `--target=cloud-staging` (which is hardcoded to aws/eu-west-1). For production, you need to set the environment variables `NIGHTLY_MZ_USERNAME` and `MZ_CLI_APP_PASSWORD`. For staging, you need to set the environment variables `NIGHTLY_CANARY_USERNAME` and `NIGHTLY_CANARY_APP_PASSWORD`.
 
-The username is an email address, the app password is a password generated in the cloud console.
-The environment is either `production` or `staging`, and the region is one of the supported regions, e.g. `aws/us-east-1`.
+The username is an email address, the app password is a password generated in the cloud console (something like `mzp_...`).
 
 Once the environment variables have been set, you can run:
 
@@ -48,3 +38,27 @@ In this case, the environment variables are not required.
 ```
 bin/mzcompose --find cluster-spec-sheet run default --target=docker
 ```
+
+## Scenarios
+
+There are two kinds of scenarios:
+- cluster scaling: These measure run times and arrangement sizes.
+- envd scaling: These measure QPS.
+
+Currently, the envd scaling scenarios can't be run in Production, because changing envd's CPU cores using `mz` is not allowed there. Therefore, these scenarios need to be run with `--target=cloud-staging`.
+
+You can invoke only one kind of scenarios by using the group name from `SCENARIO_GROUPS`. For example:
+```
+bin/mzcompose --find cluster-spec-sheet run default environmentd  --target=cloud-staging
+```
+or
+```
+bin/mzcompose --find cluster-spec-sheet run default cluster
+```
+
+You can also specify a specific scenario by name.
+
+For testing just the scaffolding of the cluster spec sheet itself, you can make the run much faster by using the various scaling options, e.g.:
+```
+--scale-tpch=0.01 --scale-tpch-queries=0.01 --scale-auction=1 --max-scale=4
+```
diff --git a/test/cluster-spec-sheet/mzcompose.py b/test/cluster-spec-sheet/mzcompose.py
diff --git a/test/dbbench/Dockerfile b/test/dbbench/Dockerfile
diff --git a/test/dbbench/mzbuild.yml b/test/dbbench/mzbuild.yml

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@`
`26`	`26`	`BuildAnnotationStorage,`
`27`	`27`	`)`
`28`	`28`	`from materialize.test_analytics.data.cluster_spec_sheet.cluster_spec_sheet_result_storage import (`
	`29`	`+ ClusterSpecSheetEnvironmentdResultStorage,`
`29`	`30`	`ClusterSpecSheetResultStorage,`
`30`	`31`	`)`
`31`	`32`	`from materialize.test_analytics.data.feature_benchmark.feature_benchmark_result_storage import (`
`@@ -79,6 +80,9 @@ def __init__(self, config: MzDbConfig):`
`79`	`80`	`self.cluster_spec_sheet_results = ClusterSpecSheetResultStorage(`
`80`	`81`	`self.database_connector`
`81`	`82`	`)`
	`83`	`+ self.cluster_spec_sheet_environmentd_results = (`
	`84`	`+ ClusterSpecSheetEnvironmentdResultStorage(self.database_connector)`
	`85`	`+ )`
`82`	`86`
`83`	`87`	`def _create_database_connector(self, config: MzDbConfig) -> DatabaseConnector:`
`84`	`88`	`if config.enabled:`