From 67db41ab89dc917f3343e0e15a25e82d180255e6 Mon Sep 17 00:00:00 2001
From: Dennis Felsing <dennis@felsing.org>
Date: Wed, 3 Dec 2025 22:40:52 +0000
Subject: [PATCH] orchestratord test: Upload upgrade downtimes to test
 analytics

As requested by Jon, for later analysis
---
 ci/nightly/pipeline.template.yml              |  3 +-
 .../upgrade_downtime_result_storage.py        | 60 +++++++++++++++
 .../setup/cleanup/remove-build.sql            |  1 +
 .../setup/tables/15-cluster-spec-sheet.sql    |  1 -
 .../16-cluster-spec-sheet-environmentd.sql    |  1 -
 .../setup/tables/17-upgrade-downtime.sql      | 21 +++++
 .../setup/views/100-data-integrity.sql        |  4 +
 .../test_analytics/test_analytics_db.py       |  6 ++
 test/orchestratord/mzcompose.py               | 76 +++++++++++++++++--
 9 files changed, 164 insertions(+), 9 deletions(-)
 create mode 100644 misc/python/materialize/test_analytics/data/upgrade_downtime/upgrade_downtime_result_storage.py
 create mode 100644 misc/python/materialize/test_analytics/setup/tables/17-upgrade-downtime.sql

diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml
index 98267f4222c88..c3d807182bf35 100644
--- a/ci/nightly/pipeline.template.yml
+++ b/ci/nightly/pipeline.template.yml
@@ -2412,7 +2412,8 @@ steps:
               run: upgrade-downtime
               ci-builder: stable
         agents:
-          queue: hetzner-aarch64-16cpu-32gb
+          # More stable results for recording benchmarks
+          queue: hetzner-x86-64-dedi-16cpu-64gb
 
       - id: orchestratord-default-properties
         label: "Orchestratord + defaults for properties"
diff --git a/misc/python/materialize/test_analytics/data/upgrade_downtime/upgrade_downtime_result_storage.py b/misc/python/materialize/test_analytics/data/upgrade_downtime/upgrade_downtime_result_storage.py
new file mode 100644
index 0000000000000..473851ad4536c
--- /dev/null
+++ b/misc/python/materialize/test_analytics/data/upgrade_downtime/upgrade_downtime_result_storage.py
@@ -0,0 +1,60 @@
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+from dataclasses import dataclass
+
+from materialize import buildkite
+from materialize.buildkite import BuildkiteEnvVar
+from materialize.test_analytics.data.base_data_storage import BaseDataStorage
+from materialize.test_analytics.util.mz_sql_util import as_sanitized_literal
+
+
+@dataclass
+class UpgradeDowntimeResultEntry:
+    scenario: str
+    scenario_version: str
+    downtime_initial: float
+    downtime_upgrade: float
+
+
+class UpgradeDowntimeResultStorage(BaseDataStorage):
+
+    def add_result(
+        self,
+        framework_version: str,
+        results: list[UpgradeDowntimeResultEntry],
+    ) -> None:
+        job_id = buildkite.get_var(BuildkiteEnvVar.BUILDKITE_JOB_ID)
+
+        sql_statements = []
+
+        for result_entry in results:
+            # TODO: remove NULL castings when database-issues#8100 is resolved
+            sql_statements.append(
+                f"""
+                INSERT INTO upgrade_downtime_result
+                (
+                    build_job_id,
+                    framework_version,
+                    scenario,
+                    scenario_version,
+                    downtime_initial,
+                    downtime_upgrade
+                )
+                SELECT
+                    {as_sanitized_literal(job_id)},
+                    {as_sanitized_literal(framework_version)},
+                    {as_sanitized_literal(result_entry.scenario)},
+                    {as_sanitized_literal(result_entry.scenario_version)},
+                    {result_entry.downtime_initial},
+                    {result_entry.downtime_upgrade}
+                ;
+                """
+            )
+
+        self.database_connector.add_update_statements(sql_statements)
diff --git a/misc/python/materialize/test_analytics/setup/cleanup/remove-build.sql b/misc/python/materialize/test_analytics/setup/cleanup/remove-build.sql
index 5e65d8c5c4266..265bf8536fc34 100644
--- a/misc/python/materialize/test_analytics/setup/cleanup/remove-build.sql
+++ b/misc/python/materialize/test_analytics/setup/cleanup/remove-build.sql
@@ -15,6 +15,7 @@ DELETE FROM parallel_benchmark_result WHERE build_job_id IN (SELECT build_id FRO
 DELETE FROM product_limits_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
 DELETE FROM cluster_spec_sheet_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
 DELETE FROM cluster_spec_sheet_environmentd_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
+DELETE FROM upgrade_downtime_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
 DELETE FROM build_annotation_error WHERE build_job_id IN (SELECT build_job_id FROM build_annotation WHERE build_id IN (%build-ids%));
 DELETE FROM build_annotation WHERE build_id IN (%build-ids%);
 DELETE FROM build_job WHERE build_id IN (%build-ids%);
diff --git a/misc/python/materialize/test_analytics/setup/tables/15-cluster-spec-sheet.sql b/misc/python/materialize/test_analytics/setup/tables/15-cluster-spec-sheet.sql
index b1854bc56e472..5b3044165203f 100644
--- a/misc/python/materialize/test_analytics/setup/tables/15-cluster-spec-sheet.sql
+++ b/misc/python/materialize/test_analytics/setup/tables/15-cluster-spec-sheet.sql
@@ -8,7 +8,6 @@
 -- by the Apache License, Version 2.0.
 
 
--- result of individual product limits scenarios
 CREATE TABLE cluster_spec_sheet_result (
    build_job_id TEXT NOT NULL,
    framework_version TEXT NOT NULL,
diff --git a/misc/python/materialize/test_analytics/setup/tables/16-cluster-spec-sheet-environmentd.sql b/misc/python/materialize/test_analytics/setup/tables/16-cluster-spec-sheet-environmentd.sql
index 9b4399fb64263..d9ee25c3e1b3a 100644
--- a/misc/python/materialize/test_analytics/setup/tables/16-cluster-spec-sheet-environmentd.sql
+++ b/misc/python/materialize/test_analytics/setup/tables/16-cluster-spec-sheet-environmentd.sql
@@ -8,7 +8,6 @@
 -- by the Apache License, Version 2.0.
 
 
--- result of individual product limits scenarios
 CREATE TABLE cluster_spec_sheet_environmentd_result (
    build_job_id TEXT NOT NULL,
    framework_version TEXT NOT NULL,
diff --git a/misc/python/materialize/test_analytics/setup/tables/17-upgrade-downtime.sql b/misc/python/materialize/test_analytics/setup/tables/17-upgrade-downtime.sql
new file mode 100644
index 0000000000000..c6b8bbde73fc0
--- /dev/null
+++ b/misc/python/materialize/test_analytics/setup/tables/17-upgrade-downtime.sql
@@ -0,0 +1,21 @@
+-- Copyright Materialize, Inc. and contributors. All rights reserved.
+--
+-- Use of this software is governed by the Business Source License
+-- included in the LICENSE file at the root of this repository.
+--
+-- As of the Change Date specified in that file, in accordance with
+-- the Business Source License, use of this software will be governed
+-- by the Apache License, Version 2.0.
+
+
+CREATE TABLE upgrade_downtime_result (
+   build_job_id TEXT NOT NULL,
+   framework_version TEXT NOT NULL,
+   scenario TEXT NOT NULL,
+   scenario_version TEXT NOT NULL,
+   downtime_initial TEXT NOT NULL,
+   downtime_upgrade TEXT NOT NULL
+);
+
+ALTER TABLE upgrade_downtime_result OWNER TO qa;
+GRANT SELECT, INSERT, UPDATE ON TABLE upgrade_downtime_result TO "hetzner-ci";
diff --git a/misc/python/materialize/test_analytics/setup/views/100-data-integrity.sql b/misc/python/materialize/test_analytics/setup/views/100-data-integrity.sql
index 56b93309c5c94..f00ae42a04437 100644
--- a/misc/python/materialize/test_analytics/setup/views/100-data-integrity.sql
+++ b/misc/python/materialize/test_analytics/setup/views/100-data-integrity.sql
@@ -37,6 +37,10 @@ CREATE OR REPLACE VIEW v_data_integrity (table_name, own_item_key, referenced_it
     FROM cluster_spec_sheet_environmentd_result
     WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job)
     UNION
+    SELECT 'upgrade_downtime_environmentd_result', build_job_id, build_job_id, 'upgrade downtime result references missing build job'
+    FROM upgrade_downtime_result
+    WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job)
+    UNION
     SELECT 'build_annotation', build_job_id, build_job_id, 'build annotation references missing build job'
     FROM build_annotation
     WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job)
diff --git a/misc/python/materialize/test_analytics/test_analytics_db.py b/misc/python/materialize/test_analytics/test_analytics_db.py
index ccbdc076a4d97..aec3907813708 100644
--- a/misc/python/materialize/test_analytics/test_analytics_db.py
+++ b/misc/python/materialize/test_analytics/test_analytics_db.py
@@ -47,6 +47,9 @@
 from materialize.test_analytics.data.scalability_framework.scalability_framework_result_storage import (
     ScalabilityFrameworkResultStorage,
 )
+from materialize.test_analytics.data.upgrade_downtime.upgrade_downtime_result_storage import (
+    UpgradeDowntimeResultStorage,
+)
 
 TEST_ANALYTICS_DATA_VERSION: int = 21
 
@@ -83,6 +86,9 @@ def __init__(self, config: MzDbConfig):
         self.cluster_spec_sheet_environmentd_results = (
             ClusterSpecSheetEnvironmentdResultStorage(self.database_connector)
         )
+        self.upgrade_downtime_results = UpgradeDowntimeResultStorage(
+            self.database_connector
+        )
 
     def _create_database_connector(self, config: MzDbConfig) -> DatabaseConnector:
         if config.enabled:
diff --git a/test/orchestratord/mzcompose.py b/test/orchestratord/mzcompose.py
index c29e0f219d3d7..1377c4d1dffc5 100644
--- a/test/orchestratord/mzcompose.py
+++ b/test/orchestratord/mzcompose.py
@@ -31,7 +31,7 @@
 import yaml
 from semver.version import Version
 
-from materialize import MZ_ROOT, ci_util, git, spawn
+from materialize import MZ_ROOT, buildkite, ci_util, git, spawn
 from materialize.mz_version import MzVersion
 from materialize.mzcompose.composition import (
     Composition,
@@ -41,9 +41,17 @@
 from materialize.mzcompose.services.balancerd import Balancerd
 from materialize.mzcompose.services.clusterd import Clusterd
 from materialize.mzcompose.services.environmentd import Environmentd
+from materialize.mzcompose.services.mz import Mz
 from materialize.mzcompose.services.mz_debug import MzDebug
 from materialize.mzcompose.services.orchestratord import Orchestratord
 from materialize.mzcompose.services.testdrive import Testdrive
+from materialize.test_analytics.config.test_analytics_db_config import (
+    create_test_analytics_config,
+)
+from materialize.test_analytics.data.upgrade_downtime import (
+    upgrade_downtime_result_storage,
+)
+from materialize.test_analytics.test_analytics_db import TestAnalyticsDb
 from materialize.util import PropagatingThread, all_subclasses
 from materialize.version_list import (
     get_all_self_managed_versions,
@@ -57,6 +65,7 @@
     Clusterd(),
     Balancerd(),
     MzDebug(),
+    Mz(app_password=""),
 ]
 
 
@@ -1594,6 +1603,46 @@ def make_mod_source(
         raise ValueError(f"Unhandled properties: {properties}")
 
 
+# Bump this version if the upgrade-downtime workflow is changed in a way that changes the results uploaded to test analytics
+UPGRADE_DOWNTIME_SCENARIO_VERSION = "1.0.0"
+# Used for uploading test analytics results
+ORCHESTRATORD_TEST_VERSION = "1.0.0"
+
+
+def upload_upgrade_downtime_to_test_analytics(
+    composition: Composition,
+    downtime_initial: float,
+    downtime_upgrade: float,
+    was_successful: bool,
+) -> None:
+    if not buildkite.is_in_buildkite():
+        return
+
+    test_analytics = TestAnalyticsDb(create_test_analytics_config(composition))
+    test_analytics.builds.add_build_job(was_successful=was_successful)
+
+    result_entries = [
+        upgrade_downtime_result_storage.UpgradeDowntimeResultEntry(
+            scenario="upgrade-downtime",
+            scenario_version=UPGRADE_DOWNTIME_SCENARIO_VERSION,
+            downtime_initial=downtime_initial,
+            downtime_upgrade=downtime_upgrade,
+        )
+    ]
+
+    test_analytics.upgrade_downtime_results.add_result(
+        framework_version=ORCHESTRATORD_TEST_VERSION,
+        results=result_entries,
+    )
+
+    try:
+        test_analytics.submit_updates()
+        print("Uploaded results.")
+    except Exception as e:
+        # An error during an upload must never cause the build to fail
+        test_analytics.on_upload_failed(e)
+
+
 def workflow_upgrade_downtime(c: Composition, parser: WorkflowArgumentParser) -> None:
     parser.add_argument(
         "--recreate-cluster",
@@ -1614,6 +1663,7 @@ def workflow_upgrade_downtime(c: Composition, parser: WorkflowArgumentParser) ->
     args = parser.parse_args()
 
     running = True
+    downtimes: list[float] = []
 
     def measure_downtime() -> None:
         port_forward_process = None
@@ -1650,6 +1700,7 @@ def measure_downtime() -> None:
                         preexec_fn=os.setpgrp,
                     )
                     connect_port_forward = False
+                time.sleep(1)
                 try:
                     with psycopg.connect(
                         "postgres://materialize@127.0.0.1:6875/materialize",
@@ -1660,11 +1711,10 @@ def measure_downtime() -> None:
                 except psycopg.OperationalError:
                     connect_port_forward = True
                     continue
-                runtime = time.time() - start_time
-                if runtime > 2:
-                    print(f"Downtime: {runtime}s")
-                assert runtime < 15, f"SELECT 1 took more than 15s: {runtime}s"
-                time.sleep(10)
+                runtime = time.time() - start_time - 1
+                print(f"Time: {runtime}s")
+                if runtime > 1:
+                    downtimes.append(runtime)
                 start_time = time.time()
         finally:
             if port_forward_process:
@@ -1684,6 +1734,20 @@ def measure_downtime() -> None:
     running = False
     thread.join()
 
+    assert len(downtimes) == 2, f"Wrong number of downtimes: {downtimes}"
+
+    test_failed = False
+    max_downtime = 15
+    for downtime in downtimes:
+        if downtime > max_downtime:
+            print(f"SELECT 1 took more than {max_downtime}s: {downtime}s")
+            test_failed = True
+
+    upload_upgrade_downtime_to_test_analytics(
+        c, downtimes[0], downtimes[1], not test_failed
+    )
+    assert not test_failed
+
 
 def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
     parser.add_argument(