From 67db41ab89dc917f3343e0e15a25e82d180255e6 Mon Sep 17 00:00:00 2001 From: Dennis Felsing Date: Wed, 3 Dec 2025 22:40:52 +0000 Subject: [PATCH] orchestratord test: Upload upgrade downtimes to test analytics As requested by Jon, for later analysis --- ci/nightly/pipeline.template.yml | 3 +- .../upgrade_downtime_result_storage.py | 60 +++++++++++++++ .../setup/cleanup/remove-build.sql | 1 + .../setup/tables/15-cluster-spec-sheet.sql | 1 - .../16-cluster-spec-sheet-environmentd.sql | 1 - .../setup/tables/17-upgrade-downtime.sql | 21 +++++ .../setup/views/100-data-integrity.sql | 4 + .../test_analytics/test_analytics_db.py | 6 ++ test/orchestratord/mzcompose.py | 76 +++++++++++++++++-- 9 files changed, 164 insertions(+), 9 deletions(-) create mode 100644 misc/python/materialize/test_analytics/data/upgrade_downtime/upgrade_downtime_result_storage.py create mode 100644 misc/python/materialize/test_analytics/setup/tables/17-upgrade-downtime.sql diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index 98267f4222c88..c3d807182bf35 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -2412,7 +2412,8 @@ steps: run: upgrade-downtime ci-builder: stable agents: - queue: hetzner-aarch64-16cpu-32gb + # More stable results for recording benchmarks + queue: hetzner-x86-64-dedi-16cpu-64gb - id: orchestratord-default-properties label: "Orchestratord + defaults for properties" diff --git a/misc/python/materialize/test_analytics/data/upgrade_downtime/upgrade_downtime_result_storage.py b/misc/python/materialize/test_analytics/data/upgrade_downtime/upgrade_downtime_result_storage.py new file mode 100644 index 0000000000000..473851ad4536c --- /dev/null +++ b/misc/python/materialize/test_analytics/data/upgrade_downtime/upgrade_downtime_result_storage.py @@ -0,0 +1,60 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. +from dataclasses import dataclass + +from materialize import buildkite +from materialize.buildkite import BuildkiteEnvVar +from materialize.test_analytics.data.base_data_storage import BaseDataStorage +from materialize.test_analytics.util.mz_sql_util import as_sanitized_literal + + +@dataclass +class UpgradeDowntimeResultEntry: + scenario: str + scenario_version: str + downtime_initial: float + downtime_upgrade: float + + +class UpgradeDowntimeResultStorage(BaseDataStorage): + + def add_result( + self, + framework_version: str, + results: list[UpgradeDowntimeResultEntry], + ) -> None: + job_id = buildkite.get_var(BuildkiteEnvVar.BUILDKITE_JOB_ID) + + sql_statements = [] + + for result_entry in results: + # TODO: remove NULL castings when database-issues#8100 is resolved + sql_statements.append( + f""" + INSERT INTO upgrade_downtime_result + ( + build_job_id, + framework_version, + scenario, + scenario_version, + downtime_initial, + downtime_upgrade + ) + SELECT + {as_sanitized_literal(job_id)}, + {as_sanitized_literal(framework_version)}, + {as_sanitized_literal(result_entry.scenario)}, + {as_sanitized_literal(result_entry.scenario_version)}, + {result_entry.downtime_initial}, + {result_entry.downtime_upgrade} + ; + """ + ) + + self.database_connector.add_update_statements(sql_statements) diff --git a/misc/python/materialize/test_analytics/setup/cleanup/remove-build.sql b/misc/python/materialize/test_analytics/setup/cleanup/remove-build.sql index 5e65d8c5c4266..265bf8536fc34 100644 --- a/misc/python/materialize/test_analytics/setup/cleanup/remove-build.sql +++ b/misc/python/materialize/test_analytics/setup/cleanup/remove-build.sql @@ -15,6 +15,7 @@ DELETE FROM parallel_benchmark_result WHERE build_job_id IN (SELECT build_id FRO DELETE FROM product_limits_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%)); DELETE FROM cluster_spec_sheet_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%)); DELETE FROM cluster_spec_sheet_environmentd_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%)); +DELETE FROM upgrade_downtime_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%)); DELETE FROM build_annotation_error WHERE build_job_id IN (SELECT build_job_id FROM build_annotation WHERE build_id IN (%build-ids%)); DELETE FROM build_annotation WHERE build_id IN (%build-ids%); DELETE FROM build_job WHERE build_id IN (%build-ids%); diff --git a/misc/python/materialize/test_analytics/setup/tables/15-cluster-spec-sheet.sql b/misc/python/materialize/test_analytics/setup/tables/15-cluster-spec-sheet.sql index b1854bc56e472..5b3044165203f 100644 --- a/misc/python/materialize/test_analytics/setup/tables/15-cluster-spec-sheet.sql +++ b/misc/python/materialize/test_analytics/setup/tables/15-cluster-spec-sheet.sql @@ -8,7 +8,6 @@ -- by the Apache License, Version 2.0. --- result of individual product limits scenarios CREATE TABLE cluster_spec_sheet_result ( build_job_id TEXT NOT NULL, framework_version TEXT NOT NULL, diff --git a/misc/python/materialize/test_analytics/setup/tables/16-cluster-spec-sheet-environmentd.sql b/misc/python/materialize/test_analytics/setup/tables/16-cluster-spec-sheet-environmentd.sql index 9b4399fb64263..d9ee25c3e1b3a 100644 --- a/misc/python/materialize/test_analytics/setup/tables/16-cluster-spec-sheet-environmentd.sql +++ b/misc/python/materialize/test_analytics/setup/tables/16-cluster-spec-sheet-environmentd.sql @@ -8,7 +8,6 @@ -- by the Apache License, Version 2.0. --- result of individual product limits scenarios CREATE TABLE cluster_spec_sheet_environmentd_result ( build_job_id TEXT NOT NULL, framework_version TEXT NOT NULL, diff --git a/misc/python/materialize/test_analytics/setup/tables/17-upgrade-downtime.sql b/misc/python/materialize/test_analytics/setup/tables/17-upgrade-downtime.sql new file mode 100644 index 0000000000000..c6b8bbde73fc0 --- /dev/null +++ b/misc/python/materialize/test_analytics/setup/tables/17-upgrade-downtime.sql @@ -0,0 +1,21 @@ +-- Copyright Materialize, Inc. and contributors. All rights reserved. +-- +-- Use of this software is governed by the Business Source License +-- included in the LICENSE file at the root of this repository. +-- +-- As of the Change Date specified in that file, in accordance with +-- the Business Source License, use of this software will be governed +-- by the Apache License, Version 2.0. + + +CREATE TABLE upgrade_downtime_result ( + build_job_id TEXT NOT NULL, + framework_version TEXT NOT NULL, + scenario TEXT NOT NULL, + scenario_version TEXT NOT NULL, + downtime_initial TEXT NOT NULL, + downtime_upgrade TEXT NOT NULL +); + +ALTER TABLE upgrade_downtime_result OWNER TO qa; +GRANT SELECT, INSERT, UPDATE ON TABLE upgrade_downtime_result TO "hetzner-ci"; diff --git a/misc/python/materialize/test_analytics/setup/views/100-data-integrity.sql b/misc/python/materialize/test_analytics/setup/views/100-data-integrity.sql index 56b93309c5c94..f00ae42a04437 100644 --- a/misc/python/materialize/test_analytics/setup/views/100-data-integrity.sql +++ b/misc/python/materialize/test_analytics/setup/views/100-data-integrity.sql @@ -37,6 +37,10 @@ CREATE OR REPLACE VIEW v_data_integrity (table_name, own_item_key, referenced_it FROM cluster_spec_sheet_environmentd_result WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job) UNION + SELECT 'upgrade_downtime_environmentd_result', build_job_id, build_job_id, 'upgrade downtime result references missing build job' + FROM upgrade_downtime_result + WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job) + UNION SELECT 'build_annotation', build_job_id, build_job_id, 'build annotation references missing build job' FROM build_annotation WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job) diff --git a/misc/python/materialize/test_analytics/test_analytics_db.py b/misc/python/materialize/test_analytics/test_analytics_db.py index ccbdc076a4d97..aec3907813708 100644 --- a/misc/python/materialize/test_analytics/test_analytics_db.py +++ b/misc/python/materialize/test_analytics/test_analytics_db.py @@ -47,6 +47,9 @@ from materialize.test_analytics.data.scalability_framework.scalability_framework_result_storage import ( ScalabilityFrameworkResultStorage, ) +from materialize.test_analytics.data.upgrade_downtime.upgrade_downtime_result_storage import ( + UpgradeDowntimeResultStorage, +) TEST_ANALYTICS_DATA_VERSION: int = 21 @@ -83,6 +86,9 @@ def __init__(self, config: MzDbConfig): self.cluster_spec_sheet_environmentd_results = ( ClusterSpecSheetEnvironmentdResultStorage(self.database_connector) ) + self.upgrade_downtime_results = UpgradeDowntimeResultStorage( + self.database_connector + ) def _create_database_connector(self, config: MzDbConfig) -> DatabaseConnector: if config.enabled: diff --git a/test/orchestratord/mzcompose.py b/test/orchestratord/mzcompose.py index c29e0f219d3d7..1377c4d1dffc5 100644 --- a/test/orchestratord/mzcompose.py +++ b/test/orchestratord/mzcompose.py @@ -31,7 +31,7 @@ import yaml from semver.version import Version -from materialize import MZ_ROOT, ci_util, git, spawn +from materialize import MZ_ROOT, buildkite, ci_util, git, spawn from materialize.mz_version import MzVersion from materialize.mzcompose.composition import ( Composition, @@ -41,9 +41,17 @@ from materialize.mzcompose.services.balancerd import Balancerd from materialize.mzcompose.services.clusterd import Clusterd from materialize.mzcompose.services.environmentd import Environmentd +from materialize.mzcompose.services.mz import Mz from materialize.mzcompose.services.mz_debug import MzDebug from materialize.mzcompose.services.orchestratord import Orchestratord from materialize.mzcompose.services.testdrive import Testdrive +from materialize.test_analytics.config.test_analytics_db_config import ( + create_test_analytics_config, +) +from materialize.test_analytics.data.upgrade_downtime import ( + upgrade_downtime_result_storage, +) +from materialize.test_analytics.test_analytics_db import TestAnalyticsDb from materialize.util import PropagatingThread, all_subclasses from materialize.version_list import ( get_all_self_managed_versions, @@ -57,6 +65,7 @@ Clusterd(), Balancerd(), MzDebug(), + Mz(app_password=""), ] @@ -1594,6 +1603,46 @@ def make_mod_source( raise ValueError(f"Unhandled properties: {properties}") +# Bump this version if the upgrade-downtime workflow is changed in a way that changes the results uploaded to test analytics +UPGRADE_DOWNTIME_SCENARIO_VERSION = "1.0.0" +# Used for uploading test analytics results +ORCHESTRATORD_TEST_VERSION = "1.0.0" + + +def upload_upgrade_downtime_to_test_analytics( + composition: Composition, + downtime_initial: float, + downtime_upgrade: float, + was_successful: bool, +) -> None: + if not buildkite.is_in_buildkite(): + return + + test_analytics = TestAnalyticsDb(create_test_analytics_config(composition)) + test_analytics.builds.add_build_job(was_successful=was_successful) + + result_entries = [ + upgrade_downtime_result_storage.UpgradeDowntimeResultEntry( + scenario="upgrade-downtime", + scenario_version=UPGRADE_DOWNTIME_SCENARIO_VERSION, + downtime_initial=downtime_initial, + downtime_upgrade=downtime_upgrade, + ) + ] + + test_analytics.upgrade_downtime_results.add_result( + framework_version=ORCHESTRATORD_TEST_VERSION, + results=result_entries, + ) + + try: + test_analytics.submit_updates() + print("Uploaded results.") + except Exception as e: + # An error during an upload must never cause the build to fail + test_analytics.on_upload_failed(e) + + def workflow_upgrade_downtime(c: Composition, parser: WorkflowArgumentParser) -> None: parser.add_argument( "--recreate-cluster", @@ -1614,6 +1663,7 @@ def workflow_upgrade_downtime(c: Composition, parser: WorkflowArgumentParser) -> args = parser.parse_args() running = True + downtimes: list[float] = [] def measure_downtime() -> None: port_forward_process = None @@ -1650,6 +1700,7 @@ def measure_downtime() -> None: preexec_fn=os.setpgrp, ) connect_port_forward = False + time.sleep(1) try: with psycopg.connect( "postgres://materialize@127.0.0.1:6875/materialize", @@ -1660,11 +1711,10 @@ def measure_downtime() -> None: except psycopg.OperationalError: connect_port_forward = True continue - runtime = time.time() - start_time - if runtime > 2: - print(f"Downtime: {runtime}s") - assert runtime < 15, f"SELECT 1 took more than 15s: {runtime}s" - time.sleep(10) + runtime = time.time() - start_time - 1 + print(f"Time: {runtime}s") + if runtime > 1: + downtimes.append(runtime) start_time = time.time() finally: if port_forward_process: @@ -1684,6 +1734,20 @@ def measure_downtime() -> None: running = False thread.join() + assert len(downtimes) == 2, f"Wrong number of downtimes: {downtimes}" + + test_failed = False + max_downtime = 15 + for downtime in downtimes: + if downtime > max_downtime: + print(f"SELECT 1 took more than {max_downtime}s: {downtime}s") + test_failed = True + + upload_upgrade_downtime_to_test_analytics( + c, downtimes[0], downtimes[1], not test_failed + ) + assert not test_failed + def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: parser.add_argument(