Skip to content

Commit 3d2199a

Browse files
committed
orchestratord test: Upload upgrade downtimes to test analytics
As requested by Jon, for later analysis
1 parent f1e5a40 commit 3d2199a

File tree

9 files changed

+157
-9
lines changed

9 files changed

+157
-9
lines changed

ci/nightly/pipeline.template.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2412,7 +2412,8 @@ steps:
24122412
run: upgrade-downtime
24132413
ci-builder: stable
24142414
agents:
2415-
queue: hetzner-aarch64-16cpu-32gb
2415+
# More stable results for recording benchmarks
2416+
queue: hetzner-x86-64-dedi-16cpu-64gb
24162417

24172418
- id: orchestratord-default-properties
24182419
label: "Orchestratord + defaults for properties"
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Copyright Materialize, Inc. and contributors. All rights reserved.
2+
#
3+
# Use of this software is governed by the Business Source License
4+
# included in the LICENSE file at the root of this repository.
5+
#
6+
# As of the Change Date specified in that file, in accordance with
7+
# the Business Source License, use of this software will be governed
8+
# by the Apache License, Version 2.0.
9+
from dataclasses import dataclass
10+
11+
from materialize import buildkite
12+
from materialize.buildkite import BuildkiteEnvVar
13+
from materialize.test_analytics.data.base_data_storage import BaseDataStorage
14+
from materialize.test_analytics.util.mz_sql_util import as_sanitized_literal
15+
16+
17+
@dataclass
18+
class UpgradeDowntimeResultEntry:
19+
scenario: str
20+
scenario_version: str
21+
downtime_initial: float
22+
downtime_upgrade: float
23+
24+
25+
class UpgradeDowntimeResultStorage(BaseDataStorage):
26+
27+
def add_result(
28+
self,
29+
framework_version: str,
30+
results: list[UpgradeDowntimeResultEntry],
31+
) -> None:
32+
job_id = buildkite.get_var(BuildkiteEnvVar.BUILDKITE_JOB_ID)
33+
34+
sql_statements = []
35+
36+
for result_entry in results:
37+
# TODO: remove NULL castings when database-issues#8100 is resolved
38+
sql_statements.append(
39+
f"""
40+
INSERT INTO upgrade_downtime_result
41+
(
42+
build_job_id,
43+
framework_version,
44+
scenario,
45+
scenario_version,
46+
downtime_initial,
47+
downtime_upgrade
48+
)
49+
SELECT
50+
{as_sanitized_literal(job_id)},
51+
{as_sanitized_literal(framework_version)},
52+
{as_sanitized_literal(result_entry.scenario)},
53+
{as_sanitized_literal(result_entry.scenario_version)},
54+
{result_entry.downtime_initial},
55+
{result_entry.downtime_upgrade}
56+
;
57+
"""
58+
)
59+
60+
self.database_connector.add_update_statements(sql_statements)

misc/python/materialize/test_analytics/setup/cleanup/remove-build.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ DELETE FROM parallel_benchmark_result WHERE build_job_id IN (SELECT build_id FRO
1515
DELETE FROM product_limits_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
1616
DELETE FROM cluster_spec_sheet_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
1717
DELETE FROM cluster_spec_sheet_environmentd_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
18+
DELETE FROM upgrade_downtime_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
1819
DELETE FROM build_annotation_error WHERE build_job_id IN (SELECT build_job_id FROM build_annotation WHERE build_id IN (%build-ids%));
1920
DELETE FROM build_annotation WHERE build_id IN (%build-ids%);
2021
DELETE FROM build_job WHERE build_id IN (%build-ids%);

misc/python/materialize/test_analytics/setup/tables/15-cluster-spec-sheet.sql

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
-- by the Apache License, Version 2.0.
99

1010

11-
-- result of individual product limits scenarios
1211
CREATE TABLE cluster_spec_sheet_result (
1312
build_job_id TEXT NOT NULL,
1413
framework_version TEXT NOT NULL,

misc/python/materialize/test_analytics/setup/tables/16-cluster-spec-sheet-environmentd.sql

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
-- by the Apache License, Version 2.0.
99

1010

11-
-- result of individual product limits scenarios
1211
CREATE TABLE cluster_spec_sheet_environmentd_result (
1312
build_job_id TEXT NOT NULL,
1413
framework_version TEXT NOT NULL,
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
-- Copyright Materialize, Inc. and contributors. All rights reserved.
2+
--
3+
-- Use of this software is governed by the Business Source License
4+
-- included in the LICENSE file at the root of this repository.
5+
--
6+
-- As of the Change Date specified in that file, in accordance with
7+
-- the Business Source License, use of this software will be governed
8+
-- by the Apache License, Version 2.0.
9+
10+
11+
CREATE TABLE upgrade_downtime_result (
12+
build_job_id TEXT NOT NULL,
13+
framework_version TEXT NOT NULL,
14+
scenario TEXT NOT NULL,
15+
scenario_version TEXT NOT NULL,
16+
downtime_initial TEXT NOT NULL,
17+
downtime_upgrade TEXT NOT NULL
18+
);
19+
20+
ALTER TABLE upgrade_downtime_result OWNER TO qa;
21+
GRANT SELECT, INSERT, UPDATE ON TABLE upgrade_downtime_result TO "hetzner-ci";

misc/python/materialize/test_analytics/setup/views/100-data-integrity.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ CREATE OR REPLACE VIEW v_data_integrity (table_name, own_item_key, referenced_it
3737
FROM cluster_spec_sheet_environmentd_result
3838
WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job)
3939
UNION
40+
SELECT 'upgrade_downtime_environmentd_result', build_job_id, build_job_id, 'upgrade downtime result references missing build job'
41+
FROM upgrade_downtime_result
42+
WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job)
43+
UNION
4044
SELECT 'build_annotation', build_job_id, build_job_id, 'build annotation references missing build job'
4145
FROM build_annotation
4246
WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job)

misc/python/materialize/test_analytics/test_analytics_db.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@
4747
from materialize.test_analytics.data.scalability_framework.scalability_framework_result_storage import (
4848
ScalabilityFrameworkResultStorage,
4949
)
50+
from materialize.test_analytics.data.upgrade_downtime.upgrade_downtime_result_storage import (
51+
UpgradeDowntimeResultStorage,
52+
)
5053

5154
TEST_ANALYTICS_DATA_VERSION: int = 21
5255

@@ -83,6 +86,9 @@ def __init__(self, config: MzDbConfig):
8386
self.cluster_spec_sheet_environmentd_results = (
8487
ClusterSpecSheetEnvironmentdResultStorage(self.database_connector)
8588
)
89+
self.upgrade_downtime_results = UpgradeDowntimeResultStorage(
90+
self.database_connector
91+
)
8692

8793
def _create_database_connector(self, config: MzDbConfig) -> DatabaseConnector:
8894
if config.enabled:

test/orchestratord/mzcompose.py

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
import yaml
3232
from semver.version import Version
3333

34-
from materialize import MZ_ROOT, ci_util, git, spawn
34+
from materialize import MZ_ROOT, buildkite, ci_util, git, spawn
3535
from materialize.mz_version import MzVersion
3636
from materialize.mzcompose.composition import (
3737
Composition,
@@ -41,9 +41,17 @@
4141
from materialize.mzcompose.services.balancerd import Balancerd
4242
from materialize.mzcompose.services.clusterd import Clusterd
4343
from materialize.mzcompose.services.environmentd import Environmentd
44+
from materialize.mzcompose.services.mz import Mz
4445
from materialize.mzcompose.services.mz_debug import MzDebug
4546
from materialize.mzcompose.services.orchestratord import Orchestratord
4647
from materialize.mzcompose.services.testdrive import Testdrive
48+
from materialize.test_analytics.config.test_analytics_db_config import (
49+
create_test_analytics_config,
50+
)
51+
from materialize.test_analytics.data.upgrade_downtime import (
52+
upgrade_downtime_result_storage,
53+
)
54+
from materialize.test_analytics.test_analytics_db import TestAnalyticsDb
4755
from materialize.util import PropagatingThread, all_subclasses
4856
from materialize.version_list import (
4957
get_all_self_managed_versions,
@@ -57,6 +65,7 @@
5765
Clusterd(),
5866
Balancerd(),
5967
MzDebug(),
68+
Mz(app_password=""),
6069
]
6170

6271

@@ -1594,6 +1603,39 @@ def make_mod_source(
15941603
raise ValueError(f"Unhandled properties: {properties}")
15951604

15961605

1606+
# Bump this version if the upgrade-downtime workflow is changed in a way that changes the results uploaded to test analytics
1607+
UPGRADE_DOWNTIME_SCENARIO_VERSION = "1.0.0"
1608+
# Used for uploading test analytics results
1609+
ORCHESTRATORD_TEST_VERSION = "1.0.0"
1610+
1611+
1612+
def upload_upgrade_downtime_to_test_analytics(
1613+
composition: Composition,
1614+
downtime_initial: float,
1615+
downtime_upgrade: float,
1616+
was_successful: bool,
1617+
) -> None:
1618+
if not buildkite.is_in_buildkite():
1619+
return
1620+
1621+
test_analytics = TestAnalyticsDb(create_test_analytics_config(composition))
1622+
test_analytics.builds.add_build_job(was_successful=was_successful)
1623+
1624+
result_entries = [
1625+
upgrade_downtime_result_storage.UpgradeDowntimeResultEntry(
1626+
scenario="upgrade-downtime",
1627+
scenario_version=UPGRADE_DOWNTIME_SCENARIO_VERSION,
1628+
downtime_initial=downtime_initial,
1629+
downtime_upgrade=downtime_upgrade,
1630+
)
1631+
]
1632+
1633+
test_analytics.upgrade_downtime_results.add_result(
1634+
framework_version=ORCHESTRATORD_TEST_VERSION,
1635+
results=result_entries,
1636+
)
1637+
1638+
15971639
def workflow_upgrade_downtime(c: Composition, parser: WorkflowArgumentParser) -> None:
15981640
parser.add_argument(
15991641
"--recreate-cluster",
@@ -1614,6 +1656,7 @@ def workflow_upgrade_downtime(c: Composition, parser: WorkflowArgumentParser) ->
16141656
args = parser.parse_args()
16151657

16161658
running = True
1659+
downtimes: list[float] = []
16171660

16181661
def measure_downtime() -> None:
16191662
port_forward_process = None
@@ -1650,6 +1693,7 @@ def measure_downtime() -> None:
16501693
preexec_fn=os.setpgrp,
16511694
)
16521695
connect_port_forward = False
1696+
time.sleep(1)
16531697
try:
16541698
with psycopg.connect(
16551699
"postgres://[email protected]:6875/materialize",
@@ -1660,11 +1704,10 @@ def measure_downtime() -> None:
16601704
except psycopg.OperationalError:
16611705
connect_port_forward = True
16621706
continue
1663-
runtime = time.time() - start_time
1664-
if runtime > 2:
1665-
print(f"Downtime: {runtime}s")
1666-
assert runtime < 15, f"SELECT 1 took more than 15s: {runtime}s"
1667-
time.sleep(10)
1707+
runtime = time.time() - start_time - 1
1708+
print(f"Time: {runtime}s")
1709+
if runtime > 1:
1710+
downtimes.append(runtime)
16681711
start_time = time.time()
16691712
finally:
16701713
if port_forward_process:
@@ -1684,6 +1727,20 @@ def measure_downtime() -> None:
16841727
running = False
16851728
thread.join()
16861729

1730+
assert len(downtimes) == 2, f"Wrong number of downtimes: {downtimes}"
1731+
1732+
test_failed = False
1733+
max_downtime = 15
1734+
for downtime in downtimes:
1735+
if downtime > max_downtime:
1736+
print(f"SELECT 1 took more than {max_downtime}s: {downtime}s")
1737+
test_failed = True
1738+
1739+
upload_upgrade_downtime_to_test_analytics(
1740+
c, downtimes[0], downtimes[1], test_failed
1741+
)
1742+
assert not test_failed
1743+
16871744

16881745
def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
16891746
parser.add_argument(

0 commit comments

Comments
 (0)