From 48761d76541ca70f2b72bad81738353a26473858 Mon Sep 17 00:00:00 2001 From: Sang Jun Bak Date: Tue, 14 Oct 2025 13:18:20 -0400 Subject: [PATCH 1/6] Turn off persist incompatiblity check --- src/catalog/src/durable/persist.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/catalog/src/durable/persist.rs b/src/catalog/src/durable/persist.rs index 9d7d8e5fba50b..cee33b1ccda36 100644 --- a/src/catalog/src/durable/persist.rs +++ b/src/catalog/src/durable/persist.rs @@ -1008,10 +1008,7 @@ impl UnopenedPersistCatalogState { if mz_persist_client::cfg::check_data_version(&version_in_upgrade_shard, &version) .is_err() { - return Err(DurableCatalogError::IncompatiblePersistVersion { - found_version: version_in_upgrade_shard, - catalog_version: version, - }); + tracing::info!("optimistically ignoring persist version error"); } } From 45c59e9866b209c90814e417f37aad04e610ddd6 Mon Sep 17 00:00:00 2001 From: Sang Jun Bak Date: Tue, 14 Oct 2025 16:45:08 -0400 Subject: [PATCH 2/6] Dynamically add multiversion upgrades - Created a factory function to create Scenarios - For the platform-checks CLI, converted scenario argument to regex search to capture multiple versions --- .../checks/scenarios_zero_downtime.py | 70 +++++++++++++++++++ test/platform-checks/mzcompose.py | 29 +++++++- 2 files changed, 97 insertions(+), 2 deletions(-) diff --git a/misc/python/materialize/checks/scenarios_zero_downtime.py b/misc/python/materialize/checks/scenarios_zero_downtime.py index a62f81b234e4b..fbdddb7aa7a8a 100644 --- a/misc/python/materialize/checks/scenarios_zero_downtime.py +++ b/misc/python/materialize/checks/scenarios_zero_downtime.py @@ -16,6 +16,7 @@ Manipulate, Validate, ) +from materialize.checks.all_checks.drop_index import DropIndex from materialize.checks.checks import Check from materialize.checks.executors import Executor from materialize.checks.features import Features @@ -34,6 +35,7 @@ ) from materialize.mz_version import MzVersion from materialize.mzcompose import get_default_system_parameters +from materialize.version_list import VersionsFromDocs def wait_ready_and_promote(mz_service: str) -> list[MzcomposeAction]: @@ -315,3 +317,71 @@ def actions(self) -> list[Action]: *wait_ready_and_promote("mz_5"), Validate(self, mz_service="mz_5"), ] + + +# class ZeroDowntimeBasic(Scenario): +# """0dt upgrade of the entire Mz instance from the last released version.""" + +# def __init__( +# self, +# checks: list[type[Check]], +# executor: Executor, +# features: Features, +# seed: str | None = None, +# ): +# super().__init__([DropIndex], executor, features, seed) + +# def base_version(self) -> MzVersion: +# # TODO (SangJunBak): Create a factory function and replace this with the input version from the factory +# # 146, 155 do not work. +# return MzVersion.parse_mz("v0.130.0") + + +def create_zero_downtime_basic( + name: str, + base_version: MzVersion, +) -> type[Scenario]: + + def actions(self) -> list[Action]: + return [ + StartMz( + self, + tag=self.base_version(), + mz_service="mz_1", + ), + Initialize(self, mz_service="mz_1"), + Manipulate(self, phase=1, mz_service="mz_1"), + Manipulate(self, phase=2, mz_service="mz_1"), + start_mz_read_only( + self, + tag=None, + deploy_generation=1, + mz_service="mz_2", + ), + *wait_ready_and_promote("mz_2"), + Validate(self, mz_service="mz_2"), + ] + + return type( + name, + (Scenario,), + {"base_version": lambda self: base_version, "actions": actions}, + ) + + +versions_from_docs = sorted( + [ + version + for version in VersionsFromDocs(respect_released_tag=True).all_versions() + if version >= MzVersion.parse_mz("v0.140.0") + ] +) + + +zero_downtime_basic_scenarios = [ + create_zero_downtime_basic( + name=f"ZeroDowntimeBasic_{version}", + base_version=version, + ) + for version in versions_from_docs +] diff --git a/test/platform-checks/mzcompose.py b/test/platform-checks/mzcompose.py index 3b09e0c88b4ec..78d5d6907c458 100644 --- a/test/platform-checks/mzcompose.py +++ b/test/platform-checks/mzcompose.py @@ -15,6 +15,7 @@ import argparse import os +import re from enum import Enum from materialize import buildkite @@ -261,8 +262,32 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: features = Features(args.features) if args.scenario: - assert args.scenario in globals(), f"scenario {args.scenario} does not exist" - scenarios = [globals()[args.scenario]] + # Get all available scenarios + base_scenarios = {SystemVarChange} + all_scenarios = all_subclasses(Scenario) - base_scenarios + + # Create a mapping of scenario names to scenario classes + scenario_map = {scenario.__name__: scenario for scenario in all_scenarios} + + # Compile the regex pattern + try: + pattern = re.compile(args.scenario) + except re.error as e: + raise ValueError(f"Invalid regex pattern '{args.scenario}': {e}") + + # Filter scenarios by regex match + scenarios = [ + scenario for name, scenario in scenario_map.items() if pattern.search(name) + ] + + if not scenarios: + available = sorted(scenario_map.keys()) + raise ValueError( + f"No scenarios matched pattern '{args.scenario}'. " + f"Available scenarios: {', '.join(available)}" + ) + + print(f"Matched scenarios: {[s.__name__ for s in scenarios]}") else: base_scenarios = {SystemVarChange} scenarios = all_subclasses(Scenario) - base_scenarios From 98eabe8aee4023931c323b9942495f6404c5d692 Mon Sep 17 00:00:00 2001 From: Sang Jun Bak Date: Tue, 14 Oct 2025 18:45:24 -0400 Subject: [PATCH 3/6] Add multi version upgrade test to CI - Add pipeline to CI - Enhanced error handling by wrapping scenario execution in try-except blocks to capture and print exceptions during runtime - Add debug log filter - Add teardown for each scenario run - Allow failures to not end the pipeline --- ci/nightly/pipeline.template.yml | 12 +++++ .../checks/scenarios_zero_downtime.py | 28 +++-------- misc/python/materialize/mzcompose/__init__.py | 1 + test/platform-checks/mzcompose.py | 47 ++++++++++++------- 4 files changed, 50 insertions(+), 38 deletions(-) diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index 3d85b8561971a..62b1d41327127 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -1094,6 +1094,18 @@ steps: composition: platform-checks args: [--scenario=ZeroDowntimeUpgradeEntireMzFourVersions, "--seed=$BUILDKITE_JOB_ID"] + - id: checks-0dt-upgrade-for-previous-versions-to-current + label: "Checks 0dt upgrade for previous versions to current" + depends_on: build-x86_64 + timeout_in_minutes: 120 + parallelism: 3 + agents: + queue: hetzner-x86-64-16cpu-32gb + plugins: + - ./ci/plugins/mzcompose: + composition: platform-checks + args: [--scenario=MultiVersionZeroDowntimeBasic, "--seed=$BUILDKITE_JOB_ID", "--teardown=True"] + - id: checks-0dt-bump-version label: "Checks 0dt upgrade to a bumped version" depends_on: build-x86_64 diff --git a/misc/python/materialize/checks/scenarios_zero_downtime.py b/misc/python/materialize/checks/scenarios_zero_downtime.py index fbdddb7aa7a8a..2f00a98900049 100644 --- a/misc/python/materialize/checks/scenarios_zero_downtime.py +++ b/misc/python/materialize/checks/scenarios_zero_downtime.py @@ -16,7 +16,6 @@ Manipulate, Validate, ) -from materialize.checks.all_checks.drop_index import DropIndex from materialize.checks.checks import Check from materialize.checks.executors import Executor from materialize.checks.features import Features @@ -319,24 +318,6 @@ def actions(self) -> list[Action]: ] -# class ZeroDowntimeBasic(Scenario): -# """0dt upgrade of the entire Mz instance from the last released version.""" - -# def __init__( -# self, -# checks: list[type[Check]], -# executor: Executor, -# features: Features, -# seed: str | None = None, -# ): -# super().__init__([DropIndex], executor, features, seed) - -# def base_version(self) -> MzVersion: -# # TODO (SangJunBak): Create a factory function and replace this with the input version from the factory -# # 146, 155 do not work. -# return MzVersion.parse_mz("v0.130.0") - - def create_zero_downtime_basic( name: str, base_version: MzVersion, @@ -365,7 +346,10 @@ def actions(self) -> list[Action]: return type( name, (Scenario,), - {"base_version": lambda self: base_version, "actions": actions}, + { + "base_version": lambda self: base_version, + "actions": actions, + }, ) @@ -373,14 +357,14 @@ def actions(self) -> list[Action]: [ version for version in VersionsFromDocs(respect_released_tag=True).all_versions() - if version >= MzVersion.parse_mz("v0.140.0") + if version >= MzVersion.parse_mz("v0.107.0") ] ) zero_downtime_basic_scenarios = [ create_zero_downtime_basic( - name=f"ZeroDowntimeBasic_{version}", + name=f"MultiVersionZeroDowntimeBasic_{version}", base_version=version, ) for version in versions_from_docs diff --git a/misc/python/materialize/mzcompose/__init__.py b/misc/python/materialize/mzcompose/__init__.py index cb4bb396907d2..810c2d680f5d3 100644 --- a/misc/python/materialize/mzcompose/__init__.py +++ b/misc/python/materialize/mzcompose/__init__.py @@ -117,6 +117,7 @@ def get_minimal_system_parameters( "ore_overflowing_behavior": "panic", "unsafe_enable_table_keys": "true", "with_0dt_deployment_max_wait": "1800s", + "log_filter": "debug", # End of list (ordered by name) } diff --git a/test/platform-checks/mzcompose.py b/test/platform-checks/mzcompose.py index 78d5d6907c458..c2e003a4cc763 100644 --- a/test/platform-checks/mzcompose.py +++ b/test/platform-checks/mzcompose.py @@ -258,6 +258,13 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: "--external-blob-store", action=argparse.BooleanOptionalAction, default=True ) + parser.add_argument( + "--teardown", + default="False", + choices=["True", "False"], + help="Teardown the environment per scenario ran", + ) + args = parser.parse_args() features = Features(args.features) @@ -340,14 +347,17 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: execution_mode = args.execution_mode if execution_mode in [ExecutionMode.SEQUENTIAL, ExecutionMode.PARALLEL]: - setup(c, args.external_blob_store) - scenario = scenario_class( - checks=checks, - executor=executor, - features=features, - seed=args.seed, - ) - scenario.run() + try: + setup(c, args.external_blob_store) + scenario = scenario_class( + checks=checks, + executor=executor, + features=features, + seed=args.seed, + ) + scenario.run() + except Exception as e: + print(e) elif execution_mode is ExecutionMode.ONEATATIME: for check in checks: print( @@ -356,13 +366,18 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: c.override_current_testcase_name( f"Check '{check}' with scenario '{scenario_class}'" ) - setup(c, args.external_blob_store) - scenario = scenario_class( - checks=[check], - executor=executor, - features=features, - seed=args.seed, - ) - scenario.run() + try: + setup(c, args.external_blob_store) + scenario = scenario_class( + checks=[check], + executor=executor, + features=features, + seed=args.seed, + ) + scenario.run() + except Exception as e: + print(e) else: raise RuntimeError(f"Unsupported execution mode: {execution_mode}") + if args.teardown == "True": + teardown(c) From cda1e4f6b0dbf239b9b35c74709308e320b08a5c Mon Sep 17 00:00:00 2001 From: Sang Jun Bak Date: Tue, 14 Oct 2025 21:20:06 -0400 Subject: [PATCH 4/6] Run pipeline for CreateCluster check - Targets only minor versions rather than all patch releases - Ran than all checks, runs against CreateCluster check - Adds additional logging for pipeline - Hard code timeout of 60 seconds --- ci/nightly/pipeline.template.yml | 2 +- .../materialize/checks/mzcompose_actions.py | 6 ++++- .../checks/scenarios_zero_downtime.py | 12 ++++------ test/platform-checks/mzcompose.py | 24 +++++++++++++++++-- 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index 62b1d41327127..1d720538327a5 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -1104,7 +1104,7 @@ steps: plugins: - ./ci/plugins/mzcompose: composition: platform-checks - args: [--scenario=MultiVersionZeroDowntimeBasic, "--seed=$BUILDKITE_JOB_ID", "--teardown=True"] + args: [--scenario=MultiVersionZeroDowntimeBasic, "--seed=$BUILDKITE_JOB_ID", "--teardown=True", "--check=CreateCluster"] - id: checks-0dt-bump-version label: "Checks 0dt upgrade to a bumped version" diff --git a/misc/python/materialize/checks/mzcompose_actions.py b/misc/python/materialize/checks/mzcompose_actions.py index 77babafb18c28..6c183e806a7ad 100644 --- a/misc/python/materialize/checks/mzcompose_actions.py +++ b/misc/python/materialize/checks/mzcompose_actions.py @@ -94,7 +94,11 @@ def execute(self, e: Executor) -> None: # Don't fail since we are careful to explicitly kill and collect logs # of the services thus started with c.override(mz, fail_on_new_service=False): - c.up("materialized" if self.mz_service is None else self.mz_service) + c.up( + "materialized" if self.mz_service is None else self.mz_service, + wait=False, + max_tries=2, + ) # If we start up Materialize with a deploy-generation , then it # stays in a stuck state when the preflight-check is completed. So diff --git a/misc/python/materialize/checks/scenarios_zero_downtime.py b/misc/python/materialize/checks/scenarios_zero_downtime.py index 2f00a98900049..3f1c1c112e01b 100644 --- a/misc/python/materialize/checks/scenarios_zero_downtime.py +++ b/misc/python/materialize/checks/scenarios_zero_downtime.py @@ -353,13 +353,11 @@ def actions(self) -> list[Action]: ) -versions_from_docs = sorted( - [ - version - for version in VersionsFromDocs(respect_released_tag=True).all_versions() - if version >= MzVersion.parse_mz("v0.107.0") - ] -) +versions_from_docs = [ + version + for version in VersionsFromDocs(respect_released_tag=True).minor_versions() + if version >= MzVersion.parse_mz("v0.107.0") +] zero_downtime_basic_scenarios = [ diff --git a/test/platform-checks/mzcompose.py b/test/platform-checks/mzcompose.py index c2e003a4cc763..d31b6b640ebea 100644 --- a/test/platform-checks/mzcompose.py +++ b/test/platform-checks/mzcompose.py @@ -293,6 +293,7 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: f"No scenarios matched pattern '{args.scenario}'. " f"Available scenarios: {', '.join(available)}" ) + scenarios.sort(key=lambda s: s.__name__) print(f"Matched scenarios: {[s.__name__ for s in scenarios]}") else: @@ -357,7 +358,16 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: ) scenario.run() except Exception as e: - print(e) + c.invoke( + "logs", + "--no-color", + "--timestamps", + "--tail", + "20", + "mz_1", + "mz_2", + ) + print("Error in scenario", e) elif execution_mode is ExecutionMode.ONEATATIME: for check in checks: print( @@ -376,8 +386,18 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: ) scenario.run() except Exception as e: - print(e) + c.invoke( + "logs", + "--no-color", + "--timestamps", + "--tail", + "20", + "mz_1", + "mz_2", + ) + print("Error in scenario", e) else: raise RuntimeError(f"Unsupported execution mode: {execution_mode}") if args.teardown == "True": + teardown(c) From 17c482d2c1139c28c60e9ebbeaf88138e44a2e87 Mon Sep 17 00:00:00 2001 From: Sang Jun Bak Date: Thu, 16 Oct 2025 09:31:46 -0400 Subject: [PATCH 5/6] Update CI pipeline and mzcompose actions for zero downtime checks - Increased timeout for 0dt upgrade checks from 120 to 240 minutes - Updated minimum version requirement for zero downtime scenarios to v0.126.0 since that's the earliest version can upgrade successfully - Run for all checks instead of one - Remove skipping of healthchecks --- ci/nightly/pipeline.template.yml | 6 +++--- misc/python/materialize/checks/mzcompose_actions.py | 2 -- misc/python/materialize/checks/scenarios_zero_downtime.py | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index 1d720538327a5..fd65ebc1e62e9 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -1097,14 +1097,14 @@ steps: - id: checks-0dt-upgrade-for-previous-versions-to-current label: "Checks 0dt upgrade for previous versions to current" depends_on: build-x86_64 - timeout_in_minutes: 120 - parallelism: 3 + timeout_in_minutes: 240 + parallelism: 2 agents: queue: hetzner-x86-64-16cpu-32gb plugins: - ./ci/plugins/mzcompose: composition: platform-checks - args: [--scenario=MultiVersionZeroDowntimeBasic, "--seed=$BUILDKITE_JOB_ID", "--teardown=True", "--check=CreateCluster"] + args: [--scenario=MultiVersionZeroDowntimeBasic, "--seed=$BUILDKITE_JOB_ID", "--teardown=True"] - id: checks-0dt-bump-version label: "Checks 0dt upgrade to a bumped version" diff --git a/misc/python/materialize/checks/mzcompose_actions.py b/misc/python/materialize/checks/mzcompose_actions.py index 6c183e806a7ad..6db202c6370be 100644 --- a/misc/python/materialize/checks/mzcompose_actions.py +++ b/misc/python/materialize/checks/mzcompose_actions.py @@ -96,8 +96,6 @@ def execute(self, e: Executor) -> None: with c.override(mz, fail_on_new_service=False): c.up( "materialized" if self.mz_service is None else self.mz_service, - wait=False, - max_tries=2, ) # If we start up Materialize with a deploy-generation , then it diff --git a/misc/python/materialize/checks/scenarios_zero_downtime.py b/misc/python/materialize/checks/scenarios_zero_downtime.py index 3f1c1c112e01b..0a9a6f0cc09e7 100644 --- a/misc/python/materialize/checks/scenarios_zero_downtime.py +++ b/misc/python/materialize/checks/scenarios_zero_downtime.py @@ -356,7 +356,7 @@ def actions(self) -> list[Action]: versions_from_docs = [ version for version in VersionsFromDocs(respect_released_tag=True).minor_versions() - if version >= MzVersion.parse_mz("v0.107.0") + if version >= MzVersion.parse_mz("v0.126.0") ] From 0f782c907f2d9a0d2350df87ade3f9e7549a8c50 Mon Sep 17 00:00:00 2001 From: Sang Jun Bak Date: Thu, 16 Oct 2025 10:49:07 -0400 Subject: [PATCH 6/6] Enhance zero downtime checks and teardown process - Added execution mode option to the CI pipeline for one-at-a-time execution. - Introduced a fast teardown function to streamline resource cleanup during tests. - Don't stop process on failure of a check --- ci/nightly/pipeline.template.yml | 2 +- .../checks/scenarios_zero_downtime.py | 4 +++ test/platform-checks/mzcompose.py | 35 +++++++++++++++++-- 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index fd65ebc1e62e9..89f21dac92da8 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -1104,7 +1104,7 @@ steps: plugins: - ./ci/plugins/mzcompose: composition: platform-checks - args: [--scenario=MultiVersionZeroDowntimeBasic, "--seed=$BUILDKITE_JOB_ID", "--teardown=True"] + args: [--scenario=MultiVersionZeroDowntimeBasic, "--seed=$BUILDKITE_JOB_ID", "--teardown=True", "--execution-mode=oneatatime"] - id: checks-0dt-bump-version label: "Checks 0dt upgrade to a bumped version" diff --git a/misc/python/materialize/checks/scenarios_zero_downtime.py b/misc/python/materialize/checks/scenarios_zero_downtime.py index 0a9a6f0cc09e7..49abfc93ca73d 100644 --- a/misc/python/materialize/checks/scenarios_zero_downtime.py +++ b/misc/python/materialize/checks/scenarios_zero_downtime.py @@ -329,6 +329,9 @@ def actions(self) -> list[Action]: self, tag=self.base_version(), mz_service="mz_1", + system_parameter_defaults=get_default_system_parameters( + self.base_version() + ), ), Initialize(self, mz_service="mz_1"), Manipulate(self, phase=1, mz_service="mz_1"), @@ -338,6 +341,7 @@ def actions(self) -> list[Action]: tag=None, deploy_generation=1, mz_service="mz_2", + system_parameter_defaults=get_default_system_parameters(None), ), *wait_ready_and_promote("mz_2"), Validate(self, mz_service="mz_2"), diff --git a/test/platform-checks/mzcompose.py b/test/platform-checks/mzcompose.py index d31b6b640ebea..6c91179af4827 100644 --- a/test/platform-checks/mzcompose.py +++ b/test/platform-checks/mzcompose.py @@ -206,6 +206,25 @@ def teardown(c: Composition) -> None: c.rm_volumes("mzdata", "tmp", force=True) +def fast_teardown(c: Composition) -> None: + c.rm( + *[ + s.name + for s in SERVICES + if s.name != "debezium" + and s.name != "kafka" + and s.name != "schema-registry" + and s.name != "azurite" + and s.name != "mysql" + and s.name != "ssh-bastion-host" + and s.name != "zookeeper" + ], + stop=True, + destroy_volumes=True, + ) + c.rm_volumes("mzdata", "tmp", force=True) + + def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: # c.silent = True parser.add_argument( @@ -395,9 +414,19 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: "mz_1", "mz_2", ) - print("Error in scenario", e) + print( + f"Error: {e}" + + ( + f" (version: {executor.current_mz_version})" + if hasattr(executor, "current_mz_version") + else "" + ) + ) + finally: + if args.teardown == "True": + fast_teardown(c) + else: raise RuntimeError(f"Unsupported execution mode: {execution_mode}") if args.teardown == "True": - - teardown(c) + fast_teardown(c)