Skip to content

Commit cb0f075

Browse files
authored
feat: add upgrade window check for environmentd image ref (#34327)
This commit introduces a new method to check if the current environmentd image reference is within the upgrade window of the last successful rollout. We default the image ref to the current version if status is not set. Not sure if this is the best way to solve this! One nice thing is we'll see the last version everytime we upgrade which can be useful. An edge case is if the user's materialize CR status is already set, then it won't have `last_completed_rollout_environmentd_image_ref` until the next successful rollout. So hypothetically, users on `v26.0.0` can jump straight to `v28.0.0` if they didn't upgrade between then. We solve this by relaxing the check as much as possible, also in case of other false positives. <img width="1023" height="294" alt="Screenshot 2025-12-02 at 2 01 09 PM" src="https://github.com/user-attachments/assets/b128a351-3aba-42db-9b39-ececa701ac68" /> <!-- Describe the contents of the PR briefly but completely. If you write detailed commit messages, it is acceptable to copy/paste them here, or write "see commit messages for details." If there is only one commit in the PR, GitHub will have already added its commit message above. --> ### Motivation * This PR adds a known-desirable feature. * MaterializeInc/database-issues#9860 <!-- Which of the following best describes the motivation behind this PR? * This PR fixes a recognized bug. [Ensure issue is linked somewhere.] [Ensure issue is linked somewhere.] * This PR fixes a previously unreported bug. [Describe the bug in detail, as if you were filing a bug report.] * This PR adds a feature that has not yet been specified. [Write a brief specification for the feature, including justification for its inclusion in Materialize, as if you were writing the original feature specification.] * This PR refactors existing code. [Describe what was wrong with the existing code, if it is not obvious.] --> ### Tips for reviewer <!-- Leave some tips for your reviewer, like: * The diff is much smaller if viewed with whitespace hidden. * [Some function/module/file] deserves extra attention. * [Some function/module/file] is pure code movement and only needs a skim. Delete this section if no tips. --> ### Checklist - [ ] This PR has adequate test coverage / QA involvement has been duly considered. ([trigger-ci for additional test/nightly runs](https://trigger-ci.dev.materialize.com/)) - [ ] This PR has an associated up-to-date [design doc](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/design/README.md), is a design doc ([template](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/design/00000000_template.md)), or is sufficiently small to not require a design. <!-- Reference the design in the description. --> - [ ] If this PR evolves [an existing `$T ⇔ Proto$T` mapping](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/command-and-response-binary-encoding.md) (possibly in a backwards-incompatible way), then it is tagged with a `T-proto` label. - [ ] If this PR will require changes to cloud orchestration or tests, there is a companion cloud PR to account for those changes that is tagged with the release-blocker label ([example](MaterializeInc/cloud#5021)). <!-- Ask in #team-cloud on Slack if you need help preparing the cloud PR. --> - [ ] If this PR includes major [user-facing behavior changes](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/guide-changes.md#what-changes-require-a-release-note), I have pinged the relevant PM to schedule a changelog post.
1 parent cea2162 commit cb0f075

File tree

2 files changed

+205
-1
lines changed

2 files changed

+205
-1
lines changed

src/cloud-resources/src/crd/materialize.rs

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,71 @@ pub mod v1alpha1 {
479479
}
480480
}
481481

482+
/// This check isn't strictly required since environmentd will still be able to determine
483+
/// if the upgrade is allowed or not. However, doing this check allows us to provide
484+
/// the error as soon as possible and in a more user friendly way.
485+
pub fn is_valid_upgrade_version(active_version: &Version, next_version: &Version) -> bool {
486+
// Don't allow rolling back
487+
// Note: semver comparison handles RC versions correctly:
488+
// v26.0.0-rc.1 < v26.0.0-rc.2 < v26.0.0
489+
if next_version < active_version {
490+
return false;
491+
}
492+
493+
if active_version.major == 0 {
494+
// Self managed 25.2 to 26.0
495+
if next_version.major != active_version.major {
496+
if next_version.major == 26
497+
&& active_version.major == 0
498+
&& active_version.minor == 164
499+
{
500+
return true;
501+
} else {
502+
return false;
503+
}
504+
}
505+
// Self managed 25.1 to 25.2
506+
if next_version.minor == 147 && active_version.minor == 130 {
507+
return true;
508+
}
509+
// only allow upgrading a single minor version at a time
510+
return next_version.minor <= active_version.minor + 1;
511+
} else if active_version.major >= 26 {
512+
// For versions 26.X.X and onwards, we deny upgrades past 1 major version of the active version
513+
return next_version.major <= active_version.major + 1;
514+
}
515+
516+
true
517+
}
518+
519+
/// Checks if the current environmentd image ref is within the upgrade window of the last
520+
/// successful rollout.
521+
pub fn within_upgrade_window(&self) -> bool {
522+
let active_environmentd_version = self
523+
.status
524+
.as_ref()
525+
.and_then(|status| {
526+
status
527+
.last_completed_rollout_environmentd_image_ref
528+
.as_ref()
529+
})
530+
.and_then(|image_ref| parse_image_ref(image_ref));
531+
532+
if let (Some(next_environmentd_version), Some(active_environmentd_version)) = (
533+
parse_image_ref(&self.spec.environmentd_image_ref),
534+
active_environmentd_version,
535+
) {
536+
Self::is_valid_upgrade_version(
537+
&active_environmentd_version,
538+
&next_environmentd_version,
539+
)
540+
} else {
541+
// If we fail to parse either version,
542+
// we still allow the upgrade since environmentd will still error if the upgrade is not allowed.
543+
true
544+
}
545+
}
546+
482547
pub fn managed_resource_meta(&self, name: String) -> ObjectMeta {
483548
ObjectMeta {
484549
namespace: Some(self.namespace()),
@@ -516,6 +581,11 @@ pub mod v1alpha1 {
516581
.expect("valid int generation");
517582
}
518583

584+
// Initialize the last completed rollout environmentd image ref to
585+
// the current image ref if not already set.
586+
status.last_completed_rollout_environmentd_image_ref =
587+
Some(self.spec.environmentd_image_ref.clone());
588+
519589
status
520590
})
521591
}
@@ -530,6 +600,10 @@ pub mod v1alpha1 {
530600
pub active_generation: u64,
531601
/// The UUID of the last successfully completed rollout.
532602
pub last_completed_rollout_request: Uuid,
603+
/// The image ref of the environmentd image that was last successfully rolled out.
604+
/// Used to deny upgrades past 1 major version from the last successful rollout.
605+
/// When None, we upgrade anyways.
606+
pub last_completed_rollout_environmentd_image_ref: Option<String>,
533607
/// A hash calculated from the spec of resources to be created based on this Materialize
534608
/// spec. This is used for detecting when the existing resources are up to date.
535609
/// If you want to trigger a rollout without making other changes that would cause this
@@ -630,4 +704,89 @@ mod tests {
630704
mz.spec.environmentd_image_ref = "my.private.registry:5000:v0.33.3".to_owned();
631705
assert!(!mz.meets_minimum_version(&Version::parse("0.34.0").unwrap()));
632706
}
707+
708+
#[mz_ore::test]
709+
fn within_upgrade_window() {
710+
use super::v1alpha1::MaterializeStatus;
711+
712+
let mut mz = Materialize {
713+
spec: MaterializeSpec {
714+
environmentd_image_ref: "materialize/environmentd:v26.0.0".to_owned(),
715+
..Default::default()
716+
},
717+
metadata: ObjectMeta {
718+
..Default::default()
719+
},
720+
status: Some(MaterializeStatus {
721+
last_completed_rollout_environmentd_image_ref: Some(
722+
"materialize/environmentd:v26.0.0".to_owned(),
723+
),
724+
..Default::default()
725+
}),
726+
};
727+
728+
// Pass: upgrading from 26.0.0 to 27.7.3 (within 1 major version)
729+
mz.spec.environmentd_image_ref = "materialize/environmentd:v27.7.3".to_owned();
730+
assert!(mz.within_upgrade_window());
731+
732+
// Pass: upgrading from 26.0.0 to 27.7.8-dev.0 (within 1 major version, pre-release)
733+
mz.spec.environmentd_image_ref = "materialize/environmentd:v27.7.8-dev.0".to_owned();
734+
assert!(mz.within_upgrade_window());
735+
736+
// Fail: upgrading from 26.0.0 to 28.0.1 (more than 1 major version)
737+
mz.spec.environmentd_image_ref = "materialize/environmentd:v28.0.1".to_owned();
738+
assert!(!mz.within_upgrade_window());
739+
740+
// Pass: upgrading from 26.0.0 to 28.0.1.not_a_valid_version (invalid version, defaults to true)
741+
mz.spec.environmentd_image_ref =
742+
"materialize/environmentd:v28.0.1.not_a_valid_version".to_owned();
743+
assert!(mz.within_upgrade_window());
744+
745+
// Pass: upgrading from 0.164.0 to 26.1.0 (self managed 25.2 to 26.0)
746+
mz.status
747+
.as_mut()
748+
.unwrap()
749+
.last_completed_rollout_environmentd_image_ref =
750+
Some("materialize/environmentd:v0.164.0".to_owned());
751+
mz.spec.environmentd_image_ref = "materialize/environmentd:v26.1.0".to_owned();
752+
assert!(mz.within_upgrade_window());
753+
}
754+
755+
#[mz_ore::test]
756+
fn is_valid_upgrade_version() {
757+
let success_tests = [
758+
(Version::new(0, 83, 0), Version::new(0, 83, 0)),
759+
(Version::new(0, 83, 0), Version::new(0, 84, 0)),
760+
(Version::new(0, 9, 0), Version::new(0, 10, 0)),
761+
(Version::new(0, 99, 0), Version::new(0, 100, 0)),
762+
(Version::new(0, 83, 0), Version::new(0, 83, 1)),
763+
(Version::new(0, 83, 0), Version::new(0, 83, 2)),
764+
(Version::new(0, 83, 2), Version::new(0, 83, 10)),
765+
(Version::new(0, 164, 0), Version::new(26, 0, 0)),
766+
(Version::new(26, 0, 0), Version::new(26, 1, 0)),
767+
(Version::new(26, 5, 3), Version::new(26, 10, 0)),
768+
(Version::new(0, 130, 0), Version::new(0, 147, 0)),
769+
];
770+
for (active_version, next_version) in success_tests {
771+
assert!(
772+
Materialize::is_valid_upgrade_version(&active_version, &next_version),
773+
"v{active_version} can upgrade to v{next_version}"
774+
);
775+
}
776+
777+
let failure_tests = [
778+
(Version::new(0, 83, 0), Version::new(0, 82, 0)),
779+
(Version::new(0, 83, 3), Version::new(0, 83, 2)),
780+
(Version::new(0, 83, 3), Version::new(1, 83, 3)),
781+
(Version::new(0, 83, 0), Version::new(0, 85, 0)),
782+
(Version::new(26, 0, 0), Version::new(28, 0, 0)),
783+
(Version::new(0, 130, 0), Version::new(26, 1, 0)),
784+
];
785+
for (active_version, next_version) in failure_tests {
786+
assert!(
787+
!Materialize::is_valid_upgrade_version(&active_version, &next_version),
788+
"v{active_version} can't upgrade to v{next_version}"
789+
);
790+
}
791+
}
633792
}

src/orchestratord/src/controller/materialize.rs

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,9 @@ impl Context {
360360
MaterializeStatus {
361361
active_generation: desired_generation,
362362
last_completed_rollout_request: mz.requested_reconciliation_id(),
363+
last_completed_rollout_environmentd_image_ref: Some(
364+
mz.spec.environmentd_image_ref.clone(),
365+
),
363366
resource_id: mz.status().resource_id,
364367
resources_hash,
365368
conditions: vec![Condition {
@@ -591,7 +594,7 @@ impl k8s_controller::Context for Context {
591594
)
592595
.await
593596
}
594-
// There are changes pending, and we want to appy them.
597+
// There are changes pending, and we want to apply them.
595598
(false, true, true) => {
596599
// we remove the environment resources hash annotation here
597600
// because if we fail halfway through applying the resources,
@@ -615,6 +618,8 @@ impl k8s_controller::Context for Context {
615618
// we fail later on, we want to ensure that the
616619
// rollout gets retried.
617620
last_completed_rollout_request: status.last_completed_rollout_request,
621+
last_completed_rollout_environmentd_image_ref: status
622+
.last_completed_rollout_environmentd_image_ref,
618623
resource_id: status.resource_id,
619624
resources_hash: String::new(),
620625
conditions: vec![Condition {
@@ -634,6 +639,38 @@ impl k8s_controller::Context for Context {
634639
let mz = &mz;
635640
let status = mz.status();
636641

642+
if !mz.within_upgrade_window() {
643+
let last_completed_rollout_environmentd_image_ref =
644+
status.last_completed_rollout_environmentd_image_ref;
645+
646+
self.update_status(
647+
&mz_api,
648+
mz,
649+
MaterializeStatus {
650+
active_generation,
651+
last_completed_rollout_request: status.last_completed_rollout_request,
652+
last_completed_rollout_environmentd_image_ref: last_completed_rollout_environmentd_image_ref.clone(),
653+
resource_id: status.resource_id,
654+
resources_hash: status.resources_hash,
655+
conditions: vec![Condition {
656+
type_: "UpToDate".into(),
657+
status: "False".into(),
658+
last_transition_time: Time(chrono::offset::Utc::now()),
659+
message: format!(
660+
"Refusing to upgrade from {} to {}. More than one major version from last successful rollout.",
661+
last_completed_rollout_environmentd_image_ref.expect("should be set if upgrade window check fails"),
662+
&mz.spec.environmentd_image_ref,
663+
),
664+
observed_generation: mz.meta().generation,
665+
reason: "FailedDeploy".into(),
666+
}],
667+
},
668+
active_generation != desired_generation,
669+
)
670+
.await?;
671+
return Ok(None);
672+
}
673+
637674
if mz.spec.rollout_strategy
638675
== MaterializeRolloutStrategy::ImmediatelyPromoteCausingDowntime
639676
{
@@ -673,6 +710,8 @@ impl k8s_controller::Context for Context {
673710
// rollout gets retried.
674711
last_completed_rollout_request: status
675712
.last_completed_rollout_request,
713+
last_completed_rollout_environmentd_image_ref: status
714+
.last_completed_rollout_environmentd_image_ref,
676715
resource_id: status.resource_id,
677716
resources_hash: resources_hash.clone(),
678717
conditions: vec![Condition {
@@ -710,6 +749,8 @@ impl k8s_controller::Context for Context {
710749
// the rollout and we want to ensure it gets
711750
// retried.
712751
last_completed_rollout_request: status.last_completed_rollout_request,
752+
last_completed_rollout_environmentd_image_ref: status
753+
.last_completed_rollout_environmentd_image_ref,
713754
resource_id: status.resource_id,
714755
resources_hash: status.resources_hash,
715756
conditions: vec![Condition {
@@ -746,6 +787,8 @@ impl k8s_controller::Context for Context {
746787
MaterializeStatus {
747788
active_generation,
748789
last_completed_rollout_request: mz.requested_reconciliation_id(),
790+
last_completed_rollout_environmentd_image_ref: status
791+
.last_completed_rollout_environmentd_image_ref,
749792
resource_id: status.resource_id,
750793
resources_hash: status.resources_hash,
751794
conditions: vec![Condition {
@@ -786,6 +829,8 @@ impl k8s_controller::Context for Context {
786829
MaterializeStatus {
787830
active_generation,
788831
last_completed_rollout_request: mz.requested_reconciliation_id(),
832+
last_completed_rollout_environmentd_image_ref: status
833+
.last_completed_rollout_environmentd_image_ref,
789834
resource_id: status.resource_id,
790835
resources_hash: status.resources_hash,
791836
conditions: vec![Condition {

0 commit comments

Comments
 (0)