diff --git a/doc/user/content/releases/previews.md b/doc/user/content/releases/previews.md index 382efd8f71128..e5b5ed2b49e57 100644 --- a/doc/user/content/releases/previews.md +++ b/doc/user/content/releases/previews.md @@ -35,12 +35,14 @@ contact your Materialize support. {{}} -### Graceful cluster resizing +### Zero-downtime cluster resizing -For clusters that do not contain sources or sinks, Materialize supports altering -the cluster size with no downtime (i.e., graceful cluster resizing). +For clusters that do not contain sources or sinks, Materialize supports +altering the cluster size with no downtime (i.e., zero-downtime cluster +resizing). -For more information, see [Graceful cluster resizing](/sql/alter-cluster/#graceful-cluster-resizing). +For more information, see [Zero-downtime cluster +resizing](/sql/alter-cluster/#zero-downtime-cluster-resizing). ### Real-time recency diff --git a/doc/user/content/sql/alter-cluster.md b/doc/user/content/sql/alter-cluster.md index 9ffd38d17d69b..9752d576f964f 100644 --- a/doc/user/content/sql/alter-cluster.md +++ b/doc/user/content/sql/alter-cluster.md @@ -140,17 +140,18 @@ Depending on the type of objects in a cluster, a resizing operation might incur all objects in the cluster to hydrate. * For clusters that **do not contain sources or sinks**, it's possible to avoid - downtime by performing a [graceful cluster resizing](#graceful-cluster-resizing). + downtime by performing a [zero-downtime cluster + resizing](#zero-downtime-cluster-resizing). -#### Graceful cluster resizing +#### Zero-downtime cluster resizing {{< private-preview />}} For clusters that do not contain sources or sinks, you can use the `WAIT UNTIL -READY` option to perform a graceful resizing, which incurs **no downtime**. -Instead of restarting the cluster, this approach spins up an additional cluster -replica under the covers with the desired new size, waits for the replica to be -hydrated, and then replaces the original replica. +READY` option to perform a zero-downtime resizing, which incurs **no +downtime**. Instead of restarting the cluster, this approach spins up an +additional cluster replica under the covers with the desired new size, waits +for the replica to be hydrated, and then replaces the original replica. ```sql ALTER CLUSTER c1 @@ -255,9 +256,9 @@ or `1`. ### Resizing - For clusters **without any sources or sinks**, you can alter the cluster size - with **no downtime** (i.e., [graceful cluster - resizing](#graceful-cluster-resizing)) by running the `ALTER CLUSTER` command - with the `WAIT UNTIL READY` [option](#with-options): + with **no downtime** (i.e., [zero-downtime cluster + resizing](#zero-downtime-cluster-resizing)) by running the `ALTER CLUSTER` + command with the `WAIT UNTIL READY` [option](#with-options): ```mzsql ALTER CLUSTER c1 @@ -265,9 +266,9 @@ or `1`. ```` - For clusters **with sources or sinks**, it's not yet possible to perform - graceful cluster resizing. This means that resizing clusters with sources or - sinks requires a cluster **restart**, which incurs **downtime**. You can - alter the cluster size by running the `ALTER CLUSTER` command: + zero-downtime cluster resizing. This means that resizing clusters with + sources or sinks requires a cluster **restart**, which incurs **downtime**. + You can alter the cluster size by running the `ALTER CLUSTER` command: ```mzsql ALTER CLUSTER c1 SET (SIZE '100cc'); diff --git a/doc/user/content/sql/create-cluster.md b/doc/user/content/sql/create-cluster.md index 066c2d1b272c5..3fbaeb7bfdc51 100644 --- a/doc/user/content/sql/create-cluster.md +++ b/doc/user/content/sql/create-cluster.md @@ -109,8 +109,9 @@ You can change the size of a cluster to respond to changes in your workload using [`ALTER CLUSTER`](/sql/alter-cluster). Depending on the type of objects the cluster is hosting, this operation **might incur downtime**. -See the reference documentation for [`ALTER CLUSTER`](/sql/alter-cluster#graceful-cluster-resizing) -for more details on cluster resizing. +See the reference documentation for [`ALTER +CLUSTER`](/sql/alter-cluster#zero-downtime-cluster-resizing) for more details +on cluster resizing. #### Legacy sizes diff --git a/src/adapter/src/coord/sequencer/inner/cluster.rs b/src/adapter/src/coord/sequencer/inner/cluster.rs index a59bff59b9114..a87f882407ef3 100644 --- a/src/adapter/src/coord/sequencer/inner/cluster.rs +++ b/src/adapter/src/coord/sequencer/inner/cluster.rs @@ -282,8 +282,8 @@ impl Coordinator { ) .await?; if alter_followup == NeedsFinalization::Yes { - // For non backgrounded graceful alters, store the cluster_id in the ConnMeta - // to allow for cancellation. + // For non backgrounded zero-downtime alters, store the + // cluster_id in the ConnMeta to allow for cancellation. self.active_conns .get_mut(session.conn_id()) .expect("There must be an active connection") @@ -1185,10 +1185,9 @@ impl Coordinator { || new_disk != disk { self.ensure_valid_azs(new_availability_zones.iter())?; - // If we're not doing a graceful reconfig - // tear down all replicas, create new ones - // else create the pending replicas and return - // early asking for finalization + // If we're not doing a zero-downtime reconfig tear down all + // replicas, create new ones else create the pending replicas and + // return early asking for finalization match strategy { AlterClusterPlanStrategy::None => { let replica_ids_and_reasons = (0..*replication_factor) diff --git a/src/sql/src/plan/statement/ddl.rs b/src/sql/src/plan/statement/ddl.rs index adec210c3faab..4100e0c026091 100644 --- a/src/sql/src/plan/statement/ddl.rs +++ b/src/sql/src/plan/statement/ddl.rs @@ -5895,7 +5895,7 @@ pub fn plan_alter_cluster( AlterClusterPlanStrategy::None => {} _ => { scx.require_feature_flag( - &crate::session::vars::ENABLE_GRACEFUL_CLUSTER_RECONFIGURATION, + &crate::session::vars::ENABLE_ZERO_DOWNTIME_CLUSTER_RECONFIGURATION, )?; } } diff --git a/src/sql/src/session/vars/definitions.rs b/src/sql/src/session/vars/definitions.rs index a3116a99b2418..d647ecce22313 100644 --- a/src/sql/src/session/vars/definitions.rs +++ b/src/sql/src/session/vars/definitions.rs @@ -2126,8 +2126,8 @@ feature_flags!( enable_for_item_parsing: false, }, { - name: enable_graceful_cluster_reconfiguration, - desc: "Enable graceful reconfiguration for alter cluster", + name: enable_zero_downtime_cluster_reconfiguration, + desc: "Enable zero-downtime reconfiguration for alter cluster", default: false, enable_for_item_parsing: false, }, diff --git a/test/cloudtest/test_managed_cluster.py b/test/cloudtest/test_managed_cluster.py index d99ef9419534b..d3036e64456bb 100644 --- a/test/cloudtest/test_managed_cluster.py +++ b/test/cloudtest/test_managed_cluster.py @@ -115,10 +115,10 @@ def test_managed_cluster_sizing(mz: MaterializeApplication) -> None: ) -def test_graceful_reconfiguration(mz: MaterializeApplication) -> None: +def test_zero_downtime_reconfiguration(mz: MaterializeApplication) -> None: mz.environmentd.sql( """ - ALTER SYSTEM SET enable_graceful_cluster_reconfiguration = true; + ALTER SYSTEM SET enable_zero_downtime_cluster_reconfiguration = true; """, port="internal", user="mz_system", @@ -130,7 +130,7 @@ def assert_replica_names(names, allow_pending=False): SELECT mz_cluster_replicas.name FROM mz_cluster_replicas, mz_clusters WHERE mz_cluster_replicas.cluster_id = mz_clusters.id - AND mz_clusters.name = 'gracefulatlertest'; + AND mz_clusters.name = 'zdtaltertest'; """ ) assert [replica[0] for replica in replicas] == names @@ -143,14 +143,14 @@ def assert_replica_names(names, allow_pending=False): FROM mz_internal.mz_pending_cluster_replicas ur INNER join mz_cluster_replicas cr ON cr.id=ur.id INNER join mz_clusters c ON c.id=cr.cluster_id - WHERE c.name = 'gracefulatlertest'; + WHERE c.name = 'zdtaltertest'; """ ) ) == 0 ), "There should be no pending replicas" - # Basic Graceful reocnfig test cases matrix + # Basic zero-downtime reconfig test cases matrix # - size change, no replica change # - replica size up, no other change # - replica size down, with size change @@ -161,16 +161,16 @@ def assert_replica_names(names, allow_pending=False): # - names should match r# patter, not end with `-pending` # - cancelled statements correctly roll back # - timedout until ready queries take the appropriate action - # - Fails to gracefully alter cluster with source + # - Fails to zero-downtime alter cluster with source mz.environmentd.sql( - 'CREATE CLUSTER gracefulatlertest ( SIZE = "1" )', + 'CREATE CLUSTER zdtaltertest ( SIZE = "1" )', port="internal", user="mz_system", ) mz.environmentd.sql( """ - ALTER CLUSTER gracefulatlertest SET ( SIZE = '2' ) WITH ( WAIT FOR '1ms' ) + ALTER CLUSTER zdtaltertest SET ( SIZE = '2' ) WITH ( WAIT FOR '1ms' ) """, port="internal", user="mz_system", @@ -179,7 +179,7 @@ def assert_replica_names(names, allow_pending=False): mz.environmentd.sql( """ - ALTER CLUSTER gracefulatlertest SET ( SIZE = '1', REPLICATION FACTOR 2 ) WITH ( WAIT FOR '1ms' ) + ALTER CLUSTER zdtaltertest SET ( SIZE = '1', REPLICATION FACTOR 2 ) WITH ( WAIT FOR '1ms' ) """, port="internal", user="mz_system", @@ -188,7 +188,7 @@ def assert_replica_names(names, allow_pending=False): mz.environmentd.sql( """ - ALTER CLUSTER gracefulatlertest SET ( SIZE = '1', REPLICATION FACTOR 1 ) WITH ( WAIT FOR '1ms' ) + ALTER CLUSTER zdtaltertest SET ( SIZE = '1', REPLICATION FACTOR 1 ) WITH ( WAIT FOR '1ms' ) """, port="internal", user="mz_system", @@ -197,7 +197,7 @@ def assert_replica_names(names, allow_pending=False): mz.environmentd.sql( """ - ALTER CLUSTER gracefulatlertest SET ( SIZE = '2', REPLICATION FACTOR 2 ) WITH ( WAIT FOR '1ms' ) + ALTER CLUSTER zdtaltertest SET ( SIZE = '2', REPLICATION FACTOR 2 ) WITH ( WAIT FOR '1ms' ) """, port="internal", user="mz_system", @@ -206,7 +206,7 @@ def assert_replica_names(names, allow_pending=False): mz.environmentd.sql( """ - ALTER CLUSTER gracefulatlertest SET ( SIZE = '1', REPLICATION FACTOR 1 ) WITH ( WAIT FOR '1ms' ) + ALTER CLUSTER zdtaltertest SET ( SIZE = '1', REPLICATION FACTOR 1 ) WITH ( WAIT FOR '1ms' ) """, port="internal", user="mz_system", @@ -217,34 +217,34 @@ def assert_replica_names(names, allow_pending=False): # replica checks during alter mz.environmentd.sql( """ - DROP CLUSTER IF EXISTS gracefulatlertest CASCADE; + DROP CLUSTER IF EXISTS zdtaltertest CASCADE; DROP TABLE IF EXISTS t CASCADE; - CREATE CLUSTER gracefulatlertest ( SIZE = '1'); + CREATE CLUSTER zdtaltertest ( SIZE = '1'); - SET CLUSTER = gracefulatlertest; + SET CLUSTER = zdtaltertest; -- now let's give it another go with user-defined objects CREATE TABLE t (a int); CREATE DEFAULT INDEX ON t; INSERT INTO t VALUES (42); - GRANT ALL ON CLUSTER gracefulatlertest TO materialize; + GRANT ALL ON CLUSTER zdtaltertest TO materialize; """, port="internal", user="mz_system", ) # Valudate replicas are correct during an ongoing alter - def gracefully_alter(): + def zero_downtime_alter(): mz.environmentd.sql( """ - ALTER CLUSTER gracefulatlertest SET (SIZE = '2') WITH ( WAIT FOR '5s') + ALTER CLUSTER zdtaltertest SET (SIZE = '2') WITH ( WAIT FOR '5s') """, port="internal", user="mz_system", ) - thread = Thread(target=gracefully_alter) + thread = Thread(target=zero_downtime_alter) thread.start() time.sleep(1) @@ -252,7 +252,7 @@ def gracefully_alter(): assert ( mz.environmentd.sql_query( """ - SELECT size FROM mz_clusters WHERE name='gracefulatlertest'; + SELECT size FROM mz_clusters WHERE name='zdtaltertest'; """ ) == (["1"],) @@ -264,7 +264,7 @@ def gracefully_alter(): assert ( mz.environmentd.sql_query( """ - SELECT size FROM mz_clusters WHERE name='gracefulatlertest'; + SELECT size FROM mz_clusters WHERE name='zdtaltertest'; """ ) == (["2"],) @@ -332,11 +332,11 @@ def query_with_conn( == (["1"],) ), "Cluster should not have updated if canceled during alter" - # Test graceful reconfig wait until ready + # Test zero-downtime reconfig wait until ready mz.environmentd.sql( """ DROP CLUSTER IF EXISTS cluster1 CASCADE; - DROP CLUSTER IF EXISTS gracefulaltertest CASCADE; + DROP CLUSTER IF EXISTS zdtaltertest CASCADE; """, port="internal", user="mz_system", diff --git a/test/cluster/mzcompose.py b/test/cluster/mzcompose.py index 91bb01b047a4c..bbc5fe5f29bca 100644 --- a/test/cluster/mzcompose.py +++ b/test/cluster/mzcompose.py @@ -4934,11 +4934,11 @@ def workflow_test_unified_introspection_during_replica_disconnect(c: Composition ) -def workflow_test_graceful_reconfigure( +def workflow_test_zero_downtime_reconfigure( c: Composition, parser: WorkflowArgumentParser ) -> None: """ - Tests gracefully reconfiguring a managed cluster + Tests reconfiguring a managed cluster with zero downtime """ c.down(destroy_volumes=True) with c.override( @@ -4949,7 +4949,7 @@ def workflow_test_graceful_reconfigure( c.up("clusterd1") c.sql( """ - ALTER SYSTEM SET enable_graceful_cluster_reconfiguration = true; + ALTER SYSTEM SET enable_zero_downtime_cluster_reconfiguration = true; DROP CLUSTER IF EXISTS cluster1 CASCADE; DROP TABLE IF EXISTS t CASCADE; @@ -4991,7 +4991,7 @@ def workflow_test_graceful_reconfigure( len(replicas) == 0 ), f"Cluster should only have no pending replica prior to alter, found {replicas}" - def gracefully_alter(): + def zero_downtime_alter(): try: c.sql( """ @@ -5005,7 +5005,7 @@ def gracefully_alter(): pass # Run a reconfigure - thread = Thread(target=gracefully_alter) + thread = Thread(target=zero_downtime_alter) thread.start() time.sleep(3) @@ -5059,7 +5059,7 @@ def gracefully_alter(): ) c.sql( """ - ALTER SYSTEM RESET enable_graceful_cluster_reconfiguration; + ALTER SYSTEM RESET enable_zero_downtime_cluster_reconfiguration; """, port=6877, user="mz_system", diff --git a/test/sqllogictest/managed_cluster.slt b/test/sqllogictest/managed_cluster.slt index 3d29b336aed73..41289abd15bcb 100644 --- a/test/sqllogictest/managed_cluster.slt +++ b/test/sqllogictest/managed_cluster.slt @@ -446,7 +446,7 @@ DROP CLUSTER foo simple conn=mz_system,user=mz_system -ALTER SYSTEM SET enable_graceful_cluster_reconfiguration = true; +ALTER SYSTEM SET enable_zero_downtime_cluster_reconfiguration = true; ---- COMPLETE 0