Skip to content

Commit 965a391

Browse files
himanshujindaljaydeep1984
authored andcommitted
Stop repair scheduler if two major versions are detected
patch by Himanshu Jindal; reviewed by Jaydeepkumar Chovatia, Andy Tolbert for CASSANDRA-20048
1 parent ecaa650 commit 965a391

File tree

14 files changed

+143
-6
lines changed

14 files changed

+143
-6
lines changed

CHANGES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
5.1
2+
* Stop repair scheduler if two major versions are detected (CASSANDRA-20048)
23
* Optimize audit logic for batch operations especially when audit is not enabled for DML (CASSANDRA-20885)
34
* Implement nodetool history (CASSANDRA-20851)
45
* Expose StorageService.dropPreparedStatements via JMX (CASSANDRA-20870)

conf/cassandra.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2782,6 +2782,9 @@ storage_compatibility_mode: NONE
27822782
# # The scheduler needs to adjust its order when nodes leave the ring. Deleted hosts are tracked in metadata
27832783
# # for a specified duration to ensure they are indeed removed before adjustments are made to the schedule.
27842784
# history_clear_delete_hosts_buffer_interval: 2h
2785+
# # By default repair is disabled if there are mixed major versions detected - which would happen
2786+
# # if a major version upgrade is being performed on the cluster, but a user can enable it using this flag
2787+
# mixed_major_version_repair_enabled: false
27852788
# # NOTE: Each of the below settings can be overridden per repair type under repair_type_overrides
27862789
# global_settings:
27872790
# # If true, attempts to group tables in the same keyspace into one repair; otherwise, each table is repaired

conf/cassandra_latest.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2467,6 +2467,9 @@ storage_compatibility_mode: NONE
24672467
# # The scheduler needs to adjust its order when nodes leave the ring. Deleted hosts are tracked in metadata
24682468
# # for a specified duration to ensure they are indeed removed before adjustments are made to the schedule.
24692469
# history_clear_delete_hosts_buffer_interval: 2h
2470+
# # By default repair is disabled if there are mixed major versions detected - which would happen
2471+
# # if a major version upgrade is being performed on the cluster, but a user can enable it using this flag
2472+
# mixed_major_version_repair_enabled: false
24702473
# # NOTE: Each of the below settings can be overridden per repair type under repair_type_overrides
24712474
# global_settings:
24722475
# # If true, attempts to group tables in the same keyspace into one repair; otherwise, each table is repaired

doc/modules/cassandra/pages/managing/operating/auto_repair.adoc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,10 @@ is time to schedule repairs.
167167
| history_clear_delete_hosts_buffer_interval | 2h | The scheduler needs to adjust its order when nodes leave the ring.
168168
Deleted hosts are tracked in metadata for a specified duration to ensure they are indeed removed before adjustments
169169
are made to the schedule.
170+
| mixed_major_version_repair_enabled | false | Enable/Disable running repairs on the cluster when there are mixed
171+
major versions detected, which usually occurs when the cluster is being upgraded. Repairs between nodes of
172+
different major versions is not something that is tested, so this may lead to data compatibility issues.
173+
It is strongly discouraged to set this to true without doing extensive testing beforehand.
170174
|===
171175

172176

src/java/org/apache/cassandra/repair/autorepair/AutoRepair.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,11 @@ public void repair(AutoRepairConfig.RepairType repairType)
165165
logger.debug("Auto-repair is disabled for repair type {}", repairType);
166166
return;
167167
}
168+
if (!config.isMixedMajorVersionRepairEnabled() && AutoRepairUtils.hasMultipleLiveMajorVersions())
169+
{
170+
logger.info("Auto-repair is disabled when nodes in the cluster have different major versions");
171+
return;
172+
}
168173
AutoRepairService.instance.checkCanRun(repairType);
169174
AutoRepairState repairState = repairStates.get(repairType);
170175
try

src/java/org/apache/cassandra/repair/autorepair/AutoRepairConfig.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ public class AutoRepairConfig implements Serializable
5959
// Minimum duration for the execution of a single repair task. This prevents the scheduler from overwhelming
6060
// the node by scheduling too many repair tasks in a short period of time.
6161
public volatile DurationSpec.LongSecondsBound repair_task_min_duration = new DurationSpec.LongSecondsBound("5s");
62+
// by default repair is disabled if there are mixed major versions detected, but you can enable it using this flag
63+
public volatile boolean mixed_major_version_repair_enabled = false;
6264

6365
// global_settings overides Options.defaultOptions for all repair types
6466
public volatile Options global_settings;
@@ -149,6 +151,11 @@ public void setAutoRepairSchedulingEnabled(boolean enabled)
149151
this.enabled = enabled;
150152
}
151153

154+
public boolean isMixedMajorVersionRepairEnabled()
155+
{
156+
return mixed_major_version_repair_enabled;
157+
}
158+
152159
public DurationSpec.IntSecondsBound getAutoRepairHistoryClearDeleteHostsBufferInterval()
153160
{
154161
return history_clear_delete_hosts_buffer_interval;
@@ -366,6 +373,16 @@ public void setRepairRetryBackoff(RepairType repairType, String interval)
366373
getOptions(repairType).repair_retry_backoff = new DurationSpec.LongSecondsBound(interval);
367374
}
368375

376+
public boolean getMixedMajorVersionRepairEnabled()
377+
{
378+
return this.mixed_major_version_repair_enabled;
379+
}
380+
381+
public void setMixedMajorVersionRepairEnabled(boolean enabled)
382+
{
383+
this.mixed_major_version_repair_enabled = enabled;
384+
}
385+
369386
@VisibleForTesting
370387
static IAutoRepairTokenRangeSplitter newAutoRepairTokenRangeSplitter(RepairType repairType, ParameterizedClass parameterizedClass) throws ConfigurationException
371388
{

src/java/org/apache/cassandra/repair/autorepair/AutoRepairUtils.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,21 @@ public static CurrentRepairStatus getCurrentRepairStatus(RepairType repairType,
425425
return null;
426426
}
427427

428+
/**
429+
* Checks whether the cluster has multiple major versions
430+
* @return
431+
* true if more than one major versions are detected
432+
* false if only one major version is detected
433+
*
434+
*/
435+
public static boolean hasMultipleLiveMajorVersions()
436+
{
437+
ClusterMetadata metadata = ClusterMetadata.current();
438+
int maxMajorVersion = ClusterMetadata.current().directory.clusterMaxVersion.cassandraVersion.major;
439+
int minMajorVersion = ClusterMetadata.current().directory.clusterMinVersion.cassandraVersion.major;
440+
return maxMajorVersion != minMajorVersion;
441+
}
442+
428443
@VisibleForTesting
429444
protected static TreeSet<UUID> getHostIdsInCurrentRing(RepairType repairType, Collection<NodeAddresses> allNodesInRing)
430445
{

src/java/org/apache/cassandra/service/AutoRepairService.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ public String getAutoRepairConfiguration()
102102
appendConfig(sb, "repair_check_interval", config.getRepairCheckInterval());
103103
appendConfig(sb, "repair_task_min_duration", config.getRepairTaskMinDuration());
104104
appendConfig(sb, "history_clear_delete_hosts_buffer_interval", config.getAutoRepairHistoryClearDeleteHostsBufferInterval());
105+
appendConfig(sb, "mixed_major_version_repair_enabled", config.getMixedMajorVersionRepairEnabled());
105106
for (RepairType repairType : RepairType.values())
106107
{
107108
sb.append(formatRepairTypeConfig(repairType, config));
@@ -271,6 +272,12 @@ public void setAutoRepairRetryBackoff(String repairType, String interval)
271272
config.setRepairRetryBackoff(RepairType.parse(repairType), interval);
272273
}
273274

275+
@Override
276+
public void setMixedMajorVersionRepairEnabled(boolean enabled)
277+
{
278+
config.setMixedMajorVersionRepairEnabled(enabled);
279+
}
280+
274281
private String formatRepairTypeConfig(RepairType repairType, AutoRepairConfig config)
275282
{
276283
StringBuilder sb = new StringBuilder();

src/java/org/apache/cassandra/service/AutoRepairServiceMBean.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,6 @@ public interface AutoRepairServiceMBean
7474
public void setAutoRepairMaxRetriesCount(String repairType, int retries);
7575

7676
public void setAutoRepairRetryBackoff(String repairType, String interval);
77+
78+
public void setMixedMajorVersionRepairEnabled(boolean enabled);
7779
}

src/java/org/apache/cassandra/tools/NodeProbe.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2677,6 +2677,11 @@ public GuardrailsMBean getGuardrailsMBean()
26772677
{
26782678
return grProxy;
26792679
}
2680+
2681+
public void setMixedMajorVersionRepairEnabled(boolean enabled)
2682+
{
2683+
autoRepairProxy.setMixedMajorVersionRepairEnabled(enabled);
2684+
}
26802685
}
26812686

26822687
class ColumnFamilyStoreMBeanIterator implements Iterator<Map.Entry<String, ColumnFamilyStoreMBean>>

0 commit comments

Comments
 (0)