Skip to content

Commit bca4cd6

Browse files
committed
SOLR-17652: Fix a bug that could cause long leader elections to leave PULL replicas in DOWN state forever
1 parent 5e328ee commit bca4cd6

File tree

2 files changed

+17
-9
lines changed

2 files changed

+17
-9
lines changed

solr/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,8 @@ Bug Fixes
185185
* SOLR-17637: LBHttp2SolrClient can fail to complete async requests in certain error scenarios.
186186
This can cause the HttpShardHandler to indefinitely wait on a completed response that will never come. (Houston Putman)
187187

188+
* SOLR-17652: Fix a bug that could cause long leader elections to leave PULL replicas in DOWN state forever. (hossman)
189+
188190
Dependency Upgrades
189191
---------------------
190192
* SOLR-17471: Upgrade Lucene to 9.12.1. (Pierre Salagnac, Christine Poerschke)

solr/core/src/java/org/apache/solr/cloud/ZkController.java

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1315,16 +1315,21 @@ public String register(
13151315
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
13161316
}
13171317

1318-
// in this case, we want to wait for the leader as long as the leader might
1319-
// wait for a vote, at least - but also long enough that a large cluster has
1320-
// time to get its act together
1321-
String leaderUrl = getLeader(cloudDesc, leaderVoteWait + 600000);
1318+
final String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
13221319

1323-
String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
1324-
log.debug("We are {} and leader is {}", ourUrl, leaderUrl);
1325-
boolean isLeader = leaderUrl.equals(ourUrl);
1326-
assert !isLeader || replica.getType().leaderEligible
1327-
: replica.getType().name() + " replica became leader!";
1320+
// Check if we are the (new) leader before deciding if/what type of recovery to do
1321+
boolean isLeader = false;
1322+
if (replica.getType().leaderEligible) {
1323+
// if are eligible to be a leader, then we might currently be participating in leader
1324+
// election.
1325+
1326+
// in this case, we want to wait for the leader as long as the leader might
1327+
// wait for a vote, at least - but also long enough that a large cluster has
1328+
// time to get its act together
1329+
String leaderUrl = getLeader(cloudDesc, leaderVoteWait + 600000);
1330+
log.debug("We are {} and leader is {}", ourUrl, leaderUrl);
1331+
isLeader = leaderUrl.equals(ourUrl);
1332+
}
13281333

13291334
try (SolrCore core = cc.getCore(desc.getName())) {
13301335

@@ -1368,6 +1373,7 @@ public String register(
13681373
}
13691374
}
13701375
}
1376+
13711377
boolean didRecovery =
13721378
checkRecovery(
13731379
recoverReloadedCores,

0 commit comments

Comments
 (0)