Skip to content

Commit b657ec0

Browse files
authored
SOLR-17453: Leverage waitForState() instead of busy waiting (#2737)
Leverage waitForState() instead of busy waiting in CREATE, MIGRATE, REINDEXCOLLECTION, MOVEREPLICA commands, and in some tests.
1 parent 2550e7e commit b657ec0

24 files changed

+419
-598
lines changed

solr/CHANGES.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ Improvements
143143
when PKI is used between nodes. (Jason Gerlowski)
144144

145145
* SOLR-17383: Resolved overlapping arguments in the Solr CLI. Removed duplicative but differing arguments,
146-
consolidated use of short form arguments -v to not have differing meanings based on tool. Provide deprecation warning
146+
consolidated use of short form arguments -v to not have differing meanings based on tool. Provide deprecation warning
147147
in command line when deprecated arguments are used. (Eric Pugh, Christos Malliaridis)
148148

149149
* SOLR-17256: Deprecate SolrRequest `setBasePath` and `getBasePath` methods. SolrJ users wishing to temporarily
@@ -181,6 +181,8 @@ Optimizations
181181

182182
* SOLR-16503: Switched from HTTP1 to HTTP2 in SolrClientCloudManager by replacing CloudLegacySolrClient with CloudHttp2SolrClient. (Sanjay Dutt, David Smiley)
183183

184+
* SOLR-17453: Leverage waitForState() instead of busy waiting in CREATE, MIGRATE, REINDEXCOLLECTION, MOVEREPLICA commands, and in some tests. (Pierre Salagnac)
185+
184186
Bug Fixes
185187
---------------------
186188
* SOLR-12429: Uploading a configset with a symbolic link produces a IOException. Now a error message to user generated instead. (Eric Pugh)

solr/core/src/java/org/apache/solr/cloud/api/collections/CreateCollectionCmd.java

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,11 @@
3636
import java.util.List;
3737
import java.util.Map;
3838
import java.util.NoSuchElementException;
39+
import java.util.Objects;
3940
import java.util.Properties;
4041
import java.util.concurrent.ConcurrentHashMap;
4142
import java.util.concurrent.TimeUnit;
43+
import java.util.concurrent.TimeoutException;
4244
import org.apache.solr.client.solrj.cloud.AlreadyExistsException;
4345
import org.apache.solr.client.solrj.cloud.BadVersionException;
4446
import org.apache.solr.client.solrj.cloud.DelegatingCloudManager;
@@ -221,24 +223,19 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList<Objec
221223
}
222224

223225
// wait for a while until we see the collection
224-
TimeOut waitUntil =
225-
new TimeOut(30, TimeUnit.SECONDS, ccc.getSolrCloudManager().getTimeSource());
226-
boolean created = false;
227-
while (!waitUntil.hasTimedOut()) {
228-
waitUntil.sleep(100);
229-
created = ccc.getSolrCloudManager().getClusterState().hasCollection(collectionName);
230-
if (created) break;
231-
}
232-
if (!created) {
226+
try {
227+
newColl =
228+
zkStateReader.waitForState(collectionName, 30, TimeUnit.SECONDS, Objects::nonNull);
229+
} catch (TimeoutException e) {
233230
throw new SolrException(
234231
SolrException.ErrorCode.SERVER_ERROR,
235-
"Could not fully create collection: " + collectionName);
232+
"Could not fully create collection: " + collectionName,
233+
e);
236234
}
237235

238236
// refresh cluster state (value read below comes from Zookeeper watch firing following the
239237
// update done previously, be it by Overseer or by this thread when updates are distributed)
240238
clusterState = ccc.getSolrCloudManager().getClusterState();
241-
newColl = clusterState.getCollection(collectionName);
242239
}
243240

244241
final List<ReplicaPosition> replicaPositions;

solr/core/src/java/org/apache/solr/cloud/api/collections/MigrateCmd.java

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import java.util.HashMap;
3232
import java.util.Map;
3333
import java.util.concurrent.TimeUnit;
34+
import java.util.concurrent.TimeoutException;
3435
import org.apache.solr.client.solrj.request.CoreAdminRequest;
3536
import org.apache.solr.cloud.DistributedClusterStateUpdater;
3637
import org.apache.solr.cloud.Overseer;
@@ -52,7 +53,6 @@
5253
import org.apache.solr.common.util.NamedList;
5354
import org.apache.solr.common.util.Utils;
5455
import org.apache.solr.handler.component.ShardHandler;
55-
import org.apache.solr.util.TimeOut;
5656
import org.slf4j.Logger;
5757
import org.slf4j.LoggerFactory;
5858

@@ -272,27 +272,26 @@ private void migrateKey(
272272

273273
// wait for a while until we see the new rule
274274
log.info("Waiting to see routing rule updated in clusterstate");
275-
TimeOut waitUntil =
276-
new TimeOut(60, TimeUnit.SECONDS, ccc.getSolrCloudManager().getTimeSource());
277-
boolean added = false;
278-
while (!waitUntil.hasTimedOut()) {
279-
waitUntil.sleep(100);
280-
sourceCollection = zkStateReader.getClusterState().getCollection(sourceCollection.getName());
281-
sourceSlice = sourceCollection.getSlice(sourceSlice.getName());
282-
Map<String, RoutingRule> rules = sourceSlice.getRoutingRules();
283-
if (rules != null) {
284-
RoutingRule rule = rules.get(sourceRouter.getRouteKeyNoSuffix(splitKey) + "!");
285-
if (rule != null && rule.getRouteRanges().contains(splitRange)) {
286-
added = true;
287-
break;
288-
}
289-
}
290-
}
291-
if (!added) {
275+
276+
try {
277+
sourceCollection =
278+
zkStateReader.waitForState(
279+
sourceCollection.getName(),
280+
60,
281+
TimeUnit.SECONDS,
282+
c -> {
283+
Slice s = c.getSlice(sourceSlice.getName());
284+
Map<String, RoutingRule> rules = s.getRoutingRules();
285+
if (rules != null) {
286+
RoutingRule rule = rules.get(sourceRouter.getRouteKeyNoSuffix(splitKey) + "!");
287+
return rule != null && rule.getRouteRanges().contains(splitRange);
288+
}
289+
return false;
290+
});
291+
} catch (TimeoutException e) {
292292
throw new SolrException(
293-
SolrException.ErrorCode.SERVER_ERROR, "Could not add routing rule: " + m);
293+
SolrException.ErrorCode.SERVER_ERROR, "Could not add routing rule: " + m, e);
294294
}
295-
296295
log.info("Routing rule added successfully");
297296

298297
// Create temp core on source shard

solr/core/src/java/org/apache/solr/cloud/api/collections/MoveReplicaCmd.java

Lines changed: 19 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import java.util.List;
3434
import java.util.Locale;
3535
import java.util.concurrent.TimeUnit;
36+
import java.util.concurrent.TimeoutException;
3637
import org.apache.solr.cloud.ActiveReplicaWatcher;
3738
import org.apache.solr.common.SolrCloseableLatch;
3839
import org.apache.solr.common.SolrException;
@@ -46,7 +47,6 @@
4647
import org.apache.solr.common.params.CoreAdminParams;
4748
import org.apache.solr.common.util.NamedList;
4849
import org.apache.solr.common.util.Utils;
49-
import org.apache.solr.util.TimeOut;
5050
import org.slf4j.Logger;
5151
import org.slf4j.LoggerFactory;
5252

@@ -161,9 +161,7 @@ private void moveReplica(
161161
dataDir.toString(),
162162
targetNode,
163163
async,
164-
coll,
165164
replica,
166-
slice,
167165
timeout,
168166
waitForFinalState);
169167
} else {
@@ -187,9 +185,7 @@ private void moveHdfsReplica(
187185
String dataDir,
188186
String targetNode,
189187
String async,
190-
DocCollection coll,
191188
Replica replica,
192-
Slice slice,
193189
int timeout,
194190
boolean waitForFinalState)
195191
throws Exception {
@@ -198,8 +194,8 @@ private void moveHdfsReplica(
198194
skipCreateReplicaInClusterState = "false";
199195
ZkNodeProps removeReplicasProps =
200196
new ZkNodeProps(
201-
COLLECTION_PROP, coll.getName(),
202-
SHARD_ID_PROP, slice.getName(),
197+
COLLECTION_PROP, replica.getCollection(),
198+
SHARD_ID_PROP, replica.getShard(),
203199
REPLICA_PROP, replica.getName());
204200
removeReplicasProps.getProperties().put(CoreAdminParams.DELETE_DATA_DIR, false);
205201
removeReplicasProps.getProperties().put(CoreAdminParams.DELETE_INDEX, false);
@@ -217,26 +213,23 @@ private void moveHdfsReplica(
217213
String.format(
218214
Locale.ROOT,
219215
"Failed to cleanup replica collection=%s shard=%s name=%s, failure=%s",
220-
coll.getName(),
221-
slice.getName(),
216+
replica.getCollection(),
217+
replica.getShard(),
222218
replica.getName(),
223219
deleteResult.get("failure"));
224220
log.warn(errorString);
225221
results.add("failure", errorString);
226222
return;
227223
}
228224

229-
TimeOut timeOut =
230-
new TimeOut(20L, TimeUnit.SECONDS, ccc.getSolrCloudManager().getTimeSource());
231-
while (!timeOut.hasTimedOut()) {
232-
coll = ccc.getZkStateReader().getClusterState().getCollection(coll.getName());
233-
if (coll.getReplica(replica.getName()) != null) {
234-
timeOut.sleep(100);
235-
} else {
236-
break;
237-
}
238-
}
239-
if (timeOut.hasTimedOut()) {
225+
try {
226+
ccc.getZkStateReader()
227+
.waitForState(
228+
replica.getCollection(),
229+
20L,
230+
TimeUnit.SECONDS,
231+
c -> c.getReplica(replica.getName()) != null);
232+
} catch (TimeoutException e) {
240233
results.add("failure", "Still see deleted replica in clusterstate!");
241234
return;
242235
}
@@ -246,9 +239,9 @@ private void moveHdfsReplica(
246239
ZkNodeProps addReplicasProps =
247240
new ZkNodeProps(
248241
COLLECTION_PROP,
249-
coll.getName(),
242+
replica.getCollection(),
250243
SHARD_ID_PROP,
251-
slice.getName(),
244+
replica.getShard(),
252245
CoreAdminParams.NODE,
253246
targetNode,
254247
CoreAdminParams.CORE_NODE_NAME,
@@ -277,8 +270,8 @@ private void moveHdfsReplica(
277270
String.format(
278271
Locale.ROOT,
279272
"Failed to create replica for collection=%s shard=%s" + " on node=%s, failure=%s",
280-
coll.getName(),
281-
slice.getName(),
273+
replica.getCollection(),
274+
replica.getShard(),
282275
targetNode,
283276
addResult.get("failure"));
284277
results.add("failure", errorString);
@@ -302,8 +295,8 @@ private void moveHdfsReplica(
302295
String.format(
303296
Locale.ROOT,
304297
"Failed to create replica for collection=%s shard=%s" + " on node=%s, failure=%s",
305-
coll.getName(),
306-
slice.getName(),
298+
replica.getCollection(),
299+
replica.getShard(),
307300
targetNode,
308301
addResult.get("failure"));
309302
log.warn(errorString);

solr/core/src/java/org/apache/solr/cloud/api/collections/ReindexCollectionCmd.java

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@
2525
import java.util.List;
2626
import java.util.Locale;
2727
import java.util.Map;
28+
import java.util.Objects;
2829
import java.util.TreeMap;
2930
import java.util.concurrent.TimeUnit;
31+
import java.util.concurrent.TimeoutException;
3032
import java.util.concurrent.atomic.AtomicInteger;
3133
import java.util.function.Function;
3234
import java.util.stream.Collectors;
@@ -360,22 +362,17 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList<Objec
360362
CollectionHandlingUtils.checkResults(
361363
"creating checkpoint collection " + chkCollection, cmdResults, true);
362364
// wait for a while until we see both collections
363-
TimeOut waitUntil =
364-
new TimeOut(30, TimeUnit.SECONDS, ccc.getSolrCloudManager().getTimeSource());
365-
boolean created = false;
366-
while (!waitUntil.hasTimedOut()) {
367-
waitUntil.sleep(100);
368-
// this also refreshes our local var clusterState
369-
clusterState = ccc.getSolrCloudManager().getClusterState();
370-
created =
371-
clusterState.hasCollection(targetCollection)
372-
&& clusterState.hasCollection(chkCollection);
373-
if (created) break;
374-
}
375-
if (!created) {
365+
try {
366+
for (String col : List.of(targetCollection, chkCollection)) {
367+
ccc.getZkStateReader().waitForState(col, 30, TimeUnit.SECONDS, Objects::nonNull);
368+
}
369+
} catch (TimeoutException e) {
376370
throw new SolrException(
377371
SolrException.ErrorCode.SERVER_ERROR, "Could not fully create temporary collection(s)");
378372
}
373+
374+
clusterState = ccc.getSolrCloudManager().getClusterState();
375+
379376
if (maybeAbort(collection)) {
380377
aborted = true;
381378
return;

solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
import java.util.Random;
5454
import java.util.Set;
5555
import java.util.concurrent.TimeUnit;
56+
import java.util.concurrent.TimeoutException;
5657
import java.util.function.Supplier;
5758
import javax.servlet.http.HttpServletRequest;
5859
import javax.servlet.http.HttpServletResponse;
@@ -96,7 +97,6 @@
9697
import org.apache.solr.common.util.SimpleOrderedMap;
9798
import org.apache.solr.common.util.StrUtils;
9899
import org.apache.solr.common.util.SuppressForbidden;
99-
import org.apache.solr.common.util.TimeSource;
100100
import org.apache.solr.common.util.Utils;
101101
import org.apache.solr.common.util.ValidatingJsonMap;
102102
import org.apache.solr.core.CoreContainer;
@@ -125,7 +125,6 @@
125125
import org.apache.solr.servlet.cache.Method;
126126
import org.apache.solr.update.processor.DistributingUpdateProcessorFactory;
127127
import org.apache.solr.util.RTimerTree;
128-
import org.apache.solr.util.TimeOut;
129128
import org.apache.solr.util.tracing.TraceUtils;
130129
import org.apache.zookeeper.KeeperException;
131130
import org.slf4j.Logger;
@@ -388,18 +387,16 @@ protected void autoCreateSystemColl(String corename) throws Exception {
388387
+ " collection: "
389388
+ Utils.toJSONString(rsp.getValues()));
390389
}
391-
TimeOut timeOut = new TimeOut(3, TimeUnit.SECONDS, TimeSource.NANO_TIME);
392-
for (; ; ) {
393-
if (cores.getZkController().getClusterState().getCollectionOrNull(SYSTEM_COLL) != null) {
394-
break;
395-
} else {
396-
if (timeOut.hasTimedOut()) {
397-
throw new SolrException(
398-
ErrorCode.SERVER_ERROR,
399-
"Could not find " + SYSTEM_COLL + " collection even after 3 seconds");
400-
}
401-
timeOut.sleep(50);
402-
}
390+
391+
try {
392+
cores
393+
.getZkController()
394+
.getZkStateReader()
395+
.waitForState(SYSTEM_COLL, 3, TimeUnit.SECONDS, Objects::nonNull);
396+
} catch (TimeoutException e) {
397+
throw new SolrException(
398+
ErrorCode.SERVER_ERROR,
399+
"Could not find " + SYSTEM_COLL + " collection even after 3 seconds");
403400
}
404401

405402
action = RETRY;

solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPullReplicasTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,7 @@ public void test() throws Exception {
343343
List<Integer> numShardsNumReplicas = new ArrayList<>(2);
344344
numShardsNumReplicas.add(1);
345345
numShardsNumReplicas.add(1 + getPullReplicaCount());
346-
checkForCollection("testcollection", numShardsNumReplicas, null);
346+
checkForCollection("testcollection", numShardsNumReplicas);
347347

348348
testSuccessful = true;
349349
} finally {

solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPullReplicasTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ public void test() throws Exception {
255255
List<Integer> numShardsNumReplicas = new ArrayList<>(2);
256256
numShardsNumReplicas.add(1);
257257
numShardsNumReplicas.add(1 + getPullReplicaCount());
258-
checkForCollection("testcollection", numShardsNumReplicas, null);
258+
checkForCollection("testcollection", numShardsNumReplicas);
259259
}
260260

261261
private void tryDelete() throws Exception {

solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -488,11 +488,13 @@ private JettySolrRunner getJettyForReplica(Replica replica) {
488488
}
489489

490490
private void waitForNodeLeave(String lostNodeName) throws InterruptedException {
491-
ZkStateReader reader = cluster.getZkStateReader();
492-
TimeOut timeOut = new TimeOut(20, TimeUnit.SECONDS, TimeSource.NANO_TIME);
493-
while (reader.getClusterState().getLiveNodes().contains(lostNodeName)) {
494-
Thread.sleep(100);
495-
if (timeOut.hasTimedOut()) fail("Wait for " + lostNodeName + " to leave failed!");
491+
492+
try {
493+
cluster
494+
.getZkStateReader()
495+
.waitForLiveNodes(20, TimeUnit.SECONDS, (o, n) -> !n.contains(lostNodeName));
496+
} catch (TimeoutException e) {
497+
fail("Wait for " + lostNodeName + " to leave failed!");
496498
}
497499
}
498500

0 commit comments

Comments
 (0)