diff --git a/changelog/unreleased/SOLR-18002-add-unresponsive-servers-to-zombie-list.yml b/changelog/unreleased/SOLR-18002-add-unresponsive-servers-to-zombie-list.yml new file mode 100644 index 00000000000..f0e529ff639 --- /dev/null +++ b/changelog/unreleased/SOLR-18002-add-unresponsive-servers-to-zombie-list.yml @@ -0,0 +1,8 @@ +# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc +title: Add unresponsive servers to zombie list +type: fixed # added, changed, fixed, deprecated, removed, dependency_update, security, other +authors: + - name: James Vanneman +links: + - name: SOLR-18002 + url: https://issues.apache.org/jira/browse/SOLR-18002 diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttp2SolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttp2SolrClient.java index 89ffbd707a0..b7af88c2f7e 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttp2SolrClient.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBHttp2SolrClient.java @@ -24,6 +24,7 @@ import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicReference; import org.apache.solr.client.solrj.ResponseParser; import org.apache.solr.client.solrj.SolrClient; @@ -282,7 +283,7 @@ private void onFailedRequest( } } catch (SolrServerException e) { Throwable rootCause = e.getRootCause(); - if (!isNonRetryable && rootCause instanceof IOException) { + if (!isNonRetryable && (rootCause instanceof IOException || rootCause instanceof TimeoutException)) { listener.onFailure((!isZombie) ? makeServerAZombie(endpoint, e) : e, true); } else if (isNonRetryable && rootCause instanceof ConnectException) { listener.onFailure((!isZombie) ? makeServerAZombie(endpoint, e) : e, true); diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java index b2560bc7eae..76bb79e3c80 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java @@ -38,6 +38,7 @@ import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import org.apache.solr.client.solrj.ResponseParser; @@ -554,7 +555,7 @@ protected Exception doRequest( } } catch (SolrServerException e) { Throwable rootCause = e.getRootCause(); - if (!isNonRetryable && rootCause instanceof IOException) { + if (!isNonRetryable && (rootCause instanceof IOException || rootCause instanceof TimeoutException)) { ex = (!isZombie) ? makeServerAZombie(baseUrl, e) : e; } else if (isNonRetryable && rootCause instanceof ConnectException) { ex = (!isZombie) ? makeServerAZombie(baseUrl, e) : e; diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/impl/LBHttp2SolrClientIntegrationTest.java b/solr/solrj/src/test/org/apache/solr/client/solrj/impl/LBHttp2SolrClientIntegrationTest.java index 3f24984f90e..f085ce68613 100644 --- a/solr/solrj/src/test/org/apache/solr/client/solrj/impl/LBHttp2SolrClientIntegrationTest.java +++ b/solr/solrj/src/test/org/apache/solr/client/solrj/impl/LBHttp2SolrClientIntegrationTest.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.io.UncheckedIOException; import java.lang.invoke.MethodHandles; +import java.net.ServerSocket; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -32,6 +33,7 @@ import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.request.QueryRequest; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.SolrResponseBase; import org.apache.solr.common.SolrInputDocument; @@ -206,6 +208,29 @@ public void testTwoServers() throws Exception { } } + public void testTimeoutExceptionMarksServerAsZombie() throws Exception { + try (ZombieTestContext ctx = new ZombieTestContext()) { + LBSolrClient.Req lbReq = ctx.createQueryRequest(); + + try { + ctx.lbClient.request(lbReq); + } catch (Exception e) { + } + + ctx.assertZombieState(); + } + } + + public void testTimeoutExceptionMarksServerAsZombieAsyncRequest() throws Exception { + try (ZombieTestContext ctx = new ZombieTestContext()) { + LBSolrClient.Req lbReq = ctx.createQueryRequest(); + + ctx.lbClient.requestAsync(lbReq).exceptionally(e -> null).get(); + + ctx.assertZombieState(); + } + } + private LBSolrClient.Endpoint[] bootstrapBaseSolrEndpoints(int max) { LBSolrClient.Endpoint[] solrUrls = new LBSolrClient.Endpoint[max]; for (int i = 0; i < max; i++) { @@ -334,4 +359,60 @@ public void close() { } } } + + private class ZombieTestContext implements AutoCloseable { + final ServerSocket blackhole; + final LBSolrClient.Endpoint nonRoutableEndpoint; + final Http2SolrClient delegateClient; + final LBHttp2SolrClient lbClient; + + ZombieTestContext() throws Exception { + //create a socket that allows a client to connect but causes them to hang until idleTimeout is triggered + blackhole = new ServerSocket(0); + int blackholePort = blackhole.getLocalPort(); + nonRoutableEndpoint = + new LBSolrClient.Endpoint("http://localhost:" + blackholePort + "/solr"); + + delegateClient = + new Http2SolrClient.Builder() + .withConnectionTimeout(1000, TimeUnit.MILLISECONDS) + .withIdleTimeout(100, TimeUnit.MILLISECONDS) + .build(); + + lbClient = + new LBHttp2SolrClient.Builder<>(delegateClient) + .setAliveCheckInterval(500, TimeUnit.MILLISECONDS) + .build(); + } + + LBSolrClient.Req createQueryRequest() { + SolrQuery solrQuery = new SolrQuery("*:*"); + QueryRequest queryRequest = new QueryRequest(solrQuery); + + List endpoints = + List.of( + new LBSolrClient.Endpoint( + nonRoutableEndpoint.getBaseUrl(), solr[0].getDefaultCollection()) + ); + return new LBSolrClient.Req(queryRequest, endpoints); + } + + void assertZombieState() { + assertTrue( + "Non-routable endpoint should be marked as zombie due to timeout", + lbClient.zombieServers.containsKey( + nonRoutableEndpoint.getBaseUrl() + "/" + solr[0].getDefaultCollection())); + } + + @Override + public void close() { + lbClient.close(); + delegateClient.close(); + try { + blackhole.close(); + } catch (IOException ioe) { + + } + } + } }