Skip to content

Commit 8484b71

Browse files
authored
ES|QL: Make skip_unavailable catch all errors (#128163)
* Make skip_unavailable catch all errors
1 parent 7532ad5 commit 8484b71

File tree

12 files changed

+455
-80
lines changed

12 files changed

+455
-80
lines changed

docs/changelog/128163.yaml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
pr: 128163
2+
summary: Make `skip_unavailable` catch all errors
3+
area: ES|QL
4+
type: breaking
5+
issues: [ ]
6+
breaking:
7+
title: Cluster setting "skip_unavailable" catches all runtime errors
8+
area: ES|QL
9+
details: "If `skip_unavailable` is set to `true`, the runtime errors from this cluster\
10+
\ do not lead to a failure of the query. Instead, the cluster is set to `skipped`\
11+
\ or `partial` status, and the query execution continues. This is a breaking change\
12+
\ from previous versions, where `skip_unavailable` only applied to errors related\
13+
\ to a cluster being unavailable."
14+
impact: "The errors on remote clusters, e.g. missing indices, will not lead to a\
15+
\ failure of the query. Instead, the cluster is set to `skipped` or `partial` status\
16+
\ in the response metadata."
17+
notable: false

server/src/test/java/org/elasticsearch/action/admin/cluster/stats/CCSTelemetrySnapshotTests.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ public void testToXContent() throws IOException {
325325
XContentHelper.toXContent(snapshot, XContentType.JSON, randomBoolean()),
326326
XContentType.JSON
327327
);
328+
assertToXContentEquivalent(new BytesArray(expectedJson), new BytesArray(snapshot.toString()), XContentType.JSON);
328329
}
329330

330331
private String readJSONFromResource(String fileName) throws IOException {

x-pack/plugin/esql/qa/server/multi-clusters/src/javaRestTest/java/org/elasticsearch/xpack/esql/ccq/RequestIndexFilteringIT.java

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
1111

1212
import org.apache.http.HttpHost;
13+
import org.elasticsearch.Version;
1314
import org.elasticsearch.client.Request;
1415
import org.elasticsearch.client.ResponseException;
1516
import org.elasticsearch.client.RestClient;
@@ -31,9 +32,11 @@
3132
import java.util.List;
3233
import java.util.Map;
3334

35+
import static org.elasticsearch.test.ListMatcher.matchesList;
3436
import static org.elasticsearch.test.MapMatcher.assertMap;
3537
import static org.elasticsearch.test.MapMatcher.matchesMap;
3638
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
39+
import static org.hamcrest.Matchers.hasSize;
3740
import static org.hamcrest.Matchers.instanceOf;
3841

3942
@ThreadLeakFilters(filters = TestClustersThreadFilter.class)
@@ -138,4 +141,33 @@ protected void assertQueryResult(Map<String, Object> result, Matcher<?> columnMa
138141
assertMap(result, matcher);
139142
}
140143

144+
private static boolean checkVersion(org.elasticsearch.Version version) {
145+
return version.onOrAfter(Version.fromString("9.1.0"));
146+
// TODO: enable this when ported to 8.x
147+
// || (version.onOrAfter(Version.fromString("8.19.0")) && version.before(Version.fromString("9.0.0")));
148+
}
149+
150+
// We need a separate test since remote missing indices and local missing indices now work differently
151+
public void testIndicesDontExistRemote() throws IOException {
152+
// Exclude old versions
153+
assumeTrue("Only works with latest support_unavailable logic", checkVersion(Clusters.localClusterVersion()));
154+
int docsTest1 = randomIntBetween(1, 5);
155+
indexTimestampData(docsTest1, "test1", "2024-11-26", "id1");
156+
157+
Map<String, Object> result = runEsql(
158+
timestampFilter("gte", "2020-01-01").query("FROM *:foo,*:test1 METADATA _index | SORT id1 | KEEP _index, id*")
159+
);
160+
@SuppressWarnings("unchecked")
161+
var columns = (List<List<Object>>) result.get("columns");
162+
assertThat(
163+
columns,
164+
matchesList().item(matchesMap().entry("name", "_index").entry("type", "keyword"))
165+
.item(matchesMap().entry("name", "id1").entry("type", "integer"))
166+
);
167+
@SuppressWarnings("unchecked")
168+
var values = (List<List<Object>>) result.get("values");
169+
// TODO: for now, we return empty result, but eventually it should return records from test1
170+
assertThat(values, hasSize(0));
171+
172+
}
141173
}

x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/RequestIndexFilteringTestCase.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ public void testFieldNameTypo() throws IOException {
195195
}
196196

197197
public void testIndicesDontExist() throws IOException {
198-
int docsTest1 = 0; // we are interested only in the created index, not necessarily that it has data
198+
int docsTest1 = randomIntBetween(1, 5);
199199
indexTimestampData(docsTest1, "test1", "2024-11-26", "id1");
200200

201201
ResponseException e = expectThrows(ResponseException.class, () -> runEsql(timestampFilter("gte", "2020-01-01").query(from("foo"))));
@@ -208,7 +208,7 @@ public void testIndicesDontExist() throws IOException {
208208
assertThat(e.getMessage(), containsString("verification_exception"));
209209
assertThat(e.getMessage(), anyOf(containsString("Unknown index [foo*]"), containsString("Unknown index [remote_cluster:foo*]")));
210210

211-
e = expectThrows(ResponseException.class, () -> runEsql(timestampFilter("gte", "2020-01-01").query(from("foo", "test1"))));
211+
e = expectThrows(ResponseException.class, () -> runEsql(timestampFilter("gte", "2020-01-01").query("FROM foo, test1")));
212212
assertEquals(404, e.getResponse().getStatusLine().getStatusCode());
213213
assertThat(e.getMessage(), containsString("index_not_found_exception"));
214214
assertThat(e.getMessage(), anyOf(containsString("no such index [foo]"), containsString("no such index [remote_cluster:foo]")));
@@ -231,7 +231,7 @@ public void testIndicesDontExist() throws IOException {
231231
}
232232
}
233233

234-
private static RestEsqlTestCase.RequestObjectBuilder timestampFilter(String op, String date) throws IOException {
234+
protected static RestEsqlTestCase.RequestObjectBuilder timestampFilter(String op, String date) throws IOException {
235235
return requestObjectBuilder().filter(b -> {
236236
b.startObject("range");
237237
{

x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/CrossClusterQueryIT.java

Lines changed: 63 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
import org.elasticsearch.transport.TransportService;
3030
import org.elasticsearch.xcontent.XContentBuilder;
3131
import org.elasticsearch.xcontent.json.JsonXContent;
32-
import org.elasticsearch.xpack.esql.EsqlTestUtils;
3332
import org.elasticsearch.xpack.esql.VerificationException;
3433
import org.elasticsearch.xpack.esql.plugin.QueryPragmas;
3534

@@ -260,6 +259,7 @@ public void testSearchesAgainstNonMatchingIndices() throws Exception {
260259
Map<String, Object> testClusterInfo = setupClusters(numClusters);
261260
int localNumShards = (Integer) testClusterInfo.get("local.num_shards");
262261
int remote1NumShards = (Integer) testClusterInfo.get("remote1.num_shards");
262+
int remote2NumShards = (Integer) testClusterInfo.get("remote2.num_shards");
263263
String localIndex = (String) testClusterInfo.get("local.index");
264264
String remote1Index = (String) testClusterInfo.get("remote1.index");
265265
String remote2Index = (String) testClusterInfo.get("remote2.index");
@@ -281,7 +281,23 @@ public void testSearchesAgainstNonMatchingIndices() throws Exception {
281281
{
282282
String q = "FROM logs*,cluster-a:nomatch";
283283
String expectedError = "Unknown index [cluster-a:nomatch]";
284+
setSkipUnavailable(REMOTE_CLUSTER_1, false);
284285
expectVerificationExceptionForQuery(q, expectedError, requestIncludeMeta);
286+
setSkipUnavailable(REMOTE_CLUSTER_1, true);
287+
try (EsqlQueryResponse resp = runQuery(q, requestIncludeMeta)) {
288+
assertThat(getValuesList(resp).size(), greaterThanOrEqualTo(1));
289+
EsqlExecutionInfo executionInfo = resp.getExecutionInfo();
290+
assertThat(executionInfo.isCrossClusterSearch(), is(true));
291+
assertThat(executionInfo.includeCCSMetadata(), equalTo(responseExpectMeta));
292+
assertExpectedClustersForMissingIndicesTests(
293+
executionInfo,
294+
List.of(
295+
new ExpectedCluster(LOCAL_CLUSTER, "logs*", EsqlExecutionInfo.Cluster.Status.SUCCESSFUL, localNumShards),
296+
new ExpectedCluster(REMOTE_CLUSTER_1, "nomatch", EsqlExecutionInfo.Cluster.Status.SKIPPED, 0)
297+
)
298+
);
299+
300+
}
285301
}
286302

287303
// No error since local non-matching index has wildcard and the remote cluster index expression matches
@@ -411,7 +427,34 @@ public void testSearchesAgainstNonMatchingIndices() throws Exception {
411427
String remote2IndexName = randomFrom(remote2Index, IDX_ALIAS, FILTERED_IDX_ALIAS);
412428
String q = Strings.format("FROM %s*,cluster-a:nomatch,%s:%s*", localIndexName, REMOTE_CLUSTER_2, remote2IndexName);
413429
String expectedError = "Unknown index [cluster-a:nomatch]";
430+
setSkipUnavailable(REMOTE_CLUSTER_1, false);
414431
expectVerificationExceptionForQuery(q, expectedError, requestIncludeMeta);
432+
setSkipUnavailable(REMOTE_CLUSTER_1, true);
433+
try (EsqlQueryResponse resp = runQuery(q, requestIncludeMeta)) {
434+
assertThat(getValuesList(resp).size(), greaterThanOrEqualTo(1));
435+
EsqlExecutionInfo executionInfo = resp.getExecutionInfo();
436+
assertThat(executionInfo.isCrossClusterSearch(), is(true));
437+
assertThat(executionInfo.includeCCSMetadata(), equalTo(responseExpectMeta));
438+
assertExpectedClustersForMissingIndicesTests(
439+
executionInfo,
440+
List.of(
441+
new ExpectedCluster(
442+
LOCAL_CLUSTER,
443+
localIndexName + "*",
444+
EsqlExecutionInfo.Cluster.Status.SUCCESSFUL,
445+
localNumShards
446+
),
447+
new ExpectedCluster(REMOTE_CLUSTER_1, "nomatch", EsqlExecutionInfo.Cluster.Status.SKIPPED, 0),
448+
new ExpectedCluster(
449+
REMOTE_CLUSTER_2,
450+
remote2IndexName + "*",
451+
EsqlExecutionInfo.Cluster.Status.SUCCESSFUL,
452+
remote2NumShards
453+
)
454+
)
455+
);
456+
457+
}
415458
}
416459
}
417460

@@ -454,8 +497,8 @@ public void assertExpectedClustersForMissingIndicesTests(EsqlExecutionInfo execu
454497
assertThat(msg, cluster.getSkippedShards(), equalTo(expectedCluster.totalShards()));
455498
assertThat(msg, cluster.getFailures().size(), equalTo(1));
456499
assertThat(msg, cluster.getFailures().get(0).getCause(), instanceOf(VerificationException.class));
457-
String expectedMsg = "Unknown index [" + expectedCluster.indexExpression() + "]";
458-
assertThat(msg, cluster.getFailures().get(0).getCause().getMessage(), containsString(expectedMsg));
500+
assertThat(msg, cluster.getFailures().get(0).getCause().getMessage(), containsString("Unknown index"));
501+
assertThat(msg, cluster.getFailures().get(0).getCause().getMessage(), containsString(expectedCluster.indexExpression()));
459502
hasSkipped = true;
460503
}
461504
// currently failed shards is always zero - change this once we start allowing partial data for individual shard failures
@@ -809,18 +852,30 @@ public void testWarnings() throws Exception {
809852
assertTrue(latch.await(30, TimeUnit.SECONDS));
810853
}
811854

812-
// Non-disconnect remote failures still fail the request even if skip_unavailable is true
855+
// Non-disconnect remote failures lead to skipping if skip_unavailable is true
813856
public void testRemoteFailureSkipUnavailableTrue() throws IOException {
814857
Map<String, Object> testClusterInfo = setupFailClusters();
815858
String localIndex = (String) testClusterInfo.get("local.index");
816859
String remote1Index = (String) testClusterInfo.get("remote.index");
817860
String q = Strings.format("FROM %s,cluster-a:%s*", localIndex, remote1Index);
818861

819-
Exception error = expectThrows(Exception.class, () -> runQuery(q, false));
820-
error = EsqlTestUtils.unwrapIfWrappedInRemoteException(error);
862+
try (EsqlQueryResponse resp = runQuery(q, randomBoolean())) {
863+
EsqlExecutionInfo executionInfo = resp.getExecutionInfo();
864+
assertNotNull(executionInfo);
865+
assertThat(executionInfo.isCrossClusterSearch(), is(true));
866+
assertThat(executionInfo.overallTook().millis(), greaterThanOrEqualTo(0L));
821867

822-
assertThat(error, instanceOf(IllegalStateException.class));
823-
assertThat(error.getMessage(), containsString("Accessing failing field"));
868+
EsqlExecutionInfo.Cluster remoteCluster = executionInfo.getCluster(REMOTE_CLUSTER_1);
869+
assertThat(remoteCluster.getIndexExpression(), equalTo("logs-2*"));
870+
assertThat(remoteCluster.getStatus(), equalTo(EsqlExecutionInfo.Cluster.Status.SKIPPED));
871+
assertThat(remoteCluster.getTook().millis(), greaterThanOrEqualTo(0L));
872+
assertThat(remoteCluster.getTotalShards(), equalTo(0));
873+
assertThat(remoteCluster.getSuccessfulShards(), equalTo(0));
874+
assertThat(remoteCluster.getSkippedShards(), equalTo(0));
875+
assertThat(remoteCluster.getFailedShards(), equalTo(0));
876+
assertThat(remoteCluster.getFailures(), hasSize(1));
877+
assertThat(remoteCluster.getFailures().getFirst().reason(), containsString("Accessing failing field"));
878+
}
824879
}
825880

826881
private static void assertClusterMetadataInResponse(EsqlQueryResponse resp, boolean responseExpectMeta) {

x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/CrossClusterQueryWithPartialResultsIT.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ public void sendResponse(Exception exception) {
227227
assertNotNull(unwrapped);
228228
assertThat(unwrapped.getMessage(), equalTo(simulatedFailure.getMessage()));
229229
}
230+
// The failure leads to skipped regardless of allowPartialResults
230231
request.allowPartialResults(true);
231232
try (var resp = runQuery(request)) {
232233
assertTrue(resp.isPartial());
@@ -289,7 +290,7 @@ public void testFailToStartRequestOnRemoteCluster() throws Exception {
289290
assertThat(returnedIds, equalTo(local.okIds));
290291
assertClusterSuccess(resp, LOCAL_CLUSTER, local.okShards);
291292
EsqlExecutionInfo.Cluster remoteInfo = resp.getExecutionInfo().getCluster(REMOTE_CLUSTER_1);
292-
assertThat(remoteInfo.getStatus(), equalTo(EsqlExecutionInfo.Cluster.Status.PARTIAL));
293+
assertThat(remoteInfo.getStatus(), equalTo(EsqlExecutionInfo.Cluster.Status.SKIPPED));
293294
assertClusterFailure(resp, REMOTE_CLUSTER_1, simulatedFailure.getMessage());
294295
}
295296
} finally {

x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/CrossClusterUsageTelemetryIT.java

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -118,24 +118,23 @@ public void testFailed() throws Exception {
118118

119119
assertThat(telemetry.getTotalCount(), equalTo(1L));
120120
assertThat(telemetry.getSuccessCount(), equalTo(0L));
121-
assertThat(telemetry.getByRemoteCluster().size(), equalTo(0));
121+
assertThat(telemetry.getByRemoteCluster().size(), equalTo(1));
122122
assertThat(telemetry.getRemotesPerSearchAvg(), equalTo(2.0));
123123
assertThat(telemetry.getRemotesPerSearchMax(), equalTo(2L));
124-
assertThat(telemetry.getSearchCountWithSkippedRemotes(), equalTo(0L));
124+
assertThat(telemetry.getSearchCountWithSkippedRemotes(), equalTo(1L));
125125
Map<String, Long> expectedFailure = Map.of(CCSUsageTelemetry.Result.NOT_FOUND.getName(), 1L);
126126
assertThat(telemetry.getFailureReasons(), equalTo(expectedFailure));
127127

128-
// this is only for cluster-a so no skipped remotes
128+
// this is only for cluster-a now
129129
telemetry = getTelemetryFromFailedQuery("from logs-*,cluster-a:no_such_index | stats sum (v)");
130130
assertThat(telemetry.getTotalCount(), equalTo(2L));
131131
assertThat(telemetry.getSuccessCount(), equalTo(0L));
132-
assertThat(telemetry.getByRemoteCluster().size(), equalTo(0));
132+
assertThat(telemetry.getByRemoteCluster().size(), equalTo(1));
133133
assertThat(telemetry.getRemotesPerSearchAvg(), equalTo(2.0));
134134
assertThat(telemetry.getRemotesPerSearchMax(), equalTo(2L));
135-
assertThat(telemetry.getSearchCountWithSkippedRemotes(), equalTo(0L));
135+
assertThat(telemetry.getSearchCountWithSkippedRemotes(), equalTo(1L));
136136
expectedFailure = Map.of(CCSUsageTelemetry.Result.NOT_FOUND.getName(), 2L);
137137
assertThat(telemetry.getFailureReasons(), equalTo(expectedFailure));
138-
assertThat(telemetry.getByRemoteCluster().size(), equalTo(0));
139138
}
140139

141140
public void testRemoteOnly() throws Exception {

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plugin/ClusterComputeHandler.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,14 @@ void startComputeOnRemoteCluster(
8585
final AtomicReference<ComputeResponse> finalResponse = new AtomicReference<>();
8686
listener = listener.delegateResponse((l, e) -> {
8787
final boolean receivedResults = finalResponse.get() != null || pagesFetched.get();
88-
if (receivedResults == false && EsqlCCSUtils.shouldIgnoreRuntimeError(executionInfo, clusterAlias, e)) {
89-
EsqlCCSUtils.markClusterWithFinalStateAndNoShards(executionInfo, clusterAlias, EsqlExecutionInfo.Cluster.Status.SKIPPED, e);
90-
l.onResponse(DriverCompletionInfo.EMPTY);
91-
} else if (configuration.allowPartialResults() && EsqlCCSUtils.canAllowPartial(e)) {
92-
EsqlCCSUtils.markClusterWithFinalStateAndNoShards(executionInfo, clusterAlias, EsqlExecutionInfo.Cluster.Status.PARTIAL, e);
88+
if (EsqlCCSUtils.shouldIgnoreRuntimeError(executionInfo, clusterAlias, e)
89+
|| (configuration.allowPartialResults() && EsqlCCSUtils.canAllowPartial(e))) {
90+
EsqlCCSUtils.markClusterWithFinalStateAndNoShards(
91+
executionInfo,
92+
clusterAlias,
93+
receivedResults ? EsqlExecutionInfo.Cluster.Status.PARTIAL : EsqlExecutionInfo.Cluster.Status.SKIPPED,
94+
e
95+
);
9396
l.onResponse(DriverCompletionInfo.EMPTY);
9497
} else {
9598
l.onFailure(e);

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlCCSUtils.java

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -221,15 +221,21 @@ static void updateExecutionInfoWithClustersWithNoMatchingIndices(
221221
"Unknown index [%s]",
222222
(c.equals(RemoteClusterAware.LOCAL_CLUSTER_GROUP_KEY) ? indexExpression : c + ":" + indexExpression)
223223
);
224-
if (fatalErrorMessage == null) {
225-
fatalErrorMessage = error;
226-
} else {
227-
fatalErrorMessage += "; " + error;
224+
if (executionInfo.isSkipUnavailable(c) == false) {
225+
if (fatalErrorMessage == null) {
226+
fatalErrorMessage = error;
227+
} else {
228+
fatalErrorMessage += "; " + error;
229+
}
228230
}
229231
if (filter == null) {
230-
// Not very useful since we don't send metadata on errors now, but may be useful in the future
231232
// We check for filter since the filter may be the reason why the index is missing, and then it's ok
232-
markClusterWithFinalStateAndNoShards(executionInfo, c, Cluster.Status.FAILED, new VerificationException(error));
233+
markClusterWithFinalStateAndNoShards(
234+
executionInfo,
235+
c,
236+
executionInfo.isSkipUnavailable(c) ? Cluster.Status.SKIPPED : Cluster.Status.FAILED,
237+
new VerificationException(error)
238+
);
233239
}
234240
} else {
235241
if (indexResolution.isValid()) {
@@ -364,11 +370,7 @@ public static void markClusterWithFinalStateAndNoShards(
364370
* We will ignore the error if it's remote unavailable and the cluster is marked to skip unavailable.
365371
*/
366372
public static boolean shouldIgnoreRuntimeError(EsqlExecutionInfo executionInfo, String clusterAlias, Exception e) {
367-
if (executionInfo.isSkipUnavailable(clusterAlias) == false) {
368-
return false;
369-
}
370-
371-
return ExceptionsHelper.isRemoteUnavailableException(e);
373+
return executionInfo.isSkipUnavailable(clusterAlias);
372374
}
373375

374376
/**

0 commit comments

Comments
 (0)