Skip to content

Commit 149a674

Browse files
authored
[8.19] ES|QL: Make skip_unavailable catch all errors (elastic#128163) (elastic#128533)
* ES|QL: Make skip_unavailable catch all errors (elastic#128163) * Make skip_unavailable catch all errors (cherry picked from commit 8484b71) # Conflicts: # x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plugin/ClusterComputeHandler.java * Fix test * Test fixes
1 parent b0d3f52 commit 149a674

File tree

14 files changed

+474
-80
lines changed

14 files changed

+474
-80
lines changed

docs/changelog/128163.yaml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
pr: 128163
2+
summary: Make `skip_unavailable` catch all errors
3+
area: ES|QL
4+
type: breaking
5+
issues: [ ]
6+
breaking:
7+
title: Cluster setting "skip_unavailable" catches all runtime errors
8+
area: ES|QL
9+
details: "If `skip_unavailable` is set to `true`, the runtime errors from this cluster\
10+
\ do not lead to a failure of the query. Instead, the cluster is set to `skipped`\
11+
\ or `partial` status, and the query execution continues. This is a breaking change\
12+
\ from previous versions, where `skip_unavailable` only applied to errors related\
13+
\ to a cluster being unavailable."
14+
impact: "The errors on remote clusters, e.g. missing indices, will not lead to a\
15+
\ failure of the query. Instead, the cluster is set to `skipped` or `partial` status\
16+
\ in the response metadata."
17+
notable: false

server/src/test/java/org/elasticsearch/action/admin/cluster/stats/CCSTelemetrySnapshotTests.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ public void testToXContent() throws IOException {
325325
XContentHelper.toXContent(snapshot, XContentType.JSON, randomBoolean()),
326326
XContentType.JSON
327327
);
328+
assertToXContentEquivalent(new BytesArray(expectedJson), new BytesArray(snapshot.toString()), XContentType.JSON);
328329
}
329330

330331
private String readJSONFromResource(String fileName) throws IOException {

x-pack/plugin/esql/qa/server/multi-clusters/src/javaRestTest/java/org/elasticsearch/xpack/esql/ccq/EsqlRestValidationIT.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,22 @@ private RestClient remoteClusterClient() throws IOException {
8181
return remoteClient;
8282
}
8383

84+
protected boolean isSkipUnavailable() {
85+
return true;
86+
}
87+
88+
@Override
89+
public void testAlias() throws IOException {
90+
assumeFalse("expecting skip_unavailable to be false", isSkipUnavailable());
91+
super.testAlias();
92+
}
93+
94+
@Override
95+
public void testExistentIndexWithoutWildcard() throws IOException {
96+
assumeFalse("expecting skip_unavailable to be false", isSkipUnavailable());
97+
super.testExistentIndexWithoutWildcard();
98+
}
99+
84100
@Before
85101
public void skipTestOnOldVersions() {
86102
assumeTrue("skip on old versions", Clusters.localClusterVersion().equals(Version.V_8_19_0));

x-pack/plugin/esql/qa/server/multi-clusters/src/javaRestTest/java/org/elasticsearch/xpack/esql/ccq/EsqlRestValidationSkipUnFalseIT.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,9 @@ public class EsqlRestValidationSkipUnFalseIT extends EsqlRestValidationIT {
2727
protected String getTestRestCluster() {
2828
return localCluster.getHttpAddresses();
2929
}
30+
31+
@Override
32+
protected boolean isSkipUnavailable() {
33+
return false;
34+
}
3035
}

x-pack/plugin/esql/qa/server/multi-clusters/src/javaRestTest/java/org/elasticsearch/xpack/esql/ccq/RequestIndexFilteringIT.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,11 @@
3333
import java.util.List;
3434
import java.util.Map;
3535

36+
import static org.elasticsearch.test.ListMatcher.matchesList;
3637
import static org.elasticsearch.test.MapMatcher.assertMap;
3738
import static org.elasticsearch.test.MapMatcher.matchesMap;
3839
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
40+
import static org.hamcrest.Matchers.hasSize;
3941
import static org.hamcrest.Matchers.instanceOf;
4042

4143
@ThreadLeakFilters(filters = TestClustersThreadFilter.class)
@@ -145,4 +147,32 @@ protected void assertQueryResult(Map<String, Object> result, Matcher<?> columnMa
145147
assertMap(result, matcher);
146148
}
147149

150+
private static boolean checkVersion(org.elasticsearch.Version version) {
151+
return version.onOrAfter(Version.fromString("9.1.0"))
152+
|| (version.onOrAfter(Version.fromString("8.19.0")) && version.before(Version.fromString("9.0.0")));
153+
}
154+
155+
// We need a separate test since remote missing indices and local missing indices now work differently
156+
public void testIndicesDontExistRemote() throws IOException {
157+
// Exclude old versions
158+
assumeTrue("Only works with latest support_unavailable logic", checkVersion(Clusters.localClusterVersion()));
159+
int docsTest1 = randomIntBetween(1, 5);
160+
indexTimestampData(docsTest1, "test1", "2024-11-26", "id1");
161+
162+
Map<String, Object> result = runEsql(
163+
timestampFilter("gte", "2020-01-01").query("FROM *:foo,*:test1 METADATA _index | SORT id1 | KEEP _index, id*")
164+
);
165+
@SuppressWarnings("unchecked")
166+
var columns = (List<List<Object>>) result.get("columns");
167+
assertThat(
168+
columns,
169+
matchesList().item(matchesMap().entry("name", "_index").entry("type", "keyword"))
170+
.item(matchesMap().entry("name", "id1").entry("type", "integer"))
171+
);
172+
@SuppressWarnings("unchecked")
173+
var values = (List<List<Object>>) result.get("values");
174+
// TODO: for now, we return empty result, but eventually it should return records from test1
175+
assertThat(values, hasSize(0));
176+
177+
}
148178
}

x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/RequestIndexFilteringTestCase.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ public void testFieldNameTypo() throws IOException {
195195
}
196196

197197
public void testIndicesDontExist() throws IOException {
198-
int docsTest1 = 0; // we are interested only in the created index, not necessarily that it has data
198+
int docsTest1 = randomIntBetween(1, 5);
199199
indexTimestampData(docsTest1, "test1", "2024-11-26", "id1");
200200

201201
ResponseException e = expectThrows(ResponseException.class, () -> runEsql(timestampFilter("gte", "2020-01-01").query(from("foo"))));
@@ -208,7 +208,7 @@ public void testIndicesDontExist() throws IOException {
208208
assertThat(e.getMessage(), containsString("verification_exception"));
209209
assertThat(e.getMessage(), anyOf(containsString("Unknown index [foo*]"), containsString("Unknown index [remote_cluster:foo*]")));
210210

211-
e = expectThrows(ResponseException.class, () -> runEsql(timestampFilter("gte", "2020-01-01").query(from("foo", "test1"))));
211+
e = expectThrows(ResponseException.class, () -> runEsql(timestampFilter("gte", "2020-01-01").query("FROM foo, test1")));
212212
assertEquals(404, e.getResponse().getStatusLine().getStatusCode());
213213
assertThat(e.getMessage(), containsString("index_not_found_exception"));
214214
assertThat(e.getMessage(), anyOf(containsString("no such index [foo]"), containsString("no such index [remote_cluster:foo]")));
@@ -231,7 +231,7 @@ public void testIndicesDontExist() throws IOException {
231231
}
232232
}
233233

234-
private static RestEsqlTestCase.RequestObjectBuilder timestampFilter(String op, String date) throws IOException {
234+
protected static RestEsqlTestCase.RequestObjectBuilder timestampFilter(String op, String date) throws IOException {
235235
return requestObjectBuilder().filter(b -> {
236236
b.startObject("range");
237237
{

x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/CrossClusterQueryIT.java

Lines changed: 63 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
import org.elasticsearch.transport.TransportService;
3030
import org.elasticsearch.xcontent.XContentBuilder;
3131
import org.elasticsearch.xcontent.json.JsonXContent;
32-
import org.elasticsearch.xpack.esql.EsqlTestUtils;
3332
import org.elasticsearch.xpack.esql.VerificationException;
3433
import org.elasticsearch.xpack.esql.plugin.QueryPragmas;
3534

@@ -260,6 +259,7 @@ public void testSearchesAgainstNonMatchingIndices() throws Exception {
260259
Map<String, Object> testClusterInfo = setupClusters(numClusters);
261260
int localNumShards = (Integer) testClusterInfo.get("local.num_shards");
262261
int remote1NumShards = (Integer) testClusterInfo.get("remote1.num_shards");
262+
int remote2NumShards = (Integer) testClusterInfo.get("remote2.num_shards");
263263
String localIndex = (String) testClusterInfo.get("local.index");
264264
String remote1Index = (String) testClusterInfo.get("remote1.index");
265265
String remote2Index = (String) testClusterInfo.get("remote2.index");
@@ -281,7 +281,23 @@ public void testSearchesAgainstNonMatchingIndices() throws Exception {
281281
{
282282
String q = "FROM logs*,cluster-a:nomatch";
283283
String expectedError = "Unknown index [cluster-a:nomatch]";
284+
setSkipUnavailable(REMOTE_CLUSTER_1, false);
284285
expectVerificationExceptionForQuery(q, expectedError, requestIncludeMeta);
286+
setSkipUnavailable(REMOTE_CLUSTER_1, true);
287+
try (EsqlQueryResponse resp = runQuery(q, requestIncludeMeta)) {
288+
assertThat(getValuesList(resp).size(), greaterThanOrEqualTo(1));
289+
EsqlExecutionInfo executionInfo = resp.getExecutionInfo();
290+
assertThat(executionInfo.isCrossClusterSearch(), is(true));
291+
assertThat(executionInfo.includeCCSMetadata(), equalTo(responseExpectMeta));
292+
assertExpectedClustersForMissingIndicesTests(
293+
executionInfo,
294+
List.of(
295+
new ExpectedCluster(LOCAL_CLUSTER, "logs*", EsqlExecutionInfo.Cluster.Status.SUCCESSFUL, localNumShards),
296+
new ExpectedCluster(REMOTE_CLUSTER_1, "nomatch", EsqlExecutionInfo.Cluster.Status.SKIPPED, 0)
297+
)
298+
);
299+
300+
}
285301
}
286302

287303
// No error since local non-matching index has wildcard and the remote cluster index expression matches
@@ -411,7 +427,34 @@ public void testSearchesAgainstNonMatchingIndices() throws Exception {
411427
String remote2IndexName = randomFrom(remote2Index, IDX_ALIAS, FILTERED_IDX_ALIAS);
412428
String q = Strings.format("FROM %s*,cluster-a:nomatch,%s:%s*", localIndexName, REMOTE_CLUSTER_2, remote2IndexName);
413429
String expectedError = "Unknown index [cluster-a:nomatch]";
430+
setSkipUnavailable(REMOTE_CLUSTER_1, false);
414431
expectVerificationExceptionForQuery(q, expectedError, requestIncludeMeta);
432+
setSkipUnavailable(REMOTE_CLUSTER_1, true);
433+
try (EsqlQueryResponse resp = runQuery(q, requestIncludeMeta)) {
434+
assertThat(getValuesList(resp).size(), greaterThanOrEqualTo(1));
435+
EsqlExecutionInfo executionInfo = resp.getExecutionInfo();
436+
assertThat(executionInfo.isCrossClusterSearch(), is(true));
437+
assertThat(executionInfo.includeCCSMetadata(), equalTo(responseExpectMeta));
438+
assertExpectedClustersForMissingIndicesTests(
439+
executionInfo,
440+
List.of(
441+
new ExpectedCluster(
442+
LOCAL_CLUSTER,
443+
localIndexName + "*",
444+
EsqlExecutionInfo.Cluster.Status.SUCCESSFUL,
445+
localNumShards
446+
),
447+
new ExpectedCluster(REMOTE_CLUSTER_1, "nomatch", EsqlExecutionInfo.Cluster.Status.SKIPPED, 0),
448+
new ExpectedCluster(
449+
REMOTE_CLUSTER_2,
450+
remote2IndexName + "*",
451+
EsqlExecutionInfo.Cluster.Status.SUCCESSFUL,
452+
remote2NumShards
453+
)
454+
)
455+
);
456+
457+
}
415458
}
416459
}
417460

@@ -454,8 +497,8 @@ public void assertExpectedClustersForMissingIndicesTests(EsqlExecutionInfo execu
454497
assertThat(msg, cluster.getSkippedShards(), equalTo(expectedCluster.totalShards()));
455498
assertThat(msg, cluster.getFailures().size(), equalTo(1));
456499
assertThat(msg, cluster.getFailures().get(0).getCause(), instanceOf(VerificationException.class));
457-
String expectedMsg = "Unknown index [" + expectedCluster.indexExpression() + "]";
458-
assertThat(msg, cluster.getFailures().get(0).getCause().getMessage(), containsString(expectedMsg));
500+
assertThat(msg, cluster.getFailures().get(0).getCause().getMessage(), containsString("Unknown index"));
501+
assertThat(msg, cluster.getFailures().get(0).getCause().getMessage(), containsString(expectedCluster.indexExpression()));
459502
hasSkipped = true;
460503
}
461504
// currently failed shards is always zero - change this once we start allowing partial data for individual shard failures
@@ -809,18 +852,30 @@ public void testWarnings() throws Exception {
809852
assertTrue(latch.await(30, TimeUnit.SECONDS));
810853
}
811854

812-
// Non-disconnect remote failures still fail the request even if skip_unavailable is true
855+
// Non-disconnect remote failures lead to skipping if skip_unavailable is true
813856
public void testRemoteFailureSkipUnavailableTrue() throws IOException {
814857
Map<String, Object> testClusterInfo = setupFailClusters();
815858
String localIndex = (String) testClusterInfo.get("local.index");
816859
String remote1Index = (String) testClusterInfo.get("remote.index");
817860
String q = Strings.format("FROM %s,cluster-a:%s*", localIndex, remote1Index);
818861

819-
Exception error = expectThrows(Exception.class, () -> runQuery(q, false));
820-
error = EsqlTestUtils.unwrapIfWrappedInRemoteException(error);
862+
try (EsqlQueryResponse resp = runQuery(q, randomBoolean())) {
863+
EsqlExecutionInfo executionInfo = resp.getExecutionInfo();
864+
assertNotNull(executionInfo);
865+
assertThat(executionInfo.isCrossClusterSearch(), is(true));
866+
assertThat(executionInfo.overallTook().millis(), greaterThanOrEqualTo(0L));
821867

822-
assertThat(error, instanceOf(IllegalStateException.class));
823-
assertThat(error.getMessage(), containsString("Accessing failing field"));
868+
EsqlExecutionInfo.Cluster remoteCluster = executionInfo.getCluster(REMOTE_CLUSTER_1);
869+
assertThat(remoteCluster.getIndexExpression(), equalTo("logs-2*"));
870+
assertThat(remoteCluster.getStatus(), equalTo(EsqlExecutionInfo.Cluster.Status.SKIPPED));
871+
assertThat(remoteCluster.getTook().millis(), greaterThanOrEqualTo(0L));
872+
assertThat(remoteCluster.getTotalShards(), equalTo(0));
873+
assertThat(remoteCluster.getSuccessfulShards(), equalTo(0));
874+
assertThat(remoteCluster.getSkippedShards(), equalTo(0));
875+
assertThat(remoteCluster.getFailedShards(), equalTo(0));
876+
assertThat(remoteCluster.getFailures(), hasSize(1));
877+
assertThat(remoteCluster.getFailures().get(0).reason(), containsString("Accessing failing field"));
878+
}
824879
}
825880

826881
private static void assertClusterMetadataInResponse(EsqlQueryResponse resp, boolean responseExpectMeta) {

x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/CrossClusterQueryWithPartialResultsIT.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ public void sendResponse(Exception exception) {
227227
assertNotNull(unwrapped);
228228
assertThat(unwrapped.getMessage(), equalTo(simulatedFailure.getMessage()));
229229
}
230+
// The failure leads to skipped regardless of allowPartialResults
230231
request.allowPartialResults(true);
231232
try (var resp = runQuery(request)) {
232233
assertTrue(resp.isPartial());
@@ -289,7 +290,7 @@ public void testFailToStartRequestOnRemoteCluster() throws Exception {
289290
assertThat(returnedIds, equalTo(local.okIds));
290291
assertClusterSuccess(resp, LOCAL_CLUSTER, local.okShards);
291292
EsqlExecutionInfo.Cluster remoteInfo = resp.getExecutionInfo().getCluster(REMOTE_CLUSTER_1);
292-
assertThat(remoteInfo.getStatus(), equalTo(EsqlExecutionInfo.Cluster.Status.PARTIAL));
293+
assertThat(remoteInfo.getStatus(), equalTo(EsqlExecutionInfo.Cluster.Status.SKIPPED));
293294
assertClusterFailure(resp, REMOTE_CLUSTER_1, simulatedFailure.getMessage());
294295
}
295296
} finally {

x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/CrossClusterUsageTelemetryIT.java

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -118,24 +118,23 @@ public void testFailed() throws Exception {
118118

119119
assertThat(telemetry.getTotalCount(), equalTo(1L));
120120
assertThat(telemetry.getSuccessCount(), equalTo(0L));
121-
assertThat(telemetry.getByRemoteCluster().size(), equalTo(0));
121+
assertThat(telemetry.getByRemoteCluster().size(), equalTo(1));
122122
assertThat(telemetry.getRemotesPerSearchAvg(), equalTo(2.0));
123123
assertThat(telemetry.getRemotesPerSearchMax(), equalTo(2L));
124-
assertThat(telemetry.getSearchCountWithSkippedRemotes(), equalTo(0L));
124+
assertThat(telemetry.getSearchCountWithSkippedRemotes(), equalTo(1L));
125125
Map<String, Long> expectedFailure = Map.of(CCSUsageTelemetry.Result.NOT_FOUND.getName(), 1L);
126126
assertThat(telemetry.getFailureReasons(), equalTo(expectedFailure));
127127

128-
// this is only for cluster-a so no skipped remotes
128+
// this is only for cluster-a now
129129
telemetry = getTelemetryFromFailedQuery("from logs-*,cluster-a:no_such_index | stats sum (v)");
130130
assertThat(telemetry.getTotalCount(), equalTo(2L));
131131
assertThat(telemetry.getSuccessCount(), equalTo(0L));
132-
assertThat(telemetry.getByRemoteCluster().size(), equalTo(0));
132+
assertThat(telemetry.getByRemoteCluster().size(), equalTo(1));
133133
assertThat(telemetry.getRemotesPerSearchAvg(), equalTo(2.0));
134134
assertThat(telemetry.getRemotesPerSearchMax(), equalTo(2L));
135-
assertThat(telemetry.getSearchCountWithSkippedRemotes(), equalTo(0L));
135+
assertThat(telemetry.getSearchCountWithSkippedRemotes(), equalTo(1L));
136136
expectedFailure = Map.of(CCSUsageTelemetry.Result.NOT_FOUND.getName(), 2L);
137137
assertThat(telemetry.getFailureReasons(), equalTo(expectedFailure));
138-
assertThat(telemetry.getByRemoteCluster().size(), equalTo(0));
139138
}
140139

141140
public void testRemoteOnly() throws Exception {

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plugin/ClusterComputeHandler.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,14 @@ void startComputeOnRemoteCluster(
8585
final AtomicReference<ComputeResponse> finalResponse = new AtomicReference<>();
8686
listener = listener.delegateResponse((l, e) -> {
8787
final boolean receivedResults = finalResponse.get() != null || pagesFetched.get();
88-
if (receivedResults == false && EsqlCCSUtils.shouldIgnoreRuntimeError(executionInfo, clusterAlias, e)) {
89-
EsqlCCSUtils.markClusterWithFinalStateAndNoShards(executionInfo, clusterAlias, EsqlExecutionInfo.Cluster.Status.SKIPPED, e);
90-
l.onResponse(List.of());
91-
} else if (configuration.allowPartialResults() && EsqlCCSUtils.canAllowPartial(e)) {
92-
EsqlCCSUtils.markClusterWithFinalStateAndNoShards(executionInfo, clusterAlias, EsqlExecutionInfo.Cluster.Status.PARTIAL, e);
88+
if (EsqlCCSUtils.shouldIgnoreRuntimeError(executionInfo, clusterAlias, e)
89+
|| (configuration.allowPartialResults() && EsqlCCSUtils.canAllowPartial(e))) {
90+
EsqlCCSUtils.markClusterWithFinalStateAndNoShards(
91+
executionInfo,
92+
clusterAlias,
93+
receivedResults ? EsqlExecutionInfo.Cluster.Status.PARTIAL : EsqlExecutionInfo.Cluster.Status.SKIPPED,
94+
e
95+
);
9396
l.onResponse(List.of());
9497
} else {
9598
l.onFailure(e);

0 commit comments

Comments
 (0)