Skip to content

Commit 82aaa70

Browse files
authored
Handle failures that occur during field-caps (#130840) (#131255)
Currently, errors from the field-caps phase are not always handled properly, leading to cases where the final response is not marked as partial correctly. For example: FROM ok*,unavailable_index* should return a partial result, as unavailable_index* is skipped after the resolution phase. This change tracks failures that occur during field-caps and reports them in the final response. Since this only affects cases with allow_partial_results=true, I am labeling this as a non-issue and will backport the change to 9.1 and 8.19. (cherry picked from commit a699655)
1 parent 5040e9c commit 82aaa70

File tree

10 files changed

+230
-108
lines changed

10 files changed

+230
-108
lines changed

x-pack/plugin/esql/src/internalClusterTest/java/org/elasticsearch/xpack/esql/action/CrossClusterQueryWithPartialResultsIT.java

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import org.elasticsearch.ExceptionsHelper;
1111
import org.elasticsearch.ResourceNotFoundException;
1212
import org.elasticsearch.action.search.ShardSearchFailure;
13+
import org.elasticsearch.action.support.ActiveShardCount;
1314
import org.elasticsearch.client.internal.Client;
1415
import org.elasticsearch.common.breaker.CircuitBreaker;
1516
import org.elasticsearch.common.breaker.CircuitBreakingException;
@@ -41,6 +42,7 @@
4142
import static org.hamcrest.Matchers.empty;
4243
import static org.hamcrest.Matchers.equalTo;
4344
import static org.hamcrest.Matchers.greaterThan;
45+
import static org.hamcrest.Matchers.hasSize;
4446
import static org.hamcrest.Matchers.in;
4547
import static org.hamcrest.Matchers.instanceOf;
4648
import static org.hamcrest.Matchers.is;
@@ -62,12 +64,15 @@ private static class ClusterSetup {
6264
void populateIndices() throws Exception {
6365
local.okIds = populateIndex(LOCAL_CLUSTER, "ok-local", local.okShards, between(1, 100));
6466
populateIndexWithFailingFields(LOCAL_CLUSTER, "fail-local", local.failingShards);
67+
createUnavailableIndex(LOCAL_CLUSTER, "unavailable-local");
6568

6669
remote1.okIds = populateIndex(REMOTE_CLUSTER_1, "ok-cluster1", remote1.okShards, between(1, 100));
6770
populateIndexWithFailingFields(REMOTE_CLUSTER_1, "fail-cluster1", remote1.failingShards);
71+
createUnavailableIndex(REMOTE_CLUSTER_1, "unavailable-cluster1");
6872

6973
remote2.okIds = populateIndex(REMOTE_CLUSTER_2, "ok-cluster2", remote2.okShards, between(1, 100));
7074
populateIndexWithFailingFields(REMOTE_CLUSTER_2, "fail-cluster2", remote2.failingShards);
75+
createUnavailableIndex(REMOTE_CLUSTER_2, "unavailable-cluster2");
7176
}
7277

7378
private void assertClusterPartial(EsqlQueryResponse resp, String clusterAlias, ClusterSetup cluster) {
@@ -356,6 +361,42 @@ private static Exception randomFailure() {
356361
);
357362
}
358363

364+
public void testResolutionFailures() throws Exception {
365+
populateIndices();
366+
EsqlQueryRequest request = new EsqlQueryRequest();
367+
request.allowPartialResults(true);
368+
request.query("FROM ok*,unavailable* | LIMIT 1000");
369+
try (var resp = runQuery(request)) {
370+
assertThat(EsqlTestUtils.getValuesList(resp), hasSize(local.okIds.size()));
371+
assertTrue(resp.isPartial());
372+
EsqlExecutionInfo executionInfo = resp.getExecutionInfo();
373+
var localCluster = executionInfo.getCluster(LOCAL_CLUSTER);
374+
assertThat(localCluster.getFailures(), not(empty()));
375+
assertThat(localCluster.getFailures().get(0).reason(), containsString("index [unavailable-local] has no active shard copy"));
376+
}
377+
request.query("FROM *:ok*,unavailable* | LIMIT 1000");
378+
try (var resp = runQuery(request)) {
379+
assertThat(EsqlTestUtils.getValuesList(resp), hasSize(remote1.okIds.size() + remote2.okIds.size()));
380+
assertTrue(resp.isPartial());
381+
var executionInfo = resp.getExecutionInfo();
382+
var localCluster = executionInfo.getCluster(LOCAL_CLUSTER);
383+
assertThat(localCluster.getStatus(), equalTo(EsqlExecutionInfo.Cluster.Status.SKIPPED));
384+
assertThat(localCluster.getFailures(), not(empty()));
385+
assertThat(localCluster.getFailures().get(0).reason(), containsString("index [unavailable-local] has no active shard copy"));
386+
assertThat(executionInfo.getCluster(REMOTE_CLUSTER_1).getStatus(), equalTo(EsqlExecutionInfo.Cluster.Status.SUCCESSFUL));
387+
assertThat(executionInfo.getCluster(REMOTE_CLUSTER_2).getStatus(), equalTo(EsqlExecutionInfo.Cluster.Status.SUCCESSFUL));
388+
}
389+
request.query("FROM ok*,cluster-a:unavailable* | LIMIT 1000");
390+
try (var resp = runQuery(request)) {
391+
assertThat(EsqlTestUtils.getValuesList(resp), hasSize(local.okIds.size()));
392+
assertTrue(resp.isPartial());
393+
var remote1 = resp.getExecutionInfo().getCluster(REMOTE_CLUSTER_1);
394+
assertThat(remote1.getFailures(), not(empty()));
395+
assertThat(remote1.getFailures().get(0).reason(), containsString("index [unavailable-cluster1] has no active shard copy"));
396+
assertThat(resp.getExecutionInfo().getCluster(LOCAL_CLUSTER).getStatus(), equalTo(EsqlExecutionInfo.Cluster.Status.SUCCESSFUL));
397+
}
398+
}
399+
359400
private Set<String> populateIndexWithFailingFields(String clusterAlias, String indexName, int numShards) throws IOException {
360401
Client client = client(clusterAlias);
361402
XContentBuilder mapping = JsonXContent.contentBuilder().startObject();
@@ -398,4 +439,15 @@ private Set<String> populateIndexWithFailingFields(String clusterAlias, String i
398439
}
399440
return ids;
400441
}
442+
443+
private void createUnavailableIndex(String clusterAlias, String indexName) throws IOException {
444+
Client client = client(clusterAlias);
445+
assertAcked(
446+
client.admin()
447+
.indices()
448+
.prepareCreate(indexName)
449+
.setSettings(Settings.builder().put("index.routing.allocation.include._name", "no_such_node"))
450+
.setWaitForActiveShards(ActiveShardCount.NONE)
451+
);
452+
}
401453
}

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlExecutionInfo.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.elasticsearch.xcontent.XContentBuilder;
2828

2929
import java.io.IOException;
30+
import java.util.ArrayList;
3031
import java.util.Collections;
3132
import java.util.Iterator;
3233
import java.util.List;
@@ -545,8 +546,14 @@ public Cluster.Builder setFailedShards(int failedShards) {
545546
return this;
546547
}
547548

548-
public Cluster.Builder setFailures(List<ShardSearchFailure> failures) {
549-
this.failures = failures;
549+
public Cluster.Builder addFailures(List<ShardSearchFailure> failures) {
550+
if (failures.isEmpty()) {
551+
return this;
552+
}
553+
if (this.failures == null) {
554+
this.failures = new ArrayList<>(original.failures);
555+
}
556+
this.failures.addAll(failures);
550557
return this;
551558
}
552559

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/index/IndexResolution.java

Lines changed: 17 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
*/
77
package org.elasticsearch.xpack.esql.index;
88

9-
import org.elasticsearch.action.NoShardAvailableActionException;
109
import org.elasticsearch.action.fieldcaps.FieldCapabilitiesFailure;
1110
import org.elasticsearch.core.Nullable;
1211

12+
import java.util.List;
1313
import java.util.Map;
1414
import java.util.Objects;
1515
import java.util.Set;
@@ -19,33 +19,26 @@ public final class IndexResolution {
1919
/**
2020
* @param index EsIndex encapsulating requested index expression, resolved mappings and index modes from field-caps.
2121
* @param resolvedIndices Set of concrete indices resolved by field-caps. (This information is not always present in the EsIndex).
22-
* @param unavailableShards Set of shards that were unavailable during index resolution
23-
* @param unavailableClusters Remote clusters that could not be contacted during planning
22+
* @param failures failures occurred during field-caps.
2423
* @return valid IndexResolution
2524
*/
26-
public static IndexResolution valid(
27-
EsIndex index,
28-
Set<String> resolvedIndices,
29-
Set<NoShardAvailableActionException> unavailableShards,
30-
Map<String, FieldCapabilitiesFailure> unavailableClusters
31-
) {
25+
public static IndexResolution valid(EsIndex index, Set<String> resolvedIndices, Map<String, List<FieldCapabilitiesFailure>> failures) {
3226
Objects.requireNonNull(index, "index must not be null if it was found");
3327
Objects.requireNonNull(resolvedIndices, "resolvedIndices must not be null");
34-
Objects.requireNonNull(unavailableShards, "unavailableShards must not be null");
35-
Objects.requireNonNull(unavailableClusters, "unavailableClusters must not be null");
36-
return new IndexResolution(index, null, resolvedIndices, unavailableShards, unavailableClusters);
28+
Objects.requireNonNull(failures, "failures must not be null");
29+
return new IndexResolution(index, null, resolvedIndices, failures);
3730
}
3831

3932
/**
4033
* Use this method only if the set of concrete resolved indices is the same as EsIndex#concreteIndices().
4134
*/
4235
public static IndexResolution valid(EsIndex index) {
43-
return valid(index, index.concreteIndices(), Set.of(), Map.of());
36+
return valid(index, index.concreteIndices(), Map.of());
4437
}
4538

4639
public static IndexResolution invalid(String invalid) {
4740
Objects.requireNonNull(invalid, "invalid must not be null to signal that the index is invalid");
48-
return new IndexResolution(null, invalid, Set.of(), Set.of(), Map.of());
41+
return new IndexResolution(null, invalid, Set.of(), Map.of());
4942
}
5043

5144
public static IndexResolution notFound(String name) {
@@ -59,22 +52,19 @@ public static IndexResolution notFound(String name) {
5952

6053
// all indices found by field-caps
6154
private final Set<String> resolvedIndices;
62-
private final Set<NoShardAvailableActionException> unavailableShards;
63-
// remote clusters included in the user's index expression that could not be connected to
64-
private final Map<String, FieldCapabilitiesFailure> unavailableClusters;
55+
// map from cluster alias to failures that occurred during field-caps.
56+
private final Map<String, List<FieldCapabilitiesFailure>> failures;
6557

6658
private IndexResolution(
6759
EsIndex index,
6860
@Nullable String invalid,
6961
Set<String> resolvedIndices,
70-
Set<NoShardAvailableActionException> unavailableShards,
71-
Map<String, FieldCapabilitiesFailure> unavailableClusters
62+
Map<String, List<FieldCapabilitiesFailure>> failures
7263
) {
7364
this.index = index;
7465
this.invalid = invalid;
7566
this.resolvedIndices = resolvedIndices;
76-
this.unavailableShards = unavailableShards;
77-
this.unavailableClusters = unavailableClusters;
67+
this.failures = failures;
7868
}
7969

8070
public boolean matches(String indexName) {
@@ -101,11 +91,10 @@ public boolean isValid() {
10191
}
10292

10393
/**
104-
* @return Map of unavailable clusters (could not be connected to during field-caps query). Key of map is cluster alias,
105-
* value is the {@link FieldCapabilitiesFailure} describing the issue.
94+
* @return Map from cluster alias to failures that occurred during field-caps.
10695
*/
107-
public Map<String, FieldCapabilitiesFailure> unavailableClusters() {
108-
return unavailableClusters;
96+
public Map<String, List<FieldCapabilitiesFailure>> failures() {
97+
return failures;
10998
}
11099

111100
/**
@@ -115,13 +104,6 @@ public Set<String> resolvedIndices() {
115104
return resolvedIndices;
116105
}
117106

118-
/**
119-
* @return set of unavailable shards during index resolution
120-
*/
121-
public Set<NoShardAvailableActionException> getUnavailableShards() {
122-
return unavailableShards;
123-
}
124-
125107
@Override
126108
public boolean equals(Object obj) {
127109
if (obj == null || obj.getClass() != getClass()) {
@@ -131,12 +113,12 @@ public boolean equals(Object obj) {
131113
return Objects.equals(index, other.index)
132114
&& Objects.equals(invalid, other.invalid)
133115
&& Objects.equals(resolvedIndices, other.resolvedIndices)
134-
&& Objects.equals(unavailableClusters, other.unavailableClusters);
116+
&& Objects.equals(failures, other.failures);
135117
}
136118

137119
@Override
138120
public int hashCode() {
139-
return Objects.hash(index, invalid, resolvedIndices, unavailableClusters);
121+
return Objects.hash(index, invalid, resolvedIndices, failures);
140122
}
141123

142124
@Override
@@ -152,7 +134,7 @@ public String toString() {
152134
+ ", resolvedIndices="
153135
+ resolvedIndices
154136
+ ", unavailableClusters="
155-
+ unavailableClusters
137+
+ failures
156138
+ '}';
157139
}
158140
}

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plugin/ClusterComputeHandler.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ private void updateExecutionInfo(EsqlExecutionInfo executionInfo, String cluster
166166
builder.setTook(executionInfo.tookSoFar());
167167
}
168168
if (v.getStatus() == EsqlExecutionInfo.Cluster.Status.RUNNING) {
169-
builder.setFailures(resp.failures);
169+
builder.addFailures(resp.failures);
170170
if (executionInfo.isStopped() || resp.failedShards > 0 || resp.failures.isEmpty() == false) {
171171
builder.setStatus(EsqlExecutionInfo.Cluster.Status.PARTIAL);
172172
} else {

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plugin/ComputeService.java

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -246,9 +246,13 @@ public void execute(
246246
var builder = new EsqlExecutionInfo.Cluster.Builder(v).setTook(tookTime);
247247
if (v.getStatus() == EsqlExecutionInfo.Cluster.Status.RUNNING) {
248248
final Integer failedShards = execInfo.getCluster(LOCAL_CLUSTER).getFailedShards();
249-
var status = localClusterWasInterrupted.get() || (failedShards != null && failedShards > 0)
250-
? EsqlExecutionInfo.Cluster.Status.PARTIAL
251-
: EsqlExecutionInfo.Cluster.Status.SUCCESSFUL;
249+
// Set the local cluster status (including the final driver) to partial if the query was stopped
250+
// or encountered resolution or execution failures.
251+
var status = localClusterWasInterrupted.get()
252+
|| (failedShards != null && failedShards > 0)
253+
|| v.getFailures().isEmpty() == false
254+
? EsqlExecutionInfo.Cluster.Status.PARTIAL
255+
: EsqlExecutionInfo.Cluster.Status.SUCCESSFUL;
252256
builder.setStatus(status);
253257
}
254258
return builder.build();
@@ -296,7 +300,7 @@ public void execute(
296300
.setSuccessfulShards(r.getSuccessfulShards())
297301
.setSkippedShards(r.getSkippedShards())
298302
.setFailedShards(r.getFailedShards())
299-
.setFailures(r.failures)
303+
.addFailures(r.failures)
300304
.build()
301305
);
302306
dataNodesListener.onResponse(r.getCompletionInfo());
@@ -306,7 +310,7 @@ public void execute(
306310
LOCAL_CLUSTER,
307311
(k, v) -> new EsqlExecutionInfo.Cluster.Builder(v).setStatus(
308312
EsqlExecutionInfo.Cluster.Status.PARTIAL
309-
).setFailures(List.of(new ShardSearchFailure(e))).build()
313+
).addFailures(List.of(new ShardSearchFailure(e))).build()
310314
);
311315
dataNodesListener.onResponse(DriverCompletionInfo.EMPTY);
312316
} else {

0 commit comments

Comments
 (0)