Skip to content

Commit 896422b

Browse files
authored
Async Search: correct shards counting (#56272)
Async search allows users to retrieve partial results for a running search. For partial results, the number of successful shards does not include the skipped shards, while the response returned to users should. Also, we recently had a bug where async search would miss tracking shard failures, which would have been caught if we had assertions in place that verified that whenever we get the last response, the number of failures included in it is the same as the failures that were tracked through the listener notifications.
1 parent 9a4fbbe commit 896422b

File tree

4 files changed

+69
-25
lines changed

4 files changed

+69
-25
lines changed

x-pack/plugin/async-search/src/main/java/org/elasticsearch/xpack/search/AsyncSearchTask.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,7 @@ public void onFinalReduce(List<SearchShard> shards, TotalHits totalHits, Interna
383383

384384
@Override
385385
public void onResponse(SearchResponse response) {
386-
searchResponse.get().updateFinalResponse(response.getSuccessfulShards(), response.getInternalResponse());
386+
searchResponse.get().updateFinalResponse(response);
387387
executeCompletionListeners();
388388
}
389389

x-pack/plugin/async-search/src/main/java/org/elasticsearch/xpack/search/MutableSearchResponse.java

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,8 @@ synchronized void updatePartialResponse(int successfulShards, SearchResponseSect
9191
throw new IllegalStateException("received partial response out of order: "
9292
+ newSections.getNumReducePhases() + " < " + sections.getNumReducePhases());
9393
}
94-
this.successfulShards = successfulShards;
94+
//when we get partial results skipped shards are not included in the provided number of successful shards
95+
this.successfulShards = successfulShards + skippedShards;
9596
this.sections = newSections;
9697
this.isPartial = true;
9798
this.isFinalReduce = isFinalReduce;
@@ -101,12 +102,20 @@ synchronized void updatePartialResponse(int successfulShards, SearchResponseSect
101102
* Updates the response with the final {@link SearchResponseSections} merged from #<code>successfulShards</code>
102103
* shards.
103104
*/
104-
synchronized void updateFinalResponse(int successfulShards, SearchResponseSections newSections) {
105+
synchronized void updateFinalResponse(SearchResponse searchResponse) {
105106
failIfFrozen();
107+
assert searchResponse.getTotalShards() == totalShards : "received number of total shards differs from the one " +
108+
"notified through onListShards";
109+
assert searchResponse.getSkippedShards() == skippedShards : "received number of skipped shards differs from the one " +
110+
"notified through onListShards";
111+
assert searchResponse.getFailedShards() == buildShardFailures().length : "number of tracked failures differs from failed shards";
106112
// copy the response headers from the current context
107113
this.responseHeaders = threadContext.getResponseHeaders();
108-
this.successfulShards = successfulShards;
109-
this.sections = newSections;
114+
//we take successful from the final response, which overrides whatever value we set when we received the last partial results.
115+
//This is important for cases where e.g. aggs work fine and then fetch fails on some of the shards but not all.
116+
//The shards where fetch has failed should not be counted as successful.
117+
this.successfulShards = searchResponse.getSuccessfulShards();
118+
this.sections = searchResponse.getInternalResponse();
110119
this.isPartial = false;
111120
this.isFinalReduce = true;
112121
this.frozen = true;
@@ -121,6 +130,8 @@ synchronized void updateWithFailure(Exception exc) {
121130
// copy the response headers from the current context
122131
this.responseHeaders = threadContext.getResponseHeaders();
123132
this.isPartial = true;
133+
//note that when search fails, we may have gotten partial results before the failure. In that case async
134+
// search will return an error plus the last partial results that were collected.
124135
this.failure = ElasticsearchException.guessRootCauses(exc)[0];
125136
this.frozen = true;
126137
}

x-pack/plugin/async-search/src/test/java/org/elasticsearch/xpack/search/AsyncSearchResponseTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ static SearchResponse randomSearchResponse() {
121121
long tookInMillis = randomNonNegativeLong();
122122
int totalShards = randomIntBetween(1, Integer.MAX_VALUE);
123123
int successfulShards = randomIntBetween(0, totalShards);
124-
int skippedShards = totalShards - successfulShards;
124+
int skippedShards = randomIntBetween(0, successfulShards);
125125
InternalSearchResponse internalSearchResponse = InternalSearchResponse.empty();
126126
return new SearchResponse(internalSearchResponse, null, totalShards,
127127
successfulShards, skippedShards, tookInMillis, ShardSearchFailure.EMPTY_ARRAY, SearchResponse.Clusters.EMPTY);

x-pack/plugin/async-search/src/test/java/org/elasticsearch/xpack/search/AsyncSearchTaskTests.java

Lines changed: 52 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -133,26 +133,24 @@ public void testWaitForCompletion() throws InterruptedException {
133133
for (int i = 0; i < numSkippedShards; i++) {
134134
skippedShards.add(new SearchShard(null, new ShardId("0", "0", 1)));
135135
}
136-
137-
int numShardFailures = 0;
136+
int totalShards = numShards + numSkippedShards;
138137
task.getSearchProgressActionListener().onListShards(shards, skippedShards, SearchResponse.Clusters.EMPTY, false);
139138
for (int i = 0; i < numShards; i++) {
140139
task.getSearchProgressActionListener().onPartialReduce(shards.subList(i, i+1),
141140
new TotalHits(0, TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO), null, 0);
142-
assertCompletionListeners(task, numShards+numSkippedShards, numSkippedShards, numShardFailures, true);
141+
assertCompletionListeners(task, totalShards, 1 + numSkippedShards, numSkippedShards, 0, true);
143142
}
144143
task.getSearchProgressActionListener().onFinalReduce(shards,
145144
new TotalHits(0, TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO), null, 0);
146-
assertCompletionListeners(task, numShards+numSkippedShards, numSkippedShards, numShardFailures, true);
145+
assertCompletionListeners(task, totalShards, totalShards, numSkippedShards, 0, true);
147146
((AsyncSearchTask.Listener)task.getProgressListener()).onResponse(
148-
newSearchResponse(numShards+numSkippedShards, numShards, numSkippedShards));
149-
assertCompletionListeners(task, numShards+numSkippedShards,
150-
numSkippedShards, numShardFailures, false);
147+
newSearchResponse(totalShards, totalShards, numSkippedShards));
148+
assertCompletionListeners(task, totalShards, totalShards, numSkippedShards, 0, false);
151149
}
152150

153151
public void testWithFetchFailures() throws InterruptedException {
154152
AsyncSearchTask task = createAsyncSearchTask();
155-
int numShards = randomIntBetween(0, 10);
153+
int numShards = randomIntBetween(2, 10);
156154
List<SearchShard> shards = new ArrayList<>();
157155
for (int i = 0; i < numShards; i++) {
158156
shards.add(new SearchShard(null, new ShardId("0", "0", 1)));
@@ -162,38 +160,72 @@ public void testWithFetchFailures() throws InterruptedException {
162160
for (int i = 0; i < numSkippedShards; i++) {
163161
skippedShards.add(new SearchShard(null, new ShardId("0", "0", 1)));
164162
}
165-
163+
int totalShards = numShards + numSkippedShards;
166164
task.getSearchProgressActionListener().onListShards(shards, skippedShards, SearchResponse.Clusters.EMPTY, false);
167165
for (int i = 0; i < numShards; i++) {
168166
task.getSearchProgressActionListener().onPartialReduce(shards.subList(i, i+1),
169167
new TotalHits(0, TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO), null, 0);
170-
assertCompletionListeners(task, numShards+numSkippedShards, numSkippedShards, 0, true);
168+
assertCompletionListeners(task, totalShards, 1 + numSkippedShards, numSkippedShards, 0, true);
171169
}
172170
task.getSearchProgressActionListener().onFinalReduce(shards,
173171
new TotalHits(0, TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO), null, 0);
174-
int numFetchFailures = randomIntBetween(0, numShards);
172+
int numFetchFailures = randomIntBetween(1, numShards - 1);
173+
ShardSearchFailure[] shardSearchFailures = new ShardSearchFailure[numFetchFailures];
175174
for (int i = 0; i < numFetchFailures; i++) {
175+
IOException failure = new IOException("boum");
176176
task.getSearchProgressActionListener().onFetchFailure(i,
177177
new SearchShardTarget("0", new ShardId("0", "0", 1), null, OriginalIndices.NONE),
178-
new IOException("boum"));
179-
178+
failure);
179+
shardSearchFailures[i] = new ShardSearchFailure(failure);
180180
}
181-
assertCompletionListeners(task, numShards+numSkippedShards, numSkippedShards, numFetchFailures, true);
181+
assertCompletionListeners(task, totalShards, totalShards, numSkippedShards, numFetchFailures, true);
182182
((AsyncSearchTask.Listener)task.getProgressListener()).onResponse(
183-
newSearchResponse(numShards+numSkippedShards, numShards, numSkippedShards));
184-
assertCompletionListeners(task, numShards+numSkippedShards,
185-
numSkippedShards, numFetchFailures, false);
183+
newSearchResponse(totalShards, totalShards - numFetchFailures, numSkippedShards, shardSearchFailures));
184+
assertCompletionListeners(task, totalShards, totalShards - numFetchFailures, numSkippedShards, numFetchFailures, false);
185+
}
186+
187+
public void testFatalFailureDuringFetch() throws InterruptedException {
188+
AsyncSearchTask task = createAsyncSearchTask();
189+
int numShards = randomIntBetween(0, 10);
190+
List<SearchShard> shards = new ArrayList<>();
191+
for (int i = 0; i < numShards; i++) {
192+
shards.add(new SearchShard(null, new ShardId("0", "0", 1)));
193+
}
194+
List<SearchShard> skippedShards = new ArrayList<>();
195+
int numSkippedShards = randomIntBetween(0, 10);
196+
for (int i = 0; i < numSkippedShards; i++) {
197+
skippedShards.add(new SearchShard(null, new ShardId("0", "0", 1)));
198+
}
199+
int totalShards = numShards + numSkippedShards;
200+
task.getSearchProgressActionListener().onListShards(shards, skippedShards, SearchResponse.Clusters.EMPTY, false);
201+
for (int i = 0; i < numShards; i++) {
202+
task.getSearchProgressActionListener().onPartialReduce(shards.subList(0, i+1),
203+
new TotalHits(0, TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO), null, 0);
204+
assertCompletionListeners(task, totalShards, i + 1 + numSkippedShards, numSkippedShards, 0, true);
205+
}
206+
task.getSearchProgressActionListener().onFinalReduce(shards,
207+
new TotalHits(0, TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO), null, 0);
208+
for (int i = 0; i < numShards; i++) {
209+
task.getSearchProgressActionListener().onFetchFailure(i,
210+
new SearchShardTarget("0", new ShardId("0", "0", 1), null, OriginalIndices.NONE),
211+
new IOException("boum"));
212+
}
213+
assertCompletionListeners(task, totalShards, totalShards, numSkippedShards, numShards, true);
214+
((AsyncSearchTask.Listener)task.getProgressListener()).onFailure(new IOException("boum"));
215+
assertCompletionListeners(task, totalShards, totalShards, numSkippedShards, numShards, true);
186216
}
187217

188-
private static SearchResponse newSearchResponse(int totalShards, int successfulShards, int skippedShards) {
218+
private static SearchResponse newSearchResponse(int totalShards, int successfulShards, int skippedShards,
219+
ShardSearchFailure... shardFailures) {
189220
InternalSearchResponse response = new InternalSearchResponse(SearchHits.empty(),
190221
InternalAggregations.EMPTY, null, null, false, null, 1);
191222
return new SearchResponse(response, null, totalShards, successfulShards, skippedShards,
192-
100, ShardSearchFailure.EMPTY_ARRAY, SearchResponse.Clusters.EMPTY);
223+
100, shardFailures, SearchResponse.Clusters.EMPTY);
193224
}
194225

195226
private void assertCompletionListeners(AsyncSearchTask task,
196227
int expectedTotalShards,
228+
int expectedSuccessfulShards,
197229
int expectedSkippedShards,
198230
int expectedShardFailures,
199231
boolean isPartial) throws InterruptedException {
@@ -204,6 +236,7 @@ private void assertCompletionListeners(AsyncSearchTask task,
204236
@Override
205237
public void onResponse(AsyncSearchResponse resp) {
206238
assertThat(resp.getSearchResponse().getTotalShards(), equalTo(expectedTotalShards));
239+
assertThat(resp.getSearchResponse().getSuccessfulShards(), equalTo(expectedSuccessfulShards));
207240
assertThat(resp.getSearchResponse().getSkippedShards(), equalTo(expectedSkippedShards));
208241
assertThat(resp.getSearchResponse().getFailedShards(), equalTo(expectedShardFailures));
209242
assertThat(resp.isPartial(), equalTo(isPartial));

0 commit comments

Comments
 (0)