Skip to content

Commit 5e9f907

Browse files
pawankartik-elasticelasticsearchmachine
andauthored
Propagate status codes from shard failures appropriately (#118016) (#119205)
* Return 502 if the underlying error is `NodeNotConnectedException` * Traverse through the cause stack trace and check for `NodeNotConnectedException` and undo `status()` override in `NodeDisconnectedException` * Rewrite `while` condition * Fix: precommit * Let status codes propagate rather than walking the stacktrace explicitly Walking the stacktrace explicitly and looking for a specific error (node connection-related errors in this case) is a workaround rather than a proper fix. Instead, let the status codes propagate all the way to the top so that they can be reported as-is. * Fix: unused import * Fix null deref * Do not map descendants of `ConnectTransportException` to `502` * Fix: precommit * Do not account for 4xx status codes 4xx status codes are not likely to appear along with 5xx status codes. As a result, we do not need to account for them when looking at shard failures' status codes. * Remove unnecessary `switch` case In reference to the previous commit: this case is no longer needed. * Rewrite code comment * Address review comments * [CI] Auto commit changes from spotless * Update docs/changelog/118016.yaml --------- Co-authored-by: elasticsearchmachine <[email protected]>
1 parent 013f363 commit 5e9f907

File tree

3 files changed

+74
-7
lines changed

3 files changed

+74
-7
lines changed

docs/changelog/118016.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 118016
2+
summary: Propagate status codes from shard failures appropriately
3+
area: Search
4+
type: enhancement
5+
issues:
6+
- 118482

server/src/main/java/org/elasticsearch/action/search/SearchPhaseExecutionException.java

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,15 +73,21 @@ public RestStatus status() {
7373
// on coordinator node. so get the status from cause instead of returning SERVICE_UNAVAILABLE blindly
7474
return getCause() == null ? RestStatus.SERVICE_UNAVAILABLE : ExceptionsHelper.status(getCause());
7575
}
76-
RestStatus status = shardFailures[0].status();
77-
if (shardFailures.length > 1) {
78-
for (int i = 1; i < shardFailures.length; i++) {
79-
if (shardFailures[i].status().getStatus() >= RestStatus.INTERNAL_SERVER_ERROR.getStatus()) {
80-
status = shardFailures[i].status();
81-
}
76+
RestStatus status = null;
77+
for (ShardSearchFailure shardFailure : shardFailures) {
78+
RestStatus shardStatus = shardFailure.status();
79+
int statusCode = shardStatus.getStatus();
80+
81+
// Return if it's an error that can be retried.
82+
// These currently take precedence over other status code(s).
83+
if (statusCode >= 502 && statusCode <= 504) {
84+
return shardStatus;
85+
} else if (statusCode >= 500) {
86+
status = shardStatus;
8287
}
8388
}
84-
return status;
89+
90+
return status == null ? shardFailures[0].status() : status;
8591
}
8692

8793
public ShardSearchFailure[] shardFailures() {

server/src/test/java/org/elasticsearch/action/search/SearchPhaseExecutionExceptionTests.java

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.elasticsearch.rest.RestStatus;
2323
import org.elasticsearch.search.SearchShardTarget;
2424
import org.elasticsearch.test.ESTestCase;
25+
import org.elasticsearch.transport.NodeDisconnectedException;
2526
import org.elasticsearch.xcontent.ToXContent;
2627
import org.elasticsearch.xcontent.XContent;
2728
import org.elasticsearch.xcontent.XContentParser;
@@ -31,6 +32,7 @@
3132

3233
import static org.hamcrest.CoreMatchers.hasItem;
3334
import static org.hamcrest.Matchers.hasSize;
35+
import static org.hamcrest.Matchers.is;
3436

3537
public class SearchPhaseExecutionExceptionTests extends ESTestCase {
3638

@@ -168,4 +170,57 @@ public void testPhaseFailureWithSearchShardFailure() {
168170

169171
assertEquals(actual.status(), RestStatus.BAD_REQUEST);
170172
}
173+
174+
public void testOnlyWithCodesThatDoNotRequirePrecedence() {
175+
int pickedIndex = randomIntBetween(0, 1);
176+
177+
// Pick one of these exceptions randomly.
178+
var searchExceptions = new ElasticsearchException[] {
179+
new ElasticsearchException("simulated"),
180+
new NodeDisconnectedException(null, "unused message", "unused action", null) };
181+
182+
// Status codes that map to searchExceptions.
183+
var expectedStatusCodes = new RestStatus[] { RestStatus.INTERNAL_SERVER_ERROR, RestStatus.BAD_GATEWAY };
184+
185+
ShardSearchFailure shardFailure1 = new ShardSearchFailure(
186+
searchExceptions[pickedIndex],
187+
new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 1), null)
188+
);
189+
190+
ShardSearchFailure shardFailure2 = new ShardSearchFailure(
191+
searchExceptions[pickedIndex],
192+
new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 2), null)
193+
);
194+
195+
SearchPhaseExecutionException ex = new SearchPhaseExecutionException(
196+
"search",
197+
"all shards failed",
198+
new ShardSearchFailure[] { shardFailure1, shardFailure2 }
199+
);
200+
201+
assertThat(ex.status(), is(expectedStatusCodes[pickedIndex]));
202+
}
203+
204+
public void testWithRetriableCodesThatTakePrecedence() {
205+
// Maps to a 500.
206+
ShardSearchFailure shardFailure1 = new ShardSearchFailure(
207+
new ElasticsearchException("simulated"),
208+
new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 1), null)
209+
);
210+
211+
// Maps to a 502.
212+
ShardSearchFailure shardFailure2 = new ShardSearchFailure(
213+
new NodeDisconnectedException(null, "unused message", "unused action", null),
214+
new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 2), null)
215+
);
216+
217+
SearchPhaseExecutionException ex = new SearchPhaseExecutionException(
218+
"search",
219+
"all shards failed",
220+
new ShardSearchFailure[] { shardFailure1, shardFailure2 }
221+
);
222+
223+
// The 502 takes precedence over 500.
224+
assertThat(ex.status(), is(RestStatus.BAD_GATEWAY));
225+
}
171226
}

0 commit comments

Comments
 (0)