Skip to content

Commit 849ae09

Browse files
committed
Summarize cause when replacing no shard available exception in search
When a shard fails during search, the error handling logic looks at the root cause and in case it is in the availability bucket, the exception is replaced with a more generic NoShardAvailableException without a shard_id and cause, to prevent spamming the coord node with many of the same errors. This is quite lossy, and makes it very hard to track actual problems if they arise, especially as the message used for the new exception does not include the cause in any way. This commit reuses the toString of both the exception and its cause, which is going to concatenate their class name and message in a single reason field.
1 parent 99897b1 commit 849ae09

File tree

4 files changed

+26
-15
lines changed

4 files changed

+26
-15
lines changed

server/src/main/java/org/elasticsearch/action/NoShardAvailableActionException.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ public final class NoShardAvailableActionException extends ElasticsearchExceptio
2323
// It isn't necessary to serialize this field over the wire as the empty stack trace is serialized instead.
2424
private final boolean onShardFailureWrapper;
2525

26-
public static NoShardAvailableActionException forOnShardFailureWrapper(String msg) {
27-
return new NoShardAvailableActionException(null, msg, null, true);
26+
public static NoShardAvailableActionException forOnShardFailureWrapper(Exception e, Throwable cause) {
27+
return new NoShardAvailableActionException(null, e + ", cause: " + cause, null, true);
2828
}
2929

3030
public NoShardAvailableActionException(ShardId shardId) {

server/src/main/java/org/elasticsearch/action/search/AbstractSearchAsyncAction.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -442,10 +442,11 @@ protected void onShardGroupFailure(int shardIndex, SearchShardTarget shardTarget
442442
* @param e the failure reason
443443
*/
444444
void onShardFailure(final int shardIndex, SearchShardTarget shardTarget, Exception e) {
445-
if (TransportActions.isShardNotAvailableException(e)) {
445+
Throwable cause = ExceptionsHelper.unwrapCause(e);
446+
if (TransportActions.isShardNotAvailableException(cause)) {
446447
// Groups shard not available exceptions under a generic exception that returns a SERVICE_UNAVAILABLE(503)
447448
// temporary error.
448-
e = NoShardAvailableActionException.forOnShardFailureWrapper(e.getMessage());
449+
e = NoShardAvailableActionException.forOnShardFailureWrapper(e, cause);
449450
}
450451
// we don't aggregate shard on failures due to the internal cancellation,
451452
// but do keep the header counts right

server/src/test/java/org/elasticsearch/action/search/ShardSearchFailureTests.java

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
package org.elasticsearch.action.search;
1111

12+
import org.apache.lucene.store.AlreadyClosedException;
1213
import org.elasticsearch.action.NoShardAvailableActionException;
1314
import org.elasticsearch.cluster.metadata.IndexMetadata;
1415
import org.elasticsearch.common.ParsingException;
@@ -116,17 +117,22 @@ public void testToXContent() throws IOException {
116117
public void testToXContentForNoShardAvailable() throws IOException {
117118
ShardId shardId = new ShardId(new Index("indexName", "indexUuid"), 123);
118119
ShardSearchFailure failure = new ShardSearchFailure(
119-
NoShardAvailableActionException.forOnShardFailureWrapper("shard unassigned"),
120+
NoShardAvailableActionException.forOnShardFailureWrapper(new AlreadyClosedException("shard unassigned"), null),
120121
new SearchShardTarget("nodeId", shardId, null)
121122
);
122123
BytesReference xContent = toXContent(failure, XContentType.JSON, randomBoolean());
123-
assertEquals(XContentHelper.stripWhitespace("""
124-
{
125-
"shard": 123,
126-
"index": "indexName",
127-
"node": "nodeId",
128-
"reason":{"type":"no_shard_available_action_exception","reason":"shard unassigned"}
129-
}"""), xContent.utf8ToString());
124+
assertEquals(
125+
XContentHelper.stripWhitespace(
126+
"""
127+
{
128+
"shard": 123,
129+
"index": "indexName",
130+
"node": "nodeId",
131+
"reason":{"type":"no_shard_available_action_exception","reason":"org.apache.lucene.store.AlreadyClosedException: shard unassigned, cause: null"}
132+
}"""
133+
),
134+
xContent.utf8ToString()
135+
);
130136
}
131137

132138
public void testToXContentWithClusterAlias() throws IOException {
@@ -156,7 +162,10 @@ public void testSerialization() throws IOException {
156162
testItem = createTestItem(randomAlphaOfLength(12));
157163
} else {
158164
SearchShardTarget target = randomShardTarget(randomAlphaOfLength(12));
159-
testItem = new ShardSearchFailure(NoShardAvailableActionException.forOnShardFailureWrapper("unavailable"), target);
165+
testItem = new ShardSearchFailure(
166+
NoShardAvailableActionException.forOnShardFailureWrapper(new AlreadyClosedException("unavailable"), null),
167+
target
168+
);
160169
}
161170
ShardSearchFailure deserializedInstance = copyWriteable(
162171
testItem,

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plugin/DataNodeRequestSender.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,9 @@ interface NodeListener {
202202

203203
private static Exception unwrapFailure(Exception e) {
204204
e = e instanceof TransportException te ? FailureCollector.unwrapTransportException(te) : e;
205-
if (TransportActions.isShardNotAvailableException(e)) {
206-
return NoShardAvailableActionException.forOnShardFailureWrapper(e.getMessage());
205+
Throwable cause = ExceptionsHelper.unwrapCause(e);
206+
if (TransportActions.isShardNotAvailableException(cause)) {
207+
return NoShardAvailableActionException.forOnShardFailureWrapper(e, cause);
207208
} else {
208209
return e;
209210
}

0 commit comments

Comments
 (0)