99
1010import org .elasticsearch .ElasticsearchException ;
1111import org .elasticsearch .ExceptionsHelper ;
12- import org .elasticsearch .common . util . concurrent . ConcurrentCollections ;
12+ import org .elasticsearch .action . support . TransportActions ;
1313import org .elasticsearch .tasks .TaskCancelledException ;
1414import org .elasticsearch .transport .TransportException ;
1515
16+ import java .util .EnumMap ;
17+ import java .util .List ;
18+ import java .util .Map ;
1619import java .util .Queue ;
17- import java .util .concurrent .Semaphore ;
20+ import java .util .concurrent .ArrayBlockingQueue ;
1821
1922/**
2023 * {@code FailureCollector} is responsible for collecting exceptions that occur in the compute engine.
21- * The collected exceptions are categorized into task-cancelled and non-task-cancelled exceptions.
22- * To limit memory usage, this class collects only the first 10 exceptions in each category by default.
23- * When returning the accumulated failure to the caller, this class prefers non-task-cancelled exceptions
24- * over task-cancelled ones as they are more useful for diagnosing issues.
24+ * The collected exceptions are categorized into client (4xx), server (5xx), shard-unavailable errors,
25+ * and cancellation errors. To limit memory usage, this class collects only the first 10 exceptions in
26+ * each category by default. When returning the accumulated failures to the caller, this class prefers
27+ * client (4xx) errors over server (5xx) errors, shard-unavailable errors, and cancellation errors,
28+ * as they are more useful for diagnosing issues.
2529 */
2630public final class FailureCollector {
27- private final Queue <Exception > cancelledExceptions = ConcurrentCollections .newQueue ();
28- private final Semaphore cancelledExceptionsPermits ;
2931
30- private final Queue <Exception > nonCancelledExceptions = ConcurrentCollections .newQueue ();
31- private final Semaphore nonCancelledExceptionsPermits ;
32+ private enum Category {
33+ CLIENT ,
34+ SERVER ,
35+ SHARD_UNAVAILABLE ,
36+ CANCELLATION
37+ }
38+
39+ private final Map <Category , Queue <Exception >> categories ;
40+ private final int maxExceptions ;
3241
3342 private volatile boolean hasFailure = false ;
3443 private Exception finalFailure = null ;
@@ -41,8 +50,11 @@ public FailureCollector(int maxExceptions) {
4150 if (maxExceptions <= 0 ) {
4251 throw new IllegalArgumentException ("maxExceptions must be at least one" );
4352 }
44- this .cancelledExceptionsPermits = new Semaphore (maxExceptions );
45- this .nonCancelledExceptionsPermits = new Semaphore (maxExceptions );
53+ this .maxExceptions = maxExceptions ;
54+ this .categories = new EnumMap <>(Category .class );
55+ for (Category c : Category .values ()) {
56+ this .categories .put (c , new ArrayBlockingQueue <>(maxExceptions ));
57+ }
4658 }
4759
4860 public static Exception unwrapTransportException (TransportException te ) {
@@ -56,16 +68,24 @@ public static Exception unwrapTransportException(TransportException te) {
5668 }
5769 }
5870
59- public void unwrapAndCollect (Exception e ) {
60- e = e instanceof TransportException te ? unwrapTransportException (te ) : e ;
71+ private static Category getErrorCategory (Exception e ) {
6172 if (ExceptionsHelper .unwrap (e , TaskCancelledException .class ) != null ) {
62- if (nonCancelledExceptions .isEmpty () && cancelledExceptionsPermits .tryAcquire ()) {
63- cancelledExceptions .add (e );
73+ return Category .CANCELLATION ;
74+ } else if (TransportActions .isShardNotAvailableException (e )) {
75+ return Category .SHARD_UNAVAILABLE ;
76+ } else {
77+ final int status = ExceptionsHelper .status (e ).getStatus ();
78+ if (400 <= status && status < 500 ) {
79+ return Category .CLIENT ;
80+ } else {
81+ return Category .SERVER ;
6482 }
65- } else if (nonCancelledExceptionsPermits .tryAcquire ()) {
66- nonCancelledExceptions .add (e );
67- cancelledExceptions .clear ();
6883 }
84+ }
85+
86+ public void unwrapAndCollect (Exception e ) {
87+ e = e instanceof TransportException te ? unwrapTransportException (te ) : e ;
88+ categories .get (getErrorCategory (e )).offer (e );
6989 hasFailure = true ;
7090 }
7191
@@ -77,8 +97,8 @@ public boolean hasFailure() {
7797 }
7898
7999 /**
80- * Returns the accumulated failure, preferring non-task-cancelled exceptions over task-cancelled ones.
81- * Once this method builds the failure, incoming failures are discarded.
100+ * Returns the accumulated failure, preferring client (4xx) errors over server (5xx) errors and cancellation errors,
101+ * as they are more useful for diagnosing issues. Once this method builds the failure, incoming failures are discarded.
82102 *
83103 * @return the accumulated failure, or {@code null} if no failure has been collected
84104 */
@@ -98,21 +118,19 @@ private Exception buildFailure() {
98118 assert hasFailure ;
99119 assert Thread .holdsLock (this );
100120 Exception first = null ;
101- for (Exception e : nonCancelledExceptions ) {
102- if (first == null ) {
103- first = e ;
104- } else if (first != e ) {
105- first .addSuppressed (e );
121+ int collected = 0 ;
122+ for (Category category : List .of (Category .CLIENT , Category .SERVER , Category .SHARD_UNAVAILABLE , Category .CANCELLATION )) {
123+ if (first != null && category == Category .CANCELLATION ) {
124+ continue ; // do not add cancellation errors if other errors present
106125 }
107- }
108- if (first != null ) {
109- return first ;
110- }
111- for (Exception e : cancelledExceptions ) {
112- if (first == null ) {
113- first = e ;
114- } else if (first != e ) {
115- first .addSuppressed (e );
126+ for (Exception e : categories .get (category )) {
127+ if (++collected <= maxExceptions ) {
128+ if (first == null ) {
129+ first = e ;
130+ } else if (first != e ) {
131+ first .addSuppressed (e );
132+ }
133+ }
116134 }
117135 }
118136 assert first != null ;
0 commit comments