99
1010import org .elasticsearch .ElasticsearchException ;
1111import org .elasticsearch .ExceptionsHelper ;
12+ import org .elasticsearch .action .support .TransportActions ;
1213import org .elasticsearch .common .util .concurrent .ConcurrentCollections ;
1314import org .elasticsearch .tasks .TaskCancelledException ;
1415import org .elasticsearch .transport .TransportException ;
1516
17+ import java .util .EnumMap ;
18+ import java .util .List ;
19+ import java .util .Map ;
1620import java .util .Queue ;
1721import java .util .concurrent .Semaphore ;
1822
1923/**
2024 * {@code FailureCollector} is responsible for collecting exceptions that occur in the compute engine.
21- * The collected exceptions are categorized into task-cancelled and non-task-cancelled exceptions.
22- * To limit memory usage, this class collects only the first 10 exceptions in each category by default.
23- * When returning the accumulated failure to the caller, this class prefers non-task-cancelled exceptions
24- * over task-cancelled ones as they are more useful for diagnosing issues.
25+ * The collected exceptions are categorized into client (4xx), server (5xx), shard-unavailable errors,
26+ * and cancellation errors. To limit memory usage, this class collects only the first 10 exceptions in
27+ * each category by default. When returning the accumulated failures to the caller, this class prefers
28+ * client (4xx) errors over server (5xx) errors, shard-unavailable errors, and cancellation errors,
29+ * as they are more useful for diagnosing issues.
2530 */
2631public final class FailureCollector {
27- private final Queue <Exception > cancelledExceptions = ConcurrentCollections .newQueue ();
28- private final Semaphore cancelledExceptionsPermits ;
2932
30- private final Queue <Exception > nonCancelledExceptions = ConcurrentCollections .newQueue ();
31- private final Semaphore nonCancelledExceptionsPermits ;
33+ private enum Category {
34+ CLIENT ,
35+ SERVER ,
36+ SHARD_UNAVAILABLE ,
37+ CANCELLATION
38+ }
39+
40+ private static final class CategorizedErrors {
41+ final Queue <Exception > exceptions = ConcurrentCollections .newQueue ();
42+ final Semaphore permits ;
43+
44+ CategorizedErrors (int permits ) {
45+ this .permits = new Semaphore (permits );
46+ }
47+
48+ void maybeCollect (Exception e ) {
49+ if (permits .tryAcquire ()) {
50+ exceptions .add (e );
51+ }
52+ }
53+ }
54+
55+ private final Map <Category , CategorizedErrors > categories ;
56+ private final int maxExceptions ;
3257
3358 private volatile boolean hasFailure = false ;
3459 private Exception finalFailure = null ;
@@ -41,8 +66,11 @@ public FailureCollector(int maxExceptions) {
4166 if (maxExceptions <= 0 ) {
4267 throw new IllegalArgumentException ("maxExceptions must be at least one" );
4368 }
44- this .cancelledExceptionsPermits = new Semaphore (maxExceptions );
45- this .nonCancelledExceptionsPermits = new Semaphore (maxExceptions );
69+ this .maxExceptions = maxExceptions ;
70+ this .categories = new EnumMap <>(Category .class );
71+ for (Category c : Category .values ()) {
72+ this .categories .put (c , new CategorizedErrors (maxExceptions ));
73+ }
4674 }
4775
4876 public static Exception unwrapTransportException (TransportException te ) {
@@ -56,16 +84,24 @@ public static Exception unwrapTransportException(TransportException te) {
5684 }
5785 }
5886
59- public void unwrapAndCollect (Exception e ) {
60- e = e instanceof TransportException te ? unwrapTransportException (te ) : e ;
87+ private static Category getErrorCategory (Exception e ) {
6188 if (ExceptionsHelper .unwrap (e , TaskCancelledException .class ) != null ) {
62- if (nonCancelledExceptions .isEmpty () && cancelledExceptionsPermits .tryAcquire ()) {
63- cancelledExceptions .add (e );
89+ return Category .CANCELLATION ;
90+ } else if (TransportActions .isShardNotAvailableException (e )) {
91+ return Category .SHARD_UNAVAILABLE ;
92+ } else {
93+ final int status = ExceptionsHelper .status (e ).getStatus ();
94+ if (400 <= status && status < 500 ) {
95+ return Category .CLIENT ;
96+ } else {
97+ return Category .SERVER ;
6498 }
65- } else if (nonCancelledExceptionsPermits .tryAcquire ()) {
66- nonCancelledExceptions .add (e );
67- cancelledExceptions .clear ();
6899 }
100+ }
101+
102+ public void unwrapAndCollect (Exception e ) {
103+ e = e instanceof TransportException te ? unwrapTransportException (te ) : e ;
104+ categories .get (getErrorCategory (e )).maybeCollect (e );
69105 hasFailure = true ;
70106 }
71107
@@ -77,8 +113,8 @@ public boolean hasFailure() {
77113 }
78114
79115 /**
80- * Returns the accumulated failure, preferring non-task-cancelled exceptions over task-cancelled ones.
81- * Once this method builds the failure, incoming failures are discarded.
116+ * Returns the accumulated failure, preferring client (4xx) errors over server (5xx) errors and cancellation errors,
117+ * as they are more useful for diagnosing issues. Once this method builds the failure, incoming failures are discarded.
82118 *
83119 * @return the accumulated failure, or {@code null} if no failure has been collected
84120 */
@@ -98,21 +134,19 @@ private Exception buildFailure() {
98134 assert hasFailure ;
99135 assert Thread .holdsLock (this );
100136 Exception first = null ;
101- for (Exception e : nonCancelledExceptions ) {
102- if (first == null ) {
103- first = e ;
104- } else if (first != e ) {
105- first .addSuppressed (e );
137+ int collected = 0 ;
138+ for (Category category : List .of (Category .CLIENT , Category .SERVER , Category .SHARD_UNAVAILABLE , Category .CANCELLATION )) {
139+ if (first != null && category == Category .CANCELLATION ) {
140+ continue ; // do not add cancellation errors if other errors present
106141 }
107- }
108- if (first != null ) {
109- return first ;
110- }
111- for (Exception e : cancelledExceptions ) {
112- if (first == null ) {
113- first = e ;
114- } else if (first != e ) {
115- first .addSuppressed (e );
142+ for (Exception e : categories .get (category ).exceptions ) {
143+ if (++collected <= maxExceptions ) {
144+ if (first == null ) {
145+ first = e ;
146+ } else if (first != e ) {
147+ first .addSuppressed (e );
148+ }
149+ }
116150 }
117151 }
118152 assert first != null ;
0 commit comments