Skip to content

Commit 7460156

Browse files
authored
Add application err category (#2485)
* update protos * add ApplicationErrorCategory enum * Add category field to ApplicationFailure * add category proto converter logic * add logging/metrics checks for benign application failures * fixes, added test for workflow failure metric * fixes for activity failures, added test for activity failures * address PR review * only check immediate failure * define code enum, convert to/from proto representation * cleanup
1 parent 2807771 commit 7460156

File tree

11 files changed

+618
-22
lines changed

11 files changed

+618
-22
lines changed
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
* Copyright (C) 2022 Temporal Technologies, Inc. All Rights Reserved.
3+
*
4+
* Copyright (C) 2012-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5+
*
6+
* Modifications copyright (C) 2017 Uber Technologies, Inc.
7+
*
8+
* Licensed under the Apache License, Version 2.0 (the "License");
9+
* you may not use this material except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*/
20+
21+
package io.temporal.failure;
22+
23+
/**
24+
* Used to categorize application failures, for example, to distinguish benign errors from others.
25+
*
26+
* @see io.temporal.api.enums.v1.ApplicationErrorCategory
27+
*/
28+
public enum ApplicationErrorCategory {
29+
UNSPECIFIED,
30+
/** Expected application error with little/no severity. */
31+
BENIGN,
32+
;
33+
}

temporal-sdk/src/main/java/io/temporal/failure/ApplicationFailure.java

Lines changed: 62 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,15 @@
5151
* <li>nonRetryable is set to false
5252
* <li>details are set to null
5353
* <li>stack trace is copied from the original exception
54+
* <li>category is set to ApplicationErrorCategory.APPLICATION_ERROR_CATEGORY_UNSPECIFIED
5455
* </ul>
5556
*/
5657
public final class ApplicationFailure extends TemporalFailure {
5758
private final String type;
5859
private final Values details;
5960
private boolean nonRetryable;
6061
private Duration nextRetryDelay;
62+
private final ApplicationErrorCategory category;
6163

6264
/**
6365
* New ApplicationFailure with {@link #isNonRetryable()} flag set to false.
@@ -92,7 +94,14 @@ public static ApplicationFailure newFailure(String message, String type, Object.
9294
*/
9395
public static ApplicationFailure newFailureWithCause(
9496
String message, String type, @Nullable Throwable cause, Object... details) {
95-
return new ApplicationFailure(message, type, false, new EncodedValues(details), cause, null);
97+
return new ApplicationFailure(
98+
message,
99+
type,
100+
false,
101+
new EncodedValues(details),
102+
cause,
103+
null,
104+
ApplicationErrorCategory.UNSPECIFIED);
96105
}
97106

98107
/**
@@ -118,7 +127,13 @@ public static ApplicationFailure newFailureWithCauseAndDelay(
118127
Duration nextRetryDelay,
119128
Object... details) {
120129
return new ApplicationFailure(
121-
message, type, false, new EncodedValues(details), cause, nextRetryDelay);
130+
message,
131+
type,
132+
false,
133+
new EncodedValues(details),
134+
cause,
135+
nextRetryDelay,
136+
ApplicationErrorCategory.UNSPECIFIED);
122137
}
123138

124139
/**
@@ -153,7 +168,40 @@ public static ApplicationFailure newNonRetryableFailure(
153168
*/
154169
public static ApplicationFailure newNonRetryableFailureWithCause(
155170
String message, String type, @Nullable Throwable cause, Object... details) {
156-
return new ApplicationFailure(message, type, true, new EncodedValues(details), cause, null);
171+
return new ApplicationFailure(
172+
message,
173+
type,
174+
true,
175+
new EncodedValues(details),
176+
cause,
177+
null,
178+
ApplicationErrorCategory.UNSPECIFIED);
179+
}
180+
181+
/**
182+
* New ApplicationFailure with a specified category and {@link #isNonRetryable()} flag set to
183+
* false.
184+
*
185+
* <p>Note that this exception still may not be retried by the service if its type is included in
186+
* the doNotRetry property of the correspondent retry policy.
187+
*
188+
* @param message optional error message
189+
* @param type error type
190+
* @param category the category of the application failure.
191+
* @param cause failure cause. Each element of the cause chain will be converted to
192+
* ApplicationFailure for network transmission across network if it doesn't extend {@link
193+
* TemporalFailure}
194+
* @param details optional details about the failure. They are serialized using the same approach
195+
* as arguments and results.
196+
*/
197+
public static ApplicationFailure newFailureWithCategory(
198+
String message,
199+
String type,
200+
ApplicationErrorCategory category,
201+
@Nullable Throwable cause,
202+
Object... details) {
203+
return new ApplicationFailure(
204+
message, type, false, new EncodedValues(details), cause, null, category);
157205
}
158206

159207
static ApplicationFailure newFromValues(
@@ -162,8 +210,10 @@ static ApplicationFailure newFromValues(
162210
boolean nonRetryable,
163211
Values details,
164212
Throwable cause,
165-
Duration nextRetryDelay) {
166-
return new ApplicationFailure(message, type, nonRetryable, details, cause, nextRetryDelay);
213+
Duration nextRetryDelay,
214+
ApplicationErrorCategory category) {
215+
return new ApplicationFailure(
216+
message, type, nonRetryable, details, cause, nextRetryDelay, category);
167217
}
168218

169219
ApplicationFailure(
@@ -172,12 +222,14 @@ static ApplicationFailure newFromValues(
172222
boolean nonRetryable,
173223
Values details,
174224
Throwable cause,
175-
Duration nextRetryDelay) {
225+
Duration nextRetryDelay,
226+
ApplicationErrorCategory category) {
176227
super(getMessage(message, Objects.requireNonNull(type), nonRetryable), message, cause);
177228
this.type = type;
178229
this.details = details;
179230
this.nonRetryable = nonRetryable;
180231
this.nextRetryDelay = nextRetryDelay;
232+
this.category = category;
181233
}
182234

183235
public String getType() {
@@ -210,6 +262,10 @@ public void setNextRetryDelay(Duration nextRetryDelay) {
210262
this.nextRetryDelay = nextRetryDelay;
211263
}
212264

265+
public ApplicationErrorCategory getApplicationErrorCategory() {
266+
return category;
267+
}
268+
213269
private static String getMessage(String message, String type, boolean nonRetryable) {
214270
return (Strings.isNullOrEmpty(message) ? "" : "message='" + message + "', ")
215271
+ "type='"

temporal-sdk/src/main/java/io/temporal/failure/DefaultFailureConverter.java

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import io.temporal.common.converter.EncodedValues;
3535
import io.temporal.common.converter.FailureConverter;
3636
import io.temporal.internal.activity.ActivityTaskHandlerImpl;
37+
import io.temporal.internal.common.FailureUtils;
3738
import io.temporal.internal.common.ProtobufTimeUtils;
3839
import io.temporal.internal.sync.POJOWorkflowImplementationFactory;
3940
import io.temporal.serviceclient.CheckedExceptionWrapper;
@@ -106,7 +107,8 @@ private RuntimeException failureToExceptionImpl(Failure failure, DataConverter d
106107
cause,
107108
info.hasNextRetryDelay()
108109
? ProtobufTimeUtils.toJavaDuration(info.getNextRetryDelay())
109-
: null);
110+
: null,
111+
FailureUtils.categoryFromProto(info.getCategory()));
110112
}
111113
case TIMEOUT_FAILURE_INFO:
112114
{
@@ -146,13 +148,14 @@ private RuntimeException failureToExceptionImpl(Failure failure, DataConverter d
146148
info.hasLastHeartbeatDetails()
147149
? Optional.of(info.getLastHeartbeatDetails())
148150
: Optional.empty();
149-
return new ApplicationFailure(
151+
return ApplicationFailure.newFromValues(
150152
failure.getMessage(),
151153
"ResetWorkflow",
152154
false,
153155
new EncodedValues(details, dataConverter),
154156
cause,
155-
null);
157+
null,
158+
ApplicationErrorCategory.UNSPECIFIED);
156159
}
157160
case ACTIVITY_FAILURE_INFO:
158161
{
@@ -214,7 +217,8 @@ private RuntimeException failureToExceptionImpl(Failure failure, DataConverter d
214217
false,
215218
new EncodedValues(Optional.empty(), dataConverter),
216219
cause,
217-
null);
220+
null,
221+
ApplicationErrorCategory.UNSPECIFIED);
218222
}
219223
}
220224

@@ -260,7 +264,8 @@ private Failure exceptionToFailure(Throwable throwable) {
260264
ApplicationFailureInfo.Builder info =
261265
ApplicationFailureInfo.newBuilder()
262266
.setType(ae.getType())
263-
.setNonRetryable(ae.isNonRetryable());
267+
.setNonRetryable(ae.isNonRetryable())
268+
.setCategory(FailureUtils.categoryToProto(ae.getApplicationErrorCategory()));
264269
Optional<Payloads> details = ((EncodedValues) ae.getDetails()).toPayloads();
265270
if (details.isPresent()) {
266271
info.setDetails(details.get());
@@ -352,7 +357,10 @@ private Failure exceptionToFailure(Throwable throwable) {
352357
ApplicationFailureInfo.Builder info =
353358
ApplicationFailureInfo.newBuilder()
354359
.setType(throwable.getClass().getName())
355-
.setNonRetryable(false);
360+
.setNonRetryable(false)
361+
.setCategory(
362+
io.temporal.api.enums.v1.ApplicationErrorCategory
363+
.APPLICATION_ERROR_CATEGORY_UNSPECIFIED);
356364
failure.setApplicationFailureInfo(info);
357365
}
358366
return failure.build();

temporal-sdk/src/main/java/io/temporal/internal/activity/ActivityTaskExecutors.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import io.temporal.common.interceptors.ActivityInboundCallsInterceptor.ActivityOutput;
3737
import io.temporal.common.interceptors.Header;
3838
import io.temporal.common.interceptors.WorkerInterceptor;
39+
import io.temporal.internal.common.FailureUtils;
3940
import io.temporal.internal.worker.ActivityTaskHandler;
4041
import io.temporal.payload.context.ActivitySerializationContext;
4142
import io.temporal.serviceclient.CheckedExceptionWrapper;
@@ -122,6 +123,14 @@ public ActivityTaskHandler.Result execute(ActivityInfoInternal info, Scope metri
122123
info.getActivityId(),
123124
info.getActivityType(),
124125
info.getAttempt());
126+
} else if (FailureUtils.isBenignApplicationFailure(ex)) {
127+
log.debug(
128+
"{} failure. ActivityId={}, activityType={}, attempt={}",
129+
local ? "Local activity" : "Activity",
130+
info.getActivityId(),
131+
info.getActivityType(),
132+
info.getAttempt(),
133+
ex);
125134
} else {
126135
log.warn(
127136
"{} failure. ActivityId={}, activityType={}, attempt={}",

temporal-sdk/src/main/java/io/temporal/internal/activity/ActivityTaskHandlerImpl.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import io.temporal.common.metadata.POJOActivityImplMetadata;
3737
import io.temporal.common.metadata.POJOActivityMethodMetadata;
3838
import io.temporal.internal.activity.ActivityTaskExecutors.ActivityTaskExecutor;
39+
import io.temporal.internal.common.FailureUtils;
3940
import io.temporal.internal.common.env.ReflectionUtils;
4041
import io.temporal.internal.worker.ActivityTask;
4142
import io.temporal.internal.worker.ActivityTaskHandler;
@@ -209,11 +210,13 @@ static ActivityTaskHandler.Result mapToActivityFailure(
209210
Scope ms =
210211
metricsScope.tagged(
211212
ImmutableMap.of(MetricsTag.EXCEPTION, exception.getClass().getSimpleName()));
212-
if (isLocalActivity) {
213-
ms.counter(MetricsType.LOCAL_ACTIVITY_EXEC_FAILED_COUNTER).inc(1);
214-
ms.counter(MetricsType.LOCAL_ACTIVITY_FAILED_COUNTER).inc(1);
215-
} else {
216-
ms.counter(MetricsType.ACTIVITY_EXEC_FAILED_COUNTER).inc(1);
213+
if (!FailureUtils.isBenignApplicationFailure(exception)) {
214+
if (isLocalActivity) {
215+
ms.counter(MetricsType.LOCAL_ACTIVITY_EXEC_FAILED_COUNTER).inc(1);
216+
ms.counter(MetricsType.LOCAL_ACTIVITY_FAILED_COUNTER).inc(1);
217+
} else {
218+
ms.counter(MetricsType.ACTIVITY_EXEC_FAILED_COUNTER).inc(1);
219+
}
217220
}
218221
Failure failure = dataConverter.exceptionToFailure(exception);
219222
RespondActivityTaskFailedRequest.Builder result =
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
* Copyright (C) 2022 Temporal Technologies, Inc. All Rights Reserved.
3+
*
4+
* Copyright (C) 2012-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5+
*
6+
* Modifications copyright (C) 2017 Uber Technologies, Inc.
7+
*
8+
* Licensed under the Apache License, Version 2.0 (the "License");
9+
* you may not use this material except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*/
20+
21+
package io.temporal.internal.common;
22+
23+
import io.temporal.api.failure.v1.Failure;
24+
import io.temporal.failure.ApplicationErrorCategory;
25+
import io.temporal.failure.ApplicationFailure;
26+
import javax.annotation.Nullable;
27+
28+
public class FailureUtils {
29+
private FailureUtils() {}
30+
31+
public static boolean isBenignApplicationFailure(@Nullable Throwable t) {
32+
if (t instanceof ApplicationFailure
33+
&& ((ApplicationFailure) t).getApplicationErrorCategory()
34+
== ApplicationErrorCategory.BENIGN) {
35+
return true;
36+
}
37+
return false;
38+
}
39+
40+
public static boolean isBenignApplicationFailure(@Nullable Failure failure) {
41+
if (failure != null
42+
&& failure.getApplicationFailureInfo() != null
43+
&& FailureUtils.categoryFromProto(failure.getApplicationFailureInfo().getCategory())
44+
== ApplicationErrorCategory.BENIGN) {
45+
return true;
46+
}
47+
return false;
48+
}
49+
50+
public static ApplicationErrorCategory categoryFromProto(
51+
io.temporal.api.enums.v1.ApplicationErrorCategory protoCategory) {
52+
if (protoCategory == null) {
53+
return ApplicationErrorCategory.UNSPECIFIED;
54+
}
55+
switch (protoCategory) {
56+
case APPLICATION_ERROR_CATEGORY_BENIGN:
57+
return ApplicationErrorCategory.BENIGN;
58+
case APPLICATION_ERROR_CATEGORY_UNSPECIFIED:
59+
case UNRECOGNIZED:
60+
default:
61+
// Fallback unrecognized or unspecified proto values as UNSPECIFIED
62+
return ApplicationErrorCategory.UNSPECIFIED;
63+
}
64+
}
65+
66+
public static io.temporal.api.enums.v1.ApplicationErrorCategory categoryToProto(
67+
io.temporal.failure.ApplicationErrorCategory category) {
68+
switch (category) {
69+
case BENIGN:
70+
return io.temporal.api.enums.v1.ApplicationErrorCategory.APPLICATION_ERROR_CATEGORY_BENIGN;
71+
case UNSPECIFIED:
72+
default:
73+
// Fallback to UNSPECIFIED for unknown values
74+
return io.temporal.api.enums.v1.ApplicationErrorCategory
75+
.APPLICATION_ERROR_CATEGORY_UNSPECIFIED;
76+
}
77+
}
78+
}

temporal-sdk/src/main/java/io/temporal/internal/replay/ReplayWorkflowExecutor.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import io.temporal.api.update.v1.Input;
3535
import io.temporal.api.update.v1.Request;
3636
import io.temporal.failure.CanceledFailure;
37+
import io.temporal.internal.common.FailureUtils;
3738
import io.temporal.internal.common.ProtobufTimeUtils;
3839
import io.temporal.internal.common.UpdateMessage;
3940
import io.temporal.internal.statemachines.WorkflowStateMachines;
@@ -153,7 +154,9 @@ private void completeWorkflow(@Nullable WorkflowExecutionException failure) {
153154
metricsScope.counter(MetricsType.WORKFLOW_CANCELED_COUNTER).inc(1);
154155
} else if (failure != null) {
155156
workflowStateMachines.failWorkflow(failure.getFailure());
156-
metricsScope.counter(MetricsType.WORKFLOW_FAILED_COUNTER).inc(1);
157+
if (!FailureUtils.isBenignApplicationFailure(failure.getFailure())) {
158+
metricsScope.counter(MetricsType.WORKFLOW_FAILED_COUNTER).inc(1);
159+
}
157160
} else {
158161
ContinueAsNewWorkflowExecutionCommandAttributes attributes =
159162
context.getContinueAsNewOnCompletion();

0 commit comments

Comments
 (0)