Skip to content

Commit 1f0bb7f

Browse files
authored
feat: add bulk exit cause list reporting for compute stages (#653)
1 parent 2fe69d0 commit 1f0bb7f

File tree

8 files changed

+259
-171
lines changed

8 files changed

+259
-171
lines changed

src/main/java/com/iexec/worker/compute/ComputeController.java

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package com.iexec.worker.compute;
1818

1919

20+
import com.iexec.common.replicate.ReplicateStatusCause;
2021
import com.iexec.common.result.ComputedFile;
2122
import com.iexec.common.worker.api.ExitMessage;
2223
import com.iexec.worker.chain.WorkerpoolAuthorizationService;
@@ -25,6 +26,7 @@
2526
import org.springframework.http.ResponseEntity;
2627
import org.springframework.web.bind.annotation.*;
2728

29+
import java.util.List;
2830
import java.util.NoSuchElementException;
2931

3032
import static org.springframework.http.ResponseEntity.ok;
@@ -44,12 +46,27 @@ public ComputeController(final ComputeExitCauseService computeStageExitService,
4446
this.workerpoolAuthorizationService = workerpoolAuthorizationService;
4547
}
4648

49+
/**
50+
* @deprecated Use {@link #sendExitCausesForGivenComputeStage(String, ComputeStage, String, List)}
51+
* for bulk exit cause reporting instead
52+
*/
53+
@Deprecated(forRemoval = true) // TODO: Add version when releasing next one
4754
@PostMapping("/compute/{stage}/{chainTaskId}/exit")
4855
public ResponseEntity<Void> sendExitCauseForGivenComputeStage(
4956
@RequestHeader("Authorization") String authorization,
5057
@PathVariable ComputeStage stage,
5158
@PathVariable String chainTaskId,
5259
@RequestBody ExitMessage exitMessage) {
60+
List<ReplicateStatusCause> causes = exitMessage != null && exitMessage.cause() != null ? List.of(exitMessage.cause()) : List.of();
61+
return sendExitCausesForGivenComputeStage(authorization, stage, chainTaskId, causes);
62+
}
63+
64+
@PostMapping("/compute/{stage}/{chainTaskId}/exit-causes")
65+
public ResponseEntity<Void> sendExitCausesForGivenComputeStage(
66+
@RequestHeader("Authorization") String authorization,
67+
@PathVariable ComputeStage stage,
68+
@PathVariable String chainTaskId,
69+
@RequestBody List<ReplicateStatusCause> causes) {
5370
try {
5471
if (!workerpoolAuthorizationService.isSignedWithEnclaveChallenge(chainTaskId, authorization)) {
5572
return ResponseEntity
@@ -62,14 +79,15 @@ public ResponseEntity<Void> sendExitCauseForGivenComputeStage(
6279
.build();
6380
}
6481

65-
if (exitMessage.cause() == null) {
82+
if (causes == null || causes.isEmpty()) {
6683
return ResponseEntity
6784
.status(HttpStatus.BAD_REQUEST.value())
6885
.build();
6986
}
70-
if (!computeStageExitService.setExitCause(stage,
71-
chainTaskId,
72-
exitMessage.cause())) {
87+
88+
final boolean stored = computeStageExitService.setExitCausesForGivenComputeStage(stage, chainTaskId, causes);
89+
90+
if (!stored) {
7391
return ResponseEntity
7492
.status(HttpStatus.ALREADY_REPORTED.value())
7593
.build();
@@ -103,5 +121,4 @@ public ResponseEntity<String> sendComputedFileForTee(@RequestHeader("Authorizati
103121
}
104122
return ok(chainTaskId);
105123
}
106-
107124
}
Lines changed: 40 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2020 IEXEC BLOCKCHAIN TECH
2+
* Copyright 2020-2025 IEXEC BLOCKCHAIN TECH
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -21,73 +21,65 @@
2121
import org.springframework.stereotype.Service;
2222

2323
import java.util.HashMap;
24+
import java.util.List;
2425

2526

2627
@Slf4j
2728
@Service
2829
public class ComputeExitCauseService {
2930

30-
private final HashMap<String, ReplicateStatusCause> exitCauseMap = new HashMap<>();
31+
private final HashMap<String, List<ReplicateStatusCause>> exitCauseMap = new HashMap<>();
3132

3233
/**
33-
* Report failure exit cause from pre-compute or post-compute enclave.
34+
* Report failure exit causes from pre-compute or post-compute enclave.
35+
* Guarantees that exit causes can only be reported once per compute stage and task.
3436
*
3537
* @param computeStage pre-compute or post-compute-stage label
3638
* @param chainTaskId task ID
37-
* @param exitCause root cause of the failure
38-
* @return true if exit cause is reported
39+
* @param causes list of root causes of the failure
40+
* @return true if exit causes are reported, false if already reported
3941
*/
40-
boolean setExitCause(ComputeStage computeStage,
41-
String chainTaskId,
42-
ReplicateStatusCause exitCause) {
43-
String key = buildKey(computeStage, chainTaskId);
42+
boolean setExitCausesForGivenComputeStage(final ComputeStage computeStage,
43+
final String chainTaskId,
44+
final List<ReplicateStatusCause> causes) {
45+
final String key = buildKey(computeStage, chainTaskId);
46+
4447
if (exitCauseMap.containsKey(key)) {
45-
log.info("Cannot set exit cause since already set " +
46-
"[computeStage:{}, chainTaskId:{}, exitCause:{}]",
47-
computeStage, chainTaskId, exitCause);
48+
log.warn("Exit causes already reported for compute stage [computeStage:{}, chainTaskId:{}]",
49+
computeStage, chainTaskId);
4850
return false;
4951
}
50-
exitCauseMap.put(key, exitCause);
51-
log.info("Added exit cause [computeStage:{}, chainTaskId:{}, exitCause:{}]",
52-
computeStage, chainTaskId, exitCause);
53-
return true;
54-
}
55-
56-
/**
57-
* Get exit cause for pre-compute or post-compute enclave.
58-
*
59-
* @param chainTaskId task ID
60-
* @return exit cause
61-
*/
62-
ReplicateStatusCause getReplicateStatusCause(ComputeStage computeStage,
63-
String chainTaskId) {
64-
return exitCauseMap.get(buildKey(computeStage, chainTaskId));
65-
}
6652

67-
/**
68-
* Get pre-compute exit cause.
69-
*
70-
* @param chainTaskId task ID
71-
* @return exit cause
72-
*/
73-
public ReplicateStatusCause getPreComputeExitCauseAndPrune(String chainTaskId) {
74-
ComputeStage stage = ComputeStage.PRE;
75-
ReplicateStatusCause cause = getReplicateStatusCause(stage, chainTaskId);
76-
pruneExitCause(stage, chainTaskId);
77-
return cause != null ? cause : ReplicateStatusCause.PRE_COMPUTE_FAILED_UNKNOWN_ISSUE;
53+
exitCauseMap.put(key, List.copyOf(causes));
54+
log.info("Added exit causes [computeStage:{}, chainTaskId:{}, causeCount:{}]",
55+
computeStage, chainTaskId, causes.size());
56+
return true;
7857
}
7958

8059
/**
81-
* Get post-compute exit cause.
60+
* Get exit causes for a specific compute stage and prune them.
61+
* Returns default unknown issue cause when no specific causes are set.
8262
*
83-
* @param chainTaskId task ID
84-
* @return exit cause
63+
* @param computeStage compute stage
64+
* @param chainTaskId task ID
65+
* @param fallbackCause default cause to return if no specific causes are found
66+
* @return list of exit causes, or default unknown issue if not found
8567
*/
86-
public ReplicateStatusCause getPostComputeExitCauseAndPrune(String chainTaskId) {
87-
ComputeStage stage = ComputeStage.POST;
88-
ReplicateStatusCause cause = getReplicateStatusCause(stage, chainTaskId);
89-
pruneExitCause(stage, chainTaskId);
90-
return cause != null ? cause : ReplicateStatusCause.POST_COMPUTE_FAILED_UNKNOWN_ISSUE;
68+
public List<ReplicateStatusCause> getExitCausesAndPruneForGivenComputeStage(
69+
final ComputeStage computeStage,
70+
final String chainTaskId,
71+
final ReplicateStatusCause fallbackCause) {
72+
final String key = buildKey(computeStage, chainTaskId);
73+
final List<ReplicateStatusCause> causes = exitCauseMap.remove(key);
74+
if (causes != null) {
75+
log.info("Retrieved and pruned exit causes [computeStage:{}, chainTaskId:{}, causeCount:{}]",
76+
computeStage, chainTaskId, causes.size());
77+
return causes;
78+
} else {
79+
log.info("No exit causes found, returning fallback cause [computeStage:{}, chainTaskId:{}]",
80+
computeStage, chainTaskId);
81+
return List.of(fallbackCause);
82+
}
9183
}
9284

9385
/**
@@ -100,15 +92,4 @@ public ReplicateStatusCause getPostComputeExitCauseAndPrune(String chainTaskId)
10092
private String buildKey(ComputeStage prefix, String chainTaskId) {
10193
return prefix + "_" + chainTaskId;
10294
}
103-
104-
/**
105-
* Prune exit cause.
106-
*
107-
* @param computeStage compute stage
108-
* @param chainTaskId task ID
109-
*/
110-
private void pruneExitCause(ComputeStage computeStage, String chainTaskId) {
111-
exitCauseMap.remove(buildKey(computeStage, chainTaskId));
112-
}
113-
114-
}
95+
}

src/main/java/com/iexec/worker/compute/post/PostComputeService.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import com.iexec.sms.api.config.TeeAppProperties;
3131
import com.iexec.sms.api.config.TeeServicesProperties;
3232
import com.iexec.worker.compute.ComputeExitCauseService;
33+
import com.iexec.worker.compute.ComputeStage;
3334
import com.iexec.worker.config.WorkerConfigurationService;
3435
import com.iexec.worker.docker.DockerService;
3536
import com.iexec.worker.metric.ComputeDurationsService;
@@ -237,7 +238,8 @@ private ReplicateStatusCause getExitCause(String chainTaskId, Integer exitCode)
237238
if (exitCode != null && exitCode != 0) {
238239
switch (exitCode) {
239240
case 1:
240-
cause = computeExitCauseService.getPostComputeExitCauseAndPrune(chainTaskId);
241+
// Use first cause from bulk processing for now
242+
cause = computeExitCauseService.getExitCausesAndPruneForGivenComputeStage(ComputeStage.POST, chainTaskId, POST_COMPUTE_FAILED_UNKNOWN_ISSUE).get(0);
241243
break;
242244
case 2:
243245
cause = ReplicateStatusCause.POST_COMPUTE_EXIT_REPORTING_FAILED;

src/main/java/com/iexec/worker/compute/pre/PreComputeService.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2020-2024 IEXEC BLOCKCHAIN TECH
2+
* Copyright 2020-2025 IEXEC BLOCKCHAIN TECH
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
3030
import com.iexec.sms.api.config.TeeAppProperties;
3131
import com.iexec.sms.api.config.TeeServicesProperties;
3232
import com.iexec.worker.compute.ComputeExitCauseService;
33+
import com.iexec.worker.compute.ComputeStage;
3334
import com.iexec.worker.config.WorkerConfigurationService;
3435
import com.iexec.worker.docker.DockerService;
3536
import com.iexec.worker.metric.ComputeDurationsService;
@@ -173,7 +174,8 @@ private ReplicateStatusCause getExitCause(String chainTaskId, Integer exitCode)
173174
} else {
174175
switch (exitCode) {
175176
case 1:
176-
cause = computeExitCauseService.getPreComputeExitCauseAndPrune(chainTaskId);
177+
// Use first cause from bulk processing for now
178+
cause = computeExitCauseService.getExitCausesAndPruneForGivenComputeStage(ComputeStage.PRE, chainTaskId, PRE_COMPUTE_FAILED_UNKNOWN_ISSUE).get(0);
177179
break;
178180
case 2:
179181
cause = ReplicateStatusCause.PRE_COMPUTE_EXIT_REPORTING_FAILED;

0 commit comments

Comments
 (0)