Skip to content

Commit 97327ef

Browse files
committed
feat: add bulk exit cause reporting for compute stages
1 parent 2fe69d0 commit 97327ef

File tree

8 files changed

+809
-91
lines changed

8 files changed

+809
-91
lines changed

src/main/java/com/iexec/worker/compute/ComputeController.java

Lines changed: 61 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,24 @@
1717
package com.iexec.worker.compute;
1818

1919

20+
import static org.springframework.http.ResponseEntity.ok;
21+
22+
import java.util.List;
23+
import java.util.NoSuchElementException;
24+
25+
import org.springframework.http.HttpStatus;
26+
import org.springframework.http.ResponseEntity;
27+
import org.springframework.web.bind.annotation.PathVariable;
28+
import org.springframework.web.bind.annotation.PostMapping;
29+
import org.springframework.web.bind.annotation.RequestBody;
30+
import org.springframework.web.bind.annotation.RequestHeader;
31+
import org.springframework.web.bind.annotation.RestController;
32+
33+
import com.iexec.common.replicate.ReplicateStatusCause;
2034
import com.iexec.common.result.ComputedFile;
2135
import com.iexec.common.worker.api.ExitMessage;
2236
import com.iexec.worker.chain.WorkerpoolAuthorizationService;
2337
import com.iexec.worker.result.ResultService;
24-
import org.springframework.http.HttpStatus;
25-
import org.springframework.http.ResponseEntity;
26-
import org.springframework.web.bind.annotation.*;
27-
28-
import java.util.NoSuchElementException;
29-
30-
import static org.springframework.http.ResponseEntity.ok;
3138

3239
@RestController
3340
public class ComputeController {
@@ -44,6 +51,18 @@ public ComputeController(final ComputeExitCauseService computeStageExitService,
4451
this.workerpoolAuthorizationService = workerpoolAuthorizationService;
4552
}
4653

54+
/**
55+
* Send a single exit cause for a given compute stage.
56+
*
57+
* @param authorization authorization header
58+
* @param stage compute stage (PRE or POST)
59+
* @param chainTaskId task ID
60+
* @param exitMessage exit message containing the cause
61+
* @return response entity
62+
* @deprecated Use {@link #sendExitCausesForGivenComputeStage(String, ComputeStage, String, List)}
63+
* for bulk exit cause reporting instead
64+
*/
65+
@Deprecated(since = "v9.0.1", forRemoval = true)
4766
@PostMapping("/compute/{stage}/{chainTaskId}/exit")
4867
public ResponseEntity<Void> sendExitCauseForGivenComputeStage(
4968
@RequestHeader("Authorization") String authorization,
@@ -104,4 +123,39 @@ public ResponseEntity<String> sendComputedFileForTee(@RequestHeader("Authorizati
104123
return ok(chainTaskId);
105124
}
106125

126+
@PostMapping("/compute/{stage}/{chainTaskId}/exit-causes")
127+
public ResponseEntity<Void> sendExitCausesForGivenComputeStage(
128+
@RequestHeader("Authorization") String authorization,
129+
@PathVariable ComputeStage stage,
130+
@PathVariable String chainTaskId,
131+
@RequestBody List<ReplicateStatusCause> causes) {
132+
133+
try {
134+
if (!workerpoolAuthorizationService.isSignedWithEnclaveChallenge(chainTaskId, authorization)) {
135+
return ResponseEntity
136+
.status(HttpStatus.UNAUTHORIZED.value())
137+
.build();
138+
}
139+
} catch (NoSuchElementException e) {
140+
return ResponseEntity
141+
.status(HttpStatus.NOT_FOUND.value())
142+
.build();
143+
}
144+
145+
if (causes == null || causes.isEmpty()) {
146+
return ResponseEntity
147+
.status(HttpStatus.BAD_REQUEST.value())
148+
.build();
149+
}
150+
151+
final boolean stored = computeStageExitService.setBulkExitCausesForGivenComputeStage(stage, chainTaskId, causes);
152+
153+
if (!stored) {
154+
return ResponseEntity
155+
.status(HttpStatus.ALREADY_REPORTED.value())
156+
.build();
157+
}
158+
return ok().build();
159+
}
160+
107161
}

src/main/java/com/iexec/worker/compute/ComputeExitCauseService.java

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,27 +16,41 @@
1616

1717
package com.iexec.worker.compute;
1818

19-
import com.iexec.common.replicate.ReplicateStatusCause;
20-
import lombok.extern.slf4j.Slf4j;
19+
import java.util.ArrayList;
20+
import java.util.HashMap;
21+
import java.util.List;
22+
import java.util.Map;
23+
2124
import org.springframework.stereotype.Service;
2225

23-
import java.util.HashMap;
26+
import com.iexec.common.replicate.ReplicateStatusCause;
27+
28+
import lombok.extern.slf4j.Slf4j;
2429

2530

2631
@Slf4j
2732
@Service
2833
public class ComputeExitCauseService {
2934

35+
/**
36+
* @deprecated Use {@link #bulkExitCauseMap} instead
37+
*/
38+
@Deprecated(since = "v9.0.1", forRemoval = true)
3039
private final HashMap<String, ReplicateStatusCause> exitCauseMap = new HashMap<>();
3140

41+
private final Map<String, List<ReplicateStatusCause>> bulkExitCauseMap = new HashMap<>();
42+
3243
/**
3344
* Report failure exit cause from pre-compute or post-compute enclave.
3445
*
3546
* @param computeStage pre-compute or post-compute-stage label
3647
* @param chainTaskId task ID
3748
* @param exitCause root cause of the failure
3849
* @return true if exit cause is reported
50+
* @deprecated Use {@link #setBulkExitCausesForGivenComputeStage(ComputeStage, String, List)}
51+
* for bulk exit cause reporting instead
3952
*/
53+
@Deprecated(since = "v9.0.1", forRemoval = true)
4054
boolean setExitCause(ComputeStage computeStage,
4155
String chainTaskId,
4256
ReplicateStatusCause exitCause) {
@@ -58,7 +72,9 @@ boolean setExitCause(ComputeStage computeStage,
5872
*
5973
* @param chainTaskId task ID
6074
* @return exit cause
75+
* @deprecated Use {@link #getExitCauseAndPruneForGivenComputeStage(ComputeStage, String)} instead
6176
*/
77+
@Deprecated(since = "v9.0.1", forRemoval = true)
6278
ReplicateStatusCause getReplicateStatusCause(ComputeStage computeStage,
6379
String chainTaskId) {
6480
return exitCauseMap.get(buildKey(computeStage, chainTaskId));
@@ -69,7 +85,10 @@ ReplicateStatusCause getReplicateStatusCause(ComputeStage computeStage,
6985
*
7086
* @param chainTaskId task ID
7187
* @return exit cause
88+
* @deprecated Use {@link #getExitCauseAndPruneForGivenComputeStage(ComputeStage, String)}
89+
* with ComputeStage.PRE instead
7290
*/
91+
@Deprecated(since = "v9.0.1", forRemoval = true)
7392
public ReplicateStatusCause getPreComputeExitCauseAndPrune(String chainTaskId) {
7493
ComputeStage stage = ComputeStage.PRE;
7594
ReplicateStatusCause cause = getReplicateStatusCause(stage, chainTaskId);
@@ -82,7 +101,10 @@ public ReplicateStatusCause getPreComputeExitCauseAndPrune(String chainTaskId) {
82101
*
83102
* @param chainTaskId task ID
84103
* @return exit cause
104+
* @deprecated Use {@link #getExitCauseAndPruneForGivenComputeStage(ComputeStage, String)}
105+
* with ComputeStage.POST instead
85106
*/
107+
@Deprecated(since = "v9.0.1", forRemoval = true)
86108
public ReplicateStatusCause getPostComputeExitCauseAndPrune(String chainTaskId) {
87109
ComputeStage stage = ComputeStage.POST;
88110
ReplicateStatusCause cause = getReplicateStatusCause(stage, chainTaskId);
@@ -111,4 +133,53 @@ private void pruneExitCause(ComputeStage computeStage, String chainTaskId) {
111133
exitCauseMap.remove(buildKey(computeStage, chainTaskId));
112134
}
113135

114-
}
136+
/**
137+
* Store bulk exit causes for a specific compute stage.
138+
* If causes already exist for this compute stage and task, the new causes will be added to the existing list.
139+
*
140+
* @param computeStage compute stage
141+
* @param chainTaskId task ID
142+
* @param causes list of exit causes
143+
* @return true if causes were stored successfully
144+
*/
145+
public boolean setBulkExitCausesForGivenComputeStage(ComputeStage computeStage, String chainTaskId, List<ReplicateStatusCause> causes) {
146+
if (causes == null || causes.isEmpty()) {
147+
log.error("Cannot set bulk exit causes with null or empty list [computeStage:{}, chainTaskId:{}]",
148+
computeStage, chainTaskId);
149+
return false;
150+
}
151+
152+
final String key = buildKey(computeStage, chainTaskId);
153+
bulkExitCauseMap.compute(key, (k, existingCauses) -> {
154+
if (existingCauses == null) {
155+
log.info("Added bulk exit causes [computeStage:{}, chainTaskId:{}, causeCount:{}]",
156+
computeStage, chainTaskId, causes.size());
157+
return List.copyOf(causes);
158+
} else {
159+
log.info("Appended bulk exit causes to existing list [computeStage:{}, chainTaskId:{}, newCauseCount:{}, totalCauseCount:{}]",
160+
computeStage, chainTaskId, causes.size(), existingCauses.size() + causes.size());
161+
final List<ReplicateStatusCause> combinedCauses = new ArrayList<>(existingCauses);
162+
combinedCauses.addAll(causes);
163+
return combinedCauses;
164+
}
165+
});
166+
return true;
167+
}
168+
169+
/**
170+
* Get bulk exit causes for a specific compute stage and prune them.
171+
*
172+
* @param computeStage compute stage
173+
* @param chainTaskId task ID
174+
* @return list of exit causes, or null if not found
175+
*/
176+
public List<ReplicateStatusCause> getBulkExitCausesAndPruneForGivenComputeStage(ComputeStage computeStage, String chainTaskId) {
177+
final String key = buildKey(computeStage, chainTaskId);
178+
final List<ReplicateStatusCause> causes = bulkExitCauseMap.remove(key);
179+
if (causes != null) {
180+
log.debug("Retrieved and pruned bulk exit causes [computeStage:{}, chainTaskId:{}, causeCount:{}]",
181+
computeStage, chainTaskId, causes.size());
182+
}
183+
return causes;
184+
}
185+
}

src/main/java/com/iexec/worker/compute/post/PostComputeService.java

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,26 @@
1616

1717
package com.iexec.worker.compute.post;
1818

19+
import static com.iexec.common.replicate.ReplicateStatusCause.POST_COMPUTE_FAILED_UNKNOWN_ISSUE;
20+
import static com.iexec.common.replicate.ReplicateStatusCause.POST_COMPUTE_TOO_LONG_RESULT_FILE_NAME;
21+
22+
import java.io.IOException;
23+
import java.nio.file.FileVisitResult;
24+
import java.nio.file.Files;
25+
import java.nio.file.Path;
26+
import java.nio.file.Paths;
27+
import java.nio.file.SimpleFileVisitor;
28+
import java.nio.file.attribute.BasicFileAttributes;
29+
import java.time.Duration;
30+
import java.util.Collection;
31+
import java.util.Collections;
32+
import java.util.List;
33+
import java.util.Optional;
34+
import java.util.concurrent.atomic.AtomicBoolean;
35+
import java.util.stream.Stream;
36+
37+
import org.springframework.stereotype.Service;
38+
1939
import com.github.dockerjava.api.model.Bind;
2040
import com.github.dockerjava.api.model.HostConfig;
2141
import com.iexec.common.replicate.ReplicateStatusCause;
@@ -30,29 +50,16 @@
3050
import com.iexec.sms.api.config.TeeAppProperties;
3151
import com.iexec.sms.api.config.TeeServicesProperties;
3252
import com.iexec.worker.compute.ComputeExitCauseService;
53+
import com.iexec.worker.compute.ComputeStage;
3354
import com.iexec.worker.config.WorkerConfigurationService;
3455
import com.iexec.worker.docker.DockerService;
3556
import com.iexec.worker.metric.ComputeDurationsService;
3657
import com.iexec.worker.sgx.SgxService;
3758
import com.iexec.worker.tee.TeeService;
3859
import com.iexec.worker.tee.TeeServicesManager;
3960
import com.iexec.worker.tee.TeeServicesPropertiesService;
40-
import lombok.extern.slf4j.Slf4j;
41-
import org.springframework.stereotype.Service;
4261

43-
import java.io.IOException;
44-
import java.nio.file.*;
45-
import java.nio.file.attribute.BasicFileAttributes;
46-
import java.time.Duration;
47-
import java.util.Collection;
48-
import java.util.Collections;
49-
import java.util.List;
50-
import java.util.Optional;
51-
import java.util.concurrent.atomic.AtomicBoolean;
52-
import java.util.stream.Stream;
53-
54-
import static com.iexec.common.replicate.ReplicateStatusCause.POST_COMPUTE_FAILED_UNKNOWN_ISSUE;
55-
import static com.iexec.common.replicate.ReplicateStatusCause.POST_COMPUTE_TOO_LONG_RESULT_FILE_NAME;
62+
import lombok.extern.slf4j.Slf4j;
5663

5764

5865
@Slf4j
@@ -237,7 +244,13 @@ private ReplicateStatusCause getExitCause(String chainTaskId, Integer exitCode)
237244
if (exitCode != null && exitCode != 0) {
238245
switch (exitCode) {
239246
case 1:
240-
cause = computeExitCauseService.getPostComputeExitCauseAndPrune(chainTaskId);
247+
// Check for bulk exit causes first, use default if none found
248+
List<ReplicateStatusCause> bulkCauses = computeExitCauseService.getBulkExitCausesAndPruneForGivenComputeStage(ComputeStage.POST, chainTaskId);
249+
if (bulkCauses != null && !bulkCauses.isEmpty()) {
250+
cause = bulkCauses.get(0); // Use first cause from bulk processing
251+
} else {
252+
cause = ReplicateStatusCause.POST_COMPUTE_FAILED_UNKNOWN_ISSUE; // Default cause
253+
}
241254
break;
242255
case 2:
243256
cause = ReplicateStatusCause.POST_COMPUTE_EXIT_REPORTING_FAILED;

src/main/java/com/iexec/worker/compute/pre/PreComputeService.java

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,21 @@
1616

1717
package com.iexec.worker.compute.pre;
1818

19+
import static com.iexec.common.replicate.ReplicateStatusCause.PRE_COMPUTE_IMAGE_MISSING;
20+
import static com.iexec.common.replicate.ReplicateStatusCause.PRE_COMPUTE_INVALID_ENCLAVE_CONFIGURATION;
21+
import static com.iexec.common.replicate.ReplicateStatusCause.PRE_COMPUTE_INVALID_ENCLAVE_HEAP_CONFIGURATION;
22+
import static com.iexec.common.replicate.ReplicateStatusCause.PRE_COMPUTE_MISSING_ENCLAVE_CONFIGURATION;
23+
import static com.iexec.common.replicate.ReplicateStatusCause.PRE_COMPUTE_TIMEOUT;
24+
import static com.iexec.sms.api.TeeSessionGenerationError.UNKNOWN_ISSUE;
25+
26+
import java.time.Duration;
27+
import java.util.Collections;
28+
import java.util.List;
29+
import java.util.concurrent.TimeoutException;
30+
31+
import org.springframework.stereotype.Service;
32+
import org.springframework.util.unit.DataSize;
33+
1934
import com.github.dockerjava.api.model.Bind;
2035
import com.github.dockerjava.api.model.HostConfig;
2136
import com.iexec.common.replicate.ReplicateStatusCause;
@@ -30,6 +45,7 @@
3045
import com.iexec.sms.api.config.TeeAppProperties;
3146
import com.iexec.sms.api.config.TeeServicesProperties;
3247
import com.iexec.worker.compute.ComputeExitCauseService;
48+
import com.iexec.worker.compute.ComputeStage;
3349
import com.iexec.worker.config.WorkerConfigurationService;
3450
import com.iexec.worker.docker.DockerService;
3551
import com.iexec.worker.metric.ComputeDurationsService;
@@ -38,17 +54,8 @@
3854
import com.iexec.worker.sms.TeeSessionGenerationException;
3955
import com.iexec.worker.tee.TeeServicesManager;
4056
import com.iexec.worker.tee.TeeServicesPropertiesService;
41-
import lombok.extern.slf4j.Slf4j;
42-
import org.springframework.stereotype.Service;
43-
import org.springframework.util.unit.DataSize;
44-
45-
import java.time.Duration;
46-
import java.util.Collections;
47-
import java.util.List;
48-
import java.util.concurrent.TimeoutException;
4957

50-
import static com.iexec.common.replicate.ReplicateStatusCause.*;
51-
import static com.iexec.sms.api.TeeSessionGenerationError.UNKNOWN_ISSUE;
58+
import lombok.extern.slf4j.Slf4j;
5259

5360
@Slf4j
5461
@Service
@@ -173,7 +180,13 @@ private ReplicateStatusCause getExitCause(String chainTaskId, Integer exitCode)
173180
} else {
174181
switch (exitCode) {
175182
case 1:
176-
cause = computeExitCauseService.getPreComputeExitCauseAndPrune(chainTaskId);
183+
// Check for bulk exit causes first, use default if none found
184+
List<ReplicateStatusCause> bulkCauses = computeExitCauseService.getBulkExitCausesAndPruneForGivenComputeStage(ComputeStage.PRE, chainTaskId);
185+
if (bulkCauses != null && !bulkCauses.isEmpty()) {
186+
cause = bulkCauses.get(0); // Use first cause from bulk processing
187+
} else {
188+
cause = ReplicateStatusCause.PRE_COMPUTE_FAILED_UNKNOWN_ISSUE; // Default cause
189+
}
177190
break;
178191
case 2:
179192
cause = ReplicateStatusCause.PRE_COMPUTE_EXIT_REPORTING_FAILED;

0 commit comments

Comments
 (0)