Skip to content

Commit 40f55de

Browse files
authored
Compute alive worker metrics in WorkerService#updateMetrics scheduled job (#739)
These metrics are then used in `MetricService#getPlatformMetrics` to build responses to queries on `/metrics` endpoint. Those queries no more trigger reads on MongoDB.
1 parent b36dda7 commit 40f55de

File tree

9 files changed

+199
-234
lines changed

9 files changed

+199
-234
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ All notable changes to this project will be documented in this file.
1313
- Configuration server is now optional by default. (#728)
1414
- Improve switch statements after Java 17 migration. (#729)
1515
- Remove redundant blockchain calls to diminish pressure on Ethereum JSON-RPC API. (#734)
16+
- Compute alive workers metrics in `WorkerService#updateMetrics` scheduled job. (#739)
1617

1718
### Breaking API changes
1819

@@ -22,6 +23,7 @@ All notable changes to this project will be documented in this file.
2223
- Move `TaskAbortCause` from `iexec-commons-poco` to `iexec-core-library`. (#736)
2324
- Remove deprecated methods in `iexec-core-library`. (#737)
2425
- Remove unused `ContributionUtils` class. (#738)
26+
- Rework metrics to expose count of computing CPUs or GPUs instead of available ones. (#739)
2527

2628
### Dependency Upgrades
2729

iexec-task-api/src/main/java/com/iexec/core/metric/PlatformMetric.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2020-2023 IEXEC BLOCKCHAIN TECH
2+
* Copyright 2020-2025 IEXEC BLOCKCHAIN TECH
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -30,10 +30,10 @@
3030
@JsonDeserialize(builder = PlatformMetric.PlatformMetricBuilder.class)
3131
public class PlatformMetric {
3232
int aliveWorkers;
33-
int aliveTotalCpu;
34-
int aliveAvailableCpu;
35-
int aliveTotalGpu;
36-
int aliveAvailableGpu;
33+
int aliveComputingCpu;
34+
int aliveRegisteredCpu;
35+
int aliveComputingGpu;
36+
int aliveRegisteredGpu;
3737
LinkedHashMap<TaskStatus, Long> currentTaskStatusesCount;
3838
long dealEventsCount;
3939
long dealsCount;

iexec-task-api/src/test/java/com/iexec/core/metric/PlatformMetricTests.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2023-2023 IEXEC BLOCKCHAIN TECH
2+
* Copyright 2023-2025 IEXEC BLOCKCHAIN TECH
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -33,10 +33,10 @@ class PlatformMetricTests {
3333
void shouldSerializeAndDeserialize() throws JsonProcessingException {
3434
final PlatformMetric platformMetric = PlatformMetric.builder()
3535
.aliveWorkers(4)
36-
.aliveTotalCpu(12)
37-
.aliveAvailableCpu(7)
38-
.aliveTotalGpu(0)
39-
.aliveAvailableGpu(0)
36+
.aliveComputingCpu(5)
37+
.aliveRegisteredCpu(12)
38+
.aliveComputingGpu(0)
39+
.aliveRegisteredGpu(0)
4040
.currentTaskStatusesCount(createCurrentTaskStatusesCount())
4141
.dealEventsCount(3000)
4242
.dealsCount(1100)

src/main/java/com/iexec/core/metric/MetricService.java

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2020 IEXEC BLOCKCHAIN TECH
2+
* Copyright 2020-2025 IEXEC BLOCKCHAIN TECH
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
1919
import com.iexec.core.chain.DealWatcherService;
2020
import com.iexec.core.task.TaskStatus;
2121
import com.iexec.core.task.event.TaskStatusesCountUpdatedEvent;
22+
import com.iexec.core.worker.AliveWorkerMetrics;
2223
import com.iexec.core.worker.WorkerService;
2324
import org.springframework.context.event.EventListener;
2425
import org.springframework.stereotype.Service;
@@ -31,21 +32,22 @@ public class MetricService {
3132
private final WorkerService workerService;
3233
private LinkedHashMap<TaskStatus, Long> currentTaskStatusesCount;
3334

34-
public MetricService(DealWatcherService dealWatcherService,
35-
WorkerService workerService) {
35+
public MetricService(final DealWatcherService dealWatcherService,
36+
final WorkerService workerService) {
3637
this.dealWatcherService = dealWatcherService;
3738
this.workerService = workerService;
3839

3940
this.currentTaskStatusesCount = new LinkedHashMap<>();
4041
}
4142

4243
public PlatformMetric getPlatformMetrics() {
44+
final AliveWorkerMetrics aliveWorkerMetrics = workerService.getAliveWorkerMetrics();
4345
return PlatformMetric.builder()
44-
.aliveWorkers(workerService.getAliveWorkers().size())
45-
.aliveTotalCpu(workerService.getAliveTotalCpu())
46-
.aliveAvailableCpu(workerService.getAliveAvailableCpu())
47-
.aliveTotalGpu(workerService.getAliveTotalGpu())
48-
.aliveAvailableGpu(workerService.getAliveAvailableGpu())
46+
.aliveWorkers(aliveWorkerMetrics.aliveWorkers())
47+
.aliveComputingCpu(aliveWorkerMetrics.aliveComputingCpu())
48+
.aliveRegisteredCpu(aliveWorkerMetrics.aliveRegisteredCpu())
49+
.aliveComputingGpu(aliveWorkerMetrics.aliveComputingGpu())
50+
.aliveRegisteredGpu(aliveWorkerMetrics.aliveRegisteredGpu())
4951
.currentTaskStatusesCount(currentTaskStatusesCount)
5052
.dealEventsCount(dealWatcherService.getDealEventsCount())
5153
.dealsCount(dealWatcherService.getDealsCount())
@@ -55,7 +57,7 @@ public PlatformMetric getPlatformMetrics() {
5557
}
5658

5759
@EventListener
58-
void onTaskStatusesCountUpdateEvent(TaskStatusesCountUpdatedEvent event) {
60+
void onTaskStatusesCountUpdateEvent(final TaskStatusesCountUpdatedEvent event) {
5961
this.currentTaskStatusesCount = event.getCurrentTaskStatusesCount();
6062
}
6163
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
* Copyright 2025 IEXEC BLOCKCHAIN TECH
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.iexec.core.worker;
18+
19+
import lombok.Builder;
20+
21+
@Builder
22+
public record AliveWorkerMetrics(int aliveWorkers,
23+
int aliveComputingCpu,
24+
int aliveRegisteredCpu,
25+
int aliveComputingGpu,
26+
int aliveRegisteredGpu) {
27+
}

src/main/java/com/iexec/core/worker/WorkerService.java

Lines changed: 44 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
import com.iexec.core.configuration.WorkerConfiguration;
2121
import com.mongodb.client.result.UpdateResult;
2222
import io.micrometer.core.instrument.Metrics;
23-
import jakarta.annotation.PostConstruct;
2423
import lombok.Data;
2524
import lombok.Getter;
2625
import lombok.extern.slf4j.Slf4j;
@@ -50,15 +49,19 @@ public class WorkerService {
5049

5150
private static final String WALLET_ADDRESS_FIELD = "walletAddress";
5251
public static final String METRIC_WORKERS_GAUGE = "iexec.core.workers";
53-
public static final String METRIC_CPU_TOTAL_GAUGE = "iexec.core.cpu.total";
54-
public static final String METRIC_CPU_AVAILABLE_GAUGE = "iexec.core.cpu.available";
52+
public static final String METRIC_CPU_COMPUTING_GAUGE = "iexec.core.cpu.computing";
53+
public static final String METRIC_CPU_REGISTERED_GAUGE = "iexec.core.cpu.registered";
54+
public static final String METRIC_GPU_COMPUTING_GAUGE = "iexec.core.gpu.computing";
55+
public static final String METRIC_GPU_REGISTERED_GAUGE = "iexec.core.gpu.registered";
5556
private final MongoTemplate mongoTemplate;
5657
private final WorkerRepository workerRepository;
5758
private final WorkerConfiguration workerConfiguration;
5859
private final ContextualLockRunner<String> contextualLockRunner;
59-
private AtomicInteger aliveWorkersGauge;
60-
private AtomicInteger aliveTotalCpuGauge;
61-
private AtomicInteger aliveAvailableCpuGauge;
60+
private final AtomicInteger aliveWorkersGauge;
61+
private final AtomicInteger aliveComputingCpuGauge;
62+
private final AtomicInteger aliveRegisteredCpuGauge;
63+
private final AtomicInteger aliveComputingGpuGauge;
64+
private final AtomicInteger aliveRegisteredGpuGauge;
6265
@Getter
6366
private final ConcurrentHashMap<String, WorkerStats> workerStatsMap = new ConcurrentHashMap<>();
6467

@@ -80,38 +83,52 @@ public WorkerService(MongoTemplate mongoTemplate,
8083
this.workerRepository = workerRepository;
8184
this.workerConfiguration = workerConfiguration;
8285
this.contextualLockRunner = new ContextualLockRunner<>();
83-
}
8486

85-
@PostConstruct
86-
void init() {
87-
aliveWorkersGauge = Metrics.gauge(METRIC_WORKERS_GAUGE, new AtomicInteger(getAliveWorkers().size()));
88-
aliveTotalCpuGauge = Metrics.gauge(METRIC_CPU_TOTAL_GAUGE, new AtomicInteger(getAliveTotalCpu()));
89-
aliveAvailableCpuGauge = Metrics.gauge(METRIC_CPU_AVAILABLE_GAUGE, new AtomicInteger(getAliveAvailableCpu()));
87+
this.aliveWorkersGauge = Metrics.gauge(METRIC_WORKERS_GAUGE, new AtomicInteger(0));
88+
this.aliveComputingCpuGauge = Metrics.gauge(METRIC_CPU_COMPUTING_GAUGE, new AtomicInteger(0));
89+
this.aliveRegisteredCpuGauge = Metrics.gauge(METRIC_CPU_REGISTERED_GAUGE, new AtomicInteger(0));
90+
this.aliveComputingGpuGauge = Metrics.gauge(METRIC_GPU_COMPUTING_GAUGE, new AtomicInteger(0));
91+
this.aliveRegisteredGpuGauge = Metrics.gauge(METRIC_GPU_REGISTERED_GAUGE, new AtomicInteger(0));
9092
}
9193

9294
/**
9395
* updateMetrics is used to update all workers metrics
9496
*/
95-
@Scheduled(fixedDelayString = "${cron.metrics.refresh.period}", initialDelayString = "${cron.metrics.refresh.period}")
97+
@Scheduled(fixedDelayString = "${cron.metrics.refresh.period}")
9698
void updateMetrics() {
9799
// Fusion of methods getAliveTotalCpu and getAliveAvailableCpu to prevent making 3 calls to getAliveWorkers
98-
int availableCpus = 0;
99-
int totalCpus = 0;
100-
List<Worker> workers = getAliveWorkers();
101-
for (Worker worker : workers) {
100+
int computingCpus = 0;
101+
int registeredCpus = 0;
102+
int computingGpus = 0;
103+
int registeredGpus = 0;
104+
final List<Worker> workers = getAliveWorkers();
105+
for (final Worker worker : workers) {
102106
if (worker.isGpuEnabled()) {
103-
continue;
107+
registeredGpus++;
108+
if (!worker.getComputingChainTaskIds().isEmpty()) {
109+
computingGpus++;
110+
}
111+
} else {
112+
registeredCpus += worker.getCpuNb();
113+
computingCpus += worker.getComputingChainTaskIds().size();
104114
}
105-
int workerCpuNb = worker.getCpuNb();
106-
int computingReplicateNb = worker.getComputingChainTaskIds().size();
107-
int availableCpu = workerCpuNb - computingReplicateNb;
108-
totalCpus += workerCpuNb;
109-
availableCpus += availableCpu;
110115
}
111116

112117
aliveWorkersGauge.set(workers.size());
113-
aliveTotalCpuGauge.set(totalCpus);
114-
aliveAvailableCpuGauge.set(availableCpus);
118+
aliveComputingCpuGauge.set(computingCpus);
119+
aliveRegisteredCpuGauge.set(registeredCpus);
120+
aliveComputingGpuGauge.set(computingGpus);
121+
aliveRegisteredGpuGauge.set(registeredGpus);
122+
}
123+
124+
public AliveWorkerMetrics getAliveWorkerMetrics() {
125+
return AliveWorkerMetrics.builder()
126+
.aliveWorkers(aliveWorkersGauge.get())
127+
.aliveComputingCpu(aliveComputingCpuGauge.get())
128+
.aliveRegisteredCpu(aliveRegisteredCpuGauge.get())
129+
.aliveComputingGpu(aliveComputingGpuGauge.get())
130+
.aliveRegisteredGpu(aliveRegisteredGpuGauge.get())
131+
.build();
115132
}
116133

117134
// region Read methods
@@ -120,12 +137,9 @@ public Optional<Worker> getWorker(String walletAddress) {
120137
}
121138

122139
public boolean isAllowedToJoin(String workerAddress) {
123-
List<String> whitelist = workerConfiguration.getWhitelist();
140+
final List<String> whitelist = workerConfiguration.getWhitelist();
124141
// if the whitelist is empty, there is no restriction on the workers
125-
if (whitelist.isEmpty()) {
126-
return true;
127-
}
128-
return whitelist.contains(workerAddress);
142+
return whitelist.isEmpty() || whitelist.contains(workerAddress);
129143
}
130144

131145
public boolean isWorkerAllowedToAskReplicate(String walletAddress) {
@@ -190,56 +204,6 @@ public boolean canAcceptMoreWorks(Worker worker) {
190204

191205
return true;
192206
}
193-
194-
public int getAliveAvailableCpu() {
195-
int availableCpus = 0;
196-
for (Worker worker : getAliveWorkers()) {
197-
if (worker.isGpuEnabled()) {
198-
continue;
199-
}
200-
201-
int workerCpuNb = worker.getCpuNb();
202-
int computingReplicateNb = worker.getComputingChainTaskIds().size();
203-
int availableCpu = workerCpuNb - computingReplicateNb;
204-
availableCpus += availableCpu;
205-
}
206-
return availableCpus;
207-
}
208-
209-
public int getAliveTotalCpu() {
210-
int totalCpus = 0;
211-
for (Worker worker : getAliveWorkers()) {
212-
if (worker.isGpuEnabled()) {
213-
continue;
214-
}
215-
totalCpus += worker.getCpuNb();
216-
}
217-
return totalCpus;
218-
}
219-
220-
// We suppose for now that 1 Gpu enabled worker has only one GPU
221-
public int getAliveTotalGpu() {
222-
int totalGpus = 0;
223-
for (Worker worker : getAliveWorkers()) {
224-
if (worker.isGpuEnabled()) {
225-
totalGpus++;
226-
}
227-
}
228-
return totalGpus;
229-
}
230-
231-
public int getAliveAvailableGpu() {
232-
int availableGpus = getAliveTotalGpu();
233-
for (Worker worker : getAliveWorkers()) {
234-
if (worker.isGpuEnabled()) {
235-
boolean isWorking = !worker.getComputingChainTaskIds().isEmpty();
236-
if (isWorking) {
237-
availableGpus = availableGpus - 1;
238-
}
239-
}
240-
}
241-
return availableGpus;
242-
}
243207
// endregion
244208

245209
// region Read-and-write methods
Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2020 IEXEC BLOCKCHAIN TECH
2+
* Copyright 2020-2025 IEXEC BLOCKCHAIN TECH
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -16,16 +16,17 @@
1616

1717
package com.iexec.core.metric;
1818

19-
import org.assertj.core.api.Assertions;
20-
import org.junit.jupiter.api.BeforeEach;
2119
import org.junit.jupiter.api.Test;
20+
import org.junit.jupiter.api.extension.ExtendWith;
2221
import org.mockito.InjectMocks;
2322
import org.mockito.Mock;
24-
import org.mockito.MockitoAnnotations;
23+
import org.mockito.junit.jupiter.MockitoExtension;
2524
import org.springframework.http.HttpStatus;
2625

26+
import static org.assertj.core.api.Assertions.assertThat;
2727
import static org.mockito.Mockito.when;
2828

29+
@ExtendWith(MockitoExtension.class)
2930
class MetricControllerTests {
3031

3132
@Mock
@@ -34,23 +35,16 @@ class MetricControllerTests {
3435
@InjectMocks
3536
private MetricController metricController;
3637

37-
@BeforeEach
38-
void init() {
39-
MockitoAnnotations.openMocks(this);
40-
}
41-
4238
@Test
4339
void shouldGetMetrics() {
4440
PlatformMetric metric = PlatformMetric.builder()
45-
.aliveAvailableCpu(1)
46-
.aliveAvailableGpu(1)
41+
.aliveRegisteredCpu(1)
42+
.aliveRegisteredCpu(1)
4743
.build();
4844
when(metricService.getPlatformMetrics()).thenReturn(metric);
49-
Assertions.assertThat(
50-
metricController.getPlatformMetric().getStatusCode())
51-
.isEqualTo(HttpStatus.OK);
52-
Assertions.assertThat(
53-
metricController.getPlatformMetric().getBody())
54-
.isEqualTo(metric);
45+
assertThat(metricController.getPlatformMetric().getStatusCode())
46+
.isEqualTo(HttpStatus.OK);
47+
assertThat(metricController.getPlatformMetric().getBody())
48+
.isEqualTo(metric);
5549
}
5650
}

0 commit comments

Comments
 (0)