Skip to content

Commit 7e63bde

Browse files
zustonJunfan Zhang
andauthored
[#2350] improvement(coordinator): Add metrics of active/lost server number (#2351)
### What changes were proposed in this pull request? Add metrics of active/lost server number for #2350 ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Needn't --------- Co-authored-by: Junfan Zhang <zhangjunfan@qiyi.com>
1 parent deb5de3 commit 7e63bde

File tree

2 files changed

+17
-2
lines changed

2 files changed

+17
-2
lines changed

coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import java.io.InputStreamReader;
2626
import java.io.OutputStreamWriter;
2727
import java.nio.charset.StandardCharsets;
28+
import java.util.HashSet;
2829
import java.util.List;
2930
import java.util.Map;
3031
import java.util.Set;
@@ -171,6 +172,16 @@ void nodesCheck() {
171172

172173
CoordinatorMetrics.gaugeUnhealthyServerNum.set(unhealthyNodes.size());
173174
CoordinatorMetrics.gaugeTotalServerNum.set(servers.size());
175+
CoordinatorMetrics.gaugeLostServerNum.set(lostNodes.size());
176+
177+
// get the active server num.
178+
Set<String> allServers = new HashSet<>(servers.keySet());
179+
allServers.removeAll(excludedNodes);
180+
for (ServerNode unhealthyNode : unhealthyNodes) {
181+
allServers.remove(unhealthyNode.getId());
182+
}
183+
CoordinatorMetrics.gaugeActiveServerNum.set(allServers.size());
184+
174185
} catch (Exception e) {
175186
LOG.warn("Error happened in nodesCheck", e);
176187
}

coordinator/src/main/java/org/apache/uniffle/coordinator/metric/CoordinatorMetrics.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@
3333
import org.apache.uniffle.common.util.RssUtils;
3434

3535
public class CoordinatorMetrics {
36-
36+
private static final String ACTIVE_SERVER_NUM = "active_server_num";
37+
private static final String LOST_SERVER_NUM = "lost_server_num";
3738
private static final String TOTAL_SERVER_NUM = "total_server_num";
3839
private static final String RUNNING_APP_NUM = "running_app_num";
3940
private static final String TOTAL_APP_NUM = "total_app_num";
@@ -46,7 +47,8 @@ public class CoordinatorMetrics {
4647
public static final String REMOTE_STORAGE_IN_USED_PREFIX = "remote_storage_in_used_";
4748
public static final String APP_NUM_TO_USER = "app_num";
4849
public static final String USER_LABEL = "user_name";
49-
50+
public static Gauge gaugeLostServerNum;
51+
public static Gauge gaugeActiveServerNum;
5052
public static Gauge gaugeTotalServerNum;
5153
public static Gauge gaugeExcludeServerNum;
5254
public static Gauge gaugeUnhealthyServerNum;
@@ -107,6 +109,8 @@ public static void updateDynamicGaugeForRemoteStorage(String storageHost, double
107109
}
108110

109111
private static void setUpMetrics() {
112+
gaugeLostServerNum = metricsManager.addGauge(LOST_SERVER_NUM);
113+
gaugeActiveServerNum = metricsManager.addGauge(ACTIVE_SERVER_NUM);
110114
gaugeTotalServerNum = metricsManager.addGauge(TOTAL_SERVER_NUM);
111115
gaugeExcludeServerNum = metricsManager.addGauge(EXCLUDE_SERVER_NUM);
112116
gaugeUnhealthyServerNum = metricsManager.addGauge(UNHEALTHY_SERVER_NUM);

0 commit comments

Comments
 (0)