Skip to content

Commit 0c83a63

Browse files
committed
Add Get Alarm Runtime Status API.
Add `lock` when query the Alarm metrics window values.
1 parent b7e961b commit 0c83a63

File tree

8 files changed

+364
-1
lines changed

8 files changed

+364
-1
lines changed

docs/en/changes/changes.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@
7070
* Add type descriptor when converting Envoy logs to JSON for persistence, to avoid conversion error.
7171
* Bseline: Support query baseline with MQE and use in the Alarm Rule.
7272
* Bump up netty to 4.11.118 to fix CVE-2025-24970.
73+
* Add `Get Alarm Runtime Status` API.
74+
* Add `lock` when query the Alarm metrics window values.
7375

7476
#### UI
7577

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
# Get Alarm Runtime Status
2+
3+
OAP calculate the alarm condition in the memory based on the alarm rules and the metrics data.
4+
The following API is used to get the running rules/contexts of the alarm calculation.
5+
6+
## Get Alarm Running Rules
7+
8+
Return the list of alarm running rules.
9+
10+
- URL, `http://{core restHost}:{core restPort}/status/alarm/rules`
11+
- HTTP GET method.
12+
13+
```json
14+
{
15+
"ruleNames": [
16+
"service_percentile_rule",
17+
"service_resp_time_rule"
18+
]
19+
}
20+
```
21+
22+
## Get Alarm Running Rule Info
23+
24+
Return the detailed information of the alarm running rule.
25+
26+
- URL, `http://{core restHost}:{core restPort}/status/alarm/rules/{ruleName}`
27+
- HTTP GET method.
28+
29+
```json
30+
{
31+
"ruleName": "service_resp_time_rule",
32+
"expression": "sum(service_resp_time > baseline(service_resp_time,upper)) >= 1",
33+
"period": 10,
34+
"silentPeriod": 10,
35+
"additonalPeriod": 0,
36+
"includeNames": [
37+
"mock_a_service",
38+
"mock_b_service",
39+
"mock_c_service"
40+
],
41+
"excludeNames": [],
42+
"includeNamesRegex": "",
43+
"excludeNamesRegex": "",
44+
"affectedEntities": [],
45+
"tags": [
46+
{
47+
"key": "level",
48+
"value": "WARNING"
49+
}
50+
],
51+
"hooks": [
52+
"webhook.default",
53+
"wechat.default"
54+
],
55+
"includeMetrics": [
56+
"service_resp_time"
57+
],
58+
"messageFormatter": [
59+
[
60+
"Response time of service ",
61+
" is more than upper baseline in 1 minutes of last 10 minutes."
62+
],
63+
[
64+
"NAME"
65+
]
66+
]
67+
}
68+
```
69+
70+
## Get Alarm Running Context
71+
72+
Return the running context of the alarm rule.
73+
74+
- URL, `http://{core restHost}:{core restPort}/status/alarm/{ruleName}/{entityName}`
75+
- HTTP GET method.
76+
77+
```json
78+
{
79+
"expression": "sum(service_resp_time > baseline(service_resp_time,upper)) >= 1",
80+
"endTime": "2025-02-12T14:39:00.000",
81+
"additionalPeriod": 0,
82+
"size": 10,
83+
"silenceCountdown": 10,
84+
"windowValues": [
85+
{
86+
"index": 0,
87+
"metrics": []
88+
},
89+
{
90+
"index": 1,
91+
"metrics": []
92+
},
93+
{
94+
"index": 2,
95+
"metrics": []
96+
},
97+
{
98+
"index": 3,
99+
"metrics": []
100+
},
101+
{
102+
"index": 4,
103+
"metrics": []
104+
},
105+
{
106+
"index": 5,
107+
"metrics": []
108+
},
109+
{
110+
"index": 6,
111+
"metrics": []
112+
},
113+
{
114+
"index": 7,
115+
"metrics": [
116+
{
117+
"timeBucket": 202502121437,
118+
"name": "service_resp_time",
119+
"value": "6000"
120+
}
121+
]
122+
},
123+
{
124+
"index": 8,
125+
"metrics": []
126+
},
127+
{
128+
"index": 9,
129+
"metrics": []
130+
}
131+
],
132+
"mqeMetricsSnapshot": {
133+
"service_resp_time": "[{\"metric\":{\"labels\":[]},\"values\":[{\"id\":\"202502121430\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121431\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121432\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121433\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121434\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121435\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121436\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121437\",\"doubleValue\":6000.0,\"isEmptyValue\":false},{\"id\":\"202502121438\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121439\",\"doubleValue\":0.0,\"isEmptyValue\":true}]}]",
134+
"baseline(service_resp_time,upper)": "[{\"metric\":{\"labels\":[]},\"values\":[{\"id\":\"202502121430\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121431\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121432\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121433\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121434\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121435\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121436\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121437\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121438\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121439\",\"doubleValue\":10.0,\"isEmptyValue\":false}]}]"
135+
}
136+
}
137+
```
138+
139+
`windowValues` is the original metrics data. The `index` is the index of the window, starting from 0.
140+
`mqeMetricsSnapshot` is the metrics data in the MQE format. When checking conditions, these data will be calculated according to the expression.

docs/en/status/status_apis.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ logs and self-observability solutions.
1111
- [Tracing Query Execution APIs](../debugging/query-tracing.md)
1212
- [Get Effective TTL Configurations API](query_ttl_setup.md)
1313
- [Query Cluster Nodes API](query_cluster_nodes.md)
14+
- [Get Alarm Runtime Status API](query_alarm_runtime_status.md)
1415

1516
If you have a proposal about new status API, please don't hesitate
1617
to [create a discussion](https://github.com/apache/skywalking/discussions/new?category=ideas).

oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmMessageFormatter.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import java.util.ArrayList;
2222
import java.util.List;
23+
import lombok.Getter;
2324

2425
/**
2526
* This is a formatter especially for alarm message.
@@ -28,6 +29,7 @@
2829
* <p>
2930
* - Successful rate of endpoint {name} is lower than 75%
3031
*/
32+
@Getter
3133
public class AlarmMessageFormatter {
3234
private List<String> formatSegments;
3335
private List<ValueFrom> valueFroms;
@@ -88,7 +90,7 @@ public String format(AlarmEntity alarmEntity) {
8890
return message.toString();
8991
}
9092

91-
private enum ValueFrom {
93+
public enum ValueFrom {
9294
ID, NAME
9395
}
9496
}

oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmModuleProvider.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import java.io.FileNotFoundException;
2222
import java.io.Reader;
23+
import lombok.Getter;
2324
import org.apache.skywalking.oap.server.configuration.api.ConfigurationModule;
2425
import org.apache.skywalking.oap.server.configuration.api.DynamicConfigurationService;
2526
import org.apache.skywalking.oap.server.core.CoreModule;
@@ -35,6 +36,7 @@
3536
public class AlarmModuleProvider extends ModuleProvider {
3637

3738
private NotifyHandler notifyHandler;
39+
@Getter
3840
private AlarmRulesWatcher alarmRulesWatcher;
3941

4042
@Override

oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,10 @@
2929
import java.util.Set;
3030
import java.util.concurrent.ConcurrentHashMap;
3131
import java.util.concurrent.locks.ReentrantLock;
32+
import java.util.function.Consumer;
3233
import java.util.regex.Pattern;
3334
import java.util.stream.Collectors;
35+
import lombok.Getter;
3436
import lombok.RequiredArgsConstructor;
3537
import lombok.ToString;
3638
import lombok.extern.slf4j.Slf4j;
@@ -68,6 +70,7 @@
6870
* RunningRule represents each rule in running status. Based on the {@link AlarmRule} definition,
6971
*/
7072
@Slf4j
73+
@Getter
7174
public class RunningRule {
7275
private static DateTimeFormatter TIME_BUCKET_FORMATTER = DateTimeFormat.forPattern("yyyyMMddHHmm");
7376

@@ -243,12 +246,17 @@ public List<AlarmMessage> check() {
243246
* buckets.
244247
*/
245248
public class Window {
249+
@Getter
246250
private LocalDateTime endTime;
251+
@Getter
247252
private final int additionalPeriod;
253+
@Getter
248254
private final int size;
255+
@Getter
249256
private int silenceCountdown;
250257
private LinkedList<Map<String, Metrics>> values;
251258
private ReentrantLock lock = new ReentrantLock();
259+
@Getter
252260
private JsonObject mqeMetricsSnapshot;
253261
private AlarmEntity entity;
254262

@@ -356,6 +364,7 @@ public Optional<AlarmMessage> checkAlarm() {
356364
}
357365

358366
private boolean isMatch() {
367+
this.lock.lock();
359368
int isMatch = 0;
360369
try {
361370
TRACE_CONTEXT.set(new DebuggingTraceContext(expression, false, false));
@@ -407,6 +416,7 @@ private boolean isMatch() {
407416
this.mqeMetricsSnapshot = visitor.getMqeMetricsSnapshot();
408417
return isMatch == 1;
409418
} finally {
419+
this.lock.unlock();
410420
TRACE_CONTEXT.remove();
411421
}
412422
}
@@ -422,6 +432,15 @@ public boolean isExpired() {
422432
return true;
423433
}
424434

435+
public void scanWindowValues(Consumer<LinkedList<Map<String, Metrics>>> scanFunction) {
436+
lock.lock();
437+
try {
438+
scanFunction.accept(values);
439+
} finally {
440+
lock.unlock();
441+
}
442+
}
443+
425444
private void init() {
426445
values = new LinkedList<>();
427446
for (int i = 0; i < size; i++) {

0 commit comments

Comments
 (0)