Skip to content

Commit 0cd4354

Browse files
authored
Add Get Alarm Runtime Status API. (#13028)
1 parent b7e961b commit 0cd4354

File tree

10 files changed

+389
-1
lines changed

10 files changed

+389
-1
lines changed

docs/en/changes/changes.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@
7070
* Add type descriptor when converting Envoy logs to JSON for persistence, to avoid conversion error.
7171
* Bseline: Support query baseline with MQE and use in the Alarm Rule.
7272
* Bump up netty to 4.11.118 to fix CVE-2025-24970.
73+
* Add `Get Alarm Runtime Status` API.
74+
* Add `lock` when query the Alarm metrics window values.
7375

7476
#### UI
7577

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
# Get Alarm Runtime Status
2+
3+
OAP calculates the alarm conditions in the memory based on the alarm rules and the metrics data.
4+
The following APIs are exposed to make the alerting running kernel visible.
5+
6+
## Get Alarm Running Rules
7+
8+
Return the list of alarm running rules.
9+
10+
- URL, `http://{core restHost}:{core restPort}/status/alarm/rules`
11+
- HTTP GET method.
12+
13+
```json
14+
{
15+
"ruleNames": [
16+
"service_percentile_rule",
17+
"service_resp_time_rule"
18+
]
19+
}
20+
```
21+
22+
## Get Alarm Running Rule Info
23+
24+
Return the detailed information of the alarm running rule.
25+
26+
- URL, `http://{core restHost}:{core restPort}/status/alarm/rules/{ruleName}`
27+
- HTTP GET method.
28+
29+
```json
30+
{
31+
"ruleName": "service_resp_time_rule",
32+
"expression": "sum(service_resp_time > baseline(service_resp_time,upper)) >= 1",
33+
"period": 10,
34+
"silentPeriod": 10,
35+
"additonalPeriod": 0,
36+
"includeNames": [
37+
"mock_a_service",
38+
"mock_b_service",
39+
"mock_c_service"
40+
],
41+
"excludeNames": [],
42+
"includeNamesRegex": "",
43+
"excludeNamesRegex": "",
44+
"affectedEntities": [
45+
{
46+
"scope": "SERVICE",
47+
"name": "mock_b_service"
48+
},
49+
{
50+
"scope": "SERVICE",
51+
"name": "mock_a_service"
52+
},
53+
{
54+
"scope": "SERVICE",
55+
"name": "mock_c_service"
56+
}
57+
],
58+
"tags": [
59+
{
60+
"key": "level",
61+
"value": "WARNING"
62+
}
63+
],
64+
"hooks": [
65+
"webhook.default",
66+
"wechat.default"
67+
],
68+
"includeMetrics": [
69+
"service_resp_time"
70+
],
71+
"formattedMessages": [
72+
{
73+
"mock_b_service": "Response time of service mock_b_service is more than upper baseline in 1 minutes of last 10 minutes."
74+
},
75+
{
76+
"mock_a_service": "Response time of service mock_a_service is more than upper baseline in 1 minutes of last 10 minutes."
77+
},
78+
{
79+
"mock_c_service": "Response time of service mock_c_service is more than upper baseline in 1 minutes of last 10 minutes."
80+
}
81+
]
82+
}
83+
```
84+
85+
- `additonalPeriod` is the additional period if the expression includes the [increase/rate function](../api/metrics-query-expression.md#trend-operation).
86+
This additional period is used to enlarge window size for calculating the trend value.
87+
- `affectedEntities` is the entities that have metrics data and being calculated by the alarm rule.
88+
- `formattedMessages` is the result message according to the message template and the affected entities.
89+
90+
## Get Alarm Running Context
91+
92+
Return the running context of the alarm rule.
93+
94+
- URL, `http://{core restHost}:{core restPort}/status/alarm/{ruleName}/{entityName}`
95+
- HTTP GET method.
96+
97+
```json
98+
{
99+
"expression": "sum(service_resp_time > baseline(service_resp_time,upper)) >= 1",
100+
"endTime": "2025-02-12T13:39:00.000",
101+
"additionalPeriod": 0,
102+
"size": 10,
103+
"silenceCountdown": 10,
104+
"windowValues": [
105+
{
106+
"index": 0,
107+
"metrics": []
108+
},
109+
{
110+
"index": 1,
111+
"metrics": []
112+
},
113+
{
114+
"index": 2,
115+
"metrics": []
116+
},
117+
{
118+
"index": 3,
119+
"metrics": []
120+
},
121+
{
122+
"index": 4,
123+
"metrics": []
124+
},
125+
{
126+
"index": 5,
127+
"metrics": []
128+
},
129+
{
130+
"index": 6,
131+
"metrics": []
132+
},
133+
{
134+
"index": 7,
135+
"metrics": [
136+
{
137+
"timeBucket": 202502121437,
138+
"name": "service_resp_time",
139+
"value": "6000"
140+
}
141+
]
142+
},
143+
{
144+
"index": 8,
145+
"metrics": []
146+
},
147+
{
148+
"index": 9,
149+
"metrics": []
150+
}
151+
],
152+
"mqeMetricsSnapshot": {
153+
"service_resp_time": "[{\"metric\":{\"labels\":[]},\"values\":[{\"id\":\"202502121430\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121431\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121432\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121433\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121434\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121435\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121436\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121437\",\"doubleValue\":6000.0,\"isEmptyValue\":false},{\"id\":\"202502121438\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121439\",\"doubleValue\":0.0,\"isEmptyValue\":true}]}]",
154+
"baseline(service_resp_time,upper)": "[{\"metric\":{\"labels\":[]},\"values\":[{\"id\":\"202502121430\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121431\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121432\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121433\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121434\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121435\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121436\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121437\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121438\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121439\",\"doubleValue\":10.0,\"isEmptyValue\":false}]}]"
155+
}
156+
}
157+
```
158+
`size` is the window size. Equal to the `period + additionalPeriod`.
159+
`silenceCountdown` is the countdown of the silence period. -1 means silence countdown is not running.
160+
`windowValues` is the original metrics data. The `index` is the index of the window, starting from 0.
161+
`mqeMetricsSnapshot` is the metrics data in the MQE format. When checking conditions, these data will be calculated according to the expression.

docs/en/status/status_apis.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ logs and self-observability solutions.
1111
- [Tracing Query Execution APIs](../debugging/query-tracing.md)
1212
- [Get Effective TTL Configurations API](query_ttl_setup.md)
1313
- [Query Cluster Nodes API](query_cluster_nodes.md)
14+
- [Get Alarm Runtime Status API](query_alarm_runtime_status.md)
1415

1516
If you have a proposal about new status API, please don't hesitate
1617
to [create a discussion](https://github.com/apache/skywalking/discussions/new?category=ideas).

docs/menu.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,8 @@ catalog:
346346
path: "/en/status/query_ttl_setup"
347347
- name: "Get Node List in the Cluster"
348348
path: "/en/status/query_cluster_nodes"
349+
- name: "Get Alarm Runtime Status"
350+
path: "/en/status/query_alarm_runtime_status"
349351
- name: "Customization"
350352
catalog:
351353
- name: "Overview"

oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmMessageFormatter.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import java.util.ArrayList;
2222
import java.util.List;
23+
import lombok.Getter;
2324

2425
/**
2526
* This is a formatter especially for alarm message.
@@ -28,6 +29,7 @@
2829
* <p>
2930
* - Successful rate of endpoint {name} is lower than 75%
3031
*/
32+
@Getter
3133
public class AlarmMessageFormatter {
3234
private List<String> formatSegments;
3335
private List<ValueFrom> valueFroms;
@@ -88,7 +90,7 @@ public String format(AlarmEntity alarmEntity) {
8890
return message.toString();
8991
}
9092

91-
private enum ValueFrom {
93+
public enum ValueFrom {
9294
ID, NAME
9395
}
9496
}

oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmModuleProvider.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import java.io.FileNotFoundException;
2222
import java.io.Reader;
23+
import lombok.Getter;
2324
import org.apache.skywalking.oap.server.configuration.api.ConfigurationModule;
2425
import org.apache.skywalking.oap.server.configuration.api.DynamicConfigurationService;
2526
import org.apache.skywalking.oap.server.core.CoreModule;
@@ -35,6 +36,7 @@
3536
public class AlarmModuleProvider extends ModuleProvider {
3637

3738
private NotifyHandler notifyHandler;
39+
@Getter
3840
private AlarmRulesWatcher alarmRulesWatcher;
3941

4042
@Override

oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,10 @@
2929
import java.util.Set;
3030
import java.util.concurrent.ConcurrentHashMap;
3131
import java.util.concurrent.locks.ReentrantLock;
32+
import java.util.function.Consumer;
3233
import java.util.regex.Pattern;
3334
import java.util.stream.Collectors;
35+
import lombok.Getter;
3436
import lombok.RequiredArgsConstructor;
3537
import lombok.ToString;
3638
import lombok.extern.slf4j.Slf4j;
@@ -68,6 +70,7 @@
6870
* RunningRule represents each rule in running status. Based on the {@link AlarmRule} definition,
6971
*/
7072
@Slf4j
73+
@Getter
7174
public class RunningRule {
7275
private static DateTimeFormatter TIME_BUCKET_FORMATTER = DateTimeFormat.forPattern("yyyyMMddHHmm");
7376

@@ -243,12 +246,17 @@ public List<AlarmMessage> check() {
243246
* buckets.
244247
*/
245248
public class Window {
249+
@Getter
246250
private LocalDateTime endTime;
251+
@Getter
247252
private final int additionalPeriod;
253+
@Getter
248254
private final int size;
255+
@Getter
249256
private int silenceCountdown;
250257
private LinkedList<Map<String, Metrics>> values;
251258
private ReentrantLock lock = new ReentrantLock();
259+
@Getter
252260
private JsonObject mqeMetricsSnapshot;
253261
private AlarmEntity entity;
254262

@@ -356,6 +364,7 @@ public Optional<AlarmMessage> checkAlarm() {
356364
}
357365

358366
private boolean isMatch() {
367+
this.lock.lock();
359368
int isMatch = 0;
360369
try {
361370
TRACE_CONTEXT.set(new DebuggingTraceContext(expression, false, false));
@@ -407,6 +416,7 @@ private boolean isMatch() {
407416
this.mqeMetricsSnapshot = visitor.getMqeMetricsSnapshot();
408417
return isMatch == 1;
409418
} finally {
419+
this.lock.unlock();
410420
TRACE_CONTEXT.remove();
411421
}
412422
}
@@ -422,6 +432,15 @@ public boolean isExpired() {
422432
return true;
423433
}
424434

435+
public void scanWindowValues(Consumer<LinkedList<Map<String, Metrics>>> scanFunction) {
436+
lock.lock();
437+
try {
438+
scanFunction.accept(values);
439+
} finally {
440+
lock.unlock();
441+
}
442+
}
443+
425444
private void init() {
426445
values = new LinkedList<>();
427446
for (int i = 0; i < size; i++) {

oap-server/server-query-plugin/status-query-plugin/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,5 +44,10 @@
4444
<artifactId>zipkin-query-plugin</artifactId>
4545
<version>${project.version}</version>
4646
</dependency>
47+
<dependency>
48+
<groupId>org.apache.skywalking</groupId>
49+
<artifactId>server-alarm-plugin</artifactId>
50+
<version>${project.version}</version>
51+
</dependency>
4752
</dependencies>
4853
</project>

0 commit comments

Comments
 (0)