Skip to content

Commit cfbb00d

Browse files
authored
feat:Enhance the alarm kernel with recovered status notification capability for alarm rules (#13539)
1 parent b762831 commit cfbb00d

File tree

133 files changed

+2020
-409
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

133 files changed

+2020
-409
lines changed

.github/workflows/skywalking.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -780,7 +780,7 @@ jobs:
780780
if: matrix.test.docker != null
781781
run: docker build -t ${{ matrix.test.docker.name }} -f ${{ matrix.test.docker.base }}/${{ matrix.test.docker.file }} ${{ matrix.test.docker.base }}
782782
- name: ${{ matrix.test.name }}
783-
uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180
783+
uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80
784784
with:
785785
e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }}
786786
- if: ${{ failure() }}
@@ -844,7 +844,7 @@ jobs:
844844
username: ${{ github.repository_owner }}
845845
password: ${{ secrets.GITHUB_TOKEN }}
846846
- name: ${{ matrix.test.name }}
847-
uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180
847+
uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80
848848
env:
849849
ISTIO_VERSION: ${{ matrix.versions.istio }}
850850
KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }}
@@ -905,7 +905,7 @@ jobs:
905905
username: ${{ github.repository_owner }}
906906
password: ${{ secrets.GITHUB_TOKEN }}
907907
- name: ${{ matrix.test.name }}
908-
uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180
908+
uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80
909909
env:
910910
ISTIO_VERSION: ${{ matrix.versions.istio }}
911911
KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }}
@@ -968,7 +968,7 @@ jobs:
968968
shell: bash
969969
run: ./mvnw -B -q -f test/e2e-v2/java-test-service/pom.xml clean package
970970
- name: Java version ${{ matrix.java-version }}
971-
uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180
971+
uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80
972972
env:
973973
SW_AGENT_JDK_VERSION: ${{ matrix.java-version }}
974974
with:
@@ -1064,7 +1064,7 @@ jobs:
10641064
# fi
10651065
# docker compose -f ${BANYANDB_DATA_GENERATE_ROOT}/docker-compose.yml down -v
10661066
# - name: ${{ matrix.test.name }}
1067-
# uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180
1067+
# uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80
10681068
# with:
10691069
# e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }}
10701070
# - if: ${{ failure() }}

dist-material/alarm-settings.yml

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ rules:
2323
expression: sum(service_resp_time > 1000) >= 3
2424
period: 10
2525
silence-period: 5
26+
# Number of periods to wait before considering the alarm recovered,default as 0.
27+
recovery-observation-period: 3
2628
message: Response time of service {name} is more than 1000ms in 3 minutes of last 10 minutes.
2729
# service_resp_time_rule:
2830
# expression: avg(service_resp_time) > 1000
@@ -35,16 +37,20 @@ rules:
3537
period: 10
3638
# How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
3739
silence-period: 3
40+
# Number of periods to wait before considering the alarm recovered,default as 0.
41+
recovery-observation-period: 2
3842
message: Successful rate of service {name} is lower than 80% in 2 minutes of last 10 minutes
3943
service_resp_time_percentile_rule:
4044
expression: sum(service_percentile{p='50,75,90,95,99'} > 1000) >= 3
4145
period: 10
4246
silence-period: 5
47+
recovery-observation-period: 3
4348
message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000
4449
service_instance_resp_time_rule:
4550
expression: sum(service_instance_resp_time > 1000) >= 2
4651
period: 10
4752
silence-period: 5
53+
recovery-observation-period: 2
4854
message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes
4955
database_access_resp_time_rule:
5056
expression: sum(database_access_resp_time > 1000) >= 2
@@ -63,11 +69,36 @@ rules:
6369
# silence-period: 5
6470
# message: Response time of endpoint {name} is more than 1000ms in 2 minutes of last 10 minutes
6571

72+
6673
#hooks:
6774
# webhook:
6875
# default:
6976
# is-default: true
7077
# urls:
71-
# - http://127.0.0.1/notify/
72-
# - http://127.0.0.1/go-wechat/
73-
78+
# - http://127.0.0.1/default/alarm
79+
# recovery-urls:
80+
# - http://127.0.0.1/default/alarm-recovery
81+
# custom1:
82+
# urls:
83+
# - http://127.0.0.1/custom1/alarm
84+
# recovery-urls:
85+
# - http://127.0.0.1/custom1/alarm-recovery
86+
# wechat:
87+
# default:
88+
# is-default: true
89+
# text-template: |-
90+
# {
91+
# "msgtype": "text",
92+
# "text": {
93+
# "content": "Apache SkyWalking Alarm: \n %s."
94+
# }
95+
# }
96+
# recovery-text-template: |-
97+
# {
98+
# "msgtype": "text",
99+
# "text": {
100+
# "content": "Apache SkyWalking Alarm Recovered: \n %s."
101+
# }
102+
# }
103+
# webhooks:
104+
# - https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=dummy_key

dist-material/config-examples/alarm-settings.yml

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ rules:
2323
period: 10
2424
# How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
2525
silence-period: 10
26+
# Number of periods to wait before considering the alarm recovered,default as 0.
27+
recovery-observation-period: 3
2628
message: Successful rate of endpoint {name} is lower than 75%
2729
tags:
2830
level: WARNING
@@ -43,7 +45,35 @@ rules:
4345
silence-period: 5
4446
message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes
4547

46-
#webhooks:
47-
# - http://127.0.0.1/notify/
48-
# - http://127.0.0.1/go-wechat/
49-
48+
#hooks:
49+
# webhook:
50+
# default:
51+
# is-default: true
52+
# urls:
53+
# - http://127.0.0.1/default/alarm
54+
# recovery-urls:
55+
# - http://127.0.0.1/default/alarm-recovery
56+
# custom1:
57+
# urls:
58+
# - http://127.0.0.1/custom1/alarm
59+
# recovery-urls:
60+
# - http://127.0.0.1/custom1/alarm-recovery
61+
# wechat:
62+
# default:
63+
# is-default: true
64+
# text-template: |-
65+
# {
66+
# "msgtype": "text",
67+
# "text": {
68+
# "content": "Apache SkyWalking Alarm: \n %s."
69+
# }
70+
# }
71+
# recovery-text-template: |-
72+
# {
73+
# "msgtype": "text",
74+
# "text": {
75+
# "content": "Apache SkyWalking Alarm Recovered: \n %s."
76+
# }
77+
# }
78+
# webhooks:
79+
# - https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=dummy_key

docs/en/changes/changes.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,11 @@
55
#### OAP Server
66

77
* KubernetesCoordinator: make self instance return real pod IP address instead of `127.0.0.1`.
8+
* Enhance the alarm kernel with recovered status notification capability
89

910
#### UI
11+
* Fix the missing icon in new native trace view.
12+
* Enhance the alert page to show the recovery time of resolved alerts.
1013

1114
#### Documentation
1215

0 commit comments

Comments
 (0)