Skip to content

Commit 0142f05

Browse files
Fix functional tests failures for STF tests (#166)
* test_alerts * Add meaningful changed_when for `oc patch` * Replace grep with failed_when condition * Add retries to the check for alarm creation * test_sensubility * Increase retry count * test_snmp_traps * Add meaningful changed_when for `oc patch` * Remove grep and update failed_when * Wait up to two minutes for the snmp logs to show the alarm * Undo update to stf object post-test * test_verify_email * Add quotes to the receiver name This avoids some errors in parsing the alertmanager config, which caused delays and the alarm not to appear in the logs --------- Co-authored-by: Alex Yefimov <[email protected]>
1 parent 6c7d652 commit 0142f05

File tree

5 files changed

+45
-19
lines changed

5 files changed

+45
-19
lines changed

roles/test_alerts/tasks/test_create_an_alert.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,12 @@
3434
cmd: |
3535
curl -k {{ prom_auth_string }} https://{{ prom_url }}/api/v1/rules
3636
register: cmd_output
37+
retries: 30
38+
delay: 10
3739
changed_when: true
40+
# when there are no rules, there is still a response and rc == 0
41+
# e.g. {\"status\":\"success\",\"data\":{\"groups\":[]}}
42+
until: '"FVT_TESTING Collectd metrics receive rate is zero" in cmd_output.stdout'
3843

3944
always:
4045
- name: "Delete the PrometheusRule"

roles/test_alerts/tasks/test_creating_a_standard_alert_route_in_alert_manager.yml

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
ansible.builtin.shell:
1818
cmd: |
1919
oc patch stf default --type merge -p '{"spec": {"alertmanagerConfigManifest": "apiVersion: v1\nkind: Secret\nmetadata:\n name: 'alertmanager-default'\n namespace: 'service-telemetry'\ntype: Opaque\nstringData:\n alertmanager.yaml: |-\n global:\n resolve_timeout: 10m\n route:\n group_by: ['job']\n group_wait: 30s\n group_interval: 5m\n repeat_interval: 12h\n receiver: 'null'\n receivers:\n - name: 'null'\n"}}'
20-
changed_when: false
20+
changed_when: 'cmd_output == "servicetelemetry.infra.watch/default patched"'
2121
register: cmd_output
2222
failed_when: cmd_output.rc != 0
2323

@@ -27,11 +27,10 @@
2727

2828
# oc get secret alertmanager-default -o go-template='{{index .data "alertmanager.yaml" | base64decode }}'
2929
# Can't use -o go-template because of the "{{" and "}}", which are mistaken for templating syntax.
30-
# The alertmanager.yaml key needed to be surrounded by [".."] because of the period in the key name.
3130
- name: "Get the updated secret"
3231
ansible.builtin.shell:
3332
cmd: |
34-
oc get secret alertmanager-default -ojson | jq '.data | .["alertmanager.yaml"]'
33+
oc get secret alertmanager-default -ojsonpath="{ .data.alertmanager\.yaml }"
3534
register: cmd_output
3635
changed_when: false
3736

@@ -78,18 +77,27 @@
7877
cmd: >-
7978
oc exec -it prometheus-default-0 -c prometheus -- /bin/sh -c 'curl -k -H \
8079
"Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" \
81-
https://default-alertmanager-proxy:9095/api/v1/alerts' | grep 'active' | grep 'FVT_TESTING Collectd metrics receive rate is zero'
80+
https://default-alertmanager-proxy:9095/api/v1/alerts' | grep 'active'
8281
register: cmd_output
82+
retries: 30
83+
delay: 10
8384
changed_when: false
84-
failed_when: cmd_output.stdout_lines | length == 0
85+
until: '"FVT_TESTING Collectd metrics receive rate is zero" in cmd_output.stdout'
8586

8687
- name: "RHELOSP-148699 Verify that the alert is firing in Prometheus"
8788
ansible.builtin.shell:
8889
cmd: >-
89-
/usr/bin/curl -k {{ prom_auth_string }} -g https://{{ prom_url }}/api/v1/alerts | grep 'firing' | grep 'FVT_TESTING Collectd metrics receive rate is zero'
90+
curl -k {{ prom_auth_string }} -g https://{{ prom_url }}/api/v1/alerts | jq '.data.alerts | select (.[].state == "firing") | .[].labels.alertname'
91+
register: cmd_output
92+
changed_when: false
93+
failed_when: '"FVT_TESTING Collectd metrics receive rate is zero" not in cmd_output.stdout'
94+
95+
- name: "Check what alerts are firing in prometheus"
96+
ansible.builtin.command:
97+
cmd: >-
98+
curl -k {{ prom_auth_string }} -g https://{{ prom_url }}/api/v1/alerts
9099
register: cmd_output
91100
changed_when: false
92-
failed_when: cmd_output.stdout_lines | length == 0
93101

94102
always:
95103
- name: "Delete the PrometheusRule"
@@ -117,7 +125,6 @@
117125
register: output
118126
until: output.stdout_lines | length == expected_pods.stdout_lines | length
119127
changed_when: false
120-
121128

122129
- name: "RHELOSP-176039 Remove alertmanagerConfigManifest from the ServiceTelemetry object"
123130
ansible.builtin.shell:

roles/test_sensubility/tasks/test_health_status.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,18 @@
3232
changed_when: false
3333
failed_when: container_nodes.stdout_lines|length != 0
3434

35+
- name: Check what metrics are available to prometheus that relate to sensubility
36+
ansible.builtin.shell:
37+
cmd: |
38+
curl -k {{ prom_auth_string }} -g https://${prom_url}/api/v1/label/__name__/values | jq | grep sensubility
39+
changed_when: false
40+
3541
- name: RHELOSP-176036 Check that health status of container changed to 0
3642
ansible.builtin.shell:
3743
cmd: /usr/bin/curl -k {{ prom_auth_string }} -g https://{{ prom_url }}/api/v1/query? --data-urlencode 'query=last_over_time(sensubility_container_health_status{process="logrotate_crond",host="ceph-0.redhat.local"}[10m])' | grep -oP '(?<="value":).*' | awk -F, '{ print $2 }' | grep -o '[0-9]\+' | grep 0
3844
register: output
3945
changed_when: false
40-
retries: 12
46+
retries: 20
4147
delay: 10
4248
until: "output.stdout_lines | length == 1"
4349

@@ -60,7 +66,7 @@
6066
| grep -oP '(?<="value":).*' | awk -F, '{ print $2 }' | grep -o '[0-9]\+' | grep 1
6167
register: output
6268
changed_when: false
63-
retries: 12
69+
retries: 20
6470
delay: 10
6571
until: "output.stdout_lines | length == 1"
6672

roles/test_snmp_traps/tasks/main.yml

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
ansible.builtin.shell:
3232
cmd: |
3333
oc patch stf/default --type merge -p '{"spec": {"alerting": {"alertmanager": {"receivers": {"snmpTraps": {"enabled": true, "target": "10.10.10.10" }}}}}}'
34-
changed_when: false
34+
changed_when: 'cmd_output == "servicetelemetry.infra.watch/default patched"'
3535
register: cmd_output
3636
failed_when: cmd_output.rc != 0
3737

@@ -69,17 +69,17 @@
6969
- name: "RHELOSP-144481 Check for snmpTraps logs"
7070
ansible.builtin.shell:
7171
cmd: |
72-
oc logs -l "app=default-snmp-webhook" | grep "Sending SNMP trap"
72+
oc logs -l "app=default-snmp-webhook"
7373
register: cmd_output
7474
changed_when: false
75-
failed_when: "cmd_output.stdout_lines | length == 0"
75+
retries: 12
76+
delay: 10
77+
until: "'Sending SNMP trap' in cmd_output.stdout"
7678

7779
rescue:
78-
- name: "Get the snmp traps logs"
79-
ansible.builtin.shell:
80-
cmd: |
81-
oc logs -l "app=default-snmp-webhook"
82-
changed_when: false
80+
- name: "Show the snmp traps logs"
81+
ansible.builtin.debug:
82+
var: cmd_output.stdout
8383

8484
always:
8585
- name: "Delete the PrometheusRule"
@@ -98,6 +98,14 @@
9898
until: 'not "FVT_TESTING Collectd metrics receive rate is zero" in cmd_output.stdout'
9999
changed_when: false
100100

101+
- name: "Remove alertmanagerConfigManifest from the ServiceTelemetry object"
102+
ansible.builtin.shell:
103+
cmd: |
104+
oc patch stf/default --type='json' -p '[{"op": "replace", "path": "/spec/alerting/alertmanager/receivers/snmpTraps/enabled", "value": false }]'
105+
changed_when: 'cmd_output == "servicetelemetry.infra.watch/default patched"'
106+
register: cmd_output
107+
failed_when: cmd_output.rc != 0
108+
101109
- name: "Wait up to 2 minutes to make sure all default-interconnect pods are back"
102110
ansible.builtin.command:
103111
cmd: |

roles/test_verify_email/tasks/main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
- name: "RHELOSP-176043 Patch the ServiceTelemetry object for the STF deployment"
4646
ansible.builtin.shell:
4747
cmd: |
48-
oc patch stf default --type merge -p '{"spec": {"alertmanagerConfigManifest": "apiVersion: v1\nkind: Secret\nmetadata:\n name: 'alertmanager-default'\n namespace: 'service-telemetry'\ntype: Opaque\nstringData:\n alertmanager.yaml: |-\n global:\n resolve_timeout: 10m\n smtp_smarthost: localhost:25\n smtp_from: [email protected]\n smtp_auth_username: alertmanager\n smtp_auth_password: password\n route:\n group_by: ['job']\n group_wait: 30s\n group_interval: 5m\n repeat_interval: 12h\n receiver: 'email'\n receivers:\n - name: 'email'\n email_configs:\n - to: [email protected]"}}'
48+
oc patch stf default --type merge -p '{"spec": {"alertmanagerConfigManifest": "apiVersion: v1\nkind: Secret\nmetadata:\n name: 'alertmanager-default'\n namespace: 'service-telemetry'\ntype: Opaque\nstringData:\n alertmanager.yaml: |-\n global:\n resolve_timeout: 10m\n smtp_smarthost: localhost:25\n smtp_from: [email protected]\n smtp_auth_username: alertmanager\n smtp_auth_password: password\n route:\n group_by: ['job']\n group_wait: 30s\n group_interval: 5m\n repeat_interval: 12h\n receiver: 'email'\n receivers:\n - name: \"email\"\n email_configs:\n - to: [email protected]"}}'
4949
changed_when: false
5050

5151
- name: "RHELOSP-176044 Interrupt metrics flow by preventing the QDR from running"

0 commit comments

Comments
 (0)