Skip to content

Commit cfb1b95

Browse files
committed
Fixed bugs, added first documentation iteration.
1 parent 4d8b2f5 commit cfb1b95

File tree

18 files changed

+386
-42
lines changed

18 files changed

+386
-42
lines changed

infra/ansible/README.md

Lines changed: 94 additions & 17 deletions
Large diffs are not rendered by default.
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
- name: Install Alert Manager
2+
hosts: "{{ host }}"
3+
4+
tasks:
5+
- name: Install Alert Manager
6+
get_url:
7+
url: https://github.com/prometheus/alertmanager/releases/download/v{{ alert_manager_version }}/alertmanager-{{ alert_manager_version }}.linux-amd64.tar.gz
8+
dest: /tmp/alert_manager-{{ alert_manager_version }}-linux-amd64.tar.gz
9+
mode: '0644'
10+
11+
- name: Install Alert Manager package
12+
become: true
13+
unarchive:
14+
src: "/tmp/alert_manager-{{ alert_manager_version }}-linux-amd64.tar.gz"
15+
dest: "/usr/local/bin/"
16+
remote_src: yes
17+
extra_opts:
18+
- --strip-components=1
19+
vars:
20+
ansible_ssh_user: "{{ admin_user }}"
21+
22+
- name: Clean up Alert Manager tarball package
23+
file:
24+
path: /tmp/alert_manager-{{ alert_manager_version }}-linux-amd64.tar.gz
25+
state: absent
26+
27+
- name: Make sure /etc/alertmanager directory exists
28+
become: true
29+
file:
30+
path: /etc/alertmanager
31+
state: directory
32+
vars:
33+
ansible_ssh_user: "{{ admin_user }}"
34+
35+
- name: Create Alert Manager config file
36+
become: true
37+
template:
38+
src: alert_manager/alertmanager.yml.j2
39+
dest: /etc/alertmanager/alertmanager.yml
40+
vars:
41+
ansible_ssh_user: "{{ admin_user }}"
42+
pagerduty_routing_key: "{{ lookup('ini', 'pagerduty_routing_key', file=ini_file) }}"
43+
44+
- name: Create Alert Manager systemd service
45+
become: true
46+
template:
47+
src: services/alert_manager.service.j2
48+
dest: /etc/systemd/system/alert_manager.service
49+
vars:
50+
ansible_ssh_user: "{{ admin_user }}"
51+
52+
- name: Start Alert Manager
53+
become: true
54+
systemd_service:
55+
name: alert_manager
56+
state: started
57+
daemon_reload: true
58+
enabled: true
59+
vars:
60+
ansible_ssh_user: "{{ admin_user }}"

infra/ansible/playbooks/caddy.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,17 @@
22
hosts: "{{ host }}"
33

44
tasks:
5+
- name: Allow http/https traffic on UFW
6+
become: true
7+
ufw:
8+
rule: allow
9+
state: enabled
10+
port: '{{ item }}'
11+
loop:
12+
- http
13+
- https
14+
vars:
15+
ansible_ssh_user: "{{ admin_user }}"
516

617
- name: Install dependencies for Caddy
718
become: true

infra/ansible/playbooks/elixir.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@
4949
deb: "{{ download_libssl.dest }}"
5050
when: libssl_check.rc != 0
5151

52-
########## Install Erlang 26.2.1-1 ##########
5352
- name: Check if Erlang 26.2.1-1 is installed
5453
become: true
5554
ansible.builtin.shell:
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
groups:
2+
3+
- name: AllInstances
4+
rules:
5+
- alert: InstanceDown
6+
# Condition for alerting
7+
expr: up == 0
8+
for: 1m
9+
# Annotation - additional informational labels to store more information
10+
annotations:
11+
title: 'Instance {{ $labels.job }} down'
12+
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute'
13+
# Labels - additional labels to be attached to the alert
14+
labels:
15+
severity: 'critical'
16+
text: 'Instance {{ $labels.job }} down'
17+
18+
- alert: TaskDifferenceAggregatorBatcher
19+
# Condition for alerting
20+
expr: floor(increase(aligned_aggregator_received_tasks{job="aligned-aggregator"}[15m])) - on() floor(increase(sent_batches{job="aligned-batcher"}[15m])) > 1
21+
for: 30s
22+
# Annotation - additional informational labels to store more information
23+
annotations:
24+
title: 'Tasks not being received by the aggregator'
25+
description: 'The difference between aggregator recevied tasks and batcher sent batches is greater than 1 for more than 30 seconds'
26+
# Labels - additional labels to be attached to the alert
27+
labels:
28+
severity: 'critical'
29+
text: 'Tasks not being received by the aggregator'
30+
31+
- alert: TaskDifferenceAggregator
32+
# Condition for alerting
33+
expr: floor(increase(aligned_aggregator_received_tasks{job="aligned-aggregator"}[15m])) - floor(increase(aligned_aggregated_responses{job="aligned-aggregator"}[15m])) > 1
34+
for: 30s
35+
# Annotation - additional informational labels to store more information
36+
annotations:
37+
title: 'Tasks not being verified'
38+
description: 'The difference between aggregator received tasks and verified tasks is greater than 1 for more than 30 seconds'
39+
# Labels - additional labels to be attached to the alert
40+
labels:
41+
severity: 'critical'
42+
text: 'Tasks not being verified'
43+
44+
- alert: UserErrorRate
45+
# Condition for alerting
46+
expr: rate(user_errors[5m]) > 2
47+
for: 1m
48+
# Annotation - additional informational labels to store more information
49+
annotations:
50+
title: 'High error rate {{ $labels.error_type }}'
51+
description: 'User error rate is greater than 2 for more than 1 minute'
52+
labels:
53+
severity: 'critical'
54+
text: 'High error rate {{ $labels.error_type }}'

infra/ansible/playbooks/grafana.yaml

Lines changed: 48 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@
3636
file:
3737
path: /etc/grafana/
3838
state: directory
39-
owner: root
40-
group: root
39+
owner: grafana
40+
group: grafana
4141
mode: '0755'
4242
vars:
4343
ansible_ssh_user: "{{ admin_user }}"
@@ -47,27 +47,33 @@
4747
template:
4848
src: grafana/grafana.ini.j2
4949
dest: /etc/grafana/grafana.ini
50-
owner: root
51-
group: root
50+
owner: grafana
51+
group: grafana
5252
mode: '0755'
5353
vars:
5454
ansible_ssh_user: "{{ admin_user }}"
55-
grafana_http_port: "{{ lookup('ini', 'grafana_http_port', file=ini_file) }}"
55+
grafana_domain: "{{ lookup('ini', 'grafana_domain', file=ini_file) }}"
56+
grafana_oath_client_id: "{{ lookup('ini', 'grafana_oath_client_id', file=ini_file) }}"
57+
grafana_oath_client_secret: "{{ lookup('ini', 'grafana_oath_client_secret', file=ini_file) }}"
58+
grafana_oath_auth_url: "{{ lookup('ini', 'grafana_oath_auth_url', file=ini_file) }}"
59+
grafana_oath_token_url: "{{ lookup('ini', 'grafana_oath_token_url', file=ini_file) }}"
60+
grafana_oath_api_url: "{{ lookup('ini', 'grafana_oath_api_url', file=ini_file) }}"
5661

5762
- name: Clone Aligned repository
5863
git:
5964
repo: https://github.com/yetanotherco/aligned_layer.git
6065
dest: /home/{{ ansible_user }}/repos/telemetry/aligned_layer
61-
version: v0.10.2
66+
#version: v0.12.0
67+
version: staging
6268
recursive: false
6369

6470
- name: Ensure /etc/grafana/provisioning directory exists
6571
become: true
6672
file:
6773
path: /etc/grafana/provisioning/
6874
state: directory
69-
owner: root
70-
group: root
75+
owner: grafana
76+
group: grafana
7177
mode: '0755'
7278
vars:
7379
ansible_ssh_user: "{{ admin_user }}"
@@ -77,8 +83,8 @@
7783
copy:
7884
src: /home/{{ ansible_user }}/repos/telemetry/aligned_layer/grafana/provisioning/
7985
dest: /etc/grafana/provisioning/
80-
owner: root
81-
group: root
86+
owner: grafana
87+
group: grafana
8288
mode: '0755'
8389
remote_src: yes
8490
vars:
@@ -89,11 +95,43 @@
8995
template:
9096
src: grafana/datasource.yaml.j2
9197
dest: "/etc/grafana/provisioning/datasources/datasource.yaml"
98+
owner: grafana
99+
group: grafana
92100
mode: '0755'
93101
vars:
94102
ansible_ssh_user: "{{ admin_user }}"
95103
grafana_prometheus_datasource: "{{ lookup('ini', 'grafana_prometheus_datasource', file=ini_file) }}"
96104

105+
- name: Change admin password for grafana
106+
shell:
107+
cmd: sudo grafana-cli admin reset-admin-password {{ lookup('ini', 'grafana_admin_password', file=ini_file) }}
108+
vars:
109+
ansible_ssh_user: "{{ admin_user }}"
110+
111+
- name: Ensure /etc/grafana/ directory is owned by user grafana
112+
become: true
113+
file:
114+
path: /etc/grafana/
115+
recurse: true
116+
state: directory
117+
owner: grafana
118+
group: grafana
119+
mode: '0755'
120+
vars:
121+
ansible_ssh_user: "{{ admin_user }}"
122+
123+
- name: Ensure /var/lib/grafana/ directory is owned by user grafana
124+
become: true
125+
file:
126+
path: /var/lib/grafana/
127+
recurse: true
128+
state: directory
129+
owner: grafana
130+
group: grafana
131+
mode: '0755'
132+
vars:
133+
ansible_ssh_user: "{{ admin_user }}"
134+
97135
- name: Restart Grafana
98136
become: true
99137
service:

infra/ansible/playbooks/ini/config-telemetry.ini.example

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,14 @@ caddy_metrics_url=
33
caddy_telemetry_url=
44
caddy_jaeger_url=
55

6+
grafana_admin_password=
67
grafana_prometheus_datasource=
7-
grafana_http_port=
8+
grafana_domain=
9+
grafana_oath_client_id=
10+
grafana_oath_client_secret=
11+
grafana_oath_auth_url=
12+
grafana_oath_token_url=
13+
grafana_oath_api_url=
814

915
prometheus_aggregator_ip=
1016
prometheus_operator_ip=
@@ -15,10 +21,12 @@ postgresql_telemetry_db_name=
1521
postgresql_telemetry_user=
1622
postgresql_telemetry_pass=
1723

18-
cassandra_telemetry_user=admidn
24+
cassandra_telemetry_user=
1925
cassandra_telemetry_pass=
2026

2127
telemetry_aligned_rpc=
2228
telemetry_api_phx_host=
2329
telemetry_api_elixir_hostname=
2430
telemetry_api_secret_key_base=
31+
32+
pagerduty_routing_key=

infra/ansible/playbooks/open_telemetry.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
become: true
2525
template:
2626
src: open_telemetry/otel-collector.yaml
27-
dest: /etc/otel-collector.yaml
27+
dest: /etc/otelcol/config.yaml
2828
vars:
2929
ansible_ssh_user: "{{ admin_user }}"
3030

infra/ansible/playbooks/prometheus.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,17 @@
111111
prometheus_batcher_ip: "{{ lookup('ini', 'prometheus_batcher_ip', file='ini/config-telemetry.ini') }}"
112112
prometheus_tracker_ip: "{{ lookup('ini', 'prometheus_tracker_ip', file='ini/config-telemetry.ini') }}"
113113

114+
- name: Add prometheus rules file
115+
become: true
116+
copy:
117+
src: prometheus/rules.yml
118+
dest: /etc/prometheus/rules.yml
119+
owner: prometheus
120+
group: prometheus
121+
mode: '0755'
122+
vars:
123+
ansible_ssh_user: "{{ admin_user }}"
124+
114125
- name: Create Prometheus systemd service
115126
become: true
116127
template:

infra/ansible/playbooks/telemetry.yaml

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,13 @@
1616
prometheus_version: "2.53.2"
1717
ini_file: ini/config-telemetry.ini
1818

19+
- name: Run Alert Manager playbook
20+
ansible.builtin.import_playbook: alert_manager.yaml
21+
vars:
22+
host: telemetry
23+
alert_manager_version: 0.27.0
24+
ini_file: ini/config-telemetry.ini
25+
1926
- name: Run Grafana playbook
2027
ansible.builtin.import_playbook: grafana.yaml
2128
vars:
@@ -53,10 +60,23 @@
5360
vars:
5461
host: telemetry
5562

63+
- name: Run Go playbook
64+
ansible.builtin.import_playbook: go.yaml
65+
vars:
66+
host: telemetry
67+
5668
- name: Setup Telemetry
5769
hosts: telemetry
5870

5971
tasks:
72+
- name: Make sure /etc/default/tailscaled exists
73+
become: true
74+
file:
75+
path: /etc/default/tailscaled
76+
state: touch
77+
vars:
78+
ansible_ssh_user: "{{ admin_user }}"
79+
6080
- name: Add permit to tailscale for caddy
6181
become: true
6282
lineinfile:
@@ -73,9 +93,17 @@
7393
ansible.builtin.git:
7494
repo: https://github.com/yetanotherco/aligned_layer.git
7595
dest: /home/{{ ansible_user }}/repos/telemetry/aligned_layer
76-
version: v0.10.2
96+
#version: v0.12.0
97+
version: staging
7798
recursive: false
7899

100+
- name: Run telemetry_compile_bls_verifier target
101+
make:
102+
target: telemetry_compile_bls_verifier
103+
chdir: /home/{{ ansible_user }}/repos/telemetry/aligned_layer
104+
environment:
105+
PATH: "{{ ansible_env.PATH }}:/usr/local/go/bin"
106+
79107
- name: Add environment file for Telemetry API
80108
template:
81109
src: telemetry_api/telemetry_env.j2
@@ -102,14 +130,19 @@
102130

103131
- name: Build release for Telemetry API
104132
shell:
105-
cmd: |
106-
source .env && mix release
133+
cmd: source .env && mix release
107134
chdir: /home/{{ ansible_user }}/repos/telemetry/aligned_layer/telemetry_api
108135
executable: /bin/bash
109136
creates: /home/{{ ansible_user }}/repos/telemetry/aligned_layer/telemetry_api/_build/prod/rel/telemetry_api/bin/
110137
environment:
111138
MIX_ENV: prod
112139

140+
- name: Run migrations for Telemetry API
141+
shell:
142+
cmd: source .env && _build/prod/rel/telemetry_api/bin/migrate
143+
chdir: /home/{{ ansible_user }}/repos/telemetry/aligned_layer/telemetry_api
144+
executable: /bin/bash
145+
113146
- name: Ensure ~/.config/systemd/user/ directory exists
114147
file:
115148
path: /home/{{ ansible_user }}/.config/systemd/user/

0 commit comments

Comments
 (0)