Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/chatops.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ jobs:
pip install ansible
pip install ansible-lint
cd chatops_deployment
ansible-galaxy install -r ansible/requirements.yml
ansible-lint --project-dir ansible

- name: Run ShellCheck
Expand Down
3 changes: 1 addition & 2 deletions chatops_deployment/ansible/configure.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,8 @@

- name: Set up systemd exporters
hosts: stack
gather_facts: false
roles:
- role: systemd_exporter
- role: prometheus.prometheus.systemd_exporter
tags:
- systemd_exporter

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ARGS="--web.config.file=/etc/prometheus/prometheus-web.yml"
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
---
- type: filestream
id: prometheus
enabled: true
paths:
- /opt/prometheus/prometheus.log
fields:
service.name: prometheus
fields_under_root: true
filebeat.inputs:
- type: journald
id: prometheus
include_matches.match:
- _SYSTEMD_UNIT=prometheus.service
fields:
service.name: prometheus
fields_under_root: true

This file was deleted.

7 changes: 5 additions & 2 deletions chatops_deployment/ansible/roles/prometheus/files/rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,13 @@ groups:
description: "Container has been not running for more than 30 seconds."

- alert: SystemdServiceDown
expr: systemd_unit_state{name=~"grafana-server.service|haproxy.service",state=~"failed|inactive"} == 1
expr: |
systemd_unit_state{
name=~"grafana-server.service|haproxy.service|kibana.service|elasticsearch.service|logstash.service|filebeat.service",state=~"failed|inactive"
} == 1
for: 30s
labels:
severity: critical
annotations:
summary: "Systemd service {{ $labels.name }} on host {{ $labels.instance }} is in state {{ $labels.state }}."
description: "Systemd service has been in failed state for 30s."
description: "Systemd service has been in failed {{ $labels.state }} for 30s."
24 changes: 0 additions & 24 deletions chatops_deployment/ansible/roles/prometheus/handlers/main.yml

This file was deleted.

88 changes: 31 additions & 57 deletions chatops_deployment/ansible/roles/prometheus/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -1,41 +1,9 @@
---
- name: Create prometheus group
- name: Install Prometheus
become: true
ansible.builtin.group:
ansible.builtin.apt:
name: prometheus
state: present

- name: Add ubuntu to prometheus group
become: true
ansible.builtin.user:
name: ubuntu
group: prometheus

- name: Reset connection for group changes
ansible.builtin.meta: reset_connection

- name: Create a prometheus user
become: true
ansible.builtin.user:
name: prometheus
create_home: false
group: prometheus
system: true

- name: Download and extract Prometheus
become: true
ansible.builtin.unarchive:
src: " https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/prometheus-{{ prometheus_version }}.linux-amd64.tar.gz "
dest: /tmp
remote_src: true
creates: "/opt/prometheus"
mode: "0774"
notify:
- Move Prometheus binaries
- Start Prometheus

- name: Flush handlers to move binaries
ansible.builtin.meta: flush_handlers
state: latest # noqa: package-latest

- name: Set permissions on volume
become: true
Expand All @@ -47,55 +15,61 @@
mode: "0774"
recurse: true

- name: Copy prometheus service file
- name: Copy prometheus rules file
become: true
ansible.builtin.copy:
src: prometheus.service
dest: /etc/systemd/system/prometheus.service
src: rules.yml
dest: /etc/prometheus/rules.yml
owner: prometheus
group: prometheus
mode: "0774"
notify:
- Start Prometheus

- name: Copy prometheus rules file
- name: Copy systemd arguments
become: true
ansible.builtin.copy:
src: rules.yml
dest: /opt/prometheus/rules.yml
src: prometheus
dest: "/etc/default/prometheus"
owner: prometheus
group: prometheus
mode: "0774"
notify:
- Restart Prometheus
mode: "0644"

- name: Template prometheus config
become: true
ansible.builtin.template:
src: "{{ item }}"
dest: "/opt/prometheus/{{ item[:-3] }}"
dest: "/etc/prometheus/{{ item[:-3] }}"
owner: prometheus
group: prometheus
mode: "0774"
notify:
- Restart Prometheus
loop:
- prometheus.yml.j2
- web.yml.j2
- prometheus-web.yml.j2

- name: Create Prometheus log directory
ansible.builtin.file:
path: /var/log/prometheus
state: directory
- name: Copy certificate and key
become: true
ansible.builtin.copy:
src: "./{{ env }}_ssl/{{ item }}"
dest: "/etc/prometheus/{{ item }}"
owner: prometheus
group: prometheus
mode: "0770"
mode: "0440"
loop:
- prometheus.key
- prometheus.crt
- alertmanager.crt

- name: Restart Prometheus
become: true
ansible.builtin.systemd_service:
name: prometheus.service
state: restarted
daemon_reload: true

- name: Copy filebeat external config
become: true
ansible.builtin.copy:
src: prometheus.filebeat.yml
dest: /var/filebeat/prometheus.filebeat.yml
owner: root
group: root
owner: prometheus
group: prometheus
mode: "0640"
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
basic_auth_users:
{{ prometheus_username }}: {{ prometheus_password | ansible.builtin.password_hash(hashtype="bcrypt") }}

tls_server_config:
cert_file: /etc/prometheus/prometheus.crt
key_file: /etc/prometheus/prometheus.key
Original file line number Diff line number Diff line change
@@ -1,42 +1,48 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: 'chatops-monitor'

scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
scheme: https
basic_auth:
username: "{{ prometheus_username }}"
password: "{{ prometheus_password }}"
tls_config:
ca_file: /etc/prometheus/prometheus.crt

- job_name: 'alertmanager'
static_configs:
- targets: ['localhost:9093']
basic_auth:
username: "{{ alertmanager_username }}"
password: "{{ alertmanager_password }}"
scheme: https
tls_config:
ca_file: /etc/prometheus/alertmanager.crt

- job_name: 'load-balancer-metrics'
static_configs:
- targets: ['{{ loadbalancer_private_ip }}:8405']
- targets: ['localhost:8405']

- job_name: 'chatops_cadvisor'
static_configs:
{% for host in groups['chatops'] %}
- targets: ['{{ host }}:8080']
{% endfor %}
- targets: ['localhost:8080']

- job_name: 'systemd_exporter'
static_configs:
{% for host in groups['private'] %}
- targets: ['{{ host }}:9558']
{% endfor %}
- targets: ['{{ loadbalancer_private_ip }}:9558']
- targets: ['localhost:9558']

- job_name: 'node_exporter'
static_configs:
- targets: ['localhost:9100']

rule_files:
- '/opt/prometheus/rules.yml'
- '/etc/prometheus/rules.yml'

alerting:
alertmanagers:
Expand All @@ -47,3 +53,5 @@ alerting:
basic_auth:
username: "{{ alertmanager_username }}"
password: "{{ alertmanager_password }}"
tls_config:
ca_file: /etc/prometheus/alertmanager.crt

This file was deleted.

This file was deleted.

Loading