Merge pull request #205 from stackhpc/fix/grafana859

sjpb · web-flow · commit 82208d14971b · 2022-08-17T12:43:43.000+01:00
Update grafana and fix control image build grafana issues
diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml
@@ -163,6 +163,15 @@ jobs:
           ANSIBLE_FORCE_COLOR: True
           OS_CLOUD: openstack
 
+      - name: Check MPI-based tests are shown in Grafana
+        run: |
+          . venv/bin/activate
+          . environments/${{ matrix.cloud }}/activate
+          ansible-playbook -vv ansible/ci/check_grafana.yml
+        env:
+          ANSIBLE_FORCE_COLOR: True
+          OS_CLOUD: openstack
+
       - name: Delete infrastructure
         run: |
           . venv/bin/activate
diff --git a/ansible/.gitignore b/ansible/.gitignore
@@ -10,8 +10,6 @@ roles/*
 !roles/podman/**
 !roles/grafana-dashboards/
 !roles/grafana-dashboards/**
-!roles/grafana-datasources/
-!roles/grafana-datasources/**
 !roles/passwords/
 !roles/passwords/**
 !roles/fail2ban/
diff --git a/ansible/ci/check_grafana.yml b/ansible/ci/check_grafana.yml
@@ -0,0 +1,26 @@
+# Checks Slurm jobs from hpctests are shown in Grafana.
+# Can't actually check the dashboard programatically so this queries the datasource used by the dashboard instead.
+
+- hosts: control # so proxying etc is irrelevant
+  gather_facts: no
+  become: no
+  tasks:
+    - name: Wait for slurm-stats file to exist (run by cron)
+      ansible.builtin.wait_for:
+        path: /var/log/slurm-stats/finished_jobs.json
+        timeout: 315 # slurm stats cron job runs every 5 mins
+      
+    - name: Query grafana for expected hpctests jobs
+      grafana_elasticsearch_query:
+        grafana_url: http://{{ grafana_api_address }}:{{ grafana_port }}
+        grafana_username: grafana
+        grafana_password: "{{ vault_grafana_admin_password }}"
+        datasource: slurmstats
+        index_pattern: filebeat-*
+      register: _slurm_stats_jobs
+      until: _expected_jobs | difference(_found_jobs) == []
+      retries: 60
+      delay: 5
+      vars:
+        _found_jobs: "{{ _slurm_stats_jobs.docs | map(attribute='JobName', default='(json error in slurmstats data)') }}"
+        _expected_jobs: ['hpl-solo.sh', 'pingpong.sh', 'pingmatrix.sh']
diff --git a/ansible/ci/library/grafana_elasticsearch_query.py b/ansible/ci/library/grafana_elasticsearch_query.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python
+
+# Copyright: (c) 2022 Steve Brasier steve@stackhpc.com
+from __future__ import (absolute_import, division, print_function)
+__metaclass__ = type
+
+DOCUMENTATION = r'''
+---
+module: grafana_elasticsearch_query
+
+short_description: Get elasticsearch hits via grafana
+
+version_added: "1.0.0"
+
+description: Returns hits from selected datasource and indices.
+
+author:
+    - Steve Brasier
+'''
+
+EXAMPLES = r'''
+- name: Get elasticsearch hits
+  grafana_elasticsearch_query:
+    grafana_url: http://{{ grafana_api_address }}:{{ grafana_port }}
+    grafana_username: grafana
+    grafana_password: "{{ vault_grafana_admin_password }}"
+    datasource: slurmstats
+    index_pattern: 'filebeat-*'
+'''
+
+RETURN = r'''
+# These are examples of possible return values, and in general should use other names for return values.
+docs:
+  description: List of dicts with the original json in each document.
+  returned: always
+  type: list
+'''
+
+from ansible.module_utils.basic import AnsibleModule
+import requests
+import json
+
+def run_module():
+    module_args = dict(
+        grafana_url=dict(type="str", required=True),
+        grafana_username=dict(type="str", required=True),
+        grafana_password=dict(type="str", required=True),
+        datasource=dict(type="str", required=True),
+        index_pattern=dict(type="str", required=True),
+    )
+
+    result = dict(
+        changed=False,
+        jobs=[]
+    )
+
+    module = AnsibleModule(argument_spec=module_args, supports_check_mode=True)
+
+    auth=(module.params['grafana_username'], module.params['grafana_password'])
+    
+    # list datasources:
+    datasources_api_url = module.params["grafana_url"] + '/api/datasources'
+    r = requests.get(datasources_api_url, auth=auth)
+    datasources = json.loads(r.text)
+
+    # select required datasource:
+    ds = [s for s in datasources if s['name'] == module.params["datasource"]][0]
+
+    # get documents:
+    datasource_proxy_url = module.params["grafana_url"] + '/api/datasources/proxy/' + str(ds['id']) + '/' + module.params['index_pattern'] + '/_search'
+    r = requests.get(datasource_proxy_url, auth=auth)
+    search = json.loads(r.text)
+    # see https://www.elastic.co/guide/en/elasticsearch/reference/current/search-search.html#search-api-response-body:
+    docs = [h['_source']['json'] for h in search['hits']['hits']]    
+
+    result = {
+        'docs': docs,
+    }
+
+    module.exit_json(**result)
+
+
+def main():
+    run_module()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/ansible/monitoring.yml b/ansible/monitoring.yml
@@ -106,23 +106,10 @@
     - assert:
         that: vault_grafana_admin_password is defined
         fail_msg: "Must define vault_grafana_admin_password - use `ansible-playbook generate-passwords.yml` to generate a set of passwords"
-    - import_role:
+    - include_role:
         name: cloudalchemy.grafana
       vars:
-        # We use internal roles to register the datasources and dashboards as the roles
-        # does not support all options that we require.
+        # We use internal roles to register the dashboards as the role does not support all options that we require.
         grafana_dashboards: []
-        grafana_datasources: []
-
-- name: Initialise grafana
-  hosts: grafana
-  tags:
-    - grafana-init
-  tasks:
-    - assert:
-        that: vault_grafana_admin_password is defined
-        fail_msg: "Must define vault_grafana_admin_password - use `ansible-playbook generate-passwords.yml` to generate a set of passwords"
-    - import_role:
-        name: grafana-datasources
-    - import_role:
+    - import_role: # done in same play so it can use handlers from cloudalchemy.grafana
         name: grafana-dashboards
diff --git a/ansible/roles/grafana-dashboards/defaults/main.yml b/ansible/roles/grafana-dashboards/defaults/main.yml
@@ -9,4 +9,7 @@ grafana_api_url: "{{ grafana_url }}"
 
 grafana_security:
   admin_user: admin
-  admin_password: ""
+  admin_password: ""
+
+grafana_data_dir: "/var/lib/grafana"
+grafana_dashboards_dir: "dashboards"
diff --git a/ansible/roles/grafana-dashboards/tasks/main.yml b/ansible/roles/grafana-dashboards/tasks/main.yml
@@ -2,6 +2,7 @@
 # The MIT License (MIT)
 
 # Copyright (c) 2017-2018 Pawel Krupa, Roman Demachkovych
+# Copyright (c) 2022 Steve Brasier
 
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -22,7 +23,6 @@
 # SOFTWARE.
 
 - become: false
-  run_once: true
   block:
     - name: Create local grafana dashboard directory
       tempfile:
@@ -108,17 +108,51 @@
       when:
         - grafana_dashboards | length > 0
 
-    - name: import grafana dashboards
-      community.grafana.grafana_dashboard:
-        grafana_url: "{{ grafana_api_url }}"
-        grafana_user: "{{ grafana_security.admin_user }}"
-        grafana_password: "{{ grafana_security.admin_password }}"
-        path: "{{ _tmp_dashboards.path }}/{{ ((item.dashboard_id | string) + '.json') if 'dashboard_id' in item else item.dashboard_file }}"
-        # message renamed to commit_message (Be aware if using old ansible)
-        # https://github.com/ansible/ansible/pull/60051
-        #commit_message: Updated by ansible
-        state: present
-        overwrite: true
-      #no_log: true
-      with_items: "{{ grafana_dashboards }}"
-      when: grafana_state | default('started') != 'stopped'
+- name: Create/Update dashboards file (provisioning)
+  become: true
+  copy:
+    dest: "/etc/grafana/provisioning/dashboards/ansible.yml"
+    content: |
+      apiVersion: 1
+      providers:
+        - name: 'default'
+          orgId: 1
+          folder: ''
+          type: file
+          options:
+            path: "{{ grafana_data_dir }}/dashboards"
+    backup: false
+    owner: root
+    group: grafana
+    mode: 0640
+  notify: restart grafana
+
+- name: Register previously copied dashboards
+  become: true
+  find:
+    paths: "{{ grafana_data_dir }}/dashboards"
+    hidden: true
+    patterns:
+      - "*.json"
+  register: _dashboards_present
+
+- name: Import grafana dashboards
+  become: true
+  copy:
+    remote_src: yes
+    src: "{{ _tmp_dashboards.path }}/" # Note trailing / to only copy contents, not directory itself
+    dest: "{{ grafana_data_dir }}/dashboards/"
+  register: _dashboards_copied
+  notify: "provisioned dashboards changed"
+
+- name: Get dashboard lists
+  set_fact:
+    _dashboards_present_list: "{{ _dashboards_present | json_query('files[*].path') | default([]) }}"
+    _dashboards_copied_list: "{{ _dashboards_copied | json_query('results[*].dest') | default([]) }}"
+
+- name: Remove installed dashboards not defined through this role
+  become: true
+  file:
+    path: "{{ item }}"
+    state: absent
+  with_items: "{{ _dashboards_present_list | difference( _dashboards_copied_list ) }}"
diff --git a/ansible/roles/grafana-datasources/defaults/main.yml b/ansible/roles/grafana-datasources/defaults/main.yml
diff --git a/ansible/roles/grafana-datasources/tasks/main.yml b/ansible/roles/grafana-datasources/tasks/main.yml
diff --git a/environments/arcus/builder.pkrvars.hcl b/environments/arcus/builder.pkrvars.hcl
@@ -1,6 +1,6 @@
 flavor = "vm.alaska.cpu.general.small"
 networks = ["a262aabd-e6bf-4440-a155-13dbc1b5db0e"] # WCDC-iLab-60
-source_image_name = "openhpc-220808-1510.qcow2"
+source_image_name = "openhpc-220810-0839.qcow2"
 ssh_keypair_name = "slurm-app-ci"
 security_groups = ["default", "SSH"]
 ssh_bastion_host = "128.232.222.183"
diff --git a/environments/arcus/hooks/post.yml b/environments/arcus/hooks/post.yml
@@ -5,6 +5,24 @@
     - name: Check slurm up after direct deploy
       import_tasks: check_slurm.yml
 
+- hosts: control
+  become: yes
+  gather_facts: false
+  tasks:
+    - name: Write CI-generated inventory and secrets for debugging
+      ansible.builtin.copy:
+        dest: /etc/ci-config/
+        src: "{{ item }}"
+        directory_mode: 0400
+        mode: 0400
+        owner: root
+        group: root
+      no_log: true
+      loop:
+        - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/hosts"
+        - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/secrets.yml"
+        - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/basic_users/defaults.yml"
+
 - hosts: localhost
   become: false
   tags: build
diff --git a/environments/arcus/terraform/main.tf b/environments/arcus/terraform/main.tf
@@ -18,18 +18,18 @@ module "cluster" {
     key_pair = "slurm-app-ci"
     control_node = {
         flavor: "vm.alaska.cpu.general.small"
-        image: "openhpc-220808-1510.qcow2"
+        image: "openhpc-220810-0839.qcow2"
     }
     login_nodes = {
         login-0: {
             flavor: "vm.alaska.cpu.general.small"
-            image: "openhpc-220808-1510.qcow2"
+            image: "openhpc-220810-0839.qcow2"
         }
     }
     compute_types = {
         small: {
             flavor: "vm.alaska.cpu.general.small"
-            image: "openhpc-220808-1510.qcow2"
+            image: "openhpc-220810-0839.qcow2"
         }
     }
     compute_nodes = {
diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml
diff --git a/environments/common/inventory/group_vars/builder/defaults.yml b/environments/common/inventory/group_vars/builder/defaults.yml