From e4a47b24e509172278d8194bd4a7725447cef505 Mon Sep 17 00:00:00 2001 From: Roman Safronov Date: Mon, 25 Aug 2025 22:07:06 +0300 Subject: [PATCH] Update kepler version and container template Update kepler image default version to v0.10.2. The new kepler (starting from 0.10) has completely different set of config options so kepler's template was updated correspondingly. - Removed '-v' option as it's no longer supported. - Added option --config.file to specify a file with settings - There is no longer need to specify desired metrics using environment variables as all metrics are enabled by default. Also changed kepler healthcheck script: - 'healthz' URL path no longer exist, so use 'metrics' instead. - Fetch only headers and status code for healthcheck in order to minimize load. - Removed check for 'ok' in response as it is no longer relevant. --- .../defaults/main.yml | 2 +- .../files/healthchecks/exporter/healthcheck | 12 +++--------- .../tasks/configure.yml | 8 ++++++++ .../templates/kepler-config.yaml.j2 | 10 ++++++++++ .../templates/kepler.json.j2 | 11 ++--------- 5 files changed, 24 insertions(+), 19 deletions(-) create mode 100644 roles/edpm_telemetry_power_monitoring/templates/kepler-config.yaml.j2 diff --git a/roles/edpm_telemetry_power_monitoring/defaults/main.yml b/roles/edpm_telemetry_power_monitoring/defaults/main.yml index ad3a1a16e..eacc96dfc 100644 --- a/roles/edpm_telemetry_power_monitoring/defaults/main.yml +++ b/roles/edpm_telemetry_power_monitoring/defaults/main.yml @@ -38,7 +38,7 @@ edpm_telemetry_image_download_retries: "{{ edpm_download_retries | default(5) }} edpm_telemetry_old_tripleo_compute_sevices: - tripleo_ceilometer_agent_ipmi.service # Image to use for kepler -edpm_telemetry_kepler_image: "quay.io/sustainable_computing_io/kepler:release-0.7.12" +edpm_telemetry_kepler_image: "quay.io/sustainable_computing_io/kepler:v0.10.2" # Instruction for distribution of container health check scripts edpm_telemetry_power_monitoring_healthcheck_sources: ceilometer_agent_ipmi: ceilometer_agent diff --git a/roles/edpm_telemetry_power_monitoring/files/healthchecks/exporter/healthcheck b/roles/edpm_telemetry_power_monitoring/files/healthchecks/exporter/healthcheck index 53a669f60..bfd5031ab 100644 --- a/roles/edpm_telemetry_power_monitoring/files/healthchecks/exporter/healthcheck +++ b/roles/edpm_telemetry_power_monitoring/files/healthchecks/exporter/healthcheck @@ -15,11 +15,11 @@ # License for the specific language governing permissions and limitations # under the License. -URL="http://0.0.0.0:8888/healthz" +URL="http://0.0.0.0:8888/metrics" TIMEOUT=5 # Timeout in seconds # Get the HTTP status code and response body using curl -RESPONSE=$(curl -s -w "%{http_code}" $URL --max-time $TIMEOUT) +RESPONSE=$(curl -I -s -w "%{http_code}" $URL --max-time $TIMEOUT) BODY=${RESPONSE:0:-3} # Extract the body (all but the last 3 characters) HTTP_CODE=${RESPONSE: -3} # Extract the last 3 characters as the HTTP status code @@ -29,11 +29,5 @@ if [ "$HTTP_CODE" -ne 200 ]; then exit 1 fi -# Check if the response body contains "ok" -if [[ "$BODY" != *"ok"* ]]; then - echo "$1 Health check failed: Response body does not contain 'ok'" - exit 1 -fi - -echo "$1 Health check passed: HTTP status code $HTTP_CODE, Health response 'ok'" +echo "$1 Health check passed: HTTP status code $HTTP_CODE" exit 0 diff --git a/roles/edpm_telemetry_power_monitoring/tasks/configure.yml b/roles/edpm_telemetry_power_monitoring/tasks/configure.yml index 3bd68650c..79499246b 100644 --- a/roles/edpm_telemetry_power_monitoring/tasks/configure.yml +++ b/roles/edpm_telemetry_power_monitoring/tasks/configure.yml @@ -98,6 +98,13 @@ path: "{{ edpm_telemetry_certs }}/tls.key" register: tls_key_stat +- name: Gather virtualization fact + ansible.builtin.setup: + gather_subset: + - "!all" + - "!min" + - "virtual" + - name: Render container config templates ansible.builtin.template: src: "{{ item }}" @@ -108,6 +115,7 @@ vars: ca_bundle_exists: "{{ ca_bundle_stat_res.stat.exists }}" tls_cert_exists: "{{ tls_crt_stat.stat.exists and tls_key_stat.stat.exists }}" + running_in_vm: "{{ ansible_facts['virtualization_role'] == 'guest' }}" - name: Configure tls if present when: diff --git a/roles/edpm_telemetry_power_monitoring/templates/kepler-config.yaml.j2 b/roles/edpm_telemetry_power_monitoring/templates/kepler-config.yaml.j2 new file mode 100644 index 000000000..90ba29d69 --- /dev/null +++ b/roles/edpm_telemetry_power_monitoring/templates/kepler-config.yaml.j2 @@ -0,0 +1,10 @@ +web: + listenAddresses: + - :8888 +{% if running_in_vm|bool %} +# WARN DO NOT ENABLE THIS IN PRODUCTION - for CI testing only +dev: + fake-cpu-meter: + enabled: true + zones: ["package", "core", "dram"] +{% endif %} diff --git a/roles/edpm_telemetry_power_monitoring/templates/kepler.json.j2 b/roles/edpm_telemetry_power_monitoring/templates/kepler.json.j2 index 603058d50..b782c4dc6 100644 --- a/roles/edpm_telemetry_power_monitoring/templates/kepler.json.j2 +++ b/roles/edpm_telemetry_power_monitoring/templates/kepler.json.j2 @@ -4,16 +4,8 @@ "restart": "always", "ports": ["8888:8888"], "net": "host", - "command": "-v=2", + "command": "--config.file=/etc/kepler/kepler-config.yaml", "recreate": true, - "environment": { - "ENABLE_GPU": "true", - "EXPOSE_CONTAINER_METRICS": "true", - "ENABLE_PROCESS_METRICS": "true", - "EXPOSE_VM_METRICS": "true", - "EXPOSE_ESTIMATED_IDLE_POWER_METRICS": "false", - "LIBVIRT_METADATA_URI": "http://openstack.org/xmlns/libvirt/nova/1.1" - }, {% if edpm_telemetry_power_monitoring_healthcheck %} "healthcheck": { "test": "/openstack/healthcheck kepler", @@ -21,6 +13,7 @@ }, {% endif %} "volumes": [ + "{{ edpm_telemetry_config_dest }}/kepler-config.yaml:/etc/kepler/kepler-config.yaml:ro", "/lib/modules:/lib/modules:ro", "/run/libvirt:/run/libvirt:shared,ro", "/sys:/sys",