diff --git a/.github/workflows/stackhpc-all-in-one.yml b/.github/workflows/stackhpc-all-in-one.yml index cae50d566..caa702e09 100644 --- a/.github/workflows/stackhpc-all-in-one.yml +++ b/.github/workflows/stackhpc-all-in-one.yml @@ -423,7 +423,8 @@ jobs: -v $(pwd)/sct-results:/stack/sct-results \ -e KAYOBE_ENVIRONMENT -e KAYOBE_VAULT_PASSWORD -e KAYOBE_AUTOMATION_SSH_PRIVATE_KEY \ $KAYOBE_IMAGE \ - /stack/kayobe-automation-env/src/kayobe-config/.automation/pipeline/playbook-run.sh '$KAYOBE_CONFIG_PATH/ansible/stackhpc-cloud-tests.yml' -e sct_version=${{ inputs.stackhpc_cloud_tests_version }} + /stack/kayobe-automation-env/src/kayobe-config/.automation/pipeline/playbook-run.sh '$KAYOBE_CONFIG_PATH/ansible/stackhpc-cloud-tests.yml' \ + -e sct_version=${{ inputs.stackhpc_cloud_tests_version }} env: KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }} diff --git a/.github/workflows/stackhpc-container-image-build.yml b/.github/workflows/stackhpc-container-image-build.yml index 60426f243..595ab9cd3 100644 --- a/.github/workflows/stackhpc-container-image-build.yml +++ b/.github/workflows/stackhpc-container-image-build.yml @@ -92,17 +92,17 @@ jobs: - name: Generate build matrix id: set-matrix run: | - comma="" - echo -n "matrix={\"distro\": [" >> $GITHUB_OUTPUT + output="{'distro': [" if [[ ${{ inputs.rocky-linux-9 }} == 'true' ]]; then - echo -n "$comma\"rocky\"" >> $GITHUB_OUTPUT - comma=", " + output+="{'name': 'rocky', 'release': 9}," fi if [[ ${{ inputs.ubuntu-noble }} == 'true' ]]; then - echo -n "$comma\"ubuntu\"" >> $GITHUB_OUTPUT - comma=", " + output+="{'name': 'ubuntu', 'release': 'noble'}," fi - echo "]}" >> $GITHUB_OUTPUT + # remove trailing comma + output="${output%,}" + output+="]}" + echo "matrix=$output" >> $GITHUB_OUTPUT - name: Display container datetime tag run: | @@ -173,7 +173,7 @@ jobs: - name: Get Kolla tag id: write-kolla-tag - run: echo "kolla-tag=${{ needs.generate-tag.outputs.openstack_release }}-${{ matrix.distro }}-${{ matrix.distro == 'rocky' && '9' || 'noble' }}-${{ needs.generate-tag.outputs.datetime_tag }}" >> $GITHUB_OUTPUT + run: echo "kolla-tag=${{ needs.generate-tag.outputs.openstack_release }}-${{ matrix.distro.name }}-${{ matrix.distro.release }}-${{ needs.generate-tag.outputs.datetime_tag }}" >> $GITHUB_OUTPUT - name: Configure localhost as a seed run: | @@ -203,7 +203,8 @@ jobs: continue-on-error: true run: | args="${{ inputs.regexes }}" - args="$args -e kolla_base_distro=${{ matrix.distro }}" + args="$args -e kolla_base_distro=${{ matrix.distro.name }}" + args="$args -e kolla_base_distro_version=${{ matrix.distro.release }}" args="$args -e kolla_tag=${{ steps.write-kolla-tag.outputs.kolla-tag }}" args="$args -e stackhpc_repo_mirror_auth_proxy_enabled=true" source venvs/kayobe/bin/activate && @@ -221,7 +222,8 @@ jobs: id: build_seed_images continue-on-error: true run: | - args="-e kolla_base_distro=${{ matrix.distro }}" + args="-e kolla_base_distro=${{ matrix.distro.name }}" + args="$args -e kolla_base_distro_version=${{ matrix.distro.release }}" args="$args -e kolla_tag=${{ steps.write-kolla-tag.outputs.kolla-tag }}" args="$args -e stackhpc_repo_mirror_auth_proxy_enabled=true" source venvs/kayobe/bin/activate && @@ -236,13 +238,13 @@ jobs: if: inputs.seed - name: Get built container images - run: docker image ls --filter "reference=ark.stackhpc.com/stackhpc-dev/*:${{ steps.write-kolla-tag.outputs.kolla-tag }}" > ${{ matrix.distro }}-container-images + run: docker image ls --filter "reference=ark.stackhpc.com/stackhpc-dev/*:${{ steps.write-kolla-tag.outputs.kolla-tag }}" > ${{ matrix.distro.name }}-${{ matrix.distro.release }}-container-images - name: Fail if no images have been built - run: if [ $(wc -l < ${{ matrix.distro }}-container-images) -le 1 ]; then exit 1; fi + run: if [ $(wc -l < ${{ matrix.distro.name }}-${{ matrix.distro.release }}-container-images) -le 1 ]; then exit 1; fi - name: Scan built container images - run: src/kayobe-config/tools/scan-images.sh ${{ matrix.distro }} ${{ steps.write-kolla-tag.outputs.kolla-tag }} + run: src/kayobe-config/tools/scan-images.sh ${{ matrix.distro.name }}-${{ matrix.distro.release }} ${{ steps.write-kolla-tag.outputs.kolla-tag }} - name: Move image scan logs to output artifact run: mv image-scan-output image-build-logs/image-scan-output @@ -297,7 +299,7 @@ jobs: - name: Upload output artifact uses: actions/upload-artifact@v4 with: - name: ${{ matrix.distro }}-logs + name: ${{ matrix.distro.name }}-${{ matrix.distro.release }}-logs path: image-build-logs retention-days: 7 if: ${{ !cancelled() }} diff --git a/.github/workflows/stackhpc-pull-request.yml b/.github/workflows/stackhpc-pull-request.yml index 0a62c2fd0..3a6625006 100644 --- a/.github/workflows/stackhpc-pull-request.yml +++ b/.github/workflows/stackhpc-pull-request.yml @@ -18,6 +18,9 @@ jobs: pull-requests: read name: Check changed files if: github.repository == 'stackhpc/stackhpc-kayobe-config' + needs: + - lint + - tox outputs: aio: ${{ steps.changes.outputs.aio }} build-kayobe-image: ${{ steps.changes.outputs.build-kayobe-image }} diff --git a/.github/workflows/stackhpc-update-kolla.yml b/.github/workflows/stackhpc-update-kolla.yml new file mode 100644 index 000000000..0f5d3e62e --- /dev/null +++ b/.github/workflows/stackhpc-update-kolla.yml @@ -0,0 +1,29 @@ +name: Update Kolla versions + +on: + # Allow manual executions + workflow_dispatch: + # Run weekly on Tuesday + schedule: + - cron: '0 0 * * 2' + +jobs: + update-from-branch: + name: Update dependencies + strategy: + matrix: + include: + - version: stackhpc/2023.1 + codename: Antelope + - version: stackhpc/2024.1 + codename: Caracal + - version: stackhpc/master + codename: master + uses: ./.github/workflows/update-dependencies.yml + with: + openstack_version: ${{ matrix.version }} + openstack_codename: ${{ matrix.codename }} + permissions: + contents: write + pull-requests: write + if: github.repository == 'stackhpc/stackhpc-kayobe-config' diff --git a/.github/workflows/update-dependencies.yml b/.github/workflows/update-dependencies.yml index 2b9c8bda7..dea909df5 100644 --- a/.github/workflows/update-dependencies.yml +++ b/.github/workflows/update-dependencies.yml @@ -1,11 +1,16 @@ name: Update dependencies on: - # Allow manual executions - workflow_dispatch: - # Run nightly - schedule: - - cron: '0 0 * * *' + workflow_call: + inputs: + openstack_version: + description: OpenStack version + type: string + required: true + openstack_codename: + description: OpenStack codename + type: string + required: true jobs: propose_github_release_updates: @@ -38,20 +43,14 @@ jobs: - name: Checkout uses: actions/checkout@v4 with: + ref: ${{ inputs.openstack_version }} path: ${{ github.workspace }}/src/kayobe-config - - name: Determine OpenStack release - id: openstack_release - run: | - BRANCH=$(awk -F'=' '/defaultbranch/ {print $2}' .gitreview) - echo "openstack_release=${BRANCH}" | sed -E "s,(stable|unmaintained)/,," >> $GITHUB_OUTPUT - working-directory: ${{ github.workspace }}/src/kayobe-config - - name: Checkout the dependency repo uses: actions/checkout@v4 with: repository: ${{ matrix.repository }} - ref: stackhpc/${{ steps.openstack_release.outputs.openstack_release }} + ref: ${{ inputs.openstack_version }} fetch-tags: true path: ${{ github.workspace }}/src/${{ matrix.key }} @@ -78,10 +77,17 @@ jobs: path: ${{ github.workspace }}/src/kayobe-config commit-message: >- Bump ${{ matrix.key }} to ${{ steps.latest_tag.outputs.latest_tag }} - branch: update-dependency/${{ matrix.key }} + author: stackhpc-ci <22933334+stackhpc-ci@users.noreply.github.com> + branch: update-dependency/${{ matrix.key }}/${{ inputs.openstack_version }} delete-branch: true title: >- Bump ${{ matrix.key }} to ${{ steps.latest_tag.outputs.latest_tag }} body: > - This PR was created automatically to update + This PR was created automatically to update ${{ inputs.openstack_version }} ${{ matrix.key }} to ${{ steps.latest_tag.outputs.latest_tag }}. + + GitHub Release Changelog: + https://github.com/stackhpc/${{ matrix.key }}/releases/tag/${{ steps.latest_tag.outputs.latest_tag }} + labels: | + automated + ${{ inputs.openstack_codename }} diff --git a/doc/source/configuration/cephadm.rst b/doc/source/configuration/cephadm.rst index f0e6af15a..c4da93c81 100644 --- a/doc/source/configuration/cephadm.rst +++ b/doc/source/configuration/cephadm.rst @@ -206,8 +206,8 @@ Pools ~~~~~ The following example pools should be sufficient to work with the -default `external Ceph -configuration `__ +default :kolla-ansible-doc:`external Ceph +configuration ` for Cinder, Cinder backup, Glance, and Nova in Kolla Ansible. .. code:: yaml @@ -234,8 +234,8 @@ Keys ~~~~ The following example keys should be sufficient to work with the default -`external Ceph -configuration `__ +:kolla-ansible-doc:`external Ceph +configuration ` for Cinder, Cinder backup, Glance, and Nova in Kolla Ansible. .. code:: yaml @@ -351,7 +351,7 @@ RADOS Gateways RADOS Gateway integration is described in the :kolla-ansible-doc:`Kolla Ansible documentation -`. +`. RADOS Gateways (RGWs) are defined with the following: @@ -390,7 +390,7 @@ The set of commands below configure all of these. - "config set client.rgw rgw_keystone_admin_user 'ceph_rgw'" - "config set client.rgw rgw_keystone_api_version '3'" - "config set client.rgw rgw_keystone_token_cache_size '10000'" - - "config set client.rgw rgw_keystone_url https://{{ kolla_internal_fqdn }}:5000" + - "config set client.rgw rgw_keystone_url {{ 'https' if kolla_enable_tls_internal | bool else 'http' }}://{{ kolla_internal_fqdn }}:5000" - "config set client.rgw rgw_keystone_verify_ssl false" - "config set client.rgw rgw_max_attr_name_len '1000'" - "config set client.rgw rgw_max_attr_size '1000'" diff --git a/doc/source/configuration/ci-cd.rst b/doc/source/configuration/ci-cd.rst index dcf86350e..435c114f7 100644 --- a/doc/source/configuration/ci-cd.rst +++ b/doc/source/configuration/ci-cd.rst @@ -118,7 +118,7 @@ Runner Deployment The repository permissions for a fine-grained personal access token should be; :code:`Actions: R/W, Administration: R/W, Metadata: R` Once the key has been obtained, add it to :code:`secrets.yml` under :code:`secrets_github_access_token` -7. If the host is an actual Infra VM then please refer to upstream `Infrastructure VMs `__ documentation for additional configuration and steps. +7. If the host is an actual Infra VM then please refer to upstream :kayobe-doc:`Infrastructure VMs ` documentation for additional configuration and steps. 8. Run :code:`kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/deploy-github-runner.yml` diff --git a/doc/source/configuration/cloudkitty.rst b/doc/source/configuration/cloudkitty.rst index 83f31205f..084fe45e4 100644 --- a/doc/source/configuration/cloudkitty.rst +++ b/doc/source/configuration/cloudkitty.rst @@ -8,8 +8,8 @@ Configuring in kayobe-config By default, CloudKitty uses Gnocchi and Ceilometer as the collector and fetcher backends. Unless the system has a specific reason not to, we recommend instead using Prometheus as the backend for both. The following instructions explain -how to do this. Also, see the `Kolla Ansible docs on CloudKitty -`__ +how to do this. Also, see the :kolla-ansible-doc:`Kolla Ansible docs on CloudKitty +` for more details. Enable CloudKitty and disable InfluxDB, as we are using OpenSearch as the diff --git a/doc/source/configuration/index.rst b/doc/source/configuration/index.rst index 136cf13a1..576dfe63d 100644 --- a/doc/source/configuration/index.rst +++ b/doc/source/configuration/index.rst @@ -13,6 +13,7 @@ the various features provided. lvm cephadm monitoring + openbao vault wazuh walled-garden diff --git a/doc/source/configuration/monitoring.rst b/doc/source/configuration/monitoring.rst index 77c5e47f7..7414a5398 100644 --- a/doc/source/configuration/monitoring.rst +++ b/doc/source/configuration/monitoring.rst @@ -81,8 +81,8 @@ on the overcloud hosts: SMART reporting should now be enabled along with a Prometheus alert for unhealthy disks and a Grafana dashboard called ``Hardware Overview``. -Alertmanager and Slack -====================== +Alertmanager, Slack and Microsoft Teams +======================================= StackHPC Kayobe configuration comes bundled with an array of alerts but does not enable any receivers for notifications by default. Various receivers can be @@ -119,6 +119,17 @@ available `here `__. They simply need to be added to one of the ``*.rules`` files in the prometheus configuration directory. +If however you are using Microsoft Teams instead of Slack, you can use Prometheus +Alertmanager's built-in support for the new message format based on Power Automate flows. +You will need an incoming webhook URL for your Teams channel. +This can be done by following `these instructions `__. +To set up a receiver, create a ``prometheus-alertmanager.yml`` file under +``etc/kayobe/kolla/config/prometheus/``. +An example config is stored in this directory known as ``prometheus-alertmanager.msteamvs2.yml.example``. +The example configuration uses two Slack channels. +One channel receives all alerts while the other only receives alerts tagged as critical. +Feel free to modify the example configuration to suit your needs. + Ceph Monitoring =============== diff --git a/doc/source/configuration/openbao.rst b/doc/source/configuration/openbao.rst new file mode 100644 index 000000000..87d1c1435 --- /dev/null +++ b/doc/source/configuration/openbao.rst @@ -0,0 +1,441 @@ +.. _openbao: + +======================== +OpenBao for internal PKI +======================== + +This document describes how to deploy OpenBao for +internal PKI purposes using the +`StackHPC Hashicorp collection `_ + +OpenBao may be used as a Certificate Authority to generate certificates for: + +* OpenStack internal API +* OpenStack backend APIs +* RabbitMQ + +TLS support is described in the :kolla-ansible-doc:`Kolla Ansible documentation +` and the :kayobe-doc:`Kayobe documentation +`. + +OpenBao may also be used as the secret store for Barbican. + +Background +========== + +Our OpenStack environment employs two separate OpenBao instances. +These instances manage the Public Key Infrastructure (PKI) by handling the +creation and issuance of certificates. + +- The first OpenBao instance is located on the seed host. + It handles infrastructure-level certificates, generating the root + Certificate Authority (CA) and intermediate CA for the second OpenBao. + The ``openbao-deploy-seed.yml`` playbook sets up this instance. + +- The second OpenBao instance is within the OpenStack + overcloud, located on the controller nodes. This instance uses the + intermediate CA from the seed OpenBao to issue application-specific + certificates. The ``vault-openbao-overcloud.yml`` playbook is used + for its setup. It ensures that all controller nodes trust the + intermediate CA from the root OpenBao. + +The dual OpenBao setup enhances security by protecting the root CA's key. The more +exposed overcloud OpenBao only possesses the intermediate key, ensuring that +the root key remains secure even if the overcloud OpenBao instance is compromised. + +Prerequisites +============= + +Before beginning the deployment of OpenBao for openstack internal TLS and backend TLS you should ensure that you have the following. + + * Seed Node or a host to run the vault container on + * Overcloud controller hosts to install second vault on + * Ansible Galaxy dependencies installed: ``kayobe control host bootstrap`` + * Python dependencies installed: ``pip install -r kayobe-config/requirements.txt`` + +By default OpenBao image is not synced from Docker Hub to the local +Pulp. To sync this image, set ``stackhpc_sync_openbao_images`` to ``true``. +The OpenBao deployment configuration will be automatically updated to pull images +from Pulp. + +Deployment +========== + +Setup OpenBao on the seed node +------------------------------ + +1. Run openbao-deploy-seed.yml custom playbook + + .. code-block:: + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/openbao-deploy-seed.yml + +2. Encrypt generated certs/keys with ansible-vault (use proper location of vault password file) + + .. code-block:: + + ansible-vault encrypt --vault-password-file ~/vault.pass $KAYOBE_CONFIG_PATH/openbao/OS-TLS-INT.pem + ansible-vault encrypt --vault-password-file ~/vault.pass $KAYOBE_CONFIG_PATH/openbao/seed-openbao-keys.json + ansible-vault encrypt --vault-password-file ~/vault.pass $KAYOBE_CONFIG_PATH/openbao/overcloud.key + + Or if environments are being used + + .. code-block:: + + ansible-vault encrypt --vault-password-file ~/vault.pass $KAYOBE_CONFIG_PATH/environments/$KAYOBE_ENVIRONMENT/openbao/OS-TLS-INT.pem + ansible-vault encrypt --vault-password-file ~/vault.pass $KAYOBE_CONFIG_PATH/environments/$KAYOBE_ENVIRONMENT/openbao/seed-openbao-keys.json + ansible-vault encrypt --vault-password-file ~/vault.pass $KAYOBE_CONFIG_PATH/environments/$KAYOBE_ENVIRONMENT/openbao/overcloud.key + +Setup OpenBao HA on the overcloud hosts +--------------------------------------- + +1. Run openbao-deploy-overcloud.yml custom playbook + + .. code-block:: + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/openbao-deploy-overcloud.yml + +2. Encrypt overcloud openbao keys (use proper location of vault password file) + + .. code-block:: + + ansible-vault encrypt --vault-password-file ~/vault.pass $KAYOBE_CONFIG_PATH/openbao/overcloud-openbao-keys.json + + Or if environments are being used + + .. code-block:: + + ansible-vault encrypt --vault-password-file ~/vault.pass $KAYOBE_CONFIG_PATH/environments/$KAYOBE_ENVIRONMENT/openbao/overcloud-openbao-keys.json + +Rotating OpenBao certificate on the overcloud hosts +--------------------------------------------------- + +The certificate for the overcloud OpenBao has an expiry time of one year. While +the cloud won't break if this expires, it will need rotating before new +certificates can be generated for internal PKI. If a OpenBao becomes sealed, it +cannot be unsealed with an expired certificate. + +1. Delete the old certificate: + + .. code-block:: + + rm $KAYOBE_CONFIG_PATH/openbao/overcloud.crt + + Or if environments are being used + + .. code-block:: + + rm $KAYOBE_CONFIG_PATH/environments/$KAYOBE_ENVIRONMENT/openbao/overcloud.crt + +2. Generate a new certificate (and key): + + .. code-block:: + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/openbao-deploy-seed.yml + +3. Encrypt generated key with ansible-vault (use proper location of vault password file) + + .. code-block:: + + ansible-vault encrypt --vault-password-file ~/vault.pass $KAYOBE_CONFIG_PATH/openbao/overcloud.key + + Or if environments are being used + + .. code-block:: + + ansible-vault encrypt --vault-password-file ~/vault.pass $KAYOBE_CONFIG_PATH/environments/$KAYOBE_ENVIRONMENT/openbao/overcloud.key + +4. Copy the new certificate to the overcloud hosts. Note, if the old + certificate has expired this will fail on the unseal step. + + .. code-block:: + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/openbao-deploy-overcloud.yml + +5. Restart the containers to use the new certificate: + + .. code-block:: + + kayobe overcloud host command run --command "docker restart openbao" -l controllers + +6. If sealed, unseal OpenBao: + + .. code-block:: + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/openbao-unseal-overcloud.yml + +Certificates generation +======================= + +.. note:: + + Generating certificates will fail if the OpenBao on the overcloud is sealed. This will happen whenever the openbao containers are restarted. To unseal the + overcloud OpenBao, run: + + .. code-block:: + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/openbao-unseal-overcloud.yml + +Create the external TLS certificates (testing only) +--------------------------------------------------- + +This method should only be used for testing. For external TLS on production systems, +See `Installing External TLS Certificates `__. + +Typically external API TLS certificates should be generated by a organisation's trusted internal or third-party CA. +For test and development purposes it is possible to use OpenBao as a CA for the external API. + +1. Run the playbook + + .. code-block:: + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/openbao-generate-test-external-tls.yml + +2. Use ansible-vault to encrypt the PEM bundle in $KAYOBE_CONFIG_PATH/kolla/certificates/haproxy.pem. Commit the PEM bundle to the kayobe configuration. + + .. code-block:: + + ansible-vault encrypt --vault-password-file ~/vault.pass $KAYOBE_CONFIG_PATH/kolla/certificates/haproxy.pem + + Or if environments are being used + + .. code-block:: + + ansible-vault encrypt --vault-password-file ~/vault.pass $KAYOBE_CONFIG_PATH/environments/$KAYOBE_ENVIRONMENT/kolla/certificates/haproxy.pem + +Create the internal TLS certificates +------------------------------------ + +1. Run the playbook + + .. code-block:: + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/openbao-generate-internal-tls.yml + +2. Use ansible-vault to encrypt the PEM bundle in $KAYOBE_CONFIG_PATH/kolla/certificates/haproxy-internal.pem. Commit the PEM bundle and root CA to the kayobe configuration. + + .. code-block:: + + ansible-vault encrypt --vault-password-file ~/vault.pass $KAYOBE_CONFIG_PATH/kolla/certificates/haproxy-internal.pem + + Or if environments are being used + + .. code-block:: + + ansible-vault encrypt --vault-password-file ~/vault.pass $KAYOBE_CONFIG_PATH/environments/$KAYOBE_ENVIRONMENT/kolla/certificates/haproxy-internal.pem + +Create the backend TLS and RabbitMQ TLS certificates +---------------------------------------------------- + +1. Run the playbook + + .. code-block:: + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/openbao-generate-backend-tls.yml + +2. Use ansible-vault to encrypt the keys in $KAYOBE_CONFIG_PATH/kolla/certificates/-key.pem. Commit the certificates and keys to the kayobe configuration. + + .. code-block:: + + ansible-vault encrypt --vault-password-file ~/vault.pass $KAYOBE_CONFIG_PATH/kolla/certificates/-key.pem + + Or if environments are being used + + .. code-block:: + + ansible-vault encrypt --vault-password-file ~/vault.pass $KAYOBE_CONFIG_PATH/environments/$KAYOBE_ENVIRONMENT/kolla/certificates/-key.pem + +.. _openbao-haproxy: + +HAProxy integration +=================== + +It is possible to expose the overcloud OpenBao service via the Kolla Ansible HAProxy load balancer. +This provides a single highly available API endpoint, as well as monitoring of the OpenBao backends when combined with Prometheus. +HAProxy integration is no longer required for generating OpenStack control plane certificates, making it possible to deploy OpenBao and generate certificates before any containers have been deployed by Kolla Ansible. + +1. Create the HAProxy config to reverse proxy the OpenBao HA container + + Set the openbao_front to the external VIP address or internal VIP address depending on the installation. Set the openbao_back to the IPs of the control nodes. + + Set the following in etc/kayobe/kolla/config/haproxy/services.d/openbao.cfg or if environments are being used etc/kayobe/environments/$KAYOBE_ENVIRONMENT/kolla/config/haproxy/services.d/openbao.cfg + + .. code-block:: + + # Delete "verify none" if not using self-signed/unknown issuer + {% raw %} + frontend openbao_front + mode tcp + option tcplog + bind {{ kolla_internal_vip_address }}:8200 + default_backend openbao_back + + backend openbao_back + mode tcp + option httpchk GET /v1/sys/health + # https://openbao.org/api-docs/system/health/ + # 200: initialized, unsealed, and active + # 429: standby + http-check expect rstatus (200|429) + + {% for host in groups['control'] %} + {% set host_name = hostvars[host].ansible_facts.hostname %} + {% set host_ip = 'api' | kolla_address(host) %} + server {{ host_name }} {{ host_ip }}:8200 check check-ssl verify none inter 2000 rise 2 fall 5 + {% endfor %} + {% endraw %} + +2. If HAProxy has not yet been deployed, continue to :ref:`certificates deployment `. + If HAProxy has been deployed, it may be redeployed with the new OpenBao service configuration: + + .. code-block:: + + kayobe overcloud service deploy -kt haproxy + +.. _openbao-certificates: + +Certificates deployment +======================= + +.. warning:: + + The switch from HTTP to HTTPS during the deployment of internal/backend TLS certificates can temporarily disrupt service availability and necessitates a restart of all services. During this transition, endpoints may become unreachable following the HAProxy restart, persisting until the endpoint catalogue and client have been reconfigured to use HTTPS. + +Enable the required TLS variables in kayobe and kolla +----------------------------------------------------- + +1. If using OpenBao as a CA for the external API, set the following in kayobe-config/etc/kayobe/kolla.yml or if environments are being used etc/kayobe/environments/$KAYOBE_ENVIRONMENT/kolla.yml + + .. code-block:: + + # Whether TLS is enabled for the external API endpoints. Default is 'no'. + kolla_enable_tls_external: yes + kolla_public_openrc_cacert: "{{ '/etc/pki/tls/certs/ca-bundle.crt' if os_distribution in ['centos', 'rocky'] else '/etc/ssl/certs/ca-certificates.crt' }}" + + See :ref:`tempest-cacert` for information on adding CA certificates to the trust store when running Tempest. + +2. Set the following in kayobe-config/etc/kayobe/kolla.yml or if environments are being used etc/kayobe/environments/$KAYOBE_ENVIRONMENT/kolla.yml + + .. code-block:: + + # Whether TLS is enabled for the internal API endpoints. Default is 'no'. + kolla_enable_tls_internal: yes + kolla_admin_openrc_cacert: "{{ '/etc/pki/tls/certs/ca-bundle.crt' if os_distribution in ['centos', 'rocky'] else '/etc/ssl/certs/ca-certificates.crt' }}" + + See :ref:`os-capacity` for information on adding CA certificates to the trust store when deploying the OpenStack Capacity exporter. + +3. Set the following in etc/kayobe/kolla/globals.yml or if environments are being used etc/kayobe/environments/$KAYOBE_ENVIRONMENT/kolla/globals.yml + + .. code-block:: + + # Internal TLS configuration + # Copy the self-signed CA into the kolla containers + kolla_copy_ca_into_containers: "yes" + # Use the following trust store within the container + openstack_cacert: "{{ '/etc/pki/tls/certs/ca-bundle.crt' if os_distribution == 'rocky' else '/etc/ssl/certs/ca-certificates.crt' }}" + + # Backend TLS config + # Enable backend TLS + kolla_enable_tls_backend: "yes" + + # If using RabbitMQ TLS: + rabbitmq_enable_tls: "yes" + +4. Deploy OpenStack + + .. warning:: + + It is important that you are only using admin endpoints for keystone. If + any admin endpoints exist for other services, they must be deleted e.g. + + .. code-block:: + + openstack endpoint list --interface admin -f value | \ + awk '!/keystone/ {print $1}' | xargs openstack endpoint delete + + .. code-block:: + + kayobe overcloud service deploy + + If VM provisioning fails with an error with this format: + + .. code-block:: + + Unable to establish connection to http://:9696/v2.0/ports/some-sort-of-uuid: Connection aborted + + Restart the nova-compute container on all hypervisors: + + .. code-block:: + + kayobe overcloud host command run --command "systemctl restart kolla-nova_compute-container.service" --become --show-output -l compute + +Barbican integration +==================== + +Barbican integration depends on :ref:`HAProxy integration `. + +Enable Barbican in kayobe +------------------------- + +1. Set the following in kayobe-config/etc/kayobe/kolla.yml or if environments are being used etc/kayobe/environments/$KAYOBE_ENVIRONMENT/kolla.yml + + .. code-block:: + + kolla_enable_barbican: yes + +Generate secrets_barbican_approle_secret_id +------------------------------------------- + +1. Run ``uuidgen`` to generate secret id +2. Insert into secrets.yml or if environments are being used etc/kayobe/environments/$KAYOBE_ENVIRONMENT/secrets.yml + + .. code-block:: + + secrets_barbican_approle_secret_id: "YOUR-SECRET-GOES-HERE" + +Create required configuration in OpenBao +---------------------------------------- + +1. Run openbao-deploy-barbican.yml custom playbook + + .. code-block:: + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/openbao-deploy-barbican.yml + +Add secrets_barbican_approle_id to secrets +------------------------------------------ + +1. Note the role id from playbook output and insert into secrets.yml or if environments are being used etc/kayobe/environments/$KAYOBE_ENVIRONMENT/secrets.yml + + .. code-block:: + + secrets_barbican_approle_role_id: "YOUR-APPROLE-ID-GOES-HERE" + +Configure Barbican +------------------ + +1. Put required configuration in kayobe-config/etc/kayobe/kolla/config/barbican.conf or if environments are being used etc/kayobe/environments/$KAYOBE_ENVIRONMENT/kolla/config/barbican.conf + + .. code-block:: + + [secretstore] + namespace=barbican.secretstore.plugin + enable_multiple_secret_stores=false + enabled_secretstore_plugins=vault_plugin + + [vault_plugin] + vault_url = https://{{ kolla_internal_fqdn }}:8200 + use_ssl = True + {% raw %} + ssl_ca_crt_file = {{ openstack_cacert }} + {% endraw %} + approle_role_id = {{ secrets_barbican_approle_role_id }} + approle_secret_id = {{ secrets_barbican_approle_secret_id }} + kv_mountpoint = barbican + +Deploy Barbican +--------------- + + .. code-block:: + + kayobe overcloud service deploy -kt barbican diff --git a/doc/source/configuration/vault.rst b/doc/source/configuration/vault.rst index 8754f0bd7..126f5adc8 100644 --- a/doc/source/configuration/vault.rst +++ b/doc/source/configuration/vault.rst @@ -4,6 +4,13 @@ Hashicorp Vault for internal PKI ================================ +.. warning:: + + Deployment of Hashicorp Vault is deprecated and will be removed in a future release. + New deployments should use OpenBao and existing deployments will be migrated once an + method for migration is available. + See :ref:`here ` for more information. + This document describes how to deploy Hashicorp Vault for internal PKI purposes using the `StackHPC Hashicorp collection `_ @@ -167,6 +174,15 @@ cannot be unsealed with an expired certificate. Certificates generation ======================= +.. note:: + + Generating certificates will fail if the Vault on the overcloud is sealed. This will happen whenever the vault containers are restarted. To unseal the + overcloud Vault, run: + + .. code-block:: + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/vault-unseal-overcloud.yml + Create the external TLS certificates (testing only) --------------------------------------------------- diff --git a/doc/source/contributor/ofed.rst b/doc/source/contributor/ofed.rst index c993f6748..cbd57b421 100644 --- a/doc/source/contributor/ofed.rst +++ b/doc/source/contributor/ofed.rst @@ -78,12 +78,12 @@ a package update, which can also be limited to hosts in the ``mlnx`` group. kayobe overcloud host package update --packages "*" --limit mlnx -To ensure the latest kernel is the default on boot, the bootloader entires will need +To ensure the latest kernel is the default on boot, the bootloader entries will need to be reset before rebooting. .. code-block:: console - kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/reset-bls-entires.yml -e reset_bls_host=mlnx + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/reset-bls-entries.yml -e reset_bls_host=mlnx The hosts can now be rebooted to use the latest kernel, a rolling reboot may be applicable here to reduce distruptions. See the `package updates documentation `. diff --git a/doc/source/index.rst b/doc/source/index.rst index be1320948..85d3c4697 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -12,8 +12,8 @@ Overview ======== This documentation covers the StackHPC Kayobe configuration. It is intended to -complement, rather than replace, the upstream :kayobe-doc:`Kayobe `, -:kolla-doc:`Kolla ` and :kolla-ansible-doc:`Kolla Ansible ` +complement, rather than replace, the upstream :kayobe-doc:`Kayobe <>`, +:kolla-doc:`Kolla <>` and :kolla-ansible-doc:`Kolla Ansible<>` documentation. The configuration includes various things, such as: diff --git a/doc/source/operations/gpu-in-openstack.rst b/doc/source/operations/gpu-in-openstack.rst index 1fd99d30d..6198edcaa 100644 --- a/doc/source/operations/gpu-in-openstack.rst +++ b/doc/source/operations/gpu-in-openstack.rst @@ -2,6 +2,132 @@ Support for GPUs in OpenStack ============================= +PCI Passthrough +############### + +Prerequisite - BIOS Configuration +--------------------------------- + +On an Intel system: + +* Enable ``VT-x`` in the BIOS for virtualisation support. +* Enable ``VT-d`` in the BIOS for IOMMU support. + +On an AMD system: + +* Enable ``AMD-v`` in the BIOS for virtualisation support. +* Enable ``AMD-Vi`` (also just called ``IOMMU`` on older hardware) in the BIOS + for IOMMU support. + +It may be possible to configure passthrough without these settings, though +stability or performance may be affected. + +Host and Service Configuration +------------------------------ + +PCI passthrough GPU variables can be found in the +``etc/kayobe/stackhpc-compute.yml`` file. + +The ``gpu_group_map`` is a dictionary mapping inventory groups to GPU types. +This is used to determine which GPU types each compute node should pass through +to OpenStack. The keys are group names, the values are a list of GPU types. + +Possible GPU types are defined in the ``stackhpc_gpu_data`` dictionary. It +contains data for many common GPUs. If you have a GPU that is not included, +extend the dictionary following the same pattern. + +The ``resource_name`` is the name that will be used in the flavor extra specs. +These can be overridden e.g. ``a100_80_resource_name: "big_gpu"``. + +Example configuration for three groups containing A100s, V100s, and both: + +.. code-block:: yaml + :caption: $KAYOBE_CONFIG_PATH/stackhpc-compute.yml + + gpu_group_map: + compute_a100: + - a100_80 + compute_v100: + - v100_32 + compute_multi_gpu: + - a100_80 + - v100_32 + +All groups in the ``gpu_group_map`` must also be added to +``kolla_overcloud_inventory_top_level_group_map`` in ``etc/kayobe/kolla.yml``. +Always include the Kayobe defaults unless you know what you are doing. + +When ``gpu_group_map`` is populated, the ``pci-passthrough.yml`` playbook will +be added as a pre-hook to ``kayobe overcloud host configure``. Either run host +configuration or trigger the playbook manually: + +.. code-block:: console + + kayobe overcloud host configure --limit compute_a100,compute_v100,compute_multi_gpu + # OR + kayobe playbook run --playbook $KAYOBE_CONFIG_PATH/ansible/pci-passthrough.yml --limit compute_a100,compute_v100,compute_multi_gpu + +The playbook will apply the necessary configuraion and reboot the hosts if +required. + +Once host configuration is complete, deploy Nova: +.. code-block:: console + + kayobe overcloud service deploy -kt nova + +Create a flavor +--------------- + +For example, to request two of the GPUs with alias **v100_32** + +.. code-block:: text + + openstack flavor set m1.medium-gpu --property "pci_passthrough:alias"="v100_32:2" + +This can be also defined in the openstack-config repository. + +Add extra_specs to flavor in etc/openstack-config/openstack-config.yml: + +.. code-block:: console + + cd src/openstack-config + vim etc/openstack-config/openstack-config.yml + + name: "m1.medium-gpu" + ram: 4096 + disk: 40 + vcpus: 2 + extra_specs: + "pci_passthrough:alias": "v100_32:2" + +Invoke configuration playbooks afterwards: + +.. code-block:: console + + source src/kayobe-config/etc/kolla/public-openrc.sh + source venvs/openstack/bin/activate + tools/openstack-config --vault-password-file + +Create instance with GPU passthrough +------------------------------------ + +.. code-block:: text + + openstack server create --flavor m1.medium-gpu --image ubuntu22.04 --wait test-pci + +Testing GPU in a Guest VM +------------------------- + +The Nvidia drivers must be installed first. For example, on an Ubuntu guest: + +.. code-block:: text + + sudo apt install nvidia-headless-440 nvidia-utils-440 nvidia-compute-utils-440 + +The ``nvidia-smi`` command will generate detailed output if the driver has +loaded successfully. + + Virtual GPUs ############ @@ -147,7 +273,7 @@ hosts can automatically be mapped to these groups by configuring .. _NVIDIA Role Configuration: Role Configuration -^^^^^^^^^^^^^^^^^^ +------------------ Configure the VGPU devices: @@ -193,7 +319,7 @@ Configure the VGPU devices: .. _NVIDIA Kolla Ansible Configuration: Kolla-Ansible configuration -^^^^^^^^^^^^^^^^^^^^^^^^^^^ +--------------------------- See upstream documentation: `Kolla Ansible configuration `__ then follow the rest. @@ -241,12 +367,12 @@ You will need to reconfigure nova for this change to be applied: kayobe overcloud service deploy -kt nova --kolla-limit compute_vgpu Openstack flavors -^^^^^^^^^^^^^^^^^ +----------------- See upstream documentation: `OpenStack flavors `__ NVIDIA License Server -^^^^^^^^^^^^^^^^^^^^^ +--------------------- The Nvidia delegated license server is a virtual machine based appliance. You simply need to boot an instance using the image supplied on the NVIDIA Licensing portal. This can be done on the OpenStack cloud itself. The @@ -323,7 +449,7 @@ Booting the VM: Manual VM driver and licence configuration -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +------------------------------------------ vGPU client VMs need to be configured with Nvidia drivers to run GPU workloads. The host drivers should already be applied to the hypervisor. @@ -393,7 +519,7 @@ includes the drivers and licencing token. Alternatively, an image can be created using Diskimage Builder. Disk image builder recipe to automatically license VGPU on boot -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +--------------------------------------------------------------- `stackhpc-image-elements `__ provides a ``nvidia-vgpu`` element to configure the nvidia-gridd service in VGPU mode. This allows you to boot VMs that automatically license themselves. @@ -471,7 +597,7 @@ into your openstack-config repository and vault encrypt it. The ``file`` lookup the file (as shown in the example above). Testing vGPU VMs -^^^^^^^^^^^^^^^^ +---------------- vGPU VMs can be validated using the following test workload. The test should succeed if the VM is correctly licenced and drivers are correctly installed for @@ -531,266 +657,10 @@ Example output: Test passed Changing VGPU device types -^^^^^^^^^^^^^^^^^^^^^^^^^^ +-------------------------- See upstream documentation: `Changing VGPU device types `__ -PCI Passthrough -############### - -This guide has been developed for Nvidia GPUs and CentOS 8. - -See `Kayobe Ops `_ for -a playbook implementation of host setup for GPU. - -BIOS Configuration Requirements -------------------------------- - -On an Intel system: - -* Enable `VT-x` in the BIOS for virtualisation support. -* Enable `VT-d` in the BIOS for IOMMU support. - -Hypervisor Configuration Requirements -------------------------------------- - -Find the GPU device IDs -^^^^^^^^^^^^^^^^^^^^^^^ - -From the host OS, use ``lspci -nn`` to find the PCI vendor ID and -device ID for the GPU device and supporting components. These are -4-digit hex numbers. - -For example: - -.. code-block:: text - - 01:00.0 VGA compatible controller [0300]: NVIDIA Corporation GM204M [GeForce GTX 980M] [10de:13d7] (rev a1) (prog-if 00 [VGA controller]) - 01:00.1 Audio device [0403]: NVIDIA Corporation GM204 High Definition Audio Controller [10de:0fbb] (rev a1) - -In this case the vendor ID is ``10de``, display ID is ``13d7`` and audio ID is ``0fbb``. - -Alternatively, for an Nvidia Quadro RTX 6000: - -.. code-block:: yaml - - # NVIDIA Quadro RTX 6000/8000 PCI device IDs - vendor_id: "10de" - display_id: "1e30" - audio_id: "10f7" - usba_id: "1ad6" - usba_class: "0c0330" - usbc_id: "1ad7" - usbc_class: "0c8000" - -These parameters will be used for device-specific configuration. - -Kernel Ramdisk Reconfiguration -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ramdisk loaded during kernel boot can be extended to include the -vfio PCI drivers and ensure they are loaded early in system boot. - -.. code-block:: yaml - - - name: Template dracut config - blockinfile: - path: /etc/dracut.conf.d/gpu-vfio.conf - block: | - add_drivers+="vfio vfio_iommu_type1 vfio_pci vfio_virqfd" - owner: root - group: root - mode: 0660 - create: true - become: true - notify: - - Regenerate initramfs - - reboot - -The handler for regenerating the Dracut initramfs is: - -.. code-block:: yaml - - - name: Regenerate initramfs - shell: |- - #!/bin/bash - set -eux - dracut -v -f /boot/initramfs-$(uname -r).img $(uname -r) - become: true - -Kernel Boot Parameters -^^^^^^^^^^^^^^^^^^^^^^ - -Set the following kernel parameters by adding to -``GRUB_CMDLINE_LINUX_DEFAULT`` or ``GRUB_CMDLINE_LINUX`` in -``/etc/default/grub.conf``. We can use the -`stackhpc.grubcmdline `_ -role from Ansible Galaxy: - -.. code-block:: yaml - - - name: Add vfio-pci.ids kernel args - include_role: - name: stackhpc.grubcmdline - vars: - kernel_cmdline: - - intel_iommu=on - - iommu=pt - - "vfio-pci.ids={{ vendor_id }}:{{ display_id }},{{ vendor_id }}:{{ audio_id }}" - kernel_cmdline_remove: - - iommu - - intel_iommu - - vfio-pci.ids - -Kernel Device Management -^^^^^^^^^^^^^^^^^^^^^^^^ - -In the hypervisor, we must prevent kernel device initialisation of -the GPU and prevent drivers from loading for binding the GPU in the -host OS. We do this using ``udev`` rules: - -.. code-block:: yaml - - - name: Template udev rules to blacklist GPU usb controllers - blockinfile: - # We want this to execute as soon as possible - path: /etc/udev/rules.d/99-gpu.rules - block: | - #Remove NVIDIA USB xHCI Host Controller Devices, if present - ACTION=="add", SUBSYSTEM=="pci", ATTR{vendor}=="0x{{ vendor_id }}", ATTR{class}=="0x{{ usba_class }}", ATTR{remove}="1" - #Remove NVIDIA USB Type-C UCSI devices, if present - ACTION=="add", SUBSYSTEM=="pci", ATTR{vendor}=="0x{{ vendor_id }}", ATTR{class}=="0x{{ usbc_class }}", ATTR{remove}="1" - owner: root - group: root - mode: 0644 - create: true - become: true - -Kernel Drivers -^^^^^^^^^^^^^^ - -Prevent the ``nouveau`` kernel driver from loading by -blacklisting the module: - -.. code-block:: yaml - - - name: Blacklist nouveau - blockinfile: - path: /etc/modprobe.d/blacklist-nouveau.conf - block: | - blacklist nouveau - options nouveau modeset=0 - mode: 0664 - owner: root - group: root - create: true - become: true - notify: - - reboot - - Regenerate initramfs - -Ensure that the ``vfio`` drivers are loaded into the kernel on boot: - -.. code-block:: yaml - - - name: Add vfio to modules-load.d - blockinfile: - path: /etc/modules-load.d/vfio.conf - block: | - vfio - vfio_iommu_type1 - vfio_pci - vfio_virqfd - owner: root - group: root - mode: 0664 - create: true - become: true - notify: reboot - -Once this code has taken effect (after a reboot), the VFIO kernel drivers should be loaded on boot: - -.. code-block:: text - - # lsmod | grep vfio - vfio_pci 49152 0 - vfio_virqfd 16384 1 vfio_pci - vfio_iommu_type1 28672 0 - vfio 32768 2 vfio_iommu_type1,vfio_pci - irqbypass 16384 5 vfio_pci,kvm - - # lspci -nnk -s 3d:00.0 - 3d:00.0 VGA compatible controller [0300]: NVIDIA Corporation GM107GL [Tesla M10] [10de:13bd] (rev a2) - Subsystem: NVIDIA Corporation Tesla M10 [10de:1160] - Kernel driver in use: vfio-pci - Kernel modules: nouveau - -IOMMU should be enabled at kernel level as well - we can verify that on the compute host: - -.. code-block:: text - - # docker exec -it nova_libvirt virt-host-validate | grep IOMMU - QEMU: Checking for device assignment IOMMU support : PASS - QEMU: Checking if IOMMU is enabled by kernel : PASS - -OpenStack Nova configuration ----------------------------- - -See upsteram Nova documentation: `Attaching physical PCI devices to guests `__ - -Configure a flavor -^^^^^^^^^^^^^^^^^^ - -For example, to request two of the GPUs with alias **a1** - -.. code-block:: text - - openstack flavor set m1.medium --property "pci_passthrough:alias"="a1:2" - - -This can be also defined in the openstack-config repository - -add extra_specs to flavor in etc/openstack-config/openstack-config.yml: - -.. code-block:: console - - cd src/openstack-config - vim etc/openstack-config/openstack-config.yml - - name: "m1.medium-gpu" - ram: 4096 - disk: 40 - vcpus: 2 - extra_specs: - "pci_passthrough:alias": "a1:2" - -Invoke configuration playbooks afterwards: - -.. code-block:: console - - source src/kayobe-config/etc/kolla/public-openrc.sh - source venvs/openstack/bin/activate - tools/openstack-config --vault-password-file - -Create instance with GPU passthrough -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: text - - openstack server create --flavor m1.medium-gpu --image ubuntu22.04 --wait test-pci - -Testing GPU in a Guest VM -------------------------- - -The Nvidia drivers must be installed first. For example, on an Ubuntu guest: - -.. code-block:: text - - sudo apt install nvidia-headless-440 nvidia-utils-440 nvidia-compute-utils-440 - -The ``nvidia-smi`` command will generate detailed output if the driver has loaded -successfully. - Further Reference ----------------- diff --git a/doc/source/operations/upgrading-openstack.rst b/doc/source/operations/upgrading-openstack.rst index b21834b34..891829e05 100644 --- a/doc/source/operations/upgrading-openstack.rst +++ b/doc/source/operations/upgrading-openstack.rst @@ -820,6 +820,24 @@ the change: kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/reboot.yml -l +.. warning:: + + Take extra care when updating packages on Ceph hosts. Docker live-restore + does not work until the Squid version of Ceph, so a reload of docker will + restart all Ceph containers. Set the hosts to maintenance mode before + updating packages, and unset when done: + + .. code-block:: console + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/ceph-enter-maintenance.yml --limit + kayobe overcloud host package update --packages "*" --limit + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/reboot.yml -l + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/ceph-exit-maintenance.yml --limit + + **Always** reconfigure hosts in small batches or one-by-one. Check the Ceph + state after each host configuration. Ensure all warnings and errors are + resolved before moving on. + If the host is a hypervisor, enable the Nova compute service. .. code-block:: console diff --git a/doc/source/usage.rst b/doc/source/usage.rst index 0760e0dd9..dc2a22a81 100644 --- a/doc/source/usage.rst +++ b/doc/source/usage.rst @@ -57,3 +57,51 @@ configuration: The intention is to avoid merge conflicts where possible, but there may be cases where this is difficult. We are open to discussion on how best to approach this on both sides. + +Beokay +------ + +`Beokay ` is a tool to manage Kayobe +environments. This can create new StackHPC Kayobe environments and +ensure StackHPC Kayobe Configuration dependencies are from the correct repositories and +are up-to-date: + +To create a Beokay environment using the base configuration, for the latest release: + +.. code-block:: console + + beokay.py create \ + --base-path skc-environment \ + --kayobe-config-repo https://github.com/stackhpc/stackhpc-kayobe-config.git \ + --kayobe-config-branch |current_release_git_branch_name| \ + --kayobe-in-requirements + +Kayobe environments can also be specified, for example, to create an AIO environment: + +.. code-block:: console + + beokay.py create \ + --base-path skc-aio-environment \ + --kayobe-config-repo https://github.com/stackhpc/stackhpc-kayobe-config.git \ + --kayobe-config-branch |current_release_git_branch_name| \ + --kayobe-config-env-name ci-aio \ + --vault-password-file ~/vault-pw \ + --kayobe-in-requirements + +When Beokay environments are no longer required, they can be deleted by running: + +.. code-block:: console + + beokay.py destroy \ + --base-path skc-environment + +Specific Kayobe commands can also be run via Beokay, for example, to run a Kolla +service deployment on overcloud hosts: + +.. code-block:: console + + beokay.py run \ + 'kayobe overcloud service deploy' \ + --base-path skc-aio-environment \ + --kayobe-config-env-name ci-aio \ + --vault-password-file ~/vault-pw diff --git a/etc/kayobe/ansible/deploy-radosgw-usage-exporter.yml b/etc/kayobe/ansible/deploy-radosgw-usage-exporter.yml index df8340419..c70e54194 100644 --- a/etc/kayobe/ansible/deploy-radosgw-usage-exporter.yml +++ b/etc/kayobe/ansible/deploy-radosgw-usage-exporter.yml @@ -66,7 +66,7 @@ vars: ansible_host: "{{ hostvars[groups['controllers'][0]].ansible_host }}" run_once: true - when: credential_check.stdout == [] + when: credential_check.stdout | from_json == [] - name: Query ec2 credential for ceph_rgw ansible.builtin.command: > @@ -115,6 +115,7 @@ ACCESS_KEY: "{{ ec2.Access }}" SECRET_KEY: "{{ ec2.Secret }}" VIRTUAL_PORT: "{{ stackhpc_radosgw_usage_exporter_port | string }}" + REQUESTS_CA_BUNDLE: "/etc/ssl/certs/ca-certificates.crt" entrypoint: "{{ ['python', '-u', './radosgw_usage_exporter.py', '--insecure'] if not stackhpc_radosgw_usage_exporter_verify else omit }}" vars: ec2: "{{ credential.stdout | from_json | first }}" diff --git a/etc/kayobe/ansible/openbao-deploy-barbican.yml b/etc/kayobe/ansible/openbao-deploy-barbican.yml new file mode 100644 index 000000000..50d55aa22 --- /dev/null +++ b/etc/kayobe/ansible/openbao-deploy-barbican.yml @@ -0,0 +1,107 @@ +--- +- name: Configure AppRole + any_errors_fatal: true + gather_facts: true + hosts: controllers[0] + vars: + openbao_api_addr: https://{{ internal_net_name | net_ip }}:8200 + openbao_ca_cert: "{{ '/etc/pki/tls/certs/ca-bundle.crt' if ansible_facts.os_family == 'RedHat' else '/usr/local/share/ca-certificates/OS-TLS-ROOT.crt' }}" + tasks: + - name: Assert that secrets_barbican_approle_secret_id is defined + ansible.builtin.assert: + that: + - secrets_barbican_approle_secret_id is defined + fail_msg: Please define secrets_barbican_approle_secret_id in your secrets.yml + + - name: Include OpenBao keys + ansible.builtin.include_vars: + file: "{{ kayobe_env_config_path }}/openbao/overcloud-openbao-keys.json" + name: openbao_keys + + - name: Ensure hvac is installed + ansible.builtin.pip: + name: hvac + state: present + extra_args: "{% if pip_upper_constraints_file %}-c {{ pip_upper_constraints_file }}{% endif %}" + virtualenv: "{{ virtualenv_path }}/kayobe" + + - name: Ensure AppRole is configured + environment: + https_proxy: "" + block: + - name: Enable AppRole auth module + hashivault_auth_method: # noqa: fqcn + url: "{{ openbao_api_addr }}" + ca_cert: "{{ openbao_ca_cert }}" + token: "{{ openbao_keys.root_token }}" + method_type: approle + state: enabled + + - name: Enable barbican kv store + hashivault_secret_engine: # noqa: fqcn + url: "{{ openbao_api_addr }}" + ca_cert: "{{ openbao_ca_cert }}" + token: "{{ openbao_keys.root_token }}" + name: barbican + backend: kv + description: Barbican kv store + + - name: Ensure barbican policy is defined + hashivault_policy: # noqa: fqcn + url: "{{ openbao_api_addr }}" + ca_cert: "{{ openbao_ca_cert }}" + token: "{{ openbao_keys.root_token }}" + name: barbican-policy + state: present + rules: | + path "barbican/*" { + capabilities = ["create", "read", "update", "delete", "list"] + } + + - name: Ensure barbican AppRole is defined + hashivault_approle_role: # noqa: fqcn + url: "{{ openbao_api_addr }}" + ca_cert: "{{ openbao_ca_cert }}" + token: "{{ openbao_keys.root_token }}" + bind_secret_id: true + secret_id_bound_cidrs: "{{ internal_net_name | net_cidr }}" + secret_id_ttl: 0 + token_policies: barbican-policy + name: barbican + + - name: Get barbican Approle ID + hashivault_approle_role_id: # noqa: fqcn + url: "{{ openbao_api_addr }}" + ca_cert: "{{ openbao_ca_cert }}" + token: "{{ openbao_keys.root_token }}" + name: barbican + register: barbican_role_id + + - name: Print barbican Approle ID + ansible.builtin.debug: + msg: barbican role id is {{ barbican_role_id.id }} + + - name: Write barbican Approle ID to file if requested + delegate_to: localhost + ansible.builtin.copy: + content: "{{ barbican_role_id.id }}" + dest: "{{ stackhpc_barbican_role_id_file_path | default('~/barbican-role-id') }}" + when: stackhpc_write_barbican_role_id_to_file | default(false) | bool + + - name: Check if barbican Approle Secret ID is defined + hashivault_approle_role_secret_get: # noqa: fqcn + url: "{{ openbao_api_addr }}" + ca_cert: "{{ openbao_ca_cert }}" + token: "{{ openbao_keys.root_token }}" + secret: "{{ secrets_barbican_approle_secret_id }}" + name: barbican + register: barbican_approle_secret_get + + - name: Ensure barbican AppRole Secret ID is defined + hashivault_approle_role_secret: # noqa: fqcn + url: "{{ openbao_api_addr }}" + ca_cert: "{{ openbao_ca_cert }}" + token: "{{ openbao_keys.root_token }}" + secret: "{{ secrets_barbican_approle_secret_id }}" + name: barbican + when: barbican_approle_secret_get.status == "absent" diff --git a/etc/kayobe/ansible/openbao-deploy-overcloud.yml b/etc/kayobe/ansible/openbao-deploy-overcloud.yml new file mode 100644 index 000000000..811e9b1f0 --- /dev/null +++ b/etc/kayobe/ansible/openbao-deploy-overcloud.yml @@ -0,0 +1,116 @@ +--- +# Required for uri module to work with self-signed certificates and for systems to trust +# the self-signed CA +- name: Install CA on controllers + hosts: controllers + tasks: + - name: Copy the intermediate CA + ansible.builtin.copy: + src: "{{ kayobe_env_config_path }}/openbao/OS-TLS-ROOT.pem" + dest: "{{ '/etc/pki/ca-trust/source/anchors/OS-TLS-ROOT.crt' if ansible_facts.os_family == 'RedHat' else '/usr/local/share/ca-certificates/OS-TLS-ROOT.crt' + }}" + mode: "0644" + become: true + + - name: Update system CA + become: true + ansible.builtin.command: "{{ 'update-ca-trust' if ansible_facts.os_family == 'RedHat' else 'update-ca-certificates' }}" + +- name: Deploy OpenBao on the overcloud + any_errors_fatal: true + gather_facts: true + hosts: controllers + vars: + openbao_bind_address: "{{ internal_net_name | net_ip }}" + tasks: + - name: Set a fact about the virtualenv on the remote system + ansible.builtin.set_fact: + virtualenv: "{{ ansible_python_interpreter | dirname | dirname }}" + when: + - ansible_python_interpreter is defined + - not ansible_python_interpreter.startswith('/bin/') + - not ansible_python_interpreter.startswith('/usr/bin/') + + - name: Ensure Python hvac module is installed + ansible.builtin.pip: + name: hvac + state: latest + extra_args: "{% if pip_upper_constraints_file %}-c {{ pip_upper_constraints_file }}{% endif %}" + virtualenv: "{{ virtualenv is defined | ternary(virtualenv, omit) }}" + become: "{{ virtualenv is not defined }}" + + - name: Ensure /opt/kayobe/openbao exists + ansible.builtin.file: + path: /opt/kayobe/openbao + state: directory + + - name: Template out TLS key and cert + ansible.builtin.copy: + # Within the OpenBao container these uids & gids map to the vault user + src: "{{ kayobe_env_config_path }}/openbao/{{ item }}" + dest: /opt/kayobe/openbao/{{ item }} + owner: 100 + group: 1000 + mode: "0600" + loop: + - "{% if kolla_internal_fqdn != kolla_internal_vip_address %}{{ kolla_internal_fqdn }}{% else %}overcloud{% endif %}.crt" + - "{% if kolla_internal_fqdn != kolla_internal_vip_address %}{{ kolla_internal_fqdn }}{% else %}overcloud{% endif %}.key" + become: true + + - name: Apply OpenBao role + ansible.builtin.import_role: + name: stackhpc.hashicorp.openbao + vars: + openbao_registry_url: "{{ overcloud_openbao_registry_url }}" + openbao_registry_username: "{{ overcloud_openbao_registry_username }}" + openbao_registry_password: "{{ overcloud_openbao_registry_password }}" + openbao_config_dir: /opt/kayobe/openbao + openbao_cluster_name: overcloud + openbao_ca_cert: "{{ '/etc/pki/tls/certs/ca-bundle.crt' if ansible_facts.os_family == 'RedHat' else '/usr/local/share/ca-certificates/OS-TLS-ROOT.crt' }}" + openbao_docker_image: "{{ overcloud_openbao_docker_image }}" + openbao_docker_tag: "{{ overcloud_openbao_docker_tag }}" + openbao_tls_cert: "{% if kolla_internal_fqdn != kolla_internal_vip_address %}{{ kolla_internal_fqdn }}{% else %}overcloud{% endif %}.crt" + openbao_tls_key: "{% if kolla_internal_fqdn != kolla_internal_vip_address %}{{ kolla_internal_fqdn }}{% else %}overcloud{% endif %}.key" + copy_self_signed_ca: true + openbao_api_addr: https://{{ internal_net_name | net_ip }}:8200 + openbao_write_keys_file: true + openbao_write_keys_file_path: "{{ kayobe_env_config_path }}/openbao/overcloud-openbao-keys.json" + + - name: Include OpenBao keys + ansible.builtin.include_vars: + file: "{{ kayobe_env_config_path }}/openbao/overcloud-openbao-keys.json" + name: openbao_keys + + - name: Unseal OpenBao + ansible.builtin.import_role: + name: stackhpc.hashicorp.vault_unseal + vars: + vault_api_addr: https://{{ internal_net_name | net_ip }}:8200 + vault_unseal_token: "{{ openbao_keys.root_token }}" + vault_unseal_ca_cert: "{{ '/etc/pki/tls/certs/ca-bundle.crt' if ansible_facts.os_family == 'RedHat' else '/usr/local/share/ca-certificates/OS-TLS-ROOT.crt' }}" + vault_unseal_keys: "{{ openbao_keys.keys_base64 }}" + environment: + https_proxy: "" + +- name: Configure PKI + any_errors_fatal: true + gather_facts: true + hosts: controllers[0] + tasks: + - name: Apply OpenBao pki role + ansible.builtin.import_role: + name: stackhpc.hashicorp.vault_pki + vars: + vault_token: "{{ openbao_keys.root_token }}" + vault_api_addr: https://{{ internal_net_name | net_ip }}:8200 + vault_ca_cert: "{{ '/etc/pki/tls/certs/ca-bundle.crt' if ansible_facts.os_family == 'RedHat' else '/usr/local/share/ca-certificates/OS-TLS-ROOT.crt' }}" + vault_pki_root_create: false + vault_pki_intermediate_import: true + vault_pki_intermediate_ca_name: OS-TLS-INT + vault_pki_intermediate_ca_bundle: "{{ lookup('file', kayobe_env_config_path + '/openbao/OS-TLS-INT.pem') }}" + vault_pki_intermediate_ca_cert: "{{ lookup('file', kayobe_env_config_path + '/openbao/OS-TLS-INT.crt') }}" + vault_pki_intermediate_roles: "{{ overcloud_openbao_pki_roles }}" + vault_pki_write_certificate_files: true + vault_pki_certificates_directory: "{{ kayobe_env_config_path }}/openbao" + environment: + https_proxy: "" diff --git a/etc/kayobe/ansible/openbao-deploy-seed.yml b/etc/kayobe/ansible/openbao-deploy-seed.yml new file mode 100644 index 000000000..2f276cbf6 --- /dev/null +++ b/etc/kayobe/ansible/openbao-deploy-seed.yml @@ -0,0 +1,78 @@ +--- +- name: Deploy OpenBao on the seed + any_errors_fatal: true + gather_facts: true + hosts: seed + vars: + openbao_bind_address: "{{ ansible_facts['lo'].ipv4.address }}" + openbao_api_addr: "http://{{ openbao_bind_address }}:8200" + tasks: + - name: Set a fact about the virtualenv on the remote system + ansible.builtin.set_fact: + virtualenv: "{{ ansible_python_interpreter | dirname | dirname }}" + when: + - ansible_python_interpreter is defined + - not ansible_python_interpreter.startswith('/bin/') + - not ansible_python_interpreter.startswith('/usr/bin/') + + - name: Ensure Python PyYAML and hvac modules are installed + ansible.builtin.pip: + name: + - PyYAML + - hvac + state: latest + extra_args: "{% if pip_upper_constraints_file %}-c {{ pip_upper_constraints_file }}{% endif %}" + virtualenv: "{{ virtualenv is defined | ternary(virtualenv, omit) }}" + become: "{{ virtualenv is not defined }}" + + - name: Ensure OpenBao directory exists in Kayobe configuration + ansible.builtin.file: + path: "{{ kayobe_env_config_path }}/openbao/" + state: directory + delegate_to: localhost + run_once: true + + - name: Apply OpenBao role + ansible.builtin.import_role: + name: stackhpc.hashicorp.openbao + vars: + openbao_registry_url: "{{ seed_openbao_registry_url }}" + openbao_registry_username: "{{ seed_openbao_registry_username }}" + openbao_registry_password: "{{ seed_openbao_registry_password }}" + openbao_config_dir: /opt/kayobe/openbao + openbao_cluster_name: seed + openbao_docker_image: "{{ seed_openbao_docker_image }}" + openbao_docker_tag: "{{ seed_openbao_docker_tag }}" + openbao_write_keys_file: true + openbao_write_keys_file_path: "{{ kayobe_env_config_path }}/openbao/seed-openbao-keys.json" + + - name: Include Vault keys + ansible.builtin.include_vars: + file: "{{ kayobe_env_config_path }}/openbao/seed-openbao-keys.json" + name: openbao_keys + + - name: Unseal OpenBao + ansible.builtin.import_role: + name: stackhpc.hashicorp.vault_unseal + vars: + vault_api_addr: "{{ openbao_api_addr }}" + vault_unseal_keys: "{{ openbao_keys.keys_base64 }}" + + - name: Apply PKI role + ansible.builtin.import_role: + name: stackhpc.hashicorp.vault_pki + vars: + vault_api_addr: "{{ openbao_api_addr }}" + vault_token: "{{ openbao_keys.root_token }}" + vault_pki_root_ca_name: OS-TLS-ROOT + vault_pki_write_root_ca_to_file: true + vault_pki_intermediate_ca_name: OS-TLS-INT + vault_pki_intermediate_export: true + vault_pki_intermediate_roles: "{{ seed_openbao_pki_roles }}" + vault_pki_certificates_directory: "{{ kayobe_env_config_path }}/openbao" + vault_pki_generate_certificates: true + vault_pki_write_certificates: true + vault_pki_certificate_subject: "{{ seed_openbao_pki_certificate_subject }}" + vault_pki_write_certificate_files: true + vault_pki_write_pem_bundle: false + vault_pki_write_int_ca_to_file: true diff --git a/etc/kayobe/ansible/openbao-generate-backend-tls.yml b/etc/kayobe/ansible/openbao-generate-backend-tls.yml new file mode 100644 index 000000000..f43513ff1 --- /dev/null +++ b/etc/kayobe/ansible/openbao-generate-backend-tls.yml @@ -0,0 +1,83 @@ +--- +# Required for uri module to work with self-signed certificates and for systems to trust +# the self-signed CA +- name: Install CA + hosts: controllers:network + tasks: + - name: Copy the intermediate CA + ansible.builtin.copy: + src: "{{ kayobe_env_config_path }}/openbao/OS-TLS-ROOT.pem" + dest: "{{ '/etc/pki/ca-trust/source/anchors/OS-TLS-ROOT.crt' if ansible_facts.os_family == 'RedHat' \ + else '/usr/local/share/ca-certificates/OS-TLS-ROOT.crt' }}" + mode: "0644" + become: true + + - name: Update system CA + become: true + ansible.builtin.command: "{{ 'update-ca-trust' if ansible_facts.os_family == 'RedHat' else 'update-ca-certificates' }}" + +- name: Generate backend API certificates + hosts: controllers:network + vars: + openbao_api_addr: https://{{ internal_net_name | net_ip(groups['controllers'][0]) }}:8200 + openbao_intermediate_ca_name: OS-TLS-INT + tasks: + - name: Set a fact about the virtualenv on the remote system + ansible.builtin.set_fact: + virtualenv: "{{ ansible_python_interpreter | dirname | dirname }}" + when: + - ansible_python_interpreter is defined + - not ansible_python_interpreter.startswith('/bin/') + - not ansible_python_interpreter.startswith('/usr/bin/') + + - name: Ensure Python hvac module is installed + ansible.builtin.pip: + name: hvac + state: latest + extra_args: "{% if pip_upper_constraints_file %}-c {{ pip_upper_constraints_file }}{% endif %}" + virtualenv: "{{ virtualenv is defined | ternary(virtualenv, omit) }}" + become: "{{ virtualenv is not defined }}" + + - name: Include OpenBao keys + ansible.builtin.include_vars: + file: "{{ kayobe_env_config_path }}/openbao/overcloud-openbao-keys.json" + name: openbao_keys + + - name: Issue a certificate for backend TLS + hashivault_pki_cert_issue: # noqa: fqcn + url: "{{ openbao_api_addr }}" + ca_cert: "{{ '/etc/pki/tls/certs/ca-bundle.crt' if ansible_facts.os_family == 'RedHat' else '/usr/local/share/ca-certificates/OS-TLS-ROOT.crt' }}" + token: "{{ openbao_keys.root_token }}" + mount_point: "{{ openbao_intermediate_ca_name }}" + role: "{{ overcloud_openbao_pki_backend_tls_role_name }}" + common_name: "" + extra_params: + ip_sans: "{{ internal_net_name | net_ip }}" + register: backend_cert + environment: + https_proxy: "" + + - name: Ensure certificates directory exists + ansible.builtin.file: + path: "{{ kayobe_env_config_path }}/kolla/certificates" + state: directory + delegate_to: localhost + + - name: Copy backend cert + no_log: true + ansible.builtin.copy: + dest: "{{ kayobe_env_config_path }}/kolla/certificates/{{ inventory_hostname }}-cert.pem" + content: | + {{ backend_cert.data.certificate }} + {{ backend_cert.data.issuing_ca }} + mode: "0600" + delegate_to: localhost + + - name: Copy backend key + no_log: true + ansible.builtin.copy: + dest: "{{ kayobe_env_config_path }}/kolla/certificates/{{ inventory_hostname }}-key.pem" + content: | + {{ backend_cert.data.private_key }} + mode: "0600" + delegate_to: localhost diff --git a/etc/kayobe/ansible/openbao-generate-internal-tls.yml b/etc/kayobe/ansible/openbao-generate-internal-tls.yml new file mode 100644 index 000000000..2cc9e841a --- /dev/null +++ b/etc/kayobe/ansible/openbao-generate-internal-tls.yml @@ -0,0 +1,56 @@ +--- +- name: Generate internal API certificate + hosts: controllers + run_once: true + vars: + openbao_api_addr: https://{{ internal_net_name | net_ip }}:8200 + openbao_intermediate_ca_name: OS-TLS-INT + tasks: + - name: Include OpenBao keys + ansible.builtin.include_vars: + file: "{{ kayobe_env_config_path }}/openbao/overcloud-openbao-keys.json" + name: openbao_keys + + - name: Issue a certificate for internal TLS + hashivault_pki_cert_issue: # noqa: fqcn + url: "{{ openbao_api_addr }}" + ca_cert: "{{ '/etc/pki/tls/certs/ca-bundle.crt' if ansible_facts.os_family == 'RedHat' else '/usr/local/share/ca-certificates/OS-TLS-ROOT.crt' }}" + token: "{{ openbao_keys.root_token }}" + mount_point: "{{ openbao_intermediate_ca_name }}" + role: "{{ overcloud_openbao_pki_internal_tls_role_name }}" + common_name: "{% if kolla_internal_fqdn != kolla_internal_vip_address %}{{ kolla_internal_fqdn }}{% endif %}" + extra_params: + ip_sans: "{{ kolla_internal_vip_address }}" + register: internal_cert + environment: + https_proxy: "" + + - name: Ensure certificates directory exists + ansible.builtin.file: + path: "{{ kayobe_env_config_path }}/kolla/certificates" + state: directory + delegate_to: localhost + + - name: Ensure CA certificates directory exists + ansible.builtin.file: + path: "{{ kayobe_env_config_path }}/kolla/certificates/ca" + state: directory + delegate_to: localhost + + - name: Copy internal API PEM bundle + no_log: true + ansible.builtin.copy: + dest: "{{ kayobe_env_config_path }}/kolla/certificates/haproxy-internal.pem" + content: | + {{ internal_cert.data.certificate }} + {{ internal_cert.data.issuing_ca }} + {{ internal_cert.data.private_key }} + mode: "0600" + delegate_to: localhost + + - name: Copy root CA + ansible.builtin.copy: + src: "{{ kayobe_env_config_path }}/openbao/OS-TLS-ROOT.pem" + dest: "{{ kayobe_env_config_path }}/kolla/certificates/ca/openbao.crt" + mode: "0600" + delegate_to: localhost diff --git a/etc/kayobe/ansible/openbao-generate-test-external-tls.yml b/etc/kayobe/ansible/openbao-generate-test-external-tls.yml new file mode 100644 index 000000000..e7150fed1 --- /dev/null +++ b/etc/kayobe/ansible/openbao-generate-test-external-tls.yml @@ -0,0 +1,57 @@ +--- +- name: Generate external API certificate (for testing only) + hosts: controllers + run_once: true + vars: + openbao_api_addr: https://{{ internal_net_name | net_ip }}:8200 + # NOTE: Using the same CA as internal TLS. + openbao_intermediate_ca_name: OS-TLS-INT + tasks: + - name: Include OpenBao keys + ansible.builtin.include_vars: + file: "{{ kayobe_env_config_path }}/openbao/overcloud-openbao-keys.json" + name: openbao_keys + + - name: Issue a certificate for external TLS + hashivault_pki_cert_issue: # noqa: fqcn + url: "{{ openbao_api_addr }}" + ca_cert: "{{ '/etc/pki/tls/certs/ca-bundle.crt' if ansible_facts.os_family == 'RedHat' else '/usr/local/share/ca-certificates/OS-TLS-ROOT.crt' }}" + token: "{{ openbao_keys.root_token }}" + mount_point: "{{ openbao_intermediate_ca_name }}" + role: "{{ overcloud_openbao_pki_external_tls_role_name }}" + common_name: "{% if kolla_external_fqdn != kolla_external_vip_address %}{{ kolla_external_fqdn }}{% endif %}" + extra_params: + ip_sans: "{{ kolla_external_vip_address }}" + register: external_cert + environment: + https_proxy: "" + + - name: Ensure certificates directory exists + ansible.builtin.file: + path: "{{ kayobe_env_config_path }}/kolla/certificates" + state: directory + delegate_to: localhost + + - name: Ensure CA certificates directory exists + ansible.builtin.file: + path: "{{ kayobe_env_config_path }}/kolla/certificates/ca" + state: directory + delegate_to: localhost + + - name: Copy external API PEM bundle + no_log: true + ansible.builtin.copy: + dest: "{{ kayobe_env_config_path }}/kolla/certificates/haproxy.pem" + content: | + {{ external_cert.data.certificate }} + {{ external_cert.data.issuing_ca }} + {{ external_cert.data.private_key }} + mode: "0600" + delegate_to: localhost + + - name: Copy root CA + ansible.builtin.copy: + src: "{{ kayobe_env_config_path }}/openbao/OS-TLS-ROOT.pem" + dest: "{{ kayobe_env_config_path }}/kolla/certificates/ca/openbao.crt" + mode: "0600" + delegate_to: localhost diff --git a/etc/kayobe/ansible/openbao-unseal-overcloud.yml b/etc/kayobe/ansible/openbao-unseal-overcloud.yml new file mode 100644 index 000000000..0a631c598 --- /dev/null +++ b/etc/kayobe/ansible/openbao-unseal-overcloud.yml @@ -0,0 +1,37 @@ +--- +- name: Unseal OpenBao on the overcloud + any_errors_fatal: true + gather_facts: true + hosts: controllers + tasks: + - name: Set a fact about the virtualenv on the remote system + ansible.builtin.set_fact: + virtualenv: "{{ ansible_python_interpreter | dirname | dirname }}" + when: + - ansible_python_interpreter is defined + - not ansible_python_interpreter.startswith('/bin/') + - not ansible_python_interpreter.startswith('/usr/bin/') + + - name: Ensure Python hvac module is installed + ansible.builtin.pip: + name: hvac + state: latest + extra_args: "{% if pip_upper_constraints_file %}-c {{ pip_upper_constraints_file }}{% endif %}" + virtualenv: "{{ virtualenv is defined | ternary(virtualenv, omit) }}" + become: "{{ virtualenv is not defined }}" + + - name: Include OpenBao keys + ansible.builtin.include_vars: + file: "{{ kayobe_env_config_path }}/openbao/overcloud-openbao-keys.json" + name: openbao_keys + + - name: Apply OpenBao unseal role + ansible.builtin.import_role: + name: stackhpc.hashicorp.vault_unseal + vars: + vault_api_addr: https://{{ internal_net_name | net_ip }}:8200 + vault_unseal_token: "{{ openbao_keys.root_token }}" + vault_unseal_ca_cert: "{{ '/etc/pki/tls/certs/ca-bundle.crt' if ansible_facts.os_family == 'RedHat' else '/usr/local/share/ca-certificates/OS-TLS-ROOT.crt' }}" + vault_unseal_keys: "{{ openbao_keys.keys_base64 }}" + environment: + https_proxy: "" diff --git a/etc/kayobe/ansible/openbao-unseal-seed.yml b/etc/kayobe/ansible/openbao-unseal-seed.yml new file mode 100644 index 000000000..82f9b8fa7 --- /dev/null +++ b/etc/kayobe/ansible/openbao-unseal-seed.yml @@ -0,0 +1,34 @@ +--- +- name: Unseal OpenBao on the seed + any_errors_fatal: true + gather_facts: true + hosts: seed + vars: + vault_api_addr: http://127.0.0.1:8200 + tasks: + - name: Set a fact about the virtualenv on the remote system + ansible.builtin.set_fact: + virtualenv: "{{ ansible_python_interpreter | dirname | dirname }}" + when: + - ansible_python_interpreter is defined + - not ansible_python_interpreter.startswith('/bin/') + - not ansible_python_interpreter.startswith('/usr/bin/') + + - name: Ensure Python hvac module is installed + ansible.builtin.pip: + name: hvac + state: latest + extra_args: "{% if pip_upper_constraints_file %}-c {{ pip_upper_constraints_file }}{% endif %}" + virtualenv: "{{ virtualenv is defined | ternary(virtualenv, omit) }}" + become: "{{ virtualenv is not defined }}" + + - name: Include OpenBao keys + ansible.builtin.include_vars: + file: "{{ kayobe_env_config_path }}/openbao/seed-openbao-keys.json" + name: openbao_keys + + - name: Apply OpenBao unseal role + ansible.builtin.import_role: + name: stackhpc.hashicorp.vault_unseal + vars: + vault_unseal_keys: "{{ openbao_keys.keys_base64 }}" diff --git a/etc/kayobe/ansible/pci-passthrough.yml b/etc/kayobe/ansible/pci-passthrough.yml new file mode 100644 index 000000000..59803ccf3 --- /dev/null +++ b/etc/kayobe/ansible/pci-passthrough.yml @@ -0,0 +1,142 @@ +--- +- name: Enable GPU passthough + hosts: "{{ (gpu_group_map | default({})).keys() }}" + vars: + # This playbook will execute after nodes are deployed + # and before overcloud host configure - we can't assume + # users and venvs exist. + ansible_user: "{{ bootstrap_user }}" + ansible_ssh_common_args: "-o StrictHostKeyChecking=no" + ansible_python_interpreter: "/usr/bin/python3" + vfio_pci_ids: |- + {% set gpu_list = [] %} + {% set output = [] %} + {% for gpu_group in gpu_group_map | dict2items | default([]) %} + {% if gpu_group.key in group_names %} + {% set _ = gpu_list.append(gpu_group.value) %} + {% endif %} + {% endfor %} + {% for item in gpu_list | flatten | unique %} + {% set _ = output.append(stackhpc_gpu_data[item]['vendor_id'] + ':' + stackhpc_gpu_data[item]['product_id']) %} + {% endfor %} + {{ output | join(',') }} + reboot_timeout_s: "{{ 20 * 60 }}" + tasks: + - name: Template dracut config + ansible.builtin.blockinfile: + path: /etc/dracut.conf.d/gpu-vfio.conf + block: | + add_drivers+="vfio vfio_iommu_type1 vfio_pci vfio_virqfd" + owner: root + group: root + mode: 0660 + create: true + become: true + notify: + - Regenerate initramfs + - reboot + + - name: Add vfio to modules-load.d + ansible.builtin.blockinfile: + path: /etc/modules-load.d/vfio.conf + block: | + vfio + vfio_iommu_type1 + vfio_pci + vfio_virqfd + owner: root + group: root + mode: 0664 + create: true + become: true + notify: reboot + + - name: Blacklist nouveau + ansible.builtin.blockinfile: + path: /etc/modprobe.d/blacklist-nouveau.conf + block: | + blacklist nouveau + options nouveau modeset=0 + mode: 0664 + owner: root + group: root + create: true + become: true + notify: + - reboot + - Regenerate initramfs + + - name: Ignore unsupported model specific registers + # Occasionally, applications running in the VM may crash unexpectedly, + # whereas they would run normally on a physical machine. If, while + # running dmesg -wH, you encounter an error mentioning MSR, the reason + # for those crashes is that KVM injects a General protection fault (GPF) + # when the guest tries to access unsupported Model-specific registers + # (MSRs) - this often results in guest applications/OS crashing. A + # number of those issues can be solved by passing the ignore_msrs=1 + # option to the KVM module, which will ignore unimplemented MSRs. + # source: https://wiki.archlinux.org/index.php/QEMU + ansible.builtin.blockinfile: + path: /etc/modprobe.d/kvm.conf + block: | + options kvm ignore_msrs=Y + # This option is not available in centos 7 as the kernel is too old, + # but it can help with dmesg spam in newer kernels (centos8?). Sample + # dmesg log message: + # [ +0.000002] kvm [8348]: vcpu0, guest rIP: 0xffffffffb0a767fa ignored rdmsr: 0x619 + # options kvm report_ignored_msrs=N + mode: 0664 + owner: root + group: root + create: true + become: true + notify: reboot + + - name: Add vfio-pci.ids kernel args + ansible.builtin.include_role: + name: stackhpc.linux.grubcmdline + vars: + kernel_cmdline: + - intel_iommu=on + - iommu=pt + - "vfio-pci.ids={{ vfio_pci_ids }}" + kernel_cmdline_remove: + - iommu + - intel_iommu + - vfio-pci.ids + + handlers: + - name: Regenerate initramfs (RedHat) + listen: Regenerate initramfs + ansible.builtin.shell: |- + #!/bin/bash + set -eux + dracut -v -f /boot/initramfs-$(uname -r).img $(uname -r) + become: true + changed_when: true + when: ansible_facts.os_family == 'RedHat' + + - name: Regenerate initramfs (Debian) + listen: Regenerate initramfs + ansible.builtin.shell: |- + #!/bin/bash + set -eux + update-initramfs -u -k $(uname -r) + become: true + changed_when: true + when: ansible_facts.os_family == 'Debian' + + - name: Reboot + listen: reboot + become: true + ansible.builtin.reboot: + reboot_timeout: "{{ reboot_timeout_s }}" + search_paths: + # Systems running molly-guard hang waiting for confirmation before rebooting without this. + - /lib/molly-guard + # Default list: + - /sbin + - /bin + - /usr/sbin + - /usr/bin + - /usr/local/sbin diff --git a/etc/kayobe/ansible/rabbitmq-reset.yml b/etc/kayobe/ansible/rabbitmq-reset.yml index b0235ab44..1d8d8cd1a 100644 --- a/etc/kayobe/ansible/rabbitmq-reset.yml +++ b/etc/kayobe/ansible/rabbitmq-reset.yml @@ -86,7 +86,8 @@ ansible.builtin.shell: cmd: >- set -o pipefail && - systemctl -a | egrep 'kolla-(barbican|blazar|cinder|cloudkitty|designate|heat|ironic|keystone|magnum|manila|neutron|nova|octavia)' | - awk '{ print $NF }' | - xargs systemctl restart + systemctl list-units --type=service --all --no-legend --plain | + egrep 'kolla-(barbican|blazar|cinder|cloudkitty|designate|heat|ironic|keystone|magnum|manila|neutron|nova|octavia)' | + awk '{ print $1 }' | + xargs -r systemctl restart executable: "/bin/bash" diff --git a/etc/kayobe/ansible/reboot.yml b/etc/kayobe/ansible/reboot.yml index 1af22e7f7..aa73a2617 100644 --- a/etc/kayobe/ansible/reboot.yml +++ b/etc/kayobe/ansible/reboot.yml @@ -9,9 +9,26 @@ ansible_user: "{{ bootstrap_user if reboot_with_bootstrap_user | bool else kayobe_ansible_user }}" ansible_ssh_common_args: "{{ '-o StrictHostKeyChecking=no' if reboot_with_bootstrap_user | bool else '' }}" ansible_python_interpreter: /usr/bin/python3 + confirm_reboot: false tags: - reboot tasks: + - name: Prompt to confirm reboot + ansible.builtin.pause: + prompt: > + The following hosts will be rebooted: + {{ play_hosts | join(', ') }} + If you want to proceed type: yes + register: pause_prompt + when: not confirm_reboot + + - name: Fail if reboot is not confirmed + ansible.builtin.assert: + that: confirm_reboot | bool or pause_prompt.user_input == 'yes' + msg: > + Reboot has not been confirmed. You must either type 'yes' when + prompted, or set ``confirm_reboot: true``. + - name: Reboot and wait become: true ansible.builtin.reboot: diff --git a/etc/kayobe/ansible/requirements.yml b/etc/kayobe/ansible/requirements.yml index 569ec172f..a81decfc1 100644 --- a/etc/kayobe/ansible/requirements.yml +++ b/etc/kayobe/ansible/requirements.yml @@ -9,11 +9,12 @@ collections: - name: stackhpc.pulp version: 0.5.5 - name: stackhpc.hashicorp - version: 2.5.1 + version: 2.6.1 - name: stackhpc.kayobe_workflows version: 1.1.0 roles: - src: stackhpc.vxlan + version: 1.1.0 - name: ansible-lockdown.ubuntu22_cis src: https://github.com/ansible-lockdown/UBUNTU22-CIS version: 1.4.1 @@ -29,3 +30,4 @@ roles: version: 1.18.5 - src: https://github.com/stackhpc/ansible-role-docker.git name: geerlingguy.docker + version: stackhpc/7.0.1.1 diff --git a/etc/kayobe/ansible/scripts/generate_fixtures.py b/etc/kayobe/ansible/scripts/generate_fixtures.py new file mode 100644 index 000000000..5f8f7cc64 --- /dev/null +++ b/etc/kayobe/ansible/scripts/generate_fixtures.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +import json +import re +from pySMART import DeviceList + +SMARTMON_ATTRS = { + "airflow_temperature_cel", + "command_timeout", + "current_pending_sector", + "end_to_end_error", + "erase_fail_count", + "g_sense_error_rate", + "hardware_ecc_recovered", + "host_reads_32mib", + "host_reads_mib", + "host_writes_32mib", + "host_writes_mib", + "load_cycle_count", + "media_wearout_indicator", + "nand_writes_1gib", + "offline_uncorrectable", + "power_cycle_count", + "power_on_hours", + "program_fail_cnt_total", + "program_fail_count", + "raw_read_error_rate", + "reallocated_event_count", + "reallocated_sector_ct", + "reported_uncorrect", + "runtime_bad_block", + "sata_downshift_count", + "seek_error_rate", + "spin_retry_count", + "spin_up_time", + "start_stop_count", + "temperature_case", + "temperature_celsius", + "temperature_internal", + "total_lbas_read", + "total_lbas_written", + "udma_crc_error_count", + "unsafe_shutdown_count", + "unused_rsvd_blk_cnt_tot", + "wear_leveling_count", + "workld_host_reads_perc", + "workld_media_wear_indic", + "workload_minutes", + "critical_warning", + "temperature", + "available_spare", + "available_spare_threshold", + "percentage_used", + "data_units_read", + "data_units_written", + "host_reads", + "host_writes", + "controller_busy_time", + "power_cycles", + "unsafe_shutdowns", + "media_errors", + "num_err_log_entries", + "warning_temp_time", + "critical_comp_time", +} + +DISK_INFO = { + "name", + "interface", + "vendor", + "family", + "model", + "serial", + "firmware", + "smart_capable", + "smart_enabled", + "assessment", +} + +def camel_to_snake(name): + """ + Convert a CamelCase string to snake_case. + + Reference: https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case + """ + return re.sub(r'(? (c) 2012 -# source at: http://devel.dob.sk/collectd-scripts/ - -# TODO: This probably needs to be a little more complex. The raw numbers can have more -# data in them than you'd think. -# http://arstechnica.com/civis/viewtopic.php?p=22062211 - -# Formatting done via shfmt -i 2 -# https://github.com/mvdan/sh - -parse_smartctl_attributes_awk="$( - cat <<'SMARTCTLAWK' -$1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ { - gsub(/-/, "_"); - printf "%s_value{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4 - printf "%s_worst{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $5 - printf "%s_threshold{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $6 - printf "%s_raw_value{%s,smart_id=\"%s\"} %e\n", $2, labels, $1, $10 -} -SMARTCTLAWK -)" - -smartmon_attrs="$( - cat <<'SMARTMONATTRS' -airflow_temperature_cel -command_timeout -current_pending_sector -end_to_end_error -erase_fail_count -g_sense_error_rate -hardware_ecc_recovered -host_reads_32mib -host_reads_mib -host_writes_32mib -host_writes_mib -load_cycle_count -media_wearout_indicator -nand_writes_1gib -offline_uncorrectable -power_cycle_count -power_on_hours -program_fail_cnt_total -program_fail_count -raw_read_error_rate -reallocated_event_count -reallocated_sector_ct -reported_uncorrect -runtime_bad_block -sata_downshift_count -seek_error_rate -spin_retry_count -spin_up_time -start_stop_count -temperature_case -temperature_celsius -temperature_internal -total_lbas_read -total_lbas_written -udma_crc_error_count -unsafe_shutdown_count -unused_rsvd_blk_cnt_tot -wear_leveling_count -workld_host_reads_perc -workld_media_wear_indic -workload_minutes -SMARTMONATTRS -)" -smartmon_attrs="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')" - -parse_smartctl_attributes() { - local disk="$1" - local disk_type="$2" - local serial="$3" - local labels="disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial}\"" - sed 's/^ \+//g' | - awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null | - tr '[:upper:]' '[:lower:]' | - grep -E "(${smartmon_attrs})" -} - -parse_smartctl_scsi_attributes() { - local disk="$1" - local disk_type="$2" - local serial="$3" - local labels="disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial}\"" - while read -r line; do - attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')" - attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')" - case "${attr_type}" in - number_of_hours_powered_up_) power_on="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; - Current_Drive_Temperature) temp_cel="$(echo "${attr_value}" | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;; - Blocks_sent_to_initiator_) lbas_read="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; - Blocks_received_from_initiator_) lbas_written="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; - Accumulated_start-stop_cycles) power_cycle="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; - Elements_in_grown_defect_list) grown_defects="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; - esac - done - [ -n "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}" - [ -n "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}" - [ -n "$lbas_read" ] && echo "total_lbas_read_raw_value{${labels},smart_id=\"242\"} ${lbas_read}" - [ -n "$lbas_written" ] && echo "total_lbas_written_raw_value{${labels},smart_id=\"241\"} ${lbas_written}" - [ -n "$power_cycle" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycle}" - [ -n "$grown_defects" ] && echo "grown_defects_count_raw_value{${labels},smart_id=\"-1\"} ${grown_defects}" -} - -parse_smartctl_info() { - shopt -s nocasematch - local -i smart_available=0 smart_enabled=0 smart_healthy= - local disk="$1" disk_type="$2" - local model_family='' device_model='' serial_number='' fw_version='' vendor='' product='' revision='' lun_id='' - while read -r line; do - info_type="$(echo "${line}" | cut -f1 -d: | tr ' ' '_')" - info_value="$(echo "${line}" | cut -f2- -d: | sed 's/^ \+//g' | sed 's/"/\\"/')" - case "${info_type}" in - Model_Family) model_family="${info_value}" ;; - Device_Model) device_model="${info_value}" ;; - Serial_Number) serial_number="$(echo ${info_value} | tr '[:upper:]' '[:lower:]')" ;; - Firmware_Version) fw_version="${info_value}" ;; - Vendor) vendor="${info_value}" ;; - Product) product="${info_value}" ;; - Revision) revision="${info_value}" ;; - Logical_Unit_id) lun_id="${info_value}" ;; - esac - if [[ "${info_type}" == 'SMART_support_is' ]]; then - case "${info_value:0:7}" in - Enabled) smart_available=1; smart_enabled=1 ;; - Availab) smart_available=1; smart_enabled=0 ;; - Unavail) smart_available=0; smart_enabled=0 ;; - esac - fi - if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]]; then - case "${info_value:0:6}" in - PASSED) smart_healthy=1 ;; - *) smart_healthy=0 ;; - esac - elif [[ "${info_type}" == 'SMART_Health_Status' ]]; then - case "${info_value:0:2}" in - OK) smart_healthy=1 ;; - *) smart_healthy=0 ;; - esac - fi - done - echo "device_info{disk=\"${disk}\",type=\"${disk_type}\",vendor=\"${vendor}\",product=\"${product}\",revision=\"${revision}\",lun_id=\"${lun_id}\",model_family=\"${model_family}\",device_model=\"${device_model}\",serial_number=\"${serial_number}\",firmware_version=\"${fw_version}\"} 1" - echo "device_smart_available{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_available}" - [[ "${smart_available}" == "1" ]] && echo "device_smart_enabled{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_enabled}" - [[ "${smart_available}" == "1" ]] && [[ "${smart_healthy}" != "" ]] && echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_healthy}" -} - -output_format_awk="$( - cat <<'OUTPUTAWK' -BEGIN { v = "" } -v != $1 { - print "# HELP smartmon_" $1 " SMART metric " $1; - print "# TYPE smartmon_" $1 " gauge"; - v = $1 -} -{print "smartmon_" $0} -OUTPUTAWK -)" - -format_output() { - sort | - awk -F'{' "${output_format_awk}" -} - -smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')" - -echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output - -if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]]; then - exit -fi - -device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}')" - -for device in ${device_list}; do - disk="$(echo "${device}" | cut -f1 -d'|')" - type="$(echo "${device}" | cut -f2 -d'|')" - # Use REGEX to extract the serial number from the parsed information and save that to a variable - serial_number="$(/usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}"| sed -E ':a;N;$!ba;s/.*serial_number=\"([^"]+)\".*/\1/g' | sed -E 's/^device_info\{.*//g')" - active=1 - echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" "$(TZ=UTC date '+%s')" - # Check if the device is in a low-power mode - /usr/sbin/smartctl -n standby -d "${type}" "${disk}" > /dev/null || active=0 - echo "device_active{disk=\"${disk}\",type=\"${type}\"}" "${active}" - # Skip further metrics to prevent the disk from spinning up - test ${active} -eq 0 && continue - # Get the SMART information and health - /usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}" - # Get the SMART attributes - case ${type} in - sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" "${serial_number}" ;; - sat+megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" "${serial_number}" ;; - scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" "${serial_number}" ;; - megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" "${serial_number}" ;; - *) - (>&2 echo "disk type is not sat, scsi or megaraid but ${type}") - exit - ;; - esac -done | format_output diff --git a/etc/kayobe/ansible/scripts/test_smartmon.py b/etc/kayobe/ansible/scripts/test_smartmon.py new file mode 100644 index 000000000..4749808a5 --- /dev/null +++ b/etc/kayobe/ansible/scripts/test_smartmon.py @@ -0,0 +1,278 @@ +import glob +import json +import os +import unittest +import tempfile +import math +from time import sleep + +from unittest.mock import patch, MagicMock +from smartmon import ( + parse_device_info, + parse_if_attributes, + main, + SMARTMON_ATTRS, + camel_to_snake, + write_metrics_to_textfile, +) + +def load_json_fixture(filename): + """ + Load a JSON file from the 'tests' subfolder. + """ + path = os.path.join(os.path.dirname(__file__), "tests", filename) + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +class TestSmartMon(unittest.TestCase): + @classmethod + def setUpClass(cls): + # Collect all *.json files from ./tests/ + data_folder = os.path.join(os.path.dirname(__file__), "tests") + cls.fixture_files = glob.glob(os.path.join(data_folder, "*.json")) + + def create_mock_device_from_json(self, device_info, if_attributes=None): + """ + Given a 'device_info' dict and optional 'if_attributes', build + a MagicMock that mimics a pySMART Device object. + """ + device = MagicMock() + device.name = device_info.get("name", "") + device.interface = device_info.get("interface", "") + device.vendor = device_info.get("vendor", "") + device.family = device_info.get("family", "") + device.model = device_info.get("model", "") + device.serial = device_info.get("serial", "") + device.firmware = device_info.get("firmware", "") + device.smart_capable = device_info.get("smart_capable", False) + device.smart_enabled = device_info.get("smart_enabled", False) + device.assessment = device_info.get("assessment", "") + + if if_attributes: + class IfAttributesMock: + pass + + if_mock = IfAttributesMock() + for key, val in if_attributes.items(): + setattr(if_mock, key, val) + device.if_attributes = if_mock + else: + device.if_attributes = None + + return device + + def _test_parse_device_info(self, fixture_name): + """ + Helper method to test parse_device_info() for a single JSON fixture. + """ + data = load_json_fixture(fixture_name) + device_info = data["device_info"] + + device = self.create_mock_device_from_json(device_info) + metrics = parse_device_info(device) + + dev_name = device_info["name"] + dev_iface = device_info["interface"] + dev_serial = device_info["serial"].lower() + + # The device_info line should exist for every device + device_info_found = any( + line.startswith("smartmon_device_info{") and + f'disk="{dev_name}"' in line and + f'type="{dev_iface}"' in line and + f'serial_number="{dev_serial}"' in line + for line in metrics + ) + self.assertTrue( + device_info_found, + f"Expected a smartmon_device_info metric line for {dev_name} but didn't find it." + ) + + # If smart_capable is true, we expect device_smart_available = 1 + if device_info.get("smart_capable"): + smart_available_found = any( + line.startswith("smartmon_device_smart_available{") and + f'disk="{dev_name}"' in line and + f'serial_number="{dev_serial}"' in line and + line.endswith(" 1.0") + for line in metrics + ) + self.assertTrue( + smart_available_found, + f"Expected smartmon_device_smart_available=1.0 for {dev_name}, not found." + ) + + # If smart_enabled is true, we expect device_smart_enabled = 1 + if device_info.get("smart_enabled"): + smart_enabled_found = any( + line.startswith("smartmon_device_smart_enabled{") and + f'disk="{dev_name}"' in line and + line.endswith(" 1.0") + for line in metrics + ) + self.assertTrue( + smart_enabled_found, + f"Expected smartmon_device_smart_enabled=1.0 for {dev_name}, not found." + ) + + # device_smart_healthy if assessment in [PASS, WARN, FAIL] + # PASS => 1, otherwise => 0 + assessment = device_info.get("assessment", "").upper() + if assessment in ["PASS", "WARN", "FAIL"]: + expected_val = float(1) if assessment == "PASS" else float(0) + smart_healthy_found = any( + line.startswith("smartmon_device_smart_healthy{") and + f'disk="{dev_name}"' in line and + line.endswith(f" {expected_val}") + for line in metrics + ) + self.assertTrue( + smart_healthy_found, + f"Expected smartmon_device_smart_healthy={expected_val} for {dev_name}, not found." + ) + + def test_parse_device_info(self): + """ + Test parse_device_info() for every JSON fixture in ./tests/. + Each fixture is tested individually with clear error reporting. + """ + for fixture_path in self.fixture_files: + fixture_name = os.path.basename(fixture_path) + with self.subTest(fixture=fixture_name): + self._test_parse_device_info(fixture_name) + + def _test_parse_if_attributes(self, fixture_name): + """ + Helper method to test parse_if_attributes() for a single JSON fixture. + """ + data = load_json_fixture(fixture_name) + device_info = data["device_info"] + if_attrs = data.get("if_attributes", {}) + + device = self.create_mock_device_from_json(device_info, if_attrs) + metrics = parse_if_attributes(device) + + dev_name = device_info["name"] + dev_iface = device_info["interface"] + dev_serial = device_info["serial"].lower() + + # For each numeric attribute in JSON, if it's in SMARTMON_ATTRS, + # we expect a line in the script's output. + for attr_key, attr_val in if_attrs.items(): + snake_key = camel_to_snake(attr_key) + + if isinstance(attr_val, (int, float)) and snake_key in SMARTMON_ATTRS: + expected_line = ( + f"smartmon_{snake_key}{{disk=\"{dev_name}\",serial_number=\"{dev_serial}\",type=\"{dev_iface}\"}} {float(attr_val)}" + ) + self.assertIn( + expected_line, + metrics, + f"Expected metric '{expected_line}' for attribute '{attr_key}' not found." + ) + else: + # If it's not in SMARTMON_ATTRS or not numeric, + # we do NOT expect a line with that name+value + unexpected_line = ( + f"smartmon_{snake_key}{{disk=\"{dev_name}\",serial_number=\"{dev_serial}\",type=\"{dev_iface}\"}} {float(attr_val)}" + ) + self.assertNotIn( + unexpected_line, + metrics, + f"Unexpected metric '{unexpected_line}' found for {attr_key}." + ) + + # Also ensure that non-numeric or disallowed attributes do not appear + # For instance "notInSmartmonAttrs" should never appear. + for line in metrics: + self.assertNotIn( + "not_in_smartmon_attrs", + line, + f"'notInSmartmonAttrs' attribute unexpectedly found in metric line: {line}" + ) + + def test_parse_if_attributes(self): + """ + Test parse_if_attributes() for every JSON fixture in ./tests/. + Each fixture is tested individually with clear error reporting. + """ + for fixture_path in self.fixture_files: + fixture_name = os.path.basename(fixture_path) + with self.subTest(fixture=fixture_name): + self._test_parse_if_attributes(fixture_name) + + @patch("smartmon.run_command") + @patch("smartmon.DeviceList") + @patch("smartmon.write_metrics_to_textfile", wraps=write_metrics_to_textfile) + def test_main(self, mock_write_metrics, mock_devicelist_class, mock_run_cmd): + """ + End-to-end test of main() for every JSON fixture in ./tests/. + This ensures we can handle multiple disks (multiple fixture files). + Checks metrics written to a temp file, and that write_metrics_to_textfile is called once. + """ + + # Patch run_command to return a version & "active" power_mode + def run_command_side_effect(cmd, parse_json=False): + if "--version" in cmd: + return "smartctl 7.3 5422 [x86_64-linux-5.15.0]\n..." + if "-n" in cmd and "standby" in cmd and parse_json: + return {"power_mode": "active"} + return "" + + mock_run_cmd.side_effect = run_command_side_effect + + for fixture_path in self.fixture_files: + fixture_name = os.path.basename(fixture_path) + with self.subTest(msg=f"Testing main() with {fixture_name}"): + mock_write_metrics.reset_mock() + data = load_json_fixture(fixture_name) + device_info = data["device_info"] + if_attrs = data.get("if_attributes", {}) + + # Mock a single device from the fixture + device_mock = self.create_mock_device_from_json(device_info, if_attrs) + + # Make DeviceList() return our single mock device + mock_dev_list = MagicMock() + mock_dev_list.devices = [device_mock] + mock_devicelist_class.return_value = mock_dev_list + + with tempfile.NamedTemporaryFile(mode="r+", delete_on_close=False) as tmpfile: + path= tmpfile.name + main(output_path=path) + tmpfile.close() + + # Ensure write_metrics_to_textfile was called once + self.assertEqual(mock_write_metrics.call_count, 1) + + with open(path, "r") as f: + # Read the metrics from the file + metrics_lines = [line.strip() for line in f.readlines() if line.strip() and not line.startswith('#')] + print(f"Metrics lines: {metrics_lines}") + + # Generate expected metrics using the parse functions + expected_metrics = [] + expected_metrics.extend(parse_device_info(device_mock)) + expected_metrics.extend(parse_if_attributes(device_mock)) + + # Check that all expected metrics are present in the file + for expected in expected_metrics: + exp_metric, exp_val_str = expected.rsplit(" ", 1) + exp_val = float(exp_val_str) + found = any( + (exp_metric in line) and + math.isclose(float(line.rsplit(" ", 1)[1]), exp_val) + for line in metrics_lines + ) + self.assertTrue(found, f"Expected metric '{expected}' not found") + + # Check that smartctl_version metric is present + version_found = any(line.startswith("smartmon_smartctl_version{") for line in metrics_lines) + self.assertTrue(version_found, "Expected 'smartmon_smartctl_version' metric not found in output file.") + + # Check that the output file is not empty + self.assertTrue(metrics_lines, "Metrics output file is empty.") + +if __name__ == "__main__": + unittest.main() diff --git a/etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json b/etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json new file mode 100644 index 000000000..d867910ae --- /dev/null +++ b/etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json @@ -0,0 +1,26 @@ +{ + "device_info": { + "assessment": "PASS", + "firmware": "2.1.8", + "interface": "nvme", + "model": "Dell Ent NVMe CM6 RI 7.68TB", + "name": "nvme8", + "serial": "Y2Q0A0BPTCF8", + "smart_capable": true, + "smart_enabled": true, + "vendor": "Dell" + }, + "if_attributes": { + "availableSpare": 100, + "availableSpareThreshold": 10, + "controllerBusyTime": 2478, + "criticalWarning": 0, + "dataUnitsRead": 177817765, + "dataUnitsWritten": 127992843, + "percentageUsed": 1, + "powerCycles": 750, + "powerOnHours": 17427, + "temperature": 36, + "unsafeShutdowns": 37 + } +} diff --git a/etc/kayobe/ansible/scripts/tests/nvme.json b/etc/kayobe/ansible/scripts/tests/nvme.json new file mode 100644 index 000000000..bbff19ec0 --- /dev/null +++ b/etc/kayobe/ansible/scripts/tests/nvme.json @@ -0,0 +1,24 @@ +{ + "device_info": { + "name": "/dev/nvme0", + "interface": "nvme", + "vendor": "AcmeCorp", + "family": "Acme NVMe Family", + "model": "Acme NVMe 1TB", + "serial": "ABCD1234", + "firmware": "3.0.1", + "smart_capable": true, + "smart_enabled": true, + "assessment": "PASS" + }, + "if_attributes": { + "criticalWarning": 0, + "temperature": 36, + "availableSpare": 100, + "availableSpareThreshold": 10, + "percentageUsed": 0, + "dataUnitsRead": 117446405, + "dataUnitsWritten": 84630284, + "notInSmartmonAttrs": 999 + } +} diff --git a/etc/kayobe/ansible/smartmon-tools.yml b/etc/kayobe/ansible/smartmon-tools.yml index 00cdfa495..351ce0325 100644 --- a/etc/kayobe/ansible/smartmon-tools.yml +++ b/etc/kayobe/ansible/smartmon-tools.yml @@ -13,6 +13,30 @@ state: present become: true + - name: Ensure Python 3, venv, and pip are installed + ansible.builtin.package: + name: + - python3 + - python3-venv + - python3-pip + state: present + become: true + + - name: Create smartmon Python virtual environment + ansible.builtin.command: + cmd: python3 -m venv /opt/smartmon-venv + creates: /opt/smartmon-venv/bin/activate + become: true + + - name: Install prometheus_client and pySMART in venv + ansible.builtin.pip: + name: + - prometheus_client + - pySMART + virtualenv: /opt/smartmon-venv + virtualenv_python: python3 + become: true + - name: Ensure the cron/crond service is running ansible.builtin.service: name: "{{ 'cron' if ansible_facts['distribution'] == 'Ubuntu' else 'crond' }}" @@ -20,15 +44,15 @@ enabled: true become: true - - name: Copy smartmon.sh and nvmemon.sh from scripts folder + - name: Copy smartmon.py and nvmemon.sh from scripts folder ansible.builtin.copy: src: scripts/{{ item }} - dest: /usr/local/bin/ + dest: /usr/local/bin/{{ item }} owner: root group: root mode: "0700" loop: - - smartmon.sh + - smartmon.py - nvmemon.sh become: true @@ -40,16 +64,39 @@ job: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin become: true - - name: Schedule cronjob to run both scripts every 5 minutes and save output to file + - name: Schedule cronjob to run smartmon.py every 5 minutes and save output to file ansible.builtin.cron: - name: SMART metrics for drive monitoring using {{ item }} + name: SMART metrics for drive monitoring using smartmon.py + user: root + minute: "*/5" + job: >- + umask 0022 && /opt/smartmon-venv/bin/python /usr/local/bin/smartmon.py --output /var/lib/docker/volumes/textfile/_data/smartmon.prom.temp && + mv -f /var/lib/docker/volumes/textfile/_data/smartmon.prom.temp /var/lib/docker/volumes/textfile/_data/smartmon.prom + become: true + + - name: Schedule cronjob to run nvmemon.sh every 5 minutes and save output to file + ansible.builtin.cron: + name: SMART metrics for drive monitoring using nvmemon.sh user: root minute: "*/5" job: >- - umask 0022 && /usr/local/bin/{{ item }}.sh > - /var/lib/docker/volumes/textfile/_data/{{ item }}.prom.temp && - mv -f /var/lib/docker/volumes/textfile/_data/{{ item }}.prom.temp /var/lib/docker/volumes/textfile/_data/{{ item }}.prom + umask 0022 && /usr/local/bin/nvmemon.sh > + /var/lib/docker/volumes/textfile/_data/nvmemon.prom.temp && + mv -f /var/lib/docker/volumes/textfile/_data/nvmemon.prom.temp /var/lib/docker/volumes/textfile/_data/nvmemon.prom + become: true + + - name: Remove old cronjobs if present + ansible.builtin.cron: + name: SMART metrics for drive monitoring using {{ item }} + user: root + state: absent + become: true loop: - smartmon - nvmemon + + - name: Remove old smartmon.sh if present + ansible.builtin.file: + path: /usr/local/bin/smartmon.sh + state: absent become: true diff --git a/etc/kayobe/ansible/stackhpc-cloud-tests.yml b/etc/kayobe/ansible/stackhpc-cloud-tests.yml index b2fed995d..cdc94c2cd 100644 --- a/etc/kayobe/ansible/stackhpc-cloud-tests.yml +++ b/etc/kayobe/ansible/stackhpc-cloud-tests.yml @@ -142,7 +142,7 @@ # Inclusive min sct_docker_version_min: "24.0.0" # Exclusive max - sct_docker_version_max: "28.0.0" + sct_docker_version_max: "28.1.0" sct_selinux_state: "{{ selinux_state }}" failed_when: host_results.rc not in [0, 1] register: host_results diff --git a/etc/kayobe/ansible/ubuntu-upgrade.yml b/etc/kayobe/ansible/ubuntu-upgrade.yml index cc0faf3a5..1a3ec3dc0 100644 --- a/etc/kayobe/ansible/ubuntu-upgrade.yml +++ b/etc/kayobe/ansible/ubuntu-upgrade.yml @@ -1,11 +1,13 @@ --- # To prevent Ansible role dependency errors, this playbook requires that environment variable # ANSIBLE_ROLES_PATH is defined and includes '$KAYOBE_PATH/ansible/roles' on the Ansible control host. -- name: Migrate hosts from Ubuntu Jammy 22.04 to Noble 24.04 +# Where KAYOBE_PATH is the path to the source of kayobe that the environment uses. +- name: Prepare upgrade from Ubuntu Jammy 22.04 to Noble 24.04 hosts: overcloud:infra-vms:seed:seed-hypervisor vars: ansible_python_interpreter: /usr/bin/python3 reboot_timeout_s: "{{ 20 * 60 }}" + tags: pre tasks: - name: Assert that hosts are running Ubuntu Jammy ansible.builtin.assert: @@ -63,10 +65,67 @@ deb {{ stackhpc_repo_ubuntu_noble_security_url }} noble-security main restricted universe multiverse become: true - - name: Do release upgrade - ansible.builtin.command: do-release-upgrade -f DistUpgradeViewNonInteractive + - name: Enusre /tmp is mounted with exec + ansible.posix.mount: + path: /tmp + opts: exec + state: remounted become: true + - name: Ensure /var/lib/cephadm directory exists + ansible.builtin.file: + path: /var/lib/cephadm + state: directory + owner: root + group: root + become: true + when: inventory_hostname in groups['ceph'] + +- name: Upgrade hosts from Ubuntu Jammy 22.04 to Noble 24.04 + hosts: overcloud:infra-vms:seed:seed-hypervisor + vars: + ansible_python_interpreter: /usr/bin/python3 + reboot_timeout_s: "{{ 20 * 60 }}" + tags: upgrade + tasks: + - name: Perform in-place Ubuntu upgrade + block: + - name: Run do-release-upgrade + ansible.builtin.command: do-release-upgrade -f DistUpgradeViewNonInteractive + become: true + rescue: + - name: Ensure Noble repo definitions do not exist in sources.list + ansible.builtin.blockinfile: + path: /etc/apt/sources.list + state: absent + become: true + + - name: Ensure Kolla Ansible Docker repo definition does not exist + ansible.builtin.file: + path: /etc/apt/sources.list.d/docker.list + state: absent + become: true + when: apt_repositories | selectattr('url', 'match', '.*docker-ce.*') | list | length > 0 + + - name: Display recommanded action on upgrade failure + ansible.builtin.debug: + msg: > + Ubuntu upgrade failed. You can check the upgrade logs at /var/log/dist-upgrade + on the failed host. + It is likely due to packages with broken dependency. You can find broken packages + by running following command from the host. + cat /var/log/dist-upgrade/apt.log | grep "Holding Back" | awk '{print $3}' + + - name: Fail fast when upgrade fails + ansible.builtin.meta: end_host + +- name: Post upgrade of Ubuntu Jammy 22.04 to Noble 24.04 + hosts: overcloud:infra-vms:seed:seed-hypervisor + vars: + ansible_python_interpreter: /usr/bin/python3 + reboot_timeout_s: "{{ 20 * 60 }}" + tags: post + tasks: - name: Ensure old venvs do not exist ansible.builtin.file: path: /opt/kayobe/venvs/{{ item }} @@ -83,15 +142,45 @@ - name: Run the Kayobe kayobe-target-venv playbook to ensure kayobe venv exists on remote host import_playbook: "{{ lookup('ansible.builtin.env', 'VIRTUAL_ENV') }}/share/kayobe/ansible/kayobe-target-venv.yml" + tags: post + +- name: Run the Kayobe apt playbook to ensure Noble repositories are set on remote host + import_playbook: "{{ lookup('ansible.builtin.env', 'VIRTUAL_ENV') }}/share/kayobe/ansible/apt.yml" + tags: post + +- name: Fix broken packages after upgrade + hosts: overcloud:infra-vms:seed:seed-hypervisor + tags: post + tasks: + - name: Ensure iproute2 is installed + ansible.builtin.apt: + name: iproute2 + state: present + become: true + - name: Ensure cephadm dependencies are installed + ansible.builtin.apt: + name: + - python3-yaml + - python3-jinja2 + state: present + become: true + when: inventory_hostname in groups['ceph'] + + - name: Update Python and current user facts before running Kayobe network playbook + ansible.builtin.setup: + filter: "{{ kayobe_ansible_setup_filter }}" + gather_subset: "{{ kayobe_ansible_setup_gather_subset }}" - name: Run the Kayobe network configuration playbook, to ensure definitions are not lost on reboot import_playbook: "{{ lookup('ansible.builtin.env', 'VIRTUAL_ENV') }}/share/kayobe/ansible/network.yml" + tags: post - name: Reboot and confirm the host is upgraded to Noble 24.04 hosts: overcloud:infra-vms:seed:seed-hypervisor vars: ansible_python_interpreter: /usr/bin/python3 reboot_timeout_s: "{{ 20 * 60 }}" + tags: post tasks: - name: Ensure Noble repo definitions do not exist in sources.list ansible.builtin.blockinfile: @@ -121,6 +210,33 @@ - /usr/local/sbin become: true + # Make a backup, in case of having broken apt configuration. + - name: Backup upstream ubuntu.sources + ansible.builtin.copy: + src: /etc/apt/sources.list.d/ubuntu.sources + dest: /etc/apt/ubuntu.sources.bak + backup: true + remote_src: true + become: true + when: hostvars[inventory_hostname].stackhpc_repos_enabled + + - name: Ensure only Kayobe defined apt repositories are defined + ansible.builtin.file: + path: "/etc/apt/{{ item }}" + state: absent + loop: + - sources.list.distUpgrade + - sources.list.d/third-party.sources + - sources.list.d/ubuntu.sources + become: true + when: hostvars[inventory_hostname].stackhpc_repos_enabled + + - name: Ensure all packages are in Noble version + ansible.builtin.apt: + upgrade: full + update_cache: true + become: true + - name: Update distribution facts ansible.builtin.setup: filter: "{{ kayobe_ansible_setup_filter }}" diff --git a/etc/kayobe/ansible/vault-deploy-barbican.yml b/etc/kayobe/ansible/vault-deploy-barbican.yml index 0270dc8d0..54f17bb35 100644 --- a/etc/kayobe/ansible/vault-deploy-barbican.yml +++ b/etc/kayobe/ansible/vault-deploy-barbican.yml @@ -30,7 +30,7 @@ https_proxy: "" block: - name: Enable AppRole auth module - hashivault_auth_method: + hashivault_auth_method: # noqa: fqcn url: "{{ vault_api_addr }}" ca_cert: "{{ vault_ca_cert }}" token: "{{ vault_keys.root_token }}" @@ -38,7 +38,7 @@ state: enabled - name: Enable barbican kv store - hashivault_secret_engine: + hashivault_secret_engine: # noqa: fqcn url: "{{ vault_api_addr }}" ca_cert: "{{ vault_ca_cert }}" token: "{{ vault_keys.root_token }}" @@ -47,7 +47,7 @@ description: Barbican kv store - name: Ensure barbican policy is defined - hashivault_policy: + hashivault_policy: # noqa: fqcn url: "{{ vault_api_addr }}" ca_cert: "{{ vault_ca_cert }}" token: "{{ vault_keys.root_token }}" @@ -59,7 +59,7 @@ } - name: Ensure barbican AppRole is defined - hashivault_approle_role: + hashivault_approle_role: # noqa: fqcn url: "{{ vault_api_addr }}" ca_cert: "{{ vault_ca_cert }}" token: "{{ vault_keys.root_token }}" @@ -70,7 +70,7 @@ name: barbican - name: Get barbican Approle ID - hashivault_approle_role_id: + hashivault_approle_role_id: # noqa: fqcn url: "{{ vault_api_addr }}" ca_cert: "{{ vault_ca_cert }}" token: "{{ vault_keys.root_token }}" @@ -89,7 +89,7 @@ when: stackhpc_write_barbican_role_id_to_file | default(false) | bool - name: Check if barbican Approle Secret ID is defined - hashivault_approle_role_secret_get: + hashivault_approle_role_secret_get: # noqa: fqcn url: "{{ vault_api_addr }}" ca_cert: "{{ vault_ca_cert }}" token: "{{ vault_keys.root_token }}" @@ -98,7 +98,7 @@ register: barbican_approle_secret_get - name: Ensure barbican AppRole Secret ID is defined - hashivault_approle_role_secret: + hashivault_approle_role_secret: # noqa: fqcn url: "{{ vault_api_addr }}" ca_cert: "{{ vault_ca_cert }}" token: "{{ vault_keys.root_token }}" diff --git a/etc/kayobe/ansible/vault-generate-backend-tls.yml b/etc/kayobe/ansible/vault-generate-backend-tls.yml index 71f243c85..32e502fbe 100644 --- a/etc/kayobe/ansible/vault-generate-backend-tls.yml +++ b/etc/kayobe/ansible/vault-generate-backend-tls.yml @@ -43,7 +43,7 @@ name: vault_keys - name: Issue a certificate for backend TLS - hashivault_pki_cert_issue: + hashivault_pki_cert_issue: # noqa: fqcn url: "{{ vault_api_addr }}" ca_cert: "{{ '/etc/pki/tls/certs/ca-bundle.crt' if ansible_facts.os_family == 'RedHat' else '/usr/local/share/ca-certificates/OS-TLS-ROOT.crt' }}" token: "{{ vault_keys.root_token }}" diff --git a/etc/kayobe/ansible/vault-generate-internal-tls.yml b/etc/kayobe/ansible/vault-generate-internal-tls.yml index d5d4e6068..a585d1bc9 100644 --- a/etc/kayobe/ansible/vault-generate-internal-tls.yml +++ b/etc/kayobe/ansible/vault-generate-internal-tls.yml @@ -12,7 +12,7 @@ name: vault_keys - name: Issue a certificate for internal TLS - hashivault_pki_cert_issue: + hashivault_pki_cert_issue: # noqa: fqcn url: "{{ vault_api_addr }}" ca_cert: "{{ '/etc/pki/tls/certs/ca-bundle.crt' if ansible_facts.os_family == 'RedHat' else '/usr/local/share/ca-certificates/OS-TLS-ROOT.crt' }}" token: "{{ vault_keys.root_token }}" diff --git a/etc/kayobe/ansible/vault-generate-test-external-tls.yml b/etc/kayobe/ansible/vault-generate-test-external-tls.yml index 37841ad0d..de02ddb03 100644 --- a/etc/kayobe/ansible/vault-generate-test-external-tls.yml +++ b/etc/kayobe/ansible/vault-generate-test-external-tls.yml @@ -13,7 +13,7 @@ name: vault_keys - name: Issue a certificate for external TLS - hashivault_pki_cert_issue: + hashivault_pki_cert_issue: # noqa: fqcn url: "{{ vault_api_addr }}" ca_cert: "{{ '/etc/pki/tls/certs/ca-bundle.crt' if ansible_facts.os_family == 'RedHat' else '/usr/local/share/ca-certificates/OS-TLS-ROOT.crt' }}" token: "{{ vault_keys.root_token }}" diff --git a/etc/kayobe/environments/aufn-ceph/globals.yml b/etc/kayobe/environments/aufn-ceph/globals.yml index 2e3d26996..88c3f37b9 100644 --- a/etc/kayobe/environments/aufn-ceph/globals.yml +++ b/etc/kayobe/environments/aufn-ceph/globals.yml @@ -13,3 +13,9 @@ os_distribution: "{{ lookup('pipe', '. /etc/os-release && echo $ID') | trim }}" os_release: >- {{ (lookup('pipe', '. /etc/os-release && echo $VERSION_CODENAME') | trim) if os_distribution == 'ubuntu' else (lookup('pipe', '. /etc/os-release && echo $VERSION_ID') | trim | split('.') | first) if os_distribution == 'rocky' }} + +############################################################################### +# Extra vars. + +# Don't prompt when rebooting hosts. +confirm_reboot: true diff --git a/etc/kayobe/environments/ci-aio/globals.yml b/etc/kayobe/environments/ci-aio/globals.yml index b7fda2fb8..1fe92cfea 100644 --- a/etc/kayobe/environments/ci-aio/globals.yml +++ b/etc/kayobe/environments/ci-aio/globals.yml @@ -55,6 +55,12 @@ os_release: >- {{ (lookup('pipe', '. /etc/os-release && echo $VERSION_CODENAME') | trim) if os_distribution == 'ubuntu' else (lookup('pipe', '. /etc/os-release && echo $VERSION_ID') | trim | split('.') | first) if os_distribution == 'rocky' }} +############################################################################### +# Extra vars. + +# Don't prompt when rebooting hosts. +confirm_reboot: true + ############################################################################### # Dummy variable to allow Ansible to accept this file. workaround_ansible_issue_8743: yes diff --git a/etc/kayobe/environments/ci-aio/inventory/group_vars/cis-hardening/cis b/etc/kayobe/environments/ci-aio/inventory/group_vars/cis-hardening/cis index 50084c72f..4565c96f2 100644 --- a/etc/kayobe/environments/ci-aio/inventory/group_vars/cis-hardening/cis +++ b/etc/kayobe/environments/ci-aio/inventory/group_vars/cis-hardening/cis @@ -7,10 +7,11 @@ rhel9cis_rule_5_4_3_2: false ############################################################################## -# Ubuntu Jammy CIS Hardening Configuration +# Ubuntu Noble CIS Hardening Configuration +# TODO: Test CIS rules for Ubuntu Noble # Disable shell timeout for inactivity which can be disruptive to # development work. -ubtu22cis_rule_5_4_3_2: false +ubtu24cis_rule_5_4_3_2: false ############################################################################## diff --git a/etc/kayobe/environments/ci-builder/globals.yml b/etc/kayobe/environments/ci-builder/globals.yml index 9852cbd4d..f33fd5e05 100644 --- a/etc/kayobe/environments/ci-builder/globals.yml +++ b/etc/kayobe/environments/ci-builder/globals.yml @@ -7,3 +7,9 @@ # OS distribution name. Valid options are "rocky", "ubuntu". Default is # "rocky". os_distribution: "{{ lookup('pipe', '. /etc/os-release && echo $ID') | trim }}" + +############################################################################### +# Extra vars. + +# Don't prompt when rebooting hosts. +confirm_reboot: true diff --git a/etc/kayobe/environments/ci-builder/stackhpc-ci.yml b/etc/kayobe/environments/ci-builder/stackhpc-ci.yml index 755804d86..0cdd49bd2 100644 --- a/etc/kayobe/environments/ci-builder/stackhpc-ci.yml +++ b/etc/kayobe/environments/ci-builder/stackhpc-ci.yml @@ -28,6 +28,7 @@ kolla_enable_octavia: true kolla_enable_opensearch: true kolla_enable_prometheus: true kolla_enable_redis: true +kolla_enable_skyline: true kolla_build_neutron_ovs: true ############################################################################### diff --git a/etc/kayobe/environments/ci-doca-builder/globals.yml b/etc/kayobe/environments/ci-doca-builder/globals.yml new file mode 100644 index 000000000..f08f5e2e4 --- /dev/null +++ b/etc/kayobe/environments/ci-doca-builder/globals.yml @@ -0,0 +1,6 @@ +--- +############################################################################### +# Extra vars. + +# Don't prompt when rebooting hosts. +confirm_reboot: true diff --git a/etc/kayobe/environments/ci-multinode/globals.yml b/etc/kayobe/environments/ci-multinode/globals.yml index a9157f07d..0976d362c 100644 --- a/etc/kayobe/environments/ci-multinode/globals.yml +++ b/etc/kayobe/environments/ci-multinode/globals.yml @@ -63,6 +63,12 @@ stackhpc_barbican_role_id_file_path: "/tmp/barbican-role-id" # Enable rebooting to update SELinux state selinux_do_reboot: true +############################################################################### +# Extra vars. + +# Don't prompt when rebooting hosts. +confirm_reboot: true + ############################################################################### # Dummy variable to allow Ansible to accept this file. workaround_ansible_issue_8743: yes diff --git a/etc/kayobe/environments/ci-multinode/inventory/group_vars/cis-hardening/cis b/etc/kayobe/environments/ci-multinode/inventory/group_vars/cis-hardening/cis index 50084c72f..4565c96f2 100644 --- a/etc/kayobe/environments/ci-multinode/inventory/group_vars/cis-hardening/cis +++ b/etc/kayobe/environments/ci-multinode/inventory/group_vars/cis-hardening/cis @@ -7,10 +7,11 @@ rhel9cis_rule_5_4_3_2: false ############################################################################## -# Ubuntu Jammy CIS Hardening Configuration +# Ubuntu Noble CIS Hardening Configuration +# TODO: Test CIS rules for Ubuntu Noble # Disable shell timeout for inactivity which can be disruptive to # development work. -ubtu22cis_rule_5_4_3_2: false +ubtu24cis_rule_5_4_3_2: false ############################################################################## diff --git a/etc/kayobe/hooks/overcloud-host-configure/pre.d/pci-passthrough.yml b/etc/kayobe/hooks/overcloud-host-configure/pre.d/pci-passthrough.yml new file mode 120000 index 000000000..ffdf55f6a --- /dev/null +++ b/etc/kayobe/hooks/overcloud-host-configure/pre.d/pci-passthrough.yml @@ -0,0 +1 @@ +../../../ansible/pci-passthrough.yml \ No newline at end of file diff --git a/etc/kayobe/inventory/group_vars/all/openbao.yml b/etc/kayobe/inventory/group_vars/all/openbao.yml new file mode 100644 index 000000000..172cc3cfc --- /dev/null +++ b/etc/kayobe/inventory/group_vars/all/openbao.yml @@ -0,0 +1,80 @@ +--- +############################################################################### +# Openbao deployment configuration. + +# Registry information for seed. +seed_openbao_registry_url: "{{ stackhpc_docker_registry if stackhpc_sync_openbao_images | bool else '' }}" +seed_openbao_registry_username: "{{ stackhpc_docker_registry_username if stackhpc_sync_openbao_images | bool else '' }}" +seed_openbao_registry_password: "{{ stackhpc_docker_registry_password if stackhpc_sync_openbao_images | bool else '' }}" + +# Seed OpenBao container image. +seed_openbao_docker_image: "{{ stackhpc_docker_registry ~ '/' if stackhpc_sync_openbao_images | bool else '' }}openbao/openbao" + +# Seed OpenBao container image tag. +seed_openbao_docker_tag: "2.2.1" + +# Seed OpenBao PKI Role name +seed_openbao_pki_role_name: "ServerCert" + +# Seed OpenBao PKI Roles definition +seed_openbao_pki_roles: + - name: "{{ seed_openbao_pki_role_name }}" + config: + max_ttl: 8760h + ttl: 8760h + allow_any_name: true + allow_ip_sans: true + require_cn: false + server_flag: true + key_type: rsa + key_bits: 4096 + country: ["UK"] + locality: ["Bristol"] + organization: ["StackHPC"] + ou: ["OpenStack"] + +# Registry information for overcloud. +overcloud_openbao_registry_url: "{{ stackhpc_docker_registry if stackhpc_sync_openbao_images | bool else '' }}" +overcloud_openbao_registry_username: "{{ stackhpc_docker_registry_username if stackhpc_sync_openbao_images | bool else '' }}" +overcloud_openbao_registry_password: "{{ stackhpc_docker_registry_password if stackhpc_sync_openbao_images | bool else '' }}" + +# Overcloud OpenBao container image. +overcloud_openbao_docker_image: "{{ stackhpc_docker_registry ~ '/' if stackhpc_sync_openbao_images | bool else '' }}openbao/openbao" + +# Overcloud OpenBao container image tag. +overcloud_openbao_docker_tag: "2.2.1" + +# Overcloud OpenBao PKI Default Role name +overcloud_openbao_pki_default_role_name: "ServerCert" + +# Overcloud OpenBao PKI Internal TLS Role name +overcloud_openbao_pki_internal_tls_role_name: "{{ overcloud_openbao_pki_default_role_name }}" + +# Overcloud OpenBao PKI Backend TLS Role name +overcloud_openbao_pki_backend_tls_role_name: "{{ overcloud_openbao_pki_default_role_name }}" + +# Overcloud OpenBao PKI External TLS Role name (for testing only) +overcloud_openbao_pki_external_tls_role_name: "{{ overcloud_openbao_pki_default_role_name }}" + +# Overcloud OpenBao PKI Roles definition +overcloud_openbao_pki_roles: + - name: "{{ overcloud_openbao_pki_default_role_name }}" + config: + max_ttl: 8760h + ttl: 8760h + allow_any_name: true + allow_ip_sans: true + require_cn: false + server_flag: true + key_type: rsa + key_bits: 4096 + country: ["UK"] + locality: ["Bristol"] + organization: ["StackHPC"] + ou: ["OpenStack"] + +seed_openbao_pki_certificate_subject: + - common_name: "{% if kolla_internal_fqdn != kolla_internal_vip_address %}{{ kolla_internal_fqdn }}{% else %}overcloud{% endif %}" + role: "{{ seed_openbao_pki_role_name }}" + extra_params: + ip_sans: "{% for host in groups['controllers'] %}{{ internal_net_name | net_ip(host) }}{% if not loop.last %},{% endif %}{% endfor %},{{ kolla_internal_vip_address }}" diff --git a/etc/kayobe/inventory/group_vars/cis-hardening/cis b/etc/kayobe/inventory/group_vars/cis-hardening/cis index 9c9c48fbe..3953adc42 100644 --- a/etc/kayobe/inventory/group_vars/cis-hardening/cis +++ b/etc/kayobe/inventory/group_vars/cis-hardening/cis @@ -192,4 +192,3 @@ ubtu24cis_rule_5_5_1_5: false # Also matches RHEL hardening behavior. ubtu24cis_ipv6_required: true -############################################################################## diff --git a/etc/kayobe/inventory/group_vars/wazuh-manager/wazuh-manager b/etc/kayobe/inventory/group_vars/wazuh-manager/wazuh-manager index 76ce10766..5d95f6d3b 100644 --- a/etc/kayobe/inventory/group_vars/wazuh-manager/wazuh-manager +++ b/etc/kayobe/inventory/group_vars/wazuh-manager/wazuh-manager @@ -28,7 +28,7 @@ indexer_node_name: "{{ inventory_hostname }}" indexer_network_host: "{{ provision_oc_net_name | net_ip }}" # Even in a single node setup this must be defined. If not defaults to 127.0.0.1 -indexer_cluster_nodes: +indexer_cluster_nodes: - "{{ indexer_network_host }}" instances: diff --git a/etc/kayobe/kolla-image-tags.yml b/etc/kayobe/kolla-image-tags.yml index df3123275..7bec69128 100644 --- a/etc/kayobe/kolla-image-tags.yml +++ b/etc/kayobe/kolla-image-tags.yml @@ -19,3 +19,6 @@ kolla_image_tags: rabbitmq: rocky-9: master-rocky-9-20250502T080944 ubuntu-noble: master-ubuntu-noble-20250502T080944 + skyline: + rocky-9: master-rocky-9-20250425T091159 + ubuntu-noble: master-ubuntu-noble-20250425T091159 diff --git a/etc/kayobe/kolla.yml b/etc/kayobe/kolla.yml index 8c1021e89..7491ee242 100644 --- a/etc/kayobe/kolla.yml +++ b/etc/kayobe/kolla.yml @@ -470,6 +470,24 @@ kolla_build_args: {} # * groups: A list of kayobe ansible groups to map to this kolla-ansible group. # * vars: A dict mapping variable names to values for hosts in this # kolla-ansible group. +# NOTE(Alex-Welsh): If you want to extend the map rather than replace it, you +# must include the Kayobe defaults in the mapping. +# Standard Kayobe defaults: +# compute: +# groups: +# - "compute" +# control: +# groups: +# - "controllers" +# monitoring: +# groups: +# - "controllers" +# network: +# groups: +# - "controllers" +# storage: +# groups: +# - "controllers" #kolla_overcloud_inventory_top_level_group_map: # List of names of top level kolla-ansible groups. Any of these groups which @@ -484,7 +502,9 @@ kolla_build_args: {} # List of names of additional host variables to pass through from kayobe hosts # to kolla-ansible hosts, if set. See also # kolla_overcloud_inventory_pass_through_host_vars_map. -#kolla_overcloud_inventory_pass_through_host_vars_extra: +kolla_overcloud_inventory_pass_through_host_vars_extra: + - stackhpc_gpu_data + - gpu_group_map # List of names of host variables to pass through from kayobe hosts to # kolla-ansible hosts, if set. See also @@ -624,7 +644,7 @@ kolla_enable_heat: false #kolla_enable_influxdb: #kolla_enable_ironic: #kolla_enable_ironic_neutron_agent: -#kolla_enable_ironic_prometheus_exporter: +kolla_enable_ironic_prometheus_exporter: false #kolla_enable_iscsid: #kolla_enable_keepalived: #kolla_enable_keystone: diff --git a/etc/kayobe/kolla/config/bifrost/bifrost.yml b/etc/kayobe/kolla/config/bifrost/bifrost.yml index 16ea3ac3b..6310d4931 100644 --- a/etc/kayobe/kolla/config/bifrost/bifrost.yml +++ b/etc/kayobe/kolla/config/bifrost/bifrost.yml @@ -8,3 +8,12 @@ cirros_deploy_image_upstream_url: "{{ stackhpc_overcloud_host_image_url }}" # Disable debug logging to avoid generating large log files ironic_debug: false + +# Use prebuilt release train IPA images from Ark. +{% if stackhpc_ipa_image_bifrost_enabled | bool %} +ipa_download_url_username: "{{ stackhpc_release_pulp_username }}" +ipa_download_url_password: "{{ stackhpc_release_pulp_password }}" +ipa_download_force_basic_auth: true +ipa_download_unredirected_headers: + - Authorization +{% endif %} diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json index 92001f842..6ba67ad27 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json @@ -20,12 +20,38 @@ "fiscalYearStartMonth": 0, "gnetId": 12403, "graphTooltip": 0, - "id": 28, + "id": 91, "links": [], - "liveNow": false, "panels": [ + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 31, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Overview", + "type": "row" + }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -52,7 +78,7 @@ "h": 3, "w": 4, "x": 0, - "y": 0 + "y": 1 }, "id": 53, "options": { @@ -60,6 +86,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -71,7 +98,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -79,7 +106,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(up{job=~\"redfish-exporter.*\"} == 1)", + "expr": "count(redfish_system_power_state{group=\"$group\", job!=\"redfish-exporter-collectlog\"})", "format": "table", "hide": false, "instant": true, @@ -101,11 +128,12 @@ "refId": "B" } ], - "title": "iDRAC Up", + "title": "Redfish Up", "type": "stat" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -132,14 +160,16 @@ "h": 3, "w": 4, "x": 4, - "y": 0 + "y": 1 }, "id": 54, + "interval": "30m", "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -151,14 +181,15 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_power_state == 1)", + "editorMode": "code", + "expr": "count(redfish_system_power_state{group=\"$group\", job!=\"redfish-exporter-collectlog\"} == 1)", "format": "table", "hide": false, "instant": true, @@ -172,14 +203,13 @@ }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, - "description": "", "fieldConfig": { "defaults": { "mappings": [], - "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ @@ -200,7 +230,7 @@ "h": 3, "w": 4, "x": 8, - "y": 0 + "y": 1 }, "id": 55, "options": { @@ -208,6 +238,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -219,23 +250,21 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_power_state != 1)", + "editorMode": "code", + "expr": "count(redfish_system_power_state{group=\"$group\", job!=\"redfish-exporter-collectlog\"} != 1)", "format": "table", - "fullMetaSearch": false, "hide": false, - "includeNullMetadata": true, "instant": true, "interval": "", - "legendFormat": "__auto", - "refId": "A", - "useBackend": false + "legendFormat": "", + "refId": "A" } ], "title": "Powered Off", @@ -243,13 +272,13 @@ }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "mappings": [], - "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ @@ -259,7 +288,7 @@ }, { "color": "red", - "value": 80 + "value": 1 } ] } @@ -270,7 +299,7 @@ "h": 3, "w": 4, "x": 12, - "y": 0 + "y": 1 }, "id": 56, "options": { @@ -278,6 +307,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -289,14 +319,15 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_chassis_health != 1)", + "editorMode": "code", + "expr": "count(redfish_chassis_health{group=\"$group\", job!=\"redfish-exporter-collectlog\" } != 1)", "format": "table", "hide": false, "instant": true, @@ -310,14 +341,37 @@ }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, "description": "", "fieldConfig": { "defaults": { - "mappings": [], - "noValue": "0", + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [ + { + "options": { + "1": { + "color": "dark-green", + "index": 1, + "text": "On" + }, + "2": { + "color": "dark-red", + "index": 0, + "text": "Off" + } + }, + "type": "value" + } + ], "thresholds": { "mode": "absolute", "steps": [ @@ -326,328 +380,36 @@ "value": null }, { - "color": "red", - "value": 80 + "color": "#EAB839", + "value": 2 } ] } }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 16, - "y": 0 - }, - "id": 57, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "mean" - ], - "fields": "", - "values": true - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "count(redfish_logservices_entry_count{name=\"SEL Log Service\", severity!=\"OK\"} != 0)", - "format": "table", - "hide": false, - "instant": true, - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "title": "Nodes with SEL Logs", - "type": "stat" - }, - { - "collapsed": false, - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 3 - }, - "id": 31, - "panels": [], - "targets": [ - { - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "refId": "A" - } - ], - "title": "Overview", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "points", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ + "properties": [ { - "color": "green", - "value": null + "id": "displayName", + "value": "Time" }, { - "color": "red", - "value": 80 + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align" } ] }, - "unit": "watt" - }, - "overrides": [] - }, - "gridPos": { - "h": 15, - "w": 5, - "x": 0, - "y": 4 - }, - "id": 36, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "maxHeight": 600, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "10.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "sum(redfish_chassis_power_average_consumed_watts)", - "legendFormat": "Rack power consumption", - "refId": "A" - } - ], - "title": "Power Consumption", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "points", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "watt" - }, - "overrides": [] - }, - "gridPos": { - "h": 15, - "w": 6, - "x": 5, - "y": 4 - }, - "id": 44, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "maxHeight": 600, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "10.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "sum(redfish_chassis_power_average_consumed_watts) by (env)", - "interval": "", - "legendFormat": "{{ env }}", - "refId": "A" - } - ], - "title": "Power Consumption", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Time" - }, - "properties": [ - { - "id": "displayName", - "value": "Time" - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "instance" + { + "matcher": { + "id": "byName", + "options": "instance" }, "properties": [ { @@ -672,7 +434,7 @@ { "targetBlank": true, "title": "", - "url": "https://${__cell}" + "url": "https://${__data.fields.instance}" } ] }, @@ -717,14 +479,6 @@ { "color": "#73BF69", "value": null - }, - { - "color": "#73BF69", - "value": 0 - }, - { - "color": "#C4162A", - "value": 1 } ] } @@ -866,7 +620,7 @@ { "targetBlank": true, "title": "", - "url": "https://${__cell_3}" + "url": "https://${__data.fields.instance}" } ] }, @@ -897,16 +651,28 @@ "id": "custom.align" } ] + }, + { + "matcher": { + "id": "byName", + "options": "Power state" + }, + "properties": [ + { + "id": "custom.width", + "value": 197 + } + ] } ] }, "gridPos": { - "h": 15, - "w": 5, - "x": 11, + "h": 8, + "w": 12, + "x": 0, "y": 4 }, - "id": 38, + "id": 59, "options": { "cellHeight": "sm", "footer": { @@ -917,16 +683,23 @@ ], "show": false }, - "showHeader": true + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Power state" + } + ] }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "sort_desc(redfish_system_power_state)", + "editorMode": "code", + "expr": "sort_desc(redfish_system_power_state{group=\"$group\", job!=\"redfish-exporter-collectlog\"})", "format": "table", "instant": true, "interval": "", @@ -947,9 +720,11 @@ }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, + "description": "", "fieldConfig": { "defaults": { "custom": { @@ -959,7 +734,25 @@ }, "inspect": false }, - "mappings": [], + "mappings": [ + { + "options": { + "1": { + "index": 1, + "text": "Healthy" + }, + "2": { + "index": 2, + "text": "Unknown" + }, + "3": { + "index": 0, + "text": "Unhealthy" + } + }, + "type": "value" + } + ], "thresholds": { "mode": "absolute", "steps": [ @@ -1006,11 +799,7 @@ }, { "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 + "value": "" }, { "id": "custom.cellOptions", @@ -1031,12 +820,12 @@ "value": null }, { - "color": "rgba(237, 129, 40, 0.89)", - "value": 1 + "color": "#EAB839", + "value": 2 }, { "color": "rgba(245, 54, 54, 0.9)", - "value": 2 + "value": 3 } ] } @@ -1063,7 +852,7 @@ { "targetBlank": true, "title": "", - "url": "https://$__cell_4" + "url": "https://${__data.fields.instance}" } ] }, @@ -1075,12 +864,12 @@ ] }, "gridPos": { - "h": 15, - "w": 8, - "x": 16, + "h": 8, + "w": 12, + "x": 12, "y": 4 }, - "id": 33, + "id": 61, "interval": "", "options": { "cellHeight": "sm", @@ -1092,16 +881,23 @@ ], "show": false }, - "showHeader": true + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Status" + } + ] }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "sort(redfish_chassis_health)", + "editorMode": "code", + "expr": "sort(redfish_chassis_health{group=\"$group\", job!=\"redfish-exporter-collectlog\"})", "format": "table", "hide": false, "instant": true, @@ -1123,112 +919,225 @@ }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, + "description": "", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "points", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "align": "auto", + "cellOptions": { + "type": "auto" }, - "thresholdsStyle": { - "mode": "off" - } + "inspect": false }, - "links": [], - "mappings": [], - "min": 16, + "mappings": [ + { + "options": { + "1": { + "index": 1, + "text": "Healthy" + }, + "2": { + "index": 2, + "text": "Warning" + }, + "3": { + "index": 0, + "text": "Critical" + } + }, + "type": "value" + } + ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/__name__|chassis_id|Time|env|job|resource|instance/" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" }, { - "color": "red", - "value": 80 + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align" } ] }, - "unit": "celsius" - }, - "overrides": [] - }, - "gridPos": { - "h": 11, - "w": 5, - "x": 0, - "y": 19 - }, - "id": 39, - "interval": "5m", - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "maxHeight": 600, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "10.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "editorMode": "code", - "expr": "max(redfish_chassis_temperature_celsius{sensor_id=~\".*InletTemp\"} or redfish_chassis_temperature_celsius{sensor=~\".*Inlet.*\"}) by (env)", - "hide": false, - "interval": "", - "legendFormat": "max inlet {{ env }}", - "range": true, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Status" + }, + { + "id": "unit", + "value": "" + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-background" + } + }, + { + "id": "custom.align" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "#EAB839", + "value": 2 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 3 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "server" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "", + "url": "https://${__data.fields.instance}" + } + ] + }, + { + "id": "custom.align" + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 62, + "interval": "", + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Status" + } + ] + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "max(last_over_time({__name__=~\"redfish_chassis_log_entry_severity_state|redfish_manager_log_entry_severity_state\", group=\"$group\"}[1h])) by (group, server, log_service_id, instance)", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "", "refId": "A" } ], - "title": "Max Inlet Temp", - "type": "timeseries" + "title": "Log severity summary", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [] + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "includeByName": {}, + "indexByName": { + "Time": 0, + "Value": 4, + "group": 1, + "instance": 5, + "log_service_id": 3, + "server": 2 + }, + "renameByName": { + "Value": "Status", + "log_service_id": "Log Service", + "server": "Server" + } + } + } + ], + "type": "table" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, - "description": "", "fieldConfig": { "defaults": { "color": { @@ -1241,8 +1150,9 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, - "drawStyle": "points", - "fillOpacity": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1256,7 +1166,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -1268,7 +1178,6 @@ }, "links": [], "mappings": [], - "min": 40, "thresholds": { "mode": "absolute", "steps": [ @@ -1282,32 +1191,34 @@ } ] }, - "unit": "celsius" + "unit": "watt" }, "overrides": [] }, "gridPos": { "h": 11, "w": 6, - "x": 5, - "y": 19 + "x": 12, + "y": 12 }, - "id": 40, - "interval": "5m", + "id": 44, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { @@ -1315,19 +1226,19 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "max(redfish_chassis_temperature_celsius{sensor_id=~\".*CPU1Temp\"} or redfish_chassis_temperature_celsius{sensor=~\".*CPU1.*\"}) by (env)", - "hide": false, + "expr": "sum(redfish_chassis_power_average_consumed_watts{group=\"$group\", job!=\"redfish-exporter-collectlog\"}) by (env)", "interval": "", "legendFormat": "{{ env }}", "range": true, "refId": "A" } ], - "title": "Max CPU1 Temp", + "title": "Power consumption", "type": "timeseries" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -1343,8 +1254,9 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, - "drawStyle": "points", - "fillOpacity": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1358,7 +1270,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -1370,7 +1282,6 @@ }, "links": [], "mappings": [], - "min": 40, "thresholds": { "mode": "absolute", "steps": [ @@ -1384,32 +1295,33 @@ } ] }, - "unit": "celsius" + "unit": "short" }, "overrides": [] }, "gridPos": { "h": 11, - "w": 5, - "x": 11, - "y": 19 + "w": 6, + "x": 18, + "y": 12 }, - "id": 41, - "interval": "5m", + "id": 42, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { @@ -1417,19 +1329,33 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "max(redfish_chassis_temperature_celsius{sensor_id=~\".*CPU2Temp\"} or redfish_chassis_temperature_celsius{sensor=~\"CPU2.*\"}) by (env)", + "expr": "count(redfish_system_power_state{group=\"$group\", job!=\"redfish-exporter-collectlog\"} == 1) by (env)", "hide": false, "interval": "", - "legendFormat": "{{ env }}", + "legendFormat": "Powered up {{ env }}", "range": true, "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "count(redfish_system_power_state{group=\"$group\", job!=\"redfish-exporter-collectlog\"} == 2) by (env) * -1", + "hide": true, + "interval": "", + "legendFormat": "Powered down {{ env }}", + "range": true, + "refId": "B" } ], - "title": "Max CPU2 Temp", + "title": "Powered status", "type": "timeseries" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -1445,8 +1371,9 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, - "drawStyle": "points", - "fillOpacity": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1460,7 +1387,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -1472,6 +1399,7 @@ }, "links": [], "mappings": [], + "min": 16, "thresholds": { "mode": "absolute", "steps": [ @@ -1485,63 +1413,60 @@ } ] }, - "unit": "short" + "unit": "celsius" }, "overrides": [] }, "gridPos": { "h": 11, - "w": 4, - "x": 16, - "y": 19 + "w": 7, + "x": 0, + "y": 23 }, - "id": 42, + "id": 39, + "interval": "5m", "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_power_state == 1) by (env)", + "editorMode": "code", + "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp|.*Inlet.*\", group=\"$group\", job!=\"redfish-exporter-collectlog\"}) by (env)", "hide": false, "interval": "", - "legendFormat": "Powered up {{ env }}", + "legendFormat": "max inlet {{ env }}", + "range": true, "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "count(redfish_system_power_state == 2) by (env) * -1", - "hide": true, - "interval": "", - "legendFormat": "Powered down {{ env }}", - "refId": "B" } ], - "title": "Powered ON by Rack", + "title": "Max Inlet Temp", "type": "timeseries" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, + "description": "", "fieldConfig": { "defaults": { "color": { @@ -1554,8 +1479,9 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, - "drawStyle": "points", - "fillOpacity": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1569,7 +1495,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -1581,6 +1507,7 @@ }, "links": [], "mappings": [], + "min": 20, "thresholds": { "mode": "absolute", "steps": [ @@ -1594,31 +1521,76 @@ } ] }, - "unit": "short" + "unit": "celsius" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsZero", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + }, + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsNull", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + } + ] }, "gridPos": { "h": 11, - "w": 4, - "x": 20, - "y": 19 - }, - "id": 43, + "w": 8, + "x": 7, + "y": 23 + }, + "id": 40, + "interval": "5m", "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { @@ -1626,26 +1598,163 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_system_power_state == 1) by (env)", - "hide": true, + "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\", job!=\"redfish-exporter-collectlog\"}) by (env)", + "hide": false, "interval": "", - "legendFormat": "Powered up {{ env }}", + "legendFormat": "{{ env }}", "range": true, "refId": "A" + } + ], + "title": "Max CPU1 Temp", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 20, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [ + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsZero", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + }, + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsNull", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 9, + "x": 15, + "y": 23 + }, + "id": 41, + "interval": "5m", + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "7.1.5", + "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_power_state == 2) by (env)", + "editorMode": "code", + "expr": "max(redfish_chassis_temperature_celsius{sensor=~\"CPU2 Temp|.*CPU.2.*Temp\", group=\"$group\", job!=\"redfish-exporter-collectlog\"}) by (env)", "hide": false, "interval": "", - "legendFormat": "Powered down {{ env }}", - "refId": "B" + "legendFormat": "{{ env }}", + "range": true, + "refId": "A" } ], - "title": "Powered OFF by Rack", + "title": "Max CPU2 Temp", "type": "timeseries" }, { @@ -1659,6 +1768,7 @@ }, "dataFormat": "timeseries", "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -1679,9 +1789,9 @@ }, "gridPos": { "h": 10, - "w": 7, + "w": 6, "x": 0, - "y": 30 + "y": 34 }, "heatmap": {}, "hideZeroBuckets": false, @@ -1719,7 +1829,6 @@ }, "showValue": "never", "tooltip": { - "maxHeight": 600, "mode": "single", "showColorScale": false, "yHistogram": true @@ -1730,7 +1839,7 @@ "unit": "short" } }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "reverseYBuckets": false, "targets": [ { @@ -1739,7 +1848,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor_id=~\".*InletTemp\"} or redfish_chassis_temperature_celsius{sensor=~\".*Inlet.*\"}", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp|.*Inlet.*\", group=\"$group\", job!=\"redfish-exporter-collectlog\"}", "hide": false, "interval": "", "intervalFactor": 1, @@ -1775,6 +1884,7 @@ }, "dataFormat": "timeseries", "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -1796,14 +1906,14 @@ }, "gridPos": { "h": 10, - "w": 5, - "x": 7, - "y": 30 + "w": 6, + "x": 6, + "y": 34 }, "heatmap": {}, "hideZeroBuckets": false, "highlightCards": true, - "id": 51, + "id": 48, "interval": "1m", "legend": { "show": false @@ -1836,18 +1946,19 @@ }, "showValue": "never", "tooltip": { - "maxHeight": 600, "mode": "single", "showColorScale": false, "yHistogram": true }, "yAxis": { "axisPlacement": "left", + "max": "95", + "min": "25", "reverse": false, "unit": "short" } }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "reverseYBuckets": false, "targets": [ { @@ -1855,16 +1966,17 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "max(redfish_chassis_fan_rpm_percentage) by (server) > 0", + "editorMode": "code", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\", job!=\"redfish-exporter-collectlog\"} != 0", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "{{ env }}", "range": true, "refId": "A" } ], - "title": "Max server fan speed", + "title": "CPU1 Temp", "tooltip": { "show": true, "showHistogram": true @@ -1876,6 +1988,8 @@ "yAxis": { "format": "short", "logBase": 1, + "max": "95", + "min": "25", "show": true }, "yBucketBound": "auto" @@ -1891,6 +2005,7 @@ }, "dataFormat": "timeseries", "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -1914,12 +2029,12 @@ "h": 10, "w": 6, "x": 12, - "y": 30 + "y": 34 }, "heatmap": {}, "hideZeroBuckets": false, "highlightCards": true, - "id": 48, + "id": 50, "interval": "1m", "legend": { "show": false @@ -1952,7 +2067,6 @@ }, "showValue": "never", "tooltip": { - "maxHeight": 600, "mode": "single", "showColorScale": false, "yHistogram": true @@ -1965,7 +2079,7 @@ "unit": "short" } }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "reverseYBuckets": false, "targets": [ { @@ -1974,7 +2088,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "(redfish_chassis_temperature_celsius{sensor_id=~\".*CPU1Temp\"} or redfish_chassis_temperature_celsius{sensor=~\"CPU1.*\"}) != 0", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*2.*Temp\", group=\"$group\", job!=\"redfish-exporter-collectlog\"} != 0", "hide": false, "interval": "", "intervalFactor": 1, @@ -1983,7 +2097,7 @@ "refId": "A" } ], - "title": "CPU1 Temp", + "title": "CPU2 Temp", "tooltip": { "show": true, "showHistogram": true @@ -2012,6 +2126,7 @@ }, "dataFormat": "timeseries", "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -2035,12 +2150,12 @@ "h": 10, "w": 6, "x": 18, - "y": 30 + "y": 34 }, "heatmap": {}, "hideZeroBuckets": false, "highlightCards": true, - "id": 50, + "id": 51, "interval": "1m", "legend": { "show": false @@ -2073,20 +2188,17 @@ }, "showValue": "never", "tooltip": { - "maxHeight": 600, "mode": "single", "showColorScale": false, "yHistogram": true }, "yAxis": { "axisPlacement": "left", - "max": "95", - "min": "25", "reverse": false, "unit": "short" } }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "reverseYBuckets": false, "targets": [ { @@ -2095,16 +2207,16 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "(redfish_chassis_temperature_celsius{sensor_id=~\".*CPU2Temp\"} or redfish_chassis_temperature_celsius{sensor=~\"CPU2.*\"}) != 0", + "expr": "max(redfish_chassis_fan_rpm_percentage{group=\"$group\", job!=\"redfish-exporter-collectlog\"}) by (server) > 0", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "{{ env }}", + "legendFormat": "", "range": true, "refId": "A" } ], - "title": "CPU2 Temp", + "title": "Max server fan speed", "tooltip": { "show": true, "showHistogram": true @@ -2116,14 +2228,13 @@ "yAxis": { "format": "short", "logBase": 1, - "max": "95", - "min": "25", "show": true }, "yBucketBound": "auto" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -2139,6 +2250,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", @@ -2170,8 +2282,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2185,9 +2296,9 @@ }, "gridPos": { "h": 10, - "w": 13, + "w": 12, "x": 0, - "y": 40 + "y": 44 }, "id": 47, "interval": "5m", @@ -2203,12 +2314,11 @@ "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { @@ -2216,7 +2326,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor_id=~\".*InletTemp\"} or redfish_chassis_temperature_celsius{sensor=~\"CPU2.*\"}", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp\", job!=\"redfish-exporter-collectlog\"} or redfish_chassis_temperature_celsius{sensor_id=~\".*InletTemp\", job!=\"redfish-exporter-collectlog\"} or redfish_chassis_temperature_celsius{sensor=~\".*Inlet.*\", job!=\"redfish-exporter-collectlog\"}", "hide": false, "interval": "", "legendFormat": "{{ env }} {{ server }}", @@ -2229,6 +2339,7 @@ }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -2245,6 +2356,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", @@ -2276,8 +2388,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2291,9 +2402,9 @@ }, "gridPos": { "h": 10, - "w": 11, - "x": 13, - "y": 40 + "w": 12, + "x": 12, + "y": 44 }, "id": 49, "interval": "1m", @@ -2308,12 +2419,11 @@ "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { @@ -2321,7 +2431,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "(redfish_chassis_temperature_celsius{sensor_id=~\".*CPU1Temp\"} or redfish_chassis_temperature_celsius{sensor=~\"CPU1.*\"}) != 0", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\", job!=\"redfish-exporter-collectlog\"} != 0", "hide": false, "interval": "", "legendFormat": "{{ env }} {{ server }}", @@ -2332,252 +2442,96 @@ "title": "Max CPU1 Temp", "type": "timeseries" }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 54 + }, + "id": 29, + "panels": [], + "repeat": "server", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "refId": "A" + } + ], + "title": "$server", + "type": "row" + }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, "mappings": [], + "max": 500, + "min": 0, "thresholds": { - "mode": "absolute", + "mode": "percentage", "steps": [ { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Time" - }, - "properties": [ - { - "id": "displayName", - "value": "Time" - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" + "color": "green" } ] }, - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "displayName", - "value": "Count" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "custom.cellOptions", - "value": { - "type": "color-background" - } - }, - { - "id": "custom.align" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(50, 172, 45, 0.97)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 0 - }, - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 1 - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "instance" - }, - "properties": [ - { - "id": "displayName", - "value": "BMC" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "links", - "value": [ - { - "targetBlank": true, - "title": "", - "url": "https://${__cell:raw}" - } - ] - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "__name__" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "job" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - } - ] + "unit": "watt" + }, + "overrides": [] }, "gridPos": { - "h": 7, - "w": 24, + "h": 4, + "w": 8, "x": 0, - "y": 50 + "y": 55 }, - "id": 34, + "id": 19, "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" ], - "show": false + "fields": "", + "values": false }, - "showHeader": true + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "sort(redfish_logservices_entry_count{name=\"SEL Log Service\", severity!=\"OK\"})", - "format": "table", - "instant": true, + "editorMode": "code", + "expr": "redfish_chassis_power_average_consumed_watts{server=\"$server\", job!=\"redfish-exporter-collectlog\"}", + "hide": false, "interval": "", - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "title": "Errors in event log", - "transformations": [ - { - "id": "merge", - "options": { - "reducers": [] - } - } - ], - "type": "table" - }, - { - "collapsed": false, - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 57 - }, - "id": 29, - "panels": [], - "repeat": "server", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, + "legendFormat": "{{power_voltage}}", + "range": true, "refId": "A" } ], - "title": "$server", - "type": "row" + "title": "Power consumption", + "type": "gauge" }, { "datasource": { @@ -2586,33 +2540,31 @@ }, "fieldConfig": { "defaults": { + "displayName": "Disks", "mappings": [], - "max": 500, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", - "value": 80 + "value": 1 } ] - }, - "unit": "watt" + } }, "overrides": [] }, "gridPos": { "h": 4, - "w": 8, - "x": 0, - "y": 58 + "w": 2, + "x": 8, + "y": 55 }, - "id": 19, + "id": 24, "options": { "minVizHeight": 75, "minVizWidth": 75, @@ -2628,21 +2580,22 @@ "showThresholdMarkers": true, "sizing": "auto" }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "redfish_chassis_power_average_consumed_watts{server=~\"$server\"}", - "hide": false, + "expr": "count(redfish_system_storage_drive_state{server=~\"$server\", job!=\"redfish-exporter-collectlog\"} != 1) or vector(0)", + "format": "table", + "instant": true, "interval": "", - "legendFormat": "{{type}}", + "legendFormat": "", "refId": "A" } ], - "title": "Power consumption", + "title": "Disks with errors", "type": "gauge" }, { @@ -2652,759 +2605,54 @@ }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 3, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "line+area" - } - }, - "decimals": 1, - "links": [], + "displayName": "Controllers", "mappings": [], + "min": 0, "thresholds": { "mode": "absolute", "steps": [ { - "color": "red", - "value": null + "color": "green" }, { - "color": "transparent", - "value": 10000 + "color": "red", + "value": 1 } ] - }, - "unit": "rpm" - }, - "overrides": [ - { - "matcher": { - "id": "byValue", - "options": { - "op": "gte", - "reducer": "allIsZero", - "value": 0 - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": true, - "tooltip": true, - "viz": false - } - } - ] - }, - { - "matcher": { - "id": "byValue", - "options": { - "op": "gte", - "reducer": "allIsNull", - "value": 0 - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": true, - "tooltip": true, - "viz": false - } - } - ] - } - ] - }, - "gridPos": { - "h": 14, - "w": 8, - "x": 8, - "y": 58 - }, - "id": 4, - "interval": "", - "options": { - "legend": { - "calcs": [ - "max" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "maxHeight": 600, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "10.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "redfish_chassis_fan_rpm_percentage{server=~\"$server\"}", - "format": "time_series", - "instant": false, - "interval": "", - "legendFormat": "{{fan}}", - "refId": "A" - } - ], - "title": "Fans", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 0, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 16, - "y": 58 - }, - "id": 13, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "maxHeight": 600, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "10.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "redfish_chassis_power_average_consumed_watts{server=~\"$server\"}", - "interval": "", - "legendFormat": "{{resource}}", - "refId": "A" - } - ], - "title": "Power consumption", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "1": { - "text": "Up" - }, - "2": { - "text": "Down" - } - }, - "type": "value" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#d44a3a", - "value": null - }, - { - "color": "#299c46", - "value": 0 - }, - { - "color": "#299c46", - "value": 2 - } - ] - }, - "unit": "none" + } }, "overrides": [] }, "gridPos": { - "h": 2, + "h": 4, "w": 2, - "x": 0, - "y": 62 + "x": 10, + "y": 55 }, - "id": 6, - "maxDataPoints": 100, + "id": 25, "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "redfish_system_power_state{server=~\"$server\"}", - "format": "time_series", - "instant": true, - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "title": "Power Status", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "1": { - "text": "OK" - }, - "2": { - "text": "WARNING" - }, - "3": { - "text": "CRITICAL" - } - }, - "type": "value" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#299c46", - "value": null - }, - { - "color": "#299c46", - "value": 1 - }, - { - "color": "#d44a3a", - "value": 2 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 4, - "x": 2, - "y": 62 - }, - "id": 7, - "maxDataPoints": 100, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "vertical", - "reduceOptions": { - "calcs": [ - "max" - ], - "fields": "/^Value$/", - "values": false - }, - "showPercentChange": false, - "textMode": "value", - "wideLayout": true - }, - "pluginVersion": "11.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "editorMode": "code", - "exemplar": false, - "expr": "redfish_chassis_health{server=~\"$server\"}", - "format": "table", - "instant": true, - "interval": "", - "legendFormat": "{{chassis_id}}", - "range": false, - "refId": "A" - } - ], - "title": "General Health", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "1": { - "text": "Healthy" - }, - "2": { - "text": "Warning" - }, - "3": { - "text": "Critical" - } - }, - "type": "value" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#d44a3a", - "value": null - }, - { - "color": "#299c46", - "value": 1 - }, - { - "color": "#299c46", - "value": 2 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 6, - "y": 62 - }, - "id": 8, - "maxDataPoints": 100, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "redfish_manager_health_state{server=~\"$server\"}", - "format": "time_series", - "instant": true, - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "title": "BMC Health", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Time" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "__name__" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "instance" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "job" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "env" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "displayName", - "value": "Status" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.cellOptions", - "value": { - "type": "color-background" - } - }, - { - "id": "custom.align" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(245, 54, 54, 0.9)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 0 - }, - { - "color": "rgba(50, 172, 45, 0.97)", - "value": 1 - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "resource" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - } - ] - }, - "gridPos": { - "h": 12, - "w": 8, - "x": 0, - "y": 64 - }, - "id": 2, - "interval": "", - "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" ], - "show": false + "fields": "", + "values": false }, - "showHeader": true + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "redfish_system_storage_drive_state{server=~\"$server\"}", + "expr": "count(redfish_system_pcie_device_health_state{server=~\"$server\", job!=\"redfish-exporter-collectlog\"} != 1) or vector(0)", "format": "table", "instant": true, "interval": "", @@ -3412,37 +2660,29 @@ "refId": "A" } ], - "title": "Disk states / health", - "transformations": [ - { - "id": "merge", - "options": { - "reducers": [] - } - } - ], - "type": "table" + "title": "PCI-E with errors", + "type": "gauge" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "displayName": "Disks", + "displayName": "Sensor", "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", - "value": 80 + "value": 1 } ] } @@ -3452,10 +2692,10 @@ "gridPos": { "h": 4, "w": 2, - "x": 8, - "y": 72 + "x": 12, + "y": 55 }, - "id": 24, + "id": 26, "options": { "minVizHeight": 75, "minVizWidth": 75, @@ -3471,14 +2711,15 @@ "showThresholdMarkers": true, "sizing": "auto" }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_storage_drive_state{server=~\"$server\"} != 1) or vector(0)", + "editorMode": "code", + "expr": "count(redfish_chassis_temperature_sensor_state{server=~\"$server\", job!=\"redfish-exporter-collectlog\"} > 2) or vector(0)", "format": "table", "instant": true, "interval": "", @@ -3486,7 +2727,7 @@ "refId": "A" } ], - "title": "Disk with errors", + "title": "Sensors with errors", "type": "gauge" }, { @@ -3496,19 +2737,18 @@ }, "fieldConfig": { "defaults": { - "displayName": "Controllers", + "displayName": "Power Supply", "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", - "value": 80 + "value": 1 } ] } @@ -3518,10 +2758,10 @@ "gridPos": { "h": 4, "w": 2, - "x": 10, - "y": 72 + "x": 14, + "y": 55 }, - "id": 25, + "id": 27, "options": { "minVizHeight": 75, "minVizWidth": 75, @@ -3537,14 +2777,14 @@ "showThresholdMarkers": true, "sizing": "auto" }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_pcie_device_health_state{server=~\"$server\"} != 1) or vector(0)", + "expr": "count(redfish_chassis_power_powersupply_health{server=~\"$server\", job!=\"redfish-exporter-collectlog\"} > 1) or vector(0)", "format": "table", "instant": true, "interval": "", @@ -3552,74 +2792,308 @@ "refId": "A" } ], - "title": "PCI-E with errors", + "title": "PS with errors", "type": "gauge" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "displayName": "Fans", + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, "mappings": [], + "max": 80, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", "value": 80 } ] + }, + "unit": "degree" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Temperature" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "color-background" + } + } + ] + } + ] + }, + "gridPos": { + "h": 18, + "w": 8, + "x": 16, + "y": 55 + }, + "id": 17, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Temperature" + } + ] + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by (sensor) (sum_over_time(redfish_chassis_temperature_celsius{server=\"$server\", job!=\"redfish-exporter-collectlog\"}[15m])) / sum by (sensor) (count_over_time(redfish_chassis_temperature_celsius{server=\"$server\", job!=\"redfish-exporter-collectlog\"}[15m])) > 0", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "{{sensor}}", + "range": false, + "refId": "A" + } + ], + "title": "Temperatures", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "includeByName": {}, + "indexByName": {}, + "renameByName": { + "Value": "Temperature", + "sensor": "Sensor" + } } + } + ], + "type": "table" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "1": { + "text": "Up" + } + }, + "type": "value" + }, + { + "options": { + "2": { + "text": "Down" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a" + }, + { + "color": "#299c46", + "value": 0 + }, + { + "color": "#299c46", + "value": 2 + } + ] + }, + "unit": "none" }, "overrides": [] }, "gridPos": { "h": 4, - "w": 2, - "x": 12, - "y": 72 + "w": 3, + "x": 0, + "y": 59 }, - "id": 26, + "id": 6, + "maxDataPoints": 100, "options": { - "minVizHeight": 75, - "minVizWidth": 75, + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "redfish_system_power_state{server=\"$server\", job!=\"redfish-exporter-collectlog\"}", + "format": "time_series", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Power Status", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "1": { + "text": "OK" + }, + "2": { + "text": "WARNING" + }, + "3": { + "text": "CRITICAL" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46" + }, + { + "color": "#299c46", + "value": 1 + }, + { + "color": "#d44a3a", + "value": 2 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 3, + "y": 59 + }, + "id": 66, + "maxDataPoints": 100, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "vertical", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ - "mean" + "max" ], - "fields": "", + "fields": "/^Value$/", "values": false }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "sizing": "auto" + "showPercentChange": false, + "textMode": "value", + "wideLayout": true }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_chassis_temperature_sensor_state{server=~\"$server\"} != 1) or vector(0)", + "editorMode": "code", + "exemplar": false, + "expr": "redfish_chassis_health{server=~\"$server\"}", "format": "table", "instant": true, "interval": "", - "legendFormat": "", + "legendFormat": "{{chassis_id}}", + "range": false, "refId": "A" } ], - "title": "Sensors with errors", - "type": "gauge" + "title": "General Health", + "type": "stat" }, { "datasource": { @@ -3628,132 +3102,242 @@ }, "fieldConfig": { "defaults": { - "displayName": "Power Supply", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, "mappings": [], + "max": 80, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" + }, + { + "color": "orange", + "value": 50 }, { "color": "red", - "value": 80 + "value": 70 } ] - } + }, + "unit": "degree" }, "overrides": [] }, "gridPos": { - "h": 4, - "w": 2, - "x": 14, - "y": 72 + "h": 7, + "w": 8, + "x": 8, + "y": 59 }, - "id": 27, + "id": 45, "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "auto", - "reduceOptions": { + "legend": { "calcs": [ - "mean" + "lastNotNull", + "max", + "min" ], - "fields": "", - "values": false + "displayMode": "table", + "placement": "bottom", + "showLegend": true }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "sizing": "auto" + "tooltip": { + "mode": "multi", + "sort": "none" + } }, - "pluginVersion": "11.0.0", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_chassis_power_powersupply_health{server=~\"$server\"} > 1) or vector(0)", - "format": "table", - "instant": true, + "expr": "redfish_chassis_temperature_celsius{server=~\"$server\", job!=\"redfish-exporter-collectlog\"}", "interval": "", - "legendFormat": "", + "legendFormat": "{{sensor}}", "refId": "A" } ], - "title": "PS with errors", - "type": "gauge" + "title": "Temperatures", + "type": "timeseries" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { + "color": { + "mode": "fixed" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "color-background" + }, + "filterable": false, + "inspect": false + }, "mappings": [], - "max": 80, - "min": 0, "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "#299c46" }, { - "color": "red", - "value": 80 + "color": "#299c46", + "value": 1 + }, + { + "color": "#d44a3a", + "value": 2 } ] }, - "unit": "degree" + "unit": "none" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "1": { + "color": "dark-green", + "index": 0, + "text": "OK" + }, + "2": { + "color": "dark-yellow", + "index": 1, + "text": "WARNING" + }, + "3": { + "color": "dark-red", + "index": 2, + "text": "CRITICAL" + } + }, + "type": "value" + } + ] + } + ] + } + ] }, "gridPos": { - "h": 4, - "w": 24, + "h": 10, + "w": 8, "x": 0, - "y": 76 + "y": 63 }, - "id": 17, + "id": 7, + "maxDataPoints": 100, "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "vertical", - "reduceOptions": { - "calcs": [ - "mean" - ], + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": true, "fields": "", - "values": false + "reducer": [ + "sum" + ], + "show": false }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "sizing": "auto" + "showHeader": true, + "sortBy": [] }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "redfish_chassis_temperature_celsius{server=~\"$server\"}", - "interval": "", - "legendFormat": "{{sensor}}", + "editorMode": "code", + "expr": "redfish_chassis_health{server=\"$server\", job!=\"redfish-exporter-collectlog\"}", + "format": "table", + "instant": true, + "interval": "30m", + "legendFormat": "Chassis {{chassis_id}}", "refId": "A" } ], - "title": "Temperatures", - "type": "gauge" + "title": "Health by Chassis", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "env": true, + "group": true, + "instance": true, + "job": true, + "resource": true, + "server": true + }, + "includeByName": {}, + "indexByName": {}, + "renameByName": {} + } + } + ], + "type": "table" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -3769,8 +3353,9 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, - "drawStyle": "points", - "fillOpacity": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -3784,7 +3369,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -3794,14 +3379,14 @@ "mode": "off" } }, + "decimals": 0, "links": [], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3809,44 +3394,49 @@ } ] }, - "unit": "degree" + "unit": "short" }, "overrides": [] }, "gridPos": { - "h": 10, - "w": 24, - "x": 0, - "y": 80 + "h": 7, + "w": 8, + "x": 8, + "y": 66 }, - "id": 45, + "id": 13, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "redfish_chassis_temperature_celsius{server=~\"$server\"}", + "editorMode": "code", + "expr": "redfish_chassis_power_average_consumed_watts{server=~\"$server\", job!=\"redfish-exporter-collectlog\"}", "interval": "", - "legendFormat": "{{sensor}}", + "legendFormat": "{{power_voltage}}", + "range": true, "refId": "A" } ], - "title": "Temperatures", + "title": "Power consumption", "type": "timeseries" } ], @@ -3877,24 +3467,89 @@ }, { "current": { - "isNone": true, "selected": false, - "text": "None", - "value": "" + "text": "production", + "value": "production" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(redfish_exporter_collector_duration_seconds,env)", + "hide": 0, + "includeAll": false, + "label": "env", + "multi": false, + "name": "env", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(redfish_exporter_collector_duration_seconds,env)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "compute", + "value": "compute" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(redfish_exporter_collector_duration_seconds{env=\"$env\"},group)", + "hide": 0, + "includeAll": false, + "label": "group", + "multi": false, + "name": "group", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(redfish_exporter_collector_duration_seconds{env=\"$env\"},group)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "svn1-dr07-u10", + "value": "svn1-dr07-u10" }, "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(redfish_exporter_collector_duration_seconds, server)", + "definition": "label_values(redfish_exporter_collector_duration_seconds{group=\"$group\", env=\"$env\"},server)", "hide": 0, "includeAll": false, "label": "server", "multi": false, "name": "server", "options": [], - "query": "label_values(redfish_exporter_collector_duration_seconds, server)", - "refresh": 1, + "query": { + "qryType": 1, + "query": "label_values(redfish_exporter_collector_duration_seconds{group=\"$group\", env=\"$env\"},server)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, @@ -3909,7 +3564,6 @@ "from": "now-30m", "to": "now-5m" }, - "timeRangeUpdatedDuringEditOrView": false, "timepicker": { "nowDelay": "1m", "refresh_intervals": [ @@ -3927,8 +3581,8 @@ }, "timezone": "", "title": "Redfish exporter", - "uid": "b02mElQGX", - "version": 1, + "uid": "redfish", + "version": 3, "weekStart": "" } {% endraw %} diff --git a/etc/kayobe/kolla/config/nova/nova-api.conf b/etc/kayobe/kolla/config/nova/nova-api.conf new file mode 100644 index 000000000..59e3a6102 --- /dev/null +++ b/etc/kayobe/kolla/config/nova/nova-api.conf @@ -0,0 +1,4 @@ +[pci] +{% for item in gpu_group_map | dict2items | map(attribute='value') | flatten | unique | list %} +alias = { "vendor_id":"{{ stackhpc_gpu_data[item].vendor_id }}", "product_id":"{{ stackhpc_gpu_data[item].product_id }}", "device_type":"{{ stackhpc_gpu_data[item].device_type }}", "name":"{{ stackhpc_gpu_data[item].resource_name }}" } +{% endfor %} diff --git a/etc/kayobe/kolla/config/nova/nova-compute.conf b/etc/kayobe/kolla/config/nova/nova-compute.conf new file mode 100644 index 000000000..5f8593dde --- /dev/null +++ b/etc/kayobe/kolla/config/nova/nova-compute.conf @@ -0,0 +1,13 @@ +[pci] +{% raw %} +{% set gpu_list = [] %} +{% for gpu_group in gpu_group_map | dict2items | default([]) %} +{% if gpu_group.key in group_names %} +{% set _ = gpu_list.append(gpu_group.value) %} +{% endif %} +{% endfor %} +{% for item in gpu_list | flatten | unique %} +device_spec = { "vendor_id":"{{ stackhpc_gpu_data[item].vendor_id }}", "product_id":"{{ stackhpc_gpu_data[item].product_id }}" } +alias = { "vendor_id":"{{ stackhpc_gpu_data[item].vendor_id }}", "product_id":"{{ stackhpc_gpu_data[item].product_id }}", "device_type":"{{ stackhpc_gpu_data[item].device_type }}", "name":"{{ stackhpc_gpu_data[item].resource_name }}" } +{% endfor %} +{% endraw %} diff --git a/etc/kayobe/kolla/config/nova/nova-scheduler.conf b/etc/kayobe/kolla/config/nova/nova-scheduler.conf new file mode 100644 index 000000000..f41bd8548 --- /dev/null +++ b/etc/kayobe/kolla/config/nova/nova-scheduler.conf @@ -0,0 +1,7 @@ +[filter_scheduler] +# Default list plus PciPassthroughFilter +# NOTE(Upgrade): defaults may change in each release. Default values can be +# checked here: +# https://docs.openstack.org/nova/latest/configuration/sample-config.html +enabled_filters = ComputeFilter,ComputeCapabilitiesFilter,ImagePropertiesFilter,ServerGroupAntiAffinityFilter,ServerGroupAffinityFilter,PciPassthroughFilter +available_filters = nova.scheduler.filters.all_filters diff --git a/etc/kayobe/kolla/config/prometheus/ceph.rules b/etc/kayobe/kolla/config/prometheus/ceph.rules index 58698cdaf..88b04f1e6 100644 --- a/etc/kayobe/kolla/config/prometheus/ceph.rules +++ b/etc/kayobe/kolla/config/prometheus/ceph.rules @@ -139,7 +139,7 @@ groups: # alert on nic packet errors and drops rates > alertmanager_packet_drop_threshold packet/s - alert: NetworkPacketsDropped - expr: irate(node_network_receive_drop_total{device!~"lo|br.*|.*-ovs|tap.*"}[5m]) + irate(node_network_transmit_drop_total{device!~"lo|br.*|.*-ovs|tap.*"}[5m]) > {% endraw %}{{ alertmanager_packet_drop_threshold }}{% raw %} + expr: irate(node_network_receive_drop_total{device!~"lo|br.*|.*-ovs|tap.*|ha.*|qc.*|qr.*|qg.*"}[5m]) + irate(node_network_transmit_drop_total{device!~"lo|br.*|.*-ovs|tap.*|ha.*|qc.*|qr.*|qg.*"}[5m]) > {% endraw %}{{ alertmanager_packet_drop_threshold }}{% raw %} labels: severity: warning annotations: diff --git a/etc/kayobe/kolla/config/prometheus/prometheus-alertmanager.msteamsv2.yml.example b/etc/kayobe/kolla/config/prometheus/prometheus-alertmanager.msteamsv2.yml.example new file mode 100644 index 000000000..e4575f831 --- /dev/null +++ b/etc/kayobe/kolla/config/prometheus/prometheus-alertmanager.msteamsv2.yml.example @@ -0,0 +1,29 @@ +--- +global: + resolve_timeout: 5m + smtp_require_tls: true + +route: + receiver: 'msteamsv2-notifications' + group_by: [alertname] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + + routes: + - matchers: + - severity=~"critical|alert" + receiver: 'msteamvs2-critical-notifications' + +receivers: + - name: 'msteamsv2-notifications' + msteamsv2_configs: + - webhook_url: '{{ secrets_msteams_notification_channel_url | default('https://prod-01.westeurope.logic.azure.com/workflows/') }}' + send_resolved: true + - name: 'msteamsv2-critical-notifications' + msteamsv2_configs: + - webhook_url: '{{ secrets_msteams_notification_critical_channel_url | default('https://prod-01.westeurope.logic.azure.com/workflows/') }}' + send_resolved: true + +templates: + - '/etc/prometheus/*.tmpl' diff --git a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml index 6f234e5a0..54a8ae2e5 100644 --- a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml +++ b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml @@ -2,10 +2,12 @@ --- {% if seed_redfish_exporter_container_enabled | bool %} scrape_configs: - - job_name: redfish-exporter-seed + - job_name: redfish-exporter + params: + collectlogs: ['false'] metrics_path: /redfish - scrape_timeout: 120s - scrape_interval: {{ [8 * groups['redfish_exporter_targets'] | length, 120] | max }}s + scrape_timeout: 300s + scrape_interval: {{ stackhpc_redfish_exporter_scrape_interval }}s relabel_configs: - source_labels: [__address__] target_label: __param_target diff --git a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml index f886734a4..77e224bbc 100644 --- a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml +++ b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml @@ -7,8 +7,10 @@ scrape_configs: static_configs: - targets: - '{{ kolla_internal_fqdn | put_address_in_context('url') }}:9090' - scrape_interval: 15m +{% endraw %} + scrape_interval: "{{ stackhpc_os_capacity_scrape_interval }}s" scrape_timeout: 10m +{% raw %} {% if kolla_enable_tls_internal | bool %} scheme: https {% endif %} diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml index c36b659b5..85778eaac 100644 --- a/etc/kayobe/kolla/globals.yml +++ b/etc/kayobe/kolla/globals.yml @@ -56,3 +56,5 @@ prometheus_blackbox_exporter_endpoints_kayobe: - endpoints: - "pulp:http_2xx:{{ pulp_url }}/pulp/api/v3/status/" enabled: "{{ seed_pulp_container_enabled | bool }}" + +prometheus_openstack_exporter_interval: "{{ stackhpc_prometheus_openstack_exporter_interval }}s" diff --git a/etc/kayobe/openbao.yml b/etc/kayobe/openbao.yml new file mode 100644 index 000000000..0a545eced --- /dev/null +++ b/etc/kayobe/openbao.yml @@ -0,0 +1,47 @@ +--- +############################################################################### +# OpenBao deployment configuration. + +# Registry information for seed. +# seed_openbao_registry_url: +# seed_openbao_registry_username: +# seed_openbao_registry_password: + +# Seed OpenBao container image. +# seed_openbao_docker_image: + +# Seed OpenBao container image tag. +seed_openbao_docker_tag: 2.2.1 + +# Seed OpenBao PKI Role name +# seed_openbao_pki_role_name: + +# Seed OpenBao PKI Roles definition +# seed_openbao_pki_roles: [] + +# Registry information for overcloud. +# overcloud_openbao_registry_url: +# overcloud_openbao_registry_username: +# overcloud_openbao_registry_password: + +# Overcloud OpenBao container image. +# overcloud_openbao_docker_image: + +# Overcloud OpenBao container image tag. +overcloud_openbao_docker_tag: 2.2.1 + +# Overcloud OpenBao PKI Default Role name +# overcloud_openbao_pki_default_role_name: + +# Overcloud OpenBao PKI Internal TLS Role name +# overcloud_openbao_pki_internal_tls_role_name: + +# Overcloud OpenBao PKI Backend TLS Role name +# overcloud_openbao_pki_backend_tls_role_name: + +# Overcloud OpenBao PKI Roles definition +# overcloud_openbao_pki_roles: [] + +############################################################################### +# Dummy variable to allow Ansible to accept this file. +workaround_ansible_issue_8743: yes diff --git a/etc/kayobe/pulp.yml b/etc/kayobe/pulp.yml index ac52dd44f..f29bf4a4b 100644 --- a/etc/kayobe/pulp.yml +++ b/etc/kayobe/pulp.yml @@ -513,8 +513,8 @@ stackhpc_pulp_images_kolla: - ovn-controller - ovn-nb-db-server - ovn-northd - - ovn-sb-db-server - ovn-sb-db-relay + - ovn-sb-db-server - placement-api - prometheus-alertmanager - prometheus-blackbox-exporter @@ -531,6 +531,8 @@ stackhpc_pulp_images_kolla: - rabbitmq - redis - redis-sentinel + - skyline-apiserver + - skyline-console # List of images for each base distribution which should not/cannot be built. stackhpc_kolla_unbuildable_images: @@ -559,10 +561,11 @@ stackhpc_pulp_repository_container_repos_kolla_common: # List of Kolla container image repositories. stackhpc_pulp_repository_container_repos_kolla: >- {%- set repos = [] -%} + {%- set image_tags = lookup('pipe', 'python3 ' ~ kayobe_config_path ~ '/../../tools/kolla-images.py list-tags') | from_yaml -%} {%- for image in stackhpc_pulp_images_kolla_filtered -%} - {%- if image not in stackhpc_kolla_unbuildable_images[kolla_base_distro_and_version]-%} + {%- if image not in stackhpc_kolla_unbuildable_images[kolla_base_distro_and_version] -%} {%- set image_repo = kolla_docker_namespace ~ "/" ~ image -%} - {%- set repo = {"name": image_repo} -%} + {%- set repo = {"name": image_repo, "include_tags": image_tags[image]} -%} {%- set _ = repos.append(stackhpc_pulp_repository_container_repos_kolla_common | combine(repo)) -%} {%- endif -%} {%- endfor -%} @@ -663,6 +666,18 @@ stackhpc_pulp_distribution_container_hashicorp: state: present required: "{{ stackhpc_sync_hashicorp_images | bool }}" +stackhpc_sync_openbao_images: false + +# List of OpenBao container image repositories. +stackhpc_pulp_repository_container_repos_openbao: + - name: "openbao/openbao" + url: "https://registry-1.docker.io" + policy: on_demand + proxy_url: "{{ pulp_proxy_url }}" + state: present + include_tags: "{{ overcloud_vault_docker_tag }}" + required: "{{ stackhpc_sync_openbao_images | bool }}" + # List of extra container image repositories. stackhpc_pulp_repository_container_repos_extra: [] @@ -674,6 +689,7 @@ stackhpc_pulp_repository_container_repos: >- {{ (stackhpc_pulp_repository_container_repos_kolla + stackhpc_pulp_repository_container_repos_ceph + stackhpc_pulp_repository_container_repos_hashicorp + + stackhpc_pulp_repository_container_repos_openbao + stackhpc_pulp_repository_container_repos_extra) | selectattr('required') }} # List of container image distributions. @@ -681,4 +697,5 @@ stackhpc_pulp_distribution_container: >- {{ (stackhpc_pulp_distribution_container_kolla + stackhpc_pulp_distribution_container_ceph + stackhpc_pulp_distribution_container_hashicorp + + stackhpc_pulp_distribution_container_openbao + stackhpc_pulp_distribution_container_extra) | selectattr('required') }} diff --git a/etc/kayobe/seed.yml b/etc/kayobe/seed.yml index c76b82f8a..4542c9bbc 100644 --- a/etc/kayobe/seed.yml +++ b/etc/kayobe/seed.yml @@ -145,9 +145,9 @@ seed_redfish_exporter_container: image: ghcr.io/stackhpc/redfish-exporter pre: "{{ kayobe_config_path }}/containers/redfish_exporter/pre.yml" post: "{{ kayobe_config_path }}/containers/redfish_exporter/post.yml" - tag: "v1.0.2" + tag: "v2.1.1-stackhpc" network_mode: host - command: ./main --config.file /redfish_exporter.yml + command: redfish_exporter --config.file /redfish_exporter.yml volumes: "/opt/kayobe/containers/redfish_exporter/redfish_exporter.yml:/redfish_exporter.yml:ro" restart_policy: unless-stopped diff --git a/etc/kayobe/stackhpc-compute.yml b/etc/kayobe/stackhpc-compute.yml new file mode 100644 index 000000000..5e86b0030 --- /dev/null +++ b/etc/kayobe/stackhpc-compute.yml @@ -0,0 +1,103 @@ +--- +# StackHPC compute node configuration + +# Map of inventory groups to GPU types. +# This is used to determine which GPU types each compute node should pass +# through to OpenStack. +# Keys are group names, values are a list of GPU types. +# Groups must be added to kolla_overcloud_inventory_top_level_group_map +# GPU types must be keys in stackhpc_gpu_data. +# Example GPU group map: +# gpu_group_map: +# compute_a100: +# - a100_80 +# compute_v100: +# - v100_32 +# compute_multi_gpu: +# - a100_80 +# - v100_32 +gpu_group_map: {} + +# Dict mapping GPUs to PCI data. +# Resource names are used to identify the device in placement, and can be +# edited to match deployment-specific naming conventions +# The default list covers many common GPUs, but can be extended as needed. +stackhpc_gpu_data: + # Nvidia H100 SXM5 80GB + h100_80_sxm: + resource_name: "{{ h100_80_sxm_resource_name | default('h100_80_sxm')}}" + vendor_id: "10de" + product_id: "2330" + device_type: "type-PF" + # Nvidia A100 SXM5 80GB + a100_80_sxm: + resource_name: "{{ a100_80_sxm_resource_name | default('a100_80_sxm')}}" + vendor_id: "10de" + product_id: "20b2" + device_type: "type-PF" + # Nvidia A100 SXM5 40GB + a100_40_sxm: + resource_name: "{{ a100_40_sxm_resource_name | default('a100_40_sxm')}}" + vendor_id: "10de" + product_id: "20b0" + device_type: "type-PF" + # Nvidia A100 PCI 80GB + a100_80: + resource_name: "{{ a100_80_resource_name | default('a100_80')}}" + vendor_id: "10de" + product_id: "20b5" + device_type: "type-PF" + # Nvidia A100 PCI 40GB + a100_40: + resource_name: "{{ a100_40_resource_name | default('a100_40')}}" + vendor_id: "10de" + product_id: "20f1" + device_type: "type-PF" + # Nvidia V100 SXM3 32GB + v100_32_sxm3: + resource_name: "{{ v100_32_sxm3_resource_name | default('v100_32_sxm3')}}" + vendor_id: "10de" + product_id: "1db8" + device_type: "type-PCI" + # Nvidia V100 SXM2 32GB + v100_32_sxm2: + resource_name: "{{ v100_32_sxm2_resource_name | default('v100_32_sxm2')}}" + vendor_id: "10de" + product_id: "1db5" + device_type: "type-PCI" + # Nvidia V100 PCI 32GB + v100_32: + resource_name: "{{ v100_32_resource_name | default('v100_32')}}" + vendor_id: "10de" + product_id: "1db6" + device_type: "type-PCI" + # Nvidia RTX A6000 + a6000: + resource_name: "{{ a6000_resource_name | default('a6000')}}" + vendor_id: "10de" + product_id: "2230" + device_type: "type-PCI" + # Nvidia A40 + a40: + resource_name: "{{ a40_resource_name | default('a40')}}" + vendor_id: "10de" + product_id: "2235" + device_type: "type-PF" + # Nvidia T4 + t4: + resource_name: "{{ t4_resource_name | default('t4')}}" + vendor_id: "10de" + product_id: "1eb8" + device_type: "type-PF" + # Nvidia L40 + l40: + resource_name: "{{ l40_resource_name | default('l40')}}" + vendor_id: "10de" + product_id: "26b5" + device_type: "type-PF" + # Nvidia L40s + l40s: + resource_name: "{{ l40s_resource_name | default('l40s')}}" + vendor_id: "10de" + product_id: "26b9" + device_type: "type-PF" diff --git a/etc/kayobe/stackhpc-monitoring.yml b/etc/kayobe/stackhpc-monitoring.yml index 5eee4b19c..831486d10 100644 --- a/etc/kayobe/stackhpc-monitoring.yml +++ b/etc/kayobe/stackhpc-monitoring.yml @@ -45,9 +45,15 @@ stackhpc_os_capacity_openstack_cacert: "" stackhpc_os_capacity_openstack_verify: true # Redfish exporter +# How often to scrape the os capacity exporter in seconds. +stackhpc_os_capacity_scrape_interval: 900 + # Whether the redfish exporter is enabled. stackhpc_enable_redfish_exporter: false +# How often to scrape the BMCs in seconds. +stackhpc_redfish_exporter_scrape_interval: "{{ [8 * groups['redfish_exporter_targets'] | length, 300] | max }}" + # Credentials redfish_exporter_default_username: "{{ ipmi_username }}" redfish_exporter_default_password: "{{ ipmi_password }}" @@ -55,6 +61,9 @@ redfish_exporter_default_password: "{{ ipmi_password }}" # The address of the BMC that is used to query redfish metrics. redfish_exporter_target_address: "{{ ipmi_address }}" +# How often to scrape OpenStack Exporter in seconds. +stackhpc_prometheus_openstack_exporter_interval: 300 + ############################################################################### # Whether the RADOS gateway usage exporter is enabled. diff --git a/etc/kayobe/trivy/allowed-vulnerabilities.yml b/etc/kayobe/trivy/allowed-vulnerabilities.yml index adf2aad82..caf8c7021 100644 --- a/etc/kayobe/trivy/allowed-vulnerabilities.yml +++ b/etc/kayobe/trivy/allowed-vulnerabilities.yml @@ -36,6 +36,9 @@ prometheus_cadvisor_allowed_vulnerabilities: - CVE-2024-41110 - CVE-2024-45337 +skyline_apiserver_allowed_vulnerabilities: + - CVE-2024-33663 + ############################################################################### # Dummy variable to allow Ansible to accept this file. workaround_ansible_issue_8743: yes diff --git a/releasenotes/notes/add-confirmation-prompt-to-reboot.yml-4fd1ae8e8d360e57.yaml b/releasenotes/notes/add-confirmation-prompt-to-reboot.yml-4fd1ae8e8d360e57.yaml new file mode 100644 index 000000000..b696ac7c6 --- /dev/null +++ b/releasenotes/notes/add-confirmation-prompt-to-reboot.yml-4fd1ae8e8d360e57.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + A confirmation prompt has been added to ``reboot.yml`` to help avoid + rebooting the wrong hosts by mistake. This check can be skipped by setting + ``confirm_reboot: true``. diff --git a/releasenotes/notes/add-openbao-for-tls-698ae3834ed5c67f.yaml b/releasenotes/notes/add-openbao-for-tls-698ae3834ed5c67f.yaml new file mode 100644 index 000000000..c6c760b40 --- /dev/null +++ b/releasenotes/notes/add-openbao-for-tls-698ae3834ed5c67f.yaml @@ -0,0 +1,12 @@ +--- +features: + - | + Add support for deploying ``OpenBao`` across the ``seed`` and ``overcloud`` hosts + for the purpose of internal and backend TLS generation. +deprecations: + - | + Hashicorp Vault for TLS generation is deprecated in favour of OpenBao. + The ``openbao`` role is now used to deploy OpenBao on the seed and overcloud hosts. + New deployments should use OpenBao for TLS generation. + Existing deployments using Hashicorp Vault for TLS generation should be migrated + to OpenBao once migration steps are available. diff --git a/releasenotes/notes/add-ubuntu-noble-support-caracal-1b9e64d4aa2e1ff7.yaml b/releasenotes/notes/add-ubuntu-noble-support-caracal-1b9e64d4aa2e1ff7.yaml new file mode 100644 index 000000000..12079d9af --- /dev/null +++ b/releasenotes/notes/add-ubuntu-noble-support-caracal-1b9e64d4aa2e1ff7.yaml @@ -0,0 +1,5 @@ +--- +features: + - | + Added support for Ubuntu 24.04 Noble Numbat as a host operating system. + Repositories and configuration for Ubuntu Noble have been added. diff --git a/releasenotes/notes/bifrost-fix-ipa-auth-9d0e0c3b948b5850.yaml b/releasenotes/notes/bifrost-fix-ipa-auth-9d0e0c3b948b5850.yaml new file mode 100644 index 000000000..87bb66c90 --- /dev/null +++ b/releasenotes/notes/bifrost-fix-ipa-auth-9d0e0c3b948b5850.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Fixes an issue where the IPA images in Ark could not be downloaded by + Bifrost, due to missing authentication parameters. diff --git a/releasenotes/notes/bump-alertmanager-958f90fa2bc9b562.yaml b/releasenotes/notes/bump-alertmanager-958f90fa2bc9b562.yaml new file mode 100644 index 000000000..22b60e653 --- /dev/null +++ b/releasenotes/notes/bump-alertmanager-958f90fa2bc9b562.yaml @@ -0,0 +1,5 @@ +--- +features: + - | + Prometheus Alertmanager has been updated to ``0.28.1``. This release + includes support for Microsoft Teams notifications. diff --git a/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml b/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml new file mode 100644 index 000000000..ddf1d58eb --- /dev/null +++ b/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml @@ -0,0 +1,15 @@ +--- +features: + - Upgrades the redfish exporter container image to the v2.x series. + - Adds support for lenovo hardware to the redfish exporter dashboard. + - | + Adds the ``stackhpcredfish_exporter_scrape_interval``, + ``stackhpc_os_capacity_scrape_interval``, and + ``stackhpc_prometheus_openstack_exporter_interval`` + configuration variables. +fixes: + - Fixes various issues with the redfish exporter dashboard. +upgrade: + - | + Increases default ``os_capacity_scrape_interval`` to ``5m``. If you already customise + this please move to the new ``stackhpc_os_capacity_scrape_interval`` variable. diff --git a/releasenotes/notes/disable-ironic-prometheus-exporter-894d98022a1e926d.yaml b/releasenotes/notes/disable-ironic-prometheus-exporter-894d98022a1e926d.yaml new file mode 100644 index 000000000..7b94110ce --- /dev/null +++ b/releasenotes/notes/disable-ironic-prometheus-exporter-894d98022a1e926d.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + The Ironic Prometheus Exporter is now disabled by default as redfish exporter + is preferred. If you need to use the Ironic Prometheus Exporter, you can + enable it by setting the `kolla_enable_ironic_prometheus_exporter` + option to `true`. diff --git a/releasenotes/notes/enable-building-skyline-61a41c13cfcd54a1.yaml b/releasenotes/notes/enable-building-skyline-61a41c13cfcd54a1.yaml new file mode 100644 index 000000000..681c338e6 --- /dev/null +++ b/releasenotes/notes/enable-building-skyline-61a41c13cfcd54a1.yaml @@ -0,0 +1,5 @@ +--- +features: + - | + Enable building of ``Skyline`` an alternative to ``Horizon``. + diff --git a/releasenotes/notes/fix-2024.1-kolla-image-build-f78a5524381fa4da.yaml b/releasenotes/notes/fix-2024.1-kolla-image-build-f78a5524381fa4da.yaml new file mode 100644 index 000000000..89f02fb87 --- /dev/null +++ b/releasenotes/notes/fix-2024.1-kolla-image-build-f78a5524381fa4da.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + Fix Kolla container image build workflow failing to find default + sources.list.ubuntu. + The default sources.list for ubuntu now has each for Ubuntu Jammy and + Noble. + This upstream change was brought by `Ubuntu 24.04 support for Caracal + `__. diff --git a/releasenotes/notes/fix-broken-links-dcd128c8e211b2b8.yaml b/releasenotes/notes/fix-broken-links-dcd128c8e211b2b8.yaml new file mode 100644 index 000000000..8d65779e0 --- /dev/null +++ b/releasenotes/notes/fix-broken-links-dcd128c8e211b2b8.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + Fix some broken links in the docs. diff --git a/releasenotes/notes/fix-ceph-rgw-blackbox-endpoint-7af02679b3fd093d.yaml b/releasenotes/notes/fix-ceph-rgw-blackbox-endpoint-7af02679b3fd093d.yaml new file mode 100644 index 000000000..fc8cce6b5 --- /dev/null +++ b/releasenotes/notes/fix-ceph-rgw-blackbox-endpoint-7af02679b3fd093d.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixes an issue where the ``ceph-rgw`` endpoints were added to the + Prometheus blackbox exporter when the Kolla loadbalancer was not in use for + RGWs. diff --git a/releasenotes/notes/fix-cephadm-facts-2ee6dc9a1c617944.yaml b/releasenotes/notes/fix-cephadm-facts-2ee6dc9a1c617944.yaml new file mode 100644 index 000000000..98f0e1779 --- /dev/null +++ b/releasenotes/notes/fix-cephadm-facts-2ee6dc9a1c617944.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + The Ceph version is now determined by ``os_release``, rather + than Ansible facts. Using Ansible facts caused playbooks to fail when + facts are not gathered. diff --git a/releasenotes/notes/fix-fluentd-opensearch-template-14bde9b6d5a723f8.yaml b/releasenotes/notes/fix-fluentd-opensearch-template-14bde9b6d5a723f8.yaml new file mode 100644 index 000000000..b5e98506a --- /dev/null +++ b/releasenotes/notes/fix-fluentd-opensearch-template-14bde9b6d5a723f8.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Fix missing bracket in fluentd OpenSearch configuration + causing container crashes when caso is enabled. diff --git a/releasenotes/notes/fix-ssh-in-ubuntu-overcloud-host-image-b395c8afb6d5b820.yaml b/releasenotes/notes/fix-ssh-in-ubuntu-overcloud-host-image-b395c8afb6d5b820.yaml new file mode 100644 index 000000000..d11b74e4e --- /dev/null +++ b/releasenotes/notes/fix-ssh-in-ubuntu-overcloud-host-image-b395c8afb6d5b820.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Bumps the ``stackhpc_ubuntu_jammy_overcloud_host_image_version`` to fix an + issue where ``authorized_keys`` for the ``ubuntu`` user was not populated + on newly provisioned nodes. Ubuntu package snapshots are also bumped to + match those used in the new host image. diff --git a/releasenotes/notes/kolla-update-fixes-5ff55225ce85441f.yaml b/releasenotes/notes/kolla-update-fixes-5ff55225ce85441f.yaml new file mode 100644 index 000000000..1d0b34d4b --- /dev/null +++ b/releasenotes/notes/kolla-update-fixes-5ff55225ce85441f.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Allow Kolla dependency updates on non-default branches. diff --git a/releasenotes/notes/pci-passthrough-support-0c7e62585aaf2c23.yaml b/releasenotes/notes/pci-passthrough-support-0c7e62585aaf2c23.yaml new file mode 100644 index 000000000..eae2d774b --- /dev/null +++ b/releasenotes/notes/pci-passthrough-support-0c7e62585aaf2c23.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Added templates and a playbook to simplify configuration of PCI passthrough + GPUs. GPU types can be mapped to inventory groups with the + ``gpu_group_map`` variable, which will configure the host and Nova + automatically. A list of supported GPUs can be found in + ``etc/kayobe/stackhpc-compute.yml`` under ``stackhpc_gpu_data``. diff --git a/releasenotes/notes/pin-deps-dbe52c49e945daf5.yaml b/releasenotes/notes/pin-deps-dbe52c49e945daf5.yaml new file mode 100644 index 000000000..db4e5383f --- /dev/null +++ b/releasenotes/notes/pin-deps-dbe52c49e945daf5.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + All Ansible dependencies are now pinned to specific versions. The + ``stackhpc.vxlan`` role is pinned to 1.1.0, and ``ansible-role-docker`` + is pinned to stackhpc/7.0.1.1. diff --git a/releasenotes/notes/rabbitmq-queue-migration-script-b6d3abebbebf8087.yaml b/releasenotes/notes/rabbitmq-queue-migration-script-b6d3abebbebf8087.yaml new file mode 100644 index 000000000..a475dbc7a --- /dev/null +++ b/releasenotes/notes/rabbitmq-queue-migration-script-b6d3abebbebf8087.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + Added a new script, ``rabbitmq-queue-migration.sh``, which will migrate to + the new RabbitMQ durable queues. This is intended for use prior to an + upgrade to Epoxy. diff --git a/releasenotes/notes/redfish-dashboard-fix-inlet-temp-91a7018adb2e1763.yaml b/releasenotes/notes/redfish-dashboard-fix-inlet-temp-91a7018adb2e1763.yaml new file mode 100644 index 000000000..73f587172 --- /dev/null +++ b/releasenotes/notes/redfish-dashboard-fix-inlet-temp-91a7018adb2e1763.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Fixes "Max Inlet Temp" time series chart in Redfish dashboard. This chart + could wrongly display CPU2 temperature instead of inlet temperature. diff --git a/releasenotes/notes/rgw-usage-exporter-deployment-fixes-0196167326dbe456.yaml b/releasenotes/notes/rgw-usage-exporter-deployment-fixes-0196167326dbe456.yaml new file mode 100644 index 000000000..46a591480 --- /dev/null +++ b/releasenotes/notes/rgw-usage-exporter-deployment-fixes-0196167326dbe456.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixed RADOS gateway usage exporter deployment failing + to generate ec2 credentials for the ceph_rgw user. + - | + Fixed RADOS gateway usage exporter not using the system + trust root as its CA bundle. diff --git a/releasenotes/notes/use-include-tags-for-client-pulp-cf46d328b30162be.yaml b/releasenotes/notes/use-include-tags-for-client-pulp-cf46d328b30162be.yaml new file mode 100644 index 000000000..7103ef604 --- /dev/null +++ b/releasenotes/notes/use-include-tags-for-client-pulp-cf46d328b30162be.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + Restrict the content that is synced to the client by using include tags. + This feature ensures that the tags as defined within + ``kolla-image-tags.yml`` are synced. diff --git a/tools/rabbitmq-queue-migration.sh b/tools/rabbitmq-queue-migration.sh new file mode 100755 index 000000000..089b197eb --- /dev/null +++ b/tools/rabbitmq-queue-migration.sh @@ -0,0 +1,72 @@ +#! /usr/bin/bash + +set -ex + +RED='\033[0;31m' +GREEN='\033[0;32m' + +RABBITMQ_SERVICES_TO_RESTART=barbican,blazar,cinder,cloudkitty,designate,heat,ironic,keystone,magnum,manila,neutron,nova,octavia +RABBITMQ_CONTAINER_NAME=rabbitmq + +if [[ ! $KAYOBE_CONFIG_PATH ]]; then + echo "${RED}Environment variable \$KAYOBE_CONFIG_PATH is not defined" + echo "${RED}Ensure your environment is set up to run kayobe commands" + exit 2 +fi + +if [[ ! "$1" = "--skip-checks" ]]; then + # Fail if clocks are not synced + if ! ( kayobe overcloud host command run -l controllers -b --command "timedatectl status | grep 'synchronized: yes'" ); then + echo "${RED}Failed precheck: Time not synced on controllers" + echo "${RED}Use 'timedatectl status' to check sync state" + echo "${RED}Either wait for sync or use 'chronyc makestep'" + exit 1 + fi + kayobe overcloud service configuration generate --node-config-dir /tmp/rabbit-migration --kolla-tags none + # Fail if any new feature flags are not set + if ! ( grep 'om_enable_queue_manager: true' $KOLLA_CONFIG_PATH/globals.yml && \ + grep 'om_enable_rabbitmq_quorum_queues: true' $KOLLA_CONFIG_PATH/globals.yml && \ + grep 'om_enable_rabbitmq_transient_quorum_queue: true' $KOLLA_CONFIG_PATH/globals.yml && \ + grep 'om_enable_rabbitmq_stream_fanout: true' $KOLLA_CONFIG_PATH/globals.yml ); then + echo "${RED}Failed precheck: The following must be enabled: om_enable_queue_manager, om_enable_rabbitmq_quorum_queues, om_enable_rabbitmq_transient_quorum_queue, om_enable_rabbitmq_stream_fanout" + exit 1 + fi +fi + +# Generate new config, stop services using rabbit, and reset rabbit state +kayobe overcloud service configuration generate --node-config-dir /etc/kolla --kolla-skip-tags rabbitmq-ha-precheck +kayobe kolla ansible run "stop --yes-i-really-really-mean-it" -kt $RABBITMQ_SERVICES_TO_RESTART +kayobe kolla ansible run rabbitmq-reset-state + +if [[ ! "$1" = "--skip-checks" ]]; then + # Fail if any queues still exist + sleep 20 + # Note(mattcrees): We turn the text grey here so the failed Ansible calls don't freak anyone out + CURRENTTERM=${TERM} + export TERM=xterm-mono + if ( kayobe overcloud host command run -l controllers -b --command "docker exec $RABBITMQ_CONTAINER_NAME rabbitmqctl list_queues name --silent | grep -v '^$'" ); then + export TERM=${CURRENTTERM} + echo -e "${RED}Failed check: RabbitMQ has not stopped properly, queues still exist" + exit 1 + fi + # Fail if any exchanges still exist (excluding those starting with 'amq.') + if ( kayobe overcloud host command run -l controllers -b --command "docker exec $RABBITMQ_CONTAINER_NAME rabbitmqctl list_exchanges name --silent | grep -v '^$' | grep -v '^amq.'" ); then + export TERM=${CURRENTTERM} + echo -e "${RED}Failed check: RabbitMQ has not stopped properly, exchanges still exist" + exit 1 + fi + export TERM=${CURRENTTERM} +fi + +# Redeploy with all durable-type queues enabled +kayobe kolla ansible run deploy-containers -kt $RABBITMQ_SERVICES_TO_RESTART + +if [[ ! "$1" = "--skip-checks" ]]; then + sleep 60 + # Assert that all queues are durable + if ! ( kayobe overcloud host command run -l controllers -b --command "docker exec $RABBITMQ_CONTAINER_NAME rabbitmqctl list_queues durable --silent | grep false" > /dev/null 2>&1 ); then + echo -e "${GREEN}Queues migrated successfully" + else + echo -e "${RED}Failed post-check: A controller has non-durable queues" + fi +fi diff --git a/tools/ubuntu-upgrade-overcloud.sh b/tools/ubuntu-upgrade-overcloud.sh index 14271c3d1..05d50068b 100755 --- a/tools/ubuntu-upgrade-overcloud.sh +++ b/tools/ubuntu-upgrade-overcloud.sh @@ -33,4 +33,4 @@ kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/ubuntu-upgrade.yml -e os_release kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/ovn-fix-chassis-priorities.yml -kayobe overcloud host configure --limit $1 --kolla-limit $1 -e os_release=noble +kayobe overcloud host configure --limit $1 -e os_release=noble diff --git a/tools/ubuntu-upgrade-seed-hypervisor.sh b/tools/ubuntu-upgrade-seed-hypervisor.sh index d34e58b26..259d0da5d 100755 --- a/tools/ubuntu-upgrade-seed-hypervisor.sh +++ b/tools/ubuntu-upgrade-seed-hypervisor.sh @@ -26,4 +26,4 @@ set -x kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/ubuntu-upgrade.yml -e os_release=noble --limit seed-hypervisor -kayobe seed hypervisor host configure +kayobe seed hypervisor host configure -e os_release=noble diff --git a/tools/ubuntu-upgrade-seed.sh b/tools/ubuntu-upgrade-seed.sh index d4191da46..96553225b 100755 --- a/tools/ubuntu-upgrade-seed.sh +++ b/tools/ubuntu-upgrade-seed.sh @@ -26,4 +26,4 @@ set -x kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/ubuntu-upgrade.yml -e os_release=noble --limit seed -kayobe seed host configure +kayobe seed host configure -e os_release=noble