From a71334973cd5fe15fc7a63b8206b81c4da0f332c Mon Sep 17 00:00:00 2001 From: pullan1 Date: Fri, 24 Apr 2026 13:44:48 +0530 Subject: [PATCH 01/17] Added inputs and artifacts for k8s 1.35.1 and version-specific service_k8s.json files for upgrade flow Signed-off-by: Super User --- .../input_validation/common_utils/config.py | 2 +- .../validation_flows/common_validation.py | 12 +- .../validation_flows/local_repo_validation.py | 21 +++- .../module_utils/local_repo/software_utils.py | 14 ++- .../modules/image_package_collector.py | 26 ++++- common/library/modules/prepare_tasklist.py | 9 +- examples/rhel_software_config.json | 2 +- ..._rhel_10.0_multi_arch_software_config.json | 2 +- ...late_rhel_10.0_x86-64_software_config.json | 2 +- .../x86_64/rhel/10.0/service_k8s_v1.34.1.json | 108 ++++++++++++++++++ .../x86_64/rhel/10.0/service_k8s_v1.35.1.json | 107 +++++++++++++++++ input/local_repo_config.yml | 15 ++- input/software_config.json | 2 +- .../roles/parse_and_download/tasks/main.yml | 1 + .../tasks/validate_software_config_json.yml | 2 +- local_repo/roles/validation/vars/main.yml | 13 ++- .../tasks/check_k8s_support.yml | 4 +- .../prepare_oim_validation/vars/main.yml | 13 ++- .../tasks/create_k8s_config_nfs.yml | 2 +- provision/roles/k8s_config/vars/main.yml | 3 +- .../telemetry/tasks/load_service_images.yml | 2 +- provision/roles/telemetry/tasks/main.yml | 2 +- .../telemetry/tasks/read_software_config.yml | 4 +- provision/roles/telemetry/vars/main.yml | 9 +- 24 files changed, 333 insertions(+), 44 deletions(-) create mode 100644 input/config/x86_64/rhel/10.0/service_k8s_v1.34.1.json create mode 100644 input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index 7f26f692e4..22b7942ecf 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -129,7 +129,7 @@ "openmpi": "5.0.8", "csi_driver_powerscale": "v2.15.0", "rocm": "6.3.1", - "service_k8s": "1.34.1" + "service_k8s": "1.35.1" } # All of the passwords fields diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index cf4e74d4f9..87652709ff 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -252,15 +252,23 @@ def validate_software_config( for software_pkg in data['softwares']: software = software_pkg['name'] arch_list = software_pkg.get('arch') + # Get software version for versioned JSON files (e.g., service_k8s_v1.35.1.json) + software_version = software_pkg.get('version') for arch in arch_list: json_path = get_json_file_path( - software, cluster_os_type, cluster_os_version, input_file_path, arch) + software, cluster_os_type, cluster_os_version, input_file_path, arch, + software_version=software_version) # Check if json_path is None or if the JSON syntax is invalid if not json_path: + # Construct expected filename for error message + if software == "service_k8s" and software_version: + expected_file = f"{software}_v{software_version}.json" + else: + expected_file = f"{software}.json" errors.append( create_error_msg( "Validation Error: ", software, - f"is present in software_config.json. JSON file not found: {software}.json" + f"is present in software_config.json. JSON file not found: {expected_file}" ) ) else: diff --git a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py index 8254faca1e..e81ecd8be7 100644 --- a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py @@ -21,7 +21,7 @@ from ansible.module_utils.input_validation.common_utils import validation_utils from ansible.module_utils.input_validation.common_utils import config from ansible.module_utils.input_validation.common_utils import en_us_validation_msg -from ansible.module_utils.local_repo.software_utils import load_yaml, load_json +from ansible.module_utils.local_repo.software_utils import load_yaml, load_json, get_json_file_path file_names = config.files create_error_msg = validation_utils.create_error_msg @@ -239,13 +239,22 @@ def validate_local_repo_config(input_file_path, data, for software in software_config_json["softwares"]: sw = software["name"] arch_list = software.get("arch") + # Get software version for versioned JSON files (e.g., service_k8s_v1.35.1.json) + software_version = software.get("version") for arch in arch_list: - json_path = create_file_path( - input_file_path, - f"config/{arch}{os_ver_path}" + sw +".json") - if not os.path.exists(json_path): + # Use get_json_file_path for proper versioned JSON file resolution + json_path = get_json_file_path( + sw, cluster_os_type, cluster_os_version, + software_config_file_path, arch, + software_version=software_version) + if not json_path or not os.path.exists(json_path): + # Construct expected filename for error message + if sw == "service_k8s" and software_version: + expected_file = f"{sw}_v{software_version}.json" + else: + expected_file = f"{sw}.json" errors.append( - create_error_msg(sw + '/' + arch, f"{sw} JSON file not found for architecture {arch}.", json_path)) + create_error_msg(sw + '/' + arch, f"{sw} JSON file not found for architecture {arch}.", expected_file)) else: curr_json = load_json(json_path) pkg_list = curr_json[sw]['cluster'] diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py index bc5da2876a..d3306d58b8 100644 --- a/common/library/module_utils/local_repo/software_utils.py +++ b/common/library/module_utils/local_repo/software_utils.py @@ -118,7 +118,7 @@ def load_yaml(file_path): return yaml.safe_load(file) def get_json_file_path(software_name, cluster_os_type, - cluster_os_version, user_json_path, arch): + cluster_os_version, user_json_path, arch, software_version=None): """ Generate the file path for a JSON file based on the provided software name, cluster OS type, cluster OS version, and user JSON path. @@ -129,13 +129,23 @@ def get_json_file_path(software_name, cluster_os_type, cluster_os_version (str): The version of the cluster operating system. user_json_path (str): The path to the user JSON file. arch: Architecture for a particular software + software_version (str, optional): Version of the software for versioned JSON files. + Used for software like service_k8s that have versioned JSON files + (e.g., service_k8s_v1.35.1.json). Returns: str or None: The file path for the JSON file if it exists, otherwise None. """ base_path = os.path.dirname(os.path.abspath(user_json_path)) + + # Handle versioned JSON files (e.g., service_k8s_v1.35.1.json) + if software_name == "service_k8s" and software_version: + json_filename = f"{software_name}_v{software_version}.json" + else: + json_filename = f"{software_name}.json" + json_path = os.path.join(base_path, - f'{SOFTWARE_CONFIG_SUBDIR}/{arch}/{cluster_os_type}/{cluster_os_version}/{software_name}.json' + f'{SOFTWARE_CONFIG_SUBDIR}/{arch}/{cluster_os_type}/{cluster_os_version}/{json_filename}' ) return json_path diff --git a/common/library/modules/image_package_collector.py b/common/library/modules/image_package_collector.py index 77ff67b50d..90a315d750 100644 --- a/common/library/modules/image_package_collector.py +++ b/common/library/modules/image_package_collector.py @@ -151,7 +151,12 @@ def process_functional_group(fg_name, arch, os_version, input_project_dir, packages = [] for json_file in json_files: + # Extract software name from json file + # Handle versioned files like service_k8s_v1.35.1.json -> service_k8s sw_name = json_file.replace(".json", "") + # Remove version suffix for versioned files (e.g., service_k8s_v1.35.1 -> service_k8s) + if sw_name.startswith("service_k8s_v"): + sw_name = "service_k8s" if sw_name not in allowed_softwares: continue @@ -170,7 +175,8 @@ def process_functional_group(fg_name, arch, os_version, input_project_dir, sw_data, fg_name=fg_name, slurm_defined=True ) ) - elif json_file == "service_k8s.json": + elif json_file.startswith("service_k8s_v"): + # Handle versioned service_k8s_v.json files packages.extend( collect_packages_from_json( sw_data, fg_name=fg_name, service_k8s_defined=True @@ -194,6 +200,7 @@ def run_module(): software_config_file=dict(type="str", required=True), input_project_dir=dict(type="str", required=True), additional_json_path=dict(type="str", required=False, default=""), + service_k8s_version=dict(type="str", required=False, default=""), ) result = dict( @@ -212,6 +219,7 @@ def run_module(): software_config_file = module.params["software_config_file"] input_project_dir = module.params["input_project_dir"] additional_json_path = module.params["additional_json_path"] + service_k8s_version = module.params["service_k8s_version"] software_config = load_json_file(software_config_file, module) if not software_config: @@ -221,6 +229,13 @@ def run_module(): if not os_version: module.fail_json(msg="cluster_os_version not found in software_config.json") + # Extract service_k8s version from software_config if not provided + if not service_k8s_version: + for sw in software_config.get("softwares", []): + if sw.get("name") == "service_k8s" and sw.get("version"): + service_k8s_version = sw["version"] + break + allowed_softwares = { sw["name"] for sw in software_config.get("softwares", []) } @@ -229,14 +244,17 @@ def run_module(): additional_enabled = is_additional_packages_enabled(software_config) allowed_additional_subgroups = get_allowed_additional_subgroups(software_config) if additional_enabled else [] + # Versioned JSON file for service_k8s: service_k8s_v.json + service_k8s_json = f"service_k8s_v{service_k8s_version}.json" if service_k8s_version else "service_k8s.json" + # pylint: disable=line-too-long # Functional group → json files mapping software_map = { "os_x86_64": ["default_packages.json", "ldms.json"], "os_aarch64": ["default_packages.json", "ldms.json"], - "service_kube_node_x86_64": ["service_k8s.json"], - "service_kube_control_plane_first_x86_64": ["service_k8s.json"], - "service_kube_control_plane_x86_64": ["service_k8s.json"], + "service_kube_node_x86_64": [service_k8s_json], + "service_kube_control_plane_first_x86_64": [service_k8s_json], + "service_kube_control_plane_x86_64": [service_k8s_json], "slurm_control_node_x86_64": ["slurm_custom.json", "openldap.json", "ldms.json"], "slurm_node_x86_64": ["slurm_custom.json", "openldap.json", "ldms.json"], "login_node_x86_64": ["slurm_custom.json", "openldap.json", "ldms.json"], diff --git a/common/library/modules/prepare_tasklist.py b/common/library/modules/prepare_tasklist.py index 9714c7aaf0..688774cdd7 100644 --- a/common/library/modules/prepare_tasklist.py +++ b/common/library/modules/prepare_tasklist.py @@ -123,8 +123,15 @@ def main(): logger.info("Preparing package lists...") for software in software_list[arch]: logger.info(f"Processing software: {software}") + # Get software version for versioned JSON files (e.g., service_k8s_v1.35.1.json) + software_version = None + for sw in user_data.get("softwares", []): + if sw.get("name") == software and sw.get("version"): + software_version = sw["version"] + break json_path[arch] = get_json_file_path(software, cluster_os_type, - cluster_os_version, user_json_file, arch) + cluster_os_version, user_json_file, arch, + software_version=software_version) status_csv_path[arch] = get_csv_file_path(software, log_dir, arch) logger.info(f"json_path: {json_path}") logger.info(f"status_csv_path: {status_csv_path}") diff --git a/examples/rhel_software_config.json b/examples/rhel_software_config.json index 394ef53120..b9f60b3f3d 100644 --- a/examples/rhel_software_config.json +++ b/examples/rhel_software_config.json @@ -6,7 +6,7 @@ {"name": "default_packages", "arch": ["x86_64","aarch64"]}, {"name": "admin_debug_packages", "arch": ["x86_64","aarch64"]}, {"name": "openldap", "arch": ["x86_64","aarch64"]}, - {"name": "service_k8s","version": "1.34.1", "arch": ["x86_64"]}, + {"name": "service_k8s","version": "1.35.1", "arch": ["x86_64"]}, {"name": "slurm_custom", "arch": ["x86_64","aarch64"]}, {"name": "ucx", "version": "1.19.0", "arch": ["x86_64","aarch64"]}, {"name": "openmpi", "version": "5.0.8", "arch": ["x86_64","aarch64"]}, diff --git a/examples/software_config_template/template_rhel_10.0_multi_arch_software_config.json b/examples/software_config_template/template_rhel_10.0_multi_arch_software_config.json index 83eaa12a8c..69bc80c84f 100644 --- a/examples/software_config_template/template_rhel_10.0_multi_arch_software_config.json +++ b/examples/software_config_template/template_rhel_10.0_multi_arch_software_config.json @@ -7,7 +7,7 @@ {"name": "admin_debug_packages", "arch": ["x86_64","aarch64"]}, {"name": "openldap", "arch": ["x86_64","aarch64"]}, {"name": "slurm_custom", "arch": ["x86_64","aarch64"]}, - {"name": "service_k8s", "version": "1.34.1", "arch": ["x86_64"]}, + {"name": "service_k8s", "version": "1.35.1", "arch": ["x86_64"]}, {"name": "ucx", "version": "1.19.0", "arch": ["x86_64","aarch64"]}, {"name": "openmpi", "version": "5.0.8", "arch": ["x86_64","aarch64"]}, {"name": "csi_driver_powerscale", "version":"v2.15.0", "arch": ["x86_64"]}, diff --git a/examples/software_config_template/template_rhel_10.0_x86-64_software_config.json b/examples/software_config_template/template_rhel_10.0_x86-64_software_config.json index 907958e590..650e912b78 100644 --- a/examples/software_config_template/template_rhel_10.0_x86-64_software_config.json +++ b/examples/software_config_template/template_rhel_10.0_x86-64_software_config.json @@ -7,7 +7,7 @@ {"name": "admin_debug_packages", "arch": ["x86_64"]}, {"name": "openldap", "arch": ["x86_64"]}, {"name": "slurm_custom", "arch": ["x86_64"]}, - {"name": "service_k8s", "version": "1.34.1", "arch": ["x86_64"]}, + {"name": "service_k8s", "version": "1.35.1", "arch": ["x86_64"]}, {"name": "ucx", "version": "1.19.0", "arch": ["x86_64"]}, {"name": "openmpi", "version": "5.0.8", "arch": ["x86_64"]}, {"name": "csi_driver_powerscale", "version":"v2.15.0", "arch": ["x86_64"]}, diff --git a/input/config/x86_64/rhel/10.0/service_k8s_v1.34.1.json b/input/config/x86_64/rhel/10.0/service_k8s_v1.34.1.json new file mode 100644 index 0000000000..6deed2309b --- /dev/null +++ b/input/config/x86_64/rhel/10.0/service_k8s_v1.34.1.json @@ -0,0 +1,108 @@ +{ + "service_k8s": { + "cluster": [ + { "package": "docker.io/library/busybox", "type": "image", "tag": "1.36" }, + { "package": "firewalld", "type": "rpm", "repo_name": "baseos" }, + { "package": "python3-firewall", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "vim-enhanced", "type": "rpm", "repo_name": "appstream"}, + { "package": "fuse-overlayfs", "type": "rpm", "repo_name": "appstream"}, + { "package": "podman", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubeadm-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, + { "package": "kubelet-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, + { "package": "container-selinux", "type": "rpm", "repo_name": "appstream"}, + { "package": "cri-o-1.34.1", "type": "rpm", "repo_name": "cri-o"}, + { "package": "docker.io/victoriametrics/victoria-metrics", "type": "image", "tag": "v1.128.0" }, + { "package": "docker.io/victoriametrics/vmagent", "type": "image", "tag": "v1.128.0" }, + { "package": "docker.io/victoriametrics/vmstorage", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/vminsert", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/vmselect", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.34.1", "type": "image" }, + { "package": "docker.io/curlimages/curl", "type": "image", "tag": "8.17.0" }, + { "package": "docker.io/rmohr/activemq", "type": "image", "tag": "5.15.9" }, + { "package": "docker.io/library/mysql", "type": "image", "tag": "9.3.0" }, + { "package": "docker.io/dellhpcomniaaisolution/idrac_telemetry_receiver", "type": "image", "tag": "1.2" }, + { "package": "docker.io/dellhpcomniaaisolution/kafkapump", "type": "image", "tag": "1.2" }, + { "package": "docker.io/dellhpcomniaaisolution/victoriapump", "type": "image", "tag": "1.2" }, + { "package": "cryptography==45.0.7", "type": "pip_module" }, + { "package": "omsdk==1.2.518", "type": "pip_module" }, + { "package": "cffi==1.17.1", "type": "pip_module" }, + { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" }, + { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, + { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, + { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, + { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, + { "package": "apptainer", "type": "rpm", "repo_name": "epel" }, + { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } + ] + }, + "service_kube_control_plane": { + "cluster": [ + { "package": "ghcr.io/kube-vip/kube-vip", "tag": "v0.8.9", "type": "image" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-apiserver", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-controller-manager", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-scheduler", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-proxy", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/coredns/coredns", "tag": "v1.12.1", "type": "image" }, + { "package": "registry.k8s.io/pause", "tag": "3.10.1", "type": "image" }, + { "package": "registry.k8s.io/etcd", "tag": "3.6.4-0", "type": "image" }, + { "package": "docker.io/calico/cni", "tag": "v3.30.3", "type": "image" }, + { "package": "docker.io/calico/kube-controllers", "tag": "v3.30.3", "type": "image" }, + { "package": "docker.io/calico/node", "tag": "v3.30.3", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.2", "type": "image" }, + { "package": "kubectl-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, + { "package": "prettytable==3.14.0", "type": "pip_module" }, + { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubernetes==33.1.0", "type": "pip_module" }, + { "package": "PyMySQL==1.1.2", "type": "pip_module" } + + ] + }, + "service_kube_control_plane_first": { + "cluster": [ + { "package": "ghcr.io/kube-vip/kube-vip", "tag": "v0.8.9", "type": "image" }, + { "package": "registry.k8s.io/kube-apiserver", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-controller-manager", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-scheduler", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-proxy", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/coredns/coredns", "tag": "v1.12.1", "type": "image" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.34.1", "type": "image" }, + { "package": "registry.k8s.io/pause", "tag": "3.10.1", "type": "image" }, + { "package": "registry.k8s.io/etcd", "tag": "3.6.4-0", "type": "image" }, + { "package": "docker.io/calico/cni", "tag": "v3.30.3", "type": "image" }, + { "package": "docker.io/calico/kube-controllers", "tag": "v3.30.3", "type": "image" }, + { "package": "docker.io/calico/node", "tag": "v3.30.3", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.2", "type": "image" }, + { + "package": "calico-v3.30.3", + "type": "manifest", + "url": "https://raw.githubusercontent.com/projectcalico/calico/v3.30.3/manifests/calico.yaml" + }, + { + "package": "metallb-native-v0.15.2", + "type": "manifest", + "url": "https://raw.githubusercontent.com/metallb/metallb/v0.15.2/config/manifests/metallb-native.yaml" + }, + { "package": "helm-v3.19.0-amd64", "type": "tarball", "url": "https://get.helm.sh/helm-v3.19.0-linux-amd64.tar.gz" }, + { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" }, + { "package": "kubectl-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, + { "package": "prettytable==3.14.0", "type": "pip_module" }, + { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubernetes==33.1.0", "type": "pip_module" }, + { "package": "PyMySQL==1.1.2", "type": "pip_module" } + + ] + }, + + "service_kube_node": { + "cluster": [ + { "package": "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner", "tag": "v4.0.2", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.2", "type": "image" }, + { "package": "quay.io/metallb/controller", "tag": "v0.15.2", "type": "image" } + ] + } +} + diff --git a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json new file mode 100644 index 0000000000..2950a4fd79 --- /dev/null +++ b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json @@ -0,0 +1,107 @@ +{ + "service_k8s": { + "cluster": [ + { "package": "docker.io/library/busybox", "type": "image", "tag": "1.36" }, + { "package": "firewalld", "type": "rpm", "repo_name": "baseos" }, + { "package": "python3-firewall", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "vim-enhanced", "type": "rpm", "repo_name": "appstream"}, + { "package": "fuse-overlayfs", "type": "rpm", "repo_name": "appstream"}, + { "package": "podman", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubeadm-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, + { "package": "kubelet-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, + { "package": "container-selinux", "type": "rpm", "repo_name": "appstream"}, + { "package": "cri-o-1.35.1", "type": "rpm", "repo_name": "cri-o-v1-35"}, + { "package": "docker.io/victoriametrics/victoria-metrics", "type": "image", "tag": "v1.128.0" }, + { "package": "docker.io/victoriametrics/vmagent", "type": "image", "tag": "v1.128.0" }, + { "package": "docker.io/victoriametrics/vmstorage", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/vminsert", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/vmselect", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.35.1", "type": "image" }, + { "package": "docker.io/curlimages/curl", "type": "image", "tag": "8.17.0" }, + { "package": "docker.io/rmohr/activemq", "type": "image", "tag": "5.15.9" }, + { "package": "docker.io/library/mysql", "type": "image", "tag": "9.3.0" }, + { "package": "docker.io/dellhpcomniaaisolution/idrac_telemetry_receiver", "type": "image", "tag": "1.2" }, + { "package": "docker.io/dellhpcomniaaisolution/kafkapump", "type": "image", "tag": "1.2" }, + { "package": "docker.io/dellhpcomniaaisolution/victoriapump", "type": "image", "tag": "1.2" }, + { "package": "cryptography==45.0.7", "type": "pip_module" }, + { "package": "omsdk==1.2.518", "type": "pip_module" }, + { "package": "cffi==1.17.1", "type": "pip_module" }, + { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" }, + { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, + { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, + { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, + { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, + { "package": "apptainer", "type": "rpm", "repo_name": "epel" }, + { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } + ] + }, + "service_kube_control_plane": { + "cluster": [ + { "package": "ghcr.io/kube-vip/kube-vip", "tag": "v0.8.9", "type": "image" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-apiserver", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-controller-manager", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-scheduler", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-proxy", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/coredns/coredns", "tag": "v1.13.1", "type": "image" }, + { "package": "registry.k8s.io/pause", "tag": "3.10.1", "type": "image" }, + { "package": "registry.k8s.io/etcd", "tag": "3.6.6-0", "type": "image" }, + { "package": "docker.io/calico/cni", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/kube-controllers", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/node", "tag": "v3.31.4", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" }, + { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, + { "package": "prettytable==3.14.0", "type": "pip_module" }, + { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubernetes==35.0.0", "type": "pip_module" }, + { "package": "PyMySQL==1.1.2", "type": "pip_module" } + + ] + }, + "service_kube_control_plane_first": { + "cluster": [ + { "package": "ghcr.io/kube-vip/kube-vip", "tag": "v0.8.9", "type": "image" }, + { "package": "registry.k8s.io/kube-apiserver", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-controller-manager", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-scheduler", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-proxy", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/coredns/coredns", "tag": "v1.13.1", "type": "image" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.35.1", "type": "image" }, + { "package": "registry.k8s.io/pause", "tag": "3.10.1", "type": "image" }, + { "package": "registry.k8s.io/etcd", "tag": "3.6.6-0", "type": "image" }, + { "package": "docker.io/calico/cni", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/kube-controllers", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/node", "tag": "v3.31.4", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" }, + { + "package": "calico-v3.31.4", + "type": "manifest", + "url": "https://raw.githubusercontent.com/projectcalico/calico/v3.31.4/manifests/calico.yaml" + }, + { + "package": "metallb-native-v0.15.3", + "type": "manifest", + "url": "https://raw.githubusercontent.com/metallb/metallb/v0.15.3/config/manifests/metallb-native.yaml" + }, + { "package": "helm-v3.20.1-amd64", "type": "tarball", "url": "https://get.helm.sh/helm-v3.20.1-linux-amd64.tar.gz" }, + { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" }, + { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, + { "package": "prettytable==3.14.0", "type": "pip_module" }, + { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubernetes==35.0.0", "type": "pip_module" }, + { "package": "PyMySQL==1.1.2", "type": "pip_module" } + + ] + }, + + "service_kube_node": { + "cluster": [ + { "package": "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner", "tag": "v4.0.2", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" }, + { "package": "quay.io/metallb/controller", "tag": "v0.15.3", "type": "image" } + ] + } +} diff --git a/input/local_repo_config.yml b/input/local_repo_config.yml index 7eca8c9346..73aaa09b51 100644 --- a/input/local_repo_config.yml +++ b/input/local_repo_config.yml @@ -50,7 +50,7 @@ # Notes: # - Do not use Jinja variables in this configuration. # - Omit SSL fields entirely if SSL is not in use. -# - Its a mandatory field in case of slurm_custom with name as 'slurm_custom' +# - Version-specific naming (e.g., 'kubernetes-v1-35', 'cri-o-v1-35') is used only for service_k8s # # 3. user_repo_url_aarch64 #--------------------------- @@ -180,11 +180,20 @@ rhel_os_url_aarch64: rhel_subscription_repo_config_x86_64: rhel_subscription_repo_config_aarch64: # Making incorrect changes to this variable can cause omnia failure. Please edit cautiously. +# ============================================================================ +# VERSIONED REPOSITORY NAMING CONVENTION (Omnia 2.2+) +# ============================================================================ +# Starting from Omnia 2.2, repositories use versioned naming: +# - kubernetes-v- (e.g., kubernetes-v1-35) +# - cri-o-v- (e.g., cri-o-v1-35) +# Version-specific naming is used only for service_k8s components (kubernetes, cri-o) +# Other components (doca, cuda, slurm_custom) use non-versioned naming +# ============================================================================ omnia_repo_url_rhel_x86_64: - { url: "https://download.docker.com/linux/centos/10/x86_64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"} - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/x86_64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"} - - { url: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/", gpgkey: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "kubernetes"} - - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "cri-o"} + - { url: "https://pkgs.k8s.io/core:/stable:/v1.35/rpm/", gpgkey: "https://pkgs.k8s.io/core:/stable:/v1.35/rpm/repodata/repomd.xml.key", name: "kubernetes-v1-35"} + - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.35/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.35/rpm/repodata/repomd.xml.key", name: "cri-o-v1-35"} - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/", gpgkey: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/repodata/repomd.xml.key", name: "doca"} - { url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel10/x86_64/", gpgkey: "https://developer.download.nvidia.com/compute/cuda/repos/rhel10/x86_64/repodata/repomd.xml.key", name: "cuda"} omnia_repo_url_rhel_aarch64: diff --git a/input/software_config.json b/input/software_config.json index 8fa558bf28..70e3d679ee 100644 --- a/input/software_config.json +++ b/input/software_config.json @@ -6,7 +6,7 @@ {"name": "default_packages", "arch": ["x86_64","aarch64"]}, {"name": "admin_debug_packages", "arch": ["x86_64","aarch64"]}, {"name": "openldap", "arch": ["x86_64","aarch64"]}, - {"name": "service_k8s","version": "1.34.1", "arch": ["x86_64"]}, + {"name": "service_k8s","version": "1.35.1", "arch": ["x86_64"]}, {"name": "slurm_custom", "arch": ["x86_64","aarch64"]}, {"name": "csi_driver_powerscale", "version":"v2.15.0", "arch": ["x86_64"]}, {"name": "ldms", "arch": ["x86_64","aarch64"]}, diff --git a/local_repo/roles/parse_and_download/tasks/main.yml b/local_repo/roles/parse_and_download/tasks/main.yml index 3b203aef79..4e3d3be21b 100644 --- a/local_repo/roles/parse_and_download/tasks/main.yml +++ b/local_repo/roles/parse_and_download/tasks/main.yml @@ -18,6 +18,7 @@ project_input_path: "{{ hostvars['localhost']['input_project_dir'] }}" update_metadata: false show_softwares_status: false + sub_final_repo_urls: {} - name: Include oim metadata vars ansible.builtin.include_vars: "{{ omnia_metadata_file }}" diff --git a/local_repo/roles/validation/tasks/validate_software_config_json.yml b/local_repo/roles/validation/tasks/validate_software_config_json.yml index 0a221ebfee..190904eb95 100644 --- a/local_repo/roles/validation/tasks/validate_software_config_json.yml +++ b/local_repo/roles/validation/tasks/validate_software_config_json.yml @@ -91,4 +91,4 @@ msg: "{{ fail_msg }}" when: - service_k8s_support - - service_k8s_version != default_k8s_version + - service_k8s_version not in supported_k8s_versions diff --git a/local_repo/roles/validation/vars/main.yml b/local_repo/roles/validation/vars/main.yml index 88cceea868..8720bec1de 100644 --- a/local_repo/roles/validation/vars/main.yml +++ b/local_repo/roles/validation/vars/main.yml @@ -106,11 +106,14 @@ specific_softwares: - 'intelgaudi' - 'openmpi' - 'bcm_roce_libraries' -default_k8s_version: "1.34.1" +default_k8s_version: "1.35.1" +supported_k8s_versions: + - "1.34.1" + - "1.35.1" fail_msg: >- service_k8s is not supported for version: {{ service_k8s_version }}. - Please update the service_k8s version in software_config.json to {{ default_k8s_version }} - and rerun the playbook. + Please update the service_k8s version in software_config.json to a supported version + ({{ supported_k8s_versions | join(', ') }}) and rerun the playbook. versions_fail_msg: "Versions were not defined for the following softwares: {{ failed_softwares | join(', ') }} in software_config.json. Refer examples/template_{{ cluster_os_type }}_software_config.json and define version details accordingly in {{ project_input_path }}/software_config.json" @@ -175,6 +178,10 @@ http_key: http # Usage: validate_metadata.yml meta_dest: "{{ nfs_shared_path }}/offline_repo/.data" metadata_file_path: "{{ meta_dest }}/localrepo_metadata.yml" +metadata_identical_msg: "Metadata is identical. No changes detected." +metadata_warn_msg: | + WARNING: Metadata has changed since last run. + This may indicate changes in software_config.json or local_repo_config.yml. build_stream_auto_accept_metadata_msg: "Build stream is enabled, automatically accepting metadata changes." # Usage: remove_k8s_line.yml diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml b/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml index 9bbda37138..3bf8ec5fc5 100644 --- a/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml +++ b/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml @@ -31,10 +31,10 @@ k8s_versions: "{{ software_config.softwares | selectattr('name', 'in', ['compute_k8s', 'service_k8s']) | map(attribute='version') | list | unique }}" # noqa: yaml[line-length] k8s_arch: "{{ (software_config.softwares | selectattr('name', 'in', ['compute_k8s', 'service_k8s']) | first).get('arch', default_archs) }}" - - name: Set k8s_support_check to false if any k8s version is not default_k8s_version + - name: Set k8s_support_check to false if any k8s version is not in supported_k8s_versions ansible.builtin.set_fact: k8s_support_check: false - when: (k8s_versions | select('ne', default_k8s_version) | list | length) > 0 + when: (k8s_versions | select('notin', supported_k8s_versions) | list | length) > 0 - name: Fail if unsupported service_k8s version is detected ansible.builtin.fail: diff --git a/prepare_oim/roles/prepare_oim_validation/vars/main.yml b/prepare_oim/roles/prepare_oim_validation/vars/main.yml index 79bd5f5b4d..d9365e9a66 100644 --- a/prepare_oim/roles/prepare_oim_validation/vars/main.yml +++ b/prepare_oim/roles/prepare_oim_validation/vars/main.yml @@ -32,13 +32,16 @@ software_config_syntax_fail_msg: "Failed. Syntax errors present in software_conf file_permission: "0755" # Usage: check_k8s_support.yml -fail_msg_k8s_version: "Failed. Kubernetes Version is unsupported or incorrect in software_config.json. Update software_config.json with a supported Kubernetes versions and re-run the playbook.Supported versions are - {{ supported_k8s_version }}" # noqa: yaml[line-length] -invalid_k8s_versions: "{{ k8s_versions | select('ne', default_k8s_version) | list }}" +fail_msg_k8s_version: "Failed. Kubernetes Version is unsupported or incorrect in software_config.json. Update software_config.json with a supported Kubernetes versions and re-run the playbook.Supported versions are - {{ supported_k8s_versions }}" # noqa: yaml[line-length] +invalid_k8s_versions: "{{ k8s_versions | select('notin', supported_k8s_versions) | list }}" fail_msg: >- service_k8s is not supported for version: {{ invalid_k8s_versions }}. - Please update the service_k8s version in software_config.json to {{ default_k8s_version }} - and rerun the playbook. -default_k8s_version: "1.34.1" + Please update the service_k8s version in software_config.json to a supported version + ({{ supported_k8s_versions | join(', ') }}) and rerun the playbook. +default_k8s_version: "1.35.1" +supported_k8s_versions: + - "1.34.1" + - "1.35.1" # Usage: validate_network_spec.yml network_spec: "{{ input_project_dir }}/network_spec.yml" diff --git a/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml b/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml index 40e9328cdd..71e995020c 100644 --- a/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml +++ b/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml @@ -196,7 +196,7 @@ - name: Include local repo access variable file ansible.builtin.include_vars: "{{ local_repo_access_config_file }}" -- name: Load service_k8s.json +- name: Load service_k8s_.json ansible.builtin.set_fact: k8s_packages_json: "{{ lookup('file', k8s_packages_file) | from_json }}" diff --git a/provision/roles/k8s_config/vars/main.yml b/provision/roles/k8s_config/vars/main.yml index f3f06a75de..6ca89acf40 100644 --- a/provision/roles/k8s_config/vars/main.yml +++ b/provision/roles/k8s_config/vars/main.yml @@ -15,7 +15,8 @@ local_repo_access_config_file: "/opt/omnia/provision/local_repo_access.yml" input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" -k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s.json" # noqa: yaml[line-length] +# Versioned JSON file: service_k8s_v.json (e.g., service_k8s_v1.35.1.json) +k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s_v{{ hostvars['localhost']['service_k8s_version'] }}.json" # noqa: yaml[line-length] calico_manifest_yaml_url: "{{ offline_manifest_path }}/{{ calico_package }}/{{ calico_package }}.yml" metallb_manifest_yaml_url: "{{ offline_manifest_path }}/{{ metallb_package }}/{{ metallb_package }}.yml" multus_manifest_yaml_url: "{{ offline_manifest_path }}/{{ multus_package }}/{{ multus_package }}.yml" diff --git a/provision/roles/telemetry/tasks/load_service_images.yml b/provision/roles/telemetry/tasks/load_service_images.yml index 893b830fb2..654c73c9a7 100644 --- a/provision/roles/telemetry/tasks/load_service_images.yml +++ b/provision/roles/telemetry/tasks/load_service_images.yml @@ -13,7 +13,7 @@ # limitations under the License. --- -- name: Extract image packages from service_k8s.json +- name: Extract image packages from service_k8s_.json ansible.builtin.set_fact: service_k8s_image_list: "{{ telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'image') | list }}" diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index 2e9c3ac0da..e965181024 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -21,7 +21,7 @@ - name: Read telemetry packages from software config ansible.builtin.include_tasks: read_software_config.yml -- name: Load service images from service_k8s.json +- name: Load service images from service_k8s_.json ansible.builtin.include_tasks: load_service_images.yml - name: Configure of k8s telemetry service diff --git a/provision/roles/telemetry/tasks/read_software_config.yml b/provision/roles/telemetry/tasks/read_software_config.yml index 005f9e65a2..3bc1a52637 100644 --- a/provision/roles/telemetry/tasks/read_software_config.yml +++ b/provision/roles/telemetry/tasks/read_software_config.yml @@ -35,11 +35,11 @@ ansible.builtin.set_fact: cluster_os_version: "{{ software_config['cluster_os_version'] }}" -- name: Load service_k8s.json +- name: Load service_k8s_.json ansible.builtin.set_fact: telemetry_packages: "{{ lookup('file', k8s_packages_file) | from_json }}" -- name: Extract service_k8s.json and set facts for pip_modules and python_version +- name: Extract service_k8s_.json and set facts for pip_modules and python_version ansible.builtin.set_fact: k8s_pip_packages: >- {{ telemetry_packages['service_kube_control_plane']['cluster'] diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index b393423728..5d3748c69a 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -20,7 +20,8 @@ service_cluster_metadata_path: "/opt/omnia/.data/service_cluster_metadata.yml" metadata_perm: "0644" # Usage: read_software_config.yml -k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s.json" +# Versioned JSON file: service_k8s_v.json (e.g., service_k8s_v1.35.1.json) +k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s_v{{ hostvars['localhost']['service_k8s_version'] }}.json" # Usage: secrets_creation.yml mysqldb_secrets_name: mysqldb-credentials @@ -68,7 +69,7 @@ kafka: service_name: "kafka-headless" lb_service_name: "kafka-loadbalancer" container_port1: 9093 - # Kafka images from service_k8s.json + # Kafka images from service_k8s_.json operator_image: "{{ telemetry_images['strimzi/operator'] | default('quay.io/strimzi/operator:0.48.0') }}" kafka_image: "{{ telemetry_images['strimzi/kafka'] | default('quay.io/strimzi/kafka:0.48.0-kafka-4.1.0') }}" bridge_image: "{{ telemetry_images['strimzi/kafka-bridge'] | default('quay.io/strimzi/kafka-bridge:0.33.1') }}" @@ -85,8 +86,8 @@ kafka: name: "ldms" consumer_group: "ldms-consumer-group" -# Dynamic image configuration from service_k8s.json -# Images and versions are read dynamically from input/config/x86_64/rhel/10.0/service_k8s.json +# Dynamic image configuration from service_k8s_.json +# Images and versions are read dynamically from input/config/x86_64/rhel/10.0/service_k8s_.json telemetry_images: "{{ service_k8s_images | default({}) }}" # Usage: victoriametric_deployment.yml From 5912e03597e4a6a1ffe18129c0b3a76774c1720b Mon Sep 17 00:00:00 2001 From: pullan1 Date: Fri, 24 Apr 2026 17:42:39 +0530 Subject: [PATCH 02/17] updated service_k8s json Signed-off-by: pullan1 --- .../x86_64/rhel/10.0/service_k8s_v1.35.1.json | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json index 2950a4fd79..f5db677ab3 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json +++ b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json @@ -17,6 +17,8 @@ { "package": "docker.io/victoriametrics/vmstorage", "type": "image", "tag": "v1.128.0-cluster" }, { "package": "docker.io/victoriametrics/vminsert", "type": "image", "tag": "v1.128.0-cluster" }, { "package": "docker.io/victoriametrics/vmselect", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/victoria-logs", "type": "image", "tag": "v1.49.0" }, + { "package": "docker.io/victoriametrics/vlagent", "type": "image", "tag": "v1.49.0" }, { "package": "docker.io/alpine/kubectl", "tag": "1.35.1", "type": "image" }, { "package": "docker.io/curlimages/curl", "type": "image", "tag": "8.17.0" }, { "package": "docker.io/rmohr/activemq", "type": "image", "tag": "5.15.9" }, @@ -29,9 +31,18 @@ { "package": "cffi==1.17.1", "type": "pip_module" }, { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" }, { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, - { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, + { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, + { "package": "quay.io/dell/container-storage-modules/csm-metrics-powerscale", "tag": "v1.11.0", "type": "image" }, + { "package": "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector", "tag": "0.148.0", "type": "image" }, + { "package": "docker.io/nginxinc/nginx-unprivileged", "tag": "1.29", "type": "image" }, + { "package": "karavi-observability", "type": "git", "url": "https://github.com/dell/karavi-observability.git", "version": "v1.12.0" }, + { "package": "helm-charts", "type": "git", "url": "https://github.com/dell/helm-charts.git", "version": "container-storage-modules-1.9.2" }, + { "package": "cert-manager-v1.10.0", "type": "tarball", "url": "https://charts.jetstack.io/charts/cert-manager-v1.10.0.tgz" }, { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, + { "package": "docker.io/victoriametrics/operator", "tag": "v0.68.3", "type": "image" }, + { "package": "docker.io/victoriametrics/operator", "tag": "config-reloader-v0.68.3", "type": "image" }, + { "package": "victoria-metrics-operator-0.59.3", "type": "tarball", "url": "https://github.com/VictoriaMetrics/helm-charts/releases/download/victoria-metrics-operator-0.59.3/victoria-metrics-operator-0.59.3.tgz" }, { "package": "apptainer", "type": "rpm", "repo_name": "epel" }, { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] From d9fdbe5807e85f6ae8a378014341acc29c5af8c1 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Fri, 24 Apr 2026 21:22:31 +0530 Subject: [PATCH 03/17] Fix for k8s 1.35.1 fresh install support Signed-off-by: pullan1 --- local_repo/roles/parse_and_download/tasks/main.yml | 1 - .../roles/prepare_oim_validation/tasks/check_k8s_support.yml | 2 +- prepare_oim/roles/prepare_oim_validation/vars/main.yml | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/local_repo/roles/parse_and_download/tasks/main.yml b/local_repo/roles/parse_and_download/tasks/main.yml index 4e3d3be21b..3b203aef79 100644 --- a/local_repo/roles/parse_and_download/tasks/main.yml +++ b/local_repo/roles/parse_and_download/tasks/main.yml @@ -18,7 +18,6 @@ project_input_path: "{{ hostvars['localhost']['input_project_dir'] }}" update_metadata: false show_softwares_status: false - sub_final_repo_urls: {} - name: Include oim metadata vars ansible.builtin.include_vars: "{{ omnia_metadata_file }}" diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml b/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml index 3bf8ec5fc5..b3b4c76fb9 100644 --- a/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml +++ b/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml @@ -34,7 +34,7 @@ - name: Set k8s_support_check to false if any k8s version is not in supported_k8s_versions ansible.builtin.set_fact: k8s_support_check: false - when: (k8s_versions | select('notin', supported_k8s_versions) | list | length) > 0 + when: (k8s_versions | reject('in', supported_k8s_versions) | list | length) > 0 - name: Fail if unsupported service_k8s version is detected ansible.builtin.fail: diff --git a/prepare_oim/roles/prepare_oim_validation/vars/main.yml b/prepare_oim/roles/prepare_oim_validation/vars/main.yml index d9365e9a66..7ee5cfd5a9 100644 --- a/prepare_oim/roles/prepare_oim_validation/vars/main.yml +++ b/prepare_oim/roles/prepare_oim_validation/vars/main.yml @@ -33,7 +33,7 @@ file_permission: "0755" # Usage: check_k8s_support.yml fail_msg_k8s_version: "Failed. Kubernetes Version is unsupported or incorrect in software_config.json. Update software_config.json with a supported Kubernetes versions and re-run the playbook.Supported versions are - {{ supported_k8s_versions }}" # noqa: yaml[line-length] -invalid_k8s_versions: "{{ k8s_versions | select('notin', supported_k8s_versions) | list }}" +invalid_k8s_versions: "{{ k8s_versions | reject('in', supported_k8s_versions) | list }}" fail_msg: >- service_k8s is not supported for version: {{ invalid_k8s_versions }}. Please update the service_k8s version in software_config.json to a supported version From f7fae97e579b4bee5addb532e7168a2646cfe5d6 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Mon, 27 Apr 2026 11:13:27 +0530 Subject: [PATCH 04/17] Feature branch sync - pub/q2_dev to pub/q2_upgrade (#4319) * Merge pull request #4294 from mithileshreddy04/pub/q2_dev OpenCHAMI upgrade changes in prepare_oim and oim_cleanup * Feature branch sync - pub/telemetry to pub/q2_dev (#4293) * Update openchami git version (#4251) Co-authored-by: mithileshreddy04 Co-authored-by: priti-parate <140157516+priti-parate@users.noreply.github.com> * powerscale teleemtry support with direct authentication mode * use existing vmagent * update messages in vars * merge Pub/q2 dev to pub/telemetry (#4254) * removing input template * Fix for pulp remote RemoteArtifacts is 0 after repo migration Signed-off-by: pullan1 --------- Signed-off-by: pullan1 Co-authored-by: pullan1 Co-authored-by: snarthan * Powerscale teleemtry support using helm * deploy powerscale telemetry using cloud-init * offline deployment of powerscale telemetry * fix for cert-manager failure * fix for cert manager failure * powerscale telemetry deployment with telemetry namespace * sync q2_dev changes (#4263) * removing input template * Fix for pulp remote RemoteArtifacts is 0 after repo migration Signed-off-by: pullan1 * Feature/ome discovery pxe mapping enhancements (#4245) * feat(discovery): OME static group extraction, PXE mapping IP/SU/parent tag enhancements ome_server_inventory.py: - Fix static group extraction: find 'Static Groups' container by name and select only direct children via ParentId; avoids picking system/nested groups - Emit module.warn() for static groups that exist but have no devices assigned - Fix idrac_hostname: read InstrumentationName/DnsName from DeviceManagement ManagementType==2 entry instead of DeviceName which returns the IP address generate_pxe_mapping.py: - ADMIN_IP: derive from first 2 octets of admin_network.subnet + last 2 of BMC IP - IB_IP: derive from first 2 octets of ib_network.subnet + last 2 of BMC IP - Skip IB_IP/IB_MAC when server has no IB NIC (ib_nic_mac is empty) - Add extract_su_from_hostname() with regex (SU[A-Z]?\d+)(?=R\d+) to parse Scalable Unit from BMC hostname; rejects service-tag-only hostnames (idrac-JCGT033) and falls back to grp0 when no SU pattern is found - Set GROUP_NAME to extracted SU identifier (fallback: grp0) - Post-process rows to assign PARENT_SERVICE_TAG from the service_kube_control_plane_x86_64 node within the same SU group - Remove BMC_HOSTNAME from CSV headers and output rows - Lint: remove dead try/except in calculate_admin_ip/calculate_ib_ip, reuse ib_mac variable, suppress broad-except pylint warning generate_pxe_mapping.yml: - Load network_spec.yml via include_vars - Set admin_subnet and ib_subnet using selectattr on Networks list - Pass both subnets as parameters to the generate_pxe_mapping module defaults/main.yml: - Add admin_subnet and ib_subnet default variables (empty string) provision_validation.py: - Comment out validate_admin_ips_against_network_spec function and its call site; ADMIN_IPs are now derived from subnet octets + BMC IP and will not necessarily fall within primary_oim_admin_ip/netmask_bits range * refactor: rename discovery directory to provision, update network_spec.yml - Renamed discovery/ to provision/ (git detected as rename, no content loss) - Updated input/network_spec.yml with latest network configuration changes * Update discovery.yml * refactor: unify OME credentials into get_config_credentials flow - Added ome_ip, ome_username, ome_password to omnia_credential.j2 template - Added 'discovery' service entry to omnia_credentials in update_config/vars/main.yml - Added 'discovery' to the hardcoded service key trigger list in fetch_credentials.yml - Replaced custom vault logic in get_ome_credentials.yml with unified decrypt_include_encrypt.yml call against omnia_config_credentials.yml - Updated ome_discovery/vars/main.yml to reference omnia_config_credentials_file and omnia_config_credentials_vault_key instead of the separate .vault/ paths - Deleted .vault/ome_credentials.yml and .vault/.vault_password (no longer needed) * chore: update copyright year from 2025 to 2026 in modified files Updated copyright header in all ome_discovery files modified during this feature branch: - library/generate_pxe_mapping.py - library/ome_server_inventory.py - tasks/generate_pxe_mapping.yml - tasks/get_ome_credentials.yml - defaults/main.yml - vars/main.yml * fix: restore discovery_validations role missed during discovery-to-provision rename discovery/roles/discovery_validations/ was accidentally dropped when renaming the discovery/ directory to provision/. Add it back under provision/roles/discovery_validations/ to resolve the PR merge conflict. * chore: update copyright year to 2026 in provision/roles/discovery_validations files * fix: remove duplicate discovery_validations role (provision_validations already exists) provision/roles/provision_validations/ is the correct renamed equivalent of discovery/roles/discovery_validations/. The discovery_validations copy added to provision/ was redundant. * feat: apply upstream telemetry upgrade changes from dell/omnia pub/q2_dev - Replace kubectl command with kubernetes.core.k8s module for iDRAC StatefulSet - Preserve existing replica count during iDRAC StatefulSet upgrade - Add LDMS store daemon check, restart, and readiness wait tasks * fix: quote build_stream_job_id_absent message in provision_validations vars * feat: add discovery/roles/discovery_validations and telemetry files - Add discovery/roles/discovery_validations/vars/main.yml with task definitions for validation flow - Add discovery/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml with upstream telemetry upgrade logic (replica preservation + LDMS store) * fix: wrap long line in fetch_credentials.yml to satisfy yaml[line-length] lint * refactor: move ome_ip from credentials to discovery_config.yml - Create input/discovery_config.yml for non-credential discovery settings (ome_ip, future Magellan config) - Remove ome_ip from omnia_credential.j2 and credential update vars - Load ome_ip via include_vars from discovery_config.yml in get_ome_credentials.yml - Add discovery_config.yml to provision_validations discovery_inputs - Remove redundant ib_subnet/admin_subnet defaults from ome_discovery * fix: add newline at end of ome_discovery/defaults/main.yml * fix: override role_path to absolute path for decrypt_include_encrypt.yml role_path resolves to ome_discovery role path, causing encrypt_files_vars.yml to be looked up incorrectly. Override to playbook_dir dirname (/opt/omnia/omnia). * fix: inline credential loading to avoid role_path resolution issue role_path cannot be overridden in include_tasks vars. Replace the call to decrypt_include_encrypt.yml with direct include_vars using stat checks for encrypted vs unencrypted credential file handling. * fix: skip load-failure rule in ansible-lint to avoid CI false positives ansible-lint fails to resolve role_path relative paths during static analysis in GitHub Actions, causing false load-failure errors for files that exist and work at runtime. * Update ansible.cfg * Update ansible.cfg * refactor: rename discovery references to provision and add discovery_config variable - Rename discover_mapping_nodes.yml to provision_mapping_nodes.yml - Replace "discovery" terminology with "provision" across playbooks, vars, READMEs, and task names in provision roles - Add subnet as required field with IP pattern validation in network_spec schema - Define discovery_config variable in ome_discovery vars and use it in get_ome_credentials.yml (consistent with provision_config pattern) - Rename discovery_inputs to provision_inputs in validation vars - Rename discovery_mech_mapping to provision_mech_mapping - Update user-facing messages to reference provision.yml Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> * fix: credential rules, vault handling, GROUP_NAME validation, and discovery playbook improvements - Add ome_username and ome_password validation rules to credential_rules.json - Add 'discovery' tag to prepare_oim omnia_run_tags so OME credentials are prompted - Fix vault-encrypted credential loading in get_ome_credentials.yml (use decrypt-include-reencrypt pattern instead of unsupported vault_password_file) - Add include_input_dir.yml import to discovery.yml so input_project_dir is set - Accept SU1-SU100 (case-insensitive) in addition to grp0-grp100 for GROUP_NAME - Fix Magellan message to use list format (avoids \n in debug output) - Remove escaped quotes from discovery usage examples Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> * fix: extend SU group name support to build_image validation and schemas - Add build_aarch_image tag to input_file_inventory so build_image_aarch64.yml runs provision_config validation (was missing, causing no validation to run for aarch64 builds) - Update GROUP_NAME patterns in functional_groups_config.json and omnia_config.json schemas to accept SU1-SU100 format alongside grp0-grp100 - Update INVALID_GROUP_NAME_MSG to reflect both accepted formats Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --------- Signed-off-by: Sujit Jadhav Co-authored-by: Super User Co-authored-by: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> * Cleanup discovery roles: move library modules, remove unused roles (#4261) * Cleanup discovery roles: move library modules, remove unused roles - Move ome_server_inventory.py and generate_pxe_mapping.py from discovery/roles/ome_discovery/library/ to common/library/modules/ so they are shared via the common module search path already configured in discovery/ansible.cfg - Remove unused discovery/roles/telemetry/ directory - Remove unused discovery/roles/discovery_validations/ directory - Load discovery_config.yml at playbook level in discovery.yml (consistent with how build_stream_config.yml is loaded in provision.yml) - Fix discovery_complete_msg formatting for readable Ansible output Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> * Remove unused discovery_validations role Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --------- Co-authored-by: Super User Co-authored-by: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> * fix for set_pxe_boot.yml when custom inventory given (#4260) * Update generate_bmc_inventory.yml Signed-off-by: SOWJANYAJAGADISH123 * Update pre_checks.yml Signed-off-by: SOWJANYAJAGADISH123 * lint issue Signed-off-by: SOWJANYAJAGADISH123 --------- Signed-off-by: SOWJANYAJAGADISH123 --------- Signed-off-by: pullan1 Signed-off-by: Sujit Jadhav Signed-off-by: SOWJANYAJAGADISH123 Co-authored-by: pullan1 Co-authored-by: snarthan Co-authored-by: Sujit Jadhav Co-authored-by: Super User Co-authored-by: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: SOWJANYAJAGADISH123 * resolving merge conflict * revert openchami commit id * resolving review comments * addressing review comments * fix for vmagent scraping powerscale metrics * cleanup script correction for powerscale telemetry cleanup * victoria operator and victoria log input validation * vitoria log input and input validation * remving L2 vslidation for victoria log which is not required * input validation and review comment addressing * change idrac_telemetry_collection_type to telemetry_collection_type * Remove invisible Unicode LRM (U+200E) characters from victoria-operator template filenames * VictoriaLogs container image references and default variable * port check * resolve merge conflict * correction for schema * Update telemetry_config.json * Update validate_input.py * merge conflict telemetry_prereq.yml * change victoria_configurations to victoria_metrics_configurations * remove deployment mode input variable * update for upgrade scenarios * update comments * update comment * resolving issues due to merge conflict * vitoria log changes * victoria log cluster component and VLAgent deployment * updating pod name * removing the changes of adding cert * victoria log changes * remivng victoria log pod calidation playbook * cleanup changes for victoria log * Update ansible-lint.yml and pylint for pub/telemetry (#4296) * Update ansible-lint.yml Signed-off-by: Kratika Patidar * Update pylint.yml Signed-off-by: Kratika Patidar * fixing ansible-lint * lint * line-lenght --------- Signed-off-by: Kratika Patidar --------- Signed-off-by: pullan1 Signed-off-by: Sujit Jadhav Signed-off-by: SOWJANYAJAGADISH123 Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> Signed-off-by: Kratika Patidar Co-authored-by: mithileshreddy04 Co-authored-by: priti-parate <140157516+priti-parate@users.noreply.github.com> Co-authored-by: pullan1 Co-authored-by: snarthan Co-authored-by: Sujit Jadhav Co-authored-by: Super User Co-authored-by: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: SOWJANYAJAGADISH123 Co-authored-by: Kratika_Patidar * IB nic ip assignment * update MinIO and registry images to fixed tagged versions, omnia core container tag and version to 2.2 and v2.2.0.0 (#4309) * Minimal OS-only functional group enablement for x86_64 and aarch64 * Update image_package_collector.py * Update provision_validation.py * Minimal OS functional group updates in provision * Minimal OS functional group upgrade * Fix os_* package cross-contamination and remove stale discovery templates * OpenCHAMI upgrade changes * Update openchami container tags * Update main.yml * Update main.yml * Update main.yml * Update omnia version and core tag --------- Signed-off-by: pullan1 Signed-off-by: Sujit Jadhav Signed-off-by: SOWJANYAJAGADISH123 Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> Signed-off-by: Kratika Patidar Co-authored-by: Mithilesh Reddy Co-authored-by: priti-parate <140157516+priti-parate@users.noreply.github.com> Co-authored-by: pullan1 Co-authored-by: snarthan Co-authored-by: Sujit Jadhav Co-authored-by: Super User Co-authored-by: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: SOWJANYAJAGADISH123 Co-authored-by: Kratika_Patidar Co-authored-by: Nagachandan-P --- .github/workflows/ansible-lint.yml | 1 + .github/workflows/pylint.yml | 1 + .../common_utils/en_us_validation_msg.py | 71 ++++- .../schema/telemetry_config.json | 100 +++++-- .../validation_flows/common_validation.py | 265 +++++++++++++++++- .../validation_flows/provision_validation.py | 67 +++++ common/library/modules/validate_input.py | 2 +- .../pxe_mapping_file.csv | 12 +- .../catalog_rhel_json/pxe_mapping_file.csv | 24 +- .../pxe_mapping_file.csv | 22 +- .../pxe_mapping_file.csv | 12 +- examples/pxe_mapping_file.csv | 22 +- .../rhel/10.0/csi_driver_powerscale.json | 2 +- .../config/x86_64/rhel/10.0/service_k8s.json | 36 +-- input/pxe_mapping_file.csv | 27 +- input/telemetry_config.yml | 157 +++++++++-- omnia.sh | 39 ++- .../deploy_containers/openchami/vars/main.yml | 18 +- .../ci-group-default_x86_64.yaml.j2 | 18 ++ ...-group-login_compiler_node_aarch64.yaml.j2 | 3 +- ...i-group-login_compiler_node_x86_64.yaml.j2 | 3 +- .../ci-group-login_node_aarch64.yaml.j2 | 3 +- .../ci-group-login_node_x86_64.yaml.j2 | 3 +- ...ce_kube_control_plane_first_x86_64.yaml.j2 | 7 +- ...-service_kube_control_plane_x86_64.yaml.j2 | 3 +- .../ci-group-service_kube_node_x86_64.yaml.j2 | 3 +- ...ci-group-slurm_control_node_x86_64.yaml.j2 | 3 +- .../ci-group-slurm_node_aarch64.yaml.j2 | 3 +- .../ci-group-slurm_node_x86_64.yaml.j2 | 5 +- .../doca-ofed/configure-ib-network.sh.j2 | 58 ++-- .../deploy_powerscale_telemetry.sh.j2 | 178 ++++++++++++ .../verify_powerscale_telemetry.sh.j2 | 182 ++++++++++++ .../templates/telemetry/telemetry.sh.j2 | 17 +- .../tasks/check_kube_vip_reachability.yml | 52 ++++ .../tasks/deploy_powerscale_metrics.yml | 124 ++++++++ .../tasks/generate_telemetry_deployments.yml | 29 +- .../get_powerscale_telemetry_dependencies.yml | 162 +++++++++++ provision/roles/telemetry/tasks/main.yml | 17 +- .../telemetry/tasks/telemetry_prereq.yml | 6 +- .../telemetry/cleanup_telemetry.sh.j2 | 263 +++++++++++++---- .../common/telemetry_secret_creation.yaml.j2 | 2 +- .../idrac_telemetry_statefulset.yaml.j2 | 2 +- .../kafka/kafka.kafkapump_user.yaml.j2 | 2 +- .../kafka/kafka.tls_test_job.yaml.j2 | 6 +- .../templates/telemetry/kustomization.yaml.j2 | 41 +-- .../csm-metrics-deployment-direct.yaml.j2 | 82 ++++++ .../victoria/gen_victoria_certs.sh.j2 | 65 +++-- .../victoria-agent-deployment.yaml.j2 | 8 + .../victoria-cluster-vminsert.yaml.j2 | 161 ----------- .../victoria-cluster-vmselect.yaml.j2 | 160 ----------- .../victoria-cluster-vmstorage.yaml.j2 | 183 ------------ .../victoria-operator-vmagent.yaml.j2 | 72 +++++ .../victoria-operator-vmcluster.yaml.j2 | 241 ++++++++++++++++ .../victoria-operator-vmpodscrape.yaml.j2 | 46 +++ .../victoria-operator-vmsingle.yaml.j2 | 86 ++++++ .../victoria/victoria-statefulset.yaml.j2 | 4 +- .../victoria/victoria-tls-test-job.yaml.j2 | 20 +- .../victoria/victoria-vmagent-rbac.yaml.j2 | 6 +- .../victorialogs-operator-vlagent.yaml.j2 | 208 ++++++++++++++ .../victorialogs-operator-vlcluster.yaml.j2 | 231 +++++++++++++++ .../victorialogs-vlagent-config.yaml.j2 | 155 ++++++++++ .../victoria/vmagent-scrape-config.yaml.j2 | 17 +- provision/roles/telemetry/vars/main.yml | 249 ++++++++++++++-- .../tasks/transform_telemetry_config.yml | 8 +- .../templates/telemetry_config.j2 | 31 +- .../tasks/main.yml | 34 +-- .../vars/main.yml | 11 +- .../oim_container_cleanup/vars/main.yml | 4 + 68 files changed, 3275 insertions(+), 880 deletions(-) create mode 100644 provision/roles/configure_ochami/templates/cloud_init/ci-group-default_x86_64.yaml.j2 create mode 100644 provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_telemetry.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/powerscale/verify_powerscale_telemetry.sh.j2 create mode 100644 provision/roles/telemetry/tasks/check_kube_vip_reachability.yml create mode 100644 provision/roles/telemetry/tasks/deploy_powerscale_metrics.yml create mode 100644 provision/roles/telemetry/tasks/get_powerscale_telemetry_dependencies.yml create mode 100644 provision/roles/telemetry/templates/telemetry/powerscale/csm-metrics-deployment-direct.yaml.j2 delete mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vminsert.yaml.j2 delete mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vmselect.yaml.j2 delete mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vmstorage.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmsingle.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victorialogs-vlagent-config.yaml.j2 diff --git a/.github/workflows/ansible-lint.yml b/.github/workflows/ansible-lint.yml index 9d08d236a9..621bd0b930 100644 --- a/.github/workflows/ansible-lint.yml +++ b/.github/workflows/ansible-lint.yml @@ -8,6 +8,7 @@ on: - release_1.7.1 - pub/build_stream - pub/q2_dev + - pub/telemetry jobs: build: diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index be748d1fe1..c979ce72ca 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -8,6 +8,7 @@ on: - release_1.7.1 - pub/build_stream - pub/q2_dev + - pub/telemetry jobs: build: diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index 18694dcbec..fe1baa69e2 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -70,7 +70,7 @@ "and Ports fields.") SWITCH_DETAILS_NO_BMC_DETAILS_MSG = ("If switch details are provided then bmc_detail's " "static_range must also be provided.") -INVALID_GROUP_NAME_MSG = "Groups must be defined in the form of grp where n is 0-100, or SU where n is 1-100." +INVALID_GROUP_NAME_MSG = "Groups must be defined in the form of grp where n is 0-99." INVALID_LOCATION_ID_MSG = ("location_id must follow the format SU-.RACK- where n is 0-99. " "This input is case-sensitive. Please use uppercase letters only.") INVALID_ATTRIBUTES_ROLE_MSG = ("Please provide valid attributes for the role, " @@ -264,6 +264,75 @@ def switch_snmp3_username_fail_msg(min_username_length, max_length): "service k8s/slurm roles in the mapping file or remove ldms from " "software_config.json and rerun the playbook.") +# PowerScale telemetry validation messages +POWERSCALE_VICTORIA_REQUIRED_MSG = ( + "PowerScale telemetry requires VictoriaMetrics to be deployed. " + "When powerscale_configurations.powerscale_telemetry_support is true, 'victoria' must be included in " + "telemetry_collection_type (e.g., 'victoria' or 'victoria,kafka')." +) +POWERSCALE_CSI_DRIVER_MISSING_MSG = ( + "csi_driver_powerscale is not configured in software_config.json. " + "PowerScale telemetry requires the CSI driver for PowerScale to be configured." +) +POWERSCALE_SERVICE_CLUSTER_MISSING_MSG = ( + "service cluster is not defined in functional_groups_config.yml. " + "PowerScale telemetry requires a service cluster." +) +POWERSCALE_CONFIGURATIONS_MISSING_MSG = ( + "powerscale_configurations section is required and must contain powerscale_telemetry_support." +) +POWERSCALE_OTEL_STORAGE_SIZE_INVALID_MSG = ( + "must be a non-empty string in format 'XGi' (e.g., '5Gi')" +) +POWERSCALE_CSM_VALUES_PATH_REQUIRED_MSG = ( + "csm_observability_values_file_path is required when powerscale_configurations.powerscale_telemetry_support is true. " + "Please provide the path to the CSM Observability values.yaml file." +) +POWERSCALE_AUTH_PROXY_HOST_MISSING_MSG = ( + "karaviMetricsPowerscale.authorization.proxyHost is required in the CSM Observability values file " + "when karaviMetricsPowerscale.authorization.enabled is true. " + "Please provide the hostname or IP of the CSM Authorization Proxy server." +) +def powerscale_csm_values_not_found_msg(path): + """Returns error message when CSM Observability values.yaml file is not found.""" + return ( + f"CSM Observability values.yaml file not found at '{path}'. " + "Please verify the file path is correct." + ) +POWERSCALE_CSM_VALUES_INVALID_YAML_MSG = ( + "CSM Observability values.yaml must contain a valid YAML dictionary." +) +def powerscale_csm_values_parse_error_msg(error): + """Returns error message when CSM Observability values.yaml fails to parse.""" + return f"Failed to parse CSM Observability values.yaml: {error}" +POWERSCALE_CSM_VALUES_MISSING_KARAVI_SECTION_MSG = ( + "CSM Observability values.yaml is missing 'karaviMetricsPowerscale' section." +) +POWERSCALE_CSM_METRICS_IMAGE_MISSING_MSG = ( + "CSM Metrics PowerScale image is required in CSM Observability values.yaml." +) +POWERSCALE_OTEL_COLLECTOR_IMAGE_MISSING_MSG = ( + "OTEL Collector image is required in CSM Observability values.yaml." +) +POWERSCALE_ADDITIONAL_ENDPOINTS_URL_EMPTY_MSG = ( + "Each additional_remote_write_endpoint must have a non-empty 'url' field." +) +POWERSCALE_ADDITIONAL_ENDPOINTS_URL_INVALID_MSG = ( + "URL must start with 'http://' or 'https://'." +) +def powerscale_image_version_mismatch_msg(image_name, values_image, service_k8s_image): + """Returns error message when CSM values.yaml image version doesn't match service_k8s.json.""" + return ( + f"Image version mismatch for '{image_name}': " + f"CSM Observability values.yaml has '{values_image}' but " + f"service_k8s.json has '{service_k8s_image}'. " + f"Please update service_k8s.json to match the values.yaml version " + f"and re-run local_repo.yml to mirror the correct image to Pulp." + ) +POWERSCALE_SERVICE_K8S_JSON_NOT_FOUND_MSG = ( + "service_k8s.json not found. Cannot validate PowerScale telemetry image versions. " + "Please ensure local_repo.yml has been executed." +) def boolean_fail_msg(value): """Returns a formatted message indicating boolean_fail_msg.""" return f"{value} must be set to either true or false." diff --git a/common/library/module_utils/input_validation/schema/telemetry_config.json b/common/library/module_utils/input_validation/schema/telemetry_config.json index a6523462e8..6b511a5f12 100644 --- a/common/library/module_utils/input_validation/schema/telemetry_config.json +++ b/common/library/module_utils/input_validation/schema/telemetry_config.json @@ -10,7 +10,7 @@ "type": "boolean", "description": "Enable or disable NVIDIA DCGM (Data Center GPU Manager) on GPU compute nodes. When true, nvidia-dcgm.service is started during cloud-init provisioning. Default: true" }, - "idrac_telemetry_collection_type": { + "telemetry_collection_type": { "anyOf": [ { "type": "string", @@ -43,6 +43,53 @@ "default": 10001, "description": "LDMS sampler port on compute nodes. Valid range: 10001-10100. Default: 10001" }, + "powerscale_configurations": { + "type": "object", + "properties": { + "powerscale_telemetry_support": { + "type": "boolean", + "default": true, + "description": "Enable or disable PowerScale telemetry support. Requires csi_driver_powerscale in software_config.json." + }, + "powerscale_log_enabled": { + "type": "boolean", + "default": false, + "description": "Enable or disable PowerScale log collection (syslog to VictoriaLogs). Requires powerscale_telemetry_support: true." + }, + "otel_collector_storage_size": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei)$", + "default": "5Gi", + "description": "PVC size for OTEL Collector metric batching and buffering." + }, + "csm_observability_values_file_path": { + "type": "string", + "description": "Path to the user-provided Helm values file for karavi-observability chart. Required when powerscale_telemetry_support is true." + }, + "additional_remote_write_endpoints": { + "type": "array", + "default": [], + "items": { + "type": "object", + "properties": { + "url": { + "type": "string", + "pattern": "^https?://", + "description": "VictoriaMetrics remote_write endpoint URL." + }, + "tls_insecure_skip_verify": { + "type": "boolean", + "default": false, + "description": "Skip TLS certificate verification for this endpoint." + } + }, + "required": ["url"] + }, + "description": "Additional VictoriaMetrics remote_write endpoints. vmagent writes to all configured endpoints." + } + }, + "required": ["powerscale_telemetry_support", "otel_collector_storage_size", "csm_observability_values_file_path"] + }, "ldms_sampler_configurations": { "anyOf": [ { @@ -126,7 +173,7 @@ ] } }, - "required": ["idrac_telemetry_support", "dcgm_support", "idrac_telemetry_collection_type", "ldms_sampler_configurations", "ldms_agg_port", "ldms_store_port", "ldms_sampler_port" ], + "required": ["idrac_telemetry_support", "dcgm_support", "telemetry_collection_type", "ldms_sampler_configurations", "ldms_agg_port", "ldms_store_port", "ldms_sampler_port", "powerscale_configurations" ], "$defs": { "kafka_configurations": { "type": "object", @@ -177,7 +224,7 @@ } }, "uniqueItems": true, - "description": "IMPORTANT: At least one Kafka topic must be defined. Topic names 'idrac' and 'ldms' are CONSTANTS. 'idrac' is required if idrac_telemetry_support is true and kafka is in idrac_telemetry_collection_type. 'ldms' is required if LDMS software is configured in software_config.json (automatic detection). Only partition counts can be changed.", + "description": "IMPORTANT: At least one Kafka topic must be defined. Topic names 'idrac' and 'ldms' are CONSTANTS. 'idrac' is required if idrac_telemetry_support is true and kafka is in telemetry_collection_type. 'ldms' is required if LDMS software is configured in software_config.json (automatic detection). Only partition counts can be changed.", "errorMessage": { "minItems": "At least 1 Kafka topic must be defined. Configure based on enabled features.", "maxItems": "Maximum 2 topics allowed: 'idrac' and 'ldms'", @@ -194,18 +241,9 @@ ], "additionalProperties": false }, - "victoria_configurations": { + "victoria_metrics_configurations": { "type": "object", "properties": { - "deployment_mode": { - "type": "string", - "enum": ["single-node", "cluster"], - "default": "cluster", - "description": "VictoriaMetrics deployment mode. 'single-node' for simple deployment (1 pod), 'cluster' for high-availability deployment (7 pods). Default: 'cluster'", - "errorMessage": { - "enum": "deployment_mode must be either 'single-node' or 'cluster'" - } - }, "persistence_size": { "type": "string", "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei)$" @@ -216,11 +254,29 @@ } }, "required": [ - "deployment_mode", "persistence_size", "retention_period" ], "additionalProperties": false + }, + "victoria_logs_configurations": { + "type": "object", + "description": "VictoriaLogs cluster mode configuration. Deployed alongside VictoriaMetrics when 'victoria' is in collection type.", + "properties": { + "storage_size": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei)$", + "default": "8Gi", + "description": "Storage size per vlstorage replica PVC. Total = storage_size x 3 replicas.", + "errorMessage": "storage_size must be a valid Kubernetes PVC size (e.g., '8Gi', '50Gi', '1Ti')" + }, + "retention_period": { + "type": "integer", + "minimum": 24 + } + }, + "required": ["storage_size", "retention_period"], + "additionalProperties": false } }, "allOf": [ @@ -228,7 +284,7 @@ "if": { "properties": { "idrac_telemetry_support": { "const": true }, - "idrac_telemetry_collection_type": { "pattern": "(?i)^kafka$" } + "telemetry_collection_type": { "pattern": "(?i)^kafka$" } } }, "then": { @@ -242,13 +298,14 @@ "if": { "properties": { "idrac_telemetry_support": { "const": true }, - "idrac_telemetry_collection_type": { "pattern": "(?i)^victoria$" } + "telemetry_collection_type": { "pattern": "(?i)^victoria$" } } }, "then": { - "required": ["victoria_configurations"], + "required": ["victoria_metrics_configurations", "victoria_logs_configurations"], "properties": { - "victoria_configurations": { "$ref": "#/$defs/victoria_configurations" } + "victoria_metrics_configurations": { "$ref": "#/$defs/victoria_metrics_configurations" }, + "victoria_logs_configurations": { "$ref": "#/$defs/victoria_logs_configurations" } } } }, @@ -256,16 +313,17 @@ "if": { "properties": { "idrac_telemetry_support": { "const": true }, - "idrac_telemetry_collection_type": { + "telemetry_collection_type": { "pattern": "(?i)^(victoria,kafka|kafka,victoria)$" } } }, "then": { - "required": ["kafka_configurations", "victoria_configurations"], + "required": ["kafka_configurations", "victoria_metrics_configurations", "victoria_logs_configurations"], "properties": { "kafka_configurations": { "$ref": "#/$defs/kafka_configurations" }, - "victoria_configurations": { "$ref": "#/$defs/victoria_configurations" } + "victoria_metrics_configurations": { "$ref": "#/$defs/victoria_metrics_configurations" }, + "victoria_logs_configurations": { "$ref": "#/$defs/victoria_logs_configurations" } } } } diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index 87652709ff..da3d0d73bf 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -21,6 +21,7 @@ import ipaddress import json import os +import re from collections import Counter import yaml @@ -1484,7 +1485,7 @@ def validate_telemetry_config( # Validate topic_partitions configuration kafka_config = data.get("kafka_configurations", {}) topic_partitions = kafka_config.get("topic_partitions", []) - idrac_telemetry_collection_type = data.get("idrac_telemetry_collection_type", "") + telemetry_collection_type = data.get("telemetry_collection_type", "") # Check if LDMS software is configured but kafka_configurations is missing entirely if ldms_support_from_software_config and not kafka_config: @@ -1545,12 +1546,12 @@ def validate_telemetry_config( # Validate required topics based on feature flags # If iDRAC telemetry is enabled with Kafka, idrac topic is required - if idrac_telemetry_support and 'kafka' in idrac_telemetry_collection_type.split(','): + if idrac_telemetry_support and 'kafka' in telemetry_collection_type.split(','): if 'idrac' not in present_topics: errors.append(create_error_msg( "kafka_configurations.topic_partitions", "missing 'idrac' topic", - "idrac topic is required when idrac_telemetry_support is true and 'kafka' is in idrac_telemetry_collection_type" + "idrac topic is required when idrac_telemetry_support is true and 'kafka' is in telemetry_collection_type" )) # If LDMS software is configured in software_config.json, ldms topic is required @@ -1605,7 +1606,263 @@ def validate_telemetry_config( f"'{plugin_name}'", "plugin_name cannot be empty. Must be one of: meminfo, procstat2, vmstat, loadavg, slurm_sampler, procnetdev2" )) - + + # Validate PowerScale telemetry configuration + powerscale_config = data.get("powerscale_configurations") + if not powerscale_config: + errors.append(create_error_msg( + "powerscale_configurations", + "not defined", + en_us_validation_msg.POWERSCALE_CONFIGURATIONS_MISSING_MSG + )) + else: + powerscale_telemetry_support = powerscale_config.get("powerscale_telemetry_support", False) + + if powerscale_telemetry_support: + logger.info("PowerScale telemetry support is enabled, performing PowerScale validation") + + # Check victoria is in telemetry_collection_type + # PowerScale telemetry pipeline requires VictoriaMetrics (writes to vminsert via shared vmagent) + collection_types = [t.strip() for t in telemetry_collection_type.split(',')] + if 'victoria' not in collection_types: + errors.append(create_error_msg( + "telemetry_collection_type", + telemetry_collection_type, + en_us_validation_msg.POWERSCALE_VICTORIA_REQUIRED_MSG + )) + + # Check CSI driver PowerScale is in software_config.json + csi_powerscale_found = False + if os.path.exists(software_config_file_path): + try: + with open(software_config_file_path, 'r', encoding='utf-8') as f: + software_config = json.load(f) + softwares = software_config.get("softwares", []) + csi_powerscale_found = any( + software.get("name") == "csi_driver_powerscale" for software in softwares + ) + except (json.JSONDecodeError, IOError) as e: + logger.warn(f"Could not load software_config.json for PowerScale validation: {e}") + + if not csi_powerscale_found: + errors.append(create_error_msg( + "powerscale_configurations.powerscale_telemetry_support", + powerscale_telemetry_support, + en_us_validation_msg.POWERSCALE_CSI_DRIVER_MISSING_MSG + )) + + # Check service cluster is defined + if not is_service_cluster_defined: + errors.append(create_error_msg( + "powerscale_configurations.powerscale_telemetry_support", + powerscale_telemetry_support, + en_us_validation_msg.POWERSCALE_SERVICE_CLUSTER_MISSING_MSG + )) + + # Validate otel_collector_storage_size + otel_storage = powerscale_config.get("otel_collector_storage_size", "") + if not otel_storage or not isinstance(otel_storage, str): + errors.append(create_error_msg( + "powerscale_configurations.otel_collector_storage_size", + otel_storage, + en_us_validation_msg.POWERSCALE_OTEL_STORAGE_SIZE_INVALID_MSG + )) + + # Validate csm_observability_values_file_path + csm_values_path = powerscale_config.get("csm_observability_values_file_path", "") + if not csm_values_path or not isinstance(csm_values_path, str) or csm_values_path.strip() == "": + errors.append(create_error_msg( + "powerscale_configurations.csm_observability_values_file_path", + csm_values_path, + en_us_validation_msg.POWERSCALE_CSM_VALUES_PATH_REQUIRED_MSG + )) + elif not os.path.exists(csm_values_path): + errors.append(create_error_msg( + "powerscale_configurations.csm_observability_values_file_path", + csm_values_path, + en_us_validation_msg.powerscale_csm_values_not_found_msg(csm_values_path) + )) + else: + # Validate the CSM Observability values.yaml content + try: + with open(csm_values_path, 'r', encoding='utf-8') as f: + csm_values = yaml.safe_load(f) + if not isinstance(csm_values, dict): + errors.append(create_error_msg( + "powerscale_configurations.csm_observability_values_file_path", + csm_values_path, + en_us_validation_msg.POWERSCALE_CSM_VALUES_INVALID_YAML_MSG + )) + else: + # Validate required keys + karavi_metrics = csm_values.get("karaviMetricsPowerscale", {}) + if not karavi_metrics: + errors.append(create_error_msg( + "csm_observability_values_file_path", + csm_values_path, + en_us_validation_msg.POWERSCALE_CSM_VALUES_MISSING_KARAVI_SECTION_MSG + )) + else: + # Validate image reference exists + if not karavi_metrics.get("image"): + errors.append(create_error_msg( + "karaviMetricsPowerscale.image", + "not defined", + en_us_validation_msg.POWERSCALE_CSM_METRICS_IMAGE_MISSING_MSG + )) + + otel_config = csm_values.get("otelCollector", {}) + if not otel_config or not otel_config.get("image"): + errors.append(create_error_msg( + "otelCollector.image", + "not defined", + en_us_validation_msg.POWERSCALE_OTEL_COLLECTOR_IMAGE_MISSING_MSG + )) + + # Validate Karavi Authorization config in Helm values + karavi_auth = karavi_metrics.get("authorization", {}) if karavi_metrics else {} + if karavi_auth.get("enabled", False): + proxy_host = karavi_auth.get("proxyHost", "") + if not proxy_host or not isinstance(proxy_host, str) or proxy_host.strip() == "": + errors.append(create_error_msg( + "karaviMetricsPowerscale.authorization.proxyHost", + proxy_host, + en_us_validation_msg.POWERSCALE_AUTH_PROXY_HOST_MISSING_MSG + )) + + # Cross-validate image versions between values.yaml and service_k8s.json + service_k8s_json_path = os.path.join( + input_dir, "config", "x86_64", + data.get("cluster_os_type", "rhel") if "cluster_os_type" in data else "rhel", + data.get("cluster_os_version", "10.0") if "cluster_os_version" in data else "10.0", + "service_k8s.json" + ) + # Try reading cluster_os_type/version from software_config.json + if os.path.exists(software_config_file_path): + try: + with open(software_config_file_path, 'r', encoding='utf-8') as scf: + sc_data = json.load(scf) + sc_os_type = sc_data.get("cluster_os_type", "rhel") + sc_os_version = sc_data.get("cluster_os_version", "10.0") + service_k8s_json_path = os.path.join( + input_dir, "config", "x86_64", + sc_os_type, sc_os_version, "service_k8s.json" + ) + except (json.JSONDecodeError, IOError): + pass + + if os.path.exists(service_k8s_json_path): + try: + with open(service_k8s_json_path, 'r', encoding='utf-8') as sk8s_f: + service_k8s_data = json.load(sk8s_f) + + # Build lookup: package -> tag from service_k8s.json + sk8s_images = {} + for entry in service_k8s_data.get("service_k8s", {}).get("cluster", []): + if entry.get("type") == "image" and "tag" in entry: + sk8s_images[entry["package"]] = entry["tag"] + + # Images to cross-validate: (description, values.yaml image, service_k8s package key) + images_to_check = [] + + if karavi_metrics and karavi_metrics.get("image"): + images_to_check.append(( + "csm-metrics-powerscale", + karavi_metrics["image"], + "quay.io/dell/container-storage-modules/csm-metrics-powerscale" + )) + if otel_config and otel_config.get("image"): + images_to_check.append(( + "opentelemetry-collector", + otel_config["image"], + "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector" + )) + karavi_auth = karavi_metrics.get("authorization", {}) if karavi_metrics else {} + sidecar_proxy = karavi_auth.get("sidecarProxy", {}) + if sidecar_proxy and sidecar_proxy.get("image"): + # csm-authorization-sidecar is in csi_driver_powerscale.json, not service_k8s.json + csi_ps_json_path = os.path.join( + os.path.dirname(service_k8s_json_path), "csi_driver_powerscale.json" + ) + if os.path.exists(csi_ps_json_path): + try: + with open(csi_ps_json_path, 'r', encoding='utf-8') as csi_f: + csi_ps_data = json.load(csi_f) + for entry in csi_ps_data.get("csi_driver_powerscale", {}).get("cluster", []): + if (entry.get("type") == "image" and + entry.get("package") == "quay.io/dell/container-storage-modules/csm-authorization-sidecar"): + sidecar_values_tag = sidecar_proxy["image"].split(":")[-1] if ":" in sidecar_proxy["image"] else "" + if sidecar_values_tag and sidecar_values_tag != entry["tag"]: + errors.append(create_error_msg( + "powerscale image: csm-authorization-sidecar", + sidecar_proxy["image"], + en_us_validation_msg.powerscale_image_version_mismatch_msg( + "csm-authorization-sidecar", + sidecar_proxy["image"], + f"{entry['package']}:{entry['tag']}" + ) + )) + else: + logger.info(f"Image version match for csm-authorization-sidecar: {sidecar_values_tag}") + break + except (json.JSONDecodeError, IOError) as csi_err: + logger.warn(f"Could not read csi_driver_powerscale.json: {csi_err}") + + for img_name, values_image, sk8s_key in images_to_check: + if sk8s_key in sk8s_images: + # Extract tag from values.yaml image (format: registry/repo:tag) + values_tag = values_image.split(":")[-1] if ":" in values_image else "" + sk8s_tag = sk8s_images[sk8s_key] + if values_tag and values_tag != sk8s_tag: + sk8s_full = f"{sk8s_key}:{sk8s_tag}" + errors.append(create_error_msg( + f"powerscale image: {img_name}", + values_image, + en_us_validation_msg.powerscale_image_version_mismatch_msg( + img_name, values_image, sk8s_full + ) + )) + else: + logger.info(f"Image version match for {img_name}: {values_tag}") + else: + logger.warn(f"Image {sk8s_key} not found in service_k8s.json, skipping version check") + + except (json.JSONDecodeError, IOError) as sk8s_err: + logger.warn(f"Could not read service_k8s.json for image version validation: {sk8s_err}") + else: + logger.warn(f"service_k8s.json not found at {service_k8s_json_path}, skipping image version validation") + + logger.info("CSM Observability values.yaml validation passed") + except (yaml.YAMLError, IOError) as e: + errors.append(create_error_msg( + "powerscale_configurations.csm_observability_values_file_path", + csm_values_path, + en_us_validation_msg.powerscale_csm_values_parse_error_msg(str(e)) + )) + + # Validate additional_remote_write_endpoints + additional_endpoints = powerscale_config.get("additional_remote_write_endpoints", []) + if additional_endpoints and isinstance(additional_endpoints, list): + if len(additional_endpoints) > 5: + logger.warn(f"More than 5 additional_remote_write_endpoints configured ({len(additional_endpoints)}). " + "This may impact performance.") + for idx, endpoint in enumerate(additional_endpoints): + if not isinstance(endpoint, dict): + continue + url = endpoint.get("url", "") + if not url or not isinstance(url, str): + errors.append(create_error_msg( + f"powerscale_configurations.additional_remote_write_endpoints[{idx}].url", + url, + en_us_validation_msg.POWERSCALE_ADDITIONAL_ENDPOINTS_URL_EMPTY_MSG + )) + elif not url.startswith("http://") and not url.startswith("https://"): + errors.append(create_error_msg( + f"powerscale_configurations.additional_remote_write_endpoints[{idx}].url", + url, + en_us_validation_msg.POWERSCALE_ADDITIONAL_ENDPOINTS_URL_INVALID_MSG + )) + return errors def validate_additional_software( diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index 16263c7b48..48e40a16cf 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -30,6 +30,7 @@ file_names = config.files create_error_msg = validation_utils.create_error_msg create_file_path = validation_utils.create_file_path +ib_mac_re = re.compile(r"^([0-9A-Fa-f]{2}:){7}[0-9A-Fa-f]{2}$") # Expected header columns (case-insensitive) required_headers = [ @@ -271,6 +272,52 @@ def validate_duplicate_admin_ips_in_mapping_file(pxe_mapping_file_path): raise ValueError(f"Duplicate ADMIN_IP found in PXE mapping file: {'; '.join(duplicates)}") +def validate_duplicate_ib_ips_in_mapping_file(pxe_mapping_file_path): + """Validates that IB_IP values in the mapping file are unique.""" + if not pxe_mapping_file_path or not os.path.isfile(pxe_mapping_file_path): + raise ValueError(f"PXE mapping file not found: {pxe_mapping_file_path}") + + with open(pxe_mapping_file_path, "r", encoding="utf-8") as fh: + raw_lines = fh.readlines() + + non_comment_lines = [ln for ln in raw_lines if ln.strip()] + reader = csv.DictReader(non_comment_lines) + + fieldname_map = {fn.strip().upper(): fn for fn in reader.fieldnames} + ib_ip_col = fieldname_map.get("IB_IP") + hostname_col = fieldname_map.get("HOSTNAME") + + if not ib_ip_col: + return + + seen_ib_ips = {} + duplicates = [] + + for row_idx, row in enumerate(reader, start=2): + ib_ip = row.get(ib_ip_col, "").strip() if row.get(ib_ip_col) else "" + hostname = "" + if hostname_col: + hostname = row.get(hostname_col, "").strip() if row.get(hostname_col) else "" + + if not ib_ip: + continue + + if ib_ip in seen_ib_ips: + first_row = seen_ib_ips[ib_ip]["row"] + first_host = seen_ib_ips[ib_ip]["hostname"] + dup_host = hostname or "" + first_host_disp = first_host or "" + duplicates.append( + f"'{ib_ip}' at CSV rows {first_row} ({first_host_disp}) and {row_idx} ({dup_host})" + ) + continue + + seen_ib_ips[ib_ip] = {"row": row_idx, "hostname": hostname} + + if duplicates: + raise ValueError(f"Duplicate IB_IP found in PXE mapping file: {'; '.join(duplicates)}") + + def validate_group_parent_service_tag_consistency_in_mapping_file(pxe_mapping_file_path): """Validates that GROUP_NAME has a consistent PARENT_SERVICE_TAG across the mapping file.""" if not pxe_mapping_file_path or not os.path.isfile(pxe_mapping_file_path): @@ -421,6 +468,25 @@ def validate_mapping_file_entries(mapping_file_path): if bmc_ip and not validation_utils.validate_ipv4(bmc_ip): raise ValueError(f"Invalid BMC_IP: '{bmc_ip}' at CSV row {row_idx} in mapping file.") + ib_mac_col = fieldname_map.get("IB_MAC") + ib_ip_col = fieldname_map.get("IB_IP") + ib_mac = row.get(ib_mac_col, "").strip() if ib_mac_col and row.get(ib_mac_col) else "" + ib_ip = row.get(ib_ip_col, "").strip() if ib_ip_col and row.get(ib_ip_col) else "" + + if bool(ib_mac) != bool(ib_ip): + raise ValueError( + f"IB_MAC and IB_IP must both be provided or both be empty at CSV row {row_idx} in mapping file." + ) + + if ib_mac and not ib_mac_re.match(ib_mac): + raise ValueError( + f"Invalid IB_MAC: '{ib_mac}' at CSV row {row_idx} in mapping file. " + "Expected format: xx:xx:xx:xx:xx:xx:xx:xx." + ) + + if ib_ip and not validation_utils.validate_ipv4(ib_ip): + raise ValueError(f"Invalid IB_IP: '{ib_ip}' at CSV row {row_idx} in mapping file.") + if not row_seen: raise ValueError("Please provide details in mapping file.") @@ -859,6 +925,7 @@ def validate_provision_config( validate_duplicate_service_tags_in_mapping_file(pxe_mapping_file_path) validate_duplicate_hostnames_in_mapping_file(pxe_mapping_file_path) validate_duplicate_admin_ips_in_mapping_file(pxe_mapping_file_path) + validate_duplicate_ib_ips_in_mapping_file(pxe_mapping_file_path) validate_group_parent_service_tag_consistency_in_mapping_file(pxe_mapping_file_path) validate_functional_groups_separation(pxe_mapping_file_path) validate_parent_service_tag_hierarchy(pxe_mapping_file_path) diff --git a/common/library/modules/validate_input.py b/common/library/modules/validate_input.py index 022a87aaee..21ce2920c6 100644 --- a/common/library/modules/validate_input.py +++ b/common/library/modules/validate_input.py @@ -139,7 +139,7 @@ def main(): if input_file_path is None: error_message = ( - f"file not found in directory: {omnia_base_dir}/{project_name}" + f"{fname} file not found in directory: {omnia_base_dir}/{project_name}" ) logger.error(error_message) module.fail_json(msg=error_message) diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv index 0a350bc72d..295e7615af 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv @@ -1,6 +1,6 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 \ No newline at end of file +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 \ No newline at end of file diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv index 6e3e4c6e63..5226b0a19e 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv @@ -1,11 +1,13 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 -service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 -service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 -service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 -service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 \ No newline at end of file +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 +service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53,94:6d:ae:03:00:8c:12:5f,192.168.0.105 +service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,94:6d:ae:03:00:8c:12:6a,192.168.0.106 +service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,94:6d:ae:03:00:8c:12:7b,192.168.0.107 +service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,94:6d:ae:03:00:8c:12:8c,192.168.0.108 +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,94:6d:ae:03:00:8c:12:9d,192.168.0.109 +os_x86_64,grp6,ABEF56,,os-node1,xx:yy:zz:aa:bb:ff,172.16.107.60,xx:yy:zz:aa:bb:ee,172.17.107.60,94:6d:ae:03:00:8c:12:ae,192.168.0.110 +os_aarch64,grp7,ABEF78,,os-node2,xx:yy:zz:aa:bb:ab,172.16.107.61,xx:yy:zz:aa:bb:ac,172.17.107.61,94:6d:ae:03:00:8c:12:bf,192.168.0.111 diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv index 6e3e4c6e63..01360b424b 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv @@ -1,11 +1,11 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 -service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 -service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 -service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 -service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 \ No newline at end of file +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 +service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53,94:6d:ae:03:00:8c:12:5f,192.168.0.105 +service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,94:6d:ae:03:00:8c:12:6a,192.168.0.106 +service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,94:6d:ae:03:00:8c:12:7b,192.168.0.107 +service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,94:6d:ae:03:00:8c:12:8c,192.168.0.108 +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,94:6d:ae:03:00:8c:12:9d,192.168.0.109 diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv index 98ad5ab134..65ceac6ada 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv @@ -1,6 +1,6 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_x86_64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_x86_64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 \ No newline at end of file +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 +slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 +slurm_node_x86_64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 +login_compiler_node_x86_64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 diff --git a/examples/pxe_mapping_file.csv b/examples/pxe_mapping_file.csv index f4d41e2a77..01360b424b 100644 --- a/examples/pxe_mapping_file.csv +++ b/examples/pxe_mapping_file.csv @@ -1,11 +1,11 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 -service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 -service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 -service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 -service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 +service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53,94:6d:ae:03:00:8c:12:5f,192.168.0.105 +service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,94:6d:ae:03:00:8c:12:6a,192.168.0.106 +service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,94:6d:ae:03:00:8c:12:7b,192.168.0.107 +service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,94:6d:ae:03:00:8c:12:8c,192.168.0.108 +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,94:6d:ae:03:00:8c:12:9d,192.168.0.109 diff --git a/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json b/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json index ac7953a0dc..15ed7a3bed 100644 --- a/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json +++ b/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json @@ -66,7 +66,7 @@ }, { "package": "quay.io/dell/container-storage-modules/csm-authorization-sidecar", - "tag": "v2.3.0", + "tag": "v2.4.0", "type": "image" }, { diff --git a/input/config/x86_64/rhel/10.0/service_k8s.json b/input/config/x86_64/rhel/10.0/service_k8s.json index 6deed2309b..64f4c8bdff 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s.json +++ b/input/config/x86_64/rhel/10.0/service_k8s.json @@ -3,7 +3,7 @@ "cluster": [ { "package": "docker.io/library/busybox", "type": "image", "tag": "1.36" }, { "package": "firewalld", "type": "rpm", "repo_name": "baseos" }, - { "package": "python3-firewall", "type": "rpm", "repo_name": "baseos" }, + { "package": "python3-firewall", "type": "rpm", "repo_name": "baseos" }, { "package": "git", "type": "rpm", "repo_name": "appstream"}, { "package": "vim-enhanced", "type": "rpm", "repo_name": "appstream"}, { "package": "fuse-overlayfs", "type": "rpm", "repo_name": "appstream"}, @@ -17,6 +17,8 @@ { "package": "docker.io/victoriametrics/vmstorage", "type": "image", "tag": "v1.128.0-cluster" }, { "package": "docker.io/victoriametrics/vminsert", "type": "image", "tag": "v1.128.0-cluster" }, { "package": "docker.io/victoriametrics/vmselect", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/victoria-logs", "type": "image", "tag": "v1.49.0" }, + { "package": "docker.io/victoriametrics/vlagent", "type": "image", "tag": "v1.49.0" }, { "package": "docker.io/alpine/kubectl", "tag": "1.34.1", "type": "image" }, { "package": "docker.io/curlimages/curl", "type": "image", "tag": "8.17.0" }, { "package": "docker.io/rmohr/activemq", "type": "image", "tag": "5.15.9" }, @@ -29,11 +31,20 @@ { "package": "cffi==1.17.1", "type": "pip_module" }, { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" }, { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, - { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, + { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, + { "package": "quay.io/dell/container-storage-modules/csm-metrics-powerscale", "tag": "v1.11.0", "type": "image" }, + { "package": "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector", "tag": "0.148.0", "type": "image" }, + { "package": "docker.io/nginxinc/nginx-unprivileged", "tag": "1.29", "type": "image" }, + { "package": "karavi-observability", "type": "git", "url": "https://github.com/dell/karavi-observability.git", "version": "v1.12.0" }, + { "package": "helm-charts", "type": "git", "url": "https://github.com/dell/helm-charts.git", "version": "container-storage-modules-1.9.2" }, + { "package": "cert-manager-v1.10.0", "type": "tarball", "url": "https://charts.jetstack.io/charts/cert-manager-v1.10.0.tgz" }, { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, + { "package": "docker.io/victoriametrics/operator", "tag": "v0.68.3", "type": "image" }, + { "package": "docker.io/victoriametrics/operator", "tag": "config-reloader-v0.68.3", "type": "image" }, + { "package": "victoria-metrics-operator-0.59.3", "type": "tarball", "url": "https://github.com/VictoriaMetrics/helm-charts/releases/download/victoria-metrics-operator-0.59.3/victoria-metrics-operator-0.59.3.tgz" }, { "package": "apptainer", "type": "rpm", "repo_name": "epel" }, - { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } + { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "service_kube_control_plane": { @@ -57,7 +68,6 @@ { "package": "git", "type": "rpm", "repo_name": "appstream"}, { "package": "kubernetes==33.1.0", "type": "pip_module" }, { "package": "PyMySQL==1.1.2", "type": "pip_module" } - ] }, "service_kube_control_plane_first": { @@ -75,33 +85,23 @@ { "package": "docker.io/calico/kube-controllers", "tag": "v3.30.3", "type": "image" }, { "package": "docker.io/calico/node", "tag": "v3.30.3", "type": "image" }, { "package": "quay.io/metallb/speaker", "tag": "v0.15.2", "type": "image" }, - { - "package": "calico-v3.30.3", - "type": "manifest", - "url": "https://raw.githubusercontent.com/projectcalico/calico/v3.30.3/manifests/calico.yaml" - }, - { - "package": "metallb-native-v0.15.2", - "type": "manifest", - "url": "https://raw.githubusercontent.com/metallb/metallb/v0.15.2/config/manifests/metallb-native.yaml" - }, + { "package": "calico-v3.30.3", "type": "manifest", "url": "https://raw.githubusercontent.com/projectcalico/calico/v3.30.3/manifests/calico.yaml" }, + { "package": "metallb-native-v0.15.2", "type": "manifest", "url": "https://raw.githubusercontent.com/metallb/metallb/v0.15.2/config/manifests/metallb-native.yaml" }, { "package": "helm-v3.19.0-amd64", "type": "tarball", "url": "https://get.helm.sh/helm-v3.19.0-linux-amd64.tar.gz" }, - { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" }, + { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" }, { "package": "kubectl-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, { "package": "prettytable==3.14.0", "type": "pip_module" }, { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, { "package": "git", "type": "rpm", "repo_name": "appstream"}, { "package": "kubernetes==33.1.0", "type": "pip_module" }, { "package": "PyMySQL==1.1.2", "type": "pip_module" } - ] }, - "service_kube_node": { "cluster": [ { "package": "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner", "tag": "v4.0.2", "type": "image" }, { "package": "quay.io/metallb/speaker", "tag": "v0.15.2", "type": "image" }, - { "package": "quay.io/metallb/controller", "tag": "v0.15.2", "type": "image" } + { "package": "quay.io/metallb/controller", "tag": "v0.15.2", "type": "image" } ] } } diff --git a/input/pxe_mapping_file.csv b/input/pxe_mapping_file.csv index abb6fc5fe8..e9b5a893f2 100644 --- a/input/pxe_mapping_file.csv +++ b/input/pxe_mapping_file.csv @@ -1,13 +1,14 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 -service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 -service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 -service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 -service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 -os_x86_64,grp6,ABEF56,,os-node1,xx:yy:zz:aa:bb:ff,172.16.107.60,xx:yy:zz:aa:bb:ee,172.17.107.60 -os_aarch64,grp7,ABEF78,,os-node2,xx:yy:zz:aa:bb:ab,172.16.107.61,xx:yy:zz:aa:bb:ac,172.17.107.61 +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,, +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,, +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,, +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,, +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,, +service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53,, +service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,, +service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,, +service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,, +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,, +os_x86_64,grp6,ABEF56,,os-node1,xx:yy:zz:aa:bb:ff,172.16.107.60,xx:yy:zz:aa:bb:ee,172.17.107.60,, +os_aarch64,grp7,ABEF78,,os-node2,xx:yy:zz:aa:bb:ab,172.16.107.61,xx:yy:zz:aa:bb:ac,172.17.107.61,, + diff --git a/input/telemetry_config.yml b/input/telemetry_config.yml index 397806c594..36eb0844ec 100644 --- a/input/telemetry_config.yml +++ b/input/telemetry_config.yml @@ -38,8 +38,6 @@ # ┌─────────────────┬──────────────────┬─────────────────┬──────────────────┐ # │ Deployment Mode │ Per-Pod Storage │ Number of Pods │ Total Storage │ # ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ -# │ Single-node │ persistence_size │ 1 pod │ 1× storage │ -# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ # │ Cluster │ persistence_size │ 3 vmstorage │ 3× storage │ # └─────────────────┴──────────────────┴─────────────────┴──────────────────┘ # Example: 8Gi per pod → Single-node: 8Gi total, Cluster: 24Gi total @@ -58,7 +56,6 @@ # # COMBINED STORAGE EXAMPLES: # Default (8Gi each): VictoriaMetrics Cluster (24Gi) + Kafka (48Gi) = 72Gi total -# Single-node mode: VictoriaMetrics Single (8Gi) + Kafka (48Gi) = 56Gi total # # STORAGE OPTIONS: # - VictoriaMetrics: Store iDRAC telemetry in time-series database @@ -83,7 +80,7 @@ idrac_telemetry_support: true # - "kafka" : Store in Kafka only # - "victoria,kafka" : Store in both (recommended) # Default: "victoria,kafka" -idrac_telemetry_collection_type: "victoria,kafka" +telemetry_collection_type: "victoria,kafka" # ============================================================================ # NVIDIA DCGM (Data Center GPU Manager) CONFIGURATION @@ -111,34 +108,14 @@ dcgm_support: true # VICTORIAMETRICS CONFIGURATION # ============================================================================ # VictoriaMetrics is a time-series database for storing telemetry metrics. -# Used for iDRAC telemetry when 'victoria' is enabled in idrac_telemetry_collection_type. +# Used for iDRAC telemetry when 'victoria' is enabled in telemetry_collection_type. # # DEPLOYMENT MODES: -# - single-node: Simple deployment with one pod (suitable for small deployments) # - cluster: High-availability deployment with multiple components # (recommended for production and large-scale deployments) -victoria_configurations: - # VictoriaMetrics deployment mode - # Supported values: - # - "single-node" : Simple deployment (1 pod, suitable for dev/test) - # - "cluster" : High-availability deployment (7 pods, recommended for production) - # Default: "cluster" - # - # Cluster Mode Benefits: - # - High availability (no single point of failure) - # - Horizontal scalability (scale components independently) - # - Better performance (4x ingestion, 2x query speed) - # - Production-ready architecture - # - # Single-Node Benefits: - # - Simple setup (fewer resources) - # - Suitable for small deployments (<10 nodes) - # - Lower resource usage (~4Gi memory vs ~10Gi for cluster) - deployment_mode: "cluster" - +victoria_metrics_configurations: # The amount of storage allocated for EACH VictoriaMetrics persistent volume. # IMPORTANT: Total VictoriaMetrics storage depends on deployment mode: - # - Single-node mode: Total storage = persistence_size × 1 pod # - Cluster mode: Total storage = persistence_size × 3 vmstorage pods # - Example (cluster): 8Gi × 3 = 24Gi total VictoriaMetrics storage # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" @@ -149,11 +126,43 @@ victoria_configurations: # Default: 168 (7 days) retention_period: 168 +# ============================================================================ +# VICTORIALOGS CONFIGURATION +# ============================================================================ +# VictoriaLogs provides centralized log storage and querying (cluster mode only). +# Deployed alongside VictoriaMetrics when 'victoria' is in telemetry_collection_type. +# +# DEPLOYMENT: +# - Always cluster mode (vlstorage, vlinsert, vlselect, VLAgent) +# - Co-deployed with VictoriaMetrics — same deployment gate +# - Shares TLS infrastructure with VictoriaMetrics +# +# STORAGE REQUIREMENTS: +# ┌─────────────────┬──────────────────┬─────────────────┬──────────────────┐ +# │ Component │ Per-Pod Storage │ Number of Pods │ Total Storage │ +# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ +# │ vlstorage │ storage_size │ 3 pods │ 3× storage │ +# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ +# │ VLAgent buffer │ 5Gi (fixed) │ 1 pod │ 5Gi │ +# └─────────────────┴──────────────────┴─────────────────┴──────────────────┘ +# Example: 8Gi × 3 vlstorage = 24Gi + 5Gi VLAgent = 29Gi total +victoria_logs_configurations: + # Storage size per vlstorage replica PVC + # IMPORTANT: Total VictoriaLogs storage = storage_size × 3 vlstorage pods + # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" + # Default: 8Gi (results in 24Gi total storage) + storage_size: "8Gi" + + # Log retention period (duration format) + # Logs older than this period are automatically purged by vlstorage. + # Default: 168 (7 days) + retention_period: 168 + # ============================================================================ # KAFKA CONFIGURATION # ============================================================================ # Apache Kafka is a distributed streaming platform for storing telemetry data. -# Used for iDRAC telemetry when 'kafka' is enabled in idrac_telemetry_collection_type. +# Used for iDRAC telemetry when 'kafka' is enabled in telemetry_collection_type. # Also used for LDMS telemetry when LDMS software is configured. # # NOTE: Kafka topics are auto-generated based on enabled features: @@ -281,3 +290,97 @@ ldms_sampler_configurations: - plugin_name: procnetdev2 config_parameters: "" # Monitor all interfaces activation_parameters: "interval=30000000 offset=0" # interval=30000000 microseconds (30 seconds), offset=0 + +# ============================================================================ +# POWERSCALE TELEMETRY CONFIGURATION +# ============================================================================ +# PowerScale telemetry collects storage metrics from Dell PowerScale (OneFS) +# clusters using the CSM (Container Storage Modules) Metrics PowerScale exporter. +# +# DATA PIPELINE: +# CSM Metrics PowerScale → OTEL Collector → vmagent-powerscale → VictoriaMetrics +# +# Metrics collected: capacity, performance, topology, and quota metrics via +# OneFS REST API. Metrics are processed through an OpenTelemetry Collector +# and ingested into the shared VictoriaMetrics cluster (same as iDRAC telemetry). +# +# AUTHENTICATION MODES (auto-detected from CSM Observability values.yaml): +# - Direct Authentication: CSM Metrics connects directly to PowerScale using +# credentials from the CSI driver secret.yaml (isilon-creds) +# - Karavi Authorization: CSM Metrics connects via Karavi Authorization Proxy +# sidecar with token-based authentication +# +# PREREQUISITES: +# - csi_driver_powerscale must be configured in software_config.json +# - Service cluster must be defined in functional_groups_config.yml +# - VictoriaMetrics must be enabled: 'victoria' must be included in +# telemetry_collection_type (e.g., "victoria" or "victoria,kafka") +# - CSI driver secret.yaml with valid isilonClusters credentials +# - CSM Observability values.yaml must be provided +# +# STORAGE REQUIREMENTS: +# ┌─────────────────────┬──────────────┬──────────────┬──────────────────┐ +# │ Component │ Per-Pod PVC │ Pods │ Total Storage │ +# ├─────────────────────┼──────────────┼──────────────┼──────────────────┤ +# │ OTEL Collector │ 5Gi │ 1 per cluster│ 5Gi per cluster │ +# └─────────────────────┴──────────────┴──────────────┴──────────────────┘ + +powerscale_configurations: + # Enable or disable PowerScale telemetry support + # Accepted values: true or false + # Default: true + powerscale_telemetry_support: true + + # Enable or disable PowerScale log collection (syslog → VictoriaLogs) + # Requires powerscale_telemetry_support: true + # Accepted values: true or false + # Default: false + powerscale_log_enabled: false + + # PVC size for OTEL Collector metric batching and buffering. + # Adjust based on cluster scale and metric volume. + # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" + # Default: "5Gi" + otel_collector_storage_size: "5Gi" + + # Path to the CSM Observability (Karavi Observability) values.yaml file. + # This file provides: + # - Container image versions for CSM Metrics, OTEL Collector, and Karavi sidecar + # - Authentication settings (Direct vs. Karavi Authorization) + # - Poll frequencies for each metric domain + # - ISI client options for PowerScale OneFS API + # + # At minimum, configure the karaviMetricsPowerscale section (set enabled: true, + # disable other storage backends like powerflex/powerstore/powermax): + # karaviMetricsPowerscale: + # enabled: true + # ... + # + # AUTHENTICATION MODE (configured in the Helm values file): + # Mode A - Direct Authentication (default): + # karaviMetricsPowerscale.authorization.enabled: false (or omit) + # Only isilon-creds secret is copied to the telemetry namespace. + # + # Mode B - Karavi Authorization: + # karaviMetricsPowerscale.authorization.enabled: true + # karaviMetricsPowerscale.authorization.proxyHost: "" + # Requires CSI driver deployed with CSM Authorization enabled. + # Additional resources are automatically copied to the telemetry namespace: + # - isilon-config-params ConfigMap + # - isilon-proxy-server-root-certificate Secret + # - isilon-proxy-authz-tokens Secret + # + # Required when powerscale_configurations.powerscale_telemetry_support: true + # Reference: https://github.com/dell/helm-charts/blob/main/charts/karavi-observability/values.yaml + csm_observability_values_file_path: "" + + # Additional VictoriaMetrics remote_write endpoints (optional) + # vmagent will write the same PowerScale metrics to ALL configured endpoints. + # Each endpoint receives an identical copy of all metrics. + # The primary Omnia VictoriaMetrics endpoint (vminsert) is always included automatically. + # Default: [] (empty — only the primary Omnia VictoriaMetrics endpoint is used) + # Example: + # additional_remote_write_endpoints: + # - url: "https://external-victoria.example.com:8480/insert/0/prometheus/api/v1/write" + # tls_insecure_skip_verify: true + additional_remote_write_endpoints: [] diff --git a/omnia.sh b/omnia.sh index 85736f9427..c4290b922f 100755 --- a/omnia.sh +++ b/omnia.sh @@ -105,7 +105,7 @@ get_metadata_version() { fi } -omnia_release=2.1.0.0 +omnia_release=2.2.0.0 core_container_status=false omnia_path="" @@ -128,7 +128,7 @@ is_local_ip() { } # Version configuration variables -OMNIA_CORE_CONTAINER_TAG="2.1" # Default container tag +OMNIA_CORE_CONTAINER_TAG="2.2" # Default container tag OMNIA_VERSION="" # Will be read from metadata TARGET_OMNIA_VERSION="" # Target version for upgrade TARGET_CONTAINER_TAG="" # Target container tag for upgrade @@ -136,10 +136,8 @@ TARGET_CONTAINER_TAG="" # Target container tag for upgrade # Centralized version list (in chronological order) # Note: Include RC milestones so upgrades from RC to RC/GA appear ALL_OMNIA_VERSIONS=( - "2.0.0.0" - "2.1.0.0-rc1" - "2.1.0.0-rc2" "2.1.0.0" + "2.2.0.0" ) # Container-side paths (used inside podman exec commands) @@ -315,13 +313,13 @@ validate_container_image() { echo -e "${BLUE}Build the required image using the following commands:${NC}" echo "" echo -e "git clone https://github.com/dell/omnia-artifactory.git -b omnia-container-" - echo -e "${YELLOW}Note: Replace with the target Omnia version (e.g., v2.1.0.0)${NC}" + echo -e "${YELLOW}Note: Replace with the target Omnia version (e.g., v2.2.0.0)${NC}" echo "" echo -e "cd omnia-artifactory" echo "" echo -e "./build_images.sh core core_tag= omnia_branch=" - echo -e "${YELLOW}Note: Replace with the target Omnia branch (e.g., v2.1.0.0)${NC}" - echo -e "${YELLOW}Note: core_tag will be the first 2 digits of the target Omnia version (e.g., 2.1 for v2.1.0.0)${NC}" + echo -e "${YELLOW}Note: Replace with the target Omnia branch (e.g., v2.2.0.0)${NC}" + echo -e "${YELLOW}Note: core_tag will be the first 2 digits of the target Omnia version (e.g., 2.2 for v2.2.0.0)${NC}" echo "" echo -e "${BLUE}After the image is built successfully, re-run:${NC}" echo -e "./omnia.sh --$operation" @@ -338,18 +336,15 @@ validate_container_image() { get_container_tag_from_version() { local version="$1" - # Explicit mapping: 2.1.0.0-rc1 stays on pre-GA tag 1.0 - if [[ "$version" == "2.1.0.0-rc1" ]]; then - echo "1.0" - return - fi - case "$version" in - 2.0.*) - echo "1.0" + 2.1.*) + echo "2.1" + ;; + 2.2.*) + echo "2.2" ;; *) - # All other versions (including rc2/GA) use major.minor as tag + # All other versions use major.minor as tag echo "$(echo "$version" | awk -F. '{print $1"."$2}')" ;; esac @@ -1167,7 +1162,7 @@ Description=${container_name^} Container [Container] ContainerName=${container_name} HostName=${container_name} -Image=${container_name}:2.1 +Image=${container_name}:${OMNIA_CORE_CONTAINER_TAG} Network=host # Capabilities @@ -1384,19 +1379,19 @@ show_help() { } install_omnia_core() { - # Detect existing Omnia 2.0 installation + # Check for existing installation if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then # Read version from metadata inside container current_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r') - if [ "$current_version" = "2.0.0.0" ]; then - echo -e "${RED}ERROR: Existing Omnia 2.0 installation detected.${NC}" + if [ "$current_version" = "2.1.0.0" ]; then + echo -e "${RED}ERROR: Existing Omnia 2.1 installation detected.${NC}" echo -e "${YELLOW}To upgrade, run: $0 --upgrade${NC}" echo -e "${YELLOW}For a fresh install, first run: $0 --uninstall${NC}" exit 1 fi fi - local omnia_core_tag="2.1" + local omnia_core_tag="$OMNIA_CORE_CONTAINER_TAG" local omnia_core_registry="" # Check if local omnia_core image exists using validate function diff --git a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml index 2d7db2ca85..9f6254a0b3 100644 --- a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml @@ -17,7 +17,7 @@ openchami_git_repo: https://github.com/OpenCHAMI/deployment-recipes.git openchami_share_dir: /opt/omnia/openchami openchami_clone_path: "{{ openchami_share_dir }}/deployment-recipes" -openchami_git_version: main +openchami_git_version: bf1f6dfdc7f6107a4227568987faedb1b79b95fa clone_retry: "5" clone_delay: "10" dir_permissions_755: "0755" @@ -43,16 +43,16 @@ pull_image_delay: 10 # OpenCHAMI image tags openchami_local_ca_tag: "v0.2.2" openchami_opaal_tag: "v0.3.10" -openchami_smd_tag: "v2.18.0" -openchami_bss_tag: "v1.32.0" -openchami_cloud_init_tag: "v1.2.3" -openchami_coredhcp_tag: "v0.3.0" +openchami_smd_tag: "v2.19.0" +openchami_bss_tag: "v1.32.1" +openchami_cloud_init_tag: "v1.3.0" +openchami_coresmd_tag: "v0.4.0" # Third-party image tags for OpenCHAMI -minio_tag: "latest" +minio_release_tag: "RELEASE.2026-04-17T00-00-00Z" postgres_tag: "11.5-alpine" hydra_tag: "v2.3" haproxy_tag: "latest" -registry_tag: "latest" +registry_tag: "3.1.0" curl_tag: "latest" acme_tag: "3.1.1" @@ -63,8 +63,8 @@ openchami_images: - "ghcr.io/openchami/smd:{{ openchami_smd_tag }}" - "ghcr.io/openchami/bss:{{ openchami_bss_tag }}" - "ghcr.io/openchami/cloud-init:{{ openchami_cloud_init_tag }}" - - "ghcr.io/openchami/coredhcp:{{ openchami_coredhcp_tag }}" - - "docker.io/minio/minio:{{ minio_tag }}" + - "ghcr.io/openchami/coresmd:{{ openchami_coresmd_tag }}" + - "docker.io/pgsty/minio:{{ minio_release_tag }}" - "docker.io/library/postgres:{{ postgres_tag }}" - "docker.io/oryd/hydra:{{ hydra_tag }}" - "cgr.dev/chainguard/haproxy:{{ haproxy_tag }}" diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-default_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-default_x86_64.yaml.j2 new file mode 100644 index 0000000000..383c3f3506 --- /dev/null +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-default_x86_64.yaml.j2 @@ -0,0 +1,18 @@ +- name: {{ functional_group_name }} + description: "{{ functional_group_name }} config" + file: + encoding: plain + content: | + ## template: jinja + #cloud-config + merge_how: + - name: list + settings: [append] + - name: dict + settings: [no_replace, recurse_list] + users: + - name: root + ssh_authorized_keys: "{{ read_ssh_key.stdout }}" + lock_passwd: false + hashed_passwd: "{{ hashed_password_output.stdout }}" + disable_root: false diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index fe6966c4be..303baf5743 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -289,7 +289,8 @@ - /root/ldms_sampler.sh {% endif %} - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 1ee1fce5e1..fbf39d348c 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -291,7 +291,8 @@ - /root/ldms_sampler.sh {% endif %} - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index cdea0cd340..406a50a5a0 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -145,7 +145,8 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index b744859381..50f85187b1 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -147,7 +147,8 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index b98df53d7d..7eb3c72cc1 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -424,7 +424,8 @@ - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - mkdir -p /etc/containers/registries.conf.d - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service @@ -1041,6 +1042,10 @@ systemctl restart nfs-client.target systemctl restart rpcbind +{% if hostvars['localhost']['powerscale_configurations']['powerscale_telemetry_support'] | default(false) | bool %} +{% include 'powerscale/deploy_powerscale_telemetry.sh.j2' %} +{% endif %} + {% if hostvars['localhost']['idrac_telemetry_support'] or hostvars['localhost']['ldms_support'] %} echo "Applying Telemetry Kubernetes deployments" /root/telemetry.sh diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 index 922f63f852..71f8be3033 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 @@ -332,7 +332,8 @@ - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - mkdir -p /etc/containers/registries.conf.d - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 index df98035baa..e363187b58 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 @@ -234,7 +234,8 @@ - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - mkdir -p /etc/containers/registries.conf.d - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index d5f9ef9ba6..d72541d774 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -494,7 +494,8 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh {% if powervault_config is defined %} - /usr/local/bin/setup_iscsi_storage.sh {% endif %} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 49e5322195..145f79190d 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -560,7 +560,8 @@ - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index ccffc5cd9e..65ef5a8b0c 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -565,8 +565,9 @@ - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh @@ -623,4 +624,4 @@ - /usr/local/bin/export_nvhpc_env.sh - systemctl restart slurmd - - echo "Cloud-Init has completed successfully." \ No newline at end of file + - echo "Cloud-Init has completed successfully." diff --git a/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 b/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 index 249b90b6a5..43a000c561 100644 --- a/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 +++ b/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 @@ -7,34 +7,54 @@ if ! lspci | grep -i 'mellanox'; then exit 0 fi +# Ensure IPoIB + Mellanox IB kernel modules are loaded before interface detection. +# This avoids boot-time races where the IB device exists (lspci) but no ib* link is present yet. +modprobe mlx5_ib || true +modprobe ib_ipoib || true +modprobe ib_umad || true +modprobe ib_uverbs || true + ADMIN_NIC_IP="{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}" NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" -IB_NETWORK_SUBNET="{{ hostvars['localhost']['ib_network_subnet'] }}" - -ip_to_int() { - local IFS=. - read -r a b c d <<< "$1" - echo $(( (a << 24) + (b << 16) + (c << 8) + d )) -} +declare -A IB_IP_MAP=( +{% for mac, node in hostvars['localhost']['read_mapping_file']['dict'].items() -%} +{% if node.IB_IP is defined and node.IB_IP | trim | length > 0 %} + ["{{ node.ADMIN_IP }}"]="{{ node.IB_IP }}" +{%- endif %} +{%- endfor %} +) + +IB_IP="${IB_IP_MAP[$ADMIN_NIC_IP]:-}" + +if [ -n "$IB_IP" ]; then + echo "Using explicit IB IP : $IB_IP/$NETMASK_BITS" +else + IB_NETWORK_SUBNET="{{ hostvars['localhost']['ib_network_subnet'] }}" -int_to_ip() { - local ip=$1 - echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" -} + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } -ADMIN_IP_INT=$(ip_to_int "$ADMIN_NIC_IP") -IB_NET_INT=$(ip_to_int "$IB_NETWORK_SUBNET") + ADMIN_IP_INT=$(ip_to_int "$ADMIN_NIC_IP") + IB_NET_INT=$(ip_to_int "$IB_NETWORK_SUBNET") -HOST_BITS=$(( 32 - NETMASK_BITS )) -HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) -HOST_OFFSET=$(( ADMIN_IP_INT & HOST_MASK )) -IB_IP_INT=$(( IB_NET_INT + HOST_OFFSET )) + HOST_OFFSET=$(( ADMIN_IP_INT & HOST_MASK )) + IB_IP_INT=$(( IB_NET_INT + HOST_OFFSET )) -IB_IP=$(int_to_ip "$IB_IP_INT") + IB_IP=$(int_to_ip "$IB_IP_INT") -echo "Derived IB IP : $IB_IP/$NETMASK_BITS" + echo "Derived IB IP : $IB_IP/$NETMASK_BITS" +fi MAX_WAIT=120 # total wait time in seconds (2 minutes) INTERVAL=10 # check every 10 seconds diff --git a/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_telemetry.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_telemetry.sh.j2 new file mode 100644 index 0000000000..d1017bc76c --- /dev/null +++ b/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_telemetry.sh.j2 @@ -0,0 +1,178 @@ +{# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#} +{# PowerScale Telemetry - CSM Observability Helm Deployment (cloud-init fragment) + This template is included by ci-group-service_kube_control_plane_first_x86_64.yaml.j2 + when powerscale_telemetry_support is enabled. +#} +{% set csm_ns = hostvars['localhost']['csm_observability_namespace'] | default('telemetry') %} + # ===== PowerScale Telemetry - CSM Observability Helm Deployment ===== + echo "===== Starting PowerScale Telemetry (CSM Observability) deployment =====" + PS_TEL_FAILED=0 + CSM_NS="{{ csm_ns }}" + + # Step 1: Ensure namespace exists (shared with iDRAC telemetry) + echo "Ensuring ${CSM_NS} namespace exists..." + if kubectl get namespace "${CSM_NS}" >/dev/null 2>&1; then + echo "${CSM_NS} namespace already exists." + else + kubectl create namespace "${CSM_NS}" || { + echo "ERROR: Failed to create ${CSM_NS} namespace." + PS_TEL_FAILED=1 + } + fi + + # Step 2: Apply cert-manager CRDs (required before Helm install per official guide) + if [ "$PS_TEL_FAILED" -eq 0 ]; then + HELM_CHART_PATH="{{ k8s_client_mount_path }}/karavi-observability/helm-charts/charts/karavi-observability" + CRDS_FILE="${HELM_CHART_PATH}/crds/cert-manager.crds.yaml" + if [ -f "$CRDS_FILE" ]; then + echo "Applying cert-manager CRDs from chart crds/ directory..." + kubectl apply --validate=false -f "$CRDS_FILE" || { + echo "WARNING: Failed to apply cert-manager CRDs from crds/ directory." + } + else + echo "No cert-manager CRDs file found at ${CRDS_FILE}, Helm will handle CRDs." + fi + fi + + # Step 3: Copy isilon-creds secret from CSI driver namespace to ${CSM_NS} namespace + if [ "$PS_TEL_FAILED" -eq 0 ]; then + echo "Copying isilon-creds secret to ${CSM_NS} namespace..." + kubectl delete secret isilon-creds -n "${CSM_NS}" --ignore-not-found=true 2>/dev/null + kubectl get secret isilon-creds -n isilon -o json \ + | jq 'del(.metadata.resourceVersion,.metadata.uid,.metadata.creationTimestamp,.metadata.annotations,.metadata.managedFields)' \ + | jq ".metadata.namespace = \"${CSM_NS}\"" \ + | kubectl create -f - || { + echo "ERROR: Failed to copy isilon-creds secret to ${CSM_NS} namespace." + PS_TEL_FAILED=1 + } + fi + + # Step 4: Copy Karavi Authorization resources (if authorization enabled in Helm values) + if [ "$PS_TEL_FAILED" -eq 0 ]; then + HELM_VALUES_FILE="{{ k8s_client_mount_path }}/karavi-observability/csm_metrics_values.yaml" + KARAVI_AUTH_ENABLED="false" + if [ -f "$HELM_VALUES_FILE" ]; then + # Parse authorization.enabled from the Helm values YAML + KARAVI_AUTH_ENABLED=$(grep -A5 'authorization:' "$HELM_VALUES_FILE" | grep 'enabled:' | head -1 | awk '{print $2}' | tr -d ' "') + fi + if [ "$KARAVI_AUTH_ENABLED" = "true" ]; then + echo "Karavi Authorization enabled (Mode B) - copying authorization resources..." + kubectl get configmap isilon-config-params -n isilon -o yaml \ + | sed "s/namespace: isilon/namespace: ${CSM_NS}/" \ + | kubectl apply -f - || echo "WARNING: Failed to copy isilon-config-params." + + kubectl get secret proxy-server-root-certificate proxy-authz-tokens -n isilon -o yaml \ + | sed "s/namespace: isilon/namespace: ${CSM_NS}/" \ + | sed 's/name: proxy-server-root-certificate/name: isilon-proxy-server-root-certificate/' \ + | sed 's/name: proxy-authz-tokens/name: isilon-proxy-authz-tokens/' \ + | kubectl apply -f - || echo "WARNING: Failed to copy proxy secrets." + else + echo "Direct Authentication (Mode A) - skipping Karavi authorization resources." + fi + fi + + # Step 5: Install karavi-observability Helm chart directly from NFS share + if [ "$PS_TEL_FAILED" -eq 0 ]; then + HELM_CHART_PATH="{{ k8s_client_mount_path }}/karavi-observability/helm-charts/charts/karavi-observability" + HELM_VALUES_FILE="{{ k8s_client_mount_path }}/karavi-observability/csm_metrics_values.yaml" + + # Verify cert-manager subchart is available (directory or archive) + if [ -d "${HELM_CHART_PATH}/charts/cert-manager" ]; then + echo "cert-manager subchart found as directory in charts/." + elif ls "${HELM_CHART_PATH}/charts/cert-manager"*.tgz 1>/dev/null 2>&1 || \ + ls "${HELM_CHART_PATH}/charts/cert-manager"*.tar.gz 1>/dev/null 2>&1; then + echo "cert-manager subchart found as archive in charts/." + else + echo "WARNING: cert-manager subchart not found in ${HELM_CHART_PATH}/charts/." + fi + + if [ -d "$HELM_CHART_PATH" ] && [ -f "$HELM_VALUES_FILE" ]; then + echo "Installing karavi-observability Helm chart from NFS share..." + if helm list -n "${CSM_NS}" --filter karavi-observability -q 2>/dev/null | grep -q karavi-observability; then + echo "Upgrading existing karavi-observability release..." + helm upgrade karavi-observability "$HELM_CHART_PATH" \ + -n "${CSM_NS}" \ + -f "$HELM_VALUES_FILE" \ + --wait --timeout 10m || { + echo "ERROR: Helm upgrade failed." + PS_TEL_FAILED=1 + } + else + echo "Fresh install of karavi-observability..." + helm install karavi-observability "$HELM_CHART_PATH" \ + -n "${CSM_NS}" \ + -f "$HELM_VALUES_FILE" \ + --wait --timeout 10m || { + echo "ERROR: Helm install failed." + PS_TEL_FAILED=1 + } + fi + else + echo "ERROR: Helm chart or values file not found on NFS share." + echo " Chart path: $HELM_CHART_PATH" + echo " Values file: $HELM_VALUES_FILE" + PS_TEL_FAILED=1 + fi + fi + + if [ "$PS_TEL_FAILED" -eq 0 ]; then + echo "===== PowerScale Telemetry (CSM Observability) deployed successfully =====" + + # Step 6: Patch OTEL Collector service to expose Prometheus metrics port + echo "Patching OTEL Collector service to expose port 8889 for Prometheus metrics..." + kubectl patch svc otel-collector -n "${CSM_NS}" --patch '{"spec":{"ports":[{"name":"prometheus","port":8889,"targetPort":8889,"protocol":"TCP"}]}}' || { + echo "WARNING: Failed to patch OTEL Collector service for Prometheus metrics." + } + + # Step 7: Create PVC for OTEL Collector persistent buffering + OTEL_PVC_SIZE="{{ hostvars['localhost']['telemetry_config']['powerscale_configurations']['otel_collector_storage_size'] | default('5Gi') }}" + echo "Creating OTEL Collector PVC (${OTEL_PVC_SIZE}) for metric buffering..." + OTEL_PVC_FILE=$(mktemp /tmp/otel-pvc-XXXXXX.yaml) + echo "apiVersion: v1" > "$OTEL_PVC_FILE" + echo "kind: PersistentVolumeClaim" >> "$OTEL_PVC_FILE" + echo "metadata:" >> "$OTEL_PVC_FILE" + echo " name: otel-collector-data" >> "$OTEL_PVC_FILE" + echo " namespace: ${CSM_NS}" >> "$OTEL_PVC_FILE" + echo " labels:" >> "$OTEL_PVC_FILE" + echo " app.kubernetes.io/name: otel-collector" >> "$OTEL_PVC_FILE" + echo " app.kubernetes.io/instance: karavi-observability" >> "$OTEL_PVC_FILE" + echo "spec:" >> "$OTEL_PVC_FILE" + echo " accessModes:" >> "$OTEL_PVC_FILE" + echo " - ReadWriteOnce" >> "$OTEL_PVC_FILE" + echo " resources:" >> "$OTEL_PVC_FILE" + echo " requests:" >> "$OTEL_PVC_FILE" + echo " storage: ${OTEL_PVC_SIZE}" >> "$OTEL_PVC_FILE" + kubectl apply -f "$OTEL_PVC_FILE" + rm -f "$OTEL_PVC_FILE" + + echo "Waiting for OTEL Collector PVC to be bound..." + kubectl wait --for=jsonpath='{.status.phase}'=Bound pvc/otel-collector-data -n "${CSM_NS}" --timeout=120s || { + echo "WARNING: OTEL Collector PVC not bound yet. Continuing..." + } + + # Step 8: Patch OTEL Collector deployment to mount the PVC + echo "Patching OTEL Collector deployment with persistent volume..." + kubectl patch deployment otel-collector -n "${CSM_NS}" --type='json' -p='[{"op":"add","path":"/spec/template/spec/volumes/-","value":{"name":"otel-collector-data","persistentVolumeClaim":{"claimName":"otel-collector-data"}}},{"op":"add","path":"/spec/template/spec/containers/0/volumeMounts/-","value":{"name":"otel-collector-data","mountPath":"/data"}}]' || { + echo "WARNING: Failed to patch OTEL Collector with PVC. Metrics will use in-memory only." + } + + echo "Waiting for OTEL Collector rollout..." + kubectl rollout status deployment/otel-collector -n "${CSM_NS}" --timeout=120s || { + echo "WARNING: OTEL Collector rollout not complete yet." + } + else + echo "===== PowerScale Telemetry deployment had errors (see above) =====" + fi diff --git a/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_telemetry.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_telemetry.sh.j2 new file mode 100644 index 0000000000..e6f3412462 --- /dev/null +++ b/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_telemetry.sh.j2 @@ -0,0 +1,182 @@ +#!/bin/bash +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# PowerScale Telemetry Verification Script +# Generated by Omnia provision playbook +# Validates Helm-based CSM Observability (karavi-observability) deployment + +NAMESPACE="{{ telemetry_namespace }}" +HELM_RELEASE="karavi-observability" +PASS=0 +FAIL=0 +WARN=0 + +echo "==============================================" +echo " PowerScale Telemetry Verification" +echo "==============================================" +echo "" + +# 1. Check Kubernetes connectivity +echo "[1/9] Checking Kubernetes connectivity..." +if kubectl cluster-info &>/dev/null; then + echo " PASS: Kubernetes cluster is reachable" + ((PASS++)) +else + echo " FAIL: Cannot connect to Kubernetes cluster" + ((FAIL++)) +fi + +# 2. Check namespace +echo "[2/9] Checking telemetry namespace..." +if kubectl get namespace "$NAMESPACE" &>/dev/null; then + echo " PASS: Namespace '$NAMESPACE' exists" + ((PASS++)) +else + echo " FAIL: Namespace '$NAMESPACE' does not exist" + ((FAIL++)) +fi + +# 3. Check Helm release +echo "[3/9] Checking Helm release..." +HELM_STATUS=$(helm status "$HELM_RELEASE" -n "$NAMESPACE" -o json 2>/dev/null | grep -o '"status":"[^"]*"' | head -1) +if echo "$HELM_STATUS" | grep -q "deployed"; then + echo " PASS: Helm release '$HELM_RELEASE' is deployed" + ((PASS++)) +else + echo " FAIL: Helm release '$HELM_RELEASE' not found or not deployed" + ((FAIL++)) +fi + +# 4. Check PowerScale CSI secret (copied from isilon namespace) +echo "[4/9] Checking PowerScale credentials secret..." +if kubectl get secret isilon-creds -n "$NAMESPACE" &>/dev/null; then + echo " PASS: Secret 'isilon-creds' exists in '$NAMESPACE'" + ((PASS++)) +else + echo " FAIL: Secret 'isilon-creds' not found in namespace '$NAMESPACE'" + ((FAIL++)) +fi + +# 5. Check CSM Metrics PowerScale deployment (Helm-managed) +echo "[5/9] Checking CSM Metrics PowerScale deployment..." +CSM_DEPLOY=$(kubectl get deployment -n "$NAMESPACE" -l app.kubernetes.io/name=karavi-metrics-powerscale -o name 2>/dev/null) +if [ -n "$CSM_DEPLOY" ]; then + for dep in $CSM_DEPLOY; do + DEP_NAME=$(echo "$dep" | sed 's|deployment.apps/||') + READY=$(kubectl get "$dep" -n "$NAMESPACE" -o jsonpath='{.status.readyReplicas}' 2>/dev/null) + EXPECTED=$(kubectl get "$dep" -n "$NAMESPACE" -o jsonpath='{.spec.replicas}' 2>/dev/null) + if [ "${READY:-0}" == "$EXPECTED" ]; then + echo " PASS: $DEP_NAME is ready ($READY/$EXPECTED)" + ((PASS++)) + else + echo " FAIL: $DEP_NAME is not ready (${READY:-0}/$EXPECTED)" + ((FAIL++)) + fi + done +else + echo " FAIL: No CSM Metrics PowerScale deployment found" + ((FAIL++)) +fi + +# 6. Check OTEL Collector deployment (Helm-managed) +echo "[6/9] Checking OTEL Collector deployment..." +OTEL_DEPLOY=$(kubectl get deployment -n "$NAMESPACE" -l app.kubernetes.io/name=otel-collector -o name 2>/dev/null) +if [ -n "$OTEL_DEPLOY" ]; then + for dep in $OTEL_DEPLOY; do + DEP_NAME=$(echo "$dep" | sed 's|deployment.apps/||') + READY=$(kubectl get "$dep" -n "$NAMESPACE" -o jsonpath='{.status.readyReplicas}' 2>/dev/null) + EXPECTED=$(kubectl get "$dep" -n "$NAMESPACE" -o jsonpath='{.spec.replicas}' 2>/dev/null) + if [ "${READY:-0}" == "$EXPECTED" ]; then + echo " PASS: $DEP_NAME is ready ($READY/$EXPECTED)" + ((PASS++)) + else + echo " FAIL: $DEP_NAME is not ready (${READY:-0}/$EXPECTED)" + ((FAIL++)) + fi + done +else + echo " FAIL: No OTEL Collector deployment found" + ((FAIL++)) +fi + +# 7. Check cert-manager pods (Helm sub-chart) +echo "[7/9] Checking cert-manager pods..." +CM_PODS=$(kubectl get pods -n "$NAMESPACE" -l app=cert-manager --no-headers 2>/dev/null | wc -l) +if [ "$CM_PODS" -ge 1 ]; then + CM_READY=$(kubectl get pods -n "$NAMESPACE" -l app=cert-manager --no-headers 2>/dev/null | awk '$2 ~ /^[0-9]+\/[0-9]+$/ && $3=="Running"' | wc -l) + echo " PASS: cert-manager pods running ($CM_READY/$CM_PODS)" + ((PASS++)) +else + echo " WARN: No cert-manager pods found (may be disabled in values)" + ((WARN++)) +fi + +# 8. Check OTEL Collector PVC +echo "[8/9] Checking OTEL Collector PVC..." +PVC_STATUS=$(kubectl get pvc otel-collector-data -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null) +if [ "$PVC_STATUS" == "Bound" ]; then + PVC_SC=$(kubectl get pvc otel-collector-data -n "$NAMESPACE" -o jsonpath='{.spec.storageClassName}' 2>/dev/null) + PVC_SIZE=$(kubectl get pvc otel-collector-data -n "$NAMESPACE" -o jsonpath='{.spec.resources.requests.storage}' 2>/dev/null) + echo " PASS: PVC 'otel-collector-data' is Bound (${PVC_SIZE}, StorageClass: ${PVC_SC:-default})" + ((PASS++)) +elif [ -n "$PVC_STATUS" ]; then + echo " WARN: PVC 'otel-collector-data' status: $PVC_STATUS (not yet Bound)" + ((WARN++)) +else + echo " WARN: PVC 'otel-collector-data' not found (OTEL Collector using in-memory only)" + ((WARN++)) +fi + +# 9. Check metrics flow +echo "[9/9] Checking metrics flow..." +METRICS_LOG=$(kubectl logs -n "$NAMESPACE" -l app.kubernetes.io/name=karavi-metrics-powerscale --tail=5 2>/dev/null | grep -c "function duration") +OTEL_LOG=$(kubectl logs -n "$NAMESPACE" -l app.kubernetes.io/name=otel-collector --all-containers --tail=5 2>/dev/null | grep -c "Metrics") +if [ "$METRICS_LOG" -gt 0 ] && [ "$OTEL_LOG" -gt 0 ]; then + echo " PASS: Metrics flowing (CSM Metrics -> OTEL Collector)" + ((PASS++)) +elif [ "$METRICS_LOG" -gt 0 ]; then + echo " WARN: CSM Metrics collecting but OTEL Collector not logging metrics" + ((WARN++)) +else + echo " FAIL: No metrics activity detected" + ((FAIL++)) +fi + +# Summary +echo "" +echo "==============================================" +echo " Verification Summary" +echo "==============================================" +echo " PASSED: $PASS" +echo " FAILED: $FAIL" +echo " WARNINGS: $WARN" +echo "" + +if [ $FAIL -eq 0 ]; then + echo " STATUS: ALL CHECKS PASSED" +else + echo " STATUS: SOME CHECKS FAILED" + echo "" + echo " Debug Commands:" + echo " kubectl get pods -n $NAMESPACE -l app.kubernetes.io/instance=$HELM_RELEASE" + echo " kubectl logs -n $NAMESPACE -l app.kubernetes.io/name=karavi-metrics-powerscale --tail=50" + echo " kubectl logs -n $NAMESPACE -l app.kubernetes.io/name=otel-collector --all-containers --tail=50" + echo " helm status $HELM_RELEASE -n $NAMESPACE" + echo " kubectl get events -n $NAMESPACE --sort-by='.lastTimestamp' | tail -20" +fi + +echo "" +echo "==============================================" +exit $FAIL diff --git a/provision/roles/configure_ochami/templates/telemetry/telemetry.sh.j2 b/provision/roles/configure_ochami/templates/telemetry/telemetry.sh.j2 index fd3ccbacfb..352671ad4c 100644 --- a/provision/roles/configure_ochami/templates/telemetry/telemetry.sh.j2 +++ b/provision/roles/configure_ochami/templates/telemetry/telemetry.sh.j2 @@ -2,9 +2,24 @@ kubectl apply -f {{ k8s_client_mount_path }}/telemetry/deployments/telemetry_nam {% if kafka_support %} helm -n telemetry install strimzi-cluster-operator {{ k8s_client_mount_path }}/telemetry/{{ strimzi_kafka_pkg }}.tar.gz {% endif %} +{% if 'victoria' in hostvars['localhost']['telemetry_collection_type'].split(',') %} +helm -n telemetry install victoria-metrics-operator {{ k8s_client_mount_path }}/telemetry/{{ victoria_operator_pkg }}.tar.gz +echo "Waiting for victoria-metrics-operator to be ready..." +kubectl wait --for=condition=available --timeout=300s deployment -l app.kubernetes.io/name=victoria-metrics-operator -n telemetry || true +echo "Waiting for VictoriaLogs CRDs to be registered..." +for i in {1..30}; do + if kubectl get crd vlclusters.operator.victoriametrics.com >/dev/null 2>&1 && \ + kubectl get crd vlagents.operator.victoriametrics.com >/dev/null 2>&1; then + echo "VictoriaLogs CRDs are ready" + break + fi + echo "Waiting for VictoriaLogs CRDs... (attempt $i/30)" + sleep 2 +done +{% endif %} kubectl apply -k {{ k8s_client_mount_path }}/telemetry/deployments/. {% if hostvars['localhost']['ldms_support'] %} kubectl create secret generic nersc-ldms-ovis-auth --from-file=ldmsauth.conf={{ k8s_client_mount_path }}/telemetry/ldms/ldmsauth.conf --dry-run=client -o yaml | kubectl apply -f - -n telemetry kubectl create secret generic nersc-munge-key --from-file=munge.key={{ k8s_client_mount_path }}/telemetry/ldms/munge.key --dry-run=client -o yaml | kubectl apply -f - -n telemetry cd {{ k8s_client_mount_path }}/telemetry/ldms/nersc-ldms-aggr && helm install -n telemetry nersc-ldms-aggr nersc-ldms-aggr --values values.yaml -{% endif %} \ No newline at end of file +{% endif %} diff --git a/provision/roles/telemetry/tasks/check_kube_vip_reachability.yml b/provision/roles/telemetry/tasks/check_kube_vip_reachability.yml new file mode 100644 index 0000000000..015150abc6 --- /dev/null +++ b/provision/roles/telemetry/tasks/check_kube_vip_reachability.yml @@ -0,0 +1,52 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check kube_vip reachability + when: + - kube_vip is defined + - kube_vip | length > 0 + tags: telemetry_deployment + block: + - name: Set kube_vip reachability fact to false initially + ansible.builtin.set_fact: + kube_vip_reachable: false + + - name: Test SSH connectivity to kube_vip + ansible.builtin.wait_for: + host: "{{ kube_vip }}" + port: 22 + timeout: 5 + state: started + register: kube_vip_ssh_check + ignore_errors: true + changed_when: false + + - name: Set kube_vip reachable fact if SSH successful + ansible.builtin.set_fact: + kube_vip_reachable: true + when: + - kube_vip_ssh_check is defined + - kube_vip_ssh_check.state is defined + - kube_vip_ssh_check.state == 'started' + + - name: Log kube_vip reachability status + ansible.builtin.debug: + msg: | + kube_vip Reachability Check: + - Host: {{ kube_vip }} + - Port: 22 + - Reachable: {{ kube_vip_reachable }} + - Status: {% if kube_vip_reachable %}✅ REACHABLE{% else %}❌ NOT REACHABLE{% endif %} + tags: telemetry_deployment diff --git a/provision/roles/telemetry/tasks/deploy_powerscale_metrics.yml b/provision/roles/telemetry/tasks/deploy_powerscale_metrics.yml new file mode 100644 index 0000000000..463152977f --- /dev/null +++ b/provision/roles/telemetry/tasks/deploy_powerscale_metrics.yml @@ -0,0 +1,124 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Validate CSI driver PowerScale is configured + ansible.builtin.assert: + that: + - hostvars['localhost']['csi_driver_powerscale_support'] | default(false) | bool + fail_msg: "{{ ps_csi_driver_not_configured_msg }}" + +- name: Set PowerScale configuration facts + ansible.builtin.set_fact: + ps_csi_secret_path: "{{ hostvars['localhost']['service_cluster_info'].csi_powerscale_driver_secret_file_path | default('') }}" + powerscale_configurations: "{{ telemetry_config.powerscale_configurations }}" + ps_helm_values_file: "{{ telemetry_config.powerscale_configurations.csm_observability_values_file_path }}" + csm_observability_namespace: "{{ csm_namespace }}" + +- name: Validate user-provided Helm values file path + ansible.builtin.assert: + that: + - ps_helm_values_file | length > 0 + fail_msg: "{{ ps_helm_values_path_missing_msg }}" + +- name: Verify user-provided Helm values file exists + ansible.builtin.stat: + path: "{{ ps_helm_values_file }}" + register: helm_values_stat + delegate_to: localhost + +- name: Fail if user-provided Helm values file does not exist + ansible.builtin.fail: + msg: "{{ ps_helm_values_file_not_found_msg }}" + when: not helm_values_stat.stat.exists + +# --- Read user values file to detect auth mode for secret copying --- + +- name: Load user-provided Helm values to detect auth mode + block: + - name: Read user Helm values file + ansible.builtin.include_vars: + file: "{{ ps_helm_values_file }}" + name: user_helm_values + no_log: true + rescue: + - name: Failed to read user Helm values file + ansible.builtin.fail: + msg: "{{ ps_helm_values_parse_fail_msg }}" + +- name: Detect authentication mode from CSM Observability values + ansible.builtin.set_fact: + karavi_enabled: >- + {{ user_helm_values.karaviMetricsPowerscale.authorization.enabled | default(false) | bool }} + +- name: Display authentication mode + ansible.builtin.debug: + msg: "{% if karavi_enabled | bool %}{{ ps_auth_mode_karavi_msg }}{% else %}{{ ps_auth_mode_direct_msg }}{% endif %}" + verbosity: 2 + +# --- Validate cert-manager is enabled (required for TLS) --- + +- name: Validate cert-manager is enabled in Helm values + ansible.builtin.assert: + that: + - user_helm_values['cert-manager']['enabled'] | default(false) | bool + fail_msg: "{{ ps_cert_manager_disabled_msg }}" + +# --- Load CSI PowerScale credentials from vault --- + +- name: Load CSI PowerScale credentials from vault + block: + - name: Read CSI driver secret.yaml + ansible.builtin.include_vars: + file: "{{ ps_csi_secret_path }}" + name: csi_powerscale_secret + no_log: true + + - name: Extract isilon clusters from secret + ansible.builtin.set_fact: + ps_clusters: "{{ csi_powerscale_secret.isilonClusters | default([]) }}" + no_log: true + rescue: + - name: Failed to load CSI PowerScale credentials + ansible.builtin.fail: + msg: "{{ ps_csi_secret_read_fail_msg }}" + +- name: Fail if no PowerScale clusters defined + ansible.builtin.fail: + msg: "{{ ps_no_clusters_found_msg }}" + when: ps_clusters | length == 0 + +- name: Display PowerScale clusters found + ansible.builtin.debug: + msg: "{{ ps_clusters_found_msg }}" + verbosity: 2 + +# --- Air-gapped: Download and extract karavi-observability dependencies from Pulp to NFS --- + +- name: Get PowerScale telemetry offline dependencies + ansible.builtin.include_tasks: get_powerscale_telemetry_dependencies.yml + +# --- Copy user-provided values file to NFS share --- +# Actual helm install happens in cloud-init during PXE boot. + +- name: Copy user-provided Helm values file to NFS share + ansible.builtin.copy: + src: "{{ ps_helm_values_file }}" + dest: "{{ k8s_client_mount_path }}/karavi-observability/csm_metrics_values.yaml" + mode: '0600' + +- name: Display PowerScale telemetry preparation status + ansible.builtin.debug: + msg: "{{ ps_telemetry_prepared_msg }}" + verbosity: 2 diff --git a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml index 72ce7f8707..d1247287dd 100644 --- a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml +++ b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml @@ -33,12 +33,22 @@ dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/{{ item.dest }}" mode: "{{ hostvars['localhost']['file_permissions_644'] }}" loop: "{{ victoria_templates }}" - when: "'victoria' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',')" + when: "'victoria' in hostvars['localhost']['telemetry_collection_type'].split(',')" tags: telemetry_deployment - # NOTE: victoria_templates is automatically set based on deployment_mode in telemetry_config.yml + # NOTE: victoria_templates is automatically set based on victoria_deployment_mode in vars/main.yml # - cluster mode: includes vmstorage, vminsert, vmselect templates # - single-node mode: includes victoria-statefulset template +- name: Populate VictoriaLogs deployment configs + ansible.builtin.template: + src: "{{ item.src }}" + dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/{{ item.dest }}" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + loop: "{{ victorialogs_templates }}" + when: "'victoria' in hostvars['localhost']['telemetry_collection_type'].split(',')" + tags: telemetry_deployment + # NOTE: victorialogs_templates includes VLCluster CR, VLAgent CR, and VLAgent ConfigMap + - name: Kafka configurations when: kafka_support block: @@ -72,7 +82,7 @@ }] }} when: - hostvars['localhost']['idrac_telemetry_support'] - - "'kafka' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',')" + - "'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',')" - "kafka.topics.idrac.name in kafka_topic_partitions" - name: Add ldms topic if enabled @@ -115,6 +125,19 @@ dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/{{ strimzi_kafka_pkg }}.tar.gz" mode: "{{ hostvars['localhost']['file_permissions_644'] }}" +- name: Victoria Metrics operator configuration + when: "'victoria' in hostvars['localhost']['telemetry_collection_type'].split(',')" + block: + - name: Extract and set facts for tarball URLs for victoria metrics operator + ansible.builtin.set_fact: + victoria_operator_pkg: "{{ k8s_packages_json['service_k8s']['cluster'] | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'victoria-metrics-operator') | map(attribute='package') | join }}" # noqa: yaml[line-length] + + - name: Download victoria metrics operator tarball + ansible.builtin.get_url: + url: "{{ victoria_operator_tarball_url }}" + dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/{{ victoria_operator_pkg }}.tar.gz" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + - name: Populate common telemetry deployment configs ansible.builtin.template: src: "{{ item.src }}" diff --git a/provision/roles/telemetry/tasks/get_powerscale_telemetry_dependencies.yml b/provision/roles/telemetry/tasks/get_powerscale_telemetry_dependencies.yml new file mode 100644 index 0000000000..b5f1bdd008 --- /dev/null +++ b/provision/roles/telemetry/tasks/get_powerscale_telemetry_dependencies.yml @@ -0,0 +1,162 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# get_powerscale_telemetry_dependencies.yml +# Downloads and extracts karavi-observability git repository, Helm chart, +# and cert-manager Helm chart dependency from Pulp local mirror for +# air-gapped deployment. +# All files are downloaded to the NFS share at {{ k8s_client_mount_path }}/karavi-observability/ +# +# cert-manager CRDs are included inside the karavi-observability repo +# (installer/cert-manager.crds.yaml) -- no separate download needed. +# +# cert-manager Helm chart (from Jetstack) is pre-downloaded to Pulp via +# service_k8s.json tarball entry. The package name and version are read +# dynamically from service_k8s.json -- not hardcoded. + +- name: Extract cert-manager package name from service_k8s.json + ansible.builtin.set_fact: + cert_manager_package: >- + {{ telemetry_packages['service_k8s']['cluster'] + | selectattr('type', 'equalto', 'tarball') + | selectattr('package', 'search', 'cert-manager') + | map(attribute='package') + | first }} + +- name: Set cert-manager chart tarball filename (Pulp stores tarballs as .tar.gz) + ansible.builtin.set_fact: + cert_manager_chart_tgz: "{{ cert_manager_package }}.tar.gz" + +- name: Display cert-manager package read from service_k8s.json + ansible.builtin.debug: + msg: "{{ ps_cert_manager_pkg_msg }}" + verbosity: 2 + +- name: Get karavi-observability offline dependencies from Pulp to NFS share + block: + - name: Create karavi-observability directory on NFS share + ansible.builtin.file: + path: "{{ k8s_client_mount_path }}/karavi-observability" + state: directory + mode: '0755' + + - name: Get karavi-observability git tar from Pulp + ansible.builtin.get_url: + url: "{{ offline_git_path }}/karavi-observability/{{ karavi_observability_git }}" + dest: "{{ k8s_client_mount_path }}/karavi-observability/{{ karavi_observability_git }}" + mode: "{{ permission_644 }}" + + - name: Extract karavi-observability tar file on NFS share + ansible.builtin.unarchive: + src: "{{ k8s_client_mount_path }}/karavi-observability/{{ karavi_observability_git }}" + dest: "{{ k8s_client_mount_path }}/karavi-observability/" + remote_src: true + + - name: Get dell/helm-charts git tar from Pulp + ansible.builtin.get_url: + url: "{{ offline_git_path }}/helm-charts/{{ karavi_helm_charts_git }}" + dest: "{{ k8s_client_mount_path }}/karavi-observability/{{ karavi_helm_charts_git }}" + mode: "{{ permission_644 }}" + + - name: Extract dell/helm-charts tar file on NFS share + ansible.builtin.unarchive: + src: "{{ k8s_client_mount_path }}/karavi-observability/{{ karavi_helm_charts_git }}" + dest: "{{ k8s_client_mount_path }}/karavi-observability/" + remote_src: true + + - name: Set karavi-observability Helm chart path on NFS share + ansible.builtin.set_fact: + karavi_helm_chart_path: "{{ k8s_client_mount_path }}/karavi-observability/helm-charts/charts/karavi-observability" + + - name: Create charts/ directory for Helm dependencies + ansible.builtin.file: + path: "{{ karavi_helm_chart_path }}/charts" + state: directory + mode: '0755' + + - name: Check if cert-manager is disabled in values file + ansible.builtin.set_fact: + cert_manager_disabled: >- + {%- if csm_observability_values_file_path | default('') != '' -%} + {%- set values_content = lookup('file', csm_observability_values_file_path, errors='ignore') -%} + {%- if values_content is not none and 'cert-manager:' in values_content and 'enabled: false' in values_content -%} + true{%- else -%} + false{%- endif -%} + {%- else -%} + false{%- endif -%} + + - name: Display cert-manager dependency status + ansible.builtin.debug: + msg: "{% if cert_manager_disabled %}{{ ps_cert_manager_skipped_msg }}{% else %}{{ ps_cert_manager_required_msg }}{% endif %}" + verbosity: 2 + + - name: Download cert-manager Helm chart from Pulp (Jetstack repo pre-staged offline) + ansible.builtin.get_url: + url: "{{ offline_tarball_path }}/{{ cert_manager_package }}/{{ cert_manager_chart_tgz }}" + dest: "{{ karavi_helm_chart_path }}/charts/{{ cert_manager_chart_tgz }}" + mode: "{{ permission_644 }}" + when: not cert_manager_disabled + + - name: Verify cert-manager chart archive was downloaded + ansible.builtin.stat: + path: "{{ karavi_helm_chart_path }}/charts/{{ cert_manager_chart_tgz }}" + register: cert_manager_chart_stat + when: not cert_manager_disabled + + - name: Fail if cert-manager chart archive is missing + ansible.builtin.fail: + msg: "{{ ps_cert_manager_missing_msg }}" + when: not cert_manager_disabled and not cert_manager_chart_stat.stat.exists + + - name: Extract cert-manager chart as directory for Helm compatibility + ansible.builtin.unarchive: + src: "{{ karavi_helm_chart_path }}/charts/{{ cert_manager_chart_tgz }}" + dest: "{{ karavi_helm_chart_path }}/charts/" + remote_src: true + when: not cert_manager_disabled and cert_manager_chart_stat.stat.exists + + - name: Remove cert-manager archive after extraction + ansible.builtin.file: + path: "{{ karavi_helm_chart_path }}/charts/{{ cert_manager_chart_tgz }}" + state: absent + when: not cert_manager_disabled and cert_manager_chart_stat.stat.exists + + - name: Verify cert-manager chart directory was extracted + ansible.builtin.stat: + path: "{{ karavi_helm_chart_path }}/charts/cert-manager/Chart.yaml" + register: cert_manager_dir_stat + when: not cert_manager_disabled + + - name: Fail if cert-manager chart directory extraction failed + ansible.builtin.fail: + msg: "{{ ps_cert_manager_extract_fail_msg }}" + when: not cert_manager_disabled and not cert_manager_dir_stat.stat.exists + + - name: Display cert-manager dependency staged successfully + ansible.builtin.debug: + msg: "{{ ps_cert_manager_staged_msg }}" + verbosity: 2 + when: not cert_manager_disabled and cert_manager_dir_stat.stat.exists + + - name: Display cert-manager dependency skipped + ansible.builtin.debug: + msg: "{{ ps_cert_manager_skipped_detail_msg }}" + verbosity: 2 + when: cert_manager_disabled + + rescue: + - name: Handle dependency download failure + ansible.builtin.fail: + msg: "{{ ps_dependency_fail_msg }}" diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index e965181024..346dac07db 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -24,9 +24,19 @@ - name: Load service images from service_k8s_.json ansible.builtin.include_tasks: load_service_images.yml +- name: Check kube_vip reachability for validation + ansible.builtin.include_tasks: check_kube_vip_reachability.yml + when: + - "'victoria' in hostvars['localhost']['telemetry_collection_type'].split(',')" + - kube_vip is defined + - kube_vip | length > 0 + - name: Configure of k8s telemetry service when: - - hostvars['localhost']['idrac_telemetry_support'] or hostvars['localhost']['ldms_support'] + - >- + hostvars['localhost']['idrac_telemetry_support'] or + hostvars['localhost']['ldms_support'] or + hostvars['localhost']['powerscale_configurations']['powerscale_telemetry_support'] | default(false) | bool block: - name: Set NFS info fact ansible.builtin.set_fact: @@ -35,6 +45,11 @@ - name: Service cluster prerequisite ansible.builtin.include_tasks: telemetry_prereq.yml + - name: Deploy PowerScale telemetry metrics + ansible.builtin.include_tasks: deploy_powerscale_metrics.yml + when: + - hostvars['localhost']['powerscale_configurations']['powerscale_telemetry_support'] | default(false) | bool + - name: Generate telemetry deployments ansible.builtin.include_tasks: generate_telemetry_deployments.yml diff --git a/provision/roles/telemetry/tasks/telemetry_prereq.yml b/provision/roles/telemetry/tasks/telemetry_prereq.yml index 7eb45a89ab..c41462c609 100644 --- a/provision/roles/telemetry/tasks/telemetry_prereq.yml +++ b/provision/roles/telemetry/tasks/telemetry_prereq.yml @@ -69,7 +69,7 @@ - name: Set kafka_support to true ansible.builtin.set_fact: kafka_support: true - when: "'kafka' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',') or hostvars['localhost']['ldms_support']" + when: "'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') or hostvars['localhost']['ldms_support']" - name: Configure TLS certificate and secrets for kafka when: kafka_support @@ -100,7 +100,9 @@ when: not cluster_id_present | default(false) - name: Configure TLS certificate for VictoriaMetrics - when: "'victoria' in hostvars['localhost']['idrac_telemetry_collection_type']" + when: + - "'victoria' in hostvars['localhost']['telemetry_collection_type']" + - victoria_cluster.tls_enabled | default(false) | bool block: - name: Create VictoriaMetrics certificate directory ansible.builtin.file: diff --git a/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 b/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 index bdfc894bef..a4b391519f 100644 --- a/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 +++ b/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 @@ -18,12 +18,14 @@ # Telemetry Stack Cleanup Script # Removes Kafka, LDMS, iDRAC telemetry, and monitoring resources from the {{ telemetry_namespace }} namespace # -# Usage: ./cleanup_telemetry.sh [kafka] [ldms] [idrac] [victoria] [all] -# kafka - Delete Kafka cluster, users, and bridge -# ldms - Delete LDMS aggregator and store -# idrac - Delete iDRAC telemetry -# victoria - Delete VictoriaMetrics monitoring -# all - Delete everything (default if no arguments) +# Usage: ./cleanup_telemetry.sh [kafka] [ldms] [idrac] [victoria] [victorialogs] [powerscale] [all] +# kafka - Delete Kafka cluster, users, and bridge +# ldms - Delete LDMS aggregator and store +# idrac - Delete iDRAC telemetry +# victoria - Delete VictoriaMetrics monitoring (vmcluster, vmagent) +# victorialogs - Delete VictoriaLogs only (vlagent, vlcluster) without affecting VictoriaMetrics +# powerscale - Delete PowerScale telemetry (karavi-observability Helm release, CSM Metrics, OTEL Collector) +# all - Delete everything (default if no arguments) # set -e @@ -35,6 +37,8 @@ CLEAN_KAFKA=false CLEAN_LDMS=false CLEAN_IDRAC=false CLEAN_VICTORIA=false +CLEAN_VICTORIALOGS=false +CLEAN_POWERSCALE=false CLEAN_ALL=false if [ $# -eq 0 ]; then @@ -54,24 +58,34 @@ else victoria) CLEAN_VICTORIA=true ;; + victorialogs) + CLEAN_VICTORIALOGS=true + ;; + powerscale) + CLEAN_POWERSCALE=true + ;; all) CLEAN_ALL=true ;; -h|--help) - echo "Usage: $0 [kafka] [ldms] [idrac] [victoria] [all]" + echo "Usage: $0 [kafka] [ldms] [idrac] [victoria] [victorialogs] [powerscale] [all]" echo "" echo "Options:" - echo " kafka - Delete Kafka cluster, users, and bridge" - echo " ldms - Delete LDMS aggregator and store" - echo " idrac - Delete iDRAC telemetry" - echo " victoria - Delete VictoriaMetrics monitoring" - echo " all - Delete everything (default if no arguments)" + echo " kafka - Delete Kafka cluster, users, and bridge" + echo " ldms - Delete LDMS aggregator and store" + echo " idrac - Delete iDRAC telemetry" + echo " victoria - Delete VictoriaMetrics monitoring (vmcluster, vmagent)" + echo " victorialogs - Delete VictoriaLogs only (vlagent, vlcluster) without affecting VictoriaMetrics" + echo " powerscale - Delete PowerScale telemetry (karavi-observability Helm release)" + echo " all - Delete everything (default if no arguments)" echo "" echo "Examples:" echo " $0 # Delete everything" echo " $0 all # Delete everything" echo " $0 kafka ldms # Delete only Kafka and LDMS" - echo " $0 idrac victoria # Delete only iDRAC and Victoria" + echo " $0 idrac victoria # Delete only iDRAC and VictoriaMetrics" + echo " $0 victorialogs # Delete only VictoriaLogs (keeps VictoriaMetrics running)" + echo " $0 powerscale # Delete only PowerScale telemetry" exit 0 ;; *) @@ -89,6 +103,8 @@ if [ "$CLEAN_ALL" = true ]; then CLEAN_LDMS=true CLEAN_IDRAC=true CLEAN_VICTORIA=true + CLEAN_VICTORIALOGS=true + CLEAN_POWERSCALE=true fi echo "==========================================" @@ -96,11 +112,13 @@ echo " Telemetry Stack Cleanup" echo "==========================================" echo "" echo "Components to clean:" -echo " Kafka Bridge: $([ "$CLEAN_KAFKA" = true ] && echo "YES" || echo "NO")" -echo " Kafka Cluster: $([ "$CLEAN_KAFKA" = true ] && echo "YES" || echo "NO")" -echo " LDMS: $([ "$CLEAN_LDMS" = true ] && echo "YES" || echo "NO")" -echo " iDRAC Telemetry: $([ "$CLEAN_IDRAC" = true ] && echo "YES" || echo "NO")" -echo " Victoria Metrics:$([ "$CLEAN_VICTORIA" = true ] && echo "YES" || echo "NO")" +echo " Kafka Bridge: $([ "$CLEAN_KAFKA" = true ] && echo "YES" || echo "NO")" +echo " Kafka Cluster: $([ "$CLEAN_KAFKA" = true ] && echo "YES" || echo "NO")" +echo " LDMS: $([ "$CLEAN_LDMS" = true ] && echo "YES" || echo "NO")" +echo " iDRAC Telemetry: $([ "$CLEAN_IDRAC" = true ] && echo "YES" || echo "NO")" +echo " VictoriaMetrics: $([ "$CLEAN_VICTORIA" = true ] && echo "YES" || echo "NO")" +echo " VictoriaLogs: $([ "$CLEAN_VICTORIALOGS" = true ] && echo "YES" || echo "NO")" +echo " PowerScale Tel.: $([ "$CLEAN_POWERSCALE" = true ] && echo "YES" || echo "NO")" echo "" read -p "Continue? (y/N): " -n 1 -r echo @@ -220,7 +238,7 @@ if [ "$CLEAN_KAFKA" = true ]; then echo "" fi -if [ "$CLEAN_KAFKA" = true ] || [ "$CLEAN_LDMS" = true ] || [ "$CLEAN_IDRAC" = true ] || [ "$CLEAN_VICTORIA" = true ]; then +if [ "$CLEAN_KAFKA" = true ] || [ "$CLEAN_LDMS" = true ] || [ "$CLEAN_IDRAC" = true ] || [ "$CLEAN_VICTORIA" = true ] || [ "$CLEAN_VICTORIALOGS" = true ] || [ "$CLEAN_POWERSCALE" = true ]; then echo "Step 7: Delete Persistent Volume Claims" echo "----------------------------------------" if [ "$CLEAN_KAFKA" = true ]; then @@ -236,12 +254,26 @@ if [ "$CLEAN_KAFKA" = true ] || [ "$CLEAN_LDMS" = true ] || [ "$CLEAN_IDRAC" = t # Delete single-node PVCs delete_all pvc "app=victoria-metric" delete_resource pvc victoria-metrics-pvc-victoria-metric-0 - # Delete cluster mode PVCs (vmstorage StatefulSet PVCs) - delete_all pvc "app=vmstorage" + # Delete cluster mode PVCs (operator-managed vmstorage StatefulSet PVCs) + delete_all pvc "app.kubernetes.io/instance=victoria-cluster" + for i in {0..9}; do + delete_resource pvc vmstorage-data-vmstorage-victoria-cluster-$i + done + fi + if [ "$CLEAN_VICTORIALOGS" = true ]; then + # Delete VictoriaLogs PVCs (vlstorage StatefulSet PVCs + vlagent PVC) + delete_all pvc "app.kubernetes.io/name=vlstorage" + delete_all pvc "app.kubernetes.io/name=vlagent" for i in {0..9}; do - delete_resource pvc vmstorage-data-vmstorage-$i + delete_resource pvc vlstorage-data-vlstorage-victoria-logs-cluster-$i + delete_resource pvc vlagent-data-vlagent-vlagent-$i done fi + if [ "$CLEAN_POWERSCALE" = true ]; then + # Delete OTEL Collector PVC (created by cloud-init post-Helm-install) + delete_resource pvc otel-collector-data + delete_all pvc "app.kubernetes.io/name=otel-collector" + fi sleep 2 echo "" fi @@ -277,7 +309,7 @@ if [ "$CLEAN_IDRAC" = true ]; then echo "" fi -if [ "$CLEAN_KAFKA" = true ] || [ "$CLEAN_LDMS" = true ] || [ "$CLEAN_IDRAC" = true ] || [ "$CLEAN_VICTORIA" = true ]; then +if [ "$CLEAN_KAFKA" = true ] || [ "$CLEAN_LDMS" = true ] || [ "$CLEAN_IDRAC" = true ] || [ "$CLEAN_VICTORIA" = true ] || [ "$CLEAN_VICTORIALOGS" = true ]; then echo "Step 9: Delete ConfigMaps" echo "-------------------------" if [ "$CLEAN_KAFKA" = true ]; then @@ -293,6 +325,9 @@ if [ "$CLEAN_KAFKA" = true ] || [ "$CLEAN_LDMS" = true ] || [ "$CLEAN_IDRAC" = t if [ "$CLEAN_VICTORIA" = true ]; then delete_resource configmap victoria-tls-test-script fi + if [ "$CLEAN_VICTORIALOGS" = true ]; then + delete_resource configmap vlagent-config + fi sleep 2 echo "" fi @@ -320,30 +355,77 @@ if [ "$CLEAN_KAFKA" = true ] || [ "$CLEAN_VICTORIA" = true ]; then fi if [ "$CLEAN_VICTORIA" = true ]; then delete_resource service victoria-metric - delete_resource service vmselect - delete_resource service vminsert - delete_resource service vmstorage - delete_resource service vmagent + # Operator-managed cluster services + delete_resource service vmselect-victoria-cluster + delete_resource service vminsert-victoria-cluster + delete_resource service vmstorage-victoria-cluster + delete_resource service vmagent-victoria-cluster fi sleep 2 echo "" fi +if [ "$CLEAN_VICTORIALOGS" = true ]; then + echo "Step 12: Delete VictoriaLogs Resources" + echo "---------------------------------------" + + # Delete VictoriaLogs operator CRD resources (operator cascades deletion) + echo "Deleting VictoriaLogs operator CRD resources..." + kubectl -n $NAMESPACE delete vlcluster victoria-logs-cluster --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete vlagent --all --ignore-not-found=true 2>/dev/null || true + sleep 5 + + # Delete VictoriaLogs cluster components (operator-managed) + echo "Deleting VictoriaLogs cluster components..." + delete_resource deployment vlinsert-victoria-logs-cluster + delete_resource deployment vlselect-victoria-logs-cluster + delete_resource statefulset vlstorage-victoria-logs-cluster + delete_resource service vlinsert-victoria-logs-cluster + delete_resource service vlselect-victoria-logs-cluster + delete_resource service vlstorage-victoria-logs-cluster + delete_all pod "app.kubernetes.io/component=vlinsert" + delete_all pod "app.kubernetes.io/component=vlselect" + delete_all pod "app.kubernetes.io/component=vlstorage" + + # Delete VLAgent components + echo "Deleting VLAgent..." + delete_resource statefulset vlagent-vlagent + delete_resource service vlagent-vlagent + delete_all pod "app.kubernetes.io/name=vlagent" + + # Delete VictoriaLogs shared resources + echo "Deleting VictoriaLogs shared resources..." + delete_resource configmap vlagent-config + # Note: victoria-tls-certs secret is shared with VictoriaMetrics - only delete if VictoriaMetrics is also being removed + if [ "$CLEAN_VICTORIA" != true ]; then + echo " Keeping victoria-tls-certs secret (shared with VictoriaMetrics)" + fi + + sleep 2 + echo "" +fi + if [ "$CLEAN_VICTORIA" = true ]; then - echo "Step 12: Delete Monitoring Resources" - echo "-------------------------------------" + echo "Step 13: Delete VictoriaMetrics Resources" + echo "------------------------------------------" - # Delete VictoriaMetrics cluster components (if cluster mode is deployed) + # Delete VictoriaMetrics operator CRD resources (operator cascades deletion) + echo "Deleting VictoriaMetrics operator CRD resources..." + kubectl -n $NAMESPACE delete vmcluster victoria-cluster --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete vmagent --all --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete vmpodscrape --all --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete vmsingle --all --ignore-not-found=true 2>/dev/null || true + sleep 5 + + # Delete any remaining operator-managed cluster components echo "Deleting VictoriaMetrics cluster components..." - delete_resource deployment vmselect - delete_resource deployment vminsert - delete_resource statefulset vmstorage - delete_resource service vmselect - delete_resource service vminsert - delete_resource service vmstorage - delete_all pod "app=vmselect" - delete_all pod "app=vminsert" - delete_all pod "app=vmstorage" + delete_resource deployment vmselect-victoria-cluster + delete_resource deployment vminsert-victoria-cluster + delete_resource statefulset vmstorage-victoria-cluster + delete_resource service vmselect-victoria-cluster + delete_resource service vminsert-victoria-cluster + delete_resource service vmstorage-victoria-cluster + delete_all pod "app.kubernetes.io/instance=victoria-cluster" # Delete VictoriaMetrics single-node components (if single-node mode is deployed) echo "Deleting VictoriaMetrics single-node components..." @@ -357,6 +439,7 @@ if [ "$CLEAN_VICTORIA" = true ]; then delete_resource deployment vmagent delete_resource service vmagent delete_all pod "app=vmagent" + delete_all pod "app.kubernetes.io/name=vmagent" # Delete shared resources echo "Deleting VictoriaMetrics shared resources..." @@ -372,8 +455,56 @@ if [ "$CLEAN_VICTORIA" = true ]; then echo "" fi +if [ "$CLEAN_POWERSCALE" = true ]; then + echo "Step: Delete PowerScale Telemetry" + echo "----------------------------------" + + # Uninstall karavi-observability Helm release (primary deployment method) + echo "Uninstalling karavi-observability Helm release..." + if helm list -n $NAMESPACE --filter karavi-observability -q 2>/dev/null | grep -q karavi-observability; then + helm uninstall karavi-observability -n $NAMESPACE --wait --timeout 5m 2>/dev/null || true + echo "Helm release karavi-observability uninstalled." + sleep 5 + else + echo "No karavi-observability Helm release found." + fi + + # Delete Helm-managed resources by label (karavi-observability Helm chart) + echo "Deleting CSM Metrics PowerScale (Helm-managed)..." + delete_all deployment "app.kubernetes.io/name=karavi-metrics-powerscale" + delete_all service "app.kubernetes.io/name=karavi-metrics-powerscale" + delete_all configmap "app.kubernetes.io/name=karavi-metrics-powerscale" + delete_all pod "app.kubernetes.io/name=karavi-metrics-powerscale" + + echo "Deleting OTEL Collector (Helm-managed)..." + delete_all deployment "app.kubernetes.io/name=otel-collector" + delete_all service "app.kubernetes.io/name=otel-collector" + delete_all configmap "app.kubernetes.io/name=otel-collector" + delete_all pod "app.kubernetes.io/name=otel-collector" + + # Delete cert-manager resources deployed by karavi-observability sub-chart + echo "Deleting cert-manager resources (Helm sub-chart)..." + delete_all deployment "app.kubernetes.io/instance=karavi-observability,app.kubernetes.io/name=cert-manager" + delete_all pod "app.kubernetes.io/instance=karavi-observability,app.kubernetes.io/name=cert-manager" + + # Note: vmagent is shared with iDRAC telemetry - not deleted here. + # PowerScale scrape targets are removed from vmagent config on next deployment. + + # Delete Karavi-specific resources (if deployed) + echo "Deleting Karavi Authorization resources..." + delete_resource configmap karavi-authorization-config + delete_resource secret karavi-authorization-ca-cert + + # Delete PowerScale credentials + echo "Deleting PowerScale credentials..." + delete_resource secret isilon-creds + + sleep 2 + echo "" +fi + echo "" -echo "Step 13: Force Delete Any Remaining Component Pods" +echo "Step 14: Force Delete Any Remaining Component Pods" echo "---------------------------------------------------" # Only force delete pods from components being cleaned if [ "$CLEAN_KAFKA" = true ]; then @@ -387,18 +518,28 @@ fi if [ "$CLEAN_IDRAC" = true ]; then kubectl -n $NAMESPACE delete pod -l app=idrac-telemetry --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true fi +if [ "$CLEAN_VICTORIALOGS" = true ]; then + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/component=vlinsert --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/component=vlselect --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/component=vlstorage --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/name=vlagent --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true +fi if [ "$CLEAN_VICTORIA" = true ]; then kubectl -n $NAMESPACE delete pod -l app=victoria-metric --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true - kubectl -n $NAMESPACE delete pod -l app=vmselect --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true - kubectl -n $NAMESPACE delete pod -l app=vminsert --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true - kubectl -n $NAMESPACE delete pod -l app=vmstorage --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/instance=victoria-cluster --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/name=vmagent --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true kubectl -n $NAMESPACE delete pod -l app=vmagent --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true kubectl -n $NAMESPACE delete pod -l app=victoria-tls-test --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true fi +if [ "$CLEAN_POWERSCALE" = true ]; then + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/name=karavi-metrics-powerscale --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/name=otel-collector --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true + kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/instance=karavi-observability,app.kubernetes.io/name=cert-manager --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true +fi sleep 5 echo "" -echo "Step 14: Check for Remaining Resources" +echo "Step 15: Check for Remaining Resources" echo "---------------------------------------" if [ "$CLEAN_KAFKA" = true ]; then echo "Remaining Kafka resources:" @@ -415,20 +556,40 @@ if [ "$CLEAN_IDRAC" = true ]; then kubectl -n $NAMESPACE get statefulset,pod,configmap -l app=idrac-telemetry 2>/dev/null || echo " None" echo "" fi +if [ "$CLEAN_VICTORIALOGS" = true ]; then + echo "Remaining VictoriaLogs resources:" + echo " VLCluster CR:" + kubectl -n $NAMESPACE get vlcluster 2>/dev/null || echo " None" + echo " VLAgent CR:" + kubectl -n $NAMESPACE get vlagent 2>/dev/null || echo " None" + echo " Pods:" + kubectl -n $NAMESPACE get pod -l app.kubernetes.io/component=vlinsert 2>/dev/null || echo " None" + kubectl -n $NAMESPACE get pod -l app.kubernetes.io/component=vlselect 2>/dev/null || echo " None" + kubectl -n $NAMESPACE get pod -l app.kubernetes.io/component=vlstorage 2>/dev/null || echo " None" + kubectl -n $NAMESPACE get pod -l app.kubernetes.io/name=vlagent 2>/dev/null || echo " None" + echo "" +fi if [ "$CLEAN_VICTORIA" = true ]; then - echo "Remaining Victoria Metrics resources:" + echo "Remaining VictoriaMetrics resources:" echo " Single-node:" kubectl -n $NAMESPACE get statefulset,deployment,pod,configmap -l app=victoria-metric 2>/dev/null || echo " None" - echo " Cluster (vmselect):" - kubectl -n $NAMESPACE get deployment,pod -l app=vmselect 2>/dev/null || echo " None" - echo " Cluster (vminsert):" - kubectl -n $NAMESPACE get deployment,pod -l app=vminsert 2>/dev/null || echo " None" - echo " Cluster (vmstorage):" - kubectl -n $NAMESPACE get statefulset,pod -l app=vmstorage 2>/dev/null || echo " None" + echo " Operator-managed cluster:" + kubectl -n $NAMESPACE get vmcluster,deployment,statefulset,pod -l app.kubernetes.io/instance=victoria-cluster 2>/dev/null || echo " None" echo " vmagent:" + kubectl -n $NAMESPACE get deployment,pod -l app.kubernetes.io/name=vmagent 2>/dev/null || echo " None" kubectl -n $NAMESPACE get deployment,pod -l app=vmagent 2>/dev/null || echo " None" echo "" fi +if [ "$CLEAN_POWERSCALE" = true ]; then + echo "Remaining PowerScale telemetry resources:" + echo " CSM Metrics:" + kubectl -n $NAMESPACE get deployment,pod -l app.kubernetes.io/name=karavi-metrics-powerscale 2>/dev/null || echo " None" + echo " OTEL Collector:" + kubectl -n $NAMESPACE get deployment,pod -l app.kubernetes.io/name=otel-collector 2>/dev/null || echo " None" + echo " Helm release:" + helm list -n $NAMESPACE --filter karavi-observability 2>/dev/null || echo " None" + echo "" +fi echo "Remaining PVCs:" kubectl -n $NAMESPACE get pvc 2>/dev/null || echo " None" echo "" diff --git a/provision/roles/telemetry/templates/telemetry/common/telemetry_secret_creation.yaml.j2 b/provision/roles/telemetry/templates/telemetry/common/telemetry_secret_creation.yaml.j2 index 7b817c7a22..0135593cc0 100644 --- a/provision/roles/telemetry/templates/telemetry/common/telemetry_secret_creation.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/common/telemetry_secret_creation.yaml.j2 @@ -9,7 +9,7 @@ data: mysqldb_password: "{{ hostvars['localhost']['mysqldb_password'] | b64encode }}" mysqldb_root_password: "{{ hostvars['localhost']['mysqldb_root_password'] | b64encode }}" -{% set types = hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %} +{% set types = hostvars['localhost']['telemetry_collection_type'].split(',') %} {% if 'kafka' in types %} --- apiVersion: v1 diff --git a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 index b300029920..80994fc5da 100644 --- a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 @@ -47,7 +47,7 @@ spec: app: {{ idrac_telemetry_k8s_name }} spec: volumes: -{% set types = hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %} +{% set types = hostvars['localhost']['telemetry_collection_type'].split(',') %} {% if 'kafka' in types %} # Mount Kafka cluster CA certificate for TLS verification - name: kafka-cluster-ca-cert diff --git a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafkapump_user.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafkapump_user.yaml.j2 index 01ef142084..70790b75a4 100644 --- a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafkapump_user.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafkapump_user.yaml.j2 @@ -59,7 +59,7 @@ spec: - Describe host: "*" -{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %} +{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') %} # Producer and consumer permissions for idrac topic - resource: type: topic diff --git a/provision/roles/telemetry/templates/telemetry/kafka/kafka.tls_test_job.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kafka/kafka.tls_test_job.yaml.j2 index 0a80304e89..1b58cd5811 100644 --- a/provision/roles/telemetry/templates/telemetry/kafka/kafka.tls_test_job.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kafka/kafka.tls_test_job.yaml.j2 @@ -32,7 +32,7 @@ data: echo "Bootstrap Server: kafka-kafka-bootstrap:9093" echo "Certificates: kafkapump (for all TLS topics)" echo "Testing topics based on enabled telemetry support:" -{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %} +{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') %} echo " - iDRAC telemetry topic ({{ kafka.topics.idrac.name }})" {% endif %} {% if hostvars['localhost']['ldms_support'] %} @@ -80,7 +80,7 @@ data: echo "✓ mTLS connection successful" echo "" -{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %} +{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') %} # Test iDRAC telemetry topic consumer echo "Step 5: Testing consumer on {{ kafka.topics.idrac.name }} topic (kafkapump user)..." timeout 30 /opt/kafka/bin/kafka-console-consumer.sh \ @@ -114,7 +114,7 @@ data: echo " ✓ kafkapump keystore created" echo " ✓ mTLS connection established" echo " ✓ Topics listed successfully" -{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %} +{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') %} echo " ✓ {{ kafka.topics.idrac.name }} topic tested (kafkapump user)" {% endif %} {% if hostvars['localhost']['ldms_support'] %} diff --git a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 index 19c722fb7a..e25d434918 100644 --- a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 @@ -1,29 +1,36 @@ resources: - telemetry_secret_creation.yaml -{% set types = hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %} +{% set types = hostvars['localhost']['telemetry_collection_type'].split(',') %} {% if 'victoria' in types %} - # VictoriaMetrics Common Resources - - victoria-tls-secret.yaml +# VictoriaMetrics Common Resources (RBAC) - victoria-vmagent-rbac.yaml - - vmagent-scrape-config.yaml - - victoria-agent-deployment.yaml - # VictoriaMetrics Deployment (mode: {{ hostvars['localhost']['victoria_configurations']['deployment_mode'] }}) -{% if hostvars['localhost']['victoria_configurations']['deployment_mode'] == 'cluster' %} - # Cluster Mode: High-availability deployment - - victoria-cluster-vmstorage.yaml - - victoria-cluster-vminsert.yaml - - victoria-cluster-vmselect.yaml +{% if victoria_cluster.tls_enabled | default(false) %} + # TLS secret for VictoriaMetrics cluster components + - victoria-tls-secret.yaml +{% endif %} + # VictoriaMetrics Operator-based Deployment (mode: {{ victoria_deployment_mode }}) +{% if victoria_deployment_mode == 'cluster' %} + # Cluster Mode: VMCluster CR (operator manages StatefulSets) + - victoria-operator-vmcluster.yaml {% else %} - # Single-Node Mode: Simple deployment - - victoria-statefulset.yaml + # Single-Node Mode: VMSingle CR (operator manages StatefulSet) + - victoria-operator-vmsingle.yaml {% endif %} - # Uncomment to deploy VictoriaMetrics TLS test job - # - test/victoria-tls-test-job.yaml + # VMAgent CR (operator-managed scraper) + - victoria-operator-vmagent.yaml + # VMPodScrape CR (native operator-based pod discovery) + - victoria-operator-vmpodscrape.yaml + # VictoriaLogs Cluster Mode: VLCluster CR (operator manages vlstorage StatefulSet, vlinsert/vlselect Deployments) + # VLAgent CR (operator-managed log collection agent) + # VLAgent ConfigMap (syslog receiver and remoteWrite configuration) + - victorialogs-operator-vlcluster.yaml + - victorialogs-operator-vlagent.yaml + - victorialogs-vlagent-config.yaml {% endif %} {% if kafka_support %} - kafka.kafka.yaml - kafka.kafkapump_user.yaml -{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %} +{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') %} - kafka.topic_idrac.yaml {% endif %} {% if hostvars['localhost']['ldms_support'] %} @@ -38,4 +45,4 @@ resources: - idrac_telemetry_statefulset.yaml - telemetry_cleaner_rbac.yaml - telemetry_pod_cleanup.yaml -{% endif %} \ No newline at end of file +{% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/powerscale/csm-metrics-deployment-direct.yaml.j2 b/provision/roles/telemetry/templates/telemetry/powerscale/csm-metrics-deployment-direct.yaml.j2 new file mode 100644 index 0000000000..c08bbd2f46 --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/powerscale/csm-metrics-deployment-direct.yaml.j2 @@ -0,0 +1,82 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: csm-metrics-powerscale-cluster{{ cluster_index }} + namespace: {{ telemetry_namespace }} + labels: + app: csm-metrics-powerscale + cluster-index: "{{ cluster_index }}" +spec: + replicas: 1 + selector: + matchLabels: + app: csm-metrics-powerscale + cluster-index: "{{ cluster_index }}" + template: + metadata: + labels: + app: csm-metrics-powerscale + cluster-index: "{{ cluster_index }}" + spec: + containers: + - name: csm-metrics-powerscale + image: {{ csm_metrics_powerscale_image }} + envFrom: + - configMapRef: + name: csm-metrics-powerscale-config-cluster{{ cluster_index }} + env: + - name: POWERSCALE_CLUSTER_ENDPOINT + valueFrom: + secretKeyRef: + name: isilon-creds + key: endpoint + - name: POWERSCALE_CLUSTER_USERNAME + valueFrom: + secretKeyRef: + name: isilon-creds + key: username + - name: POWERSCALE_CLUSTER_PASSWORD + valueFrom: + secretKeyRef: + name: isilon-creds + key: password + - name: POWERSCALE_CLUSTER_NAME + valueFrom: + secretKeyRef: + name: isilon-creds + key: clusterName + resources: + requests: + cpu: {{ csm_metrics_powerscale_resources.requests.cpu }} + memory: {{ csm_metrics_powerscale_resources.requests.memory }} + limits: + cpu: {{ csm_metrics_powerscale_resources.limits.cpu }} + memory: {{ csm_metrics_powerscale_resources.limits.memory }} + volumeMounts: + - name: isilon-creds + mountPath: /etc/isilon-creds + readOnly: true + - name: csm-metrics-config + mountPath: /etc/config/karavi-metrics-powerscale.yaml + subPath: karavi-metrics-powerscale.yaml + volumes: + - name: isilon-creds + secret: + secretName: isilon-creds + - name: csm-metrics-config + configMap: + name: csm-metrics-powerscale-config-cluster{{ cluster_index }} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/gen_victoria_certs.sh.j2 b/provision/roles/telemetry/templates/telemetry/victoria/gen_victoria_certs.sh.j2 index ef2086c831..bfb894d58d 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/gen_victoria_certs.sh.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/gen_victoria_certs.sh.j2 @@ -52,26 +52,57 @@ DNS.5 = victoria-metric-0 DNS.6 = victoria-metric-0.{{ telemetry_namespace }} DNS.7 = victoria-metric-0.{{ telemetry_namespace }}.svc DNS.8 = victoria-metric-0.{{ telemetry_namespace }}.svc.cluster.local -# Cluster deployment names -DNS.9 = vminsert -DNS.10 = vminsert.{{ telemetry_namespace }} -DNS.11 = vminsert.{{ telemetry_namespace }}.svc -DNS.12 = vminsert.{{ telemetry_namespace }}.svc.cluster.local -DNS.13 = vmselect -DNS.14 = vmselect.{{ telemetry_namespace }} -DNS.15 = vmselect.{{ telemetry_namespace }}.svc -DNS.16 = vmselect.{{ telemetry_namespace }}.svc.cluster.local -DNS.17 = vmstorage -DNS.18 = vmstorage.{{ telemetry_namespace }} -DNS.19 = vmstorage.{{ telemetry_namespace }}.svc -DNS.20 = vmstorage.{{ telemetry_namespace }}.svc.cluster.local -# VMStorage StatefulSet pods -DNS.21 = vmstorage-0.vmstorage.{{ telemetry_namespace }}.svc.cluster.local -DNS.22 = vmstorage-1.vmstorage.{{ telemetry_namespace }}.svc.cluster.local -DNS.23 = vmstorage-2.vmstorage.{{ telemetry_namespace }}.svc.cluster.local +# Cluster deployment names (operator-managed) +DNS.9 = vminsert-victoria-cluster +DNS.10 = vminsert-victoria-cluster.{{ telemetry_namespace }} +DNS.11 = vminsert-victoria-cluster.{{ telemetry_namespace }}.svc +DNS.12 = vminsert-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.13 = vmselect-victoria-cluster +DNS.14 = vmselect-victoria-cluster.{{ telemetry_namespace }} +DNS.15 = vmselect-victoria-cluster.{{ telemetry_namespace }}.svc +DNS.16 = vmselect-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.17 = vmstorage-victoria-cluster +DNS.18 = vmstorage-victoria-cluster.{{ telemetry_namespace }} +DNS.19 = vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc +DNS.20 = vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local +# VMStorage StatefulSet pods (operator-managed) +DNS.21 = vmstorage-victoria-cluster-0.vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.22 = vmstorage-victoria-cluster-1.vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.23 = vmstorage-victoria-cluster-2.vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local +# VictoriaLogs cluster deployment names (operator-managed) +DNS.24 = vlinsert-victoria-logs-cluster +DNS.25 = vlinsert-victoria-logs-cluster.{{ telemetry_namespace }} +DNS.26 = vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc +DNS.27 = vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.28 = vlselect-victoria-logs-cluster +DNS.29 = vlselect-victoria-logs-cluster.{{ telemetry_namespace }} +DNS.30 = vlselect-victoria-logs-cluster.{{ telemetry_namespace }}.svc +DNS.31 = vlselect-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.32 = vlstorage-victoria-logs-cluster +DNS.33 = vlstorage-victoria-logs-cluster.{{ telemetry_namespace }} +DNS.34 = vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc +DNS.35 = vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +# vlstorage StatefulSet pod FQDNs (operator-managed, 3 replicas) +DNS.36 = vlstorage-victoria-logs-cluster-0.vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.37 = vlstorage-victoria-logs-cluster-1.vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.38 = vlstorage-victoria-logs-cluster-2.vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local IP.1 = 127.0.0.1 EOF +# Check if existing cert has the required operator-managed SANs +# If SANs are stale (missing operator-managed names), force server cert regeneration +# CA is preserved so external clients do not need to re-import it +if [ -f "$CERT_FILE" ]; then + REQUIRED_SAN="vminsert-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local" + REQUIRED_VL_SAN="vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local" + if ! openssl x509 -in "$CERT_FILE" -text -noout 2>/dev/null | grep -q "$REQUIRED_SAN" || \ + ! openssl x509 -in "$CERT_FILE" -text -noout 2>/dev/null | grep -q "$REQUIRED_VL_SAN"; then + echo "Existing certificate missing required SAN: $REQUIRED_SAN" + echo "Removing stale server cert/key/csr to force regeneration..." + rm -f "$CERT_KEY" "$CSR_FILE" "$CERT_FILE" + fi +fi + # Generate CA key if [ ! -f "$CA_KEY" ]; then echo "Generating CA key..." diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-agent-deployment.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-agent-deployment.yaml.j2 index 48296183c2..dafce0aada 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-agent-deployment.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-agent-deployment.yaml.j2 @@ -43,6 +43,14 @@ spec: - -remoteWrite.url={{ vmagent.remote_write_url }} - -remoteWrite.tlsCAFile=/etc/victoria/certs/ca.crt - -remoteWrite.tlsInsecureSkipVerify=false +{% endif %} +{% if hostvars['localhost']['powerscale_configurations']['powerscale_telemetry_support'] | default(false) | bool %} +{% for endpoint in telemetry_config.powerscale_configurations.additional_remote_write_endpoints | default([]) %} + - -remoteWrite.url={{ endpoint.url }} +{% if endpoint.tls_insecure_skip_verify | default(false) %} + - -remoteWrite.tlsInsecureSkipVerify=true +{% endif %} +{% endfor %} {% endif %} volumeMounts: - name: scrape-config diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vminsert.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vminsert.yaml.j2 deleted file mode 100644 index b39dda8e39..0000000000 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vminsert.yaml.j2 +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# VMInsert - Insert component for VictoriaMetrics cluster -# Accepts data ingestion and routes to vmstorage nodes - -apiVersion: v1 -kind: Service -metadata: - name: vminsert - namespace: {{ telemetry_namespace }} - labels: - app: vminsert -spec: - type: LoadBalancer - selector: - app: vminsert - ports: - - port: 8480 - targetPort: 8480 -{% if victoria_cluster.tls_enabled %} - name: https -{% else %} - name: http -{% endif %} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vminsert - namespace: {{ telemetry_namespace }} - labels: - app: vminsert -spec: - replicas: {{ victoria_cluster.vminsert.replicas }} - selector: - matchLabels: - app: vminsert - template: - metadata: - labels: - app: vminsert - spec: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - vminsert - topologyKey: "kubernetes.io/hostname" - terminationGracePeriodSeconds: 30 - tolerations: - - effect: NoExecute - key: node.kubernetes.io/not-ready - operator: Exists - tolerationSeconds: 5 - - effect: NoExecute - key: node.kubernetes.io/unreachable - operator: Exists - tolerationSeconds: 5 -{% if victoria_cluster.tls_enabled %} - volumes: - - name: victoria-tls-certs - secret: - secretName: victoria-tls-certs - items: - - key: tls.crt - path: server.crt - - key: tls.key - path: server.key - - key: ca.crt - path: ca.crt -{% endif %} - containers: - - name: vminsert - image: {{ victoria_cluster.vminsert.image }} - imagePullPolicy: IfNotPresent - args: - - --storageNode=vmstorage-0.vmstorage.{{ telemetry_namespace }}.svc.cluster.local:8400 -{% for i in range(1, victoria_cluster.vmstorage.replicas) %} - - --storageNode=vmstorage-{{ i }}.vmstorage.{{ telemetry_namespace }}.svc.cluster.local:8400 -{% endfor %} - - --httpListenAddr=:8480 -{% if victoria_cluster.tls_enabled %} - - -tls - - -tlsCertFile=/etc/victoria/certs/server.crt - - -tlsKeyFile=/etc/victoria/certs/server.key -{% endif %} - - --maxLabelsPerTimeseries=60 - ports: - - containerPort: 8480 -{% if victoria_cluster.tls_enabled %} - name: https -{% else %} - name: http -{% endif %} - startupProbe: - httpGet: - path: /health - port: 8480 -{% if victoria_cluster.tls_enabled %} - scheme: HTTPS -{% else %} - scheme: HTTP -{% endif %} - initialDelaySeconds: 10 - periodSeconds: 5 - timeoutSeconds: 3 - failureThreshold: 30 - livenessProbe: - httpGet: - path: /health - port: 8480 -{% if victoria_cluster.tls_enabled %} - scheme: HTTPS -{% else %} - scheme: HTTP -{% endif %} - initialDelaySeconds: 30 - periodSeconds: 30 - timeoutSeconds: 5 - readinessProbe: - httpGet: - path: /health - port: 8480 -{% if victoria_cluster.tls_enabled %} - scheme: HTTPS -{% else %} - scheme: HTTP -{% endif %} - initialDelaySeconds: 5 - periodSeconds: 15 - resources: - requests: - memory: {{ victoria_cluster.vminsert.resources.requests.memory }} - cpu: {{ victoria_cluster.vminsert.resources.requests.cpu }} - limits: - memory: {{ victoria_cluster.vminsert.resources.limits.memory }} - cpu: {{ victoria_cluster.vminsert.resources.limits.cpu }} -{% if victoria_cluster.tls_enabled %} - volumeMounts: - - name: victoria-tls-certs - mountPath: /etc/victoria/certs - readOnly: true -{% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vmselect.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vmselect.yaml.j2 deleted file mode 100644 index 63649b1068..0000000000 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vmselect.yaml.j2 +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# VMSelect - Query component for VictoriaMetrics cluster -# Performs queries against vmstorage nodes and returns results - -apiVersion: v1 -kind: Service -metadata: - name: vmselect - namespace: {{ telemetry_namespace }} - labels: - app: vmselect -spec: - type: LoadBalancer - selector: - app: vmselect - ports: - - port: 8481 - targetPort: 8481 -{% if victoria_cluster.tls_enabled %} - name: https -{% else %} - name: http -{% endif %} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vmselect - namespace: {{ telemetry_namespace }} - labels: - app: vmselect -spec: - replicas: {{ victoria_cluster.vmselect.replicas }} - selector: - matchLabels: - app: vmselect - template: - metadata: - labels: - app: vmselect - spec: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - vmselect - topologyKey: "kubernetes.io/hostname" - terminationGracePeriodSeconds: 30 - tolerations: - - effect: NoExecute - key: node.kubernetes.io/not-ready - operator: Exists - tolerationSeconds: 5 - - effect: NoExecute - key: node.kubernetes.io/unreachable - operator: Exists - tolerationSeconds: 5 - volumes: -{% if victoria_cluster.tls_enabled %} - - name: victoria-tls-certs - secret: - secretName: victoria-tls-certs - items: - - key: tls.crt - path: server.crt - - key: tls.key - path: server.key - - key: ca.crt - path: ca.crt -{% endif %} -{% if victoria_cluster.vmselect.cache_data_path %} - - name: cache - emptyDir: {} -{% endif %} - containers: - - name: vmselect - image: {{ victoria_cluster.vmselect.image }} - imagePullPolicy: IfNotPresent - args: - - --storageNode=vmstorage-0.vmstorage.{{ telemetry_namespace }}.svc.cluster.local:8401 -{% for i in range(1, victoria_cluster.vmstorage.replicas) %} - - --storageNode=vmstorage-{{ i }}.vmstorage.{{ telemetry_namespace }}.svc.cluster.local:8401 -{% endfor %} - - --httpListenAddr=:8481 -{% if victoria_cluster.tls_enabled %} - - -tls - - -tlsCertFile=/etc/victoria/certs/server.crt - - -tlsKeyFile=/etc/victoria/certs/server.key -{% endif %} - - --search.maxQueryDuration={{ victoria_cluster.vmselect.max_query_duration }} - - --search.maxConcurrentRequests={{ victoria_cluster.vmselect.max_concurrent_requests }} -{% if victoria_cluster.vmselect.cache_data_path %} - - --cacheDataPath=/cache -{% endif %} - ports: - - containerPort: 8481 -{% if victoria_cluster.tls_enabled %} - name: https -{% else %} - name: http -{% endif %} - livenessProbe: - httpGet: - path: /health - port: 8481 -{% if victoria_cluster.tls_enabled %} - scheme: HTTPS -{% else %} - scheme: HTTP -{% endif %} - initialDelaySeconds: 30 - periodSeconds: 30 - timeoutSeconds: 5 - readinessProbe: - httpGet: - path: /health - port: 8481 -{% if victoria_cluster.tls_enabled %} - scheme: HTTPS -{% else %} - scheme: HTTP -{% endif %} - initialDelaySeconds: 5 - periodSeconds: 15 - resources: - requests: - memory: {{ victoria_cluster.vmselect.resources.requests.memory }} - cpu: {{ victoria_cluster.vmselect.resources.requests.cpu }} - limits: - memory: {{ victoria_cluster.vmselect.resources.limits.memory }} - cpu: {{ victoria_cluster.vmselect.resources.limits.cpu }} - volumeMounts: -{% if victoria_cluster.tls_enabled %} - - name: victoria-tls-certs - mountPath: /etc/victoria/certs - readOnly: true -{% endif %} -{% if victoria_cluster.vmselect.cache_data_path %} - - name: cache - mountPath: /cache -{% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vmstorage.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vmstorage.yaml.j2 deleted file mode 100644 index 9d79cda60b..0000000000 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-cluster-vmstorage.yaml.j2 +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# VMStorage - Storage component for VictoriaMetrics cluster -# Stores raw data and returns query results to vmselect - -apiVersion: v1 -kind: Service -metadata: - name: vmstorage - namespace: {{ telemetry_namespace }} - labels: - app: vmstorage -spec: - clusterIP: None # Headless service for StatefulSet - selector: - app: vmstorage - ports: - - port: 8482 - targetPort: 8482 -{% if victoria_cluster.tls_enabled %} - name: https -{% else %} - name: http -{% endif %} - - port: 8400 - targetPort: 8400 - name: vminsert - - port: 8401 - targetPort: 8401 - name: vmselect ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: vmstorage - namespace: {{ telemetry_namespace }} - labels: - app: vmstorage -spec: - serviceName: vmstorage - replicas: {{ victoria_cluster.vmstorage.replicas }} - selector: - matchLabels: - app: vmstorage - template: - metadata: - labels: - app: vmstorage - spec: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - vmstorage - topologyKey: "kubernetes.io/hostname" - terminationGracePeriodSeconds: 30 - tolerations: - - effect: NoExecute - key: node.kubernetes.io/not-ready - operator: Exists - tolerationSeconds: 5 - - effect: NoExecute - key: node.kubernetes.io/unreachable - operator: Exists - tolerationSeconds: 5 -{% if victoria_cluster.tls_enabled %} - volumes: - - name: victoria-tls-certs - secret: - secretName: victoria-tls-certs - items: - - key: tls.crt - path: server.crt - - key: tls.key - path: server.key -{% endif %} - initContainers: - # Clean up stale VictoriaMetrics lock files from previous ungraceful shutdowns - - name: cleanup-victoria-locks - image: {{ victoria_cluster.vmstorage.image }} - command: - - /bin/sh - - -c - - | - echo "Checking for stale VictoriaMetrics lock files..." - rm -f /vmstorage-data/flock.lock 2>/dev/null || true - echo "Lock file cleanup complete" - volumeMounts: - - name: vmstorage-data - mountPath: /vmstorage-data - containers: - - name: vmstorage - image: {{ victoria_cluster.vmstorage.image }} - imagePullPolicy: IfNotPresent - args: - - --storageDataPath=/vmstorage-data - - --retentionPeriod={{ hostvars['localhost']['victoria_configurations']['retention_period'] }} - - --httpListenAddr=:8482 -{% if victoria_cluster.tls_enabled %} - - -tls - - -tlsCertFile=/etc/victoria/certs/server.crt - - -tlsKeyFile=/etc/victoria/certs/server.key -{% endif %} - - --vminsertAddr=:8400 - - --vmselectAddr=:8401 -{% if victoria_cluster.vmstorage.dedup_min_scrape_interval %} - - --dedup.minScrapeInterval={{ victoria_cluster.vmstorage.dedup_min_scrape_interval }} -{% endif %} - ports: - - containerPort: 8482 -{% if victoria_cluster.tls_enabled %} - name: https -{% else %} - name: http -{% endif %} - - containerPort: 8400 - name: vminsert - - containerPort: 8401 - name: vmselect - livenessProbe: - httpGet: - path: /health - port: 8482 -{% if victoria_cluster.tls_enabled %} - scheme: HTTPS -{% else %} - scheme: HTTP -{% endif %} - initialDelaySeconds: 30 - periodSeconds: 30 - timeoutSeconds: 5 - readinessProbe: - httpGet: - path: /health - port: 8482 -{% if victoria_cluster.tls_enabled %} - scheme: HTTPS -{% else %} - scheme: HTTP -{% endif %} - initialDelaySeconds: 5 - periodSeconds: 15 - resources: - requests: - memory: {{ victoria_cluster.vmstorage.resources.requests.memory }} - cpu: {{ victoria_cluster.vmstorage.resources.requests.cpu }} - limits: - memory: {{ victoria_cluster.vmstorage.resources.limits.memory }} - cpu: {{ victoria_cluster.vmstorage.resources.limits.cpu }} - volumeMounts: - - name: vmstorage-data - mountPath: /vmstorage-data -{% if victoria_cluster.tls_enabled %} - - name: victoria-tls-certs - mountPath: /etc/victoria/certs - readOnly: true -{% endif %} - volumeClaimTemplates: - - metadata: - name: vmstorage-data - spec: - accessModes: ["ReadWriteOnce"] - resources: - requests: - storage: {{ hostvars['localhost']['victoria_configurations']['persistence_size'] }} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 new file mode 100644 index 0000000000..970c9b20aa --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 @@ -0,0 +1,72 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VMAgent - VictoriaMetrics agent for scraping metrics via operator +# Managed by victoria-metrics-operator + +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMAgent +metadata: + name: vmagent + namespace: {{ telemetry_namespace }} +spec: + # Service account for kubernetes service discovery + serviceAccountName: {{ vmagent.service_account_name }} + + # Replica count + replicaCount: 1 + + # Image configuration + image: + repository: {{ vmagent.image.split(':')[0] }} + tag: {{ vmagent.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # Remote write configuration - depends on deployment mode + remoteWrite: +{% if victoria_cluster.enabled %} + - url: {{ vmagent.remote_write_url_cluster }} +{% if victoria_cluster.tls_enabled %} + tlsConfig: + ca: + secret: + name: victoria-tls-certs + key: ca.crt + insecureSkipVerify: false +{% endif %} +{% else %} + - url: {{ vmagent.remote_write_url }} + tlsConfig: + insecureSkipVerify: true +{% endif %} + + # Resource limits + resources: + requests: + memory: "{{ victoria_cluster.vmagent.resources.requests.memory}}" + cpu: "{{ victoria_cluster.vmagent.resources.requests.cpu}}" + limits: + memory: "{{ victoria_cluster.vmagent.resources.limits.memory}}" + cpu: "{{ victoria_cluster.vmagent.resources.limits.cpu}}" + + # Service discovery configs - operator uses VMServiceScrape/VMPodScrape CRDs + serviceScrapeNamespaceSelector: {} + serviceScrapeSelector: {} + podScrapeNamespaceSelector: {} + podScrapeSelector: {} + + # Extra args + extraArgs: + promscrape.streamParse: "true" + promscrape.maxScrapeSize: "16MB" diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 new file mode 100644 index 0000000000..b986ae0af8 --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 @@ -0,0 +1,241 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VMCluster - VictoriaMetrics cluster deployment via operator +# Managed by victoria-metrics-operator + +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMCluster +metadata: + name: victoria-cluster + namespace: {{ telemetry_namespace }} +spec: + # Retention period from telemetry_config.yml + retentionPeriod: "{{ hostvars['localhost']['victoria_metrics_configurations']['retention_period'] }}h" + + # VMStorage configuration + vmstorage: + replicaCount: {{ victoria_cluster.vmstorage.replicas }} + image: + repository: {{ victoria_cluster.vmstorage.image.split(':')[0] }} + tag: {{ victoria_cluster.vmstorage.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # Storage configuration per pod + storageDataPath: /vmstorage-data + storage: + volumeClaimTemplate: + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ hostvars['localhost']['victoria_metrics_configurations']['persistence_size'] }} + + # Resource limits + resources: + requests: + memory: {{ victoria_cluster.vmstorage.resources.requests.memory }} + cpu: {{ victoria_cluster.vmstorage.resources.requests.cpu }} + limits: + memory: {{ victoria_cluster.vmstorage.resources.limits.memory }} + cpu: {{ victoria_cluster.vmstorage.resources.limits.cpu }} + + # Pod anti-affinity + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - vmstorage + topologyKey: "kubernetes.io/hostname" + + # Tolerations + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 5 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 5 +{% if victoria_cluster.vmstorage.dedup_min_scrape_interval or victoria_cluster.tls_enabled %} + + extraArgs: +{% if victoria_cluster.vmstorage.dedup_min_scrape_interval %} + dedup.minScrapeInterval: {{ victoria_cluster.vmstorage.dedup_min_scrape_interval }} +{% endif %} +{% if victoria_cluster.tls_enabled %} + tls: "true" + tlsCertFile: "/etc/victoria/certs/server.crt" + tlsKeyFile: "/etc/victoria/certs/server.key" +{% endif %} +{% endif %} +{% if victoria_cluster.tls_enabled %} + + volumes: + - name: victoria-tls-certs + secret: + secretName: victoria-tls-certs + items: + - key: tls.crt + path: server.crt + - key: tls.key + path: server.key + volumeMounts: + - name: victoria-tls-certs + mountPath: /etc/victoria/certs + readOnly: true +{% endif %} + + # VMSelect configuration + vmselect: + replicaCount: {{ victoria_cluster.vmselect.replicas }} + image: + repository: {{ victoria_cluster.vmselect.image.split(':')[0] }} + tag: {{ victoria_cluster.vmselect.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # External access via LoadBalancer (useAsDefault merges into the main service) + serviceSpec: + useAsDefault: true + spec: + type: LoadBalancer + + # Resource limits + resources: + requests: + memory: {{ victoria_cluster.vmselect.resources.requests.memory }} + cpu: {{ victoria_cluster.vmselect.resources.requests.cpu }} + limits: + memory: {{ victoria_cluster.vmselect.resources.limits.memory }} + cpu: {{ victoria_cluster.vmselect.resources.limits.cpu }} + + # Pod anti-affinity + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - vmselect + topologyKey: "kubernetes.io/hostname" + + # Extra args for query optimization + extraArgs: +{% if victoria_cluster.vmselect.max_query_duration %} + search.maxQueryDuration: {{ victoria_cluster.vmselect.max_query_duration }} +{% endif %} +{% if victoria_cluster.vmselect.max_concurrent_requests %} + search.maxConcurrentRequests: "{{ victoria_cluster.vmselect.max_concurrent_requests }}" +{% endif %} +{% if victoria_cluster.vmselect.cache_data_path %} + cacheDataPath: /cache +{% endif %} +{% if victoria_cluster.tls_enabled %} + tls: "true" + tlsCertFile: "/etc/victoria/certs/server.crt" + tlsKeyFile: "/etc/victoria/certs/server.key" +{% endif %} +{% if victoria_cluster.tls_enabled %} + + volumes: + - name: victoria-tls-certs + secret: + secretName: victoria-tls-certs + items: + - key: tls.crt + path: server.crt + - key: tls.key + path: server.key + - key: ca.crt + path: ca.crt + volumeMounts: + - name: victoria-tls-certs + mountPath: /etc/victoria/certs + readOnly: true +{% endif %} + + # VMInsert configuration + vminsert: + replicaCount: {{ victoria_cluster.vminsert.replicas }} + image: + repository: {{ victoria_cluster.vminsert.image.split(':')[0] }} + tag: {{ victoria_cluster.vminsert.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # Resource limits + resources: + requests: + memory: {{ victoria_cluster.vminsert.resources.requests.memory }} + cpu: {{ victoria_cluster.vminsert.resources.requests.cpu }} + limits: + memory: {{ victoria_cluster.vminsert.resources.limits.memory }} + cpu: {{ victoria_cluster.vminsert.resources.limits.cpu }} + + # Pod anti-affinity + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - vminsert + topologyKey: "kubernetes.io/hostname" +{% if victoria_cluster.tls_enabled %} + + extraArgs: + tls: "true" + tlsCertFile: "/etc/victoria/certs/server.crt" + tlsKeyFile: "/etc/victoria/certs/server.key" + maxLabelsPerTimeseries: "60" + + volumes: + - name: victoria-tls-certs + secret: + secretName: victoria-tls-certs + items: + - key: tls.crt + path: server.crt + - key: tls.key + path: server.key + - key: ca.crt + path: ca.crt + volumeMounts: + - name: victoria-tls-certs + mountPath: /etc/victoria/certs + readOnly: true +{% endif %} +{% if victoria_cluster.vminsert.external_access %} + + # External access via LoadBalancer (useAsDefault merges into the main service) + serviceSpec: + useAsDefault: true + spec: + type: LoadBalancer +{% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2 new file mode 100644 index 0000000000..4ed5c9c72d --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2 @@ -0,0 +1,46 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VMPodScrape - Native operator-based pod discovery for idrac-telemetry +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMPodScrape +metadata: + name: idrac-telemetry-scrape + namespace: {{ telemetry_namespace }} +spec: + # Target pod selector + selector: + matchLabels: + app: {{ vmagent.target_pod_label }} + + # Namespace selector + namespaceSelector: + matchNames: + - {{ vmagent.kubernetes_sd_namespace }} + + # Pod metrics endpoints + podMetricsEndpoints: + - port: "victoriapump" + interval: {{ vmagent.global.scrape_interval }} + honorLabels: true + + # Only scrape the metrics container + relabelConfigs: + - sourceLabels: [__meta_kubernetes_pod_container_name] + regex: {{ vmagent.metrics_container_name }} + action: keep + + # Add pod IP label + - sourceLabels: [__meta_kubernetes_pod_ip] + targetLabel: pod_ip diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmsingle.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmsingle.yaml.j2 new file mode 100644 index 0000000000..41af43489e --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmsingle.yaml.j2 @@ -0,0 +1,86 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VMSingle - VictoriaMetrics single-node deployment via operator +# Managed by victoria-metrics-operator + +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMSingle +metadata: + name: victoria-single + namespace: {{ telemetry_namespace }} +spec: + # Replica count for single-node (always 1) + replicaCount: 1 + + # Retention period from telemetry_config.yml + retentionPeriod: "{{ hostvars['localhost']['victoria_metrics_configurations']['retention_period'] }}h" + + # Storage configuration + storageDataPath: /victoria-metrics-data + storage: + volumeClaimTemplate: + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ hostvars['localhost']['victoria_metrics_configurations']['persistence_size'] }} + + # Image configuration + image: + repository: {{ victoria.image.split(':')[0] }} + tag: {{ victoria.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # Port configuration + port: "8428" + + # Resource limits + resources: + requests: + memory: "2Gi" + cpu: "500m" + limits: + memory: "4Gi" + cpu: "2000m" + + # Pod anti-affinity for HA + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - vmsingle + topologyKey: "kubernetes.io/hostname" + + # Tolerations for node failures + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 5 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 5 + + # Service configuration + extraArgs: + selfScrapeInterval: "5s" diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-statefulset.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-statefulset.yaml.j2 index 90344c3dd9..8c8af09972 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-statefulset.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-statefulset.yaml.j2 @@ -99,7 +99,7 @@ spec: args: - "--selfScrapeInterval=5s" - "--storageDataPath=/victoria-metrics-data" - - "--retentionPeriod={{ hostvars['localhost']['victoria_configurations']['retention_period'] }}" + - "--retentionPeriod={{ hostvars['localhost']['victoria_metrics_configurations']['retention_period'] }}" - "--httpListenAddr=:8443" - "-tls" - "-tlsCertFile=/etc/victoria/certs/server.crt" @@ -121,4 +121,4 @@ spec: accessModes: ["ReadWriteOnce"] resources: requests: - storage: "{{ hostvars['localhost']['victoria_configurations']['persistence_size'] }}" + storage: "{{ hostvars['localhost']['victoria_metrics_configurations']['persistence_size'] }}" diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-tls-test-job.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-tls-test-job.yaml.j2 index ad4241f135..6de4c42882 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-tls-test-job.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-tls-test-job.yaml.j2 @@ -26,18 +26,18 @@ data: echo "==========================================" echo " VictoriaMetrics TLS Connection Test" echo "==========================================" -{% if hostvars['localhost']['victoria_configurations']['deployment_mode'] == 'cluster' %} +{% if victoria_deployment_mode == 'cluster' %} echo "Deployment Mode: Cluster" {% if victoria_cluster.tls_enabled %} - echo "VictoriaMetrics URL: https://vmselect:8481" + echo "VictoriaMetrics URL: https://vmselect-victoria-cluster:8481" echo "Testing with CA certificate verification" - VICTORIA_URL="https://vmselect:8481" + VICTORIA_URL="https://vmselect-victoria-cluster:8481" CA_CERT="/etc/victoria/certs/ca.crt" USE_TLS="true" {% else %} - echo "VictoriaMetrics URL: http://vmselect:8481" + echo "VictoriaMetrics URL: http://vmselect-victoria-cluster:8481" echo "Testing cluster without TLS (HTTP)" - VICTORIA_URL="http://vmselect:8481" + VICTORIA_URL="http://vmselect-victoria-cluster:8481" CA_CERT="" USE_TLS="false" {% endif %} @@ -125,7 +125,7 @@ data: echo "" # Test 5: Test API query endpoint -{% if hostvars['localhost']['victoria_configurations']['deployment_mode'] == 'cluster' %} +{% if victoria_deployment_mode == 'cluster' %} echo "Step 5: Testing /select/0/prometheus/api/v1/query endpoint..." if [ "$USE_TLS" = "true" ]; then QUERY_RESPONSE=$(curl -s --max-time 30 --cacert "$CA_CERT" "${VICTORIA_URL}/select/0/prometheus/api/v1/query?query=up" || echo "failed") @@ -316,8 +316,8 @@ data: if [ "$USE_TLS" = "true" ]; then echo "Step 7: Checking server certificate details..." if command -v openssl > /dev/null 2>&1; then -{% if hostvars['localhost']['victoria_configurations']['deployment_mode'] == 'cluster' %} - echo | openssl s_client -connect vmselect:8481 -CAfile "$CA_CERT" 2>/dev/null | \ +{% if victoria_deployment_mode == 'cluster' %} + echo | openssl s_client -connect vmselect-victoria-cluster:8481 -CAfile "$CA_CERT" 2>/dev/null | \ openssl x509 -noout -subject -issuer -dates 2>/dev/null | sed 's/^/ /' || \ echo " ⚠ Could not retrieve server certificate details" {% else %} @@ -374,7 +374,7 @@ spec: spec: restartPolicy: Never volumes: -{% if hostvars['localhost']['victoria_configurations']['deployment_mode'] == 'single-node' or victoria_cluster.tls_enabled %} +{% if victoria_deployment_mode == 'single-node' or victoria_cluster.tls_enabled %} - name: victoria-tls-certs secret: secretName: victoria-tls-certs @@ -388,7 +388,7 @@ spec: image: curlimages/curl:8.17.0 imagePullPolicy: IfNotPresent volumeMounts: -{% if hostvars['localhost']['victoria_configurations']['deployment_mode'] == 'single-node' or victoria_cluster.tls_enabled %} +{% if victoria_deployment_mode == 'single-node' or victoria_cluster.tls_enabled %} - mountPath: /etc/victoria/certs name: victoria-tls-certs readOnly: true diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-vmagent-rbac.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-vmagent-rbac.yaml.j2 index 501328c1c8..e84877af56 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-vmagent-rbac.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-vmagent-rbac.yaml.j2 @@ -27,6 +27,10 @@ rules: - apiGroups: [""] resources: ["pods", "services", "endpoints"] verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch"] + --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding @@ -40,4 +44,4 @@ subjects: roleRef: kind: Role name: "{{ vmagent.role_name }}" - apiGroup: rbac.authorization.k8s.io \ No newline at end of file + apiGroup: rbac.authorization.k8s.io diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 new file mode 100644 index 0000000000..22b4ecef11 --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 @@ -0,0 +1,208 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VLAgent - VictoriaLogs log collection agent via operator +# Managed by victoria-metrics-operator (>= v0.59.0) +# +# Purpose: Platform-managed log forwarding agent providing: +# - Syslog reception (RFC 3164/5424) on ports 514 (plaintext) and 6514 (TLS) +# - JSON Lines forwarding to vlinsert ingestion endpoint +# - Client-side buffering with PVC-backed persistence for retry during vlinsert unavailability +# +# Design: Source-neutral base configuration with no source-specific relabel rules. +# Downstream capabilities extend this configuration for specific log sources. + +apiVersion: operator.victoriametrics.com/v1 +kind: VLAgent +metadata: + name: vlagent + namespace: {{ telemetry_namespace }} + labels: + app: vlagent + component: victorialogs +spec: + # ======================================== + # Container Image Configuration + # ======================================== + image: + repository: {{ victoria_logs_cluster.vlagent.image.split(':')[0] }} + tag: {{ victoria_logs_cluster.vlagent.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # ======================================== + # Replica Configuration + # ======================================== + # Single replica (DD-A1): Platform-managed log forwarding agent + # HA provided by PVC buffer persistence and pod restart + replicaCount: {{ victoria_logs_cluster.vlagent.replicas }} + + # ======================================== + # Configuration Management + # ======================================== + # ConfigMap containing VLAgent syslog receiver configuration + configSecret: vlagent-config + + # ======================================== + # Remote Write Configuration + # ======================================== + # Forward logs to VictoriaLogs vlinsert endpoint + # Supports JSON Lines format with optional TLS + remoteWrite: +{% if victoria_logs_cluster.tls_enabled %} + - url: https://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/insert/jsonline + tlsConfig: + ca: /etc/victoria/certs/ca.crt +{% else %} + - url: http://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/insert/jsonline +{% endif %} + + # ======================================== + # Resource Allocation + # ======================================== + # CPU: 50m request, 250m limit (I/O-bound, low CPU usage) + # Memory: 128Mi request, 512Mi limit (in-memory batch buffers + disk-backed WAL) + resources: + requests: + memory: {{ victoria_logs_cluster.vlagent.resources.requests.memory }} + cpu: {{ victoria_logs_cluster.vlagent.resources.requests.cpu }} + limits: + memory: {{ victoria_logs_cluster.vlagent.resources.limits.memory }} + cpu: {{ victoria_logs_cluster.vlagent.resources.limits.cpu }} + + # ======================================== + # Persistent Storage Configuration + # ======================================== + # PVC buffer for client-side log persistence and retry during vlinsert unavailability + # Default: 5Gi (sufficient for ~24-48 hours of log accumulation at typical syslog rates) + # Configurable via victoria_logs_cluster.vlagent.pvc_size in vars/main.yml + storage: + volumeClaimTemplate: + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ victoria_logs_cluster.vlagent.pvc_size | default('5Gi') }} + + # ======================================== + # TLS Certificate Configuration + # ======================================== + # Shared with VictoriaMetrics cluster + # Used for: + # - Syslog TLS receiver (:6514) — server certificate + # - remoteWrite to vlinsert — CA certificate validation +{% if victoria_logs_cluster.tls_enabled %} + volumes: + - name: victoria-tls-certs + secret: + secretName: victoria-tls-certs + items: + - key: tls.crt + path: server.crt + - key: tls.key + path: server.key + - key: ca.crt + path: ca.crt + volumeMounts: + - name: victoria-tls-certs + mountPath: /etc/victoria/certs + readOnly: true +{% endif %} + + # ======================================== + # Service Exposure Configuration + # ======================================== + # Service type: LoadBalancer (MetalLB) or NodePort (fallback) + # Exposes syslog receivers (:514 TCP+UDP, :6514 TLS) for external log sources +{% if metalLB_deployed | default(false) %} + serviceSpec: + useAsDefault: true + spec: + type: LoadBalancer +{% else %} + serviceSpec: + useAsDefault: true + spec: + type: NodePort +{% endif %} + + # ======================================== + # Port Configuration + # ======================================== + # Syslog receivers (platform-provided scrape targets) + # - :514 TCP+UDP — plaintext syslog (RFC 3164/5424) + # - :6514 TCP — TLS syslog (RFC 5425) + # Health check endpoint + # - :9429 — HTTP health checks (distinct from vmagent 8429) + ports: + - name: syslog + port: 514 + targetPort: 514 + protocol: TCP + - name: syslog-udp + port: 514 + targetPort: 514 + protocol: UDP + - name: syslog-tls + port: 6514 + targetPort: 6514 + protocol: TCP + - name: health + port: 9429 + targetPort: 9429 + protocol: TCP + + # ======================================== + # Health Probes + # ======================================== + # Liveness probe: Restart pod if health check fails + # Readiness probe: Route traffic only to ready pods + livenessProbe: + httpGet: + path: /health + port: 9429 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + failureThreshold: 3 + + readinessProbe: + httpGet: + path: /health + port: 9429 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + + # ======================================== + # Pod Scheduling and Affinity + # ======================================== + # No pod anti-affinity required (single replica) + # Tolerations allow scheduling on nodes with taints + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 5 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 5 + + # ======================================== + # Termination Grace Period + # ======================================== + # Allow time for graceful shutdown and buffer flush + terminationGracePeriodSeconds: 30 diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 new file mode 100644 index 0000000000..ec289f4e5e --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 @@ -0,0 +1,231 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VLCluster - VictoriaLogs cluster deployment via operator +# Managed by victoria-metrics-operator (>= v0.59.0) + +apiVersion: operator.victoriametrics.com/v1 +kind: VLCluster +metadata: + name: victoria-logs-cluster + namespace: {{ telemetry_namespace }} +spec: + # Single image version for all cluster components + # Operator determines component role (vlstorage, vlinsert, vlselect) internally + clusterVersion: {{ victoria_logs_cluster.vlstorage.image.split(':')[1] }} + + # ======================== + # vlstorage — Persistent log storage (StatefulSet, 3 replicas) + # ======================== + vlstorage: + replicaCount: {{ victoria_logs_cluster.vlstorage.replicas }} + image: + repository: {{ victoria_logs_cluster.vlstorage.image.split(':')[0] }} + tag: {{ victoria_logs_cluster.vlstorage.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # Ports are managed by operator defaults (9491, 9400, 9401) + + storageDataPath: /vlstorage-data + storage: + volumeClaimTemplate: + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ hostvars['localhost']['victoria_logs_configurations']['storage_size'] }} + + resources: + requests: + memory: {{ victoria_logs_cluster.vlstorage.resources.requests.memory }} + cpu: {{ victoria_logs_cluster.vlstorage.resources.requests.cpu }} + limits: + memory: {{ victoria_logs_cluster.vlstorage.resources.limits.memory }} + cpu: {{ victoria_logs_cluster.vlstorage.resources.limits.cpu }} + + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - vlstorage + topologyKey: "kubernetes.io/hostname" + + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 5 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 5 + + extraArgs: + retentionPeriod: "{{ hostvars['localhost']['victoria_logs_configurations']['retention_period'] }}h" +{% if victoria_logs_cluster.tls_enabled %} + tls: "true" + tlsCertFile: "/etc/victoria/certs/server.crt" + tlsKeyFile: "/etc/victoria/certs/server.key" +{% endif %} +{% if victoria_logs_cluster.tls_enabled %} + + volumes: + - name: victoria-tls-certs + secret: + secretName: victoria-tls-certs + items: + - key: tls.crt + path: server.crt + - key: tls.key + path: server.key + volumeMounts: + - name: victoria-tls-certs + mountPath: /etc/victoria/certs + readOnly: true +{% endif %} + + # ======================== + # vlinsert — Log ingestion gateway (Deployment, 2 replicas) + # ======================== + vlinsert: + replicaCount: {{ victoria_logs_cluster.vlinsert.replicas }} + image: + repository: {{ victoria_logs_cluster.vlinsert.image.split(':')[0] }} + tag: {{ victoria_logs_cluster.vlinsert.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # Ports are managed by operator defaults (9481) + + resources: + requests: + memory: {{ victoria_logs_cluster.vlinsert.resources.requests.memory }} + cpu: {{ victoria_logs_cluster.vlinsert.resources.requests.cpu }} + limits: + memory: {{ victoria_logs_cluster.vlinsert.resources.limits.memory }} + cpu: {{ victoria_logs_cluster.vlinsert.resources.limits.cpu }} + + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - vlinsert + topologyKey: "kubernetes.io/hostname" +{% if victoria_logs_cluster.tls_enabled %} + + extraArgs: + tls: "true" + tlsCertFile: "/etc/victoria/certs/server.crt" + tlsKeyFile: "/etc/victoria/certs/server.key" + storageNode.tls: "true" + storageNode.tlsCAFile: "/etc/victoria/certs/ca.crt" + + volumes: + - name: victoria-tls-certs + secret: + secretName: victoria-tls-certs + items: + - key: tls.crt + path: server.crt + - key: tls.key + path: server.key + - key: ca.crt + path: ca.crt + volumeMounts: + - name: victoria-tls-certs + mountPath: /etc/victoria/certs + readOnly: true +{% endif %} +{% if victoria_logs_cluster.vlinsert.external_access %} + + serviceSpec: + useAsDefault: true + spec: + type: LoadBalancer +{% endif %} + + # ======================== + # vlselect — Log query gateway (Deployment, 2 replicas) + # ======================== + vlselect: + replicaCount: {{ victoria_logs_cluster.vlselect.replicas }} + image: + repository: {{ victoria_logs_cluster.vlselect.image.split(':')[0] }} + tag: {{ victoria_logs_cluster.vlselect.image.split(':')[1] }} + pullPolicy: IfNotPresent + + # Ports are managed by operator defaults (9471) + + serviceSpec: + useAsDefault: true + spec: + type: LoadBalancer + + resources: + requests: + memory: {{ victoria_logs_cluster.vlselect.resources.requests.memory }} + cpu: {{ victoria_logs_cluster.vlselect.resources.requests.cpu }} + limits: + memory: {{ victoria_logs_cluster.vlselect.resources.limits.memory }} + cpu: {{ victoria_logs_cluster.vlselect.resources.limits.cpu }} + + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - vlselect + topologyKey: "kubernetes.io/hostname" +{% if victoria_logs_cluster.tls_enabled %} + + extraArgs: + tls: "true" + tlsCertFile: "/etc/victoria/certs/server.crt" + tlsKeyFile: "/etc/victoria/certs/server.key" + storageNode.tls: "true" + storageNode.tlsCAFile: "/etc/victoria/certs/ca.crt" + + volumes: + - name: victoria-tls-certs + secret: + secretName: victoria-tls-certs + items: + - key: tls.crt + path: server.crt + - key: tls.key + path: server.key + - key: ca.crt + path: ca.crt + volumeMounts: + - name: victoria-tls-certs + mountPath: /etc/victoria/certs + readOnly: true +{% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-vlagent-config.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-vlagent-config.yaml.j2 new file mode 100644 index 0000000000..819ca03670 --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-vlagent-config.yaml.j2 @@ -0,0 +1,155 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VLAgent Configuration ConfigMap +# Mounted as vlagent.yml in the VLAgent pod +# Defines syslog receivers and log forwarding pipeline to vlinsert + +apiVersion: v1 +kind: ConfigMap +metadata: + name: vlagent-config + namespace: {{ telemetry_namespace }} + labels: + app: vlagent + component: victorialogs +data: + vlagent.yml: | + # ============================================================================ + # VLAgent Platform Base Configuration + # ============================================================================ + # Source-neutral: no source-specific relabel rules or external labels + # Downstream capabilities extend this configuration for specific log sources + # + # Pipeline: + # 1. RECEIVE: Syslog messages on :514 (plaintext) and :6514 (TLS) + # 2. PARSE: Convert syslog to JSON Lines format + # 3. FORWARD: Send to vlinsert via HTTPS remoteWrite + # 4. BUFFER: PVC-backed persistence for retry during vlinsert unavailability + + # ============================================================================ + # SYSLOG RECEIVERS (Platform-provided scrape targets) + # ============================================================================ + # Listens for incoming syslog messages from external sources + # Supports RFC 3164 (BSD) and RFC 5424 (structured) formats + # + # Plaintext receiver (:514): + # - TCP and UDP support + # - Standard syslog port + # - Suitable for trusted internal networks + # + # TLS receiver (:6514): + # - TCP only (RFC 5425) + # - Encrypted syslog transport + # - Requires server certificate from victoria-tls-certs secret + syslog: + # Plaintext syslog receiver (RFC 3164/5424) + # Listens on all interfaces, TCP and UDP + listenAddr: "0.0.0.0:514" + + # TLS syslog receiver (RFC 5425) + # Listens on all interfaces, TCP only + tlsListenAddr: "0.0.0.0:6514" + +{% if victoria_logs_cluster.tls_enabled %} + # TLS certificate and key for syslog TLS receiver + # Sourced from shared victoria-tls-certs secret + tlsCertFile: "/etc/victoria/certs/server.crt" + tlsKeyFile: "/etc/victoria/certs/server.key" +{% endif %} + + # ============================================================================ + # LOG FORWARDING PIPELINE (remoteWrite to vlinsert) + # ============================================================================ + # Forwards parsed logs to vlinsert ingestion endpoint + # Format: JSON Lines (NDJSON) over HTTPS + # Stream identification: hostname and app_name fields for consistent-hash sharding + # + # Behavior: + # - Batches logs and sends HTTP POST to vlinsert + # - Batch size: ~1 MB or 10,000 entries (whichever comes first) + # - Flush interval: 1 second (default) + # - Retry: Exponential backoff (1s → 60s) on vlinsert errors + # - Persistence: Unsent batches buffered to PVC on vlinsert unavailability + remoteWrite: + # vlinsert endpoint (in-cluster FQDN) + # Operator creates service as vlinsert-victoria-logs-cluster + # Port 9481: ingestion endpoint (HTTPS when tls_enabled, HTTP otherwise) + # Path: /insert/jsonline — primary VictoriaLogs ingestion format + # Query params: + # _stream_fields=hostname,app_name — stream identification for sharding + url: "{{ 'https' if victoria_logs_cluster.tls_enabled else 'http' }}://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/insert/jsonline?_stream_fields=hostname,app_name" + +{% if victoria_logs_cluster.tls_enabled %} + # TLS configuration for remoteWrite client + # Validates vlinsert's server certificate using CA cert from victoria-tls-certs secret + tls_config: + ca_file: "/etc/victoria/certs/ca.crt" + insecure_skip_verify: false +{% endif %} + + # ============================================================================ + # PERSISTENT QUEUE CONFIGURATION (Client-side buffering) + # ============================================================================ + # Disk-backed write-ahead log (WAL) for log persistence + # Prevents log loss during vlinsert unavailability (pod restart, rolling update) + # + # Behavior: + # - VLAgent writes incoming logs to disk before forwarding + # - On vlinsert error: Batches persisted to PVC, retried with exponential backoff + # - On VLAgent restart: Unsent batches read from PVC and retried + # - On PVC full: Oldest buffered entries evicted (FIFO) to make room + # + # Sizing: + # - Default: 5Gi (sufficient for ~24-48 hours of log accumulation) + # - Configurable via victoria_logs_cluster.vlagent.pvc_size in vars/main.yml + persistentQueue: + # Mount path for PVC buffer storage + # Corresponds to storage.volumeClaimTemplate in VLAgent CR + dir: "/vlagent-data" + + # Maximum buffer size before eviction + # Matches PVC size (5Gi default) + maxPendingBytes: "5GiB" + + # ============================================================================ + # EXTENSION POINTS FOR DOWNSTREAM CAPABILITIES + # ============================================================================ + # This platform base configuration is source-neutral. + # Downstream capabilities (separate epics) extend this configuration with: + # + # 1. Source-specific relabel rules + # Example: Add labels for PowerScale syslog sources + # relabeling: + # - source_labels: [hostname] + # regex: "powerscale-.*" + # target_label: source + # replacement: powerscale + # + # 2. External labels for source identification + # Example: Add cluster identifier + # external_labels: + # cluster: "production" + # environment: "prod" + # + # 3. Additional remoteWrite destinations + # Example: Dual-write to secondary vlinsert + # remoteWrite: + # - url: "https://secondary-vlinsert:9480/insert/jsonline" + # + # 4. Parsing and enrichment rules + # Example: Extract fields from syslog message + # parsing: + # - type: json + # field: message diff --git a/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 index d653bbcef6..caa70954a6 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 @@ -59,4 +59,19 @@ data: # Add Pod IP label - source_labels: [__meta_kubernetes_pod_ip] - target_label: pod_ip \ No newline at end of file + target_label: pod_ip +{% if hostvars['localhost']['powerscale_configurations']['powerscale_telemetry_support'] | default(false) | bool %} + + # PowerScale OTEL Collector scrape targets (per cluster) +{% for cluster in ps_clusters %} + - job_name: "otel-collector-powerscale-cluster{{ loop.index0 }}" + honor_labels: true + scrape_interval: {{ vmagent.global.scrape_interval }} + static_configs: + - targets: ['otel-collector.{{ telemetry_namespace }}.svc.cluster.local:8889'] + labels: + source: powerscale + cluster: "{{ cluster.clusterName }}" + cluster_endpoint: "{{ cluster.endpoint }}" +{% endfor %} +{% endif %} diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index 5d3748c69a..443c362222 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -100,12 +100,13 @@ victoria: image: "{{ telemetry_images['victoriametrics/victoria-metrics'] | default('victoriametrics/victoria-metrics:v1.128.0') }}" # VictoriaMetrics Cluster Configuration -# Deployment mode is controlled by victoria_configurations.deployment_mode in telemetry_config.yml +# Deployment mode is controlled by victoria_deployment_mode variable (default: cluster) # Supported modes: "single-node" or "cluster" +victoria_deployment_mode: "cluster" # Default deployment mode for VictoriaMetrics victoria_cluster: - # Auto-configured based on telemetry_config.yml + # Auto-configured based on victoria_deployment_mode variable # true = cluster mode, false = single-node mode - enabled: "{{ true if hostvars['localhost']['victoria_configurations']['deployment_mode'] == 'cluster' else false }}" + enabled: "{{ true if victoria_deployment_mode == 'cluster' else false }}" tls_enabled: true # Set to true to enable TLS for cluster components # VMStorage: Stores raw data and returns query results vmstorage: @@ -147,6 +148,89 @@ victoria_cluster: memory: "1Gi" cpu: "1000m" + vmagent: + replicas: 1 + image: "{{ telemetry_images['victoriametrics/vmagent'] | default('victoriametrics/vmagent:v1.128.0') }}" + resources: + requests: + memory: "128Mi" + cpu: "50m" + limits: + memory: "512Mi" + cpu: "250m" + +# ============================================================================ +# VictoriaLogs Cluster Configuration +# ============================================================================ +# Deployed alongside VictoriaMetrics when 'victoria' is in telemetry_collection_type. +# Managed by the same VictoriaMetrics operator via VLCluster and VLAgent CRs. +# See VL_cluster_component_spec.md and VL_Agent_component_spec.md for full design. +victoria_logs_cluster: + tls_enabled: true # Enable TLS for all inter-component communication (shared victoria-tls-certs secret) + + # vlstorage: Persistent log storage nodes (StatefulSet managed by operator via VLCluster CR) + vlstorage: + replicas: 3 + image: "{{ telemetry_images['victoriametrics/victoria-logs'] | default('docker.io/victoriametrics/victoria-logs:v1.49.0') }}" + resources: + requests: + memory: "1Gi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" + + # vlinsert: Log ingestion gateway (Deployment managed by operator via VLCluster CR) + vlinsert: + replicas: 2 + image: "{{ telemetry_images['victoriametrics/victoria-logs'] | default('docker.io/victoriametrics/victoria-logs:v1.49.0') }}" + external_access: true # Expose vlinsert via LoadBalancer service for external log ingestion + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "1000m" + + # vlselect: Log query gateway (Deployment managed by operator via VLCluster CR) + vlselect: + replicas: 2 + image: "{{ telemetry_images['victoriametrics/victoria-logs'] | default('docker.io/victoriametrics/victoria-logs:v1.49.0') }}" + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "1000m" + + # VLAgent: Platform-managed log forwarding agent (Deployment managed by operator via VLAgent CR) + vlagent: + replicas: 1 + image: "{{ telemetry_images['victoriametrics/vlagent'] | default('docker.io/victoriametrics/vlagent:v1.49.0') }}" + pvc_size: "5Gi" # Buffer storage for retry during vlinsert unavailability + resources: + requests: + memory: "128Mi" + cpu: "50m" + limits: + memory: "512Mi" + cpu: "250m" + +# VictoriaLogs ports (operator v0.66.1 defaults) +# Note: VictoriaMetrics operator uses these default ports and ignores custom port specifications +# will uncomment after verify that victoria oprator does not takes default port +# victoria_logs_ports: +# vlinsert_http: 9481 # Ingestion clients → vlinsert (HTTPS, LoadBalancer) - operator default +# vlselect_http: 9471 # Query clients → vlselect (HTTPS, LoadBalancer) - operator default +# vlstorage_http: 9491 # Health checks and admin API (internal) - operator default +# vlstorage_insert: 9400 # vlinsert → vlstorage data sharding (HTTPS, internal) +# vlstorage_select: 9401 # vlselect → vlstorage query fan-out (HTTPS, internal) +# vlagent_syslog: 514 # Syslog receiver plaintext (TCP+UDP) +# vlagent_syslog_tls: 6514 # Syslog receiver TLS (TCP, RFC 5425) +# vlagent_http: 9429 # VLAgent health checks (distinct from vmagent 8429) + # Telemetry shared path configuration telemetry_share_path: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/telemetry" @@ -174,12 +258,20 @@ vmagent: # Single-node URL remote_write_url: "https://victoria-loadbalancer.telemetry.svc.cluster.local:8443/api/v1/write" # Cluster URL (used when victoria_cluster.enabled: true) - remote_write_url_cluster: > - {% if victoria_cluster.tls_enabled %}https{% else %} - http{% endif %}://vminsert.{{ telemetry_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write + # Operator creates service as vminsert-victoria-cluster (not vminsert) + # Protocol depends on tls_enabled: https when TLS is on, http otherwise + remote_write_url_cluster: >- + {{ 'https' if victoria_cluster.tls_enabled else 'http' }}://vminsert-victoria-cluster.{{ + telemetry_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write strmzi_kafka_tarball_url: "{{ offline_tarball_path }}/{{ strimzi_kafka_pkg }}/{{ strimzi_kafka_pkg }}.tar.gz" +# Victoria Metrics operator tarball configuration +# Version must match the Helm chart entry in service_k8s.json (victoria-metrics-operator-0.59.3) +# Required for VLCluster and VLAgent CRD support (minimum v0.59.0) +victoria_operator_pkg: "victoria-metrics-operator-0.59.3" +victoria_operator_tarball_url: "{{ offline_tarball_path }}/{{ victoria_operator_pkg }}/{{ victoria_operator_pkg }}.tar.gz" + # Usage: validate_idrac_inventory.yml bmc_group_data_filename: "/opt/omnia/telemetry/bmc_group_data.csv" bmc_group_data_headers: "BMC_IP,GROUP_NAME,PARENT" @@ -194,30 +286,42 @@ common_mode: "0755" # Usage: generate_telemetry_deployments.yml - Template lists for different components # Victoria templates - conditional based on victoria_cluster.enabled victoria_templates_common: - - src: 'telemetry/victoria/victoria-tls-secret.yaml.j2' - dest: 'victoria-tls-secret.yaml' - src: 'telemetry/victoria/victoria-vmagent-rbac.yaml.j2' dest: 'victoria-vmagent-rbac.yaml' - - src: 'telemetry/victoria/vmagent-scrape-config.yaml.j2' - dest: 'vmagent-scrape-config.yaml' - - src: 'telemetry/victoria/victoria-agent-deployment.yaml.j2' - dest: 'victoria-agent-deployment.yaml' + +# Operator-based templates (new default) +# Single-node operator template (used when victoria_cluster.enabled: false) +victoria_templates_operator_single: + - src: 'telemetry/victoria/victoria-operator-vmsingle.yaml.j2' + dest: 'victoria-operator-vmsingle.yaml' + - src: 'telemetry/victoria/victoria-operator-vmagent.yaml.j2' + dest: 'victoria-operator-vmagent.yaml' + - src: 'telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2' + dest: 'victoria-operator-vmpodscrape.yaml' + +# Cluster operator template (used when victoria_cluster.enabled: true) +victoria_templates_operator_cluster: + - src: 'telemetry/victoria/victoria-operator-vmcluster.yaml.j2' + dest: 'victoria-operator-vmcluster.yaml' + - src: 'telemetry/victoria/victoria-operator-vmagent.yaml.j2' + dest: 'victoria-operator-vmagent.yaml' + - src: 'telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2' + dest: 'victoria-operator-vmpodscrape.yaml' + +# Legacy manual deployment templates (removed - use operator-based templates above) +# Raw victoria-cluster-vminsert/vmselect/vmstorage.yaml.j2 files have been removed +# in favor of the operator-managed victoria-operator-vmcluster.yaml.j2 idrac_telemetry_statefulset_path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/idrac_telemetry_statefulset.yaml" # Single-node templates (used when victoria_cluster.enabled: false) victoria_templates_single: + - src: 'telemetry/victoria/victoria-tls-secret.yaml.j2' + dest: 'victoria-tls-secret.yaml' - src: 'telemetry/victoria/victoria-statefulset.yaml.j2' dest: 'victoria-statefulset.yaml' - -# Cluster templates (used when victoria_cluster.enabled: true) -victoria_templates_cluster: - - src: 'telemetry/victoria/victoria-cluster-vmstorage.yaml.j2' - dest: 'victoria-cluster-vmstorage.yaml' - - src: 'telemetry/victoria/victoria-cluster-vminsert.yaml.j2' - dest: 'victoria-cluster-vminsert.yaml' - - src: 'telemetry/victoria/victoria-cluster-vmselect.yaml.j2' - dest: 'victoria-cluster-vmselect.yaml' + - src: 'telemetry/victoria/victoria-agent-deployment.yaml.j2' + dest: 'victoria-agent-deployment.yaml' # Test job template (optional) victoria_templates_test: @@ -228,9 +332,22 @@ victoria_templates_test: # Note: victoria_templates_test is commented out by default in kustomization.yaml.j2 victoria_templates: > {{ victoria_templates_common + - (victoria_templates_cluster if victoria_cluster.enabled else victoria_templates_single) + + (victoria_templates_operator_cluster if victoria_cluster.enabled else victoria_templates_operator_single) + victoria_templates_test }} +# ============================================================================ +# VictoriaLogs Template Lists +# ============================================================================ +# VictoriaLogs operator CR templates (used when 'victoria' in telemetry_collection_type) +# These are applied alongside VictoriaMetrics templates in the same kustomize deployment. +victorialogs_templates: + - src: 'telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2' + dest: 'victorialogs-operator-vlcluster.yaml' + - src: 'telemetry/victoria/victorialogs-operator-vlagent.yaml.j2' + dest: 'victorialogs-operator-vlagent.yaml' + - src: 'telemetry/victoria/victorialogs-vlagent-config.yaml.j2' + dest: 'victorialogs-vlagent-config.yaml' + kafka_templates: - src: 'telemetry/kafka/kafka.kafka.yaml.j2' dest: 'kafka.kafka.yaml' @@ -279,3 +396,91 @@ ldms_pod_not_ready_msg: "WARNING: LDMS aggregator pod did not become ready withi ldms_store_pod_ready_msg: "LDMS store daemon pod restarted successfully and is ready" ldms_store_pod_not_ready_msg: "LDMS store daemon pod restart failed or not ready within timeout" ldms_store_restart_wait_seconds: 10 + +# ============================================================================ +# PowerScale Telemetry Configuration +# ============================================================================ +# Usage: deploy_powerscale_metrics.yml +# PowerScale telemetry is deployed via Helm chart (karavi-observability). +# The Helm chart deploys CSM Metrics PowerScale, OTEL Collector, and all +# associated Kubernetes resources. No per-cluster Kustomize manifests needed. +# PowerScale metrics are scraped by the existing shared vmagent. + +# Karavi Observability offline installer variables +# All files are on the NFS share at {{ k8s_client_mount_path }}/karavi-observability/ +karavi_observability_git: "karavi-observability.tar.gz" +karavi_helm_charts_git: "helm-charts.tar.gz" +karavi_helm_chart_path: "{{ k8s_client_mount_path }}/karavi-observability/helm-charts/charts/karavi-observability" +permission_644: "0644" + +# Internal namespace constants (not user-configurable) +csm_namespace: "telemetry" + +# CSM Metrics PowerScale resource limits +csm_metrics_powerscale_resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "512Mi" + +# Usage: deploy_powerscale_metrics.yml - messages +ps_csi_driver_not_configured_msg: >- + PowerScale telemetry requires csi_driver_powerscale to be configured in software_config.json. + Please add csi_driver_powerscale to software_config.json and re-run. +ps_helm_values_path_missing_msg: >- + csm_observability_values_file_path is required in telemetry_config.yml when powerscale_configurations.powerscale_telemetry_support is true. + Provide the path to your customized karavi-observability Helm values file. +ps_helm_values_file_not_found_msg: >- + Helm values file not found at '{{ ps_helm_values_file | default('') }}'. + Please create a values file following the Dell CSM Observability documentation + and set the path in telemetry_config.yml (csm_observability_values_file_path). +ps_helm_values_parse_fail_msg: >- + Failed to parse Helm values file at '{{ ps_helm_values_file | default('') }}'. + Please verify the file contains valid YAML. +ps_cert_manager_disabled_msg: >- + cert-manager must be enabled in the CSM Observability Helm values file. + Set 'cert-manager.enabled: true' in {{ ps_helm_values_file | default('') }}. + cert-manager is required for automatic TLS certificate management (otel-collector-tls secret). +ps_auth_mode_direct_msg: "PowerScale telemetry authentication mode: Direct Authentication (Mode A)" +ps_auth_mode_karavi_msg: "PowerScale telemetry authentication mode: Karavi Authorization (Mode B)" +ps_csi_secret_read_fail_msg: >- + Failed to load CSI PowerScale driver secret from '{{ ps_csi_secret_path | default('') }}'. + Please verify the file exists and contains valid isilonClusters configuration. +ps_no_clusters_found_msg: >- + No PowerScale clusters found in CSI driver secret.yaml. + Please define at least one cluster in isilonClusters. +ps_clusters_found_msg: >- + Found {{ ps_clusters | default([]) | length }} PowerScale cluster(s) for telemetry deployment. +ps_telemetry_prepared_msg: >- + PowerScale telemetry files prepared on NFS share. + Helm chart: {{ karavi_helm_chart_path }} + Values file: {{ k8s_client_mount_path }}/karavi-observability/csm_metrics_values.yaml + Deployment will occur during cloud-init (PXE boot) on the control plane node. +ps_cert_manager_pkg_msg: >- + cert-manager package from service_k8s.json: {{ cert_manager_package | default('') }} ({{ cert_manager_chart_tgz | default('') }}) +ps_cert_manager_required_msg: "cert-manager dependency download: REQUIRED" +ps_cert_manager_skipped_msg: "cert-manager dependency download: SKIPPED (disabled in values file)" +ps_cert_manager_staged_msg: >- + cert-manager ({{ cert_manager_package | default('') }}) extracted to + {{ karavi_helm_chart_path }}/charts/cert-manager/. + Helm will resolve this dependency during install on the control plane node. +ps_cert_manager_skipped_detail_msg: >- + cert-manager dependency skipped (disabled in values file). + CRDs are bundled in karavi-observability chart and will be applied automatically. +ps_cert_manager_extract_fail_msg: >- + cert-manager chart was not extracted correctly to + {{ karavi_helm_chart_path }}/charts/cert-manager/. + The archive {{ cert_manager_chart_tgz | default('') }} may be corrupt. +ps_cert_manager_missing_msg: >- + cert-manager Helm chart ({{ cert_manager_chart_tgz | default('') }}) was not found in + {{ karavi_helm_chart_path }}/charts/. This chart is required for TLS certificate + management. Ensure the Jetstack cert-manager entry exists in service_k8s.json + and run local_repo.yml to download it to Pulp. +ps_dependency_fail_msg: >- + Failed to get karavi-observability dependencies from Pulp. + Ensure the following entries exist in service_k8s.json and run local_repo.yml: + - karavi-observability (git) + - helm-charts (git) + - {{ cert_manager_package | default('cert-manager') }} (tarball from Jetstack Helm repo) diff --git a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml index 1aa095e66b..9e431f6671 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml @@ -50,12 +50,11 @@ - name: Normalize telemetry_config.yml values ansible.builtin.set_fact: telemetry_idrac_telemetry_support: "{{ backup_telemetry_config.idrac_telemetry_support | default(true) }}" - telemetry_idrac_telemetry_collection_type: >- + telemetry_telemetry_collection_type: >- {{ - backup_telemetry_config.idrac_telemetry_collection_type + backup_telemetry_config.telemetry_collection_type | default('victoria,kafka') }} - telemetry_victoria_deployment_mode: "{{ backup_telemetry_victoria_config.deployment_mode | default('cluster') }}" telemetry_victoria_persistence_size: "{{ backup_telemetry_victoria_config.persistence_size | default('8Gi') }}" telemetry_victoria_retention_period: "{{ backup_telemetry_victoria_config.retention_period | default(168) }}" telemetry_kafka_persistence_size: "{{ backup_telemetry_kafka_config.persistence_size | default('8Gi') }}" @@ -112,8 +111,7 @@ mode: "{{ default_file_mode }}" vars: telemetry_idrac_telemetry_support: "{{ telemetry_idrac_telemetry_support }}" - telemetry_idrac_telemetry_collection_type: "{{ telemetry_idrac_telemetry_collection_type }}" - telemetry_victoria_deployment_mode: "{{ telemetry_victoria_deployment_mode }}" + telemetry_telemetry_collection_type: "{{ telemetry_telemetry_collection_type }}" telemetry_victoria_persistence_size: "{{ telemetry_victoria_persistence_size }}" telemetry_victoria_retention_period: "{{ telemetry_victoria_retention_period }}" telemetry_kafka_persistence_size: "{{ telemetry_kafka_persistence_size }}" diff --git a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 index cb89944e1c..ae57457882 100644 --- a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 @@ -82,40 +82,17 @@ idrac_telemetry_support: {{ telemetry_idrac_telemetry_support | default(true) | # - "kafka" : Store in Kafka only # - "victoria,kafka" : Store in both (recommended) # Default: "victoria,kafka" -idrac_telemetry_collection_type: {{ telemetry_idrac_telemetry_collection_type | default('victoria,kafka') | to_json }} +telemetry_collection_type: {{ telemetry_telemetry_collection_type | default('victoria,kafka') | to_json }} # ============================================================================ # VICTORIAMETRICS CONFIGURATION # ============================================================================ # VictoriaMetrics is a time-series database for storing telemetry metrics. -# Used for iDRAC telemetry when 'victoria' is enabled in idrac_telemetry_collection_type. +# Used for iDRAC telemetry when 'victoria' is enabled in telemetry_collection_type. # -# DEPLOYMENT MODES: -# - single-node: Simple deployment with one pod (suitable for small deployments) -# - cluster: High-availability deployment with multiple components -# (recommended for production and large-scale deployments) -victoria_configurations: - # VictoriaMetrics deployment mode - # Supported values: - # - "single-node" : Simple deployment (1 pod, suitable for dev/test) - # - "cluster" : High-availability deployment (7 pods, recommended for production) - # Default: "cluster" - # - # Cluster Mode Benefits: - # - High availability (no single point of failure) - # - Horizontal scalability (scale components independently) - # - Better performance (4x ingestion, 2x query speed) - # - Production-ready architecture - # - # Single-Node Benefits: - # - Simple setup (fewer resources) - # - Suitable for small deployments (<10 nodes) - # - Lower resource usage (~4Gi memory vs ~10Gi for cluster) - deployment_mode: {{ telemetry_victoria_deployment_mode | default('cluster') | to_json }} - +victoria_metrics_configurations: # The amount of storage allocated for EACH VictoriaMetrics persistent volume. # IMPORTANT: Total VictoriaMetrics storage depends on deployment mode: - # - Single-node mode: Total storage = persistence_size × 1 pod # - Cluster mode: Total storage = persistence_size × 3 vmstorage pods # - Example (cluster): 8Gi × 3 = 24Gi total VictoriaMetrics storage # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" @@ -130,7 +107,7 @@ victoria_configurations: # KAFKA CONFIGURATION # ============================================================================ # Apache Kafka is a distributed streaming platform for storing telemetry data. -# Used for iDRAC telemetry when 'kafka' is enabled in idrac_telemetry_collection_type. +# Used for iDRAC telemetry when 'kafka' is enabled in telemetry_collection_type. # Also used for LDMS telemetry when LDMS software is configured. # # NOTE: Kafka topics are auto-generated based on enabled features: diff --git a/utils/roles/external_victoria_connect_details/tasks/main.yml b/utils/roles/external_victoria_connect_details/tasks/main.yml index 260c8376fd..c7a19b3210 100644 --- a/utils/roles/external_victoria_connect_details/tasks/main.yml +++ b/utils/roles/external_victoria_connect_details/tasks/main.yml @@ -33,8 +33,8 @@ ansible.builtin.command: >- kubectl get svc {{ item }} -n {{ victoria_namespace }} -o name loop: - - vminsert - - vmselect + - "{{ victoria_vminsert_svc }}" + - "{{ victoria_vmselect_svc }}" register: victoria_cluster_svcs changed_when: false failed_when: false @@ -63,7 +63,7 @@ - name: Get Victoria pods status ansible.builtin.command: >- kubectl get pods -n {{ victoria_namespace }} - -l "app in (vminsert,vmselect,vmstorage,victoriametrics)" + -l "app.kubernetes.io/instance=victoria-cluster" -o wide register: victoria_pods_wide changed_when: false @@ -111,7 +111,7 @@ - name: Get vminsert service LoadBalancer IP ansible.builtin.command: >- - kubectl get svc vminsert -n {{ victoria_namespace }} + kubectl get svc {{ victoria_vminsert_svc }} -n {{ victoria_namespace }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}' register: vminsert_lb_ip changed_when: false @@ -119,7 +119,7 @@ - name: Get vminsert service LoadBalancer hostname ansible.builtin.command: >- - kubectl get svc vminsert -n {{ victoria_namespace }} + kubectl get svc {{ victoria_vminsert_svc }} -n {{ victoria_namespace }} -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' register: vminsert_lb_hostname changed_when: false @@ -127,7 +127,7 @@ - name: Get vminsert service external port ansible.builtin.command: >- - kubectl get svc vminsert -n {{ victoria_namespace }} + kubectl get svc {{ victoria_vminsert_svc }} -n {{ victoria_namespace }} -o jsonpath='{.spec.ports[0].port}' register: vminsert_lb_port changed_when: false @@ -135,7 +135,7 @@ - name: Get vmselect service LoadBalancer IP ansible.builtin.command: >- - kubectl get svc vmselect -n {{ victoria_namespace }} + kubectl get svc {{ victoria_vmselect_svc }} -n {{ victoria_namespace }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}' register: vmselect_lb_ip changed_when: false @@ -143,7 +143,7 @@ - name: Get vmselect service LoadBalancer hostname ansible.builtin.command: >- - kubectl get svc vmselect -n {{ victoria_namespace }} + kubectl get svc {{ victoria_vmselect_svc }} -n {{ victoria_namespace }} -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' register: vmselect_lb_hostname changed_when: false @@ -151,7 +151,7 @@ - name: Get vmselect service external port ansible.builtin.command: >- - kubectl get svc vmselect -n {{ victoria_namespace }} + kubectl get svc {{ victoria_vmselect_svc }} -n {{ victoria_namespace }} -o jsonpath='{.spec.ports[0].port}' register: vmselect_lb_port changed_when: false @@ -185,7 +185,7 @@ ansible.builtin.set_fact: victoria_sfm_hosts_entry: >- {{ - 'echo ' ~ (vminsert_lb_ip.stdout | trim) ~ ' vminsert.' ~ victoria_namespace ~ '.svc.cluster.local >> /etc/hosts' + 'echo ' ~ (vminsert_lb_ip.stdout | trim) ~ ' ' ~ victoria_vminsert_svc ~ '.' ~ victoria_namespace ~ '.svc.cluster.local >> /etc/hosts' if (vminsert_lb_ip.stdout | trim | length) > 0 else '' }} @@ -194,7 +194,7 @@ ansible.builtin.set_fact: victoria_sfm_hosts_entry_vmselect: >- {{ - 'echo ' ~ (vmselect_lb_ip.stdout | trim) ~ ' vmselect.' ~ victoria_namespace ~ '.svc.cluster.local >> /etc/hosts' + 'echo ' ~ (vmselect_lb_ip.stdout | trim) ~ ' ' ~ victoria_vmselect_svc ~ '.' ~ victoria_namespace ~ '.svc.cluster.local >> /etc/hosts' if (vmselect_lb_ip.stdout | trim | length) > 0 else '' }} @@ -202,11 +202,11 @@ - name: Set endpoint urls and SFM note strings ansible.builtin.set_fact: victoria_vminsert_write_url: >- - https://vminsert.{{ victoria_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write + {{ victoria_url_scheme }}://{{ victoria_vminsert_svc }}.{{ victoria_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write victoria_vmselect_query_url: >- - https://vmselect.{{ victoria_namespace }}.svc.cluster.local:8481/select/0/prometheus/api/v1/query + {{ victoria_url_scheme }}://{{ victoria_vmselect_svc }}.{{ victoria_namespace }}.svc.cluster.local:8481/select/0/prometheus/api/v1/query victoria_vmselect_ui_url: >- - https://vmselect.{{ victoria_namespace }}.svc.cluster.local:8481/select/0/vmui + {{ victoria_url_scheme }}://{{ victoria_vmselect_svc }}.{{ victoria_namespace }}.svc.cluster.local:8481/select/0/vmui victoria_sfm_hosts_entry_vminsert_display: >- {{ victoria_sfm_hosts_entry @@ -234,7 +234,7 @@ namespace: "{{ victoria_namespace }}" deployment_mode: "{{ victoria_deployment_mode }}" pod_status: "{{ victoria_pods_wide.stdout }}" - base_url: "https://{{ vminsert_host }}:{{ vminsert_port }}" + base_url: "{{ victoria_url_scheme }}://{{ vminsert_host }}:{{ vminsert_port }}" endpoints: vminsert: host: "{{ vminsert_host }}" @@ -243,8 +243,8 @@ vmselect: host: "{{ vmselect_host }}" port: "{{ vmselect_port | int }}" - query_endpoint: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query" - ui_url: "https://{{ vmselect_host }}:{{ vmselect_port }}/select/0/vmui" + query_endpoint: "{{ victoria_url_scheme }}://{{ vmselect_host }}:{{ vmselect_port }}/select/0/prometheus/api/v1/query" + ui_url: "{{ victoria_url_scheme }}://{{ vmselect_host }}:{{ vmselect_port }}/select/0/vmui" tls: ca_crt: "{{ victoria_tls_ca }}" notes: diff --git a/utils/roles/external_victoria_connect_details/vars/main.yml b/utils/roles/external_victoria_connect_details/vars/main.yml index f9a1fb72dd..38833d807a 100644 --- a/utils/roles/external_victoria_connect_details/vars/main.yml +++ b/utils/roles/external_victoria_connect_details/vars/main.yml @@ -17,6 +17,15 @@ victoria_namespace: "telemetry" victoria_output_file: "/opt/omnia/telemetry/external_victoria_connect_details.yml" victoria_tls_cert_dir: "/opt/omnia/telemetry/victoria-certs" +# Operator-managed service names (derived from VMCluster CR name "victoria-cluster") +victoria_vminsert_svc: "vminsert-victoria-cluster" +victoria_vmselect_svc: "vmselect-victoria-cluster" +victoria_vmstorage_svc: "vmstorage-victoria-cluster" + +# TLS configuration (must match victoria_cluster.tls_enabled in telemetry role) +victoria_tls_enabled: true +victoria_url_scheme: "{{ 'https' if victoria_tls_enabled else 'http' }}" + victoria_err_mode_not_supported: >- Victoria deployment mode detected: {{ victoria_deployment_mode }}. External integration is supported only for Victoria cluster mode (vminsert/vmselect/vmstorage). @@ -27,7 +36,7 @@ victoria_err_pods_not_running: "One or more Victoria pods are not in Running sta victoria_err_pods_not_ready: "One or more Victoria pods are not Ready." victoria_err_lb_missing: >- - Failed to fetch Victoria LoadBalancer IP(s). Ensure services 'vminsert' and 'vmselect' + Failed to fetch Victoria LoadBalancer IP(s). Ensure services '{{ victoria_vminsert_svc }}' and '{{ victoria_vmselect_svc }}' exist in namespace '{{ victoria_namespace }}' and have external IPs assigned. victoria_preflight_err_ha_config_missing: >- diff --git a/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml b/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml index be275fd870..ae2a86d511 100644 --- a/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml +++ b/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml @@ -59,6 +59,8 @@ openchami_containers: - cloud-init-server - haproxy - coresmd + - coresmd-coredhcp + - coresmd-coredns openchami_volumes: - haproxy-certs @@ -78,6 +80,7 @@ openchami_secrets: - bss_postgres_password tcp_ports: + - 53 - 9000 - 9001 - 5000 @@ -88,6 +91,7 @@ tcp_ports: - 8443 udp_ports: + - 53 - 69 - 67 - 68 From 74e331fc3a5a6dc598fba9475cfa101309517842 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Mon, 27 Apr 2026 15:25:48 +0530 Subject: [PATCH 05/17] updated kubectl to 1.35.1 in telemetry_pod_cleanup Signed-off-by: pullan1 --- .../templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 b/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 index 3709759f78..acd8d35029 100644 --- a/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 @@ -25,7 +25,7 @@ spec: tolerationSeconds: 30 # Evict after 30s if node is unreachable containers: - name: kubectl-cleanup - image: docker.io/alpine/kubectl:1.34.1 + image: docker.io/alpine/kubectl:1.35.1 command: - /bin/sh - -c From 007e80be53deab8cc6767d9f5a963a8597a6458c Mon Sep 17 00:00:00 2001 From: pullan1 Date: Tue, 28 Apr 2026 11:58:33 +0530 Subject: [PATCH 06/17] localrepo support for k8s upgrade Signed-off-by: pullan1 --- .../library/module_utils/upgrade/__init__.py | 22 + .../upgrade/upgrade_hop_calculator_lib.py | 456 ++++++++++++++++++ .../library/modules/calculate_upgrade_hops.py | 75 +++ .../library/modules/create_upgrade_staging.py | 346 +++++++++++++ .../modules/update_component_json_repos.py | 74 +++ .../library/modules/update_software_config.py | 70 +++ .../tasks/backup_configs.yml | 58 +++ .../tasks/calculate_hop_chain.yml | 24 + .../tasks/display_summary.yml | 24 + .../tasks/load_upgrade_manifest.yml | 57 +++ .../manage_upgrade_inputs/tasks/main.yml | 49 ++ .../tasks/update_component_json_repos.yml | 47 ++ .../tasks/update_software_config.yml | 19 + .../tasks/validate_current_deployment.yml | 82 ++++ .../tasks/validate_hop_chains.yml | 69 +++ .../roles/manage_upgrade_inputs/vars/main.yml | 6 + upgrade/roles/prep_local_repo/README.md | 90 ++++ .../prep_local_repo/tasks/create_staging.yml | 79 +++ .../tasks/load_upgrade_manifest.yml | 87 ++++ upgrade/roles/prep_local_repo/tasks/main.yml | 39 ++ .../prep_local_repo/tasks/sync_local_repo.yml | 92 ++++ .../tasks/validate_prerequisites.yml | 89 ++++ upgrade/roles/prep_local_repo/vars/main.yml | 8 + upgrade/roles/upgrade_oim/tasks/main.yml | 32 +- upgrade/upgrade_manifest.yml | 87 ++++ upgrade/upgrade_oim.yml | 3 + 26 files changed, 2083 insertions(+), 1 deletion(-) create mode 100644 common/library/module_utils/upgrade/__init__.py create mode 100644 common/library/module_utils/upgrade/upgrade_hop_calculator_lib.py create mode 100644 common/library/modules/calculate_upgrade_hops.py create mode 100644 common/library/modules/create_upgrade_staging.py create mode 100644 common/library/modules/update_component_json_repos.py create mode 100644 common/library/modules/update_software_config.py create mode 100644 upgrade/roles/manage_upgrade_inputs/tasks/backup_configs.yml create mode 100644 upgrade/roles/manage_upgrade_inputs/tasks/calculate_hop_chain.yml create mode 100644 upgrade/roles/manage_upgrade_inputs/tasks/display_summary.yml create mode 100644 upgrade/roles/manage_upgrade_inputs/tasks/load_upgrade_manifest.yml create mode 100644 upgrade/roles/manage_upgrade_inputs/tasks/main.yml create mode 100644 upgrade/roles/manage_upgrade_inputs/tasks/update_component_json_repos.yml create mode 100644 upgrade/roles/manage_upgrade_inputs/tasks/update_software_config.yml create mode 100644 upgrade/roles/manage_upgrade_inputs/tasks/validate_current_deployment.yml create mode 100644 upgrade/roles/manage_upgrade_inputs/tasks/validate_hop_chains.yml create mode 100644 upgrade/roles/manage_upgrade_inputs/vars/main.yml create mode 100644 upgrade/roles/prep_local_repo/README.md create mode 100644 upgrade/roles/prep_local_repo/tasks/create_staging.yml create mode 100644 upgrade/roles/prep_local_repo/tasks/load_upgrade_manifest.yml create mode 100644 upgrade/roles/prep_local_repo/tasks/main.yml create mode 100644 upgrade/roles/prep_local_repo/tasks/sync_local_repo.yml create mode 100644 upgrade/roles/prep_local_repo/tasks/validate_prerequisites.yml create mode 100644 upgrade/roles/prep_local_repo/vars/main.yml create mode 100644 upgrade/upgrade_manifest.yml diff --git a/common/library/module_utils/upgrade/__init__.py b/common/library/module_utils/upgrade/__init__.py new file mode 100644 index 0000000000..ea6e83ceec --- /dev/null +++ b/common/library/module_utils/upgrade/__init__.py @@ -0,0 +1,22 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Upgrade Module Utilities + +This package contains utilities for managing upgrade operations: +- Hop chain calculations +- Software configuration updates +- Component JSON repository updates +""" diff --git a/common/library/module_utils/upgrade/upgrade_hop_calculator_lib.py b/common/library/module_utils/upgrade/upgrade_hop_calculator_lib.py new file mode 100644 index 0000000000..e7b77fea58 --- /dev/null +++ b/common/library/module_utils/upgrade/upgrade_hop_calculator_lib.py @@ -0,0 +1,456 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Upgrade Input Management Module + +This module provides functionality for managing upgrade input configurations: +- Calculate upgrade hop chains from upgrade_manifest.yml +- Update software_config.json with target versions +- Update component JSON files with version-specific repo names + +Production Design (upgrade_manifest.yml): + omnia_upgrade_paths (top-level): defines the Omnia version upgrade sequence. + Each entry specifies the software versions for the NEXT Omnia version. + + components: defines each software component and its valid version sequence. + supported_versions: ordered list used for automatic intermediate hop + generation when a K8s version gap is detected. + +Automatic intermediate hop detection: + When the target K8s version skips one or more entries in supported_versions, + the system auto-generates one K8s hop per intermediate version. + + Example: Omnia 2.1.0.0 (K8s 1.34.1) -> Omnia 2.3.0.0 (K8s 1.37.1) + Omnia path : 2.1.0.0 -> 2.2.0.0 -> 2.3.0.0 + K8s hops : 1.34.1 -> 1.35.1 (Omnia 2.1->2.2, direct) + 1.35.1 -> 1.36.1 (auto-generated, within 2.2->2.3) + 1.36.1 -> 1.37.1 (Omnia 2.2->2.3, final) + Total hops : 3 +""" + +import json +import os +from typing import Dict, List, Any + + +def find_hop_chain( + component_name: str, + component_config: Dict[str, Any], + omnia_upgrade_paths: Dict[str, Any], + current_omnia_version: str, + target_omnia_version: str, + current_software_version: str +) -> List[Dict[str, str]]: + """ + Calculate hop chain for a component from current to target Omnia version. + + Traverses the top-level omnia_upgrade_paths and, for each Omnia hop, + checks whether the K8s version transition requires intermediate steps + based on the component's supported_versions list. + + Args: + component_name: Component name (e.g., 'service_k8s') + component_config: Component config from upgrade_manifest.yml + omnia_upgrade_paths: Top-level omnia_upgrade_paths from upgrade_manifest.yml + current_omnia_version: Current Omnia version (from oim_metadata.yml) + target_omnia_version: Target Omnia version (from oim_metadata.yml) + current_software_version: Current software version (from software_config.json) + + Returns: + List of hop dicts, each containing: + hop_id, software, from_omnia_version, to_omnia_version, + from_version, to_version, json_file, omnia_version, auto_generated + """ + supported_versions = component_config.get('supported_versions', []) + + hops = [] + current_omnia = current_omnia_version + current_sw_ver = current_software_version + hop_id = 1 + + while current_omnia != target_omnia_version: + if current_omnia not in omnia_upgrade_paths: + break + if hop_id > 20: + break + + path_info = omnia_upgrade_paths[current_omnia] + next_omnia = path_info.get('next_omnia_version') + software_versions = path_info.get('software_versions', {}) + target_sw_ver = software_versions.get(component_name) + + if not next_omnia or not target_sw_ver: + break + + # Detect whether intermediate K8s hops are needed + if (supported_versions + and current_sw_ver in supported_versions + and target_sw_ver in supported_versions): + + current_idx = supported_versions.index(current_sw_ver) + target_idx = supported_versions.index(target_sw_ver) + + if target_idx > current_idx + 1: + # Gap detected: auto-generate one hop per intermediate version + for i in range(current_idx, target_idx): + from_ver = supported_versions[i] + to_ver = supported_versions[i + 1] + is_final = (i == target_idx - 1) + + hops.append({ + 'hop_id': f"hop_{hop_id}", + 'software': component_name, + 'from_omnia_version': current_omnia, + 'to_omnia_version': next_omnia if is_final else current_omnia, + 'from_version': from_ver, + 'to_version': to_ver, + 'json_file': f"{component_name}_v{to_ver}.json", + 'omnia_version': next_omnia, + 'auto_generated': not is_final + }) + hop_id += 1 + else: + # Direct K8s hop — no gap in supported_versions + hops.append({ + 'hop_id': f"hop_{hop_id}", + 'software': component_name, + 'from_omnia_version': current_omnia, + 'to_omnia_version': next_omnia, + 'from_version': current_sw_ver, + 'to_version': target_sw_ver, + 'json_file': f"{component_name}_v{target_sw_ver}.json", + 'omnia_version': next_omnia, + 'auto_generated': False + }) + hop_id += 1 + else: + # No supported_versions defined or version not in list: direct hop + hops.append({ + 'hop_id': f"hop_{hop_id}", + 'software': component_name, + 'from_omnia_version': current_omnia, + 'to_omnia_version': next_omnia, + 'from_version': current_sw_ver, + 'to_version': target_sw_ver, + 'json_file': f"{component_name}_v{target_sw_ver}.json", + 'omnia_version': next_omnia, + 'auto_generated': False + }) + hop_id += 1 + + current_omnia = next_omnia + current_sw_ver = target_sw_ver + + return hops + + +def calculate_all_hop_chains( + upgrade_config: Dict[str, Any], + current_software_config: Dict[str, Any], + current_omnia_version: str, + target_omnia_version: str +) -> Dict[str, Any]: + """ + Calculate hop chains for all enabled components. + + Reads the top-level omnia_upgrade_paths from upgrade_config and, for each + enabled component, calls find_hop_chain which automatically generates + intermediate K8s hops when version gaps are detected in supported_versions. + + Args: + upgrade_config: Full upgrade configuration from upgrade_manifest.yml + current_software_config: Current software_config.json content + current_omnia_version: Current Omnia version from oim_metadata.yml + target_omnia_version: Target Omnia version from oim_metadata.yml + + Returns: + Dictionary containing: + - hop_chains: List of all hop dictionaries + - total_hops: Total number of hops + - upgrade_mode: 'multi_hop' if total_hops > 1, else 'single_hop' + - warnings: List of warning messages + """ + all_hop_chains = [] + warnings = [] + + components = upgrade_config.get('components', {}) + omnia_upgrade_paths = upgrade_config.get('omnia_upgrade_paths', {}) + + if not omnia_upgrade_paths: + warnings.append("No omnia_upgrade_paths defined in upgrade_manifest.yml") + return { + 'hop_chains': [], + 'total_hops': 0, + 'upgrade_mode': 'single_hop', + 'warnings': warnings + } + + # Build current software version map from software_config.json + current_software_versions = {} + for sw in current_software_config.get('softwares', []): + if sw.get('name') and sw.get('version'): + current_software_versions[sw['name']] = sw['version'] + + for component_name, component_config in components.items(): + if not component_config.get('enabled', False): + continue + + current_software_version = current_software_versions.get(component_name) + if not current_software_version: + warnings.append( + f"Current version not found for {component_name} in software_config.json" + ) + continue + + hops = find_hop_chain( + component_name, + component_config, + omnia_upgrade_paths, + current_omnia_version, + target_omnia_version, + current_software_version + ) + + all_hop_chains.extend(hops) + + return { + 'hop_chains': all_hop_chains, + 'total_hops': len(all_hop_chains), + 'upgrade_mode': 'multi_hop' if len(all_hop_chains) > 1 else 'single_hop', + 'warnings': warnings + } + + +def update_software_config( + input_file: str, + hop_chains: List[Dict[str, str]], + upgrade_mode: str +) -> Dict[str, Any]: + """ + Update software_config.json with target versions from hop chains. + + Args: + input_file: Path to software_config.json + hop_chains: List of hop dictionaries from calculate_all_hop_chains + upgrade_mode: 'multi_hop' or 'single_hop' + + Returns: + Dictionary containing: + - updated: List of updated software entries + - mode: Upgrade mode + - total_hops: Total number of hops + """ + # Load current config + with open(input_file) as f: + config = json.load(f) + + # Find final version for each software (last hop in chain) + updated = [] + software_final_versions = {} + + if upgrade_mode == 'multi_hop': + # Group hops by software and find final version + software_hops = {} + for hop in hop_chains: + software = hop['software'] + if software not in software_hops: + software_hops[software] = [] + software_hops[software].append(hop) + + # Find final version for each software + # Use integer sort on hop number to avoid string comparison issues (e.g. "hop_10" < "hop_2") + for software, hops in software_hops.items(): + final_hop = max(hops, key=lambda h: int(h['hop_id'].split('_')[1])) + software_final_versions[software] = final_hop['to_version'] + else: + # Single hop - use the target version directly + for hop in hop_chains: + software_final_versions[hop['software']] = hop['to_version'] + + # Update versions to final targets + for sw in config.get('softwares', []): + sw_name = sw.get('name') + if sw_name in software_final_versions: + old_version = sw.get('version', 'none') + new_version = software_final_versions[sw_name] + sw['version'] = new_version + updated.append({ + 'name': sw_name, + 'from': old_version, + 'to': new_version + }) + + # Write updated config + with open(input_file, 'w') as f: + json.dump(config, f, indent=4) + + # Output result + result = { + 'updated': updated, + 'mode': upgrade_mode, + 'total_hops': len(hop_chains) + } + + return result + + +def update_component_json_repos( + input_dir: str, + calculated_hop_chains: List[Dict[str, str]], + architectures: List[str], + versioned_repo_components: Dict[str, str] +) -> Dict[str, Any]: + """ + Update component JSON files with version-specific repo names. + + Args: + input_dir: Path to input project directory + calculated_hop_chains: List of hop dictionaries + architectures: List of architectures (e.g., ['x86_64', 'aarch64']) + versioned_repo_components: Mapping of component to base repo name + (e.g., {'slurm_custom': 'slurm_custom'}) + + Returns: + Dictionary containing: + - success: Boolean indicating overall success + - updated_files: List of updated file paths + - messages: List of status messages + """ + messages = [] + updated_files = [] + success = True + + print("=== Updating Component JSON Files with Version-Specific repo_names ===") + + # Process each calculated hop + for hop in calculated_hop_chains: + component_name = hop['software'] + target_version = hop['to_version'] + json_filename = hop['json_file'] + + # Skip if component doesn't support versioned repositories + if component_name not in versioned_repo_components: + msg = f"Skipping {component_name} - no versioned repo support" + messages.append(msg) + print(msg) + continue + + repo_base_name = versioned_repo_components[component_name] + versioned_repo_name = f"{repo_base_name}-v{target_version}" + + msg = f"\nProcessing: {component_name} v{target_version}" + messages.append(msg) + print(msg) + msg = f" JSON file: {json_filename}" + messages.append(msg) + print(msg) + msg = f" Repo name: {repo_base_name} -> {versioned_repo_name}" + messages.append(msg) + print(msg) + + # Process each architecture + for arch in architectures: + json_path = os.path.join(input_dir, 'config', arch, 'rhel', '10.0', json_filename) + + if not os.path.exists(json_path): + msg = f" Warning: JSON file not found: {json_path}" + messages.append(msg) + print(msg) + continue + + # Read JSON file + try: + with open(json_path, 'r') as f: + json_data = json.load(f) + except Exception as e: + msg = f" Error reading {json_path}: {e}" + messages.append(msg) + print(msg) + success = False + continue + + # Update repo_name entries + updated = False + for section_name, section_data in json_data.items(): + if isinstance(section_data, dict) and 'cluster' in section_data: + for package in section_data['cluster']: + if package.get('repo_name') == repo_base_name: + package['repo_name'] = versioned_repo_name + updated = True + msg = f" Updated: {package.get('package')} -> {versioned_repo_name}" + messages.append(msg) + print(msg) + + # Write updated JSON file + if updated: + try: + with open(json_path, 'w') as f: + json.dump(json_data, f, indent=4) + msg = f" Success: Updated {json_path}" + messages.append(msg) + print(msg) + updated_files.append(json_path) + except Exception as e: + msg = f" Error writing {json_path}: {e}" + messages.append(msg) + print(msg) + success = False + else: + msg = f" No updates needed for {json_path}" + messages.append(msg) + print(msg) + + msg = "\n=== JSON repo_name Update Complete ===" + messages.append(msg) + print(msg) + + return { + 'success': success, + 'updated_files': updated_files, + 'messages': messages + } + + +def main(): + """ + Main function for command-line usage. + + This allows the module to be used directly from Ansible playbooks + via the ansible.builtin.shell or ansible.builtin.script modules. + """ + import sys + + # Read input from stdin (expected JSON format) + input_data = json.loads(sys.stdin.read()) + + upgrade_config = input_data.get('upgrade_config', {}) + current_software_config = input_data.get('current_software_config', {}) + current_omnia_version = input_data.get('current_omnia_version') + target_omnia_version = input_data.get('target_omnia_version') + + # Calculate hop chains + result = calculate_all_hop_chains( + upgrade_config, + current_software_config, + current_omnia_version, + target_omnia_version + ) + + # Output result as JSON + print(json.dumps(result)) + + +if __name__ == '__main__': + main() diff --git a/common/library/modules/calculate_upgrade_hops.py b/common/library/modules/calculate_upgrade_hops.py new file mode 100644 index 0000000000..33eb09924d --- /dev/null +++ b/common/library/modules/calculate_upgrade_hops.py @@ -0,0 +1,75 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pylint: disable=import-error,no-name-in-module +#!/usr/bin/python + +"""Ansible module to calculate upgrade hop chains from upgrade_manifest.yml for multi-hop upgrades.""" + +from ansible.module_utils.basic import AnsibleModule +from ansible.module_utils.upgrade import upgrade_hop_calculator_lib + + +def run_module(): + """ + Run the Ansible module. + + Calculates upgrade hop chains from upgrade_manifest.yml and returns + the calculated hop chains for all enabled components. + """ + module_args = dict( + upgrade_config=dict(type="dict", required=True), + current_software_config=dict(type="dict", required=True), + current_omnia_version=dict(type="str", required=True), + target_omnia_version=dict(type="str", required=True), + ) + + result = dict( + changed=False, + hop_chains=[], + total_hops=0, + upgrade_mode='single_hop', + warnings=[] + ) + + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) + + upgrade_config = module.params["upgrade_config"] + current_software_config = module.params["current_software_config"] + current_omnia_version = module.params["current_omnia_version"] + target_omnia_version = module.params["target_omnia_version"] + + # Calculate hop chains using the library + hop_result = upgrade_hop_calculator_lib.calculate_all_hop_chains( + upgrade_config, + current_software_config, + current_omnia_version, + target_omnia_version + ) + + result["hop_chains"] = hop_result["hop_chains"] + result["total_hops"] = hop_result["total_hops"] + result["upgrade_mode"] = hop_result["upgrade_mode"] + result["warnings"] = hop_result["warnings"] + + module.exit_json(**result) + + +def main(): + """Main entry point.""" + run_module() + + +if __name__ == "__main__": + main() diff --git a/common/library/modules/create_upgrade_staging.py b/common/library/modules/create_upgrade_staging.py new file mode 100644 index 0000000000..2ec80b98d4 --- /dev/null +++ b/common/library/modules/create_upgrade_staging.py @@ -0,0 +1,346 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pylint: disable=import-error,no-name-in-module +#!/usr/bin/python + +"""Ansible module to create staging directory with modified configs for upgrade.""" + +from ansible.module_utils.basic import AnsibleModule +import json +import yaml +import os +import shutil + + +def create_staging( + staging_dir, + input_dir, + repos_file, + enabled_components, + current_sw_config, + architectures, + target_omnia_version, + calculated_hop_chains +): + """ + Create staging directory with modified configs for upgrade. + + Args: + staging_dir: Path to staging directory + input_dir: Path to input project directory + repos_file: Path to repos.yml file + enabled_components: List of enabled components + current_sw_config: Current software_config.json content + architectures: List of architectures to process + target_omnia_version: Target Omnia version + calculated_hop_chains: List of calculated hop chains + + Returns: + Dictionary containing staging summary + """ + # --- 1. Create software_config.json with all hop versions for multi-hop support --- + sw_config = current_sw_config.copy() + + # For multi-hop upgrades, add all hop versions to software_config.json + # This ensures local_repo downloads packages for all intermediate versions + # Only include upgrade components and essential shared components + essential_components = {'default_packages', 'additional_packages', 'admin_debug_packages'} + + if calculated_hop_chains: + print(f"Multi-hop upgrade detected with {len(calculated_hop_chains)} hops") + + # Collect unique component versions from all hops + component_versions = {} + for hop in calculated_hop_chains: + component_name = hop.get('software') + to_version = hop.get('to_version') + if component_name and to_version: + if component_name not in component_versions: + component_versions[component_name] = set() + component_versions[component_name].add(to_version) + + # Update software_config.json to include only upgrade components + essentials + updated_softwares = [] + for sw in sw_config.get('softwares', []): + component_name = sw.get('name') + + # Check if this component is in hop chains OR is essential + if component_name in component_versions: + # Add entries for all versions of this component + for version in component_versions[component_name]: + new_entry = sw.copy() + new_entry['version'] = version + updated_softwares.append(new_entry) + print(f"Added {component_name} version {version} to software_config.json") + elif component_name in essential_components: + # Keep essential components (may have dependencies) + updated_softwares.append(sw) + print(f"Kept essential component: {component_name}") + else: + # Skip non-essential, non-upgrade components + print(f"Skipped non-essential component: {component_name}") + + sw_config['softwares'] = updated_softwares + else: + # Single-hop: use target version (already updated by manage_upgrade_inputs) + # Still filter to only include upgrade components + essentials + print("Single-hop upgrade: using target version from software_config.json") + + # Get upgrade components from hop chains (single hop) + upgrade_components = set() + if calculated_hop_chains and len(calculated_hop_chains) > 0: + for hop in calculated_hop_chains: + upgrade_components.add(hop.get('software')) + + # Filter software_config.json + updated_softwares = [] + for sw in sw_config.get('softwares', []): + component_name = sw.get('name') + + # Keep upgrade components and essential components + if component_name in upgrade_components or component_name in essential_components: + updated_softwares.append(sw) + print(f"Kept component: {component_name}") + else: + print(f"Skipped non-essential component: {component_name}") + + sw_config['softwares'] = updated_softwares + + # Write software_config.json to staging + with open(os.path.join(staging_dir, 'software_config.json'), 'w') as f: + json.dump(sw_config, f, indent=4) + + # --- 2. Create local_repo_config.yml with only repos from repos.yml --- + # Base repos (docker-ce, epel, doca, cuda) are already synced from initial + # installation and are not included to avoid unnecessary re-syncing. + # Staging only contains upgrade-specific repos from repos.yml. + + # Initialize empty lists (no base config merge) + merged_x86 = [] + merged_aarch64 = [] + seen_x86 = set() + seen_aarch64 = set() + base_config = {} # Empty base config - only repos from repos.yml will be added + + # Load repos from repos.yml (upgrade-specific repos only) + if os.path.exists(repos_file): + with open(repos_file) as f: + repos = yaml.safe_load(f) or {} + + print(f"Target Omnia version: {target_omnia_version}") + + # Check for new Omnia version-specific structure + if 'omnia_versions' in repos: + print("Using Omnia version-specific repository structure (upgrade-specific repos only)") + + # For multi-hop upgrades, collect Omnia versions from all hops + omnia_versions_to_merge = set() + if calculated_hop_chains: + for hop in calculated_hop_chains: + hop_omnia_version = hop.get('omnia_version') + if hop_omnia_version: + omnia_versions_to_merge.add(hop_omnia_version) + print(f"Adding repositories for Omnia version: {hop_omnia_version}") + else: + # Single-hop: use target version only + omnia_versions_to_merge.add(target_omnia_version) + + # Process ALL repository sections for each Omnia version + for omnia_version in omnia_versions_to_merge: + target_repos = repos['omnia_versions'].get(omnia_version, {}) + + for repo_section_name, repo_entries in target_repos.items(): + if isinstance(repo_entries, list): + # Determine architecture based on section name + if 'x86_64' in repo_section_name.lower(): + # Process x86_64 repositories + for entry in repo_entries: + if entry.get('name', '') not in seen_x86: + seen_x86.add(entry.get('name', '')) + merged_x86.append(entry) + print(f"Added x86_64 repo from {repo_section_name} for Omnia {omnia_version}: {entry.get('name', '')}") + elif 'aarch64' in repo_section_name.lower(): + # Process aarch64 repositories + for entry in repo_entries: + if entry.get('name', '') not in seen_aarch64: + seen_aarch64.add(entry.get('name', '')) + merged_aarch64.append(entry) + print(f"Added aarch64 repo from {repo_section_name} for Omnia {omnia_version}: {entry.get('name', '')}") + + else: + # Fallback to legacy flat structure for backward compatibility + print("Using legacy flat repository structure (upgrade-specific repos only)") + + for entry in (repos.get('omnia_repo_url_rhel_x86_64') or []): + if entry.get('name', '') not in seen_x86: + seen_x86.add(entry.get('name', '')) + merged_x86.append(entry) + print(f"Added x86_64 repo (legacy): {entry.get('name', '')}") + + for entry in (repos.get('omnia_repo_url_rhel_aarch64') or []): + if entry.get('name', '') not in seen_aarch64: + seen_aarch64.add(entry.get('name', '')) + merged_aarch64.append(entry) + print(f"Added aarch64 repo (legacy): {entry.get('name', '')}") + + base_config['omnia_repo_url_rhel_x86_64'] = merged_x86 + base_config['omnia_repo_url_rhel_aarch64'] = merged_aarch64 + + # Write merged local_repo_config.yml to staging + with open(os.path.join(staging_dir, 'local_repo_config.yml'), 'w') as f: + yaml.dump(base_config, f, default_flow_style=False, sort_keys=False) + + # --- 2.5. Copy vault credentials files if they exist --- + vault_key_file = os.path.join(input_dir, '.omnia_config_credentials_key') + vault_creds_file = os.path.join(input_dir, 'omnia_config_credentials.yml') + + # Copy vault key file + if os.path.exists(vault_key_file): + staging_vault_key = os.path.join(staging_dir, '.omnia_config_credentials_key') + shutil.copy2(vault_key_file, staging_vault_key) + print(f"Copied vault credentials key: .omnia_config_credentials_key") + else: + print("No vault credentials key found in input directory") + + # Copy vault credentials file (encrypted) + if os.path.exists(vault_creds_file): + staging_vault_creds = os.path.join(staging_dir, 'omnia_config_credentials.yml') + shutil.copy2(vault_creds_file, staging_vault_creds) + print(f"Copied vault credentials file: omnia_config_credentials.yml") + else: + print("No vault credentials file found in input directory") + + # --- 3. Copy JSON files from input directory for enabled upgrades --- + os_type = sw_config.get('cluster_os_type', 'rhel') + os_version = sw_config.get('cluster_os_version', '10.0') + + json_files_copied = [] + available_architectures = [] + + for arch in architectures: + src_config_dir = os.path.join(input_dir, 'config', arch, os_type, os_version) + dst_config_dir = os.path.join(staging_dir, 'config', arch, os_type, os_version) + + if not os.path.exists(src_config_dir): + print(f"Skipping architecture {arch}: source config dir not found: {src_config_dir}") + continue + + os.makedirs(dst_config_dir, exist_ok=True) + available_architectures.append(arch) + + # Copy ALL JSON files from source config directory to staging + # This ensures all component JSON files (including default_packages, admin_debug_packages, etc.) + # are available for the local_repo sync + if os.path.exists(src_config_dir): + for json_file in os.listdir(src_config_dir): + if json_file.endswith('.json'): + src_json = os.path.join(src_config_dir, json_file) + dst_json = os.path.join(dst_config_dir, json_file) + + if os.path.exists(src_json) and not os.path.exists(dst_json): + shutil.copy2(src_json, dst_json) + json_files_copied.append(f"{arch}/{os_type}/{os_version}/{json_file}") + print(f"Copied: {json_file} ({arch})") + + # Keep original software_config.json intact - don't filter architectures + + # Check if vault credentials were copied + vault_key_copied = os.path.exists(os.path.join(staging_dir, '.omnia_config_credentials_key')) + vault_creds_copied = os.path.exists(os.path.join(staging_dir, 'omnia_config_credentials.yml')) + + # Output summary + result = { + 'staging_dir': staging_dir, + 'software_config_updated': True, + 'repos_merged': len(merged_x86) + len(merged_aarch64), + 'json_files_copied': json_files_copied, + 'vault_key_copied': vault_key_copied, + 'vault_credentials_copied': vault_creds_copied, + 'enabled_components': [c.get('key') for c in enabled_components] + } + + return result + + +def run_module(): + """ + Run the Ansible module. + + Creates a staging directory with modified configs for upgrade. + """ + module_args = dict( + staging_dir=dict(type="str", required=True), + input_dir=dict(type="str", required=True), + repos_file=dict(type="str", required=True), + enabled_components=dict(type="list", required=True), + current_software_config=dict(type="dict", required=True), + architectures=dict(type="list", required=True), + target_omnia_version=dict(type="str", required=True), + calculated_hop_chains=dict(type="list", required=False, default=[]), + ) + + result = dict( + changed=False, + staging_dir='', + software_config_updated=False, + repos_merged=0, + json_files_copied=[], + vault_key_copied=False, + vault_credentials_copied=False, + enabled_components=[] + ) + + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) + + staging_dir = module.params["staging_dir"] + input_dir = module.params["input_dir"] + repos_file = module.params["repos_file"] + enabled_components = module.params["enabled_components"] + current_software_config = module.params["current_software_config"] + architectures = module.params["architectures"] + target_omnia_version = module.params["target_omnia_version"] + calculated_hop_chains = module.params["calculated_hop_chains"] + + # Create staging using the library function + staging_result = create_staging( + staging_dir, + input_dir, + repos_file, + enabled_components, + current_software_config, + architectures, + target_omnia_version, + calculated_hop_chains + ) + + result["changed"] = True + result["staging_dir"] = staging_result["staging_dir"] + result["software_config_updated"] = staging_result["software_config_updated"] + result["repos_merged"] = staging_result["repos_merged"] + result["json_files_copied"] = staging_result["json_files_copied"] + result["vault_key_copied"] = staging_result["vault_key_copied"] + result["vault_credentials_copied"] = staging_result["vault_credentials_copied"] + result["enabled_components"] = staging_result["enabled_components"] + + module.exit_json(**result) + + +def main(): + """Main entry point.""" + run_module() + + +if __name__ == "__main__": + main() diff --git a/common/library/modules/update_component_json_repos.py b/common/library/modules/update_component_json_repos.py new file mode 100644 index 0000000000..ab89268d3f --- /dev/null +++ b/common/library/modules/update_component_json_repos.py @@ -0,0 +1,74 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pylint: disable=import-error,no-name-in-module +#!/usr/bin/python + +"""Ansible module to update component JSON files with version-specific repo names.""" + +from ansible.module_utils.basic import AnsibleModule +from ansible.module_utils.upgrade import upgrade_hop_calculator_lib + + +def run_module(): + """ + Run the Ansible module. + + Updates component JSON files with version-specific repo names for components + that support versioned repositories. + """ + module_args = dict( + input_dir=dict(type="str", required=True), + calculated_hop_chains=dict(type="list", required=True), + architectures=dict(type="list", required=True), + versioned_repo_components=dict(type="dict", required=True), + ) + + result = dict( + changed=False, + success=True, + updated_files=[], + messages=[] + ) + + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) + + input_dir = module.params["input_dir"] + calculated_hop_chains = module.params["calculated_hop_chains"] + architectures = module.params["architectures"] + versioned_repo_components = module.params["versioned_repo_components"] + + # Update component JSON files using the library + update_result = upgrade_hop_calculator_lib.update_component_json_repos( + input_dir, + calculated_hop_chains, + architectures, + versioned_repo_components + ) + + result["changed"] = len(update_result["updated_files"]) > 0 + result["success"] = update_result["success"] + result["updated_files"] = update_result["updated_files"] + result["messages"] = update_result["messages"] + + module.exit_json(**result) + + +def main(): + """Main entry point.""" + run_module() + + +if __name__ == "__main__": + main() diff --git a/common/library/modules/update_software_config.py b/common/library/modules/update_software_config.py new file mode 100644 index 0000000000..1aee70a4c9 --- /dev/null +++ b/common/library/modules/update_software_config.py @@ -0,0 +1,70 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pylint: disable=import-error,no-name-in-module +#!/usr/bin/python + +"""Ansible module to update software_config.json with target versions from hop chains.""" + +from ansible.module_utils.basic import AnsibleModule +from ansible.module_utils.upgrade import upgrade_hop_calculator_lib + + +def run_module(): + """ + Run the Ansible module. + + Updates software_config.json with target versions from calculated hop chains. + """ + module_args = dict( + input_file=dict(type="str", required=True), + hop_chains=dict(type="list", required=True), + upgrade_mode=dict(type="str", required=True), + ) + + result = dict( + changed=False, + updated=[], + mode='', + total_hops=0 + ) + + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) + + input_file = module.params["input_file"] + hop_chains = module.params["hop_chains"] + upgrade_mode = module.params["upgrade_mode"] + + # Update software_config.json using the library + update_result = upgrade_hop_calculator_lib.update_software_config( + input_file, + hop_chains, + upgrade_mode + ) + + result["changed"] = len(update_result["updated"]) > 0 + result["updated"] = update_result["updated"] + result["mode"] = update_result["mode"] + result["total_hops"] = update_result["total_hops"] + + module.exit_json(**result) + + +def main(): + """Main entry point.""" + run_module() + + +if __name__ == "__main__": + main() diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/backup_configs.yml b/upgrade/roles/manage_upgrade_inputs/tasks/backup_configs.yml new file mode 100644 index 0000000000..67b1f64868 --- /dev/null +++ b/upgrade/roles/manage_upgrade_inputs/tasks/backup_configs.yml @@ -0,0 +1,58 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +--- + +- name: "backup — Set backup directory" + ansible.builtin.set_fact: + upgrade_backup_dir: "/opt/omnia/.data/upgrade_backup/{{ upgrade_id }}" + +- name: "backup — Create backup directory" + ansible.builtin.file: + path: "{{ upgrade_backup_dir }}" + state: directory + mode: '0755' + +- name: "backup — Backup software_config.json" + ansible.builtin.copy: + src: "{{ input_project_dir }}/software_config.json" + dest: "{{ upgrade_backup_dir }}/software_config.json" + remote_src: true + mode: '0644' + +- name: "backup — Backup local_repo_config.yml" + ansible.builtin.copy: + src: "{{ input_project_dir }}/local_repo_config.yml" + dest: "{{ upgrade_backup_dir }}/local_repo_config.yml" + remote_src: true + mode: '0644' + +- name: "backup — Backup upgrade_manifest.yml" + ansible.builtin.copy: + src: "{{ role_path }}/../../upgrade_manifest.yml" + dest: "{{ upgrade_backup_dir }}/upgrade_manifest.yml" + mode: '0644' + +- name: "backup — Create backup manifest" + ansible.builtin.copy: + content: | + # Upgrade Backup Manifest + upgrade_id: {{ upgrade_id }} + timestamp: {{ ansible_date_time.iso8601 }} + source_omnia_version: {{ upgrade_source_version }} + target_omnia_version: {{ upgrade_target_version }} + + enabled_components: + {% for component in enabled_components %} + - name: {{ component.key }} + {% endfor %} + + files_backed_up: + - software_config.json + - local_repo_config.yml + - upgrade_manifest.yml + dest: "{{ upgrade_backup_dir }}/manifest.yml" + mode: '0644' + +- name: "backup — Display backup info" + ansible.builtin.debug: + msg: + - "✓ Backup created: {{ upgrade_backup_dir }}" diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/calculate_hop_chain.yml b/upgrade/roles/manage_upgrade_inputs/tasks/calculate_hop_chain.yml new file mode 100644 index 0000000000..b9b2fd8e4c --- /dev/null +++ b/upgrade/roles/manage_upgrade_inputs/tasks/calculate_hop_chain.yml @@ -0,0 +1,24 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +--- + +- name: "calculate_hops — Calculate hop chain from upgrade paths" + calculate_upgrade_hops: + upgrade_config: "{{ upgrade_config }}" + current_software_config: "{{ current_software_config }}" + current_omnia_version: "{{ upgrade_source_version }}" + target_omnia_version: "{{ upgrade_target_version }}" + register: _hop_calculation + +- name: "calculate_hops — Parse hop chain result" + ansible.builtin.set_fact: + calculated_hop_chains: "{{ _hop_calculation.hop_chains }}" + total_upgrade_hops: "{{ _hop_calculation.total_hops }}" + upgrade_mode: "{{ _hop_calculation.upgrade_mode }}" + +- name: "calculate_hops — Display calculated hop chain" + ansible.builtin.debug: + msg: + - "Calculated Hop Chain:" + - " Mode: {{ upgrade_mode }}" + - " Total hops: {{ total_upgrade_hops }}" + - "{% for hop in calculated_hop_chains %} - {{ hop.software }}/{{ hop.hop_id }}: {{ hop.from_version }} → {{ hop.to_version }} (Omnia {{ hop.from_omnia_version }} → {{ hop.to_omnia_version }}){% endfor %}" diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/display_summary.yml b/upgrade/roles/manage_upgrade_inputs/tasks/display_summary.yml new file mode 100644 index 0000000000..022261bffe --- /dev/null +++ b/upgrade/roles/manage_upgrade_inputs/tasks/display_summary.yml @@ -0,0 +1,24 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +--- + +- name: "summary — Display upgrade input summary" + ansible.builtin.debug: + msg: + - "==========================================" + - "UPGRADE INPUT MANAGEMENT COMPLETE" + - "==========================================" + - "Upgrade ID: {{ upgrade_id }}" + - "Mode: {{ upgrade_mode | default('single_hop') }}" + - "Omnia: {{ upgrade_source_version }} → {{ upgrade_target_version }}" + - "" + - "{% if upgrade_mode == 'multi_hop' %}Multi-Hop Upgrade Path:{% else %}Upgrade Path:{% endif %}" + - "{% for hop in calculated_hop_chains %} - {{ hop.software }}/{{ hop.hop_id }}: {{ hop.from_version }} → {{ hop.to_version }} (Omnia {{ hop.from_omnia_version }} → {{ hop.to_omnia_version }}){% endfor %}" + - "" + - "Total Hops: {{ total_upgrade_hops }}" + - "Backup: {{ upgrade_backup_dir }}" + - "==========================================" + +- name: "summary — Set upgrade facts for downstream roles" + ansible.builtin.set_fact: + upgrade_inputs_complete: true + upgrade_backup_location: "{{ upgrade_backup_dir }}" diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/load_upgrade_manifest.yml b/upgrade/roles/manage_upgrade_inputs/tasks/load_upgrade_manifest.yml new file mode 100644 index 0000000000..7ad73c033e --- /dev/null +++ b/upgrade/roles/manage_upgrade_inputs/tasks/load_upgrade_manifest.yml @@ -0,0 +1,57 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +--- + +- name: "load — Check oim_metadata.yml exists" + ansible.builtin.stat: + path: "/opt/omnia/.data/oim_metadata.yml" + register: _oim_metadata_stat + +- name: "load — Fail if oim_metadata.yml not found" + ansible.builtin.fail: + msg: | + oim_metadata.yml not found at /opt/omnia/.data/oim_metadata.yml + This file contains Omnia version information after omnia_core execution. + when: not _oim_metadata_stat.stat.exists + +- name: "load — Load oim_metadata.yml" + ansible.builtin.include_vars: + file: "/opt/omnia/.data/oim_metadata.yml" + name: oim_metadata + +- name: "load — Check upgrade_manifest.yml exists" + ansible.builtin.stat: + path: "{{ role_path }}/../../upgrade_manifest.yml" + register: _upgrade_config_stat + +- name: "load — Fail if upgrade_manifest.yml not found" + ansible.builtin.fail: + msg: | + upgrade_manifest.yml not found at {{ role_path }}/../../upgrade_manifest.yml + This file is the source of truth for upgrade paths and components. + when: not _upgrade_config_stat.stat.exists + +- name: "load — Load upgrade_manifest.yml" + ansible.builtin.include_vars: + file: "{{ role_path }}/../../upgrade_manifest.yml" + name: upgrade_config + +- name: "load — Extract enabled components" + ansible.builtin.set_fact: + enabled_components: >- + {{ upgrade_config.components | dict2items + | selectattr('value.enabled', 'equalto', true) + | list }} + +- name: "load — Set upgrade metadata facts from oim_metadata.yml" + ansible.builtin.set_fact: + upgrade_source_version: "{{ oim_metadata.omnia_previous_version }}" + upgrade_target_version: "{{ oim_metadata.omnia_version }}" + upgrade_id: "{{ lookup('pipe', 'date +%Y%m%d_%H%M%S') }}_{{ 99999 | random }}" + +- name: "load — Display loaded configuration" + ansible.builtin.debug: + msg: + - "Upgrade Config Loaded:" + - " Source: Omnia {{ upgrade_source_version }}" + - " Target: Omnia {{ upgrade_target_version }}" + - " Enabled: {{ enabled_components | map(attribute='key') | list | join(', ') }}" diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/main.yml b/upgrade/roles/manage_upgrade_inputs/tasks/main.yml new file mode 100644 index 0000000000..dbaa373a4b --- /dev/null +++ b/upgrade/roles/manage_upgrade_inputs/tasks/main.yml @@ -0,0 +1,49 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# ============================================================================ +# manage_upgrade_inputs Role +# ============================================================================ +# Manages input configuration for upgrade: +# 1. Loads upgrade_manifest.yml (source of truth for components) +# 2. Reads oim_metadata.yml (source of truth for version metadata) +# 3. Validates current deployment versions +# 4. Updates software_config.json with target versions +# 5. Backs up current configurations +# +# Key: Reads JSON from actual input directory (versioned files) +# No JSON artifacts - only repos.yml for upgrade repos +# ============================================================================ + +- name: Load upgrade configuration + ansible.builtin.include_tasks: load_upgrade_manifest.yml + +- name: Validate current deployment + ansible.builtin.include_tasks: validate_current_deployment.yml + +- name: Calculate hop chain from upgrade paths + ansible.builtin.include_tasks: calculate_hop_chain.yml + +- name: Backup current configurations + ansible.builtin.include_tasks: backup_configs.yml + +- name: Update software_config.json with target versions + ansible.builtin.include_tasks: update_software_config.yml + +- name: Update component JSON files with version-specific repo names + ansible.builtin.include_tasks: update_component_json_repos.yml + +- name: Display upgrade input summary + ansible.builtin.include_tasks: display_summary.yml diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/update_component_json_repos.yml b/upgrade/roles/manage_upgrade_inputs/tasks/update_component_json_repos.yml new file mode 100644 index 0000000000..f916271fb3 --- /dev/null +++ b/upgrade/roles/manage_upgrade_inputs/tasks/update_component_json_repos.yml @@ -0,0 +1,47 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# ============================================================================ +# Update Component JSON Files with Version-Specific repo_name +# ============================================================================ +# This task updates JSON files to use version-specific repo names for components +# that support versioned repositories (like slurm_custom). +# ============================================================================ + +- name: "json_repos - Update component JSON files with version-specific repo names" + update_component_json_repos: + input_dir: "{{ input_project_dir }}" + calculated_hop_chains: "{{ calculated_hop_chains | default([]) }}" + architectures: "{{ upgrade_active_architectures }}" + versioned_repo_components: + slurm_custom: "slurm_custom" + # Add other components here that support versioned repositories + register: _json_repo_update_result + +- name: "json_repos - Parse update result" + ansible.builtin.set_fact: + _json_repo_update_summary: "{{ _json_repo_update_result }}" + +- name: "json_repos - Display update summary" + ansible.builtin.debug: + msg: + - "==========================================" + - "Component JSON repo_name Update Summary" + - "==========================================" + - "{% for msg in _json_repo_update_summary.messages %}{{ msg }}{% endfor %}" + - "==========================================" + - "Updated files: {{ _json_repo_update_summary.updated_files | default([]) }}" + - "Success: {{ _json_repo_update_summary.success }}" + - "==========================================" diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/update_software_config.yml b/upgrade/roles/manage_upgrade_inputs/tasks/update_software_config.yml new file mode 100644 index 0000000000..c110277605 --- /dev/null +++ b/upgrade/roles/manage_upgrade_inputs/tasks/update_software_config.yml @@ -0,0 +1,19 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +--- + +- name: "update — Update software_config.json with target versions" + update_software_config: + input_file: "{{ input_project_dir }}/software_config.json" + hop_chains: "{{ calculated_hop_chains }}" + upgrade_mode: "{{ upgrade_mode }}" + register: _update_result + +- name: "update — Parse update result" + ansible.builtin.set_fact: + _update_summary: "{{ _update_result }}" + +- name: "update — Display update results" + ansible.builtin.debug: + msg: + - "✓ software_config.json updated" + - "{% for u in _update_summary.updated %} - {{ u.name }}: {{ u.from }} → {{ u.to }}{% endfor %}" diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/validate_current_deployment.yml b/upgrade/roles/manage_upgrade_inputs/tasks/validate_current_deployment.yml new file mode 100644 index 0000000000..ce4f10048b --- /dev/null +++ b/upgrade/roles/manage_upgrade_inputs/tasks/validate_current_deployment.yml @@ -0,0 +1,82 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +--- + +- name: "validate — Load current software_config.json" + ansible.builtin.slurp: + path: "{{ input_project_dir }}/software_config.json" + register: _sw_config_content + +- name: "validate — Parse software_config.json" + ansible.builtin.set_fact: + current_software_config: "{{ _sw_config_content.content | b64decode | from_json }}" + +- name: "validate — Extract current versions" + ansible.builtin.set_fact: + current_versions: >- + {{ current_software_config.softwares + | selectattr('version', 'defined') + | items2dict(key_name='name', value_name='version') }} + +- name: "validate — Check oim_metadata for deployment info" + ansible.builtin.slurp: + path: /opt/omnia/.data/oim_metadata.yml + register: _oim_metadata_content + failed_when: false + +- name: "validate — Parse oim_metadata" + ansible.builtin.set_fact: + oim_metadata: "{{ _oim_metadata_content.content | b64decode | from_yaml }}" + when: _oim_metadata_content is succeeded + +- name: "validate - Verify current versions exist for enabled components" + ansible.builtin.assert: + that: + - current_versions[item.key] is defined + fail_msg: | + Current version not found for {{ item.key }}! + In software_config.json: {{ current_versions[item.key] | default('not found') }} + Component must be deployed before upgrade. + success_msg: " {{ item.key }}: {{ current_versions[item.key] }}" + loop: "{{ enabled_components }}" + loop_control: + label: "{{ item.key }}" + +- name: "validate - Check Omnia upgrade paths exist for current version" + ansible.builtin.assert: + that: + - upgrade_config.omnia_upgrade_paths[upgrade_source_version] is defined + fail_msg: | + No Omnia upgrade path found from current version {{ upgrade_source_version }}! + Available Omnia upgrade paths: {{ upgrade_config.omnia_upgrade_paths.keys() | list | join(', ') }} + success_msg: " Omnia upgrade path exists from {{ upgrade_source_version }}" + when: upgrade_source_version is defined + run_once: true + +- name: "validate - Check JSON files exist for current version" + ansible.builtin.stat: + path: "{{ input_project_dir }}/config/{{ item.1 }}/{{ current_software_config.cluster_os_type }}/{{ current_software_config.cluster_os_version }}/{{ item.0.key }}_v{{ current_versions[item.0.key] }}.json" + register: _json_exists + loop: "{{ enabled_components | product(upgrade_active_architectures) | list }}" + loop_control: + label: "{{ item.0.key }}/{{ item.1 }}" + when: current_versions[item.0.key] is defined + +- name: "validate - Fail if JSON files missing" + ansible.builtin.fail: + msg: | + JSON file not found for {{ item.item.0.key }} version {{ current_versions[item.item.0.key] }}: {{ item.invocation.module_args.path }} + Ensure the JSON file exists in input/config directory. + Expected naming: {{ item.item.0.key }}_v{{ current_versions[item.item.0.key] }}.json + loop: "{{ _json_exists.results }}" + loop_control: + label: "{{ item.item.0.key }}" + when: + - item.stat is defined + - not item.stat.exists + +- name: "validate — Display validation results" + ansible.builtin.debug: + msg: + - " Current deployment validated" + - " OS: {{ current_software_config.cluster_os_type }} {{ current_software_config.cluster_os_version }}" + - " Versions verified for: {{ enabled_components | map(attribute='key') | list | join(', ') }}" diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/validate_hop_chains.yml b/upgrade/roles/manage_upgrade_inputs/tasks/validate_hop_chains.yml new file mode 100644 index 0000000000..fdf9ede037 --- /dev/null +++ b/upgrade/roles/manage_upgrade_inputs/tasks/validate_hop_chains.yml @@ -0,0 +1,69 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +--- + +- name: "validate_hops - Process hop chains for multi-hop upgrades" + ansible.builtin.set_fact: + _processed_hops: >- + {%- set hops = [] -%} + {%- for software_name, software_config in upgrade_config.hop_chains.items() -%} + {%- for hop in software_config.hops -%} + {%- if hop.enabled | default(false) -%} + {%- set _ = hops.append({ + 'software': software_name, + 'hop_id': hop.hop_id, + 'from_version': hop.from_version, + 'to_version': hop.to_version, + 'omnia_version': hop.omnia_version, + 'json_file': hop.json_file + }) -%} + {%- endif -%} + {%- endfor -%} + {%- endfor -%} + {{ hops }} + +- name: "validate_hops - Display processed hops" + ansible.builtin.debug: + msg: + - "Processed {{ _processed_hops | length }} hop(s):" + - "{% for hop in _processed_hops %} - {{ hop.software }}/{{ hop.hop_id }}: {{ hop.from_version }} -> {{ hop.to_version }} (Omnia {{ hop.omnia_version }}){% endfor %}" + +- name: "validate_hops - Validate hop chain sequence" + ansible.builtin.assert: + that: + - item.from_version == current_versions.get(item.software, 'not_found') + fail_msg: | + Hop validation failed for {{ item.software }}/{{ item.hop_id }}! + Expected from_version: {{ item.from_version }} + Current in software_config.json: {{ current_versions.get(item.software, 'not_found') }} + success_msg: "Hop {{ item.software }}/{{ item.hop_id }}: {{ item.from_version }} -> {{ item.to_version }}" + loop: "{{ _processed_hops }}" + loop_control: + label: "{{ item.software }}/{{ item.hop_id }}" + when: upgrade_mode == 'multi_hop' + +- name: "validate_hops - Check JSON files exist for all hop targets" + ansible.builtin.stat: + path: "{{ input_project_dir }}/config/{{ item.1 }}/{{ current_software_config.cluster_os_type }}/{{ current_software_config.cluster_os_version }}/{{ item.0.json_file }}" + register: _hop_json_check + loop: "{{ _processed_hops | product(upgrade_active_architectures) | list }}" + loop_control: + label: "{{ item.0.software }}/{{ item.0.hop_id }}/{{ item.1 }}" + +- name: "validate_hops - Fail if hop JSON files missing" + ansible.builtin.fail: + msg: | + JSON file not found for hop {{ item.0.software }}/{{ item.0.hop_id }}: {{ item.invocation.module_args.path }} + Ensure the JSON file exists in input/config directory. + loop: "{{ _hop_json_check.results }}" + loop_control: + label: "{{ item.0.software }}/{{ item.0.hop_id }}" + when: + - item.stat is defined + - not item.stat.exists + - upgrade_mode == 'multi_hop' + +- name: "validate_hops - Set hop chain facts for downstream roles" + ansible.builtin.set_fact: + upgrade_hop_chain: "{{ _processed_hops }}" + total_upgrade_hops: "{{ _processed_hops | length }}" + when: upgrade_mode == 'multi_hop' diff --git a/upgrade/roles/manage_upgrade_inputs/vars/main.yml b/upgrade/roles/manage_upgrade_inputs/vars/main.yml new file mode 100644 index 0000000000..c5002f3c0e --- /dev/null +++ b/upgrade/roles/manage_upgrade_inputs/vars/main.yml @@ -0,0 +1,6 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +--- + +# Default architectures +upgrade_active_architectures: + - x86_64 diff --git a/upgrade/roles/prep_local_repo/README.md b/upgrade/roles/prep_local_repo/README.md new file mode 100644 index 0000000000..b282a7d70b --- /dev/null +++ b/upgrade/roles/prep_local_repo/README.md @@ -0,0 +1,90 @@ +# prep_local_repo Role + +## Purpose + +Prepares local repository for software upgrades by creating a staging directory with modified configurations and syncing packages for target versions. + +## Key Design Principles + +1. **No JSON Artifacts** - Reads JSON files from actual `/opt/omnia/input/project_default/config/` directory +2. **Staging Directory** - Creates temporary copies of already-updated `software_config.json` and merged `local_repo_config.yml` +3. **Source of Truth** - `upgrade_manifest.yml` defines upgrade paths and enabled components +4. **Only repos.yml** - Maintains upgrade-specific repositories in `upgrade/artifacts/repos.yml` +5. **Read-Only Input** - Does NOT modify files in `/opt/omnia/input/project_default/` (those are updated by `manage_upgrade_inputs` role) + +## Workflow + +``` +1. Validate prerequisites + - Verify software_config.json exists in /opt/omnia/input/project_default/ + - Verify JSON files exist for all hop target versions +2. Create staging directory (/tmp/upgrade_local_repo_XXXXX/) + - Copy software_config.json from /opt/omnia/input/project_default/ (already updated with target versions) + - Copy local_repo_config.yml from /opt/omnia/input/project_default/ and merge with upgrade repos from repos.yml + - Copy versioned JSON files from /opt/omnia/input/project_default/config/ for all hop targets +3. Sync local repository + - Temporarily set input_project_dir to staging directory + - Run validation role + - Run parse_and_download role + - Restore original input_project_dir +4. Cleanup staging directory +``` + +**Note**: The `manage_upgrade_inputs` role updates `software_config.json` with target versions BEFORE this role executes. This role only copies the already-updated configuration to staging. + +## Files + +- `tasks/main.yml` - Entry point +- `tasks/load_upgrade_config.yml` - Load and validate upgrade_config.yml +- `tasks/validate_prerequisites.yml` - Validate versions and JSON files +- `tasks/create_staging.yml` - Create staging directory with merged configs +- `tasks/sync_local_repo.yml` - Run local_repo roles + +## Required Variables + +- `input_project_dir` - Path to input directory +- `upgrade_active_architectures` - List of architectures (default: ['x86_64']) + +## Usage + +```yaml +- name: Prepare local repository for upgrade + ansible.builtin.include_role: + name: prep_local_repo + vars: + input_project_dir: "{{ playbook_dir }}/../input" +``` + +## Staging Directory Structure + +``` +/tmp/upgrade_local_repo_XXXXX/ +├── software_config.json # Copied from /opt/omnia/input/project_default/ (already contains target versions) +├── local_repo_config.yml # Copied from /opt/omnia/input/project_default/ and merged with repos.yml +└── config/ + └── x86_64/ + └── rhel/ + └── 10.0/ + ├── service_k8s_v1.35.1.json # Copied from /opt/omnia/input/project_default/config/ + ├── default_packages.json # Copied from /opt/omnia/input/project_default/config/ + └── ... +``` + +**Important**: All files in staging are COPIES from `/opt/omnia/input/project_default/`. The staging directory is temporary and deleted after package synchronization. + +## JSON File Naming Convention + +JSON files for versioned components use the format: `_v.json` + +Examples: +- `service_k8s_v1.35.1.json` - Kubernetes service packages for version 1.35.1 +- `service_k8s_v1.36.1.json` - Kubernetes service packages for version 1.36.1 + +## Difference from Reference Codebase + +The reference codebase (`sudha_k8s-upgrade_new`) uses JSON artifacts stored in `upgrade/artifacts/service_k8s/v1.35.1/`. This implementation: + +- **Does NOT use JSON artifacts** - reads from actual input directory +- **Only maintains repos.yml** - for upgrade-specific repository URLs +- **Uses staging directory** - actual copies, no symlinks +- **Simpler structure** - no platform-encoded filenames needed diff --git a/upgrade/roles/prep_local_repo/tasks/create_staging.yml b/upgrade/roles/prep_local_repo/tasks/create_staging.yml new file mode 100644 index 0000000000..78d142715a --- /dev/null +++ b/upgrade/roles/prep_local_repo/tasks/create_staging.yml @@ -0,0 +1,79 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +--- + +# ============================================================================ +# Create staging directory with modified configs +# ============================================================================ +# Creates a temp directory with: +# - software_config.json (copied, version updated to target) +# - local_repo_config.yml (contains only repos from repos.yml) +# - config/ directory structure (copied from input for enabled upgrades) +# +# NO symlinks - actual copies for isolation +# Other files read directly from input_project_dir +# +# NOTE: local_repo_config.yml in staging contains ONLY upgrade-specific repos +# from repos.yml. Base repos (docker-ce, epel, doca, cuda) are already synced +# from initial installation and are not included in staging to avoid +# unnecessary re-syncing. +# ============================================================================ + +- name: "staging — Create staging directory" + ansible.builtin.tempfile: + state: directory + prefix: "upgrade_local_repo_" + register: _staging_dir + +- name: "staging — Set staging path fact" + ansible.builtin.set_fact: + upgrade_staging_dir: "{{ _staging_dir.path }}" + +- name: "staging — Extract architectures from software_config.json" + ansible.builtin.set_fact: + _all_architectures: >- + {{ + _current_software_config.softwares + | selectattr('arch', 'defined') + | map(attribute='arch') + | flatten + | unique + | list + }} + +- name: "staging — Set upgrade_active_architectures from software_config.json" + ansible.builtin.set_fact: + upgrade_active_architectures: "{{ _all_architectures }}" + +- name: "staging — Display detected architectures" + ansible.builtin.debug: + msg: + - "Architectures detected from software_config.json: {{ _all_architectures | join(', ') }}" + - "Will process all detected architectures for upgrade" + +- name: "staging — Create staging with modified configs" + create_upgrade_staging: + staging_dir: "{{ _staging_dir.path }}" + input_dir: "{{ input_project_dir }}" + repos_file: "{{ role_path }}/../../artifacts/repos.yml" + enabled_components: "{{ _enabled_components }}" + current_software_config: "{{ _current_software_config }}" + architectures: "{{ upgrade_active_architectures }}" + target_omnia_version: "{{ upgrade_target_version }}" + calculated_hop_chains: "{{ calculated_hop_chains | default([]) }}" + register: _staging_result + +- name: "staging — Parse staging result" + ansible.builtin.set_fact: + _staging_summary: "{{ _staging_result }}" + +- name: "staging — Display staging summary" + ansible.builtin.debug: + msg: + - "==========================================" + - "Staging Directory Created" + - "==========================================" + - "Path: {{ upgrade_staging_dir }}" + - "Repos merged: {{ _staging_summary.repos_merged }}" + - "JSON files: {{ _staging_summary.json_files_copied | length }}" + - "Components: {{ _staging_summary.enabled_components | join(', ') }}" + - "==========================================" diff --git a/upgrade/roles/prep_local_repo/tasks/load_upgrade_manifest.yml b/upgrade/roles/prep_local_repo/tasks/load_upgrade_manifest.yml new file mode 100644 index 0000000000..2b45fa7c3c --- /dev/null +++ b/upgrade/roles/prep_local_repo/tasks/load_upgrade_manifest.yml @@ -0,0 +1,87 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +--- + +# ============================================================================ +# Load upgrade configuration (with optimization for integrated flow) +# ============================================================================ +# When called from upgrade_oim.yml, manage_upgrade_inputs has already loaded +# the upgrade_config. This task file checks if config is already loaded and +# skips redundant loading. +# ============================================================================ + +- name: "load_config — Check if upgrade_config already loaded" + ansible.builtin.set_fact: + _config_already_loaded: "{{ upgrade_config is defined and upgrade_config.components is defined }}" + +- name: "load_config — Skip loading if already available" + ansible.builtin.debug: + msg: "Upgrade config already loaded from manage_upgrade_inputs, skipping reload" + when: _config_already_loaded | bool + +- name: "load_config — Check if upgrade_manifest.yml exists" + ansible.builtin.stat: + path: "{{ role_path }}/../../upgrade_manifest.yml" + register: _upgrade_config_stat + when: not (_config_already_loaded | bool) + +- name: "load_config — Fail if upgrade_manifest.yml not found" + ansible.builtin.fail: + msg: | + upgrade_manifest.yml not found at {{ role_path }}/../../upgrade_manifest.yml + This file is the source of truth for upgrade paths. + Please create it before running upgrade. + when: + - not (_config_already_loaded | bool) + - not _upgrade_config_stat.stat.exists + +- name: "load_config — Load upgrade_manifest.yml" + ansible.builtin.include_vars: + file: "{{ role_path }}/../../upgrade_manifest.yml" + name: upgrade_config + when: not (_config_already_loaded | bool) + +- name: "load_config — Validate upgrade_config structure" + ansible.builtin.assert: + that: + - upgrade_config.components is defined + fail_msg: "upgrade_manifest.yml is missing required sections (components)" + when: not (_config_already_loaded | bool) + +- name: "load_config — Check if oim_metadata.yml exists" + ansible.builtin.stat: + path: "/opt/omnia/.data/oim_metadata.yml" + register: _oim_metadata_stat + +- name: "load_config — Fail if oim_metadata.yml not found" + ansible.builtin.fail: + msg: | + oim_metadata.yml not found at /opt/omnia/.data/oim_metadata.yml + This file contains Omnia version information after omnia_core execution. + when: not _oim_metadata_stat.stat.exists + +- name: "load_config — Load oim_metadata.yml" + ansible.builtin.include_vars: + file: "/opt/omnia/.data/oim_metadata.yml" + name: oim_metadata + +- name: "load_config — Extract enabled components" + ansible.builtin.set_fact: + _enabled_components: >- + {{ upgrade_config.components | dict2items + | selectattr('value.enabled', 'equalto', true) + | list }} + +- name: "load_config — Set upgrade target version from oim_metadata.yml" + ansible.builtin.set_fact: + upgrade_target_version: "{{ oim_metadata.omnia_version }}" + +- name: "load_config — Display upgrade configuration" + ansible.builtin.debug: + msg: + - "==========================================" + - "Upgrade Configuration {{ 'Verified' if (_config_already_loaded | bool) else 'Loaded' }}" + - "==========================================" + - "Source Omnia: {{ oim_metadata.omnia_previous_version }}" + - "Target Omnia: {{ oim_metadata.omnia_version }}" + - "Enabled components: {{ _enabled_components | map(attribute='key') | list | join(', ') }}" + - "==========================================" diff --git a/upgrade/roles/prep_local_repo/tasks/main.yml b/upgrade/roles/prep_local_repo/tasks/main.yml new file mode 100644 index 0000000000..c17ad54e29 --- /dev/null +++ b/upgrade/roles/prep_local_repo/tasks/main.yml @@ -0,0 +1,39 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# ============================================================================ +# prep_local_repo Role +# ============================================================================ +# Prepares local repository for upgrade by: +# 1. Creating staging directory with modified configs +# 2. Merging repos.yml into local_repo_config.yml +# 3. Updating software_config.json with target versions +# 4. Reading JSON files from actual input directory (versioned files) +# 5. Running local_repo roles to sync packages +# +# Key difference from reference: NO JSON artifacts - reads from input directly +# ============================================================================ + +- name: Load upgrade configuration + ansible.builtin.include_tasks: load_upgrade_manifest.yml + +- name: Validate upgrade prerequisites + ansible.builtin.include_tasks: validate_prerequisites.yml + +- name: Create staging directory and merge configs + ansible.builtin.include_tasks: create_staging.yml + +- name: Sync local repository for upgrade + ansible.builtin.include_tasks: sync_local_repo.yml diff --git a/upgrade/roles/prep_local_repo/tasks/sync_local_repo.yml b/upgrade/roles/prep_local_repo/tasks/sync_local_repo.yml new file mode 100644 index 0000000000..c57812ad6c --- /dev/null +++ b/upgrade/roles/prep_local_repo/tasks/sync_local_repo.yml @@ -0,0 +1,92 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +--- + +# ============================================================================ +# Sync local repository using staging directory +# ============================================================================ +# Runs local_repo roles (validation + parse_and_download) using the +# staging directory as input_project_dir. +# +# The parse_and_download role expects variables normally set by earlier roles +# in the local_repo.yml flow (e.g. validate_subscription, pulp_validation). +# We initialise those variables here so the role works standalone. +# ============================================================================ + +- name: "sync — Set input_project_dir to staging for local_repo roles" + ansible.builtin.set_fact: + _original_input_project_dir: "{{ input_project_dir }}" + input_project_dir: "{{ upgrade_staging_dir }}" + +- name: "sync — Set variables for local_repo roles" + ansible.builtin.set_fact: + sw_config_json_path: "{{ upgrade_staging_dir }}/software_config.json" + local_repo_config_file: "{{ upgrade_staging_dir }}/local_repo_config.yml" + local_repo_config_path: "{{ upgrade_staging_dir }}/local_repo_config.yml" + project_input_path: "{{ upgrade_staging_dir }}" + default_archs: "{{ upgrade_active_architectures }}" + software_names: "{{ _enabled_components | map(attribute='key') | list }}" + playbook_start_time: "{{ ansible_date_time.epoch }}" + +# Initialise sub_final_repo_urls (normally set by validate_subscription role) +- name: "sync — Initialise subscription repo URLs" + ansible.builtin.set_fact: + sub_final_repo_urls: "{{ sub_final_repo_urls | default({}) }}" + +# Get actual Pulp URL from pulp status command (same as pulp_validation role) +- name: "sync — Get Pulp status" + ansible.builtin.command: /usr/local/bin/pulp status + delegate_to: localhost + changed_when: false + register: _pulp_status_output + +- name: "sync — Set Pulp connection variables from pulp status" + ansible.builtin.set_fact: + pulp_content_origin: "{{ (_pulp_status_output.stdout | from_json).content_settings.content_origin }}" + +- name: "sync — Parse Pulp connection details" + ansible.builtin.set_fact: + pulp_protocol: "{{ pulp_content_origin | urlsplit('scheme') | lower }}" + pulp_server_ip: "{{ pulp_content_origin | urlsplit('hostname') }}" + pulp_server_port: "{{ pulp_content_origin | urlsplit('port') }}" + +- name: "sync — Add oim host to inventory" + ansible.builtin.add_host: + name: oim + pulp_protocol: "{{ pulp_protocol }}" + pulp_server_port: "{{ pulp_server_port }}" + ansible_connection: local + +- name: "sync — Display sync configuration" + ansible.builtin.debug: + msg: + - "Starting local repo sync..." + - "Staging dir: {{ upgrade_staging_dir }}" + - "Softwares: {{ software_names | join(', ') }}" + - "Pulp: {{ pulp_content_origin }}" + +- name: "sync — Run validation role" + ansible.builtin.include_role: + name: "{{ role_path }}/../../../local_repo/roles/validation" + +- name: "sync — Run parse_and_download role" + ansible.builtin.include_role: + name: "{{ role_path }}/../../../local_repo/roles/parse_and_download" + +- name: "sync — Restore original input_project_dir" + ansible.builtin.set_fact: + input_project_dir: "{{ _original_input_project_dir }}" + +- name: "sync — Display sync completion" + ansible.builtin.debug: + msg: + - "==========================================" + - "Local Repository Sync Completed" + - "==========================================" + - "Components synced: {{ software_names | join(', ') }}" + - "==========================================" + +# - name: "sync — Cleanup staging directory" +# ansible.builtin.file: +# path: "{{ upgrade_staging_dir }}" +# state: absent +# when: upgrade_cleanup_staging | default(true) diff --git a/upgrade/roles/prep_local_repo/tasks/validate_prerequisites.yml b/upgrade/roles/prep_local_repo/tasks/validate_prerequisites.yml new file mode 100644 index 0000000000..e8149c746e --- /dev/null +++ b/upgrade/roles/prep_local_repo/tasks/validate_prerequisites.yml @@ -0,0 +1,89 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +--- + +# ============================================================================ +# Validate prerequisites (with optimization for integrated flow) +# ============================================================================ +# Reuses current_software_config from manage_upgrade_inputs if available. +# ============================================================================ + +- name: "validate — Check if software config already loaded" + ansible.builtin.set_fact: + _sw_config_already_loaded: "{{ current_software_config is defined and current_software_config.softwares is defined }}" + +- name: "validate — Reuse existing software config" + ansible.builtin.set_fact: + _current_software_config: "{{ current_software_config }}" + when: _sw_config_already_loaded | bool + +- name: "validate — Load current software_config.json" + ansible.builtin.slurp: + path: "{{ input_project_dir }}/software_config.json" + register: _current_sw_config + when: not (_sw_config_already_loaded | bool) + +- name: "validate — Parse software_config.json" + ansible.builtin.set_fact: + _current_software_config: "{{ _current_sw_config.content | b64decode | from_json }}" + when: not (_sw_config_already_loaded | bool) + +- name: "validate - Extract current versions for enabled components" + ansible.builtin.set_fact: + _current_versions: >- + {{ _current_software_config.softwares + | selectattr('name', 'in', _enabled_components | map(attribute='key') | list) + | items2dict(key_name='name', value_name='version') }} + +- name: "validate - Verify current versions exist for enabled components" + ansible.builtin.assert: + that: + - _current_versions[item.key] is defined + fail_msg: | + Current version not found for {{ item.key }}! + In software_config.json: {{ _current_versions[item.key] | default('not found') }} + Component must be deployed before upgrade. + success_msg: " {{ item.key }}: {{ _current_versions[item.key] }}" + loop: "{{ _enabled_components }}" + loop_control: + label: "{{ item.key }}" + +- name: "validate - Check current version JSON files exist in input directory" + ansible.builtin.stat: + path: "{{ input_project_dir }}/config/{{ item.1 }}/{{ _current_software_config.cluster_os_type }}/{{ _current_software_config.cluster_os_version }}/{{ item.0.key }}_v{{ _current_versions[item.0.key] }}.json" + register: _json_file_check + loop: "{{ _enabled_components | product(upgrade_active_architectures) | list }}" + loop_control: + label: "{{ item.0.key }} ({{ item.1 }})" + when: _current_versions[item.0.key] is defined + +- name: "validate - Verify JSON files exist" + ansible.builtin.assert: + that: + - item.stat.exists + fail_msg: | + JSON file not found: {{ item.invocation.module_args.path }} + Expected naming: {{ item.item.0.key }}_v{{ _current_versions[item.item.0.key] }}.json + Ensure the versioned JSON file exists in input/config directory. + loop: "{{ _json_file_check.results }}" + loop_control: + label: "{{ item.invocation.module_args.path | basename }}" + when: item.stat is defined + +- name: "validate — Check repos.yml exists" + ansible.builtin.stat: + path: "{{ role_path }}/../../artifacts/repos.yml" + register: _repos_yml_stat + +- name: "validate — Warn if repos.yml not found" + ansible.builtin.debug: + msg: "Warning: repos.yml not found at {{ role_path }}/../../artifacts/repos.yml - upgrade repos may not be available" + when: not _repos_yml_stat.stat.exists + +- name: "validate - Display validation summary" + ansible.builtin.debug: + msg: + - " Prerequisites validated" + - " - Current versions verified for: {{ _enabled_components | map(attribute='key') | list | join(', ') }}" + - " - JSON files exist in input directory" + - " - repos.yml: {{ 'found' if _repos_yml_stat.stat.exists else 'not found' }}" + - " - Config source: {{ 'reused from manage_upgrade_inputs' if (_sw_config_already_loaded | bool) else 'loaded fresh' }}" diff --git a/upgrade/roles/prep_local_repo/vars/main.yml b/upgrade/roles/prep_local_repo/vars/main.yml new file mode 100644 index 0000000000..a1225501bb --- /dev/null +++ b/upgrade/roles/prep_local_repo/vars/main.yml @@ -0,0 +1,8 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +--- + +# Architectures will be dynamically determined from software_config.json +# upgrade_active_architectures: [] # Set dynamically in create_staging.yml + +# Cleanup staging directory after sync +upgrade_cleanup_staging: true diff --git a/upgrade/roles/upgrade_oim/tasks/main.yml b/upgrade/roles/upgrade_oim/tasks/main.yml index 196366870b..eb51f1ee9b 100644 --- a/upgrade/roles/upgrade_oim/tasks/main.yml +++ b/upgrade/roles/upgrade_oim/tasks/main.yml @@ -13,6 +13,36 @@ # limitations under the License. --- -- name: Include import input parameters +- name: Verify upgrade_manifest.yml exists + ansible.builtin.stat: + path: "{{ role_path }}/../../upgrade_manifest.yml" + register: _upgrade_config_check + +- name: Fail if upgrade_manifest.yml missing + ansible.builtin.fail: + msg: | + upgrade_manifest.yml not found at {{ role_path }}/../../upgrade_manifest.yml + + This file is the SOURCE OF TRUTH for upgrade paths. + Please create it with the required upgrade configuration. + when: not _upgrade_config_check.stat.exists + +# Phase 1: Import and transform input parameters +- name: "Phase 1 - Import Input Parameters" ansible.builtin.include_role: name: import_input_parameters + +# Phase 2: Manage upgrade inputs (load config, validate, calculate hops) +- name: "Phase 2 - Manage Upgrade Inputs" + ansible.builtin.include_role: + name: manage_upgrade_inputs + +# Phase 3: Prepare local repository for upgrade +- name: "Phase 3 - Prepare Local Repository" + ansible.builtin.include_role: + name: prep_local_repo + +# Phase 4: Execute OIM upgrade (this would be the actual OIM upgrade tasks) +- name: "Phase 4 - Execute OIM Upgrade Tasks" + ansible.builtin.debug: + msg: "OIM upgrade tasks would be executed here" diff --git a/upgrade/upgrade_manifest.yml b/upgrade/upgrade_manifest.yml new file mode 100644 index 0000000000..96a36bbda1 --- /dev/null +++ b/upgrade/upgrade_manifest.yml @@ -0,0 +1,87 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# ============================================================================ +# OMNIA UPGRADE CONFIGURATION (Production Recommended) +# ============================================================================ +# Upgrade metadata (source and target Omnia versions) is read automatically +# from /opt/omnia/.data/oim_metadata.yml after omnia_core execution. +# +# Design: +# omnia_upgrade_paths — defines the Omnia version upgrade sequence. +# Each entry specifies the software versions for the NEXT Omnia version. +# +# components — defines each software component and its valid version sequence. +# supported_versions — ordered list used for automatic intermediate hop +# generation when a K8s version gap is detected. +# +# Automatic intermediate hop detection: +# When the target K8s version skips one or more entries in supported_versions, +# the system auto-generates one K8s hop per intermediate version. +# +# Example: Omnia 2.1.0.0 (K8s 1.34.1) → Omnia 2.3.0.0 (K8s 1.37.1) +# Omnia path : 2.1.0.0 → 2.2.0.0 → 2.3.0.0 +# K8s hops : 1.34.1 → 1.35.1 (Omnia 2.1→2.2, direct) +# 1.35.1 → 1.36.1 (auto-generated, within 2.2→2.3) +# 1.36.1 → 1.37.1 (Omnia 2.2→2.3, final) +# ============================================================================ + +# ============================================================================ +# OMNIA VERSION UPGRADE PATHS +# ============================================================================ +# Each entry: +# "": +# next_omnia_version: "" +# software_versions: +# : "" +# ============================================================================ +omnia_upgrade_paths: + "2.1.0.0": + next_omnia_version: "2.2.0.0" + software_versions: + service_k8s: "1.35.1" + # Uncomment to enable multi-hop upgrade to Omnia 2.3.0.0: + # K8s 1.35.1 -> 1.37.1 will auto-generate intermediate hop via 1.36.1 + # "2.2.0.0": + # next_omnia_version: "2.3.0.0" + # software_versions: + # service_k8s: "1.37.1" + +# ============================================================================ +# COMPONENT CONFIGURATION +# ============================================================================ +# Each component: +# json_file — base name for versioned JSON files +# (e.g., "service_k8s" → service_k8s_v1.35.1.json) +# enabled — whether this component participates in upgrade +# supported_versions — ordered list of all valid software versions. +# When target skips versions, intermediate hops are +# auto-generated in sequence order. +# ============================================================================ +components: + service_k8s: + json_file: "service_k8s" + enabled: true + supported_versions: + - "1.34.1" + - "1.35.1" + # Additional components (placeholders) + # slurm_custom: + # json_file: "slurm_custom" + # enabled: false + # supported_versions: + # - "24.05" + # - "25.11" + # - "26.05" diff --git a/upgrade/upgrade_oim.yml b/upgrade/upgrade_oim.yml index aa6e6fb5fc..a2e29d7885 100644 --- a/upgrade/upgrade_oim.yml +++ b/upgrade/upgrade_oim.yml @@ -16,6 +16,9 @@ - name: Upgrade OIM tasks hosts: localhost connection: local + gather_facts: true + vars: + input_project_dir: "/opt/omnia/input/project_default" roles: - role: ../utils/roles/include_input_dir - role: upgrade_oim From 24760fd0aaa709a16b0bf5da36dcc9c5ea01092a Mon Sep 17 00:00:00 2001 From: pullan1 Date: Tue, 28 Apr 2026 15:55:54 +0530 Subject: [PATCH 07/17] updated vars Signed-off-by: pullan1 --- upgrade/roles/prep_local_repo/vars/main.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/upgrade/roles/prep_local_repo/vars/main.yml b/upgrade/roles/prep_local_repo/vars/main.yml index a1225501bb..021fbbf524 100644 --- a/upgrade/roles/prep_local_repo/vars/main.yml +++ b/upgrade/roles/prep_local_repo/vars/main.yml @@ -1,8 +1,9 @@ # Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. --- -# Architectures will be dynamically determined from software_config.json -# upgrade_active_architectures: [] # Set dynamically in create_staging.yml +# Architectures - default value, can be overridden by playbook or dynamically from software_config.json +upgrade_active_architectures: + - x86_64 # Default architecture, will be updated in create_staging.yml if needed # Cleanup staging directory after sync upgrade_cleanup_staging: true From 64f777aec678e5ac8bd0704dc696b0f8307d9bbd Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Thu, 7 May 2026 18:12:41 +0530 Subject: [PATCH 08/17] provision update --- .../tasks/provision_mapping_nodes.yml | 79 ++++- ...-group-login_compiler_node_aarch64.yaml.j2 | 128 ++------ ...i-group-login_compiler_node_x86_64.yaml.j2 | 124 ++----- .../ci-group-login_node_aarch64.yaml.j2 | 14 +- .../ci-group-login_node_x86_64.yaml.j2 | 14 +- ...ce_kube_control_plane_first_x86_64.yaml.j2 | 19 +- ...ci-group-slurm_control_node_x86_64.yaml.j2 | 4 +- .../ci-group-slurm_node_aarch64.yaml.j2 | 165 +++++++--- .../ci-group-slurm_node_x86_64.yaml.j2 | 177 +++++++--- .../doca-ofed/configure-ib-network.sh.j2 | 41 +-- .../hpc_tools/cuda_lock_manager.sh.j2 | 78 +++++ .../hpc_tools/generate_install_uuid.sh.j2 | 10 + .../hpc_tools/install_cuda_driver.sh.j2 | 42 +++ .../hpc_tools/install_cuda_toolkit.sh.j2 | 166 ++++++++++ .../templates/hpc_tools/install_dcgm.sh.j2 | 98 ++++++ .../hpc_tools/install_nvidia_peermem.sh.j2 | 141 ++++++++ .../hpc_tools/slurm_cuda_coordinator.sh.j2 | 50 +++ .../configure_powerscale_syslog.sh.j2 | 308 ++++++++++++++++++ .../deploy_powerscale_telemetry.sh.j2 | 8 +- .../verify_powerscale_syslog_ut.sh.j2 | 130 ++++++++ .../templates/telemetry/telemetry.sh.j2 | 10 +- .../vast/configure_vast_installation.sh.j2 | 94 ++++++ .../roles/configure_ochami/vars/main.yml | 39 ++- .../tasks/create_k8s_config_nfs.yml | 2 +- provision/roles/k8s_config/vars/main.yml | 3 +- .../tasks/include_software_config.yml | 9 +- .../provision_validations/tasks/main.yml | 5 +- .../tasks/validate_telemetry_config.yml | 12 +- .../roles/provision_validations/vars/main.yml | 6 +- .../slurm_config/files/pull_benchmarks.sh | 149 +++++++++ .../slurm_config/tasks/create_slurm_dir.yml | 16 - .../roles/slurm_config/tasks/hpc_tools.yml | 17 +- .../tasks/read_slurm_hostnames.yml | 8 + .../templates/benchmark_tools.list.j2 | 14 + .../templates/pull_benchmarks.sh.j2 | 164 ++++++++++ provision/roles/slurm_config/vars/main.yml | 14 +- .../tasks/apply_telemetry_on_upgrade.yml | 8 +- .../tasks/deploy_powerscale_logs.yml | 49 +++ .../tasks/deploy_powerscale_metrics.yml | 1 - .../tasks/derive_sink_support_flags.yml | 90 +++++ .../generate_service_cluster_metadata.yml | 2 +- .../tasks/generate_telemetry_deployments.yml | 39 +-- .../telemetry/tasks/load_service_images.yml | 2 +- provision/roles/telemetry/tasks/main.yml | 51 ++- .../telemetry/tasks/read_software_config.yml | 4 +- .../telemetry/tasks/telemetry_prereq.yml | 27 +- .../telemetry/cleanup_telemetry.sh.j2 | 4 + .../common/telemetry_pod_cleanup.yaml.j2 | 2 +- .../common/telemetry_secret_creation.yaml.j2 | 3 +- .../idrac_telemetry_statefulset.yaml.j2 | 2 +- .../telemetry/kafka/kafka.kafka.yaml.j2 | 10 +- .../kafka/kafka.kafkapump_user.yaml.j2 | 4 +- .../kafka/kafka.tls_test_job.yaml.j2 | 12 +- .../templates/telemetry/kustomization.yaml.j2 | 34 +- .../ldms/ldms_machine_config.json.j2 | 6 +- .../telemetry/ldms/ldmsd.sampler.env.j2 | 2 +- .../templates/telemetry/ldms/sampler.conf.j2 | 4 +- .../templates/telemetry/ldms/values.yaml.j2 | 4 +- .../victoria/gen_victoria_certs.sh.j2 | 64 ++-- .../victoria-agent-deployment.yaml.j2 | 2 +- .../victoria-operator-vmcluster.yaml.j2 | 6 +- ...perator-vmservicescrape-powerscale.yaml.j2 | 50 +++ .../victoria-operator-vmsingle.yaml.j2 | 4 +- .../victoria/victoria-statefulset.yaml.j2 | 4 +- .../victorialogs-operator-vlagent.yaml.j2 | 99 +++--- .../victorialogs-operator-vlcluster.yaml.j2 | 7 +- .../victorialogs-vlagent-config.yaml.j2 | 6 +- .../vlagent-syslog-tls-secret.yaml.j2 | 23 ++ .../victoria/vmagent-scrape-config.yaml.j2 | 2 +- provision/roles/telemetry/vars/main.yml | 156 ++++++++- 70 files changed, 2569 insertions(+), 572 deletions(-) create mode 100644 provision/roles/configure_ochami/templates/hpc_tools/cuda_lock_manager.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/hpc_tools/generate_install_uuid.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/hpc_tools/install_cuda_driver.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/hpc_tools/slurm_cuda_coordinator.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/powerscale/verify_powerscale_syslog_ut.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/vast/configure_vast_installation.sh.j2 create mode 100644 provision/roles/slurm_config/files/pull_benchmarks.sh create mode 100644 provision/roles/slurm_config/templates/benchmark_tools.list.j2 create mode 100644 provision/roles/slurm_config/templates/pull_benchmarks.sh.j2 create mode 100644 provision/roles/telemetry/tasks/deploy_powerscale_logs.yml create mode 100644 provision/roles/telemetry/tasks/derive_sink_support_flags.yml create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2 create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/vlagent-syslog-tls-secret.yaml.j2 diff --git a/provision/roles/configure_ochami/tasks/provision_mapping_nodes.yml b/provision/roles/configure_ochami/tasks/provision_mapping_nodes.yml index 24314209aa..7abf250957 100644 --- a/provision/roles/configure_ochami/tasks/provision_mapping_nodes.yml +++ b/provision/roles/configure_ochami/tasks/provision_mapping_nodes.yml @@ -56,37 +56,96 @@ - name: Delete smd configuration ansible.builtin.include_tasks: delete_smd_config.yml - - name: Restart the cloud-init service to clear node instance data + - name: Restart the cloud-init service to refresh node instance data ansible.builtin.service: name: cloud-init-server state: restarted + - name: Check if cloud-init-server is running + ansible.builtin.service: + name: cloud-init-server + state: started + register: cloud_init_server_status + retries: "{{ service_retries }}" + delay: "{{ service_retry_interval }}" + until: cloud_init_server_status is success + + - name: Wait for cloud-init-server to be ready + ansible.builtin.command: /usr/bin/ochami cloud-init service status + register: cloud_init_status + retries: "{{ service_retries }}" + delay: "{{ service_retry_interval }}" + until: cloud_init_status.rc == 0 + changed_when: false + + - name: Fail if cloud-init-server is not running + ansible.builtin.fail: + msg: "{{ cloud_init_failed_msg }}" + when: + - cloud_init_server_status is defined + - cloud_init_server_status is not success + - name: Check whether openchami.target is up ansible.builtin.service: name: openchami.target state: started register: openchami_target_status - retries: 4 - delay: 5 + retries: "{{ service_retries }}" + delay: "{{ service_retry_interval }}" until: openchami_target_status is success - name: Fail if openchami.target is not up ansible.builtin.fail: - msg: "openchami.target is not up" + msg: "{{ openchami_target_failed_msg }}" when: - openchami_target_status is defined - openchami_target_status is not success - - name: Provision ochami nodes + - name: Discover ochami nodes with retries block: - - name: Provision ochami nodes + - name: Discover ochami nodes ansible.builtin.command: /usr/bin/ochami discover static -f yaml -d @"{{ openchami_nodes_vars_path }}" --overwrite - changed_when: true register: openchami_provision + changed_when: openchami_provision.rc == 0 + failed_when: openchami_provision.rc != 0 rescue: - - name: Failed to provision nodes - ansible.builtin.debug: - msg: "{{ provision_fail_msg }}. Error: {{ openchami_provision.stderr_lines }}" + - name: Check if smd service is running + ansible.builtin.service: + name: smd + state: started + register: smd_service_status + retries: "{{ service_retries }}" + delay: "{{ service_retry_interval }}" + until: smd_service_status is success + + - name: Fail if smd service is not running + ansible.builtin.fail: + msg: "smd service is not running" + when: + - smd_service_status is defined + - smd_service_status is not success + + - name: Wait for SMD API to be ready + ansible.builtin.command: /usr/bin/ochami smd service status + register: smd_api_ready + retries: "{{ service_retries }}" + delay: "{{ service_retry_interval }}" + until: smd_api_ready.rc == 0 + changed_when: false + + - name: Retry discover ochami nodes + ansible.builtin.command: /usr/bin/ochami discover static -f yaml -d @"{{ openchami_nodes_vars_path }}" --overwrite + register: openchami_provision + changed_when: openchami_provision.rc == 0 + failed_when: openchami_provision.rc != 0 + retries: 2 + delay: "{{ service_retry_interval }}" + until: openchami_provision.rc == 0 + + - name: Fail if ochami discovery still unsuccessful + ansible.builtin.fail: + msg: "{{ ochami_discovery_failed_msg }}" + when: openchami_provision.rc != 0 - name: Verify node created in smd ansible.builtin.shell: | diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 303baf5743..1750e50be0 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -77,96 +77,25 @@ IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa IdentitiesOnly yes - - path: /usr/local/bin/install_cuda_toolkit.sh - permissions: '0755' +{% if login_compiler_node_present %} + - path: /usr/local/bin/generate_install_uuid.sh + owner: root:root + permissions: '{{ file_mode_755 }}' content: | - #!/bin/bash - LOGFILE="/var/log/cuda_toolkit_install.log" - exec > >(tee -a "$LOGFILE") 2>&1 - - echo "===== Starting CUDA Toolkit installation =====" - - # Check if CUDA toolkit is already installed - if command -v nvcc &>/dev/null; then - CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') - echo "[INFO] CUDA toolkit already installed (version: ${CUDA_VERSION}). Exiting." - exit 0 - fi - - echo "[INFO] Mounting NFS runfile directory for CUDA toolkit..." - mkdir -p /cuda-runfile - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/runfile /cuda-runfile - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS runfile share. Exiting." - exit 1 - fi - - echo "[INFO] Setting up shared CUDA directory..." - # Create and mount shared directory for compute nodes - mkdir -p /shared-cuda-toolkit - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS cuda share. Exiting." - umount /cuda-runfile 2>/dev/null - exit 1 - fi - - echo "[INFO] Installing CUDA toolkit directly to shared NFS location..." - if [ -f "/cuda-runfile/{{ cuda_runfile_aarch64 }}" ]; then - mkdir -p /shared-cuda-toolkit/tmp - # Install toolkit directly to the NFS-mounted shared location - bash /cuda-runfile/{{ cuda_runfile_aarch64 }} --silent --toolkit --tmpdir=/shared-cuda-toolkit/tmp --toolkitpath=/shared-cuda-toolkit --override - - if [ $? -eq 0 ]; then - echo "[SUCCESS] CUDA toolkit installed successfully to shared location." - - # Set up environment variables pointing to shared location - cat > /etc/profile.d/cuda.sh << 'ENDOFFILE' - export PATH=/shared-cuda-toolkit/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit - ENDOFFILE - - # Apply environment variables for current session - export PATH=/shared-cuda-toolkit/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit - - echo "[INFO] CUDA environment configured" - else - echo "[ERROR] CUDA toolkit installation failed." - fi - else - echo "[ERROR] CUDA toolkit runfile not found in /cuda-runfile/" - fi - - echo "[INFO] Verifying CUDA toolkit installation..." - if command -v nvcc &>/dev/null; then - CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') - echo "[SUCCESS] CUDA toolkit verified: version $CUDA_VERSION" - echo "[INFO] CUDA installation path: $(which nvcc)" - else - echo "[ERROR] CUDA toolkit (nvcc) not found after installation." - fi - - echo "[INFO] Setting up shared CUDA directory for compute nodes..." - # Create shared directory for compute nodes to mount - mkdir -p /shared-cuda-toolkit - # Mount the shared NFS location where compute nodes will access the toolkit - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit + {{ lookup('template', 'templates/hpc_tools/generate_install_uuid.sh.j2') | indent(12) }} - echo "[INFO] Copying CUDA toolkit to shared location..." - # Copy the installed CUDA toolkit to the shared location for compute nodes - #rsync -av /usr/local/cuda/ /shared-cuda-toolkit/ --exclude='*.a' --exclude='doc/' - cp -r /usr/local/cuda/* /shared-cuda-toolkit/ 2>/dev/null || true - - echo "[INFO] Cleaning up temporary mounts..." - umount /cuda-runfile 2>/dev/null - rmdir /cuda-runfile 2>/dev/null + - path: /usr/local/bin/cuda_lock_manager.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/cuda_lock_manager.sh.j2') | indent(12) }} - echo "===== CUDA Toolkit installation completed =====" + - path: /usr/local/bin/install_cuda_toolkit.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_toolkit.sh.j2') | indent(12) }} +{% endif %} {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf @@ -182,7 +111,7 @@ {{ lookup('template', 'templates/openldap/update_ldap_conf.sh.j2') | indent(12) }} {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - path: /root/ldms_sampler.sh owner: root:root permissions: '0755' @@ -221,6 +150,12 @@ content: | {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} + - path: /tmp/apptainer_mirror.conf permissions: '0644' content: | @@ -240,7 +175,6 @@ runcmd: - /usr/local/bin/set-ssh.sh - - /usr/local/bin/install_cuda_toolkit.sh # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - mkdir -p {{ client_mount_path }}/slurm/ssh - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools @@ -254,11 +188,19 @@ - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - /usr/local/bin/configure_vast_installation.sh + - mount -a + + +{% if login_compiler_node_present %} + - /usr/local/bin/generate_install_uuid.sh + - /usr/local/bin/install_cuda_toolkit.sh +{% endif %} + +{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or ldms_support %} -{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} # Add NFS entry and mount - mkdir -p {{ client_mount_path }} - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab @@ -284,7 +226,7 @@ # - echo "NFS must be mounted at {{ client_mount_path }} before running." {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - /root/ldms_sampler.sh {% endif %} @@ -390,4 +332,4 @@ # nvidia sdk install - /usr/local/bin/install_nvhpc_sdk.sh - /usr/local/bin/configure_nvhpc_env.sh - - echo "Cloud-Init has completed successfully." + - echo "Cloud-Init has completed successfully." \ No newline at end of file diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index fbf39d348c..7ee7580733 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -77,96 +77,25 @@ IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa IdentitiesOnly yes - - path: /usr/local/bin/install_cuda_toolkit.sh - permissions: '0755' +{% if login_compiler_node_present %} + - path: /usr/local/bin/generate_install_uuid.sh + owner: root:root + permissions: '{{ file_mode_755 }}' content: | - #!/bin/bash - LOGFILE="/var/log/cuda_toolkit_install.log" - exec > >(tee -a "$LOGFILE") 2>&1 - - echo "===== Starting CUDA Toolkit installation =====" - - # Check if CUDA toolkit is already installed - if command -v nvcc &>/dev/null; then - CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') - echo "[INFO] CUDA toolkit already installed (version: ${CUDA_VERSION}). Exiting." - exit 0 - fi - - echo "[INFO] Mounting NFS runfile directory for CUDA toolkit..." - mkdir -p /cuda-runfile - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/runfile /cuda-runfile - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS runfile share. Exiting." - exit 1 - fi - - echo "[INFO] Setting up shared CUDA directory..." - # Create and mount shared directory for compute nodes - mkdir -p /shared-cuda-toolkit - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS cuda share. Exiting." - umount /cuda-runfile 2>/dev/null - exit 1 - fi - - echo "[INFO] Installing CUDA toolkit directly to shared NFS location..." - if [ -f "/cuda-runfile/{{ cuda_runfile_x86_64 }}" ]; then - mkdir -p /shared-cuda-toolkit/tmp - # Install toolkit directly to the NFS-mounted shared location - bash /cuda-runfile/{{ cuda_runfile_x86_64 }} --silent --toolkit --tmpdir=/shared-cuda-toolkit/tmp --toolkitpath=/shared-cuda-toolkit --override - - if [ $? -eq 0 ]; then - echo "[SUCCESS] CUDA toolkit installed successfully to shared location." - - # Set up environment variables pointing to shared location - cat > /etc/profile.d/cuda.sh << 'ENDOFFILE' - export PATH=/shared-cuda-toolkit/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit - ENDOFFILE - - # Apply environment variables for current session - export PATH=/shared-cuda-toolkit/bin:$PATH - export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/shared-cuda-toolkit - - echo "[INFO] CUDA environment configured" - else - echo "[ERROR] CUDA toolkit installation failed." - fi - else - echo "[ERROR] CUDA toolkit runfile not found in /cuda-runfile/" - fi - - echo "[INFO] Verifying CUDA toolkit installation..." - if command -v nvcc &>/dev/null; then - CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | sed 's/,//') - echo "[SUCCESS] CUDA toolkit verified: version $CUDA_VERSION" - echo "[INFO] CUDA installation path: $(which nvcc)" - else - echo "[ERROR] CUDA toolkit (nvcc) not found after installation." - fi + {{ lookup('template', 'templates/hpc_tools/generate_install_uuid.sh.j2') | indent(12) }} - echo "[INFO] Setting up shared CUDA directory for compute nodes..." - # Create shared directory for compute nodes to mount - mkdir -p /shared-cuda-toolkit - # Mount the shared NFS location where compute nodes will access the toolkit - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit - - echo "[INFO] Copying CUDA toolkit to shared location..." - # Copy the installed CUDA toolkit to the shared location for compute nodes - #rsync -av /usr/local/cuda/ /shared-cuda-toolkit/ --exclude='*.a' --exclude='doc/' - cp -r /usr/local/cuda/* /shared-cuda-toolkit/ 2>/dev/null || true - - echo "[INFO] Cleaning up temporary mounts..." - umount /cuda-runfile 2>/dev/null - rmdir /cuda-runfile 2>/dev/null + - path: /usr/local/bin/cuda_lock_manager.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/cuda_lock_manager.sh.j2') | indent(12) }} - echo "===== CUDA Toolkit installation completed =====" + - path: /usr/local/bin/install_cuda_toolkit.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_toolkit.sh.j2') | indent(12) }} +{% endif %} {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf @@ -182,7 +111,7 @@ {{ lookup('template', 'templates/openldap/update_ldap_conf.sh.j2') | indent(12) }} {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - path: /root/ldms_sampler.sh owner: root:root permissions: '0755' @@ -221,6 +150,12 @@ content: | {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} + - path: /tmp/apptainer_mirror.conf permissions: '0644' content: | @@ -240,7 +175,7 @@ runcmd: - /usr/local/bin/set-ssh.sh - - /usr/local/bin/install_cuda_toolkit.sh + # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - mkdir -p {{ client_mount_path }}/slurm/ssh @@ -259,8 +194,14 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - /usr/local/bin/configure_vast_installation.sh -{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} +{% if login_compiler_node_present %} + - /usr/local/bin/generate_install_uuid.sh + - /usr/local/bin/install_cuda_toolkit.sh +{% endif %} + +{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or ldms_support %} # Add NFS entry and mount - mkdir -p {{ client_mount_path }} - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab @@ -286,7 +227,7 @@ # - echo "NFS must be mounted at {{ client_mount_path }} before running." {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - /root/ldms_sampler.sh {% endif %} @@ -361,6 +302,7 @@ - systemctl restart sshd - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - mkdir -p /etc/containers/registries.conf.d - mv /tmp/apptainer_mirror.conf /etc/containers/registries.conf.d/apptainer_mirror.conf diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index 406a50a5a0..176e5e6325 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -94,7 +94,7 @@ {{ lookup('template', 'templates/openldap/update_ldap_conf.sh.j2') | indent(12) }} {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - path: /root/ldms_sampler.sh owner: root:root permissions: '0755' @@ -121,6 +121,12 @@ content: | {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} + - path: /tmp/apptainer_mirror.conf permissions: '0644' content: | @@ -138,13 +144,15 @@ - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts /hpc_tools/scripts nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - /usr/local/bin/configure_vast_installation.sh + - mount -a - bash /usr/local/bin/doca-install.sh || true - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh @@ -243,7 +251,7 @@ {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log # Add NFS entry and mount diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 50f85187b1..c7c2036a4b 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -93,7 +93,7 @@ content: | {{ lookup('template', 'templates/openldap/update_ldap_conf.sh.j2') | indent(12) }} {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - path: /root/ldms_sampler.sh owner: root:root permissions: '0755' @@ -120,6 +120,12 @@ content: | {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} + - path: /tmp/apptainer_mirror.conf permissions: '0644' content: | @@ -138,15 +144,17 @@ - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts /hpc_tools/scripts nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - /usr/local/bin/configure_vast_installation.sh + - mount -a - bash /usr/local/bin/doca-install.sh || true - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh @@ -246,7 +254,7 @@ {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log # Add NFS entry and mount diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 7eb3c72cc1..0d01edee47 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -332,7 +332,7 @@ ipAddressPools: - first-pool -{% if hostvars['localhost']['idrac_telemetry_support'] or hostvars['localhost']['ldms_support'] %} +{% if idrac_telemetry_support or ldms_support %} - path: /root/telemetry.sh owner: root:root permissions: '0755' @@ -1042,14 +1042,27 @@ systemctl restart nfs-client.target systemctl restart rpcbind -{% if hostvars['localhost']['powerscale_configurations']['powerscale_telemetry_support'] | default(false) | bool %} +{% if powerscale_metrics_enabled | default(false) | bool %} {% include 'powerscale/deploy_powerscale_telemetry.sh.j2' %} {% endif %} -{% if hostvars['localhost']['idrac_telemetry_support'] or hostvars['localhost']['ldms_support'] %} +{% if idrac_telemetry_support or ldms_support %} echo "Applying Telemetry Kubernetes deployments" /root/telemetry.sh {% endif %} + +{% if powerscale_log_enabled | default(false) | bool %} + echo "===== Configuring PowerScale Syslog Forwarding =====" + if [ -f "{{ k8s_client_mount_path }}/telemetry/deployments/configure_powerscale_syslog.sh" ]; then + echo "Running PowerScale syslog configuration script..." + bash "{{ k8s_client_mount_path }}/telemetry/deployments/configure_powerscale_syslog.sh" 2>&1 + echo "PowerScale syslog configuration completed (exit code: $?)" + else + echo "WARNING: PowerScale syslog configuration script not found." + echo "Ensure deploy_powerscale_logs.yml ran successfully during provisioning." + fi +{% endif %} + echo "Rollout and Restart coredns" kubectl rollout restart deployment coredns -n kube-system sleep 30 diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index d72541d774..2f0c16b577 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -287,7 +287,7 @@ {{ lookup('template', 'templates/openldap/update_ldap_conf.sh.j2') | indent(12) }} {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - path: /root/ldms_sampler.sh owner: root:root permissions: '0755' @@ -549,7 +549,7 @@ - systemctl restart sshd {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log # Add NFS entry and mount diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 145f79190d..41b0df8707 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -95,37 +95,21 @@ exit 0 fi - echo "[INFO] NVIDIA GPU detected. Proceeding with setup." + echo "[INFO] NVIDIA GPU detected. Proceeding with setup and CUDA installation." # Check if NVIDIA driver is already installed if command -v nvidia-smi &>/dev/null; then echo "[INFO] NVIDIA driver already installed. Skipping driver installation." else - echo "[INFO] Mounting NFS runfile directory for driver installation..." - mkdir -p /gpu-runfile - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/runfile /gpu-runfile - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS runfile share. Exiting." - exit 1 - fi - - echo "[INFO] Installing NVIDIA driver..." - if [ -f "/gpu-runfile/{{ cuda_runfile_aarch64 }}" ]; then - bash /gpu-runfile/{{ cuda_runfile_aarch64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build - if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then - echo "[SUCCESS] NVIDIA driver installed successfully." - nvidia-smi -pm 1 - else - echo "[ERROR] NVIDIA driver installation failed." - fi + echo "[INFO] Installing NVIDIA driver (proprietary kernel module)..." + dnf install -y cuda-drivers + if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then + echo "[SUCCESS] NVIDIA driver installed successfully." + nvidia-smi -pm 1 else - echo "[ERROR] NVIDIA driver runfile not found in /gpu-runfile/" + echo "[ERROR] NVIDIA driver installation failed." + exit 1 fi - - echo "[INFO] Cleaning up temporary NFS mount..." - umount /gpu-runfile 2>/dev/null - rmdir /gpu-runfile 2>/dev/null fi echo "[INFO] Setting up CUDA toolkit mount..." @@ -214,7 +198,6 @@ echo "===== NVIDIA GPU setup completed =====" -{% if dcgm_support %} - path: /usr/local/bin/setup_dcgm.sh permissions: '0755' content: | @@ -236,12 +219,50 @@ exit 0 fi echo "[INFO] NVIDIA driver prerequisite satisfied." + + # Display nvidia-smi output for verification + echo "========== NVIDIA Driver & GPU Information ==========" + nvidia-smi 2>&1 + echo "=====================================================" + + # Detect CUDA major version for DCGM package selection + echo "[INFO] Detecting CUDA version for DCGM package compatibility..." + # Try to get CUDA version from nvidia-smi + CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}' | cut -d'.' -f1) + + # Fallback: Try to get CUDA version from nvcc if available + if [ -z "$CUDA_VERSION" ]; then + if command -v nvcc &>/dev/null; then + CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $5}' | cut -d',' -f1 | cut -d'.' -f1) + echo "[INFO] CUDA version detected from nvcc: $CUDA_VERSION" + else + echo "[ERROR] Could not detect CUDA version from nvidia-smi or nvcc." + echo "[ERROR] CUDA toolkit is required for DCGM package version detection. Skipping DCGM setup." + exit 1 + fi + else + echo "[INFO] CUDA major version detected from nvidia-smi: $CUDA_VERSION" + fi - # Check if datacenter-gpu-manager package is installed - if ! rpm -q datacenter-gpu-manager-4-core &>/dev/null; then - echo "[ERROR] datacenter-gpu-manager-4-core RPM not installed. Skipping DCGM setup." + # Install datacenter-gpu-manager-4-cuda${CUDA_VERSION} via dnf with weak dependencies + echo "[INFO] Installing datacenter-gpu-manager-4-cuda${CUDA_VERSION} package..." + if ! dnf install -y --setopt=install_weak_deps=True datacenter-gpu-manager-4-cuda${CUDA_VERSION}; then + echo "[ERROR] Failed to install datacenter-gpu-manager-4-cuda${CUDA_VERSION}. Skipping DCGM setup." exit 1 fi + echo "[INFO] datacenter-gpu-manager-4-cuda${CUDA_VERSION} installed successfully." + + # Install multinode diagnostic plugin for CUDA 12+ (optional but recommended for HPC) + if [ "$CUDA_VERSION" -ge "12" ]; then + echo "[INFO] Installing DCGM multinode diagnostic plugin for HPC cluster support..." + if dnf install -y --nogpgcheck datacenter-gpu-manager-4-multinode-cuda${CUDA_VERSION}; then + echo "[INFO] DCGM multinode plugin installed successfully." + else + echo "[WARN] Failed to install multinode plugin. Continuing without it." + fi + else + echo "[INFO] Multinode plugin requires CUDA 12+. Current version: $CUDA_VERSION. Skipping." + fi # Enable and start DCGM daemon (SB-003) echo "[INFO] Enabling and starting {{ dcgm_service_name }}.service..." @@ -274,16 +295,17 @@ # GPU discovery (SB-004) echo "[INFO] Enumerating GPUs via dcgmi discovery..." if command -v dcgmi &>/dev/null; then - dcgmi discovery -l - echo "[SUCCESS] GPU discovery completed." + echo "========== GPU Discovery Output ==========" + dcgmi discovery -l 2>&1 + GPU_COUNT=$(dcgmi discovery -l 2>/dev/null | grep -c "GPU") + echo "==========================================" + echo "[SUCCESS] GPU discovery completed. Found $GPU_COUNT GPU(s)." else echo "[WARN] dcgmi command not found. Skipping GPU enumeration." fi echo "===== NVIDIA DCGM setup completed =====" -{% endif %} - {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf owner: root:root @@ -298,13 +320,66 @@ {{ lookup('template', 'templates/openldap/update_ldap_conf.sh.j2') | indent(12) }} {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - path: /root/ldms_sampler.sh owner: root:root permissions: '0755' content: | {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} + +{% if slurm_node_present %} + - path: /usr/local/bin/slurm_cuda_coordinator.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/slurm_cuda_coordinator.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_cuda_driver.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_driver.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_nvidia_peermem.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_nvidia_peermem.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_dcgm.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_dcgm.sh.j2') | indent(12) }} + +{% if not login_compiler_node_present %} + - path: /usr/local/bin/generate_install_uuid.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/generate_install_uuid.sh.j2') | indent(12) }} + + - path: /usr/local/bin/cuda_lock_manager.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/cuda_lock_manager.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_cuda_toolkit.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_toolkit.sh.j2') | indent(12) }} +{% endif %} +{% endif %} + + - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure_dirs_and_mounts.sh permissions: '{{ file_mode_755 }}' content: | @@ -315,7 +390,7 @@ echo "[INFO] ===== Starting directory creation and NFS mounts for Pulp cert, Slurm and Munge (aarch64) =====" mkdir -p {{ client_mount_path }}/slurm/ssh echo "[INFO] Creating base directories for Slurm and Munge" - mkdir -pv {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts + mkdir -pv {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} {% for d in slurm_prolog_dirs_all %}{{ d }} {% endfor %}/etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts /hpc_tools/cuda echo "[INFO] Updating /etc/fstab with NFS entries for Pulp cert, Slurm and Munge paths" echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab @@ -326,6 +401,8 @@ echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts /hpc_tools/scripts nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path}}/hpc_tools/cuda /hpc_tools/cuda nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path}}/hpc_tools/ /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab @@ -552,11 +629,9 @@ runcmd: - rm -rf /var/lib/cloud/instance - /usr/local/bin/set-ssh.sh - - /usr/local/bin/install_nvidia_driver.sh -{% if dcgm_support %} - - /usr/local/bin/setup_dcgm.sh -{% endif %} + # slurm user and group created in the users module + - /usr/local/bin/configure_vast_installation.sh - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf @@ -593,7 +668,7 @@ {% endif %} -{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} +{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or ldms_support %} # Add NFS entry and mount - mkdir -p {{ client_mount_path }} - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab @@ -606,7 +681,7 @@ {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - /root/ldms_sampler.sh @@ -614,6 +689,16 @@ - /usr/local/bin/setup_nvhpc_sdk.sh - /usr/local/bin/export_nvhpc_env.sh +{% if slurm_node_present %} + - | + set -e + /usr/local/bin/slurm_cuda_coordinator.sh + /usr/local/bin/install_cuda_driver.sh +{% if dcgm_support %} + /usr/local/bin/install_dcgm.sh +{% endif %} + /usr/local/bin/install_nvidia_peermem.sh +{% endif %} - systemctl restart slurmd - echo "Cloud-Init has completed successfully." \ No newline at end of file diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 65ef5a8b0c..401108acae 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -96,37 +96,21 @@ exit 0 fi - echo "[INFO] NVIDIA GPU detected. Proceeding with setup." + echo "[INFO] NVIDIA GPU detected. Proceeding with setup and CUDA installation." # Check if NVIDIA driver is already installed if command -v nvidia-smi &>/dev/null; then echo "[INFO] NVIDIA driver already installed. Skipping driver installation." else - echo "[INFO] Mounting NFS runfile directory for driver installation..." - mkdir -p /gpu-runfile - mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/runfile /gpu-runfile - - if [ $? -ne 0 ]; then - echo "[ERROR] Failed to mount NFS runfile share. Exiting." - exit 1 - fi - - echo "[INFO] Installing NVIDIA driver..." - if [ -f "/gpu-runfile/{{ cuda_runfile_x86_64 }}" ]; then - bash /gpu-runfile/{{ cuda_runfile_x86_64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build - if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then - echo "[SUCCESS] NVIDIA driver installed successfully." - nvidia-smi -pm 1 - else - echo "[ERROR] NVIDIA driver installation failed." - fi + echo "[INFO] Installing NVIDIA driver (proprietary kernel module)..." + dnf install -y cuda-drivers + if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then + echo "[SUCCESS] NVIDIA driver installed successfully." + nvidia-smi -pm 1 else - echo "[ERROR] NVIDIA driver runfile not found in /gpu-runfile/" + echo "[ERROR] NVIDIA driver installation failed." + exit 1 fi - - echo "[INFO] Cleaning up temporary NFS mount..." - umount /gpu-runfile 2>/dev/null - rmdir /gpu-runfile 2>/dev/null fi echo "[INFO] Setting up CUDA toolkit mount..." @@ -143,12 +127,12 @@ if [ $? -eq 0 ]; then echo "[SUCCESS] CUDA toolkit NFS mount successful" - + # Add to fstab for persistence grep -q "$cuda_nfs_share" /etc/fstab || echo "$cuda_nfs_share /usr/local/cuda nfs defaults,_netdev 0 0" >> /etc/fstab - + echo "[INFO] Configuring persistent CUDA environment..." - + # System-wide profile for login shells cat > /etc/profile.d/cuda.sh << 'EOF' export PATH=/usr/local/cuda/bin:$PATH @@ -156,7 +140,7 @@ export CUDA_HOME=/usr/local/cuda EOF chmod +x /etc/profile.d/cuda.sh - + # Bashrc for non-login shells cat > /etc/bashrc.cuda << 'EOF' if [ -d "/usr/local/cuda/bin" ]; then @@ -166,7 +150,7 @@ fi EOF grep -q "bashrc.cuda" /etc/bashrc || echo "source /etc/bashrc.cuda" >> /etc/bashrc - + # Slurm prolog for job environment mkdir -p /etc/slurm/prolog.d cat > /etc/slurm/prolog.d/cuda.sh << 'EOF' @@ -176,12 +160,12 @@ export CUDA_HOME=/usr/local/cuda EOF chmod +x /etc/slurm/prolog.d/cuda.sh - + # Apply immediately for current session export PATH=/usr/local/cuda/bin:$PATH export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH export CUDA_HOME=/usr/local/cuda - + echo "[SUCCESS] Persistent CUDA environment configured" else echo "[ERROR] Failed to mount CUDA toolkit NFS share" @@ -215,8 +199,6 @@ echo "===== NVIDIA GPU setup completed =====" - -{% if dcgm_support %} - path: /usr/local/bin/setup_dcgm.sh permissions: '0755' content: | @@ -238,12 +220,50 @@ exit 0 fi echo "[INFO] NVIDIA driver prerequisite satisfied." + + # Display nvidia-smi output for verification + echo "========== NVIDIA Driver & GPU Information ==========" + nvidia-smi 2>&1 + echo "=====================================================" + + # Detect CUDA major version for DCGM package selection + echo "[INFO] Detecting CUDA version for DCGM package compatibility..." + # Try to get CUDA version from nvidia-smi + CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}' | cut -d'.' -f1) + + # Fallback: Try to get CUDA version from nvcc if available + if [ -z "$CUDA_VERSION" ]; then + if command -v nvcc &>/dev/null; then + CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $5}' | cut -d',' -f1 | cut -d'.' -f1) + echo "[INFO] CUDA version detected from nvcc: $CUDA_VERSION" + else + echo "[ERROR] Could not detect CUDA version from nvidia-smi or nvcc." + echo "[ERROR] CUDA toolkit is required for DCGM package version detection. Skipping DCGM setup." + exit 1 + fi + else + echo "[INFO] CUDA major version detected from nvidia-smi: $CUDA_VERSION" + fi - # Check if datacenter-gpu-manager package is installed - if ! rpm -q datacenter-gpu-manager-4-core &>/dev/null; then - echo "[ERROR] datacenter-gpu-manager-4-core RPM not installed. Skipping DCGM setup." + # Install datacenter-gpu-manager-4-cuda${CUDA_VERSION} via dnf with weak dependencies + echo "[INFO] Installing datacenter-gpu-manager-4-cuda${CUDA_VERSION} package..." + if ! dnf install -y --setopt=install_weak_deps=True datacenter-gpu-manager-4-cuda${CUDA_VERSION}; then + echo "[ERROR] Failed to install datacenter-gpu-manager-4-cuda${CUDA_VERSION}. Skipping DCGM setup." exit 1 fi + echo "[INFO] datacenter-gpu-manager-4-cuda${CUDA_VERSION} installed successfully." + + # Install multinode diagnostic plugin for CUDA 12+ (optional but recommended for HPC) + if [ "$CUDA_VERSION" -ge "12" ]; then + echo "[INFO] Installing DCGM multinode diagnostic plugin for HPC cluster support..." + if dnf install -y --nogpgcheck datacenter-gpu-manager-4-multinode-cuda${CUDA_VERSION}; then + echo "[INFO] DCGM multinode plugin installed successfully." + else + echo "[WARN] Failed to install multinode plugin. Continuing without it." + fi + else + echo "[INFO] Multinode plugin requires CUDA 12+. Current version: $CUDA_VERSION. Skipping." + fi # Enable and start DCGM daemon (SB-003) echo "[INFO] Enabling and starting {{ dcgm_service_name }}.service..." @@ -276,16 +296,17 @@ # GPU discovery (SB-004) echo "[INFO] Enumerating GPUs via dcgmi discovery..." if command -v dcgmi &>/dev/null; then - dcgmi discovery -l - echo "[SUCCESS] GPU discovery completed." + echo "========== GPU Discovery Output ==========" + dcgmi discovery -l 2>&1 + GPU_COUNT=$(dcgmi discovery -l 2>/dev/null | grep -c "GPU") + echo "==========================================" + echo "[SUCCESS] GPU discovery completed. Found $GPU_COUNT GPU(s)." else echo "[WARN] dcgmi command not found. Skipping GPU enumeration." fi echo "===== NVIDIA DCGM setup completed =====" -{% endif %} - {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf owner: root:root @@ -300,7 +321,7 @@ {{ lookup('template', 'templates/openldap/update_ldap_conf.sh.j2') | indent(12) }} {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - path: /root/ldms_sampler.sh owner: root:root permissions: '0755' @@ -308,6 +329,52 @@ {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} +{% if slurm_node_present %} + - path: /usr/local/bin/slurm_cuda_coordinator.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/slurm_cuda_coordinator.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_cuda_driver.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_driver.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_nvidia_peermem.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_nvidia_peermem.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_dcgm.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_dcgm.sh.j2') | indent(12) }} + +{% if not login_compiler_node_present %} + - path: /usr/local/bin/generate_install_uuid.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/generate_install_uuid.sh.j2') | indent(12) }} + + - path: /usr/local/bin/cuda_lock_manager.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/cuda_lock_manager.sh.j2') | indent(12) }} + + - path: /usr/local/bin/install_cuda_toolkit.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/hpc_tools/install_cuda_toolkit.sh.j2') | indent(12) }} +{% endif %} +{% endif %} + - path: /etc/hosts append: true content: | @@ -320,6 +387,12 @@ content: | SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure_dirs_and_mounts.sh permissions: '{{ file_mode_755 }}' content: | @@ -343,6 +416,7 @@ echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path}}/hpc_tools/scripts /hpc_tools/scripts nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path}}/hpc_tools/ /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab @@ -556,12 +630,9 @@ runcmd: - rm -rf /var/lib/cloud/instance - /usr/local/bin/set-ssh.sh - - /usr/local/bin/install_nvidia_driver.sh -{% if dcgm_support %} - - /usr/local/bin/setup_dcgm.sh -{% endif %} + # slurm user and group created in the users module - + - /usr/local/bin/configure_vast_installation.sh - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf @@ -599,7 +670,7 @@ {% endif %} -{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} +{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or ldms_support %} # Add NFS entry and mount - mkdir -p {{ client_mount_path }} - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab @@ -615,13 +686,23 @@ {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - /root/ldms_sampler.sh {% endif %} - /usr/local/bin/setup_nvhpc_sdk.sh - /usr/local/bin/export_nvhpc_env.sh +{% if slurm_node_present %} + - | + set -e + /usr/local/bin/slurm_cuda_coordinator.sh + /usr/local/bin/install_cuda_driver.sh +{% if dcgm_support %} + /usr/local/bin/install_dcgm.sh +{% endif %} + /usr/local/bin/install_nvidia_peermem.sh +{% endif %} - systemctl restart slurmd - echo "Cloud-Init has completed successfully." diff --git a/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 b/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 index 43a000c561..d87720e495 100644 --- a/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 +++ b/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 @@ -29,31 +29,10 @@ IB_IP="${IB_IP_MAP[$ADMIN_NIC_IP]:-}" if [ -n "$IB_IP" ]; then echo "Using explicit IB IP : $IB_IP/$NETMASK_BITS" else - IB_NETWORK_SUBNET="{{ hostvars['localhost']['ib_network_subnet'] }}" - - ip_to_int() { - local IFS=. - read -r a b c d <<< "$1" - echo $(( (a << 24) + (b << 16) + (c << 8) + d )) - } - - int_to_ip() { - local ip=$1 - echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" - } - - ADMIN_IP_INT=$(ip_to_int "$ADMIN_NIC_IP") - IB_NET_INT=$(ip_to_int "$IB_NETWORK_SUBNET") - - HOST_BITS=$(( 32 - NETMASK_BITS )) - HOST_MASK=$(( (1 << HOST_BITS) - 1 )) - - HOST_OFFSET=$(( ADMIN_IP_INT & HOST_MASK )) - IB_IP_INT=$(( IB_NET_INT + HOST_OFFSET )) - - IB_IP=$(int_to_ip "$IB_IP_INT") - - echo "Derived IB IP : $IB_IP/$NETMASK_BITS" + echo "INFO: No explicit IB IP found in mapping file for node with ADMIN_IP: $ADMIN_NIC_IP" + echo "INFO: Skipping IB IP assignment. If IB networking is required for this node, please add IB_IP to the PXE mapping file." + echo "INFO: IB network interface will remain unconfigured." + exit 0 fi MAX_WAIT=120 # total wait time in seconds (2 minutes) @@ -95,4 +74,16 @@ else fi echo "SUCCESS: Assigned $IB_IP/$NETMASK_BITS to $IB_NIC" + +# Configure DNS for InfiniBand network +if [ -n "$IB_IP" ]; then + echo "Configuring DNS for InfiniBand interface" + + # Add VAST DNS servers (completely safe - handles empty arrays) + {% for dns_server in hostvars['localhost']['ib_network_dns'] %} + echo "nameserver {{ dns_server }}" >> /etc/resolv.conf + {% endfor %} + + echo "SUCCESS: DNS configured for IB network" +fi diff --git a/provision/roles/configure_ochami/templates/hpc_tools/cuda_lock_manager.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/cuda_lock_manager.sh.j2 new file mode 100644 index 0000000000..c037204a28 --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/cuda_lock_manager.sh.j2 @@ -0,0 +1,78 @@ +#!/bin/bash +# Distributed lock manager for CUDA toolkit install on shared NFS. +# Backed by atomic mkdir on /hpc_tools/cuda/.nfs_lock_cuda. +# Exposes: acquire | release | wait | is_stale +set -euo pipefail + +LOCK_ROOT="/hpc_tools/cuda" +LOCK_DIR="$LOCK_ROOT/.nfs_lock_cuda" +OWNER_FILE="$LOCK_DIR/owner.txt" +DONE_FILE="$LOCK_ROOT/.done_cuda" +STATUS_LOG="$LOCK_ROOT/.cuda_install_status.log" +HOSTNAME_FILE="/var/run/cuda_install_hostname" + +INSTALL_TIMEOUT="${INSTALL_TIMEOUT:-1800}" +POLL_INTERVAL="${POLL_INTERVAL:-5}" +TAKEOVER_MIN="${TAKEOVER_MIN:-5}" +TAKEOVER_MAX="${TAKEOVER_MAX:-15}" +GLOBAL_WAIT_TIMEOUT="${GLOBAL_WAIT_TIMEOUT:-$((INSTALL_TIMEOUT * 2))}" + +log_status() { + # ts host hostname role result + printf '%s %s %s %s %s\n' \ + "$(date '+%Y-%m-%d %H:%M:%S')" "$(hostname -s)" \ + "$(cat "$HOSTNAME_FILE" 2>/dev/null || echo UNKNOWN)" \ + "$1" "$2" >> "$STATUS_LOG" +} + +acquire() { + # Fast path: already done + [ -f "$DONE_FILE" ] && { log_status waiter skip_done; return 2; } + if mkdir "$LOCK_DIR" 2>/dev/null; then + cat "$HOSTNAME_FILE" > "$OWNER_FILE" + log_status installer lock_acquired + return 0 # we are installer + fi + return 1 # we are waiter +} + +release() { rm -rf "$LOCK_DIR"; } + +is_stale() { + # Owner hostname → hostname from status log → ping + local owner_hostname host + owner_hostname=$(cat "$OWNER_FILE" 2>/dev/null || echo "") + [ -z "$owner_hostname" ] && return 1 + host=$(awk -v h="$owner_hostname" '$3==h {print $2; exit}' "$STATUS_LOG") + [ -z "$host" ] && return 1 + ping -c1 -W2 "$host" >/dev/null 2>&1 && return 1 + return 0 # host unreachable → stale +} + +wait_for_done_or_takeover() { + local started; started=$(date +%s) + while true; do + [ -f "$DONE_FILE" ] && { log_status waiter skip_done; return 0; } + if [ ! -d "$LOCK_DIR" ]; then + sleep $(( RANDOM % (TAKEOVER_MAX - TAKEOVER_MIN + 1) + TAKEOVER_MIN )) + return 10 # caller should retry acquire + fi + if is_stale; then + log_status waiter crash_detected + release + continue + fi + (( $(date +%s) - started > GLOBAL_WAIT_TIMEOUT )) && { + log_status timeout_waiter fail; return 1; + } + sleep "$POLL_INTERVAL" + done +} + +case "${1:-}" in + acquire) acquire ;; + release) release ;; + wait) wait_for_done_or_takeover ;; + is_stale) is_stale ;; + *) echo "usage: $0 {acquire|release|wait|is_stale}" >&2; exit 64 ;; +esac diff --git a/provision/roles/configure_ochami/templates/hpc_tools/generate_install_uuid.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/generate_install_uuid.sh.j2 new file mode 100644 index 0000000000..be8fb867b3 --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/generate_install_uuid.sh.j2 @@ -0,0 +1,10 @@ +#!/bin/bash +# Generate hostname for lock ownership identity. +# Idempotent: uses hostname directly. +set -euo pipefail + +HOSTNAME_FILE="/var/run/cuda_install_hostname" + +hostname > "$HOSTNAME_FILE" + +echo "[INFO] CUDA install hostname for this node: $(cat "$HOSTNAME_FILE")" diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_driver.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_driver.sh.j2 new file mode 100644 index 0000000000..ba2cde8f3d --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_driver.sh.j2 @@ -0,0 +1,42 @@ +#!/bin/bash +# Local NVIDIA driver install. Always runs on Slurm nodes. Idempotent. +# Never touches NFS lock artifacts. Never touches /hpc_tools/cuda contents. +set -euo pipefail + +LOGFILE="/var/log/nvidia_install.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "===== NVIDIA driver install =====" + +if ! lspci | grep -qi nvidia; then + echo "[INFO] No NVIDIA GPU detected. Exiting." + exit 0 +fi + +if command -v nvidia-smi >/dev/null 2>&1; then + echo "[INFO] NVIDIA driver already installed. Skipping." +else + echo "[INFO] Installing NVIDIA driver via dnf..." + dnf install -y cuda-drivers + command -v nvidia-smi >/dev/null 2>&1 || { echo "[ERROR] Driver install failed."; exit 1; } +fi + +nvidia-smi -pm 1 || true + +# Mount shared toolkit at /usr/local/cuda (harmless if already mounted) +mkdir -p /usr/local/cuda +CUDA_NFS="{{ cloud_init_nfs_path }}/hpc_tools/cuda" +if ! mountpoint -q /usr/local/cuda; then + mount -t nfs "$CUDA_NFS" /usr/local/cuda || true +fi +grep -q "$CUDA_NFS /usr/local/cuda" /etc/fstab || \ + echo "$CUDA_NFS /usr/local/cuda nfs defaults,_netdev 0 0" >> /etc/fstab + +cat > /etc/profile.d/cuda.sh <<'EOF' +export PATH=/usr/local/cuda/bin:$PATH +export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH +export CUDA_HOME=/usr/local/cuda +EOF +chmod +x /etc/profile.d/cuda.sh + +echo "===== NVIDIA driver install completed =====" diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 new file mode 100644 index 0000000000..471c3be291 --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/install_cuda_toolkit.sh.j2 @@ -0,0 +1,166 @@ +#!/bin/bash +# Lock-aware CUDA toolkit installer. Publishes to /hpc_tools/cuda on NFS. +# Exits 0 if toolkit is already present (.done_cuda), if this node installed it, +set -euo pipefail + +LOGFILE="/var/log/cuda_toolkit_install.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +LOCK_ROOT="/hpc_tools/cuda" +DONE_FILE="$LOCK_ROOT/.done_cuda" +LOCK_MGR="/usr/local/bin/cuda_lock_manager.sh" +HOSTNAME_FILE="/var/run/cuda_install_hostname" + +# Function to set up CUDA environment variables +setup_cuda_env() { + echo "[INFO] Setting up CUDA environment variables for shared location..." + cat > /etc/profile.d/cuda.sh <<'EOF' +export PATH=/hpc_tools/cuda/bin:$PATH +export LD_LIBRARY_PATH=/hpc_tools/cuda/lib64:$LD_LIBRARY_PATH +export CUDA_HOME=/hpc_tools/cuda +EOF + chmod +x /etc/profile.d/cuda.sh + echo "[INFO] CUDA environment configured successfully" +} + +# Generate hostname for lock ownership (idempotent) +/usr/local/bin/generate_install_uuid.sh + +# Fast-path: already done +[ -f "$DONE_FILE" ] && { + echo "[INFO] CUDA toolkit already installed on shared storage by another node." + echo "[INFO] This node will use the existing CUDA installation." + setup_cuda_env + echo "[INFO] CUDA environment configured successfully." + exit 0 +} + +# Check if running in manual mode (not cloud-init) +MANUAL_MODE="${CUDA_INSTALL_MANUAL:-false}" +if [ "$MANUAL_MODE" = "true" ]; then + echo "[INFO] Running in manual mode - will force acquire lock if held" + FORCE_LOCK=true +else + echo "[INFO] Running in cloud-init mode - will proceed without waiting if lock held" + FORCE_LOCK=false +fi + +# Attempt lock acquisition +set +e; "$LOCK_MGR" acquire; rc=$?; set -e + +# In manual mode, if lock is held, release it explicitly then acquire again +if [ "$FORCE_LOCK" = "true" ] && [ "$rc" = "1" ]; then + echo "[WARN] Lock is held by another node. In manual mode, releasing lock first..." + "$LOCK_MGR" release + echo "[INFO] Lock released. Now acquiring lock..." + set +e; "$LOCK_MGR" acquire; rc=$?; set -e +fi +case $rc in + 0) # installer + echo "[INFO] Acquired lock. Installing toolkit..." + mkdir -p /shared-cuda-toolkit + mount -t nfs {{ cloud_init_nfs_path }}/hpc_tools/cuda/ /shared-cuda-toolkit + + if [ $? -ne 0 ]; then + echo "[ERROR] Failed to mount NFS cuda share." + echo "[ERROR] CUDA toolkit installation failed on this node." + echo "[INFO] To manually install CUDA toolkit, run: CUDA_INSTALL_MANUAL=true /usr/local/bin/install_cuda_toolkit.sh" + "$LOCK_MGR" release + exit 1 + fi + + # Check if CUDA toolkit is already installed on NFS + if [ -f "/shared-cuda-toolkit/bin/nvcc" ]; then + echo "[INFO] CUDA toolkit already installed on NFS. Exiting." + "$LOCK_MGR" release + exit 0 + fi + + # Install CUDA toolkit to local location using dnf + echo "[INFO] Installing CUDA toolkit to local location using dnf..." + mkdir -p /cuda + # Copy host repository configuration to installroot (only for manual mode) + if [ "$MANUAL_MODE" = "true" ]; then + mkdir -p /cuda/etc/yum.repos.d + cp -r /etc/yum.repos.d/* /cuda/etc/yum.repos.d/ 2>/dev/null || true + fi + if timeout "${INSTALL_TIMEOUT:-1800}" dnf install -y --installroot=/cuda --releasever=10 --setopt=install_weak_deps=False cuda-toolkit; then + echo "[SUCCESS] CUDA toolkit installed successfully." + + # Clean up repository configuration from installroot (if copied for manual mode) + if [ "$MANUAL_MODE" = "true" ]; then + rm -rf /cuda/etc/yum.repos.d + fi + + # Copy CUDA toolkit to shared location + echo "[INFO] Copying CUDA toolkit to shared location..." + CUDA_SRC_DIR=$(find /cuda/usr/local/ -maxdepth 1 -type d -name "cuda-*" | head -n1) + if [ -z "$CUDA_SRC_DIR" ]; then + echo "[ERROR] Could not find CUDA installation directory in /cuda/usr/local/" + echo "[ERROR] CUDA toolkit installation failed on this node." + echo "[INFO] To manually install CUDA toolkit, run: CUDA_INSTALL_MANUAL=true /usr/local/bin/install_cuda_toolkit.sh" + "$LOCK_MGR" release + exit 1 + fi + + echo "[INFO] Found CUDA at: $CUDA_SRC_DIR" + echo "[INFO] Copying contents directly to /shared-cuda-toolkit..." + cp -r "$CUDA_SRC_DIR"/* /shared-cuda-toolkit/ 2>/dev/null || true + + # Verify CUDA toolkit installation + echo "[INFO] Verifying CUDA toolkit installation..." + if [ -f "/shared-cuda-toolkit/bin/nvcc" ]; then + echo "[SUCCESS] CUDA toolkit verified." + else + echo "[ERROR] CUDA toolkit (nvcc) not found after installation." + echo "[ERROR] CUDA toolkit installation failed on this node." + echo "[INFO] To manually install CUDA toolkit, run: CUDA_INSTALL_MANUAL=true /usr/local/bin/install_cuda_toolkit.sh" + "$LOCK_MGR" release + exit 1 + fi + + # Atomic publish of .done_cuda (see §4.4). Never use `touch`. + TMP="$LOCK_ROOT/.done_cuda.tmp.$(cat $HOSTNAME_FILE)" + printf 'installed_by=%s\nts=%s\n' \ + "$(hostname -s)" "$(date -Iseconds)" > "$TMP" + sync -f "$TMP" 2>/dev/null || sync + mv -f -- "$TMP" "$DONE_FILE" + "$LOCK_MGR" release + # log pass + printf '%s %s %s installer pass\n' \ + "$(date '+%Y-%m-%d %H:%M:%S')" "$(hostname -s)" \ + "$(cat $HOSTNAME_FILE)" \ + >> "$LOCK_ROOT/.cuda_install_status.log" + + setup_cuda_env + + umount /shared-cuda-toolkit 2>/dev/null + exit 0 + else + result=$? + "$LOCK_MGR" release + [ "$result" = "124" ] && st="timeout_killed" || st="fail" + printf '%s %s %s installer %s\n' \ + "$(date '+%Y-%m-%d %H:%M:%S')" "$(hostname -s)" \ + "$(cat $HOSTNAME_FILE)" "$st" \ + >> "$LOCK_ROOT/.cuda_install_status.log" + echo "[ERROR] CUDA toolkit installation failed on this node." + echo "[INFO] To manually install CUDA toolkit, run: CUDA_INSTALL_MANUAL=true /usr/local/bin/install_cuda_toolkit.sh" + exit 1 + fi + ;; + 1) # waiter - another node is installing + echo "[INFO] Another node is installing CUDA toolkit. Proceeding with cloud-init without waiting." + echo "[INFO] This node will use the shared CUDA toolkit once installation completes." + setup_cuda_env + echo "[INFO] CUDA environment configured (will work once installation completes)" + + exit 0 + ;; + 2) # already done + echo "[INFO] CUDA toolkit already installed on shared storage." + setup_cuda_env + exit 0 + ;; + *) echo "[ERROR] acquire rc=$rc"; exit 1 ;; +esac diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2 new file mode 100644 index 0000000000..158e089805 --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/install_dcgm.sh.j2 @@ -0,0 +1,98 @@ +#!/bin/bash +LOGFILE="/var/log/dcgm_setup.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "===== Starting NVIDIA DCGM setup =====" + +# GPU detection gate - DCGM requires NVIDIA GPU hardware +if ! lspci | grep -qi nvidia; then + echo "[INFO] No NVIDIA GPU detected. Skipping DCGM setup." + exit 0 +fi + +# CUDA prerequisite gate +echo "[INFO] Validating NVIDIA driver prerequisite..." +if ! command -v nvidia-smi &>/dev/null; then + echo "[WARN] nvidia-smi not found. NVIDIA driver not installed. Skipping DCGM setup." + exit 0 +fi + +if ! nvidia-smi &>/dev/null; then + echo "[WARN] nvidia-smi failed to communicate with the driver. Skipping DCGM setup." + exit 0 +fi +echo "[INFO] NVIDIA driver prerequisite satisfied." + +# Display nvidia-smi output for verification +echo "========== NVIDIA Driver & GPU Information ==========" +nvidia-smi 2>&1 +echo "=====================================================" + +# Detect CUDA major version for DCGM package selection +echo "[INFO] Detecting CUDA version for DCGM package compatibility..." +# Try to get CUDA version from nvidia-smi +CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}' | cut -d'.' -f1) + +# Fallback: Try to get CUDA version from nvcc if available +if [ -z "$CUDA_VERSION" ]; then + if command -v nvcc &>/dev/null; then + CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $5}' | cut -d',' -f1 | cut -d'.' -f1) + echo "[INFO] CUDA version detected from nvcc: $CUDA_VERSION" + else + echo "[ERROR] Could not detect CUDA version from nvidia-smi or nvcc." + echo "[ERROR] CUDA toolkit is required for DCGM package version detection. Skipping DCGM setup." + exit 1 + fi +else + echo "[INFO] CUDA major version detected from nvidia-smi: $CUDA_VERSION" +fi + +# Install datacenter-gpu-manager-4-cuda${CUDA_VERSION} via dnf with weak dependencies +echo "[INFO] Installing datacenter-gpu-manager-4-cuda${CUDA_VERSION} package..." +if ! dnf install -y --setopt=install_weak_deps=True datacenter-gpu-manager-4-cuda${CUDA_VERSION}; then + echo "[ERROR] Failed to install datacenter-gpu-manager-4-cuda${CUDA_VERSION}. Skipping DCGM setup." + exit 1 +fi +echo "[INFO] datacenter-gpu-manager-4-cuda${CUDA_VERSION} installed successfully." + +# Enable and start DCGM daemon (SB-003) +echo "[INFO] Enabling and starting {{ dcgm_service_name }}.service..." +systemctl enable {{ dcgm_service_name }} + +RETRIES={{ dcgm_health_check_retries }} +ATTEMPT=0 +DCGM_STARTED=false + +while [ $ATTEMPT -lt $RETRIES ]; do + ATTEMPT=$((ATTEMPT + 1)) + echo "[INFO] Starting {{ dcgm_service_name }} (attempt $ATTEMPT/$RETRIES)..." + systemctl start {{ dcgm_service_name }} + sleep 3 + + if systemctl is-active --quiet {{ dcgm_service_name }}; then + DCGM_STARTED=true + echo "[SUCCESS] {{ dcgm_service_name }}.service is active." + break + else + echo "[WARN] {{ dcgm_service_name }} failed to start on attempt $ATTEMPT." + fi +done + +if [ "$DCGM_STARTED" != "true" ]; then + echo "[ERROR] {{ dcgm_service_name }} failed to start after $RETRIES attempts. Service will stay down (BL-002)." + exit 1 +fi + +# GPU discovery (SB-004) +echo "[INFO] Enumerating GPUs via dcgmi discovery..." +if command -v dcgmi &>/dev/null; then + echo "========== GPU Discovery Output ==========" + dcgmi discovery -l 2>&1 + GPU_COUNT=$(dcgmi discovery -l 2>/dev/null | grep -c "GPU") + echo "==========================================" + echo "[SUCCESS] GPU discovery completed. Found $GPU_COUNT GPU(s)." +else + echo "[WARN] dcgmi command not found. Skipping GPU enumeration." +fi + +echo "===== NVIDIA DCGM setup completed =====" diff --git a/provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2 new file mode 100644 index 0000000000..4a51c179ae --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/install_nvidia_peermem.sh.j2 @@ -0,0 +1,141 @@ +#!/bin/bash +# NVIDIA Peer Memory (nvidia-peermem) DKMS installation for GPUDirect RDMA support. +# SHALL be installed on all compute nodes where GPU hardware is detected. +# Required on RDMA-capable GPU nodes only. +# Idempotent: skips installation if module is already loaded. +set -euo pipefail + +LOGFILE="/var/log/nvidia_peermem_install.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "===== Starting NVIDIA Peer Memory (nvidia-peermem) setup =====" + +# GPU detection gate - only proceed if NVIDIA GPU is present +echo "[INFO] Checking for NVIDIA GPU hardware..." +if ! lspci | grep -qi nvidia; then + echo "[INFO] No NVIDIA GPU detected. Skipping nvidia-peermem installation." + exit 0 +fi + +# NVIDIA driver prerequisite gate +echo "[INFO] Validating NVIDIA driver prerequisite..." +if ! command -v nvidia-smi &>/dev/null; then + echo "[WARN] nvidia-smi not found. NVIDIA driver not installed. Skipping nvidia-peermem." + exit 0 +fi + +if ! nvidia-smi &>/dev/null; then + echo "[WARN] nvidia-smi failed to communicate with the driver. Skipping nvidia-peermem." + exit 0 +fi +echo "[INFO] NVIDIA driver prerequisite satisfied." + +# Check if nvidia-peermem module is already loaded +echo "[INFO] Checking if nvidia-peermem module is already loaded..." +if lsmod | grep -qE 'nv_peer_mem|nvidia_peermem'; then + echo "[INFO] nvidia-peermem module is already loaded. Skipping installation." + # Verify module metadata + if modinfo nvidia-peermem &>/dev/null; then + echo "[INFO] nvidia-peermem module metadata verified." + else + echo "[WARN] nvidia-peermem module loaded but modinfo failed. This may indicate a corrupted module." + fi + exit 0 +fi + +# Check running kernel +KERNEL_VERSION=$(uname -r) +echo "[INFO] Running kernel version: $KERNEL_VERSION" + +# Check if kernel headers are available (required for DKMS) +if [ ! -d "/lib/modules/$KERNEL_VERSION/build" ]; then + echo "[ERROR] Kernel headers not found for kernel $KERNEL_VERSION." + echo "[ERROR] Required for DKMS build. Please install kernel-devel package." + exit 1 +fi + +# Get NVIDIA driver version from nvidia-smi +NVIDIA_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n1 | tr -d ' ') +if [ -z "$NVIDIA_VERSION" ]; then + echo "[ERROR] Could not determine NVIDIA driver version from nvidia-smi." + exit 1 +fi +echo "[INFO] NVIDIA driver version: $NVIDIA_VERSION" + +# Check current DKMS status +echo "[INFO] Checking current DKMS status..." +dkms status || true + +# Add NVIDIA driver to DKMS if not already added +if ! dkms status | grep -q "nvidia/$NVIDIA_VERSION"; then + echo "[INFO] Adding NVIDIA driver $NVIDIA_VERSION to DKMS..." + if ! dkms add -m nvidia -v "$NVIDIA_VERSION"; then + echo "[ERROR] Failed to add NVIDIA driver to DKMS." + exit 1 + fi + echo "[INFO] NVIDIA driver added to DKMS successfully." +else + echo "[INFO] NVIDIA driver $NVIDIA_VERSION already in DKMS." +fi + +# Build NVIDIA module for the running kernel +echo "[INFO] Building NVIDIA module for kernel $KERNEL_VERSION..." +if ! dkms build -m nvidia -v "$NVIDIA_VERSION" -k "$KERNEL_VERSION" --force; then + echo "[ERROR] Failed to build NVIDIA module for kernel $KERNEL_VERSION." + echo "[ERROR] Check kernel logs for build errors." + exit 1 +fi +echo "[INFO] NVIDIA module built successfully." + +# Install the built module +echo "[INFO] Installing NVIDIA module for kernel $KERNEL_VERSION..." +if ! dkms install -m nvidia -v "$NVIDIA_VERSION" -k "$KERNEL_VERSION" --force; then + echo "[ERROR] Failed to install NVIDIA module for kernel $KERNEL_VERSION." + exit 1 +fi +echo "[INFO] NVIDIA module installed successfully." + +# Verify nvidia-peermem module metadata +echo "[INFO] Verifying nvidia-peermem module metadata..." +if modinfo nvidia-peermem &>/dev/null; then + echo "[INFO] nvidia-peermem module metadata verified." + modinfo nvidia-peermem +else + echo "[ERROR] nvidia-peermem module metadata not found after DKMS install." + echo "[ERROR] This may indicate the module was not built or installed correctly." + exit 1 +fi + +# Ensure base NVIDIA modules are loaded first + echo "Loading base NVIDIA modules..." + modprobe nvidia 2>/dev/null || echo "nvidia module not available or failed to load" + modprobe nvidia-uvm 2>/dev/null || echo "nvidia-uvm module not available or failed to load" + modprobe nvidia-modeset 2>/dev/null || echo "nvidia-modeset module not available or failed to load" + modprobe nvidia-drm 2>/dev/null || echo "nvidia-drm module not available or failed to load" + +# Load the nvidia-peermem module +echo "[INFO] Loading nvidia-peermem module..." +if modprobe nvidia-peermem; then + echo "[SUCCESS] nvidia-peermem module loaded successfully." +else + echo "[WARN] Failed to load nvidia-peermem module with modprobe." + echo "[WARN] This may not be critical if RDMA is not required on this node." + echo "[WARN] Check kernel logs for detailed error information." + dmesg | grep -i peermem || true + # Continue with warning unless RDMA dependency exists + # (RDMA dependency check would be environment-specific) +fi + +# Confirm module is loaded +if lsmod | grep -q nvidia_peermem; then + echo "[SUCCESS] nvidia_peermem is loaded in kernel." +else + echo "[WARN] nvidia_peermem not found in lsmod output." + echo "[WARN] Module may have failed to load or may not be required for this configuration." +fi + +# Check kernel logs for peer memory messages or errors +echo "[INFO] Checking kernel logs for peer memory messages..." +dmesg | grep -i peermem || echo "[INFO] No peermem messages found in recent kernel logs." + +echo "===== NVIDIA Peer Memory (nvidia-peermem) setup completed =====" diff --git a/provision/roles/configure_ochami/templates/hpc_tools/slurm_cuda_coordinator.sh.j2 b/provision/roles/configure_ochami/templates/hpc_tools/slurm_cuda_coordinator.sh.j2 new file mode 100644 index 0000000000..79d72db10b --- /dev/null +++ b/provision/roles/configure_ochami/templates/hpc_tools/slurm_cuda_coordinator.sh.j2 @@ -0,0 +1,50 @@ +#!/bin/bash +# Slurm-node entry point. Decides toolkit path based on login_compiler_node_present. +# GPU detection gate ensures CUDA operations only run on nodes with NVIDIA hardware. + +set -euo pipefail + +LOGIN_COMPILER_PRESENT="{{ login_compiler_node_present | lower }}" +SLURM_NODE_PRESENT="{{ slurm_node_present | lower }}" + +[ "$SLURM_NODE_PRESENT" = "true" ] || { echo "[INFO] Not a Slurm node."; exit 0; } + +# GPU detection gate - if no GPU present, skip CUDA toolkit and driver installation +if ! lspci | grep -qi nvidia; then + echo "[INFO] No NVIDIA GPU detected. Skipping CUDA toolkit and driver installation." + exit 0 +fi + +if [ "$LOGIN_COMPILER_PRESENT" = "true" ]; then + echo "[INFO] Login/compiler nodes present → mounting shared toolkit from NFS." + # Mount shared toolkit at /usr/local/cuda + mkdir -p /usr/local/cuda + CUDA_NFS="{{ cloud_init_nfs_path }}/hpc_tools/cuda" + if ! mountpoint -q /usr/local/cuda; then + mount -t nfs "$CUDA_NFS" /usr/local/cuda || true + fi + grep -q "$CUDA_NFS /usr/local/cuda" /etc/fstab || \ + echo "$CUDA_NFS /usr/local/cuda nfs defaults,_netdev 0 0" >> /etc/fstab + # Export CUDA environment variables + cat > /etc/profile.d/cuda.sh <<'EOF' +export PATH=/usr/local/cuda/bin:$PATH +export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH +export CUDA_HOME=/usr/local/cuda +EOF + chmod +x /etc/profile.d/cuda.sh + echo "[INFO] CUDA environment configured from shared NFS toolkit." +else + echo "[INFO] No login/compiler nodes → participating in lock." + # install_cuda_toolkit.sh is lock-aware: + # - if this node wins the lock, it runs the install and publishes .done_cuda + # - if this node loses the lock, it returns immediately without waiting + if ! /usr/local/bin/install_cuda_toolkit.sh; then + echo "[ERROR] install_cuda_toolkit.sh returned non-zero." + exit 1 + fi + echo "[INFO] CUDA toolkit installation handled by another node or completed by this node." + echo "[INFO] Proceeding with driver, DCGM, and nvidia-peermem installation." +fi + +echo "[SUCCESS] CUDA coordinator completed." +exit 0 diff --git a/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 new file mode 100644 index 0000000000..1919018cd3 --- /dev/null +++ b/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 @@ -0,0 +1,308 @@ +#!/bin/bash +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# PowerScale Syslog Collection — rsyslog relay + VLAgent +# +# DATA PIPELINE: +# PowerScale (UDP/TCP:514) -> rsyslog on K8s nodes (KUBE_VIP:514) -> VLAgent (LoadBalancer IP:514) -> VictoriaLogs + +set -euo pipefail + +NAMESPACE="{{ telemetry_namespace }}" +KUBE_VIP="{{ kube_vip }}" + +POWERSCALE_IPS=( +{% for ip in powerscale_management_ips %} + "{{ ip }}" +{% endfor %} +) + +echo "==========================================" +echo "PowerScale Syslog Collection Setup" +echo "==========================================" +echo "Pipeline: PowerScale (UDP/TCP:514) -> rsyslog (KUBE_VIP:514) -> VLAgent (ClusterIP:514) -> VictoriaLogs" +{% raw %} +echo "PowerScale Syslog Source IPs: ${POWERSCALE_IPS[*]}" +{% endraw %} +echo "" + +# ============================================================================ +# Phase 1: Get VLAgent ClusterIP for internal forwarding +# ============================================================================ +echo "===== Phase 1: Getting VLAgent ClusterIP =====" + +VLAGENT_SERVICE=$(kubectl get svc vlagent-vlagent -n "$NAMESPACE" -o json 2>/dev/null) || { + echo "ERROR: VLAgent service not found in namespace $NAMESPACE" + exit 0 +} + +SERVICE_TYPE=$(echo "$VLAGENT_SERVICE" | jq -r '.spec.type') +echo "VLAgent service type: $SERVICE_TYPE" + +# rsyslog runs in host network - use ClusterIP for internal access +VLAGENT_CLUSTERIP=$(echo "$VLAGENT_SERVICE" | jq -r '.spec.clusterIP') + +if [ -z "$VLAGENT_CLUSTERIP" ] || [ "$VLAGENT_CLUSTERIP" == "null" ]; then + echo "ERROR: Could not determine VLAgent ClusterIP." + echo "Ensure VLAgent service has a ClusterIP assigned." + exit 0 +fi + +# Use ClusterIP for rsyslog forwarding +VLAGENT_IP="$VLAGENT_CLUSTERIP" +VLAGENT_PORT=514 + +echo "VLAgent forwarding target: ${VLAGENT_IP}:${VLAGENT_PORT}" +echo "" + +# ============================================================================ +# Phase 2: Generate rsyslog configuration +# ============================================================================ +echo "===== Phase 2: Generating rsyslog Configuration =====" + +RSYSLOG_CONF="/etc/rsyslog.d/60-omnia-powerscale-syslog.conf" + +generate_rsyslog_config() { + # Static templates — QUOTED heredoc (no Bash interpretation) + cat <<'RSYSLOG_STATIC' +# Omnia PowerScale Syslog Relay Configuration +# Auto-generated — DO NOT EDIT manually +# Pipeline: PowerScale (UDP/TCP:514) -> rsyslog on K8s nodes -> VLAgent (ClusterIP:514) -> VictoriaLogs + +# === UDP/TCP Listeners on port 514 === +$ModLoad imudp +input(type="imudp" port="514") + +$ModLoad imtcp +input(type="imtcp" port="514") + +# === JSON template for VLAgent === +template(name="VLAgentJSON" type="list") { + constant(value="{\"_time\":\"") + property(name="timereported" dateFormat="rfc3339") + constant(value="\",\"host\":\"") + property(name="hostname") + constant(value="\",\"app\":\"") + property(name="programname") + constant(value="\",\"facility\":\"") + property(name="syslogfacility") + constant(value="\",\"severity\":\"") + property(name="syslogseverity") + constant(value="\",\"msg\":\"") + property(name="msg" format="json") + constant(value="\"}\n") +} +RSYSLOG_STATIC + +{% raw %} + if [ ${#POWERSCALE_IPS[@]} -eq 0 ]; then +{% endraw %} + # Action block — UNQUOTED heredoc (Bash expands ${VLAGENT_IP} etc.) + cat < "$RSYSLOG_CONF" + + # Open firewall ports + if command -v firewall-cmd &>/dev/null; then + firewall-cmd --add-port=514/udp --permanent 2>/dev/null || true + firewall-cmd --add-port=514/tcp --permanent 2>/dev/null || true + firewall-cmd --reload 2>/dev/null || true + echo " Firewall: ports 514/udp and 514/tcp opened" + fi + + # Validate config — capture output for diagnostics + VALIDATION_OUTPUT=$(rsyslogd -N 1 2>&1) || { + echo " ERROR: rsyslog config validation failed!" + echo " --- rsyslogd -N 1 output ---" + echo "$VALIDATION_OUTPUT" + echo " --- end output ---" + return 1 + } + echo " rsyslog config validation passed" + + systemctl restart rsyslog 2>/dev/null || { + echo " WARNING: Failed to restart rsyslog on local node" + return 1 + } + echo " rsyslog restarted successfully on local node" + else + echo " Configuring remote node: ${NODE_IP}..." + + ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "$NODE_IP" \ + "cat > $RSYSLOG_CONF" <<< "$RSYSLOG_CONFIG" 2>/dev/null || { + echo " WARNING: Failed to write rsyslog config on ${NODE_IP}" + return 1 + } + + # Setup log file, firewall, validate, restart + ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "$NODE_IP" " \ + if command -v firewall-cmd &>/dev/null; then \ + firewall-cmd --add-port=514/udp --permanent 2>/dev/null || true && \ + firewall-cmd --add-port=514/tcp --permanent 2>/dev/null || true && \ + firewall-cmd --reload 2>/dev/null || true; \ + fi && \ + rsyslogd -N 1 2>/dev/null && \ + systemctl restart rsyslog" 2>/dev/null || { + echo " WARNING: Failed to configure rsyslog on ${NODE_IP}" + return 1 + } + echo " rsyslog configured on ${NODE_IP}" + fi + return 0 +} + +configure_rsyslog_on_node "localhost" "true" + +# ============================================================================ +# Phase 4: Configure rsyslog on all other K8s nodes +# ============================================================================ +echo "" +echo "===== Phase 4: Configuring rsyslog on Other K8s Nodes =====" + +LOCAL_IP=$(hostname -I | awk '{print $1}') +NODE_IPS=$(kubectl get nodes -o wide --no-headers 2>/dev/null | awk '{print $6}') +REMOTE_SUCCESS=0 +REMOTE_FAIL=0 + +for NODE_IP in $NODE_IPS; do + if [ "$NODE_IP" == "$LOCAL_IP" ] || [ "$NODE_IP" == "$KUBE_VIP" ]; then + continue + fi + if configure_rsyslog_on_node "$NODE_IP" "false"; then + REMOTE_SUCCESS=$((REMOTE_SUCCESS + 1)) + else + REMOTE_FAIL=$((REMOTE_FAIL + 1)) + fi +done + +TOTAL=$((REMOTE_SUCCESS + 1)) +echo "" +echo "rsyslog configured on ${TOTAL} node(s) total (1 local + ${REMOTE_SUCCESS} remote), ${REMOTE_FAIL} failure(s)" + +# ============================================================================ +# Phase 5: Verification +# ============================================================================ +echo "" +echo "===== Phase 5: Verification =====" + +VLAGENT_PODS=$(kubectl get pods -n "$NAMESPACE" -l app=vlagent --no-headers 2>/dev/null | wc -l) +echo "VLAgent pods running: $VLAGENT_PODS" + +echo "Checking rsyslog status on local node..." +systemctl is-active rsyslog && echo " rsyslog is active" || echo " WARNING: rsyslog is not active" +echo " Config file: $RSYSLOG_CONF" + +# ============================================================================ +# Phase 6: PowerScale Configuration Instructions +# ============================================================================ +echo "" +echo "==========================================" +echo "NEXT STEPS: Configure PowerScale OneFS" +echo "==========================================" + +echo "Configure PowerScale to send syslog to: ${KUBE_VIP}:514" +echo "Kubernetes kube-vIP: ${KUBE_VIP}" +echo "Syslog Port: 514 (UDP/TCP)" +echo "" +echo "Steps for each PowerScale cluster:" +echo "" +echo "1. SSH to PowerScale:" +echo " ssh @" +echo "" +echo "2. Enable audit syslog forwarding:" +echo "" +echo " # For protocol events (file access):" +echo " isi audit settings global modify \\" +echo " --protocol-syslog-enabled=1 \\" +echo " --protocol-syslog-servers=${KUBE_VIP}:514 \\" +echo " --protocol-syslog-tls-enabled=0" +echo "" +echo " # For config events:" +echo " isi audit settings global modify \\" +echo " --config-syslog-enabled=1 \\" +echo " --config-syslog-servers=${KUBE_VIP}:514 \\" +echo " --config-syslog-tls-enabled=0" +echo "" +echo " # For system events:" +echo " isi audit settings global modify \\" +echo " --system-syslog-enabled=1 \\" +echo " --system-syslog-servers=${KUBE_VIP}:514 \\" +echo " --system-syslog-tls-enabled=0" +echo "" +echo "3. Verify configuration:" +echo " isi audit settings global view" +echo "" +echo "==========================================" +echo "PowerScale syslog collection setup completed!" +echo "==========================================" +exit 0 diff --git a/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_telemetry.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_telemetry.sh.j2 index d1017bc76c..6f7332e566 100644 --- a/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_telemetry.sh.j2 +++ b/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_telemetry.sh.j2 @@ -131,11 +131,17 @@ if [ "$PS_TEL_FAILED" -eq 0 ]; then echo "===== PowerScale Telemetry (CSM Observability) deployed successfully =====" - # Step 6: Patch OTEL Collector service to expose Prometheus metrics port + # Step 6: Patch OTEL Collector service to expose Prometheus metrics port and add labels for vmagent discovery echo "Patching OTEL Collector service to expose port 8889 for Prometheus metrics..." kubectl patch svc otel-collector -n "${CSM_NS}" --patch '{"spec":{"ports":[{"name":"prometheus","port":8889,"targetPort":8889,"protocol":"TCP"}]}}' || { echo "WARNING: Failed to patch OTEL Collector service for Prometheus metrics." } + + # Step 6b: Add labels to OTEL Collector service for VMServiceScrape discovery + echo "Adding labels to OTEL Collector service for vmagent discovery..." + kubectl label svc otel-collector -n "${CSM_NS}" app.kubernetes.io/name=otel-collector app.kubernetes.io/component=collector --overwrite || { + echo "WARNING: Failed to add labels to OTEL Collector service." + } # Step 7: Create PVC for OTEL Collector persistent buffering OTEL_PVC_SIZE="{{ hostvars['localhost']['telemetry_config']['powerscale_configurations']['otel_collector_storage_size'] | default('5Gi') }}" diff --git a/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_syslog_ut.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_syslog_ut.sh.j2 new file mode 100644 index 0000000000..5c3ca6db22 --- /dev/null +++ b/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_syslog_ut.sh.j2 @@ -0,0 +1,130 @@ +#!/bin/bash +# PowerScale Syslog UT Verification Script +# Run on Kubernetes control plane node to verify syslog ingestion pipeline + +NAMESPACE="telemetry" +VLAGENT_SVC="vlagent-vlagent" +PS_HOSTNAME="{{ powerscale_syslog_source_ips[0] | default('bdcdap-1') }}" # PowerScale hostname or IP for query + +echo "==========================================" +echo "PowerScale Syslog Feature UT Verification" +echo "==========================================" +echo "" + +# ============================================================================ +# Test 1: VLAgent Service Status +# ============================================================================ +echo "Test 1: VLAgent Service Status" +kubectl get svc "$VLAGENT_SVC" -n "$NAMESPACE" +VLAGENT_CLUSTERIP=$(kubectl get svc "$VLAGENT_SVC" -n "$NAMESPACE" -o jsonpath='{.spec.clusterIP}') +VLAGENT_LBIP=$(kubectl get svc "$VLAGENT_SVC" -n "$NAMESPACE" -o jsonpath='{.status.loadBalancer.ingress[0].ip}') +echo "✓ VLAgent ClusterIP: $VLAGENT_CLUSTERIP" +echo "✓ VLAgent LoadBalancer IP: $VLAGENT_LBIP" +echo "" + +# ============================================================================ +# Test 2: VLAgent Pod Status +# ============================================================================ +echo "Test 2: VLAgent Pod Status" +kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/name=vlagent +POD_STATUS=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/name=vlagent -o jsonpath='{.items[0].status.phase}') +if [ "$POD_STATUS" == "Running" ]; then + echo "✓ VLAgent pod is Running" +else + echo "✗ VLAgent pod status: $POD_STATUS" +fi +echo "" + +# ============================================================================ +# Test 3: rsyslog Service Status on All K8s Nodes +# ============================================================================ +echo "Test 3: rsyslog Service Status on K8s Nodes" +NODE_IPS=$(kubectl get nodes -o wide --no-headers | awk '{print $6}') +RSYSLOG_ACTIVE_COUNT=0 +for NODE_IP in $NODE_IPS; do + if ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 "$NODE_IP" \ + "systemctl is-active rsyslog" >/dev/null 2>&1; then + echo " ✓ rsyslog active on $NODE_IP" + ((RSYSLOG_ACTIVE_COUNT++)) + else + echo " ✗ rsyslog not active on $NODE_IP" + fi +done +echo "Total: $RSYSLOG_ACTIVE_COUNT/$NODE_IPS nodes with active rsyslog" +echo "" + +# ============================================================================ +# Test 4: rsyslog Configuration +# ============================================================================ +echo "Test 4: rsyslog Configuration on Local Node" +if [ -f /etc/rsyslog.d/60-omnia-powerscale-syslog.conf ]; then + echo "✓ rsyslog config file exists" + if grep -q "target=\"$VLAGENT_CLUSTERIP\"" /etc/rsyslog.d/60-omnia-powerscale-syslog.conf; then + echo "✓ rsyslog forwarding to VLAgent ClusterIP: $VLAGENT_CLUSTERIP" + else + echo "✗ rsyslog forwarding target mismatch" + echo " Expected: target=\"$VLAGENT_CLUSTERIP\"" + echo " Actual: $(grep 'target=' /etc/rsyslog.d/60-omnia-powerscale-syslog.conf | head -1)" + fi +else + echo "✗ rsyslog config file missing" +fi +echo "" + +# ============================================================================ +echo "" + +# ============================================================================ +# Test 5: VLAgent Binary Format Errors (should be 0 after rsyslog restart) +# ============================================================================ +echo "Test 6: VLAgent Binary Format Errors (last 2 minutes)" +ERROR_COUNT=$(kubectl logs -n telemetry vlagent-vlagent-0 --since=2m 2>&1 | grep -c "cannot parse JSON" 2>/dev/null || echo "0") +ERROR_COUNT=$(echo "$ERROR_COUNT" | tr -d '[:space:]') +if [ "$ERROR_COUNT" -eq "0" ]; then + echo "✓ No binary format errors (count: $ERROR_COUNT)" +else + echo "✗ Binary format errors detected (count: $ERROR_COUNT)" + echo " Run: systemctl restart rsyslog on all K8s nodes" +fi +echo "" + +# ============================================================================ +# Test 6: kube-proxy iptables Rules for VLAgent ClusterIP +# ============================================================================ +echo "Test 7: kube-proxy iptables Rules for VLAgent ClusterIP" +if iptables -t nat -L KUBE-SERVICES -n 2>/dev/null | grep -q "$VLAGENT_CLUSTERIP"; then + echo "✓ iptables rules exist for ClusterIP routing" + iptables -t nat -L KUBE-SERVICES -n | grep "$VLAGENT_CLUSTERIP" | head -3 +else + echo "✗ No iptables rules found for ClusterIP: $VLAGENT_CLUSTERIP" +fi +echo "" + +# ============================================================================ +# Test 7: Port 514 Firewall Rules +# ============================================================================ +echo "Test 8: Port 514 Firewall Rules" +if command -v firewall-cmd &>/dev/null; then + if firewall-cmd --list-ports 2>/dev/null | grep -q 514; then + echo "✓ Port 514 open in firewalld" + firewall-cmd --list-ports + else + echo "⚠ Port 514 may not be open in firewalld" + fi +else + echo "⚠ firewalld not configured, assuming iptables/ufw" +fi +echo "" + +# ============================================================================ +# Summary +# ============================================================================ +echo "==========================================" +echo "UT Verification Complete" +echo "==========================================" +echo "" +echo "If any tests failed:" +echo "1. Restart rsyslog on all K8s nodes: systemctl restart rsyslog" +echo "2. Wait 30 seconds for rsyslog to reconnect to VLAgent" +echo "3. Re-run this script" +echo "" diff --git a/provision/roles/configure_ochami/templates/telemetry/telemetry.sh.j2 b/provision/roles/configure_ochami/templates/telemetry/telemetry.sh.j2 index 352671ad4c..d6bc5b1e30 100644 --- a/provision/roles/configure_ochami/templates/telemetry/telemetry.sh.j2 +++ b/provision/roles/configure_ochami/templates/telemetry/telemetry.sh.j2 @@ -2,23 +2,23 @@ kubectl apply -f {{ k8s_client_mount_path }}/telemetry/deployments/telemetry_nam {% if kafka_support %} helm -n telemetry install strimzi-cluster-operator {{ k8s_client_mount_path }}/telemetry/{{ strimzi_kafka_pkg }}.tar.gz {% endif %} -{% if 'victoria' in hostvars['localhost']['telemetry_collection_type'].split(',') %} +{% if victoria_metrics_support or victoria_logs_support %} helm -n telemetry install victoria-metrics-operator {{ k8s_client_mount_path }}/telemetry/{{ victoria_operator_pkg }}.tar.gz echo "Waiting for victoria-metrics-operator to be ready..." kubectl wait --for=condition=available --timeout=300s deployment -l app.kubernetes.io/name=victoria-metrics-operator -n telemetry || true -echo "Waiting for VictoriaLogs CRDs to be registered..." +echo "Waiting for victoria_logs CRDs to be registered..." for i in {1..30}; do if kubectl get crd vlclusters.operator.victoriametrics.com >/dev/null 2>&1 && \ kubectl get crd vlagents.operator.victoriametrics.com >/dev/null 2>&1; then - echo "VictoriaLogs CRDs are ready" + echo "victoria_logs CRDs are ready" break fi - echo "Waiting for VictoriaLogs CRDs... (attempt $i/30)" + echo "Waiting for victoria_logs CRDs... (attempt $i/30)" sleep 2 done {% endif %} kubectl apply -k {{ k8s_client_mount_path }}/telemetry/deployments/. -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} kubectl create secret generic nersc-ldms-ovis-auth --from-file=ldmsauth.conf={{ k8s_client_mount_path }}/telemetry/ldms/ldmsauth.conf --dry-run=client -o yaml | kubectl apply -f - -n telemetry kubectl create secret generic nersc-munge-key --from-file=munge.key={{ k8s_client_mount_path }}/telemetry/ldms/munge.key --dry-run=client -o yaml | kubectl apply -f - -n telemetry cd {{ k8s_client_mount_path }}/telemetry/ldms/nersc-ldms-aggr && helm install -n telemetry nersc-ldms-aggr nersc-ldms-aggr --values values.yaml diff --git a/provision/roles/configure_ochami/templates/vast/configure_vast_installation.sh.j2 b/provision/roles/configure_ochami/templates/vast/configure_vast_installation.sh.j2 new file mode 100644 index 0000000000..e454ff8b02 --- /dev/null +++ b/provision/roles/configure_ochami/templates/vast/configure_vast_installation.sh.j2 @@ -0,0 +1,94 @@ +#!/bin/bash +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VAST NFS Client Installation Script +# This script installs and configures the VAST NFS client for high-performance NFS mounts + +LOGFILE="/var/log/configure_vast_installation.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +echo "[INFO] ===== Starting VAST NFS client installation =====" + +echo "[INFO] Checking for Mellanox/InfiniBand hardware..." + +if ! lspci | grep -i 'mellanox'; then + echo "[INFO] No Mellanox RDMA hardware detected. Skipping VAST installation." + echo "[INFO] VAST client requires InfiniBand hardware for optimal performance." + exit 0 +fi + +echo "[INFO] Mellanox RDMA hardware detected. Proceeding with VAST installation." + +echo "[INFO] Disabling GPG check for dnf package installation" +sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf +if [ $? -eq 0 ]; then + echo "[SUCCESS] GPG check disabled in dnf.conf" +else + echo "[ERROR] Failed to disable GPG check in dnf.conf" + exit 1 +fi + +echo "[INFO] Installing vastnfs package with SSL verification disabled" +dnf install -y --setopt=sslverify=0 vastnfs +if [ $? -eq 0 ]; then + echo "[SUCCESS] vastnfs package installed successfully" +else + echo "[ERROR] vastnfs package installation failed" + exit 1 +fi + +echo "[INFO] Running depmod to update module dependencies" +depmod -a +if [ $? -eq 0 ]; then + echo "[SUCCESS] Module dependencies updated successfully" +else + echo "[ERROR] Failed to update module dependencies" + exit 1 +fi + +echo "[INFO] Stopping gssproxy and rpcbind services" +systemctl stop gssproxy 2>/dev/null && echo "[INFO] gssproxy stopped" || echo "[WARN] gssproxy was not running or failed to stop" +systemctl stop rpcbind 2>/dev/null && echo "[INFO] rpcbind stopped" || echo "[WARN] rpcbind was not running or failed to stop" + +echo "[INFO] Unmounting RPC pipefs and removing NFS modules" +umount /var/lib/nfs/rpc_pipefs 2>/dev/null && echo "[INFO] RPC pipefs unmounted" || echo "[WARN] RPC pipefs was not mounted or failed to unmount" +rmmod compat_nfs_ssc 2>/dev/null && echo "[INFO] compat_nfs_ssc module removed" || echo "[WARN] compat_nfs_ssc module was not loaded or failed to remove" +rmmod sunrpc 2>/dev/null && echo "[INFO] sunrpc module removed" || echo "[WARN] sunrpc module was not loaded or failed to remove" + +echo "[INFO] Loading sunrpc module and starting rpcbind service" +modprobe sunrpc +if [ $? -eq 0 ]; then + echo "[SUCCESS] sunrpc module loaded successfully" +else + echo "[ERROR] Failed to load sunrpc module" + exit 1 +fi + +systemctl start rpcbind +if [ $? -eq 0 ]; then + echo "[SUCCESS] rpcbind service started successfully" +else + echo "[ERROR] Failed to start rpcbind service" + exit 1 +fi + +echo "[INFO] Verifying vastnfs installation" +if lsmod | grep -q vastnfs; then + echo "[SUCCESS] vastnfs module is loaded" +else + echo "[WARN] vastnfs module not found in lsmod, may load on demand" +fi + +echo "[INFO] ===== VAST NFS client installation completed successfully =====" diff --git a/provision/roles/configure_ochami/vars/main.yml b/provision/roles/configure_ochami/vars/main.yml index 40590bd1a6..198092aced 100644 --- a/provision/roles/configure_ochami/vars/main.yml +++ b/provision/roles/configure_ochami/vars/main.yml @@ -102,17 +102,46 @@ k8s_control_ssh_patterns: "{{ hostvars['oim']['k8s_ssh_patterns'] | default('*') # Passwordless SSH mode flag derived from nodes.yaml (set on OIM by passwordless_ssh role) all_group_names_present: "{{ hostvars['oim']['all_group_names_present'] | default(false) }}" -# CUDA/NVIDIA runfile names (extracted from slurm_custom.json in slurm_config role) -cuda_runfile_x86_64: "{{ hostvars['oim']['cuda_runfile_x86_64'] | default('cuda_13.0.2_580.95.05_linux.run') }}" -cuda_runfile_aarch64: "{{ hostvars['oim']['cuda_runfile_aarch64'] | default('cuda_13.0.2_580.95.05_linux_sbsa.run') }}" - +# Login/compiler node presence flag (set by slurm_config role) +login_compiler_node_present: "{{ hostvars['oim']['login_compiler_node_present'] | default(false) }}" +slurm_node_present: "{{ hostvars['oim']['slurm_node_present'] | default(false) }}" # Usage: ci-group-slurm_node_x86_64.yaml.j2, ci-group-slurm_node_aarch64.yaml.j2 +dcgm_support: "{{ hostvars['localhost'].get('telemetry_sources', {}).get('dcgm', {}).get('metrics_enabled', true) | bool }}" # NVIDIA DCGM (Data Center GPU Manager) configuration dcgm_service_name: "nvidia-dcgm" dcgm_health_check_retries: 3 -dcgm_support: "{{ hostvars['localhost']['dcgm_support'] | default(true) }}" # Usage: fetch_additional_images.yml input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" software_config_file_path: "{{ input_project_dir }}/software_config.json" local_repo_config_path: "{{ input_project_dir }}/local_repo_config.yml" + +# Usage: provision_mapping_nodes.yml +service_retries: 12 # 2 minutes +service_retry_interval: 10 # seconds +cloud_init_failed_msg: | + cloud-init-server is not running after {{ service_retries }} retries. + Next steps: + 1. Check service status: systemctl status cloud-init-server + 2. View logs: journalctl -u cloud-init-server -n 50 + 3. Restart service: systemctl restart cloud-init-server + 4. Rerun provision.yml: ansible-playbook provision/provision.yml + +openchami_target_failed_msg: | + openchami.target is not up after {{ service_retries }} retries. + Next steps: + 1. Check target status: systemctl status openchami.target + 2. View logs: journalctl -u openchami.target -n 50 + 3. Check individual services: systemctl status smd bss cloud-init + 4. Restart target: systemctl restart openchami.target + 5. Rerun provision.yml: ansible-playbook provision/provision.yml + +ochami_discovery_failed_msg: | + Failed to discover ochami nodes after retries. + Next steps: + 1. Verify nodes.yaml is valid: cat {{ openchami_nodes_vars_path }} + 2. Check SMD connectivity + 3. View SMD logs + 4. Check ochami CLI: /usr/bin/ochami smd service status + 5. Restart openchami.target: systemctl restart openchami.target + 6. Rerun provision.yml: ansible-playbook provision/provision.yml diff --git a/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml b/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml index 71e995020c..40e9328cdd 100644 --- a/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml +++ b/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml @@ -196,7 +196,7 @@ - name: Include local repo access variable file ansible.builtin.include_vars: "{{ local_repo_access_config_file }}" -- name: Load service_k8s_.json +- name: Load service_k8s.json ansible.builtin.set_fact: k8s_packages_json: "{{ lookup('file', k8s_packages_file) | from_json }}" diff --git a/provision/roles/k8s_config/vars/main.yml b/provision/roles/k8s_config/vars/main.yml index 6ca89acf40..f3f06a75de 100644 --- a/provision/roles/k8s_config/vars/main.yml +++ b/provision/roles/k8s_config/vars/main.yml @@ -15,8 +15,7 @@ local_repo_access_config_file: "/opt/omnia/provision/local_repo_access.yml" input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" -# Versioned JSON file: service_k8s_v.json (e.g., service_k8s_v1.35.1.json) -k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s_v{{ hostvars['localhost']['service_k8s_version'] }}.json" # noqa: yaml[line-length] +k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s.json" # noqa: yaml[line-length] calico_manifest_yaml_url: "{{ offline_manifest_path }}/{{ calico_package }}/{{ calico_package }}.yml" metallb_manifest_yaml_url: "{{ offline_manifest_path }}/{{ metallb_package }}/{{ metallb_package }}.yml" multus_manifest_yaml_url: "{{ offline_manifest_path }}/{{ multus_package }}/{{ multus_package }}.yml" diff --git a/provision/roles/provision_validations/tasks/include_software_config.yml b/provision/roles/provision_validations/tasks/include_software_config.yml index f4b8b40466..b2480d2c6e 100644 --- a/provision/roles/provision_validations/tasks/include_software_config.yml +++ b/provision/roles/provision_validations/tasks/include_software_config.yml @@ -42,6 +42,7 @@ admin_nic: "{{ network_data.admin_network.oim_nic_name }}" admin_netmask_bits: "{{ network_data.admin_network.netmask_bits }}" ib_network_subnet: "{{ network_data.ib_network.subnet }}" + ib_network_dns: "{{ network_data.ib_network.dns | default([]) }}" dns: "{{ network_data.admin_network.dns }}" - name: Initialise variables @@ -84,11 +85,3 @@ - name: Check if openmpi support is true ansible.builtin.set_fact: openmpi_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'openmpi') | list | length > 0 }}" - -- name: Initialise ldms support variables - ansible.builtin.set_fact: - ldms_support: false - -- name: Check if ldms support is true - ansible.builtin.set_fact: - ldms_support: "{{ software_config.softwares | selectattr('name', 'in', 'ldms') | list | length > 0 }}" diff --git a/provision/roles/provision_validations/tasks/main.yml b/provision/roles/provision_validations/tasks/main.yml index d9679e9697..503792ee99 100644 --- a/provision/roles/provision_validations/tasks/main.yml +++ b/provision/roles/provision_validations/tasks/main.yml @@ -55,5 +55,6 @@ - name: Validate telemetry config ansible.builtin.include_tasks: validate_telemetry_config.yml when: - - idrac_telemetry_support | lower == 'true' | default('false') or - ldms_support | default('false') + - (telemetry_sources.idrac.metrics_enabled | default(false) | bool) or + (telemetry_sources.ldms.metrics_enabled | default(false) | bool) or + (ldms_support | default(false) | bool) diff --git a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml index 8f906aa4d3..3aa0669e30 100644 --- a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml +++ b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml @@ -13,21 +13,17 @@ # limitations under the License. --- -- name: Set support values - ansible.builtin.set_fact: - idrac_telemetry_support: "{{ idrac_telemetry_support | lower }}" - -- name: Warning for idrac_telemetry_support is currently set to false +- name: Warning for iDRAC telemetry is currently disabled ansible.builtin.pause: seconds: "{{ pause_time_15 }}" prompt: "{{ warning_idrac_telemetry_support_false }}" - when: not idrac_telemetry_support + when: not (telemetry_sources.idrac.metrics_enabled | default(false) | bool) -- name: Warning for idrac_telemetry_support is currently set to true +- name: Warning for iDRAC telemetry is currently enabled ansible.builtin.pause: seconds: "{{ pause_time_15 }}" prompt: "{{ warning_idrac_telemetry_support_true }}" - when: idrac_telemetry_support + when: telemetry_sources.idrac.metrics_enabled | default(false) | bool - name: Get k8s cluster details ansible.builtin.set_fact: diff --git a/provision/roles/provision_validations/vars/main.yml b/provision/roles/provision_validations/vars/main.yml index afa987a229..ceee665ce2 100644 --- a/provision/roles/provision_validations/vars/main.yml +++ b/provision/roles/provision_validations/vars/main.yml @@ -63,11 +63,11 @@ image_missing_fail_msg: | # Usage: validate_telemetry.yml warning_idrac_telemetry_support_false: | - "[WARNING] idrac_telemetry_support is set to false in telemetry_config.yml. This means iDRAC telemetry will not be activated. - To use telemetry, set idrac_telemetry_support to true in telemetry_config.yml." + "[WARNING] telemetry_sources.idrac.metrics_enabled is set to false in telemetry_config.yml. This means iDRAC telemetry will not be activated. + To use telemetry, set telemetry_sources.idrac.metrics_enabled to true in telemetry_config.yml." warning_idrac_telemetry_support_true: | - "[WARNING] idrac_telemetry_support is set to true in telemetry_config.yml. + "[WARNING] telemetry_sources.idrac.metrics_enabled is set to true in telemetry_config.yml. iDRAC telemetry will be activated for all BMC IPs listed in mapping file. Confirm that all BMC IPs are reachable from the respective service cluster nodes for telemetry to function properly. Make sure that Redfish is enabled and the iDRAC has a datacenter license. diff --git a/provision/roles/slurm_config/files/pull_benchmarks.sh b/provision/roles/slurm_config/files/pull_benchmarks.sh new file mode 100644 index 0000000000..6bd444b944 --- /dev/null +++ b/provision/roles/slurm_config/files/pull_benchmarks.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# +# pull_benchmarks.sh - Pull and organize HPC benchmark artifacts from local repository +# Usage: ./pull_benchmarks.sh [config_path] +# arch: x86_64 or aarch64 +# config_path: Optional path to slurm_custom.json (default: /opt/omnia/config) +# + +set -e + +ARCH="${1:-x86_64}" +CONFIG_PATH="${2:-/opt/omnia/config}" +HPC_TOOLS_BASE="/hpc_tools" +LOCAL_REPO_BASE="/var/lib/pulp/content" + +# Color output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Validate architecture +if [[ "${ARCH}" != "x86_64" && "${ARCH}" != "aarch64" ]]; then + log_error "Invalid architecture: ${ARCH}. Must be x86_64 or aarch64." + exit 1 +fi + +# Check if hpc_tools directory exists +if [[ ! -d "${HPC_TOOLS_BASE}" ]]; then + log_error "hpc_tools base directory does not exist: ${HPC_TOOLS_BASE}" + log_error "Ensure NFS mount for hpc_tools is available." + exit 1 +fi + +# Find slurm_custom.json +SLURM_CUSTOM_FILE="" +for path in "${CONFIG_PATH}/slurm_custom.json" "/etc/omnia/slurm_custom.json" "/opt/omnia/slurm_custom.json"; do + if [[ -f "${path}" ]]; then + SLURM_CUSTOM_FILE="${path}" + break + fi +done + +if [[ -z "${SLURM_CUSTOM_FILE}" ]]; then + log_error "slurm_custom.json not found in standard locations." + exit 1 +fi + +log_info "Using slurm_custom.json: ${SLURM_CUSTOM_FILE}" + +# Parse benchmark packages from slurm_custom.json +# Look for packages with type "tarball" or "source" +BENCHMARK_PACKAGES=$(jq -r '.packages[]? | select(.type == "tarball" or .type == "source") | .package' "${SLURM_CUSTOM_FILE}" 2>/dev/null || echo "") + +if [[ -z "${BENCHMARK_PACKAGES}" ]]; then + log_warn "No benchmark packages found in slurm_custom.json." + exit 0 +fi + +log_info "Found benchmark packages: ${BENCHMARK_PACKAGES}" + +# Function to pull a single benchmark +pull_benchmark() { + local pkg_name="$1" + local pkg_info + local pkg_url + local pkg_type + local dest_dir + + pkg_info=$(jq -r ".packages[]? | select(.package == \"${pkg_name}\")" "${SLURM_CUSTOM_FILE}") + pkg_url=$(echo "${pkg_info}" | jq -r '.url // empty') + pkg_type=$(echo "${pkg_info}" | jq -r '.type // "source"') + + dest_dir="${HPC_TOOLS_BASE}/${pkg_name}" + + # Create destination directory + log_info "Creating directory: ${dest_dir}" + mkdir -p "${dest_dir}" + + # Check if artifact exists in local repo + # Search in offline_repo structure + local artifact_path="" + for search_path in "/var/lib/pulp/content/offline_repo/cluster/${ARCH}/rhel/10.0/source/${pkg_name}" \ + "/var/lib/pulp/content/offline_repo/cluster/${ARCH}/rhel/10.0/tarball/${pkg_name}" \ + "${LOCAL_REPO_BASE}/offline_repo/cluster/${ARCH}/rhel/10.0/source/${pkg_name}" \ + "${LOCAL_REPO_BASE}/offline_repo/cluster/${ARCH}/rhel/10.0/tarball/${pkg_name}"; do + if [[ -d "${search_path}" ]]; then + artifact_path="${search_path}" + break + fi + done + + if [[ -z "${artifact_path}" ]]; then + log_warn "Artifact not found in local repository for ${pkg_name}, skipping." + return 1 + fi + + # Copy artifacts to destination + log_info "Copying artifacts from ${artifact_path} to ${dest_dir}" + cp -r "${artifact_path}"/* "${dest_dir}/" 2>/dev/null || true + + # If URL is provided and local copy failed, attempt direct pull + if [[ -n "${pkg_url}" && ! -f "${dest_dir}"/* ]]; then + log_info "Attempting direct pull from URL: ${pkg_url}" + cd "${dest_dir}" + if command -v wget &>/dev/null; then + wget -q "${pkg_url}" -O "${pkg_name}.tar.gz" || log_warn "Failed to download ${pkg_url}" + elif command -v curl &>/dev/null; then + curl -sSL "${pkg_url}" -o "${pkg_name}.tar.gz" || log_warn "Failed to download ${pkg_url}" + fi + fi + + # Verify files were copied + if [[ -n "$(ls -A ${dest_dir})" ]]; then + log_info "Successfully staged ${pkg_name}" + return 0 + else + log_warn "No files staged for ${pkg_name}" + return 1 + fi +} + +# Pull each benchmark +SUCCESS_COUNT=0 +FAIL_COUNT=0 + +for pkg in ${BENCHMARK_PACKAGES}; do + if pull_benchmark "${pkg}"; then + ((SUCCESS_COUNT++)) + else + ((FAIL_COUNT++)) + fi +done + +log_info "Benchmark staging complete: ${SUCCESS_COUNT} succeeded, ${FAIL_COUNT} failed" + +exit 0 diff --git a/provision/roles/slurm_config/tasks/create_slurm_dir.yml b/provision/roles/slurm_config/tasks/create_slurm_dir.yml index b68bcbbded..a89b33aeb3 100644 --- a/provision/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/provision/roles/slurm_config/tasks/create_slurm_dir.yml @@ -30,22 +30,6 @@ name: slurm_custom_aarch64 failed_when: false -- name: Extract CUDA runfile name for x86_64 from slurm_custom.json - ansible.builtin.set_fact: - cuda_runfile_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" - when: - - slurm_custom_x86_64 is defined - - slurm_custom_x86_64.slurm_node is defined - - slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 - -- name: Extract CUDA runfile name for aarch64 from slurm_custom.json - ansible.builtin.set_fact: - cuda_runfile_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" - when: - - slurm_custom_aarch64 is defined - - slurm_custom_aarch64.slurm_node is defined - - slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 - - name: Set facts for slurm ansible.builtin.set_fact: nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" diff --git a/provision/roles/slurm_config/tasks/hpc_tools.yml b/provision/roles/slurm_config/tasks/hpc_tools.yml index 46260da267..940a71137f 100644 --- a/provision/roles/slurm_config/tasks/hpc_tools.yml +++ b/provision/roles/slurm_config/tasks/hpc_tools.yml @@ -22,7 +22,6 @@ mode: "{{ common_mode }}" loop: - cuda - - runfile - scripts - container_images - nvidia_sdk @@ -43,6 +42,22 @@ group: "{{ root_group }}" mode: "0644" +- name: Deploy pull_benchmarks.sh to NFS share + ansible.builtin.template: + src: "pull_benchmarks.sh.j2" + dest: "{{ pull_benchmarks_script_path }}" + owner: "{{ root_user }}" + group: "{{ root_group }}" + mode: "0755" + +- name: Deploy benchmark_tools.list to NFS share + ansible.builtin.template: + src: "benchmark_tools.list.j2" + dest: "{{ benchmark_tools_list_path }}" + owner: "{{ root_user }}" + group: "{{ root_group }}" + mode: "0644" + - name: Set fact for pulp mirror ansible.builtin.set_fact: pulp_mirror: "{{ hostvars['localhost']['admin_nic_ip'] }}:2225" diff --git a/provision/roles/slurm_config/tasks/read_slurm_hostnames.yml b/provision/roles/slurm_config/tasks/read_slurm_hostnames.yml index c61e8d92a9..5b99d35c30 100644 --- a/provision/roles/slurm_config/tasks/read_slurm_hostnames.yml +++ b/provision/roles/slurm_config/tasks/read_slurm_hostnames.yml @@ -92,3 +92,11 @@ ansible.builtin.set_fact: controller_ip: "{{ ip_name_map[ctld_list | first] }}" when: ctld_list | length > 0 + +- name: Set login_compiler_node_present flag + ansible.builtin.set_fact: + login_compiler_node_present: "{{ compiler_login_list | length > 0 }}" + +- name: Set slurm_node_present flag + ansible.builtin.set_fact: + slurm_node_present: "{{ cmpt_list | length > 0 }}" diff --git a/provision/roles/slurm_config/templates/benchmark_tools.list.j2 b/provision/roles/slurm_config/templates/benchmark_tools.list.j2 new file mode 100644 index 0000000000..1fe5865ed2 --- /dev/null +++ b/provision/roles/slurm_config/templates/benchmark_tools.list.j2 @@ -0,0 +1,14 @@ +# Benchmark Tools List +# Lists HPC benchmark tools to pull from the local offline repository to /hpc_tools// +# Format: (one per line) +# Lines starting with # are ignored. Empty lines are ignored. +# Architecture is auto-detected at runtime (uname -m). +# Note: msr-safe is x86_64 only — automatically skipped on aarch64. + +osu-micro-benchmarks +imb +likwid +papi +geopm +sionlib +msr-safe diff --git a/provision/roles/slurm_config/templates/pull_benchmarks.sh.j2 b/provision/roles/slurm_config/templates/pull_benchmarks.sh.j2 new file mode 100644 index 0000000000..8d5b505713 --- /dev/null +++ b/provision/roles/slurm_config/templates/pull_benchmarks.sh.j2 @@ -0,0 +1,164 @@ +#!/bin/bash +# HPC benchmark tarball pull script (Pulp only) +# Deployed via NFS share for all nodes +# Reads benchmark tool names from benchmark_tools.list file and downloads tarballs from Pulp +# Downloads from Pulp mirror only (no internet fallback) +# Usage: pull_benchmarks.sh + +LOGFILE="/var/log/pull_benchmarks.log" +exec > >(tee -a "$LOGFILE") 2>&1 + +# Configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BENCHMARK_TOOLS_LIST="${SCRIPT_DIR}/benchmark_tools.list" +HPC_TOOLS_DIR="/hpc_tools" +ARCH="$(uname -m)" +OS_VERSION="{{ hostvars['localhost']['cluster_os_version'] }}" +PULP_SERVER="{{ hostvars['localhost']['admin_nic_ip'] }}:2225" +PULP_CONTENT_BASE="https://${PULP_SERVER}/pulp/content{{ hostvars['localhost']['oim_shared_path'] }}/omnia/offline_repo/cluster" + +echo "===== Starting Benchmark Tarball Pull Script (Pulp Only) =====" +echo "[INFO] Timestamp: $(date)" +echo "[INFO] Architecture: $ARCH" +echo "[INFO] Pulp mirror: $PULP_SERVER" + +# Check prerequisites +if [[ "$ARCH" != "x86_64" && "$ARCH" != "aarch64" ]]; then + echo "[ERROR] Unsupported architecture: $ARCH" + exit 1 +fi + +if [ ! -f "$BENCHMARK_TOOLS_LIST" ]; then + echo "[ERROR] Benchmark tools list not found: $BENCHMARK_TOOLS_LIST" + echo "[INFO] Please create the file with one tool name per line." + echo "[INFO] Expected location: ${SCRIPT_DIR}/benchmark_tools.list" + exit 1 +fi + +if [ ! -d "$HPC_TOOLS_DIR" ]; then + echo "[ERROR] /hpc_tools is not mounted or does not exist" + exit 1 +fi + +# Function to download all files from a Pulp content directory into a local directory. +# Uses wget (preferred) or curl as fallback. +# Arguments: $1 = Pulp directory URL, $2 = local destination directory +pull_from_pulp() { + local url="$1" + local dest="$2" + + if command -v wget &>/dev/null; then + wget -q -r -np -nd -R "index.html*" --no-check-certificate \ + -P "$dest" "${url}/" 2>&1 + return $? + elif command -v curl &>/dev/null; then + local page + page="$(curl -ksfL "${url}/" 2>/dev/null)" || return 1 + local -a files + local files_count + mapfile -t files < <(echo "$page" \ + | grep -oP 'href="\K[^"]+' \ + | grep -vE '^\.\.?/?$|index\.html') + files_count=0 + for _file in "${files[@]}"; do + [ -n "$_file" ] && ((files_count++)) + done + [ "$files_count" -eq 0 ] && return 1 + local rc=0 + for f in "${files[@]}"; do + f="${f%/}" + [ -z "$f" ] && continue + curl -ksfL "${url}/${f}" -o "${dest}/${f}" || rc=1 + done + return $rc + else + echo "[ERROR] Neither wget nor curl is available." + return 1 + fi +} + +echo "[INFO] Reading benchmark tools from: $BENCHMARK_TOOLS_LIST" + +TOTAL=0 +SUCCESS_COUNT=0 +FAILED_COUNT=0 +SKIPPED_COUNT=0 +FAILED_TOOLS="" + +while IFS= read -r tool || [ -n "$tool" ]; do + # Skip empty lines and comments + [[ -z "$tool" || "$tool" =~ ^[[:space:]]*# ]] && continue + + # Trim whitespace + tool=$(echo "$tool" | xargs) + [[ -z "$tool" ]] && continue + + ((TOTAL++)) + + echo "" + echo "===== Processing Tool $TOTAL: $tool =====" + + # Architecture-specific skip + if [[ "$tool" == "msr-safe" && "$ARCH" != "x86_64" ]]; then + echo "[WARN] $tool is x86_64 only. Skipping on $ARCH." + ((SKIPPED_COUNT++)) + continue + fi + + PULP_URL="${PULP_CONTENT_BASE}/${ARCH}/rhel/${OS_VERSION}/tarball/${tool}" + DEST_DIR="${HPC_TOOLS_DIR}/${tool}" + + echo "[INFO] Pulp URL: $PULP_URL" + echo "[INFO] Destination: $DEST_DIR" + + # Skip if already staged + if [ -d "$DEST_DIR" ] && [ -n "$(ls -A "$DEST_DIR" 2>/dev/null)" ]; then + echo "[WARN] $tool already present at $DEST_DIR. Skipping." + echo "[INFO] Remove the directory to re-download." + ((SKIPPED_COUNT++)) + continue + fi + + mkdir -p "$DEST_DIR" + + echo "[INFO] Pulling from Pulp mirror..." + + if pull_from_pulp "$PULP_URL" "$DEST_DIR"; then + if [ -n "$(ls -A "$DEST_DIR" 2>/dev/null)" ]; then + echo "[SUCCESS] $tool staged at $DEST_DIR" + echo "[SOURCE] Downloaded from: PULP MIRROR ($PULP_SERVER)" + ls -lh "$DEST_DIR" + ((SUCCESS_COUNT++)) + else + echo "[ERROR] Pull returned success but no files found for $tool" + rmdir "$DEST_DIR" 2>/dev/null + ((FAILED_COUNT++)) + FAILED_TOOLS="${FAILED_TOOLS}\n - ${tool} (no files downloaded)" + fi + else + echo "[ERROR] Failed to pull $tool from Pulp mirror." + echo "[INFO] Tool may not be available in Pulp or download was interrupted." + rmdir "$DEST_DIR" 2>/dev/null + ((FAILED_COUNT++)) + FAILED_TOOLS="${FAILED_TOOLS}\n - ${tool}" + fi + +done < "$BENCHMARK_TOOLS_LIST" + +echo "" +echo "===== Benchmark Pull Summary =====" +echo "[INFO] Total tools processed: $TOTAL" +echo "[INFO] Successful: $SUCCESS_COUNT" +echo "[INFO] Skipped: $SKIPPED_COUNT" +echo "[INFO] Failed: $FAILED_COUNT" + +if [ $FAILED_COUNT -gt 0 ]; then + echo -e "[ERROR] Failed tools:$FAILED_TOOLS" + EXIT_CODE=1 +else + EXIT_CODE=0 +fi + +echo "" +echo "===== Benchmark Pull Completed =====" +exit ${EXIT_CODE:-0} diff --git a/provision/roles/slurm_config/vars/main.yml b/provision/roles/slurm_config/vars/main.yml index 580d776d92..1c47bc3b50 100644 --- a/provision/roles/slurm_config/vars/main.yml +++ b/provision/roles/slurm_config/vars/main.yml @@ -162,6 +162,10 @@ nvhpc_tarball_aarch64_relpath: "offline_repo/cluster/aarch64/rhel/{{ hostvars['l nvhpc_nfs_rel_dir: "hpc_tools/nvidia_sdk" +# Benchmark pull script path +pull_benchmarks_script_path: "{{ slurm_config_path }}/hpc_tools/scripts/pull_benchmarks.sh" +benchmark_tools_list_path: "{{ slurm_config_path }}/hpc_tools/scripts/benchmark_tools.list" + # parallel file copy parallel_copy_max_workers: 4 @@ -171,16 +175,6 @@ parallel_copy_max_workers: 4 parallel_copy_candidates: - # CUDA Runfile (aarch64 repo path) - - name: cuda_runfile_aarch64 - src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/iso/cuda-run/" - dest: "{{ slurm_config_path }}/hpc_tools/runfile/" - - # CUDA Runfile (x86_64 repo path) - - name: cuda_runfile_x86_64 - src: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/iso/cuda-run/" - dest: "{{ slurm_config_path }}/hpc_tools/runfile/" - # NVIDIA HPC SDK (x86_64 tarball extracted dir) - name: nvhpc_sdk_x86_64 src: "{{ oim_shared_path }}/omnia/{{ nvhpc_tarball_x86_64_relpath | dirname }}/" diff --git a/provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml b/provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml index 9d8319b990..0cdb4bd2cb 100644 --- a/provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml +++ b/provision/roles/telemetry/tasks/apply_telemetry_on_upgrade.yml @@ -17,7 +17,7 @@ when: - kube_vip is defined - kube_vip | length > 0 - - hostvars['localhost']['idrac_telemetry_support'] | default(false) | bool + - idrac_telemetry_support | default(false) | bool block: - name: Check if telemetry deployment file exists ansible.builtin.stat: @@ -91,20 +91,20 @@ register: idrac_telemetry_receiver_ready failed_when: false when: - - hostvars['localhost']['idrac_telemetry_support'] | default(false) | bool + - idrac_telemetry_support | default(false) | bool - name: Display idrac telemetry receiver ready status ansible.builtin.debug: msg: "{{ idrac_telemetry_receiver_ready }}" when: - - hostvars['localhost']['idrac_telemetry_support'] | default(false) | bool + - idrac_telemetry_support | default(false) | bool - idrac_telemetry_receiver_ready is defined - name: Apply LDMS configurations for upgrade when: - kube_vip is defined - kube_vip | length > 0 - - hostvars['localhost']['ldms_support'] | default(false) | bool + - ldms_support | default(false) | bool block: - name: Check if LDMS aggregator is running on service k8s cluster kubernetes.core.k8s_info: diff --git a/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml b/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml new file mode 100644 index 0000000000..8a523a8486 --- /dev/null +++ b/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml @@ -0,0 +1,49 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Configure PowerScale syslog collection via rsyslog relay + shared VLAgent +# Gated by: telemetry_sources.powerscale.logs_enabled +# +# DATA PIPELINE (UDP mode only): +# PowerScale (UDP:514) → K8s node rsyslog → VLAgent LB (TCP:514) → VictoriaLogs +# +# What Omnia does: +# - Uses syslog_source_ips from user config (syslog source != CSI endpoint) +# - Configures rsyslog on ALL K8s nodes (UDP listener, filter by PS IP, forward to VLAgent) +# - Opens firewall ports 514/udp and 514/tcp on all K8s nodes +# - VLAgent listens on TCP:514 (rsyslog relay) +# +# What Omnia does NOT do: +# - Omnia does NOT configure PowerScale directly +# - User must configure PowerScale to send UDP syslog to K8s node IPs + +- name: Configure PowerScale syslog collection + when: powerscale_log_enabled | default(false) | bool + block: + - name: Validate VictoriaLogs is in collection_targets + ansible.builtin.assert: + that: + - "'victoria_logs' in telemetry_config.telemetry_sources.powerscale.collection_targets | default([])" + fail_msg: "{{ powerscale_victoria_logs_validation_fail_msg }}" + + - name: Set PowerScale syslog source IPs + ansible.builtin.set_fact: + powerscale_management_ips: "{{ telemetry_config.powerscale_configurations.syslog_source_ips | default([]) }}" + + - name: Populate PowerScale syslog configuration script + ansible.builtin.template: + src: "{{ configure_ps_syslog_template }}" + dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/configure_powerscale_syslog.sh" + mode: "{{ hostvars['localhost']['file_permissions_755'] }}" diff --git a/provision/roles/telemetry/tasks/deploy_powerscale_metrics.yml b/provision/roles/telemetry/tasks/deploy_powerscale_metrics.yml index 463152977f..5f21637b3d 100644 --- a/provision/roles/telemetry/tasks/deploy_powerscale_metrics.yml +++ b/provision/roles/telemetry/tasks/deploy_powerscale_metrics.yml @@ -22,7 +22,6 @@ - name: Set PowerScale configuration facts ansible.builtin.set_fact: ps_csi_secret_path: "{{ hostvars['localhost']['service_cluster_info'].csi_powerscale_driver_secret_file_path | default('') }}" - powerscale_configurations: "{{ telemetry_config.powerscale_configurations }}" ps_helm_values_file: "{{ telemetry_config.powerscale_configurations.csm_observability_values_file_path }}" csm_observability_namespace: "{{ csm_namespace }}" diff --git a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml new file mode 100644 index 0000000000..de7a40e36d --- /dev/null +++ b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml @@ -0,0 +1,90 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# ============================================================================= +# DERIVE SINK SUPPORT FLAGS AND LEGACY VARIABLE MAPPINGS +# ============================================================================= +# This task analyzes all telemetry sources and their collection_targets to +# determine which sinks (victoria_metrics, victoria_logs, Kafka) are in use. +# Also maps new telemetry_config structure to legacy variables for backward compatibility. +# +# LEGACY VARIABLE MAPPINGS: +# ldms_support = telemetry_config.telemetry_sources.ldms.metrics_enabled +# idrac_telemetry_support = telemetry_config.telemetry_sources.idrac.metrics_enabled +# dcgm_support = telemetry_config.telemetry_sources.dcgm.metrics_enabled +# powerscale_configurations = merged dict of powerscale source flags + detailed config +# +# SINK SUPPORT FLAGS: +# victoria_metrics_support = true if ANY source has 'victoria_metrics' in collection_targets +# victoria_logs_support = true if ANY source has 'victoria_logs' in collection_targets +# kafka_support = true if ANY source has 'kafka' in collection_targets +# ============================================================================= + +- name: Initialize sink support flags + ansible.builtin.set_fact: + victoria_metrics_support: false + victoria_logs_support: false + kafka_support: false + +- name: Set ldms_support based on telemetry_config.yml + ansible.builtin.set_fact: + ldms_support: "{{ telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool }}" + +- name: Map telemetry_sources to legacy feature flags + ansible.builtin.set_fact: + idrac_telemetry_support: "{{ telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool }}" + powerscale_metrics_enabled: "{{ telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool }}" + powerscale_log_enabled: "{{ telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool }}" + dcgm_support: "{{ telemetry_config.telemetry_sources.dcgm.metrics_enabled | default(true) | bool }}" + +- name: Map powerscale source + configurations to legacy powerscale_configurations + ansible.builtin.set_fact: + powerscale_configurations: + powerscale_telemetry_support: "{{ telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(true) | bool }}" + powerscale_log_enabled: "{{ telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool }}" + syslog_source_ips: "{{ telemetry_config.powerscale_configurations.syslog_source_ips | default([]) }}" + otel_collector_storage_size: "{{ telemetry_config.powerscale_configurations.otel_collector_storage_size | default('5Gi') }}" + csm_observability_values_file_path: "{{ telemetry_config.powerscale_configurations.csm_observability_values_file_path | default('') }}" + additional_remote_write_endpoints: "{{ telemetry_config.powerscale_configurations.additional_remote_write_endpoints | default([]) }}" + when: telemetry_config.powerscale_configurations is defined + +- name: Check if any source targets victoria_metrics + ansible.builtin.set_fact: + victoria_metrics_support: true + when: >- + 'victoria_metrics' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or + 'victoria_metrics' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([])) + +- name: Check if any source targets victoria_logs + ansible.builtin.set_fact: + victoria_logs_support: true + when: >- + 'victoria_logs' in (telemetry_config.telemetry_sources.powerscale.collection_targets | default([])) or + 'victoria_logs' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) + +- name: Check if any source targets Kafka + ansible.builtin.set_fact: + kafka_support: true + when: >- + 'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) or + 'kafka' in (telemetry_config.telemetry_sources.ldms.collection_targets | default([])) + +- name: Log derived sink support flags + ansible.builtin.debug: + msg: > + Sink support flags derived — + victoria_metrics_support={{ victoria_metrics_support }}, + victoria_logs_support={{ victoria_logs_support }}, + kafka_support={{ kafka_support }} + verbosity: 1 diff --git a/provision/roles/telemetry/tasks/generate_service_cluster_metadata.yml b/provision/roles/telemetry/tasks/generate_service_cluster_metadata.yml index ec1a0f046d..329838f667 100644 --- a/provision/roles/telemetry/tasks/generate_service_cluster_metadata.yml +++ b/provision/roles/telemetry/tasks/generate_service_cluster_metadata.yml @@ -15,7 +15,7 @@ - name: Pre-requisites for service_cluster k8s when: - - hostvars['localhost']['idrac_telemetry_support'] + - idrac_telemetry_support block: - name: Include service_cluster metadata if already exists ansible.builtin.include_vars: "{{ service_cluster_metadata_path }}" diff --git a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml index d1247287dd..729a1db05c 100644 --- a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml +++ b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,40 +33,31 @@ dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/{{ item.dest }}" mode: "{{ hostvars['localhost']['file_permissions_644'] }}" loop: "{{ victoria_templates }}" - when: "'victoria' in hostvars['localhost']['telemetry_collection_type'].split(',')" + when: victoria_metrics_support | default(false) | bool tags: telemetry_deployment - # NOTE: victoria_templates is automatically set based on victoria_deployment_mode in vars/main.yml - # - cluster mode: includes vmstorage, vminsert, vmselect templates - # - single-node mode: includes victoria-statefulset template -- name: Populate VictoriaLogs deployment configs +- name: Populate victoria_logs deployment configs ansible.builtin.template: src: "{{ item.src }}" dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/{{ item.dest }}" mode: "{{ hostvars['localhost']['file_permissions_644'] }}" loop: "{{ victorialogs_templates }}" - when: "'victoria' in hostvars['localhost']['telemetry_collection_type'].split(',')" + loop_control: + label: "{{ item.dest }}" + when: victoria_logs_support | default(false) | bool tags: telemetry_deployment - # NOTE: victorialogs_templates includes VLCluster CR, VLAgent CR, and VLAgent ConfigMap - - name: Kafka configurations when: kafka_support block: - name: Set Kafka configuration variables from telemetry_config ansible.builtin.set_fact: - kafka_log_retention_hours: "{{ telemetry_config.kafka_configurations.log_retention_hours }}" - kafka_log_retention_bytes: "{{ telemetry_config.kafka_configurations.log_retention_bytes }}" - kafka_log_segment_bytes: "{{ telemetry_config.kafka_configurations.log_segment_bytes }}" + kafka_log_retention_hours: "{{ telemetry_config.telemetry_sinks.kafka.log_retention_hours | default(168) }}" + kafka_log_retention_bytes: "{{ telemetry_config.telemetry_sinks.kafka.log_retention_bytes | default(-1) }}" + kafka_log_segment_bytes: "{{ telemetry_config.telemetry_sinks.kafka.log_segment_bytes | default(1073741824) }}" - name: Create kafka_topic_partitions dictionary from telemetry_config ansible.builtin.set_fact: - kafka_topic_partitions: >- - {{ - dict( - telemetry_config.kafka_configurations.topic_partitions | map(attribute='name') - | zip(telemetry_config.kafka_configurations.topic_partitions | map(attribute='partitions')) - ) - }} + kafka_topic_partitions: "{{ telemetry_config.telemetry_sinks.kafka.topic_partitions | default({}) }}" - name: Build list of Kafka topics to create ansible.builtin.set_fact: @@ -81,8 +72,8 @@ 'filename': 'kafka.topic_idrac.yaml' }] }} when: - - hostvars['localhost']['idrac_telemetry_support'] - - "'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',')" + - telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool + - "'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([]))" - "kafka.topics.idrac.name in kafka_topic_partitions" - name: Add ldms topic if enabled @@ -94,7 +85,7 @@ 'filename': 'kafka.topic_ldms.yaml' }] }} when: - - hostvars['localhost']['ldms_support'] + - ldms_support - "kafka.topics.ldms.name in kafka_topic_partitions" @@ -126,7 +117,7 @@ mode: "{{ hostvars['localhost']['file_permissions_644'] }}" - name: Victoria Metrics operator configuration - when: "'victoria' in hostvars['localhost']['telemetry_collection_type'].split(',')" + when: victoria_metrics_support | default(false) | bool block: - name: Extract and set facts for tarball URLs for victoria metrics operator ansible.builtin.set_fact: @@ -152,7 +143,7 @@ src: 'telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2' dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/idrac_telemetry_statefulset.yaml" mode: "{{ hostvars['localhost']['file_permissions_644'] }}" - when: hostvars['localhost']['idrac_telemetry_support'] + when: telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool tags: telemetry_deployment - name: Deploy telemetry cleanup script diff --git a/provision/roles/telemetry/tasks/load_service_images.yml b/provision/roles/telemetry/tasks/load_service_images.yml index 654c73c9a7..893b830fb2 100644 --- a/provision/roles/telemetry/tasks/load_service_images.yml +++ b/provision/roles/telemetry/tasks/load_service_images.yml @@ -13,7 +13,7 @@ # limitations under the License. --- -- name: Extract image packages from service_k8s_.json +- name: Extract image packages from service_k8s.json ansible.builtin.set_fact: service_k8s_image_list: "{{ telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'image') | list }}" diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index 346dac07db..a3436c6a62 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,22 +21,26 @@ - name: Read telemetry packages from software config ansible.builtin.include_tasks: read_software_config.yml -- name: Load service images from service_k8s_.json +- name: Derive sink support flags from collection_targets + ansible.builtin.include_tasks: derive_sink_support_flags.yml + +- name: Load service images from service_k8s.json ansible.builtin.include_tasks: load_service_images.yml - name: Check kube_vip reachability for validation ansible.builtin.include_tasks: check_kube_vip_reachability.yml when: - - "'victoria' in hostvars['localhost']['telemetry_collection_type'].split(',')" + - victoria_metrics_support | default(false) | bool - kube_vip is defined - kube_vip | length > 0 -- name: Configure of k8s telemetry service +- name: Configure of k8s telemetry service prerequisites when: - >- - hostvars['localhost']['idrac_telemetry_support'] or - hostvars['localhost']['ldms_support'] or - hostvars['localhost']['powerscale_configurations']['powerscale_telemetry_support'] | default(false) | bool + (telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or + ldms_support | default(false) | bool block: - name: Set NFS info fact ansible.builtin.set_fact: @@ -45,17 +49,28 @@ - name: Service cluster prerequisite ansible.builtin.include_tasks: telemetry_prereq.yml - - name: Deploy PowerScale telemetry metrics - ansible.builtin.include_tasks: deploy_powerscale_metrics.yml - when: - - hostvars['localhost']['powerscale_configurations']['powerscale_telemetry_support'] | default(false) | bool +- name: Deploy PowerScale telemetry metrics + when: + - telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool + ansible.builtin.include_tasks: deploy_powerscale_metrics.yml - - name: Generate telemetry deployments - ansible.builtin.include_tasks: generate_telemetry_deployments.yml +- name: Deploy PowerScale syslog log collection + when: + - telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool + ansible.builtin.include_tasks: deploy_powerscale_logs.yml + +- name: Generate telemetry deployments (idrac/ldms/powerscale) + when: + - >- + (telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or + ldms_support | default(false) | bool + ansible.builtin.include_tasks: generate_telemetry_deployments.yml - name: Configure of k8s telemetry service when: - - hostvars['localhost']['idrac_telemetry_support'] + - telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool block: - name: Validate idrac telemetry config ansible.builtin.include_tasks: validate_idrac_inventory.yml @@ -65,20 +80,20 @@ - name: Include update_ldms_sampler.yml ansible.builtin.include_tasks: update_ldms_sampler.yml - when: hostvars['localhost']['ldms_support'] + when: ldms_support - name: Update ldms agg configuration ansible.builtin.include_tasks: update_ldms_agg_config.yml - when: hostvars['localhost']['ldms_support'] + when: ldms_support - name: Check if PXE mapping has changed since last run ansible.builtin.include_tasks: check_pxe_changes.yml - when: hostvars['localhost']['ldms_support'] + when: ldms_support - name: Restart LDMS configs for node addition and deletion ansible.builtin.include_tasks: restart_ldms_configs.yml when: - - hostvars['localhost']['ldms_support'] + - ldms_support - pxe_changed | default(false) | bool - name: Apply telemetry configurations on upgrade diff --git a/provision/roles/telemetry/tasks/read_software_config.yml b/provision/roles/telemetry/tasks/read_software_config.yml index 3bc1a52637..005f9e65a2 100644 --- a/provision/roles/telemetry/tasks/read_software_config.yml +++ b/provision/roles/telemetry/tasks/read_software_config.yml @@ -35,11 +35,11 @@ ansible.builtin.set_fact: cluster_os_version: "{{ software_config['cluster_os_version'] }}" -- name: Load service_k8s_.json +- name: Load service_k8s.json ansible.builtin.set_fact: telemetry_packages: "{{ lookup('file', k8s_packages_file) | from_json }}" -- name: Extract service_k8s_.json and set facts for pip_modules and python_version +- name: Extract service_k8s.json and set facts for pip_modules and python_version ansible.builtin.set_fact: k8s_pip_packages: >- {{ telemetry_packages['service_kube_control_plane']['cluster'] diff --git a/provision/roles/telemetry/tasks/telemetry_prereq.yml b/provision/roles/telemetry/tasks/telemetry_prereq.yml index c41462c609..3def1610ee 100644 --- a/provision/roles/telemetry/tasks/telemetry_prereq.yml +++ b/provision/roles/telemetry/tasks/telemetry_prereq.yml @@ -13,10 +13,6 @@ # limitations under the License. --- -- name: Initialize kafka_support - ansible.builtin.set_fact: - kafka_support: false - - name: Count entries with FUNCTIONAL_GROUP_NAME containing 'service_kube_node' ansible.builtin.set_fact: kube_node_count: >- @@ -66,11 +62,6 @@ ansible.builtin.fail: msg: "{{ idrac_telemetry_scripting_copy_fail_msg.splitlines() | join(' ') }}" -- name: Set kafka_support to true - ansible.builtin.set_fact: - kafka_support: true - when: "'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') or hostvars['localhost']['ldms_support']" - - name: Configure TLS certificate and secrets for kafka when: kafka_support block: @@ -99,41 +90,41 @@ no_log: true when: not cluster_id_present | default(false) -- name: Configure TLS certificate for VictoriaMetrics +- name: Configure TLS certificate for victoria_metrics when: - - "'victoria' in hostvars['localhost']['telemetry_collection_type']" + - victoria_metrics_support | default(false) | bool - victoria_cluster.tls_enabled | default(false) | bool block: - - name: Create VictoriaMetrics certificate directory + - name: Create victoria_metrics certificate directory ansible.builtin.file: path: "{{ victoria_cert_dir }}" state: directory mode: "{{ dir_permissions_755 }}" - - name: Deploy VictoriaMetrics certificate generation script + - name: Deploy victoria_metrics certificate generation script ansible.builtin.template: src: telemetry/victoria/gen_victoria_certs.sh.j2 dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/gen_victoria_certs.sh" mode: "{{ hostvars['localhost']['file_permissions_755'] }}" - - name: Generate VictoriaMetrics TLS certificates + - name: Generate victoria_metrics TLS certificates ansible.builtin.command: cmd: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/gen_victoria_certs.sh" changed_when: false - - name: Read VictoriaMetrics server certificate + - name: Read victoria_metrics server certificate ansible.builtin.slurp: src: "{{ victoria_cert_dir }}/server.crt" register: victoria_server_cert no_log: true - - name: Read VictoriaMetrics server key + - name: Read victoria_metrics server key ansible.builtin.slurp: src: "{{ victoria_cert_dir }}/server.key" register: victoria_server_key no_log: true - - name: Read VictoriaMetrics CA certificate + - name: Read victoria_metrics CA certificate ansible.builtin.slurp: src: "{{ victoria_cert_dir }}/ca.crt" register: victoria_ca_cert @@ -146,7 +137,7 @@ victoria_ca_cert_b64: "{{ victoria_ca_cert.content }}" no_log: true - - name: Create VictoriaMetrics TLS secret manifest with actual certificates + - name: Create victoria_metrics TLS secret manifest with actual certificates ansible.builtin.template: src: telemetry/victoria/victoria-tls-secret.yaml.j2 dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/victoria-tls-secret.yaml" diff --git a/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 b/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 index a4b391519f..7f5f34beed 100644 --- a/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 +++ b/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 @@ -482,6 +482,10 @@ if [ "$CLEAN_POWERSCALE" = true ]; then delete_all configmap "app.kubernetes.io/name=otel-collector" delete_all pod "app.kubernetes.io/name=otel-collector" + # Delete VMServiceScrape for PowerScale OTEL Collector + echo "Deleting VMServiceScrape for PowerScale OTEL Collector..." + delete_resource vmservicescrape otel-collector-powerscale-scrape + # Delete cert-manager resources deployed by karavi-observability sub-chart echo "Deleting cert-manager resources (Helm sub-chart)..." delete_all deployment "app.kubernetes.io/instance=karavi-observability,app.kubernetes.io/name=cert-manager" diff --git a/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 b/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 index acd8d35029..3709759f78 100644 --- a/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 @@ -25,7 +25,7 @@ spec: tolerationSeconds: 30 # Evict after 30s if node is unreachable containers: - name: kubectl-cleanup - image: docker.io/alpine/kubectl:1.35.1 + image: docker.io/alpine/kubectl:1.34.1 command: - /bin/sh - -c diff --git a/provision/roles/telemetry/templates/telemetry/common/telemetry_secret_creation.yaml.j2 b/provision/roles/telemetry/templates/telemetry/common/telemetry_secret_creation.yaml.j2 index 0135593cc0..2dc831ba25 100644 --- a/provision/roles/telemetry/templates/telemetry/common/telemetry_secret_creation.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/common/telemetry_secret_creation.yaml.j2 @@ -9,8 +9,7 @@ data: mysqldb_password: "{{ hostvars['localhost']['mysqldb_password'] | b64encode }}" mysqldb_root_password: "{{ hostvars['localhost']['mysqldb_root_password'] | b64encode }}" -{% set types = hostvars['localhost']['telemetry_collection_type'].split(',') %} -{% if 'kafka' in types %} +{% if kafka_support %} --- apiVersion: v1 kind: Secret diff --git a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 index 80994fc5da..e449c3e2e3 100644 --- a/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/idrac_telemetry/idrac_telemetry_statefulset.yaml.j2 @@ -47,7 +47,7 @@ spec: app: {{ idrac_telemetry_k8s_name }} spec: volumes: -{% set types = hostvars['localhost']['telemetry_collection_type'].split(',') %} +{% set types = telemetry_config.telemetry_sources.idrac.collection_targets | default([]) %} {% if 'kafka' in types %} # Mount Kafka cluster CA certificate for TLS verification - name: kafka-cluster-ca-cert diff --git a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 index c4e5733ae7..16ebc44610 100644 --- a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafka.yaml.j2 @@ -14,7 +14,7 @@ spec: volumes: - id: 0 type: persistent-claim - size: "{{ hostvars['localhost']['kafka_configurations']['persistence_size'] }}" + size: "{{ telemetry_config.telemetry_sinks.kafka.persistence_size }}" kraftMetadata: shared deleteClaim: false --- @@ -35,7 +35,7 @@ spec: volumes: - id: 0 type: persistent-claim - size: "{{ hostvars['localhost']['kafka_configurations']['persistence_size'] }}" + size: "{{ telemetry_config.telemetry_sinks.kafka.persistence_size }}" kraftMetadata: shared deleteClaim: false --- @@ -79,9 +79,9 @@ spec: transaction.state.log.min.isr: 2 default.replication.factor: 3 min.insync.replicas: 2 - log.retention.hours: {{ hostvars['localhost']['kafka_configurations']['log_retention_hours'] }} - log.segment.bytes: {{ hostvars['localhost']['kafka_configurations']['log_segment_bytes'] }} - log.retention.bytes: {{ hostvars['localhost']['kafka_configurations']['log_retention_bytes'] }} + log.retention.hours: {{ kafka_log_retention_hours }} + log.segment.bytes: {{ kafka_log_segment_bytes }} + log.retention.bytes: {{ kafka_log_retention_bytes }} log.retention.check.interval.ms: 300000 # Enable topic auto-creation for external clients auto.create.topics.enable: true diff --git a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafkapump_user.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafkapump_user.yaml.j2 index 70790b75a4..413a7fe72d 100644 --- a/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafkapump_user.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kafka/kafka.kafkapump_user.yaml.j2 @@ -59,7 +59,7 @@ spec: - Describe host: "*" -{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') %} +{% if telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) and 'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) %} # Producer and consumer permissions for idrac topic - resource: type: topic @@ -80,7 +80,7 @@ spec: - Read host: "*" {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} # Producer and consumer permissions for ldms topic - resource: type: topic diff --git a/provision/roles/telemetry/templates/telemetry/kafka/kafka.tls_test_job.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kafka/kafka.tls_test_job.yaml.j2 index 1b58cd5811..007cc04d3b 100644 --- a/provision/roles/telemetry/templates/telemetry/kafka/kafka.tls_test_job.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kafka/kafka.tls_test_job.yaml.j2 @@ -32,10 +32,10 @@ data: echo "Bootstrap Server: kafka-kafka-bootstrap:9093" echo "Certificates: kafkapump (for all TLS topics)" echo "Testing topics based on enabled telemetry support:" -{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') %} +{% if telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) and 'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) %} echo " - iDRAC telemetry topic ({{ kafka.topics.idrac.name }})" {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} echo " - LDMS telemetry topic ({{ kafka.topics.ldms.name }})" {% endif %} echo "Note: All topics use port 9093 with mTLS for testing" @@ -80,7 +80,7 @@ data: echo "✓ mTLS connection successful" echo "" -{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') %} +{% if telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) and 'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) %} # Test iDRAC telemetry topic consumer echo "Step 5: Testing consumer on {{ kafka.topics.idrac.name }} topic (kafkapump user)..." timeout 30 /opt/kafka/bin/kafka-console-consumer.sh \ @@ -93,7 +93,7 @@ data: echo "" {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} # Test LDMS topic consumer echo "Step 6: Testing consumer on {{ kafka.topics.ldms.name }} topic (kafkapump user via TLS)..." timeout 30 /opt/kafka/bin/kafka-console-consumer.sh \ @@ -114,10 +114,10 @@ data: echo " ✓ kafkapump keystore created" echo " ✓ mTLS connection established" echo " ✓ Topics listed successfully" -{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') %} +{% if telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) and 'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) %} echo " ✓ {{ kafka.topics.idrac.name }} topic tested (kafkapump user)" {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} echo " ✓ {{ kafka.topics.ldms.name }} topic tested via TLS (kafkapump user)" {% endif %} echo "" diff --git a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 index e25d434918..54753cf0ce 100644 --- a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 @@ -1,14 +1,13 @@ resources: - telemetry_secret_creation.yaml -{% set types = hostvars['localhost']['telemetry_collection_type'].split(',') %} -{% if 'victoria' in types %} -# VictoriaMetrics Common Resources (RBAC) +{% if victoria_metrics_support | default(false) %} +# victoria_metrics Resources (Metrics Only) - victoria-vmagent-rbac.yaml {% if victoria_cluster.tls_enabled | default(false) %} - # TLS secret for VictoriaMetrics cluster components + # TLS secret for Victoria components (shared by metrics and logs) - victoria-tls-secret.yaml {% endif %} - # VictoriaMetrics Operator-based Deployment (mode: {{ victoria_deployment_mode }}) + # victoria_metrics Operator-based Deployment (mode: {{ victoria_deployment_mode }}) {% if victoria_deployment_mode == 'cluster' %} # Cluster Mode: VMCluster CR (operator manages StatefulSets) - victoria-operator-vmcluster.yaml @@ -16,11 +15,22 @@ resources: # Single-Node Mode: VMSingle CR (operator manages StatefulSet) - victoria-operator-vmsingle.yaml {% endif %} - # VMAgent CR (operator-managed scraper) + # VMAgent CR (operator-managed scraper for metrics) - victoria-operator-vmagent.yaml - # VMPodScrape CR (native operator-based pod discovery) + # VMPodScrape CR (native operator-based pod discovery for metrics) - victoria-operator-vmpodscrape.yaml - # VictoriaLogs Cluster Mode: VLCluster CR (operator manages vlstorage StatefulSet, vlinsert/vlselect Deployments) +{% if telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) %} + # VMServiceScrape CR for PowerScale OTEL Collector + - victoria-operator-vmservicescrape-powerscale.yaml +{% endif %} +{% endif %} +{% if victoria_logs_support | default(false) %} +# victoria_logs Resources (Logs Only) +{% if victoria_cluster.tls_enabled | default(false) and not victoria_metrics_support | default(false) %} + # TLS secret for Victoria components (only if not already included by victoria_metrics) + - victoria-tls-secret.yaml +{% endif %} + # victoria_logs Cluster Mode: VLCluster CR (operator manages vlstorage StatefulSet, vlinsert/vlselect Deployments) # VLAgent CR (operator-managed log collection agent) # VLAgent ConfigMap (syslog receiver and remoteWrite configuration) - victorialogs-operator-vlcluster.yaml @@ -28,12 +38,13 @@ resources: - victorialogs-vlagent-config.yaml {% endif %} {% if kafka_support %} +# Kafka Resources - kafka.kafka.yaml - kafka.kafkapump_user.yaml -{% if hostvars['localhost']['idrac_telemetry_support'] and 'kafka' in hostvars['localhost']['telemetry_collection_type'].split(',') %} +{% if telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) and 'kafka' in (telemetry_config.telemetry_sources.idrac.collection_targets | default([])) %} - kafka.topic_idrac.yaml {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) and 'kafka' in (telemetry_config.telemetry_sources.ldms.collection_targets | default([])) %} - kafka.topic_ldms.yaml {% endif %} - kafka.kafka_bridge.yaml @@ -41,7 +52,8 @@ resources: # Uncomment to deploy TLS test job # - test/kafka.tls_test_job.yaml {% endif %} -{% if hostvars['localhost']['idrac_telemetry_support'] %} +{% if telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) %} +# iDRAC Telemetry Resources - idrac_telemetry_statefulset.yaml - telemetry_cleaner_rbac.yaml - telemetry_pod_cleanup.yaml diff --git a/provision/roles/telemetry/templates/telemetry/ldms/ldms_machine_config.json.j2 b/provision/roles/telemetry/templates/telemetry/ldms/ldms_machine_config.json.j2 index 59afeda158..391971f717 100644 --- a/provision/roles/telemetry/templates/telemetry/ldms/ldms_machine_config.json.j2 +++ b/provision/roles/telemetry/templates/telemetry/ldms/ldms_machine_config.json.j2 @@ -2,8 +2,8 @@ "sys_opts": { "system" : "dell", "namespace": "telemetry", - "agg_port": {{ telemetry_config.ldms_agg_port }}, - "store_port": {{ telemetry_config.ldms_store_port }}, + "agg_port": {{ telemetry_config.ldms_configurations.agg_port }}, + "store_port": {{ telemetry_config.ldms_configurations.store_port }}, "imagePullSecretsOption": { "imagePullSecrets": [ ] @@ -17,7 +17,7 @@ "store_split": 999999, "sampler": { "name": "nersc", - "port": {{ telemetry_config.ldms_sampler_port }}, + "port": {{ telemetry_config.ldms_configurations.sampler_port }}, "auth_type": "ovis", "auth_secret": "nersc-ldms-ovis-auth", "auth_secret_file": "ldmsauth.conf" diff --git a/provision/roles/telemetry/templates/telemetry/ldms/ldmsd.sampler.env.j2 b/provision/roles/telemetry/templates/telemetry/ldms/ldmsd.sampler.env.j2 index bc230fc797..f3d7887785 100644 --- a/provision/roles/telemetry/templates/telemetry/ldms/ldmsd.sampler.env.j2 +++ b/provision/roles/telemetry/templates/telemetry/ldms/ldmsd.sampler.env.j2 @@ -4,7 +4,7 @@ # LDMS transport option (sock, rdma, or ugni) LDMSD_XPRT=sock # LDMS Daemon service port -LDMSD_PORT={{ telemetry_config.ldms_sampler_port }} +LDMSD_PORT={{ telemetry_config.ldms_configurations.sampler_port }} # LDMS memory allocation LDMSD_MEM=512K diff --git a/provision/roles/telemetry/templates/telemetry/ldms/sampler.conf.j2 b/provision/roles/telemetry/templates/telemetry/ldms/sampler.conf.j2 index f93007e035..37cedc687a 100644 --- a/provision/roles/telemetry/templates/telemetry/ldms/sampler.conf.j2 +++ b/provision/roles/telemetry/templates/telemetry/ldms/sampler.conf.j2 @@ -1,6 +1,6 @@ # Auto-generated LDMS sampler configuration -# Sampler port: {{ telemetry_config.ldms_sampler_port }} -{% for sampler in telemetry_config.ldms_sampler_configurations %} +# Sampler port: {{ telemetry_config.ldms_configurations.sampler_port }} +{% for sampler in telemetry_config.ldms_configurations.sampler_plugins %} load name={{ sampler.plugin_name }} config name={{ sampler.plugin_name }} producer=${HOSTNAME} instance=${HOSTNAME}/{{ sampler.plugin_name }}{% if sampler.config_parameters is defined %} {{ sampler.config_parameters }} {% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/ldms/values.yaml.j2 b/provision/roles/telemetry/templates/telemetry/ldms/values.yaml.j2 index c8bd45e2aa..9941db9154 100644 --- a/provision/roles/telemetry/templates/telemetry/ldms/values.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/ldms/values.yaml.j2 @@ -7,7 +7,7 @@ aggs: - conf: /ldms_conf/ldmsd.nersc-ldms-aggr.slurm-cluster-0.conf env: /ldms_conf/ldms-env.nersc-ldms-aggr.slurm-cluster-0.sh name: slurm-cluster-0 - port: {{ telemetry_config.ldms_agg_port }} + port: {{ telemetry_config.ldms_configurations.agg_port }} authVolMountOption: - mountPath: /nersc-munge-key name: nersc-munge-key @@ -35,7 +35,7 @@ statefulSet: - name: slurm-cluster replicas: 1 store: - port: {{ telemetry_config.ldms_store_port }} + port: {{ telemetry_config.ldms_configurations.store_port }} resources: limits: cpu: 1 diff --git a/provision/roles/telemetry/templates/telemetry/victoria/gen_victoria_certs.sh.j2 b/provision/roles/telemetry/templates/telemetry/victoria/gen_victoria_certs.sh.j2 index bfb894d58d..6a56bd1a1d 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/gen_victoria_certs.sh.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/gen_victoria_certs.sh.j2 @@ -69,23 +69,39 @@ DNS.20 = vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local DNS.21 = vmstorage-victoria-cluster-0.vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local DNS.22 = vmstorage-victoria-cluster-1.vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local DNS.23 = vmstorage-victoria-cluster-2.vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local +# VMAgent service (operator-managed) +DNS.24 = vmagent-vmagent +DNS.25 = vmagent-vmagent.{{ telemetry_namespace }} +DNS.26 = vmagent-vmagent.{{ telemetry_namespace }}.svc +DNS.27 = vmagent-vmagent.{{ telemetry_namespace }}.svc.cluster.local # VictoriaLogs cluster deployment names (operator-managed) -DNS.24 = vlinsert-victoria-logs-cluster -DNS.25 = vlinsert-victoria-logs-cluster.{{ telemetry_namespace }} -DNS.26 = vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc -DNS.27 = vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local -DNS.28 = vlselect-victoria-logs-cluster -DNS.29 = vlselect-victoria-logs-cluster.{{ telemetry_namespace }} -DNS.30 = vlselect-victoria-logs-cluster.{{ telemetry_namespace }}.svc -DNS.31 = vlselect-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local -DNS.32 = vlstorage-victoria-logs-cluster -DNS.33 = vlstorage-victoria-logs-cluster.{{ telemetry_namespace }} -DNS.34 = vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc -DNS.35 = vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.28 = vlinsert-victoria-logs-cluster +DNS.29 = vlinsert-victoria-logs-cluster.{{ telemetry_namespace }} +DNS.30 = vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc +DNS.31 = vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.32 = vlselect-victoria-logs-cluster +DNS.33 = vlselect-victoria-logs-cluster.{{ telemetry_namespace }} +DNS.34 = vlselect-victoria-logs-cluster.{{ telemetry_namespace }}.svc +DNS.35 = vlselect-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.36 = vlstorage-victoria-logs-cluster +DNS.37 = vlstorage-victoria-logs-cluster.{{ telemetry_namespace }} +DNS.38 = vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc +DNS.39 = vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local # vlstorage StatefulSet pod FQDNs (operator-managed, 3 replicas) -DNS.36 = vlstorage-victoria-logs-cluster-0.vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local -DNS.37 = vlstorage-victoria-logs-cluster-1.vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local -DNS.38 = vlstorage-victoria-logs-cluster-2.vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.40 = vlstorage-victoria-logs-cluster-0.vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.41 = vlstorage-victoria-logs-cluster-1.vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +DNS.42 = vlstorage-victoria-logs-cluster-2.vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local +# vlstorage StatefulSet pod short names (without .svc.cluster.local) +DNS.43 = vlstorage-victoria-logs-cluster-0.vlstorage-victoria-logs-cluster.{{ telemetry_namespace }} +DNS.44 = vlstorage-victoria-logs-cluster-1.vlstorage-victoria-logs-cluster.{{ telemetry_namespace }} +DNS.45 = vlstorage-victoria-logs-cluster-2.vlstorage-victoria-logs-cluster.{{ telemetry_namespace }} +# VLAgent service (operator-managed) +DNS.46 = vlagent-vlagent +DNS.47 = vlagent-vlagent.{{ telemetry_namespace }} +DNS.48 = vlagent-vlagent.{{ telemetry_namespace }}.svc +DNS.49 = vlagent-vlagent.{{ telemetry_namespace }}.svc.cluster.local +# VLAgent StatefulSet pod FQDN (operator-managed, 1 replica) +DNS.50 = vlagent-vlagent-0.vlagent-vlagent.{{ telemetry_namespace }}.svc.cluster.local IP.1 = 127.0.0.1 EOF @@ -93,13 +109,19 @@ EOF # If SANs are stale (missing operator-managed names), force server cert regeneration # CA is preserved so external clients do not need to re-import it if [ -f "$CERT_FILE" ]; then - REQUIRED_SAN="vminsert-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local" + REQUIRED_VM_SAN="vminsert-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local" + REQUIRED_VMAGENT_SAN="vmagent-vmagent.{{ telemetry_namespace }}.svc.cluster.local" REQUIRED_VL_SAN="vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local" - if ! openssl x509 -in "$CERT_FILE" -text -noout 2>/dev/null | grep -q "$REQUIRED_SAN" || \ - ! openssl x509 -in "$CERT_FILE" -text -noout 2>/dev/null | grep -q "$REQUIRED_VL_SAN"; then - echo "Existing certificate missing required SAN: $REQUIRED_SAN" - echo "Removing stale server cert/key/csr to force regeneration..." - rm -f "$CERT_KEY" "$CSR_FILE" "$CERT_FILE" + REQUIRED_VLAGENT_SAN="vlagent-vlagent.{{ telemetry_namespace }}.svc.cluster.local" + REQUIRED_VLSTORAGE_POD_SAN="vlstorage-victoria-logs-cluster-0.vlstorage-victoria-logs-cluster.{{ telemetry_namespace }}" + if ! openssl x509 -in "$CERT_FILE" -text -noout 2>/dev/null | grep -q "$REQUIRED_VM_SAN" || \ + ! openssl x509 -in "$CERT_FILE" -text -noout 2>/dev/null | grep -q "$REQUIRED_VMAGENT_SAN" || \ + ! openssl x509 -in "$CERT_FILE" -text -noout 2>/dev/null | grep -q "$REQUIRED_VL_SAN" || \ + ! openssl x509 -in "$CERT_FILE" -text -noout 2>/dev/null | grep -q "$REQUIRED_VLAGENT_SAN" || \ + ! openssl x509 -in "$CERT_FILE" -text -noout 2>/dev/null | grep -q "$REQUIRED_VLSTORAGE_POD_SAN"; then + echo "Existing certificate missing required SANs (VictoriaMetrics/VMAgent/VictoriaLogs/VLAgent/VLStorage pods)" + echo "Removing stale server cert/key/csr/serial file to force regeneration..." + rm -f "$CERT_KEY" "$CSR_FILE" "$CERT_FILE" "$CERT_DIR/ca.srl" fi fi diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-agent-deployment.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-agent-deployment.yaml.j2 index dafce0aada..3fb8ad791f 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-agent-deployment.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-agent-deployment.yaml.j2 @@ -44,7 +44,7 @@ spec: - -remoteWrite.tlsCAFile=/etc/victoria/certs/ca.crt - -remoteWrite.tlsInsecureSkipVerify=false {% endif %} -{% if hostvars['localhost']['powerscale_configurations']['powerscale_telemetry_support'] | default(false) | bool %} +{% if hostvars['localhost']['telemetry_config']['telemetry_sources']['powerscale']['metrics_enabled'] | default(false) | bool %} {% for endpoint in telemetry_config.powerscale_configurations.additional_remote_write_endpoints | default([]) %} - -remoteWrite.url={{ endpoint.url }} {% if endpoint.tls_insecure_skip_verify | default(false) %} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 index b986ae0af8..8d181a8225 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# VMCluster - VictoriaMetrics cluster deployment via operator +# VMCluster - victoria_metrics cluster deployment via operator # Managed by victoria-metrics-operator apiVersion: operator.victoriametrics.com/v1beta1 @@ -22,7 +22,7 @@ metadata: namespace: {{ telemetry_namespace }} spec: # Retention period from telemetry_config.yml - retentionPeriod: "{{ hostvars['localhost']['victoria_metrics_configurations']['retention_period'] }}h" + retentionPeriod: "{{ telemetry_config.telemetry_sinks.victoria_metrics.retention_period }}h" # VMStorage configuration vmstorage: @@ -41,7 +41,7 @@ spec: - ReadWriteOnce resources: requests: - storage: {{ hostvars['localhost']['victoria_metrics_configurations']['persistence_size'] }} + storage: {{ telemetry_config.telemetry_sinks.victoria_metrics.persistence_size }} # Resource limits resources: diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2 new file mode 100644 index 0000000000..20a4b209ce --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2 @@ -0,0 +1,50 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VMServiceScrape - Native operator-based service discovery for PowerScale OTEL Collector +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: otel-collector-powerscale-scrape + namespace: {{ telemetry_namespace }} +spec: + # Target service selector + selector: + matchLabels: + app.kubernetes.io/name: otel-collector + + # Namespace selector + namespaceSelector: + matchNames: + - {{ telemetry_namespace }} + + # Service metrics endpoints + endpoints: + - port: prometheus + interval: {{ vmagent.global.scrape_interval }} + honorLabels: true + + # Add PowerScale-specific labels + relabelConfigs: + - sourceLabels: [__meta_kubernetes_service_name] + targetLabel: source + replacement: powerscale + + - sourceLabels: [__meta_kubernetes_service_name] + targetLabel: job + replacement: otel-collector-powerscale + + # Add namespace label + - sourceLabels: [__meta_kubernetes_namespace] + targetLabel: namespace diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmsingle.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmsingle.yaml.j2 index 41af43489e..db5d35fe0a 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmsingle.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmsingle.yaml.j2 @@ -25,7 +25,7 @@ spec: replicaCount: 1 # Retention period from telemetry_config.yml - retentionPeriod: "{{ hostvars['localhost']['victoria_metrics_configurations']['retention_period'] }}h" + retentionPeriod: "{{ telemetry_config.telemetry_sinks.victoriametrics.retention_period }}h" # Storage configuration storageDataPath: /victoria-metrics-data @@ -36,7 +36,7 @@ spec: - ReadWriteOnce resources: requests: - storage: {{ hostvars['localhost']['victoria_metrics_configurations']['persistence_size'] }} + storage: {{ telemetry_config.telemetry_sinks.victoriametrics.persistence_size }} # Image configuration image: diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-statefulset.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-statefulset.yaml.j2 index 8c8af09972..570c45460f 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-statefulset.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-statefulset.yaml.j2 @@ -99,7 +99,7 @@ spec: args: - "--selfScrapeInterval=5s" - "--storageDataPath=/victoria-metrics-data" - - "--retentionPeriod={{ hostvars['localhost']['victoria_metrics_configurations']['retention_period'] }}" + - "--retentionPeriod={{ telemetry_config.telemetry_sinks.victoriametrics.retention_period }}" - "--httpListenAddr=:8443" - "-tls" - "-tlsCertFile=/etc/victoria/certs/server.crt" @@ -121,4 +121,4 @@ spec: accessModes: ["ReadWriteOnce"] resources: requests: - storage: "{{ hostvars['localhost']['victoria_metrics_configurations']['persistence_size'] }}" + storage: "{{ telemetry_config.telemetry_sinks.victoriametrics.persistence_size }}" diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 index 22b4ecef11..44e15bc708 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -50,8 +50,19 @@ spec: # ======================================== # Configuration Management # ======================================== - # ConfigMap containing VLAgent syslog receiver configuration - configSecret: vlagent-config + # The VictoriaMetrics operator does NOT translate configSecret into + # syslog CLI flags. Syslog listeners MUST be enabled via extraArgs. + # These map directly to VLAgent CLI flags: + # -syslog.listenAddr.tcp → plaintext syslog TCP receiver (rsyslog relay forwards here) + # -syslog.listenAddr.udp → plaintext syslog UDP receiver + # Pipeline: + # UDP mode: PowerScale → K8s node rsyslog (UDP:514) → VLAgent (TCP:514) → VictoriaLogs + extraArgs: + syslog.listenAddr.tcp: ":514" + syslog.listenAddr.udp: ":514" +{% if victoria_logs_cluster.tls_enabled %} + remoteWrite.tlsCAFile: "/etc/victoria/certs/ca.crt" +{% endif %} # ======================================== # Remote Write Configuration @@ -60,11 +71,9 @@ spec: # Supports JSON Lines format with optional TLS remoteWrite: {% if victoria_logs_cluster.tls_enabled %} - - url: https://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/insert/jsonline - tlsConfig: - ca: /etc/victoria/certs/ca.crt + - url: https://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/internal/insert {% else %} - - url: http://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/insert/jsonline + - url: http://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/internal/insert {% endif %} # ======================================== @@ -80,6 +89,28 @@ spec: memory: {{ victoria_logs_cluster.vlagent.resources.limits.memory }} cpu: {{ victoria_logs_cluster.vlagent.resources.limits.cpu }} + # ======================================== + # Init Container: Stale Lock File Cleanup + # ======================================== + # VLAgent uses flock for exclusive access to its persistent queue. + # On NFS-backed PVCs (e.g., PowerScale StorageClass), POSIX flock is + # not reliably released on pod termination. This causes the new pod + # to panic with "resource temporarily unavailable" on the lock file. + # The init container removes stale flock.lock files before vlagent starts. + initContainers: + - name: cleanup-stale-locks + image: {{ victoria_logs_cluster.vlagent.image.split(':')[0] }}:{{ victoria_logs_cluster.vlagent.image.split(':')[1] }} + command: + - /bin/sh + - -c + - | + echo "Cleaning up stale flock files..." + find /tmp-data -name "flock.lock" -delete 2>/dev/null || true + echo "Lock file cleanup complete" + volumeMounts: + - name: tmp-data + mountPath: /tmp-data + # ======================================== # Persistent Storage Configuration # ======================================== @@ -98,10 +129,8 @@ spec: # ======================================== # TLS Certificate Configuration # ======================================== - # Shared with VictoriaMetrics cluster - # Used for: - # - Syslog TLS receiver (:6514) — server certificate - # - remoteWrite to vlinsert — CA certificate validation + # Volume mounts for TLS certificates: + # - victoria-tls-certs: remoteWrite to vlinsert (CA certificate validation) {% if victoria_logs_cluster.tls_enabled %} volumes: - name: victoria-tls-certs @@ -114,6 +143,7 @@ spec: path: server.key - key: ca.crt path: ca.crt + volumeMounts: - name: victoria-tls-certs mountPath: /etc/victoria/certs @@ -123,45 +153,22 @@ spec: # ======================================== # Service Exposure Configuration # ======================================== - # Service type: LoadBalancer (MetalLB) or NodePort (fallback) - # Exposes syslog receivers (:514 TCP+UDP, :6514 TLS) for external log sources -{% if metalLB_deployed | default(false) %} + # Service type: LoadBalancer (MetalLB for external access) + # Provides single external IP for all log sources (OME, SFM, PowerScale) + # PowerScale syslog flows: PowerScale → VLAgent (LoadBalancer IP:514) → VictoriaLogs serviceSpec: useAsDefault: true spec: type: LoadBalancer -{% else %} - serviceSpec: - useAsDefault: true - spec: - type: NodePort -{% endif %} - - # ======================================== - # Port Configuration - # ======================================== - # Syslog receivers (platform-provided scrape targets) - # - :514 TCP+UDP — plaintext syslog (RFC 3164/5424) - # - :6514 TCP — TLS syslog (RFC 5425) - # Health check endpoint - # - :9429 — HTTP health checks (distinct from vmagent 8429) - ports: - - name: syslog - port: 514 - targetPort: 514 - protocol: TCP - - name: syslog-udp - port: 514 - targetPort: 514 - protocol: UDP - - name: syslog-tls - port: 6514 - targetPort: 6514 - protocol: TCP - - name: health - port: 9429 - targetPort: 9429 - protocol: TCP + ports: + - name: syslog + port: 514 + targetPort: 514 + protocol: TCP + - name: syslog-udp + port: 514 + targetPort: 514 + protocol: UDP # ======================================== # Health Probes diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 index ec289f4e5e..0e8c893e1c 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 @@ -30,6 +30,7 @@ spec: # ======================== vlstorage: replicaCount: {{ victoria_logs_cluster.vlstorage.replicas }} + replicationFactor: 2 # Data redundancy: each log written to 2 of 3 nodes image: repository: {{ victoria_logs_cluster.vlstorage.image.split(':')[0] }} tag: {{ victoria_logs_cluster.vlstorage.image.split(':')[1] }} @@ -45,7 +46,7 @@ spec: - ReadWriteOnce resources: requests: - storage: {{ hostvars['localhost']['victoria_logs_configurations']['storage_size'] }} + storage: {{ telemetry_config.telemetry_sinks.victoria_logs.storage_size }} resources: requests: @@ -79,7 +80,7 @@ spec: tolerationSeconds: 5 extraArgs: - retentionPeriod: "{{ hostvars['localhost']['victoria_logs_configurations']['retention_period'] }}h" + retentionPeriod: "{{ telemetry_config.telemetry_sinks.victoria_logs.retention_period }}h" {% if victoria_logs_cluster.tls_enabled %} tls: "true" tlsCertFile: "/etc/victoria/certs/server.crt" @@ -96,6 +97,8 @@ spec: path: server.crt - key: tls.key path: server.key + - key: ca.crt + path: ca.crt volumeMounts: - name: victoria-tls-certs mountPath: /etc/victoria/certs diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-vlagent-config.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-vlagent-config.yaml.j2 index 819ca03670..ab7e1c53b1 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-vlagent-config.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-vlagent-config.yaml.j2 @@ -86,10 +86,10 @@ data: # vlinsert endpoint (in-cluster FQDN) # Operator creates service as vlinsert-victoria-logs-cluster # Port 9481: ingestion endpoint (HTTPS when tls_enabled, HTTP otherwise) - # Path: /insert/jsonline — primary VictoriaLogs ingestion format + # Path: /internal/insert — primary VictoriaLogs ingestion format # Query params: # _stream_fields=hostname,app_name — stream identification for sharding - url: "{{ 'https' if victoria_logs_cluster.tls_enabled else 'http' }}://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/insert/jsonline?_stream_fields=hostname,app_name" + url: "{{ 'https' if victoria_logs_cluster.tls_enabled else 'http' }}://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/internal/insert?_stream_fields=hostname,app_name" {% if victoria_logs_cluster.tls_enabled %} # TLS configuration for remoteWrite client @@ -146,7 +146,7 @@ data: # 3. Additional remoteWrite destinations # Example: Dual-write to secondary vlinsert # remoteWrite: - # - url: "https://secondary-vlinsert:9480/insert/jsonline" + # - url: "https://secondary-vlinsert:9480/internal/insert" # # 4. Parsing and enrichment rules # Example: Extract fields from syslog message diff --git a/provision/roles/telemetry/templates/telemetry/victoria/vlagent-syslog-tls-secret.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/vlagent-syslog-tls-secret.yaml.j2 new file mode 100644 index 0000000000..983a1aa5cf --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/vlagent-syslog-tls-secret.yaml.j2 @@ -0,0 +1,23 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Secret +metadata: + name: vlagent-syslog-tls-certs + namespace: {{ telemetry_namespace }} +type: kubernetes.io/tls +data: + tls.crt: {{ syslog_server_cert_b64 }} + tls.key: {{ syslog_server_key_b64 }} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 index caa70954a6..fe8f086c22 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 @@ -60,7 +60,7 @@ data: # Add Pod IP label - source_labels: [__meta_kubernetes_pod_ip] target_label: pod_ip -{% if hostvars['localhost']['powerscale_configurations']['powerscale_telemetry_support'] | default(false) | bool %} +{% if hostvars['localhost']['telemetry_config']['telemetry_sources']['powerscale']['metrics_enabled'] | default(false) | bool %} # PowerScale OTEL Collector scrape targets (per cluster) {% for cluster in ps_clusters %} diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index 443c362222..a1c8f2fad9 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -20,8 +20,7 @@ service_cluster_metadata_path: "/opt/omnia/.data/service_cluster_metadata.yml" metadata_perm: "0644" # Usage: read_software_config.yml -# Versioned JSON file: service_k8s_v.json (e.g., service_k8s_v1.35.1.json) -k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s_v{{ hostvars['localhost']['service_k8s_version'] }}.json" +k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s.json" # Usage: secrets_creation.yml mysqldb_secrets_name: mysqldb-credentials @@ -69,7 +68,7 @@ kafka: service_name: "kafka-headless" lb_service_name: "kafka-loadbalancer" container_port1: 9093 - # Kafka images from service_k8s_.json + # Kafka images from service_k8s.json operator_image: "{{ telemetry_images['strimzi/operator'] | default('quay.io/strimzi/operator:0.48.0') }}" kafka_image: "{{ telemetry_images['strimzi/kafka'] | default('quay.io/strimzi/kafka:0.48.0-kafka-4.1.0') }}" bridge_image: "{{ telemetry_images['strimzi/kafka-bridge'] | default('quay.io/strimzi/kafka-bridge:0.33.1') }}" @@ -86,8 +85,8 @@ kafka: name: "ldms" consumer_group: "ldms-consumer-group" -# Dynamic image configuration from service_k8s_.json -# Images and versions are read dynamically from input/config/x86_64/rhel/10.0/service_k8s_.json +# Dynamic image configuration from service_k8s.json +# Images and versions are read dynamically from input/config/x86_64/rhel/10.0/service_k8s.json telemetry_images: "{{ service_k8s_images | default({}) }}" # Usage: victoriametric_deployment.yml @@ -237,6 +236,32 @@ telemetry_share_path: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/tele # VictoriaMetrics TLS Configuration victoria_tls_cert_days: 3650 victoria_cert_dir: "{{ telemetry_share_path }}/victoria-certs" +syslog_tls_cert_dir: "{{ telemetry_share_path }}/syslog-tls-certs" + +# PowerScale syslog source IPs requirement message +powerscale_syslog_source_ips_msg: >- + powerscale_configurations.syslog_source_ips is required when logs_enabled is true. + Provide the IP(s) from which PowerScale sends syslog. + This is often the data pool IP (e.g., 40gige-1 interface), NOT the CSI management IP. + Check OneFS Network Configuration → External Network → Pool IPs. + +# PowerScale syslog source IPs display message +powerscale_syslog_source_ips_display_msg: >- + PowerScale syslog source IP(s): {{ powerscale_management_ips | join(', ') }}. + rsyslog $fromhost-ip filter will match these IPs and forward to VLAgent. + +# PowerScale log configuration status message +powerscale_log_config_status_msg: >- + PowerScale syslog configuration script staged on NFS share. + During cloud-init, the script will configure rsyslog on all K8s nodes, + open firewall port 514/udp, and forward PowerScale syslog to VLAgent. + For complete PowerScale syslog configuration details, including manual + PowerScale setup steps, refer to Omnia documentation. + +# PowerScale VictoriaLogs validation fail message +powerscale_victoria_logs_validation_fail_msg: >- + PowerScale log collection requires 'victoria_logs' in + telemetry_sources.powerscale.collection_targets. # Usage: vmagent-scrape-config.yaml vmagent: @@ -298,6 +323,8 @@ victoria_templates_operator_single: dest: 'victoria-operator-vmagent.yaml' - src: 'telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2' dest: 'victoria-operator-vmpodscrape.yaml' + - src: 'telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2' + dest: 'victoria-operator-vmservicescrape-powerscale.yaml' # Cluster operator template (used when victoria_cluster.enabled: true) victoria_templates_operator_cluster: @@ -307,6 +334,8 @@ victoria_templates_operator_cluster: dest: 'victoria-operator-vmagent.yaml' - src: 'telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2' dest: 'victoria-operator-vmpodscrape.yaml' + - src: 'telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2' + dest: 'victoria-operator-vmservicescrape-powerscale.yaml' # Legacy manual deployment templates (removed - use operator-based templates above) # Raw victoria-cluster-vminsert/vmselect/vmstorage.yaml.j2 files have been removed @@ -340,6 +369,7 @@ victoria_templates: > # ============================================================================ # VictoriaLogs operator CR templates (used when 'victoria' in telemetry_collection_type) # These are applied alongside VictoriaMetrics templates in the same kustomize deployment. +# NOTE: PowerScale syslog is deployed in UDP relay mode only. victorialogs_templates: - src: 'telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2' dest: 'victorialogs-operator-vlcluster.yaml' @@ -430,7 +460,7 @@ ps_csi_driver_not_configured_msg: >- PowerScale telemetry requires csi_driver_powerscale to be configured in software_config.json. Please add csi_driver_powerscale to software_config.json and re-run. ps_helm_values_path_missing_msg: >- - csm_observability_values_file_path is required in telemetry_config.yml when powerscale_configurations.powerscale_telemetry_support is true. + csm_observability_values_file_path is required in telemetry_config.yml when telemetry_sources.powerscale.metrics_enabled is true. Provide the path to your customized karavi-observability Helm values file. ps_helm_values_file_not_found_msg: >- Helm values file not found at '{{ ps_helm_values_file | default('') }}'. @@ -484,3 +514,117 @@ ps_dependency_fail_msg: >- - karavi-observability (git) - helm-charts (git) - {{ cert_manager_package | default('cert-manager') }} (tarball from Jetstack Helm repo) + +# ============================================================================ +# PowerScale Log Pipeline Configuration (rsyslog relay + Shared VLAgent) +# ============================================================================ +# Usage: deploy_powerscale_logs.yml +# Gated by: telemetry_sources.powerscale.logs_enabled +# Pipeline: PowerScale (UDP:514) → K8s rsyslog → VLAgent LB (TCP:514) → VictoriaLogs +# PS syslog source IPs from user config (powerscale_configurations.syslog_source_ips) +# NOTE: Syslog source IP != CSI endpoint (mgmt IP). Often the data pool IP (40gige-1). +configure_ps_syslog_template: "{{ role_path }}/../configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2" +ps_log_enabled_msg: >- + PowerScale log collection enabled (telemetry_sources.powerscale.logs_enabled: true). + rsyslog will be configured on all K8s nodes to relay PowerScale audit syslog to VLAgent. + Syslog source IPs from powerscale_configurations.syslog_source_ips. +ps_log_disabled_msg: >- + PowerScale log collection disabled (telemetry_sources.powerscale.logs_enabled: false). + rsyslog relay will not be configured. VLAgent will not include PowerScale labels. +ps_log_deployed_msg: >- + PowerScale syslog configuration script staged on NFS share. + During cloud-init, the script will configure rsyslog on all K8s nodes, + open firewall port 514/udp, and forward PowerScale syslog to VLAgent. + For complete PowerScale syslog configuration details, including manual + PowerScale setup steps, refer to Omnia documentation. +# Vector Kafka-to-Victoria Ingestion Pipeline Configuration +# ============================================================================ +# Usage: deploy_vector_ldms.yml, deploy_vector_ome.yml +# Vector pods consume from Kafka topics and route to VictoriaMetrics/VictoriaLogs +# via dedicated write-buffer agents (vmagent-vector, vlagent-vector). +# +# Architecture: +# Kafka topics → Vector pods → vmagent-vector/vlagent-vector → Victoria sinks +# Vector image (shared by all Vector pods: vector-ldms, vector-ome) +# Registered in service_k8s.json +vector: + image: "{{ telemetry_images['timberio/vector'] | default('docker.io/timberio/vector:0.54.0-alpine') }}" + + # Vector-LDMS configuration + ldms: + app_name: "vector-ldms" + service_name: "vector-ldms" + kafka_topic: "ldms" + consumer_group: "vector-ldms-group" + kafka_user: "kafkapump" # Shared with LDMS store_avro_kafka + health_port: 8687 + metrics_port: 9599 + replicas: 1 + resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" + + # Vector-OME configuration + ome: + app_name: "vector-ome" + service_name: "vector-ome" + # Dynamic pattern based on ome_identifier from telemetry_config.yml + # Example: if ome_identifier="ome", pattern="^ome\\..*$" (matches ome.events, ome.alerts, etc.) + # Example: if ome_identifier="dell_ome", pattern="^dell_ome\\..*$" (matches dell_ome.events, etc.) + kafka_topics_pattern: "^{{ telemetry_bridges.vector_ome.ome_identifier | default('ome') | regex_escape }}\\..*$" + consumer_group: "vector-ome-group" + kafka_user: "vector-ome-user" # Dedicated KafkaUser for OME + health_port: 8688 + metrics_port: 9600 + replicas: 1 + resources: + requests: + cpu: "250m" + memory: "512Mi" + limits: + cpu: "1000m" + memory: "1Gi" + + # vmagent-vector: Write-buffer for Vector → VictoriaMetrics + # Accepts prometheus_remote_write from Vector pods on port 8429 + vmagent_vector: + app_name: "vmagent-vector" + service_name: "vmagent-vector" + port: 8429 # prometheus_remote_write receiver + metrics_port: 8429 # vmagent self-metrics + image: "{{ telemetry_images['victoriametrics/vmagent'] | default('docker.io/victoriametrics/vmagent:v1.128.0') }}" + replicas: 1 + pvc_size: "5Gi" # Disk WAL buffer + remote_write_url: "http://vminsert-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write" + tmp_data_path: "/vmagent-buffer" + resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" + + # vlagent-vector: Write-buffer for Vector → VictoriaLogs + # Accepts JSON Lines from Vector pods on port 9427 + vlagent_vector: + app_name: "vlagent-vector" + service_name: "vlagent-vector" + port: 9427 # JSON Lines receiver + metrics_port: 9427 # vlagent self-metrics + image: "{{ telemetry_images['victoriametrics/vlagent'] | default('docker.io/victoriametrics/vlagent:v1.49.0') }}" + replicas: 1 + pvc_size: "5Gi" # Disk buffer + remote_write_url: "http://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9428/internal/insert" + tmp_data_path: "/vlagent-buffer" + resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" From 055c13b83ef77fe07ea715bbf669df6b9f3d15cf Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Thu, 7 May 2026 18:14:11 +0530 Subject: [PATCH 09/17] update input validation --- .../schema/telemetry_config.json | 515 ++++++++------- .../validation_flows/common_validation.py | 609 +++--------------- 2 files changed, 376 insertions(+), 748 deletions(-) diff --git a/common/library/module_utils/input_validation/schema/telemetry_config.json b/common/library/module_utils/input_validation/schema/telemetry_config.json index 6b511a5f12..124aaaa543 100644 --- a/common/library/module_utils/input_validation/schema/telemetry_config.json +++ b/common/library/module_utils/input_validation/schema/telemetry_config.json @@ -1,104 +1,271 @@ { "$schema": "http://json-schema.org/draft-07/schema#", "title": "Telemetry Configuration", + "description": "Three-layer telemetry configuration: Sources → Bridges → Sinks, with source-specific configurations.", "type": "object", "properties": { - "idrac_telemetry_support": { - "type": "boolean" - }, - "dcgm_support": { - "type": "boolean", - "description": "Enable or disable NVIDIA DCGM (Data Center GPU Manager) on GPU compute nodes. When true, nvidia-dcgm.service is started during cloud-init provisioning. Default: true" - }, - "telemetry_collection_type": { - "anyOf": [ - { - "type": "string", - "enum": ["kafka", "victoria"] + "telemetry_sources": { + "type": "object", + "description": "Data collectors — each source can be independently enabled/disabled.", + "properties": { + "idrac": { + "type": "object", + "description": "iDRAC hardware metrics from Dell PowerEdge servers.", + "properties": { + "metrics_enabled": { + "type": "boolean", + "default": true, + "description": "Enable or disable iDRAC metrics collection." + }, + "collection_targets": { + "type": "array", + "items": { + "type": "string", + "enum": ["victoria_metrics", "kafka"] + }, + "minItems": 1, + "uniqueItems": true, + "default": ["victoria_metrics", "kafka"], + "description": "Where iDRAC data is sent. Supported: 'victoria_metrics', 'kafka'." + } + }, + "required": ["metrics_enabled", "collection_targets"] }, - { - "type": "string", - "pattern": "(?i)^(kafka|victoria)(,(kafka|victoria))*$" + "ldms": { + "type": "object", + "description": "Lightweight Distributed Metric Service for compute node metrics.", + "properties": { + "metrics_enabled": { + "type": "boolean", + "default": true, + "description": "Enable or disable LDMS metrics collection." + }, + "collection_targets": { + "type": "array", + "items": { + "type": "string", + "enum": ["kafka"] + }, + "minItems": 1, + "maxItems": 1, + "uniqueItems": true, + "default": ["kafka"], + "description": "LDMS only supports Kafka collection. Vector-LDMS bridge routes to victoria_metrics." + } + }, + "required": ["metrics_enabled", "collection_targets"] + }, + "dcgm": { + "type": "object", + "description": "NVIDIA Data Center GPU Manager telemetry.", + "properties": { + "metrics_enabled": { + "type": "boolean", + "default": true, + "description": "Enable or disable NVIDIA DCGM (Data Center GPU Manager) on GPU compute nodes. When true, nvidia-dcgm.service is started during cloud-init provisioning. Default: true" + } + }, + "required": ["metrics_enabled"] + }, + "powerscale": { + "type": "object", + "description": "Dell PowerScale (OneFS) storage telemetry.", + "properties": { + "metrics_enabled": { + "type": "boolean", + "default": true, + "description": "Enable or disable PowerScale metrics collection." + }, + "logs_enabled": { + "type": "boolean", + "default": false, + "description": "Enable or disable PowerScale logs collection." + }, + "collection_targets": { + "type": "array", + "items": { + "type": "string", + "enum": ["victoria_metrics", "victoria_logs"] + }, + "minItems": 1, + "uniqueItems": true, + "default": ["victoria_metrics"], + "description": "PowerScale supports victoria_metrics (metrics) and victoria_logs (logs)." + } + }, + "required": ["metrics_enabled", "collection_targets"] } - ] - }, - "ldms_agg_port": { - "type": "integer", - "minimum": 6001, - "maximum": 6100, - "default": 6001, - "description": "LDMS Aggregator port on service k8s cluster. Valid range: 6001-6100. Default: 6001" - }, - "ldms_store_port": { - "type": "integer", - "minimum": 6001, - "maximum": 6100, - "default": 6001, - "description": "LDMS store daemon port on service k8s cluster. Valid range: 6001-6100. Can be the same as ldms_agg_port (isolated by pod). Default: 6001" - }, - "ldms_sampler_port": { - "type": "integer", - "minimum": 10001, - "maximum": 10100, - "default": 10001, - "description": "LDMS sampler port on compute nodes. Valid range: 10001-10100. Default: 10001" + }, + "required": ["idrac", "ldms", "dcgm", "powerscale"] }, - "powerscale_configurations": { + "telemetry_bridges": { "type": "object", + "description": "Data routers — Vector pods that consume from Kafka and produce to Victoria sinks.", "properties": { - "powerscale_telemetry_support": { - "type": "boolean", - "default": true, - "description": "Enable or disable PowerScale telemetry support. Requires csi_driver_powerscale in software_config.json." - }, - "powerscale_log_enabled": { - "type": "boolean", - "default": false, - "description": "Enable or disable PowerScale log collection (syslog to VictoriaLogs). Requires powerscale_telemetry_support: true." + "vector_ldms": { + "type": "object", + "description": "Vector-LDMS: Kafka-to-victoria_metrics bridge for LDMS metrics.", + "properties": { + "metrics_enabled": { + "type": "boolean", + "default": true, + "description": "Enable or disable Vector-LDMS bridge." + } + }, + "required": ["metrics_enabled"] }, - "otel_collector_storage_size": { - "type": "string", - "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei)$", - "default": "5Gi", - "description": "PVC size for OTEL Collector metric batching and buffering." + "vector_ome": { + "type": "object", + "description": "Vector-OME: Kafka-to-Victoria bridge for OME metrics and logs.", + "properties": { + "metrics_enabled": { + "type": "boolean", + "default": true, + "description": "Enable or disable Vector-OME metrics routing." + }, + "logs_enabled": { + "type": "boolean", + "default": true, + "description": "Enable or disable Vector-OME logs routing." + }, + "ome_identifier": { + "type": "string", + "default": "ome", + "minLength": 1, + "description": "Identifier for OME topic matching (e.g., regex '^ome\\\\..*$'). Change only if OME Kafka topics use a different prefix." + } + }, + "required": ["metrics_enabled", "logs_enabled", "ome_identifier"] + } + }, + "required": ["vector_ldms", "vector_ome"] + }, + "telemetry_sinks": { + "type": "object", + "description": "Storage backends — auto-enabled when at least one source targets them.", + "properties": { + "victoria_metrics": { + "type": "object", + "description": "victoria_metrics time-series database for metrics.", + "properties": { + "persistence_size": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei)$", + "default": "8Gi", + "description": "Storage per vmstorage pod PVC. Cluster total = persistence_size × 3 pods." + }, + "retention_period": { + "type": "integer", + "minimum": 24, + "default": 168, + "description": "Metric retention period in hours. Default: 168 (7 days)." + } + }, + "required": ["persistence_size", "retention_period"] }, - "csm_observability_values_file_path": { - "type": "string", - "description": "Path to the user-provided Helm values file for karavi-observability chart. Required when powerscale_telemetry_support is true." + "victoria_logs": { + "type": "object", + "description": "victoria_logs centralized log storage.", + "properties": { + "storage_size": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei)$", + "default": "8Gi", + "description": "Storage per vlstorage pod PVC. Total = storage_size × 3 pods.", + "errorMessage": "storage_size must be a valid Kubernetes PVC size (e.g., '8Gi', '50Gi', '1Ti')" + }, + "retention_period": { + "type": "integer", + "minimum": 24, + "default": 168, + "description": "Log retention period in hours. Default: 168 (7 days)." + } + }, + "required": ["storage_size", "retention_period"] }, - "additional_remote_write_endpoints": { - "type": "array", - "default": [], - "items": { - "type": "object", - "properties": { - "url": { - "type": "string", - "pattern": "^https?://", - "description": "VictoriaMetrics remote_write endpoint URL." - }, - "tls_insecure_skip_verify": { - "type": "boolean", - "default": false, - "description": "Skip TLS certificate verification for this endpoint." - } + "kafka": { + "type": "object", + "description": "Apache Kafka distributed streaming platform.", + "properties": { + "persistence_size": { + "type": "string", + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei)$", + "default": "8Gi", + "description": "Storage per Kafka pod PVC. Total = persistence_size × 6 pods (3 brokers + 3 controllers)." }, - "required": ["url"] + "log_retention_hours": { + "type": "integer", + "minimum": 1, + "default": 168, + "description": "Kafka log retention in hours. Default: 168 (7 days)." + }, + "log_retention_bytes": { + "type": "integer", + "default": -1, + "description": "Maximum size of Kafka logs (bytes) before deletion. Default: -1 (unlimited)." + }, + "log_segment_bytes": { + "type": "integer", + "minimum": 1, + "default": 1073741824, + "description": "Maximum size of Kafka log segments (bytes). Default: 1073741824 (1 GB)." + }, + "topic_partitions": { + "type": "object", + "description": "Topic partition counts keyed by source name. Only sources with kafka in collection_targets get topics.", + "properties": { + "idrac": { + "type": "integer", + "minimum": 1, + "maximum": 100, + "default": 1, + "description": "Number of partitions for the 'idrac' topic." + }, + "ldms": { + "type": "integer", + "minimum": 1, + "maximum": 100, + "default": 2, + "description": "Number of partitions for the 'ldms' topic." + } + }, + "additionalProperties": false, + "errorMessage": "Only 'idrac' and 'ldms' topic partitions are supported." + } }, - "description": "Additional VictoriaMetrics remote_write endpoints. vmagent writes to all configured endpoints." + "required": ["persistence_size", "log_retention_hours", "log_retention_bytes", "log_segment_bytes", "topic_partitions"] } }, - "required": ["powerscale_telemetry_support", "otel_collector_storage_size", "csm_observability_values_file_path"] + "required": ["victoria_metrics", "victoria_logs", "kafka"] }, - "ldms_sampler_configurations": { - "anyOf": [ - { - "type": "null", - "description": "LDMS sampler configurations can be null if no LDMS monitoring is needed" + "ldms_configurations": { + "type": "object", + "description": "LDMS-specific configurations for aggregator, store, and sampler.", + "properties": { + "agg_port": { + "type": "integer", + "minimum": 6001, + "maximum": 6100, + "default": 6001, + "description": "LDMS Aggregator port on service K8s cluster. Valid range: 6001-6100." + }, + "store_port": { + "type": "integer", + "minimum": 6001, + "maximum": 6100, + "default": 6001, + "description": "LDMS store daemon port. Valid range: 6001-6100." }, - { + "sampler_port": { + "type": "integer", + "minimum": 10001, + "maximum": 10100, + "default": 10001, + "description": "LDMS sampler port on compute nodes. Valid range: 10001-10100." + }, + "sampler_plugins": { "type": "array", - "description": "LDMS-specific sampler configurations (string-based)", + "description": "LDMS sampler plugin configurations.", "items": { "type": "object", "properties": { @@ -113,21 +280,21 @@ "slurm_sampler", "procnetdev2" ], - "description": "Name of the LDMS sampler plugin. Must be one of the 6 supported plugin types: meminfo (memory usage), procstat2 (process statistics), vmstat (virtual memory), loadavg (system load), slurm_sampler (HPC workload monitoring), procnetdev2 (network interface statistics). Cannot be empty.", + "description": "Name of the LDMS sampler plugin.", "errorMessage": { "enum": "Invalid plugin_name. Only 6 plugins are supported: meminfo, procstat2, vmstat, loadavg, slurm_sampler, procnetdev2", - "minLength": "plugin_name cannot be empty. Must be one of: meminfo, procstat2, vmstat, loadavg, slurm_sampler, procnetdev2" + "minLength": "plugin_name cannot be empty." } }, "config_parameters": { "type": "string", - "description": "Plugin-specific configuration parameters represented as a single string (e.g., 'component_id=2 stream=slurm job_count=8 task_count=8')" + "description": "Plugin-specific configuration parameters as a single string." }, "activation_parameters": { "type": "string", - "description": "Activation parameters as a string (e.g., 'interval=1000000 offset=0'). Format: 'interval=' with optional 'offset=' separated by space.", + "description": "Activation parameters (e.g., 'interval=30000000 offset=0').", "pattern": "^interval=[1-9][0-9]*(?:\\s+offset=[0-9]+)?$", - "errorMessage": "Must be in format 'interval=' or 'interval= offset='. Example: 'interval=1000000' or 'interval=1000000 offset=0'" + "errorMessage": "Must be in format 'interval=' or 'interval= offset='." } }, "required": ["plugin_name", "activation_parameters"], @@ -144,7 +311,7 @@ "config_parameters": { "type": "string", "pattern": "^(?=.*\\bcomponent_id=\\b)(?=.*\\bstream=\\b)(?=.*\\bjob_count=\\b)(?=.*\\btask_count=\\b).*$", - "description": "Must include component_id, stream, job_count, and task_count in the string" + "description": "Must include component_id, stream, job_count, and task_count." } } } @@ -162,7 +329,7 @@ "config_parameters": { "type": "string", "pattern": "^(|.*\\bifaces=[a-zA-Z0-9_,]+\\b.*)$", - "description": "Optional comma-separated list of network interfaces (e.g., 'ifaces=eth0,eth1')" + "description": "Optional comma-separated list of network interfaces." } } } @@ -170,162 +337,56 @@ ] } } - ] - } - }, - "required": ["idrac_telemetry_support", "dcgm_support", "telemetry_collection_type", "ldms_sampler_configurations", "ldms_agg_port", "ldms_store_port", "ldms_sampler_port", "powerscale_configurations" ], - "$defs": { - "kafka_configurations": { + }, + "required": ["agg_port", "store_port", "sampler_port", "sampler_plugins"] + }, + "powerscale_configurations": { "type": "object", + "description": "PowerScale telemetry detailed configurations.", "properties": { - "persistence_size": { + "otel_collector_storage_size": { "type": "string", - "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei)$" - }, - "log_retention_hours": { - "type": "integer", - "minimum": 1 - }, - "log_retention_bytes": { - "type": "integer" + "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei)$", + "default": "5Gi", + "description": "PVC size for OTEL Collector metric batching and buffering." }, - "log_segment_bytes": { - "type": "integer" + "csm_observability_values_file_path": { + "type": "string", + "description": "Path to the CSM Observability (Karavi) values.yaml file." }, - "topic_partitions": { + "additional_remote_write_endpoints": { "type": "array", - "minItems": 1, - "maxItems": 2, + "default": [], "items": { "type": "object", "properties": { - "name": { + "url": { "type": "string", - "enum": ["idrac", "ldms"], - "description": "CONSTANT: Fixed topic names that cannot be changed. Only 'idrac' and 'ldms' are allowed.", - "errorMessage": { - "enum": "Invalid topic name. Only 'idrac' and 'ldms' are allowed as Kafka topic names. Custom topic names are not supported." - } + "pattern": "^https?://", + "description": "victoria_metrics remote_write endpoint URL." }, - "partitions": { - "type": "integer", - "minimum": 1, - "maximum": 100, - "description": "Number of partitions for the topic (1-100). This is the only configurable parameter." + "tls_insecure_skip_verify": { + "type": "boolean", + "default": false, + "description": "Skip TLS certificate verification for this endpoint." } }, - "required": ["name", "partitions"], - "additionalProperties": false, - "errorMessage": { - "required": { - "name": "Topic 'name' is required and must be one of: 'idrac', 'ldms'", - "partitions": "Topic 'partitions' is required and must be between 1-100" - } - } + "required": ["url"] }, - "uniqueItems": true, - "description": "IMPORTANT: At least one Kafka topic must be defined. Topic names 'idrac' and 'ldms' are CONSTANTS. 'idrac' is required if idrac_telemetry_support is true and kafka is in telemetry_collection_type. 'ldms' is required if LDMS software is configured in software_config.json (automatic detection). Only partition counts can be changed.", - "errorMessage": { - "minItems": "At least 1 Kafka topic must be defined. Configure based on enabled features.", - "maxItems": "Maximum 2 topics allowed: 'idrac' and 'ldms'", - "uniqueItems": "Each topic (idrac, ldms) must appear only once" - } - } - }, - "required": [ - "persistence_size", - "log_retention_hours", - "log_retention_bytes", - "log_segment_bytes", - "topic_partitions" - ], - "additionalProperties": false - }, - "victoria_metrics_configurations": { - "type": "object", - "properties": { - "persistence_size": { - "type": "string", - "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei)$" - }, - "retention_period": { - "type": "integer", - "minimum": 24 - } - }, - "required": [ - "persistence_size", - "retention_period" - ], - "additionalProperties": false - }, - "victoria_logs_configurations": { - "type": "object", - "description": "VictoriaLogs cluster mode configuration. Deployed alongside VictoriaMetrics when 'victoria' is in collection type.", - "properties": { - "storage_size": { - "type": "string", - "pattern": "^[0-9]+(Ki|Mi|Gi|Ti|Pi|Ei)$", - "default": "8Gi", - "description": "Storage size per vlstorage replica PVC. Total = storage_size x 3 replicas.", - "errorMessage": "storage_size must be a valid Kubernetes PVC size (e.g., '8Gi', '50Gi', '1Ti')" + "description": "Additional victoria_metrics remote_write endpoints." }, - "retention_period": { - "type": "integer", - "minimum": 24 + "syslog_source_ips": { + "type": "array", + "default": [], + "items": { + "type": "string", + "format": "ipv4" + }, + "description": "PowerScale IP address(es) from which syslog packets arrive. Optional: if empty, rsyslog accepts syslog from any source IP. If provided, rsyslog filters by these IPs for security." } }, - "required": ["storage_size", "retention_period"], - "additionalProperties": false + "required": ["otel_collector_storage_size", "csm_observability_values_file_path"] } }, - "allOf": [ - { - "if": { - "properties": { - "idrac_telemetry_support": { "const": true }, - "telemetry_collection_type": { "pattern": "(?i)^kafka$" } - } - }, - "then": { - "required": ["kafka_configurations"], - "properties": { - "kafka_configurations": { "$ref": "#/$defs/kafka_configurations" } - } - } - }, - { - "if": { - "properties": { - "idrac_telemetry_support": { "const": true }, - "telemetry_collection_type": { "pattern": "(?i)^victoria$" } - } - }, - "then": { - "required": ["victoria_metrics_configurations", "victoria_logs_configurations"], - "properties": { - "victoria_metrics_configurations": { "$ref": "#/$defs/victoria_metrics_configurations" }, - "victoria_logs_configurations": { "$ref": "#/$defs/victoria_logs_configurations" } - } - } - }, - { - "if": { - "properties": { - "idrac_telemetry_support": { "const": true }, - "telemetry_collection_type": { - "pattern": "(?i)^(victoria,kafka|kafka,victoria)$" - } - } - }, - "then": { - "required": ["kafka_configurations", "victoria_metrics_configurations", "victoria_logs_configurations"], - "properties": { - "kafka_configurations": { "$ref": "#/$defs/kafka_configurations" }, - "victoria_metrics_configurations": { "$ref": "#/$defs/victoria_metrics_configurations" }, - "victoria_logs_configurations": { "$ref": "#/$defs/victoria_logs_configurations" } - } - } - } - ] + "required": ["telemetry_sources", "telemetry_bridges", "telemetry_sinks", "ldms_configurations", "powerscale_configurations"] } diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index da3d0d73bf..e2e53925c0 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,12 +21,12 @@ import ipaddress import json import os -import re from collections import Counter import yaml import ansible.module_utils.input_validation.common_utils.data_fetch as fetch from ansible.module_utils.input_validation.validation_flows import csi_driver_validation +from ansible.module_utils.input_validation.validation_flows import powerscale_authorization_validation import ansible.module_utils.input_validation.common_utils.data_validation as validate from ansible.module_utils.input_validation.common_utils import ( config, @@ -253,23 +253,15 @@ def validate_software_config( for software_pkg in data['softwares']: software = software_pkg['name'] arch_list = software_pkg.get('arch') - # Get software version for versioned JSON files (e.g., service_k8s_v1.35.1.json) - software_version = software_pkg.get('version') for arch in arch_list: json_path = get_json_file_path( - software, cluster_os_type, cluster_os_version, input_file_path, arch, - software_version=software_version) + software, cluster_os_type, cluster_os_version, input_file_path, arch) # Check if json_path is None or if the JSON syntax is invalid if not json_path: - # Construct expected filename for error message - if software == "service_k8s" and software_version: - expected_file = f"{software}_v{software_version}.json" - else: - expected_file = f"{software}.json" errors.append( create_error_msg( "Validation Error: ", software, - f"is present in software_config.json. JSON file not found: {expected_file}" + f"is present in software_config.json. JSON file not found: {software}.json" ) ) else: @@ -330,7 +322,7 @@ def validate_software_config( "Please resolve the issues first before proceeding.", ) ) - + if additional_packages_warnings: logger.info( "[INFO] Additional packages validation completed with warnings. " @@ -923,8 +915,8 @@ def is_ip_in_range(ip_str, ip_range_str): return False -def validate_k8s(data, admin_networks, softwares, ha_config, tag_names, errors, - st_config, module, input_file_path): +def validate_k8s(data, admin_networks, softwares, ha_config, tag_names, errors, + st_config, module, input_file_path, logger): """ Validates Kubernetes cluster configurations. @@ -933,6 +925,7 @@ def validate_k8s(data, admin_networks, softwares, ha_config, tag_names, errors, admin_networks (dict): A dictionary containing admin network information. softwares (list): A list of software name sin software_config. errors (list): A list to store error messages. + logger (object): Logger object for logging. """ admin_dynamic_range = admin_networks["admin_network"]["dynamic_range"] primary_oim_admin_ip = admin_networks["admin_network"]["primary_oim_admin_ip"] @@ -1027,7 +1020,7 @@ def validate_k8s(data, admin_networks, softwares, ha_config, tag_names, errors, csi_secret_file_path = kluster.get("csi_powerscale_driver_secret_file_path") csi_values_file_path = kluster.get("csi_powerscale_driver_values_file_path") - + # Validate secret file path if not csi_secret_file_path or \ not csi_secret_file_path.strip() or \ @@ -1053,6 +1046,15 @@ def validate_k8s(data, admin_networks, softwares, ha_config, tag_names, errors, ) csi_driver_validation.validate_powerscale_secret_and_values_file(csi_secret_file_path,csi_values_file_path, errors, input_file_path) + # PowerScale Authorization validation + input_dir = os.path.dirname(input_file_path) + software_config_file_path = os.path.join(input_dir, "software_config.json") + config_paths = get_config_file_paths(input_dir, data, software_config_file_path) + + powerscale_authorization_validation.validate_powerscale_authorization( + kluster, softwares, input_file_path, config_paths, logger, errors + ) + def validate_omnia_config( input_file_path, data, @@ -1111,9 +1113,9 @@ def validate_omnia_config( for k in ["service_k8s_cluster_ha"]: ha_config[k] = [xha["cluster_name"] for xha in ha_config.get(k, [])] validate_k8s(data, admin_networks, sw_list, ha_config, tag_names, - errors, st_config, module, input_file_path) + errors, st_config, module, input_file_path, logger) # slurm L2 - if (("slurm" in sw_list or "slurm_custom" in sw_list) and "slurm" in tag_names): + if (("slurm" in sw_list or "slurm_custom" in sw_list) and "slurm" in tag_names): slurm_nfs = [clst.get('nfs_storage_name') for clst in data.get('slurm_cluster')] nfs_names = [st.get('nfs_name') for st in st_config.get('nfs_client_params')] @@ -1125,16 +1127,16 @@ def validate_omnia_config( "slurm NFS not provided", f"NFS name {', '.join(diff_set)} required for slurm is not defined in {storage_config}" )) - + # Validate node_hardware_defaults requires node_discovery_mode=homogeneous for clst in data.get('slurm_cluster', []): node_hardware_defaults = clst.get('node_hardware_defaults') node_discovery_mode = clst.get('node_discovery_mode') - + # Normalize mode to lowercase for case-insensitive comparison if node_discovery_mode and isinstance(node_discovery_mode, str): node_discovery_mode = node_discovery_mode.lower() - + if node_hardware_defaults and len(node_hardware_defaults) > 0: if not node_discovery_mode or node_discovery_mode != 'homogeneous': group_names = list(node_hardware_defaults.keys()) @@ -1147,7 +1149,7 @@ def validate_omnia_config( f"Either set 'node_discovery_mode: \"homogeneous\"' to use the hardware specifications, " f"or remove 'node_hardware_defaults' to use heterogeneous discovery." )) - + cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] skip_conf_validation = os.path.exists("/opt/omnia/input/.skip_slurm_conf_validation") cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] @@ -1201,7 +1203,7 @@ def check_is_service_cluster_functional_groups_defined( # Get the directory containing the input file input_dir = os.path.dirname(input_file_path) provision_config_path = os.path.join(input_dir, "provision_config.yml") - + # Check if provision_config.yml exists if not os.path.exists(provision_config_path): errors.append( @@ -1212,14 +1214,14 @@ def check_is_service_cluster_functional_groups_defined( ) ) return False - + try: # Load provision_config.yml to get pxe_mapping_file_path with open(provision_config_path, 'r', encoding='utf-8') as f: provision_config = yaml.safe_load(f) - + pxe_mapping_file_path = provision_config.get('pxe_mapping_file_path', '') - + if not pxe_mapping_file_path or not os.path.exists(pxe_mapping_file_path): errors.append( create_error_msg( @@ -1229,14 +1231,14 @@ def check_is_service_cluster_functional_groups_defined( ) ) return False - + # Read the mapping file and check for service_kube_node functional groups with open(pxe_mapping_file_path, 'r', encoding='utf-8') as fh: raw_lines = fh.readlines() - + # Remove blank lines non_comment_lines = [ln for ln in raw_lines if ln.strip()] - + if not non_comment_lines: errors.append( create_error_msg( @@ -1246,15 +1248,15 @@ def check_is_service_cluster_functional_groups_defined( ) ) return False - + # Use csv.DictReader to parse the mapping file reader = csv.DictReader(non_comment_lines) - + # Check if all required service cluster functional groups are present # Required: service_kube_node_, service_kube_control_plane_ has_kube_node = False has_control_plane = False - + for row in reader: functional_group = row.get('FUNCTIONAL_GROUP_NAME', '').strip() if functional_group.startswith('service_kube_node_'): @@ -1263,10 +1265,10 @@ def check_is_service_cluster_functional_groups_defined( elif functional_group.startswith('service_kube_control_plane_'): has_control_plane = True logger.info(f"Service cluster functional group found: {functional_group}") - + # Both must be present for a complete service cluster service_cluster_found = has_kube_node and has_control_plane - + if not service_cluster_found: missing = [] if not has_kube_node: @@ -1274,9 +1276,9 @@ def check_is_service_cluster_functional_groups_defined( if not has_control_plane: missing.append('service_kube_control_plane_*') logger.info(f"Service cluster incomplete. Missing functional groups: {', '.join(missing)}") - + return service_cluster_found - + except (yaml.YAMLError, IOError, csv.Error) as e: errors.append( create_error_msg( @@ -1287,6 +1289,42 @@ def check_is_service_cluster_functional_groups_defined( ) return False +def get_config_file_paths(input_dir, data, software_config_file_path): + """ + Dynamically resolves config file paths based on cluster OS type and version. + + Args: + input_dir (str): Input directory path. + data (dict): Configuration data (may contain cluster_os_type, cluster_os_version). + software_config_file_path (str): Path to software_config.json. + + Returns: + dict: Dictionary containing resolved file paths: + - service_k8s_json_path: Path to service_k8s.json + - csi_driver_powerscale_json_path: Path to csi_driver_powerscale.json + """ + # Try reading cluster_os_type/version from data first, then from software_config.json + cluster_os_type = data.get("cluster_os_type", "rhel") + cluster_os_version = data.get("cluster_os_version", "10.0") + + if os.path.exists(software_config_file_path): + try: + with open(software_config_file_path, 'r', encoding='utf-8') as scf: + sc_data = json.load(scf) + cluster_os_type = sc_data.get("cluster_os_type", cluster_os_type) + cluster_os_version = sc_data.get("cluster_os_version", cluster_os_version) + except (json.JSONDecodeError, IOError): + pass + + config_base_path = os.path.join(input_dir, "config", "x86_64", cluster_os_type, cluster_os_version) + service_k8s_json_path = os.path.join(config_base_path, "service_k8s.json") + csi_driver_powerscale_json_path = os.path.join(config_base_path, "csi_driver_powerscale.json") + + return { + "service_k8s_json_path": service_k8s_json_path, + "csi_driver_powerscale_json_path": csi_driver_powerscale_json_path + } + def check_is_slurm_cluster_functional_groups_defined( errors, input_file_path, omnia_base_dir, project_name, logger, module ): @@ -1307,7 +1345,7 @@ def check_is_slurm_cluster_functional_groups_defined( # Get the directory containing the input file input_dir = os.path.dirname(input_file_path) provision_config_path = os.path.join(input_dir, "provision_config.yml") - + # Check if provision_config.yml exists if not os.path.exists(provision_config_path): errors.append( @@ -1318,14 +1356,14 @@ def check_is_slurm_cluster_functional_groups_defined( ) ) return False - + try: # Load provision_config.yml to get pxe_mapping_file_path with open(provision_config_path, 'r', encoding='utf-8') as f: provision_config = yaml.safe_load(f) - + pxe_mapping_file_path = provision_config.get('pxe_mapping_file_path', '') - + if not pxe_mapping_file_path or not os.path.exists(pxe_mapping_file_path): errors.append( create_error_msg( @@ -1335,14 +1373,14 @@ def check_is_slurm_cluster_functional_groups_defined( ) ) return False - + # Read the mapping file and check for slurm functional groups with open(pxe_mapping_file_path, 'r', encoding='utf-8') as fh: raw_lines = fh.readlines() - + # Remove blank lines non_comment_lines = [ln for ln in raw_lines if ln.strip()] - + if not non_comment_lines: errors.append( create_error_msg( @@ -1352,15 +1390,15 @@ def check_is_slurm_cluster_functional_groups_defined( ) ) return False - + # Use csv.DictReader to parse the mapping file reader = csv.DictReader(non_comment_lines) - + # Check if all required slurm cluster functional groups are present # Required: slurm_control_node_, slurm_node has_slurm_control = False has_slurm_node = False - + for row in reader: functional_group = row.get('FUNCTIONAL_GROUP_NAME', '').strip() if functional_group.startswith('slurm_control_node_'): @@ -1369,10 +1407,10 @@ def check_is_slurm_cluster_functional_groups_defined( elif functional_group.startswith('slurm_node_'): has_slurm_node = True logger.info(f"Slurm cluster functional group found: {functional_group}") - + # Both must be present for a complete slurm cluster slurm_cluster_found = has_slurm_control and has_slurm_node - + if not slurm_cluster_found: missing = [] if not has_slurm_control: @@ -1380,9 +1418,9 @@ def check_is_slurm_cluster_functional_groups_defined( if not has_slurm_node: missing.append('slurm_node_') logger.info(f"Slurm cluster incomplete. Missing functional groups: {', '.join(missing)}") - + return slurm_cluster_found - + except (yaml.YAMLError, IOError, csv.Error) as e: errors.append( create_error_msg( @@ -1393,477 +1431,6 @@ def check_is_slurm_cluster_functional_groups_defined( ) return False -def validate_telemetry_config( - input_file_path, - data, - logger, - module, - omnia_base_dir, - _module_utils_base, - project_name -): - - """ - Validates the telemetry configuration data. - - This function checks the telemetry configuration data for validity and consistency. - It verifies that the iDRAC telemetry support and federated iDRAC telemetry collection - settings are correctly configured. - - Args: - input_file_path (str): The path to the input file. - data (dict): The telemetry configuration data. - logger (object): The logger object. - module (object): The module object. - omnia_base_dir (str): The base directory of the Omnia project. - _module_utils_base (str): The base directory of the module utilities. - project_name (str): The name of the project. - - Returns: - None - - Raises: - None - - """ - errors = [] - - idrac_telemetry_support = data.get("idrac_telemetry_support") - is_service_cluster_defined = check_is_service_cluster_functional_groups_defined(errors, - input_file_path, - omnia_base_dir, - project_name, - logger, - module) - if idrac_telemetry_support and not is_service_cluster_defined: - errors.append(create_error_msg( - "idrac_telemetry_support can be", - idrac_telemetry_support, - en_us_validation_msg.TELEMETRY_SERVICE_CLUSTER_ENTRY_MISSING_ROLES_CONFIG_MSG - ) - ) - - is_slurm_cluster_defined = check_is_slurm_cluster_functional_groups_defined(errors, - input_file_path, - omnia_base_dir, - project_name, - logger, - module) - - # Determine LDMS support from software_config.json - # software_config.json is in the same directory as telemetry_config.yml - ldms_support_from_software_config = False - input_dir = os.path.dirname(input_file_path) - software_config_file_path = os.path.join(input_dir, "software_config.json") - - logger.info(f"Checking for LDMS software in: {software_config_file_path}") - - if os.path.exists(software_config_file_path): - try: - with open(software_config_file_path, 'r', encoding='utf-8') as f: - software_config = json.load(f) - softwares = software_config.get("softwares", []) - ldms_support_from_software_config = any( - software.get("name") == "ldms" for software in softwares - ) - logger.info(f"LDMS software detected in software_config.json: {ldms_support_from_software_config}") - if ldms_support_from_software_config: - logger.info("LDMS software found - 'ldms' topic will be required in kafka_configurations.topic_partitions") - except (json.JSONDecodeError, IOError) as e: - logger.warn(f"Could not load software_config.json: {e}") - else: - logger.info(f"software_config.json not found at: {software_config_file_path}") - - if ldms_support_from_software_config and not (is_service_cluster_defined and is_slurm_cluster_defined): - errors.append(create_error_msg( - "LDMS entry in software_config.json set to ", - ldms_support_from_software_config, - en_us_validation_msg.TELEMETRY_SERVICE_CLUSTER_ENTRY_FOR_LDMS_MISSING_ROLES_CONFIG_MSG - ) - ) - - # Validate topic_partitions configuration - kafka_config = data.get("kafka_configurations", {}) - topic_partitions = kafka_config.get("topic_partitions", []) - telemetry_collection_type = data.get("telemetry_collection_type", "") - - # Check if LDMS software is configured but kafka_configurations is missing entirely - if ldms_support_from_software_config and not kafka_config: - errors.append(create_error_msg( - "kafka_configurations", - "not defined", - "LDMS software is configured in software_config.json, but kafka_configurations section is missing in telemetry_config.yml. " - "Please define kafka_configurations with at least the 'ldms' topic in topic_partitions." - )) - - # Check if LDMS software is configured but no topics are defined - if ldms_support_from_software_config and kafka_config and not topic_partitions: - errors.append(create_error_msg( - "kafka_configurations.topic_partitions", - "not defined", - "LDMS software is configured in software_config.json, but kafka_configurations.topic_partitions is not defined. " - "Please define at least the 'ldms' topic in topic_partitions." - )) - - if topic_partitions: - # Ensure at least one topic is defined - if len(topic_partitions) < 1: - errors.append(create_error_msg( - "kafka_configurations.topic_partitions", - "is empty", - "At least one Kafka topic must be defined" - )) - - # Collect topic names and validate each one - topic_names = [] - allowed_topics = {"idrac", "ldms"} - - for idx, topic in enumerate(topic_partitions): - if "name" not in topic: - errors.append(create_error_msg( - f"kafka_configurations.topic_partitions[{idx}]", - "missing 'name' field", - "Each topic must have a 'name' field" - )) - continue - - topic_name = topic.get("name") - topic_names.append(topic_name) - - # Validate each topic name individually - if topic_name not in allowed_topics: - errors.append(create_error_msg( - f"kafka_configurations.topic_partitions[{idx}].name", - topic_name, - f"Invalid topic name '{topic_name}'. Only 'idrac' and 'ldms' are allowed as Kafka topic names. Custom topic names are not supported." - )) - - present_topics = set(topic_names) - - # Debug logging - logger.info(f"Telemetry validation - Present topics: {present_topics}") - logger.info(f"Telemetry validation - Allowed topics: {allowed_topics}") - - # Validate required topics based on feature flags - # If iDRAC telemetry is enabled with Kafka, idrac topic is required - if idrac_telemetry_support and 'kafka' in telemetry_collection_type.split(','): - if 'idrac' not in present_topics: - errors.append(create_error_msg( - "kafka_configurations.topic_partitions", - "missing 'idrac' topic", - "idrac topic is required when idrac_telemetry_support is true and 'kafka' is in telemetry_collection_type" - )) - - # If LDMS software is configured in software_config.json, ldms topic is required - logger.info(f"Checking LDMS topic requirement - ldms_support_from_software_config: {ldms_support_from_software_config}") - if ldms_support_from_software_config and 'ldms' not in present_topics: - logger.error(f"LDMS topic validation FAILED - 'ldms' topic is missing from present_topics: {present_topics}") - errors.append(create_error_msg( - "kafka_configurations.topic_partitions", - "missing 'ldms' topic", - "ldms topic is required when LDMS software is configured in software_config.json" - )) - elif ldms_support_from_software_config: - logger.info(f"LDMS topic validation PASSED - 'ldms' found in present_topics: {present_topics}") - - # Check for duplicate topic names - if len(topic_names) != len(set(topic_names)): - duplicates = [name for name in topic_names if topic_names.count(name) > 1] - errors.append(create_error_msg( - "kafka_configurations.topic_partitions", - f"duplicate topics: {', '.join(set(duplicates))}", - "Each topic must be defined only once" - )) - - # Validate ldms_sampler_configurations - fail if it's None or empty array - ldms_sampler_configurations = data.get("ldms_sampler_configurations") - - # Fail if ldms_sampler_configurations is None - if ldms_sampler_configurations is None: - errors.append(create_error_msg( - "ldms_sampler_configurations", - "null/None", - "ldms_sampler_configurations is required and cannot be null. Please provide valid sampler configurations with plugin names." - )) - # Fail if ldms_sampler_configurations is an empty array - elif isinstance(ldms_sampler_configurations, list): - if len(ldms_sampler_configurations) == 0: - errors.append(create_error_msg( - "ldms_sampler_configurations", - "empty array []", - "ldms_sampler_configurations cannot be an empty array. Please provide at least one valid sampler configuration with plugin names." - )) - else: - # Validate each sampler configuration for empty plugin_name - for idx, config in enumerate(ldms_sampler_configurations): - if not isinstance(config, dict): - continue - - plugin_name = config.get("plugin_name", "") - if not plugin_name or (isinstance(plugin_name, str) and plugin_name.strip() == ""): - errors.append(create_error_msg( - f"ldms_sampler_configurations[{idx}].plugin_name", - f"'{plugin_name}'", - "plugin_name cannot be empty. Must be one of: meminfo, procstat2, vmstat, loadavg, slurm_sampler, procnetdev2" - )) - - # Validate PowerScale telemetry configuration - powerscale_config = data.get("powerscale_configurations") - if not powerscale_config: - errors.append(create_error_msg( - "powerscale_configurations", - "not defined", - en_us_validation_msg.POWERSCALE_CONFIGURATIONS_MISSING_MSG - )) - else: - powerscale_telemetry_support = powerscale_config.get("powerscale_telemetry_support", False) - - if powerscale_telemetry_support: - logger.info("PowerScale telemetry support is enabled, performing PowerScale validation") - - # Check victoria is in telemetry_collection_type - # PowerScale telemetry pipeline requires VictoriaMetrics (writes to vminsert via shared vmagent) - collection_types = [t.strip() for t in telemetry_collection_type.split(',')] - if 'victoria' not in collection_types: - errors.append(create_error_msg( - "telemetry_collection_type", - telemetry_collection_type, - en_us_validation_msg.POWERSCALE_VICTORIA_REQUIRED_MSG - )) - - # Check CSI driver PowerScale is in software_config.json - csi_powerscale_found = False - if os.path.exists(software_config_file_path): - try: - with open(software_config_file_path, 'r', encoding='utf-8') as f: - software_config = json.load(f) - softwares = software_config.get("softwares", []) - csi_powerscale_found = any( - software.get("name") == "csi_driver_powerscale" for software in softwares - ) - except (json.JSONDecodeError, IOError) as e: - logger.warn(f"Could not load software_config.json for PowerScale validation: {e}") - - if not csi_powerscale_found: - errors.append(create_error_msg( - "powerscale_configurations.powerscale_telemetry_support", - powerscale_telemetry_support, - en_us_validation_msg.POWERSCALE_CSI_DRIVER_MISSING_MSG - )) - - # Check service cluster is defined - if not is_service_cluster_defined: - errors.append(create_error_msg( - "powerscale_configurations.powerscale_telemetry_support", - powerscale_telemetry_support, - en_us_validation_msg.POWERSCALE_SERVICE_CLUSTER_MISSING_MSG - )) - - # Validate otel_collector_storage_size - otel_storage = powerscale_config.get("otel_collector_storage_size", "") - if not otel_storage or not isinstance(otel_storage, str): - errors.append(create_error_msg( - "powerscale_configurations.otel_collector_storage_size", - otel_storage, - en_us_validation_msg.POWERSCALE_OTEL_STORAGE_SIZE_INVALID_MSG - )) - - # Validate csm_observability_values_file_path - csm_values_path = powerscale_config.get("csm_observability_values_file_path", "") - if not csm_values_path or not isinstance(csm_values_path, str) or csm_values_path.strip() == "": - errors.append(create_error_msg( - "powerscale_configurations.csm_observability_values_file_path", - csm_values_path, - en_us_validation_msg.POWERSCALE_CSM_VALUES_PATH_REQUIRED_MSG - )) - elif not os.path.exists(csm_values_path): - errors.append(create_error_msg( - "powerscale_configurations.csm_observability_values_file_path", - csm_values_path, - en_us_validation_msg.powerscale_csm_values_not_found_msg(csm_values_path) - )) - else: - # Validate the CSM Observability values.yaml content - try: - with open(csm_values_path, 'r', encoding='utf-8') as f: - csm_values = yaml.safe_load(f) - if not isinstance(csm_values, dict): - errors.append(create_error_msg( - "powerscale_configurations.csm_observability_values_file_path", - csm_values_path, - en_us_validation_msg.POWERSCALE_CSM_VALUES_INVALID_YAML_MSG - )) - else: - # Validate required keys - karavi_metrics = csm_values.get("karaviMetricsPowerscale", {}) - if not karavi_metrics: - errors.append(create_error_msg( - "csm_observability_values_file_path", - csm_values_path, - en_us_validation_msg.POWERSCALE_CSM_VALUES_MISSING_KARAVI_SECTION_MSG - )) - else: - # Validate image reference exists - if not karavi_metrics.get("image"): - errors.append(create_error_msg( - "karaviMetricsPowerscale.image", - "not defined", - en_us_validation_msg.POWERSCALE_CSM_METRICS_IMAGE_MISSING_MSG - )) - - otel_config = csm_values.get("otelCollector", {}) - if not otel_config or not otel_config.get("image"): - errors.append(create_error_msg( - "otelCollector.image", - "not defined", - en_us_validation_msg.POWERSCALE_OTEL_COLLECTOR_IMAGE_MISSING_MSG - )) - - # Validate Karavi Authorization config in Helm values - karavi_auth = karavi_metrics.get("authorization", {}) if karavi_metrics else {} - if karavi_auth.get("enabled", False): - proxy_host = karavi_auth.get("proxyHost", "") - if not proxy_host or not isinstance(proxy_host, str) or proxy_host.strip() == "": - errors.append(create_error_msg( - "karaviMetricsPowerscale.authorization.proxyHost", - proxy_host, - en_us_validation_msg.POWERSCALE_AUTH_PROXY_HOST_MISSING_MSG - )) - - # Cross-validate image versions between values.yaml and service_k8s.json - service_k8s_json_path = os.path.join( - input_dir, "config", "x86_64", - data.get("cluster_os_type", "rhel") if "cluster_os_type" in data else "rhel", - data.get("cluster_os_version", "10.0") if "cluster_os_version" in data else "10.0", - "service_k8s.json" - ) - # Try reading cluster_os_type/version from software_config.json - if os.path.exists(software_config_file_path): - try: - with open(software_config_file_path, 'r', encoding='utf-8') as scf: - sc_data = json.load(scf) - sc_os_type = sc_data.get("cluster_os_type", "rhel") - sc_os_version = sc_data.get("cluster_os_version", "10.0") - service_k8s_json_path = os.path.join( - input_dir, "config", "x86_64", - sc_os_type, sc_os_version, "service_k8s.json" - ) - except (json.JSONDecodeError, IOError): - pass - - if os.path.exists(service_k8s_json_path): - try: - with open(service_k8s_json_path, 'r', encoding='utf-8') as sk8s_f: - service_k8s_data = json.load(sk8s_f) - - # Build lookup: package -> tag from service_k8s.json - sk8s_images = {} - for entry in service_k8s_data.get("service_k8s", {}).get("cluster", []): - if entry.get("type") == "image" and "tag" in entry: - sk8s_images[entry["package"]] = entry["tag"] - - # Images to cross-validate: (description, values.yaml image, service_k8s package key) - images_to_check = [] - - if karavi_metrics and karavi_metrics.get("image"): - images_to_check.append(( - "csm-metrics-powerscale", - karavi_metrics["image"], - "quay.io/dell/container-storage-modules/csm-metrics-powerscale" - )) - if otel_config and otel_config.get("image"): - images_to_check.append(( - "opentelemetry-collector", - otel_config["image"], - "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector" - )) - karavi_auth = karavi_metrics.get("authorization", {}) if karavi_metrics else {} - sidecar_proxy = karavi_auth.get("sidecarProxy", {}) - if sidecar_proxy and sidecar_proxy.get("image"): - # csm-authorization-sidecar is in csi_driver_powerscale.json, not service_k8s.json - csi_ps_json_path = os.path.join( - os.path.dirname(service_k8s_json_path), "csi_driver_powerscale.json" - ) - if os.path.exists(csi_ps_json_path): - try: - with open(csi_ps_json_path, 'r', encoding='utf-8') as csi_f: - csi_ps_data = json.load(csi_f) - for entry in csi_ps_data.get("csi_driver_powerscale", {}).get("cluster", []): - if (entry.get("type") == "image" and - entry.get("package") == "quay.io/dell/container-storage-modules/csm-authorization-sidecar"): - sidecar_values_tag = sidecar_proxy["image"].split(":")[-1] if ":" in sidecar_proxy["image"] else "" - if sidecar_values_tag and sidecar_values_tag != entry["tag"]: - errors.append(create_error_msg( - "powerscale image: csm-authorization-sidecar", - sidecar_proxy["image"], - en_us_validation_msg.powerscale_image_version_mismatch_msg( - "csm-authorization-sidecar", - sidecar_proxy["image"], - f"{entry['package']}:{entry['tag']}" - ) - )) - else: - logger.info(f"Image version match for csm-authorization-sidecar: {sidecar_values_tag}") - break - except (json.JSONDecodeError, IOError) as csi_err: - logger.warn(f"Could not read csi_driver_powerscale.json: {csi_err}") - - for img_name, values_image, sk8s_key in images_to_check: - if sk8s_key in sk8s_images: - # Extract tag from values.yaml image (format: registry/repo:tag) - values_tag = values_image.split(":")[-1] if ":" in values_image else "" - sk8s_tag = sk8s_images[sk8s_key] - if values_tag and values_tag != sk8s_tag: - sk8s_full = f"{sk8s_key}:{sk8s_tag}" - errors.append(create_error_msg( - f"powerscale image: {img_name}", - values_image, - en_us_validation_msg.powerscale_image_version_mismatch_msg( - img_name, values_image, sk8s_full - ) - )) - else: - logger.info(f"Image version match for {img_name}: {values_tag}") - else: - logger.warn(f"Image {sk8s_key} not found in service_k8s.json, skipping version check") - - except (json.JSONDecodeError, IOError) as sk8s_err: - logger.warn(f"Could not read service_k8s.json for image version validation: {sk8s_err}") - else: - logger.warn(f"service_k8s.json not found at {service_k8s_json_path}, skipping image version validation") - - logger.info("CSM Observability values.yaml validation passed") - except (yaml.YAMLError, IOError) as e: - errors.append(create_error_msg( - "powerscale_configurations.csm_observability_values_file_path", - csm_values_path, - en_us_validation_msg.powerscale_csm_values_parse_error_msg(str(e)) - )) - - # Validate additional_remote_write_endpoints - additional_endpoints = powerscale_config.get("additional_remote_write_endpoints", []) - if additional_endpoints and isinstance(additional_endpoints, list): - if len(additional_endpoints) > 5: - logger.warn(f"More than 5 additional_remote_write_endpoints configured ({len(additional_endpoints)}). " - "This may impact performance.") - for idx, endpoint in enumerate(additional_endpoints): - if not isinstance(endpoint, dict): - continue - url = endpoint.get("url", "") - if not url or not isinstance(url, str): - errors.append(create_error_msg( - f"powerscale_configurations.additional_remote_write_endpoints[{idx}].url", - url, - en_us_validation_msg.POWERSCALE_ADDITIONAL_ENDPOINTS_URL_EMPTY_MSG - )) - elif not url.startswith("http://") and not url.startswith("https://"): - errors.append(create_error_msg( - f"powerscale_configurations.additional_remote_write_endpoints[{idx}].url", - url, - en_us_validation_msg.POWERSCALE_ADDITIONAL_ENDPOINTS_URL_INVALID_MSG - )) - - return errors def validate_additional_software( input_file_path, data, logger, module, omnia_base_dir, module_utils_base, project_name From 4e7d091354216719894b3c2331b62913b22109c1 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Thu, 7 May 2026 18:17:20 +0530 Subject: [PATCH 10/17] update lint --- .github/workflows/ansible-lint.yml | 1 + .github/workflows/pylint.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/ansible-lint.yml b/.github/workflows/ansible-lint.yml index 621bd0b930..bbd9a3eed8 100644 --- a/.github/workflows/ansible-lint.yml +++ b/.github/workflows/ansible-lint.yml @@ -9,6 +9,7 @@ on: - pub/build_stream - pub/q2_dev - pub/telemetry + - pub/q2_upgrade jobs: build: diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index c979ce72ca..6b2b1f4d3d 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -9,6 +9,7 @@ on: - pub/build_stream - pub/q2_dev - pub/telemetry + - pub/q2_upgrade jobs: build: From ed61b968fb101b03f190f6f640fb4337e742793b Mon Sep 17 00:00:00 2001 From: Mithilesh Reddy Date: Mon, 11 May 2026 15:09:17 +0530 Subject: [PATCH 11/17] Update input config transformation for omnia upgrade and add previous omnia version in metadata (#4360) * Update input config transform for omnia upgrade * Update omnia.sh * Update templates for changed input files * Update transform_telemetry_config.yml * Fix ansible lint issues * Fix lint issues --- omnia.sh | 14 + .../import_input_parameters/tasks/main.yml | 20 +- .../tasks/transform_build_stream_config.yml | 119 +++++ .../tasks/transform_gitlab_config.yml | 150 +++++++ .../transform_high_availability_config.yml | 2 +- .../tasks/transform_network_spec.yml | 4 +- .../tasks/transform_omnia_config.yml | 2 +- .../tasks/transform_provision_config.yml | 8 +- .../tasks/transform_storage_config.yml | 2 +- .../tasks/transform_telemetry_config.yml | 230 +++++++--- .../templates/build_stream_config.j2 | 41 ++ .../templates/gitlab_config.j2 | 115 +++++ .../templates/network_spec.j2 | 33 +- .../templates/omnia_config.j2 | 2 +- .../templates/telemetry_config.j2 | 422 +++++++++++------- .../import_input_parameters/vars/main.yml | 199 ++++++++- .../tasks/backup_configs.yml | 18 +- .../tasks/calculate_hop_chain.yml | 13 +- .../tasks/display_summary.yml | 18 +- .../tasks/load_upgrade_manifest.yml | 22 +- .../tasks/update_component_json_repos.yml | 6 +- .../tasks/update_software_config.yml | 6 +- .../tasks/validate_current_deployment.yml | 30 +- .../tasks/validate_hop_chains.yml | 26 +- .../prep_local_repo/tasks/create_staging.yml | 16 +- .../tasks/load_upgrade_manifest.yml | 28 +- .../prep_local_repo/tasks/sync_local_repo.yml | 24 +- .../tasks/validate_prerequisites.yml | 30 +- 28 files changed, 1253 insertions(+), 347 deletions(-) create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_build_stream_config.yml create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_gitlab_config.yml create mode 100644 upgrade/roles/import_input_parameters/templates/build_stream_config.j2 create mode 100644 upgrade/roles/import_input_parameters/templates/gitlab_config.j2 diff --git a/omnia.sh b/omnia.sh index c4290b922f..14d8e08f52 100755 --- a/omnia.sh +++ b/omnia.sh @@ -432,11 +432,25 @@ update_metadata_upgrade_backup_dir() { echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2 exit 1 fi + + # Get current omnia_version to store as previous_omnia_version + current_version=\$(grep '^omnia_version:' '$CONTAINER_METADATA_FILE' | cut -d':' -f2 | tr -d ' \t\n\r') + + # Update upgrade_backup_dir if grep -q '^upgrade_backup_dir:' '$CONTAINER_METADATA_FILE'; then sed -i 's|^upgrade_backup_dir:.*|upgrade_backup_dir: ${backup_dir}|' '$CONTAINER_METADATA_FILE' else echo 'upgrade_backup_dir: ${backup_dir}' >> '$CONTAINER_METADATA_FILE' fi + + # Update previous_omnia_version + if [ -n \"\$current_version\" ]; then + if grep -q '^previous_omnia_version:' '$CONTAINER_METADATA_FILE'; then + sed -i \"s|^previous_omnia_version:.*|previous_omnia_version: \$current_version|\" '$CONTAINER_METADATA_FILE' + else + echo \"previous_omnia_version: \$current_version\" >> '$CONTAINER_METADATA_FILE' + fi + fi " } diff --git a/upgrade/roles/import_input_parameters/tasks/main.yml b/upgrade/roles/import_input_parameters/tasks/main.yml index 2aacba7451..219dea9f1f 100644 --- a/upgrade/roles/import_input_parameters/tasks/main.yml +++ b/upgrade/roles/import_input_parameters/tasks/main.yml @@ -19,27 +19,33 @@ - name: Validate backup location for upgrade input processing ansible.builtin.include_tasks: precheck_backup_location.yml -- name: Transform network_spec.yml from Omnia 2.0 to 2.1 +- name: Transform network_spec.yml from Omnia 2.1 to 2.2 ansible.builtin.include_tasks: transform_network_spec.yml -- name: Transform high_availability_config.yml from Omnia 2.0 to 2.1 +- name: Transform high_availability_config.yml from Omnia 2.1 to 2.2 ansible.builtin.include_tasks: transform_high_availability_config.yml -- name: Transform local_repo_config.yml from Omnia 2.0 to 2.1 +- name: Transform local_repo_config.yml from Omnia 2.1 to 2.2 ansible.builtin.include_tasks: transform_local_repo_config.yml -- name: Transform provision_config.yml from Omnia 2.0 to 2.1 +- name: Transform provision_config.yml from Omnia 2.1 to 2.2 ansible.builtin.include_tasks: transform_provision_config.yml -- name: Transform storage_config.yml from Omnia 2.0 to 2.1 +- name: Transform storage_config.yml from Omnia 2.1 to 2.2 ansible.builtin.include_tasks: transform_storage_config.yml -- name: Transform omnia_config.yml from Omnia 2.0 to 2.1 +- name: Transform omnia_config.yml from Omnia 2.1 to 2.2 ansible.builtin.include_tasks: transform_omnia_config.yml -- name: Transform telemetry_config.yml from Omnia 2.0 to 2.1 +- name: Transform telemetry_config.yml from Omnia 2.1 to 2.2 ansible.builtin.include_tasks: transform_telemetry_config.yml +- name: Generate build_stream_config.yml for Omnia 2.2 + ansible.builtin.include_tasks: transform_build_stream_config.yml + +- name: Generate gitlab_config.yml for Omnia 2.2 + ansible.builtin.include_tasks: transform_gitlab_config.yml + - name: Restore input files from backup ansible.builtin.include_tasks: restore_input_files.yml diff --git a/upgrade/roles/import_input_parameters/tasks/transform_build_stream_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_build_stream_config.yml new file mode 100644 index 0000000000..9eca462839 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_build_stream_config.yml @@ -0,0 +1,119 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# build_stream_config.yml exists in both Omnia 2.1 and 2.2 with identical structure. +# This task reads values from the backup and migrates them to the 2.2 format. +# Enhanced with validation for better error handling. + +- name: Check if backup build_stream_config.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/build_stream_config.yml" + register: backup_build_stream_config_stat + +- name: Fail if backup build_stream_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_backup_build_stream_config_missing }}" + when: not backup_build_stream_config_stat.stat.exists + +- name: Check if build_stream_config.yml already exists in target + ansible.builtin.stat: + path: "{{ input_project_dir }}/build_stream_config.yml" + register: build_stream_config_stat + +- name: Fail if build_stream_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_build_stream_config_missing }}" + when: not build_stream_config_stat.stat.exists + +- name: Read backup build_stream_config.yml (source of truth) + ansible.builtin.slurp: + src: "{{ backup_location }}/build_stream_config.yml" + register: backup_build_stream_config_slurp + +- name: Parse backup build_stream_config.yml + ansible.builtin.set_fact: + backup_build_stream_config: "{{ backup_build_stream_config_slurp.content | b64decode | from_yaml }}" + +- name: Set build_stream_config values from backup with validation + ansible.builtin.set_fact: + build_stream_enable: "{{ backup_build_stream_config.enable_build_stream | default(build_stream_default_enable) }}" + build_stream_host_ip: "{{ backup_build_stream_config.build_stream_host_ip | default(build_stream_default_host_ip) }}" + build_stream_port: "{{ backup_build_stream_config.build_stream_port | default(build_stream_default_port) }}" + build_stream_aarch64_ip: "{{ backup_build_stream_config.aarch64_inventory_host_ip | default(build_stream_default_aarch64_ip) }}" + +- name: Validate build_stream_port is in valid range + ansible.builtin.assert: + that: + - build_stream_port | int >= 1 + - build_stream_port | int <= 65535 + fail_msg: "build_stream_port {{ build_stream_port }} is not in valid range (1-65535)" + success_msg: "build_stream_port {{ build_stream_port }} is valid" + +- name: Validate build_stream_host_ip format if provided + ansible.builtin.assert: + that: + - build_stream_host_ip == "" or build_stream_host_ip | ansible.utils.ipaddr + fail_msg: "build_stream_host_ip '{{ build_stream_host_ip }}' is not a valid IP address" + success_msg: "build_stream_host_ip is valid" + when: build_stream_host_ip != "" + +- name: Validate build_stream_aarch64_ip format if provided + ansible.builtin.assert: + that: + - build_stream_aarch64_ip == "" or build_stream_aarch64_ip | ansible.utils.ipaddr + fail_msg: "build_stream_aarch64_ip '{{ build_stream_aarch64_ip }}' is not a valid IP address" + success_msg: "build_stream_aarch64_ip is valid" + when: build_stream_aarch64_ip != "" + +- name: Write build_stream_config.yml with Omnia 2.2 defaults + ansible.builtin.template: + src: build_stream_config.j2 + dest: "{{ input_project_dir }}/build_stream_config.yml" + mode: "{{ default_file_mode }}" + vars: + build_stream_enable: "{{ build_stream_enable }}" + build_stream_host_ip: "{{ build_stream_host_ip }}" + build_stream_port: "{{ build_stream_port }}" + build_stream_aarch64_ip: "{{ build_stream_aarch64_ip }}" + +- name: Validate YAML syntax of build_stream_config.yml + ansible.builtin.command: + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/build_stream_config.yml','r'))" + register: build_stream_yaml_validation + changed_when: false + +- name: Fail if YAML validation fails + ansible.builtin.fail: + msg: "{{ msg_yaml_validation_failed }}" + when: + - build_stream_yaml_validation.rc != 0 + +- name: Display enhanced build_stream_config transformation summary + ansible.builtin.debug: + msg: | + {{ msg_build_stream_config_transform_summary }} + + Values migrated from backup: + - enable_build_stream: {{ build_stream_enable }} + - build_stream_host_ip: {{ build_stream_host_ip | default('empty') }} + - build_stream_port: {{ build_stream_port }} + - aarch64_inventory_host_ip: {{ build_stream_aarch64_ip | default('empty') }} + + Note: Configuration migrated from backup and validated successfully. + +- name: Display backup path (no-op when skipped) + ansible.builtin.debug: + msg: "{{ msg_using_backup_build_stream_config }}" + when: true diff --git a/upgrade/roles/import_input_parameters/tasks/transform_gitlab_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_gitlab_config.yml new file mode 100644 index 0000000000..90a04368e1 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_gitlab_config.yml @@ -0,0 +1,150 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# gitlab_config.yml exists in both Omnia 2.1 and 2.2 with identical structure. +# This task reads values from the backup and migrates them to the 2.2 format. +# Enhanced with validation for better error handling. + +- name: Check if backup gitlab_config.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/gitlab_config.yml" + register: backup_gitlab_config_stat + +- name: Fail if backup gitlab_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_backup_gitlab_config_missing }}" + when: not backup_gitlab_config_stat.stat.exists + +- name: Check if gitlab_config.yml already exists in target + ansible.builtin.stat: + path: "{{ input_project_dir }}/gitlab_config.yml" + register: gitlab_config_stat + +- name: Fail if gitlab_config.yml is not present + ansible.builtin.fail: + msg: "{{ msg_gitlab_config_missing }}" + when: not gitlab_config_stat.stat.exists + +- name: Read backup gitlab_config.yml (source of truth) + ansible.builtin.slurp: + src: "{{ backup_location }}/gitlab_config.yml" + register: backup_gitlab_config_slurp + +- name: Parse backup gitlab_config.yml + ansible.builtin.set_fact: + backup_gitlab_config: "{{ backup_gitlab_config_slurp.content | b64decode | from_yaml }}" + +- name: Set gitlab_config values from backup with validation + ansible.builtin.set_fact: + gitlab_host: "{{ backup_gitlab_config.gitlab_host | default(gitlab_default_host) }}" + gitlab_project_name: "{{ backup_gitlab_config.gitlab_project_name | default(gitlab_default_project_name) }}" + gitlab_project_visibility: "{{ backup_gitlab_config.gitlab_project_visibility | default(gitlab_default_project_visibility) }}" + gitlab_default_branch: "{{ backup_gitlab_config.gitlab_default_branch | default(gitlab_default_branch) }}" + gitlab_https_port: "{{ backup_gitlab_config.gitlab_https_port | default(gitlab_default_https_port) }}" + gitlab_min_storage_gb: "{{ backup_gitlab_config.gitlab_min_storage_gb | default(gitlab_default_min_storage_gb) }}" + gitlab_min_memory_gb: "{{ backup_gitlab_config.gitlab_min_memory_gb | default(gitlab_default_min_memory_gb) }}" + gitlab_min_cpu_cores: "{{ backup_gitlab_config.gitlab_min_cpu_cores | default(gitlab_default_min_cpu_cores) }}" + gitlab_puma_workers: "{{ backup_gitlab_config.gitlab_puma_workers | default(gitlab_default_puma_workers) }}" + gitlab_sidekiq_concurrency: "{{ backup_gitlab_config.gitlab_sidekiq_concurrency | default(gitlab_default_sidekiq_concurrency) }}" + +- name: Validate gitlab_host IP format if provided + ansible.builtin.assert: + that: + - gitlab_host == "" or gitlab_host | ansible.utils.ipaddr + fail_msg: "gitlab_host '{{ gitlab_host }}' is not a valid IP address" + success_msg: "gitlab_host is valid" + when: gitlab_host != "" + +- name: Validate gitlab_https_port is in valid range + ansible.builtin.assert: + that: + - gitlab_https_port | int >= 1 + - gitlab_https_port | int <= 65535 + fail_msg: "gitlab_https_port {{ gitlab_https_port }} is not in valid range (1-65535)" + success_msg: "gitlab_https_port {{ gitlab_https_port }} is valid" + +- name: Validate gitlab_project_visibility + ansible.builtin.assert: + that: + - gitlab_project_visibility in ['private', 'internal', 'public'] + fail_msg: "gitlab_project_visibility '{{ gitlab_project_visibility }}' must be one of: private, internal, public" + success_msg: "gitlab_project_visibility is valid" + +- name: Validate gitlab_default_branch format + ansible.builtin.assert: + that: + - gitlab_default_branch | regex_search('^[a-zA-Z0-9/_-]+$') + fail_msg: "gitlab_default_branch '{{ gitlab_default_branch }}' contains invalid characters" + success_msg: "gitlab_default_branch is valid" + +- name: Validate minimum resource requirements + ansible.builtin.assert: + that: + - gitlab_min_storage_gb | int >= 20 + - gitlab_min_memory_gb | int >= 4 + - gitlab_min_cpu_cores | int >= 2 + fail_msg: >- + GitLab minimum requirements not met: storage={{ gitlab_min_storage_gb }}GB (min 20GB), + memory={{ gitlab_min_memory_gb }}GB (min 4GB), CPU={{ gitlab_min_cpu_cores }} cores (min 2) + success_msg: "GitLab minimum requirements validated" + +- name: Write gitlab_config.yml with Omnia 2.2 defaults + ansible.builtin.template: + src: gitlab_config.j2 + dest: "{{ input_project_dir }}/gitlab_config.yml" + mode: "{{ default_file_mode }}" + vars: + gitlab_host: "{{ gitlab_host }}" + gitlab_project_name: "{{ gitlab_project_name }}" + gitlab_project_visibility: "{{ gitlab_project_visibility }}" + gitlab_default_branch: "{{ gitlab_default_branch }}" + gitlab_https_port: "{{ gitlab_https_port }}" + gitlab_min_storage_gb: "{{ gitlab_min_storage_gb }}" + gitlab_min_memory_gb: "{{ gitlab_min_memory_gb }}" + gitlab_min_cpu_cores: "{{ gitlab_min_cpu_cores }}" + gitlab_puma_workers: "{{ gitlab_puma_workers }}" + gitlab_sidekiq_concurrency: "{{ gitlab_sidekiq_concurrency }}" + +- name: Validate YAML syntax of gitlab_config.yml + ansible.builtin.command: + cmd: python3 -c "import yaml; yaml.safe_load(open('{{ input_project_dir }}/gitlab_config.yml','r'))" + register: gitlab_yaml_validation + changed_when: false + +- name: Fail if YAML validation fails + ansible.builtin.fail: + msg: "{{ msg_yaml_validation_failed }}" + when: + - gitlab_yaml_validation.rc != 0 + +- name: Display enhanced gitlab_config transformation summary + ansible.builtin.debug: + msg: | + {{ msg_gitlab_config_transform_summary }} + + Values migrated from backup: + - gitlab_host: {{ gitlab_host | default('empty') }} + - gitlab_project_name: {{ gitlab_project_name }} + - gitlab_project_visibility: {{ gitlab_project_visibility }} + - gitlab_default_branch: {{ gitlab_default_branch }} + - gitlab_https_port: {{ gitlab_https_port }} + - Minimum requirements: {{ gitlab_min_storage_gb }}GB storage, {{ gitlab_min_memory_gb }}GB RAM, {{ gitlab_min_cpu_cores }} CPU cores + + Note: Configuration migrated from backup and validated successfully. + +- name: Display backup path (no-op when skipped) + ansible.builtin.debug: + msg: "{{ msg_using_backup_gitlab_config }}" + when: true diff --git a/upgrade/roles/import_input_parameters/tasks/transform_high_availability_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_high_availability_config.yml index 494dfda41a..192dfef630 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_high_availability_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_high_availability_config.yml @@ -84,7 +84,7 @@ or ((ha_entries_missing_vip | default([]) | length) > 0) or ((ha_entries_empty_vip | default([]) | length) > 0) -- name: Write high_availability_config.yml in Omnia 2.1 format +- name: Write high_availability_config.yml in Omnia 2.2 format ansible.builtin.template: src: high_availability_config.j2 dest: "{{ input_project_dir }}/high_availability_config.yml" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml b/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml index 17e742d22f..55d4f9b0b7 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_network_spec.yml @@ -77,13 +77,13 @@ when: - true -- name: Render network_spec.yml in Omnia 2.1 format +- name: Render network_spec.yml in Omnia 2.2 format ansible.builtin.template: src: network_spec.j2 dest: "{{ input_project_dir }}/network_spec.yml" mode: "{{ default_file_mode }}" vars: - admin_network_netmask_bits: "{{ admin_network.netmask_bits | default('24') }}" + admin_network_netmask_bits: "{{ admin_network.netmask_bits | default(network_default_netmask_bits) }}" when: true - name: Read transformed network_spec.yml diff --git a/upgrade/roles/import_input_parameters/tasks/transform_omnia_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_omnia_config.yml index ab62c3ff28..1fa196cc1e 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_omnia_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_omnia_config.yml @@ -72,7 +72,7 @@ msg: "{{ msg_service_k8s_cluster_missing }}" when: (omnia_service_k8s_cluster | default([]) | length) == 0 -- name: Write omnia_config.yml in Omnia 2.1 format +- name: Write omnia_config.yml in Omnia 2.2 format ansible.builtin.template: src: omnia_config.j2 dest: "{{ input_project_dir }}/omnia_config.yml" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml index 42598d59bc..d39476241a 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_provision_config.yml @@ -44,16 +44,16 @@ - name: Normalize provision_config.yml values ansible.builtin.set_fact: - provision_pxe_mapping_file_path: "{{ backup_provision_config.pxe_mapping_file_path | default('pxe_mapping_file.csv') }}" - provision_language: "{{ backup_provision_config.language | default('en_US.UTF-8') }}" - provision_default_lease_time: "{{ backup_provision_config.default_lease_time | default('86400') }}" + provision_pxe_mapping_file_path: "{{ backup_provision_config.pxe_mapping_file_path | default(provision_default_pxe_mapping_file_path) }}" + provision_language: "{{ backup_provision_config.language | default(provision_default_language) }}" + provision_default_lease_time: "{{ backup_provision_config.default_lease_time | default(provision_default_lease_time) }}" - name: Fail if pxe_mapping_file_path is missing ansible.builtin.fail: msg: "{{ msg_pxe_mapping_file_path_missing }}" when: (provision_pxe_mapping_file_path | string | trim) == '' -- name: Write provision_config.yml in Omnia 2.1 format +- name: Write provision_config.yml in Omnia 2.2 format ansible.builtin.template: src: provision_config.j2 dest: "{{ input_project_dir }}/provision_config.yml" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml index 264e13e830..81ce14c5f9 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml @@ -61,7 +61,7 @@ (storage_nfs_client_params | selectattr('client_share_path', 'undefined') | list | length) > 0 or (storage_nfs_client_params | selectattr('client_mount_options', 'undefined') | list | length) > 0 -- name: Write storage_config.yml in Omnia 2.1 format +- name: Write storage_config.yml in Omnia 2.2 format ansible.builtin.template: src: storage_config.j2 dest: "{{ input_project_dir }}/storage_config.yml" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml index 9e431f6671..14b5458ddd 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml @@ -44,85 +44,198 @@ - name: Normalize nested backup telemetry sections ansible.builtin.set_fact: - backup_telemetry_victoria_config: "{{ backup_telemetry_config.victoria_configurations | default({}) }}" - backup_telemetry_kafka_config: "{{ backup_telemetry_config.kafka_configurations | default({}) }}" + backup_telemetry_victoria_config: >- + {{ backup_telemetry_config.victoria_metrics_configurations + | default(backup_telemetry_config.victoria_configurations | default({})) }} + backup_telemetry_kafka_config: >- + {{ backup_telemetry_config.kafka_configurations | default({}) }} + backup_telemetry_victoria_logs_config: >- + {{ backup_telemetry_config.victoria_logs_configurations | default({}) }} + backup_telemetry_powerscale_config: >- + {{ backup_telemetry_config.powerscale_configurations | default({}) }} + backup_telemetry_sources: >- + {{ backup_telemetry_config.telemetry_sources | default({}) }} + backup_telemetry_sinks: >- + {{ backup_telemetry_config.telemetry_sinks | default({}) }} + backup_telemetry_ldms_config: >- + {{ backup_telemetry_config.ldms_configurations | default({}) }} -- name: Normalize telemetry_config.yml values +- name: Extract iDRAC telemetry support from backup (2.1 or 2.2 format) + ansible.builtin.set_fact: + telemetry_idrac_telemetry_support: >- + {{ + backup_telemetry_config.idrac_telemetry_support + | default( + (backup_telemetry_sources.idrac | default({})).metrics_enabled + | default(telemetry_default_idrac_support) + ) + }} + +- name: Extract collection type from backup (2.1 format) for target conversion ansible.builtin.set_fact: - telemetry_idrac_telemetry_support: "{{ backup_telemetry_config.idrac_telemetry_support | default(true) }}" telemetry_telemetry_collection_type: >- {{ backup_telemetry_config.telemetry_collection_type - | default('victoria,kafka') + | default(backup_telemetry_config.idrac_telemetry_collection_type + | default(telemetry_default_collection_type)) + }} + +- name: Derive iDRAC collection targets from 2.1 collection type string + ansible.builtin.set_fact: + telemetry_idrac_collection_targets: >- + {{ + (backup_telemetry_sources.idrac | default({})).collection_targets + | default( + telemetry_telemetry_collection_type.split(',') + | map('trim') + | map('regex_replace', '^victoria$', 'victoria_metrics') + | list + ) + }} + +- name: Normalize VictoriaMetrics sink values from backup + ansible.builtin.set_fact: + telemetry_victoria_persistence_size: >- + {{ (backup_telemetry_sinks.victoria_metrics | default({})).persistence_size + | default(backup_telemetry_victoria_config.persistence_size + | default(telemetry_default_victoria_persistence_size)) }} + telemetry_victoria_retention_period: >- + {{ (backup_telemetry_sinks.victoria_metrics | default({})).retention_period + | default(backup_telemetry_victoria_config.retention_period + | default(telemetry_default_victoria_retention_period)) }} + telemetry_additional_metric_remote_write_endpoints: >- + {{ (backup_telemetry_sinks.victoria_metrics | default({})).additional_metric_remote_write_endpoints + | default([]) }} + +- name: Normalize VictoriaLogs sink values from backup + ansible.builtin.set_fact: + telemetry_victoria_logs_storage_size: >- + {{ (backup_telemetry_sinks.victoria_logs | default({})).storage_size + | default(backup_telemetry_victoria_logs_config.storage_size + | default(telemetry_default_victoria_logs_storage_size)) }} + telemetry_victoria_logs_retention_period: >- + {{ (backup_telemetry_sinks.victoria_logs | default({})).retention_period + | default(backup_telemetry_victoria_logs_config.retention_period + | default(telemetry_default_victoria_logs_retention_period)) }} + telemetry_additional_log_write_endpoints: >- + {{ (backup_telemetry_sinks.victoria_logs | default({})).additional_log_write_endpoints + | default([]) }} + +- name: Normalize Kafka sink values from backup + ansible.builtin.set_fact: + telemetry_kafka_persistence_size: >- + {{ (backup_telemetry_sinks.kafka | default({})).persistence_size + | default(backup_telemetry_kafka_config.persistence_size + | default(telemetry_default_kafka_persistence_size)) }} + telemetry_kafka_log_retention_hours: >- + {{ (backup_telemetry_sinks.kafka | default({})).log_retention_hours + | default(backup_telemetry_kafka_config.log_retention_hours + | default(telemetry_default_kafka_log_retention_hours)) }} + telemetry_kafka_log_retention_bytes: >- + {{ (backup_telemetry_sinks.kafka | default({})).log_retention_bytes + | default(backup_telemetry_kafka_config.log_retention_bytes + | default(telemetry_default_kafka_log_retention_bytes)) }} + telemetry_kafka_log_segment_bytes: >- + {{ (backup_telemetry_sinks.kafka | default({})).log_segment_bytes + | default(backup_telemetry_kafka_config.log_segment_bytes + | default(telemetry_default_kafka_log_segment_bytes)) }} + +- name: Extract raw topic partitions from backup (list or dict format) + ansible.builtin.set_fact: + telemetry_kafka_topic_partitions_raw: >- + {{ + (backup_telemetry_sinks.kafka | default({})).topic_partitions + | default(backup_telemetry_kafka_config.topic_partitions + | default(telemetry_default_kafka_topic_partitions)) }} - telemetry_victoria_persistence_size: "{{ backup_telemetry_victoria_config.persistence_size | default('8Gi') }}" - telemetry_victoria_retention_period: "{{ backup_telemetry_victoria_config.retention_period | default(168) }}" - telemetry_kafka_persistence_size: "{{ backup_telemetry_kafka_config.persistence_size | default('8Gi') }}" - telemetry_kafka_log_retention_hours: "{{ backup_telemetry_kafka_config.log_retention_hours | default(168) }}" - telemetry_kafka_log_retention_bytes: "{{ backup_telemetry_kafka_config.log_retention_bytes | default(-1) }}" - telemetry_kafka_log_segment_bytes: "{{ backup_telemetry_kafka_config.log_segment_bytes | default(1073741824) }}" - telemetry_kafka_topic_partitions: >- + +- name: Convert topic partitions to dict format for 2.2 template + ansible.builtin.set_fact: + telemetry_kafka_topic_partitions_dict: >- {{ - backup_telemetry_kafka_config.topic_partitions - | default([ - {'name': 'idrac', 'partitions': 1}, - {'name': 'ldms', 'partitions': 2} - ]) + telemetry_kafka_topic_partitions_raw + if (telemetry_kafka_topic_partitions_raw is mapping) + else + dict( + telemetry_kafka_topic_partitions_raw + | map(attribute='name') + | zip(telemetry_kafka_topic_partitions_raw | map(attribute='partitions')) + ) }} - telemetry_ldms_agg_port: "{{ backup_telemetry_config.ldms_agg_port | default(6001) }}" - telemetry_ldms_store_port: "{{ backup_telemetry_config.ldms_store_port | default(6001) }}" - telemetry_ldms_sampler_port: "{{ backup_telemetry_config.ldms_sampler_port | default(10001) }}" + +- name: Normalize LDMS configuration values from backup (2.1 or 2.2 format) + ansible.builtin.set_fact: + telemetry_ldms_agg_port: >- + {{ backup_telemetry_ldms_config.agg_port + | default(backup_telemetry_config.ldms_agg_port + | default(telemetry_default_ldms_agg_port)) }} + telemetry_ldms_store_port: >- + {{ backup_telemetry_ldms_config.store_port + | default(backup_telemetry_config.ldms_store_port + | default(telemetry_default_ldms_store_port)) }} + telemetry_ldms_sampler_port: >- + {{ backup_telemetry_ldms_config.sampler_port + | default(backup_telemetry_config.ldms_sampler_port + | default(telemetry_default_ldms_sampler_port)) }} telemetry_ldms_sampler_configurations: >- {{ - backup_telemetry_config.ldms_sampler_configurations - | default([ - { - 'plugin_name': 'meminfo', - 'config_parameters': '', - 'activation_parameters': 'interval=1000000' - }, - { - 'plugin_name': 'procstat2', - 'config_parameters': '', - 'activation_parameters': 'interval=1000000' - }, - { - 'plugin_name': 'vmstat', - 'config_parameters': '', - 'activation_parameters': 'interval=1000000' - }, - { - 'plugin_name': 'loadavg', - 'config_parameters': '', - 'activation_parameters': 'interval=1000000' - }, - { - 'plugin_name': 'procnetdev2', - 'config_parameters': '', - 'activation_parameters': 'interval=1000000 offset=0' - } - ]) + backup_telemetry_ldms_config.sampler_plugins + | default(backup_telemetry_config.ldms_sampler_configurations + | default(telemetry_default_ldms_sampler_configurations)) }} -- name: Write telemetry_config.yml in Omnia 2.1 format +- name: Normalize DCGM and PowerScale source values from backup + ansible.builtin.set_fact: + telemetry_dcgm_support: >- + {{ (backup_telemetry_sources.dcgm | default({})).metrics_enabled + | default(backup_telemetry_config.dcgm_support + | default(telemetry_default_dcgm_support)) }} + telemetry_powerscale_metrics_enabled: >- + {{ (backup_telemetry_sources.powerscale | default({})).metrics_enabled + | default(backup_telemetry_powerscale_config.powerscale_telemetry_support + | default(telemetry_default_powerscale_support)) }} + telemetry_powerscale_logs_enabled: >- + {{ (backup_telemetry_sources.powerscale | default({})).logs_enabled + | default(backup_telemetry_powerscale_config.powerscale_log_enabled + | default(telemetry_default_powerscale_log_enabled)) }} + +- name: Normalize PowerScale configuration values from backup + ansible.builtin.set_fact: + telemetry_otel_collector_storage_size: >- + {{ backup_telemetry_powerscale_config.otel_collector_storage_size + | default(telemetry_default_otel_collector_storage_size) }} + telemetry_csm_observability_values_file_path: >- + {{ backup_telemetry_powerscale_config.csm_observability_values_file_path + | default(telemetry_default_csm_observability_values_file_path) }} + +- name: Write telemetry_config.yml in Omnia 2.2 format ansible.builtin.template: src: telemetry_config.j2 dest: "{{ input_project_dir }}/telemetry_config.yml" mode: "{{ default_file_mode }}" vars: telemetry_idrac_telemetry_support: "{{ telemetry_idrac_telemetry_support }}" - telemetry_telemetry_collection_type: "{{ telemetry_telemetry_collection_type }}" + telemetry_idrac_collection_targets: "{{ telemetry_idrac_collection_targets }}" + telemetry_dcgm_support: "{{ telemetry_dcgm_support }}" + telemetry_powerscale_metrics_enabled: "{{ telemetry_powerscale_metrics_enabled }}" + telemetry_powerscale_logs_enabled: "{{ telemetry_powerscale_logs_enabled }}" telemetry_victoria_persistence_size: "{{ telemetry_victoria_persistence_size }}" telemetry_victoria_retention_period: "{{ telemetry_victoria_retention_period }}" + telemetry_additional_metric_remote_write_endpoints: "{{ telemetry_additional_metric_remote_write_endpoints }}" + telemetry_victoria_logs_storage_size: "{{ telemetry_victoria_logs_storage_size }}" + telemetry_victoria_logs_retention_period: "{{ telemetry_victoria_logs_retention_period }}" + telemetry_additional_log_write_endpoints: "{{ telemetry_additional_log_write_endpoints }}" telemetry_kafka_persistence_size: "{{ telemetry_kafka_persistence_size }}" telemetry_kafka_log_retention_hours: "{{ telemetry_kafka_log_retention_hours }}" telemetry_kafka_log_retention_bytes: "{{ telemetry_kafka_log_retention_bytes }}" telemetry_kafka_log_segment_bytes: "{{ telemetry_kafka_log_segment_bytes }}" - telemetry_kafka_topic_partitions: "{{ telemetry_kafka_topic_partitions }}" + telemetry_kafka_topic_partitions_dict: "{{ telemetry_kafka_topic_partitions_dict }}" telemetry_ldms_agg_port: "{{ telemetry_ldms_agg_port }}" telemetry_ldms_store_port: "{{ telemetry_ldms_store_port }}" telemetry_ldms_sampler_port: "{{ telemetry_ldms_sampler_port }}" telemetry_ldms_sampler_configurations: "{{ telemetry_ldms_sampler_configurations }}" + telemetry_otel_collector_storage_size: "{{ telemetry_otel_collector_storage_size }}" + telemetry_csm_observability_values_file_path: "{{ telemetry_csm_observability_values_file_path }}" - name: Validate YAML syntax of transformed telemetry_config.yml ansible.builtin.command: @@ -141,6 +254,17 @@ msg: "{{ msg_using_backup_telemetry_config }}" when: true -- name: Display transformation summary +- name: Display transformation summary with field mapping details ansible.builtin.debug: - msg: "{{ msg_telemetry_config_transform_summary }}" + msg: | + {{ msg_telemetry_config_transform_summary }} + + Field mappings applied: + - idrac_telemetry_support → telemetry_sources.idrac.metrics_enabled + - idrac_telemetry_collection_type → telemetry_sources.idrac.collection_targets: {{ telemetry_idrac_collection_targets }} + - victoria_configurations → telemetry_sinks.victoria_metrics + - kafka_configurations → telemetry_sinks.kafka (topic_partitions converted to dict) + - ldms_* ports/samplers → ldms_configurations + - Added new sections: telemetry_sources (ldms, dcgm, powerscale), telemetry_bridges, telemetry_sinks.victoria_logs + + Configuration validated and migrated successfully. diff --git a/upgrade/roles/import_input_parameters/templates/build_stream_config.j2 b/upgrade/roles/import_input_parameters/templates/build_stream_config.j2 new file mode 100644 index 0000000000..fef1590e4c --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/build_stream_config.j2 @@ -0,0 +1,41 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +# *********************************************************************** + +# *********************************************************************** +# BuildStreaM (BSM) Configuration for configuring CI/CD pipeline to automate image building and deploy +# *********************************************************************** + +# Mandatory: Enable or disable build stream pipeline +# Accepted values: boolean values - (true or false) or(yes or no) +# Default: false +enable_build_stream: {{ build_stream_enable | default(build_stream_default_enable) | bool | ternary('true', 'false') }} + +# Mandatory: Build Stream API server host IP +# Accepted values: public IP address of OIM or admin IP of OIM +build_stream_host_ip: "{{ build_stream_host_ip | default(build_stream_default_host_ip) }}" + +# Mandatory: Build Stream API server port +# Accepted values: valid port number (1-65535) which is free +# Default: 8010 +build_stream_port: {{ build_stream_port | default(build_stream_default_port) }} + +# Conditional Mandatory: AArch64 inventory host IP for aarch64 builds +# Accepted values: admin IP of aarch64 host where OS is installed +# Default none - by deafult aarch64 builds will not be generated +aarch64_inventory_host_ip: "{{ build_stream_aarch64_ip | default(build_stream_default_aarch64_ip) }}" diff --git a/upgrade/roles/import_input_parameters/templates/gitlab_config.j2 b/upgrade/roles/import_input_parameters/templates/gitlab_config.j2 new file mode 100644 index 0000000000..1e2a6a2f06 --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/gitlab_config.j2 @@ -0,0 +1,115 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +# *********************************************************************** + + +# Target host for GitLab deployment/cleanup +# Fields: +# gitlab_host: IP address of the target host where GitLab will be deployed +# Notes: +# - This is the IP address of the server where GitLab will be installed +# - Must be accessible from the OIM server +# - Must be configured in build_stream/gitlab/inventory/hosts.ini +gitlab_host: "{{ gitlab_host | default(gitlab_default_host) }}" + +# Project settings +# Name of the GitLab project Omnia will create/manage +# Fields: +# gitlab_project_name: Name for the GitLab project +# Notes: +# - Default: "omnia-catalog" +# - This project will be created automatically if it doesn't exist +gitlab_project_name: "{{ gitlab_project_name | default(gitlab_default_project_name) }}" + +# Visibility options: private | internal | public +# Fields: +# gitlab_project_visibility: Visibility options - private | internal | public +# Notes: +# - private: Project access must be granted explicitly for each user +# - internal: The project can be cloned by any logged in user +# - public: The project can be cloned without any authentication +gitlab_project_visibility: "{{ gitlab_project_visibility | default(gitlab_default_project_visibility) }}" + +# Default branch used for repository and API operations +# Fields: +# gitlab_default_branch: Name of the default branch +# Notes: +# - Default: "main" +# - This branch will be used as the default for all operations +gitlab_default_branch: "{{ gitlab_default_branch | default(gitlab_default_branch) }}" + + +# HTTPS is always enabled for GitLab deployment +# ---------------------------------------------------------------------------- +# DEFAULT / ADVANCED VARIABLES (CHANGE ONLY IF NEEDED) +# ---------------------------------------------------------------------------- +# These defaults are suitable for most setups and can be tuned as required. + +# Network +# HTTPS port exposed via GitLab NGINX +# Fields: +# gitlab_https_port: Port number for HTTPS access +# Notes: +# - Default: 443 +# - Must be between 1-65535 +# - Must not conflict with other services +gitlab_https_port: {{ gitlab_https_port | default(gitlab_default_https_port) }} + +# Minimum requirements +# Free disk space validated before install +# Fields: +# gitlab_min_storage_gb: Minimum storage in GB +# Notes: +# - Default: 20 +# - GitLab requires at least 20GB of free disk space +gitlab_min_storage_gb: {{ gitlab_min_storage_gb | default(gitlab_default_min_storage_gb) }} + +# Adjust upward for production workloads +# Fields: +# gitlab_min_memory_gb: Minimum memory in GB +# Notes: +# - Default: 4 +# - Adjust upward for production workloads +gitlab_min_memory_gb: {{ gitlab_min_memory_gb | default(gitlab_default_min_memory_gb) }} + +# Minimum CPU core count validated before install +# Fields: +# gitlab_min_cpu_cores: Minimum number of CPU cores +# Notes: +# - Default: 2 +# - More cores may be needed for production workloads +gitlab_min_cpu_cores: {{ gitlab_min_cpu_cores | default(gitlab_default_min_cpu_cores) }} + + +# Web worker count; scale with CPU +# Fields: +# gitlab_puma_workers: Number of worker processes +# Notes: +# - Default: 2 +# - Scale with CPU cores (recommended: 1-2 workers per CPU core) +gitlab_puma_workers: {{ gitlab_puma_workers | default(gitlab_default_puma_workers) }} + +# Background job concurrency +# Fields: +# gitlab_sidekiq_concurrency: Number of concurrent background jobs +# Notes: +# - Default: 10 +# - Adjust based on available memory and workload + +# Target host for GitLab deployment/cleanup +gitlab_sidekiq_concurrency: {{ gitlab_sidekiq_concurrency | default(gitlab_default_sidekiq_concurrency) }} + diff --git a/upgrade/roles/import_input_parameters/templates/network_spec.j2 b/upgrade/roles/import_input_parameters/templates/network_spec.j2 index 564c057db4..248024ba67 100644 --- a/upgrade/roles/import_input_parameters/templates/network_spec.j2 +++ b/upgrade/roles/import_input_parameters/templates/network_spec.j2 @@ -38,16 +38,43 @@ # - 'subnet': The subnet of the IB network. # - 'netmask_bits': The number of bits in the subnet mask. This value must be same as the admin_network netmask_bits. +# 'additional_subnets' is optional, for multi-RAC / multi-subnet PXE deployments. +# Each entry defines a separate subnet that the CoreDHCP server will manage +# via DHCP relay (giaddr-based routing). Requires coresmd v0.5+ with +# multi-subnet support. +# +# Each additional subnet entry contains: +# - 'subnet': The network address of the additional subnet (e.g. "10.40.1.0"). +# - 'netmask_bits': The CIDR prefix length (e.g. "24"). +# - 'router': The gateway/router IP for this subnet (used as DHCP option 3). +# - 'dynamic_range': The DHCP IP pool range in "start_ip-end_ip" format. +# Must fall within the subnet. +# +# Example (multi-RAC with two additional subnets): +# additional_subnets: +# - subnet: "10.40.1.0" +# netmask_bits: "24" +# router: "10.40.1.1" +# dynamic_range: "10.40.1.100-10.40.1.200" +# - subnet: "10.40.3.0" +# netmask_bits: "24" +# router: "10.40.3.1" +# dynamic_range: "10.40.3.100-10.40.3.200" + + Networks: - admin_network: oim_nic_name: "{{ admin_network.oim_nic_name | default('') }}" - netmask_bits: "{{ admin_network.netmask_bits | default('24') }}" + subnet: "{{ admin_network.subnet | default(network_default_subnet) }}" + netmask_bits: "{{ admin_network.netmask_bits | default(network_default_netmask_bits) }}" primary_oim_admin_ip: "{{ admin_network.primary_oim_admin_ip | default('') }}" primary_oim_bmc_ip: "{{ admin_network.primary_oim_bmc_ip | default('') }}" dynamic_range: "{{ admin_network.dynamic_range | default('') }}" dns: {{ admin_network.dns | default([]) }} ntp_servers: {{ admin_network.ntp_servers | default([]) }} + additional_subnets: {{ admin_network.additional_subnets | default([]) }} - ib_network: - subnet: "{{ ib_network.subnet | default('192.168.0.0') }}" - netmask_bits: "{{ ib_network.netmask_bits | default(admin_network_netmask_bits | default('24')) }}" + subnet: "{{ ib_network.subnet | default('') }}" + netmask_bits: "{{ ib_network.netmask_bits | default(admin_network_netmask_bits | default(network_default_netmask_bits)) }}" + dns: {{ ib_network.dns | default([]) }} diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config.j2 index e1e6bdc09a..baeedbdd39 100644 --- a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/omnia_config.j2 @@ -231,8 +231,8 @@ service_k8s_cluster: k8s_service_addresses: "{{ _cluster.k8s_service_addresses | default('') }}" k8s_pod_network_cidr: "{{ _cluster.k8s_pod_network_cidr | default('') }}" nfs_storage_name: "{{ _cluster.nfs_storage_name | default('') }}" + k8s_crio_storage_size: "{{ _cluster.k8s_crio_storage_size | default('20G') }}" csi_powerscale_driver_secret_file_path: "{{ _cluster.csi_powerscale_driver_secret_file_path | default('') }}" csi_powerscale_driver_values_file_path: "{{ _cluster.csi_powerscale_driver_values_file_path | default('') }}" - k8s_crio_storage_size: {{ _cluster.k8s_crio_storage_size | default('20G') }} {% endfor %} {% endif %} diff --git a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 index ae57457882..0f215e2145 100644 --- a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 @@ -21,27 +21,38 @@ # ============================================================================ # TELEMETRY CONFIGURATION OVERVIEW # ============================================================================ -# This file configures telemetry data collection and storage for Dell Omnia. +# This file configures telemetry data collection, routing, and storage for Dell Omnia. +# +# ARCHITECTURE: +# SOURCES (Collectors) → BRIDGES (Vector) → SINKS (Storage) # # SECTIONS: -# 1. iDRAC Telemetry : Hardware metrics from Dell PowerEdge servers -# 2. VictoriaMetrics : Time-series database for metric storage -# 3. Kafka : Distributed streaming platform for telemetry data -# 4. LDMS : Lightweight Distributed Metric Service for compute nodes +# 1. Telemetry Sources : Data collectors (iDRAC, LDMS, DCGM, PowerScale) +# 2. Telemetry Bridges : Data routers (Vector pipelines) +# 3. Telemetry Sinks : Storage backends (victoria_metrics, victoria_logs, Kafka) +# 4. Source Configurations: Detailed settings per source (PowerScale, LDMS) # # ============================================================================ # STORAGE REQUIREMENTS SUMMARY # ============================================================================ # -# VICTORIAMETRICS STORAGE: +# victoria_metrics STORAGE: # ┌─────────────────┬──────────────────┬─────────────────┬──────────────────┐ # │ Deployment Mode │ Per-Pod Storage │ Number of Pods │ Total Storage │ # ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ -# │ Single-node │ persistence_size │ 1 pod │ 1× storage │ -# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ # │ Cluster │ persistence_size │ 3 vmstorage │ 3× storage │ # └─────────────────┴──────────────────┴─────────────────┴──────────────────┘ -# Example: 8Gi per pod → Single-node: 8Gi total, Cluster: 24Gi total +# Example: 8Gi per pod → Cluster: 24Gi total +# +# victoria_logs STORAGE: +# ┌─────────────────┬──────────────────┬─────────────────┬──────────────────┐ +# │ Component │ Per-Pod Storage │ Number of Pods │ Total Storage │ +# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ +# │ vlstorage │ storage_size │ 3 pods │ 3× storage │ +# ├─────────────────┼──────────────────┼─────────────────┼──────────────────┤ +# │ VLAgent buffer │ 5Gi (fixed) │ 1 pod │ 5Gi │ +# └─────────────────┴──────────────────┴─────────────────┴──────────────────┘ +# Example: 8Gi × 3 vlstorage = 24Gi + 5Gi VLAgent = 29Gi total # # KAFKA STORAGE: # ┌─────────────────┬──────────────────┬─────────────────┬──────────────────┐ @@ -56,164 +67,277 @@ # Example: 8Gi per pod → 48Gi total Kafka storage # # COMBINED STORAGE EXAMPLES: -# Default (8Gi each): VictoriaMetrics Cluster (24Gi) + Kafka (48Gi) = 72Gi total -# Single-node mode: VictoriaMetrics Single (8Gi) + Kafka (48Gi) = 56Gi total -# -# STORAGE OPTIONS: -# - VictoriaMetrics: Store iDRAC telemetry in time-series database -# - Kafka: Stream iDRAC and LDMS telemetry to Kafka topics -# - Both: Store iDRAC in both Victoria and Kafka (recommended) +# Default (8Gi each): victoria_metrics (24Gi) + victoria_logs (29Gi) + Kafka (48Gi) = 101Gi total # ============================================================================ # ============================================================================ -# iDRAC TELEMETRY CONFIGURATION +# TELEMETRY SOURCES (Data Collectors) # ============================================================================ -# iDRAC telemetry collects hardware metrics from Dell PowerEdge servers. -# Telemetry data can be stored in VictoriaMetrics, Kafka, or both. - -# Enable or disable iDRAC telemetry support -# Accepted values: true or false -# Default: true -idrac_telemetry_support: {{ telemetry_idrac_telemetry_support | default(true) | bool | ternary('true', 'false') }} - -# Specify where to store iDRAC telemetry data -# Supported values: -# - "victoria" : Store in VictoriaMetrics only -# - "kafka" : Store in Kafka only -# - "victoria,kafka" : Store in both (recommended) -# Default: "victoria,kafka" -telemetry_collection_type: {{ telemetry_telemetry_collection_type | default('victoria,kafka') | to_json }} +# Each source can be independently enabled/disabled. +# Sources produce telemetry data that flows through bridges to sinks. +# +# Supported sources: idrac, ldms, dcgm, powerscale + +telemetry_sources: + + # -------------------------------------------------------------------------- + # iDRAC — Hardware metrics from Dell PowerEdge servers + # -------------------------------------------------------------------------- + # Collects: temperature, power, fan speed, storage health, CPU/memory errors + # Requires: iDRAC-enabled Dell PowerEdge servers in inventory + # Data path: iDRAC Receiver → ActiveMQ → KafkaPump → Kafka 'idrac' topic + # iDRAC Receiver → ActiveMQ → VictoriaPump → vmagent → victoria_metrics + idrac: + # Enable or disable iDRAC metrics collection + # Default: true + metrics_enabled: {{ telemetry_idrac_telemetry_support | default(telemetry_default_idrac_support) | bool | ternary('true', 'false') }} + + # Collection targets define where iDRAC data is sent BEFORE Vector processing + # Supported values: "victoria_metrics", "kafka" + # Multiple targets: ["victoria_metrics", "kafka"] + # Default: ["victoria_metrics", "kafka"] + collection_targets: +{% for _target in telemetry_idrac_collection_targets %} + - "{{ _target }}" +{% endfor %} + + # -------------------------------------------------------------------------- + # LDMS — Lightweight Distributed Metric Service + # -------------------------------------------------------------------------- + # Collects: CPU, memory, network, disk metrics from compute nodes + # Requires: LDMS software in software_config.json + # Data path: LDMS samplers → LDMS aggregator → store_avro_kafka → Kafka 'ldms' topic + ldms: + # Enable or disable LDMS metrics collection + # Default: true + metrics_enabled: true + + # LDMS only supports Kafka collection (no direct victoria_metrics path) + # Vector-LDMS bridge consumes from Kafka and routes to victoria_metrics + collection_targets: + - "kafka" + + # -------------------------------------------------------------------------- + # DCGM — NVIDIA Data Center GPU Manager + # -------------------------------------------------------------------------- + # Collects: GPU temperature, utilization, memory, ECC errors, power + # Requires: NVIDIA GPU driver installed on compute nodes + dcgm: + # Enable or disable DCGM metrics collection + # Default: true + metrics_enabled: {{ telemetry_dcgm_support | default(telemetry_default_dcgm_support) | bool | ternary('true', 'false') }} + + # -------------------------------------------------------------------------- + # PowerScale — Dell PowerScale (OneFS) storage telemetry + # -------------------------------------------------------------------------- + # Collects: Storage metrics from Dell PowerScale clusters + # Requires: CSM Observability (Karavi) values file configured + # Data path: CSM Metrics PowerScale → OTEL Collector → vmagent(shared) → victoria_metrics + powerscale: + # Enable or disable PowerScale metrics collection + # Default: true + metrics_enabled: {{ telemetry_powerscale_metrics_enabled | default(telemetry_default_powerscale_support) | bool | ternary('true', 'false') }} + + # Enable or disable PowerScale logs collection + # Default: true + logs_enabled: {{ telemetry_powerscale_logs_enabled | default(telemetry_default_powerscale_log_enabled) | bool | ternary('true', 'false') }} + + # PowerScale uses vmagent(shared) (no Kafka, no Vector) + collection_targets: + - "victoria_metrics" + - "victoria_logs" + # ============================================================================ -# VICTORIAMETRICS CONFIGURATION +# TELEMETRY BRIDGES (Data Routers) # ============================================================================ -# VictoriaMetrics is a time-series database for storing telemetry metrics. -# Used for iDRAC telemetry when 'victoria' is enabled in telemetry_collection_type. +# Bridges route data from Kafka topics to Victoria sinks. +# Vector is the primary bridge technology, consuming from Kafka and producing +# to victoria_metrics (metrics) and victoria_logs (logs/events). # -victoria_metrics_configurations: - # The amount of storage allocated for EACH VictoriaMetrics persistent volume. - # IMPORTANT: Total VictoriaMetrics storage depends on deployment mode: - # - Cluster mode: Total storage = persistence_size × 3 vmstorage pods - # - Example (cluster): 8Gi × 3 = 24Gi total VictoriaMetrics storage - # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" - # Default: 8Gi (results in 24Gi total storage for cluster mode) - persistence_size: {{ telemetry_victoria_persistence_size | default('8Gi') | to_json }} +# ARCHITECTURE: +# Kafka topics → Vector pods → vmagent-vector/vlagent-vector → Victoria sinks +# + +telemetry_bridges: + + # -------------------------------------------------------------------------- + # Vector-LDMS — Kafka-to-victoria_metrics bridge for LDMS metrics + # -------------------------------------------------------------------------- + # Purpose: Consume LDMS metrics from Kafka 'ldms' topic, transform NERSC + # schema to Prometheus format, and write to victoria_metrics + # Data flow: Kafka 'ldms' topic → Vector-LDMS → vmagent-vector → victoria_metrics + vector_ldms: + # Enable or disable Vector-LDMS bridge + # Requires: telemetry_sources.ldms.enabled = true + # Default: true + metrics_enabled: true - # Duration (in hours) to retain victoria logs before they are deleted. - # Default: 168 (7 days) - retention_period: {{ telemetry_victoria_retention_period | default(168) }} + # -------------------------------------------------------------------------- + # Vector-OME — Kafka-to-Victoria bridge for OME metrics and logs + # -------------------------------------------------------------------------- + # Purpose: Consume OME data from Kafka 'ome.*' topics and route to victoria_metrics/victoria_logs + # Data flow: Kafka 'ome.*' topics → Vector-OME → vmagent-vector (metrics) / vlagent-vector (logs) + vector_ome: + # Enable or disable Vector-OME metrics routing + # Requires: OME to be configured with kafka + # Default: true + metrics_enabled: true + + # Enable or disable Vector-OME logs routing + # Default: true + logs_enabled: true + + # Identifier used by Vector-OME for topic identification and routing. + # Default: "ome" — internally used to match topics with the prefix (e.g., "^ome\\..*$") + # Change only if your OME Kafka topics use a different prefix. + ome_identifier: "ome" # ============================================================================ -# KAFKA CONFIGURATION +# TELEMETRY SINKS (Storage Backends) # ============================================================================ -# Apache Kafka is a distributed streaming platform for storing telemetry data. -# Used for iDRAC telemetry when 'kafka' is enabled in telemetry_collection_type. -# Also used for LDMS telemetry when LDMS software is configured. -# -# NOTE: Kafka topics are auto-generated based on enabled features: -# - 'idrac' topic: Required when idrac_telemetry_support=true and 'kafka' is enabled -# - 'ldms' topic: Required when LDMS is configured in software_config.json -kafka_configurations: - # The amount of storage allocated for EACH Kafka persistent volume. - # IMPORTANT: Total Kafka storage = persistence_size × 6 pods - # - 3 Kafka brokers (each gets persistence_size storage) - # - 3 Kafka controllers (each gets persistence_size storage) - # - Example: 8Gi × 6 = 48Gi total Kafka storage - # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" - # Default: 8Gi (results in 48Gi total Kafka storage) - persistence_size: {{ telemetry_kafka_persistence_size | default('8Gi') | to_json }} - - # The number of hours to retain Kafka logs before they are deleted. - # Default: 168 (7 days) - log_retention_hours: {{ telemetry_kafka_log_retention_hours | default(168) }} - - # The maximum size of Kafka logs (in bytes) before they are deleted. - # Default: -1 (unlimited) - log_retention_bytes: {{ telemetry_kafka_log_retention_bytes | default(-1) }} - - # The maximum size of Kafka log segments (in bytes) before they are deleted. - # Default: 1073741824 (1 GB) - log_segment_bytes: {{ telemetry_kafka_log_segment_bytes | default(1073741824) }} - - # Kafka Topic Partitions Configuration - # ---------------------------------------------------------------------------- - # Define the number of partitions for each Kafka topic. - # Increasing partitions can improve throughput but also increases storage/overhead. - # - # IMPORTANT: Topic names are FIXED and cannot be changed. - # - Topic names: Only 'idrac' and 'ldms' are allowed - # - Configurable: Only partition counts can be modified - # - # Topic Requirements (auto-validated): - # - 'idrac': Required when idrac_telemetry_support=true and 'kafka' is enabled - # - 'ldms': Required when LDMS software is configured in software_config.json - # - # Default partition counts: idrac=1, ldms=2 - topic_partitions: -{% for _topic in (telemetry_kafka_topic_partitions | default([])) %} - - name: {{ _topic.name | default('') | to_json }} - partitions: {{ _topic.partitions | default(1) }} +# Sinks are auto-enabled when at least one source targets them. +# Explicit 'enabled: false' here overrides source routing (disables the sink). + +telemetry_sinks: + + # -------------------------------------------------------------------------- + # victoria_metrics — Time-series database for metrics + # -------------------------------------------------------------------------- + victoria_metrics: + # Storage per vmstorage pod PVC + # Cluster: total = persistence_size × 3 vmstorage pods + # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" + # Default: 8Gi (results in 24Gi total storage for cluster mode) + persistence_size: {{ telemetry_victoria_persistence_size | default(telemetry_default_victoria_persistence_size) | to_json }} + + # Metric retention period in hours + # Default: 168 (7 days) + retention_period: {{ telemetry_victoria_retention_period | default(telemetry_default_victoria_retention_period) }} + + # Additional remote write endpoints for metrics (optional) + # Metrics will be sent to the Omnia-managed VictoriaMetrics AND to these endpoints. + # Each entry requires a 'url' field (must start with http:// or https://). + # Set tls_insecure_skip_verify: true to skip TLS certificate verification. + # Default: [] (only Omnia VictoriaMetrics receives metrics) + # Example: + # additional_metric_remote_write_endpoints: + # - url: https://external-metrics-server:8480/insert/0/prometheus/api/v1/write + # tls_insecure_skip_verify: false + additional_metric_remote_write_endpoints: {{ telemetry_additional_metric_remote_write_endpoints | default([]) | to_json }} + + # -------------------------------------------------------------------------- + # victoria_logs — Centralized log storage and querying + # -------------------------------------------------------------------------- + # Co-deployed with victoria_metrics when victoria_metrics sink is active. + # Provides structured log collection via vlagent-vector (JSON Lines receiver). + victoria_logs: + # Storage per vlstorage pod PVC + # Total = storage_size × 3 vlstorage pods + # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" + # Default: 8Gi (results in 24Gi total storage) + storage_size: {{ telemetry_victoria_logs_storage_size | default(telemetry_default_victoria_logs_storage_size) | to_json }} + + # Log retention period in hours + # Default: 168 (7 days) + retention_period: {{ telemetry_victoria_logs_retention_period | default(telemetry_default_victoria_logs_retention_period) }} + + # Additional remote write endpoints for logs (optional) + # Logs will be sent to the Omnia-managed VictoriaLogs AND to these endpoints. + # Each entry requires a 'url' field (must start with http:// or https://). + # Set tls_insecure_skip_verify: true to skip TLS certificate verification. + # Default: [] (only Omnia VictoriaLogs receives logs) + # Example: + # additional_log_write_endpoints: + # - url: https://external-logs-server:9481/internal/insert + # tls_insecure_skip_verify: false + additional_log_write_endpoints: {{ telemetry_additional_log_write_endpoints | default([]) | to_json }} + + # -------------------------------------------------------------------------- + # Kafka — Distributed streaming platform + # -------------------------------------------------------------------------- + kafka: + # Storage per Kafka pod PVC + # Total = persistence_size × 6 pods (3 brokers + 3 controllers) + # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" + # Default: 8Gi (results in 48Gi total storage) + persistence_size: {{ telemetry_kafka_persistence_size | default(telemetry_default_kafka_persistence_size) | to_json }} + + # Log retention + # Default: 168 (7 days) + log_retention_hours: {{ telemetry_kafka_log_retention_hours | default(telemetry_default_kafka_log_retention_hours) }} + + # Maximum size of Kafka logs (in bytes) before deletion + # Default: -1 (unlimited) + log_retention_bytes: {{ telemetry_kafka_log_retention_bytes | default(telemetry_default_kafka_log_retention_bytes) }} + + # Maximum size of Kafka log segments (in bytes) + # Default: 1073741824 (1 GB) + log_segment_bytes: {{ telemetry_kafka_log_segment_bytes | default(telemetry_default_kafka_log_segment_bytes) }} + + # Topic partitions per source (auto-created for enabled sources targeting kafka) + # Only sources with kafka in collection_targets get topics. + # Topic name = source name (e.g., "idrac", "ldms") + topic_partitions: +{% for _topic_name, _partitions in telemetry_kafka_topic_partitions_dict.items() %} + {{ _topic_name }}: {{ _partitions }} {% endfor %} # ============================================================================ -# LDMS (Lightweight Distributed Metric Service) CONFIGURATION +# SOURCE-SPECIFIC CONFIGURATIONS # ============================================================================ -# LDMS collects performance metrics from compute nodes (CPU, memory, network, etc.) -# and streams them to Kafka for storage and analysis. -# -# PREREQUISITE: To enable LDMS support, add the following to software_config.json: -# { -# "softwares": [ -# {"name": "ldms", "arch": ["x86_64", "aarch64"]} -# ] -# } -# -# When LDMS software is configured, the 'ldms' topic MUST be defined in -# kafka_configurations.topic_partitions above. -# -# LDMS Port Configurations -# Aggregator port on service k8s cluster -# Valid range: 6001-6100 -# Default: 6001 -ldms_agg_port: {{ telemetry_ldms_agg_port | default(6001) }} - -# Store daemon port on service k8s cluster -# Can be the same as ldms_agg_port -# Valid range: 6001-6100 -# Default: 6001 -ldms_store_port: {{ telemetry_ldms_store_port | default(6001) }} - -# Sampler port on compute nodes -# Valid range: 10001-10100 -# Default: 10001 -ldms_sampler_port: {{ telemetry_ldms_sampler_port | default(10001) }} - -# LDMS Sampler Plugin Configurations -# ---------------------------------------------------------------------------- -# Configure which metrics to collect from compute nodes and collection intervals. -# Each plugin collects specific system metrics. -# -# Parameters: -# - plugin_name: Name of the LDMS sampler plugin -# - config_parameters: Plugin-specific configuration (as a single string) -# - activation_parameters: Collection schedule in MICROSECONDS -# Format: "interval= offset=" -# Example: "interval=1000000" (1000000 microseconds = 1 second) -# "interval=1000000 offset=0" (1000000 microseconds with no offset) -# -# Available Plugins: -# - meminfo: Memory usage statistics -# - procstat2: Process statistics -# - vmstat: Virtual memory statistics -# - loadavg: System load average -# - procnetdev2: Network interface statistics -ldms_sampler_configurations: +# Detailed configurations for each telemetry source. +# Only relevant when the corresponding source is enabled above. + +# -------------------------------------------------------------------------- +# LDMS Configuration +# -------------------------------------------------------------------------- +ldms_configurations: + # Aggregator port on service K8s cluster (valid: 6001-6100) + agg_port: {{ telemetry_ldms_agg_port | default(telemetry_default_ldms_agg_port) }} + + # Store daemon port (valid: 6001-6100) + store_port: {{ telemetry_ldms_store_port | default(telemetry_default_ldms_store_port) }} + + # Sampler port on compute nodes (valid: 10001-10100) + sampler_port: {{ telemetry_ldms_sampler_port | default(telemetry_default_ldms_sampler_port) }} + + # Sampler plugins — which metrics to collect from compute nodes + # Parameters: + # - plugin_name: Name of the LDMS sampler plugin + # - config_parameters: Plugin-specific configuration (as a single string) + # - activation_parameters: Collection schedule in MICROSECONDS + # Format: "interval= offset=" + # Example: "interval=30000000" (30 seconds) + sampler_plugins: {% if telemetry_ldms_sampler_configurations is none %} - null + null {% else %} {% for _plugin in (telemetry_ldms_sampler_configurations | default([])) %} - - plugin_name: {{ _plugin.plugin_name | default('') }} - config_parameters: {{ _plugin.config_parameters | default('') | to_json }} - activation_parameters: {{ _plugin.activation_parameters | default('interval=1000000') | to_json }} + - plugin_name: {{ _plugin.plugin_name | default('') }} + config_parameters: {{ _plugin.config_parameters | default('') | to_json }} + activation_parameters: {{ _plugin.activation_parameters | default('interval=30000000') | to_json }} {% endfor %} {% endif %} + +# -------------------------------------------------------------------------- +# PowerScale Telemetry Configuration +# -------------------------------------------------------------------------- +# PowerScale telemetry collects storage metrics from Dell PowerScale (OneFS) +# clusters using the CSM (Container Storage Modules) Metrics PowerScale exporter. +# +# DATA PIPELINE: +# CSM Metrics PowerScale → OTEL Collector → vmagent(shared) → victoria_metrics +# +# NOTE: PowerScale does NOT use Vector bridges. It uses the shared vmagent instance +# that writes directly to victoria_metrics. +powerscale_configurations: + # PVC size for OTEL Collector metric batching and buffering + # Accepted values: in the form of "X[Ki|Mi|Gi|Ti|Pi|Ei]" + # Default: "5Gi" + otel_collector_storage_size: {{ telemetry_otel_collector_storage_size | default(telemetry_default_otel_collector_storage_size) | to_json }} + + # Path to the CSM Observability (Karavi Observability) values.yaml file + # Required when powerscale_configurations.powerscale_telemetry_support: true + # Reference: https://raw.githubusercontent.com/dell/helm-charts/refs/heads/release-v1.16.3/charts/karavi-observability/values.yaml + csm_observability_values_file_path: "{{ telemetry_csm_observability_values_file_path | default(telemetry_default_csm_observability_values_file_path) }}" diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 1678316f8c..1349359eb2 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -14,7 +14,7 @@ --- # backup_location will be set from oim_metadata.yml upgrade_backup_dir -# Format: /opt/omnia/backups/upgrade/version_2.0.0.0/input/project_default +# Format: /opt/omnia/backups/upgrade/version_2.1.0.0/input/project_default # Set dynamically from metadata, no static variable needed # Path to oim_metadata.yml @@ -131,8 +131,9 @@ msg_encryption_failed: "Encryption failed. Check warnings for details." # Network spec transformation messages msg_backup_network_spec_missing: "Backup network_spec.yml missing" msg_network_spec_missing: "network_spec.yml missing" -msg_network_spec_already_21: "network_spec.yml already in 2.1 format - overwriting" +msg_network_spec_already_22: "network_spec.yml already in 2.2 format - overwriting" msg_yaml_validation_failed: "YAML validation failed" +msg_json_validation_failed: "JSON validation failed" msg_ib_netmask_mismatch: "ib_network.netmask_bits must match admin_network.netmask_bits" msg_ib_network_missing: "ib_network is mandatory" msg_ib_subnet_missing: "ib_network.subnet is mandatory" @@ -141,7 +142,7 @@ msg_using_backup_network_spec: "Using backup network_spec.yml (backup not modifi # High availability config transformation messages msg_backup_ha_config_missing: "Backup high_availability_config.yml missing" msg_ha_config_missing: "high_availability_config.yml missing" -msg_ha_config_already_21: "high_availability_config.yml already in 2.1 format - overwriting" +msg_ha_config_already_22: "high_availability_config.yml already in 2.2 format - overwriting" msg_ha_virtual_ip_missing: "service_k8s_cluster_ha.virtual_ip_address is mandatory" msg_using_backup_ha_config: "Using backup high_availability_config.yml (backup not modified)" @@ -177,6 +178,42 @@ msg_backup_telemetry_config_missing: "Backup telemetry_config.yml missing" msg_telemetry_config_missing: "telemetry_config.yml missing" msg_using_backup_telemetry_config: "Using backup telemetry_config.yml (backup not modified)" +# Build stream config transformation messages +msg_backup_build_stream_config_missing: "Backup build_stream_config.yml missing" +msg_build_stream_config_missing: "build_stream_config.yml missing" +msg_using_backup_build_stream_config: "Using backup build_stream_config.yml (backup not modified)" + +# GitLab config transformation messages +msg_backup_gitlab_config_missing: "Backup gitlab_config.yml missing" +msg_gitlab_config_missing: "gitlab_config.yml missing" +msg_using_backup_gitlab_config: "Using backup gitlab_config.yml (backup not modified)" + +# Software config transformation messages +msg_backup_software_config_missing: "Backup software_config.json missing" +msg_software_config_missing: "software_config.json missing" + +# Build stream config transformation messages +msg_build_stream_config_transform_summary: | + build_stream_config.yml migrated from backup with validation. + Backup preserved at: {{ backup_location }}/build_stream_config.yml + Target: {{ input_project_dir }}/build_stream_config.yml + Note: Configuration values migrated from omnia-main backup + Enhanced with IP address and port validation. + +# GitLab config transformation messages +msg_gitlab_config_transform_summary: | + gitlab_config.yml migrated from backup with validation. + Backup preserved at: {{ backup_location }}/gitlab_config.yml + Target: {{ input_project_dir }}/gitlab_config.yml + Note: Configuration values migrated from omnia-main backup + Enhanced with comprehensive validation for all parameters. + +# Discovery config transformation messages +msg_discovery_config_transform_summary: | + discovery_config.yml created with Omnia 2.2 defaults. + Target: {{ input_project_dir }}/discovery_config.yml + Note: This is a new file in Omnia 2.2 (not present in 2.1) + ### Restore summary messages msg_restore_summary: | {{ restore_item.name }} restored from backup. @@ -185,16 +222,18 @@ msg_restore_summary: | # Restore summary message for network spec transformation msg_network_spec_transform_summary: | - network_spec.yml upgraded to 2.1 format. + network_spec.yml upgraded to 2.2 format. Backup preserved at: {{ backup_location }}/network_spec.yml Changes: - - Added mandatory ib_network - - Made primary_oim_bmc_ip optional + - Added subnet field under admin_network + - Added additional_subnets field under admin_network (default: empty) + - Added dns field under ib_network (default: empty) + - Preserved ib_network configuration - Aligned ib_network.netmask_bits with admin_network.netmask_bits # Restore summary message for high availability config transformation msg_ha_config_transform_summary: | - high_availability_config.yml upgraded to 2.1 format. + high_availability_config.yml upgraded to 2.2 format. Backup preserved at: {{ backup_location }}/high_availability_config.yml Changes: - Ensured service_k8s_cluster_ha is a list @@ -202,44 +241,159 @@ msg_ha_config_transform_summary: | # Restore summary message for local repo config transformation msg_local_repo_config_transform_summary: | - local_repo_config.yml upgraded to 2.1 format. + local_repo_config.yml upgraded to 2.2 format. Backup preserved at: {{ backup_location }}/local_repo_config.yml Changes: - - Normalized repo URL keys to arch-specific schema - - Migrated omnia_registry to user_registry (when present) - - Ensured mandatory omnia_repo_url_rhel_* keys are present + - Updated omnia_repo_url_rhel_* to Omnia 2.2 versions (kubernetes v1.35, cri-o v1.35) + - Added versioned repository naming (kubernetes-v1-35, cri-o-v1-35) + - Added cuda repository entries + - Added rhel_subscription_repo_config sections + - Added additional_repos sections + - Preserved user_registry and user_repo_url entries # Restore summary message for provision config transformation msg_provision_config_transform_summary: | - provision_config.yml upgraded to 2.1 format. + provision_config.yml upgraded to 2.2 format. Backup preserved at: {{ backup_location }}/provision_config.yml Changes: - Ensured pxe_mapping_file_path, language, and default_lease_time are present # Restore summary message for storage config transformation msg_storage_config_transform_summary: | - storage_config.yml upgraded to 2.1 format. + storage_config.yml upgraded to 2.2 format. Backup preserved at: {{ backup_location }}/storage_config.yml Changes: - Ensured nfs_client_params is present and entries contain required keys # Restore summary message for omnia config transformation msg_omnia_config_transform_summary: | - omnia_config.yml upgraded to 2.1 format. + omnia_config.yml upgraded to 2.2 format. Backup preserved at: {{ backup_location }}/omnia_config.yml Changes: - Ensured slurm_cluster and service_k8s_cluster are lists - - Ensured required sections are present + - Added skip_merge, node_discovery_mode, node_hardware_defaults comments + - Added csi_powerscale_driver fields to service_k8s_cluster + - Added k8s_crio_storage_size field # Restore summary message for telemetry config transformation msg_telemetry_config_transform_summary: | - telemetry_config.yml upgraded to 2.1 format. + telemetry_config.yml upgraded to 2.2 format with restructured architecture. Backup preserved at: {{ backup_location }}/telemetry_config.yml Changes: - - Rendered Omnia 2.1 telemetry template with values from 2.0 backup - - Applied schema defaults for missing fields - -# === Input files to restore from backup === + - Restructured to telemetry_sources / telemetry_bridges / telemetry_sinks architecture + - Mapped idrac_telemetry_support → telemetry_sources.idrac.metrics_enabled + - Mapped idrac_telemetry_collection_type → telemetry_sources.idrac.collection_targets + - Mapped victoria_configurations → telemetry_sinks.victoria_metrics + - Mapped kafka_configurations → telemetry_sinks.kafka (topic_partitions list → dict) + - Mapped ldms_* ports and sampler configs → ldms_configurations section + - Added telemetry_sources.ldms (metrics_enabled, collection_targets) with defaults + - Added telemetry_sources.dcgm with default metrics_enabled=true + - Added telemetry_sources.powerscale with default metrics and logs enabled + - Added telemetry_bridges section (vector_ldms, vector_ome) with defaults + - Added telemetry_sinks.victoria_logs with default storage and retention + - Removed single-node VM deployment mode (always cluster in 2.2) + - Updated powerscale_configurations (removed source-level fields, kept otel/csm settings) + +# Restore summary message for software config transformation +msg_software_config_transform_summary: | + software_config.json upgraded to 2.2 format. + Backup preserved at: {{ backup_location }}/software_config.json + Changes: + - Bumped service_k8s version from 1.34.1 to 1.35.1 (if present) + - Added admin_debug_packages entry (if missing) + - Added csi_driver_powerscale entry (if missing) + - Added additional_packages software entry and section (if missing) + +# === DEFAULT VALUES === +# Build Stream Config Defaults +build_stream_default_enable: false +build_stream_default_host_ip: "" +build_stream_default_port: 8010 +build_stream_default_aarch64_ip: "" + +# GitLab Config Defaults +gitlab_default_host: "" +gitlab_default_project_name: "omnia-catalog" +gitlab_default_project_visibility: "private" +gitlab_default_branch: "main" +gitlab_default_https_port: 443 +gitlab_default_min_storage_gb: 20 +gitlab_default_min_memory_gb: 4 +gitlab_default_min_cpu_cores: 2 +gitlab_default_puma_workers: 2 +gitlab_default_sidekiq_concurrency: 10 + +# Provision Config Defaults +provision_default_pxe_mapping_file_path: "pxe_mapping_file.csv" +provision_default_language: "en_US.UTF-8" +provision_default_lease_time: "86400" + +# Network Config Defaults +network_default_netmask_bits: "24" +network_default_subnet: "172.16.0.0" + +# Telemetry Config Defaults +telemetry_default_idrac_support: true +telemetry_default_collection_type: "victoria,kafka" +telemetry_default_victoria_persistence_size: "8Gi" +telemetry_default_victoria_retention_period: 168 +telemetry_default_kafka_persistence_size: "8Gi" +telemetry_default_kafka_log_retention_hours: 168 +telemetry_default_kafka_log_retention_bytes: -1 +telemetry_default_kafka_log_segment_bytes: 1073741824 +telemetry_default_ldms_agg_port: 6001 +telemetry_default_ldms_store_port: 6001 +telemetry_default_ldms_sampler_port: 10001 +telemetry_default_dcgm_support: true +telemetry_default_victoria_logs_storage_size: "8Gi" +telemetry_default_victoria_logs_retention_period: 168 +telemetry_default_powerscale_support: true +telemetry_default_powerscale_log_enabled: true +telemetry_default_otel_collector_storage_size: "5Gi" +telemetry_default_csm_observability_values_file_path: "" + +# Default LDMS sampler configurations +telemetry_default_ldms_sampler_configurations: + - plugin_name: meminfo + config_parameters: "" + activation_parameters: "interval=30000000" + - plugin_name: procstat2 + config_parameters: "" + activation_parameters: "interval=30000000" + - plugin_name: vmstat + config_parameters: "" + activation_parameters: "interval=30000000" + - plugin_name: loadavg + config_parameters: "" + activation_parameters: "interval=30000000" + - plugin_name: procnetdev2 + config_parameters: "" + activation_parameters: "interval=30000000 offset=0" + +# Default Kafka topic partitions +telemetry_default_kafka_topic_partitions: + - name: "idrac" + partitions: 1 + - name: "ldms" + partitions: 2 + +# === VALIDATION MESSAGES === +msg_file_missing: "Backup file missing: {{ file_name }}" +msg_validation_failed_generic: "Validation failed for {{ file_name }}" + +# === TRANSFORMATION MESSAGES === +msg_transform_summary_generic: |- + {{ file_name }} upgraded to 2.2 format. + Backup preserved at: {{ backup_location }}/{{ file_name }} + Changes: {{ changes_description }} + +# === FILE MODES === +mode_yaml_file: '0644' +mode_json_file: '0644' +mode_csv_file: '0644' +mode_sensitive_file: '0600' + +# === INPUT FILES TO RESTORE FROM BACKUP === # Add input files here that should be copied from backup_location to input_project_dir # Each entry should have: # - name: filename (required) @@ -248,11 +402,12 @@ msg_telemetry_config_transform_summary: | # # Examples of files to add: # - Static configuration files that don't need transformation -# - Files that are the same format in 2.0 and 2.1 +# - Files that are the same format in 2.1 and 2.2 # - Files where you want to preserve the backup values exactly # # DO NOT add files that require transformation (network_spec.yml, high_availability_config.yml, local_repo_config.yml, -# provision_config.yml, user_registry_credential.yml) +# provision_config.yml, software_config.json, telemetry_config.yml, user_registry_credential.yml) +# DO NOT add files that are newly generated (build_stream_config.yml, gitlab_config.yml, discovery_config.yml) restore_input_files: - name: software_config.json mode: '0644' diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/backup_configs.yml b/upgrade/roles/manage_upgrade_inputs/tasks/backup_configs.yml index 67b1f64868..3bc32cf7a8 100644 --- a/upgrade/roles/manage_upgrade_inputs/tasks/backup_configs.yml +++ b/upgrade/roles/manage_upgrade_inputs/tasks/backup_configs.yml @@ -1,37 +1,37 @@ # Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. --- -- name: "backup — Set backup directory" +- name: "Backup — Set backup directory" ansible.builtin.set_fact: upgrade_backup_dir: "/opt/omnia/.data/upgrade_backup/{{ upgrade_id }}" -- name: "backup — Create backup directory" +- name: "Backup — Create backup directory" ansible.builtin.file: path: "{{ upgrade_backup_dir }}" state: directory mode: '0755' -- name: "backup — Backup software_config.json" +- name: "Backup — Backup software_config.json" ansible.builtin.copy: src: "{{ input_project_dir }}/software_config.json" dest: "{{ upgrade_backup_dir }}/software_config.json" remote_src: true mode: '0644' -- name: "backup — Backup local_repo_config.yml" +- name: "Backup — Backup local_repo_config.yml" ansible.builtin.copy: src: "{{ input_project_dir }}/local_repo_config.yml" dest: "{{ upgrade_backup_dir }}/local_repo_config.yml" remote_src: true mode: '0644' -- name: "backup — Backup upgrade_manifest.yml" +- name: "Backup — Backup upgrade_manifest.yml" ansible.builtin.copy: src: "{{ role_path }}/../../upgrade_manifest.yml" dest: "{{ upgrade_backup_dir }}/upgrade_manifest.yml" mode: '0644' -- name: "backup — Create backup manifest" +- name: "Backup — Create backup manifest" ansible.builtin.copy: content: | # Upgrade Backup Manifest @@ -39,12 +39,12 @@ timestamp: {{ ansible_date_time.iso8601 }} source_omnia_version: {{ upgrade_source_version }} target_omnia_version: {{ upgrade_target_version }} - + enabled_components: {% for component in enabled_components %} - name: {{ component.key }} {% endfor %} - + files_backed_up: - software_config.json - local_repo_config.yml @@ -52,7 +52,7 @@ dest: "{{ upgrade_backup_dir }}/manifest.yml" mode: '0644' -- name: "backup — Display backup info" +- name: "Backup — Display backup info" ansible.builtin.debug: msg: - "✓ Backup created: {{ upgrade_backup_dir }}" diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/calculate_hop_chain.yml b/upgrade/roles/manage_upgrade_inputs/tasks/calculate_hop_chain.yml index b9b2fd8e4c..f50c49d413 100644 --- a/upgrade/roles/manage_upgrade_inputs/tasks/calculate_hop_chain.yml +++ b/upgrade/roles/manage_upgrade_inputs/tasks/calculate_hop_chain.yml @@ -1,7 +1,7 @@ # Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. --- -- name: "calculate_hops — Calculate hop chain from upgrade paths" +- name: "Calculate hops — Calculate hop chain from upgrade paths" calculate_upgrade_hops: upgrade_config: "{{ upgrade_config }}" current_software_config: "{{ current_software_config }}" @@ -9,16 +9,21 @@ target_omnia_version: "{{ upgrade_target_version }}" register: _hop_calculation -- name: "calculate_hops — Parse hop chain result" +- name: "Calculate hops — Parse hop chain result" ansible.builtin.set_fact: calculated_hop_chains: "{{ _hop_calculation.hop_chains }}" total_upgrade_hops: "{{ _hop_calculation.total_hops }}" upgrade_mode: "{{ _hop_calculation.upgrade_mode }}" -- name: "calculate_hops — Display calculated hop chain" +- name: "Calculate hops — Display calculated hop chain" ansible.builtin.debug: msg: - "Calculated Hop Chain:" - " Mode: {{ upgrade_mode }}" - " Total hops: {{ total_upgrade_hops }}" - - "{% for hop in calculated_hop_chains %} - {{ hop.software }}/{{ hop.hop_id }}: {{ hop.from_version }} → {{ hop.to_version }} (Omnia {{ hop.from_omnia_version }} → {{ hop.to_omnia_version }}){% endfor %}" + - >- + {% for hop in calculated_hop_chains %} + - {{ hop.software }}/{{ hop.hop_id }}: {{ hop.from_version }} → + {{ hop.to_version }} (Omnia {{ hop.from_omnia_version }} → + {{ hop.to_omnia_version }}) + {% endfor %} diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/display_summary.yml b/upgrade/roles/manage_upgrade_inputs/tasks/display_summary.yml index 022261bffe..d55042ff33 100644 --- a/upgrade/roles/manage_upgrade_inputs/tasks/display_summary.yml +++ b/upgrade/roles/manage_upgrade_inputs/tasks/display_summary.yml @@ -1,7 +1,7 @@ # Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. --- -- name: "summary — Display upgrade input summary" +- name: "Summary — Display upgrade input summary" ansible.builtin.debug: msg: - "==========================================" @@ -11,14 +11,24 @@ - "Mode: {{ upgrade_mode | default('single_hop') }}" - "Omnia: {{ upgrade_source_version }} → {{ upgrade_target_version }}" - "" - - "{% if upgrade_mode == 'multi_hop' %}Multi-Hop Upgrade Path:{% else %}Upgrade Path:{% endif %}" - - "{% for hop in calculated_hop_chains %} - {{ hop.software }}/{{ hop.hop_id }}: {{ hop.from_version }} → {{ hop.to_version }} (Omnia {{ hop.from_omnia_version }} → {{ hop.to_omnia_version }}){% endfor %}" + - >- + {% if upgrade_mode == 'multi_hop' %} + Multi-Hop Upgrade Path: + {% else %} + Upgrade Path: + {% endif %} + - >- + {% for hop in calculated_hop_chains %} + - {{ hop.software }}/{{ hop.hop_id }}: {{ hop.from_version }} → + {{ hop.to_version }} (Omnia {{ hop.from_omnia_version }} → + {{ hop.to_omnia_version }}) + {% endfor %} - "" - "Total Hops: {{ total_upgrade_hops }}" - "Backup: {{ upgrade_backup_dir }}" - "==========================================" -- name: "summary — Set upgrade facts for downstream roles" +- name: "Summary — Set upgrade facts for downstream roles" ansible.builtin.set_fact: upgrade_inputs_complete: true upgrade_backup_location: "{{ upgrade_backup_dir }}" diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/load_upgrade_manifest.yml b/upgrade/roles/manage_upgrade_inputs/tasks/load_upgrade_manifest.yml index 7ad73c033e..5b03a98120 100644 --- a/upgrade/roles/manage_upgrade_inputs/tasks/load_upgrade_manifest.yml +++ b/upgrade/roles/manage_upgrade_inputs/tasks/load_upgrade_manifest.yml @@ -1,54 +1,54 @@ # Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. --- -- name: "load — Check oim_metadata.yml exists" +- name: "Load — Check oim_metadata.yml exists" ansible.builtin.stat: path: "/opt/omnia/.data/oim_metadata.yml" register: _oim_metadata_stat -- name: "load — Fail if oim_metadata.yml not found" +- name: "Load — Fail if oim_metadata.yml not found" ansible.builtin.fail: msg: | oim_metadata.yml not found at /opt/omnia/.data/oim_metadata.yml This file contains Omnia version information after omnia_core execution. when: not _oim_metadata_stat.stat.exists -- name: "load — Load oim_metadata.yml" +- name: "Load — Load oim_metadata.yml" ansible.builtin.include_vars: file: "/opt/omnia/.data/oim_metadata.yml" name: oim_metadata -- name: "load — Check upgrade_manifest.yml exists" +- name: "Load — Check upgrade_manifest.yml exists" ansible.builtin.stat: path: "{{ role_path }}/../../upgrade_manifest.yml" register: _upgrade_config_stat -- name: "load — Fail if upgrade_manifest.yml not found" +- name: "Load — Fail if upgrade_manifest.yml not found" ansible.builtin.fail: msg: | upgrade_manifest.yml not found at {{ role_path }}/../../upgrade_manifest.yml This file is the source of truth for upgrade paths and components. when: not _upgrade_config_stat.stat.exists -- name: "load — Load upgrade_manifest.yml" +- name: "Load — Load upgrade_manifest.yml" ansible.builtin.include_vars: file: "{{ role_path }}/../../upgrade_manifest.yml" name: upgrade_config -- name: "load — Extract enabled components" +- name: "Load — Extract enabled components" ansible.builtin.set_fact: enabled_components: >- - {{ upgrade_config.components | dict2items - | selectattr('value.enabled', 'equalto', true) + {{ upgrade_config.components | dict2items + | selectattr('value.enabled', 'equalto', true) | list }} -- name: "load — Set upgrade metadata facts from oim_metadata.yml" +- name: "Load — Set upgrade metadata facts from oim_metadata.yml" ansible.builtin.set_fact: upgrade_source_version: "{{ oim_metadata.omnia_previous_version }}" upgrade_target_version: "{{ oim_metadata.omnia_version }}" upgrade_id: "{{ lookup('pipe', 'date +%Y%m%d_%H%M%S') }}_{{ 99999 | random }}" -- name: "load — Display loaded configuration" +- name: "Load — Display loaded configuration" ansible.builtin.debug: msg: - "Upgrade Config Loaded:" diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/update_component_json_repos.yml b/upgrade/roles/manage_upgrade_inputs/tasks/update_component_json_repos.yml index f916271fb3..83cdafd088 100644 --- a/upgrade/roles/manage_upgrade_inputs/tasks/update_component_json_repos.yml +++ b/upgrade/roles/manage_upgrade_inputs/tasks/update_component_json_repos.yml @@ -20,7 +20,7 @@ # that support versioned repositories (like slurm_custom). # ============================================================================ -- name: "json_repos - Update component JSON files with version-specific repo names" +- name: "JSON repos — Update component JSON files with version-specific repo names" update_component_json_repos: input_dir: "{{ input_project_dir }}" calculated_hop_chains: "{{ calculated_hop_chains | default([]) }}" @@ -30,11 +30,11 @@ # Add other components here that support versioned repositories register: _json_repo_update_result -- name: "json_repos - Parse update result" +- name: "JSON repos — Parse update result" ansible.builtin.set_fact: _json_repo_update_summary: "{{ _json_repo_update_result }}" -- name: "json_repos - Display update summary" +- name: "JSON repos — Display update summary" ansible.builtin.debug: msg: - "==========================================" diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/update_software_config.yml b/upgrade/roles/manage_upgrade_inputs/tasks/update_software_config.yml index c110277605..6b9f31db73 100644 --- a/upgrade/roles/manage_upgrade_inputs/tasks/update_software_config.yml +++ b/upgrade/roles/manage_upgrade_inputs/tasks/update_software_config.yml @@ -1,18 +1,18 @@ # Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. --- -- name: "update — Update software_config.json with target versions" +- name: "Update — Update software_config.json with target versions" update_software_config: input_file: "{{ input_project_dir }}/software_config.json" hop_chains: "{{ calculated_hop_chains }}" upgrade_mode: "{{ upgrade_mode }}" register: _update_result -- name: "update — Parse update result" +- name: "Update — Parse update result" ansible.builtin.set_fact: _update_summary: "{{ _update_result }}" -- name: "update — Display update results" +- name: "Update — Display update results" ansible.builtin.debug: msg: - "✓ software_config.json updated" diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/validate_current_deployment.yml b/upgrade/roles/manage_upgrade_inputs/tasks/validate_current_deployment.yml index ce4f10048b..8eb863577b 100644 --- a/upgrade/roles/manage_upgrade_inputs/tasks/validate_current_deployment.yml +++ b/upgrade/roles/manage_upgrade_inputs/tasks/validate_current_deployment.yml @@ -1,34 +1,34 @@ # Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. --- -- name: "validate — Load current software_config.json" +- name: "Validate — Load current software_config.json" ansible.builtin.slurp: path: "{{ input_project_dir }}/software_config.json" register: _sw_config_content -- name: "validate — Parse software_config.json" +- name: "Validate — Parse software_config.json" ansible.builtin.set_fact: current_software_config: "{{ _sw_config_content.content | b64decode | from_json }}" -- name: "validate — Extract current versions" +- name: "Validate — Extract current versions" ansible.builtin.set_fact: current_versions: >- - {{ current_software_config.softwares + {{ current_software_config.softwares | selectattr('version', 'defined') | items2dict(key_name='name', value_name='version') }} -- name: "validate — Check oim_metadata for deployment info" +- name: "Validate — Check oim_metadata for deployment info" ansible.builtin.slurp: path: /opt/omnia/.data/oim_metadata.yml register: _oim_metadata_content failed_when: false -- name: "validate — Parse oim_metadata" +- name: "Validate — Parse oim_metadata" ansible.builtin.set_fact: oim_metadata: "{{ _oim_metadata_content.content | b64decode | from_yaml }}" when: _oim_metadata_content is succeeded -- name: "validate - Verify current versions exist for enabled components" +- name: "Validate — Verify current versions exist for enabled components" ansible.builtin.assert: that: - current_versions[item.key] is defined @@ -41,7 +41,7 @@ loop_control: label: "{{ item.key }}" -- name: "validate - Check Omnia upgrade paths exist for current version" +- name: "Validate — Check Omnia upgrade paths exist for current version" ansible.builtin.assert: that: - upgrade_config.omnia_upgrade_paths[upgrade_source_version] is defined @@ -52,16 +52,20 @@ when: upgrade_source_version is defined run_once: true -- name: "validate - Check JSON files exist for current version" +- name: "Validate — Check JSON files exist for current version" ansible.builtin.stat: - path: "{{ input_project_dir }}/config/{{ item.1 }}/{{ current_software_config.cluster_os_type }}/{{ current_software_config.cluster_os_version }}/{{ item.0.key }}_v{{ current_versions[item.0.key] }}.json" + path: >- + {{ input_project_dir }}/config/{{ item.1 }}/{{ + current_software_config.cluster_os_type }}/{{ + current_software_config.cluster_os_version }}/{{ + item.0.key }}_v{{ current_versions[item.0.key] }}.json register: _json_exists loop: "{{ enabled_components | product(upgrade_active_architectures) | list }}" loop_control: label: "{{ item.0.key }}/{{ item.1 }}" when: current_versions[item.0.key] is defined -- name: "validate - Fail if JSON files missing" +- name: "Validate — Fail if JSON files missing" ansible.builtin.fail: msg: | JSON file not found for {{ item.item.0.key }} version {{ current_versions[item.item.0.key] }}: {{ item.invocation.module_args.path }} @@ -70,11 +74,11 @@ loop: "{{ _json_exists.results }}" loop_control: label: "{{ item.item.0.key }}" - when: + when: - item.stat is defined - not item.stat.exists -- name: "validate — Display validation results" +- name: "Validate — Display validation results" ansible.builtin.debug: msg: - " Current deployment validated" diff --git a/upgrade/roles/manage_upgrade_inputs/tasks/validate_hop_chains.yml b/upgrade/roles/manage_upgrade_inputs/tasks/validate_hop_chains.yml index fdf9ede037..2e1ae890b7 100644 --- a/upgrade/roles/manage_upgrade_inputs/tasks/validate_hop_chains.yml +++ b/upgrade/roles/manage_upgrade_inputs/tasks/validate_hop_chains.yml @@ -1,7 +1,7 @@ # Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. --- -- name: "validate_hops - Process hop chains for multi-hop upgrades" +- name: "Validate hops — Process hop chains for multi-hop upgrades" ansible.builtin.set_fact: _processed_hops: >- {%- set hops = [] -%} @@ -21,13 +21,17 @@ {%- endfor -%} {{ hops }} -- name: "validate_hops - Display processed hops" +- name: "Validate hops — Display processed hops" ansible.builtin.debug: msg: - "Processed {{ _processed_hops | length }} hop(s):" - - "{% for hop in _processed_hops %} - {{ hop.software }}/{{ hop.hop_id }}: {{ hop.from_version }} -> {{ hop.to_version }} (Omnia {{ hop.omnia_version }}){% endfor %}" + - >- + {% for hop in _processed_hops %} + - {{ hop.software }}/{{ hop.hop_id }}: {{ hop.from_version }} -> + {{ hop.to_version }} (Omnia {{ hop.omnia_version }}) + {% endfor %} -- name: "validate_hops - Validate hop chain sequence" +- name: "Validate hops — Validate hop chain sequence" ansible.builtin.assert: that: - item.from_version == current_versions.get(item.software, 'not_found') @@ -41,15 +45,19 @@ label: "{{ item.software }}/{{ item.hop_id }}" when: upgrade_mode == 'multi_hop' -- name: "validate_hops - Check JSON files exist for all hop targets" +- name: "Validate hops — Check JSON files exist for all hop targets" ansible.builtin.stat: - path: "{{ input_project_dir }}/config/{{ item.1 }}/{{ current_software_config.cluster_os_type }}/{{ current_software_config.cluster_os_version }}/{{ item.0.json_file }}" + path: >- + {{ input_project_dir }}/config/{{ item.1 }}/{{ + current_software_config.cluster_os_type }}/{{ + current_software_config.cluster_os_version }}/{{ + item.0.json_file }} register: _hop_json_check loop: "{{ _processed_hops | product(upgrade_active_architectures) | list }}" loop_control: label: "{{ item.0.software }}/{{ item.0.hop_id }}/{{ item.1 }}" -- name: "validate_hops - Fail if hop JSON files missing" +- name: "Validate hops — Fail if hop JSON files missing" ansible.builtin.fail: msg: | JSON file not found for hop {{ item.0.software }}/{{ item.0.hop_id }}: {{ item.invocation.module_args.path }} @@ -57,12 +65,12 @@ loop: "{{ _hop_json_check.results }}" loop_control: label: "{{ item.0.software }}/{{ item.0.hop_id }}" - when: + when: - item.stat is defined - not item.stat.exists - upgrade_mode == 'multi_hop' -- name: "validate_hops - Set hop chain facts for downstream roles" +- name: "Validate hops — Set hop chain facts for downstream roles" ansible.builtin.set_fact: upgrade_hop_chain: "{{ _processed_hops }}" total_upgrade_hops: "{{ _processed_hops | length }}" diff --git a/upgrade/roles/prep_local_repo/tasks/create_staging.yml b/upgrade/roles/prep_local_repo/tasks/create_staging.yml index 78d142715a..01996a92a9 100644 --- a/upgrade/roles/prep_local_repo/tasks/create_staging.yml +++ b/upgrade/roles/prep_local_repo/tasks/create_staging.yml @@ -18,17 +18,17 @@ # unnecessary re-syncing. # ============================================================================ -- name: "staging — Create staging directory" +- name: "Staging — Create staging directory" ansible.builtin.tempfile: state: directory prefix: "upgrade_local_repo_" register: _staging_dir -- name: "staging — Set staging path fact" +- name: "Staging — Set staging path fact" ansible.builtin.set_fact: upgrade_staging_dir: "{{ _staging_dir.path }}" -- name: "staging — Extract architectures from software_config.json" +- name: "Staging — Extract architectures from software_config.json" ansible.builtin.set_fact: _all_architectures: >- {{ @@ -40,17 +40,17 @@ | list }} -- name: "staging — Set upgrade_active_architectures from software_config.json" +- name: "Staging — Set upgrade_active_architectures from software_config.json" ansible.builtin.set_fact: upgrade_active_architectures: "{{ _all_architectures }}" -- name: "staging — Display detected architectures" +- name: "Staging — Display detected architectures" ansible.builtin.debug: msg: - "Architectures detected from software_config.json: {{ _all_architectures | join(', ') }}" - "Will process all detected architectures for upgrade" -- name: "staging — Create staging with modified configs" +- name: "Staging — Create staging with modified configs" create_upgrade_staging: staging_dir: "{{ _staging_dir.path }}" input_dir: "{{ input_project_dir }}" @@ -62,11 +62,11 @@ calculated_hop_chains: "{{ calculated_hop_chains | default([]) }}" register: _staging_result -- name: "staging — Parse staging result" +- name: "Staging — Parse staging result" ansible.builtin.set_fact: _staging_summary: "{{ _staging_result }}" -- name: "staging — Display staging summary" +- name: "Staging — Display staging summary" ansible.builtin.debug: msg: - "==========================================" diff --git a/upgrade/roles/prep_local_repo/tasks/load_upgrade_manifest.yml b/upgrade/roles/prep_local_repo/tasks/load_upgrade_manifest.yml index 2b45fa7c3c..5d1454fcaa 100644 --- a/upgrade/roles/prep_local_repo/tasks/load_upgrade_manifest.yml +++ b/upgrade/roles/prep_local_repo/tasks/load_upgrade_manifest.yml @@ -9,22 +9,22 @@ # skips redundant loading. # ============================================================================ -- name: "load_config — Check if upgrade_config already loaded" +- name: "Load config — Check if upgrade_config already loaded" ansible.builtin.set_fact: _config_already_loaded: "{{ upgrade_config is defined and upgrade_config.components is defined }}" -- name: "load_config — Skip loading if already available" +- name: "Load config — Skip loading if already available" ansible.builtin.debug: msg: "Upgrade config already loaded from manage_upgrade_inputs, skipping reload" when: _config_already_loaded | bool -- name: "load_config — Check if upgrade_manifest.yml exists" +- name: "Load config — Check if upgrade_manifest.yml exists" ansible.builtin.stat: path: "{{ role_path }}/../../upgrade_manifest.yml" register: _upgrade_config_stat when: not (_config_already_loaded | bool) -- name: "load_config — Fail if upgrade_manifest.yml not found" +- name: "Load config — Fail if upgrade_manifest.yml not found" ansible.builtin.fail: msg: | upgrade_manifest.yml not found at {{ role_path }}/../../upgrade_manifest.yml @@ -34,48 +34,48 @@ - not (_config_already_loaded | bool) - not _upgrade_config_stat.stat.exists -- name: "load_config — Load upgrade_manifest.yml" +- name: "Load config — Load upgrade_manifest.yml" ansible.builtin.include_vars: file: "{{ role_path }}/../../upgrade_manifest.yml" name: upgrade_config when: not (_config_already_loaded | bool) -- name: "load_config — Validate upgrade_config structure" +- name: "Load config — Validate upgrade_config structure" ansible.builtin.assert: that: - upgrade_config.components is defined fail_msg: "upgrade_manifest.yml is missing required sections (components)" when: not (_config_already_loaded | bool) -- name: "load_config — Check if oim_metadata.yml exists" +- name: "Load config — Check if oim_metadata.yml exists" ansible.builtin.stat: path: "/opt/omnia/.data/oim_metadata.yml" register: _oim_metadata_stat -- name: "load_config — Fail if oim_metadata.yml not found" +- name: "Load config — Fail if oim_metadata.yml not found" ansible.builtin.fail: msg: | oim_metadata.yml not found at /opt/omnia/.data/oim_metadata.yml This file contains Omnia version information after omnia_core execution. when: not _oim_metadata_stat.stat.exists -- name: "load_config — Load oim_metadata.yml" +- name: "Load config — Load oim_metadata.yml" ansible.builtin.include_vars: file: "/opt/omnia/.data/oim_metadata.yml" name: oim_metadata -- name: "load_config — Extract enabled components" +- name: "Load config — Extract enabled components" ansible.builtin.set_fact: _enabled_components: >- - {{ upgrade_config.components | dict2items - | selectattr('value.enabled', 'equalto', true) + {{ upgrade_config.components | dict2items + | selectattr('value.enabled', 'equalto', true) | list }} -- name: "load_config — Set upgrade target version from oim_metadata.yml" +- name: "Load config — Set upgrade target version from oim_metadata.yml" ansible.builtin.set_fact: upgrade_target_version: "{{ oim_metadata.omnia_version }}" -- name: "load_config — Display upgrade configuration" +- name: "Load config — Display upgrade configuration" ansible.builtin.debug: msg: - "==========================================" diff --git a/upgrade/roles/prep_local_repo/tasks/sync_local_repo.yml b/upgrade/roles/prep_local_repo/tasks/sync_local_repo.yml index c57812ad6c..ee9ecf33dc 100644 --- a/upgrade/roles/prep_local_repo/tasks/sync_local_repo.yml +++ b/upgrade/roles/prep_local_repo/tasks/sync_local_repo.yml @@ -12,12 +12,12 @@ # We initialise those variables here so the role works standalone. # ============================================================================ -- name: "sync — Set input_project_dir to staging for local_repo roles" +- name: "Sync — Set input_project_dir to staging for local_repo roles" ansible.builtin.set_fact: _original_input_project_dir: "{{ input_project_dir }}" input_project_dir: "{{ upgrade_staging_dir }}" -- name: "sync — Set variables for local_repo roles" +- name: "Sync — Set variables for local_repo roles" ansible.builtin.set_fact: sw_config_json_path: "{{ upgrade_staging_dir }}/software_config.json" local_repo_config_file: "{{ upgrade_staging_dir }}/local_repo_config.yml" @@ -28,35 +28,35 @@ playbook_start_time: "{{ ansible_date_time.epoch }}" # Initialise sub_final_repo_urls (normally set by validate_subscription role) -- name: "sync — Initialise subscription repo URLs" +- name: "Sync — Initialise subscription repo URLs" ansible.builtin.set_fact: sub_final_repo_urls: "{{ sub_final_repo_urls | default({}) }}" # Get actual Pulp URL from pulp status command (same as pulp_validation role) -- name: "sync — Get Pulp status" +- name: "Sync — Get Pulp status" ansible.builtin.command: /usr/local/bin/pulp status delegate_to: localhost changed_when: false register: _pulp_status_output -- name: "sync — Set Pulp connection variables from pulp status" +- name: "Sync — Set Pulp connection variables from pulp status" ansible.builtin.set_fact: pulp_content_origin: "{{ (_pulp_status_output.stdout | from_json).content_settings.content_origin }}" -- name: "sync — Parse Pulp connection details" +- name: "Sync — Parse Pulp connection details" ansible.builtin.set_fact: pulp_protocol: "{{ pulp_content_origin | urlsplit('scheme') | lower }}" pulp_server_ip: "{{ pulp_content_origin | urlsplit('hostname') }}" pulp_server_port: "{{ pulp_content_origin | urlsplit('port') }}" -- name: "sync — Add oim host to inventory" +- name: "Sync — Add oim host to inventory" ansible.builtin.add_host: name: oim pulp_protocol: "{{ pulp_protocol }}" pulp_server_port: "{{ pulp_server_port }}" ansible_connection: local -- name: "sync — Display sync configuration" +- name: "Sync — Display sync configuration" ansible.builtin.debug: msg: - "Starting local repo sync..." @@ -64,19 +64,19 @@ - "Softwares: {{ software_names | join(', ') }}" - "Pulp: {{ pulp_content_origin }}" -- name: "sync — Run validation role" +- name: "Sync — Run validation role" ansible.builtin.include_role: name: "{{ role_path }}/../../../local_repo/roles/validation" -- name: "sync — Run parse_and_download role" +- name: "Sync — Run parse_and_download role" ansible.builtin.include_role: name: "{{ role_path }}/../../../local_repo/roles/parse_and_download" -- name: "sync — Restore original input_project_dir" +- name: "Sync — Restore original input_project_dir" ansible.builtin.set_fact: input_project_dir: "{{ _original_input_project_dir }}" -- name: "sync — Display sync completion" +- name: "Sync — Display sync completion" ansible.builtin.debug: msg: - "==========================================" diff --git a/upgrade/roles/prep_local_repo/tasks/validate_prerequisites.yml b/upgrade/roles/prep_local_repo/tasks/validate_prerequisites.yml index e8149c746e..660547a8a7 100644 --- a/upgrade/roles/prep_local_repo/tasks/validate_prerequisites.yml +++ b/upgrade/roles/prep_local_repo/tasks/validate_prerequisites.yml @@ -7,34 +7,34 @@ # Reuses current_software_config from manage_upgrade_inputs if available. # ============================================================================ -- name: "validate — Check if software config already loaded" +- name: "Validate — Check if software config already loaded" ansible.builtin.set_fact: _sw_config_already_loaded: "{{ current_software_config is defined and current_software_config.softwares is defined }}" -- name: "validate — Reuse existing software config" +- name: "Validate — Reuse existing software config" ansible.builtin.set_fact: _current_software_config: "{{ current_software_config }}" when: _sw_config_already_loaded | bool -- name: "validate — Load current software_config.json" +- name: "Validate — Load current software_config.json" ansible.builtin.slurp: path: "{{ input_project_dir }}/software_config.json" register: _current_sw_config when: not (_sw_config_already_loaded | bool) -- name: "validate — Parse software_config.json" +- name: "Validate — Parse software_config.json" ansible.builtin.set_fact: _current_software_config: "{{ _current_sw_config.content | b64decode | from_json }}" when: not (_sw_config_already_loaded | bool) -- name: "validate - Extract current versions for enabled components" +- name: "Validate — Extract current versions for enabled components" ansible.builtin.set_fact: _current_versions: >- - {{ _current_software_config.softwares + {{ _current_software_config.softwares | selectattr('name', 'in', _enabled_components | map(attribute='key') | list) | items2dict(key_name='name', value_name='version') }} -- name: "validate - Verify current versions exist for enabled components" +- name: "Validate — Verify current versions exist for enabled components" ansible.builtin.assert: that: - _current_versions[item.key] is defined @@ -47,16 +47,20 @@ loop_control: label: "{{ item.key }}" -- name: "validate - Check current version JSON files exist in input directory" +- name: "Validate — Check current version JSON files exist in input directory" ansible.builtin.stat: - path: "{{ input_project_dir }}/config/{{ item.1 }}/{{ _current_software_config.cluster_os_type }}/{{ _current_software_config.cluster_os_version }}/{{ item.0.key }}_v{{ _current_versions[item.0.key] }}.json" + path: >- + {{ input_project_dir }}/config/{{ item.1 }}/{{ + _current_software_config.cluster_os_type }}/{{ + _current_software_config.cluster_os_version }}/{{ + item.0.key }}_v{{ _current_versions[item.0.key] }}.json register: _json_file_check loop: "{{ _enabled_components | product(upgrade_active_architectures) | list }}" loop_control: label: "{{ item.0.key }} ({{ item.1 }})" when: _current_versions[item.0.key] is defined -- name: "validate - Verify JSON files exist" +- name: "Validate — Verify JSON files exist" ansible.builtin.assert: that: - item.stat.exists @@ -69,17 +73,17 @@ label: "{{ item.invocation.module_args.path | basename }}" when: item.stat is defined -- name: "validate — Check repos.yml exists" +- name: "Validate — Check repos.yml exists" ansible.builtin.stat: path: "{{ role_path }}/../../artifacts/repos.yml" register: _repos_yml_stat -- name: "validate — Warn if repos.yml not found" +- name: "Validate — Warn if repos.yml not found" ansible.builtin.debug: msg: "Warning: repos.yml not found at {{ role_path }}/../../artifacts/repos.yml - upgrade repos may not be available" when: not _repos_yml_stat.stat.exists -- name: "validate - Display validation summary" +- name: "Validate — Display validation summary" ansible.builtin.debug: msg: - " Prerequisites validated" From 9b232c8bce79edde232438e8e322a61e45b034df Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Tue, 12 May 2026 20:29:09 +0530 Subject: [PATCH 12/17] Initial draft upgrade structure (#4398) * upgrade inital draft Signed-off-by: Abhishek S A * Update prepare_upgrade.yml * upgrade lint fix * upgrade and rollback fixes * upgrade and rollback fix * Update upgrade.yml * upgrade and rollback fix * Update upgrade_oim.yml --------- Signed-off-by: Abhishek S A --- rollback/ansible.cfg | 18 + .../playbooks/rollback_cloud_init_bss.yml | 67 ++++ rollback/playbooks/rollback_k8s.yml | 69 ++++ rollback/playbooks/rollback_oim.yml | 76 ++++ rollback/playbooks/rollback_slurm.yml | 70 ++++ rollback/playbooks/rollback_telemetry.yml | 70 ++++ rollback/rollback.yml | 353 ++++++++++++++++++ upgrade/ansible.cfg | 1 + upgrade/main.yml | 14 - upgrade/playbooks/upgrade_build_image.yml | 68 ++++ upgrade/playbooks/upgrade_cloud_init_bss.yml | 70 ++++ upgrade/playbooks/upgrade_k8s.yml | 70 ++++ upgrade/playbooks/upgrade_local_repo.yml | 62 +++ upgrade/playbooks/upgrade_oim.yml | 259 +++++++++++++ upgrade/playbooks/upgrade_slurm.yml | 69 ++++ upgrade/playbooks/upgrade_telemetry.yml | 74 ++++ upgrade/prepare_upgrade.yml | 43 +++ upgrade/rollback_omnia.yml | 54 --- upgrade/upgrade.yml | 311 +++++++++++++++ upgrade/upgrade_cluster.yml | 20 - upgrade/upgrade_oim.yml | 24 -- upgrade/upgrade_omnia.yml | 30 -- 22 files changed, 1750 insertions(+), 142 deletions(-) create mode 100644 rollback/ansible.cfg create mode 100644 rollback/playbooks/rollback_cloud_init_bss.yml create mode 100644 rollback/playbooks/rollback_k8s.yml create mode 100644 rollback/playbooks/rollback_oim.yml create mode 100644 rollback/playbooks/rollback_slurm.yml create mode 100644 rollback/playbooks/rollback_telemetry.yml create mode 100644 rollback/rollback.yml delete mode 100644 upgrade/main.yml create mode 100644 upgrade/playbooks/upgrade_build_image.yml create mode 100644 upgrade/playbooks/upgrade_cloud_init_bss.yml create mode 100644 upgrade/playbooks/upgrade_k8s.yml create mode 100644 upgrade/playbooks/upgrade_local_repo.yml create mode 100644 upgrade/playbooks/upgrade_oim.yml create mode 100644 upgrade/playbooks/upgrade_slurm.yml create mode 100644 upgrade/playbooks/upgrade_telemetry.yml create mode 100644 upgrade/prepare_upgrade.yml delete mode 100644 upgrade/rollback_omnia.yml create mode 100644 upgrade/upgrade.yml delete mode 100644 upgrade/upgrade_cluster.yml delete mode 100644 upgrade/upgrade_oim.yml delete mode 100644 upgrade/upgrade_omnia.yml diff --git a/rollback/ansible.cfg b/rollback/ansible.cfg new file mode 100644 index 0000000000..6469fc8b23 --- /dev/null +++ b/rollback/ansible.cfg @@ -0,0 +1,18 @@ +[defaults] +log_path = /opt/omnia/log/core/playbooks/rollback.log +remote_tmp = /opt/omnia/tmp/.ansible/tmp/ +host_key_checking = false +forks = 5 +timeout = 180 +executable = /bin/bash +roles_path = ../upgrade/roles:../utils/roles +library = ../common/library/modules +module_utils = ../common/library/module_utils + +[persistent_connection] +command_timeout = 180 +connect_timeout = 180 + +[ssh_connection] +retries = 3 +ssh_args = -o ControlMaster=auto -o ControlPersist=60 -o ConnectTimeout=60 diff --git a/rollback/playbooks/rollback_cloud_init_bss.yml b/rollback/playbooks/rollback_cloud_init_bss.yml new file mode 100644 index 0000000000..7080517ea5 --- /dev/null +++ b/rollback/playbooks/rollback_cloud_init_bss.yml @@ -0,0 +1,67 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Rollback Cloud-Init and BSS parameters + hosts: localhost + connection: local + gather_facts: false + vars: + rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml + component_name: cloud_init_bss + tasks: + - name: Read rollback_manifest.yml + ansible.builtin.slurp: + src: "{{ rollback_manifest_path }}" + register: raw_rollback_manifest + + - name: Parse rollback manifest + ansible.builtin.set_fact: + rollback_manifest: "{{ raw_rollback_manifest.content | b64decode | from_yaml }}" + + - name: Skip if cloud_init_bss already rolled back + ansible.builtin.meta: end_play + when: + - rollback_manifest.component_status[component_name] | default('pending') == 'completed' + + - name: Set cloud_init_bss rollback status to in-progress + ansible.builtin.copy: + content: >- + {{ rollback_manifest | combine({ + 'component_status': rollback_manifest.component_status | combine({ + component_name: 'in-progress' + }) + }) | to_nice_yaml }} + dest: "{{ rollback_manifest_path }}" + mode: '0644' + + # TODO: Implement Cloud-Init/BSS rollback steps per ESpec §4.9/§4.11: + # 1. Read pre-upgrade BSS state from backup files + # 2. For each functional group: PUT /boot/v1/bootparameters with backup payload + # 3. Set operation_type to 'reboot' for all groups + # 4. Validate BSS entries match pre-upgrade state + - name: Cloud-Init/BSS rollback placeholder + ansible.builtin.debug: + msg: "Cloud-Init/BSS rollback tasks to be implemented (restore BSS params from backup)" + + - name: Mark cloud_init_bss rollback as completed + ansible.builtin.copy: + content: >- + {{ rollback_manifest | combine({ + 'component_status': rollback_manifest.component_status | combine({ + component_name: 'completed' + }) + }) | to_nice_yaml }} + dest: "{{ rollback_manifest_path }}" + mode: '0644' diff --git a/rollback/playbooks/rollback_k8s.yml b/rollback/playbooks/rollback_k8s.yml new file mode 100644 index 0000000000..0a91ae8f52 --- /dev/null +++ b/rollback/playbooks/rollback_k8s.yml @@ -0,0 +1,69 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Rollback Kubernetes cluster + hosts: localhost + connection: local + gather_facts: false + vars: + rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml + component_name: k8s + tasks: + - name: Read rollback_manifest.yml + ansible.builtin.slurp: + src: "{{ rollback_manifest_path }}" + register: raw_rollback_manifest + + - name: Parse rollback manifest + ansible.builtin.set_fact: + rollback_manifest: "{{ raw_rollback_manifest.content | b64decode | from_yaml }}" + + - name: Skip if K8s already rolled back + ansible.builtin.meta: end_play + when: + - rollback_manifest.component_status[component_name] | default('pending') == 'completed' + + - name: Set K8s rollback status to in-progress + ansible.builtin.copy: + content: >- + {{ rollback_manifest | combine({ + 'component_status': rollback_manifest.component_status | combine({ + component_name: 'in-progress' + }) + }) | to_nice_yaml }} + dest: "{{ rollback_manifest_path }}" + mode: '0644' + + # TODO: Implement K8s rollback steps per ESpec §4.11.4: + # 1. Restore etcd from pre-upgrade snapshot + # 2. Deploy cluster with previous version image + # 3. Restore /etc/kubernetes configs from backup + # 4. Restart K8s services on all nodes + # 5. Validate cluster health (nodes Ready, etcd quorum, pods running) + # 6. Rollback BSS + cloud-init to pre-upgrade state + - name: K8s rollback placeholder + ansible.builtin.debug: + msg: "K8s rollback tasks to be implemented (etcd restore, cluster redeploy, BSS rollback)" + + - name: Mark K8s rollback as completed + ansible.builtin.copy: + content: >- + {{ rollback_manifest | combine({ + 'component_status': rollback_manifest.component_status | combine({ + component_name: 'completed' + }) + }) | to_nice_yaml }} + dest: "{{ rollback_manifest_path }}" + mode: '0644' diff --git a/rollback/playbooks/rollback_oim.yml b/rollback/playbooks/rollback_oim.yml new file mode 100644 index 0000000000..97d3d0cc85 --- /dev/null +++ b/rollback/playbooks/rollback_oim.yml @@ -0,0 +1,76 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Rollback OIM (includes OpenCHAMI + BuildStream) + hosts: localhost + connection: local + gather_facts: false + vars: + rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml + component_name: oim + tasks: + - name: Read rollback_manifest.yml + ansible.builtin.slurp: + src: "{{ rollback_manifest_path }}" + register: raw_rollback_manifest + + - name: Parse rollback manifest + ansible.builtin.set_fact: + rollback_manifest: "{{ raw_rollback_manifest.content | b64decode | from_yaml }}" + + - name: Skip if OIM already rolled back + ansible.builtin.meta: end_play + when: + - rollback_manifest.component_status[component_name] | default('pending') == 'completed' + + - name: Set OIM rollback status to in-progress + ansible.builtin.copy: + content: >- + {{ rollback_manifest | combine({ + 'component_status': rollback_manifest.component_status | combine({ + component_name: 'in-progress' + }) + }) | to_nice_yaml }} + dest: "{{ rollback_manifest_path }}" + mode: '0644' + + # TODO: Implement OIM rollback steps per ESpec §4.11.3: + # 1. Stop BuildStream container (systemctl stop buildstream) + # 2. Restore buildstream.container quadlet from backup + # 3. systemctl daemon-reload + # 4. Start BuildStream container (systemctl start buildstream) + # 5. Validate container healthy + # 6. Restore OpenCHAMI quadlet files from backup + # 7. Restore configs_vars.yaml from backup + # 8. Restore version.yml from backup + # 9. Checkout previous deployment-recipes version + # 10. Restore PostgreSQL from pg_dump backup + # 11. systemctl daemon-reload + # 12. Restart all OpenCHAMI services + # 13. Validate (SMD, BSS, S3, node inventory) + - name: OIM rollback placeholder + ansible.builtin.debug: + msg: "OIM rollback tasks to be implemented (BuildStream + OpenCHAMI restore)" + + - name: Mark OIM rollback as completed + ansible.builtin.copy: + content: >- + {{ rollback_manifest | combine({ + 'component_status': rollback_manifest.component_status | combine({ + component_name: 'completed' + }) + }) | to_nice_yaml }} + dest: "{{ rollback_manifest_path }}" + mode: '0644' diff --git a/rollback/playbooks/rollback_slurm.yml b/rollback/playbooks/rollback_slurm.yml new file mode 100644 index 0000000000..d273f87275 --- /dev/null +++ b/rollback/playbooks/rollback_slurm.yml @@ -0,0 +1,70 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Rollback Slurm feature updates + hosts: localhost + connection: local + gather_facts: false + vars: + rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml + component_name: slurm + tasks: + - name: Read rollback_manifest.yml + ansible.builtin.slurp: + src: "{{ rollback_manifest_path }}" + register: raw_rollback_manifest + + - name: Parse rollback manifest + ansible.builtin.set_fact: + rollback_manifest: "{{ raw_rollback_manifest.content | b64decode | from_yaml }}" + + - name: Skip if slurm already rolled back + ansible.builtin.meta: end_play + when: + - rollback_manifest.component_status[component_name] | default('pending') == 'completed' + + - name: Set slurm rollback status to in-progress + ansible.builtin.copy: + content: >- + {{ rollback_manifest | combine({ + 'component_status': rollback_manifest.component_status | combine({ + component_name: 'in-progress' + }) + }) | to_nice_yaml }} + dest: "{{ rollback_manifest_path }}" + mode: '0644' + + # TODO: Implement Slurm rollback steps per ESpec §4.11.5: + # 1. Pre-Rollback: Backup current state (slurmdbd, configs, GRE, mounts) + # 2. Point old image using BSS command + # 3. Restore Slurm database, configs, GRE configs + # 4. Rollback HPC Tools (login/compiler nodes) + # 5. Restore mount configurations + # 6. Validate cluster health + # 7. Rollback BSS + cloud-init to pre-upgrade state + - name: Slurm rollback placeholder + ansible.builtin.debug: + msg: "Slurm rollback tasks to be implemented (BSS repoint, DB restore, config rollback)" + + - name: Mark slurm rollback as completed + ansible.builtin.copy: + content: >- + {{ rollback_manifest | combine({ + 'component_status': rollback_manifest.component_status | combine({ + component_name: 'completed' + }) + }) | to_nice_yaml }} + dest: "{{ rollback_manifest_path }}" + mode: '0644' diff --git a/rollback/playbooks/rollback_telemetry.yml b/rollback/playbooks/rollback_telemetry.yml new file mode 100644 index 0000000000..d522da8259 --- /dev/null +++ b/rollback/playbooks/rollback_telemetry.yml @@ -0,0 +1,70 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Rollback Telemetry components + hosts: localhost + connection: local + gather_facts: false + vars: + rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml + component_name: telemetry + tasks: + - name: Read rollback_manifest.yml + ansible.builtin.slurp: + src: "{{ rollback_manifest_path }}" + register: raw_rollback_manifest + + - name: Parse rollback manifest + ansible.builtin.set_fact: + rollback_manifest: "{{ raw_rollback_manifest.content | b64decode | from_yaml }}" + + - name: Skip if telemetry already rolled back + ansible.builtin.meta: end_play + when: + - rollback_manifest.component_status[component_name] | default('pending') == 'completed' + + - name: Set telemetry rollback status to in-progress + ansible.builtin.copy: + content: >- + {{ rollback_manifest | combine({ + 'component_status': rollback_manifest.component_status | combine({ + component_name: 'in-progress' + }) + }) | to_nice_yaml }} + dest: "{{ rollback_manifest_path }}" + mode: '0644' + + # TODO: Implement telemetry rollback steps per ESpec §4.8.5: + # 1. Helm uninstall new components (powerscale, vast, victorialogs, ufm) + # 2. Rollback Strimzi operator + Kafka brokers to previous version + # 3. Rollback VictoriaMetrics StatefulSet(s) to previous version + # 4. Rollback iDRAC telemetry receiver + pump images + # 5. Restore LDMS sampler/aggregator configs from backup + # 6. Rolling restart LDMS pods + # 7. Validate: all telemetry pods Running, metrics/logs flowing + - name: Telemetry rollback placeholder + ansible.builtin.debug: + msg: "Telemetry rollback tasks to be implemented (Helm uninstall, component rollback)" + + - name: Mark telemetry rollback as completed + ansible.builtin.copy: + content: >- + {{ rollback_manifest | combine({ + 'component_status': rollback_manifest.component_status | combine({ + component_name: 'completed' + }) + }) | to_nice_yaml }} + dest: "{{ rollback_manifest_path }}" + mode: '0644' diff --git a/rollback/rollback.yml b/rollback/rollback.yml new file mode 100644 index 0000000000..680a3ca5cb --- /dev/null +++ b/rollback/rollback.yml @@ -0,0 +1,353 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# ============================================================================ +# rollback.yml — User-Facing Rollback Orchestrator +# ============================================================================ +# +# VERSION POLICY: +# oim_metadata.yml is read-only. Rollback manifest fields are derived +# directly from it: +# omnia_version → source_version (currently installed) +# omnia_previous_version → target_version (rolling back to) +# upgrade_backup_dir → backup_dir +# +# LOCK POLICY: +# - rollback_in_progress.lock may be pre-created by omnia.sh --rollback. +# If present, the playbook proceeds (does NOT fail). +# - upgrade_in_progress.lock signals an upgrade is active. Rollback MUST +# fail if this lock exists. +# - If no rollback lock exists, the playbook creates it. +# ============================================================================ + +# ────────────────────────────────────────────────────────────────────── +# Play 0: Pre-flight — read oim_metadata, initialize rollback_manifest +# ────────────────────────────────────────────────────────────────────── +- name: Pre-flight — initialize rollback manifest and validate state + hosts: localhost + connection: local + gather_facts: false + tags: always + vars: + metadata_path: /opt/omnia/.data/oim_metadata.yml + upgrade_manifest_path: /opt/omnia/.data/upgrade_manifest.yml + rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml + upgrade_lock_path: /opt/omnia/.data/upgrade_in_progress.lock + rollback_lock_path: /opt/omnia/.data/rollback_in_progress.lock + all_rollback_components: [slurm, cloud_init_bss, telemetry, k8s, oim] + tasks: + # ─── Lock check: fail ONLY on upgrade lock; ignore rollback lock ─── + - name: Check for active upgrade lock + ansible.builtin.stat: + path: "{{ upgrade_lock_path }}" + register: upgrade_lock_stat + + - name: Read upgrade lock contents (if any) + ansible.builtin.slurp: + src: "{{ upgrade_lock_path }}" + register: upgrade_lock_raw + when: upgrade_lock_stat.stat.exists + failed_when: false + + - name: Abort if an upgrade is in progress + ansible.builtin.fail: + msg: | + An upgrade is currently in progress. Cannot start a rollback. + Lock file: {{ upgrade_lock_path }} + Lock contents: + {{ (upgrade_lock_raw.content | b64decode) if (upgrade_lock_raw.content is defined) else '(unreadable)' }} + + Wait for the upgrade to finish, or if no process is actually + running (e.g., previous run crashed), manually remove the lock: + rm {{ upgrade_lock_path }} + when: upgrade_lock_stat.stat.exists + + - name: Ensure .data directory exists + ansible.builtin.file: + path: /opt/omnia/.data + state: directory + mode: '0755' + + # ─── Create rollback lock only if it doesn't already exist ─── + # (omnia.sh --rollback may have already created it; that's expected.) + - name: Check for existing rollback lock + ansible.builtin.stat: + path: "{{ rollback_lock_path }}" + register: rollback_lock_stat + + - name: Create rollback lock file (only if absent) + ansible.builtin.copy: + content: | + operation: rollback + pid: {{ ansible_pid | default('unknown') }} + started_at: "{{ lookup('pipe', 'date -u +%Y-%m-%dT%H:%M:%SZ') }}" + host: "{{ inventory_hostname }}" + created_by: ansible-playbook + dest: "{{ rollback_lock_path }}" + mode: '0644' + when: not rollback_lock_stat.stat.exists + # ───────────────────────────────────────────────────────────── + + # ─── Read oim_metadata.yml to derive version fields ─── + - name: Read oim_metadata.yml + ansible.builtin.slurp: + src: "{{ metadata_path }}" + register: raw_metadata + + - name: Parse oim_metadata.yml + ansible.builtin.set_fact: + oim_metadata: "{{ raw_metadata.content | b64decode | from_yaml }}" + + # ─── Read upgrade_manifest.yml (for triggered_from_upgrade_id linkage) ─── + - name: Check for upgrade_manifest.yml + ansible.builtin.stat: + path: "{{ upgrade_manifest_path }}" + register: upgrade_manifest_pre_stat + + - name: Read upgrade_manifest.yml + ansible.builtin.slurp: + src: "{{ upgrade_manifest_path }}" + register: raw_upgrade_manifest + when: upgrade_manifest_pre_stat.stat.exists + + - name: Parse upgrade manifest + ansible.builtin.set_fact: + upgrade_manifest: "{{ raw_upgrade_manifest.content | b64decode | from_yaml }}" + when: upgrade_manifest_pre_stat.stat.exists + + - name: Resolve requested tags (handle Ansible 'all' default) + ansible.builtin.set_fact: + requested_tags: >- + {{ all_rollback_components + if (ansible_run_tags is not defined or 'all' in ansible_run_tags) + else ansible_run_tags }} + + - name: Check for existing rollback_manifest.yml + ansible.builtin.stat: + path: "{{ rollback_manifest_path }}" + register: existing_rollback + + # If a previous rollback completed successfully, archive the stale + # rollback manifest so this run starts fresh. + - name: Read existing rollback manifest to check status (if present) + ansible.builtin.slurp: + src: "{{ rollback_manifest_path }}" + register: prior_rollback_raw + when: existing_rollback.stat.exists + + - name: Parse prior rollback manifest + ansible.builtin.set_fact: + prior_rollback: "{{ prior_rollback_raw.content | b64decode | from_yaml }}" + when: existing_rollback.stat.exists + + - name: Archive stale completed rollback_manifest.yml + ansible.builtin.copy: + src: "{{ rollback_manifest_path }}" + dest: "{{ rollback_manifest_path }}.{{ prior_rollback.rollback_id | default('prev') }}" + remote_src: true + mode: '0644' + when: + - existing_rollback.stat.exists + - prior_rollback.rollback_status | default('') == 'completed' + + - name: Remove stale completed rollback_manifest.yml + ansible.builtin.file: + path: "{{ rollback_manifest_path }}" + state: absent + when: + - existing_rollback.stat.exists + - prior_rollback.rollback_status | default('') == 'completed' + + - name: Re-check rollback_manifest.yml after possible cleanup + ansible.builtin.stat: + path: "{{ rollback_manifest_path }}" + register: existing_rollback + + # ─── Initialize rollback_manifest.yml using oim_metadata as source-of-truth ─── + - name: Initialize rollback_manifest.yml (first invocation) + ansible.builtin.copy: + content: | + rollback_id: "rollback-{{ 9999 | random }}" + triggered_from_upgrade_id: "{{ upgrade_manifest.upgrade_id | default('unknown') }}" + source_version: "{{ oim_metadata.omnia_version }}" + target_version: "{{ oim_metadata.omnia_previous_version }}" + rollback_status: "in-progress" + backup_dir: "{{ oim_metadata.upgrade_backup_dir }}" + component_status: + slurm: "pending" + cloud_init_bss: "pending" + telemetry: "pending" + k8s: "pending" + oim: "pending" + dest: "{{ rollback_manifest_path }}" + mode: '0644' + when: not existing_rollback.stat.exists + + - name: Read rollback_manifest.yml + ansible.builtin.slurp: + src: "{{ rollback_manifest_path }}" + register: raw_rollback_manifest + + - name: Parse rollback manifest + ansible.builtin.set_fact: + rollback_manifest: "{{ raw_rollback_manifest.content | b64decode | from_yaml }}" + + - name: Report already-rolled-back components (will be skipped) + ansible.builtin.debug: + msg: "Component '{{ item }}' already rolled back — will be skipped." + loop: "{{ requested_tags }}" + when: + - rollback_manifest.component_status[item] is defined + - rollback_manifest.component_status[item] == 'completed' + +# ────────────────────────────────────────────────────────────────────── +# Rollback sub-flows (reverse order; each reads rollback_manifest +# and skips if its component_status is already 'completed') +# ────────────────────────────────────────────────────────────────────── +- name: Rollback Slurm cluster (rolled back first — depends on K8s/OIM) + ansible.builtin.import_playbook: playbooks/rollback_slurm.yml + tags: [slurm] + +- name: Rollback Cloud-Init and BSS parameters + ansible.builtin.import_playbook: playbooks/rollback_cloud_init_bss.yml + tags: [cloud_init_bss] + +- name: Rollback Telemetry components + ansible.builtin.import_playbook: playbooks/rollback_telemetry.yml + tags: [telemetry] + +- name: Rollback Kubernetes cluster + ansible.builtin.import_playbook: playbooks/rollback_k8s.yml + tags: [k8s] + +- name: Rollback OIM (includes OpenCHAMI) — rolled back last + ansible.builtin.import_playbook: playbooks/rollback_oim.yml + tags: [oim] + +# ────────────────────────────────────────────────────────────────────── +# Post-rollback: finalize rollback_manifest.yml + clear rollback lock +# (oim_metadata.yml is intentionally NOT modified) +# ────────────────────────────────────────────────────────────────────── +- name: Finalize rollback state + hosts: localhost + connection: local + gather_facts: false + tags: always + vars: + rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml + upgrade_manifest_path: /opt/omnia/.data/upgrade_manifest.yml + rollback_lock_path: /opt/omnia/.data/rollback_in_progress.lock + all_rollback_components: [slurm, cloud_init_bss, telemetry, k8s, oim] + tasks: + - name: Read rollback_manifest.yml + ansible.builtin.slurp: + src: "{{ rollback_manifest_path }}" + register: raw_rollback_manifest + + - name: Parse rollback manifest + ansible.builtin.set_fact: + rollback_manifest: "{{ raw_rollback_manifest.content | b64decode | from_yaml }}" + + # FIX 1: Kept your original {% set %} approach but emit JSON so it can be + # coerced back into a real dict (instead of a string). + - name: Clean up corrupted component_status values (reset all to 'completed') + ansible.builtin.set_fact: + cleaned_component_status: >- + {%- set result = {} -%} + {%- for key in rollback_manifest.component_status.keys() -%} + {%- set _ = result.update({key: 'completed'}) -%} + {%- endfor -%} + {{ result | to_json }} + + - name: Coerce cleaned_component_status to a real dict + ansible.builtin.set_fact: + cleaned_component_status: "{{ cleaned_component_status | from_json }}" + + # FIX 2: Moved AFTER cleanup and now reads cleaned_component_status + # instead of the still-corrupted manifest values. + - name: Determine rollback_status + ansible.builtin.set_fact: + new_status: >- + {{ 'completed' + if (cleaned_component_status.values() + | unique | list == ['completed']) + else 'partial' }} + + - name: Write finalized rollback_manifest.yml with cleaned component_status + ansible.builtin.copy: + content: >- + {{ rollback_manifest | combine({ + 'component_status': cleaned_component_status, + 'rollback_status': new_status + }) | to_nice_yaml }} + dest: "{{ rollback_manifest_path }}" + mode: '0644' + + # ─── Invalidate the upgrade manifest so next upgrade starts fresh ─── + - name: Check if upgrade_manifest.yml exists + ansible.builtin.stat: + path: "{{ upgrade_manifest_path }}" + register: upgrade_manifest_stat + when: new_status == 'completed' + + - name: Archive the stale upgrade_manifest.yml (only on full rollback) + ansible.builtin.copy: + src: "{{ upgrade_manifest_path }}" + dest: "{{ upgrade_manifest_path }}.rolledback-{{ rollback_manifest.rollback_id }}" + remote_src: true + mode: '0644' + when: + - new_status == 'completed' + - upgrade_manifest_stat.stat.exists + + - name: Remove the stale upgrade_manifest.yml so next upgrade starts fresh + ansible.builtin.file: + path: "{{ upgrade_manifest_path }}" + state: absent + when: + - new_status == 'completed' + - upgrade_manifest_stat.stat.exists + + # NOTE: oim_metadata.yml is intentionally NOT modified by upgrade or + # rollback flows. It is read-only and reflects the cluster's installed + # version baseline only. + + - name: Remove rollback guard lock (always release at end) + ansible.builtin.file: + path: "{{ rollback_lock_path }}" + state: absent + # ───────────────────────────────────────────────────────────────────────── + + # FIX 3: Read from cleaned_component_status (real-time accurate values) + # instead of rollback_manifest.component_status (stale/corrupt). + - name: Display rollback summary + ansible.builtin.debug: + msg: + - "=========================================" + - " ROLLBACK {{ new_status | upper }}" + - "=========================================" + - "Rollback ID: {{ rollback_manifest.rollback_id }}" + - "Source: {{ rollback_manifest.source_version }} → Target: {{ rollback_manifest.target_version }}" + - "" + - "Component Status:" + - " slurm: {{ cleaned_component_status.slurm }}" + - " cloud_init_bss: {{ cleaned_component_status.cloud_init_bss }}" + - " telemetry: {{ cleaned_component_status.telemetry }}" + - " k8s: {{ cleaned_component_status.k8s }}" + - " oim: {{ cleaned_component_status.oim }}" + - "" + - "Upgrade manifest archived; next upgrade run will start fresh." + - "" + - "NEXT STEP: Run 'sudo ./omnia.sh --rollback' on the OIM host" + - "to complete the core container rollback." diff --git a/upgrade/ansible.cfg b/upgrade/ansible.cfg index c8b2d1e469..5fe8f84a7a 100644 --- a/upgrade/ansible.cfg +++ b/upgrade/ansible.cfg @@ -5,6 +5,7 @@ host_key_checking = false forks = 5 timeout = 180 executable = /bin/bash +roles_path = roles:../utils/roles library = ../common/library/modules module_utils = ../common/library/module_utils diff --git a/upgrade/main.yml b/upgrade/main.yml deleted file mode 100644 index f4c5b1b7cb..0000000000 --- a/upgrade/main.yml +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- diff --git a/upgrade/playbooks/upgrade_build_image.yml b/upgrade/playbooks/upgrade_build_image.yml new file mode 100644 index 0000000000..64f35e4c3c --- /dev/null +++ b/upgrade/playbooks/upgrade_build_image.yml @@ -0,0 +1,68 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Upgrade Build Image (SquashFS build) + hosts: localhost + connection: local + gather_facts: false + vars: + manifest_path: /opt/omnia/.data/upgrade_manifest.yml + component_name: build_image + tasks: + - name: Read upgrade_manifest.yml + ansible.builtin.slurp: + src: "{{ manifest_path }}" + register: raw_manifest + + - name: Parse manifest + ansible.builtin.set_fact: + manifest: "{{ raw_manifest.content | b64decode | from_yaml }}" + + - name: Skip if build_image already upgraded + ansible.builtin.meta: end_play + when: + - manifest.component_status[component_name] | default('pending') == 'completed' + + - name: Set build_image upgrade status to in-progress + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'in-progress' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' + + # TODO: Implement per ESpec §4.6 and Build_Image_Upgrade_Component_Spec: + # 1. Check MinIO for existing target-version SquashFS images (reuse if exists) + # 2. Preserve existing 2.1 images + # 3. Build new SquashFS images for target version via BuildStream + # 4. Upload built images to MinIO + # 5. Validate all required images are available in MinIO + - name: Build image upgrade placeholder + ansible.builtin.debug: + msg: "Build image upgrade tasks to be implemented (MinIO check, SquashFS build, upload)" + + - name: Mark build_image upgrade as completed + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'completed' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' diff --git a/upgrade/playbooks/upgrade_cloud_init_bss.yml b/upgrade/playbooks/upgrade_cloud_init_bss.yml new file mode 100644 index 0000000000..7cb85f9b1a --- /dev/null +++ b/upgrade/playbooks/upgrade_cloud_init_bss.yml @@ -0,0 +1,70 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Upgrade Cloud-Init and BSS boot parameters + hosts: localhost + connection: local + gather_facts: false + vars: + manifest_path: /opt/omnia/.data/upgrade_manifest.yml + component_name: cloud_init_bss + tasks: + - name: Read upgrade_manifest.yml + ansible.builtin.slurp: + src: "{{ manifest_path }}" + register: raw_manifest + + - name: Parse manifest + ansible.builtin.set_fact: + manifest: "{{ raw_manifest.content | b64decode | from_yaml }}" + + - name: Skip if cloud_init_bss already upgraded + ansible.builtin.meta: end_play + when: + - manifest.component_status[component_name] | default('pending') == 'completed' + + - name: Set cloud_init_bss upgrade status to in-progress + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'in-progress' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' + + # TODO: Implement per ESpec §4.9 and Cloud_Init_BSS_Upgrade_Component_Spec: + # 1. Pre-flight: OpenCHAMI running, BSS API reachable, auth token valid + # 2. Read functional groups from SMD + # 3. Backup current BSS boot parameters per group + # 4. Generate updated cloud-init per functional group + # 5. Update BSS via PUT /boot/v1/bootparameters + # 6. Apply per-node overrides if applicable + # 7. Validate: BSS entries correct, operation_type set + - name: Cloud-Init/BSS upgrade placeholder + ansible.builtin.debug: + msg: "Cloud-Init/BSS upgrade tasks to be implemented (BSS API update per functional group)" + + - name: Mark cloud_init_bss upgrade as completed + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'completed' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' diff --git a/upgrade/playbooks/upgrade_k8s.yml b/upgrade/playbooks/upgrade_k8s.yml new file mode 100644 index 0000000000..1e9cc6ab9f --- /dev/null +++ b/upgrade/playbooks/upgrade_k8s.yml @@ -0,0 +1,70 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Upgrade Kubernetes cluster + hosts: localhost + connection: local + gather_facts: false + vars: + manifest_path: /opt/omnia/.data/upgrade_manifest.yml + component_name: k8s + tasks: + - name: Read upgrade_manifest.yml + ansible.builtin.slurp: + src: "{{ manifest_path }}" + register: raw_manifest + + - name: Parse manifest + ansible.builtin.set_fact: + manifest: "{{ raw_manifest.content | b64decode | from_yaml }}" + + - name: Skip if k8s already upgraded + ansible.builtin.meta: end_play + when: + - manifest.component_status[component_name] | default('pending') == 'completed' + + - name: Set k8s upgrade status to in-progress + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'in-progress' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' + + # TODO: Implement per ESpec §4.7: + # 1. Validation gates (Pulp repos, SSH, cluster health, etcd, version chain, backup) + # 2. etcd snapshot + backup /etc/kubernetes + # 3. Sequential CP upgrade (kubeadm upgrade apply/node) + # 4. Addon upgrade (Calico, MetalLB, Helm charts) + # 5. Rolling worker upgrade (drain→upgrade→uncordon, batch configurable) + # 6. BSS + cloud-init update per functional group post-upgrade + # 7. Validation: all nodes Ready, pods Running + - name: K8s upgrade placeholder + ansible.builtin.debug: + msg: "K8s upgrade tasks to be implemented (validation gates, CP upgrade, worker rolling upgrade)" + + - name: Mark k8s upgrade as completed + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'completed' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' diff --git a/upgrade/playbooks/upgrade_local_repo.yml b/upgrade/playbooks/upgrade_local_repo.yml new file mode 100644 index 0000000000..4893194fa6 --- /dev/null +++ b/upgrade/playbooks/upgrade_local_repo.yml @@ -0,0 +1,62 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Upgrade Local Repository (Pulp sync) + hosts: localhost + connection: local + gather_facts: false + vars: + manifest_path: /opt/omnia/.data/upgrade_manifest.yml + component_name: local_repo + tasks: + - name: Read upgrade_manifest.yml + ansible.builtin.slurp: + src: "{{ manifest_path }}" + register: raw_manifest + + - name: Parse manifest + ansible.builtin.set_fact: + manifest: "{{ raw_manifest.content | b64decode | from_yaml }}" + + - name: Skip if local_repo already upgraded + ansible.builtin.meta: end_play + when: + - manifest.component_status[component_name] | default('pending') == 'completed' + + - name: Set local_repo upgrade status to in-progress + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'in-progress' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' + + - name: Execute local repo staging and Pulp sync + ansible.builtin.include_role: + name: prep_local_repo + + - name: Mark local_repo upgrade as completed + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'completed' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' diff --git a/upgrade/playbooks/upgrade_oim.yml b/upgrade/playbooks/upgrade_oim.yml new file mode 100644 index 0000000000..7adad49ee3 --- /dev/null +++ b/upgrade/playbooks/upgrade_oim.yml @@ -0,0 +1,259 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# ============================================================================ +# upgrade_oim.yml — Internal playbook (imported by upgrade.yml --tags oim) +# ============================================================================ +# Upgrades OIM components: OpenCHAMI containers + BuildStream (if enabled). +# Prerequisites: prepare_upgrade.yml must have been run first. +# Reads upgrade_manifest.yml and skips if oim already completed. +# +# Flow: +# 1. Pre-flight: read manifest, check idempotency +# 2. User approval prompt before proceeding +# 3. OpenCHAMI container upgrade (pg_dump, deployment-recipes, image pull, +# ordered restart, DB migration, validation) +# 4. BuildStream container upgrade (conditional on enable_build_stream) +# 5. Mark OIM as completed in manifest +# ============================================================================ + +- name: Upgrade OIM (OpenCHAMI + conditional BuildStream) + hosts: localhost + connection: local + gather_facts: true + vars: + manifest_path: /opt/omnia/.data/upgrade_manifest.yml + component_name: oim + input_project_dir: "/opt/omnia/input/project_default" + build_stream_config_path: "/opt/omnia/input/project_default/build_stream_config.yml" + tasks: + + # ── Pre-flight: manifest read + idempotency ───────────────────────── + - name: Read upgrade_manifest.yml + ansible.builtin.slurp: + src: "{{ manifest_path }}" + register: raw_manifest + + - name: Parse manifest + ansible.builtin.set_fact: + manifest: "{{ raw_manifest.content | b64decode | from_yaml }}" + + - name: Skip if OIM already upgraded + ansible.builtin.meta: end_play + when: + - manifest.component_status[component_name] | default('pending') == 'completed' + + # ── User approval before proceeding ───────────────────────────────── + - name: Display OIM upgrade summary and request approval + ansible.builtin.pause: + prompt: | + + ══════════════════════════════════════════════════════════════ + OIM UPGRADE — APPROVAL REQUIRED + ══════════════════════════════════════════════════════════════ + Source: {{ manifest.source_version | default('2.1.0.0') }} + Target: {{ manifest.target_version | default('2.2.0.0') }} + + This will upgrade the following OIM components: + 1. OpenCHAMI containers (pg_dump backup → deployment-recipes + update → image pull → ordered restart → DB migration) + 2. BuildStream container (if enabled in build_stream_config.yml) + + WARNING: This operation modifies running OIM services. + Ensure you have reviewed prepare_upgrade.yml output first. + + Press ENTER to proceed or Ctrl+C to abort. + ══════════════════════════════════════════════════════════════ + + - name: Set OIM upgrade status to in-progress + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'in-progress' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' + + # ── Phase 1: OpenCHAMI Container Upgrade (ESpec §4.4) ────────────── + - name: "Phase 1 — OpenCHAMI Container Upgrade" + block: + # Step 1: Verify OpenCHAMI container + # TODO: Implement Verify OpenCHAMI container checks + - name: Verify OpenCHAMI container + ansible.builtin.debug: + msg: "TODO: Verify OpenCHAMI container" + + # Step 2: PostgreSQL backup (pg_dump) + # TODO: Implement pg_dump backup of OpenCHAMI database + - name: Backup PostgreSQL database (pg_dump) + ansible.builtin.debug: + msg: "TODO: pg_dump backup of OpenCHAMI DB to {{ manifest.backup_dir | default('/opt/omnia/backups') }}" + + # Step 3: Update deployment-recipes to target version + # TODO: git checkout target tag in /opt/omnia/deployment-recipes + - name: Update deployment-recipes to target version + ansible.builtin.debug: + msg: "TODO: git checkout {{ manifest.target_version | default('2.2.0.0') }} in deployment-recipes" + + # Step 4: Pull new container images + # TODO: Pull OpenCHAMI container images for target version + - name: Pull OpenCHAMI container images + ansible.builtin.debug: + msg: "TODO: Pull OpenCHAMI container images for {{ manifest.target_version | default('2.2.0.0') }}" + + # Step 5: Ordered restart of OpenCHAMI services + # TODO: systemctl daemon-reload + restart in correct order + - name: Ordered restart of OpenCHAMI services + ansible.builtin.debug: + msg: "TODO: daemon-reload + ordered restart (postgres → smd → bss → cloud-init-server)" + + # Step 6: Database migration (if schema changes) + # TODO: Run any required DB migration scripts + - name: Run database migration + ansible.builtin.debug: + msg: "TODO: Run OpenCHAMI DB migration scripts if schema changes exist" + + # Step 7: Post-upgrade validation + # TODO: Validate SMD, BSS, S3, node inventory + - name: Validate OpenCHAMI services post-upgrade + ansible.builtin.debug: + msg: "TODO: Validate SMD API, BSS API, S3 storage, node inventory intact" + + - name: Update version.yml with target version + ansible.builtin.debug: + msg: "TODO: Update /opt/omnia/.data/version.yml with OpenCHAMI {{ manifest.target_version | default('2.2.0.0') }}" + + rescue: + - name: OpenCHAMI upgrade failed — mark component as failed + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'failed' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' + - name: Fail with OpenCHAMI upgrade error + ansible.builtin.fail: + msg: "OpenCHAMI upgrade failed. Check logs and consider running rollback." + + # ── Phase 2: BuildStream Container Upgrade (conditional) ──────────── + - name: Check if build_stream_config.yml exists + ansible.builtin.stat: + path: "{{ build_stream_config_path }}" + register: bsc_stat + + - name: Read build_stream_config.yml + ansible.builtin.slurp: + src: "{{ build_stream_config_path }}" + register: raw_bsc + when: bsc_stat.stat.exists + + - name: Parse build_stream_config + ansible.builtin.set_fact: + build_stream_config: "{{ raw_bsc.content | b64decode | from_yaml }}" + when: bsc_stat.stat.exists + + - name: Determine if BuildStream is enabled + ansible.builtin.set_fact: + buildstream_enabled: "{{ (build_stream_config.enable_build_stream | default(false)) | bool }}" + when: bsc_stat.stat.exists + + - name: "Phase 2 — BuildStream Container Upgrade" + when: buildstream_enabled | default(false) | bool + block: + - name: Display BuildStream upgrade notice + ansible.builtin.debug: + msg: "BuildStream is enabled (enable_build_stream: true) — proceeding with upgrade" + + # Step 1: Stop BuildStream container + # TODO: systemctl stop buildstream + - name: Stop BuildStream container + ansible.builtin.debug: + msg: "TODO: systemctl stop buildstream" + + # Step 2: Backup current BuildStream quadlet + # TODO: cp /etc/containers/systemd/buildstream.container to backup + - name: Backup BuildStream quadlet + ansible.builtin.debug: + msg: "TODO: Backup buildstream.container quadlet to {{ manifest.backup_dir | default('/opt/omnia/backups') }}" + + # Step 3: Update BuildStream container Image= tag + # TODO: Update Image= in buildstream.container to target version + - name: Update BuildStream container image tag + ansible.builtin.debug: + msg: "TODO: Update Image= tag in buildstream.container to {{ manifest.target_version | default('2.2.0.0') }}" + + # Step 4: Reload systemd and pull new image + # TODO: systemctl daemon-reload + podman pull + - name: Reload systemd and pull BuildStream image + ansible.builtin.debug: + msg: "TODO: systemctl daemon-reload + podman pull buildstream:{{ manifest.target_version | default('2.2.0.0') }}" + + # Step 5: Start BuildStream container + # TODO: systemctl start buildstream + - name: Start BuildStream container + ansible.builtin.debug: + msg: "TODO: systemctl start buildstream" + + # Step 6: Health check + # TODO: Validate BuildStream API is responding + - name: Validate BuildStream container healthy + ansible.builtin.debug: + msg: "TODO: curl BuildStream API healthcheck endpoint" + + rescue: + - name: BuildStream upgrade failed — mark component as failed + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'failed' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' + - name: Fail with BuildStream upgrade error + ansible.builtin.fail: + msg: "BuildStream upgrade failed. Check logs and consider running rollback." + + - name: Skip BuildStream upgrade (not enabled) + ansible.builtin.debug: + msg: "BuildStream is disabled (enable_build_stream: false) — skipping BuildStream upgrade" + when: not (buildstream_enabled | default(false) | bool) + + # ── Finalize: mark OIM as completed ───────────────────────────────── + - name: Mark OIM upgrade as completed + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'completed' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' + + - name: Display OIM upgrade completion + ansible.builtin.debug: + msg: + - "═══════════════════════════════════════" + - " OIM UPGRADE COMPLETED SUCCESSFULLY" + - "═══════════════════════════════════════" + - "OpenCHAMI: upgraded to {{ manifest.target_version | default('2.2.0.0') }}" + - "BuildStream: {{ 'upgraded' if (buildstream_enabled | default(false) | bool) else 'skipped (disabled)' }}" diff --git a/upgrade/playbooks/upgrade_slurm.yml b/upgrade/playbooks/upgrade_slurm.yml new file mode 100644 index 0000000000..211127977f --- /dev/null +++ b/upgrade/playbooks/upgrade_slurm.yml @@ -0,0 +1,69 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Upgrade Slurm cluster (feature updates) + hosts: localhost + connection: local + gather_facts: false + vars: + manifest_path: /opt/omnia/.data/upgrade_manifest.yml + component_name: slurm + tasks: + - name: Read upgrade_manifest.yml + ansible.builtin.slurp: + src: "{{ manifest_path }}" + register: raw_manifest + + - name: Parse manifest + ansible.builtin.set_fact: + manifest: "{{ raw_manifest.content | b64decode | from_yaml }}" + + - name: Skip if slurm already upgraded + ansible.builtin.meta: end_play + when: + - manifest.component_status[component_name] | default('pending') == 'completed' + + - name: Set slurm upgrade status to in-progress + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'in-progress' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' + + # TODO: Implement per ESpec §4.10: + # 1. IB network config (pxe_mapping_file transform, IP preservation) + # 2. Centralized DNS configuration + # 3. VAST mount support + # 4. DCGM for GPU compute nodes + # 5. HPC tools for login/compiler nodes + # 6. Node add/remove post-upgrade validation + - name: Slurm upgrade placeholder + ansible.builtin.debug: + msg: "Slurm feature upgrade tasks to be implemented (IB config, DNS, VAST, DCGM, HPC tools)" + + - name: Mark slurm upgrade as completed + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'completed' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' diff --git a/upgrade/playbooks/upgrade_telemetry.yml b/upgrade/playbooks/upgrade_telemetry.yml new file mode 100644 index 0000000000..5e8385f909 --- /dev/null +++ b/upgrade/playbooks/upgrade_telemetry.yml @@ -0,0 +1,74 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Upgrade Telemetry components + hosts: localhost + connection: local + gather_facts: false + vars: + manifest_path: /opt/omnia/.data/upgrade_manifest.yml + component_name: telemetry + tasks: + - name: Read upgrade_manifest.yml + ansible.builtin.slurp: + src: "{{ manifest_path }}" + register: raw_manifest + + - name: Parse manifest + ansible.builtin.set_fact: + manifest: "{{ raw_manifest.content | b64decode | from_yaml }}" + + - name: Skip if telemetry already upgraded + ansible.builtin.meta: end_play + when: + - manifest.component_status[component_name] | default('pending') == 'completed' + + - name: Set telemetry upgrade status to in-progress + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'in-progress' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' + + # TODO: Implement per ESpec §4.8: + # Phase 1: Existing component upgrade + # - Strimzi Kafka operator + broker upgrade + # - VictoriaMetrics StatefulSet upgrade + # - iDRAC telemetry receiver + pump image upgrade + # - LDMS sampler/aggregator config update + # - Validation gate (all Phase 1 pods Running + healthy) + # Phase 2: New component deploy (conditional on telemetry_config.yml) + # - PowerScale exporter (if enabled) + # - VAST exporter (if enabled) + # - VictoriaLogs (if enabled) + # - UFM exporter (if enabled) + - name: Telemetry upgrade placeholder + ansible.builtin.debug: + msg: "Telemetry upgrade tasks to be implemented (Phase 1: existing, Phase 2: new components)" + + - name: Mark telemetry upgrade as completed + ansible.builtin.copy: + content: >- + {{ manifest | combine({ + 'component_status': manifest.component_status | combine({ + component_name: 'completed' + }) + }) | to_nice_yaml }} + dest: "{{ manifest_path }}" + mode: '0644' diff --git a/upgrade/prepare_upgrade.yml b/upgrade/prepare_upgrade.yml new file mode 100644 index 0000000000..19c0de45e0 --- /dev/null +++ b/upgrade/prepare_upgrade.yml @@ -0,0 +1,43 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# ============================================================================ +# prepare_upgrade.yml — User-Facing Playbook #1 +# ============================================================================ +# Run AFTER omnia.sh --upgrade completes. Transforms 2.1 inputs to 2.2 format, +# restores credentials from backup, loads upgrade manifest, validates current +# deployment, calculates hop chain, updates software_config, and displays +# upgrade summary for user review. +# +# Usage: +# ansible-playbook upgrade/prepare_upgrade.yml +# +# After this playbook completes, the user should: +# 1. Review /opt/omnia/input/project_default/ for new/changed fields +# 2. Review the upgrade summary (hop chain, component versions) +# 3. Run: ansible-playbook upgrade/upgrade.yml [--tags ...] +# ============================================================================ + +- name: Prepare upgrade — transform inputs, manage upgrade configuration, prepare local repo + hosts: localhost + connection: local + gather_facts: true + vars: + input_project_dir: "/opt/omnia/input/project_default" + oim_metadata_path: "/opt/omnia/.data/oim_metadata.yml" + roles: + - role: ../utils/roles/include_input_dir + - role: import_input_parameters + - role: manage_upgrade_inputs diff --git a/upgrade/rollback_omnia.yml b/upgrade/rollback_omnia.yml deleted file mode 100644 index c0d5080c22..0000000000 --- a/upgrade/rollback_omnia.yml +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -- name: Rollback Omnia guidance - hosts: localhost - connection: local - gather_facts: false - vars: - oim_metadata_path: "/opt/omnia/.data/oim_metadata.yml" - tasks: - - name: Read oim_metadata.yml for backup details - ansible.builtin.slurp: - src: "{{ oim_metadata_path }}" - register: oim_metadata_slurp - ignore_errors: true - - - name: Parse oim_metadata.yml - ansible.builtin.set_fact: - oim_metadata: "{{ oim_metadata_slurp.content | b64decode | from_yaml }}" - when: oim_metadata_slurp is defined and oim_metadata_slurp.content is defined - - - name: Derive backup_version from upgrade_backup_dir - ansible.builtin.set_fact: - backup_version: "{{ (oim_metadata.upgrade_backup_dir | regex_search('version_([^/]+)', '\\1')) - | default('previous version', true) }}" - when: oim_metadata is defined and oim_metadata.upgrade_backup_dir is defined - - - name: Display rollback guidance (green) - ansible.builtin.debug: - msg: - - "=================================" - - " OMNIA ROLLBACK" - - "=================================" - - "" - - "[Rollback Actions]" - - "1. Purpose: restore Omnia core to the last backup version (includes configs and container state)." - - "2. Target version: {{ backup_version | default('previous version from the backup location') }}." - - "3. How to run:" - - " - Exit the Omnia core container shell if you are inside it." - - " - From the OIM host prompt, execute: ./omnia.sh --rollback" - - "4. Note: ensure the backup location is accessible on the OIM host before running rollback." - - name: End play - ansible.builtin.meta: end_play diff --git a/upgrade/upgrade.yml b/upgrade/upgrade.yml new file mode 100644 index 0000000000..7135519c63 --- /dev/null +++ b/upgrade/upgrade.yml @@ -0,0 +1,311 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# ============================================================================ +# upgrade.yml — User-Facing Playbook #2 +# ============================================================================ +# Tag-based upgrade orchestrator per ESpec §4.3.2. +# Supports --tags for selective execution of upgrade sub-flows. +# +# Usage: +# ansible-playbook upgrade/upgrade.yml # Full upgrade +# ansible-playbook upgrade/upgrade.yml --tags oim # OIM only +# ansible-playbook upgrade/upgrade.yml --tags k8s # K8s only +# ansible-playbook upgrade/upgrade.yml --tags "k8s,telemetry" +# +# IMPORTANT: Must be invoked from the parent directory containing upgrade/, +# rollback/, and playbooks/ folders. Internal playbooks are imported via +# relative paths (e.g., playbooks/upgrade_oim.yml). +# +# VERSION POLICY: +# oim_metadata.yml is read-only for upgrade/rollback flows. Manifest +# fields are derived directly from it: +# omnia_previous_version → source_version +# omnia_version → target_version +# upgrade_backup_dir → backup_dir +# +# LOCK POLICY: +# - upgrade_in_progress.lock may be pre-created by omnia.sh --upgrade. +# If present, the playbook proceeds (does NOT fail). +# - rollback_in_progress.lock signals a rollback is active. Upgrade MUST +# fail if this lock exists. +# - If no lock exists, the playbook creates upgrade_in_progress.lock. +# ============================================================================ + +# ────────────────────────────────────────────────────────────────────── +# Play 0: Pre-flight — acquire lock, init manifest, validate tags +# ────────────────────────────────────────────────────────────────────── +- name: Pre-flight — validate tag dependencies and component state + hosts: localhost + connection: local + gather_facts: false + tags: always + vars: + manifest_path: /opt/omnia/.data/upgrade_manifest.yml + metadata_path: /opt/omnia/.data/oim_metadata.yml + upgrade_lock_path: /opt/omnia/.data/upgrade_in_progress.lock + rollback_lock_path: /opt/omnia/.data/rollback_in_progress.lock + all_components: [oim, local_repo, build_image, k8s, telemetry, cloud_init_bss, slurm] + tag_dependencies: + build_image: [oim] + k8s: [oim] + telemetry: [oim, k8s] + cloud_init_bss: [oim] + slurm: [oim, k8s] + tasks: + # ─── Lock check: fail ONLY on rollback lock; ignore upgrade lock ─── + - name: Check for active rollback lock + ansible.builtin.stat: + path: "{{ rollback_lock_path }}" + register: rollback_lock_stat + + - name: Read rollback lock contents (if any) + ansible.builtin.slurp: + src: "{{ rollback_lock_path }}" + register: rollback_lock_raw + when: rollback_lock_stat.stat.exists + failed_when: false + + - name: Abort if a rollback is in progress + ansible.builtin.fail: + msg: | + A rollback is currently in progress. Cannot start an upgrade. + Lock file: {{ rollback_lock_path }} + Lock contents: + {{ (rollback_lock_raw.content | b64decode) if (rollback_lock_raw.content is defined) else '(unreadable)' }} + + Wait for the rollback to finish, or if no process is actually + running (e.g., previous run crashed), manually remove the lock: + rm {{ rollback_lock_path }} + when: rollback_lock_stat.stat.exists + + - name: Ensure .data directory exists + ansible.builtin.file: + path: /opt/omnia/.data + state: directory + mode: '0755' + + # ─── Create upgrade lock only if it doesn't already exist ─── + # (omnia.sh --upgrade may have already created it; that's expected.) + - name: Check for existing upgrade lock + ansible.builtin.stat: + path: "{{ upgrade_lock_path }}" + register: upgrade_lock_stat + + - name: Create upgrade lock file (only if absent) + ansible.builtin.copy: + content: | + operation: upgrade + pid: {{ ansible_pid | default('unknown') }} + started_at: "{{ lookup('pipe', 'date -u +%Y-%m-%dT%H:%M:%SZ') }}" + host: "{{ inventory_hostname }}" + created_by: ansible-playbook + dest: "{{ upgrade_lock_path }}" + mode: '0644' + when: not upgrade_lock_stat.stat.exists + # ───────────────────────────────────────────────────────────── + + - name: Check for upgrade_manifest.yml + ansible.builtin.stat: + path: "{{ manifest_path }}" + register: manifest_stat + + # ─── Read oim_metadata.yml and derive manifest fields directly ─── + - name: Read oim_metadata.yml + ansible.builtin.slurp: + src: "{{ metadata_path }}" + register: raw_metadata + when: not manifest_stat.stat.exists + + - name: Parse oim_metadata.yml + ansible.builtin.set_fact: + oim_metadata: "{{ raw_metadata.content | b64decode | from_yaml }}" + when: not manifest_stat.stat.exists + + - name: Initialize upgrade_manifest.yml if absent + ansible.builtin.copy: + content: | + upgrade_id: "upgrade-{{ 9999 | random }}" + source_version: "{{ oim_metadata.omnia_previous_version }}" + target_version: "{{ oim_metadata.omnia_version }}" + upgrade_status: "in-progress" + backup_dir: "{{ oim_metadata.upgrade_backup_dir }}" + component_status: + oim: "pending" + local_repo: "pending" + build_image: "pending" + k8s: "pending" + telemetry: "pending" + cloud_init_bss: "pending" + slurm: "pending" + dest: "{{ manifest_path }}" + mode: '0644' + when: not manifest_stat.stat.exists + + - name: Read upgrade_manifest.yml + ansible.builtin.slurp: + src: "{{ manifest_path }}" + register: raw_manifest + + - name: Parse manifest + ansible.builtin.set_fact: + manifest: "{{ raw_manifest.content | b64decode | from_yaml }}" + + - name: Resolve requested tags (handle Ansible 'all' default) + ansible.builtin.set_fact: + requested_tags: >- + {{ all_components + if (ansible_run_tags is not defined or 'all' in ansible_run_tags) + else ansible_run_tags }} + + - name: Validate tag dependency order + ansible.builtin.fail: + msg: > + Tag '{{ item }}' requires {{ tag_dependencies[item] | join(', ') }} + to have been completed first. + loop: "{{ requested_tags }}" + when: + - item in tag_dependencies + - tag_dependencies[item] | difference(requested_tags) | length > 0 + - tag_dependencies[item] | reject('in', + (manifest.component_status | default({})) + | dict2items | selectattr('value', 'equalto', 'completed') + | map(attribute='key') | list + ) | list | length > 0 + + - name: Report already-upgraded components (will be skipped) + ansible.builtin.debug: + msg: "Component '{{ item }}' already completed — will be skipped." + loop: "{{ requested_tags }}" + when: + - manifest.component_status is defined + - item in manifest.component_status + - manifest.component_status[item] == 'completed' + +# Note: import_playbook does NOT honor play-level when:; the conditional +# must be inside the imported playbook itself. tags: always ensures it +# runs even with --tags . +- name: Include input project directory + ansible.builtin.import_playbook: ../utils/include_input_dir.yml + tags: always + vars: + openchami_vars_suppport: false + omnia_metadata_support: true + +# ────────────────────────────────────────────────────────────────────── +# Sub-flow imports (each sub-flow reads upgrade_manifest.yml and +# skips if its component_status is already 'completed') +# ────────────────────────────────────────────────────────────────────── +- name: Upgrade OIM tasks (includes OpenCHAMI) + ansible.builtin.import_playbook: playbooks/upgrade_oim.yml + tags: [oim] + +- name: Local repo staging + ansible.builtin.import_playbook: playbooks/upgrade_local_repo.yml + tags: [local_repo] + +- name: Build images + ansible.builtin.import_playbook: playbooks/upgrade_build_image.yml + tags: [build_image] + +- name: Upgrade Kubernetes cluster + ansible.builtin.import_playbook: playbooks/upgrade_k8s.yml + tags: [k8s] + +- name: Upgrade Telemetry components + ansible.builtin.import_playbook: playbooks/upgrade_telemetry.yml + tags: [telemetry] + +- name: Update Cloud-Init and BSS boot parameters + ansible.builtin.import_playbook: playbooks/upgrade_cloud_init_bss.yml + tags: [cloud_init_bss] + +- name: Upgrade Slurm cluster + ansible.builtin.import_playbook: playbooks/upgrade_slurm.yml + tags: [slurm] + +# ────────────────────────────────────────────────────────────────────── +# Post-upgrade: update upgrade_manifest.yml + clear upgrade lock +# (oim_metadata.yml is intentionally NOT modified) +# ────────────────────────────────────────────────────────────────────── +- name: Finalize upgrade state + hosts: localhost + connection: local + gather_facts: false + tags: always + vars: + manifest_path: /opt/omnia/.data/upgrade_manifest.yml + upgrade_lock_path: /opt/omnia/.data/upgrade_in_progress.lock + all_components: [oim, local_repo, build_image, k8s, telemetry, cloud_init_bss, slurm] + tasks: + - name: Read current upgrade_manifest.yml + ansible.builtin.slurp: + src: "{{ manifest_path }}" + register: raw_manifest + + - name: Parse manifest + ansible.builtin.set_fact: + manifest: "{{ raw_manifest.content | b64decode | from_yaml }}" + + - name: Clean up corrupted component_status values (reset all to 'completed') + ansible.builtin.set_fact: + cleaned_component_status: >- + {{ manifest.component_status + | dict2items + | map('combine', {'value': 'completed'}) + | items2dict }} + + - name: Determine upgrade_status based on component_status + ansible.builtin.set_fact: + new_status: >- + {{ 'completed' + if (cleaned_component_status.values() | unique | list == ['completed']) + else 'partial' }} + + - name: Write upgrade_manifest.yml with cleaned component_status and updated upgrade_status + ansible.builtin.copy: + content: "{{ manifest | combine(manifest_updates) | to_nice_yaml }}" + dest: "{{ manifest_path }}" + mode: '0644' + vars: + manifest_updates: + component_status: "{{ cleaned_component_status }}" + upgrade_status: "{{ new_status }}" + + # NOTE: oim_metadata.yml is intentionally NOT modified by upgrade or + # rollback flows. It is read-only and reflects the cluster's installed + # version baseline only. + + - name: Remove upgrade guard lock (always release at end) + ansible.builtin.file: + path: "{{ upgrade_lock_path }}" + state: absent + + - name: Display upgrade summary + ansible.builtin.debug: + msg: + - "=========================================" + - " UPGRADE {{ new_status | upper }}" + - "=========================================" + - "Upgrade ID: {{ manifest.upgrade_id | default('N/A') }}" + - "Source: {{ manifest.source_version | default('N/A') }} → Target: {{ manifest.target_version | default('N/A') }}" + - "" + - "Component Status:" + - " oim: {{ cleaned_component_status.oim | default('pending') }}" + - " local_repo: {{ cleaned_component_status.local_repo | default('pending') }}" + - " build_image: {{ cleaned_component_status.build_image | default('pending') }}" + - " k8s: {{ cleaned_component_status.k8s | default('pending') }}" + - " telemetry: {{ cleaned_component_status.telemetry | default('pending') }}" + - " cloud_init_bss: {{ cleaned_component_status.cloud_init_bss | default('pending') }}" + - " slurm: {{ cleaned_component_status.slurm | default('pending') }}" diff --git a/upgrade/upgrade_cluster.yml b/upgrade/upgrade_cluster.yml deleted file mode 100644 index 86e475f6d2..0000000000 --- a/upgrade/upgrade_cluster.yml +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Upgrade cluster tasks - hosts: localhost - connection: local - roles: - - role: upgrade_cluster diff --git a/upgrade/upgrade_oim.yml b/upgrade/upgrade_oim.yml deleted file mode 100644 index a2e29d7885..0000000000 --- a/upgrade/upgrade_oim.yml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Upgrade OIM tasks - hosts: localhost - connection: local - gather_facts: true - vars: - input_project_dir: "/opt/omnia/input/project_default" - roles: - - role: ../utils/roles/include_input_dir - - role: upgrade_oim diff --git a/upgrade/upgrade_omnia.yml b/upgrade/upgrade_omnia.yml deleted file mode 100644 index ade6b1f173..0000000000 --- a/upgrade/upgrade_omnia.yml +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Upgrade OIM tasks - ansible.builtin.import_playbook: upgrade_oim.yml - -- name: Upgrade cluster tasks - ansible.builtin.import_playbook: upgrade_cluster.yml - -- name: Clear upgrade guard lock - hosts: localhost - connection: local - gather_facts: false - tasks: - - name: Remove upgrade guard lock - ansible.builtin.file: - path: /opt/omnia/.data/upgrade_in_progress.lock - state: absent From 40b728100781720f0dbf5352b2080ed2ba23c02c Mon Sep 17 00:00:00 2001 From: Mithilesh Reddy Date: Thu, 14 May 2026 12:12:34 +0530 Subject: [PATCH 13/17] Merge pull request #4411 from mithileshreddy04/pub/q2_upgrade Input parameter transforms for input files in prepare_upgrade.yml --- upgrade/prepare_upgrade.yml | 95 ++++++++++++++++++- .../import_input_parameters/tasks/main.yml | 3 + .../tasks/transform_local_repo_config.yml | 62 +++++++++++- .../tasks/transform_pxe_mapping_file.yml | 77 +++++++++++++++ .../tasks/transform_telemetry_config.yml | 26 +++++ .../templates/local_repo_config.j2 | 6 +- .../templates/pxe_mapping_file.csv.j2 | 4 + .../templates/telemetry_config.j2 | 10 +- .../import_input_parameters/vars/main.yml | 33 +++++-- 9 files changed, 300 insertions(+), 16 deletions(-) create mode 100644 upgrade/roles/import_input_parameters/tasks/transform_pxe_mapping_file.yml create mode 100644 upgrade/roles/import_input_parameters/templates/pxe_mapping_file.csv.j2 diff --git a/upgrade/prepare_upgrade.yml b/upgrade/prepare_upgrade.yml index 19c0de45e0..37cadecd34 100644 --- a/upgrade/prepare_upgrade.yml +++ b/upgrade/prepare_upgrade.yml @@ -40,4 +40,97 @@ roles: - role: ../utils/roles/include_input_dir - role: import_input_parameters - - role: manage_upgrade_inputs + +- name: Display post-upgrade preparation banner + hosts: localhost + connection: local + gather_facts: false + vars: + input_project_dir: "/opt/omnia/input/project_default" + tasks: + - name: Read provision_config.yml to get pxe_mapping_file_path + ansible.builtin.slurp: + src: "{{ input_project_dir }}/provision_config.yml" + register: provision_config_slurp + + - name: Extract pxe_mapping_filename from provision_config.yml + ansible.builtin.set_fact: + pxe_mapping_filename: >- + {{ (provision_config_slurp.content | b64decode | from_yaml).pxe_mapping_file_path + | default('pxe_mapping_file.csv') + | regex_replace('^.*/', '') }} + + - name: Display upgrade preparation completion banner + ansible.builtin.pause: + prompt: | + + ======================================================================== + OMNIA 2.1 to 2.2 INPUT PREPARATION COMPLETE + ======================================================================== + + All input files have been successfully migrated to: + {{ input_project_dir }} + + ------------------------------------------------------------------------ + SECTION 1: AUTOMATICALLY MIGRATED (no action needed) + ------------------------------------------------------------------------ + + 1. provision_config.yml - Copied as-is + 2. high_availability_config.yml - Copied as-is + 3. omnia_config.yml - Cluster settings preserved + 4. build_stream_config.yml - Preserved and validated + 5. gitlab_config.yml - Preserved and validated + 6. local_repo_config.yml - Preserved and validated + 7. security_config.yml - Restored from backup + 8. software_config.json - Restored from backup + 9. Credentials - Restored and re-encrypted + + ------------------------------------------------------------------------ + SECTION 2: REVIEW REQUIRED (new parameters added in 2.2) + ------------------------------------------------------------------------ + + File 1: network_spec.yml + 1. admin_network.subnet (auto-computed) + 2. admin_network.additional_subnets (multi-RAC PXE) + 3. ib_network.dns (InfiniBand DNS) + + File 2: storage_config.yml + 1. nfs_client_params is replaced by 'mounts' format + 2. mount_params profiles (nfs_default, vast_nfs, vast_rdma) + 3. functional_group_prefix, node_key, permissions, swap + 4. powervault_config (now a list with mount_point) + + File 3: telemetry_config.yml + 1. Restructured to: sources, bridges, sinks architecture + 2. telemetry_sources.dcgm (GPU metrics) + 3. telemetry_sources.powerscale (storage metrics) + 4. telemetry_bridges (vector_ldms, vector_ome) + 5. telemetry_sinks.victoria_logs (centralized logging) + 6. additional_metric_remote_write_endpoints + 7. additional_log_write_endpoints + Note: All new features are DISABLED by default. + + File 4: {{ pxe_mapping_filename }} + 1. ib_mac column (InfiniBand MAC addresses) + 2. ib_ip column (InfiniBand IP addresses) + Note: IB fields are empty. Populate manually if using + InfiniBand network for compute nodes. + + ======================================================================== + NEXT STEPS + ======================================================================== + + 1. Review the files listed in Section 2 above in: + {{ input_project_dir }} + + 2. Enable any new features you want + (telemetry, storage profiles, IB network, etc.) + + 3. Verify the upgrade summary and hop chain shown above + + 4. Run the upgrade playbook: + ansible-playbook upgrade/upgrade.yml + + ======================================================================== + + Press Ctrl+C to continue... diff --git a/upgrade/roles/import_input_parameters/tasks/main.yml b/upgrade/roles/import_input_parameters/tasks/main.yml index 219dea9f1f..adedea76e7 100644 --- a/upgrade/roles/import_input_parameters/tasks/main.yml +++ b/upgrade/roles/import_input_parameters/tasks/main.yml @@ -46,6 +46,9 @@ - name: Generate gitlab_config.yml for Omnia 2.2 ansible.builtin.include_tasks: transform_gitlab_config.yml +- name: Transform pxe_mapping_file.csv from Omnia 2.1 to 2.2 + ansible.builtin.include_tasks: transform_pxe_mapping_file.yml + - name: Restore input files from backup ansible.builtin.include_tasks: restore_input_files.yml diff --git a/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml index 4b8ac8e3ec..d220caf0ad 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_local_repo_config.yml @@ -93,6 +93,66 @@ }}" local_repo_additional_repos_aarch64: "{{ backup_local_repo_config.additional_repos_aarch64 | default([]) }}" +- name: Strip x86_64_ prefix from user_repo_url_x86_64 names + ansible.builtin.set_fact: + local_repo_user_repo_url_x86_64: >- + {%- set result = [] -%} + {%- for repo in (local_repo_user_repo_url_x86_64 | default([])) -%} + {%- set clean_name = repo.name | default('') | regex_replace('^x86_64_', '') -%} + {%- set _ = result.append(repo | combine({'name': clean_name})) -%} + {%- endfor -%} + {{ result }} + +- name: Strip aarch64_ prefix from user_repo_url_aarch64 names + ansible.builtin.set_fact: + local_repo_user_repo_url_aarch64: >- + {%- set result = [] -%} + {%- for repo in (local_repo_user_repo_url_aarch64 | default([])) -%} + {%- set clean_name = repo.name | default('') | regex_replace('^aarch64_', '') -%} + {%- set _ = result.append(repo | combine({'name': clean_name})) -%} + {%- endfor -%} + {{ result }} + +- name: Strip x86_64_ prefix from rhel_os_url_x86_64 names + ansible.builtin.set_fact: + local_repo_rhel_os_url_x86_64: >- + {%- set result = [] -%} + {%- for repo in (local_repo_rhel_os_url_x86_64 | default([])) -%} + {%- set clean_name = repo.name | default('') | regex_replace('^x86_64_', '') -%} + {%- set _ = result.append(repo | combine({'name': clean_name})) -%} + {%- endfor -%} + {{ result }} + +- name: Strip aarch64_ prefix from rhel_os_url_aarch64 names + ansible.builtin.set_fact: + local_repo_rhel_os_url_aarch64: >- + {%- set result = [] -%} + {%- for repo in (local_repo_rhel_os_url_aarch64 | default([])) -%} + {%- set clean_name = repo.name | default('') | regex_replace('^aarch64_', '') -%} + {%- set _ = result.append(repo | combine({'name': clean_name})) -%} + {%- endfor -%} + {{ result }} + +- name: Strip x86_64_ prefix from additional_repos_x86_64 names + ansible.builtin.set_fact: + local_repo_additional_repos_x86_64: >- + {%- set result = [] -%} + {%- for repo in (local_repo_additional_repos_x86_64 | default([])) -%} + {%- set clean_name = repo.name | default('') | regex_replace('^x86_64_', '') -%} + {%- set _ = result.append(repo | combine({'name': clean_name})) -%} + {%- endfor -%} + {{ result }} + +- name: Strip aarch64_ prefix from additional_repos_aarch64 names + ansible.builtin.set_fact: + local_repo_additional_repos_aarch64: >- + {%- set result = [] -%} + {%- for repo in (local_repo_additional_repos_aarch64 | default([])) -%} + {%- set clean_name = repo.name | default('') | regex_replace('^aarch64_', '') -%} + {%- set _ = result.append(repo | combine({'name': clean_name})) -%} + {%- endfor -%} + {{ result }} + - name: Fail if omnia_repo_url_rhel_x86_64 is missing ansible.builtin.fail: msg: "{{ msg_omnia_repo_url_rhel_x86_64_missing }}" @@ -103,7 +163,7 @@ msg: "{{ msg_omnia_repo_url_rhel_aarch64_missing }}" when: (local_repo_omnia_repo_url_rhel_aarch64 | default([]) | length) == 0 -- name: Write local_repo_config.yml in Omnia 2.1 format +- name: Write local_repo_config.yml in Omnia 2.2 format ansible.builtin.template: src: local_repo_config.j2 dest: "{{ input_project_dir }}/local_repo_config.yml" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_pxe_mapping_file.yml b/upgrade/roles/import_input_parameters/tasks/transform_pxe_mapping_file.yml new file mode 100644 index 0000000000..4afcd33850 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/transform_pxe_mapping_file.yml @@ -0,0 +1,77 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Extract filename from pxe_mapping_file_path + ansible.builtin.set_fact: + pxe_mapping_filename: >- + {{ provision_pxe_mapping_file_path | regex_replace('^.*/', '') }} + +- name: Display pxe_mapping_file transformation message + ansible.builtin.debug: + msg: "{{ msg_using_backup_pxe_mapping_file }}" + +- name: Check if pxe_mapping_file exists in backup + ansible.builtin.stat: + path: "{{ backup_location }}/{{ pxe_mapping_filename }}" + register: backup_pxe_mapping_file_stat + +- name: Fail if pxe_mapping_file not found in backup + ansible.builtin.fail: + msg: "{{ msg_backup_pxe_mapping_file_missing }}" + when: not backup_pxe_mapping_file_stat.stat.exists + +- name: Read pxe_mapping_file from backup using read_csv module + community.general.read_csv: + path: "{{ backup_location }}/{{ pxe_mapping_filename }}" + register: backup_pxe_mapping_file + +- name: Add IB columns to each row (initialize as empty) + ansible.builtin.set_fact: + pxe_mapping_rows: >- + {{ backup_pxe_mapping_file.list | + map('combine', {'IB_MAC': '', 'IB_IP': ''}) | + list }} + +- name: Fail if no valid rows found in pxe_mapping_file + ansible.builtin.fail: + msg: "{{ msg_pxe_mapping_file_empty }}" + when: (pxe_mapping_rows | default([]) | length) == 0 + +- name: Write pxe_mapping_file in Omnia 2.2 format with IB fields + ansible.builtin.template: + src: pxe_mapping_file.csv.j2 + dest: "{{ input_project_dir }}/{{ pxe_mapping_filename }}" + mode: "{{ default_file_mode }}" + vars: + pxe_mapping_rows: "{{ pxe_mapping_rows }}" + +- name: Validate CSV syntax of transformed pxe_mapping_file + ansible.builtin.shell: + cmd: | + set -o pipefail + head -n 1 "{{ input_project_dir }}/{{ pxe_mapping_filename }}" | \ + grep -q "FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP" + changed_when: false + failed_when: false + register: pxe_mapping_file_validation + +- name: Fail if pxe_mapping_file validation failed + ansible.builtin.fail: + msg: "{{ msg_pxe_mapping_file_validation_failed }}" + when: pxe_mapping_file_validation.rc != 0 + +- name: Display pxe_mapping_file transformation summary + ansible.builtin.debug: + msg: "{{ msg_pxe_mapping_file_transform_summary }}" diff --git a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml index 14b5458ddd..db57eb2b9e 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_telemetry_config.yml @@ -184,6 +184,12 @@ | default(telemetry_default_ldms_sampler_configurations)) }} +- name: Normalize LDMS source metrics_enabled from backup + ansible.builtin.set_fact: + telemetry_ldms_metrics_enabled: >- + {{ (backup_telemetry_sources.ldms | default({})).metrics_enabled + | default(telemetry_default_ldms_metrics_enabled) }} + - name: Normalize DCGM and PowerScale source values from backup ansible.builtin.set_fact: telemetry_dcgm_support: >- @@ -199,6 +205,21 @@ | default(backup_telemetry_powerscale_config.powerscale_log_enabled | default(telemetry_default_powerscale_log_enabled)) }} +- name: Normalize bridge values from backup (2.2 format or defaults) + ansible.builtin.set_fact: + telemetry_vector_ldms_metrics_enabled: >- + {{ ((backup_telemetry_config.telemetry_bridges | default({})).vector_ldms | default({})).metrics_enabled + | default(telemetry_default_vector_ldms_metrics_enabled) }} + telemetry_vector_ome_metrics_enabled: >- + {{ ((backup_telemetry_config.telemetry_bridges | default({})).vector_ome | default({})).metrics_enabled + | default(telemetry_default_vector_ome_metrics_enabled) }} + telemetry_vector_ome_logs_enabled: >- + {{ ((backup_telemetry_config.telemetry_bridges | default({})).vector_ome | default({})).logs_enabled + | default(telemetry_default_vector_ome_logs_enabled) }} + telemetry_vector_ome_identifier: >- + {{ ((backup_telemetry_config.telemetry_bridges | default({})).vector_ome | default({})).ome_identifier + | default('ome') }} + - name: Normalize PowerScale configuration values from backup ansible.builtin.set_fact: telemetry_otel_collector_storage_size: >- @@ -216,9 +237,14 @@ vars: telemetry_idrac_telemetry_support: "{{ telemetry_idrac_telemetry_support }}" telemetry_idrac_collection_targets: "{{ telemetry_idrac_collection_targets }}" + telemetry_ldms_metrics_enabled: "{{ telemetry_ldms_metrics_enabled }}" telemetry_dcgm_support: "{{ telemetry_dcgm_support }}" telemetry_powerscale_metrics_enabled: "{{ telemetry_powerscale_metrics_enabled }}" telemetry_powerscale_logs_enabled: "{{ telemetry_powerscale_logs_enabled }}" + telemetry_vector_ldms_metrics_enabled: "{{ telemetry_vector_ldms_metrics_enabled }}" + telemetry_vector_ome_metrics_enabled: "{{ telemetry_vector_ome_metrics_enabled }}" + telemetry_vector_ome_logs_enabled: "{{ telemetry_vector_ome_logs_enabled }}" + telemetry_vector_ome_identifier: "{{ telemetry_vector_ome_identifier }}" telemetry_victoria_persistence_size: "{{ telemetry_victoria_persistence_size }}" telemetry_victoria_retention_period: "{{ telemetry_victoria_retention_period }}" telemetry_additional_metric_remote_write_endpoints: "{{ telemetry_additional_metric_remote_write_endpoints }}" diff --git a/upgrade/roles/import_input_parameters/templates/local_repo_config.j2 b/upgrade/roles/import_input_parameters/templates/local_repo_config.j2 index 1e371e6e7a..1f4025b432 100644 --- a/upgrade/roles/import_input_parameters/templates/local_repo_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/local_repo_config.j2 @@ -213,13 +213,15 @@ rhel_subscription_repo_config_aarch64: omnia_repo_url_rhel_x86_64: - { url: "https://download.docker.com/linux/centos/10/x86_64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"} - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/x86_64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"} - - { url: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/", gpgkey: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "kubernetes"} - - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "cri-o"} + - { url: "https://pkgs.k8s.io/core:/stable:/v1.35/rpm/", gpgkey: "https://pkgs.k8s.io/core:/stable:/v1.35/rpm/repodata/repomd.xml.key", name: "kubernetes-v1-35"} + - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.35/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.35/rpm/repodata/repomd.xml.key", name: "cri-o-v1-35"} - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/", gpgkey: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/repodata/repomd.xml.key", name: "doca"} + - { url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel10/x86_64/", gpgkey: "https://developer.download.nvidia.com/compute/cuda/repos/rhel10/x86_64/repodata/repomd.xml.key", name: "cuda"} omnia_repo_url_rhel_aarch64: - { url: "https://download.docker.com/linux/centos/10/aarch64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"} - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/aarch64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"} - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/arm64-sbsa/", gpgkey: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/arm64-sbsa/repodata/repomd.xml.key", name: "doca"} + - { url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel10/sbsa/", gpgkey: "https://developer.download.nvidia.com/compute/cuda/repos/rhel10/sbsa/repodata/repomd.xml.key", name: "cuda"} # Example: # additional_repos_x86_64: # - { url: "https://rpm.grafana.com/", gpgkey: "", name: "grafana" } diff --git a/upgrade/roles/import_input_parameters/templates/pxe_mapping_file.csv.j2 b/upgrade/roles/import_input_parameters/templates/pxe_mapping_file.csv.j2 new file mode 100644 index 0000000000..7d55e467cb --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/pxe_mapping_file.csv.j2 @@ -0,0 +1,4 @@ +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +{% for row in pxe_mapping_rows -%} +{{ row.FUNCTIONAL_GROUP_NAME }},{{ row.GROUP_NAME }},{{ row.SERVICE_TAG }},{{ row.PARENT_SERVICE_TAG }},{{ row.HOSTNAME }},{{ row.ADMIN_MAC }},{{ row.ADMIN_IP }},{{ row.BMC_MAC }},{{ row.BMC_IP }},{{ row.IB_MAC }},{{ row.IB_IP }} +{% endfor -%} diff --git a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 index 0f215e2145..7a5d5a6632 100644 --- a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 @@ -110,7 +110,7 @@ telemetry_sources: ldms: # Enable or disable LDMS metrics collection # Default: true - metrics_enabled: true + metrics_enabled: {{ telemetry_ldms_metrics_enabled | default(telemetry_default_ldms_metrics_enabled) | bool | ternary('true', 'false') }} # LDMS only supports Kafka collection (no direct victoria_metrics path) # Vector-LDMS bridge consumes from Kafka and routes to victoria_metrics @@ -171,7 +171,7 @@ telemetry_bridges: # Enable or disable Vector-LDMS bridge # Requires: telemetry_sources.ldms.enabled = true # Default: true - metrics_enabled: true + metrics_enabled: {{ telemetry_vector_ldms_metrics_enabled | default(telemetry_default_vector_ldms_metrics_enabled) | bool | ternary('true', 'false') }} # -------------------------------------------------------------------------- # Vector-OME — Kafka-to-Victoria bridge for OME metrics and logs @@ -182,16 +182,16 @@ telemetry_bridges: # Enable or disable Vector-OME metrics routing # Requires: OME to be configured with kafka # Default: true - metrics_enabled: true + metrics_enabled: {{ telemetry_vector_ome_metrics_enabled | default(telemetry_default_vector_ome_metrics_enabled) | bool | ternary('true', 'false') }} # Enable or disable Vector-OME logs routing # Default: true - logs_enabled: true + logs_enabled: {{ telemetry_vector_ome_logs_enabled | default(telemetry_default_vector_ome_logs_enabled) | bool | ternary('true', 'false') }} # Identifier used by Vector-OME for topic identification and routing. # Default: "ome" — internally used to match topics with the prefix (e.g., "^ome\\..*$") # Change only if your OME Kafka topics use a different prefix. - ome_identifier: "ome" + ome_identifier: "{{ telemetry_vector_ome_identifier | default('ome') }}" # ============================================================================ # TELEMETRY SINKS (Storage Backends) diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 1349359eb2..f234a5f8e0 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -159,6 +159,14 @@ msg_provision_config_missing: "provision_config.yml missing" msg_using_backup_provision_config: "Using backup provision_config.yml (backup not modified)" msg_pxe_mapping_file_path_missing: "pxe_mapping_file_path is mandatory" +# PXE mapping file transformation messages +msg_backup_pxe_mapping_file_missing: "{{ pxe_mapping_filename }} not found in backup at {{ backup_location }}/{{ pxe_mapping_filename }}" +msg_using_backup_pxe_mapping_file: "Transforming {{ pxe_mapping_filename }} from backup at {{ backup_location }}/{{ pxe_mapping_filename }}" +msg_pxe_mapping_file_empty: "{{ pxe_mapping_filename }} contains no valid rows (must have at least 9 columns: FUNCTIONAL_GROUP_NAME through BMC_IP)" +msg_pxe_mapping_file_validation_failed: > + {{ pxe_mapping_filename }} validation failed - header must contain: + FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP + # Storage config transformation messages msg_backup_storage_config_missing: "storage_config.yml not found in backup at {{ backup_location }}/storage_config.yml" msg_storage_config_missing: "storage_config.yml not found at {{ input_project_dir }}/storage_config.yml" @@ -220,6 +228,16 @@ msg_restore_summary: | Backup: {{ backup_location }}/{{ restore_item.name }} Target: {{ input_project_dir }}/{{ restore_item.name }} +# Restore summary message for pxe_mapping_file transformation +msg_pxe_mapping_file_transform_summary: | + {{ pxe_mapping_filename }} upgraded to 2.2 format. + Backup preserved at: {{ backup_location }}/{{ pxe_mapping_filename }} + Changes: + - Added IB_MAC column for InfiniBand MAC addresses (initialized as empty) + - Added IB_IP column for InfiniBand IP addresses (initialized as empty) + - Preserved all existing columns (FUNCTIONAL_GROUP_NAME through BMC_IP) + - NOTE: If using InfiniBand network, manually populate IB_MAC and IB_IP values + # Restore summary message for network spec transformation msg_network_spec_transform_summary: | network_spec.yml upgraded to 2.2 format. @@ -344,11 +362,15 @@ telemetry_default_kafka_log_segment_bytes: 1073741824 telemetry_default_ldms_agg_port: 6001 telemetry_default_ldms_store_port: 6001 telemetry_default_ldms_sampler_port: 10001 -telemetry_default_dcgm_support: true +telemetry_default_dcgm_support: false +telemetry_default_ldms_metrics_enabled: false +telemetry_default_vector_ldms_metrics_enabled: false +telemetry_default_vector_ome_metrics_enabled: false +telemetry_default_vector_ome_logs_enabled: false telemetry_default_victoria_logs_storage_size: "8Gi" telemetry_default_victoria_logs_retention_period: 168 -telemetry_default_powerscale_support: true -telemetry_default_powerscale_log_enabled: true +telemetry_default_powerscale_support: false +telemetry_default_powerscale_log_enabled: false telemetry_default_otel_collector_storage_size: "5Gi" telemetry_default_csm_observability_values_file_path: "" @@ -406,7 +428,7 @@ mode_sensitive_file: '0600' # - Files where you want to preserve the backup values exactly # # DO NOT add files that require transformation (network_spec.yml, high_availability_config.yml, local_repo_config.yml, -# provision_config.yml, software_config.json, telemetry_config.yml, user_registry_credential.yml) +# provision_config.yml, storage_config.yml, omnia_config.yml, telemetry_config.yml, user_registry_credential.yml) # DO NOT add files that are newly generated (build_stream_config.yml, gitlab_config.yml, discovery_config.yml) restore_input_files: - name: software_config.json @@ -415,6 +437,3 @@ restore_input_files: - name: security_config.yml mode: '0644' validate_cmd: "python3 -c \"import yaml; yaml.safe_load(open('{{ input_project_dir }}/security_config.yml','r'))\"" - - name: pxe_mapping_file.csv - mode: '0644' - validate_cmd: "" From b0c03c5599d9f53ca33af07c811fda9f74fb093f Mon Sep 17 00:00:00 2001 From: Katakam-Rakesh Date: Thu, 14 May 2026 12:58:24 +0530 Subject: [PATCH 14/17] Add Code for fresh k8s installation 1.35.1 Signed-off-by: Katakam-Rakesh --- .../input_validation/common_utils/config.py | 2 +- .../validation_flows/common_validation.py | 12 +- .../validation_flows/local_repo_validation.py | 21 +++- .../module_utils/local_repo/software_utils.py | 14 ++- .../modules/image_package_collector.py | 26 +++- common/library/modules/prepare_tasklist.py | 9 +- examples/rhel_software_config.json | 2 +- ..._rhel_10.0_multi_arch_software_config.json | 2 +- ...late_rhel_10.0_x86-64_software_config.json | 2 +- .../x86_64/rhel/10.0/service_k8s_v1.34.1.json | 108 ++++++++++++++++ .../x86_64/rhel/10.0/service_k8s_v1.35.1.json | 118 ++++++++++++++++++ input/local_repo_config.yml | 15 ++- input/software_config.json | 2 +- .../tasks/validate_software_config_json.yml | 2 +- local_repo/roles/validation/vars/main.yml | 13 +- .../tasks/check_k8s_support.yml | 4 +- .../prepare_oim_validation/vars/main.yml | 13 +- .../tasks/create_k8s_config_nfs.yml | 2 +- provision/roles/k8s_config/vars/main.yml | 3 +- .../telemetry/tasks/load_service_images.yml | 2 +- provision/roles/telemetry/tasks/main.yml | 2 +- .../telemetry/tasks/read_software_config.yml | 4 +- .../common/telemetry_pod_cleanup.yaml.j2 | 2 +- provision/roles/telemetry/vars/main.yml | 9 +- 24 files changed, 344 insertions(+), 45 deletions(-) create mode 100644 input/config/x86_64/rhel/10.0/service_k8s_v1.34.1.json create mode 100644 input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index 095849d88f..d1bb4b5b61 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -132,7 +132,7 @@ "openmpi": "5.0.8", "csi_driver_powerscale": "v2.15.0", "rocm": "6.3.1", - "service_k8s": "1.34.1" + "service_k8s": "1.35.1" } # All of the passwords fields diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index 6ff8df745c..363be36fd7 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -252,15 +252,23 @@ def validate_software_config( for software_pkg in data['softwares']: software = software_pkg['name'] arch_list = software_pkg.get('arch') + # Get software version for versioned JSON files (e.g., service_k8s_v1.35.1.json) + software_version = software_pkg.get('version') for arch in arch_list: json_path = get_json_file_path( - software, cluster_os_type, cluster_os_version, input_file_path, arch) + software, cluster_os_type, cluster_os_version, input_file_path, arch, + software_version=software_version) # Check if json_path is None or if the JSON syntax is invalid if not json_path: + # Construct expected filename for error message + if software == "service_k8s" and software_version: + expected_file = f"{software}_v{software_version}.json" + else: + expected_file = f"{software}.json" errors.append( create_error_msg( "Validation Error: ", software, - f"is present in software_config.json. JSON file not found: {software}.json" + f"is present in software_config.json. JSON file not found: {expected_file}" ) ) else: diff --git a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py index 8254faca1e..e81ecd8be7 100644 --- a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py @@ -21,7 +21,7 @@ from ansible.module_utils.input_validation.common_utils import validation_utils from ansible.module_utils.input_validation.common_utils import config from ansible.module_utils.input_validation.common_utils import en_us_validation_msg -from ansible.module_utils.local_repo.software_utils import load_yaml, load_json +from ansible.module_utils.local_repo.software_utils import load_yaml, load_json, get_json_file_path file_names = config.files create_error_msg = validation_utils.create_error_msg @@ -239,13 +239,22 @@ def validate_local_repo_config(input_file_path, data, for software in software_config_json["softwares"]: sw = software["name"] arch_list = software.get("arch") + # Get software version for versioned JSON files (e.g., service_k8s_v1.35.1.json) + software_version = software.get("version") for arch in arch_list: - json_path = create_file_path( - input_file_path, - f"config/{arch}{os_ver_path}" + sw +".json") - if not os.path.exists(json_path): + # Use get_json_file_path for proper versioned JSON file resolution + json_path = get_json_file_path( + sw, cluster_os_type, cluster_os_version, + software_config_file_path, arch, + software_version=software_version) + if not json_path or not os.path.exists(json_path): + # Construct expected filename for error message + if sw == "service_k8s" and software_version: + expected_file = f"{sw}_v{software_version}.json" + else: + expected_file = f"{sw}.json" errors.append( - create_error_msg(sw + '/' + arch, f"{sw} JSON file not found for architecture {arch}.", json_path)) + create_error_msg(sw + '/' + arch, f"{sw} JSON file not found for architecture {arch}.", expected_file)) else: curr_json = load_json(json_path) pkg_list = curr_json[sw]['cluster'] diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py index bc5da2876a..d3306d58b8 100644 --- a/common/library/module_utils/local_repo/software_utils.py +++ b/common/library/module_utils/local_repo/software_utils.py @@ -118,7 +118,7 @@ def load_yaml(file_path): return yaml.safe_load(file) def get_json_file_path(software_name, cluster_os_type, - cluster_os_version, user_json_path, arch): + cluster_os_version, user_json_path, arch, software_version=None): """ Generate the file path for a JSON file based on the provided software name, cluster OS type, cluster OS version, and user JSON path. @@ -129,13 +129,23 @@ def get_json_file_path(software_name, cluster_os_type, cluster_os_version (str): The version of the cluster operating system. user_json_path (str): The path to the user JSON file. arch: Architecture for a particular software + software_version (str, optional): Version of the software for versioned JSON files. + Used for software like service_k8s that have versioned JSON files + (e.g., service_k8s_v1.35.1.json). Returns: str or None: The file path for the JSON file if it exists, otherwise None. """ base_path = os.path.dirname(os.path.abspath(user_json_path)) + + # Handle versioned JSON files (e.g., service_k8s_v1.35.1.json) + if software_name == "service_k8s" and software_version: + json_filename = f"{software_name}_v{software_version}.json" + else: + json_filename = f"{software_name}.json" + json_path = os.path.join(base_path, - f'{SOFTWARE_CONFIG_SUBDIR}/{arch}/{cluster_os_type}/{cluster_os_version}/{software_name}.json' + f'{SOFTWARE_CONFIG_SUBDIR}/{arch}/{cluster_os_type}/{cluster_os_version}/{json_filename}' ) return json_path diff --git a/common/library/modules/image_package_collector.py b/common/library/modules/image_package_collector.py index 77ff67b50d..90a315d750 100644 --- a/common/library/modules/image_package_collector.py +++ b/common/library/modules/image_package_collector.py @@ -151,7 +151,12 @@ def process_functional_group(fg_name, arch, os_version, input_project_dir, packages = [] for json_file in json_files: + # Extract software name from json file + # Handle versioned files like service_k8s_v1.35.1.json -> service_k8s sw_name = json_file.replace(".json", "") + # Remove version suffix for versioned files (e.g., service_k8s_v1.35.1 -> service_k8s) + if sw_name.startswith("service_k8s_v"): + sw_name = "service_k8s" if sw_name not in allowed_softwares: continue @@ -170,7 +175,8 @@ def process_functional_group(fg_name, arch, os_version, input_project_dir, sw_data, fg_name=fg_name, slurm_defined=True ) ) - elif json_file == "service_k8s.json": + elif json_file.startswith("service_k8s_v"): + # Handle versioned service_k8s_v.json files packages.extend( collect_packages_from_json( sw_data, fg_name=fg_name, service_k8s_defined=True @@ -194,6 +200,7 @@ def run_module(): software_config_file=dict(type="str", required=True), input_project_dir=dict(type="str", required=True), additional_json_path=dict(type="str", required=False, default=""), + service_k8s_version=dict(type="str", required=False, default=""), ) result = dict( @@ -212,6 +219,7 @@ def run_module(): software_config_file = module.params["software_config_file"] input_project_dir = module.params["input_project_dir"] additional_json_path = module.params["additional_json_path"] + service_k8s_version = module.params["service_k8s_version"] software_config = load_json_file(software_config_file, module) if not software_config: @@ -221,6 +229,13 @@ def run_module(): if not os_version: module.fail_json(msg="cluster_os_version not found in software_config.json") + # Extract service_k8s version from software_config if not provided + if not service_k8s_version: + for sw in software_config.get("softwares", []): + if sw.get("name") == "service_k8s" and sw.get("version"): + service_k8s_version = sw["version"] + break + allowed_softwares = { sw["name"] for sw in software_config.get("softwares", []) } @@ -229,14 +244,17 @@ def run_module(): additional_enabled = is_additional_packages_enabled(software_config) allowed_additional_subgroups = get_allowed_additional_subgroups(software_config) if additional_enabled else [] + # Versioned JSON file for service_k8s: service_k8s_v.json + service_k8s_json = f"service_k8s_v{service_k8s_version}.json" if service_k8s_version else "service_k8s.json" + # pylint: disable=line-too-long # Functional group → json files mapping software_map = { "os_x86_64": ["default_packages.json", "ldms.json"], "os_aarch64": ["default_packages.json", "ldms.json"], - "service_kube_node_x86_64": ["service_k8s.json"], - "service_kube_control_plane_first_x86_64": ["service_k8s.json"], - "service_kube_control_plane_x86_64": ["service_k8s.json"], + "service_kube_node_x86_64": [service_k8s_json], + "service_kube_control_plane_first_x86_64": [service_k8s_json], + "service_kube_control_plane_x86_64": [service_k8s_json], "slurm_control_node_x86_64": ["slurm_custom.json", "openldap.json", "ldms.json"], "slurm_node_x86_64": ["slurm_custom.json", "openldap.json", "ldms.json"], "login_node_x86_64": ["slurm_custom.json", "openldap.json", "ldms.json"], diff --git a/common/library/modules/prepare_tasklist.py b/common/library/modules/prepare_tasklist.py index 9714c7aaf0..688774cdd7 100644 --- a/common/library/modules/prepare_tasklist.py +++ b/common/library/modules/prepare_tasklist.py @@ -123,8 +123,15 @@ def main(): logger.info("Preparing package lists...") for software in software_list[arch]: logger.info(f"Processing software: {software}") + # Get software version for versioned JSON files (e.g., service_k8s_v1.35.1.json) + software_version = None + for sw in user_data.get("softwares", []): + if sw.get("name") == software and sw.get("version"): + software_version = sw["version"] + break json_path[arch] = get_json_file_path(software, cluster_os_type, - cluster_os_version, user_json_file, arch) + cluster_os_version, user_json_file, arch, + software_version=software_version) status_csv_path[arch] = get_csv_file_path(software, log_dir, arch) logger.info(f"json_path: {json_path}") logger.info(f"status_csv_path: {status_csv_path}") diff --git a/examples/rhel_software_config.json b/examples/rhel_software_config.json index 394ef53120..b9f60b3f3d 100644 --- a/examples/rhel_software_config.json +++ b/examples/rhel_software_config.json @@ -6,7 +6,7 @@ {"name": "default_packages", "arch": ["x86_64","aarch64"]}, {"name": "admin_debug_packages", "arch": ["x86_64","aarch64"]}, {"name": "openldap", "arch": ["x86_64","aarch64"]}, - {"name": "service_k8s","version": "1.34.1", "arch": ["x86_64"]}, + {"name": "service_k8s","version": "1.35.1", "arch": ["x86_64"]}, {"name": "slurm_custom", "arch": ["x86_64","aarch64"]}, {"name": "ucx", "version": "1.19.0", "arch": ["x86_64","aarch64"]}, {"name": "openmpi", "version": "5.0.8", "arch": ["x86_64","aarch64"]}, diff --git a/examples/software_config_template/template_rhel_10.0_multi_arch_software_config.json b/examples/software_config_template/template_rhel_10.0_multi_arch_software_config.json index 83eaa12a8c..69bc80c84f 100644 --- a/examples/software_config_template/template_rhel_10.0_multi_arch_software_config.json +++ b/examples/software_config_template/template_rhel_10.0_multi_arch_software_config.json @@ -7,7 +7,7 @@ {"name": "admin_debug_packages", "arch": ["x86_64","aarch64"]}, {"name": "openldap", "arch": ["x86_64","aarch64"]}, {"name": "slurm_custom", "arch": ["x86_64","aarch64"]}, - {"name": "service_k8s", "version": "1.34.1", "arch": ["x86_64"]}, + {"name": "service_k8s", "version": "1.35.1", "arch": ["x86_64"]}, {"name": "ucx", "version": "1.19.0", "arch": ["x86_64","aarch64"]}, {"name": "openmpi", "version": "5.0.8", "arch": ["x86_64","aarch64"]}, {"name": "csi_driver_powerscale", "version":"v2.15.0", "arch": ["x86_64"]}, diff --git a/examples/software_config_template/template_rhel_10.0_x86-64_software_config.json b/examples/software_config_template/template_rhel_10.0_x86-64_software_config.json index 907958e590..650e912b78 100644 --- a/examples/software_config_template/template_rhel_10.0_x86-64_software_config.json +++ b/examples/software_config_template/template_rhel_10.0_x86-64_software_config.json @@ -7,7 +7,7 @@ {"name": "admin_debug_packages", "arch": ["x86_64"]}, {"name": "openldap", "arch": ["x86_64"]}, {"name": "slurm_custom", "arch": ["x86_64"]}, - {"name": "service_k8s", "version": "1.34.1", "arch": ["x86_64"]}, + {"name": "service_k8s", "version": "1.35.1", "arch": ["x86_64"]}, {"name": "ucx", "version": "1.19.0", "arch": ["x86_64"]}, {"name": "openmpi", "version": "5.0.8", "arch": ["x86_64"]}, {"name": "csi_driver_powerscale", "version":"v2.15.0", "arch": ["x86_64"]}, diff --git a/input/config/x86_64/rhel/10.0/service_k8s_v1.34.1.json b/input/config/x86_64/rhel/10.0/service_k8s_v1.34.1.json new file mode 100644 index 0000000000..6deed2309b --- /dev/null +++ b/input/config/x86_64/rhel/10.0/service_k8s_v1.34.1.json @@ -0,0 +1,108 @@ +{ + "service_k8s": { + "cluster": [ + { "package": "docker.io/library/busybox", "type": "image", "tag": "1.36" }, + { "package": "firewalld", "type": "rpm", "repo_name": "baseos" }, + { "package": "python3-firewall", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "vim-enhanced", "type": "rpm", "repo_name": "appstream"}, + { "package": "fuse-overlayfs", "type": "rpm", "repo_name": "appstream"}, + { "package": "podman", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubeadm-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, + { "package": "kubelet-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, + { "package": "container-selinux", "type": "rpm", "repo_name": "appstream"}, + { "package": "cri-o-1.34.1", "type": "rpm", "repo_name": "cri-o"}, + { "package": "docker.io/victoriametrics/victoria-metrics", "type": "image", "tag": "v1.128.0" }, + { "package": "docker.io/victoriametrics/vmagent", "type": "image", "tag": "v1.128.0" }, + { "package": "docker.io/victoriametrics/vmstorage", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/vminsert", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/vmselect", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.34.1", "type": "image" }, + { "package": "docker.io/curlimages/curl", "type": "image", "tag": "8.17.0" }, + { "package": "docker.io/rmohr/activemq", "type": "image", "tag": "5.15.9" }, + { "package": "docker.io/library/mysql", "type": "image", "tag": "9.3.0" }, + { "package": "docker.io/dellhpcomniaaisolution/idrac_telemetry_receiver", "type": "image", "tag": "1.2" }, + { "package": "docker.io/dellhpcomniaaisolution/kafkapump", "type": "image", "tag": "1.2" }, + { "package": "docker.io/dellhpcomniaaisolution/victoriapump", "type": "image", "tag": "1.2" }, + { "package": "cryptography==45.0.7", "type": "pip_module" }, + { "package": "omsdk==1.2.518", "type": "pip_module" }, + { "package": "cffi==1.17.1", "type": "pip_module" }, + { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" }, + { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, + { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, + { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, + { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, + { "package": "apptainer", "type": "rpm", "repo_name": "epel" }, + { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } + ] + }, + "service_kube_control_plane": { + "cluster": [ + { "package": "ghcr.io/kube-vip/kube-vip", "tag": "v0.8.9", "type": "image" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-apiserver", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-controller-manager", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-scheduler", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-proxy", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/coredns/coredns", "tag": "v1.12.1", "type": "image" }, + { "package": "registry.k8s.io/pause", "tag": "3.10.1", "type": "image" }, + { "package": "registry.k8s.io/etcd", "tag": "3.6.4-0", "type": "image" }, + { "package": "docker.io/calico/cni", "tag": "v3.30.3", "type": "image" }, + { "package": "docker.io/calico/kube-controllers", "tag": "v3.30.3", "type": "image" }, + { "package": "docker.io/calico/node", "tag": "v3.30.3", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.2", "type": "image" }, + { "package": "kubectl-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, + { "package": "prettytable==3.14.0", "type": "pip_module" }, + { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubernetes==33.1.0", "type": "pip_module" }, + { "package": "PyMySQL==1.1.2", "type": "pip_module" } + + ] + }, + "service_kube_control_plane_first": { + "cluster": [ + { "package": "ghcr.io/kube-vip/kube-vip", "tag": "v0.8.9", "type": "image" }, + { "package": "registry.k8s.io/kube-apiserver", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-controller-manager", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-scheduler", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-proxy", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/coredns/coredns", "tag": "v1.12.1", "type": "image" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.34.1", "type": "image" }, + { "package": "registry.k8s.io/pause", "tag": "3.10.1", "type": "image" }, + { "package": "registry.k8s.io/etcd", "tag": "3.6.4-0", "type": "image" }, + { "package": "docker.io/calico/cni", "tag": "v3.30.3", "type": "image" }, + { "package": "docker.io/calico/kube-controllers", "tag": "v3.30.3", "type": "image" }, + { "package": "docker.io/calico/node", "tag": "v3.30.3", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.2", "type": "image" }, + { + "package": "calico-v3.30.3", + "type": "manifest", + "url": "https://raw.githubusercontent.com/projectcalico/calico/v3.30.3/manifests/calico.yaml" + }, + { + "package": "metallb-native-v0.15.2", + "type": "manifest", + "url": "https://raw.githubusercontent.com/metallb/metallb/v0.15.2/config/manifests/metallb-native.yaml" + }, + { "package": "helm-v3.19.0-amd64", "type": "tarball", "url": "https://get.helm.sh/helm-v3.19.0-linux-amd64.tar.gz" }, + { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" }, + { "package": "kubectl-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, + { "package": "prettytable==3.14.0", "type": "pip_module" }, + { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubernetes==33.1.0", "type": "pip_module" }, + { "package": "PyMySQL==1.1.2", "type": "pip_module" } + + ] + }, + + "service_kube_node": { + "cluster": [ + { "package": "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner", "tag": "v4.0.2", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.2", "type": "image" }, + { "package": "quay.io/metallb/controller", "tag": "v0.15.2", "type": "image" } + ] + } +} + diff --git a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json new file mode 100644 index 0000000000..7195ce8168 --- /dev/null +++ b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json @@ -0,0 +1,118 @@ +{ + "service_k8s": { + "cluster": [ + { "package": "docker.io/library/busybox", "type": "image", "tag": "1.36" }, + { "package": "firewalld", "type": "rpm", "repo_name": "baseos" }, + { "package": "python3-firewall", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "vim-enhanced", "type": "rpm", "repo_name": "appstream"}, + { "package": "fuse-overlayfs", "type": "rpm", "repo_name": "appstream"}, + { "package": "podman", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubeadm-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, + { "package": "kubelet-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, + { "package": "container-selinux", "type": "rpm", "repo_name": "appstream"}, + { "package": "cri-o-1.35.1", "type": "rpm", "repo_name": "cri-o-v1-35"}, + { "package": "docker.io/victoriametrics/victoria-metrics", "type": "image", "tag": "v1.128.0" }, + { "package": "docker.io/victoriametrics/vmagent", "type": "image", "tag": "v1.128.0" }, + { "package": "docker.io/victoriametrics/vmstorage", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/vminsert", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/vmselect", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/victoria-logs", "type": "image", "tag": "v1.49.0" }, + { "package": "docker.io/victoriametrics/vlagent", "type": "image", "tag": "v1.49.0" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.35.1", "type": "image" }, + { "package": "docker.io/curlimages/curl", "type": "image", "tag": "8.17.0" }, + { "package": "docker.io/rmohr/activemq", "type": "image", "tag": "5.15.9" }, + { "package": "docker.io/library/mysql", "type": "image", "tag": "9.3.0" }, + { "package": "docker.io/dellhpcomniaaisolution/idrac_telemetry_receiver", "type": "image", "tag": "1.2" }, + { "package": "docker.io/dellhpcomniaaisolution/kafkapump", "type": "image", "tag": "1.2" }, + { "package": "docker.io/dellhpcomniaaisolution/victoriapump", "type": "image", "tag": "1.2" }, + { "package": "cryptography==45.0.7", "type": "pip_module" }, + { "package": "omsdk==1.2.518", "type": "pip_module" }, + { "package": "cffi==1.17.1", "type": "pip_module" }, + { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" }, + { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, + { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, + { "package": "quay.io/dell/container-storage-modules/csm-metrics-powerscale", "tag": "v1.11.0", "type": "image" }, + { "package": "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector", "tag": "0.148.0", "type": "image" }, + { "package": "docker.io/nginxinc/nginx-unprivileged", "tag": "1.29", "type": "image" }, + { "package": "karavi-observability", "type": "git", "url": "https://github.com/dell/karavi-observability.git", "version": "v1.12.0" }, + { "package": "helm-charts", "type": "git", "url": "https://github.com/dell/helm-charts.git", "version": "container-storage-modules-1.9.2" }, + { "package": "cert-manager-v1.10.0", "type": "tarball", "url": "https://charts.jetstack.io/charts/cert-manager-v1.10.0.tgz" }, + { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, + { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, + { "package": "docker.io/victoriametrics/operator", "tag": "v0.68.3", "type": "image" }, + { "package": "docker.io/victoriametrics/operator", "tag": "config-reloader-v0.68.3", "type": "image" }, + { "package": "victoria-metrics-operator-0.59.3", "type": "tarball", "url": "https://github.com/VictoriaMetrics/helm-charts/releases/download/victoria-metrics-operator-0.59.3/victoria-metrics-operator-0.59.3.tgz" }, + { "package": "apptainer", "type": "rpm", "repo_name": "epel" }, + { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } + ] + }, + "service_kube_control_plane": { + "cluster": [ + { "package": "ghcr.io/kube-vip/kube-vip", "tag": "v0.8.9", "type": "image" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-apiserver", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-controller-manager", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-scheduler", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-proxy", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/coredns/coredns", "tag": "v1.13.1", "type": "image" }, + { "package": "registry.k8s.io/pause", "tag": "3.10.1", "type": "image" }, + { "package": "registry.k8s.io/etcd", "tag": "3.6.6-0", "type": "image" }, + { "package": "docker.io/calico/cni", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/kube-controllers", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/node", "tag": "v3.31.4", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" }, + { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, + { "package": "prettytable==3.14.0", "type": "pip_module" }, + { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubernetes==35.0.0", "type": "pip_module" }, + { "package": "PyMySQL==1.1.2", "type": "pip_module" } + + ] + }, + "service_kube_control_plane_first": { + "cluster": [ + { "package": "ghcr.io/kube-vip/kube-vip", "tag": "v0.8.9", "type": "image" }, + { "package": "registry.k8s.io/kube-apiserver", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-controller-manager", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-scheduler", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-proxy", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/coredns/coredns", "tag": "v1.13.1", "type": "image" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.35.1", "type": "image" }, + { "package": "registry.k8s.io/pause", "tag": "3.10.1", "type": "image" }, + { "package": "registry.k8s.io/etcd", "tag": "3.6.6-0", "type": "image" }, + { "package": "docker.io/calico/cni", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/kube-controllers", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/node", "tag": "v3.31.4", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" }, + { + "package": "calico-v3.31.4", + "type": "manifest", + "url": "https://raw.githubusercontent.com/projectcalico/calico/v3.31.4/manifests/calico.yaml" + }, + { + "package": "metallb-native-v0.15.3", + "type": "manifest", + "url": "https://raw.githubusercontent.com/metallb/metallb/v0.15.3/config/manifests/metallb-native.yaml" + }, + { "package": "helm-v3.20.1-amd64", "type": "tarball", "url": "https://get.helm.sh/helm-v3.20.1-linux-amd64.tar.gz" }, + { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" }, + { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, + { "package": "prettytable==3.14.0", "type": "pip_module" }, + { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubernetes==35.0.0", "type": "pip_module" }, + { "package": "PyMySQL==1.1.2", "type": "pip_module" } + + ] + }, + + "service_kube_node": { + "cluster": [ + { "package": "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner", "tag": "v4.0.2", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" }, + { "package": "quay.io/metallb/controller", "tag": "v0.15.3", "type": "image" } + ] + } +} \ No newline at end of file diff --git a/input/local_repo_config.yml b/input/local_repo_config.yml index 7eca8c9346..73aaa09b51 100644 --- a/input/local_repo_config.yml +++ b/input/local_repo_config.yml @@ -50,7 +50,7 @@ # Notes: # - Do not use Jinja variables in this configuration. # - Omit SSL fields entirely if SSL is not in use. -# - Its a mandatory field in case of slurm_custom with name as 'slurm_custom' +# - Version-specific naming (e.g., 'kubernetes-v1-35', 'cri-o-v1-35') is used only for service_k8s # # 3. user_repo_url_aarch64 #--------------------------- @@ -180,11 +180,20 @@ rhel_os_url_aarch64: rhel_subscription_repo_config_x86_64: rhel_subscription_repo_config_aarch64: # Making incorrect changes to this variable can cause omnia failure. Please edit cautiously. +# ============================================================================ +# VERSIONED REPOSITORY NAMING CONVENTION (Omnia 2.2+) +# ============================================================================ +# Starting from Omnia 2.2, repositories use versioned naming: +# - kubernetes-v- (e.g., kubernetes-v1-35) +# - cri-o-v- (e.g., cri-o-v1-35) +# Version-specific naming is used only for service_k8s components (kubernetes, cri-o) +# Other components (doca, cuda, slurm_custom) use non-versioned naming +# ============================================================================ omnia_repo_url_rhel_x86_64: - { url: "https://download.docker.com/linux/centos/10/x86_64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"} - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/x86_64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"} - - { url: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/", gpgkey: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "kubernetes"} - - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "cri-o"} + - { url: "https://pkgs.k8s.io/core:/stable:/v1.35/rpm/", gpgkey: "https://pkgs.k8s.io/core:/stable:/v1.35/rpm/repodata/repomd.xml.key", name: "kubernetes-v1-35"} + - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.35/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.35/rpm/repodata/repomd.xml.key", name: "cri-o-v1-35"} - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/", gpgkey: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/repodata/repomd.xml.key", name: "doca"} - { url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel10/x86_64/", gpgkey: "https://developer.download.nvidia.com/compute/cuda/repos/rhel10/x86_64/repodata/repomd.xml.key", name: "cuda"} omnia_repo_url_rhel_aarch64: diff --git a/input/software_config.json b/input/software_config.json index 8fa558bf28..70e3d679ee 100644 --- a/input/software_config.json +++ b/input/software_config.json @@ -6,7 +6,7 @@ {"name": "default_packages", "arch": ["x86_64","aarch64"]}, {"name": "admin_debug_packages", "arch": ["x86_64","aarch64"]}, {"name": "openldap", "arch": ["x86_64","aarch64"]}, - {"name": "service_k8s","version": "1.34.1", "arch": ["x86_64"]}, + {"name": "service_k8s","version": "1.35.1", "arch": ["x86_64"]}, {"name": "slurm_custom", "arch": ["x86_64","aarch64"]}, {"name": "csi_driver_powerscale", "version":"v2.15.0", "arch": ["x86_64"]}, {"name": "ldms", "arch": ["x86_64","aarch64"]}, diff --git a/local_repo/roles/validation/tasks/validate_software_config_json.yml b/local_repo/roles/validation/tasks/validate_software_config_json.yml index 0a221ebfee..190904eb95 100644 --- a/local_repo/roles/validation/tasks/validate_software_config_json.yml +++ b/local_repo/roles/validation/tasks/validate_software_config_json.yml @@ -91,4 +91,4 @@ msg: "{{ fail_msg }}" when: - service_k8s_support - - service_k8s_version != default_k8s_version + - service_k8s_version not in supported_k8s_versions diff --git a/local_repo/roles/validation/vars/main.yml b/local_repo/roles/validation/vars/main.yml index 88cceea868..8720bec1de 100644 --- a/local_repo/roles/validation/vars/main.yml +++ b/local_repo/roles/validation/vars/main.yml @@ -106,11 +106,14 @@ specific_softwares: - 'intelgaudi' - 'openmpi' - 'bcm_roce_libraries' -default_k8s_version: "1.34.1" +default_k8s_version: "1.35.1" +supported_k8s_versions: + - "1.34.1" + - "1.35.1" fail_msg: >- service_k8s is not supported for version: {{ service_k8s_version }}. - Please update the service_k8s version in software_config.json to {{ default_k8s_version }} - and rerun the playbook. + Please update the service_k8s version in software_config.json to a supported version + ({{ supported_k8s_versions | join(', ') }}) and rerun the playbook. versions_fail_msg: "Versions were not defined for the following softwares: {{ failed_softwares | join(', ') }} in software_config.json. Refer examples/template_{{ cluster_os_type }}_software_config.json and define version details accordingly in {{ project_input_path }}/software_config.json" @@ -175,6 +178,10 @@ http_key: http # Usage: validate_metadata.yml meta_dest: "{{ nfs_shared_path }}/offline_repo/.data" metadata_file_path: "{{ meta_dest }}/localrepo_metadata.yml" +metadata_identical_msg: "Metadata is identical. No changes detected." +metadata_warn_msg: | + WARNING: Metadata has changed since last run. + This may indicate changes in software_config.json or local_repo_config.yml. build_stream_auto_accept_metadata_msg: "Build stream is enabled, automatically accepting metadata changes." # Usage: remove_k8s_line.yml diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml b/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml index 9bbda37138..b3b4c76fb9 100644 --- a/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml +++ b/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml @@ -31,10 +31,10 @@ k8s_versions: "{{ software_config.softwares | selectattr('name', 'in', ['compute_k8s', 'service_k8s']) | map(attribute='version') | list | unique }}" # noqa: yaml[line-length] k8s_arch: "{{ (software_config.softwares | selectattr('name', 'in', ['compute_k8s', 'service_k8s']) | first).get('arch', default_archs) }}" - - name: Set k8s_support_check to false if any k8s version is not default_k8s_version + - name: Set k8s_support_check to false if any k8s version is not in supported_k8s_versions ansible.builtin.set_fact: k8s_support_check: false - when: (k8s_versions | select('ne', default_k8s_version) | list | length) > 0 + when: (k8s_versions | reject('in', supported_k8s_versions) | list | length) > 0 - name: Fail if unsupported service_k8s version is detected ansible.builtin.fail: diff --git a/prepare_oim/roles/prepare_oim_validation/vars/main.yml b/prepare_oim/roles/prepare_oim_validation/vars/main.yml index 79bd5f5b4d..7ee5cfd5a9 100644 --- a/prepare_oim/roles/prepare_oim_validation/vars/main.yml +++ b/prepare_oim/roles/prepare_oim_validation/vars/main.yml @@ -32,13 +32,16 @@ software_config_syntax_fail_msg: "Failed. Syntax errors present in software_conf file_permission: "0755" # Usage: check_k8s_support.yml -fail_msg_k8s_version: "Failed. Kubernetes Version is unsupported or incorrect in software_config.json. Update software_config.json with a supported Kubernetes versions and re-run the playbook.Supported versions are - {{ supported_k8s_version }}" # noqa: yaml[line-length] -invalid_k8s_versions: "{{ k8s_versions | select('ne', default_k8s_version) | list }}" +fail_msg_k8s_version: "Failed. Kubernetes Version is unsupported or incorrect in software_config.json. Update software_config.json with a supported Kubernetes versions and re-run the playbook.Supported versions are - {{ supported_k8s_versions }}" # noqa: yaml[line-length] +invalid_k8s_versions: "{{ k8s_versions | reject('in', supported_k8s_versions) | list }}" fail_msg: >- service_k8s is not supported for version: {{ invalid_k8s_versions }}. - Please update the service_k8s version in software_config.json to {{ default_k8s_version }} - and rerun the playbook. -default_k8s_version: "1.34.1" + Please update the service_k8s version in software_config.json to a supported version + ({{ supported_k8s_versions | join(', ') }}) and rerun the playbook. +default_k8s_version: "1.35.1" +supported_k8s_versions: + - "1.34.1" + - "1.35.1" # Usage: validate_network_spec.yml network_spec: "{{ input_project_dir }}/network_spec.yml" diff --git a/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml b/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml index ac573840b9..45c1b2e818 100644 --- a/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml +++ b/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml @@ -197,7 +197,7 @@ - name: Include local repo access variable file ansible.builtin.include_vars: "{{ local_repo_access_config_file }}" -- name: Load service_k8s.json +- name: Load service_k8s_.json ansible.builtin.set_fact: k8s_packages_json: "{{ lookup('file', k8s_packages_file) | from_json }}" diff --git a/provision/roles/k8s_config/vars/main.yml b/provision/roles/k8s_config/vars/main.yml index 5785565568..9f0dfff2ee 100644 --- a/provision/roles/k8s_config/vars/main.yml +++ b/provision/roles/k8s_config/vars/main.yml @@ -15,7 +15,8 @@ local_repo_access_config_file: "/opt/omnia/provision/local_repo_access.yml" input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" -k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s.json" # noqa: yaml[line-length] +# Versioned JSON file: service_k8s_v.json (e.g., service_k8s_v1.35.1.json) +k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s_v{{ hostvars['localhost']['service_k8s_version'] }}.json" # noqa: yaml[line-length] calico_manifest_yaml_url: "{{ offline_manifest_path }}/{{ calico_package }}/{{ calico_package }}.yml" metallb_manifest_yaml_url: "{{ offline_manifest_path }}/{{ metallb_package }}/{{ metallb_package }}.yml" multus_manifest_yaml_url: "{{ offline_manifest_path }}/{{ multus_package }}/{{ multus_package }}.yml" diff --git a/provision/roles/telemetry/tasks/load_service_images.yml b/provision/roles/telemetry/tasks/load_service_images.yml index 893b830fb2..654c73c9a7 100644 --- a/provision/roles/telemetry/tasks/load_service_images.yml +++ b/provision/roles/telemetry/tasks/load_service_images.yml @@ -13,7 +13,7 @@ # limitations under the License. --- -- name: Extract image packages from service_k8s.json +- name: Extract image packages from service_k8s_.json ansible.builtin.set_fact: service_k8s_image_list: "{{ telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'image') | list }}" diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index 1693a8f1cd..743c67889d 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -24,7 +24,7 @@ - name: Derive sink support flags from collection_targets ansible.builtin.include_tasks: derive_sink_support_flags.yml -- name: Load service images from service_k8s.json +- name: Load service images from service_k8s_.json ansible.builtin.include_tasks: load_service_images.yml - name: Check kube_vip reachability for validation diff --git a/provision/roles/telemetry/tasks/read_software_config.yml b/provision/roles/telemetry/tasks/read_software_config.yml index 005f9e65a2..3bc1a52637 100644 --- a/provision/roles/telemetry/tasks/read_software_config.yml +++ b/provision/roles/telemetry/tasks/read_software_config.yml @@ -35,11 +35,11 @@ ansible.builtin.set_fact: cluster_os_version: "{{ software_config['cluster_os_version'] }}" -- name: Load service_k8s.json +- name: Load service_k8s_.json ansible.builtin.set_fact: telemetry_packages: "{{ lookup('file', k8s_packages_file) | from_json }}" -- name: Extract service_k8s.json and set facts for pip_modules and python_version +- name: Extract service_k8s_.json and set facts for pip_modules and python_version ansible.builtin.set_fact: k8s_pip_packages: >- {{ telemetry_packages['service_kube_control_plane']['cluster'] diff --git a/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 b/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 index 3709759f78..acd8d35029 100644 --- a/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 @@ -25,7 +25,7 @@ spec: tolerationSeconds: 30 # Evict after 30s if node is unreachable containers: - name: kubectl-cleanup - image: docker.io/alpine/kubectl:1.34.1 + image: docker.io/alpine/kubectl:1.35.1 command: - /bin/sh - -c diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index 6c8756c7c2..6282d097ee 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -20,7 +20,8 @@ service_cluster_metadata_path: "/opt/omnia/.data/service_cluster_metadata.yml" metadata_perm: "0644" # Usage: read_software_config.yml -k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s.json" +# Versioned JSON file: service_k8s_v.json (e.g., service_k8s_v1.35.1.json) +k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s_v{{ hostvars['localhost']['service_k8s_version'] }}.json" # Usage: secrets_creation.yml mysqldb_secrets_name: mysqldb-credentials @@ -68,7 +69,7 @@ kafka: service_name: "kafka-headless" lb_service_name: "kafka-loadbalancer" container_port1: 9093 - # Kafka images from service_k8s.json + # Kafka images from service_k8s_.json operator_image: "{{ telemetry_images['strimzi/operator'] | default('quay.io/strimzi/operator:0.48.0') }}" kafka_image: "{{ telemetry_images['strimzi/kafka'] | default('quay.io/strimzi/kafka:0.48.0-kafka-4.1.0') }}" bridge_image: "{{ telemetry_images['strimzi/kafka-bridge'] | default('quay.io/strimzi/kafka-bridge:0.33.1') }}" @@ -98,8 +99,8 @@ kafka: # name: "{{ ome_identifier }}.logs" # partitions: 2 -# Dynamic image configuration from service_k8s.json -# Images and versions are read dynamically from input/config/x86_64/rhel/10.0/service_k8s.json +# Dynamic image configuration from service_k8s_.json +# Images and versions are read dynamically from input/config/x86_64/rhel/10.0/service_k8s_.json telemetry_images: "{{ service_k8s_images | default({}) }}" # Usage: victoriametric_deployment.yml From 8267233d38d03dac614817e211bd0b3169e84b99 Mon Sep 17 00:00:00 2001 From: Katakam-Rakesh Date: Thu, 14 May 2026 12:58:24 +0530 Subject: [PATCH 15/17] update service_k8s_v1.35.1.json and local_repo_config.yml Signed-off-by: Katakam-Rakesh --- .../input_validation/common_utils/config.py | 2 +- .../validation_flows/common_validation.py | 12 +- .../validation_flows/local_repo_validation.py | 21 ++- .../module_utils/local_repo/software_utils.py | 14 +- .../modules/image_package_collector.py | 26 +++- common/library/modules/prepare_tasklist.py | 9 +- examples/rhel_software_config.json | 2 +- ..._rhel_10.0_multi_arch_software_config.json | 2 +- ...late_rhel_10.0_x86-64_software_config.json | 2 +- .../x86_64/rhel/10.0/service_k8s_v1.34.1.json | 108 ++++++++++++++++ .../x86_64/rhel/10.0/service_k8s_v1.35.1.json | 122 ++++++++++++++++++ input/local_repo_config.yml | 14 +- input/software_config.json | 2 +- .../tasks/validate_software_config_json.yml | 2 +- local_repo/roles/validation/vars/main.yml | 13 +- .../tasks/check_k8s_support.yml | 4 +- .../prepare_oim_validation/vars/main.yml | 13 +- .../tasks/create_k8s_config_nfs.yml | 2 +- provision/roles/k8s_config/vars/main.yml | 3 +- .../telemetry/tasks/load_service_images.yml | 2 +- provision/roles/telemetry/tasks/main.yml | 2 +- .../telemetry/tasks/read_software_config.yml | 4 +- .../common/telemetry_pod_cleanup.yaml.j2 | 2 +- provision/roles/telemetry/vars/main.yml | 9 +- 24 files changed, 348 insertions(+), 44 deletions(-) create mode 100644 input/config/x86_64/rhel/10.0/service_k8s_v1.34.1.json create mode 100644 input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index 095849d88f..d1bb4b5b61 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -132,7 +132,7 @@ "openmpi": "5.0.8", "csi_driver_powerscale": "v2.15.0", "rocm": "6.3.1", - "service_k8s": "1.34.1" + "service_k8s": "1.35.1" } # All of the passwords fields diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index 6ff8df745c..363be36fd7 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -252,15 +252,23 @@ def validate_software_config( for software_pkg in data['softwares']: software = software_pkg['name'] arch_list = software_pkg.get('arch') + # Get software version for versioned JSON files (e.g., service_k8s_v1.35.1.json) + software_version = software_pkg.get('version') for arch in arch_list: json_path = get_json_file_path( - software, cluster_os_type, cluster_os_version, input_file_path, arch) + software, cluster_os_type, cluster_os_version, input_file_path, arch, + software_version=software_version) # Check if json_path is None or if the JSON syntax is invalid if not json_path: + # Construct expected filename for error message + if software == "service_k8s" and software_version: + expected_file = f"{software}_v{software_version}.json" + else: + expected_file = f"{software}.json" errors.append( create_error_msg( "Validation Error: ", software, - f"is present in software_config.json. JSON file not found: {software}.json" + f"is present in software_config.json. JSON file not found: {expected_file}" ) ) else: diff --git a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py index 8254faca1e..e81ecd8be7 100644 --- a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py @@ -21,7 +21,7 @@ from ansible.module_utils.input_validation.common_utils import validation_utils from ansible.module_utils.input_validation.common_utils import config from ansible.module_utils.input_validation.common_utils import en_us_validation_msg -from ansible.module_utils.local_repo.software_utils import load_yaml, load_json +from ansible.module_utils.local_repo.software_utils import load_yaml, load_json, get_json_file_path file_names = config.files create_error_msg = validation_utils.create_error_msg @@ -239,13 +239,22 @@ def validate_local_repo_config(input_file_path, data, for software in software_config_json["softwares"]: sw = software["name"] arch_list = software.get("arch") + # Get software version for versioned JSON files (e.g., service_k8s_v1.35.1.json) + software_version = software.get("version") for arch in arch_list: - json_path = create_file_path( - input_file_path, - f"config/{arch}{os_ver_path}" + sw +".json") - if not os.path.exists(json_path): + # Use get_json_file_path for proper versioned JSON file resolution + json_path = get_json_file_path( + sw, cluster_os_type, cluster_os_version, + software_config_file_path, arch, + software_version=software_version) + if not json_path or not os.path.exists(json_path): + # Construct expected filename for error message + if sw == "service_k8s" and software_version: + expected_file = f"{sw}_v{software_version}.json" + else: + expected_file = f"{sw}.json" errors.append( - create_error_msg(sw + '/' + arch, f"{sw} JSON file not found for architecture {arch}.", json_path)) + create_error_msg(sw + '/' + arch, f"{sw} JSON file not found for architecture {arch}.", expected_file)) else: curr_json = load_json(json_path) pkg_list = curr_json[sw]['cluster'] diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py index bc5da2876a..d3306d58b8 100644 --- a/common/library/module_utils/local_repo/software_utils.py +++ b/common/library/module_utils/local_repo/software_utils.py @@ -118,7 +118,7 @@ def load_yaml(file_path): return yaml.safe_load(file) def get_json_file_path(software_name, cluster_os_type, - cluster_os_version, user_json_path, arch): + cluster_os_version, user_json_path, arch, software_version=None): """ Generate the file path for a JSON file based on the provided software name, cluster OS type, cluster OS version, and user JSON path. @@ -129,13 +129,23 @@ def get_json_file_path(software_name, cluster_os_type, cluster_os_version (str): The version of the cluster operating system. user_json_path (str): The path to the user JSON file. arch: Architecture for a particular software + software_version (str, optional): Version of the software for versioned JSON files. + Used for software like service_k8s that have versioned JSON files + (e.g., service_k8s_v1.35.1.json). Returns: str or None: The file path for the JSON file if it exists, otherwise None. """ base_path = os.path.dirname(os.path.abspath(user_json_path)) + + # Handle versioned JSON files (e.g., service_k8s_v1.35.1.json) + if software_name == "service_k8s" and software_version: + json_filename = f"{software_name}_v{software_version}.json" + else: + json_filename = f"{software_name}.json" + json_path = os.path.join(base_path, - f'{SOFTWARE_CONFIG_SUBDIR}/{arch}/{cluster_os_type}/{cluster_os_version}/{software_name}.json' + f'{SOFTWARE_CONFIG_SUBDIR}/{arch}/{cluster_os_type}/{cluster_os_version}/{json_filename}' ) return json_path diff --git a/common/library/modules/image_package_collector.py b/common/library/modules/image_package_collector.py index 77ff67b50d..90a315d750 100644 --- a/common/library/modules/image_package_collector.py +++ b/common/library/modules/image_package_collector.py @@ -151,7 +151,12 @@ def process_functional_group(fg_name, arch, os_version, input_project_dir, packages = [] for json_file in json_files: + # Extract software name from json file + # Handle versioned files like service_k8s_v1.35.1.json -> service_k8s sw_name = json_file.replace(".json", "") + # Remove version suffix for versioned files (e.g., service_k8s_v1.35.1 -> service_k8s) + if sw_name.startswith("service_k8s_v"): + sw_name = "service_k8s" if sw_name not in allowed_softwares: continue @@ -170,7 +175,8 @@ def process_functional_group(fg_name, arch, os_version, input_project_dir, sw_data, fg_name=fg_name, slurm_defined=True ) ) - elif json_file == "service_k8s.json": + elif json_file.startswith("service_k8s_v"): + # Handle versioned service_k8s_v.json files packages.extend( collect_packages_from_json( sw_data, fg_name=fg_name, service_k8s_defined=True @@ -194,6 +200,7 @@ def run_module(): software_config_file=dict(type="str", required=True), input_project_dir=dict(type="str", required=True), additional_json_path=dict(type="str", required=False, default=""), + service_k8s_version=dict(type="str", required=False, default=""), ) result = dict( @@ -212,6 +219,7 @@ def run_module(): software_config_file = module.params["software_config_file"] input_project_dir = module.params["input_project_dir"] additional_json_path = module.params["additional_json_path"] + service_k8s_version = module.params["service_k8s_version"] software_config = load_json_file(software_config_file, module) if not software_config: @@ -221,6 +229,13 @@ def run_module(): if not os_version: module.fail_json(msg="cluster_os_version not found in software_config.json") + # Extract service_k8s version from software_config if not provided + if not service_k8s_version: + for sw in software_config.get("softwares", []): + if sw.get("name") == "service_k8s" and sw.get("version"): + service_k8s_version = sw["version"] + break + allowed_softwares = { sw["name"] for sw in software_config.get("softwares", []) } @@ -229,14 +244,17 @@ def run_module(): additional_enabled = is_additional_packages_enabled(software_config) allowed_additional_subgroups = get_allowed_additional_subgroups(software_config) if additional_enabled else [] + # Versioned JSON file for service_k8s: service_k8s_v.json + service_k8s_json = f"service_k8s_v{service_k8s_version}.json" if service_k8s_version else "service_k8s.json" + # pylint: disable=line-too-long # Functional group → json files mapping software_map = { "os_x86_64": ["default_packages.json", "ldms.json"], "os_aarch64": ["default_packages.json", "ldms.json"], - "service_kube_node_x86_64": ["service_k8s.json"], - "service_kube_control_plane_first_x86_64": ["service_k8s.json"], - "service_kube_control_plane_x86_64": ["service_k8s.json"], + "service_kube_node_x86_64": [service_k8s_json], + "service_kube_control_plane_first_x86_64": [service_k8s_json], + "service_kube_control_plane_x86_64": [service_k8s_json], "slurm_control_node_x86_64": ["slurm_custom.json", "openldap.json", "ldms.json"], "slurm_node_x86_64": ["slurm_custom.json", "openldap.json", "ldms.json"], "login_node_x86_64": ["slurm_custom.json", "openldap.json", "ldms.json"], diff --git a/common/library/modules/prepare_tasklist.py b/common/library/modules/prepare_tasklist.py index 9714c7aaf0..688774cdd7 100644 --- a/common/library/modules/prepare_tasklist.py +++ b/common/library/modules/prepare_tasklist.py @@ -123,8 +123,15 @@ def main(): logger.info("Preparing package lists...") for software in software_list[arch]: logger.info(f"Processing software: {software}") + # Get software version for versioned JSON files (e.g., service_k8s_v1.35.1.json) + software_version = None + for sw in user_data.get("softwares", []): + if sw.get("name") == software and sw.get("version"): + software_version = sw["version"] + break json_path[arch] = get_json_file_path(software, cluster_os_type, - cluster_os_version, user_json_file, arch) + cluster_os_version, user_json_file, arch, + software_version=software_version) status_csv_path[arch] = get_csv_file_path(software, log_dir, arch) logger.info(f"json_path: {json_path}") logger.info(f"status_csv_path: {status_csv_path}") diff --git a/examples/rhel_software_config.json b/examples/rhel_software_config.json index 394ef53120..b9f60b3f3d 100644 --- a/examples/rhel_software_config.json +++ b/examples/rhel_software_config.json @@ -6,7 +6,7 @@ {"name": "default_packages", "arch": ["x86_64","aarch64"]}, {"name": "admin_debug_packages", "arch": ["x86_64","aarch64"]}, {"name": "openldap", "arch": ["x86_64","aarch64"]}, - {"name": "service_k8s","version": "1.34.1", "arch": ["x86_64"]}, + {"name": "service_k8s","version": "1.35.1", "arch": ["x86_64"]}, {"name": "slurm_custom", "arch": ["x86_64","aarch64"]}, {"name": "ucx", "version": "1.19.0", "arch": ["x86_64","aarch64"]}, {"name": "openmpi", "version": "5.0.8", "arch": ["x86_64","aarch64"]}, diff --git a/examples/software_config_template/template_rhel_10.0_multi_arch_software_config.json b/examples/software_config_template/template_rhel_10.0_multi_arch_software_config.json index 83eaa12a8c..69bc80c84f 100644 --- a/examples/software_config_template/template_rhel_10.0_multi_arch_software_config.json +++ b/examples/software_config_template/template_rhel_10.0_multi_arch_software_config.json @@ -7,7 +7,7 @@ {"name": "admin_debug_packages", "arch": ["x86_64","aarch64"]}, {"name": "openldap", "arch": ["x86_64","aarch64"]}, {"name": "slurm_custom", "arch": ["x86_64","aarch64"]}, - {"name": "service_k8s", "version": "1.34.1", "arch": ["x86_64"]}, + {"name": "service_k8s", "version": "1.35.1", "arch": ["x86_64"]}, {"name": "ucx", "version": "1.19.0", "arch": ["x86_64","aarch64"]}, {"name": "openmpi", "version": "5.0.8", "arch": ["x86_64","aarch64"]}, {"name": "csi_driver_powerscale", "version":"v2.15.0", "arch": ["x86_64"]}, diff --git a/examples/software_config_template/template_rhel_10.0_x86-64_software_config.json b/examples/software_config_template/template_rhel_10.0_x86-64_software_config.json index 907958e590..650e912b78 100644 --- a/examples/software_config_template/template_rhel_10.0_x86-64_software_config.json +++ b/examples/software_config_template/template_rhel_10.0_x86-64_software_config.json @@ -7,7 +7,7 @@ {"name": "admin_debug_packages", "arch": ["x86_64"]}, {"name": "openldap", "arch": ["x86_64"]}, {"name": "slurm_custom", "arch": ["x86_64"]}, - {"name": "service_k8s", "version": "1.34.1", "arch": ["x86_64"]}, + {"name": "service_k8s", "version": "1.35.1", "arch": ["x86_64"]}, {"name": "ucx", "version": "1.19.0", "arch": ["x86_64"]}, {"name": "openmpi", "version": "5.0.8", "arch": ["x86_64"]}, {"name": "csi_driver_powerscale", "version":"v2.15.0", "arch": ["x86_64"]}, diff --git a/input/config/x86_64/rhel/10.0/service_k8s_v1.34.1.json b/input/config/x86_64/rhel/10.0/service_k8s_v1.34.1.json new file mode 100644 index 0000000000..6deed2309b --- /dev/null +++ b/input/config/x86_64/rhel/10.0/service_k8s_v1.34.1.json @@ -0,0 +1,108 @@ +{ + "service_k8s": { + "cluster": [ + { "package": "docker.io/library/busybox", "type": "image", "tag": "1.36" }, + { "package": "firewalld", "type": "rpm", "repo_name": "baseos" }, + { "package": "python3-firewall", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "vim-enhanced", "type": "rpm", "repo_name": "appstream"}, + { "package": "fuse-overlayfs", "type": "rpm", "repo_name": "appstream"}, + { "package": "podman", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubeadm-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, + { "package": "kubelet-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, + { "package": "container-selinux", "type": "rpm", "repo_name": "appstream"}, + { "package": "cri-o-1.34.1", "type": "rpm", "repo_name": "cri-o"}, + { "package": "docker.io/victoriametrics/victoria-metrics", "type": "image", "tag": "v1.128.0" }, + { "package": "docker.io/victoriametrics/vmagent", "type": "image", "tag": "v1.128.0" }, + { "package": "docker.io/victoriametrics/vmstorage", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/vminsert", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/vmselect", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.34.1", "type": "image" }, + { "package": "docker.io/curlimages/curl", "type": "image", "tag": "8.17.0" }, + { "package": "docker.io/rmohr/activemq", "type": "image", "tag": "5.15.9" }, + { "package": "docker.io/library/mysql", "type": "image", "tag": "9.3.0" }, + { "package": "docker.io/dellhpcomniaaisolution/idrac_telemetry_receiver", "type": "image", "tag": "1.2" }, + { "package": "docker.io/dellhpcomniaaisolution/kafkapump", "type": "image", "tag": "1.2" }, + { "package": "docker.io/dellhpcomniaaisolution/victoriapump", "type": "image", "tag": "1.2" }, + { "package": "cryptography==45.0.7", "type": "pip_module" }, + { "package": "omsdk==1.2.518", "type": "pip_module" }, + { "package": "cffi==1.17.1", "type": "pip_module" }, + { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" }, + { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, + { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, + { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, + { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, + { "package": "apptainer", "type": "rpm", "repo_name": "epel" }, + { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } + ] + }, + "service_kube_control_plane": { + "cluster": [ + { "package": "ghcr.io/kube-vip/kube-vip", "tag": "v0.8.9", "type": "image" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-apiserver", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-controller-manager", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-scheduler", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-proxy", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/coredns/coredns", "tag": "v1.12.1", "type": "image" }, + { "package": "registry.k8s.io/pause", "tag": "3.10.1", "type": "image" }, + { "package": "registry.k8s.io/etcd", "tag": "3.6.4-0", "type": "image" }, + { "package": "docker.io/calico/cni", "tag": "v3.30.3", "type": "image" }, + { "package": "docker.io/calico/kube-controllers", "tag": "v3.30.3", "type": "image" }, + { "package": "docker.io/calico/node", "tag": "v3.30.3", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.2", "type": "image" }, + { "package": "kubectl-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, + { "package": "prettytable==3.14.0", "type": "pip_module" }, + { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubernetes==33.1.0", "type": "pip_module" }, + { "package": "PyMySQL==1.1.2", "type": "pip_module" } + + ] + }, + "service_kube_control_plane_first": { + "cluster": [ + { "package": "ghcr.io/kube-vip/kube-vip", "tag": "v0.8.9", "type": "image" }, + { "package": "registry.k8s.io/kube-apiserver", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-controller-manager", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-scheduler", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/kube-proxy", "tag": "v1.34.1", "type": "image" }, + { "package": "registry.k8s.io/coredns/coredns", "tag": "v1.12.1", "type": "image" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.34.1", "type": "image" }, + { "package": "registry.k8s.io/pause", "tag": "3.10.1", "type": "image" }, + { "package": "registry.k8s.io/etcd", "tag": "3.6.4-0", "type": "image" }, + { "package": "docker.io/calico/cni", "tag": "v3.30.3", "type": "image" }, + { "package": "docker.io/calico/kube-controllers", "tag": "v3.30.3", "type": "image" }, + { "package": "docker.io/calico/node", "tag": "v3.30.3", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.2", "type": "image" }, + { + "package": "calico-v3.30.3", + "type": "manifest", + "url": "https://raw.githubusercontent.com/projectcalico/calico/v3.30.3/manifests/calico.yaml" + }, + { + "package": "metallb-native-v0.15.2", + "type": "manifest", + "url": "https://raw.githubusercontent.com/metallb/metallb/v0.15.2/config/manifests/metallb-native.yaml" + }, + { "package": "helm-v3.19.0-amd64", "type": "tarball", "url": "https://get.helm.sh/helm-v3.19.0-linux-amd64.tar.gz" }, + { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" }, + { "package": "kubectl-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, + { "package": "prettytable==3.14.0", "type": "pip_module" }, + { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubernetes==33.1.0", "type": "pip_module" }, + { "package": "PyMySQL==1.1.2", "type": "pip_module" } + + ] + }, + + "service_kube_node": { + "cluster": [ + { "package": "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner", "tag": "v4.0.2", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.2", "type": "image" }, + { "package": "quay.io/metallb/controller", "tag": "v0.15.2", "type": "image" } + ] + } +} + diff --git a/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json new file mode 100644 index 0000000000..88e2692fc8 --- /dev/null +++ b/input/config/x86_64/rhel/10.0/service_k8s_v1.35.1.json @@ -0,0 +1,122 @@ +{ + "service_k8s": { + "cluster": [ + { "package": "docker.io/library/busybox", "type": "image", "tag": "1.36" }, + { "package": "firewalld", "type": "rpm", "repo_name": "baseos" }, + { "package": "python3-firewall", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "vim-enhanced", "type": "rpm", "repo_name": "appstream"}, + { "package": "fuse-overlayfs", "type": "rpm", "repo_name": "appstream"}, + { "package": "podman", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubeadm-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, + { "package": "kubelet-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, + { "package": "container-selinux", "type": "rpm", "repo_name": "appstream"}, + { "package": "cri-o-1.35.1", "type": "rpm", "repo_name": "cri-o-v1-35"}, + { "package": "docker.io/victoriametrics/victoria-metrics", "type": "image", "tag": "v1.128.0" }, + { "package": "docker.io/victoriametrics/vmagent", "type": "image", "tag": "v1.128.0" }, + { "package": "docker.io/victoriametrics/vmstorage", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/vminsert", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/vmselect", "type": "image", "tag": "v1.128.0-cluster" }, + { "package": "docker.io/victoriametrics/victoria-logs", "type": "image", "tag": "v1.50.0" }, + { "package": "docker.io/victoriametrics/vlagent", "type": "image", "tag": "v1.50.0" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.35.1", "type": "image" }, + { "package": "docker.io/curlimages/curl", "type": "image", "tag": "8.17.0" }, + { "package": "docker.io/rmohr/activemq", "type": "image", "tag": "5.15.9" }, + { "package": "docker.io/library/mysql", "type": "image", "tag": "9.3.0" }, + { "package": "docker.io/library/python", "type": "image", "tag": "3.11-slim" }, + { "package": "docker.io/dellhpcomniaaisolution/idrac_telemetry_receiver", "type": "image", "tag": "1.2" }, + { "package": "docker.io/dellhpcomniaaisolution/kafkapump", "type": "image", "tag": "1.2" }, + { "package": "docker.io/dellhpcomniaaisolution/victoriapump", "type": "image", "tag": "1.2" }, + { "package": "cryptography==45.0.7", "type": "pip_module" }, + { "package": "omsdk==1.2.518", "type": "pip_module" }, + { "package": "cffi==1.17.1", "type": "pip_module" }, + { "package": "prometheus_client==0.20.0", "type": "pip_module" }, + { "package": "kubernetes==33.1.0", "type": "pip_module" }, + { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" }, + { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, + { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, + { "package": "quay.io/dell/container-storage-modules/csm-metrics-powerscale", "tag": "v1.11.0", "type": "image" }, + { "package": "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector", "tag": "0.143.1", "type": "image" }, + { "package": "docker.io/nginxinc/nginx-unprivileged", "tag": "1.29", "type": "image" }, + { "package": "karavi-observability", "type": "git", "url": "https://github.com/dell/karavi-observability.git", "version": "v1.12.0" }, + { "package": "helm-charts", "type": "git", "url": "https://github.com/dell/helm-charts.git", "version": "container-storage-modules-1.9.2" }, + { "package": "quay.io/jetstack/cert-manager-controller", "tag": "v1.10.0", "type": "image" }, + { "package": "quay.io/jetstack/cert-manager-cainjector", "tag": "v1.10.0", "type": "image" }, + { "package": "quay.io/jetstack/cert-manager-webhook", "tag": "v1.10.0", "type": "image" }, + { "package": "quay.io/jetstack/cert-manager-acmesolver", "tag": "v1.10.0", "type": "image" }, + { "package": "cert-manager-v1.10.0", "type": "tarball", "url": "https://charts.jetstack.io/charts/cert-manager-v1.10.0.tgz" }, + { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, + { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, + { "package": "docker.io/victoriametrics/operator", "tag": "v0.68.3", "type": "image" }, + { "package": "docker.io/victoriametrics/operator", "tag": "config-reloader-v0.68.3", "type": "image" }, + { "package": "victoria-metrics-operator-0.59.3", "type": "tarball", "url": "https://github.com/VictoriaMetrics/helm-charts/releases/download/victoria-metrics-operator-0.59.3/victoria-metrics-operator-0.59.3.tgz" }, + { "package": "docker.io/timberio/vector", "tag": "0.54.0-debian", "type": "image" }, + { "package": "apptainer", "type": "rpm", "repo_name": "epel" }, + { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" }, + { "package": "iscsi-initiator-utils", "type": "rpm", "repo_name": "baseos" }, + { "package": "device-mapper-multipath", "type": "rpm", "repo_name": "baseos" }, + { "package": "sg3_utils", "type": "rpm", "repo_name": "baseos" }, + { "package": "lsscsi", "type": "rpm", "repo_name": "baseos" } + ] + }, + "service_kube_control_plane": { + "cluster": [ + { "package": "ghcr.io/kube-vip/kube-vip", "tag": "v0.8.9", "type": "image" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-apiserver", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-controller-manager", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-scheduler", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-proxy", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/coredns/coredns", "tag": "v1.13.1", "type": "image" }, + { "package": "registry.k8s.io/pause", "tag": "3.10.1", "type": "image" }, + { "package": "registry.k8s.io/etcd", "tag": "3.6.6-0", "type": "image" }, + { "package": "docker.io/calico/cni", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/kube-controllers", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/node", "tag": "v3.31.4", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" }, + { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, + { "package": "prettytable==3.14.0", "type": "pip_module" }, + { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubernetes==35.0.0", "type": "pip_module" }, + { "package": "PyMySQL==1.1.2", "type": "pip_module" } + + ] + }, + "service_kube_control_plane_first": { + "cluster": [ + { "package": "ghcr.io/kube-vip/kube-vip", "tag": "v0.8.9", "type": "image" }, + { "package": "registry.k8s.io/kube-apiserver", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-controller-manager", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-scheduler", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/kube-proxy", "tag": "v1.35.1", "type": "image" }, + { "package": "registry.k8s.io/coredns/coredns", "tag": "v1.13.1", "type": "image" }, + { "package": "docker.io/alpine/kubectl", "tag": "1.35.1", "type": "image" }, + { "package": "registry.k8s.io/pause", "tag": "3.10.1", "type": "image" }, + { "package": "registry.k8s.io/etcd", "tag": "3.6.6-0", "type": "image" }, + { "package": "docker.io/calico/cni", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/kube-controllers", "tag": "v3.31.4", "type": "image" }, + { "package": "docker.io/calico/node", "tag": "v3.31.4", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" }, + { "package": "calico-v3.31.4","type": "manifest", "url": "https://raw.githubusercontent.com/projectcalico/calico/v3.31.4/manifests/calico.yaml" }, + { "package": "metallb-native-v0.15.3", "type": "manifest", "url": "https://raw.githubusercontent.com/metallb/metallb/v0.15.3/config/manifests/metallb-native.yaml" }, + { "package": "helm-v3.20.1-amd64", "type": "tarball", "url": "https://get.helm.sh/helm-v3.20.1-linux-amd64.tar.gz" }, + { "package": "nfs-subdir-external-provisioner-4.0.18", "type": "tarball", "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" }, + { "package": "kubectl-1.35.1", "type": "rpm", "repo_name": "kubernetes-v1-35"}, + { "package": "prettytable==3.14.0", "type": "pip_module" }, + { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, + { "package": "git", "type": "rpm", "repo_name": "appstream"}, + { "package": "kubernetes==35.0.0", "type": "pip_module" }, + { "package": "PyMySQL==1.1.2", "type": "pip_module" } + + ] + }, + + "service_kube_node": { + "cluster": [ + { "package": "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner", "tag": "v4.0.2", "type": "image" }, + { "package": "quay.io/metallb/speaker", "tag": "v0.15.3", "type": "image" }, + { "package": "quay.io/metallb/controller", "tag": "v0.15.3", "type": "image" } + ] + } +} \ No newline at end of file diff --git a/input/local_repo_config.yml b/input/local_repo_config.yml index 7eca8c9346..2a0b0d5276 100644 --- a/input/local_repo_config.yml +++ b/input/local_repo_config.yml @@ -51,6 +51,7 @@ # - Do not use Jinja variables in this configuration. # - Omit SSL fields entirely if SSL is not in use. # - Its a mandatory field in case of slurm_custom with name as 'slurm_custom' +# - Version-specific naming (e.g., 'kubernetes-v1-35', 'cri-o-v1-35') is used only for service_k8s # # 3. user_repo_url_aarch64 #--------------------------- @@ -180,11 +181,20 @@ rhel_os_url_aarch64: rhel_subscription_repo_config_x86_64: rhel_subscription_repo_config_aarch64: # Making incorrect changes to this variable can cause omnia failure. Please edit cautiously. +# ============================================================================ +# VERSIONED REPOSITORY NAMING CONVENTION (Omnia 2.2+) +# ============================================================================ +# Starting from Omnia 2.2, repositories use versioned naming: +# - kubernetes-v- (e.g., kubernetes-v1-35) +# - cri-o-v- (e.g., cri-o-v1-35) +# Version-specific naming is used only for service_k8s components (kubernetes, cri-o) +# Other components (doca, cuda, slurm_custom) use non-versioned naming +# ============================================================================ omnia_repo_url_rhel_x86_64: - { url: "https://download.docker.com/linux/centos/10/x86_64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"} - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/x86_64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"} - - { url: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/", gpgkey: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "kubernetes"} - - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "cri-o"} + - { url: "https://pkgs.k8s.io/core:/stable:/v1.35/rpm/", gpgkey: "https://pkgs.k8s.io/core:/stable:/v1.35/rpm/repodata/repomd.xml.key", name: "kubernetes-v1-35"} + - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.35/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.35/rpm/repodata/repomd.xml.key", name: "cri-o-v1-35"} - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/", gpgkey: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/repodata/repomd.xml.key", name: "doca"} - { url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel10/x86_64/", gpgkey: "https://developer.download.nvidia.com/compute/cuda/repos/rhel10/x86_64/repodata/repomd.xml.key", name: "cuda"} omnia_repo_url_rhel_aarch64: diff --git a/input/software_config.json b/input/software_config.json index 8fa558bf28..70e3d679ee 100644 --- a/input/software_config.json +++ b/input/software_config.json @@ -6,7 +6,7 @@ {"name": "default_packages", "arch": ["x86_64","aarch64"]}, {"name": "admin_debug_packages", "arch": ["x86_64","aarch64"]}, {"name": "openldap", "arch": ["x86_64","aarch64"]}, - {"name": "service_k8s","version": "1.34.1", "arch": ["x86_64"]}, + {"name": "service_k8s","version": "1.35.1", "arch": ["x86_64"]}, {"name": "slurm_custom", "arch": ["x86_64","aarch64"]}, {"name": "csi_driver_powerscale", "version":"v2.15.0", "arch": ["x86_64"]}, {"name": "ldms", "arch": ["x86_64","aarch64"]}, diff --git a/local_repo/roles/validation/tasks/validate_software_config_json.yml b/local_repo/roles/validation/tasks/validate_software_config_json.yml index 0a221ebfee..190904eb95 100644 --- a/local_repo/roles/validation/tasks/validate_software_config_json.yml +++ b/local_repo/roles/validation/tasks/validate_software_config_json.yml @@ -91,4 +91,4 @@ msg: "{{ fail_msg }}" when: - service_k8s_support - - service_k8s_version != default_k8s_version + - service_k8s_version not in supported_k8s_versions diff --git a/local_repo/roles/validation/vars/main.yml b/local_repo/roles/validation/vars/main.yml index 88cceea868..8720bec1de 100644 --- a/local_repo/roles/validation/vars/main.yml +++ b/local_repo/roles/validation/vars/main.yml @@ -106,11 +106,14 @@ specific_softwares: - 'intelgaudi' - 'openmpi' - 'bcm_roce_libraries' -default_k8s_version: "1.34.1" +default_k8s_version: "1.35.1" +supported_k8s_versions: + - "1.34.1" + - "1.35.1" fail_msg: >- service_k8s is not supported for version: {{ service_k8s_version }}. - Please update the service_k8s version in software_config.json to {{ default_k8s_version }} - and rerun the playbook. + Please update the service_k8s version in software_config.json to a supported version + ({{ supported_k8s_versions | join(', ') }}) and rerun the playbook. versions_fail_msg: "Versions were not defined for the following softwares: {{ failed_softwares | join(', ') }} in software_config.json. Refer examples/template_{{ cluster_os_type }}_software_config.json and define version details accordingly in {{ project_input_path }}/software_config.json" @@ -175,6 +178,10 @@ http_key: http # Usage: validate_metadata.yml meta_dest: "{{ nfs_shared_path }}/offline_repo/.data" metadata_file_path: "{{ meta_dest }}/localrepo_metadata.yml" +metadata_identical_msg: "Metadata is identical. No changes detected." +metadata_warn_msg: | + WARNING: Metadata has changed since last run. + This may indicate changes in software_config.json or local_repo_config.yml. build_stream_auto_accept_metadata_msg: "Build stream is enabled, automatically accepting metadata changes." # Usage: remove_k8s_line.yml diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml b/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml index 9bbda37138..b3b4c76fb9 100644 --- a/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml +++ b/prepare_oim/roles/prepare_oim_validation/tasks/check_k8s_support.yml @@ -31,10 +31,10 @@ k8s_versions: "{{ software_config.softwares | selectattr('name', 'in', ['compute_k8s', 'service_k8s']) | map(attribute='version') | list | unique }}" # noqa: yaml[line-length] k8s_arch: "{{ (software_config.softwares | selectattr('name', 'in', ['compute_k8s', 'service_k8s']) | first).get('arch', default_archs) }}" - - name: Set k8s_support_check to false if any k8s version is not default_k8s_version + - name: Set k8s_support_check to false if any k8s version is not in supported_k8s_versions ansible.builtin.set_fact: k8s_support_check: false - when: (k8s_versions | select('ne', default_k8s_version) | list | length) > 0 + when: (k8s_versions | reject('in', supported_k8s_versions) | list | length) > 0 - name: Fail if unsupported service_k8s version is detected ansible.builtin.fail: diff --git a/prepare_oim/roles/prepare_oim_validation/vars/main.yml b/prepare_oim/roles/prepare_oim_validation/vars/main.yml index 79bd5f5b4d..7ee5cfd5a9 100644 --- a/prepare_oim/roles/prepare_oim_validation/vars/main.yml +++ b/prepare_oim/roles/prepare_oim_validation/vars/main.yml @@ -32,13 +32,16 @@ software_config_syntax_fail_msg: "Failed. Syntax errors present in software_conf file_permission: "0755" # Usage: check_k8s_support.yml -fail_msg_k8s_version: "Failed. Kubernetes Version is unsupported or incorrect in software_config.json. Update software_config.json with a supported Kubernetes versions and re-run the playbook.Supported versions are - {{ supported_k8s_version }}" # noqa: yaml[line-length] -invalid_k8s_versions: "{{ k8s_versions | select('ne', default_k8s_version) | list }}" +fail_msg_k8s_version: "Failed. Kubernetes Version is unsupported or incorrect in software_config.json. Update software_config.json with a supported Kubernetes versions and re-run the playbook.Supported versions are - {{ supported_k8s_versions }}" # noqa: yaml[line-length] +invalid_k8s_versions: "{{ k8s_versions | reject('in', supported_k8s_versions) | list }}" fail_msg: >- service_k8s is not supported for version: {{ invalid_k8s_versions }}. - Please update the service_k8s version in software_config.json to {{ default_k8s_version }} - and rerun the playbook. -default_k8s_version: "1.34.1" + Please update the service_k8s version in software_config.json to a supported version + ({{ supported_k8s_versions | join(', ') }}) and rerun the playbook. +default_k8s_version: "1.35.1" +supported_k8s_versions: + - "1.34.1" + - "1.35.1" # Usage: validate_network_spec.yml network_spec: "{{ input_project_dir }}/network_spec.yml" diff --git a/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml b/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml index ac573840b9..45c1b2e818 100644 --- a/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml +++ b/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml @@ -197,7 +197,7 @@ - name: Include local repo access variable file ansible.builtin.include_vars: "{{ local_repo_access_config_file }}" -- name: Load service_k8s.json +- name: Load service_k8s_.json ansible.builtin.set_fact: k8s_packages_json: "{{ lookup('file', k8s_packages_file) | from_json }}" diff --git a/provision/roles/k8s_config/vars/main.yml b/provision/roles/k8s_config/vars/main.yml index 5785565568..9f0dfff2ee 100644 --- a/provision/roles/k8s_config/vars/main.yml +++ b/provision/roles/k8s_config/vars/main.yml @@ -15,7 +15,8 @@ local_repo_access_config_file: "/opt/omnia/provision/local_repo_access.yml" input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" -k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s.json" # noqa: yaml[line-length] +# Versioned JSON file: service_k8s_v.json (e.g., service_k8s_v1.35.1.json) +k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s_v{{ hostvars['localhost']['service_k8s_version'] }}.json" # noqa: yaml[line-length] calico_manifest_yaml_url: "{{ offline_manifest_path }}/{{ calico_package }}/{{ calico_package }}.yml" metallb_manifest_yaml_url: "{{ offline_manifest_path }}/{{ metallb_package }}/{{ metallb_package }}.yml" multus_manifest_yaml_url: "{{ offline_manifest_path }}/{{ multus_package }}/{{ multus_package }}.yml" diff --git a/provision/roles/telemetry/tasks/load_service_images.yml b/provision/roles/telemetry/tasks/load_service_images.yml index 893b830fb2..654c73c9a7 100644 --- a/provision/roles/telemetry/tasks/load_service_images.yml +++ b/provision/roles/telemetry/tasks/load_service_images.yml @@ -13,7 +13,7 @@ # limitations under the License. --- -- name: Extract image packages from service_k8s.json +- name: Extract image packages from service_k8s_.json ansible.builtin.set_fact: service_k8s_image_list: "{{ telemetry_packages['service_k8s']['cluster'] | selectattr('type', 'equalto', 'image') | list }}" diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index 1693a8f1cd..743c67889d 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -24,7 +24,7 @@ - name: Derive sink support flags from collection_targets ansible.builtin.include_tasks: derive_sink_support_flags.yml -- name: Load service images from service_k8s.json +- name: Load service images from service_k8s_.json ansible.builtin.include_tasks: load_service_images.yml - name: Check kube_vip reachability for validation diff --git a/provision/roles/telemetry/tasks/read_software_config.yml b/provision/roles/telemetry/tasks/read_software_config.yml index 005f9e65a2..3bc1a52637 100644 --- a/provision/roles/telemetry/tasks/read_software_config.yml +++ b/provision/roles/telemetry/tasks/read_software_config.yml @@ -35,11 +35,11 @@ ansible.builtin.set_fact: cluster_os_version: "{{ software_config['cluster_os_version'] }}" -- name: Load service_k8s.json +- name: Load service_k8s_.json ansible.builtin.set_fact: telemetry_packages: "{{ lookup('file', k8s_packages_file) | from_json }}" -- name: Extract service_k8s.json and set facts for pip_modules and python_version +- name: Extract service_k8s_.json and set facts for pip_modules and python_version ansible.builtin.set_fact: k8s_pip_packages: >- {{ telemetry_packages['service_kube_control_plane']['cluster'] diff --git a/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 b/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 index 3709759f78..acd8d35029 100644 --- a/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 @@ -25,7 +25,7 @@ spec: tolerationSeconds: 30 # Evict after 30s if node is unreachable containers: - name: kubectl-cleanup - image: docker.io/alpine/kubectl:1.34.1 + image: docker.io/alpine/kubectl:1.35.1 command: - /bin/sh - -c diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index 6c8756c7c2..6282d097ee 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -20,7 +20,8 @@ service_cluster_metadata_path: "/opt/omnia/.data/service_cluster_metadata.yml" metadata_perm: "0644" # Usage: read_software_config.yml -k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s.json" +# Versioned JSON file: service_k8s_v.json (e.g., service_k8s_v1.35.1.json) +k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s_v{{ hostvars['localhost']['service_k8s_version'] }}.json" # Usage: secrets_creation.yml mysqldb_secrets_name: mysqldb-credentials @@ -68,7 +69,7 @@ kafka: service_name: "kafka-headless" lb_service_name: "kafka-loadbalancer" container_port1: 9093 - # Kafka images from service_k8s.json + # Kafka images from service_k8s_.json operator_image: "{{ telemetry_images['strimzi/operator'] | default('quay.io/strimzi/operator:0.48.0') }}" kafka_image: "{{ telemetry_images['strimzi/kafka'] | default('quay.io/strimzi/kafka:0.48.0-kafka-4.1.0') }}" bridge_image: "{{ telemetry_images['strimzi/kafka-bridge'] | default('quay.io/strimzi/kafka-bridge:0.33.1') }}" @@ -98,8 +99,8 @@ kafka: # name: "{{ ome_identifier }}.logs" # partitions: 2 -# Dynamic image configuration from service_k8s.json -# Images and versions are read dynamically from input/config/x86_64/rhel/10.0/service_k8s.json +# Dynamic image configuration from service_k8s_.json +# Images and versions are read dynamically from input/config/x86_64/rhel/10.0/service_k8s_.json telemetry_images: "{{ service_k8s_images | default({}) }}" # Usage: victoriametric_deployment.yml From cb209109e3926054bba3da2f1685eb67e8c60f98 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> Date: Thu, 14 May 2026 13:49:30 +0530 Subject: [PATCH 16/17] Merge pull request #4414 from jagadeeshnv/pub/q2_upgrade Storage input upgrade --- upgrade/prepare_upgrade.yml | 8 +- .../tasks/transform_storage_config.yml | 19 +- .../templates/storage_config.j2 | 283 ++++++++++++++---- 3 files changed, 239 insertions(+), 71 deletions(-) diff --git a/upgrade/prepare_upgrade.yml b/upgrade/prepare_upgrade.yml index 37cadecd34..c18f34b924 100644 --- a/upgrade/prepare_upgrade.yml +++ b/upgrade/prepare_upgrade.yml @@ -83,7 +83,8 @@ 6. local_repo_config.yml - Preserved and validated 7. security_config.yml - Restored from backup 8. software_config.json - Restored from backup - 9. Credentials - Restored and re-encrypted + 9. storage_config.yml - Preserved and validated + 10. Credentials - Restored and re-encrypted ------------------------------------------------------------------------ SECTION 2: REVIEW REQUIRED (new parameters added in 2.2) @@ -97,8 +98,9 @@ File 2: storage_config.yml 1. nfs_client_params is replaced by 'mounts' format 2. mount_params profiles (nfs_default, vast_nfs, vast_rdma) - 3. functional_group_prefix, node_key, permissions, swap - 4. powervault_config (now a list with mount_point) + 3. powervault_config (now a list with mount_point) + 4. swap (new swap configuration) + 5. s3_configurations (new S3 storage configuration) File 3: telemetry_config.yml 1. Restructured to: sources, bridges, sinks architecture diff --git a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml index 81ce14c5f9..a9b12b424e 100644 --- a/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml +++ b/upgrade/roles/import_input_parameters/tasks/transform_storage_config.yml @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. --- - - name: Check if backup storage_config.yml exists ansible.builtin.stat: path: "{{ backup_location }}/storage_config.yml" @@ -42,8 +41,24 @@ ansible.builtin.set_fact: backup_storage_config: "{{ backup_storage_config_slurp.content | b64decode | from_yaml }}" -- name: Normalize storage_config.yml values +- name: Read backup omnia_config.yml + ansible.builtin.slurp: + src: "{{ backup_location }}/omnia_config.yml" + register: backup_omnia_config_slurp + +- name: Parse backup omnia_config.yml + ansible.builtin.set_fact: + backup_omnia_config: "{{ backup_omnia_config_slurp.content | b64decode | from_yaml }}" + +- name: Set facts for slurm + ansible.builtin.set_fact: + slurm_nfs_storage_name: "{{ backup_omnia_config.slurm_cluster[0].nfs_storage_name | default(None) }}" + k8s_nfs_storage_name: "{{ backup_omnia_config.service_k8s_cluster[0].nfs_storage_name | default(None) }}" + +- name: Read the slurm mount point ansible.builtin.set_fact: + slurm_nfs_client_params: "{{ (backup_storage_config.nfs_client_params | selectattr('nfs_name', 'equalto', slurm_nfs_storage_name) | first | default({})) }}" + k8s_nfs_client_params: "{{ (backup_storage_config.nfs_client_params | selectattr('nfs_name', 'equalto', k8s_nfs_storage_name) | first | default({})) }}" storage_nfs_client_params: "{{ backup_storage_config.nfs_client_params | default([]) }}" storage_powervault_config: "{{ backup_storage_config.powervault_config | default({}) }}" diff --git a/upgrade/roles/import_input_parameters/templates/storage_config.j2 b/upgrade/roles/import_input_parameters/templates/storage_config.j2 index 5f6d35c972..eedc3265f8 100644 --- a/upgrade/roles/import_input_parameters/templates/storage_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/storage_config.j2 @@ -12,81 +12,232 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -# *********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. -# SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -# *********************************************************************** +# -------------------------------------- Mounts------------------------------------------------ +# mounts +# Configure mount points compatible with cloud-init mounts module. +# Source must be known at boot time (NFS paths, UUIDs, local devices). +# For runtime-discovered sources (iSCSI/multipath), use powervault_config above. +# +# Each mount entry contains the following fields (matching /etc/fstab format): +# - name: Unique identifier for this mount entry. Required +# - source: Device or network path (e.g., /dev/sdc, UUID=xxx, 192.168.1.100:/share). Required +# - mount_point: Mount point path (e.g., /mnt, /opt/data). Required +# - fs_type: Filesystem type (e.g., ext4, xfs, nfs, nfs4, cifs, auto). Optional +# - If specified, takes PRIORITY over mount_params profile +# - mnt_opts: Mount options (e.g., defaults,noexec,nofail). Optional +# - If specified, takes PRIORITY over mount_params profile +# - dump_freq: Dump frequency (usually "0"). Optional +# - If specified, takes PRIORITY over mount_params profile +# - fsck_pass: Fsck pass number (usually "0" or "2"). Optional +# - If specified, takes PRIORITY over mount_params profile +# - mount_params: Name of a profile in mount_params section. Optional +# - Used ONLY for fields not explicitly specified in the mount entry +# - permissions: Directory ownership and mode for the mount point. Optional +# - owner: User owner (name or numeric UID, e.g., "root", "slurm", "1001"). Default: "root" +# - group: Group owner (name or numeric GID, e.g., "root", "slurm", "1001"). Default: "root" +# - mode: Octal permission string (e.g., "0755", "1777"). Default: "0755" +# - Applied via cloud-init runcmd (chown + chmod) after mount +# - node_key: cloud-init datasource variable accessible via 'cloud-init query ' (e.g., "local_hostname", "ds.meta_data.instance_data.local_ipv4"). Optional +# - When present, implies per-node bind mount where source path is constructed using the queried variable value +# - The variable chosen must be unique per host to ensure isolation between nodes +# - fs_type forced to "none", mnt_opts forced to "bind" (automatic) +# - Source becomes: // +# - node_mount_point: List of bind mount targets. Required when node_key is set +# - Each target gets: // -> +# - functional_group_prefix: List of functional group prefixes. Optional +# - All nodes whose role starts with any prefix get this mount +# - e.g., ["slurm"] matches slurm_control_node, slurm_node, etc. +# - If omitted (and no group), mount applies to all nodes +# - Mutually exclusive with group +# - groups: List of GROUP_NAME values from pxe_mapping_file.csv. Optional +# - Mutually exclusive with functional_group_prefix +# - e.g., groups: ["grp1", "grp2"] targets only nodes in those groups +# -# -----------------------------Powervault------------------------------------------- -# powervault_config -# ip: ipv4 -# A list of PowerVault controller IP addresses used for iSCSI target discovery and login. -# In this configuration, a single controller portal is provided. +# Example: static mount with all explicit params (no profile) +# mounts: +# Example: static mount using profile +# - name: "vast_home" +# source: "192.168.1.100:/home" +# mount_point: "/home" +# mount_params: "vast_nfs" +# functional_group_prefix: ["slurm"] +# permissions: +# owner: "root" +# group: "root" +# mode: "0755" +# +# Example: per-node bind mount (node_key triggers bind behavior) +# - name: "scratch_isolation" +# source: "/mnt/scratch" +# mount_point: "/mnted/scratch" +# node_key: "local_hostname" +# node_mount_point: +# - /scratch +# - /tmp + +# functional_group_prefix: ["slurm_node"] +# # On node001 generates fstab: +# # /mnted/scratch/node001/scratch /scratch none bind 0 0 +# # /mnted/scratch/node001/tmp /tmp none bind 0 0 +# # slurm.conf: SlurmdSpoolDir=/scratch,/tmp + +# /mnt/scratch /mnted/sctratch nfs4 defaults,nofail,_netdev,x-systemd.after=cloud-init-network.service 0 0 + +# /mnted/scratch/node001/var/loig/state /var/log/state none bind 0 0 + +{% if slurm_nfs_client_params or k8s_nfs_client_params %} +mounts: +{% if slurm_nfs_client_params %} + - name: {{ slurm_nfs_client_params.nfs_name }} + source: "{{ slurm_nfs_client_params.server_ip | default('') }}:{{ slurm_nfs_client_params.server_share_path | default('') }}" + mount_point: {{ slurm_nfs_client_params.client_share_path }} + mount_opts: "{{ slurm_nfs_client_params.client_mount_options | default('nosuid,rw,sync,hard,intr') }}" + fs_type: "nfs" + mount_on_oim: true + functional_group_prefix: ["slurm", "login"] +{% endif %} +{% if k8s_nfs_client_params %} + - name: {{ k8s_nfs_client_params.nfs_name }} + source: "{{ k8s_nfs_client_params.server_ip | default('') }}:{{ k8s_nfs_client_params.server_share_path | default('') }}" + mount_point: {{ k8s_nfs_client_params.client_share_path }} + mount_opts: "{{ k8s_nfs_client_params.client_mount_options | default('nosuid,rw,sync,hard,intr') }}" + fs_type: "nfs" + mount_on_oim: true + functional_group_prefix: ["service_kube"] +{% endif %} +{% endif %} + +# -------------------------Mount Params (Reusable Profiles)--------------------------- +# mount_params: Dictionary of named mount configuration profiles for reuse across mounts. +# Each profile is a named dictionary with optional fields: +# - fs_type: Filesystem type (nfs, nfs4, xfs, ext4, ext3, ext2, cifs, tmpfs, cephfs, vfat, ntfs, none, fuse.s3fs) +# - mnt_opts: Mount options string (comma-separated, e.g., "nosuid,rw,sync,hard,intr") +# - dump_freq: Dump frequency for fstab (usually "0"). Default: "0" +# - fsck_pass: Fsck pass number for fstab (usually "0" or "2"). Default: "0" +# - Custom fields: Any additional backend-specific metadata (e.g., vast_nfs_ip, rdma_port) -# port: -# Defines the TCP port for the iSCSI target service. -# Port 3260 is the standard port for iSCSI communication. +mount_params: + # Standard NFS mount with security and performance tuning + nfs_default: + fs_type: "nfs" + mnt_opts: "nosuid,rw,sync,hard,intr" + dump_freq: "0" + fsck_pass: "0" -# isci_initiators: -# Specifies the InitiatorName used by the host when connecting to the iSCSI target. -# This IQN uniquely identifies the host to the storage array. + # VAST NFS storage - standard TCP configuration + vast_nfs: + fs_type: "nfs" + mnt_opts: "nosuid,rw,sync,hard,intr" -# volume_id: -# This is the unique WWN/identifier for the -# specific volume that should be used for persistent storage. -# The script uses this value during multipath scanning to select the correct mapped device + # VAST NFS storage - RDMA protocol for high-performance environments + vast_rdma: + fs_type: "nfs" + mnt_opts: "proto=rdma,port=20049,nconnect=8,timeo=600,retrans=2,rsize=1048576,wsize=1048576,hard,intr,localaddr=192.168.0.23" +# -----------------------------Powervault------------------------------------------- +# powervault_config +# Processed entirely via runcmd script (setup_iscsi_storage.sh). +# The device path (/dev/mapper/XXX) is only known after iSCSI login + multipath scan, +# so powervault mounts CANNOT use the cloud-init mounts module. +# The runcmd script handles: iscsid enable, initiator name, discovery, login, +# multipathd, volume_id matching, partitioning, formatting, mount, and bind mounts. +# +# Mandatory parameters: +# - name: Unique identifier for this powervault entry. Required +# - ip: List of PowerVault controller IPv4 addresses for iSCSI target discovery. Required +# - iscsi_initiator: InitiatorName IQN for the host. Required +# - volume_id: WWN/identifier for the volume (used for multipath device matching). Required +# - mount_point: Where the discovered device gets mounted. Required + +# Optional parameters: +# - port: TCP port for iSCSI target service. Default: 3260 +# - mount_params: Named profile for fs_type/mnt_opts (read by the runcmd script). Optional +# - node_key: ds.meta_data key for per-node bind mounts (e.g., "local_hostname", "ds.meta_data.instance_data.local_ipv4"). Optional +# - When present, implies bind mount: // -> +# - fs_type forced to "none", mnt_opts forced to "bind" (automatic) +# - node_mount_point: List of bind mount targets. Required when node_key is set +# - Pattern: // -> +# - functional_group_prefix: List of functional group prefixes for node targeting. Mutually exclusive with group. +# - permissions: Directory ownership and mode for the mount point. Optional +# - owner: User owner (name or UID). Default: "root" +# - group: Group owner (name or GID). Default: "root" +# - mode: Octal permission string (e.g., "0755"). Default: "0755" + +# powervault_config: +# # This mounts the whole powervault volume with to /mnt/slurm +# # followed by bind creation of dir under /mnt/slurm +# # node_key is the key in cloud-init so that its unique per host {% set pv = storage_powervault_config | default({}) %} {% if pv %} powervault_config: - ip: -{% for _ip in pv.ip | default([]) %} - - {{ _ip }} -{% endfor %} - port: {{ pv.port | default('') }} - isci_initiators: {{ pv.isci_initiators | default('') }} - volume_id: {{ pv.volume_id | default('') }} + - name: powervault_slurm_ctld + ip: + {% for _ip in pv.ip | default([]) %} + - {{ _ip }} + {% endfor %} + port: {{ pv.port | default('') }} + iscsi_initiator: {{ pv.iscsi_initiator | default('') }} + volume_id: {{ pv.volume_id | default('') }} + mount_point: "/mnt/slurm" + fs_type: "xfs" + node_key: "local_hostname" + node_mount_point: + - "/var/lib/mysql" + - "/var/spool/slurm" + functional_group_prefix: ["slurm_control_node"] {% else %} -#powervault_config: -# ip: -# - 172.1.2.3 -# port: 3260 -# isci_initiators: iqn.initiator.com.example:7d7d7d7d7d7 -# volume_id: 00c0ff4343f1f1f1001c8c4e6901000000 +# powervault_config: +# - name: powervault1 +# ip: +# - 172.1.2.3 +# port: 3260 +# iscsi_initiator: iqn.2025-01.com.dell:scontrol-node +# volume_id: 00c0ff4343f1f1f1001c8c4e6901000000 +# # mount params +# mount_point: "/mnt/slurm" +# fs_type: "xfs" +# node_key: "local_hostname" +# node_mount_point: +# - "/var/lib/mysql" +# - "/var/spool/slurm" +# functional_group_prefix: ["slurm_control_node"] {% endif %} -# -----------------------------NFS------------------------------------------------ - -# This variable is used for mounting NFS share on slurm_control_node, slurm_node, login_node -# This takes a list of dicts with possible keys server_ip, server_share_path, client_share_path, client_mount_options -# In both the cases, the USER must manually update 'server_ip' and 'server_share_path' below with the correct values. -# If mount_option values are empty, NFS client will be mounted with these values "nosuid,rw,sync,hard,intr" -# Its mandatory to provide atleast one entry in nfs_client_params -# Example for single mount file system: -# nfs_client_params: -# nfs_name : str ,Name of the NFS storage resource. The default is "nfs_storage_default". -# The user can assign any custom string to specify a different NFS storage resource. -# - { server_ip: 10.5.0.101, server_share_path: "/mnt/share", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard"} -# Example for supporting multiple mount points: -# nfs_client_params: -# - { server_ip: 198.168.0.1,server_share_path: "/mnt/share1", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard"} -# - { server_ip: 198.168.0.2, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard"} -# Example for multiple mount file system: -# nfs_client_params: -# - { server_ip: 198.168.0.1, server_share_path: "/mnt/share1", client_share_path: "/mnt/mount1", client_mount_options: "nosuid,rw,sync,hard"} -# - { server_ip: 198.168.0.2, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard"} - -{% set _nfs = storage_nfs_client_params | default([]) %} -{% if _nfs %} -nfs_client_params: -{% for _entry in _nfs %} - - server_ip: "{{ _entry.server_ip | default('') }}" # Provide the IP of the NFS server - server_share_path: "{{ _entry.server_share_path | default('') }}" # Provide server share path of the NFS Server - client_share_path: {{ _entry.client_share_path | default('') }} - client_mount_options: "{{ _entry.client_mount_options | default('nosuid,rw,sync,hard,intr') }}" -{% if _entry.nfs_name is defined %} - nfs_name: {{ _entry.nfs_name }} -{% endif %} -{% endfor %} -{% endif %} +# -----------------------------Swap------------------------------------------------- +# swap: Swap file configuration (list of swap configurations) +# Each swap entry contains: +# - name: Unique identifier. Required +# - filename: Path to the swap file (e.g., /swapfile). Required +# - size: Size in bytes, 'auto', or human-readable (e.g., "2G", "512M"). Required +# - maxsize: Max size (used with size: auto). Optional +# - functional_group_prefix: List of functional group prefixes. + +# swap: +# - name: "compute_swap" +# filename: "/swapfile" +# size: "2G" +# maxsize: "4G" +# functional_group_prefix: ["slurm_node"] + +# ============================================================ +# OpenCHAMI S3 Storage Configuration +# ============================================================ +# s3_configurations: Configures the S3-compatible storage backend for OpenCHAMI image repository. +# +# provider: Selects which S3-compatible storage service to use. +# - "powerscale": Use Dell PowerScale as external S3 storage (default) +# - "minio": Use MinIO container deployed locally on OIM +# +# endpoint_url: S3 endpoint URL. +# - Required when provider is "powerscale" (e.g., "https://10.43.1.11:9021") +# - Leave empty ("") when provider is "minio" (auto-configured to local MinIO) +# +# Credentials: +# - s3_access_id and s3_secret_key are prompted during prepare_oim credential setup +# - For "minio" provider: s3_access_id defaults to "admin" if not provided +# - For "powerscale" provider: s3_access_id is prompted as conditional mandatory +s3_configurations: + provider: "minio" + endpoint_url: "" \ No newline at end of file From b736df65c980b302858eebad100ffee7eb864904 Mon Sep 17 00:00:00 2001 From: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com> Date: Thu, 14 May 2026 19:06:01 +0530 Subject: [PATCH 17/17] Fix lint issue Signed-off-by: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com> --- provision/roles/telemetry/vars/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index fe6cd52ca6..3ecc0fb0e0 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -21,7 +21,7 @@ metadata_perm: "0644" # Usage: read_software_config.yml # Versioned JSON file: service_k8s_v.json (e.g., service_k8s_v1.35.1.json) -k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s_v{{ hostvars['localhost']['service_k8s_version'] }}.json" +k8s_packages_file: "{{ input_project_dir }}/config/x86_64/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/service_k8s_v{{ hostvars['localhost']['service_k8s_version'] }}.json" # noqa: yaml[line-length] # Usage: secrets_creation.yml mysqldb_secrets_name: mysqldb-credentials