From dc94ba868c2925bc4618becb442b1eedb2758bbc Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 30 Apr 2026 18:22:28 +0530 Subject: [PATCH 01/63] pub telemetry changes --- .../input_validation/common_utils/config.py | 4 +- .../common_utils/en_us_validation_msg.py | 363 ++++++++++++------ .../common_utils/logical_validation.py | 3 +- .../input_validation/schema/network_spec.json | 16 - .../input_validation/schema/omnia_config.json | 88 ++++- .../validation_flows/provision_validation.py | 67 ---- .../library/modules/fetch_telemetry_status.py | 4 +- .../library/modules/generate_pxe_mapping.py | 46 +-- .../library/modules/ome_server_inventory.py | 47 +-- discovery/discovery.yml | 34 +- .../ome_discovery/tasks/collect_inventory.yml | 19 +- discovery/roles/ome_discovery/vars/main.yml | 12 +- .../pxe_mapping_file.csv | 12 +- .../catalog_rhel_json/pxe_mapping_file.csv | 24 +- .../pxe_mapping_file.csv | 22 +- .../pxe_mapping_file.csv | 12 +- examples/pxe_mapping_file.csv | 22 +- gitlab/roles/hosted_gitlab/vars/main.yml | 6 +- input/discovery_config.yml | 6 +- input/network_spec.yml | 1 - input/omnia_config.yml | 49 ++- input/pxe_mapping_file.csv | 27 +- prepare_oim/prepare_oim.yml | 19 - .../deploy_containers/common/vars/main.yml | 7 +- .../deploy_containers/openchami/vars/main.yml | 18 +- ...-group-login_compiler_node_aarch64.yaml.j2 | 18 +- ...i-group-login_compiler_node_x86_64.yaml.j2 | 18 +- .../ci-group-login_node_aarch64.yaml.j2 | 16 +- .../ci-group-login_node_x86_64.yaml.j2 | 12 +- ...ce_kube_control_plane_first_x86_64.yaml.j2 | 3 +- ...-service_kube_control_plane_x86_64.yaml.j2 | 3 +- .../ci-group-service_kube_node_x86_64.yaml.j2 | 3 +- ...ci-group-slurm_control_node_x86_64.yaml.j2 | 3 +- .../ci-group-slurm_node_aarch64.yaml.j2 | 10 +- .../ci-group-slurm_node_x86_64.yaml.j2 | 13 +- .../doca-ofed/configure-ib-network.sh.j2 | 63 ++- .../roles/configure_ochami/vars/main.yml | 2 +- .../tasks/include_software_config.yml | 1 - .../common/telemetry_pod_cleanup.yaml.j2 | 75 +++- .../oim_container_cleanup/vars/main.yml | 4 - 40 files changed, 593 insertions(+), 579 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index 47990cafdc..7f26f692e4 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -58,8 +58,7 @@ "telemetry_config": "telemetry_config.yml", "high_availability_config": "high_availability_config.yml", "build_stream_config": "build_stream_config.yml", - "gitlab_config": "gitlab_config.yml", - "discovery_config": "discovery_config.yml" + "gitlab_config": "gitlab_config.yml" # "additional_software": "additional_software.json" } @@ -104,7 +103,6 @@ # "high_availability": [files["high_availability_config"]], # "additional_software": [files["additional_software"]], "build_stream": [files["build_stream_config"]], - "discovery": [files["discovery_config"]], "gitlab": [files["gitlab_config"], files["build_stream_config"]], "all": [ files["local_repo_config"], diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index fe1baa69e2..0e32c3bdae 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -92,28 +92,34 @@ " but service_node entry missing in sofware_config.json, " "Please rerun local repo with service_node entry in software_config.json " "to deploy service nodes successfully") -SERVICE_K8S_ENTRY_MISSING_SOFTWARE_CONFIG_MSG = ("The role service_kube_control_plane is defined in roles_config.yml, " +SERVICE_K8S_ENTRY_MISSING_SOFTWARE_CONFIG_MSG = ( + "The role service_kube_control_plane is defined in roles_config.yml, " "but the service_k8s package entry is missing in software_config.json. " - "To deploy Kubernetes in the service_k8s cluster, the package must be added to software_config.json.") + "To deploy Kubernetes in the service_k8s cluster, the package " + "must be added to software_config.json." +) SERVICE_NODE_ENTRY_INVALID_ROLES_CONFIG_MSG = ("The 'service_node' role defined in roles_config.yml" - " is not currently supported and is reserved for future use. Please remove or update this role" + " is not currently supported and is reserved for future use. Please remove or update this role" " to avoid configuration errors.") # Functional Groups Config Validation Messages EMPTY_OR_SYNTAX_ERROR_FUNCTIONAL_GROUPS_CONFIG_MSG = ( - "The functional_groups_config.yml file is empty or has syntax errors." + "The functional_groups_config.yml file is empty or has syntax errors." "It must contain a valid 'functional_groups' section with proper YAML formatting." "Check the file content and rerun the playbook." ) MISSING_GROUPS_SECTION_MSG = ( - "The functional_groups_config.yml file is empty or has syntax errors." + "The functional_groups_config.yml file is empty or has syntax errors." "It must contain a valid 'groups' section with proper YAML formatting." "Check the file content and rerun the playbook." ) +# pylint: disable=invalid-name MISSING_FUNCTIONAL_GROUPS_SECTION_MSG = ( - "The functional_groups_config.yml file must contain a valid 'functional_groups' section. It must be a non-empty list." + "The functional_groups_config.yml file must contain a valid " + "'functional_groups' section. It must be a non-empty list." ) +# pylint: enable=invalid-name NON_EMPTY_CLUSTER_NAME_MSG = "Cluster name must not be empty for '{name}' functional group." FUNCTIONAL_GROUPS_NOT_LIST_MSG = ( "The 'functional_groups' key must be associated with a list of functional group definitions." @@ -130,10 +136,7 @@ "Please make sure cluster name is same for slurm cluster and login_node functional groups." ) SLURM_NODE_PARENT_MISSING_MSG = ( - "Functional group '{name}' must have a non-empty 'parent' field." -) -MISSING_FUNCTIONAL_GROUPS_SECTION_MSG = ( - "The 'functional_groups' section is missing or null. It must be a non-empty list." + "Functional group '{name}' must have a non-empty 'parent' field." ) SLURM_NODE_WITHOUT_CONTROL_MSG = ( "Slurm node defined for cluster '{cluster}' but no corresponding slurm_control_node exists. " @@ -161,23 +164,33 @@ PRIMARY_ADMIN_BMC_IP_SAME_MSG = "primary_oim_admin_ip and primary_oim_bmc_ip should not be the same." PRIMARY_ADMIN_IP_INVALID_MSG = "primary_oim_admin_ip is not a valid IPv4 address." PRIMARY_BMC_IP_INVALID_MSG = "primary_oim_bmc_ip is not a valid IPv4 address." -PRIMARY_ADMIN_IP_IN_DYNAMIC_RANGE_MSG = "primary_oim_admin_ip should not be within the dynamic_range." -PRIMARY_BMC_IP_IN_DYNAMIC_RANGE_MSG = "primary_oim_bmc_ip should not be within the dynamic_range." +PRIMARY_ADMIN_IP_IN_DYNAMIC_RANGE_MSG = ( + "primary_oim_admin_ip should not be within the dynamic_range." +) +PRIMARY_BMC_IP_IN_DYNAMIC_RANGE_MSG = ( + "primary_oim_bmc_ip should not be within the dynamic_range." +) DEFAULT_LEASE_TIME_FAIL_MSG = "Please provide a valid default_lease_time." ENABLE_SWITCH_BASED_FAIL_MSG = "enable_switch_based must be set to either true or false." LANGUAGE_FAIL_MSG = "Only en_US.UTF-8 language supported" LANGUAGE_EMPTY_MSG = "Language setting cannot be empty" PUBLIC_NIC_FAIL_MSG = "public_nic is empty. Please provide a public_nic value." -PXE_MAPPING_FILE_PATH_FAIL_MSG = ("File path is invalid. Please ensure the file path specified in " - "pxe_mapping_file_path exists and points to a valid file, " - "not a directory.") -PXE_MAPPING_FILE_EXT_FAIL_MSG = ("File path is invalid. Please ensure that the file ends with " - ".csv extension") -PXE_MAPPING_AARCH64_LOCAL_PATH_MSG = ("aarch64 nodes are present in pxe_mapping_file.csv but " - "local share path selected for omnia core container deployment. " - "aarch64 nodes require NFS share path. " - "Please redeploy omnia core container with NFS share path option or remove aarch64 nodes " - "from pxe_mapping_file.csv.") +PXE_MAPPING_FILE_PATH_FAIL_MSG = ( + "File path is invalid. Please ensure the file path specified in " + "pxe_mapping_file_path exists and points to a valid file, " + "not a directory." +) +PXE_MAPPING_FILE_EXT_FAIL_MSG = ( + "File path is invalid. Please ensure that the file ends with " + ".csv extension" +) +PXE_MAPPING_AARCH64_LOCAL_PATH_MSG = ( + "aarch64 nodes are present in pxe_mapping_file.csv but " + "local share path selected for omnia core container deployment. " + "aarch64 nodes require NFS share path. " + "Please redeploy omnia core container with NFS share path option " + "or remove aarch64 nodes from pxe_mapping_file.csv." +) CLUSTER_OS_FAIL_MSG = "Cluster OS must be 'rhel' for RHEL Omnia Infrastructure Manager" # local_repo.yml @@ -186,89 +199,129 @@ RHEL_OS_URL_MSG = "is empty. Please provide a rhel_os_url value." UBUNTU_OS_URL_MSG = "ubuntu_os_url is empty. Please provide a ubuntu_os_url value." LDMS_REQUIRES_SERVICE_K8S_MSG = ( - "requires service_k8s to be present in the 'softwares' list in software_config.json." + "requires service_k8s to be present in the 'softwares' list " + "in software_config.json." ) LDMS_REQUIRES_SLURM_MSG = ( - "requires Slurm package 'slurm_custom' to be present in the 'softwares' list in software_config.json." + "requires Slurm package 'slurm_custom' to be present in the " + "'softwares' list in software_config.json." ) USER_REPO_NAME_PREFIX_FAIL_MSG = ( - "Repository name '{repo_name}' in {repo_key} must start with '{expected_prefix}'. " - "Please update the name to '{expected_prefix}{repo_name}'." + "Repository name '{repo_name}' in {repo_key} must start with " + "'{expected_prefix}'. Please update the name to " + "'{expected_prefix}{repo_name}'." ) # omnia_config.yml -INVALID_PASSWORD_MSG = ("Provided password is invalid. Password must meet the specified " - "requirements: should not be empty, must have a length of at least " - "8 characters, and should not contain the following characters: " - "'-', '\\', \"'\", or '\"'") -K8S_CNI_FAIL_MSG = "k8s_cni is empty or invalid. k8s_cni must be set to either calico or flannel. " -POD_EXTERNAL_IP_RANGE_FAIL_MSG = ("pod_external_ip_range value is either empty or invalid. Please " - "provide one of the following acceptable formats: '10.11.0.100-" - "10.11.0.150' (range between start and end IP addresses) or " - "'10.11.0.0/16' (CIDR notation).") -SLURM_INSTALLATION_TYPE_FAIL_MSG = ("slurm_installation_type is empty or invalid. " - "slurm_installation_type_fail_msg must either be set to " - "nfs_share or configless.") -RESTART_SLURM_SERVICES_FAIL_MSG = ("restart_slurm_services is empty or invalid. " - "restart_slurm_services must be set to either true or false.") -K8S_SERVICE_ADDRESSES_FAIL_MSG = ("k8s_service_addresses are empty. " - "Please provide k8s_service_addresses value.") -K8S_POD_NETWORK_CIDR_FAIL_MSG = ("k8s_pod_network_cidr is empty. " - "Please provide a k8s_pod_network_cidr value.") -INTEL_GAUDI_FAIL_MSG = "should not be false as intel_gaudi exists in software_config.json" +INVALID_PASSWORD_MSG = ( + "Provided password is invalid. Password must meet the specified " + "requirements: should not be empty, must have a length of at least " + "8 characters, and should not contain the following characters: " + "'-', '\\', \"'\", or '\"'" +) +K8S_CNI_FAIL_MSG = "k8s_cni is empty or invalid. k8s_cni must be set to either calico or flannel." +POD_EXTERNAL_IP_RANGE_FAIL_MSG = ( + "pod_external_ip_range value is either empty or invalid. Please " + "provide one of the following acceptable formats: '10.11.0.100-" + "10.11.0.150' (range between start and end IP addresses) or " + "'10.11.0.0/16' (CIDR notation)." +) +SLURM_INSTALLATION_TYPE_FAIL_MSG = ( + "slurm_installation_type is empty or invalid. " + "slurm_installation_type must either be set to " + "nfs_share or configless." +) +RESTART_SLURM_SERVICES_FAIL_MSG = ( + "restart_slurm_services is empty or invalid. " + "restart_slurm_services must be set to either true or false." +) +K8S_SERVICE_ADDRESSES_FAIL_MSG = ( + "k8s_service_addresses are empty. " + "Please provide k8s_service_addresses value." +) +K8S_POD_NETWORK_CIDR_FAIL_MSG = ( + "k8s_pod_network_cidr is empty. " + "Please provide a k8s_pod_network_cidr value." +) CSI_DRIVER_SECRET_FAIL_MSG = "CSI Powerscale driver secret file path should not be empty." CSI_DRIVER_VALUES_FAIL_MSG = "CSI Powerscale driver values file path should not be empty." # provision_config_credentials.yml -PROVISION_PASSWORD_FAIL_MSG = ("Incorrect provision_password format. Password must meet the " - "specified requirements: should not be empty, must have a " - "length of at least 8 characters, and should not contain the " - "following characters: '-', '\\', \"'\", or '\"'") -POSTGRESDB_PASSWORD_FAIL_MSG = ("Failed. postgresdb_password should contain only alphanumeric " - "characters and minimum length 8") +PROVISION_PASSWORD_FAIL_MSG = ( + "Incorrect provision_password format. Password must meet the " + "specified requirements: should not be empty, must have a " + "length of at least 8 characters, and should not contain the " + "following characters: '-', '\\', \"'\", or '\"'" +) +POSTGRESDB_PASSWORD_FAIL_MSG = ( + "Failed. postgresdb_password should contain only alphanumeric " + "characters and minimum length 8" +) def bmc_username_fail_msg(min_username_length, max_length): """Returns a formatted message indicating bmc_username_fail_msg.""" - return (f"bmc_username length must be between {min_username_length} and " - f"{max_length} characters. Must not contain '-', '\\', \"'\", or '\"'") + return ( + f"bmc_username length must be between {min_username_length} and " + f"{max_length} characters. Must not contain '-', '\\', \"'\", or '\"'" + ) -BMC_PASSWORD_FAIL_MSG = ("Incorrect bmc_password format. Password must meet the specified " - "requirements: should not be empty, must have a length of at least " - "3 characters, and should not contain the following characters: " - "'-', '\\', \"'\", or '\"'") +BMC_PASSWORD_FAIL_MSG = ( + "Incorrect bmc_password format. Password must meet the specified " + "requirements: should not be empty, must have a length of at least " + "3 characters, and should not contain the following characters: " + "'-', '\\', \"'\", or '\"'" +) DOCKER_PASSWORD_FAIL_MSG = "Docker password must not be empty." -SWITCH_SNMP3_USERNAME_EMPTY_MSG = ("enabled_switch_based is set to true, " - "switch_snmp3_username must not be empty") -SWITCH_SNMP3_PASSWORD_EMPTY_MSG = ("enabled_switch_based is set to true, " - "switch_snmp3_password must not be empty") +SWITCH_SNMP3_USERNAME_EMPTY_MSG = ( + "enabled_switch_based is set to true, " + "switch_snmp3_username must not be empty" +) +SWITCH_SNMP3_PASSWORD_EMPTY_MSG = ( + "enabled_switch_based is set to true, " + "switch_snmp3_password must not be empty" +) def switch_snmp3_username_fail_msg(min_username_length, max_length): """Returns a formatted message indicating switch_snmp3_username_fail_msg.""" - return (f"switch_snmp3_username length must be between {min_username_length} " - f"and {max_length} characters. Must not contain '-', '\\', \"'\", or '\"'") -SWITCH_SNMP3_PASSWORD_FAIL_MSG = ("switch_snmp3_password must be at least 3 characters. " - "Must not contain '-', '\\', \"'\", or '\"'") - + return ( + f"switch_snmp3_username length must be between {min_username_length} " + f"and {max_length} characters. Must not contain '-', '\\', \"'\", or '\"'" + ) +SWITCH_SNMP3_PASSWORD_FAIL_MSG = ( + "switch_snmp3_password must be at least 3 characters. " + "Must not contain '-', '\\', \"'\", or '\"'" +) # telemetry_config.yml -KAFKA_ENABLE_FEDERATED_IDRAC_TELEMETRY_COLLECTION= ("requires federated_idrac_telemetry_collection " - "to be enabled. Please rerun the playbook " - "with federated_idrac_telemetry_collection true" - "in telemetry_config.yml.") -TELEMETRY_SERVICE_CLUSTER_ENTRY_MISSING_ROLES_CONFIG_MSG= ("requires service k8s roles(service_kube_control_plane and service_kube_node)" - " to be defined in 'pxe_mapping_file.csv'. Please either configure " - "service k8s roles in the mapping file " - "or disable idrac_telemetry_support in in telemetry_config.yml " - "and rerun the playbook.") -TELEMETRY_SERVICE_CLUSTER_ENTRY_FOR_LDMS_MISSING_ROLES_CONFIG_MSG= ("requires service k8s roles(service_kube_control_plane " - "and service_kube_node) or slurm nodes(slurm_control_node_x86_64 and slurm_node) " - " to be defined in 'pxe_mapping_file.csv'. Please either configure " - "service k8s/slurm roles in the mapping file or remove ldms from " - "software_config.json and rerun the playbook.") +TELEMETRY_SERVICE_CLUSTER_ENTRY_MISSING_ROLES_CONFIG_MSG = ( + "requires service k8s roles(service_kube_control_plane and service_kube_node)" + " to be defined in 'pxe_mapping_file.csv'. Please either configure " + "service k8s roles in the mapping file " + "or disable idrac_telemetry_support in in telemetry_config.yml " + "and rerun the playbook." +) +TELEMETRY_SERVICE_CLUSTER_ENTRY_FOR_LDMS_MISSING_ROLES_CONFIG_MSG = ( + "requires service k8s roles(service_kube_control_plane " + "and service_kube_node) or slurm nodes(slurm_control_node_x86_64 and slurm_node) " + " to be defined in 'pxe_mapping_file.csv'. Please either configure " + "service k8s/slurm roles in the mapping file or remove ldms from " + "software_config.json and rerun the playbook." +) # PowerScale telemetry validation messages POWERSCALE_VICTORIA_REQUIRED_MSG = ( "PowerScale telemetry requires VictoriaMetrics to be deployed. " - "When powerscale_configurations.powerscale_telemetry_support is true, 'victoria' must be included in " - "telemetry_collection_type (e.g., 'victoria' or 'victoria,kafka')." + "When telemetry_sources.powerscale.metrics_enabled is true, " + "'victoria_metrics' must be included in collection_targets " + "(e.g., 'victoria_metrics' or 'victoria_metrics,victoria_logs')." +) +POWERSCALE_VICTORIA_LOGS_REQUIRED_MSG = ( + "PowerScale logs collection requires VictoriaLogs to be deployed. " + "When telemetry_sources.powerscale.logs_enabled is true, " + "'victoria_logs' must be included in collection_targets " + "(e.g., 'victoria_metrics,victoria_logs')." +) +POWERSCALE_SYSLOG_SOURCE_IP_INVALID_MSG = ( + "Invalid IP address in powerscale_syslog_source_ips. " + "Each entry must be a valid IPv4 address (e.g., '192.168.55.11')." ) POWERSCALE_CSI_DRIVER_MISSING_MSG = ( "csi_driver_powerscale is not configured in software_config.json. " @@ -279,18 +332,22 @@ def switch_snmp3_username_fail_msg(min_username_length, max_length): "PowerScale telemetry requires a service cluster." ) POWERSCALE_CONFIGURATIONS_MISSING_MSG = ( - "powerscale_configurations section is required and must contain powerscale_telemetry_support." + "powerscale_configurations section is required when " + "telemetry_sources.powerscale.metrics_enabled is true. " + "It must contain csm_observability_values_file_path." ) POWERSCALE_OTEL_STORAGE_SIZE_INVALID_MSG = ( "must be a non-empty string in format 'XGi' (e.g., '5Gi')" ) POWERSCALE_CSM_VALUES_PATH_REQUIRED_MSG = ( - "csm_observability_values_file_path is required when powerscale_configurations.powerscale_telemetry_support is true. " + "csm_observability_values_file_path is required when " + "telemetry_sources.powerscale.metrics_enabled is true. " "Please provide the path to the CSM Observability values.yaml file." ) POWERSCALE_AUTH_PROXY_HOST_MISSING_MSG = ( - "karaviMetricsPowerscale.authorization.proxyHost is required in the CSM Observability values file " - "when karaviMetricsPowerscale.authorization.enabled is true. " + "karaviMetricsPowerscale.authorization.proxyHost is required in the " + "CSM Observability values file when " + "karaviMetricsPowerscale.authorization.enabled is true. " "Please provide the hostname or IP of the CSM Authorization Proxy server." ) def powerscale_csm_values_not_found_msg(path): @@ -329,20 +386,54 @@ def powerscale_image_version_mismatch_msg(image_name, values_image, service_k8s_ f"Please update service_k8s.json to match the values.yaml version " f"and re-run local_repo.yml to mirror the correct image to Pulp." ) -POWERSCALE_SERVICE_K8S_JSON_NOT_FOUND_MSG = ( - "service_k8s.json not found. Cannot validate PowerScale telemetry image versions. " - "Please ensure local_repo.yml has been executed." + +# PowerScale CSM Authorization validation messages +POWERSCALE_AUTH_CSI_DRIVER_MISSING_MSG = ( + "PowerScale CSM Authorization requires 'csi_driver_powerscale' to be present in software_config.json." +) +POWERSCALE_AUTH_SERVICE_CLUSTER_MISSING_MSG = ( + "PowerScale CSM Authorization requires service cluster nodes " + "(service_kube_node_*, service_kube_control_plane_*) to be defined " + "in the PXE mapping file." +) +POWERSCALE_AUTH_CSM_VALUES_PATH_REQUIRED_MSG = ( + "csm_authorization_values_file_path is required when powerscale_authorization.enabled is true." +) +def powerscale_auth_csm_values_not_found_msg(path): + """Returns error message when CSM Authorization values.yaml file is not found.""" + return ( + f"CSM Authorization values file does not exist at path: {path}. " + "Please verify the file path is correct." + ) +def powerscale_auth_csm_values_validation_error_msg(error): + """Returns error message when CSM Authorization values.yaml validation fails.""" + return f"Error validating CSM Authorization image versions: {error}" +POWERSCALE_AUTH_TENANTS_REQUIRED_MSG = ( + "At least one tenant must be defined when powerscale_authorization.enabled is true." +) +def powerscale_auth_tenant_roles_required_msg(tenant_name): + """Returns error message when a tenant has no roles defined.""" + return ( + f"At least one role must be defined for tenant '{tenant_name}'." + ) +def powerscale_auth_image_version_mismatch_msg( + image_name, values_version, csi_version +): + """Returns error message when CSM Authorization image version doesn't match csi_driver_powerscale.json.""" + return ( + f"Image version for {image_name} in CSM Authorization values.yaml " + f"({values_version}) does not match csi_driver_powerscale.json " + f"({csi_version}). Please ensure both files use the same version." + ) +POWERSCALE_AUTH_CSI_JSON_NOT_FOUND_MSG = ( + "csi_driver_powerscale.json not found. Cannot validate CSM Authorization " + "image versions. Please ensure the file exists at " + "input/config/x86_64/rhel/10.0/csi_driver_powerscale.json." ) + def boolean_fail_msg(value): """Returns a formatted message indicating boolean_fail_msg.""" return f"{value} must be set to either true or false." -APPLIANCE_K8S_POD_NET_CIDR_FAIL_MSG = ("appliance_k8s_pod_net_cidr value is either empty or " - "invalid. Please provide CIDR notation such as " - "192.168.0.0/16") -K8S_PROMETHEUS_SUPPORT_FAIL_MSG = ("k8s_prometheus_support must be True when " - "prometheus_gaudi_support is True.") -PROMETHEUS_SCRAPE_INTERVAL_FAIL_MSG = ("prometheus_scrape_interval must be at least 15 when " - "prometheus_gaudi_support is True.") # security_config.yml DOMAIN_NAME_FAIL_MSG = "domain_name is empty. Please provide a domain_name value." @@ -425,20 +516,16 @@ def json_file_mandatory(file_path): ) NETWORK_SPEC_FILE_NOT_FOUND_MSG = "network_spec.yml file not found in input folder." IB_NETMASK_BITS_MISMATCH_MSG = ( - "netmask_bits configured for ib_network must match admin_network netmask_bits in network_spec.yml." + "netmask_bits configured for ib_network must match admin_network " + "netmask_bits in network_spec.yml." ) IB_SUBNET_IN_ADMIN_RANGE_MSG = ( - "ib_network subnet must be outside the admin network range derived from primary_oim_admin_ip/netmask_bits in network_spec.yml." + "ib_network subnet must be outside the admin network range derived " + "from primary_oim_admin_ip/netmask_bits in network_spec.yml." ) # telemetry MANDATORY_FIELD_FAIL_MSG = "must not be empty" -MYSQLDB_USER_FAIL_MSG = "username should not be kept 'root'." -FUZZY_OFFSET_FAIL_MSG = "should be between 60 and omnia_telemetry_collection_interval value" -METRIC_COLLECTION_TIMEOUT_FAIL_MSG = ("should be greater than 0 and less than " - "omnia_telemetry_collection_interval value") -MOUNT_LOCATION_FAIL_MSG = "should have '/' at the end of the path" -GRAFANA_PASSWORD_FAIL_MSG = "should not be kept 'admin'" # security FILE_PATH_FAIL_MSG = "path does not exist" @@ -452,18 +539,16 @@ def tls_ext_fail_msg(valid_extensions): CLIENT_MOUNT_OPTIONS_FAIL_MSG = "should only contain nosuid,rw,sync,hard as options" SLURM_SHARE_FAIL_MSG = "Exactly one entry should be present in nfs_client_params with slurm_share as true in storage_config.yml" K8S_SHARE_FAIL_MSG = "Exactly one entry should be present in nfs_client_params with k8s_share as true in storage_config.yml" -BENCHMARK_TOOLS_FAIL_MSG = "Atleast one out of k8s_share or slurm_share in storage_config.yml should be true \ - when ucx/openmpi mentioned in software_config.json." -MULT_SHARE_FAIL_MSG = "Exactly one entry should be present in nfs_client_params with slurm_share as true or \ - k8s_share as true in storage_config.yml" +BENCHMARK_TOOLS_FAIL_MSG = ( + "Atleast one out of k8s_share or slurm_share in storage_config.yml " + "should be true when ucx/openmpi mentioned in software_config.json." +) +MULT_SHARE_FAIL_MSG = ( + "Exactly one entry should be present in nfs_client_params with " + "slurm_share as true or k8s_share as true in storage_config.yml" +) BEEGFS_UMOUNT_CLIENT_FAIL_MSG = "should be set to true since beegfs_mounts value has been changed" -# server_spec -SERVER_SPEC_NICNETWORKS_FAIL_MSG = ("in server_spec.yml must exist within network_spec.yml as a " - "network name. Please check both files") -def server_spec_network_key_fail_msg(nic_device): - """Returns a formatted message indicating server_spec_network_key_fail_msg.""" - return f"in server_spec.yml does not start with '{nic_device}' (nicdevices)" IP_OVERLAP_FAIL_MSG = ("admin network, bmc network and k8 network and IP ranges should " "not have any IP overlap. Check omnia_config.yml and network_spec.yml") TELEMETRY_IP_OVERLAP_FAIL_MSG = ("admin network, telemetry network and IP ranges should " @@ -482,15 +567,22 @@ def server_spec_network_key_fail_msg(nic_device): "roles_config.yml") FEILD_MUST_BE_EMPTY = "feild must be empty." DUPLICATE_VIRTUAL_IP = "is already used. Please give unique virtual ip address" -VIRTUAL_IP_SAME_AS_PRIMARY_OIM_ADMIN_IP = ("virtual_ip_address provided in high_availability_config.yml must not be the same as primary_oim_admin_ip in network_spec.yml. " - "Please provide a different virtual IP address.") +VIRTUAL_IP_SAME_AS_PRIMARY_OIM_ADMIN_IP = ( + "virtual_ip_address provided in high_availability_config.yml must not be " + "the same as primary_oim_admin_ip in network_spec.yml. " + "Please provide a different virtual IP address." +) INVALID_PASSIVE_NODE_SERVICE_TAG = "active node and passive node service tag cannot be same." GROUP_NOT_FOUND = "is not defined in the roles_config.yml. Please define the group in roles_config.yml" ROLE_NODE_FOUND = "is not defined in roles_config.yml. Please define the role in roles_config.yml" -DUPLICATE_ACTIVE_NODE_SERVICE_TAG = ("the service tag configured for a active node is already " - "present elsewhere in the config file. ") -DUPLICATE_PASSIVE_NODE_SERVICE_TAG = ("the service tag configured for a passive node is already " - "present elsewhere in the config file. ") +DUPLICATE_ACTIVE_NODE_SERVICE_TAG = ( + "the service tag configured for a active node is already " + "present elsewhere in the config file. " +) +DUPLICATE_PASSIVE_NODE_SERVICE_TAG = ( + "the service tag configured for a passive node is already " + "present elsewhere in the config file. " +) # build_stream_config.yml ENABLE_BUILD_STREAM_REQUIRED_MSG = "Field 'enable_build_stream' is required in build_stream_config.yml." @@ -505,13 +597,15 @@ def server_spec_network_key_fail_msg(nic_device): ) AARCH64_INVENTORY_HOST_IP_REQUIRED_MSG = ( - "Field 'aarch64_inventory_host_ip' is required when PXE mapping file contains aarch64 functional groups. " - "Provide the admin IP of the aarch64 inventory host or remove aarch64 groups from PXE mapping." + "Field 'aarch64_inventory_host_ip' is required when PXE mapping file " + "contains aarch64 functional groups. Provide the admin IP of the " + "aarch64 inventory host or remove aarch64 groups from PXE mapping." ) AARCH64_INVENTORY_HOST_IP_NOT_REACHABLE_MSG = ( "aarch64 inventory host IP {0} is not reachable on SSH port 22. " - "Ensure the host is online, SSH service is running, and accessible from OIM." + "Ensure the host is online, SSH service is running, and accessible " + "from OIM." ) AARCH64_INVENTORY_HOST_IP_REACHABILITY_CHECK_FAILED_MSG = ( @@ -521,9 +615,10 @@ def server_spec_network_key_fail_msg(nic_device): BUILD_STREAM_PORT_RANGE_MSG = "build_stream_port must be an integer between 1 and 65535." BUILD_STREAM_PORT_INUSE_MSG = ( - "Port {port} is already in use and is not serving build_stream on {host_ip}. Please choose another free port." + "Port {port} is already in use and is not serving build_stream on " + "{host_ip}. Please choose another free port." ) - + BUILD_STREAM_HOST_IP_REQUIRED_MSG = ( "Field 'build_stream_host_ip' is mandatory in build_stream_config.yml. " "Please provide a valid IPv4 address (OIM admin IP or OIM public IP)." @@ -556,9 +651,11 @@ def build_stream_host_ip_not_oim_ip_msg(ip, allowed_ips): "private, internal, public.") GITLAB_DEFAULT_BRANCH_EMPTY_MSG = ("Field 'gitlab_default_branch' is required and cannot be empty. " "Provide a valid git branch name. Default: main") -GITLAB_DEFAULT_BRANCH_INVALID_MSG = ("Field 'gitlab_default_branch' contains invalid characters. " +GITLAB_DEFAULT_BRANCH_INVALID_MSG = ( + "Field 'gitlab_default_branch' contains invalid characters. " "Branch name must start with alphanumeric and may contain " - "letters, digits, dots, hyphens, underscores, or slashes.") + "letters, digits, dots, hyphens, underscores, or slashes." +) GITLAB_HTTPS_PORT_INVALID_MSG = ("Field 'gitlab_https_port' must be a valid port number between " "1 and 65535. Default: 443") GITLAB_SSH_PORT_INVALID_MSG = ("Field 'gitlab_ssh_port' must be a valid port number between " @@ -623,3 +720,15 @@ def get_logic_failed(input_file_path): def get_logic_success(input_file_path): """Returns a formatted message indicating logic validation success for a file.""" return f"{'#' * 10} Logic validation successful for {input_file_path} {'#' * 10}" + +# ============================================================================ +# Vector Bridge Validation Messages +# ============================================================================ + +# Vector-LDMS validation messages +VECTOR_LDMS_SOURCE_DISABLED_MSG = ( + "Vector-LDMS bridge cannot be enabled when telemetry_sources.ldms.metrics_enabled is 'false'. " + "Vector-LDMS consumes LDMS metrics from Kafka topic 'ldms'. " + "To fix: Either set telemetry_sources.ldms.metrics_enabled=true to enable LDMS data collection, " + "or set telemetry_bridges.vector_ldms.metrics_enabled=false to disable the Vector-LDMS bridge." +) diff --git a/common/library/module_utils/input_validation/common_utils/logical_validation.py b/common/library/module_utils/input_validation/common_utils/logical_validation.py index ed46d07ea9..2cb5c5d37b 100644 --- a/common/library/module_utils/input_validation/common_utils/logical_validation.py +++ b/common/library/module_utils/input_validation/common_utils/logical_validation.py @@ -21,6 +21,7 @@ from ansible.module_utils.input_validation.validation_flows import provision_validation from ansible.module_utils.input_validation.validation_flows import common_validation +from ansible.module_utils.input_validation.validation_flows import telemetry_validation from ansible.module_utils.input_validation.validation_flows import high_availability_validation from ansible.module_utils.input_validation.validation_flows import local_repo_validation from ansible.module_utils.input_validation.validation_flows import build_stream_validation @@ -61,7 +62,7 @@ def validate_input_logic( "network_spec.yml": provision_validation.validate_network_spec, "omnia_config.yml": common_validation.validate_omnia_config, "local_repo_config.yml": local_repo_validation.validate_local_repo_config, - "telemetry_config.yml": common_validation.validate_telemetry_config, + "telemetry_config.yml": telemetry_validation.validate_telemetry_config, "security_config.yml": common_validation.validate_security_config, "storage_config.yml": common_validation.validate_storage_config, "high_availability_config.yml": diff --git a/common/library/module_utils/input_validation/schema/network_spec.json b/common/library/module_utils/input_validation/schema/network_spec.json index c3d52e5fe0..536246af41 100644 --- a/common/library/module_utils/input_validation/schema/network_spec.json +++ b/common/library/module_utils/input_validation/schema/network_spec.json @@ -124,22 +124,6 @@ "netmask_bits": { "type": "string", "pattern": "^(1[0-9]|2[0-9]|[1-9])$|^3[0-2]$" - }, - "dns": { - "oneOf": [ - { - "type": "array", - "maxItems": 0 - }, - { - "type": "array", - "minItems": 1, - "items": { - "type": "string", - "pattern": "^(?:(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})\\.){3}(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})$" - } - } - ] } }, "additionalProperties": false diff --git a/common/library/module_utils/input_validation/schema/omnia_config.json b/common/library/module_utils/input_validation/schema/omnia_config.json index 92dacf1232..1f7824fc20 100644 --- a/common/library/module_utils/input_validation/schema/omnia_config.json +++ b/common/library/module_utils/input_validation/schema/omnia_config.json @@ -138,7 +138,66 @@ "description": "File path for the values.yaml file.", "type": "string", "pattern": "^(|/?([a-zA-Z0-9._-]+/)*[a-zA-Z0-9._-]+\\.yaml)$" - + }, + "powerscale_authorization": { + "type": "object", + "description": "PowerScale CSM Authorization configuration for multi-tenancy.", + "properties": { + "enabled": { + "type": "boolean", + "description": "Enable PowerScale CSM Authorization for multi-tenant storage." + }, + "csm_authorization_values_file_path": { + "description": "Absolute file path for the CSM Authorization values.yaml file.", + "type": "string", + "pattern": "^(|/?([a-zA-Z0-9._-]+/)*[a-zA-Z0-9._-]+\\.yaml)$" + }, + "tenants": { + "type": ["array", "null"], + "description": "List of tenant configurations.", + "items": { + "type": "object", + "properties": { + "name": { + "type": ["string", "null"], + "minLength": 1, + "pattern": "^[a-zA-Z0-9_-]+$", + "description": "Tenant name (alphanumeric, hyphens, underscores only)." + }, + "roles": { + "type": ["array", "null"], + "default": [], + "description": "List of roles for this tenant.", + "items": { + "type": "object", + "properties": { + "name": { + "type": ["string", "null"], + "minLength": 1, + "pattern": "^[a-zA-Z0-9_-]+$", + "description": "Role name (alphanumeric, hyphens, underscores only)." + }, + "storage_pool": { + "type": ["string", "null"], + "minLength": 1, + "pattern": "^/ifs(/[a-zA-Z0-9._-]+)+$", + "description": "PowerScale storage pool path (must start with /ifs and exist on PowerScale)." + }, + "quota_limit": { + "type": ["string", "null"], + "pattern": "^[1-9][0-9]*(Gi|Ti|Mi)$", + "description": "Storage quota limit (e.g., 200Gi, 1Ti, 500Mi)." + } + }, + "required": ["name", "storage_pool", "quota_limit"] + } + } + }, + "required": ["name"] + } + } + }, + "required": ["enabled"] }, "k8s_crio_storage_size": { "description": "Storage size for CRI-O in Gigabytes only (example: 10G, 15G, 100G)", @@ -166,6 +225,33 @@ "then": { "required": ["csi_powerscale_driver_values_file_path"] } + }, + { + "if": { + "properties": { + "powerscale_authorization": { + "type": "object", + "properties": { + "enabled": { + "const": true + } + }, + "required": ["enabled"] + } + }, + "required": ["powerscale_authorization"] + }, + "then": { + "required": [ + "csi_powerscale_driver_secret_file_path", + "csi_powerscale_driver_values_file_path" + ], + "properties": { + "powerscale_authorization": { + "required": ["csm_authorization_values_file_path", "tenants"] + } + } + } } ] } diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index 48e40a16cf..16263c7b48 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -30,7 +30,6 @@ file_names = config.files create_error_msg = validation_utils.create_error_msg create_file_path = validation_utils.create_file_path -ib_mac_re = re.compile(r"^([0-9A-Fa-f]{2}:){7}[0-9A-Fa-f]{2}$") # Expected header columns (case-insensitive) required_headers = [ @@ -272,52 +271,6 @@ def validate_duplicate_admin_ips_in_mapping_file(pxe_mapping_file_path): raise ValueError(f"Duplicate ADMIN_IP found in PXE mapping file: {'; '.join(duplicates)}") -def validate_duplicate_ib_ips_in_mapping_file(pxe_mapping_file_path): - """Validates that IB_IP values in the mapping file are unique.""" - if not pxe_mapping_file_path or not os.path.isfile(pxe_mapping_file_path): - raise ValueError(f"PXE mapping file not found: {pxe_mapping_file_path}") - - with open(pxe_mapping_file_path, "r", encoding="utf-8") as fh: - raw_lines = fh.readlines() - - non_comment_lines = [ln for ln in raw_lines if ln.strip()] - reader = csv.DictReader(non_comment_lines) - - fieldname_map = {fn.strip().upper(): fn for fn in reader.fieldnames} - ib_ip_col = fieldname_map.get("IB_IP") - hostname_col = fieldname_map.get("HOSTNAME") - - if not ib_ip_col: - return - - seen_ib_ips = {} - duplicates = [] - - for row_idx, row in enumerate(reader, start=2): - ib_ip = row.get(ib_ip_col, "").strip() if row.get(ib_ip_col) else "" - hostname = "" - if hostname_col: - hostname = row.get(hostname_col, "").strip() if row.get(hostname_col) else "" - - if not ib_ip: - continue - - if ib_ip in seen_ib_ips: - first_row = seen_ib_ips[ib_ip]["row"] - first_host = seen_ib_ips[ib_ip]["hostname"] - dup_host = hostname or "" - first_host_disp = first_host or "" - duplicates.append( - f"'{ib_ip}' at CSV rows {first_row} ({first_host_disp}) and {row_idx} ({dup_host})" - ) - continue - - seen_ib_ips[ib_ip] = {"row": row_idx, "hostname": hostname} - - if duplicates: - raise ValueError(f"Duplicate IB_IP found in PXE mapping file: {'; '.join(duplicates)}") - - def validate_group_parent_service_tag_consistency_in_mapping_file(pxe_mapping_file_path): """Validates that GROUP_NAME has a consistent PARENT_SERVICE_TAG across the mapping file.""" if not pxe_mapping_file_path or not os.path.isfile(pxe_mapping_file_path): @@ -468,25 +421,6 @@ def validate_mapping_file_entries(mapping_file_path): if bmc_ip and not validation_utils.validate_ipv4(bmc_ip): raise ValueError(f"Invalid BMC_IP: '{bmc_ip}' at CSV row {row_idx} in mapping file.") - ib_mac_col = fieldname_map.get("IB_MAC") - ib_ip_col = fieldname_map.get("IB_IP") - ib_mac = row.get(ib_mac_col, "").strip() if ib_mac_col and row.get(ib_mac_col) else "" - ib_ip = row.get(ib_ip_col, "").strip() if ib_ip_col and row.get(ib_ip_col) else "" - - if bool(ib_mac) != bool(ib_ip): - raise ValueError( - f"IB_MAC and IB_IP must both be provided or both be empty at CSV row {row_idx} in mapping file." - ) - - if ib_mac and not ib_mac_re.match(ib_mac): - raise ValueError( - f"Invalid IB_MAC: '{ib_mac}' at CSV row {row_idx} in mapping file. " - "Expected format: xx:xx:xx:xx:xx:xx:xx:xx." - ) - - if ib_ip and not validation_utils.validate_ipv4(ib_ip): - raise ValueError(f"Invalid IB_IP: '{ib_ip}' at CSV row {row_idx} in mapping file.") - if not row_seen: raise ValueError("Please provide details in mapping file.") @@ -925,7 +859,6 @@ def validate_provision_config( validate_duplicate_service_tags_in_mapping_file(pxe_mapping_file_path) validate_duplicate_hostnames_in_mapping_file(pxe_mapping_file_path) validate_duplicate_admin_ips_in_mapping_file(pxe_mapping_file_path) - validate_duplicate_ib_ips_in_mapping_file(pxe_mapping_file_path) validate_group_parent_service_tag_consistency_in_mapping_file(pxe_mapping_file_path) validate_functional_groups_separation(pxe_mapping_file_path) validate_parent_service_tag_hierarchy(pxe_mapping_file_path) diff --git a/common/library/modules/fetch_telemetry_status.py b/common/library/modules/fetch_telemetry_status.py index c0996a7a05..d6db54ab97 100644 --- a/common/library/modules/fetch_telemetry_status.py +++ b/common/library/modules/fetch_telemetry_status.py @@ -67,7 +67,9 @@ def main(): telemetry_status_list = [] - if telemetry_config_data["idrac_telemetry_support"]: + telemetry_sources = telemetry_config_data.get("telemetry_sources", {}) + + if telemetry_sources.get("idrac", {}).get("metrics_enabled", False): telemetry_status_list.append("idrac_telemetry") module.exit_json( diff --git a/common/library/modules/generate_pxe_mapping.py b/common/library/modules/generate_pxe_mapping.py index 666a3f794e..b50cd0775a 100644 --- a/common/library/modules/generate_pxe_mapping.py +++ b/common/library/modules/generate_pxe_mapping.py @@ -103,29 +103,6 @@ DEFAULT_FUNCTIONAL_GROUP = "slurm_node_aarch64" SERVICE_CONTROL_PLANE_GROUP = "service_kube_control_plane_x86_64" -# Omnia-supported functional group names. -# Only servers whose OME static group matches one of these will be -# included in the PXE mapping file. -SUPPORTED_FUNCTIONAL_GROUPS = { - "service_kube_control_plane_x86_64", - "service_kube_node_x86_64", - "login_node_x86_64", - "login_node_aarch64", - "login_compiler_node_x86_64", - "login_compiler_node_aarch64", - "slurm_control_node_x86_64", - "slurm_node_x86_64", - "slurm_node_aarch64", - "os_x86_64", - "os_aarch64", -} - -# Roles that have a parent-child relationship with the control plane. -# Only these roles should receive PARENT_SERVICE_TAG. -CHILD_ROLES_OF_CONTROL_PLANE = { - "service_kube_node_x86_64", -} - def extract_su_from_hostname(bmc_hostname): """ @@ -134,7 +111,7 @@ def extract_su_from_hostname(bmc_hostname): idrac-SUA99R999OU30C2 -> SUA99 SU1R2OU1C5 -> SU1 idrac-JCGT033 -> '' (service tag pattern, not an SU hostname) - The lookahead (?=R\\d+) ensures only genuine SU hostnames match; + The lookahead (?=R\d+) ensures only genuine SU hostnames match; service-tag-only hostnames like idrac-JCGT033 are ignored. Returns empty string when no SU pattern is found; caller defaults to grp0. """ @@ -250,24 +227,10 @@ def main(): # Use group_name from OME if available, else fall back to module param default server_group = server.get('group_name', '').strip() - - # Skip servers whose OME group is not a supported Omnia functional group - if server_group and server_group not in SUPPORTED_FUNCTIONAL_GROUPS: - svc_tag = server.get('service_tag', 'unknown') - module.warn( - f"Skipping device {svc_tag}: OME static group '{server_group}' " - f"is not a supported Omnia functional group. " - f"Supported groups: {', '.join(sorted(SUPPORTED_FUNCTIONAL_GROUPS))}" - ) - continue - resolved_functional_group = server_group if server_group else functional_group - # Derive GROUP_NAME: try SU from BMC hostname first, - # then from OME group name, then fall back to module default (grp0) + # Derive GROUP_NAME from SU extracted from BMC hostname su_name = extract_su_from_hostname(bmc_hostname) - if not su_name: - su_name = extract_su_from_hostname(server_group) resolved_group_name = su_name if su_name else group_name row = { @@ -293,11 +256,8 @@ def main(): if su and su not in su_control_plane_map: su_control_plane_map[su] = row["SERVICE_TAG"] - # Assign PARENT_SERVICE_TAG only to child roles of the control plane - # within the same GROUP_NAME + # Assign PARENT_SERVICE_TAG from control plane node of the same SU for row in rows: - if row["FUNCTIONAL_GROUP_NAME"] not in CHILD_ROLES_OF_CONTROL_PLANE: - continue su = row["GROUP_NAME"] if su in su_control_plane_map: row["PARENT_SERVICE_TAG"] = su_control_plane_map[su] diff --git a/common/library/modules/ome_server_inventory.py b/common/library/modules/ome_server_inventory.py index a8a0efabb1..a9ed7a3fb0 100644 --- a/common/library/modules/ome_server_inventory.py +++ b/common/library/modules/ome_server_inventory.py @@ -174,15 +174,8 @@ def build_device_group_map(self): all groups whose ParentId matches that container's Id. Fallback: skip well-known OME system/container group names and use any group that has at least one device. - - Returns (device_group_map, conflicts, debug): - device_group_map: dict mapping device_id -> first group_name - conflicts: dict mapping device_id -> list of all group_names (only for - devices found in more than one static group) - debug: diagnostic info dict """ device_group_map = {} - device_all_groups = {} all_groups_url = f"{self.base_url}/api/GroupService/Groups" all_groups = self.get_paginated(all_groups_url) @@ -228,19 +221,9 @@ def build_device_group_map(self): group_devices = self.get_paginated(devices_url) for gd in group_devices: dev_id = gd.get("Id") - if not dev_id: - continue - device_all_groups.setdefault(dev_id, []).append(group_name) - if dev_id not in device_group_map: + if dev_id and dev_id not in device_group_map: device_group_map[dev_id] = group_name - # Detect devices present in multiple static groups - conflicts = { - dev_id: groups - for dev_id, groups in device_all_groups.items() - if len(groups) > 1 - } - empty_groups = [g.get("Name") for g in target_groups if g.get("Name") not in [device_group_map.get(d) for d in device_group_map]] @@ -251,9 +234,8 @@ def build_device_group_map(self): "target_group_names": [g.get("Name") for g in target_groups], "device_ids_mapped": list(device_group_map.keys()), "empty_groups": empty_groups, - "conflicting_device_count": len(conflicts), } - return device_group_map, conflicts, debug + return device_group_map, debug def extract_server_info(client, device, device_group_map=None): @@ -378,14 +360,10 @@ def main(): try: if not client.authenticate(): - module.fail_json(msg=( - f"Failed to authenticate with OME at {ome_ip}. " - "Please verify the ome_username and ome_password provided in " - "omnia_config_credentials.yml (managed via prepare_oim.yml) and rerun the playbook." - )) + module.fail_json(msg=f"Failed to authenticate with OME at {ome_ip}") devices = client.get_all_devices(device_type) - device_group_map, conflicts, group_debug = client.build_device_group_map() + device_group_map, group_debug = client.build_device_group_map() if not group_debug["static_container_found"]: module.warn("OME: 'Static Groups' container not found under Custom Groups. " @@ -398,23 +376,6 @@ def main(): module.warn(f"OME: Static group '{grp}' exists but has no devices assigned. " f"Devices in this group will fall back to the default functional group.") - # Fail if any device belongs to multiple static groups - if conflicts: - # Build a human-readable summary keyed by service tag - svc_tag_map = {d.get("Id"): d.get("Identifier") or d.get("DeviceServiceTag", str(d.get("Id"))) - for d in devices} - conflict_lines = [] - for dev_id, groups in conflicts.items(): - tag = svc_tag_map.get(dev_id, str(dev_id)) - conflict_lines.append(f" Device {tag}: member of groups [{', '.join(groups)}]") - module.fail_json(msg=( - "Conflicting OME static group assignments detected. " - "Each server must belong to exactly one static group. " - "The following devices are assigned to multiple groups:\n" - + "\n".join(conflict_lines) - + "\nPlease fix the group assignments in OME and rerun discovery." - )) - server_info_list = [] for device in devices: info = extract_server_info(client, device, device_group_map) diff --git a/discovery/discovery.yml b/discovery/discovery.yml index 8adfa0e5d1..aa60caf33c 100644 --- a/discovery/discovery.yml +++ b/discovery/discovery.yml @@ -21,19 +21,6 @@ when: not project_dir_status | default(false) | bool ansible.builtin.import_playbook: ../utils/include_input_dir.yml -- name: Set discovery validation tags - hosts: localhost - connection: local - tasks: - - name: Set dynamic run tags for discovery validation - ansible.builtin.set_fact: - omnia_run_tags: "{{ (ansible_run_tags | default([]) + ['discovery']) | unique }}" - cacheable: true - -- name: Invoke validate_config.yml to perform L1 and L2 validations with discovery tag - ansible.builtin.import_playbook: ../input_validation/validate_config.yml - tags: always - - name: Load discovery configuration hosts: localhost connection: local @@ -57,15 +44,15 @@ - name: Display usage information when no discovery_mechanism is specified when: discovery_mechanism == "" block: - - name: Fail when no discovery_mechanism specified - ansible.builtin.fail: + - name: Show usage prompt + ansible.builtin.debug: msg: - "============================================================" - - "ERROR: discovery_mechanism is required but not provided." - - "" - "Usage: ansible-playbook discovery.yml" - " -e discovery_mechanism=" - "" + - "Please specify a discovery_mechanism parameter." + - "" - "Supported discovery mechanisms:" - " - ome : Dell OpenManage Enterprise (OME)" - " - magellan : Magellan (upcoming, not yet supported)" @@ -75,6 +62,9 @@ - " ansible-playbook discovery.yml -e discovery_mechanism=magellan" - "============================================================" + - name: End play when no discovery_mechanism specified + ansible.builtin.meta: end_play + - name: Validate discovery_mechanism parameter ansible.builtin.fail: msg: | @@ -99,17 +89,9 @@ - name: End play for magellan ansible.builtin.meta: end_play - - name: Validate OME inputs before discovery + - name: Handle OME discovery mechanism when: discovery_mechanism == 'ome' block: - - name: Fail when ome_ip is not configured - ansible.builtin.fail: - msg: >- - ome_ip must be provided in discovery_config.yml when using OME discovery. - Please set 'enable_bmc_discovery: true' and provide a valid 'ome_ip' in - {{ input_project_dir }}/discovery_config.yml. - when: ome_ip | default('') | length == 0 - - name: Include OME discovery role ansible.builtin.include_role: name: ome_discovery diff --git a/discovery/roles/ome_discovery/tasks/collect_inventory.yml b/discovery/roles/ome_discovery/tasks/collect_inventory.yml index 20a0c47301..e8651f5180 100644 --- a/discovery/roles/ome_discovery/tasks/collect_inventory.yml +++ b/discovery/roles/ome_discovery/tasks/collect_inventory.yml @@ -14,20 +14,11 @@ --- - name: Verify OME is reachable - block: - - name: Wait for OME HTTPS port - ansible.builtin.wait_for: - host: "{{ ome_ip }}" - port: 443 - timeout: 30 - register: ome_reachability - rescue: - - name: Fail with actionable message when OME is unreachable - ansible.builtin.fail: - msg: >- - Unable to reach OME at {{ ome_ip }}:443 within 30 seconds. - Please verify that ome_ip in {{ input_project_dir | default('input') }}/discovery_config.yml - is correct and that the OME appliance is powered on and network-accessible. + ansible.builtin.wait_for: + host: "{{ ome_ip }}" + port: 443 + timeout: 30 + register: ome_reachability - name: Collect OME server inventory ome_server_inventory: diff --git a/discovery/roles/ome_discovery/vars/main.yml b/discovery/roles/ome_discovery/vars/main.yml index 936d95c4b2..c2fcf8cc4e 100644 --- a/discovery/roles/ome_discovery/vars/main.yml +++ b/discovery/roles/ome_discovery/vars/main.yml @@ -42,16 +42,12 @@ discovery_complete_msg: - "Total servers discovered: {{ discovered_servers | length }}" - "" - "Next Steps:" - - "1. Review and edit the generated PXE mapping file:" + - "1. Review and edit the generated file:" - " {{ pxe_mapping_output_file }}" - - "" - "2. Update HOSTNAME, FUNCTIONAL_GROUP_NAME, GROUP_NAME as needed." - - "" - - "3. Update the following parameter in provision_config.yml:" - - " pxe_mapping_file_path: {{ pxe_mapping_output_file }}" - - "" - - "4. Run:" - - " ansible-playbook provision/provision.yml" + - "3. Rename or copy the file to:" + - " input/pxe_mapping_file.csv" + - "4. Run: ansible-playbook provision/provision.yml" - "============================================================" no_servers_found_msg: | diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv index 295e7615af..0a350bc72d 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv @@ -1,6 +1,6 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 \ No newline at end of file +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 \ No newline at end of file diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv index 5226b0a19e..6e3e4c6e63 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv @@ -1,13 +1,11 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 -service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53,94:6d:ae:03:00:8c:12:5f,192.168.0.105 -service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,94:6d:ae:03:00:8c:12:6a,192.168.0.106 -service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,94:6d:ae:03:00:8c:12:7b,192.168.0.107 -service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,94:6d:ae:03:00:8c:12:8c,192.168.0.108 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,94:6d:ae:03:00:8c:12:9d,192.168.0.109 -os_x86_64,grp6,ABEF56,,os-node1,xx:yy:zz:aa:bb:ff,172.16.107.60,xx:yy:zz:aa:bb:ee,172.17.107.60,94:6d:ae:03:00:8c:12:ae,192.168.0.110 -os_aarch64,grp7,ABEF78,,os-node2,xx:yy:zz:aa:bb:ab,172.16.107.61,xx:yy:zz:aa:bb:ac,172.17.107.61,94:6d:ae:03:00:8c:12:bf,192.168.0.111 +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 +service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 +service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 +service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 +service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 \ No newline at end of file diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv index 01360b424b..6e3e4c6e63 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv @@ -1,11 +1,11 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 -service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53,94:6d:ae:03:00:8c:12:5f,192.168.0.105 -service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,94:6d:ae:03:00:8c:12:6a,192.168.0.106 -service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,94:6d:ae:03:00:8c:12:7b,192.168.0.107 -service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,94:6d:ae:03:00:8c:12:8c,192.168.0.108 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,94:6d:ae:03:00:8c:12:9d,192.168.0.109 +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 +service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 +service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 +service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 +service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 \ No newline at end of file diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv index 65ceac6ada..98ad5ab134 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv @@ -1,6 +1,6 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 -slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 -slurm_node_x86_64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 -login_compiler_node_x86_64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 +slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 +slurm_node_x86_64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 +login_compiler_node_x86_64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 \ No newline at end of file diff --git a/examples/pxe_mapping_file.csv b/examples/pxe_mapping_file.csv index 01360b424b..f4d41e2a77 100644 --- a/examples/pxe_mapping_file.csv +++ b/examples/pxe_mapping_file.csv @@ -1,11 +1,11 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 -service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53,94:6d:ae:03:00:8c:12:5f,192.168.0.105 -service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,94:6d:ae:03:00:8c:12:6a,192.168.0.106 -service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,94:6d:ae:03:00:8c:12:7b,192.168.0.107 -service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,94:6d:ae:03:00:8c:12:8c,192.168.0.108 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,94:6d:ae:03:00:8c:12:9d,192.168.0.109 +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 +service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 +service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 +service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 +service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 diff --git a/gitlab/roles/hosted_gitlab/vars/main.yml b/gitlab/roles/hosted_gitlab/vars/main.yml index c4f3a62f8f..9fbcdb31ba 100644 --- a/gitlab/roles/hosted_gitlab/vars/main.yml +++ b/gitlab/roles/hosted_gitlab/vars/main.yml @@ -230,10 +230,8 @@ gitlab_disable_grafana: true retry_count: "5" delay_time: "10" podman_login_fail_msg: > - Podman login failed. Please ensure the podman login credentials in the - {{ hostvars['localhost']['input_project_dir'] }}/omnia_config_credentials.yml are valid. - If they are, this error can occur due to a pull limit issue or multiple requests. - Please try running the playbook again after waiting for a while. + Podman login failed. Please ensure the podman login credentials in the input/omnia_config_credentials.yml are valid. + If they are, this error can occur due to a pull limit issue or multiple requests. Please try running the playbook again after waiting for a while. # Image pull configuration gitlab_image_pull_retries: 5 diff --git a/input/discovery_config.yml b/input/discovery_config.yml index 3cc563b069..1e8df8f2a6 100644 --- a/input/discovery_config.yml +++ b/input/discovery_config.yml @@ -18,11 +18,7 @@ # SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. # *********************************************************************** -#### BMC Discovery -# Set to true to enable BMC discovery via OME. -# When false, OME credentials will not be prompted during prepare_oim. -enable_bmc_discovery: false - +#### OME Discovery # IP address of the Dell OpenManage Enterprise (OME) instance used for # server discovery and inventory collection. # Credentials (ome_username, ome_password) are managed separately via diff --git a/input/network_spec.yml b/input/network_spec.yml index dc7dc3cbbb..92f03276a5 100644 --- a/input/network_spec.yml +++ b/input/network_spec.yml @@ -53,4 +53,3 @@ Networks: - ib_network: subnet: "192.168.0.0" netmask_bits: "24" - dns: ["192.168.10.10"] diff --git a/input/omnia_config.yml b/input/omnia_config.yml index b0e9cb8850..4eef108cc7 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -172,6 +172,42 @@ slurm_cluster: # csi_powerscale_driver_values_file_path: User need to download values.yaml file and fill required data in values.yaml file. Provided the path of the values.yaml file here. # mention configurable values +# ----------------------------PowerScale CSM Authorization------------------------------------ +# PowerScale CSM Authorization enables multi-tenant storage access control for CSI PowerScale driver. +# This feature is optional and requires CSI PowerScale driver to be installed. +# When enabled, tenants can be provisioned with specific storage pools and quota limits. +# The following fields are mandatory only if powerscale_authorization.enabled is set to true: +# +# powerscale_authorization: Configuration for PowerScale multi-tenant authorization. +# enabled: Set to true to enable PowerScale CSM Authorization (default: false). +# csm_authorization_values_file_path: Absolute file path for the CSM Authorization values.yaml file. +# Required when enabled is true. +# tenants: List of tenant configurations (at least one tenant required when enabled). +# name: Tenant name (alphanumeric, hyphens, underscores only, e.g., "team-omnia"). +# roles: List of roles for this tenant (at least one role required). +# name: Role name (alphanumeric, hyphens, underscores only, e.g., "role-omnia"). +# storage_pool: PowerScale storage pool path (must start with /ifs, e.g., "/ifs/data/csi/team-omnia"). +# IMPORTANT: This path must already exist on the PowerScale cluster. +# Omnia and CSI driver will NOT create this path automatically. +# quota_limit: Storage quota limit for this role (e.g., "200Gi", "1Ti", "500Mi"). +# +# Prerequisites for enabling PowerScale Authorization: +# 1. csi_driver_powerscale must be present in software_config.json +# 2. Service cluster nodes (service_kube_node_*, service_kube_control_plane_*) must be defined in PXE mapping +# 3. All three file paths must be provided and files must exist: +# - csi_powerscale_driver_secret_file_path +# - csi_powerscale_driver_values_file_path +# - csm_authorization_values_file_path +# 4. Image versions in csm_authorization_values_file_path must match the versions in input/config/x86_64/rhel/10.0/csi_driver_powerscale.json +# The following CSM Authorization images are validated: +# - quay.io/dell/container-storage-modules/csm-authorization-proxy +# - quay.io/dell/container-storage-modules/csm-authorization-tenant +# - quay.io/dell/container-storage-modules/csm-authorization-role +# - quay.io/dell/container-storage-modules/csm-authorization-storage +# - quay.io/dell/container-storage-modules/csm-authorization-controller +# - quay.io/dell/container-storage-modules/csm-authorization-sidecar +# Note: These images are already included in csi_driver_powerscale.json + # - k8s_crio_storage_size: Specifies the disk size allocated for CRI-O container storage. # This storage is used to store container images, writable layers, and runtime data. # Acceptable formats: "10G", "15G", "50G" (Only positive values in Gigabytes are allowed) @@ -186,6 +222,15 @@ service_k8s_cluster: k8s_service_addresses: "10.233.0.0/18" k8s_pod_network_cidr: "10.233.64.0/18" nfs_storage_name: "nfs_k8s" + k8s_crio_storage_size: "20G" csi_powerscale_driver_secret_file_path: "" csi_powerscale_driver_values_file_path: "" - k8s_crio_storage_size: "20G" + powerscale_authorization: + enabled: false +# csm_authorization_values_file_path: "" # Required when enabled: true - "/path/to/your/updated/csm-authorization-values.yaml" +# tenants: # Required when enabled: true +# - name: "team-omnia" +# roles: +# - name: "role-omnia" +# storage_pool: "/ifs/data/csi/team-omnia" +# quota_limit: "200Gi" diff --git a/input/pxe_mapping_file.csv b/input/pxe_mapping_file.csv index e9b5a893f2..abb6fc5fe8 100644 --- a/input/pxe_mapping_file.csv +++ b/input/pxe_mapping_file.csv @@ -1,14 +1,13 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,, -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,, -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,, -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,, -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,, -service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53,, -service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,, -service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,, -service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,, -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,, -os_x86_64,grp6,ABEF56,,os-node1,xx:yy:zz:aa:bb:ff,172.16.107.60,xx:yy:zz:aa:bb:ee,172.17.107.60,, -os_aarch64,grp7,ABEF78,,os-node2,xx:yy:zz:aa:bb:ab,172.16.107.61,xx:yy:zz:aa:bb:ac,172.17.107.61,, - +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 +service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 +service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 +service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 +service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 +os_x86_64,grp6,ABEF56,,os-node1,xx:yy:zz:aa:bb:ff,172.16.107.60,xx:yy:zz:aa:bb:ee,172.17.107.60 +os_aarch64,grp7,ABEF78,,os-node2,xx:yy:zz:aa:bb:ab,172.16.107.61,xx:yy:zz:aa:bb:ac,172.17.107.61 diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml index f26e280126..942ab57527 100644 --- a/prepare_oim/prepare_oim.yml +++ b/prepare_oim/prepare_oim.yml @@ -99,25 +99,6 @@ - telemetry_config_stat.stat.exists - telemetry_config.idrac_telemetry_support | default(false) | bool - - name: Check discovery configuration for OME - block: - - name: Check if discovery_config.yml exists - ansible.builtin.stat: - path: "{{ input_project_dir }}/discovery_config.yml" - register: discovery_config_stat - - - name: Load discovery_config.yml - ansible.builtin.include_vars: - file: "{{ input_project_dir }}/discovery_config.yml" - name: discovery_config - when: discovery_config_stat.stat.exists - failed_when: false - - - name: Set ome_discovery_enabled flag - ansible.builtin.set_fact: - ome_discovery_enabled: "{{ discovery_config_stat.stat.exists and (discovery_config.enable_bmc_discovery | default(false) | bool) }}" - cacheable: true - - name: Invoke validate_config.yml to perform L1 and L2 validations with prepare_oim tag ansible.builtin.import_playbook: ../input_validation/validate_config.yml tags: always diff --git a/prepare_oim/roles/deploy_containers/common/vars/main.yml b/prepare_oim/roles/deploy_containers/common/vars/main.yml index a01fad7002..4b0cdeaab7 100644 --- a/prepare_oim/roles/deploy_containers/common/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/common/vars/main.yml @@ -67,11 +67,8 @@ prepare_oim_completion_msg_build_stream: | login_cmd: "podman login docker.io -u {{ docker_username }} -p {{ docker_password }}" retry_count: "5" delay_time: "10" -podman_login_fail_msg: > - Podman login failed. Please ensure the podman login credentials in the - {{ hostvars['localhost']['input_project_dir'] }}/omnia_config_credentials.yml are valid. - If they are, this error can occur due to a pull limit issue or multiple requests. - Please try running the playbook again after waiting for a while. +podman_login_fail_msg: "Podman login failed. Please ensure the podman login credentials in the input/omnia_config_credentials.yml are valid. + If they are, this error can occur due to a pull limit issue or multiple requests. Please try running the playbook again after waiting for a while." # Usage: add_known_hosts.yml ssh_config: "/root/.ssh/config" diff --git a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml index 9f6254a0b3..2d7db2ca85 100644 --- a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml @@ -17,7 +17,7 @@ openchami_git_repo: https://github.com/OpenCHAMI/deployment-recipes.git openchami_share_dir: /opt/omnia/openchami openchami_clone_path: "{{ openchami_share_dir }}/deployment-recipes" -openchami_git_version: bf1f6dfdc7f6107a4227568987faedb1b79b95fa +openchami_git_version: main clone_retry: "5" clone_delay: "10" dir_permissions_755: "0755" @@ -43,16 +43,16 @@ pull_image_delay: 10 # OpenCHAMI image tags openchami_local_ca_tag: "v0.2.2" openchami_opaal_tag: "v0.3.10" -openchami_smd_tag: "v2.19.0" -openchami_bss_tag: "v1.32.1" -openchami_cloud_init_tag: "v1.3.0" -openchami_coresmd_tag: "v0.4.0" +openchami_smd_tag: "v2.18.0" +openchami_bss_tag: "v1.32.0" +openchami_cloud_init_tag: "v1.2.3" +openchami_coredhcp_tag: "v0.3.0" # Third-party image tags for OpenCHAMI -minio_release_tag: "RELEASE.2026-04-17T00-00-00Z" +minio_tag: "latest" postgres_tag: "11.5-alpine" hydra_tag: "v2.3" haproxy_tag: "latest" -registry_tag: "3.1.0" +registry_tag: "latest" curl_tag: "latest" acme_tag: "3.1.1" @@ -63,8 +63,8 @@ openchami_images: - "ghcr.io/openchami/smd:{{ openchami_smd_tag }}" - "ghcr.io/openchami/bss:{{ openchami_bss_tag }}" - "ghcr.io/openchami/cloud-init:{{ openchami_cloud_init_tag }}" - - "ghcr.io/openchami/coresmd:{{ openchami_coresmd_tag }}" - - "docker.io/pgsty/minio:{{ minio_release_tag }}" + - "ghcr.io/openchami/coredhcp:{{ openchami_coredhcp_tag }}" + - "docker.io/minio/minio:{{ minio_tag }}" - "docker.io/library/postgres:{{ postgres_tag }}" - "docker.io/oryd/hydra:{{ hydra_tag }}" - "cgr.dev/chainguard/haproxy:{{ haproxy_tag }}" diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 710edfc39c..a9726e18a2 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -182,7 +182,7 @@ {{ lookup('template', 'templates/openldap/update_ldap_conf.sh.j2') | indent(12) }} {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - path: /root/ldms_sampler.sh owner: root:root permissions: '0755' @@ -221,12 +221,6 @@ content: | {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} - - path: /usr/local/bin/configure_vast_installation.sh - owner: root:root - permissions: '{{ file_mode_755 }}' - content: | - {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} - - path: /tmp/apptainer_mirror.conf permissions: '0644' content: | @@ -260,12 +254,11 @@ - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab + - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - /usr/local/bin/configure_vast_installation.sh - - mount -a -{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} +{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or ldms_support %} # Add NFS entry and mount - mkdir -p {{ client_mount_path }} - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab @@ -291,13 +284,12 @@ # - echo "NFS must be mounted at {{ client_mount_path }} before running." {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - /root/ldms_sampler.sh {% endif %} - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 904f7f5da2..2004543003 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -182,7 +182,7 @@ {{ lookup('template', 'templates/openldap/update_ldap_conf.sh.j2') | indent(12) }} {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - path: /root/ldms_sampler.sh owner: root:root permissions: '0755' @@ -221,12 +221,6 @@ content: | {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} - - path: /usr/local/bin/configure_vast_installation.sh - owner: root:root - permissions: '{{ file_mode_755 }}' - content: | - {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} - - path: /tmp/apptainer_mirror.conf permissions: '0644' content: | @@ -262,12 +256,11 @@ - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab + - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - /usr/local/bin/configure_vast_installation.sh - - mount -a -{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or hostvars['localhost']['ldms_support'] %} +{% if hostvars['localhost']['ucx_support'] or hostvars['localhost']['openmpi_support'] or ldms_support %} # Add NFS entry and mount - mkdir -p {{ client_mount_path }} - echo "{{ cloud_init_slurm_nfs_path }} {{ client_mount_path }} nfs defaults,_netdev 0 0" >> /etc/fstab @@ -293,13 +286,12 @@ # - echo "NFS must be mounted at {{ client_mount_path }} before running." {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - /root/ldms_sampler.sh {% endif %} - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index 0db88c90f9..126f3873b7 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -94,7 +94,7 @@ {{ lookup('template', 'templates/openldap/update_ldap_conf.sh.j2') | indent(12) }} {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - path: /root/ldms_sampler.sh owner: root:root permissions: '0755' @@ -121,12 +121,6 @@ content: | {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} - - path: /usr/local/bin/configure_vast_installation.sh - owner: root:root - permissions: '{{ file_mode_755 }}' - content: | - {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} - - path: /tmp/apptainer_mirror.conf permissions: '0644' content: | @@ -148,12 +142,10 @@ - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab + - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - /usr/local/bin/configure_vast_installation.sh - - mount -a - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} @@ -250,7 +242,7 @@ {% endif %} -{% if hostvars['localhost']['ldms_support'] %} +{% if ldms_support %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log # Add NFS entry and mount diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 593cef9d00..4e34ff4868 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -120,12 +120,6 @@ content: | {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} - - path: /usr/local/bin/configure_vast_installation.sh - owner: root:root - permissions: '{{ file_mode_755 }}' - content: | - {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} - - path: /tmp/apptainer_mirror.conf permissions: '0644' content: | @@ -150,12 +144,10 @@ - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab + - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - /usr/local/bin/configure_vast_installation.sh - - mount -a - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 0d01edee47..3490a360b6 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -424,8 +424,7 @@ - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - mkdir -p /etc/containers/registries.conf.d - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 index 71f8be3033..922f63f852 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 @@ -332,8 +332,7 @@ - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - mkdir -p /etc/containers/registries.conf.d - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 index e363187b58..df98035baa 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 @@ -234,8 +234,7 @@ - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - mkdir -p /etc/containers/registries.conf.d - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 2f0c16b577..388c587159 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -494,8 +494,7 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh {% if powervault_config is defined %} - /usr/local/bin/setup_iscsi_storage.sh {% endif %} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 30a388d7ef..b1baca5d97 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -305,12 +305,6 @@ content: | {{ lookup('template', 'templates/ldms/ldms_sampler.sh.j2') | indent(12) }} {% endif %} - - path: /usr/local/bin/configure_vast_installation.sh - owner: root:root - permissions: '{{ file_mode_755 }}' - content: | - {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} - - path: /usr/local/bin/configure_dirs_and_mounts.sh permissions: '{{ file_mode_755 }}' content: | @@ -563,12 +557,10 @@ - /usr/local/bin/setup_dcgm.sh {% endif %} - - /usr/local/bin/configure_vast_installation.sh - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 4756e8f1d3..c3fdc01ab2 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -320,12 +320,6 @@ content: | SLURMD_OPTIONS="{{ conf_server }}" - - path: /usr/local/bin/configure_vast_installation.sh - owner: root:root - permissions: '{{ file_mode_755 }}' - content: | - {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} - - path: /usr/local/bin/configure_dirs_and_mounts.sh permissions: '{{ file_mode_755 }}' content: | @@ -567,13 +561,12 @@ - /usr/local/bin/setup_dcgm.sh {% endif %} # slurm user and group created in the users module - - /usr/local/bin/configure_vast_installation.sh + - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh @@ -630,4 +623,4 @@ - /usr/local/bin/export_nvhpc_env.sh - systemctl restart slurmd - - echo "Cloud-Init has completed successfully." + - echo "Cloud-Init has completed successfully." \ No newline at end of file diff --git a/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 b/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 index d87720e495..249b90b6a5 100644 --- a/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 +++ b/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 @@ -7,33 +7,34 @@ if ! lspci | grep -i 'mellanox'; then exit 0 fi -# Ensure IPoIB + Mellanox IB kernel modules are loaded before interface detection. -# This avoids boot-time races where the IB device exists (lspci) but no ib* link is present yet. -modprobe mlx5_ib || true -modprobe ib_ipoib || true -modprobe ib_umad || true -modprobe ib_uverbs || true - ADMIN_NIC_IP="{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}" NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" -declare -A IB_IP_MAP=( -{% for mac, node in hostvars['localhost']['read_mapping_file']['dict'].items() -%} -{% if node.IB_IP is defined and node.IB_IP | trim | length > 0 %} - ["{{ node.ADMIN_IP }}"]="{{ node.IB_IP }}" -{%- endif %} -{%- endfor %} -) - -IB_IP="${IB_IP_MAP[$ADMIN_NIC_IP]:-}" - -if [ -n "$IB_IP" ]; then - echo "Using explicit IB IP : $IB_IP/$NETMASK_BITS" -else - echo "INFO: No explicit IB IP found in mapping file for node with ADMIN_IP: $ADMIN_NIC_IP" - echo "INFO: Skipping IB IP assignment. If IB networking is required for this node, please add IB_IP to the PXE mapping file." - echo "INFO: IB network interface will remain unconfigured." - exit 0 -fi +IB_NETWORK_SUBNET="{{ hostvars['localhost']['ib_network_subnet'] }}" + +ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) +} + +int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" +} + + +ADMIN_IP_INT=$(ip_to_int "$ADMIN_NIC_IP") +IB_NET_INT=$(ip_to_int "$IB_NETWORK_SUBNET") + +HOST_BITS=$(( 32 - NETMASK_BITS )) +HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + +HOST_OFFSET=$(( ADMIN_IP_INT & HOST_MASK )) +IB_IP_INT=$(( IB_NET_INT + HOST_OFFSET )) + +IB_IP=$(int_to_ip "$IB_IP_INT") + +echo "Derived IB IP : $IB_IP/$NETMASK_BITS" MAX_WAIT=120 # total wait time in seconds (2 minutes) INTERVAL=10 # check every 10 seconds @@ -74,16 +75,4 @@ else fi echo "SUCCESS: Assigned $IB_IP/$NETMASK_BITS to $IB_NIC" - -# Configure DNS for InfiniBand network -if [ -n "$IB_IP" ]; then - echo "Configuring DNS for InfiniBand interface" - - # Add VAST DNS servers (completely safe - handles empty arrays) - {% for dns_server in hostvars['localhost']['ib_network_dns'] %} - echo "nameserver {{ dns_server }}" >> /etc/resolv.conf - {% endfor %} - - echo "SUCCESS: DNS configured for IB network" -fi diff --git a/provision/roles/configure_ochami/vars/main.yml b/provision/roles/configure_ochami/vars/main.yml index 40590bd1a6..9be62ddcbe 100644 --- a/provision/roles/configure_ochami/vars/main.yml +++ b/provision/roles/configure_ochami/vars/main.yml @@ -110,7 +110,7 @@ cuda_runfile_aarch64: "{{ hostvars['oim']['cuda_runfile_aarch64'] | default('cud # NVIDIA DCGM (Data Center GPU Manager) configuration dcgm_service_name: "nvidia-dcgm" dcgm_health_check_retries: 3 -dcgm_support: "{{ hostvars['localhost']['dcgm_support'] | default(true) }}" +dcgm_support: "{{ telemetry_config.telemetry_sources.dcgm.metrics_enabled | default(true) }}" # Usage: fetch_additional_images.yml input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" diff --git a/provision/roles/provision_validations/tasks/include_software_config.yml b/provision/roles/provision_validations/tasks/include_software_config.yml index b2480d2c6e..9ba81a2f22 100644 --- a/provision/roles/provision_validations/tasks/include_software_config.yml +++ b/provision/roles/provision_validations/tasks/include_software_config.yml @@ -42,7 +42,6 @@ admin_nic: "{{ network_data.admin_network.oim_nic_name }}" admin_netmask_bits: "{{ network_data.admin_network.netmask_bits }}" ib_network_subnet: "{{ network_data.ib_network.subnet }}" - ib_network_dns: "{{ network_data.ib_network.dns | default([]) }}" dns: "{{ network_data.admin_network.dns }}" - name: Initialise variables diff --git a/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 b/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 index 3709759f78..cf02934ef3 100644 --- a/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 @@ -33,14 +33,20 @@ spec: apk add --no-cache coreutils set -e + echo "=== Checking for stuck pods ===" + # Get all terminating pods terminating=$(kubectl get pods -n telemetry -o jsonpath='{range .items[?(@.metadata.deletionTimestamp)]}{.metadata.name}{"\n"}{end}') - if [ -z "$terminating" ]; then - echo "No terminating pods found" - else + # Get all CrashLoopBackOff pods + crashloop=$(kubectl get pods -n telemetry -o jsonpath='{range .items[?(@.status.containerStatuses[*].state.waiting.reason=="CrashLoopBackOff")]}{.metadata.name}{"\n"}{end}') + now=$(date +%s) + processed=0 + # Process terminating pods + if [ -n "$terminating" ]; then + echo "→ Found terminating pods: $terminating" for pod in $terminating; do deletion_ts=$(kubectl get pod "$pod" -n telemetry -o jsonpath='{.metadata.deletionTimestamp}' 2>/dev/null) if [ -z "$deletion_ts" ]; then @@ -89,13 +95,72 @@ spec: done echo " Cleaned PVCs for $pod" fi - else + processed=$((processed + 1)) + else echo "Pod $pod terminating for $age seconds (threshold: ${threshold}s). Skipping." fi done + else + echo "No terminating pods found" + fi + + # Process CrashLoopBackOff pods + if [ -n "$crashloop" ]; then + echo "→ Found CrashLoopBackOff pods: $crashloop" + for pod in $crashloop; do + restart_count=$(kubectl get pod "$pod" -n telemetry -o jsonpath='{.status.containerStatuses[0].restartCount}' 2>/dev/null || echo 0) + + # Only process if restart count >= 5 (persistent crash loop) + if [ $restart_count -ge 5 ]; then + echo "→ Pod $pod in CrashLoopBackOff ($restart_count restarts). Processing..." + + # Check for tablespace corruption - skip cleanup if detected + if echo "$pod" | grep -q "idrac-telemetry"; then + pod_logs=$(kubectl logs "$pod" -n telemetry -c mysqldb --tail=50 2>/dev/null || echo "") + if echo "$pod_logs" | grep -q "space=4294967294\|nonexisting or being-dropped tablespace\|no existing undo tablespaces\|Data Dictionary initialization failed"; then + echo " → Tablespace corruption detected. Skipping cleanup - PVC recreation required." + processed=$((processed + 1)) + continue + fi + fi + + # Get PVCs + pvcs=$(kubectl get pod "$pod" -n telemetry -o jsonpath='{.spec.volumes[*].persistentVolumeClaim.claimName}' 2>/dev/null) + + # Clean PVCs if any + if [ -n "$pvcs" ]; then + for pvc in $pvcs; do + echo " → Cleaning $pvc..." + cleanup_pod="pvc-clean-$RANDOM" + + # Create cleanup pod - remove lock files (including MySQL-specific files) + kubectl run $cleanup_pod --image=busybox:1.36 -n telemetry --restart=Never \ + --overrides="{\"spec\":{\"containers\":[{\"name\":\"cleanup\",\"image\":\"busybox:1.36\",\"command\":[\"sh\",\"-c\",\"echo 'Cleaning lock files in /data...'; find /data -type f \\\\( -name '.lock' -o -name '*.lock' -o -name '*.sock' -o -name '*.pid' -o -name 'ib_buffer_pool' -o -name 'ibtmp1' \\\\) -exec rm -fv {} \\\\; 2>/dev/null || true; echo 'Done'\"],\"volumeMounts\":[{\"name\":\"data\",\"mountPath\":\"/data\"}]}],\"volumes\":[{\"name\":\"data\",\"persistentVolumeClaim\":{\"claimName\":\"$pvc\"}}]}}" \ + 2>/dev/null || echo " Failed to create $cleanup_pod" + + # Wait for it to complete (max 20s), then show logs and delete + if kubectl wait --for=condition=Ready pod/$cleanup_pod -n telemetry --timeout=20s 2>/dev/null; then + kubectl logs $cleanup_pod -n telemetry 2>/dev/null | head -20 + else + echo "$cleanup_pod timed out (PVC may be in use)" + fi + kubectl delete pod $cleanup_pod -n telemetry 2>/dev/null || true + done + echo " Cleaned PVCs for $pod" + fi + + # Delete pod to trigger restart with clean state + kubectl delete pod "$pod" -n telemetry --grace-period=0 --force 2>/dev/null || true + processed=$((processed + 1)) + else + echo "Pod $pod in CrashLoopBackOff but only $restart_count restarts (threshold: 5). Skipping." + fi + done + else + echo "No CrashLoopBackOff pods found" fi - echo "Cleanup complete" + echo "=== Cleanup complete (processed $processed pods) ===" exit 0 restartPolicy: Never diff --git a/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml b/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml index ae2a86d511..be275fd870 100644 --- a/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml +++ b/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml @@ -59,8 +59,6 @@ openchami_containers: - cloud-init-server - haproxy - coresmd - - coresmd-coredhcp - - coresmd-coredns openchami_volumes: - haproxy-certs @@ -80,7 +78,6 @@ openchami_secrets: - bss_postgres_password tcp_ports: - - 53 - 9000 - 9001 - 5000 @@ -91,7 +88,6 @@ tcp_ports: - 8443 udp_ports: - - 53 - 69 - 67 - 68 From 5cedc5a5c10df8804e428db7f1bd1820532e78d1 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Mon, 4 May 2026 13:17:28 +0530 Subject: [PATCH 02/63] service to scrape metrics from OTEL collector --- .../roles/fetch_packages/vars/main.yml | 14 +++++- .../deploy_powerscale_telemetry.sh.j2 | 8 ++- .../telemetry/cleanup_telemetry.sh.j2 | 4 ++ .../templates/telemetry/kustomization.yaml.j2 | 4 ++ ...perator-vmservicescrape-powerscale.yaml.j2 | 50 +++++++++++++++++++ 5 files changed, 78 insertions(+), 2 deletions(-) create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2 diff --git a/build_image_x86_64/roles/fetch_packages/vars/main.yml b/build_image_x86_64/roles/fetch_packages/vars/main.yml index ffad7b5b31..439d02931b 100644 --- a/build_image_x86_64/roles/fetch_packages/vars/main.yml +++ b/build_image_x86_64/roles/fetch_packages/vars/main.yml @@ -23,8 +23,20 @@ functional_groups_file_path: "{{ hostvars['localhost']['functional_groups_config software_config_file_path: "{{ input_project_dir }}/software_config.json" x86_64_build_image_completion_msg: | The playbook build_image_x86_64.yml has been completed successfully. - To boot x86_64 nodes execute discovery/discovery.yml playbook. To build image for aarch64 nodes execute build_image_aarch64/build_image_aarch64.yml playbook. + To boot x86_64 nodes execute discovery/discovery.yml playbook with discovery_mechanism parameter. + + Usage: ansible-playbook discovery.yml -e discovery_mechanism= + + Supported discovery mechanisms: + - ome : Dell OpenManage Enterprise (OME) + - magellan : Magellan (upcoming, not yet supported) + + Examples: + ansible-playbook discovery.yml -e discovery_mechanism=ome + + Otherwise, provide PXE mapping and execute provision/provision.yml. + functional_group_absent_msg: | Failure: No x86_64 functional groups found in functional_group_config.yml input file. Please make sure x86_64 functional_group should be present in input file functional_group_config.yml diff --git a/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_telemetry.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_telemetry.sh.j2 index d1017bc76c..6f7332e566 100644 --- a/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_telemetry.sh.j2 +++ b/provision/roles/configure_ochami/templates/powerscale/deploy_powerscale_telemetry.sh.j2 @@ -131,11 +131,17 @@ if [ "$PS_TEL_FAILED" -eq 0 ]; then echo "===== PowerScale Telemetry (CSM Observability) deployed successfully =====" - # Step 6: Patch OTEL Collector service to expose Prometheus metrics port + # Step 6: Patch OTEL Collector service to expose Prometheus metrics port and add labels for vmagent discovery echo "Patching OTEL Collector service to expose port 8889 for Prometheus metrics..." kubectl patch svc otel-collector -n "${CSM_NS}" --patch '{"spec":{"ports":[{"name":"prometheus","port":8889,"targetPort":8889,"protocol":"TCP"}]}}' || { echo "WARNING: Failed to patch OTEL Collector service for Prometheus metrics." } + + # Step 6b: Add labels to OTEL Collector service for VMServiceScrape discovery + echo "Adding labels to OTEL Collector service for vmagent discovery..." + kubectl label svc otel-collector -n "${CSM_NS}" app.kubernetes.io/name=otel-collector app.kubernetes.io/component=collector --overwrite || { + echo "WARNING: Failed to add labels to OTEL Collector service." + } # Step 7: Create PVC for OTEL Collector persistent buffering OTEL_PVC_SIZE="{{ hostvars['localhost']['telemetry_config']['powerscale_configurations']['otel_collector_storage_size'] | default('5Gi') }}" diff --git a/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 b/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 index a4b391519f..7f5f34beed 100644 --- a/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 +++ b/provision/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2 @@ -482,6 +482,10 @@ if [ "$CLEAN_POWERSCALE" = true ]; then delete_all configmap "app.kubernetes.io/name=otel-collector" delete_all pod "app.kubernetes.io/name=otel-collector" + # Delete VMServiceScrape for PowerScale OTEL Collector + echo "Deleting VMServiceScrape for PowerScale OTEL Collector..." + delete_resource vmservicescrape otel-collector-powerscale-scrape + # Delete cert-manager resources deployed by karavi-observability sub-chart echo "Deleting cert-manager resources (Helm sub-chart)..." delete_all deployment "app.kubernetes.io/instance=karavi-observability,app.kubernetes.io/name=cert-manager" diff --git a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 index 6b8c159a10..54753cf0ce 100644 --- a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 @@ -19,6 +19,10 @@ resources: - victoria-operator-vmagent.yaml # VMPodScrape CR (native operator-based pod discovery for metrics) - victoria-operator-vmpodscrape.yaml +{% if telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) %} + # VMServiceScrape CR for PowerScale OTEL Collector + - victoria-operator-vmservicescrape-powerscale.yaml +{% endif %} {% endif %} {% if victoria_logs_support | default(false) %} # victoria_logs Resources (Logs Only) diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2 new file mode 100644 index 0000000000..1cdd71796a --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2 @@ -0,0 +1,50 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# VMServiceScrape - Native operator-based service discovery for PowerScale OTEL Collector +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: otel-collector-powerscale-scrape + namespace: {{ telemetry_namespace }} +spec: + # Target service selector + selector: + matchLabels: + app.kubernetes.io/name: otel-collector + + # Namespace selector + namespaceSelector: + matchNames: + - {{ telemetry_namespace }} + + # Service metrics endpoints + serviceEndpoints: + - port: prometheus + interval: {{ vmagent.global.scrape_interval }} + honorLabels: true + + # Add PowerScale-specific labels + relabelConfigs: + - sourceLabels: [__meta_kubernetes_service_name] + targetLabel: source + replacement: powerscale + + - sourceLabels: [__meta_kubernetes_service_name] + targetLabel: job + replacement: otel-collector-powerscale + + # Add namespace label + - sourceLabels: [__meta_kubernetes_namespace] + targetLabel: namespace From edeab4f12688433673a3b469f144a826b807179c Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Mon, 4 May 2026 15:27:09 +0530 Subject: [PATCH 03/63] vmservice to scrape metrics from otel collector --- provision/roles/telemetry/vars/main.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index 2aa95a2cc1..a1c8f2fad9 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -323,6 +323,8 @@ victoria_templates_operator_single: dest: 'victoria-operator-vmagent.yaml' - src: 'telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2' dest: 'victoria-operator-vmpodscrape.yaml' + - src: 'telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2' + dest: 'victoria-operator-vmservicescrape-powerscale.yaml' # Cluster operator template (used when victoria_cluster.enabled: true) victoria_templates_operator_cluster: @@ -332,6 +334,8 @@ victoria_templates_operator_cluster: dest: 'victoria-operator-vmagent.yaml' - src: 'telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2' dest: 'victoria-operator-vmpodscrape.yaml' + - src: 'telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2' + dest: 'victoria-operator-vmservicescrape-powerscale.yaml' # Legacy manual deployment templates (removed - use operator-based templates above) # Raw victoria-cluster-vminsert/vmselect/vmstorage.yaml.j2 files have been removed From cf00983fd81be4ac66a87f738ee4fc01f936bdb1 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Mon, 4 May 2026 21:32:08 +0530 Subject: [PATCH 04/63] update endpoints --- .../victoria-operator-vmservicescrape-powerscale.yaml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2 index 1cdd71796a..20a4b209ce 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2 @@ -30,7 +30,7 @@ spec: - {{ telemetry_namespace }} # Service metrics endpoints - serviceEndpoints: + endpoints: - port: prometheus interval: {{ vmagent.global.scrape_interval }} honorLabels: true From b5c05d00e62059eba5f8cbfc88cd43492c16d62f Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 5 May 2026 00:01:20 +0530 Subject: [PATCH 05/63] revert other changes --- .../input_validation/common_utils/config.py | 4 +- .../common_utils/en_us_validation_msg.py | 125 ++++++++++++++++++ .../common_utils/logical_validation.py | 2 +- .../input_validation/schema/network_spec.json | 16 +++ .../validation_flows/provision_validation.py | 67 ++++++++++ .../library/modules/generate_pxe_mapping.py | 48 ++++++- .../library/modules/ome_server_inventory.py | 47 ++++++- discovery/discovery.yml | 36 +++-- .../ome_discovery/tasks/collect_inventory.yml | 70 ++++++++++ discovery/roles/ome_discovery/vars/main.yml | 14 +- .../pxe_mapping_file.csv | 12 +- .../catalog_rhel_json/pxe_mapping_file.csv | 24 ++-- .../pxe_mapping_file.csv | 22 +-- .../pxe_mapping_file.csv | 12 +- gitlab/roles/hosted_gitlab/vars/main.yml | 8 +- input/discovery_config.yml | 8 +- input/network_spec.yml | 1 + input/pxe_mapping_file.csv | 26 ++-- prepare_oim/prepare_oim.yml | 21 ++- .../deploy_containers/common/vars/main.yml | 9 +- .../deploy_containers/openchami/vars/main.yml | 20 +-- ...-group-login_compiler_node_aarch64.yaml.j2 | 12 +- ...i-group-login_compiler_node_x86_64.yaml.j2 | 11 +- ...ce_kube_control_plane_first_x86_64.yaml.j2 | 5 +- ...-service_kube_control_plane_x86_64.yaml.j2 | 5 +- .../ci-group-service_kube_node_x86_64.yaml.j2 | 5 +- ...ci-group-slurm_control_node_x86_64.yaml.j2 | 5 +- .../ci-group-slurm_node_aarch64.yaml.j2 | 4 +- .../ci-group-slurm_node_x86_64.yaml.j2 | 11 +- 29 files changed, 545 insertions(+), 105 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index 7f26f692e4..47990cafdc 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -58,7 +58,8 @@ "telemetry_config": "telemetry_config.yml", "high_availability_config": "high_availability_config.yml", "build_stream_config": "build_stream_config.yml", - "gitlab_config": "gitlab_config.yml" + "gitlab_config": "gitlab_config.yml", + "discovery_config": "discovery_config.yml" # "additional_software": "additional_software.json" } @@ -103,6 +104,7 @@ # "high_availability": [files["high_availability_config"]], # "additional_software": [files["additional_software"]], "build_stream": [files["build_stream_config"]], + "discovery": [files["discovery_config"]], "gitlab": [files["gitlab_config"], files["build_stream_config"]], "all": [ files["local_repo_config"], diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index 0e32c3bdae..f8adccfb7a 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -431,6 +431,131 @@ def powerscale_auth_image_version_mismatch_msg( "input/config/x86_64/rhel/10.0/csi_driver_powerscale.json." ) +# PowerScale telemetry validation messages +POWERSCALE_VICTORIA_REQUIRED_MSG = ( + "PowerScale telemetry requires VictoriaMetrics to be deployed. " + "When telemetry_sources.powerscale.metrics_enabled is true, " + "'victoria_metrics' must be included in collection_targets " + "(e.g., 'victoria_metrics' or 'victoria_metrics,victoria_logs')." +) +POWERSCALE_VICTORIA_LOGS_REQUIRED_MSG = ( + "PowerScale logs collection requires VictoriaLogs to be deployed. " + "When telemetry_sources.powerscale.logs_enabled is true, " + "'victoria_logs' must be included in collection_targets " + "(e.g., 'victoria_metrics,victoria_logs')." +) +POWERSCALE_SYSLOG_SOURCE_IP_INVALID_MSG = ( + "Invalid IP address in powerscale_syslog_source_ips. " + "Each entry must be a valid IPv4 address (e.g., '192.168.55.11')." +) +POWERSCALE_CSI_DRIVER_MISSING_MSG = ( + "csi_driver_powerscale is not configured in software_config.json. " + "PowerScale telemetry requires the CSI driver for PowerScale to be configured." +) +POWERSCALE_SERVICE_CLUSTER_MISSING_MSG = ( + "service cluster is not defined in functional_groups_config.yml. " + "PowerScale telemetry requires a service cluster." +) +POWERSCALE_CONFIGURATIONS_MISSING_MSG = ( + "powerscale_configurations section is required when " + "telemetry_sources.powerscale.metrics_enabled is true. " + "It must contain csm_observability_values_file_path." +) +POWERSCALE_OTEL_STORAGE_SIZE_INVALID_MSG = ( + "must be a non-empty string in format 'XGi' (e.g., '5Gi')" +) +POWERSCALE_CSM_VALUES_PATH_REQUIRED_MSG = ( + "csm_observability_values_file_path is required when " + "telemetry_sources.powerscale.metrics_enabled is true. " + "Please provide the path to the CSM Observability values.yaml file." +) +POWERSCALE_AUTH_PROXY_HOST_MISSING_MSG = ( + "karaviMetricsPowerscale.authorization.proxyHost is required in the " + "CSM Observability values file when " + "karaviMetricsPowerscale.authorization.enabled is true. " + "Please provide the hostname or IP of the CSM Authorization Proxy server." +) +def powerscale_csm_values_not_found_msg(path): + """Returns error message when CSM Observability values.yaml file is not found.""" + return ( + f"CSM Observability values.yaml file not found at '{path}'. " + "Please verify the file path is correct." + ) +POWERSCALE_CSM_VALUES_INVALID_YAML_MSG = ( + "CSM Observability values.yaml must contain a valid YAML dictionary." +) +def powerscale_csm_values_parse_error_msg(error): + """Returns error message when CSM Observability values.yaml fails to parse.""" + return f"Failed to parse CSM Observability values.yaml: {error}" +POWERSCALE_CSM_VALUES_MISSING_KARAVI_SECTION_MSG = ( + "CSM Observability values.yaml is missing 'karaviMetricsPowerscale' section." +) +POWERSCALE_CSM_METRICS_IMAGE_MISSING_MSG = ( + "CSM Metrics PowerScale image is required in CSM Observability values.yaml." +) +POWERSCALE_OTEL_COLLECTOR_IMAGE_MISSING_MSG = ( + "OTEL Collector image is required in CSM Observability values.yaml." +) +POWERSCALE_ADDITIONAL_ENDPOINTS_URL_EMPTY_MSG = ( + "Each additional_remote_write_endpoint must have a non-empty 'url' field." +) +POWERSCALE_ADDITIONAL_ENDPOINTS_URL_INVALID_MSG = ( + "URL must start with 'http://' or 'https://'." +) +def powerscale_image_version_mismatch_msg(image_name, values_image, service_k8s_image): + """Returns error message when CSM values.yaml image version doesn't match service_k8s.json.""" + return ( + f"Image version mismatch for '{image_name}': " + f"CSM Observability values.yaml has '{values_image}' but " + f"service_k8s.json has '{service_k8s_image}'. " + f"Please update service_k8s.json to match the values.yaml version " + f"and re-run local_repo.yml to mirror the correct image to Pulp." + ) + +# PowerScale CSM Authorization validation messages +POWERSCALE_AUTH_CSI_DRIVER_MISSING_MSG = ( + "PowerScale CSM Authorization requires 'csi_driver_powerscale' to be present in software_config.json." +) +POWERSCALE_AUTH_SERVICE_CLUSTER_MISSING_MSG = ( + "PowerScale CSM Authorization requires service cluster nodes " + "(service_kube_node_*, service_kube_control_plane_*) to be defined " + "in the PXE mapping file." +) +POWERSCALE_AUTH_CSM_VALUES_PATH_REQUIRED_MSG = ( + "csm_authorization_values_file_path is required when powerscale_authorization.enabled is true." +) +def powerscale_auth_csm_values_not_found_msg(path): + """Returns error message when CSM Authorization values.yaml file is not found.""" + return ( + f"CSM Authorization values file does not exist at path: {path}. " + "Please verify the file path is correct." + ) +def powerscale_auth_csm_values_validation_error_msg(error): + """Returns error message when CSM Authorization values.yaml validation fails.""" + return f"Error validating CSM Authorization image versions: {error}" +POWERSCALE_AUTH_TENANTS_REQUIRED_MSG = ( + "At least one tenant must be defined when powerscale_authorization.enabled is true." +) +def powerscale_auth_tenant_roles_required_msg(tenant_name): + """Returns error message when a tenant has no roles defined.""" + return ( + f"At least one role must be defined for tenant '{tenant_name}'." + ) +def powerscale_auth_image_version_mismatch_msg( + image_name, values_version, csi_version +): + """Returns error message when CSM Authorization image version doesn't match csi_driver_powerscale.json.""" + return ( + f"Image version for {image_name} in CSM Authorization values.yaml " + f"({values_version}) does not match csi_driver_powerscale.json " + f"({csi_version}). Please ensure both files use the same version." + ) +POWERSCALE_AUTH_CSI_JSON_NOT_FOUND_MSG = ( + "csi_driver_powerscale.json not found. Cannot validate CSM Authorization " + "image versions. Please ensure the file exists at " + "input/config/x86_64/rhel/10.0/csi_driver_powerscale.json." +) + def boolean_fail_msg(value): """Returns a formatted message indicating boolean_fail_msg.""" return f"{value} must be set to either true or false." diff --git a/common/library/module_utils/input_validation/common_utils/logical_validation.py b/common/library/module_utils/input_validation/common_utils/logical_validation.py index 2cb5c5d37b..d77042bb44 100644 --- a/common/library/module_utils/input_validation/common_utils/logical_validation.py +++ b/common/library/module_utils/input_validation/common_utils/logical_validation.py @@ -21,11 +21,11 @@ from ansible.module_utils.input_validation.validation_flows import provision_validation from ansible.module_utils.input_validation.validation_flows import common_validation -from ansible.module_utils.input_validation.validation_flows import telemetry_validation from ansible.module_utils.input_validation.validation_flows import high_availability_validation from ansible.module_utils.input_validation.validation_flows import local_repo_validation from ansible.module_utils.input_validation.validation_flows import build_stream_validation from ansible.module_utils.input_validation.validation_flows import gitlab_validation +from ansible.module_utils.input_validation.validation_flows import telemetry_validation # L2 Validation Code - validate anything that could not have been validated with JSON schema diff --git a/common/library/module_utils/input_validation/schema/network_spec.json b/common/library/module_utils/input_validation/schema/network_spec.json index 536246af41..c3d52e5fe0 100644 --- a/common/library/module_utils/input_validation/schema/network_spec.json +++ b/common/library/module_utils/input_validation/schema/network_spec.json @@ -124,6 +124,22 @@ "netmask_bits": { "type": "string", "pattern": "^(1[0-9]|2[0-9]|[1-9])$|^3[0-2]$" + }, + "dns": { + "oneOf": [ + { + "type": "array", + "maxItems": 0 + }, + { + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "pattern": "^(?:(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})\\.){3}(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})$" + } + } + ] } }, "additionalProperties": false diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index 16263c7b48..e68be5696f 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -30,6 +30,7 @@ file_names = config.files create_error_msg = validation_utils.create_error_msg create_file_path = validation_utils.create_file_path +ib_mac_re = re.compile(r"^([0-9A-Fa-f]{2}:){7}[0-9A-Fa-f]{2}$") # Expected header columns (case-insensitive) required_headers = [ @@ -270,6 +271,52 @@ def validate_duplicate_admin_ips_in_mapping_file(pxe_mapping_file_path): if duplicates: raise ValueError(f"Duplicate ADMIN_IP found in PXE mapping file: {'; '.join(duplicates)}") +def validate_duplicate_ib_ips_in_mapping_file(pxe_mapping_file_path): + """Validates that IB_IP values in the mapping file are unique.""" + if not pxe_mapping_file_path or not os.path.isfile(pxe_mapping_file_path): + raise ValueError(f"PXE mapping file not found: {pxe_mapping_file_path}") + + with open(pxe_mapping_file_path, "r", encoding="utf-8") as fh: + raw_lines = fh.readlines() + + non_comment_lines = [ln for ln in raw_lines if ln.strip()] + reader = csv.DictReader(non_comment_lines) + + fieldname_map = {fn.strip().upper(): fn for fn in reader.fieldnames} + ib_ip_col = fieldname_map.get("IB_IP") + hostname_col = fieldname_map.get("HOSTNAME") + + if not ib_ip_col: + return + + seen_ib_ips = {} + duplicates = [] + + for row_idx, row in enumerate(reader, start=2): + ib_ip = row.get(ib_ip_col, "").strip() if row.get(ib_ip_col) else "" + hostname = "" + if hostname_col: + hostname = row.get(hostname_col, "").strip() if row.get(hostname_col) else "" + + if not ib_ip: + continue + + if ib_ip in seen_ib_ips: + first_row = seen_ib_ips[ib_ip]["row"] + first_host = seen_ib_ips[ib_ip]["hostname"] + dup_host = hostname or "" + first_host_disp = first_host or "" + duplicates.append( + f"'{ib_ip}' at CSV rows {first_row} ({first_host_disp}) and {row_idx} ({dup_host})" + ) + continue + + seen_ib_ips[ib_ip] = {"row": row_idx, "hostname": hostname} + + + if duplicates: + raise ValueError(f"Duplicate IB_IP found in PXE mapping file: {'; '.join(duplicates)}") + def validate_group_parent_service_tag_consistency_in_mapping_file(pxe_mapping_file_path): """Validates that GROUP_NAME has a consistent PARENT_SERVICE_TAG across the mapping file.""" @@ -421,6 +468,25 @@ def validate_mapping_file_entries(mapping_file_path): if bmc_ip and not validation_utils.validate_ipv4(bmc_ip): raise ValueError(f"Invalid BMC_IP: '{bmc_ip}' at CSV row {row_idx} in mapping file.") + ib_mac_col = fieldname_map.get("IB_MAC") + ib_ip_col = fieldname_map.get("IB_IP") + ib_mac = row.get(ib_mac_col, "").strip() if ib_mac_col and row.get(ib_mac_col) else "" + ib_ip = row.get(ib_ip_col, "").strip() if ib_ip_col and row.get(ib_ip_col) else "" + + if bool(ib_mac) != bool(ib_ip): + raise ValueError( + f"IB_MAC and IB_IP must both be provided or both be empty at CSV row {row_idx} in mapping file." + ) + + if ib_mac and not ib_mac_re.match(ib_mac): + raise ValueError( + f"Invalid IB_MAC: '{ib_mac}' at CSV row {row_idx} in mapping file. " + "Expected format: xx:xx:xx:xx:xx:xx:xx:xx." + ) + + if ib_ip and not validation_utils.validate_ipv4(ib_ip): + raise ValueError(f"Invalid IB_IP: '{ib_ip}' at CSV row {row_idx} in mapping file.") + if not row_seen: raise ValueError("Please provide details in mapping file.") @@ -859,6 +925,7 @@ def validate_provision_config( validate_duplicate_service_tags_in_mapping_file(pxe_mapping_file_path) validate_duplicate_hostnames_in_mapping_file(pxe_mapping_file_path) validate_duplicate_admin_ips_in_mapping_file(pxe_mapping_file_path) + validate_duplicate_ib_ips_in_mapping_file(pxe_mapping_file_path) validate_group_parent_service_tag_consistency_in_mapping_file(pxe_mapping_file_path) validate_functional_groups_separation(pxe_mapping_file_path) validate_parent_service_tag_hierarchy(pxe_mapping_file_path) diff --git a/common/library/modules/generate_pxe_mapping.py b/common/library/modules/generate_pxe_mapping.py index 3e45b39b1d..0e64f10cac 100644 --- a/common/library/modules/generate_pxe_mapping.py +++ b/common/library/modules/generate_pxe_mapping.py @@ -103,6 +103,29 @@ DEFAULT_FUNCTIONAL_GROUP = "slurm_node_aarch64" SERVICE_CONTROL_PLANE_GROUP = "service_kube_control_plane_x86_64" +# Omnia-supported functional group names. +# Only servers whose OME static group matches one of these will be +# included in the PXE mapping file. +SUPPORTED_FUNCTIONAL_GROUPS = { + "service_kube_control_plane_x86_64", + "service_kube_node_x86_64", + "login_node_x86_64", + "login_node_aarch64", + "login_compiler_node_x86_64", + "login_compiler_node_aarch64", + "slurm_control_node_x86_64", + "slurm_node_x86_64", + "slurm_node_aarch64", + "os_x86_64", + "os_aarch64", +} + +# Roles that have a parent-child relationship with the control plane. +# Only these roles should receive PARENT_SERVICE_TAG. +CHILD_ROLES_OF_CONTROL_PLANE = { + "service_kube_node_x86_64", +} + def extract_su_from_hostname(bmc_hostname): """ @@ -111,7 +134,7 @@ def extract_su_from_hostname(bmc_hostname): idrac-SUA99R999OU30C2 -> SUA99 SU1R2OU1C5 -> SU1 idrac-JCGT033 -> '' (service tag pattern, not an SU hostname) - The lookahead (?=R\d+) ensures only genuine SU hostnames match; + The lookahead (?=R\\d+) ensures only genuine SU hostnames match; service-tag-only hostnames like idrac-JCGT033 are ignored. Returns empty string when no SU pattern is found; caller defaults to grp0. """ @@ -229,10 +252,24 @@ def main(): # Use group_name from OME if available, else fall back to module param default server_group = server.get('group_name', '').strip() + + # Skip servers whose OME group is not a supported Omnia functional group + if server_group and server_group not in SUPPORTED_FUNCTIONAL_GROUPS: + svc_tag = server.get('service_tag', 'unknown') + module.warn( + f"Skipping device {svc_tag}: OME static group '{server_group}' " + f"is not a supported Omnia functional group. " + f"Supported groups: {', '.join(sorted(SUPPORTED_FUNCTIONAL_GROUPS))}" + ) + continue + resolved_functional_group = server_group if server_group else functional_group - # Derive GROUP_NAME from SU extracted from BMC hostname + # Derive GROUP_NAME: try SU from BMC hostname first, + # then from OME group name, then fall back to module default (grp0) su_name = extract_su_from_hostname(bmc_hostname) + if not su_name: + su_name = extract_su_from_hostname(server_group) resolved_group_name = su_name if su_name else group_name row = { @@ -260,8 +297,11 @@ def main(): if su and su not in su_control_plane_map: su_control_plane_map[su] = row["SERVICE_TAG"] - # Assign PARENT_SERVICE_TAG from control plane node of the same SU + # Assign PARENT_SERVICE_TAG only to child roles of the control plane + # within the same GROUP_NAME for row in rows: + if row["FUNCTIONAL_GROUP_NAME"] not in CHILD_ROLES_OF_CONTROL_PLANE: + continue su = row["GROUP_NAME"] if su in su_control_plane_map: row["PARENT_SERVICE_TAG"] = su_control_plane_map[su] @@ -284,4 +324,4 @@ def main(): if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/common/library/modules/ome_server_inventory.py b/common/library/modules/ome_server_inventory.py index 4044429eb0..a178ac0d6f 100644 --- a/common/library/modules/ome_server_inventory.py +++ b/common/library/modules/ome_server_inventory.py @@ -300,8 +300,15 @@ def build_device_group_map(self): all groups whose ParentId matches that container's Id. Fallback: skip well-known OME system/container group names and use any group that has at least one device. + + Returns (device_group_map, conflicts, debug): + device_group_map: dict mapping device_id -> first group_name + conflicts: dict mapping device_id -> list of all group_names (only for + devices found in more than one static group) + debug: diagnostic info dict """ device_group_map = {} + device_all_groups = {} all_groups_url = f"{self.base_url}/api/GroupService/Groups" all_groups, _ = self.get_paginated(all_groups_url) @@ -347,9 +354,19 @@ def build_device_group_map(self): group_devices, _ = self.get_paginated(devices_url) for gd in group_devices: dev_id = gd.get("Id") - if dev_id and dev_id not in device_group_map: + if not dev_id: + continue + device_all_groups.setdefault(dev_id, []).append(group_name) + if dev_id not in device_group_map: device_group_map[dev_id] = group_name + # Detect devices present in multiple static groups + conflicts = { + dev_id: groups + for dev_id, groups in device_all_groups.items() + if len(groups) > 1 + } + empty_groups = [g.get("Name") for g in target_groups if g.get("Name") not in [device_group_map.get(d) for d in device_group_map]] @@ -360,8 +377,9 @@ def build_device_group_map(self): "target_group_names": [g.get("Name") for g in target_groups], "device_ids_mapped": list(device_group_map.keys()), "empty_groups": empty_groups, + "conflicting_device_count": len(conflicts), } - return device_group_map, debug + return device_group_map, conflicts, debug def extract_server_info(client, device, device_group_map=None): @@ -500,7 +518,11 @@ def main(): try: if not client.authenticate(): - module.fail_json(msg=f"Failed to authenticate with OME at {ome_ip}") + module.fail_json(msg=( + f"Failed to authenticate with OME at {ome_ip}. " + "Please verify the ome_username and ome_password provided in " + "omnia_config_credentials.yml (managed via prepare_oim.yml) and rerun the playbook." + )) devices, pagination_stats = client.get_all_devices(device_type) device_group_map, conflicts, group_debug = client.build_device_group_map() @@ -516,6 +538,23 @@ def main(): module.warn(f"OME: Static group '{grp}' exists but has no devices assigned. " f"Devices in this group will fall back to the default functional group.") + # Fail if any device belongs to multiple static groups + if conflicts: + # Build a human-readable summary keyed by service tag + svc_tag_map = {d.get("Id"): d.get("Identifier") or d.get("DeviceServiceTag", str(d.get("Id"))) + for d in devices} + conflict_lines = [] + for dev_id, groups in conflicts.items(): + tag = svc_tag_map.get(dev_id, str(dev_id)) + conflict_lines.append(f" Device {tag}: member of groups [{', '.join(groups)}]") + module.fail_json(msg=( + "Conflicting OME static group assignments detected. " + "Each server must belong to exactly one static group. " + "The following devices are assigned to multiple groups:\n" + + "\n".join(conflict_lines) + + "\nPlease fix the group assignments in OME and rerun discovery." + )) + server_info_list = [] for device in devices: info = extract_server_info(client, device, device_group_map) @@ -537,4 +576,4 @@ def main(): if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/discovery/discovery.yml b/discovery/discovery.yml index aa60caf33c..39db0ae989 100644 --- a/discovery/discovery.yml +++ b/discovery/discovery.yml @@ -21,6 +21,19 @@ when: not project_dir_status | default(false) | bool ansible.builtin.import_playbook: ../utils/include_input_dir.yml +- name: Set discovery validation tags + hosts: localhost + connection: local + tasks: + - name: Set dynamic run tags for discovery validation + ansible.builtin.set_fact: + omnia_run_tags: "{{ (ansible_run_tags | default([]) + ['discovery']) | unique }}" + cacheable: true + +- name: Invoke validate_config.yml to perform L1 and L2 validations with discovery tag + ansible.builtin.import_playbook: ../input_validation/validate_config.yml + tags: always + - name: Load discovery configuration hosts: localhost connection: local @@ -44,15 +57,15 @@ - name: Display usage information when no discovery_mechanism is specified when: discovery_mechanism == "" block: - - name: Show usage prompt - ansible.builtin.debug: + - name: Fail when no discovery_mechanism specified + ansible.builtin.fail: msg: - "============================================================" + - "ERROR: discovery_mechanism is required but not provided." + - "" - "Usage: ansible-playbook discovery.yml" - " -e discovery_mechanism=" - "" - - "Please specify a discovery_mechanism parameter." - - "" - "Supported discovery mechanisms:" - " - ome : Dell OpenManage Enterprise (OME)" - " - magellan : Magellan (upcoming, not yet supported)" @@ -62,9 +75,6 @@ - " ansible-playbook discovery.yml -e discovery_mechanism=magellan" - "============================================================" - - name: End play when no discovery_mechanism specified - ansible.builtin.meta: end_play - - name: Validate discovery_mechanism parameter ansible.builtin.fail: msg: | @@ -89,9 +99,17 @@ - name: End play for magellan ansible.builtin.meta: end_play - - name: Handle OME discovery mechanism + - name: Validate OME inputs before discovery when: discovery_mechanism == 'ome' block: + - name: Fail when ome_ip is not configured + ansible.builtin.fail: + msg: >- + ome_ip must be provided in discovery_config.yml when using OME discovery. + Please set 'enable_bmc_discovery: true' and provide a valid 'ome_ip' in + {{ input_project_dir }}/discovery_config.yml. + when: ome_ip | default('') | length == 0 + - name: Include OME discovery role ansible.builtin.include_role: - name: ome_discovery + name: ome_discovery \ No newline at end of file diff --git a/discovery/roles/ome_discovery/tasks/collect_inventory.yml b/discovery/roles/ome_discovery/tasks/collect_inventory.yml index 6f180ee92b..ccc7ded21c 100644 --- a/discovery/roles/ome_discovery/tasks/collect_inventory.yml +++ b/discovery/roles/ome_discovery/tasks/collect_inventory.yml @@ -44,7 +44,77 @@ | default(ome_inventory_result.pagination.devices_retrieved) }} - name: Display connection status + ansible.builtin.debug:# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Verify OME is reachable + block: + - name: Wait for OME HTTPS port + ansible.builtin.wait_for: + host: "{{ ome_ip }}" + port: 443 + timeout: 30 + register: ome_reachability + rescue: + - name: Fail with actionable message when OME is unreachable + ansible.builtin.fail: + msg: >- + Unable to reach OME at {{ ome_ip }}:443 within 30 seconds. + Please verify that ome_ip in {{ input_project_dir | default('input') }}/discovery_config.yml + is correct and that the OME appliance is powered on and network-accessible. + +- name: Collect OME server inventory + ome_server_inventory: + ome_ip: "{{ ome_ip }}" + ome_username: "{{ ome_username }}" + ome_password: "{{ ome_password }}" + device_type: "{{ ome_server_device_type }}" + page_size: "{{ ome_page_size }}" + register: ome_inventory_result + +- name: Display OME pagination summary ansible.builtin.debug: + msg: + - "OME Pagination Summary:" + - " Total devices in OME: {{ ome_inventory_result.pagination.total_devices_in_ome }}" + - " Page size: {{ ome_inventory_result.pagination.page_size }}" + - " Total pages: {{ ome_inventory_result.pagination.total_pages }}" + - " Pages fetched: {{ ome_inventory_result.pagination.pages_fetched }}" + - " Devices retrieved: {{ ome_inventory_result.pagination.devices_retrieved }}" + - >- + Devices after filtering: + {{ ome_inventory_result.pagination.devices_after_type_filter + | default(ome_inventory_result.pagination.devices_retrieved) }} + +- name: Display connection status + ansible.builtin.debug: + msg: "{{ ome_connection_success_msg }}" + when: ome_inventory_result.devices | length > 0 + +- name: Fail if no servers found + ansible.builtin.fail: + msg: "{{ no_servers_found_msg }}" + when: ome_inventory_result.devices | length == 0 + +- name: Set discovered servers fact + ansible.builtin.set_fact: + discovered_servers: "{{ ome_inventory_result.devices }}" + +- name: Display discovered server count + ansible.builtin.debug: + msg: "Discovered {{ discovered_servers | length }} servers from OME" msg: "{{ ome_connection_success_msg }}" when: ome_inventory_result.devices | length > 0 diff --git a/discovery/roles/ome_discovery/vars/main.yml b/discovery/roles/ome_discovery/vars/main.yml index a6682274e6..637dde1d66 100644 --- a/discovery/roles/ome_discovery/vars/main.yml +++ b/discovery/roles/ome_discovery/vars/main.yml @@ -43,14 +43,18 @@ discovery_complete_msg: - "Total servers discovered: {{ discovered_servers | length }}" - "" - "Next Steps:" - - "1. Review and edit the generated file:" + - "1. Review and edit the generated PXE mapping file:" - " {{ pxe_mapping_output_file }}" + - "" - "2. Update HOSTNAME, FUNCTIONAL_GROUP_NAME, GROUP_NAME as needed." - - "3. Rename or copy the file to:" - - " input/pxe_mapping_file.csv" - - "4. Run: ansible-playbook provision/provision.yml" + - "" + - "3. Update the following parameter in provision_config.yml:" + - " pxe_mapping_file_path: {{ pxe_mapping_output_file }}" + - "" + - "4. Run:" + - " ansible-playbook provision/provision.yml" - "============================================================" no_servers_found_msg: | No servers found in OME inventory. - Please verify that servers are discovered and managed by OME. + Please verify that servers are discovered and managed by OME. \ No newline at end of file diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv index 0a350bc72d..295e7615af 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv @@ -1,6 +1,6 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 \ No newline at end of file +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 \ No newline at end of file diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv index 6e3e4c6e63..e6f4059d0b 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv @@ -1,11 +1,13 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 -service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 -service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 -service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 -service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 \ No newline at end of file +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 +service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53,94:6d:ae:03:00:8c:12:5f,192.168.0.105 +service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,94:6d:ae:03:00:8c:12:6a,192.168.0.106 +service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,94:6d:ae:03:00:8c:12:7b,192.168.0.107 +service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,94:6d:ae:03:00:8c:12:8c,192.168.0.108 +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,94:6d:ae:03:00:8c:12:9d,192.168.0.109 +os_x86_64,grp6,ABEF56,,os-node1,xx:yy:zz:aa:bb:ff,172.16.107.60,xx:yy:zz:aa:bb:ee,172.17.107.60,94:6d:ae:03:00:8c:12:ae,192.168.0.110 +os_aarch64,grp7,ABEF78,,os-node2,xx:yy:zz:aa:bb:ab,172.16.107.61,xx:yy:zz:aa:bb:ac,172.17.107.61,94:6d:ae:03:00:8c:12:bf,192.168.0.111 \ No newline at end of file diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv index 6e3e4c6e63..8b4150ca89 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv @@ -1,11 +1,11 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 -service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 -service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 -service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 -service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 \ No newline at end of file +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 +service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53,94:6d:ae:03:00:8c:12:5f,192.168.0.105 +service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,94:6d:ae:03:00:8c:12:6a,192.168.0.106 +service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,94:6d:ae:03:00:8c:12:7b,192.168.0.107 +service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,94:6d:ae:03:00:8c:12:8c,192.168.0.108 +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,94:6d:ae:03:00:8c:12:9d,192.168.0.109 \ No newline at end of file diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv index 98ad5ab134..e29490a096 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv @@ -1,6 +1,6 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_x86_64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_x86_64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 \ No newline at end of file +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 +slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 +slurm_node_x86_64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 +login_compiler_node_x86_64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 \ No newline at end of file diff --git a/gitlab/roles/hosted_gitlab/vars/main.yml b/gitlab/roles/hosted_gitlab/vars/main.yml index 9fbcdb31ba..daec64ff40 100644 --- a/gitlab/roles/hosted_gitlab/vars/main.yml +++ b/gitlab/roles/hosted_gitlab/vars/main.yml @@ -230,8 +230,10 @@ gitlab_disable_grafana: true retry_count: "5" delay_time: "10" podman_login_fail_msg: > - Podman login failed. Please ensure the podman login credentials in the input/omnia_config_credentials.yml are valid. - If they are, this error can occur due to a pull limit issue or multiple requests. Please try running the playbook again after waiting for a while. + Podman login failed. Please ensure the podman login credentials in the + {{ hostvars['localhost']['input_project_dir'] }}/omnia_config_credentials.yml are valid. + If they are, this error can occur due to a pull limit issue or multiple requests. + Please try running the playbook again after waiting for a while. # Image pull configuration gitlab_image_pull_retries: 5 @@ -326,4 +328,4 @@ gitlab_deployment_complete_msg: - "" - "IMPORTANT: For any new GitLab reconfiguration, run cleanup_gitlab.yml before running gitlab.yml" - "" - - "============================================" + - "============================================" \ No newline at end of file diff --git a/input/discovery_config.yml b/input/discovery_config.yml index 1e8df8f2a6..dec5aee952 100644 --- a/input/discovery_config.yml +++ b/input/discovery_config.yml @@ -18,7 +18,11 @@ # SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. # *********************************************************************** -#### OME Discovery +#### BMC Discovery +# Set to true to enable BMC discovery via OME. +# When false, OME credentials will not be prompted during prepare_oim. +enable_bmc_discovery: false + # IP address of the Dell OpenManage Enterprise (OME) instance used for # server discovery and inventory collection. # Credentials (ome_username, ome_password) are managed separately via @@ -27,4 +31,4 @@ ome_ip: "" #### Magellan Discovery -# Reserved for future Magellan discovery configuration parameters. +# Reserved for future Magellan discovery configuration parameters. \ No newline at end of file diff --git a/input/network_spec.yml b/input/network_spec.yml index 92f03276a5..25939e3fd9 100644 --- a/input/network_spec.yml +++ b/input/network_spec.yml @@ -53,3 +53,4 @@ Networks: - ib_network: subnet: "192.168.0.0" netmask_bits: "24" + dns: ["192.168.10.10"] \ No newline at end of file diff --git a/input/pxe_mapping_file.csv b/input/pxe_mapping_file.csv index abb6fc5fe8..7524d2d655 100644 --- a/input/pxe_mapping_file.csv +++ b/input/pxe_mapping_file.csv @@ -1,13 +1,13 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 -service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 -service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 -service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 -service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 -os_x86_64,grp6,ABEF56,,os-node1,xx:yy:zz:aa:bb:ff,172.16.107.60,xx:yy:zz:aa:bb:ee,172.17.107.60 -os_aarch64,grp7,ABEF78,,os-node2,xx:yy:zz:aa:bb:ab,172.16.107.61,xx:yy:zz:aa:bb:ac,172.17.107.61 +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,, +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,, +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,, +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,, +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,, +service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53,, +service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,, +service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,, +service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,, +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,, +os_x86_64,grp6,ABEF56,,os-node1,xx:yy:zz:aa:bb:ff,172.16.107.60,xx:yy:zz:aa:bb:ee,172.17.107.60,, +os_aarch64,grp7,ABEF78,,os-node2,xx:yy:zz:aa:bb:ab,172.16.107.61,xx:yy:zz:aa:bb:ac,172.17.107.61,, diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml index 942ab57527..ee5ae06f6a 100644 --- a/prepare_oim/prepare_oim.yml +++ b/prepare_oim/prepare_oim.yml @@ -99,6 +99,25 @@ - telemetry_config_stat.stat.exists - telemetry_config.idrac_telemetry_support | default(false) | bool + - name: Check discovery configuration for OME + block: + - name: Check if discovery_config.yml exists + ansible.builtin.stat: + path: "{{ input_project_dir }}/discovery_config.yml" + register: discovery_config_stat + + - name: Load discovery_config.yml + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/discovery_config.yml" + name: discovery_config + when: discovery_config_stat.stat.exists + failed_when: false + + - name: Set ome_discovery_enabled flag + ansible.builtin.set_fact: + ome_discovery_enabled: "{{ discovery_config_stat.stat.exists and (discovery_config.enable_bmc_discovery | default(false) | bool) }}" + cacheable: true + - name: Invoke validate_config.yml to perform L1 and L2 validations with prepare_oim tag ansible.builtin.import_playbook: ../input_validation/validate_config.yml tags: always @@ -273,4 +292,4 @@ - name: Prepare oim has completed # noqa:role-name[path] ansible.builtin.include_role: name: deploy_containers/common - tasks_from: prepare_oim_completion.yml + tasks_from: prepare_oim_completion.yml \ No newline at end of file diff --git a/prepare_oim/roles/deploy_containers/common/vars/main.yml b/prepare_oim/roles/deploy_containers/common/vars/main.yml index 4b0cdeaab7..3f582194f8 100644 --- a/prepare_oim/roles/deploy_containers/common/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/common/vars/main.yml @@ -67,8 +67,11 @@ prepare_oim_completion_msg_build_stream: | login_cmd: "podman login docker.io -u {{ docker_username }} -p {{ docker_password }}" retry_count: "5" delay_time: "10" -podman_login_fail_msg: "Podman login failed. Please ensure the podman login credentials in the input/omnia_config_credentials.yml are valid. - If they are, this error can occur due to a pull limit issue or multiple requests. Please try running the playbook again after waiting for a while." +podman_login_fail_msg: > + Podman login failed. Please ensure the podman login credentials in the + {{ hostvars['localhost']['input_project_dir'] }}/omnia_config_credentials.yml are valid. + If they are, this error can occur due to a pull limit issue or multiple requests. + Please try running the playbook again after waiting for a while. # Usage: add_known_hosts.yml ssh_config: "/root/.ssh/config" @@ -86,4 +89,4 @@ chrony_no_sources_msg: "No chrony sources are reachable. Please give a valid NTP # Usage: aarch64_prereq.yml ochami_aarch64_dir: "/opt/omnia/openchami/aarch64" -regctl_aarch64_url: "https://github.com/regclient/regclient/releases/latest/download/regctl-linux-arm64" +regctl_aarch64_url: "https://github.com/regclient/regclient/releases/latest/download/regctl-linux-arm64" \ No newline at end of file diff --git a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml index 2d7db2ca85..72addfd759 100644 --- a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml @@ -17,7 +17,7 @@ openchami_git_repo: https://github.com/OpenCHAMI/deployment-recipes.git openchami_share_dir: /opt/omnia/openchami openchami_clone_path: "{{ openchami_share_dir }}/deployment-recipes" -openchami_git_version: main +openchami_git_version: bf1f6dfdc7f6107a4227568987faedb1b79b95fa clone_retry: "5" clone_delay: "10" dir_permissions_755: "0755" @@ -43,16 +43,16 @@ pull_image_delay: 10 # OpenCHAMI image tags openchami_local_ca_tag: "v0.2.2" openchami_opaal_tag: "v0.3.10" -openchami_smd_tag: "v2.18.0" -openchami_bss_tag: "v1.32.0" -openchami_cloud_init_tag: "v1.2.3" -openchami_coredhcp_tag: "v0.3.0" +openchami_smd_tag: "v2.19.0" +openchami_bss_tag: "v1.32.1" +openchami_cloud_init_tag: "v1.3.0" +openchami_coresmd_tag: "v0.4.0" # Third-party image tags for OpenCHAMI -minio_tag: "latest" +minio_release_tag: "RELEASE.2026-04-17T00-00-00Z" postgres_tag: "11.5-alpine" hydra_tag: "v2.3" haproxy_tag: "latest" -registry_tag: "latest" +registry_tag: "3.1.0" curl_tag: "latest" acme_tag: "3.1.1" @@ -63,8 +63,8 @@ openchami_images: - "ghcr.io/openchami/smd:{{ openchami_smd_tag }}" - "ghcr.io/openchami/bss:{{ openchami_bss_tag }}" - "ghcr.io/openchami/cloud-init:{{ openchami_cloud_init_tag }}" - - "ghcr.io/openchami/coredhcp:{{ openchami_coredhcp_tag }}" - - "docker.io/minio/minio:{{ minio_tag }}" + - "ghcr.io/openchami/coresmd:{{ openchami_coresmd_tag }}" + - "docker.io/pgsty/minio:{{ minio_release_tag }}" - "docker.io/library/postgres:{{ postgres_tag }}" - "docker.io/oryd/hydra:{{ hydra_tag }}" - "cgr.dev/chainguard/haproxy:{{ haproxy_tag }}" @@ -73,4 +73,4 @@ openchami_images: - "docker.io/neilpang/acme.sh:{{ acme_tag }}" # Usage: verify_openchami.yml -cluster_env_key: "{{ oim_node_name | upper }}_ACCESS_TOKEN" +cluster_env_key: "{{ oim_node_name | upper }}_ACCESS_TOKEN" \ No newline at end of file diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index e6e77da8c1..1750e50be0 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -150,6 +150,12 @@ content: | {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} + - path: /tmp/apptainer_mirror.conf permissions: '0644' content: | @@ -182,9 +188,10 @@ - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - /usr/local/bin/configure_vast_installation.sh + - mount -a {% if login_compiler_node_present %} @@ -224,7 +231,8 @@ - /root/ldms_sampler.sh {% endif %} - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index b7d5a6c297..48a3887394 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -150,6 +150,12 @@ content: | {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} + - path: /tmp/apptainer_mirror.conf permissions: '0644' content: | @@ -226,7 +232,8 @@ - /root/ldms_sampler.sh {% endif %} - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} @@ -328,4 +335,4 @@ # nvidia sdk install - /usr/local/bin/install_nvhpc_sdk.sh - /usr/local/bin/configure_nvhpc_env.sh - - echo "Cloud-Init has completed successfully." + - echo "Cloud-Init has completed successfully." \ No newline at end of file diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 3490a360b6..dfc1b035c2 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -424,7 +424,8 @@ - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - mkdir -p /etc/containers/registries.conf.d - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service @@ -1107,4 +1108,4 @@ kubectl get pods --all-namespaces -o wide echo "Cloud-Init finished successfully after the reboot." - fi + fi \ No newline at end of file diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 index 922f63f852..7b81a6dfe3 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 @@ -332,7 +332,8 @@ - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - mkdir -p /etc/containers/registries.conf.d - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service @@ -523,4 +524,4 @@ echo "Listing all Kubernetes pods in all namespaces:" kubectl get pods --all-namespaces -o wide echo "Cloud-Init finished successfully after the reboot." - fi + fi \ No newline at end of file diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 index df98035baa..283282e1de 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 @@ -234,7 +234,8 @@ - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - mkdir -p /etc/containers/registries.conf.d - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service @@ -317,4 +318,4 @@ # CRI and kubelet already enabled above systemctl status kubelet echo "Cloud-Init finished successfully after the reboot." - fi + fi \ No newline at end of file diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 388c587159..76fb7bfcc3 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -494,7 +494,8 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh {% if powervault_config is defined %} - /usr/local/bin/setup_iscsi_storage.sh {% endif %} @@ -560,4 +561,4 @@ {% endif %} - systemctl restart slurmdbd - systemctl restart slurmctld - - echo "Cloud-Init has completed successfully." + - echo "Cloud-Init has completed successfully." \ No newline at end of file diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 393f8ad1ab..c884c40dc9 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -630,10 +630,12 @@ - /usr/local/bin/set-ssh.sh # slurm user and group created in the users module + - /usr/local/bin/configure_vast_installation.sh - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index bda33ef511..9540b404d3 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -387,6 +387,12 @@ content: | SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure_dirs_and_mounts.sh permissions: '{{ file_mode_755 }}' content: | @@ -626,12 +632,13 @@ - /usr/local/bin/set-ssh.sh # slurm user and group created in the users module - + - /usr/local/bin/configure_vast_installation.sh - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh From 57bd382bc1277f03d0ba1c29239f65b4bb29083b Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 5 May 2026 06:23:27 +0530 Subject: [PATCH 06/63] revert merge changes as per head --- .../roles/fetch_packages/vars/main.yml | 5 +- .../validation_flows/provision_validation.py | 2 +- .../library/modules/generate_pxe_mapping.py | 2 +- .../library/modules/ome_server_inventory.py | 2 +- discovery/discovery.yml | 2 +- .../ome_discovery/tasks/collect_inventory.yml | 61 --------------- discovery/roles/ome_discovery/vars/main.yml | 2 +- .../pxe_mapping_file.csv | 2 +- .../catalog_rhel_json/pxe_mapping_file.csv | 2 +- .../pxe_mapping_file.csv | 2 +- .../pxe_mapping_file.csv | 2 +- examples/pxe_mapping_file.csv | 22 +++--- gitlab/roles/hosted_gitlab/vars/main.yml | 2 +- input/discovery_config.yml | 2 +- input/network_spec.yml | 2 +- prepare_oim/prepare_oim.yml | 2 +- .../deploy_containers/common/vars/main.yml | 2 +- .../deploy_containers/openchami/vars/main.yml | 2 +- ...-group-login_compiler_node_aarch64.yaml.j2 | 2 +- ...i-group-login_compiler_node_x86_64.yaml.j2 | 2 +- .../ci-group-login_node_aarch64.yaml.j2 | 12 ++- .../ci-group-login_node_x86_64.yaml.j2 | 12 ++- .../doca-ofed/configure-ib-network.sh.j2 | 63 +++++++++------- .../tasks/include_software_config.yml | 1 + .../common/telemetry_pod_cleanup.yaml.j2 | 75 ++----------------- .../oim_container_cleanup/vars/main.yml | 4 + 26 files changed, 96 insertions(+), 193 deletions(-) diff --git a/build_image_x86_64/roles/fetch_packages/vars/main.yml b/build_image_x86_64/roles/fetch_packages/vars/main.yml index 439d02931b..a604e9b815 100644 --- a/build_image_x86_64/roles/fetch_packages/vars/main.yml +++ b/build_image_x86_64/roles/fetch_packages/vars/main.yml @@ -30,10 +30,7 @@ x86_64_build_image_completion_msg: | Supported discovery mechanisms: - ome : Dell OpenManage Enterprise (OME) - - magellan : Magellan (upcoming, not yet supported) - - Examples: - ansible-playbook discovery.yml -e discovery_mechanism=ome + - magellan : Magellan Otherwise, provide PXE mapping and execute provision/provision.yml. diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index e68be5696f..48e40a16cf 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -271,6 +271,7 @@ def validate_duplicate_admin_ips_in_mapping_file(pxe_mapping_file_path): if duplicates: raise ValueError(f"Duplicate ADMIN_IP found in PXE mapping file: {'; '.join(duplicates)}") + def validate_duplicate_ib_ips_in_mapping_file(pxe_mapping_file_path): """Validates that IB_IP values in the mapping file are unique.""" if not pxe_mapping_file_path or not os.path.isfile(pxe_mapping_file_path): @@ -313,7 +314,6 @@ def validate_duplicate_ib_ips_in_mapping_file(pxe_mapping_file_path): seen_ib_ips[ib_ip] = {"row": row_idx, "hostname": hostname} - if duplicates: raise ValueError(f"Duplicate IB_IP found in PXE mapping file: {'; '.join(duplicates)}") diff --git a/common/library/modules/generate_pxe_mapping.py b/common/library/modules/generate_pxe_mapping.py index 0e64f10cac..58d7c85db4 100644 --- a/common/library/modules/generate_pxe_mapping.py +++ b/common/library/modules/generate_pxe_mapping.py @@ -324,4 +324,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/common/library/modules/ome_server_inventory.py b/common/library/modules/ome_server_inventory.py index a178ac0d6f..73c8b795eb 100644 --- a/common/library/modules/ome_server_inventory.py +++ b/common/library/modules/ome_server_inventory.py @@ -576,4 +576,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/discovery/discovery.yml b/discovery/discovery.yml index 39db0ae989..8adfa0e5d1 100644 --- a/discovery/discovery.yml +++ b/discovery/discovery.yml @@ -112,4 +112,4 @@ - name: Include OME discovery role ansible.builtin.include_role: - name: ome_discovery \ No newline at end of file + name: ome_discovery diff --git a/discovery/roles/ome_discovery/tasks/collect_inventory.yml b/discovery/roles/ome_discovery/tasks/collect_inventory.yml index ccc7ded21c..ebe8cd5183 100644 --- a/discovery/roles/ome_discovery/tasks/collect_inventory.yml +++ b/discovery/roles/ome_discovery/tasks/collect_inventory.yml @@ -13,52 +13,6 @@ # limitations under the License. --- -- name: Verify OME is reachable - ansible.builtin.wait_for: - host: "{{ ome_ip }}" - port: 443 - timeout: 30 - register: ome_reachability - -- name: Collect OME server inventory - ome_server_inventory: - ome_ip: "{{ ome_ip }}" - ome_username: "{{ ome_username }}" - ome_password: "{{ ome_password }}" - device_type: "{{ ome_server_device_type }}" - page_size: "{{ ome_page_size }}" - register: ome_inventory_result - -- name: Display OME pagination summary - ansible.builtin.debug: - msg: - - "OME Pagination Summary:" - - " Total devices in OME: {{ ome_inventory_result.pagination.total_devices_in_ome }}" - - " Page size: {{ ome_inventory_result.pagination.page_size }}" - - " Total pages: {{ ome_inventory_result.pagination.total_pages }}" - - " Pages fetched: {{ ome_inventory_result.pagination.pages_fetched }}" - - " Devices retrieved: {{ ome_inventory_result.pagination.devices_retrieved }}" - - >- - Devices after filtering: - {{ ome_inventory_result.pagination.devices_after_type_filter - | default(ome_inventory_result.pagination.devices_retrieved) }} - -- name: Display connection status - ansible.builtin.debug:# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - - name: Verify OME is reachable block: - name: Wait for OME HTTPS port @@ -112,21 +66,6 @@ ansible.builtin.set_fact: discovered_servers: "{{ ome_inventory_result.devices }}" -- name: Display discovered server count - ansible.builtin.debug: - msg: "Discovered {{ discovered_servers | length }} servers from OME" - msg: "{{ ome_connection_success_msg }}" - when: ome_inventory_result.devices | length > 0 - -- name: Fail if no servers found - ansible.builtin.fail: - msg: "{{ no_servers_found_msg }}" - when: ome_inventory_result.devices | length == 0 - -- name: Set discovered servers fact - ansible.builtin.set_fact: - discovered_servers: "{{ ome_inventory_result.devices }}" - - name: Display discovered server count ansible.builtin.debug: msg: "Discovered {{ discovered_servers | length }} servers from OME" diff --git a/discovery/roles/ome_discovery/vars/main.yml b/discovery/roles/ome_discovery/vars/main.yml index 637dde1d66..ddcc1868f9 100644 --- a/discovery/roles/ome_discovery/vars/main.yml +++ b/discovery/roles/ome_discovery/vars/main.yml @@ -57,4 +57,4 @@ discovery_complete_msg: no_servers_found_msg: | No servers found in OME inventory. - Please verify that servers are discovered and managed by OME. \ No newline at end of file + Please verify that servers are discovered and managed by OME. diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv index 295e7615af..87ff7ced94 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv @@ -3,4 +3,4 @@ slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172 slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 \ No newline at end of file +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv index e6f4059d0b..5226b0a19e 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_json/pxe_mapping_file.csv @@ -10,4 +10,4 @@ service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,94:6d:ae:03:00:8c:12:8c,192.168.0.108 service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,94:6d:ae:03:00:8c:12:9d,192.168.0.109 os_x86_64,grp6,ABEF56,,os-node1,xx:yy:zz:aa:bb:ff,172.16.107.60,xx:yy:zz:aa:bb:ee,172.17.107.60,94:6d:ae:03:00:8c:12:ae,192.168.0.110 -os_aarch64,grp7,ABEF78,,os-node2,xx:yy:zz:aa:bb:ab,172.16.107.61,xx:yy:zz:aa:bb:ac,172.17.107.61,94:6d:ae:03:00:8c:12:bf,192.168.0.111 \ No newline at end of file +os_aarch64,grp7,ABEF78,,os-node2,xx:yy:zz:aa:bb:ab,172.16.107.61,xx:yy:zz:aa:bb:ac,172.17.107.61,94:6d:ae:03:00:8c:12:bf,192.168.0.111 diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv index 8b4150ca89..01360b424b 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_with_ucx_openmpi_json/pxe_mapping_file.csv @@ -8,4 +8,4 @@ service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,94:6d:ae:03:00:8c:12:6a,192.168.0.106 service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,94:6d:ae:03:00:8c:12:7b,192.168.0.107 service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,94:6d:ae:03:00:8c:12:8c,192.168.0.108 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,94:6d:ae:03:00:8c:12:9d,192.168.0.109 \ No newline at end of file +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,94:6d:ae:03:00:8c:12:9d,192.168.0.109 diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv index e29490a096..65ceac6ada 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_x86_64_with_slurm_only_json/pxe_mapping_file.csv @@ -3,4 +3,4 @@ slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172 slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 slurm_node_x86_64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 login_compiler_node_x86_64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 \ No newline at end of file +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 diff --git a/examples/pxe_mapping_file.csv b/examples/pxe_mapping_file.csv index f4d41e2a77..01360b424b 100644 --- a/examples/pxe_mapping_file.csv +++ b/examples/pxe_mapping_file.csv @@ -1,11 +1,11 @@ -FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP -slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 -service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 -service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 -service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 -service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 +FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP,IB_MAC,IB_IP +slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52,94:6d:ae:03:00:8c:12:2c,192.168.0.100 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 +service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53,94:6d:ae:03:00:8c:12:5f,192.168.0.105 +service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54,94:6d:ae:03:00:8c:12:6a,192.168.0.106 +service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55,94:6d:ae:03:00:8c:12:7b,192.168.0.107 +service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56,94:6d:ae:03:00:8c:12:8c,192.168.0.108 +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57,94:6d:ae:03:00:8c:12:9d,192.168.0.109 diff --git a/gitlab/roles/hosted_gitlab/vars/main.yml b/gitlab/roles/hosted_gitlab/vars/main.yml index daec64ff40..c4f3a62f8f 100644 --- a/gitlab/roles/hosted_gitlab/vars/main.yml +++ b/gitlab/roles/hosted_gitlab/vars/main.yml @@ -328,4 +328,4 @@ gitlab_deployment_complete_msg: - "" - "IMPORTANT: For any new GitLab reconfiguration, run cleanup_gitlab.yml before running gitlab.yml" - "" - - "============================================" \ No newline at end of file + - "============================================" diff --git a/input/discovery_config.yml b/input/discovery_config.yml index dec5aee952..3cc563b069 100644 --- a/input/discovery_config.yml +++ b/input/discovery_config.yml @@ -31,4 +31,4 @@ enable_bmc_discovery: false ome_ip: "" #### Magellan Discovery -# Reserved for future Magellan discovery configuration parameters. \ No newline at end of file +# Reserved for future Magellan discovery configuration parameters. diff --git a/input/network_spec.yml b/input/network_spec.yml index 25939e3fd9..dc7dc3cbbb 100644 --- a/input/network_spec.yml +++ b/input/network_spec.yml @@ -53,4 +53,4 @@ Networks: - ib_network: subnet: "192.168.0.0" netmask_bits: "24" - dns: ["192.168.10.10"] \ No newline at end of file + dns: ["192.168.10.10"] diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml index ee5ae06f6a..f26e280126 100644 --- a/prepare_oim/prepare_oim.yml +++ b/prepare_oim/prepare_oim.yml @@ -292,4 +292,4 @@ - name: Prepare oim has completed # noqa:role-name[path] ansible.builtin.include_role: name: deploy_containers/common - tasks_from: prepare_oim_completion.yml \ No newline at end of file + tasks_from: prepare_oim_completion.yml diff --git a/prepare_oim/roles/deploy_containers/common/vars/main.yml b/prepare_oim/roles/deploy_containers/common/vars/main.yml index 3f582194f8..a01fad7002 100644 --- a/prepare_oim/roles/deploy_containers/common/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/common/vars/main.yml @@ -89,4 +89,4 @@ chrony_no_sources_msg: "No chrony sources are reachable. Please give a valid NTP # Usage: aarch64_prereq.yml ochami_aarch64_dir: "/opt/omnia/openchami/aarch64" -regctl_aarch64_url: "https://github.com/regclient/regclient/releases/latest/download/regctl-linux-arm64" \ No newline at end of file +regctl_aarch64_url: "https://github.com/regclient/regclient/releases/latest/download/regctl-linux-arm64" diff --git a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml index 72addfd759..9f6254a0b3 100644 --- a/prepare_oim/roles/deploy_containers/openchami/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/openchami/vars/main.yml @@ -73,4 +73,4 @@ openchami_images: - "docker.io/neilpang/acme.sh:{{ acme_tag }}" # Usage: verify_openchami.yml -cluster_env_key: "{{ oim_node_name | upper }}_ACCESS_TOKEN" \ No newline at end of file +cluster_env_key: "{{ oim_node_name | upper }}_ACCESS_TOKEN" diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 1750e50be0..3de6f1e35d 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -332,4 +332,4 @@ # nvidia sdk install - /usr/local/bin/install_nvhpc_sdk.sh - /usr/local/bin/configure_nvhpc_env.sh - - echo "Cloud-Init has completed successfully." \ No newline at end of file + - echo "Cloud-Init has completed successfully." diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 48a3887394..7ee7580733 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -335,4 +335,4 @@ # nvidia sdk install - /usr/local/bin/install_nvhpc_sdk.sh - /usr/local/bin/configure_nvhpc_env.sh - - echo "Cloud-Init has completed successfully." \ No newline at end of file + - echo "Cloud-Init has completed successfully." diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index 126f3873b7..317ff1aa26 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -121,6 +121,12 @@ content: | {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} + - path: /tmp/apptainer_mirror.conf permissions: '0644' content: | @@ -142,10 +148,12 @@ - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - /usr/local/bin/configure_vast_installation.sh + - mount -a + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 4e34ff4868..593cef9d00 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -120,6 +120,12 @@ content: | {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure_vast_installation.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/vast/configure_vast_installation.sh.j2') | indent(12) }} + - path: /tmp/apptainer_mirror.conf permissions: '0644' content: | @@ -144,10 +150,12 @@ - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - /usr/local/bin/configure_vast_installation.sh + - mount -a + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 b/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 index 249b90b6a5..d87720e495 100644 --- a/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 +++ b/provision/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 @@ -7,34 +7,33 @@ if ! lspci | grep -i 'mellanox'; then exit 0 fi +# Ensure IPoIB + Mellanox IB kernel modules are loaded before interface detection. +# This avoids boot-time races where the IB device exists (lspci) but no ib* link is present yet. +modprobe mlx5_ib || true +modprobe ib_ipoib || true +modprobe ib_umad || true +modprobe ib_uverbs || true + ADMIN_NIC_IP="{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}" NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" -IB_NETWORK_SUBNET="{{ hostvars['localhost']['ib_network_subnet'] }}" - -ip_to_int() { - local IFS=. - read -r a b c d <<< "$1" - echo $(( (a << 24) + (b << 16) + (c << 8) + d )) -} - -int_to_ip() { - local ip=$1 - echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" -} - - -ADMIN_IP_INT=$(ip_to_int "$ADMIN_NIC_IP") -IB_NET_INT=$(ip_to_int "$IB_NETWORK_SUBNET") - -HOST_BITS=$(( 32 - NETMASK_BITS )) -HOST_MASK=$(( (1 << HOST_BITS) - 1 )) - -HOST_OFFSET=$(( ADMIN_IP_INT & HOST_MASK )) -IB_IP_INT=$(( IB_NET_INT + HOST_OFFSET )) - -IB_IP=$(int_to_ip "$IB_IP_INT") - -echo "Derived IB IP : $IB_IP/$NETMASK_BITS" +declare -A IB_IP_MAP=( +{% for mac, node in hostvars['localhost']['read_mapping_file']['dict'].items() -%} +{% if node.IB_IP is defined and node.IB_IP | trim | length > 0 %} + ["{{ node.ADMIN_IP }}"]="{{ node.IB_IP }}" +{%- endif %} +{%- endfor %} +) + +IB_IP="${IB_IP_MAP[$ADMIN_NIC_IP]:-}" + +if [ -n "$IB_IP" ]; then + echo "Using explicit IB IP : $IB_IP/$NETMASK_BITS" +else + echo "INFO: No explicit IB IP found in mapping file for node with ADMIN_IP: $ADMIN_NIC_IP" + echo "INFO: Skipping IB IP assignment. If IB networking is required for this node, please add IB_IP to the PXE mapping file." + echo "INFO: IB network interface will remain unconfigured." + exit 0 +fi MAX_WAIT=120 # total wait time in seconds (2 minutes) INTERVAL=10 # check every 10 seconds @@ -75,4 +74,16 @@ else fi echo "SUCCESS: Assigned $IB_IP/$NETMASK_BITS to $IB_NIC" + +# Configure DNS for InfiniBand network +if [ -n "$IB_IP" ]; then + echo "Configuring DNS for InfiniBand interface" + + # Add VAST DNS servers (completely safe - handles empty arrays) + {% for dns_server in hostvars['localhost']['ib_network_dns'] %} + echo "nameserver {{ dns_server }}" >> /etc/resolv.conf + {% endfor %} + + echo "SUCCESS: DNS configured for IB network" +fi diff --git a/provision/roles/provision_validations/tasks/include_software_config.yml b/provision/roles/provision_validations/tasks/include_software_config.yml index 9ba81a2f22..b2480d2c6e 100644 --- a/provision/roles/provision_validations/tasks/include_software_config.yml +++ b/provision/roles/provision_validations/tasks/include_software_config.yml @@ -42,6 +42,7 @@ admin_nic: "{{ network_data.admin_network.oim_nic_name }}" admin_netmask_bits: "{{ network_data.admin_network.netmask_bits }}" ib_network_subnet: "{{ network_data.ib_network.subnet }}" + ib_network_dns: "{{ network_data.ib_network.dns | default([]) }}" dns: "{{ network_data.admin_network.dns }}" - name: Initialise variables diff --git a/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 b/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 index cf02934ef3..3709759f78 100644 --- a/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/common/telemetry_pod_cleanup.yaml.j2 @@ -33,20 +33,14 @@ spec: apk add --no-cache coreutils set -e - echo "=== Checking for stuck pods ===" - # Get all terminating pods terminating=$(kubectl get pods -n telemetry -o jsonpath='{range .items[?(@.metadata.deletionTimestamp)]}{.metadata.name}{"\n"}{end}') - # Get all CrashLoopBackOff pods - crashloop=$(kubectl get pods -n telemetry -o jsonpath='{range .items[?(@.status.containerStatuses[*].state.waiting.reason=="CrashLoopBackOff")]}{.metadata.name}{"\n"}{end}') - + if [ -z "$terminating" ]; then + echo "No terminating pods found" + else now=$(date +%s) - processed=0 - # Process terminating pods - if [ -n "$terminating" ]; then - echo "→ Found terminating pods: $terminating" for pod in $terminating; do deletion_ts=$(kubectl get pod "$pod" -n telemetry -o jsonpath='{.metadata.deletionTimestamp}' 2>/dev/null) if [ -z "$deletion_ts" ]; then @@ -95,72 +89,13 @@ spec: done echo " Cleaned PVCs for $pod" fi - processed=$((processed + 1)) - else - echo "Pod $pod terminating for $age seconds (threshold: ${threshold}s). Skipping." - fi - done else - echo "No terminating pods found" - fi - - # Process CrashLoopBackOff pods - if [ -n "$crashloop" ]; then - echo "→ Found CrashLoopBackOff pods: $crashloop" - for pod in $crashloop; do - restart_count=$(kubectl get pod "$pod" -n telemetry -o jsonpath='{.status.containerStatuses[0].restartCount}' 2>/dev/null || echo 0) - - # Only process if restart count >= 5 (persistent crash loop) - if [ $restart_count -ge 5 ]; then - echo "→ Pod $pod in CrashLoopBackOff ($restart_count restarts). Processing..." - - # Check for tablespace corruption - skip cleanup if detected - if echo "$pod" | grep -q "idrac-telemetry"; then - pod_logs=$(kubectl logs "$pod" -n telemetry -c mysqldb --tail=50 2>/dev/null || echo "") - if echo "$pod_logs" | grep -q "space=4294967294\|nonexisting or being-dropped tablespace\|no existing undo tablespaces\|Data Dictionary initialization failed"; then - echo " → Tablespace corruption detected. Skipping cleanup - PVC recreation required." - processed=$((processed + 1)) - continue - fi - fi - - # Get PVCs - pvcs=$(kubectl get pod "$pod" -n telemetry -o jsonpath='{.spec.volumes[*].persistentVolumeClaim.claimName}' 2>/dev/null) - - # Clean PVCs if any - if [ -n "$pvcs" ]; then - for pvc in $pvcs; do - echo " → Cleaning $pvc..." - cleanup_pod="pvc-clean-$RANDOM" - - # Create cleanup pod - remove lock files (including MySQL-specific files) - kubectl run $cleanup_pod --image=busybox:1.36 -n telemetry --restart=Never \ - --overrides="{\"spec\":{\"containers\":[{\"name\":\"cleanup\",\"image\":\"busybox:1.36\",\"command\":[\"sh\",\"-c\",\"echo 'Cleaning lock files in /data...'; find /data -type f \\\\( -name '.lock' -o -name '*.lock' -o -name '*.sock' -o -name '*.pid' -o -name 'ib_buffer_pool' -o -name 'ibtmp1' \\\\) -exec rm -fv {} \\\\; 2>/dev/null || true; echo 'Done'\"],\"volumeMounts\":[{\"name\":\"data\",\"mountPath\":\"/data\"}]}],\"volumes\":[{\"name\":\"data\",\"persistentVolumeClaim\":{\"claimName\":\"$pvc\"}}]}}" \ - 2>/dev/null || echo " Failed to create $cleanup_pod" - - # Wait for it to complete (max 20s), then show logs and delete - if kubectl wait --for=condition=Ready pod/$cleanup_pod -n telemetry --timeout=20s 2>/dev/null; then - kubectl logs $cleanup_pod -n telemetry 2>/dev/null | head -20 - else - echo "$cleanup_pod timed out (PVC may be in use)" - fi - kubectl delete pod $cleanup_pod -n telemetry 2>/dev/null || true - done - echo " Cleaned PVCs for $pod" - fi - - # Delete pod to trigger restart with clean state - kubectl delete pod "$pod" -n telemetry --grace-period=0 --force 2>/dev/null || true - processed=$((processed + 1)) - else - echo "Pod $pod in CrashLoopBackOff but only $restart_count restarts (threshold: 5). Skipping." + echo "Pod $pod terminating for $age seconds (threshold: ${threshold}s). Skipping." fi done - else - echo "No CrashLoopBackOff pods found" fi - echo "=== Cleanup complete (processed $processed pods) ===" + echo "Cleanup complete" exit 0 restartPolicy: Never diff --git a/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml b/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml index be275fd870..ae2a86d511 100644 --- a/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml +++ b/utils/roles/oim_cleanup/oim_container_cleanup/vars/main.yml @@ -59,6 +59,8 @@ openchami_containers: - cloud-init-server - haproxy - coresmd + - coresmd-coredhcp + - coresmd-coredns openchami_volumes: - haproxy-certs @@ -78,6 +80,7 @@ openchami_secrets: - bss_postgres_password tcp_ports: + - 53 - 9000 - 9001 - 5000 @@ -88,6 +91,7 @@ tcp_ports: - 8443 udp_ports: + - 53 - 69 - 67 - 68 From d46e67e84d966c4657ab3ffc4887cea055391461 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 5 May 2026 06:31:19 +0530 Subject: [PATCH 07/63] revert variable set --- provision/roles/configure_ochami/vars/main.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/provision/roles/configure_ochami/vars/main.yml b/provision/roles/configure_ochami/vars/main.yml index 58cab67375..8de6ceb69a 100644 --- a/provision/roles/configure_ochami/vars/main.yml +++ b/provision/roles/configure_ochami/vars/main.yml @@ -110,7 +110,6 @@ dcgm_support: "{{ hostvars['localhost'].get('telemetry_sources', {}).get('dcgm', # NVIDIA DCGM (Data Center GPU Manager) configuration dcgm_service_name: "nvidia-dcgm" dcgm_health_check_retries: 3 -dcgm_support: "{{ telemetry_config.telemetry_sources.dcgm.metrics_enabled | default(true) }}" # Usage: fetch_additional_images.yml input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" From 83bd12c6cfddb5c44046eb781cf8904ce2cc7953 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 5 May 2026 06:35:05 +0530 Subject: [PATCH 08/63] revert changes --- .../cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 | 2 +- .../ci-group-service_kube_control_plane_first_x86_64.yaml.j2 | 2 +- .../ci-group-service_kube_control_plane_x86_64.yaml.j2 | 2 +- .../cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 | 2 +- .../cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 | 2 +- .../templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 3de6f1e35d..1750e50be0 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -332,4 +332,4 @@ # nvidia sdk install - /usr/local/bin/install_nvhpc_sdk.sh - /usr/local/bin/configure_nvhpc_env.sh - - echo "Cloud-Init has completed successfully." + - echo "Cloud-Init has completed successfully." \ No newline at end of file diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index dfc1b035c2..0d01edee47 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -1108,4 +1108,4 @@ kubectl get pods --all-namespaces -o wide echo "Cloud-Init finished successfully after the reboot." - fi \ No newline at end of file + fi diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 index 7b81a6dfe3..71f8be3033 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 @@ -524,4 +524,4 @@ echo "Listing all Kubernetes pods in all namespaces:" kubectl get pods --all-namespaces -o wide echo "Cloud-Init finished successfully after the reboot." - fi \ No newline at end of file + fi diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 index 283282e1de..e363187b58 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 @@ -318,4 +318,4 @@ # CRI and kubelet already enabled above systemctl status kubelet echo "Cloud-Init finished successfully after the reboot." - fi \ No newline at end of file + fi diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 76fb7bfcc3..2f0c16b577 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -561,4 +561,4 @@ {% endif %} - systemctl restart slurmdbd - systemctl restart slurmctld - - echo "Cloud-Init has completed successfully." \ No newline at end of file + - echo "Cloud-Init has completed successfully." diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 9540b404d3..401108acae 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -705,4 +705,4 @@ {% endif %} - systemctl restart slurmd - - echo "Cloud-Init has completed successfully." \ No newline at end of file + - echo "Cloud-Init has completed successfully." From a09467e2d924f26a2bb184a001367733422b5266 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 5 May 2026 06:36:41 +0530 Subject: [PATCH 09/63] revert changes --- .../pxe_mapping_file.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv b/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv index 87ff7ced94..295e7615af 100644 --- a/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv +++ b/examples/catalog/mapping_file_software_config/catalog_rhel_aarch64_with_slurm_only_json/pxe_mapping_file.csv @@ -3,4 +3,4 @@ slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172 slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43,94:6d:ae:03:00:8c:10:8c,192.168.0.101 slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44,94:6d:ae:03:00:8c:11:fc,192.168.0.102 login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41,94:6d:ae:03:00:8c:12:3d,192.168.0.103 -login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 +login_node_x86_64,grp9,ABFG78,,login-node1,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42,94:6d:ae:03:00:8c:12:4e,192.168.0.104 \ No newline at end of file From 45346750f7cd28c01fdd0ff4d344dad8a0bb6df2 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 5 May 2026 06:40:18 +0530 Subject: [PATCH 10/63] pylint fixes --- .../common_utils/en_us_validation_msg.py | 127 +----------------- 1 file changed, 2 insertions(+), 125 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index f8adccfb7a..71fd578080 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -306,6 +306,7 @@ def switch_snmp3_username_fail_msg(min_username_length, max_length): "software_config.json and rerun the playbook." ) +# pylint: disable=invalid-name # PowerScale telemetry validation messages POWERSCALE_VICTORIA_REQUIRED_MSG = ( "PowerScale telemetry requires VictoriaMetrics to be deployed. " @@ -430,131 +431,7 @@ def powerscale_auth_image_version_mismatch_msg( "image versions. Please ensure the file exists at " "input/config/x86_64/rhel/10.0/csi_driver_powerscale.json." ) - -# PowerScale telemetry validation messages -POWERSCALE_VICTORIA_REQUIRED_MSG = ( - "PowerScale telemetry requires VictoriaMetrics to be deployed. " - "When telemetry_sources.powerscale.metrics_enabled is true, " - "'victoria_metrics' must be included in collection_targets " - "(e.g., 'victoria_metrics' or 'victoria_metrics,victoria_logs')." -) -POWERSCALE_VICTORIA_LOGS_REQUIRED_MSG = ( - "PowerScale logs collection requires VictoriaLogs to be deployed. " - "When telemetry_sources.powerscale.logs_enabled is true, " - "'victoria_logs' must be included in collection_targets " - "(e.g., 'victoria_metrics,victoria_logs')." -) -POWERSCALE_SYSLOG_SOURCE_IP_INVALID_MSG = ( - "Invalid IP address in powerscale_syslog_source_ips. " - "Each entry must be a valid IPv4 address (e.g., '192.168.55.11')." -) -POWERSCALE_CSI_DRIVER_MISSING_MSG = ( - "csi_driver_powerscale is not configured in software_config.json. " - "PowerScale telemetry requires the CSI driver for PowerScale to be configured." -) -POWERSCALE_SERVICE_CLUSTER_MISSING_MSG = ( - "service cluster is not defined in functional_groups_config.yml. " - "PowerScale telemetry requires a service cluster." -) -POWERSCALE_CONFIGURATIONS_MISSING_MSG = ( - "powerscale_configurations section is required when " - "telemetry_sources.powerscale.metrics_enabled is true. " - "It must contain csm_observability_values_file_path." -) -POWERSCALE_OTEL_STORAGE_SIZE_INVALID_MSG = ( - "must be a non-empty string in format 'XGi' (e.g., '5Gi')" -) -POWERSCALE_CSM_VALUES_PATH_REQUIRED_MSG = ( - "csm_observability_values_file_path is required when " - "telemetry_sources.powerscale.metrics_enabled is true. " - "Please provide the path to the CSM Observability values.yaml file." -) -POWERSCALE_AUTH_PROXY_HOST_MISSING_MSG = ( - "karaviMetricsPowerscale.authorization.proxyHost is required in the " - "CSM Observability values file when " - "karaviMetricsPowerscale.authorization.enabled is true. " - "Please provide the hostname or IP of the CSM Authorization Proxy server." -) -def powerscale_csm_values_not_found_msg(path): - """Returns error message when CSM Observability values.yaml file is not found.""" - return ( - f"CSM Observability values.yaml file not found at '{path}'. " - "Please verify the file path is correct." - ) -POWERSCALE_CSM_VALUES_INVALID_YAML_MSG = ( - "CSM Observability values.yaml must contain a valid YAML dictionary." -) -def powerscale_csm_values_parse_error_msg(error): - """Returns error message when CSM Observability values.yaml fails to parse.""" - return f"Failed to parse CSM Observability values.yaml: {error}" -POWERSCALE_CSM_VALUES_MISSING_KARAVI_SECTION_MSG = ( - "CSM Observability values.yaml is missing 'karaviMetricsPowerscale' section." -) -POWERSCALE_CSM_METRICS_IMAGE_MISSING_MSG = ( - "CSM Metrics PowerScale image is required in CSM Observability values.yaml." -) -POWERSCALE_OTEL_COLLECTOR_IMAGE_MISSING_MSG = ( - "OTEL Collector image is required in CSM Observability values.yaml." -) -POWERSCALE_ADDITIONAL_ENDPOINTS_URL_EMPTY_MSG = ( - "Each additional_remote_write_endpoint must have a non-empty 'url' field." -) -POWERSCALE_ADDITIONAL_ENDPOINTS_URL_INVALID_MSG = ( - "URL must start with 'http://' or 'https://'." -) -def powerscale_image_version_mismatch_msg(image_name, values_image, service_k8s_image): - """Returns error message when CSM values.yaml image version doesn't match service_k8s.json.""" - return ( - f"Image version mismatch for '{image_name}': " - f"CSM Observability values.yaml has '{values_image}' but " - f"service_k8s.json has '{service_k8s_image}'. " - f"Please update service_k8s.json to match the values.yaml version " - f"and re-run local_repo.yml to mirror the correct image to Pulp." - ) - -# PowerScale CSM Authorization validation messages -POWERSCALE_AUTH_CSI_DRIVER_MISSING_MSG = ( - "PowerScale CSM Authorization requires 'csi_driver_powerscale' to be present in software_config.json." -) -POWERSCALE_AUTH_SERVICE_CLUSTER_MISSING_MSG = ( - "PowerScale CSM Authorization requires service cluster nodes " - "(service_kube_node_*, service_kube_control_plane_*) to be defined " - "in the PXE mapping file." -) -POWERSCALE_AUTH_CSM_VALUES_PATH_REQUIRED_MSG = ( - "csm_authorization_values_file_path is required when powerscale_authorization.enabled is true." -) -def powerscale_auth_csm_values_not_found_msg(path): - """Returns error message when CSM Authorization values.yaml file is not found.""" - return ( - f"CSM Authorization values file does not exist at path: {path}. " - "Please verify the file path is correct." - ) -def powerscale_auth_csm_values_validation_error_msg(error): - """Returns error message when CSM Authorization values.yaml validation fails.""" - return f"Error validating CSM Authorization image versions: {error}" -POWERSCALE_AUTH_TENANTS_REQUIRED_MSG = ( - "At least one tenant must be defined when powerscale_authorization.enabled is true." -) -def powerscale_auth_tenant_roles_required_msg(tenant_name): - """Returns error message when a tenant has no roles defined.""" - return ( - f"At least one role must be defined for tenant '{tenant_name}'." - ) -def powerscale_auth_image_version_mismatch_msg( - image_name, values_version, csi_version -): - """Returns error message when CSM Authorization image version doesn't match csi_driver_powerscale.json.""" - return ( - f"Image version for {image_name} in CSM Authorization values.yaml " - f"({values_version}) does not match csi_driver_powerscale.json " - f"({csi_version}). Please ensure both files use the same version." - ) -POWERSCALE_AUTH_CSI_JSON_NOT_FOUND_MSG = ( - "csi_driver_powerscale.json not found. Cannot validate CSM Authorization " - "image versions. Please ensure the file exists at " - "input/config/x86_64/rhel/10.0/csi_driver_powerscale.json." -) +# pylint: enable=invalid-name def boolean_fail_msg(value): """Returns a formatted message indicating boolean_fail_msg.""" From fb86981cb1eb91ac18f03141ddab96e74bb1c004 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 5 May 2026 06:49:48 +0530 Subject: [PATCH 11/63] ansible lint fixes --- build_image_x86_64/roles/fetch_packages/vars/main.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/build_image_x86_64/roles/fetch_packages/vars/main.yml b/build_image_x86_64/roles/fetch_packages/vars/main.yml index a604e9b815..894baadb7c 100644 --- a/build_image_x86_64/roles/fetch_packages/vars/main.yml +++ b/build_image_x86_64/roles/fetch_packages/vars/main.yml @@ -25,15 +25,14 @@ x86_64_build_image_completion_msg: | The playbook build_image_x86_64.yml has been completed successfully. To build image for aarch64 nodes execute build_image_aarch64/build_image_aarch64.yml playbook. To boot x86_64 nodes execute discovery/discovery.yml playbook with discovery_mechanism parameter. - + Usage: ansible-playbook discovery.yml -e discovery_mechanism= - Supported discovery mechanisms: - ome : Dell OpenManage Enterprise (OME) - magellan : Magellan - + Otherwise, provide PXE mapping and execute provision/provision.yml. - + functional_group_absent_msg: | Failure: No x86_64 functional groups found in functional_group_config.yml input file. Please make sure x86_64 functional_group should be present in input file functional_group_config.yml From b0d44cc67970a3fe3793374d4357823513616cbd Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 5 May 2026 11:10:43 +0530 Subject: [PATCH 12/63] updating completion messaage --- build_image_x86_64/roles/fetch_packages/vars/main.yml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/build_image_x86_64/roles/fetch_packages/vars/main.yml b/build_image_x86_64/roles/fetch_packages/vars/main.yml index 894baadb7c..59e67fa991 100644 --- a/build_image_x86_64/roles/fetch_packages/vars/main.yml +++ b/build_image_x86_64/roles/fetch_packages/vars/main.yml @@ -24,14 +24,7 @@ software_config_file_path: "{{ input_project_dir }}/software_config.json" x86_64_build_image_completion_msg: | The playbook build_image_x86_64.yml has been completed successfully. To build image for aarch64 nodes execute build_image_aarch64/build_image_aarch64.yml playbook. - To boot x86_64 nodes execute discovery/discovery.yml playbook with discovery_mechanism parameter. - - Usage: ansible-playbook discovery.yml -e discovery_mechanism= - Supported discovery mechanisms: - - ome : Dell OpenManage Enterprise (OME) - - magellan : Magellan - - Otherwise, provide PXE mapping and execute provision/provision.yml. + To boot x86_64 nodes execute provision/provision.yml playbook. functional_group_absent_msg: | Failure: No x86_64 functional groups found in functional_group_config.yml input file. From 61d8b68c2d018c2acae9732f537f6bdb048c5719 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 5 May 2026 11:38:25 +0530 Subject: [PATCH 13/63] telemetry validation while prepare oim --- prepare_oim/prepare_oim.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml index f26e280126..31ea49cc3c 100644 --- a/prepare_oim/prepare_oim.yml +++ b/prepare_oim/prepare_oim.yml @@ -89,7 +89,7 @@ name: telemetry_config when: telemetry_config_stat.stat.exists - - name: Add telemetry tag if idrac_telemetry_support is enabled + - name: Add telemetry tag if any telemetry source is enabled ansible.builtin.set_fact: omnia_run_tags: >- {{ @@ -97,7 +97,11 @@ }} when: - telemetry_config_stat.stat.exists - - telemetry_config.idrac_telemetry_support | default(false) | bool + - >- + telemetry_config.idrac_telemetry_support | default(false) | bool or + (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.dcgm.metrics_enabled | default(false) | bool) - name: Check discovery configuration for OME block: From f91f826d73d4ffd9c074ff6df67fd36af3466119 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 5 May 2026 11:59:00 +0530 Subject: [PATCH 14/63] update condition --- prepare_oim/prepare_oim.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml index 31ea49cc3c..d1dc6d62fe 100644 --- a/prepare_oim/prepare_oim.yml +++ b/prepare_oim/prepare_oim.yml @@ -98,7 +98,7 @@ when: - telemetry_config_stat.stat.exists - >- - telemetry_config.idrac_telemetry_support | default(false) | bool or + (telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) or (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool) or (telemetry_config.telemetry_sources.dcgm.metrics_enabled | default(false) | bool) From e8ba7258c875cae79c8e59cfcd1b09d714e59f2d Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Tue, 5 May 2026 12:04:21 +0530 Subject: [PATCH 15/63] added check for LDMS --- prepare_oim/prepare_oim.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml index d1dc6d62fe..842c5af11a 100644 --- a/prepare_oim/prepare_oim.yml +++ b/prepare_oim/prepare_oim.yml @@ -99,6 +99,7 @@ - telemetry_config_stat.stat.exists - >- (telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) or (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool) or (telemetry_config.telemetry_sources.dcgm.metrics_enabled | default(false) | bool) From 65dd6bdc7273ecfaaa325cfb8fb119415903e1f7 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Wed, 6 May 2026 06:25:36 +0530 Subject: [PATCH 16/63] Fix for crashloopback state on node reboot --- .../victoria-operator-vmcluster.yaml.j2 | 48 +++++++++++++++++-- .../victorialogs-operator-vlcluster.yaml.j2 | 43 +++++++++++++++++ 2 files changed, 88 insertions(+), 3 deletions(-) diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 index 8d181a8225..f137f00f74 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 @@ -31,7 +31,19 @@ spec: repository: {{ victoria_cluster.vmstorage.image.split(':')[0] }} tag: {{ victoria_cluster.vmstorage.image.split(':')[1] }} pullPolicy: IfNotPresent - + + # Graceful shutdown - gives storage engine time to flush data and clean up lock files + terminationGracePeriodSeconds: 120 + + # Init container to remove stale lock files after node reboot + initContainers: + - name: remove-stale-locks + image: busybox:1.35 + command: ["sh", "-c", "rm -f /vmstorage-data/flock.lock || true"] + volumeMounts: + - name: vmstorage-db + mountPath: /vmstorage-data + # Storage configuration per pod storageDataPath: /vmstorage-data storage: @@ -43,7 +55,7 @@ spec: requests: storage: {{ telemetry_config.telemetry_sinks.victoria_metrics.persistence_size }} - # Resource limits + # Resource limits - Guaranteed QoS to prevent OOM on reconnection resources: requests: memory: {{ victoria_cluster.vmstorage.resources.requests.memory }} @@ -51,7 +63,37 @@ spec: limits: memory: {{ victoria_cluster.vmstorage.resources.limits.memory }} cpu: {{ victoria_cluster.vmstorage.resources.limits.cpu }} - + + # Startup probe - gives pod up to 300s to initialize (30 x 10s) + startupProbe: + httpGet: + path: /health + port: 8482 + scheme: HTTPS + initialDelaySeconds: 30 + periodSeconds: 10 + failureThreshold: 30 + timeoutSeconds: 5 +{% if victoria_cluster.tls_enabled %} + tlsConfig: + insecureSkipVerify: true +{% endif %} + + # Readiness probe - increased tolerance for WAL replay + readinessProbe: + httpGet: + path: /health + port: 8482 + scheme: HTTPS + initialDelaySeconds: 15 + periodSeconds: 5 + failureThreshold: 10 + timeoutSeconds: 5 +{% if victoria_cluster.tls_enabled %} + tlsConfig: + insecureSkipVerify: true +{% endif %} + # Pod anti-affinity affinity: podAntiAffinity: diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 index 0e8c893e1c..e0986163d8 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 @@ -36,6 +36,18 @@ spec: tag: {{ victoria_logs_cluster.vlstorage.image.split(':')[1] }} pullPolicy: IfNotPresent + # Graceful shutdown - gives storage engine time to flush data and clean up lock files + terminationGracePeriodSeconds: 120 + + # Init container to remove stale lock files after node reboot + initContainers: + - name: remove-stale-locks + image: busybox:1.35 + command: ["sh", "-c", "rm -f /vlstorage-data/flock.lock || true"] + volumeMounts: + - name: vlstorage-db + mountPath: /vlstorage-data + # Ports are managed by operator defaults (9491, 9400, 9401) storageDataPath: /vlstorage-data @@ -48,6 +60,7 @@ spec: requests: storage: {{ telemetry_config.telemetry_sinks.victoria_logs.storage_size }} + # Resource limits - Guaranteed QoS to prevent OOM on reconnection resources: requests: memory: {{ victoria_logs_cluster.vlstorage.resources.requests.memory }} @@ -56,6 +69,36 @@ spec: memory: {{ victoria_logs_cluster.vlstorage.resources.limits.memory }} cpu: {{ victoria_logs_cluster.vlstorage.resources.limits.cpu }} + # Startup probe - gives pod up to 300s to initialize (30 x 10s) + startupProbe: + httpGet: + path: /health + port: 9491 + scheme: HTTPS + initialDelaySeconds: 30 + periodSeconds: 10 + failureThreshold: 30 + timeoutSeconds: 5 +{% if victoria_logs_cluster.tls_enabled %} + tlsConfig: + insecureSkipVerify: true +{% endif %} + + # Readiness probe - increased tolerance for WAL replay + readinessProbe: + httpGet: + path: /health + port: 9491 + scheme: HTTPS + initialDelaySeconds: 15 + periodSeconds: 5 + failureThreshold: 10 + timeoutSeconds: 5 +{% if victoria_logs_cluster.tls_enabled %} + tlsConfig: + insecureSkipVerify: true +{% endif %} + affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: From 6842efdbd8176fb4e259cef42fd52e878f5d662e Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 7 May 2026 05:14:01 +0530 Subject: [PATCH 17/63] addressing review comment to move into vars --- .../victoria-operator-vmcluster.yaml.j2 | 29 +++--- .../victorialogs-operator-vlcluster.yaml.j2 | 24 ++--- provision/roles/telemetry/vars/main.yml | 89 ++++++++++++------- 3 files changed, 86 insertions(+), 56 deletions(-) diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 index f137f00f74..ac5f3f4256 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmcluster.yaml.j2 @@ -32,13 +32,14 @@ spec: tag: {{ victoria_cluster.vmstorage.image.split(':')[1] }} pullPolicy: IfNotPresent - # Graceful shutdown - gives storage engine time to flush data and clean up lock files - terminationGracePeriodSeconds: 120 + # Graceful shutdown - gives VMStorage time to flush data and clean up lock files + terminationGracePeriodSeconds: {{ victoria_cluster.termination_grace_period }} - # Init container to remove stale lock files after node reboot + # Init container to remove stale VMStorage lock files after node reboot + # Required because POSIX flock is not reliably released on NFS-backed PVCs initContainers: - name: remove-stale-locks - image: busybox:1.35 + image: "{{ telemetry_images['docker.io/library/busybox'] | default('docker.io/library/busybox:1.36') }}" command: ["sh", "-c", "rm -f /vmstorage-data/flock.lock || true"] volumeMounts: - name: vmstorage-db @@ -70,10 +71,10 @@ spec: path: /health port: 8482 scheme: HTTPS - initialDelaySeconds: 30 - periodSeconds: 10 - failureThreshold: 30 - timeoutSeconds: 5 + initialDelaySeconds: {{ victoria_cluster.startup_probe.initial_delay }} + periodSeconds: {{ victoria_cluster.startup_probe.period }} + failureThreshold: {{ victoria_cluster.startup_probe.failure_threshold }} + timeoutSeconds: {{ victoria_cluster.startup_probe.timeout }} {% if victoria_cluster.tls_enabled %} tlsConfig: insecureSkipVerify: true @@ -85,10 +86,10 @@ spec: path: /health port: 8482 scheme: HTTPS - initialDelaySeconds: 15 - periodSeconds: 5 - failureThreshold: 10 - timeoutSeconds: 5 + initialDelaySeconds: {{ victoria_cluster.readiness_probe.initial_delay }} + periodSeconds: {{ victoria_cluster.readiness_probe.period }} + failureThreshold: {{ victoria_cluster.readiness_probe.failure_threshold }} + timeoutSeconds: {{ victoria_cluster.readiness_probe.timeout }} {% if victoria_cluster.tls_enabled %} tlsConfig: insecureSkipVerify: true @@ -113,11 +114,11 @@ spec: - effect: NoExecute key: node.kubernetes.io/not-ready operator: Exists - tolerationSeconds: 5 + tolerationSeconds: {{ victoria_cluster.toleration_seconds }} - effect: NoExecute key: node.kubernetes.io/unreachable operator: Exists - tolerationSeconds: 5 + tolerationSeconds: {{ victoria_cluster.toleration_seconds }} {% if victoria_cluster.vmstorage.dedup_min_scrape_interval or victoria_cluster.tls_enabled %} extraArgs: diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 index e0986163d8..e25ac73265 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlcluster.yaml.j2 @@ -37,12 +37,12 @@ spec: pullPolicy: IfNotPresent # Graceful shutdown - gives storage engine time to flush data and clean up lock files - terminationGracePeriodSeconds: 120 + terminationGracePeriodSeconds: {{ victoria_logs_cluster.vlstorage.termination_grace_period }} # Init container to remove stale lock files after node reboot initContainers: - name: remove-stale-locks - image: busybox:1.35 + image: "{{ telemetry_images['docker.io/library/busybox'] | default('docker.io/library/busybox:1.36') }}" command: ["sh", "-c", "rm -f /vlstorage-data/flock.lock || true"] volumeMounts: - name: vlstorage-db @@ -75,10 +75,10 @@ spec: path: /health port: 9491 scheme: HTTPS - initialDelaySeconds: 30 - periodSeconds: 10 - failureThreshold: 30 - timeoutSeconds: 5 + initialDelaySeconds: {{ victoria_logs_cluster.vlstorage.startup_probe.initial_delay }} + periodSeconds: {{ victoria_logs_cluster.vlstorage.startup_probe.period }} + failureThreshold: {{ victoria_logs_cluster.vlstorage.startup_probe.failure_threshold }} + timeoutSeconds: {{ victoria_logs_cluster.vlstorage.startup_probe.timeout }} {% if victoria_logs_cluster.tls_enabled %} tlsConfig: insecureSkipVerify: true @@ -90,10 +90,10 @@ spec: path: /health port: 9491 scheme: HTTPS - initialDelaySeconds: 15 - periodSeconds: 5 - failureThreshold: 10 - timeoutSeconds: 5 + initialDelaySeconds: {{ victoria_logs_cluster.vlstorage.readiness_probe.initial_delay }} + periodSeconds: {{ victoria_logs_cluster.vlstorage.readiness_probe.period }} + failureThreshold: {{ victoria_logs_cluster.vlstorage.readiness_probe.failure_threshold }} + timeoutSeconds: {{ victoria_logs_cluster.vlstorage.readiness_probe.timeout }} {% if victoria_logs_cluster.tls_enabled %} tlsConfig: insecureSkipVerify: true @@ -116,11 +116,11 @@ spec: - effect: NoExecute key: node.kubernetes.io/not-ready operator: Exists - tolerationSeconds: 5 + tolerationSeconds: {{ victoria_logs_cluster.vlstorage.toleration_seconds }} - effect: NoExecute key: node.kubernetes.io/unreachable operator: Exists - tolerationSeconds: 5 + tolerationSeconds: {{ victoria_logs_cluster.vlstorage.toleration_seconds }} extraArgs: retentionPeriod: "{{ telemetry_config.telemetry_sinks.victoria_logs.retention_period }}h" diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index a1c8f2fad9..14388def4f 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -107,6 +107,19 @@ victoria_cluster: # true = cluster mode, false = single-node mode enabled: "{{ true if victoria_deployment_mode == 'cluster' else false }}" tls_enabled: true # Set to true to enable TLS for cluster components + # Health probe and timing configuration + termination_grace_period: 120 + startup_probe: + initial_delay: 30 + period: 10 + failure_threshold: 30 + timeout: 5 + readiness_probe: + initial_delay: 15 + period: 5 + failure_threshold: 10 + timeout: 5 + toleration_seconds: 5 # VMStorage: Stores raw data and returns query results vmstorage: replicas: 3 @@ -178,6 +191,18 @@ victoria_logs_cluster: limits: memory: "2Gi" cpu: "1000m" + termination_grace_period: 120 + startup_probe: + initial_delay: 30 + period: 10 + failure_threshold: 30 + timeout: 5 + readiness_probe: + initial_delay: 15 + period: 5 + failure_threshold: 10 + timeout: 5 + toleration_seconds: 5 # vlinsert: Log ingestion gateway (Deployment managed by operator via VLCluster CR) vlinsert: @@ -191,6 +216,17 @@ victoria_logs_cluster: limits: memory: "1Gi" cpu: "1000m" + liveness_probe: + initial_delay: 30 + period: 30 + failure_threshold: 3 + timeout: 5 + readiness_probe: + initial_delay: 5 + period: 10 + failure_threshold: 3 + timeout: 5 + termination_grace_period: 30 # vlselect: Log query gateway (Deployment managed by operator via VLCluster CR) vlselect: @@ -209,6 +245,18 @@ victoria_logs_cluster: replicas: 1 image: "{{ telemetry_images['victoriametrics/vlagent'] | default('docker.io/victoriametrics/vlagent:v1.49.0') }}" pvc_size: "5Gi" # Buffer storage for retry during vlinsert unavailability + termination_grace_period: 30 + liveness_probe: + initial_delay: 30 + period: 30 + failure_threshold: 3 + timeout: 5 + readiness_probe: + initial_delay: 5 + period: 10 + failure_threshold: 3 + timeout: 5 + toleration_seconds: 5 resources: requests: memory: "128Mi" @@ -238,25 +286,12 @@ victoria_tls_cert_days: 3650 victoria_cert_dir: "{{ telemetry_share_path }}/victoria-certs" syslog_tls_cert_dir: "{{ telemetry_share_path }}/syslog-tls-certs" -# PowerScale syslog source IPs requirement message -powerscale_syslog_source_ips_msg: >- - powerscale_configurations.syslog_source_ips is required when logs_enabled is true. - Provide the IP(s) from which PowerScale sends syslog. - This is often the data pool IP (e.g., 40gige-1 interface), NOT the CSI management IP. - Check OneFS Network Configuration → External Network → Pool IPs. - -# PowerScale syslog source IPs display message -powerscale_syslog_source_ips_display_msg: >- - PowerScale syslog source IP(s): {{ powerscale_management_ips | join(', ') }}. - rsyslog $fromhost-ip filter will match these IPs and forward to VLAgent. - # PowerScale log configuration status message powerscale_log_config_status_msg: >- PowerScale syslog configuration script staged on NFS share. - During cloud-init, the script will configure rsyslog on all K8s nodes, - open firewall port 514/udp, and forward PowerScale syslog to VLAgent. - For complete PowerScale syslog configuration details, including manual - PowerScale setup steps, refer to Omnia documentation. + During cloud-init, VLAgent LoadBalancer IP will be retrieved and + PowerScale configuration instructions will be provided. + PowerScale sends syslog directly to VLAgent LoadBalancer IP:514 (UDP/TCP). # PowerScale VictoriaLogs validation fail message powerscale_victoria_logs_validation_fail_msg: >- @@ -313,6 +348,8 @@ common_mode: "0755" victoria_templates_common: - src: 'telemetry/victoria/victoria-vmagent-rbac.yaml.j2' dest: 'victoria-vmagent-rbac.yaml' + - src: 'telemetry/victoria/vmagent-scrape-config.yaml.j2' + dest: 'vmagent-scrape-config.yaml' # Operator-based templates (new default) # Single-node operator template (used when victoria_cluster.enabled: false) @@ -323,8 +360,6 @@ victoria_templates_operator_single: dest: 'victoria-operator-vmagent.yaml' - src: 'telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2' dest: 'victoria-operator-vmpodscrape.yaml' - - src: 'telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2' - dest: 'victoria-operator-vmservicescrape-powerscale.yaml' # Cluster operator template (used when victoria_cluster.enabled: true) victoria_templates_operator_cluster: @@ -334,8 +369,6 @@ victoria_templates_operator_cluster: dest: 'victoria-operator-vmagent.yaml' - src: 'telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2' dest: 'victoria-operator-vmpodscrape.yaml' - - src: 'telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2' - dest: 'victoria-operator-vmservicescrape-powerscale.yaml' # Legacy manual deployment templates (removed - use operator-based templates above) # Raw victoria-cluster-vminsert/vmselect/vmstorage.yaml.j2 files have been removed @@ -520,23 +553,19 @@ ps_dependency_fail_msg: >- # ============================================================================ # Usage: deploy_powerscale_logs.yml # Gated by: telemetry_sources.powerscale.logs_enabled -# Pipeline: PowerScale (UDP:514) → K8s rsyslog → VLAgent LB (TCP:514) → VictoriaLogs -# PS syslog source IPs from user config (powerscale_configurations.syslog_source_ips) -# NOTE: Syslog source IP != CSI endpoint (mgmt IP). Often the data pool IP (40gige-1). +# Pipeline: PowerScale (UDP/TCP:514) → VLAgent LoadBalancer IP:514 → VictoriaLogs configure_ps_syslog_template: "{{ role_path }}/../configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2" ps_log_enabled_msg: >- PowerScale log collection enabled (telemetry_sources.powerscale.logs_enabled: true). - rsyslog will be configured on all K8s nodes to relay PowerScale audit syslog to VLAgent. - Syslog source IPs from powerscale_configurations.syslog_source_ips. + VLAgent will be deployed with LoadBalancer service to receive PowerScale syslog. + PowerScale sends syslog directly to VLAgent LoadBalancer IP:514 (UDP/TCP). ps_log_disabled_msg: >- PowerScale log collection disabled (telemetry_sources.powerscale.logs_enabled: false). - rsyslog relay will not be configured. VLAgent will not include PowerScale labels. + VLAgent will not be configured for PowerScale syslog collection. ps_log_deployed_msg: >- PowerScale syslog configuration script staged on NFS share. - During cloud-init, the script will configure rsyslog on all K8s nodes, - open firewall port 514/udp, and forward PowerScale syslog to VLAgent. - For complete PowerScale syslog configuration details, including manual - PowerScale setup steps, refer to Omnia documentation. + During cloud-init, VLAgent LoadBalancer IP will be retrieved and + PowerScale configuration instructions will be provided. # Vector Kafka-to-Victoria Ingestion Pipeline Configuration # ============================================================================ # Usage: deploy_vector_ldms.yml, deploy_vector_ome.yml From 304026b2a031f8588c559b36a0c85f6252d02948 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 7 May 2026 05:25:45 +0530 Subject: [PATCH 18/63] remove rsyslog layer, update vmscraper and enabled external health monitor of csi driver --- .../common_utils/en_us_validation_msg.py | 4 - .../schema/telemetry_config.json | 9 - .../powerscale_telemetry_validation.py | 16 - input/telemetry_config.yml | 21 -- .../configure_powerscale_syslog.sh.j2 | 351 ++++++------------ ...t.sh.j2 => verify_powerscale_syslog.sh.j2} | 2 +- .../tasks/validate_telemetry_config.yml | 12 + .../roles/provision_validations/vars/main.yml | 7 + .../tasks/deploy_powerscale_logs.yml | 34 +- .../tasks/derive_sink_support_flags.yml | 1 - .../templates/telemetry/kustomization.yaml.j2 | 4 +- .../victoria-operator-vmagent.yaml.j2 | 9 +- ...perator-vmservicescrape-powerscale.yaml.j2 | 50 --- .../victorialogs-operator-vlagent.yaml.j2 | 28 +- .../victoria/vmagent-scrape-config.yaml.j2 | 42 ++- 15 files changed, 210 insertions(+), 380 deletions(-) rename provision/roles/configure_ochami/templates/powerscale/{verify_powerscale_syslog_ut.sh.j2 => verify_powerscale_syslog.sh.j2} (97%) delete mode 100644 provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2 diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index 71fd578080..8404a743e9 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -320,10 +320,6 @@ def switch_snmp3_username_fail_msg(min_username_length, max_length): "'victoria_logs' must be included in collection_targets " "(e.g., 'victoria_metrics,victoria_logs')." ) -POWERSCALE_SYSLOG_SOURCE_IP_INVALID_MSG = ( - "Invalid IP address in powerscale_syslog_source_ips. " - "Each entry must be a valid IPv4 address (e.g., '192.168.55.11')." -) POWERSCALE_CSI_DRIVER_MISSING_MSG = ( "csi_driver_powerscale is not configured in software_config.json. " "PowerScale telemetry requires the CSI driver for PowerScale to be configured." diff --git a/common/library/module_utils/input_validation/schema/telemetry_config.json b/common/library/module_utils/input_validation/schema/telemetry_config.json index 124aaaa543..4bf4de865e 100644 --- a/common/library/module_utils/input_validation/schema/telemetry_config.json +++ b/common/library/module_utils/input_validation/schema/telemetry_config.json @@ -374,15 +374,6 @@ "required": ["url"] }, "description": "Additional victoria_metrics remote_write endpoints." - }, - "syslog_source_ips": { - "type": "array", - "default": [], - "items": { - "type": "string", - "format": "ipv4" - }, - "description": "PowerScale IP address(es) from which syslog packets arrive. Optional: if empty, rsyslog accepts syslog from any source IP. If provided, rsyslog filters by these IPs for security." } }, "required": ["otel_collector_storage_size", "csm_observability_values_file_path"] diff --git a/common/library/module_utils/input_validation/validation_flows/powerscale_telemetry_validation.py b/common/library/module_utils/input_validation/validation_flows/powerscale_telemetry_validation.py index 8ac4fa1824..312e859875 100644 --- a/common/library/module_utils/input_validation/validation_flows/powerscale_telemetry_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/powerscale_telemetry_validation.py @@ -341,22 +341,6 @@ def validate_powerscale_telemetry_config( powerscale_collection_targets, en_us_validation_msg.POWERSCALE_VICTORIA_LOGS_REQUIRED_MSG )) - # Validate syslog_source_ips when logs_enabled (optional field) - # If empty, rsyslog will accept from any source IP - syslog_source_ips = powerscale_config.get( - "syslog_source_ips", [] - ) - # Only validate IP format if provided (not required) - if syslog_source_ips and len(syslog_source_ips) > 0: - for idx, ip_str in enumerate(syslog_source_ips): - try: - ipaddress.ip_address(str(ip_str).strip()) - except ValueError: - errors.append(create_error_msg( - f"powerscale_configurations.syslog_source_ips[{idx}]", - ip_str, - en_us_validation_msg.POWERSCALE_SYSLOG_SOURCE_IP_INVALID_MSG - )) # Validate additional_remote_write_endpoints # (applies to metrics deployment) diff --git a/input/telemetry_config.yml b/input/telemetry_config.yml index be63e2a1d7..acacf2c26b 100644 --- a/input/telemetry_config.yml +++ b/input/telemetry_config.yml @@ -331,24 +331,3 @@ powerscale_configurations: # Additional victoria_metrics remote_write endpoints (optional) # Default: [] (empty — only the primary Omnia victoria_metrics endpoint is used) additional_remote_write_endpoints: [] - - # -------------------------------------------------------------------------- - # PowerScale Syslog Log Collection - # -------------------------------------------------------------------------- - # When telemetry_sources.powerscale.logs_enabled is true, Omnia configures - # rsyslog on all K8s nodes to receive and relay PowerScale audit syslog. - # - # IMPORTANT: The syslog source IP is the IP address PowerScale uses to SEND - # syslog packets. This is NOT necessarily the CSI driver endpoint (management IP). - # Check OneFS Network Configuration → External Network → Pool IPs to find - # which pool/interface PowerScale will route syslog traffic from. - # - # Example: OneFS management IP = 192.168.55.2 (pool0, mgmt-1 interface) - # OneFS data/NFS IP = 192.168.55.11 (pool1, 40gige-1 interface) - # Syslog may be sent from 192.168.55.11, NOT 192.168.55.2 - # - # Example: - # syslog_source_ips: - # - "192.168.55.11" # cluster1 data pool IP - # - "192.168.55.20" # cluster2 data pool IP - syslog_source_ips: [] diff --git a/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 index 1919018cd3..4f2d2b4815 100644 --- a/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 +++ b/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 @@ -13,35 +13,36 @@ # See the License for the specific language governing permissions and # limitations under the License. -# PowerScale Syslog Collection — rsyslog relay + VLAgent +# PowerScale Syslog Collection — Direct to VLAgent LoadBalancer # # DATA PIPELINE: -# PowerScale (UDP/TCP:514) -> rsyslog on K8s nodes (KUBE_VIP:514) -> VLAgent (LoadBalancer IP:514) -> VictoriaLogs +# PowerScale (UDP/TCP:514) -> VLAgent LoadBalancer IP:514 -> VictoriaLogs set -euo pipefail NAMESPACE="{{ telemetry_namespace }}" -KUBE_VIP="{{ kube_vip }}" -POWERSCALE_IPS=( -{% for ip in powerscale_management_ips %} - "{{ ip }}" -{% endfor %} -) +# Read PowerScale credentials from CSI secret.yaml +SECRET_FILE="{{ hostvars['localhost']['k8s_client_share_path'] }}/csi-driver-powerscale/secret.yaml" +if [ -f "$SECRET_FILE" ]; then + echo "Reading credentials from CSI secret.yaml" + CSI_USERNAME=$(grep -v '^[[:space:]]*#' "$SECRET_FILE" | grep 'username:' | head -1 | awk -F':' '{gsub(/^[[:space:]]+|[[:space:]]+$/, "", $2); print $2}' | base64 --decode 2>/dev/null) + CSI_PASSWORD=$(grep -v '^[[:space:]]*#' "$SECRET_FILE" | grep 'password:' | head -1 | awk -F':' '{gsub(/^[[:space:]]+|[[:space:]]+$/, "", $2); print $2}' | base64 --decode 2>/dev/null) +else + echo "ERROR: CSI secret file not found: $SECRET_FILE" + exit 1 +fi echo "==========================================" echo "PowerScale Syslog Collection Setup" echo "==========================================" -echo "Pipeline: PowerScale (UDP/TCP:514) -> rsyslog (KUBE_VIP:514) -> VLAgent (ClusterIP:514) -> VictoriaLogs" -{% raw %} -echo "PowerScale Syslog Source IPs: ${POWERSCALE_IPS[*]}" -{% endraw %} +echo "Pipeline: PowerScale (UDP/TCP:514) -> VLAgent LoadBalancer -> VictoriaLogs" echo "" # ============================================================================ -# Phase 1: Get VLAgent ClusterIP for internal forwarding +# Phase 1: Get VLAgent LoadBalancer IP # ============================================================================ -echo "===== Phase 1: Getting VLAgent ClusterIP =====" +echo "===== Phase 1: Getting VLAgent LoadBalancer IP =====" VLAGENT_SERVICE=$(kubectl get svc vlagent-vlagent -n "$NAMESPACE" -o json 2>/dev/null) || { echo "ERROR: VLAgent service not found in namespace $NAMESPACE" @@ -51,256 +52,118 @@ VLAGENT_SERVICE=$(kubectl get svc vlagent-vlagent -n "$NAMESPACE" -o json 2>/dev SERVICE_TYPE=$(echo "$VLAGENT_SERVICE" | jq -r '.spec.type') echo "VLAgent service type: $SERVICE_TYPE" -# rsyslog runs in host network - use ClusterIP for internal access -VLAGENT_CLUSTERIP=$(echo "$VLAGENT_SERVICE" | jq -r '.spec.clusterIP') - -if [ -z "$VLAGENT_CLUSTERIP" ] || [ "$VLAGENT_CLUSTERIP" == "null" ]; then - echo "ERROR: Could not determine VLAgent ClusterIP." - echo "Ensure VLAgent service has a ClusterIP assigned." +if [ "$SERVICE_TYPE" != "LoadBalancer" ]; then + echo "ERROR: VLAgent service is not type LoadBalancer (found: $SERVICE_TYPE)" + echo "PowerScale syslog requires LoadBalancer service type for external access" exit 0 fi -# Use ClusterIP for rsyslog forwarding -VLAGENT_IP="$VLAGENT_CLUSTERIP" -VLAGENT_PORT=514 - -echo "VLAgent forwarding target: ${VLAGENT_IP}:${VLAGENT_PORT}" -echo "" - -# ============================================================================ -# Phase 2: Generate rsyslog configuration -# ============================================================================ -echo "===== Phase 2: Generating rsyslog Configuration =====" - -RSYSLOG_CONF="/etc/rsyslog.d/60-omnia-powerscale-syslog.conf" - -generate_rsyslog_config() { - # Static templates — QUOTED heredoc (no Bash interpretation) - cat <<'RSYSLOG_STATIC' -# Omnia PowerScale Syslog Relay Configuration -# Auto-generated — DO NOT EDIT manually -# Pipeline: PowerScale (UDP/TCP:514) -> rsyslog on K8s nodes -> VLAgent (ClusterIP:514) -> VictoriaLogs - -# === UDP/TCP Listeners on port 514 === -$ModLoad imudp -input(type="imudp" port="514") - -$ModLoad imtcp -input(type="imtcp" port="514") +# Get LoadBalancer external IP +SYSLOG_ENDPOINT=$(echo "$VLAGENT_SERVICE" | jq -r '.status.loadBalancer.ingress[0].ip // empty') -# === JSON template for VLAgent === -template(name="VLAgentJSON" type="list") { - constant(value="{\"_time\":\"") - property(name="timereported" dateFormat="rfc3339") - constant(value="\",\"host\":\"") - property(name="hostname") - constant(value="\",\"app\":\"") - property(name="programname") - constant(value="\",\"facility\":\"") - property(name="syslogfacility") - constant(value="\",\"severity\":\"") - property(name="syslogseverity") - constant(value="\",\"msg\":\"") - property(name="msg" format="json") - constant(value="\"}\n") -} -RSYSLOG_STATIC - -{% raw %} - if [ ${#POWERSCALE_IPS[@]} -eq 0 ]; then -{% endraw %} - # Action block — UNQUOTED heredoc (Bash expands ${VLAGENT_IP} etc.) - cat < "$RSYSLOG_CONF" +echo "===== Phase 2: Verification =====" - # Open firewall ports - if command -v firewall-cmd &>/dev/null; then - firewall-cmd --add-port=514/udp --permanent 2>/dev/null || true - firewall-cmd --add-port=514/tcp --permanent 2>/dev/null || true - firewall-cmd --reload 2>/dev/null || true - echo " Firewall: ports 514/udp and 514/tcp opened" - fi - - # Validate config — capture output for diagnostics - VALIDATION_OUTPUT=$(rsyslogd -N 1 2>&1) || { - echo " ERROR: rsyslog config validation failed!" - echo " --- rsyslogd -N 1 output ---" - echo "$VALIDATION_OUTPUT" - echo " --- end output ---" - return 1 - } - echo " rsyslog config validation passed" - - systemctl restart rsyslog 2>/dev/null || { - echo " WARNING: Failed to restart rsyslog on local node" - return 1 - } - echo " rsyslog restarted successfully on local node" - else - echo " Configuring remote node: ${NODE_IP}..." - - ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "$NODE_IP" \ - "cat > $RSYSLOG_CONF" <<< "$RSYSLOG_CONFIG" 2>/dev/null || { - echo " WARNING: Failed to write rsyslog config on ${NODE_IP}" - return 1 - } - - # Setup log file, firewall, validate, restart - ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "$NODE_IP" " \ - if command -v firewall-cmd &>/dev/null; then \ - firewall-cmd --add-port=514/udp --permanent 2>/dev/null || true && \ - firewall-cmd --add-port=514/tcp --permanent 2>/dev/null || true && \ - firewall-cmd --reload 2>/dev/null || true; \ - fi && \ - rsyslogd -N 1 2>/dev/null && \ - systemctl restart rsyslog" 2>/dev/null || { - echo " WARNING: Failed to configure rsyslog on ${NODE_IP}" - return 1 - } - echo " rsyslog configured on ${NODE_IP}" - fi - return 0 -} +VLAGENT_PODS=$(kubectl get pods -n "$NAMESPACE" -l app=vlagent --no-headers 2>/dev/null | wc -l) +echo "VLAgent pods running: $VLAGENT_PODS" -configure_rsyslog_on_node "localhost" "true" +if [ "$VLAGENT_PODS" -eq 0 ]; then + echo "WARNING: No VLAgent pods are running" +fi # ============================================================================ -# Phase 4: Configure rsyslog on all other K8s nodes +# Phase 3: Configure PowerScale Syslog via SSH # ============================================================================ echo "" -echo "===== Phase 4: Configuring rsyslog on Other K8s Nodes =====" - -LOCAL_IP=$(hostname -I | awk '{print $1}') -NODE_IPS=$(kubectl get nodes -o wide --no-headers 2>/dev/null | awk '{print $6}') -REMOTE_SUCCESS=0 -REMOTE_FAIL=0 +echo "===== Phase 3: Configuring PowerScale Syslog =====" + +# Validate credentials +if [ -z "$CSI_USERNAME" ] || [ -z "$CSI_PASSWORD" ]; then + echo "WARNING: CSI credentials not available. Skipping automatic PowerScale configuration." + echo "" + echo "Manual configuration steps:" + echo "1. SSH to PowerScale: ssh @" + echo "2. Enable audit syslog forwarding:" + echo " isi audit settings global modify --protocol-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} --protocol-syslog-tls-enabled=0" + echo " isi audit settings global modify --config-syslog-enabled=1 --config-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} --config-syslog-tls-enabled=0" + echo " isi audit settings global modify --system-syslog-enabled=1 --system-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} --system-syslog-tls-enabled=0" + echo "3. Verify: isi audit settings global view" + echo "" + echo "==========================================" + echo "PowerScale syslog collection setup completed!" + echo "==========================================" + exit 0 +fi -for NODE_IP in $NODE_IPS; do - if [ "$NODE_IP" == "$LOCAL_IP" ] || [ "$NODE_IP" == "$KUBE_VIP" ]; then - continue - fi - if configure_rsyslog_on_node "$NODE_IP" "false"; then - REMOTE_SUCCESS=$((REMOTE_SUCCESS + 1)) +# PowerScale clusters from CSI secret +{% for cluster in ps_clusters | default([]) %} +CLUSTER_ENDPOINT="{{ cluster.endpoint | default('') }}" +CLUSTER_NAME="{{ cluster.clusterName | default('') }}" + +if [ -n "$CLUSTER_ENDPOINT" ]; then + echo "Configuring PowerScale cluster: $CLUSTER_NAME ($CLUSTER_ENDPOINT)" + + # Extract management IP from endpoint (remove protocol and path) + CLUSTER_IP=$(echo "$CLUSTER_ENDPOINT" | sed -e 's|https\?://||' -e 's|/.*||') + + echo "SSH to PowerScale: $CSI_USERNAME@$CLUSTER_IP" + + # Configure syslog forwarding via SSH + sshpass -p "$CSI_PASSWORD" ssh -o StrictHostKeyChecking=no "$CSI_USERNAME@$CLUSTER_IP" << 'EOF' + set -e + + echo "Enabling audit syslog forwarding..." + + # Enable protocol events (file access) + isi audit settings global modify \ + --protocol-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} \ + --protocol-syslog-tls-enabled=0 + + # Enable config events + isi audit settings global modify \ + --config-syslog-enabled=1 \ + --config-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} \ + --config-syslog-tls-enabled=0 + + # Enable system events + isi audit settings global modify \ + --system-syslog-enabled=1 \ + --system-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} \ + --system-syslog-tls-enabled=0 + + echo "Verifying configuration..." + isi audit settings global view + + echo "Testing connectivity to VLAgent..." + ping -c 3 ${SYSLOG_ENDPOINT} + + echo "PowerScale syslog configuration completed successfully!" +EOF + + if [ $? -eq 0 ]; then + echo "✓ Successfully configured PowerScale cluster: $CLUSTER_NAME" else - REMOTE_FAIL=$((REMOTE_FAIL + 1)) + echo "✗ Failed to configure PowerScale cluster: $CLUSTER_NAME" fi -done - -TOTAL=$((REMOTE_SUCCESS + 1)) -echo "" -echo "rsyslog configured on ${TOTAL} node(s) total (1 local + ${REMOTE_SUCCESS} remote), ${REMOTE_FAIL} failure(s)" - -# ============================================================================ -# Phase 5: Verification -# ============================================================================ -echo "" -echo "===== Phase 5: Verification =====" - -VLAGENT_PODS=$(kubectl get pods -n "$NAMESPACE" -l app=vlagent --no-headers 2>/dev/null | wc -l) -echo "VLAgent pods running: $VLAGENT_PODS" - -echo "Checking rsyslog status on local node..." -systemctl is-active rsyslog && echo " rsyslog is active" || echo " WARNING: rsyslog is not active" -echo " Config file: $RSYSLOG_CONF" - -# ============================================================================ -# Phase 6: PowerScale Configuration Instructions -# ============================================================================ -echo "" -echo "==========================================" -echo "NEXT STEPS: Configure PowerScale OneFS" -echo "==========================================" + echo "" +else + echo "WARNING: Empty endpoint for cluster $CLUSTER_NAME, skipping..." +fi +{% endfor %} -echo "Configure PowerScale to send syslog to: ${KUBE_VIP}:514" -echo "Kubernetes kube-vIP: ${KUBE_VIP}" -echo "Syslog Port: 514 (UDP/TCP)" -echo "" -echo "Steps for each PowerScale cluster:" -echo "" -echo "1. SSH to PowerScale:" -echo " ssh @" -echo "" -echo "2. Enable audit syslog forwarding:" -echo "" -echo " # For protocol events (file access):" -echo " isi audit settings global modify \\" -echo " --protocol-syslog-enabled=1 \\" -echo " --protocol-syslog-servers=${KUBE_VIP}:514 \\" -echo " --protocol-syslog-tls-enabled=0" -echo "" -echo " # For config events:" -echo " isi audit settings global modify \\" -echo " --config-syslog-enabled=1 \\" -echo " --config-syslog-servers=${KUBE_VIP}:514 \\" -echo " --config-syslog-tls-enabled=0" -echo "" -echo " # For system events:" -echo " isi audit settings global modify \\" -echo " --system-syslog-enabled=1 \\" -echo " --system-syslog-servers=${KUBE_VIP}:514 \\" -echo " --system-syslog-tls-enabled=0" -echo "" -echo "3. Verify configuration:" -echo " isi audit settings global view" echo "" echo "==========================================" echo "PowerScale syslog collection setup completed!" diff --git a/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_syslog_ut.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_syslog.sh.j2 similarity index 97% rename from provision/roles/configure_ochami/templates/powerscale/verify_powerscale_syslog_ut.sh.j2 rename to provision/roles/configure_ochami/templates/powerscale/verify_powerscale_syslog.sh.j2 index 5c3ca6db22..797dc7e7ae 100644 --- a/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_syslog_ut.sh.j2 +++ b/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_syslog.sh.j2 @@ -4,7 +4,7 @@ NAMESPACE="telemetry" VLAGENT_SVC="vlagent-vlagent" -PS_HOSTNAME="{{ powerscale_syslog_source_ips[0] | default('bdcdap-1') }}" # PowerScale hostname or IP for query +PS_HOSTNAME="powerscale" # PowerScale hostname for log query (update if needed) echo "==========================================" echo "PowerScale Syslog Feature UT Verification" diff --git a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml index 3aa0669e30..c853d3ff4c 100644 --- a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml +++ b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml @@ -25,6 +25,18 @@ prompt: "{{ warning_idrac_telemetry_support_true }}" when: telemetry_sources.idrac.metrics_enabled | default(false) | bool +- name: Warning for PowerScale health monitor configuration + ansible.builtin.pause: + seconds: "{{ pause_time_15 }}" + prompt: "{{ warning_powerscale_health_monitor_disabled }}" + when: + - telemetry_sources.powerscale.metrics_enabled | default(false) | bool + - csi_powerscale_driver_values_file_path is defined + - csi_values.controller.healthMonitor.enabled | default(false) == false + vars: + csi_values: "{{ lookup('file', csi_powerscale_driver_values_file_path) | from_yaml }}" + failed_when: false + - name: Get k8s cluster details ansible.builtin.set_fact: service_cluster_info: >- diff --git a/provision/roles/provision_validations/vars/main.yml b/provision/roles/provision_validations/vars/main.yml index ceee665ce2..3eef6c7661 100644 --- a/provision/roles/provision_validations/vars/main.yml +++ b/provision/roles/provision_validations/vars/main.yml @@ -72,6 +72,13 @@ warning_idrac_telemetry_support_true: | Confirm that all BMC IPs are reachable from the respective service cluster nodes for telemetry to function properly. Make sure that Redfish is enabled and the iDRAC has a datacenter license. Also, ensure that the firmware version is greater than 4 for iDRAC9 or greater than 1 for iDRAC10." + +warning_powerscale_health_monitor_disabled: | + "[WARNING] telemetry_sources.powerscale.metrics_enabled is set to true in telemetry_config.yml, + but CSI driver health monitor is disabled (controller.healthMonitor.enabled=false in CSI values.yaml). + Health monitor metrics will not be stored in VictoriaMetrics. + To enable health monitor metrics, set controller.healthMonitor.enabled=true in CSI values.yaml." + pause_time_15: 15 bmc_group_data_filename: "/opt/omnia/telemetry/bmc_group_data.csv" diff --git a/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml b/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml index 8a523a8486..cbc48861be 100644 --- a/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml +++ b/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml @@ -13,21 +13,21 @@ # limitations under the License. --- -# Configure PowerScale syslog collection via rsyslog relay + shared VLAgent +# Configure PowerScale syslog collection via direct VLAgent LoadBalancer # Gated by: telemetry_sources.powerscale.logs_enabled # -# DATA PIPELINE (UDP mode only): -# PowerScale (UDP:514) → K8s node rsyslog → VLAgent LB (TCP:514) → VictoriaLogs +# DATA PIPELINE: +# PowerScale (UDP/TCP:514) → VLAgent LoadBalancer IP:514 → VictoriaLogs # # What Omnia does: -# - Uses syslog_source_ips from user config (syslog source != CSI endpoint) -# - Configures rsyslog on ALL K8s nodes (UDP listener, filter by PS IP, forward to VLAgent) -# - Opens firewall ports 514/udp and 514/tcp on all K8s nodes -# - VLAgent listens on TCP:514 (rsyslog relay) +# - Deploys VLAgent with LoadBalancer service (MetalLB assigns external IP) +# - VLAgent listens on UDP:514 and TCP:514 for syslog messages +# - Automatically configures PowerScale syslog forwarding via SSH (if CSI credentials available) +# - Falls back to manual configuration instructions if credentials unavailable # # What Omnia does NOT do: -# - Omnia does NOT configure PowerScale directly -# - User must configure PowerScale to send UDP syslog to K8s node IPs +# - Omnia does NOT configure PowerScale directly if CSI credentials are unavailable +# - User must manually configure PowerScale to send syslog to VLAgent LoadBalancer IP - name: Configure PowerScale syslog collection when: powerscale_log_enabled | default(false) | bool @@ -38,9 +38,21 @@ - "'victoria_logs' in telemetry_config.telemetry_sources.powerscale.collection_targets | default([])" fail_msg: "{{ powerscale_victoria_logs_validation_fail_msg }}" - - name: Set PowerScale syslog source IPs + - name: Read CSI PowerScale secret for cluster information + ansible.builtin.slurp: + src: "{{ hostvars['localhost']['k8s_client_share_path'] }}/csi-driver-powerscale/secret.yaml" + register: csi_powerscale_secret_content + ignore_errors: true + + - name: Parse CSI PowerScale secret + ansible.builtin.set_fact: + csi_powerscale_secret: "{{ csi_powerscale_secret_content.content | b64decode | from_yaml }}" + when: csi_powerscale_secret_content.skipped is not defined + + - name: Extract PowerScale clusters ansible.builtin.set_fact: - powerscale_management_ips: "{{ telemetry_config.powerscale_configurations.syslog_source_ips | default([]) }}" + ps_clusters: "{{ csi_powerscale_secret.isilonClusters | default([]) }}" + when: csi_powerscale_secret.skipped is not defined - name: Populate PowerScale syslog configuration script ansible.builtin.template: diff --git a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml index de7a40e36d..78d1bd41eb 100644 --- a/provision/roles/telemetry/tasks/derive_sink_support_flags.yml +++ b/provision/roles/telemetry/tasks/derive_sink_support_flags.yml @@ -53,7 +53,6 @@ powerscale_configurations: powerscale_telemetry_support: "{{ telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(true) | bool }}" powerscale_log_enabled: "{{ telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool }}" - syslog_source_ips: "{{ telemetry_config.powerscale_configurations.syslog_source_ips | default([]) }}" otel_collector_storage_size: "{{ telemetry_config.powerscale_configurations.otel_collector_storage_size | default('5Gi') }}" csm_observability_values_file_path: "{{ telemetry_config.powerscale_configurations.csm_observability_values_file_path | default('') }}" additional_remote_write_endpoints: "{{ telemetry_config.powerscale_configurations.additional_remote_write_endpoints | default([]) }}" diff --git a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 index 54753cf0ce..afd457f3d0 100644 --- a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 @@ -3,6 +3,7 @@ resources: {% if victoria_metrics_support | default(false) %} # victoria_metrics Resources (Metrics Only) - victoria-vmagent-rbac.yaml + - vmagent-scrape-config.yaml {% if victoria_cluster.tls_enabled | default(false) %} # TLS secret for Victoria components (shared by metrics and logs) - victoria-tls-secret.yaml @@ -19,9 +20,6 @@ resources: - victoria-operator-vmagent.yaml # VMPodScrape CR (native operator-based pod discovery for metrics) - victoria-operator-vmpodscrape.yaml -{% if telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) %} - # VMServiceScrape CR for PowerScale OTEL Collector - - victoria-operator-vmservicescrape-powerscale.yaml {% endif %} {% endif %} {% if victoria_logs_support | default(false) %} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 index 970c9b20aa..b11261c670 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 @@ -60,11 +60,10 @@ spec: memory: "{{ victoria_cluster.vmagent.resources.limits.memory}}" cpu: "{{ victoria_cluster.vmagent.resources.limits.cpu}}" - # Service discovery configs - operator uses VMServiceScrape/VMPodScrape CRDs - serviceScrapeNamespaceSelector: {} - serviceScrapeSelector: {} - podScrapeNamespaceSelector: {} - podScrapeSelector: {} + # ConfigMap-based scrape configuration + configSecret: + name: {{ vmagent.configmap_name }} + key: prometheus.yml # Extra args extraArgs: diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2 deleted file mode 100644 index 20a4b209ce..0000000000 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmservicescrape-powerscale.yaml.j2 +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# VMServiceScrape - Native operator-based service discovery for PowerScale OTEL Collector -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMServiceScrape -metadata: - name: otel-collector-powerscale-scrape - namespace: {{ telemetry_namespace }} -spec: - # Target service selector - selector: - matchLabels: - app.kubernetes.io/name: otel-collector - - # Namespace selector - namespaceSelector: - matchNames: - - {{ telemetry_namespace }} - - # Service metrics endpoints - endpoints: - - port: prometheus - interval: {{ vmagent.global.scrape_interval }} - honorLabels: true - - # Add PowerScale-specific labels - relabelConfigs: - - sourceLabels: [__meta_kubernetes_service_name] - targetLabel: source - replacement: powerscale - - - sourceLabels: [__meta_kubernetes_service_name] - targetLabel: job - replacement: otel-collector-powerscale - - # Add namespace label - - sourceLabels: [__meta_kubernetes_namespace] - targetLabel: namespace diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 index 44e15bc708..da0c3ad06c 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 @@ -53,10 +53,10 @@ spec: # The VictoriaMetrics operator does NOT translate configSecret into # syslog CLI flags. Syslog listeners MUST be enabled via extraArgs. # These map directly to VLAgent CLI flags: - # -syslog.listenAddr.tcp → plaintext syslog TCP receiver (rsyslog relay forwards here) + # -syslog.listenAddr.tcp → plaintext syslog TCP receiver # -syslog.listenAddr.udp → plaintext syslog UDP receiver # Pipeline: - # UDP mode: PowerScale → K8s node rsyslog (UDP:514) → VLAgent (TCP:514) → VictoriaLogs + # PowerScale → VLAgent LoadBalancer (UDP/TCP:514) → VictoriaLogs extraArgs: syslog.listenAddr.tcp: ":514" syslog.listenAddr.udp: ":514" @@ -155,7 +155,7 @@ spec: # ======================================== # Service type: LoadBalancer (MetalLB for external access) # Provides single external IP for all log sources (OME, SFM, PowerScale) - # PowerScale syslog flows: PowerScale → VLAgent (LoadBalancer IP:514) → VictoriaLogs + # PowerScale sends syslog directly to VLAgent LoadBalancer IP:514 (UDP/TCP) serviceSpec: useAsDefault: true spec: @@ -179,19 +179,19 @@ spec: httpGet: path: /health port: 9429 - initialDelaySeconds: 30 - periodSeconds: 30 - timeoutSeconds: 5 - failureThreshold: 3 + initialDelaySeconds: {{ victoria_logs_cluster.vlagent.liveness_probe.initial_delay }} + periodSeconds: {{ victoria_logs_cluster.vlagent.liveness_probe.period }} + timeoutSeconds: {{ victoria_logs_cluster.vlagent.liveness_probe.timeout }} + failureThreshold: {{ victoria_logs_cluster.vlagent.liveness_probe.failure_threshold }} readinessProbe: httpGet: path: /health port: 9429 - initialDelaySeconds: 5 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 3 + initialDelaySeconds: {{ victoria_logs_cluster.vlagent.readiness_probe.initial_delay }} + periodSeconds: {{ victoria_logs_cluster.vlagent.readiness_probe.period }} + timeoutSeconds: {{ victoria_logs_cluster.vlagent.readiness_probe.timeout }} + failureThreshold: {{ victoria_logs_cluster.vlagent.readiness_probe.failure_threshold }} # ======================================== # Pod Scheduling and Affinity @@ -202,14 +202,14 @@ spec: - effect: NoExecute key: node.kubernetes.io/not-ready operator: Exists - tolerationSeconds: 5 + tolerationSeconds: {{ victoria_logs_cluster.vlagent.toleration_seconds }} - effect: NoExecute key: node.kubernetes.io/unreachable operator: Exists - tolerationSeconds: 5 + tolerationSeconds: {{ victoria_logs_cluster.vlagent.toleration_seconds }} # ======================================== # Termination Grace Period # ======================================== # Allow time for graceful shutdown and buffer flush - terminationGracePeriodSeconds: 30 + terminationGracePeriodSeconds: {{ victoria_logs_cluster.vlagent.termination_grace_period }} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 index fe8f086c22..fbf2f20960 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 @@ -60,7 +60,7 @@ data: # Add Pod IP label - source_labels: [__meta_kubernetes_pod_ip] target_label: pod_ip -{% if hostvars['localhost']['telemetry_config']['telemetry_sources']['powerscale']['metrics_enabled'] | default(false) | bool %} +{% if telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool %} # PowerScale OTEL Collector scrape targets (per cluster) {% for cluster in ps_clusters %} @@ -74,4 +74,44 @@ data: cluster: "{{ cluster.clusterName }}" cluster_endpoint: "{{ cluster.endpoint }}" {% endfor %} + + # CSI PowerScale Health Monitor scrape targets + - job_name: "csi-powerscale-health-monitor" + honor_labels: true + scrape_interval: {{ vmagent.global.scrape_interval }} + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - isilon + relabel_configs: + # Keep only CSI PowerScale pods + - source_labels: [__meta_kubernetes_pod_label_app] + regex: csi-isilon + action: keep + # Keep only health monitor container + - source_labels: [__meta_kubernetes_pod_container_name] + regex: csi-health-monitor + action: keep + # Set scrape address to health monitor port (9445) + - source_labels: [__meta_kubernetes_pod_ip] + target_label: __address__ + replacement: "$1:9445" + # Add labels + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + action: replace + - source_labels: [__meta_kubernetes_pod_node_name] + target_label: node + action: replace + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + action: replace + # Add custom labels + - target_label: source + replacement: powerscale + action: replace + - target_label: component + replacement: health-monitor + action: replace {% endif %} From 4c7fcc1f96d71399db9aa1ceafdbd3e26b6e4f1a Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 7 May 2026 06:06:26 +0530 Subject: [PATCH 19/63] fix for k8s_server_ip undefined variable --- provision/roles/provision_validations/tasks/main.yml | 4 ---- .../provision_validations/tasks/validate_telemetry_config.yml | 3 +++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/provision/roles/provision_validations/tasks/main.yml b/provision/roles/provision_validations/tasks/main.yml index 503792ee99..30b59c4c96 100644 --- a/provision/roles/provision_validations/tasks/main.yml +++ b/provision/roles/provision_validations/tasks/main.yml @@ -54,7 +54,3 @@ - name: Validate telemetry config ansible.builtin.include_tasks: validate_telemetry_config.yml - when: - - (telemetry_sources.idrac.metrics_enabled | default(false) | bool) or - (telemetry_sources.ldms.metrics_enabled | default(false) | bool) or - (ldms_support | default(false) | bool) diff --git a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml index c853d3ff4c..8894f3ec07 100644 --- a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml +++ b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml @@ -50,10 +50,12 @@ - name: Set cluster configuration facts ansible.builtin.set_fact: k8s_nfs_storage_name: "{{ service_cluster_info.nfs_storage_name }}" + cacheable: true - name: Find matching NFS client param ansible.builtin.set_fact: k8s_nfs_storage_details: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', k8s_nfs_storage_name) | list | first) | default({}) }}" + cacheable: true - name: Set share_path from service_k8s_cluster client_share_path ansible.builtin.set_fact: @@ -61,3 +63,4 @@ k8s_server_share_path: "{{ k8s_nfs_storage_details.server_share_path }}" k8s_server_ip: "{{ k8s_nfs_storage_details.server_ip }}" k8s_mount_options: "{{ k8s_nfs_storage_details.client_mount_options }}" + cacheable: true From 8c78056ed788e85110b9a4190eca12445109ebc0 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 7 May 2026 06:22:53 +0530 Subject: [PATCH 20/63] fix for syntax error --- .../roles/telemetry/templates/telemetry/kustomization.yaml.j2 | 1 - 1 file changed, 1 deletion(-) diff --git a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 index afd457f3d0..93e413cf5b 100644 --- a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 @@ -21,7 +21,6 @@ resources: # VMPodScrape CR (native operator-based pod discovery for metrics) - victoria-operator-vmpodscrape.yaml {% endif %} -{% endif %} {% if victoria_logs_support | default(false) %} # victoria_logs Resources (Logs Only) {% if victoria_cluster.tls_enabled | default(false) and not victoria_metrics_support | default(false) %} From 531a9a6fb5b76dcbcc60a645eacd540731c72653 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 7 May 2026 12:05:17 +0530 Subject: [PATCH 21/63] fix for UT issues - DNS resolution and keep powerscale configuration manual --- .../configure_powerscale_syslog.sh.j2 | 83 ++++--------------- .../victorialogs-operator-vlagent.yaml.j2 | 5 +- 2 files changed, 20 insertions(+), 68 deletions(-) diff --git a/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 index 4f2d2b4815..0ee6d5fa04 100644 --- a/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 +++ b/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 @@ -85,87 +85,38 @@ if [ "$VLAGENT_PODS" -eq 0 ]; then fi # ============================================================================ -# Phase 3: Configure PowerScale Syslog via SSH +# Phase 3: Manual Configuration Instructions # ============================================================================ echo "" -echo "===== Phase 3: Configuring PowerScale Syslog =====" - -# Validate credentials -if [ -z "$CSI_USERNAME" ] || [ -z "$CSI_PASSWORD" ]; then - echo "WARNING: CSI credentials not available. Skipping automatic PowerScale configuration." - echo "" - echo "Manual configuration steps:" - echo "1. SSH to PowerScale: ssh @" - echo "2. Enable audit syslog forwarding:" - echo " isi audit settings global modify --protocol-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} --protocol-syslog-tls-enabled=0" - echo " isi audit settings global modify --config-syslog-enabled=1 --config-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} --config-syslog-tls-enabled=0" - echo " isi audit settings global modify --system-syslog-enabled=1 --system-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} --system-syslog-tls-enabled=0" - echo "3. Verify: isi audit settings global view" - echo "" - echo "==========================================" - echo "PowerScale syslog collection setup completed!" - echo "==========================================" - exit 0 -fi - -# PowerScale clusters from CSI secret +echo "===== Phase 3: Manual Configuration Instructions =====" +echo "" +echo "PowerScale syslog requires manual configuration." +echo "" +echo "Manual configuration steps:" +echo "" {% for cluster in ps_clusters | default([]) %} CLUSTER_ENDPOINT="{{ cluster.endpoint | default('') }}" CLUSTER_NAME="{{ cluster.clusterName | default('') }}" if [ -n "$CLUSTER_ENDPOINT" ]; then - echo "Configuring PowerScale cluster: $CLUSTER_NAME ($CLUSTER_ENDPOINT)" - # Extract management IP from endpoint (remove protocol and path) CLUSTER_IP=$(echo "$CLUSTER_ENDPOINT" | sed -e 's|https\?://||' -e 's|/.*||') - - echo "SSH to PowerScale: $CSI_USERNAME@$CLUSTER_IP" - - # Configure syslog forwarding via SSH - sshpass -p "$CSI_PASSWORD" ssh -o StrictHostKeyChecking=no "$CSI_USERNAME@$CLUSTER_IP" << 'EOF' - set -e - - echo "Enabling audit syslog forwarding..." - - # Enable protocol events (file access) - isi audit settings global modify \ - --protocol-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} \ - --protocol-syslog-tls-enabled=0 - - # Enable config events - isi audit settings global modify \ - --config-syslog-enabled=1 \ - --config-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} \ - --config-syslog-tls-enabled=0 - - # Enable system events - isi audit settings global modify \ - --system-syslog-enabled=1 \ - --system-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} \ - --system-syslog-tls-enabled=0 - - echo "Verifying configuration..." - isi audit settings global view - - echo "Testing connectivity to VLAgent..." - ping -c 3 ${SYSLOG_ENDPOINT} - - echo "PowerScale syslog configuration completed successfully!" -EOF - - if [ $? -eq 0 ]; then - echo "✓ Successfully configured PowerScale cluster: $CLUSTER_NAME" - else - echo "✗ Failed to configure PowerScale cluster: $CLUSTER_NAME" - fi + echo "For PowerScale cluster: $CLUSTER_NAME ($CLUSTER_IP)" + echo " 1. SSH to PowerScale: ssh $CSI_USERNAME@$CLUSTER_IP" + echo " 2. Enable audit syslog forwarding:" + echo " isi audit settings global modify --protocol-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} --protocol-syslog-tls-enabled=0" + echo " isi audit settings global modify --config-syslog-enabled=1 --config-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} --config-syslog-tls-enabled=0" + echo " isi audit settings global modify --system-syslog-enabled=1 --system-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} --system-syslog-tls-enabled=0" + echo " 3. Verify: isi audit settings global view" echo "" else echo "WARNING: Empty endpoint for cluster $CLUSTER_NAME, skipping..." + echo "" fi {% endfor %} - +echo "Ensure firewall allows UDP/TCP:514 from PowerScale data pool IPs to ${SYSLOG_ENDPOINT}:${SYSLOG_PORT}" echo "" echo "==========================================" -echo "PowerScale syslog collection setup completed!" +echo "PowerScale syslog collection setup instructions completed!" echo "==========================================" exit 0 diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 index da0c3ad06c..a42f32fb9e 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 @@ -69,11 +69,12 @@ spec: # ======================================== # Forward logs to VictoriaLogs vlinsert endpoint # Supports JSON Lines format with optional TLS + # Using short service name (same namespace) to avoid DNS resolution issues remoteWrite: {% if victoria_logs_cluster.tls_enabled %} - - url: https://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/internal/insert + - url: https://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc:9481/internal/insert {% else %} - - url: http://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/internal/insert + - url: http://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc:9481/internal/insert {% endif %} # ======================================== From 769297220716275ead7174ff0882715276aeb5af Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 7 May 2026 17:24:14 +0530 Subject: [PATCH 22/63] ansible lint fixes --- .../powerscale/configure_powerscale_syslog.sh.j2 | 13 +------------ .../tasks/validate_telemetry_config.yml | 2 +- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 index 0ee6d5fa04..22e5c51836 100644 --- a/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 +++ b/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 @@ -22,17 +22,6 @@ set -euo pipefail NAMESPACE="{{ telemetry_namespace }}" -# Read PowerScale credentials from CSI secret.yaml -SECRET_FILE="{{ hostvars['localhost']['k8s_client_share_path'] }}/csi-driver-powerscale/secret.yaml" -if [ -f "$SECRET_FILE" ]; then - echo "Reading credentials from CSI secret.yaml" - CSI_USERNAME=$(grep -v '^[[:space:]]*#' "$SECRET_FILE" | grep 'username:' | head -1 | awk -F':' '{gsub(/^[[:space:]]+|[[:space:]]+$/, "", $2); print $2}' | base64 --decode 2>/dev/null) - CSI_PASSWORD=$(grep -v '^[[:space:]]*#' "$SECRET_FILE" | grep 'password:' | head -1 | awk -F':' '{gsub(/^[[:space:]]+|[[:space:]]+$/, "", $2); print $2}' | base64 --decode 2>/dev/null) -else - echo "ERROR: CSI secret file not found: $SECRET_FILE" - exit 1 -fi - echo "==========================================" echo "PowerScale Syslog Collection Setup" echo "==========================================" @@ -102,7 +91,7 @@ if [ -n "$CLUSTER_ENDPOINT" ]; then # Extract management IP from endpoint (remove protocol and path) CLUSTER_IP=$(echo "$CLUSTER_ENDPOINT" | sed -e 's|https\?://||' -e 's|/.*||') echo "For PowerScale cluster: $CLUSTER_NAME ($CLUSTER_IP)" - echo " 1. SSH to PowerScale: ssh $CSI_USERNAME@$CLUSTER_IP" + echo " 1. SSH to PowerScale: ssh @$CLUSTER_IP" echo " 2. Enable audit syslog forwarding:" echo " isi audit settings global modify --protocol-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} --protocol-syslog-tls-enabled=0" echo " isi audit settings global modify --config-syslog-enabled=1 --config-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} --config-syslog-tls-enabled=0" diff --git a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml index 8894f3ec07..47e4a54cd6 100644 --- a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml +++ b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml @@ -32,7 +32,7 @@ when: - telemetry_sources.powerscale.metrics_enabled | default(false) | bool - csi_powerscale_driver_values_file_path is defined - - csi_values.controller.healthMonitor.enabled | default(false) == false + - not (csi_values.controller.healthMonitor.enabled | default(false) | bool) vars: csi_values: "{{ lookup('file', csi_powerscale_driver_values_file_path) | from_yaml }}" failed_when: false From 3d45aeafbb21e27d9ed7ab1fb7f547b95d82392c Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 8 May 2026 01:53:19 +0530 Subject: [PATCH 23/63] NAtive operator based vmscraper --- .../tasks/validate_telemetry_config.yml | 12 -- .../roles/provision_validations/vars/main.yml | 5 - .../tasks/deploy_powerscale_logs.yml | 5 + provision/roles/telemetry/tasks/main.yml | 1 + .../telemetry/tasks/telemetry_prereq.yml | 2 + .../templates/telemetry/kustomization.yaml.j2 | 6 +- .../victoria-operator-vmagent.yaml.j2 | 9 +- ....j2 => victoria-operator-vmscrape.yaml.j2} | 38 ++++++ .../victoria/vmagent-scrape-config.yaml.j2 | 117 ------------------ provision/roles/telemetry/vars/main.yml | 14 +-- 10 files changed, 59 insertions(+), 150 deletions(-) rename provision/roles/telemetry/templates/telemetry/victoria/{victoria-operator-vmpodscrape.yaml.j2 => victoria-operator-vmscrape.yaml.j2} (59%) delete mode 100644 provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 diff --git a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml index 47e4a54cd6..04c7e5fbd0 100644 --- a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml +++ b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml @@ -25,18 +25,6 @@ prompt: "{{ warning_idrac_telemetry_support_true }}" when: telemetry_sources.idrac.metrics_enabled | default(false) | bool -- name: Warning for PowerScale health monitor configuration - ansible.builtin.pause: - seconds: "{{ pause_time_15 }}" - prompt: "{{ warning_powerscale_health_monitor_disabled }}" - when: - - telemetry_sources.powerscale.metrics_enabled | default(false) | bool - - csi_powerscale_driver_values_file_path is defined - - not (csi_values.controller.healthMonitor.enabled | default(false) | bool) - vars: - csi_values: "{{ lookup('file', csi_powerscale_driver_values_file_path) | from_yaml }}" - failed_when: false - - name: Get k8s cluster details ansible.builtin.set_fact: service_cluster_info: >- diff --git a/provision/roles/provision_validations/vars/main.yml b/provision/roles/provision_validations/vars/main.yml index 3eef6c7661..26cc71092a 100644 --- a/provision/roles/provision_validations/vars/main.yml +++ b/provision/roles/provision_validations/vars/main.yml @@ -73,11 +73,6 @@ warning_idrac_telemetry_support_true: | Make sure that Redfish is enabled and the iDRAC has a datacenter license. Also, ensure that the firmware version is greater than 4 for iDRAC9 or greater than 1 for iDRAC10." -warning_powerscale_health_monitor_disabled: | - "[WARNING] telemetry_sources.powerscale.metrics_enabled is set to true in telemetry_config.yml, - but CSI driver health monitor is disabled (controller.healthMonitor.enabled=false in CSI values.yaml). - Health monitor metrics will not be stored in VictoriaMetrics. - To enable health monitor metrics, set controller.healthMonitor.enabled=true in CSI values.yaml." pause_time_15: 15 bmc_group_data_filename: "/opt/omnia/telemetry/bmc_group_data.csv" diff --git a/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml b/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml index cbc48861be..3110e40e0a 100644 --- a/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml +++ b/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml @@ -54,6 +54,11 @@ ps_clusters: "{{ csi_powerscale_secret.isilonClusters | default([]) }}" when: csi_powerscale_secret.skipped is not defined + - name: Set empty ps_clusters if secret not available + ansible.builtin.set_fact: + ps_clusters: [] + when: csi_powerscale_secret.skipped is defined + - name: Populate PowerScale syslog configuration script ansible.builtin.template: src: "{{ configure_ps_syslog_template }}" diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index a3436c6a62..2cbe852288 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -40,6 +40,7 @@ (telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool) or (telemetry_config.telemetry_sources.ldms.metrics_enabled | default(false) | bool) or (telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool) or + (telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool) or ldms_support | default(false) | bool block: - name: Set NFS info fact diff --git a/provision/roles/telemetry/tasks/telemetry_prereq.yml b/provision/roles/telemetry/tasks/telemetry_prereq.yml index 3def1610ee..069bc57d8d 100644 --- a/provision/roles/telemetry/tasks/telemetry_prereq.yml +++ b/provision/roles/telemetry/tasks/telemetry_prereq.yml @@ -44,12 +44,14 @@ mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" - name: Ensure iDRAC Telemetry scripting destination exists + when: telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool ansible.builtin.file: path: "{{ idrac_telemetry_scripting_git_clone_path }}" state: directory mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" - name: Copy iDRAC Telemetry Scripting to NFS share + when: telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool block: - name: Copy pre-cloned iDRAC Telemetry Scripting directory ansible.builtin.copy: diff --git a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 index 93e413cf5b..a99b501ce6 100644 --- a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 @@ -3,7 +3,6 @@ resources: {% if victoria_metrics_support | default(false) %} # victoria_metrics Resources (Metrics Only) - victoria-vmagent-rbac.yaml - - vmagent-scrape-config.yaml {% if victoria_cluster.tls_enabled | default(false) %} # TLS secret for Victoria components (shared by metrics and logs) - victoria-tls-secret.yaml @@ -18,8 +17,9 @@ resources: {% endif %} # VMAgent CR (operator-managed scraper for metrics) - victoria-operator-vmagent.yaml - # VMPodScrape CR (native operator-based pod discovery for metrics) - - victoria-operator-vmpodscrape.yaml + # VMScrape CR (native operator-based service/pod discovery for metrics) + - victoria-operator-vmscrape.yaml +{% endif %} {% endif %} {% if victoria_logs_support | default(false) %} # victoria_logs Resources (Logs Only) diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 index b11261c670..970c9b20aa 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 @@ -60,10 +60,11 @@ spec: memory: "{{ victoria_cluster.vmagent.resources.limits.memory}}" cpu: "{{ victoria_cluster.vmagent.resources.limits.cpu}}" - # ConfigMap-based scrape configuration - configSecret: - name: {{ vmagent.configmap_name }} - key: prometheus.yml + # Service discovery configs - operator uses VMServiceScrape/VMPodScrape CRDs + serviceScrapeNamespaceSelector: {} + serviceScrapeSelector: {} + podScrapeNamespaceSelector: {} + podScrapeSelector: {} # Extra args extraArgs: diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 similarity index 59% rename from provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2 rename to provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 index 4ed5c9c72d..aca205f4b6 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +{% if telemetry_sources.idrac.metrics_enabled | default(false) %} # VMPodScrape - Native operator-based pod discovery for idrac-telemetry apiVersion: operator.victoriametrics.com/v1beta1 kind: VMPodScrape @@ -44,3 +45,40 @@ spec: # Add pod IP label - sourceLabels: [__meta_kubernetes_pod_ip] targetLabel: pod_ip +{% endif %} + +--- +{% if telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) %} +# VMServiceScrape for PowerScale OTEL Collector +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: otel-collector-powerscale + namespace: {{ telemetry_namespace }} +spec: + # Target OTEL collector service + selector: + matchLabels: + app: otel-collector + + # Service metrics endpoints + endpoints: + - port: "8889" + interval: {{ vmagent.global.scrape_interval }} + path: /metrics + + # Relabel configs + relabelConfigs: +{% for cluster in ps_clusters %} + - sourceLabels: [__address__] + targetLabel: cluster + replacement: "{{ cluster.clusterName }}" + - sourceLabels: [__address__] + targetLabel: cluster_endpoint + replacement: "{{ cluster.endpoint }}" +{% endfor %} + - targetLabel: source + replacement: powerscale + - targetLabel: component + replacement: otel-collector +{% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 deleted file mode 100644 index fbf2f20960..0000000000 --- a/provision/roles/telemetry/templates/telemetry/victoria/vmagent-scrape-config.yaml.j2 +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: v1 -kind: ConfigMap -metadata: - name: "{{ vmagent.configmap_name }}" - namespace: "{{ telemetry_namespace }}" -data: - prometheus.yml: | - global: - scrape_interval: {{ vmagent.global.scrape_interval }} - - scrape_configs: - - job_name: "{{ vmagent.job_name }}" - honor_labels: true - - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - {{ vmagent.kubernetes_sd_namespace }} - - relabel_configs: - - # Keep only pods with correct label - - source_labels: [__meta_kubernetes_pod_label_app] - regex: {{ vmagent.target_pod_label }} - action: keep - - # Keep only the metrics container - - source_labels: [__meta_kubernetes_pod_container_name] - regex: {{ vmagent.metrics_container_name }} - action: keep - - # Set actual scrape address (container port) - - source_labels: [__meta_kubernetes_pod_ip] - target_label: __address__ - replacement: "$1:{{ vmagent.metrics_port }}" - - # Unique instance using pod name - - source_labels: [__meta_kubernetes_pod_name] - target_label: instance - - # Add namespace label - - source_labels: [__meta_kubernetes_namespace] - target_label: namespace - - # Add Pod IP label - - source_labels: [__meta_kubernetes_pod_ip] - target_label: pod_ip -{% if telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool %} - - # PowerScale OTEL Collector scrape targets (per cluster) -{% for cluster in ps_clusters %} - - job_name: "otel-collector-powerscale-cluster{{ loop.index0 }}" - honor_labels: true - scrape_interval: {{ vmagent.global.scrape_interval }} - static_configs: - - targets: ['otel-collector.{{ telemetry_namespace }}.svc.cluster.local:8889'] - labels: - source: powerscale - cluster: "{{ cluster.clusterName }}" - cluster_endpoint: "{{ cluster.endpoint }}" -{% endfor %} - - # CSI PowerScale Health Monitor scrape targets - - job_name: "csi-powerscale-health-monitor" - honor_labels: true - scrape_interval: {{ vmagent.global.scrape_interval }} - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - isilon - relabel_configs: - # Keep only CSI PowerScale pods - - source_labels: [__meta_kubernetes_pod_label_app] - regex: csi-isilon - action: keep - # Keep only health monitor container - - source_labels: [__meta_kubernetes_pod_container_name] - regex: csi-health-monitor - action: keep - # Set scrape address to health monitor port (9445) - - source_labels: [__meta_kubernetes_pod_ip] - target_label: __address__ - replacement: "$1:9445" - # Add labels - - source_labels: [__meta_kubernetes_pod_name] - target_label: pod - action: replace - - source_labels: [__meta_kubernetes_pod_node_name] - target_label: node - action: replace - - source_labels: [__meta_kubernetes_namespace] - target_label: namespace - action: replace - # Add custom labels - - target_label: source - replacement: powerscale - action: replace - - target_label: component - replacement: health-monitor - action: replace -{% endif %} diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index 14388def4f..dde804ad15 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -298,9 +298,8 @@ powerscale_victoria_logs_validation_fail_msg: >- PowerScale log collection requires 'victoria_logs' in telemetry_sources.powerscale.collection_targets. -# Usage: vmagent-scrape-config.yaml +# Usage: victoria-operator-vmagent.yaml.j2 (operator-native) vmagent: - configmap_name: "vmagent-scrape-config" global: scrape_interval: "10s" job_name: "idrac-telemetry" @@ -314,7 +313,6 @@ vmagent: app_name: "vmagent" container_name: "vmagent" image: "{{ telemetry_images['victoriametrics/vmagent'] | default('victoriametrics/vmagent:v1.128.0') }}" - scrape_config_path: "/etc/vmagent/prometheus.yml" # Single-node URL remote_write_url: "https://victoria-loadbalancer.telemetry.svc.cluster.local:8443/api/v1/write" # Cluster URL (used when victoria_cluster.enabled: true) @@ -348,8 +346,6 @@ common_mode: "0755" victoria_templates_common: - src: 'telemetry/victoria/victoria-vmagent-rbac.yaml.j2' dest: 'victoria-vmagent-rbac.yaml' - - src: 'telemetry/victoria/vmagent-scrape-config.yaml.j2' - dest: 'vmagent-scrape-config.yaml' # Operator-based templates (new default) # Single-node operator template (used when victoria_cluster.enabled: false) @@ -358,8 +354,8 @@ victoria_templates_operator_single: dest: 'victoria-operator-vmsingle.yaml' - src: 'telemetry/victoria/victoria-operator-vmagent.yaml.j2' dest: 'victoria-operator-vmagent.yaml' - - src: 'telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2' - dest: 'victoria-operator-vmpodscrape.yaml' + - src: 'telemetry/victoria/victoria-operator-vmscrape.yaml.j2' + dest: 'victoria-operator-vmscrape.yaml' # Cluster operator template (used when victoria_cluster.enabled: true) victoria_templates_operator_cluster: @@ -367,8 +363,8 @@ victoria_templates_operator_cluster: dest: 'victoria-operator-vmcluster.yaml' - src: 'telemetry/victoria/victoria-operator-vmagent.yaml.j2' dest: 'victoria-operator-vmagent.yaml' - - src: 'telemetry/victoria/victoria-operator-vmpodscrape.yaml.j2' - dest: 'victoria-operator-vmpodscrape.yaml' + - src: 'telemetry/victoria/victoria-operator-vmscrape.yaml.j2' + dest: 'victoria-operator-vmscrape.yaml' # Legacy manual deployment templates (removed - use operator-based templates above) # Raw victoria-cluster-vminsert/vmselect/vmstorage.yaml.j2 files have been removed From e0cb65dbdd86a4ac61d18f5ca851fcdddeaabedc Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 8 May 2026 06:39:23 +0530 Subject: [PATCH 24/63] remove usused syslog template --- .../configure_powerscale_syslog.sh.j2 | 111 ------------------ .../tasks/deploy_powerscale_logs.yml | 6 - provision/roles/telemetry/vars/main.yml | 21 +--- 3 files changed, 2 insertions(+), 136 deletions(-) delete mode 100644 provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 diff --git a/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 deleted file mode 100644 index 22e5c51836..0000000000 --- a/provision/roles/configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2 +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# PowerScale Syslog Collection — Direct to VLAgent LoadBalancer -# -# DATA PIPELINE: -# PowerScale (UDP/TCP:514) -> VLAgent LoadBalancer IP:514 -> VictoriaLogs - -set -euo pipefail - -NAMESPACE="{{ telemetry_namespace }}" - -echo "==========================================" -echo "PowerScale Syslog Collection Setup" -echo "==========================================" -echo "Pipeline: PowerScale (UDP/TCP:514) -> VLAgent LoadBalancer -> VictoriaLogs" -echo "" - -# ============================================================================ -# Phase 1: Get VLAgent LoadBalancer IP -# ============================================================================ -echo "===== Phase 1: Getting VLAgent LoadBalancer IP =====" - -VLAGENT_SERVICE=$(kubectl get svc vlagent-vlagent -n "$NAMESPACE" -o json 2>/dev/null) || { - echo "ERROR: VLAgent service not found in namespace $NAMESPACE" - exit 0 -} - -SERVICE_TYPE=$(echo "$VLAGENT_SERVICE" | jq -r '.spec.type') -echo "VLAgent service type: $SERVICE_TYPE" - -if [ "$SERVICE_TYPE" != "LoadBalancer" ]; then - echo "ERROR: VLAgent service is not type LoadBalancer (found: $SERVICE_TYPE)" - echo "PowerScale syslog requires LoadBalancer service type for external access" - exit 0 -fi - -# Get LoadBalancer external IP -SYSLOG_ENDPOINT=$(echo "$VLAGENT_SERVICE" | jq -r '.status.loadBalancer.ingress[0].ip // empty') - -if [ -z "$SYSLOG_ENDPOINT" ]; then - echo "ERROR: LoadBalancer IP not assigned to VLAgent service" - echo "Waiting for MetalLB to assign external IP..." - exit 0 -fi - -SYSLOG_PORT=514 - -echo "VLAgent LoadBalancer IP: ${SYSLOG_ENDPOINT}:${SYSLOG_PORT}" -echo "" - -# ============================================================================ -# Phase 2: Verification -# ============================================================================ -echo "===== Phase 2: Verification =====" - -VLAGENT_PODS=$(kubectl get pods -n "$NAMESPACE" -l app=vlagent --no-headers 2>/dev/null | wc -l) -echo "VLAgent pods running: $VLAGENT_PODS" - -if [ "$VLAGENT_PODS" -eq 0 ]; then - echo "WARNING: No VLAgent pods are running" -fi - -# ============================================================================ -# Phase 3: Manual Configuration Instructions -# ============================================================================ -echo "" -echo "===== Phase 3: Manual Configuration Instructions =====" -echo "" -echo "PowerScale syslog requires manual configuration." -echo "" -echo "Manual configuration steps:" -echo "" -{% for cluster in ps_clusters | default([]) %} -CLUSTER_ENDPOINT="{{ cluster.endpoint | default('') }}" -CLUSTER_NAME="{{ cluster.clusterName | default('') }}" - -if [ -n "$CLUSTER_ENDPOINT" ]; then - # Extract management IP from endpoint (remove protocol and path) - CLUSTER_IP=$(echo "$CLUSTER_ENDPOINT" | sed -e 's|https\?://||' -e 's|/.*||') - echo "For PowerScale cluster: $CLUSTER_NAME ($CLUSTER_IP)" - echo " 1. SSH to PowerScale: ssh @$CLUSTER_IP" - echo " 2. Enable audit syslog forwarding:" - echo " isi audit settings global modify --protocol-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} --protocol-syslog-tls-enabled=0" - echo " isi audit settings global modify --config-syslog-enabled=1 --config-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} --config-syslog-tls-enabled=0" - echo " isi audit settings global modify --system-syslog-enabled=1 --system-syslog-servers=${SYSLOG_ENDPOINT}:${SYSLOG_PORT} --system-syslog-tls-enabled=0" - echo " 3. Verify: isi audit settings global view" - echo "" -else - echo "WARNING: Empty endpoint for cluster $CLUSTER_NAME, skipping..." - echo "" -fi -{% endfor %} -echo "Ensure firewall allows UDP/TCP:514 from PowerScale data pool IPs to ${SYSLOG_ENDPOINT}:${SYSLOG_PORT}" -echo "" -echo "==========================================" -echo "PowerScale syslog collection setup instructions completed!" -echo "==========================================" -exit 0 diff --git a/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml b/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml index 3110e40e0a..892d13b18f 100644 --- a/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml +++ b/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml @@ -58,9 +58,3 @@ ansible.builtin.set_fact: ps_clusters: [] when: csi_powerscale_secret.skipped is defined - - - name: Populate PowerScale syslog configuration script - ansible.builtin.template: - src: "{{ configure_ps_syslog_template }}" - dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/configure_powerscale_syslog.sh" - mode: "{{ hostvars['localhost']['file_permissions_755'] }}" diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index dde804ad15..bcec203a34 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -286,13 +286,6 @@ victoria_tls_cert_days: 3650 victoria_cert_dir: "{{ telemetry_share_path }}/victoria-certs" syslog_tls_cert_dir: "{{ telemetry_share_path }}/syslog-tls-certs" -# PowerScale log configuration status message -powerscale_log_config_status_msg: >- - PowerScale syslog configuration script staged on NFS share. - During cloud-init, VLAgent LoadBalancer IP will be retrieved and - PowerScale configuration instructions will be provided. - PowerScale sends syslog directly to VLAgent LoadBalancer IP:514 (UDP/TCP). - # PowerScale VictoriaLogs validation fail message powerscale_victoria_logs_validation_fail_msg: >- PowerScale log collection requires 'victoria_logs' in @@ -550,18 +543,8 @@ ps_dependency_fail_msg: >- # Usage: deploy_powerscale_logs.yml # Gated by: telemetry_sources.powerscale.logs_enabled # Pipeline: PowerScale (UDP/TCP:514) → VLAgent LoadBalancer IP:514 → VictoriaLogs -configure_ps_syslog_template: "{{ role_path }}/../configure_ochami/templates/powerscale/configure_powerscale_syslog.sh.j2" -ps_log_enabled_msg: >- - PowerScale log collection enabled (telemetry_sources.powerscale.logs_enabled: true). - VLAgent will be deployed with LoadBalancer service to receive PowerScale syslog. - PowerScale sends syslog directly to VLAgent LoadBalancer IP:514 (UDP/TCP). -ps_log_disabled_msg: >- - PowerScale log collection disabled (telemetry_sources.powerscale.logs_enabled: false). - VLAgent will not be configured for PowerScale syslog collection. -ps_log_deployed_msg: >- - PowerScale syslog configuration script staged on NFS share. - During cloud-init, VLAgent LoadBalancer IP will be retrieved and - PowerScale configuration instructions will be provided. +# Note: configure_powerscale_syslog.sh.j2 template removed - syslog configuration now manual + # Vector Kafka-to-Victoria Ingestion Pipeline Configuration # ============================================================================ # Usage: deploy_vector_ldms.yml, deploy_vector_ome.yml From af1649a09d269a6148b0303697aedfacad0e6b9f Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 8 May 2026 06:54:54 +0530 Subject: [PATCH 25/63] fix for nfs_client_param check in telemetry config --- .../tasks/validate_telemetry_config.yml | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml index 04c7e5fbd0..019102b505 100644 --- a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml +++ b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml @@ -40,15 +40,21 @@ k8s_nfs_storage_name: "{{ service_cluster_info.nfs_storage_name }}" cacheable: true -- name: Find matching NFS client param +- name: Find matching NFS mount ansible.builtin.set_fact: - k8s_nfs_storage_details: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', k8s_nfs_storage_name) | list | first) | default({}) }}" + k8s_nfs_storage_details: "{{ (storage_config.mounts | selectattr('name', 'equalto', k8s_nfs_storage_name) | list | first) | default({}) }}" cacheable: true -- name: Set share_path from service_k8s_cluster client_share_path +- name: Parse NFS source to extract server and path ansible.builtin.set_fact: - k8s_client_share_path: "{{ k8s_nfs_storage_details.client_share_path }}" - k8s_server_share_path: "{{ k8s_nfs_storage_details.server_share_path }}" - k8s_server_ip: "{{ k8s_nfs_storage_details.server_ip }}" - k8s_mount_options: "{{ k8s_nfs_storage_details.client_mount_options }}" + k8s_nfs_source_parts: "{{ k8s_nfs_storage_details.source.split(':') }}" + when: k8s_nfs_storage_details.source is defined + +- name: Set share_path from service_k8s_cluster mount configuration + ansible.builtin.set_fact: + k8s_client_share_path: "{{ k8s_nfs_storage_details.mount_point }}" + k8s_server_share_path: "{{ k8s_nfs_source_parts[1] if k8s_nfs_source_parts | length > 1 else '' }}" + k8s_server_ip: "{{ k8s_nfs_source_parts[0] if k8s_nfs_source_parts | length > 0 else '' }}" + k8s_mount_options: "{{ k8s_nfs_storage_details.mnt_opts }}" cacheable: true + when: k8s_nfs_storage_details.source is defined From 93ab4f05059d7cbb6a01937f6e0befb64395fdc6 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 8 May 2026 06:57:30 +0530 Subject: [PATCH 26/63] update storage_config variable --- .../tasks/validate_telemetry_config.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml index 019102b505..9754a814a2 100644 --- a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml +++ b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml @@ -40,6 +40,11 @@ k8s_nfs_storage_name: "{{ service_cluster_info.nfs_storage_name }}" cacheable: true +- name: Load storage_config.yml + ansible.builtin.include_vars: + file: "{{ hostvars['localhost']['input_project_dir'] }}/storage_config.yml" + name: storage_config + - name: Find matching NFS mount ansible.builtin.set_fact: k8s_nfs_storage_details: "{{ (storage_config.mounts | selectattr('name', 'equalto', k8s_nfs_storage_name) | list | first) | default({}) }}" From 8acbd412a9c2fd3fe7c4a177ad66691e9efbf3cf Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 8 May 2026 07:02:55 +0530 Subject: [PATCH 27/63] fix for k8s_nfs_server_path undefined variable --- .../provision_validations/tasks/validate_telemetry_config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml index 9754a814a2..9bd54a52b0 100644 --- a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml +++ b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml @@ -61,5 +61,6 @@ k8s_server_share_path: "{{ k8s_nfs_source_parts[1] if k8s_nfs_source_parts | length > 1 else '' }}" k8s_server_ip: "{{ k8s_nfs_source_parts[0] if k8s_nfs_source_parts | length > 0 else '' }}" k8s_mount_options: "{{ k8s_nfs_storage_details.mnt_opts }}" + k8s_nfs_server_path: "{{ k8s_nfs_storage_details.source }}" cacheable: true when: k8s_nfs_storage_details.source is defined From 2ae3050e934ec7c2747a8cca0c485c311f5df54d Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 8 May 2026 07:13:46 +0530 Subject: [PATCH 28/63] fix for kustomization error --- .../roles/telemetry/templates/telemetry/kustomization.yaml.j2 | 1 - 1 file changed, 1 deletion(-) diff --git a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 index a99b501ce6..09cc0ae1e6 100644 --- a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 @@ -20,7 +20,6 @@ resources: # VMScrape CR (native operator-based service/pod discovery for metrics) - victoria-operator-vmscrape.yaml {% endif %} -{% endif %} {% if victoria_logs_support | default(false) %} # victoria_logs Resources (Logs Only) {% if victoria_cluster.tls_enabled | default(false) and not victoria_metrics_support | default(false) %} From 901b7d171fdf525ecbf820f5355bffa9f2185334 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 8 May 2026 08:11:38 +0530 Subject: [PATCH 29/63] remove powerscale syslog configuration --- ...p-service_kube_control_plane_first_x86_64.yaml.j2 | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 875e47bd56..75e89f20d2 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -1079,18 +1079,6 @@ /root/telemetry.sh {% endif %} -{% if powerscale_log_enabled | default(false) | bool %} - echo "===== Configuring PowerScale Syslog Forwarding =====" - if [ -f "{{ k8s_client_mount_path }}/telemetry/deployments/configure_powerscale_syslog.sh" ]; then - echo "Running PowerScale syslog configuration script..." - bash "{{ k8s_client_mount_path }}/telemetry/deployments/configure_powerscale_syslog.sh" 2>&1 - echo "PowerScale syslog configuration completed (exit code: $?)" - else - echo "WARNING: PowerScale syslog configuration script not found." - echo "Ensure deploy_powerscale_logs.yml ran successfully during provisioning." - fi -{% endif %} - echo "Rollout and Restart coredns" kubectl rollout restart deployment coredns -n kube-system sleep 30 From 90f36ab94741699d0a77bfd7e4bfc6495de7f1f6 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 8 May 2026 13:53:18 +0530 Subject: [PATCH 30/63] support for external health monitor metrics --- .../tasks/validate_telemetry_config.yml | 37 +++++++++++++++++++ .../roles/provision_validations/vars/main.yml | 4 ++ .../victoria-operator-vmscrape.yaml.j2 | 32 ++++++++++++++++ 3 files changed, 73 insertions(+) diff --git a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml index 9bd54a52b0..2466ee6ea1 100644 --- a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml +++ b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml @@ -64,3 +64,40 @@ k8s_nfs_server_path: "{{ k8s_nfs_storage_details.source }}" cacheable: true when: k8s_nfs_storage_details.source is defined + +- name: Read CSI PowerScale driver values.yaml to detect Volume Health Monitoring + when: hostvars['localhost']['csi_driver_powerscale_support'] | default(false) | bool + block: + - name: Check if CSI PowerScale values file exists + ansible.builtin.stat: + path: "{{ service_cluster_info.csi_powerscale_driver_values_file_path }}" + register: csi_values_file_stat + when: service_cluster_info.csi_powerscale_driver_values_file_path is defined + + - name: Load CSI PowerScale values.yaml + ansible.builtin.include_vars: + file: "{{ service_cluster_info.csi_powerscale_driver_values_file_path }}" + name: csi_powerscale_values + when: + - service_cluster_info.csi_powerscale_driver_values_file_path is defined + - csi_values_file_stat.stat.exists | default(false) + + - name: Set Volume Health Monitoring status from CSI driver values + ansible.builtin.set_fact: + powerscale_volume_health_enabled: "{{ csi_powerscale_values.node.healthMonitor.enabled | default(false) | bool }}" + cacheable: true + when: + - csi_powerscale_values is defined + + - name: Set Volume Health Monitoring to false if not configured + ansible.builtin.set_fact: + powerscale_volume_health_enabled: false + cacheable: true + when: powerscale_volume_health_enabled is not defined + + - name: Warn if PowerScale metrics enabled but Volume Health Monitoring is disabled + when: + - telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool + - not powerscale_volume_health_enabled | default(false) | bool + ansible.builtin.debug: + msg: "{{ warning_powerscale_volume_health_disabled }}" diff --git a/provision/roles/provision_validations/vars/main.yml b/provision/roles/provision_validations/vars/main.yml index 26cc71092a..901e914db7 100644 --- a/provision/roles/provision_validations/vars/main.yml +++ b/provision/roles/provision_validations/vars/main.yml @@ -73,6 +73,10 @@ warning_idrac_telemetry_support_true: | Make sure that Redfish is enabled and the iDRAC has a datacenter license. Also, ensure that the firmware version is greater than 4 for iDRAC9 or greater than 1 for iDRAC10." +warning_powerscale_volume_health_disabled: | + WARNING: PowerScale metrics collection is enabled, but Volume Health Monitoring is disabled in CSI driver. + To enable Volume Health Monitoring, set node.healthMonitor.enabled: true in CSI driver values.yaml. + Volume Health Monitoring exposes capacity metrics (used/free capacity, used/free inodes) via Kubernetes metrics API. pause_time_15: 15 bmc_group_data_filename: "/opt/omnia/telemetry/bmc_group_data.csv" diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 index aca205f4b6..f4a71fdc81 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 @@ -82,3 +82,35 @@ spec: - targetLabel: component replacement: otel-collector {% endif %} + +--- +{% if hostvars['localhost']['powerscale_volume_health_enabled'] | default(false) %} +# VMNodeScrape for Kubelet Volume Health Monitoring metrics +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMNodeScrape +metadata: + name: kubelet-volume-health + namespace: {{ telemetry_namespace }} +spec: + # Scrape interval + interval: {{ vmagent.global.scrape_interval }} + + # Kubelet HTTPS endpoint + scheme: https + + # TLS configuration with proper CA verification + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + + # Bearer token for authentication + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + + # Relabel configs + relabelConfigs: + - sourceLabels: [__meta_kubernetes_node_name] + targetLabel: node + - targetLabel: source + replacement: powerscale + - targetLabel: component + replacement: volume-health +{% endif %} From e173f4f7b3d8e65da299264d7f230c2e1cd316c2 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 8 May 2026 15:10:13 +0530 Subject: [PATCH 31/63] CSM authorization support --- .../rhel/10.0/csi_driver_powerscale.json | 5 + input/omnia_config.yml | 16 +- ...ce_kube_control_plane_first_x86_64.yaml.j2 | 8 + .../templates/powerscale/csm_role_cr.yaml.j2 | 10 + .../powerscale/csm_storage_cr.yaml.j2 | 10 + .../powerscale/csm_tenant_cr.yaml.j2 | 15 + .../powerscale/deploy_csm_authorization.sh.j2 | 317 ++++++++++++++++++ .../verify_powerscale_authorization.sh.j2 | 205 +++++++++++ .../tasks/create_k8s_config_nfs.yml | 10 +- .../tasks/deploy_powerscale_authorization.yml | 95 ++++++ provision/roles/k8s_config/vars/main.yml | 42 +++ 11 files changed, 718 insertions(+), 15 deletions(-) create mode 100644 provision/roles/configure_ochami/templates/powerscale/csm_role_cr.yaml.j2 create mode 100644 provision/roles/configure_ochami/templates/powerscale/csm_storage_cr.yaml.j2 create mode 100644 provision/roles/configure_ochami/templates/powerscale/csm_tenant_cr.yaml.j2 create mode 100644 provision/roles/configure_ochami/templates/powerscale/deploy_csm_authorization.sh.j2 create mode 100644 provision/roles/configure_ochami/templates/powerscale/verify_powerscale_authorization.sh.j2 create mode 100644 provision/roles/k8s_config/tasks/deploy_powerscale_authorization.yml diff --git a/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json b/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json index 6d6abf76c6..7d45d935a8 100644 --- a/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json +++ b/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json @@ -128,6 +128,11 @@ "package": "docker.io/dellemc/csm-encryption", "tag": "v0.6.0", "type": "image" + }, + { + "package": "dellctl", + "url": "https://github.com/dell/dellctl/releases/download/v1.7.0/dellctl-linux-amd64.tar.gz", + "type": "tarball" } ] } diff --git a/input/omnia_config.yml b/input/omnia_config.yml index 4eef108cc7..d25fea2d60 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -182,31 +182,21 @@ slurm_cluster: # enabled: Set to true to enable PowerScale CSM Authorization (default: false). # csm_authorization_values_file_path: Absolute file path for the CSM Authorization values.yaml file. # Required when enabled is true. +# Download from: https://github.com/dell/helm-charts/blob/main/charts/csm-authorization-v2.0/values.yaml # tenants: List of tenant configurations (at least one tenant required when enabled). # name: Tenant name (alphanumeric, hyphens, underscores only, e.g., "team-omnia"). # roles: List of roles for this tenant (at least one role required). # name: Role name (alphanumeric, hyphens, underscores only, e.g., "role-omnia"). # storage_pool: PowerScale storage pool path (must start with /ifs, e.g., "/ifs/data/csi/team-omnia"). # IMPORTANT: This path must already exist on the PowerScale cluster. -# Omnia and CSI driver will NOT create this path automatically. # quota_limit: Storage quota limit for this role (e.g., "200Gi", "1Ti", "500Mi"). -# # Prerequisites for enabling PowerScale Authorization: # 1. csi_driver_powerscale must be present in software_config.json -# 2. Service cluster nodes (service_kube_node_*, service_kube_control_plane_*) must be defined in PXE mapping -# 3. All three file paths must be provided and files must exist: +# 2. Service cluster nodes must be defined in PXE mapping +# 3. File paths must be provided and files must exist: # - csi_powerscale_driver_secret_file_path # - csi_powerscale_driver_values_file_path # - csm_authorization_values_file_path -# 4. Image versions in csm_authorization_values_file_path must match the versions in input/config/x86_64/rhel/10.0/csi_driver_powerscale.json -# The following CSM Authorization images are validated: -# - quay.io/dell/container-storage-modules/csm-authorization-proxy -# - quay.io/dell/container-storage-modules/csm-authorization-tenant -# - quay.io/dell/container-storage-modules/csm-authorization-role -# - quay.io/dell/container-storage-modules/csm-authorization-storage -# - quay.io/dell/container-storage-modules/csm-authorization-controller -# - quay.io/dell/container-storage-modules/csm-authorization-sidecar -# Note: These images are already included in csi_driver_powerscale.json # - k8s_crio_storage_size: Specifies the disk size allocated for CRI-O container storage. # This storage is used to store container images, writable layers, and runtime data. diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 75e89f20d2..b01449bcfd 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -976,6 +976,14 @@ echo "Done updating poll rate." +{% if hostvars['localhost']['service_cluster_info'].powerscale_authorization.enabled | default(false) | bool %} +{% include 'powerscale/deploy_csm_authorization.sh.j2' %} +{% else %} + echo "INFO: CSM Authorization is DISABLED. CSI PowerScale will be deployed in direct mode (no multi-tenant RBAC)." + echo " To enable authorization, set powerscale_authorization.enabled: true in omnia_config.yml" + echo " and configure tenants, roles, storage pools, and quota limits." +{% endif %} + if [ "$POWERSCALE_DEPLOYMENT_FAILED" -eq 0 ]; then echo "===== Running CSI PowerScale installation script =====" INSTALL_SCRIPT="/opt/omnia/csi-driver-powerscale/csi-powerscale/dell-csi-helm-installer/csi-install.sh" diff --git a/provision/roles/configure_ochami/templates/powerscale/csm_role_cr.yaml.j2 b/provision/roles/configure_ochami/templates/powerscale/csm_role_cr.yaml.j2 new file mode 100644 index 0000000000..89bc9dd363 --- /dev/null +++ b/provision/roles/configure_ochami/templates/powerscale/csm_role_cr.yaml.j2 @@ -0,0 +1,10 @@ +apiVersion: csm-authorization.storage.dell.com/v1alpha1 +kind: CSMRole +metadata: + name: {{ role.name }} + namespace: {{ authz_namespace }} +spec: + systemID: {{ cluster_name }} + systemType: isilon + pool: {{ role.storage_pool }} + quota: {{ role.quota_limit }} diff --git a/provision/roles/configure_ochami/templates/powerscale/csm_storage_cr.yaml.j2 b/provision/roles/configure_ochami/templates/powerscale/csm_storage_cr.yaml.j2 new file mode 100644 index 0000000000..9fdd557cc4 --- /dev/null +++ b/provision/roles/configure_ochami/templates/powerscale/csm_storage_cr.yaml.j2 @@ -0,0 +1,10 @@ +apiVersion: csm-authorization.storage.dell.com/v1alpha1 +kind: Storage +metadata: + name: powerscale-{{ cluster_name }} + namespace: {{ authz_namespace }} +spec: + type: isilon + endpoint: {{ endpoint }}:{{ port }} + systemID: {{ cluster_name }} + skipCertificateValidation: true diff --git a/provision/roles/configure_ochami/templates/powerscale/csm_tenant_cr.yaml.j2 b/provision/roles/configure_ochami/templates/powerscale/csm_tenant_cr.yaml.j2 new file mode 100644 index 0000000000..e05620b3f2 --- /dev/null +++ b/provision/roles/configure_ochami/templates/powerscale/csm_tenant_cr.yaml.j2 @@ -0,0 +1,15 @@ +apiVersion: csm-authorization.storage.dell.com/v1alpha1 +kind: CSMTenant +metadata: + name: {{ tenant.name }} + namespace: {{ authz_namespace }} +spec: +{% if tenant.roles | default([]) %} + roles: +{% for role in tenant.roles %} + - {{ role.name }} +{% endfor %} +{% else %} + roles: [] +{% endif %} + revoke: false diff --git a/provision/roles/configure_ochami/templates/powerscale/deploy_csm_authorization.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/deploy_csm_authorization.sh.j2 new file mode 100644 index 0000000000..61b67d3acd --- /dev/null +++ b/provision/roles/configure_ochami/templates/powerscale/deploy_csm_authorization.sh.j2 @@ -0,0 +1,317 @@ +{# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#} +{# CSM Authorization Proxy Server + CRs + Token + Sidecar (cloud-init fragment) + This template is included by ci-group-service_kube_control_plane_first_x86_64.yaml.j2 + when powerscale_authorization.enabled is true. + + Deployment order (must run BEFORE csi-install.sh): + P1: Create authorization namespace + P2: Helm install/upgrade CSM Authorization Proxy Server + P3: Wait for proxy server pods (max 5 min) + P4: Extract original endpoint/systemID from secret.yaml + P5: Apply Storage CR (from pre-rendered csm-storage-cr.yaml) + P6: Apply Role CRs (from pre-rendered csm-role-cr-*.yaml files) + P7: Apply Tenant CRs (from pre-rendered csm-tenant-cr-*.yaml files) + P8: Generate token via dellctl + P9: Apply token to isilon namespace + A1: Create proxy-server-root-certificate secret + A2: Build + create karavi-authorization-config secret + A3: Modify secret.yaml (endpoint → localhost:9400, add mountEndpoint) + A4: Re-create isilon-creds secret with sidecar configuration + A5: Modify CSI driver values.yaml (authorization.enabled: true) +#} +{% set authz_skip_cert = hostvars['localhost']['service_cluster_info'].powerscale_authorization.get('authorization_skip_certificate_validation', true) | default(true) %} +{% set authz_tenants = hostvars['localhost']['service_cluster_info'].powerscale_authorization.tenants | default([]) %} + # ============================================================ + # CSM Authorization Proxy Server Deployment (Steps P1-P3) + # ============================================================ + if [ "$POWERSCALE_DEPLOYMENT_FAILED" -eq 0 ]; then + echo "===== CSM Authorization: Deploying Proxy Server =====" + AUTHZ_NS="authorization" + AUTHZ_CHART_PATH="/opt/omnia/csi-driver-powerscale/csi-powerscale/helm-charts/charts/csm-authorization-v2.0" + AUTHZ_VALUES_FILE="/opt/omnia/csi-driver-powerscale/csm-authorization/csm-authorization-values.yaml" + + # Step P1: Create authorization namespace + kubectl create namespace "${AUTHZ_NS}" --dry-run=client -o yaml | kubectl apply -f - || { + echo "WARNING: Failed to create authorization namespace." + } + + # Step P2: Helm install/upgrade CSM Authorization Proxy Server + if [ -d "$AUTHZ_CHART_PATH" ] && [ -f "$AUTHZ_VALUES_FILE" ]; then + if helm list -n "${AUTHZ_NS}" --filter csm-authorization -q 2>/dev/null | grep -q csm-authorization; then + echo "Upgrading existing CSM Authorization release..." + helm upgrade csm-authorization "$AUTHZ_CHART_PATH" \ + -n "${AUTHZ_NS}" -f "$AUTHZ_VALUES_FILE" --wait --timeout 10m || { + echo "ERROR: Helm upgrade for CSM Authorization failed." + POWERSCALE_DEPLOYMENT_FAILED=1 + } + else + echo "Fresh install of CSM Authorization Proxy Server..." + helm install csm-authorization "$AUTHZ_CHART_PATH" \ + -n "${AUTHZ_NS}" -f "$AUTHZ_VALUES_FILE" --wait --timeout 10m || { + echo "ERROR: Helm install for CSM Authorization failed." + POWERSCALE_DEPLOYMENT_FAILED=1 + } + fi + else + echo "ERROR: CSM Authorization Helm chart or values file not found." + echo " Chart path: $AUTHZ_CHART_PATH" + echo " Values file: $AUTHZ_VALUES_FILE" + POWERSCALE_DEPLOYMENT_FAILED=1 + fi + + # Step P3: Wait for proxy server pods (max 5 min) + if [ "$POWERSCALE_DEPLOYMENT_FAILED" -eq 0 ]; then + echo "Waiting for CSM Authorization Proxy Server pods..." + MAX_ATTEMPTS=30; WAIT_TIME=10 + for ((i=1; i<=MAX_ATTEMPTS; i++)); do + NOT_READY=$(kubectl get pods -n "${AUTHZ_NS}" --no-headers 2>/dev/null | grep -vE 'Running|Completed' | wc -l) + TOTAL=$(kubectl get pods -n "${AUTHZ_NS}" --no-headers 2>/dev/null | wc -l) + if [ "$NOT_READY" -eq 0 ] && [ "$TOTAL" -gt 0 ]; then + echo "All CSM Authorization Proxy Server pods are running." + break + fi + echo "[$i/$MAX_ATTEMPTS] Waiting... (${NOT_READY} not ready)" + sleep $WAIT_TIME + done + fi + fi + + # ============================================================ + # CSM Authorization CRs + Token Generation (Steps P4-P9) + # ============================================================ + if [ "$POWERSCALE_DEPLOYMENT_FAILED" -eq 0 ]; then + echo "===== CSM Authorization: Creating CRs and Generating Token =====" + SECRET_FILE="/opt/omnia/csi-driver-powerscale/secret.yaml" + + # Step P4: Extract original endpoint/systemID from secret.yaml (before modification) + ORIGINAL_ENDPOINT=$(grep -v '^[[:space:]]*#' "$SECRET_FILE" | grep 'endpoint:' | head -1 | awk -F'"' '{print $2}') + ORIGINAL_PORT=$(grep -v '^[[:space:]]*#' "$SECRET_FILE" | grep 'endpointPort:' | head -1 | awk '{print $2}') + CLUSTER_NAME=$(grep -v '^[[:space:]]*#' "$SECRET_FILE" | grep 'clusterName:' | head -1 | awk -F'"' '{print $2}') + [ -z "$ORIGINAL_PORT" ] && ORIGINAL_PORT="8080" + [ -z "$CLUSTER_NAME" ] && CLUSTER_NAME="cluster1" + echo "PowerScale endpoint: ${ORIGINAL_ENDPOINT}:${ORIGINAL_PORT}, systemID: ${CLUSTER_NAME}" + + # Step P5: Create Storage CR + echo "Creating Storage CR for PowerScale array..." + if [ -f "/opt/omnia/csi-driver-powerscale/csm-storage-cr.yaml" ]; then + kubectl apply -f /opt/omnia/csi-driver-powerscale/csm-storage-cr.yaml + echo " Storage CR created" + else + echo " WARNING: csm-storage-cr.yaml not found, skipping Storage CR creation" + fi + + # Step P6: Create Role CRs (from pre-rendered files) + echo "Creating Role CRs..." + for role_file in /opt/omnia/csi-driver-powerscale/csm-role-cr-*.yaml; do + if [ -f "$role_file" ]; then + kubectl apply -f "$role_file" + echo " Role CR applied from $(basename $role_file)" + fi + done + + # Step P7: Create Tenant CRs (from pre-rendered files) + echo "Creating Tenant CRs..." + for tenant_file in /opt/omnia/csi-driver-powerscale/csm-tenant-cr-*.yaml; do + if [ -f "$tenant_file" ]; then + kubectl apply -f "$tenant_file" + echo " Tenant CR applied from $(basename $tenant_file)" + fi + done + + # Wait for CRDs to be processed by authorization controller + echo "Waiting for CRs to be processed..." + sleep 15 + + # Step P8: Create karavi-config-secret with JWT signing secret + JWT_SECRET_FILE="/opt/omnia/csi-driver-powerscale/csm-authorization/authz_jwt_secret.env" + AUTHZ_JWT_SECRET="" + if [ -f "$JWT_SECRET_FILE" ]; then + source "$JWT_SECRET_FILE" + echo "JWT signing secret loaded." + else + echo "WARNING: JWT secret file not found. Generating random secret..." + AUTHZ_JWT_SECRET=$(head -c 32 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 32) + fi + + # Create config.yaml for karavi-config-secret + cat > /tmp/karavi-config.yaml </dev/null | grep -i proxy | head -1) + if [ -z "$PROXY_SVC" ]; then + PROXY_SVC=$(kubectl get svc -n "${AUTHZ_NS}" -o name 2>/dev/null | head -1) + fi + + if [ -n "$PROXY_SVC" ]; then + echo "Starting port-forward to proxy server..." + kubectl port-forward -n "${AUTHZ_NS}" "${PROXY_SVC}" 9443:443 &>/dev/null & + PF_PID=$! + sleep 5 + +{% for tenant in authz_tenants %} + echo "Generating token for tenant '{{ tenant.name }}'..." + "$DELLCTL_BIN" admin token generate \ + --admin-name "admin" \ + --jwt-signing-secret "$AUTHZ_JWT_SECRET" \ + --tenant {{ tenant.name }} \ + --access-token-expiration 30m \ + --refresh-token-expiration 720h \ + --addr "localhost:9443" \ + --insecure true \ + --output "${TOKEN_FILE}" 2>/dev/null && { + TOKEN_GENERATED=1 + echo " Token generated for tenant '{{ tenant.name }}'." + } || { + echo " WARNING: dellctl token generation failed for tenant '{{ tenant.name }}'." + } +{% endfor %} + + # Kill port-forward + kill $PF_PID 2>/dev/null || true + wait $PF_PID 2>/dev/null || true + else + echo "WARNING: No proxy server service found for port-forward." + fi + else + [ ! -f "$DELLCTL_BIN" ] && echo "WARNING: dellctl binary not found at ${DELLCTL_BIN}." + echo " Token generation skipped. Manual steps required after deployment:" +{% for tenant in authz_tenants %} + echo " dellctl admin token generate --admin-name admin --jwt-signing-secret --tenant {{ tenant.name }} --access-token-expiration 30m --refresh-token-expiration 720h --addr --insecure true --output /tmp/token.yaml" + echo " kubectl apply -f /tmp/token.yaml -n isilon" +{% endfor %} + fi + + # Step P10: Apply token to isilon namespace + if [ "$TOKEN_GENERATED" -eq 1 ] && [ -f "$TOKEN_FILE" ]; then + echo "Applying authorization token to isilon namespace..." + kubectl apply -f "$TOKEN_FILE" -n isilon || { + echo "WARNING: Failed to apply token. Manual step required:" + echo " kubectl apply -f ${TOKEN_FILE} -n isilon" + } + rm -f "$TOKEN_FILE" + fi + + # Cleanup: remove JWT secret from NFS (sensitive) + rm -f "$JWT_SECRET_FILE" + fi + + # ============================================================ + # CSM Authorization Sidecar Configuration (Steps A1-A5) + # ============================================================ + if [ "$POWERSCALE_DEPLOYMENT_FAILED" -eq 0 ]; then + echo "===== CSM Authorization: Configuring Sidecar =====" + AUTHZ_DIR="/opt/omnia/csi-driver-powerscale/csm-authorization" + SECRET_FILE="/opt/omnia/csi-driver-powerscale/secret.yaml" + VALUES_FILE="/opt/omnia/csi-driver-powerscale/values.yaml" + + # Re-read original values (P4 already extracted these but re-extract for safety) + ORIGINAL_ENDPOINT=$(grep -v '^[[:space:]]*#' "$SECRET_FILE" | grep 'endpoint:' | head -1 | awk -F'"' '{print $2}') + ORIGINAL_PORT=$(grep -v '^[[:space:]]*#' "$SECRET_FILE" | grep 'endpointPort:' | head -1 | awk '{print $2}') + CLUSTER_NAME=$(grep -v '^[[:space:]]*#' "$SECRET_FILE" | grep 'clusterName:' | head -1 | awk -F'"' '{print $2}') + [ -z "$ORIGINAL_PORT" ] && ORIGINAL_PORT="8080" + [ -z "$CLUSTER_NAME" ] && CLUSTER_NAME="cluster1" + + # Step A1: Create proxy-server-root-certificate secret + ROOT_CERT_FILE="${AUTHZ_DIR}/rootCertificate.pem" + if [ -f "$ROOT_CERT_FILE" ] && [ -s "$ROOT_CERT_FILE" ]; then + echo "Creating proxy-server-root-certificate secret (secure mode)..." + kubectl -n isilon create secret generic proxy-server-root-certificate \ + --from-file=rootCertificate.pem="$ROOT_CERT_FILE" \ + -o yaml --dry-run=client | kubectl apply -f - + else + echo "Creating proxy-server-root-certificate secret (insecure mode - empty cert)..." + kubectl -n isilon create secret generic proxy-server-root-certificate \ + --from-literal=rootCertificate.pem= \ + -o yaml --dry-run=client | kubectl apply -f - + fi + + # Step A2: Create karavi-authorization-config secret + echo "Building karavi-authorization-config from CSI driver secret..." + printf '%s\n' '{' > /tmp/karavi-authorization-config.json + printf '%s\n' ' "storages": [' >> /tmp/karavi-authorization-config.json + printf '%s\n' ' {' >> /tmp/karavi-authorization-config.json + printf '%s\n' ' "username": "ignored",' >> /tmp/karavi-authorization-config.json + printf '%s\n' ' "password": "ignored",' >> /tmp/karavi-authorization-config.json + printf ' "intendedEndpoint": "%s:%s"\n' "${ORIGINAL_ENDPOINT}" "${ORIGINAL_PORT}" >> /tmp/karavi-authorization-config.json + printf '%s\n' ' "endpoint": "https://localhost:9400",' >> /tmp/karavi-authorization-config.json + printf ' "systemID": "%s"\n' "${CLUSTER_NAME}" >> /tmp/karavi-authorization-config.json + printf '%s\n' ' "skipCertificateValidation": true,' >> /tmp/karavi-authorization-config.json + printf '%s\n' ' "isDefault": true' >> /tmp/karavi-authorization-config.json + printf '%s\n' ' }' >> /tmp/karavi-authorization-config.json + printf '%s\n' ' ]' >> /tmp/karavi-authorization-config.json + printf '%s\n' '}' >> /tmp/karavi-authorization-config.json + + kubectl -n isilon create secret generic karavi-authorization-config \ + --from-file=config=/tmp/karavi-authorization-config.json \ + -o yaml --dry-run=client | kubectl apply -f - + rm -f /tmp/karavi-authorization-config.json + + # Step A3: Modify secret.yaml for sidecar communication + echo "Modifying CSI driver secret for authorization sidecar..." + ORIGINAL_HOST=$(echo "$ORIGINAL_ENDPOINT" | sed -E 's#https?://##' | sed -E 's#:[0-9]+.*##' | sed -E 's#/.*##') + + sed -i 's/^\([[:space:]]*\)endpoint:.*/\1endpoint: "https:\/\/localhost"/' "$SECRET_FILE" + sed -i 's/^\([[:space:]]*\)endpointPort:.*/\1endpointPort: 9400/' "$SECRET_FILE" + + if ! grep -q 'mountEndpoint:' "$SECRET_FILE"; then + sed -i "/endpointPort:/a\\ mountEndpoint: ${ORIGINAL_HOST}" "$SECRET_FILE" + else + sed -i "s/^\([[:space:]]*\)mountEndpoint:.*/\1mountEndpoint: ${ORIGINAL_HOST}/" "$SECRET_FILE" + fi + + sed -i 's/^\([[:space:]]*\)skipCertificateValidation:.*/\1skipCertificateValidation: true/' "$SECRET_FILE" + + # Step A4: Re-create isilon-creds secret + kubectl delete secret isilon-creds -n isilon 2>/dev/null || true + kubectl create secret generic isilon-creds -n isilon --from-file=config="$SECRET_FILE" + echo "isilon-creds secret re-created with sidecar configuration." + + # Step A5: Modify CSI driver values.yaml to enable authorization + echo "Enabling authorization in CSI driver values.yaml..." + if grep -q 'authorization:' "$VALUES_FILE"; then + sed -i '/^authorization:/,/^[^ ]/ { + s/^\([[:space:]]*\)enabled:.*/\1enabled: true/ + }' "$VALUES_FILE" + else + echo "" >> "$VALUES_FILE" + echo "authorization:" >> "$VALUES_FILE" + echo " enabled: true" >> "$VALUES_FILE" + fi + + if grep -q 'skipCertificateValidation:' "$VALUES_FILE"; then + sed -i '/^authorization:/,/^[^ ]/ { + s/^\([[:space:]]*\)skipCertificateValidation:.*/\1skipCertificateValidation: {{ authz_skip_cert | lower }}/ + }' "$VALUES_FILE" + fi + + echo "CSM Authorization sidecar configuration complete." + fi diff --git a/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_authorization.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_authorization.sh.j2 new file mode 100644 index 0000000000..ad47ddf70b --- /dev/null +++ b/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_authorization.sh.j2 @@ -0,0 +1,205 @@ +#!/bin/bash +{# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#} + +# CSM Authorization Proxy Server Verification Script +# Generated by Omnia provision playbook +# Validates CSM Authorization deployment, CRs, token, and sidecar + +{% set authz_tenants = hostvars['localhost']['service_cluster_info'].powerscale_authorization.tenants | default([]) %} +{% set authz_admin_name = hostvars['localhost']['service_cluster_info'].powerscale_authorization.admin_name | default('') %} +{% set authz_admin_secret = hostvars['localhost']['service_cluster_info'].powerscale_authorization.admin_secret | default('') %} +AUTHZ_NS="authorization" +ISILON_NS="isilon" +PASS=0 +FAIL=0 +WARN=0 + +echo "==============================================" +echo " CSM Authorization Verification" +echo "==============================================" +echo "" + +# 1. Check Kubernetes connectivity +echo "[1/9] Checking Kubernetes connectivity..." +if kubectl cluster-info &>/dev/null; then + echo " PASS: Kubernetes cluster is reachable" + ((PASS++)) +else + echo " FAIL: Cannot connect to Kubernetes cluster" + ((FAIL++)) +fi + +# 2. Check authorization namespace +echo "[2/9] Checking authorization namespace..." +if kubectl get namespace "$AUTHZ_NS" &>/dev/null; then + echo " PASS: Namespace '$AUTHZ_NS' exists" + ((PASS++)) +else + echo " FAIL: Namespace '$AUTHZ_NS' does not exist" + ((FAIL++)) +fi + +# 3. Check CSM Authorization Helm release +echo "[3/9] Checking CSM Authorization Helm release..." +AUTHZ_HELM_STATUS=$(helm status csm-authorization -n "${AUTHZ_NS}" -o json 2>/dev/null | grep -o '"status":"[^"]*"' | head -1) +if echo "$AUTHZ_HELM_STATUS" | grep -q "deployed"; then + echo " PASS: Helm release 'csm-authorization' is deployed" + ((PASS++)) +else + echo " FAIL: Helm release 'csm-authorization' not found or not deployed" + ((FAIL++)) +fi + +# 4. Check proxy server pods +echo "[4/9] Checking CSM Authorization Proxy Server pods..." +AUTHZ_PODS=$(kubectl get pods -n "${AUTHZ_NS}" --no-headers 2>/dev/null | wc -l) +AUTHZ_RUNNING=$(kubectl get pods -n "${AUTHZ_NS}" --no-headers 2>/dev/null | awk '$3=="Running"' | wc -l) +if [ "$AUTHZ_PODS" -gt 0 ] && [ "$AUTHZ_RUNNING" -eq "$AUTHZ_PODS" ]; then + echo " PASS: All proxy server pods running (${AUTHZ_RUNNING}/${AUTHZ_PODS})" + ((PASS++)) +elif [ "$AUTHZ_PODS" -gt 0 ]; then + echo " FAIL: Some proxy server pods not running (${AUTHZ_RUNNING}/${AUTHZ_PODS})" + kubectl get pods -n "${AUTHZ_NS}" --no-headers 2>/dev/null | awk '$3!="Running" && $3!="Completed" {print " " $1 " - " $3}' + ((FAIL++)) +else + echo " FAIL: No proxy server pods found in namespace '${AUTHZ_NS}'" + ((FAIL++)) +fi + +# 5. Check authorization CRs (Storage, Role, Tenant) +echo "[5/9] Checking Authorization Custom Resources..." +CR_PASS=0 +STORAGE_COUNT=$(kubectl get storage -n "${AUTHZ_NS}" --no-headers 2>/dev/null | wc -l) +if [ "$STORAGE_COUNT" -gt 0 ]; then + echo " PASS: ${STORAGE_COUNT} Storage CR(s) found" + CR_PASS=$((CR_PASS + 1)) +else + echo " FAIL: No Storage CRs found" +fi + +ROLE_COUNT=$(kubectl get csmrole -n "${AUTHZ_NS}" --no-headers 2>/dev/null | wc -l) +if [ "$ROLE_COUNT" -gt 0 ]; then + echo " PASS: ${ROLE_COUNT} CSMRole CR(s) found" + CR_PASS=$((CR_PASS + 1)) +else + echo " FAIL: No CSMRole CRs found" +fi + +TENANT_COUNT=$(kubectl get csmtenant -n "${AUTHZ_NS}" --no-headers 2>/dev/null | wc -l) +if [ "$TENANT_COUNT" -gt 0 ]; then + echo " PASS: ${TENANT_COUNT} CSMTenant CR(s) found" + CR_PASS=$((CR_PASS + 1)) +else + echo " FAIL: No CSMTenant CRs found" +fi + +if [ "$CR_PASS" -eq 3 ]; then + ((PASS++)) +else + ((FAIL++)) +fi + +# 6. Check authorization secrets in isilon namespace +echo "[6/9] Checking authorization secrets in isilon namespace..." +AUTHZ_SECRET_COUNT=0 +for secret_name in karavi-authorization-config proxy-server-root-certificate; do + if kubectl get secret "$secret_name" -n "$ISILON_NS" &>/dev/null; then + echo " PASS: Secret '$secret_name' exists" + AUTHZ_SECRET_COUNT=$((AUTHZ_SECRET_COUNT + 1)) + else + echo " FAIL: Secret '$secret_name' NOT found" + fi +done +if [ "$AUTHZ_SECRET_COUNT" -eq 2 ]; then + ((PASS++)) +else + ((FAIL++)) +fi + +# 7. Check proxy-authz-tokens secret (token applied) +echo "[7/9] Checking authorization token..." +if kubectl get secret proxy-authz-tokens -n "$ISILON_NS" &>/dev/null; then + echo " PASS: Token secret 'proxy-authz-tokens' exists in '$ISILON_NS'" + ((PASS++)) +else + echo " WARN: Token secret 'proxy-authz-tokens' NOT found in '$ISILON_NS'" + echo " Token may not have been generated yet. Manual steps:" +{% for tenant in authz_tenants %} + echo " dellctl admin token generate --admin-name {{ authz_admin_name }} --admin-secret {{ authz_admin_secret }} --tenant {{ tenant.name }} --access-token-expiration 30m --refresh-token-expiration 720h --addr --insecure true --output /tmp/token.yaml" +{% endfor %} + echo " kubectl apply -f /tmp/token.yaml -n $ISILON_NS" + ((WARN++)) +fi + +# 8. Check authorization sidecar in CSI driver pods +echo "[8/9] Checking authorization sidecar in CSI driver pods..." +SIDECAR_FOUND=$(kubectl get pods -n "$ISILON_NS" -l app=isilon-controller \ + -o jsonpath='{.items[0].spec.containers[*].name}' 2>/dev/null \ + | tr ' ' '\n' | grep -c "karavi-authorization-proxy") +if [ "$SIDECAR_FOUND" -gt 0 ]; then + echo " PASS: Authorization sidecar container found in CSI driver pods" + ((PASS++)) +else + echo " FAIL: Authorization sidecar container NOT found in CSI driver pods" + echo " Expected container: karavi-authorization-proxy" + echo " Actual containers:" + kubectl get pods -n "$ISILON_NS" -l app=isilon-controller \ + -o jsonpath='{.items[0].spec.containers[*].name}' 2>/dev/null | tr ' ' '\n' | sed 's/^/ /' + ((FAIL++)) +fi + +# 9. Functional test - check CSI driver can communicate via sidecar +echo "[9/9] Checking sidecar health..." +SIDECAR_LOG=$(kubectl logs -n "$ISILON_NS" -l app=isilon-controller -c karavi-authorization-proxy --tail=5 2>/dev/null) +if [ -n "$SIDECAR_LOG" ]; then + if echo "$SIDECAR_LOG" | grep -qi "error\|fatal\|panic"; then + echo " WARN: Sidecar has error logs (may need token)" + ((WARN++)) + else + echo " PASS: Sidecar running without critical errors" + ((PASS++)) + fi +else + echo " WARN: No sidecar logs available" + ((WARN++)) +fi + +# Summary +echo "" +echo "==============================================" +echo " Authorization Verification Summary" +echo "==============================================" +echo " PASSED: $PASS" +echo " FAILED: $FAIL" +echo " WARNINGS: $WARN" +echo "" + +if [ $FAIL -eq 0 ]; then + echo " STATUS: ALL CHECKS PASSED" +else + echo " STATUS: SOME CHECKS FAILED" + echo "" + echo " Debug Commands:" + echo " kubectl get pods -n $AUTHZ_NS" + echo " kubectl get storage,csmrole,csmtenant -n $AUTHZ_NS" + echo " kubectl get secret -n $ISILON_NS | grep -E 'karavi|proxy|authz'" + echo " kubectl logs -n $ISILON_NS -l app=isilon-controller -c karavi-authorization-proxy --tail=50" + echo " helm status csm-authorization -n $AUTHZ_NS" +fi + +echo "" +echo "==============================================" +exit $FAIL diff --git a/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml b/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml index 95fcb72fa4..08b35b79a9 100644 --- a/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml +++ b/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml @@ -284,6 +284,12 @@ mode: "{{ file_mode }}" become: true -- name: Include PowerScale CSI dependency tasks +- name: Get CSI PowerScale driver dependencies + when: hostvars['localhost']['csi_driver_powerscale_support'] ansible.builtin.include_tasks: get_powerscale_dependencies.yml - when: hostvars['localhost']['csi_driver_powerscale_support'] | bool + +- name: Deploy CSM Authorization Proxy Server (file validation + NFS staging) + when: + - hostvars['localhost']['csi_driver_powerscale_support'] | default(false) | bool + - hostvars['localhost']['service_cluster_info'].powerscale_authorization.enabled | default(false) | bool + ansible.builtin.include_tasks: deploy_powerscale_authorization.yml diff --git a/provision/roles/k8s_config/tasks/deploy_powerscale_authorization.yml b/provision/roles/k8s_config/tasks/deploy_powerscale_authorization.yml new file mode 100644 index 0000000000..ec6541ea46 --- /dev/null +++ b/provision/roles/k8s_config/tasks/deploy_powerscale_authorization.yml @@ -0,0 +1,95 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# CSM Authorization Proxy Server: File Validation and NFS Staging +# This task runs independently of powerscale_telemetry_support. +# Gated by: powerscale_authorization.enabled AND csi_driver_powerscale_support + +- name: Validate CSI driver PowerScale is configured (required for authorization) + ansible.builtin.assert: + that: + - hostvars['localhost']['csi_driver_powerscale_support'] | default(false) | bool + fail_msg: "{{ ps_csi_driver_not_configured_msg }}" + +- name: Display CSM Authorization status + ansible.builtin.debug: + msg: "{{ ps_authz_enabled_msg }}" + verbosity: 2 + +- name: Set CSM Authorization configuration facts + ansible.builtin.set_fact: + ps_authz_values_file: >- + {{ hostvars['localhost']['service_cluster_info'].powerscale_authorization.csm_authorization_values_file_path | default('') }} + ps_authz_tenants: >- + {{ hostvars['localhost']['service_cluster_info'].powerscale_authorization.tenants | default([]) }} + no_log: true + +- name: Validate CSM Authorization values file path is provided + ansible.builtin.assert: + that: + - ps_authz_values_file | length > 0 + fail_msg: "{{ ps_authz_values_missing_msg }}" + +- name: Verify CSM Authorization values file exists + ansible.builtin.stat: + path: "{{ ps_authz_values_file }}" + register: authz_values_stat + delegate_to: localhost + +- name: Fail if CSM Authorization values file does not exist + ansible.builtin.fail: + msg: "{{ ps_authz_values_not_found_msg }}" + when: not authz_values_stat.stat.exists + +- name: Validate at least one tenant is defined + ansible.builtin.assert: + that: + - ps_authz_tenants | length > 0 + fail_msg: "{{ ps_authz_no_tenants_msg }}" + +- name: Create CSM Authorization directory on NFS share + ansible.builtin.file: + path: "{{ csm_authz_nfs_dir }}" + state: directory + mode: '0755' + +- name: Copy CSM Authorization values file to NFS share + ansible.builtin.copy: + src: "{{ ps_authz_values_file }}" + dest: "{{ csm_authz_nfs_dir }}/csm-authorization-values.yaml" + mode: '0600' + +- name: Create empty root certificate placeholder (insecure mode) + ansible.builtin.copy: + content: "" + dest: "{{ csm_authz_nfs_dir }}/rootCertificate.pem" + mode: '0600' + +- name: Generate JWT signing secret for authorization + ansible.builtin.set_fact: + ps_authz_jwt_secret: "{{ lookup('password', '/dev/null chars=ascii_letters,digits length=32') }}" + no_log: true + +- name: Stage JWT signing secret for cloud-init + ansible.builtin.copy: + content: | + AUTHZ_JWT_SECRET='{{ ps_authz_jwt_secret }}' + dest: "{{ csm_authz_nfs_dir }}/authz_jwt_secret.env" + mode: '0600' + no_log: true + +- name: Display CSM Authorization staging status + ansible.builtin.debug: + msg: "{{ ps_authz_staged_msg }}" + verbosity: 2 diff --git a/provision/roles/k8s_config/vars/main.yml b/provision/roles/k8s_config/vars/main.yml index 5785565568..03b16db51a 100644 --- a/provision/roles/k8s_config/vars/main.yml +++ b/provision/roles/k8s_config/vars/main.yml @@ -86,3 +86,45 @@ print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item. offline_path_x86_64: [] offline_path_aarch64: [] ssh_private_key_path: /root/.ssh/oim_rsa + +# ============================================================================ +# CSM Authorization Proxy Server Variables +# ============================================================================ +# Usage: deploy_powerscale_authorization.yml (file validation + NFS staging) +# Usage: cloud-init template (Helm install + sidecar config) +# Conditional: powerscale_authorization.enabled: true in omnia_config.yml + +# Namespace and Helm release constants +csm_authorization_isilon_namespace: "isilon" +csm_authz_proxy_namespace: "authorization" +csm_authz_proxy_helm_release: "csm-authorization" +csm_authz_proxy_helm_chart_path: "{{ k8s_client_mount_path }}/csi-driver-powerscale/csi-powerscale/helm-charts/charts/csm-authorization-v2.0" +csm_authz_nfs_dir: "{{ k8s_client_mount_path }}/csi-driver-powerscale/csm-authorization" + +# Messages +ps_authz_enabled_msg: "CSM Authorization module is ENABLED. Proxy server and sidecar will be deployed." +ps_authz_disabled_msg: "CSM Authorization module is DISABLED. Skipping authorization configuration." +ps_authz_values_missing_msg: >- + csm_authorization_values_file_path is required in omnia_config.yml + when powerscale_authorization.enabled is true. +ps_authz_values_not_found_msg: >- + CSM Authorization values file not found at '{{ ps_authz_values_file | default('') }}'. + Download from https://github.com/dell/helm-charts/blob/main/charts/csm-authorization-v2.0/values.yaml + and set the path in omnia_config.yml (powerscale_authorization.csm_authorization_values_file_path). +ps_authz_values_parse_fail_msg: >- + Failed to parse CSM Authorization values file at '{{ ps_authz_values_file | default('') }}'. + Please verify the file contains valid YAML. +ps_authz_no_tenants_msg: >- + At least one tenant must be defined in powerscale_authorization.tenants + when powerscale_authorization.enabled is true. +ps_authz_staged_msg: >- + CSM Authorization files staged on NFS share at {{ csm_authz_nfs_dir }}. + JWT signing secret auto-generated. + Proxy server Helm chart: {{ csm_authz_proxy_helm_chart_path }} + Deployment will occur during cloud-init (PXE boot) on the control plane node. +ps_authz_direct_mode_msg: >- + CSI PowerScale will be deployed in direct mode (no multi-tenant RBAC). + To enable CSM Authorization, set powerscale_authorization.enabled: true in omnia_config.yml. +ps_csi_driver_not_configured_msg: >- + PowerScale authorization requires csi_driver_powerscale to be configured in software_config.json. + Please add csi_driver_powerscale to software_config.json and re-run. From 79f72ddb799240cbb8f73820fab9ac12df1aaf4b Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 8 May 2026 15:53:46 +0530 Subject: [PATCH 32/63] deploy using template --- input/omnia_config.yml | 14 +++++- .../powerscale/deploy_csm_authorization.sh.j2 | 23 +++------ .../powerscale/karavi_auth_config.yaml.j2 | 15 ++++++ .../powerscale/karavi_config.yaml.j2 | 2 + .../tasks/deploy_powerscale_authorization.yml | 49 +++++++++++++++++++ 5 files changed, 84 insertions(+), 19 deletions(-) create mode 100644 provision/roles/configure_ochami/templates/powerscale/karavi_auth_config.yaml.j2 create mode 100644 provision/roles/configure_ochami/templates/powerscale/karavi_config.yaml.j2 diff --git a/input/omnia_config.yml b/input/omnia_config.yml index d25fea2d60..16f2a31ad1 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -176,20 +176,30 @@ slurm_cluster: # PowerScale CSM Authorization enables multi-tenant storage access control for CSI PowerScale driver. # This feature is optional and requires CSI PowerScale driver to be installed. # When enabled, tenants can be provisioned with specific storage pools and quota limits. -# The following fields are mandatory only if powerscale_authorization.enabled is set to true: +# +# For detailed configuration guide, see: docs/CSM_Authorization_Guide.md # # powerscale_authorization: Configuration for PowerScale multi-tenant authorization. # enabled: Set to true to enable PowerScale CSM Authorization (default: false). # csm_authorization_values_file_path: Absolute file path for the CSM Authorization values.yaml file. # Required when enabled is true. -# Download from: https://github.com/dell/helm-charts/blob/main/charts/csm-authorization-v2.0/values.yaml +# Download from: https://raw.githubusercontent.com/dell/helm-charts/refs/heads/release-v1.16.3/charts/csm-authorization-v2.0/values.yaml +# # tenants: List of tenant configurations (at least one tenant required when enabled). # name: Tenant name (alphanumeric, hyphens, underscores only, e.g., "team-omnia"). +# This maps to Kubernetes namespaces for storage isolation. # roles: List of roles for this tenant (at least one role required). # name: Role name (alphanumeric, hyphens, underscores only, e.g., "role-omnia"). # storage_pool: PowerScale storage pool path (must start with /ifs, e.g., "/ifs/data/csi/team-omnia"). # IMPORTANT: This path must already exist on the PowerScale cluster. +# Omnia and CSI driver will NOT create this path automatically. +# See docs/CSM_Authorization_Guide.md for instructions on how to find/create storage pools. # quota_limit: Storage quota limit for this role (e.g., "200Gi", "1Ti", "500Mi"). +# See docs/CSM_Authorization_Guide.md for guidance on determining quota limits. +# +# NOTE: JWT signing secret, Redis credentials, and container images are auto-generated by Omnia. +# See docs/CSM_Authorization_Guide.md for complete list of auto-populated fields. +# # Prerequisites for enabling PowerScale Authorization: # 1. csi_driver_powerscale must be present in software_config.json # 2. Service cluster nodes must be defined in PXE mapping diff --git a/provision/roles/configure_ochami/templates/powerscale/deploy_csm_authorization.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/deploy_csm_authorization.sh.j2 index 61b67d3acd..1a5c7e9f35 100644 --- a/provision/roles/configure_ochami/templates/powerscale/deploy_csm_authorization.sh.j2 +++ b/provision/roles/configure_ochami/templates/powerscale/deploy_csm_authorization.sh.j2 @@ -146,10 +146,9 @@ AUTHZ_JWT_SECRET=$(head -c 32 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 32) fi - # Create config.yaml for karavi-config-secret - cat > /tmp/karavi-config.yaml < /tmp/karavi-config.yaml <<'EOF' +{% include 'powerscale/karavi_config.yaml.j2' %} EOF kubectl create secret generic karavi-config-secret -n "${AUTHZ_NS}" \ --from-file=config.yaml=/tmp/karavi-config.yaml \ @@ -256,19 +255,9 @@ EOF # Step A2: Create karavi-authorization-config secret echo "Building karavi-authorization-config from CSI driver secret..." - printf '%s\n' '{' > /tmp/karavi-authorization-config.json - printf '%s\n' ' "storages": [' >> /tmp/karavi-authorization-config.json - printf '%s\n' ' {' >> /tmp/karavi-authorization-config.json - printf '%s\n' ' "username": "ignored",' >> /tmp/karavi-authorization-config.json - printf '%s\n' ' "password": "ignored",' >> /tmp/karavi-authorization-config.json - printf ' "intendedEndpoint": "%s:%s"\n' "${ORIGINAL_ENDPOINT}" "${ORIGINAL_PORT}" >> /tmp/karavi-authorization-config.json - printf '%s\n' ' "endpoint": "https://localhost:9400",' >> /tmp/karavi-authorization-config.json - printf ' "systemID": "%s"\n' "${CLUSTER_NAME}" >> /tmp/karavi-authorization-config.json - printf '%s\n' ' "skipCertificateValidation": true,' >> /tmp/karavi-authorization-config.json - printf '%s\n' ' "isDefault": true' >> /tmp/karavi-authorization-config.json - printf '%s\n' ' }' >> /tmp/karavi-authorization-config.json - printf '%s\n' ' ]' >> /tmp/karavi-authorization-config.json - printf '%s\n' '}' >> /tmp/karavi-authorization-config.json + cat > /tmp/karavi-authorization-config.json <<'EOF' +{% include 'powerscale/karavi_auth_config.yaml.j2' %} +EOF kubectl -n isilon create secret generic karavi-authorization-config \ --from-file=config=/tmp/karavi-authorization-config.json \ diff --git a/provision/roles/configure_ochami/templates/powerscale/karavi_auth_config.yaml.j2 b/provision/roles/configure_ochami/templates/powerscale/karavi_auth_config.yaml.j2 new file mode 100644 index 0000000000..31e2693c30 --- /dev/null +++ b/provision/roles/configure_ochami/templates/powerscale/karavi_auth_config.yaml.j2 @@ -0,0 +1,15 @@ +{ + "username": "admin", + "password": "", + "vCenter": "", + "vCenterPort": "", + "type": "PowerScale", + "arrays": [ + { + "endpoint": "${ORIGINAL_ENDPOINT}", + "port": "${ORIGINAL_PORT}", + "systemId": "${CLUSTER_NAME}", + "isDefault": true + } + ] +} diff --git a/provision/roles/configure_ochami/templates/powerscale/karavi_config.yaml.j2 b/provision/roles/configure_ochami/templates/powerscale/karavi_config.yaml.j2 new file mode 100644 index 0000000000..280a5b2737 --- /dev/null +++ b/provision/roles/configure_ochami/templates/powerscale/karavi_config.yaml.j2 @@ -0,0 +1,2 @@ +web: + jwtsigningsecret: ${AUTHZ_JWT_SECRET} diff --git a/provision/roles/k8s_config/tasks/deploy_powerscale_authorization.yml b/provision/roles/k8s_config/tasks/deploy_powerscale_authorization.yml index ec6541ea46..474490911e 100644 --- a/provision/roles/k8s_config/tasks/deploy_powerscale_authorization.yml +++ b/provision/roles/k8s_config/tasks/deploy_powerscale_authorization.yml @@ -58,6 +58,55 @@ - ps_authz_tenants | length > 0 fail_msg: "{{ ps_authz_no_tenants_msg }}" +- name: Check if PowerScale telemetry is enabled + ansible.builtin.set_fact: + ps_telemetry_enabled: false + when: hostvars['localhost']['telemetry_config'] is not defined + +- name: Determine PowerScale telemetry status + ansible.builtin.set_fact: + ps_telemetry_enabled: >- + {{ hostvars['localhost']['telemetry_config'].telemetry_sources.powerscale.telemetry_enabled | default(false) | bool }} + when: hostvars['localhost']['telemetry_config'] is defined + +- name: Auto-generate OTLP address if PowerScale telemetry is enabled + block: + - name: Get PowerScale cluster name from telemetry config + ansible.builtin.set_fact: + ps_cluster_name: >- + {{ hostvars['localhost']['telemetry_config'].telemetry_sources.powerscale.configurations[0].cluster_name | default('powerscale') }} + + - name: Construct OTLP collector address + ansible.builtin.set_fact: + otlp_collector_address: "otel-collector.{{ ps_cluster_name }}-observability.svc.cluster.local:4317" + + - name: Display auto-generated OTLP address + ansible.builtin.debug: + msg: "PowerScale telemetry enabled. Auto-configuring OTLP address: {{ otlp_collector_address }}" + verbosity: 1 + + - name: Inject OTLP address into CSM Authorization values file + ansible.builtin.replace: + path: "{{ ps_authz_values_file }}" + regexp: 'openTelemetryCollectorAddress:\s*""' + replace: 'openTelemetryCollectorAddress: "{{ otlp_collector_address }}"' + backup: yes + + - name: Inject OTLP address (alternative format - with null) + ansible.builtin.replace: + path: "{{ ps_authz_values_file }}" + regexp: 'openTelemetryCollectorAddress:\s*null' + replace: 'openTelemetryCollectorAddress: "{{ otlp_collector_address }}"' + when: ansible_check_mode is not defined + + when: ps_telemetry_enabled | default(false) | bool + +- name: Display message when telemetry is not enabled + ansible.builtin.debug: + msg: "PowerScale telemetry is not enabled. OTLP address will remain empty in values.yaml." + verbosity: 1 + when: not ps_telemetry_enabled | default(false) | bool + - name: Create CSM Authorization directory on NFS share ansible.builtin.file: path: "{{ csm_authz_nfs_dir }}" From 47f6db6f23a5379101fe170ecfa7fe2790f1eb57 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 8 May 2026 17:38:19 +0530 Subject: [PATCH 33/63] REmove powerscale authorization support --- .../common_utils/en_us_validation_msg.py | 50 --- .../input_validation/schema/omnia_config.json | 87 ----- .../validation_flows/common_validation.py | 10 - .../powerscale_authorization_validation.py | 337 ------------------ input/omnia_config.yml | 45 --- ...ce_kube_control_plane_first_x86_64.yaml.j2 | 9 - .../powerscale/deploy_csm_authorization.sh.j2 | 306 ---------------- .../powerscale/karavi_auth_config.yaml.j2 | 15 - .../powerscale/karavi_config.yaml.j2 | 2 - .../verify_powerscale_authorization.sh.j2 | 205 ----------- .../tasks/create_k8s_config_nfs.yml | 6 - .../tasks/deploy_powerscale_authorization.yml | 144 -------- provision/roles/k8s_config/vars/main.yml | 42 --- 13 files changed, 1258 deletions(-) delete mode 100644 common/library/module_utils/input_validation/validation_flows/powerscale_authorization_validation.py delete mode 100644 provision/roles/configure_ochami/templates/powerscale/deploy_csm_authorization.sh.j2 delete mode 100644 provision/roles/configure_ochami/templates/powerscale/karavi_auth_config.yaml.j2 delete mode 100644 provision/roles/configure_ochami/templates/powerscale/karavi_config.yaml.j2 delete mode 100644 provision/roles/configure_ochami/templates/powerscale/verify_powerscale_authorization.sh.j2 delete mode 100644 provision/roles/k8s_config/tasks/deploy_powerscale_authorization.yml diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index 8404a743e9..28dec89fec 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -341,12 +341,6 @@ def switch_snmp3_username_fail_msg(min_username_length, max_length): "telemetry_sources.powerscale.metrics_enabled is true. " "Please provide the path to the CSM Observability values.yaml file." ) -POWERSCALE_AUTH_PROXY_HOST_MISSING_MSG = ( - "karaviMetricsPowerscale.authorization.proxyHost is required in the " - "CSM Observability values file when " - "karaviMetricsPowerscale.authorization.enabled is true. " - "Please provide the hostname or IP of the CSM Authorization Proxy server." -) def powerscale_csm_values_not_found_msg(path): """Returns error message when CSM Observability values.yaml file is not found.""" return ( @@ -383,50 +377,6 @@ def powerscale_image_version_mismatch_msg(image_name, values_image, service_k8s_ f"Please update service_k8s.json to match the values.yaml version " f"and re-run local_repo.yml to mirror the correct image to Pulp." ) - -# PowerScale CSM Authorization validation messages -POWERSCALE_AUTH_CSI_DRIVER_MISSING_MSG = ( - "PowerScale CSM Authorization requires 'csi_driver_powerscale' to be present in software_config.json." -) -POWERSCALE_AUTH_SERVICE_CLUSTER_MISSING_MSG = ( - "PowerScale CSM Authorization requires service cluster nodes " - "(service_kube_node_*, service_kube_control_plane_*) to be defined " - "in the PXE mapping file." -) -POWERSCALE_AUTH_CSM_VALUES_PATH_REQUIRED_MSG = ( - "csm_authorization_values_file_path is required when powerscale_authorization.enabled is true." -) -def powerscale_auth_csm_values_not_found_msg(path): - """Returns error message when CSM Authorization values.yaml file is not found.""" - return ( - f"CSM Authorization values file does not exist at path: {path}. " - "Please verify the file path is correct." - ) -def powerscale_auth_csm_values_validation_error_msg(error): - """Returns error message when CSM Authorization values.yaml validation fails.""" - return f"Error validating CSM Authorization image versions: {error}" -POWERSCALE_AUTH_TENANTS_REQUIRED_MSG = ( - "At least one tenant must be defined when powerscale_authorization.enabled is true." -) -def powerscale_auth_tenant_roles_required_msg(tenant_name): - """Returns error message when a tenant has no roles defined.""" - return ( - f"At least one role must be defined for tenant '{tenant_name}'." - ) -def powerscale_auth_image_version_mismatch_msg( - image_name, values_version, csi_version -): - """Returns error message when CSM Authorization image version doesn't match csi_driver_powerscale.json.""" - return ( - f"Image version for {image_name} in CSM Authorization values.yaml " - f"({values_version}) does not match csi_driver_powerscale.json " - f"({csi_version}). Please ensure both files use the same version." - ) -POWERSCALE_AUTH_CSI_JSON_NOT_FOUND_MSG = ( - "csi_driver_powerscale.json not found. Cannot validate CSM Authorization " - "image versions. Please ensure the file exists at " - "input/config/x86_64/rhel/10.0/csi_driver_powerscale.json." -) # pylint: enable=invalid-name def boolean_fail_msg(value): diff --git a/common/library/module_utils/input_validation/schema/omnia_config.json b/common/library/module_utils/input_validation/schema/omnia_config.json index 1f7824fc20..01b6039cff 100644 --- a/common/library/module_utils/input_validation/schema/omnia_config.json +++ b/common/library/module_utils/input_validation/schema/omnia_config.json @@ -139,66 +139,6 @@ "type": "string", "pattern": "^(|/?([a-zA-Z0-9._-]+/)*[a-zA-Z0-9._-]+\\.yaml)$" }, - "powerscale_authorization": { - "type": "object", - "description": "PowerScale CSM Authorization configuration for multi-tenancy.", - "properties": { - "enabled": { - "type": "boolean", - "description": "Enable PowerScale CSM Authorization for multi-tenant storage." - }, - "csm_authorization_values_file_path": { - "description": "Absolute file path for the CSM Authorization values.yaml file.", - "type": "string", - "pattern": "^(|/?([a-zA-Z0-9._-]+/)*[a-zA-Z0-9._-]+\\.yaml)$" - }, - "tenants": { - "type": ["array", "null"], - "description": "List of tenant configurations.", - "items": { - "type": "object", - "properties": { - "name": { - "type": ["string", "null"], - "minLength": 1, - "pattern": "^[a-zA-Z0-9_-]+$", - "description": "Tenant name (alphanumeric, hyphens, underscores only)." - }, - "roles": { - "type": ["array", "null"], - "default": [], - "description": "List of roles for this tenant.", - "items": { - "type": "object", - "properties": { - "name": { - "type": ["string", "null"], - "minLength": 1, - "pattern": "^[a-zA-Z0-9_-]+$", - "description": "Role name (alphanumeric, hyphens, underscores only)." - }, - "storage_pool": { - "type": ["string", "null"], - "minLength": 1, - "pattern": "^/ifs(/[a-zA-Z0-9._-]+)+$", - "description": "PowerScale storage pool path (must start with /ifs and exist on PowerScale)." - }, - "quota_limit": { - "type": ["string", "null"], - "pattern": "^[1-9][0-9]*(Gi|Ti|Mi)$", - "description": "Storage quota limit (e.g., 200Gi, 1Ti, 500Mi)." - } - }, - "required": ["name", "storage_pool", "quota_limit"] - } - } - }, - "required": ["name"] - } - } - }, - "required": ["enabled"] - }, "k8s_crio_storage_size": { "description": "Storage size for CRI-O in Gigabytes only (example: 10G, 15G, 100G)", "type": "string", @@ -225,33 +165,6 @@ "then": { "required": ["csi_powerscale_driver_values_file_path"] } - }, - { - "if": { - "properties": { - "powerscale_authorization": { - "type": "object", - "properties": { - "enabled": { - "const": true - } - }, - "required": ["enabled"] - } - }, - "required": ["powerscale_authorization"] - }, - "then": { - "required": [ - "csi_powerscale_driver_secret_file_path", - "csi_powerscale_driver_values_file_path" - ], - "properties": { - "powerscale_authorization": { - "required": ["csm_authorization_values_file_path", "tenants"] - } - } - } } ] } diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index ddfd94d1e1..57a5d81f53 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -26,7 +26,6 @@ import yaml import ansible.module_utils.input_validation.common_utils.data_fetch as fetch from ansible.module_utils.input_validation.validation_flows import csi_driver_validation -from ansible.module_utils.input_validation.validation_flows import powerscale_authorization_validation import ansible.module_utils.input_validation.common_utils.data_validation as validate from ansible.module_utils.input_validation.common_utils import ( config, @@ -1543,15 +1542,6 @@ def validate_k8s(data, admin_networks, softwares, ha_config, tag_names, errors, ) csi_driver_validation.validate_powerscale_secret_and_values_file(csi_secret_file_path,csi_values_file_path, errors, input_file_path) - # PowerScale Authorization validation - input_dir = os.path.dirname(input_file_path) - software_config_file_path = os.path.join(input_dir, "software_config.json") - config_paths = get_config_file_paths(input_dir, data, software_config_file_path) - - powerscale_authorization_validation.validate_powerscale_authorization( - kluster, softwares, input_file_path, config_paths, logger, errors - ) - def validate_omnia_config( input_file_path, data, diff --git a/common/library/module_utils/input_validation/validation_flows/powerscale_authorization_validation.py b/common/library/module_utils/input_validation/validation_flows/powerscale_authorization_validation.py deleted file mode 100644 index 8d4487c298..0000000000 --- a/common/library/module_utils/input_validation/validation_flows/powerscale_authorization_validation.py +++ /dev/null @@ -1,337 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -PowerScale CSM Authorization validation module. -Contains validation logic for PowerScale multi-tenant storage authorization configuration. -""" - -import csv -import json -import os -import yaml -from ansible.module_utils.input_validation.common_utils import en_us_validation_msg -from ansible.module_utils.input_validation.common_utils.validation_utils import create_error_msg - - -def check_is_service_cluster_functional_groups_defined( - errors, input_file_path, logger -): - """ - Checks if service_kube_node_* and service_kube_control_plane_* - are configured in the mapping file. - - Args: - errors (list): A list to store error messages. - input_file_path (str): The path to the input file. - logger (object): Logger object. - - Returns: - bool: True if service cluster functional groups are defined, False otherwise. - """ - # Get the directory containing the input file - input_dir = os.path.dirname(input_file_path) - provision_config_path = os.path.join(input_dir, "provision_config.yml") - - # Check if provision_config.yml exists - if not os.path.exists(provision_config_path): - errors.append( - create_error_msg( - "provision_config.yml", - provision_config_path, - en_us_validation_msg.PROVISION_CONFIG_NOT_FOUND - ) - ) - return False - - try: - # Load provision_config.yml to get pxe_mapping_file_path - with open(provision_config_path, 'r', encoding='utf-8') as f: - provision_config = yaml.safe_load(f) - - pxe_mapping_file_path = provision_config.get('pxe_mapping_file_path', '') - - if not pxe_mapping_file_path or not os.path.exists(pxe_mapping_file_path): - errors.append( - create_error_msg( - "pxe_mapping_file_path", - pxe_mapping_file_path, - en_us_validation_msg.PXE_MAPPING_FILE_NOT_FOUND - ) - ) - return False - - # Read the mapping file and check for service_kube_node functional groups - with open(pxe_mapping_file_path, 'r', encoding='utf-8') as fh: - raw_lines = fh.readlines() - - # Remove blank lines - non_comment_lines = [ln for ln in raw_lines if ln.strip()] - - if not non_comment_lines: - errors.append( - create_error_msg( - "pxe_mapping_file_path", - pxe_mapping_file_path, - en_us_validation_msg.PXE_MAPPING_FILE_EMPTY_SERVICE_CLUSTER_MSG - ) - ) - return False - - # Use csv.DictReader to parse the mapping file - reader = csv.DictReader(non_comment_lines) - - # Check if all required service cluster functional groups are present - # Required: service_kube_node_, service_kube_control_plane_ - has_kube_node = False - has_control_plane = False - - for row in reader: - functional_group = row.get('FUNCTIONAL_GROUP_NAME', '').strip() - if functional_group.startswith('service_kube_node_'): - has_kube_node = True - logger.info(f"Service cluster functional group found: {functional_group}") - elif functional_group.startswith('service_kube_control_plane_'): - has_control_plane = True - logger.info(f"Service cluster functional group found: {functional_group}") - - # Both must be present for a complete service cluster - service_cluster_found = has_kube_node and has_control_plane - - if not service_cluster_found: - missing = [] - if not has_kube_node: - missing.append('service_kube_node_*') - if not has_control_plane: - missing.append('service_kube_control_plane_*') - missing_groups = ', '.join(missing) - logger.info( - f"Service cluster incomplete. Missing functional groups: {missing_groups}" - ) - - return service_cluster_found - - except (yaml.YAMLError, IOError, csv.Error) as e: - errors.append( - create_error_msg( - "pxe_mapping_file_path", - pxe_mapping_file_path if 'pxe_mapping_file_path' in locals() else "unknown", - f"Error reading mapping file: {str(e)}" - ) - ) - return False - - -def validate_csm_auth_image_versions(csm_auth_values_path, config_paths, logger, errors): - """ - Validates that CSM Authorization image versions in values.yaml match csi_driver_powerscale.json. - - Args: - csm_auth_values_path (str): Path to the CSM Authorization values.yaml file. - config_paths (dict): Dictionary containing resolved config file paths. - logger (object): Logger object. - errors (list): List to store error messages. - """ - try: - # Load CSM Authorization values.yaml - with open(csm_auth_values_path, 'r', encoding='utf-8') as f: - csm_auth_values = yaml.safe_load(f) - - # Extract image versions from values.yaml - # Map of image names to their expected version - csm_images = {} - if 'authorization' in csm_auth_values and 'images' in csm_auth_values['authorization']: - auth_images = csm_auth_values['authorization']['images'] - csm_images['csm-authorization-proxy'] = ( - auth_images.get('proxyService', {}).get('image', '')) - csm_images['csm-authorization-tenant'] = ( - auth_images.get('tenantService', {}).get('image', '')) - csm_images['csm-authorization-role'] = ( - auth_images.get('roleService', {}).get('image', '')) - csm_images['csm-authorization-storage'] = ( - auth_images.get('storageService', {}).get('image', '')) - csm_images['csm-authorization-controller'] = ( - auth_images.get('authorizationController', {}).get('image', '')) - csm_images['opa'] = auth_images.get('opa', {}).get('image', '') - csm_images['kube-mgmt'] = auth_images.get('opaKubeMgmt', {}).get('image', '') - - if 'redis' in csm_auth_values and 'images' in csm_auth_values['redis']: - redis_images = csm_auth_values['redis']['images'] - csm_images['redis'] = redis_images.get('redis', {}).get('image', '') - csm_images['redis-commander'] = redis_images.get('commander', {}).get('image', '') - - # Load csi_driver_powerscale.json from config_paths - csi_json_path = config_paths.get("csi_driver_powerscale_json_path") - - if not csi_json_path or not os.path.exists(csi_json_path): - logger.warning(en_us_validation_msg.POWERSCALE_AUTH_CSI_JSON_NOT_FOUND_MSG) - return - - with open(csi_json_path, 'r', encoding='utf-8') as f: - csi_config = json.load(f) - - # Extract image versions from csi_driver_powerscale.json - csi_images = {} - csi_ps_data = csi_config.get('csi_driver_powerscale', {}) - if csi_ps_data and 'cluster' in csi_ps_data: - for item in csi_ps_data['cluster']: - if item.get('type') == 'image': - package = item.get('package', '') - tag = item.get('tag', '') - # Store full image with tag - csi_images[package] = tag - - # Validate CSM Authorization images - # Expected images to check (only CSM Authorization specific ones) - _img_base = 'quay.io/dell/container-storage-modules' - csm_auth_image_map = { - f'{_img_base}/csm-authorization-proxy': 'csm-authorization-proxy', - f'{_img_base}/csm-authorization-tenant': 'csm-authorization-tenant', - f'{_img_base}/csm-authorization-role': 'csm-authorization-role', - f'{_img_base}/csm-authorization-storage': 'csm-authorization-storage', - f'{_img_base}/csm-authorization-controller': 'csm-authorization-controller', - f'{_img_base}/csm-authorization-sidecar': 'csm-authorization-sidecar' - } - - for image_path, csm_name in csm_auth_image_map.items(): - values_image = csm_images.get(csm_name, '') - - if values_image: - # Extract version from values.yaml image - values_version = values_image.split(':')[-1] if ':' in values_image else '' - - # Get version from csi_driver_powerscale.json - csi_version = csi_images.get(image_path, '') - - if values_version and csi_version and values_version != csi_version: - errors.append( - create_error_msg( - f"CSM Authorization image version mismatch: {csm_name}", - (f"values.yaml: {values_version}," - f" csi_driver_powerscale.json: {csi_version}"), - en_us_validation_msg.powerscale_auth_image_version_mismatch_msg( - csm_name, values_version, csi_version - ) - ) - ) - - logger.info("CSM Authorization image version validation completed") - - except (yaml.YAMLError, json.JSONDecodeError, IOError) as e: - errors.append( - create_error_msg( - "csm_authorization_values_file_path", - csm_auth_values_path, - en_us_validation_msg.powerscale_auth_csm_values_validation_error_msg(str(e)) - ) - ) - - -def validate_powerscale_authorization( - kluster, softwares, input_file_path, config_paths, logger, errors -): - """ - Validates PowerScale CSM Authorization configuration. - - Args: - kluster (dict): Kubernetes cluster configuration from omnia_config.yml. - softwares (list): List of software names from software_config.json. - input_file_path (str): Path to omnia_config.yml. - config_paths (dict): Dictionary containing resolved config file paths. - logger (object): Logger object. - errors (list): List to store error messages. - """ - powerscale_auth = kluster.get("powerscale_authorization", {}) - if not powerscale_auth.get("enabled", False): - return - - # Check if CSI driver is in software_config - if "csi_driver_powerscale" not in softwares: - errors.append( - create_error_msg( - "powerscale_authorization.enabled", - "true", - en_us_validation_msg.POWERSCALE_AUTH_CSI_DRIVER_MISSING_MSG - ) - ) - - # Check if service nodes are defined in PXE mapping - service_cluster_defined = check_is_service_cluster_functional_groups_defined( - errors, input_file_path, logger - ) - if not service_cluster_defined: - errors.append( - create_error_msg( - "powerscale_authorization.enabled", - "true", - en_us_validation_msg.POWERSCALE_AUTH_SERVICE_CLUSTER_MISSING_MSG - ) - ) - - # Validate csm_authorization_values_file_path (inside powerscale_authorization) - csm_auth_values_path = powerscale_auth.get("csm_authorization_values_file_path") - if not csm_auth_values_path or not csm_auth_values_path.strip(): - errors.append( - create_error_msg( - "powerscale_authorization.csm_authorization_values_file_path", - csm_auth_values_path, - en_us_validation_msg.POWERSCALE_AUTH_CSM_VALUES_PATH_REQUIRED_MSG - ) - ) - elif not os.path.exists(csm_auth_values_path.strip()): - errors.append( - create_error_msg( - "powerscale_authorization.csm_authorization_values_file_path", - csm_auth_values_path, - en_us_validation_msg.powerscale_auth_csm_values_not_found_msg(csm_auth_values_path) - ) - ) - else: - # Validate image versions match between values.yaml and csi_driver_powerscale.json - validate_csm_auth_image_versions(csm_auth_values_path.strip(), config_paths, logger, errors) - - # Validate tenants and roles - tenants = powerscale_auth.get("tenants") or [] - if not tenants: - errors.append( - create_error_msg( - "powerscale_authorization.tenants", - "[]", - en_us_validation_msg.POWERSCALE_AUTH_TENANTS_REQUIRED_MSG - ) - ) - else: - for tenant in tenants: - tenant_name = tenant.get("name", "") - roles = tenant.get("roles") or [] - - if not roles: - errors.append( - create_error_msg( - f"powerscale_authorization.tenants[{tenant_name}].roles", - "[]", - en_us_validation_msg.powerscale_auth_tenant_roles_required_msg(tenant_name) - ) - ) - else: - for role in roles: - storage_pool = role.get("storage_pool", "") - # Log warning about storage pool path requirement - if storage_pool: - logger.warning( - f"PowerScale Authorization: Storage pool path '{storage_pool}' " - f"for tenant '{tenant_name}', role '{role.get('name', '')}' " - "must already exist on the PowerScale cluster. " - "Omnia and CSI driver will NOT create this path automatically." - ) diff --git a/input/omnia_config.yml b/input/omnia_config.yml index 16f2a31ad1..52e304775d 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -172,42 +172,6 @@ slurm_cluster: # csi_powerscale_driver_values_file_path: User need to download values.yaml file and fill required data in values.yaml file. Provided the path of the values.yaml file here. # mention configurable values -# ----------------------------PowerScale CSM Authorization------------------------------------ -# PowerScale CSM Authorization enables multi-tenant storage access control for CSI PowerScale driver. -# This feature is optional and requires CSI PowerScale driver to be installed. -# When enabled, tenants can be provisioned with specific storage pools and quota limits. -# -# For detailed configuration guide, see: docs/CSM_Authorization_Guide.md -# -# powerscale_authorization: Configuration for PowerScale multi-tenant authorization. -# enabled: Set to true to enable PowerScale CSM Authorization (default: false). -# csm_authorization_values_file_path: Absolute file path for the CSM Authorization values.yaml file. -# Required when enabled is true. -# Download from: https://raw.githubusercontent.com/dell/helm-charts/refs/heads/release-v1.16.3/charts/csm-authorization-v2.0/values.yaml -# -# tenants: List of tenant configurations (at least one tenant required when enabled). -# name: Tenant name (alphanumeric, hyphens, underscores only, e.g., "team-omnia"). -# This maps to Kubernetes namespaces for storage isolation. -# roles: List of roles for this tenant (at least one role required). -# name: Role name (alphanumeric, hyphens, underscores only, e.g., "role-omnia"). -# storage_pool: PowerScale storage pool path (must start with /ifs, e.g., "/ifs/data/csi/team-omnia"). -# IMPORTANT: This path must already exist on the PowerScale cluster. -# Omnia and CSI driver will NOT create this path automatically. -# See docs/CSM_Authorization_Guide.md for instructions on how to find/create storage pools. -# quota_limit: Storage quota limit for this role (e.g., "200Gi", "1Ti", "500Mi"). -# See docs/CSM_Authorization_Guide.md for guidance on determining quota limits. -# -# NOTE: JWT signing secret, Redis credentials, and container images are auto-generated by Omnia. -# See docs/CSM_Authorization_Guide.md for complete list of auto-populated fields. -# -# Prerequisites for enabling PowerScale Authorization: -# 1. csi_driver_powerscale must be present in software_config.json -# 2. Service cluster nodes must be defined in PXE mapping -# 3. File paths must be provided and files must exist: -# - csi_powerscale_driver_secret_file_path -# - csi_powerscale_driver_values_file_path -# - csm_authorization_values_file_path - # - k8s_crio_storage_size: Specifies the disk size allocated for CRI-O container storage. # This storage is used to store container images, writable layers, and runtime data. # Acceptable formats: "10G", "15G", "50G" (Only positive values in Gigabytes are allowed) @@ -225,12 +189,3 @@ service_k8s_cluster: k8s_crio_storage_size: "20G" csi_powerscale_driver_secret_file_path: "" csi_powerscale_driver_values_file_path: "" - powerscale_authorization: - enabled: false -# csm_authorization_values_file_path: "" # Required when enabled: true - "/path/to/your/updated/csm-authorization-values.yaml" -# tenants: # Required when enabled: true -# - name: "team-omnia" -# roles: -# - name: "role-omnia" -# storage_pool: "/ifs/data/csi/team-omnia" -# quota_limit: "200Gi" diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index b01449bcfd..fb549a10fa 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -975,15 +975,6 @@ echo "Done updating poll rate." - -{% if hostvars['localhost']['service_cluster_info'].powerscale_authorization.enabled | default(false) | bool %} -{% include 'powerscale/deploy_csm_authorization.sh.j2' %} -{% else %} - echo "INFO: CSM Authorization is DISABLED. CSI PowerScale will be deployed in direct mode (no multi-tenant RBAC)." - echo " To enable authorization, set powerscale_authorization.enabled: true in omnia_config.yml" - echo " and configure tenants, roles, storage pools, and quota limits." -{% endif %} - if [ "$POWERSCALE_DEPLOYMENT_FAILED" -eq 0 ]; then echo "===== Running CSI PowerScale installation script =====" INSTALL_SCRIPT="/opt/omnia/csi-driver-powerscale/csi-powerscale/dell-csi-helm-installer/csi-install.sh" diff --git a/provision/roles/configure_ochami/templates/powerscale/deploy_csm_authorization.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/deploy_csm_authorization.sh.j2 deleted file mode 100644 index 1a5c7e9f35..0000000000 --- a/provision/roles/configure_ochami/templates/powerscale/deploy_csm_authorization.sh.j2 +++ /dev/null @@ -1,306 +0,0 @@ -{# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#} -{# CSM Authorization Proxy Server + CRs + Token + Sidecar (cloud-init fragment) - This template is included by ci-group-service_kube_control_plane_first_x86_64.yaml.j2 - when powerscale_authorization.enabled is true. - - Deployment order (must run BEFORE csi-install.sh): - P1: Create authorization namespace - P2: Helm install/upgrade CSM Authorization Proxy Server - P3: Wait for proxy server pods (max 5 min) - P4: Extract original endpoint/systemID from secret.yaml - P5: Apply Storage CR (from pre-rendered csm-storage-cr.yaml) - P6: Apply Role CRs (from pre-rendered csm-role-cr-*.yaml files) - P7: Apply Tenant CRs (from pre-rendered csm-tenant-cr-*.yaml files) - P8: Generate token via dellctl - P9: Apply token to isilon namespace - A1: Create proxy-server-root-certificate secret - A2: Build + create karavi-authorization-config secret - A3: Modify secret.yaml (endpoint → localhost:9400, add mountEndpoint) - A4: Re-create isilon-creds secret with sidecar configuration - A5: Modify CSI driver values.yaml (authorization.enabled: true) -#} -{% set authz_skip_cert = hostvars['localhost']['service_cluster_info'].powerscale_authorization.get('authorization_skip_certificate_validation', true) | default(true) %} -{% set authz_tenants = hostvars['localhost']['service_cluster_info'].powerscale_authorization.tenants | default([]) %} - # ============================================================ - # CSM Authorization Proxy Server Deployment (Steps P1-P3) - # ============================================================ - if [ "$POWERSCALE_DEPLOYMENT_FAILED" -eq 0 ]; then - echo "===== CSM Authorization: Deploying Proxy Server =====" - AUTHZ_NS="authorization" - AUTHZ_CHART_PATH="/opt/omnia/csi-driver-powerscale/csi-powerscale/helm-charts/charts/csm-authorization-v2.0" - AUTHZ_VALUES_FILE="/opt/omnia/csi-driver-powerscale/csm-authorization/csm-authorization-values.yaml" - - # Step P1: Create authorization namespace - kubectl create namespace "${AUTHZ_NS}" --dry-run=client -o yaml | kubectl apply -f - || { - echo "WARNING: Failed to create authorization namespace." - } - - # Step P2: Helm install/upgrade CSM Authorization Proxy Server - if [ -d "$AUTHZ_CHART_PATH" ] && [ -f "$AUTHZ_VALUES_FILE" ]; then - if helm list -n "${AUTHZ_NS}" --filter csm-authorization -q 2>/dev/null | grep -q csm-authorization; then - echo "Upgrading existing CSM Authorization release..." - helm upgrade csm-authorization "$AUTHZ_CHART_PATH" \ - -n "${AUTHZ_NS}" -f "$AUTHZ_VALUES_FILE" --wait --timeout 10m || { - echo "ERROR: Helm upgrade for CSM Authorization failed." - POWERSCALE_DEPLOYMENT_FAILED=1 - } - else - echo "Fresh install of CSM Authorization Proxy Server..." - helm install csm-authorization "$AUTHZ_CHART_PATH" \ - -n "${AUTHZ_NS}" -f "$AUTHZ_VALUES_FILE" --wait --timeout 10m || { - echo "ERROR: Helm install for CSM Authorization failed." - POWERSCALE_DEPLOYMENT_FAILED=1 - } - fi - else - echo "ERROR: CSM Authorization Helm chart or values file not found." - echo " Chart path: $AUTHZ_CHART_PATH" - echo " Values file: $AUTHZ_VALUES_FILE" - POWERSCALE_DEPLOYMENT_FAILED=1 - fi - - # Step P3: Wait for proxy server pods (max 5 min) - if [ "$POWERSCALE_DEPLOYMENT_FAILED" -eq 0 ]; then - echo "Waiting for CSM Authorization Proxy Server pods..." - MAX_ATTEMPTS=30; WAIT_TIME=10 - for ((i=1; i<=MAX_ATTEMPTS; i++)); do - NOT_READY=$(kubectl get pods -n "${AUTHZ_NS}" --no-headers 2>/dev/null | grep -vE 'Running|Completed' | wc -l) - TOTAL=$(kubectl get pods -n "${AUTHZ_NS}" --no-headers 2>/dev/null | wc -l) - if [ "$NOT_READY" -eq 0 ] && [ "$TOTAL" -gt 0 ]; then - echo "All CSM Authorization Proxy Server pods are running." - break - fi - echo "[$i/$MAX_ATTEMPTS] Waiting... (${NOT_READY} not ready)" - sleep $WAIT_TIME - done - fi - fi - - # ============================================================ - # CSM Authorization CRs + Token Generation (Steps P4-P9) - # ============================================================ - if [ "$POWERSCALE_DEPLOYMENT_FAILED" -eq 0 ]; then - echo "===== CSM Authorization: Creating CRs and Generating Token =====" - SECRET_FILE="/opt/omnia/csi-driver-powerscale/secret.yaml" - - # Step P4: Extract original endpoint/systemID from secret.yaml (before modification) - ORIGINAL_ENDPOINT=$(grep -v '^[[:space:]]*#' "$SECRET_FILE" | grep 'endpoint:' | head -1 | awk -F'"' '{print $2}') - ORIGINAL_PORT=$(grep -v '^[[:space:]]*#' "$SECRET_FILE" | grep 'endpointPort:' | head -1 | awk '{print $2}') - CLUSTER_NAME=$(grep -v '^[[:space:]]*#' "$SECRET_FILE" | grep 'clusterName:' | head -1 | awk -F'"' '{print $2}') - [ -z "$ORIGINAL_PORT" ] && ORIGINAL_PORT="8080" - [ -z "$CLUSTER_NAME" ] && CLUSTER_NAME="cluster1" - echo "PowerScale endpoint: ${ORIGINAL_ENDPOINT}:${ORIGINAL_PORT}, systemID: ${CLUSTER_NAME}" - - # Step P5: Create Storage CR - echo "Creating Storage CR for PowerScale array..." - if [ -f "/opt/omnia/csi-driver-powerscale/csm-storage-cr.yaml" ]; then - kubectl apply -f /opt/omnia/csi-driver-powerscale/csm-storage-cr.yaml - echo " Storage CR created" - else - echo " WARNING: csm-storage-cr.yaml not found, skipping Storage CR creation" - fi - - # Step P6: Create Role CRs (from pre-rendered files) - echo "Creating Role CRs..." - for role_file in /opt/omnia/csi-driver-powerscale/csm-role-cr-*.yaml; do - if [ -f "$role_file" ]; then - kubectl apply -f "$role_file" - echo " Role CR applied from $(basename $role_file)" - fi - done - - # Step P7: Create Tenant CRs (from pre-rendered files) - echo "Creating Tenant CRs..." - for tenant_file in /opt/omnia/csi-driver-powerscale/csm-tenant-cr-*.yaml; do - if [ -f "$tenant_file" ]; then - kubectl apply -f "$tenant_file" - echo " Tenant CR applied from $(basename $tenant_file)" - fi - done - - # Wait for CRDs to be processed by authorization controller - echo "Waiting for CRs to be processed..." - sleep 15 - - # Step P8: Create karavi-config-secret with JWT signing secret - JWT_SECRET_FILE="/opt/omnia/csi-driver-powerscale/csm-authorization/authz_jwt_secret.env" - AUTHZ_JWT_SECRET="" - if [ -f "$JWT_SECRET_FILE" ]; then - source "$JWT_SECRET_FILE" - echo "JWT signing secret loaded." - else - echo "WARNING: JWT secret file not found. Generating random secret..." - AUTHZ_JWT_SECRET=$(head -c 32 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 32) - fi - - # Step P8: Create karavi-config-secret with JWT signing secret - cat > /tmp/karavi-config.yaml <<'EOF' -{% include 'powerscale/karavi_config.yaml.j2' %} -EOF - kubectl create secret generic karavi-config-secret -n "${AUTHZ_NS}" \ - --from-file=config.yaml=/tmp/karavi-config.yaml \ - -o yaml --dry-run=client | kubectl apply -f - - rm -f /tmp/karavi-config.yaml - echo "karavi-config-secret created with auto-generated JWT signing secret." - - # Step P9: Generate token via dellctl - DELLCTL_BIN="/opt/omnia/csi-driver-powerscale/dellctl" - TOKEN_FILE="/tmp/proxy-authz-token.yaml" - TOKEN_GENERATED=0 - - if [ -f "$DELLCTL_BIN" ] && [ -n "$AUTHZ_JWT_SECRET" ]; then - chmod +x "$DELLCTL_BIN" - - # Find proxy server service for port-forward - PROXY_SVC=$(kubectl get svc -n "${AUTHZ_NS}" -o name 2>/dev/null | grep -i proxy | head -1) - if [ -z "$PROXY_SVC" ]; then - PROXY_SVC=$(kubectl get svc -n "${AUTHZ_NS}" -o name 2>/dev/null | head -1) - fi - - if [ -n "$PROXY_SVC" ]; then - echo "Starting port-forward to proxy server..." - kubectl port-forward -n "${AUTHZ_NS}" "${PROXY_SVC}" 9443:443 &>/dev/null & - PF_PID=$! - sleep 5 - -{% for tenant in authz_tenants %} - echo "Generating token for tenant '{{ tenant.name }}'..." - "$DELLCTL_BIN" admin token generate \ - --admin-name "admin" \ - --jwt-signing-secret "$AUTHZ_JWT_SECRET" \ - --tenant {{ tenant.name }} \ - --access-token-expiration 30m \ - --refresh-token-expiration 720h \ - --addr "localhost:9443" \ - --insecure true \ - --output "${TOKEN_FILE}" 2>/dev/null && { - TOKEN_GENERATED=1 - echo " Token generated for tenant '{{ tenant.name }}'." - } || { - echo " WARNING: dellctl token generation failed for tenant '{{ tenant.name }}'." - } -{% endfor %} - - # Kill port-forward - kill $PF_PID 2>/dev/null || true - wait $PF_PID 2>/dev/null || true - else - echo "WARNING: No proxy server service found for port-forward." - fi - else - [ ! -f "$DELLCTL_BIN" ] && echo "WARNING: dellctl binary not found at ${DELLCTL_BIN}." - echo " Token generation skipped. Manual steps required after deployment:" -{% for tenant in authz_tenants %} - echo " dellctl admin token generate --admin-name admin --jwt-signing-secret --tenant {{ tenant.name }} --access-token-expiration 30m --refresh-token-expiration 720h --addr --insecure true --output /tmp/token.yaml" - echo " kubectl apply -f /tmp/token.yaml -n isilon" -{% endfor %} - fi - - # Step P10: Apply token to isilon namespace - if [ "$TOKEN_GENERATED" -eq 1 ] && [ -f "$TOKEN_FILE" ]; then - echo "Applying authorization token to isilon namespace..." - kubectl apply -f "$TOKEN_FILE" -n isilon || { - echo "WARNING: Failed to apply token. Manual step required:" - echo " kubectl apply -f ${TOKEN_FILE} -n isilon" - } - rm -f "$TOKEN_FILE" - fi - - # Cleanup: remove JWT secret from NFS (sensitive) - rm -f "$JWT_SECRET_FILE" - fi - - # ============================================================ - # CSM Authorization Sidecar Configuration (Steps A1-A5) - # ============================================================ - if [ "$POWERSCALE_DEPLOYMENT_FAILED" -eq 0 ]; then - echo "===== CSM Authorization: Configuring Sidecar =====" - AUTHZ_DIR="/opt/omnia/csi-driver-powerscale/csm-authorization" - SECRET_FILE="/opt/omnia/csi-driver-powerscale/secret.yaml" - VALUES_FILE="/opt/omnia/csi-driver-powerscale/values.yaml" - - # Re-read original values (P4 already extracted these but re-extract for safety) - ORIGINAL_ENDPOINT=$(grep -v '^[[:space:]]*#' "$SECRET_FILE" | grep 'endpoint:' | head -1 | awk -F'"' '{print $2}') - ORIGINAL_PORT=$(grep -v '^[[:space:]]*#' "$SECRET_FILE" | grep 'endpointPort:' | head -1 | awk '{print $2}') - CLUSTER_NAME=$(grep -v '^[[:space:]]*#' "$SECRET_FILE" | grep 'clusterName:' | head -1 | awk -F'"' '{print $2}') - [ -z "$ORIGINAL_PORT" ] && ORIGINAL_PORT="8080" - [ -z "$CLUSTER_NAME" ] && CLUSTER_NAME="cluster1" - - # Step A1: Create proxy-server-root-certificate secret - ROOT_CERT_FILE="${AUTHZ_DIR}/rootCertificate.pem" - if [ -f "$ROOT_CERT_FILE" ] && [ -s "$ROOT_CERT_FILE" ]; then - echo "Creating proxy-server-root-certificate secret (secure mode)..." - kubectl -n isilon create secret generic proxy-server-root-certificate \ - --from-file=rootCertificate.pem="$ROOT_CERT_FILE" \ - -o yaml --dry-run=client | kubectl apply -f - - else - echo "Creating proxy-server-root-certificate secret (insecure mode - empty cert)..." - kubectl -n isilon create secret generic proxy-server-root-certificate \ - --from-literal=rootCertificate.pem= \ - -o yaml --dry-run=client | kubectl apply -f - - fi - - # Step A2: Create karavi-authorization-config secret - echo "Building karavi-authorization-config from CSI driver secret..." - cat > /tmp/karavi-authorization-config.json <<'EOF' -{% include 'powerscale/karavi_auth_config.yaml.j2' %} -EOF - - kubectl -n isilon create secret generic karavi-authorization-config \ - --from-file=config=/tmp/karavi-authorization-config.json \ - -o yaml --dry-run=client | kubectl apply -f - - rm -f /tmp/karavi-authorization-config.json - - # Step A3: Modify secret.yaml for sidecar communication - echo "Modifying CSI driver secret for authorization sidecar..." - ORIGINAL_HOST=$(echo "$ORIGINAL_ENDPOINT" | sed -E 's#https?://##' | sed -E 's#:[0-9]+.*##' | sed -E 's#/.*##') - - sed -i 's/^\([[:space:]]*\)endpoint:.*/\1endpoint: "https:\/\/localhost"/' "$SECRET_FILE" - sed -i 's/^\([[:space:]]*\)endpointPort:.*/\1endpointPort: 9400/' "$SECRET_FILE" - - if ! grep -q 'mountEndpoint:' "$SECRET_FILE"; then - sed -i "/endpointPort:/a\\ mountEndpoint: ${ORIGINAL_HOST}" "$SECRET_FILE" - else - sed -i "s/^\([[:space:]]*\)mountEndpoint:.*/\1mountEndpoint: ${ORIGINAL_HOST}/" "$SECRET_FILE" - fi - - sed -i 's/^\([[:space:]]*\)skipCertificateValidation:.*/\1skipCertificateValidation: true/' "$SECRET_FILE" - - # Step A4: Re-create isilon-creds secret - kubectl delete secret isilon-creds -n isilon 2>/dev/null || true - kubectl create secret generic isilon-creds -n isilon --from-file=config="$SECRET_FILE" - echo "isilon-creds secret re-created with sidecar configuration." - - # Step A5: Modify CSI driver values.yaml to enable authorization - echo "Enabling authorization in CSI driver values.yaml..." - if grep -q 'authorization:' "$VALUES_FILE"; then - sed -i '/^authorization:/,/^[^ ]/ { - s/^\([[:space:]]*\)enabled:.*/\1enabled: true/ - }' "$VALUES_FILE" - else - echo "" >> "$VALUES_FILE" - echo "authorization:" >> "$VALUES_FILE" - echo " enabled: true" >> "$VALUES_FILE" - fi - - if grep -q 'skipCertificateValidation:' "$VALUES_FILE"; then - sed -i '/^authorization:/,/^[^ ]/ { - s/^\([[:space:]]*\)skipCertificateValidation:.*/\1skipCertificateValidation: {{ authz_skip_cert | lower }}/ - }' "$VALUES_FILE" - fi - - echo "CSM Authorization sidecar configuration complete." - fi diff --git a/provision/roles/configure_ochami/templates/powerscale/karavi_auth_config.yaml.j2 b/provision/roles/configure_ochami/templates/powerscale/karavi_auth_config.yaml.j2 deleted file mode 100644 index 31e2693c30..0000000000 --- a/provision/roles/configure_ochami/templates/powerscale/karavi_auth_config.yaml.j2 +++ /dev/null @@ -1,15 +0,0 @@ -{ - "username": "admin", - "password": "", - "vCenter": "", - "vCenterPort": "", - "type": "PowerScale", - "arrays": [ - { - "endpoint": "${ORIGINAL_ENDPOINT}", - "port": "${ORIGINAL_PORT}", - "systemId": "${CLUSTER_NAME}", - "isDefault": true - } - ] -} diff --git a/provision/roles/configure_ochami/templates/powerscale/karavi_config.yaml.j2 b/provision/roles/configure_ochami/templates/powerscale/karavi_config.yaml.j2 deleted file mode 100644 index 280a5b2737..0000000000 --- a/provision/roles/configure_ochami/templates/powerscale/karavi_config.yaml.j2 +++ /dev/null @@ -1,2 +0,0 @@ -web: - jwtsigningsecret: ${AUTHZ_JWT_SECRET} diff --git a/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_authorization.sh.j2 b/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_authorization.sh.j2 deleted file mode 100644 index ad47ddf70b..0000000000 --- a/provision/roles/configure_ochami/templates/powerscale/verify_powerscale_authorization.sh.j2 +++ /dev/null @@ -1,205 +0,0 @@ -#!/bin/bash -{# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#} - -# CSM Authorization Proxy Server Verification Script -# Generated by Omnia provision playbook -# Validates CSM Authorization deployment, CRs, token, and sidecar - -{% set authz_tenants = hostvars['localhost']['service_cluster_info'].powerscale_authorization.tenants | default([]) %} -{% set authz_admin_name = hostvars['localhost']['service_cluster_info'].powerscale_authorization.admin_name | default('') %} -{% set authz_admin_secret = hostvars['localhost']['service_cluster_info'].powerscale_authorization.admin_secret | default('') %} -AUTHZ_NS="authorization" -ISILON_NS="isilon" -PASS=0 -FAIL=0 -WARN=0 - -echo "==============================================" -echo " CSM Authorization Verification" -echo "==============================================" -echo "" - -# 1. Check Kubernetes connectivity -echo "[1/9] Checking Kubernetes connectivity..." -if kubectl cluster-info &>/dev/null; then - echo " PASS: Kubernetes cluster is reachable" - ((PASS++)) -else - echo " FAIL: Cannot connect to Kubernetes cluster" - ((FAIL++)) -fi - -# 2. Check authorization namespace -echo "[2/9] Checking authorization namespace..." -if kubectl get namespace "$AUTHZ_NS" &>/dev/null; then - echo " PASS: Namespace '$AUTHZ_NS' exists" - ((PASS++)) -else - echo " FAIL: Namespace '$AUTHZ_NS' does not exist" - ((FAIL++)) -fi - -# 3. Check CSM Authorization Helm release -echo "[3/9] Checking CSM Authorization Helm release..." -AUTHZ_HELM_STATUS=$(helm status csm-authorization -n "${AUTHZ_NS}" -o json 2>/dev/null | grep -o '"status":"[^"]*"' | head -1) -if echo "$AUTHZ_HELM_STATUS" | grep -q "deployed"; then - echo " PASS: Helm release 'csm-authorization' is deployed" - ((PASS++)) -else - echo " FAIL: Helm release 'csm-authorization' not found or not deployed" - ((FAIL++)) -fi - -# 4. Check proxy server pods -echo "[4/9] Checking CSM Authorization Proxy Server pods..." -AUTHZ_PODS=$(kubectl get pods -n "${AUTHZ_NS}" --no-headers 2>/dev/null | wc -l) -AUTHZ_RUNNING=$(kubectl get pods -n "${AUTHZ_NS}" --no-headers 2>/dev/null | awk '$3=="Running"' | wc -l) -if [ "$AUTHZ_PODS" -gt 0 ] && [ "$AUTHZ_RUNNING" -eq "$AUTHZ_PODS" ]; then - echo " PASS: All proxy server pods running (${AUTHZ_RUNNING}/${AUTHZ_PODS})" - ((PASS++)) -elif [ "$AUTHZ_PODS" -gt 0 ]; then - echo " FAIL: Some proxy server pods not running (${AUTHZ_RUNNING}/${AUTHZ_PODS})" - kubectl get pods -n "${AUTHZ_NS}" --no-headers 2>/dev/null | awk '$3!="Running" && $3!="Completed" {print " " $1 " - " $3}' - ((FAIL++)) -else - echo " FAIL: No proxy server pods found in namespace '${AUTHZ_NS}'" - ((FAIL++)) -fi - -# 5. Check authorization CRs (Storage, Role, Tenant) -echo "[5/9] Checking Authorization Custom Resources..." -CR_PASS=0 -STORAGE_COUNT=$(kubectl get storage -n "${AUTHZ_NS}" --no-headers 2>/dev/null | wc -l) -if [ "$STORAGE_COUNT" -gt 0 ]; then - echo " PASS: ${STORAGE_COUNT} Storage CR(s) found" - CR_PASS=$((CR_PASS + 1)) -else - echo " FAIL: No Storage CRs found" -fi - -ROLE_COUNT=$(kubectl get csmrole -n "${AUTHZ_NS}" --no-headers 2>/dev/null | wc -l) -if [ "$ROLE_COUNT" -gt 0 ]; then - echo " PASS: ${ROLE_COUNT} CSMRole CR(s) found" - CR_PASS=$((CR_PASS + 1)) -else - echo " FAIL: No CSMRole CRs found" -fi - -TENANT_COUNT=$(kubectl get csmtenant -n "${AUTHZ_NS}" --no-headers 2>/dev/null | wc -l) -if [ "$TENANT_COUNT" -gt 0 ]; then - echo " PASS: ${TENANT_COUNT} CSMTenant CR(s) found" - CR_PASS=$((CR_PASS + 1)) -else - echo " FAIL: No CSMTenant CRs found" -fi - -if [ "$CR_PASS" -eq 3 ]; then - ((PASS++)) -else - ((FAIL++)) -fi - -# 6. Check authorization secrets in isilon namespace -echo "[6/9] Checking authorization secrets in isilon namespace..." -AUTHZ_SECRET_COUNT=0 -for secret_name in karavi-authorization-config proxy-server-root-certificate; do - if kubectl get secret "$secret_name" -n "$ISILON_NS" &>/dev/null; then - echo " PASS: Secret '$secret_name' exists" - AUTHZ_SECRET_COUNT=$((AUTHZ_SECRET_COUNT + 1)) - else - echo " FAIL: Secret '$secret_name' NOT found" - fi -done -if [ "$AUTHZ_SECRET_COUNT" -eq 2 ]; then - ((PASS++)) -else - ((FAIL++)) -fi - -# 7. Check proxy-authz-tokens secret (token applied) -echo "[7/9] Checking authorization token..." -if kubectl get secret proxy-authz-tokens -n "$ISILON_NS" &>/dev/null; then - echo " PASS: Token secret 'proxy-authz-tokens' exists in '$ISILON_NS'" - ((PASS++)) -else - echo " WARN: Token secret 'proxy-authz-tokens' NOT found in '$ISILON_NS'" - echo " Token may not have been generated yet. Manual steps:" -{% for tenant in authz_tenants %} - echo " dellctl admin token generate --admin-name {{ authz_admin_name }} --admin-secret {{ authz_admin_secret }} --tenant {{ tenant.name }} --access-token-expiration 30m --refresh-token-expiration 720h --addr --insecure true --output /tmp/token.yaml" -{% endfor %} - echo " kubectl apply -f /tmp/token.yaml -n $ISILON_NS" - ((WARN++)) -fi - -# 8. Check authorization sidecar in CSI driver pods -echo "[8/9] Checking authorization sidecar in CSI driver pods..." -SIDECAR_FOUND=$(kubectl get pods -n "$ISILON_NS" -l app=isilon-controller \ - -o jsonpath='{.items[0].spec.containers[*].name}' 2>/dev/null \ - | tr ' ' '\n' | grep -c "karavi-authorization-proxy") -if [ "$SIDECAR_FOUND" -gt 0 ]; then - echo " PASS: Authorization sidecar container found in CSI driver pods" - ((PASS++)) -else - echo " FAIL: Authorization sidecar container NOT found in CSI driver pods" - echo " Expected container: karavi-authorization-proxy" - echo " Actual containers:" - kubectl get pods -n "$ISILON_NS" -l app=isilon-controller \ - -o jsonpath='{.items[0].spec.containers[*].name}' 2>/dev/null | tr ' ' '\n' | sed 's/^/ /' - ((FAIL++)) -fi - -# 9. Functional test - check CSI driver can communicate via sidecar -echo "[9/9] Checking sidecar health..." -SIDECAR_LOG=$(kubectl logs -n "$ISILON_NS" -l app=isilon-controller -c karavi-authorization-proxy --tail=5 2>/dev/null) -if [ -n "$SIDECAR_LOG" ]; then - if echo "$SIDECAR_LOG" | grep -qi "error\|fatal\|panic"; then - echo " WARN: Sidecar has error logs (may need token)" - ((WARN++)) - else - echo " PASS: Sidecar running without critical errors" - ((PASS++)) - fi -else - echo " WARN: No sidecar logs available" - ((WARN++)) -fi - -# Summary -echo "" -echo "==============================================" -echo " Authorization Verification Summary" -echo "==============================================" -echo " PASSED: $PASS" -echo " FAILED: $FAIL" -echo " WARNINGS: $WARN" -echo "" - -if [ $FAIL -eq 0 ]; then - echo " STATUS: ALL CHECKS PASSED" -else - echo " STATUS: SOME CHECKS FAILED" - echo "" - echo " Debug Commands:" - echo " kubectl get pods -n $AUTHZ_NS" - echo " kubectl get storage,csmrole,csmtenant -n $AUTHZ_NS" - echo " kubectl get secret -n $ISILON_NS | grep -E 'karavi|proxy|authz'" - echo " kubectl logs -n $ISILON_NS -l app=isilon-controller -c karavi-authorization-proxy --tail=50" - echo " helm status csm-authorization -n $AUTHZ_NS" -fi - -echo "" -echo "==============================================" -exit $FAIL diff --git a/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml b/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml index 08b35b79a9..ac573840b9 100644 --- a/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml +++ b/provision/roles/k8s_config/tasks/create_k8s_config_nfs.yml @@ -287,9 +287,3 @@ - name: Get CSI PowerScale driver dependencies when: hostvars['localhost']['csi_driver_powerscale_support'] ansible.builtin.include_tasks: get_powerscale_dependencies.yml - -- name: Deploy CSM Authorization Proxy Server (file validation + NFS staging) - when: - - hostvars['localhost']['csi_driver_powerscale_support'] | default(false) | bool - - hostvars['localhost']['service_cluster_info'].powerscale_authorization.enabled | default(false) | bool - ansible.builtin.include_tasks: deploy_powerscale_authorization.yml diff --git a/provision/roles/k8s_config/tasks/deploy_powerscale_authorization.yml b/provision/roles/k8s_config/tasks/deploy_powerscale_authorization.yml deleted file mode 100644 index 474490911e..0000000000 --- a/provision/roles/k8s_config/tasks/deploy_powerscale_authorization.yml +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -# CSM Authorization Proxy Server: File Validation and NFS Staging -# This task runs independently of powerscale_telemetry_support. -# Gated by: powerscale_authorization.enabled AND csi_driver_powerscale_support - -- name: Validate CSI driver PowerScale is configured (required for authorization) - ansible.builtin.assert: - that: - - hostvars['localhost']['csi_driver_powerscale_support'] | default(false) | bool - fail_msg: "{{ ps_csi_driver_not_configured_msg }}" - -- name: Display CSM Authorization status - ansible.builtin.debug: - msg: "{{ ps_authz_enabled_msg }}" - verbosity: 2 - -- name: Set CSM Authorization configuration facts - ansible.builtin.set_fact: - ps_authz_values_file: >- - {{ hostvars['localhost']['service_cluster_info'].powerscale_authorization.csm_authorization_values_file_path | default('') }} - ps_authz_tenants: >- - {{ hostvars['localhost']['service_cluster_info'].powerscale_authorization.tenants | default([]) }} - no_log: true - -- name: Validate CSM Authorization values file path is provided - ansible.builtin.assert: - that: - - ps_authz_values_file | length > 0 - fail_msg: "{{ ps_authz_values_missing_msg }}" - -- name: Verify CSM Authorization values file exists - ansible.builtin.stat: - path: "{{ ps_authz_values_file }}" - register: authz_values_stat - delegate_to: localhost - -- name: Fail if CSM Authorization values file does not exist - ansible.builtin.fail: - msg: "{{ ps_authz_values_not_found_msg }}" - when: not authz_values_stat.stat.exists - -- name: Validate at least one tenant is defined - ansible.builtin.assert: - that: - - ps_authz_tenants | length > 0 - fail_msg: "{{ ps_authz_no_tenants_msg }}" - -- name: Check if PowerScale telemetry is enabled - ansible.builtin.set_fact: - ps_telemetry_enabled: false - when: hostvars['localhost']['telemetry_config'] is not defined - -- name: Determine PowerScale telemetry status - ansible.builtin.set_fact: - ps_telemetry_enabled: >- - {{ hostvars['localhost']['telemetry_config'].telemetry_sources.powerscale.telemetry_enabled | default(false) | bool }} - when: hostvars['localhost']['telemetry_config'] is defined - -- name: Auto-generate OTLP address if PowerScale telemetry is enabled - block: - - name: Get PowerScale cluster name from telemetry config - ansible.builtin.set_fact: - ps_cluster_name: >- - {{ hostvars['localhost']['telemetry_config'].telemetry_sources.powerscale.configurations[0].cluster_name | default('powerscale') }} - - - name: Construct OTLP collector address - ansible.builtin.set_fact: - otlp_collector_address: "otel-collector.{{ ps_cluster_name }}-observability.svc.cluster.local:4317" - - - name: Display auto-generated OTLP address - ansible.builtin.debug: - msg: "PowerScale telemetry enabled. Auto-configuring OTLP address: {{ otlp_collector_address }}" - verbosity: 1 - - - name: Inject OTLP address into CSM Authorization values file - ansible.builtin.replace: - path: "{{ ps_authz_values_file }}" - regexp: 'openTelemetryCollectorAddress:\s*""' - replace: 'openTelemetryCollectorAddress: "{{ otlp_collector_address }}"' - backup: yes - - - name: Inject OTLP address (alternative format - with null) - ansible.builtin.replace: - path: "{{ ps_authz_values_file }}" - regexp: 'openTelemetryCollectorAddress:\s*null' - replace: 'openTelemetryCollectorAddress: "{{ otlp_collector_address }}"' - when: ansible_check_mode is not defined - - when: ps_telemetry_enabled | default(false) | bool - -- name: Display message when telemetry is not enabled - ansible.builtin.debug: - msg: "PowerScale telemetry is not enabled. OTLP address will remain empty in values.yaml." - verbosity: 1 - when: not ps_telemetry_enabled | default(false) | bool - -- name: Create CSM Authorization directory on NFS share - ansible.builtin.file: - path: "{{ csm_authz_nfs_dir }}" - state: directory - mode: '0755' - -- name: Copy CSM Authorization values file to NFS share - ansible.builtin.copy: - src: "{{ ps_authz_values_file }}" - dest: "{{ csm_authz_nfs_dir }}/csm-authorization-values.yaml" - mode: '0600' - -- name: Create empty root certificate placeholder (insecure mode) - ansible.builtin.copy: - content: "" - dest: "{{ csm_authz_nfs_dir }}/rootCertificate.pem" - mode: '0600' - -- name: Generate JWT signing secret for authorization - ansible.builtin.set_fact: - ps_authz_jwt_secret: "{{ lookup('password', '/dev/null chars=ascii_letters,digits length=32') }}" - no_log: true - -- name: Stage JWT signing secret for cloud-init - ansible.builtin.copy: - content: | - AUTHZ_JWT_SECRET='{{ ps_authz_jwt_secret }}' - dest: "{{ csm_authz_nfs_dir }}/authz_jwt_secret.env" - mode: '0600' - no_log: true - -- name: Display CSM Authorization staging status - ansible.builtin.debug: - msg: "{{ ps_authz_staged_msg }}" - verbosity: 2 diff --git a/provision/roles/k8s_config/vars/main.yml b/provision/roles/k8s_config/vars/main.yml index 03b16db51a..5785565568 100644 --- a/provision/roles/k8s_config/vars/main.yml +++ b/provision/roles/k8s_config/vars/main.yml @@ -86,45 +86,3 @@ print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item. offline_path_x86_64: [] offline_path_aarch64: [] ssh_private_key_path: /root/.ssh/oim_rsa - -# ============================================================================ -# CSM Authorization Proxy Server Variables -# ============================================================================ -# Usage: deploy_powerscale_authorization.yml (file validation + NFS staging) -# Usage: cloud-init template (Helm install + sidecar config) -# Conditional: powerscale_authorization.enabled: true in omnia_config.yml - -# Namespace and Helm release constants -csm_authorization_isilon_namespace: "isilon" -csm_authz_proxy_namespace: "authorization" -csm_authz_proxy_helm_release: "csm-authorization" -csm_authz_proxy_helm_chart_path: "{{ k8s_client_mount_path }}/csi-driver-powerscale/csi-powerscale/helm-charts/charts/csm-authorization-v2.0" -csm_authz_nfs_dir: "{{ k8s_client_mount_path }}/csi-driver-powerscale/csm-authorization" - -# Messages -ps_authz_enabled_msg: "CSM Authorization module is ENABLED. Proxy server and sidecar will be deployed." -ps_authz_disabled_msg: "CSM Authorization module is DISABLED. Skipping authorization configuration." -ps_authz_values_missing_msg: >- - csm_authorization_values_file_path is required in omnia_config.yml - when powerscale_authorization.enabled is true. -ps_authz_values_not_found_msg: >- - CSM Authorization values file not found at '{{ ps_authz_values_file | default('') }}'. - Download from https://github.com/dell/helm-charts/blob/main/charts/csm-authorization-v2.0/values.yaml - and set the path in omnia_config.yml (powerscale_authorization.csm_authorization_values_file_path). -ps_authz_values_parse_fail_msg: >- - Failed to parse CSM Authorization values file at '{{ ps_authz_values_file | default('') }}'. - Please verify the file contains valid YAML. -ps_authz_no_tenants_msg: >- - At least one tenant must be defined in powerscale_authorization.tenants - when powerscale_authorization.enabled is true. -ps_authz_staged_msg: >- - CSM Authorization files staged on NFS share at {{ csm_authz_nfs_dir }}. - JWT signing secret auto-generated. - Proxy server Helm chart: {{ csm_authz_proxy_helm_chart_path }} - Deployment will occur during cloud-init (PXE boot) on the control plane node. -ps_authz_direct_mode_msg: >- - CSI PowerScale will be deployed in direct mode (no multi-tenant RBAC). - To enable CSM Authorization, set powerscale_authorization.enabled: true in omnia_config.yml. -ps_csi_driver_not_configured_msg: >- - PowerScale authorization requires csi_driver_powerscale to be configured in software_config.json. - Please add csi_driver_powerscale to software_config.json and re-run. From dd7e76591b09d2bc64e4fd9f3498db5d66d00b9e Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 8 May 2026 18:19:42 +0530 Subject: [PATCH 34/63] remove authorizaton specific files --- .../rhel/10.0/csi_driver_powerscale.json | 50 ------------------- .../templates/powerscale/csm_role_cr.yaml.j2 | 10 ---- .../powerscale/csm_storage_cr.yaml.j2 | 10 ---- .../powerscale/csm_tenant_cr.yaml.j2 | 15 ------ 4 files changed, 85 deletions(-) delete mode 100644 provision/roles/configure_ochami/templates/powerscale/csm_role_cr.yaml.j2 delete mode 100644 provision/roles/configure_ochami/templates/powerscale/csm_storage_cr.yaml.j2 delete mode 100644 provision/roles/configure_ochami/templates/powerscale/csm_tenant_cr.yaml.j2 diff --git a/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json b/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json index 7d45d935a8..7470b45cf5 100644 --- a/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json +++ b/input/config/x86_64/rhel/10.0/csi_driver_powerscale.json @@ -69,51 +69,6 @@ "tag": "v2.4.0", "type": "image" }, - { - "package": "quay.io/dell/container-storage-modules/csm-authorization-proxy", - "tag": "v2.4.0", - "type": "image" - }, - { - "package": "quay.io/dell/container-storage-modules/csm-authorization-tenant", - "tag": "v2.4.0", - "type": "image" - }, - { - "package": "quay.io/dell/container-storage-modules/csm-authorization-role", - "tag": "v2.4.0", - "type": "image" - }, - { - "package": "quay.io/dell/container-storage-modules/csm-authorization-storage", - "tag": "v2.4.0", - "type": "image" - }, - { - "package": "quay.io/dell/container-storage-modules/csm-authorization-controller", - "tag": "v2.4.0", - "type": "image" - }, - { - "package": "docker.io/openpolicyagent/opa", - "tag": "0.70.0", - "type": "image" - }, - { - "package": "docker.io/openpolicyagent/kube-mgmt", - "tag": "9.3.0", - "type": "image" - }, - { - "package": "docker.io/library/redis", - "tag": "8.4.0-alpine", - "type": "image" - }, - { - "package": "docker.io/rediscommander/redis-commander", - "tag": "latest", - "type": "image" - }, { "package": "quay.io/dell/container-storage-modules/csi-metadata-retriever", "tag": "v1.12.0", @@ -128,11 +83,6 @@ "package": "docker.io/dellemc/csm-encryption", "tag": "v0.6.0", "type": "image" - }, - { - "package": "dellctl", - "url": "https://github.com/dell/dellctl/releases/download/v1.7.0/dellctl-linux-amd64.tar.gz", - "type": "tarball" } ] } diff --git a/provision/roles/configure_ochami/templates/powerscale/csm_role_cr.yaml.j2 b/provision/roles/configure_ochami/templates/powerscale/csm_role_cr.yaml.j2 deleted file mode 100644 index 89bc9dd363..0000000000 --- a/provision/roles/configure_ochami/templates/powerscale/csm_role_cr.yaml.j2 +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: csm-authorization.storage.dell.com/v1alpha1 -kind: CSMRole -metadata: - name: {{ role.name }} - namespace: {{ authz_namespace }} -spec: - systemID: {{ cluster_name }} - systemType: isilon - pool: {{ role.storage_pool }} - quota: {{ role.quota_limit }} diff --git a/provision/roles/configure_ochami/templates/powerscale/csm_storage_cr.yaml.j2 b/provision/roles/configure_ochami/templates/powerscale/csm_storage_cr.yaml.j2 deleted file mode 100644 index 9fdd557cc4..0000000000 --- a/provision/roles/configure_ochami/templates/powerscale/csm_storage_cr.yaml.j2 +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: csm-authorization.storage.dell.com/v1alpha1 -kind: Storage -metadata: - name: powerscale-{{ cluster_name }} - namespace: {{ authz_namespace }} -spec: - type: isilon - endpoint: {{ endpoint }}:{{ port }} - systemID: {{ cluster_name }} - skipCertificateValidation: true diff --git a/provision/roles/configure_ochami/templates/powerscale/csm_tenant_cr.yaml.j2 b/provision/roles/configure_ochami/templates/powerscale/csm_tenant_cr.yaml.j2 deleted file mode 100644 index e05620b3f2..0000000000 --- a/provision/roles/configure_ochami/templates/powerscale/csm_tenant_cr.yaml.j2 +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: csm-authorization.storage.dell.com/v1alpha1 -kind: CSMTenant -metadata: - name: {{ tenant.name }} - namespace: {{ authz_namespace }} -spec: -{% if tenant.roles | default([]) %} - roles: -{% for role in tenant.roles %} - - {{ role.name }} -{% endfor %} -{% else %} - roles: [] -{% endif %} - revoke: false From d5576e21a28fb9519a3957dca31ed91b7312211d Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Fri, 8 May 2026 21:25:46 +0530 Subject: [PATCH 35/63] fix for DNS resolution --- provision/roles/telemetry/vars/main.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index bcec203a34..2b391c2657 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -311,9 +311,10 @@ vmagent: # Cluster URL (used when victoria_cluster.enabled: true) # Operator creates service as vminsert-victoria-cluster (not vminsert) # Protocol depends on tls_enabled: https when TLS is on, http otherwise + # Use service name without FQDN for better DNS resolution within cluster remote_write_url_cluster: >- {{ 'https' if victoria_cluster.tls_enabled else 'http' }}://vminsert-victoria-cluster.{{ - telemetry_namespace }}.svc.cluster.local:8480/insert/0/prometheus/api/v1/write + telemetry_namespace }}.svc:8480/insert/0/prometheus/api/v1/write strmzi_kafka_tarball_url: "{{ offline_tarball_path }}/{{ strimzi_kafka_pkg }}/{{ strimzi_kafka_pkg }}.tar.gz" From d8c3521c6c77e4f88e6bcbdbfdf445410270be51 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sat, 9 May 2026 16:32:16 +0530 Subject: [PATCH 36/63] UT sisue fixes --- .../common_utils/en_us_validation_msg.py | 12 +++- .../schema/telemetry_config.json | 63 ++++++++++++------ .../powerscale_telemetry_validation.py | 32 +--------- .../validation_flows/telemetry_validation.py | 64 +++++++++++++++++++ input/telemetry_config.yml | 26 ++++++-- .../victoria-agent-deployment.yaml.j2 | 4 +- .../victoria-operator-vmagent.yaml.j2 | 9 ++- .../victoria-operator-vmscrape.yaml.j2 | 4 +- .../victorialogs-operator-vlagent.yaml.j2 | 11 +++- provision/roles/telemetry/vars/main.yml | 18 +----- 10 files changed, 161 insertions(+), 82 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index 28dec89fec..1f1685d81a 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -362,10 +362,16 @@ def powerscale_csm_values_parse_error_msg(error): POWERSCALE_OTEL_COLLECTOR_IMAGE_MISSING_MSG = ( "OTEL Collector image is required in CSM Observability values.yaml." ) -POWERSCALE_ADDITIONAL_ENDPOINTS_URL_EMPTY_MSG = ( - "Each additional_remote_write_endpoint must have a non-empty 'url' field." +ADDITIONAL_METRIC_ENDPOINTS_URL_EMPTY_MSG = ( + "Each additional_metric_remote_write_endpoint must have a non-empty 'url' field." ) -POWERSCALE_ADDITIONAL_ENDPOINTS_URL_INVALID_MSG = ( +ADDITIONAL_METRIC_ENDPOINTS_URL_INVALID_MSG = ( + "URL must start with 'http://' or 'https://'." +) +ADDITIONAL_LOG_ENDPOINTS_URL_EMPTY_MSG = ( + "Each additional_log_write_endpoint must have a non-empty 'url' field." +) +ADDITIONAL_LOG_ENDPOINTS_URL_INVALID_MSG = ( "URL must start with 'http://' or 'https://'." ) def powerscale_image_version_mismatch_msg(image_name, values_image, service_k8s_image): diff --git a/common/library/module_utils/input_validation/schema/telemetry_config.json b/common/library/module_utils/input_validation/schema/telemetry_config.json index 4bf4de865e..28260bc42f 100644 --- a/common/library/module_utils/input_validation/schema/telemetry_config.json +++ b/common/library/module_utils/input_validation/schema/telemetry_config.json @@ -159,6 +159,27 @@ "minimum": 24, "default": 168, "description": "Metric retention period in hours. Default: 168 (7 days)." + }, + "additional_metric_remote_write_endpoints": { + "type": "array", + "default": [], + "items": { + "type": "object", + "properties": { + "url": { + "type": "string", + "pattern": "^https?://", + "description": "Metrics remote_write endpoint URL." + }, + "tls_insecure_skip_verify": { + "type": "boolean", + "default": false, + "description": "Skip TLS certificate verification for this endpoint." + } + }, + "required": ["url"] + }, + "description": "Additional metrics remote_write endpoints. Metrics are sent to Omnia VictoriaMetrics AND these endpoints." } }, "required": ["persistence_size", "retention_period"] @@ -179,6 +200,27 @@ "minimum": 24, "default": 168, "description": "Log retention period in hours. Default: 168 (7 days)." + }, + "additional_log_write_endpoints": { + "type": "array", + "default": [], + "items": { + "type": "object", + "properties": { + "url": { + "type": "string", + "pattern": "^https?://", + "description": "Logs remote_write endpoint URL." + }, + "tls_insecure_skip_verify": { + "type": "boolean", + "default": false, + "description": "Skip TLS certificate verification for this endpoint." + } + }, + "required": ["url"] + }, + "description": "Additional log remote_write endpoints. Logs are sent to Omnia VictoriaLogs AND these endpoints." } }, "required": ["storage_size", "retention_period"] @@ -353,27 +395,6 @@ "csm_observability_values_file_path": { "type": "string", "description": "Path to the CSM Observability (Karavi) values.yaml file." - }, - "additional_remote_write_endpoints": { - "type": "array", - "default": [], - "items": { - "type": "object", - "properties": { - "url": { - "type": "string", - "pattern": "^https?://", - "description": "victoria_metrics remote_write endpoint URL." - }, - "tls_insecure_skip_verify": { - "type": "boolean", - "default": false, - "description": "Skip TLS certificate verification for this endpoint." - } - }, - "required": ["url"] - }, - "description": "Additional victoria_metrics remote_write endpoints." } }, "required": ["otel_collector_storage_size", "csm_observability_values_file_path"] diff --git a/common/library/module_utils/input_validation/validation_flows/powerscale_telemetry_validation.py b/common/library/module_utils/input_validation/validation_flows/powerscale_telemetry_validation.py index 312e859875..9b0ef57217 100644 --- a/common/library/module_utils/input_validation/validation_flows/powerscale_telemetry_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/powerscale_telemetry_validation.py @@ -342,32 +342,6 @@ def validate_powerscale_telemetry_config( en_us_validation_msg.POWERSCALE_VICTORIA_LOGS_REQUIRED_MSG )) - # Validate additional_remote_write_endpoints - # (applies to metrics deployment) - additional_endpoints = powerscale_config.get( - "additional_remote_write_endpoints", [] - ) - if additional_endpoints and isinstance(additional_endpoints, list): - if len(additional_endpoints) > 5: - logger.warning( - f"More than 5 additional_remote_write_endpoints " - f"configured ({len(additional_endpoints)}). " - "This may impact performance." - ) - for idx, endpoint in enumerate(additional_endpoints): - if not isinstance(endpoint, dict): - continue - url = endpoint.get("url", "") - if not url or not isinstance(url, str): - errors.append(create_error_msg( - f"powerscale_configurations.additional_remote_write_endpoints[{idx}].url", - url, - en_us_validation_msg.POWERSCALE_ADDITIONAL_ENDPOINTS_URL_EMPTY_MSG - )) - elif (not url.startswith("http://") and - not url.startswith("https://")): - errors.append(create_error_msg( - f"powerscale_configurations.additional_remote_write_endpoints[{idx}].url", - url, - en_us_validation_msg.POWERSCALE_ADDITIONAL_ENDPOINTS_URL_INVALID_MSG - )) + # NOTE: additional_remote_write_endpoints and additional_log_write_endpoints + # have been moved to telemetry_sinks.victoria_metrics and + # telemetry_sinks.victoria_logs. Validation is now handled in telemetry_validation.py diff --git a/common/library/module_utils/input_validation/validation_flows/telemetry_validation.py b/common/library/module_utils/input_validation/validation_flows/telemetry_validation.py index 4b14351b21..cd2ee50b4d 100644 --- a/common/library/module_utils/input_validation/validation_flows/telemetry_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/telemetry_validation.py @@ -579,6 +579,70 @@ def validate_telemetry_config( f"ldms_source.metrics_enabled={ldms_source_enabled}" ) + # ========================================================================= + # Validate additional_metric_remote_write_endpoints (victoria_metrics) + # ========================================================================= + victoria_metrics_sink = telemetry_sinks.get("victoria_metrics", {}) + additional_metric_endpoints = victoria_metrics_sink.get( + "additional_metric_remote_write_endpoints", [] + ) + if additional_metric_endpoints and isinstance(additional_metric_endpoints, list): + if len(additional_metric_endpoints) > 5: + logger.warning( + f"More than 5 additional_metric_remote_write_endpoints " + f"configured ({len(additional_metric_endpoints)}). " + "This may impact performance." + ) + for idx, endpoint in enumerate(additional_metric_endpoints): + if not isinstance(endpoint, dict): + continue + url = endpoint.get("url", "") + if not url or not isinstance(url, str): + errors.append(create_error_msg( + f"telemetry_sinks.victoria_metrics.additional_metric_remote_write_endpoints[{idx}].url", + url, + en_us_validation_msg.ADDITIONAL_METRIC_ENDPOINTS_URL_EMPTY_MSG + )) + elif (not url.startswith("http://") and + not url.startswith("https://")): + errors.append(create_error_msg( + f"telemetry_sinks.victoria_metrics.additional_metric_remote_write_endpoints[{idx}].url", + url, + en_us_validation_msg.ADDITIONAL_METRIC_ENDPOINTS_URL_INVALID_MSG + )) + + # ========================================================================= + # Validate additional_log_write_endpoints (victoria_logs) + # ========================================================================= + victoria_logs_sink = telemetry_sinks.get("victoria_logs", {}) + additional_log_endpoints = victoria_logs_sink.get( + "additional_log_write_endpoints", [] + ) + if additional_log_endpoints and isinstance(additional_log_endpoints, list): + if len(additional_log_endpoints) > 5: + logger.warning( + f"More than 5 additional_log_write_endpoints " + f"configured ({len(additional_log_endpoints)}). " + "This may impact performance." + ) + for idx, endpoint in enumerate(additional_log_endpoints): + if not isinstance(endpoint, dict): + continue + url = endpoint.get("url", "") + if not url or not isinstance(url, str): + errors.append(create_error_msg( + f"telemetry_sinks.victoria_logs.additional_log_write_endpoints[{idx}].url", + url, + en_us_validation_msg.ADDITIONAL_LOG_ENDPOINTS_URL_EMPTY_MSG + )) + elif (not url.startswith("http://") and + not url.startswith("https://")): + errors.append(create_error_msg( + f"telemetry_sinks.victoria_logs.additional_log_write_endpoints[{idx}].url", + url, + en_us_validation_msg.ADDITIONAL_LOG_ENDPOINTS_URL_INVALID_MSG + )) + # ========================================================================= # Validate PowerScale telemetry configuration # ========================================================================= diff --git a/input/telemetry_config.yml b/input/telemetry_config.yml index acacf2c26b..a66c418ecb 100644 --- a/input/telemetry_config.yml +++ b/input/telemetry_config.yml @@ -214,6 +214,17 @@ telemetry_sinks: # Default: 168 (7 days) retention_period: 168 + # Additional remote write endpoints for metrics (optional) + # Metrics will be sent to the Omnia-managed VictoriaMetrics AND to these endpoints. + # Each entry requires a 'url' field (must start with http:// or https://). + # Set tls_insecure_skip_verify: true to skip TLS certificate verification. + # Default: [] (only Omnia VictoriaMetrics receives metrics) + # Example: + # additional_metric_remote_write_endpoints: + # - url: https://external-metrics-server:8480/insert/0/prometheus/api/v1/write + # tls_insecure_skip_verify: false + additional_metric_remote_write_endpoints: [] + # -------------------------------------------------------------------------- # victoria_logs — Centralized log storage and querying # -------------------------------------------------------------------------- @@ -230,6 +241,17 @@ telemetry_sinks: # Default: 168 (7 days) retention_period: 168 + # Additional remote write endpoints for logs (optional) + # Logs will be sent to the Omnia-managed VictoriaLogs AND to these endpoints. + # Each entry requires a 'url' field (must start with http:// or https://). + # Set tls_insecure_skip_verify: true to skip TLS certificate verification. + # Default: [] (only Omnia VictoriaLogs receives logs) + # Example: + # additional_log_write_endpoints: + # - url: https://external-logs-server:9481/internal/insert + # tls_insecure_skip_verify: false + additional_log_write_endpoints: [] + # -------------------------------------------------------------------------- # Kafka — Distributed streaming platform # -------------------------------------------------------------------------- @@ -327,7 +349,3 @@ powerscale_configurations: # Required when powerscale_configurations.powerscale_telemetry_support: true # Reference: https://raw.githubusercontent.com/dell/helm-charts/refs/heads/release-v1.16.3/charts/karavi-observability/values.yaml csm_observability_values_file_path: "" - - # Additional victoria_metrics remote_write endpoints (optional) - # Default: [] (empty — only the primary Omnia victoria_metrics endpoint is used) - additional_remote_write_endpoints: [] diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-agent-deployment.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-agent-deployment.yaml.j2 index 3fb8ad791f..5fa8381565 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-agent-deployment.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-agent-deployment.yaml.j2 @@ -44,14 +44,12 @@ spec: - -remoteWrite.tlsCAFile=/etc/victoria/certs/ca.crt - -remoteWrite.tlsInsecureSkipVerify=false {% endif %} -{% if hostvars['localhost']['telemetry_config']['telemetry_sources']['powerscale']['metrics_enabled'] | default(false) | bool %} -{% for endpoint in telemetry_config.powerscale_configurations.additional_remote_write_endpoints | default([]) %} +{% for endpoint in telemetry_config.telemetry_sinks.victoria_metrics.additional_metric_remote_write_endpoints | default([]) %} - -remoteWrite.url={{ endpoint.url }} {% if endpoint.tls_insecure_skip_verify | default(false) %} - -remoteWrite.tlsInsecureSkipVerify=true {% endif %} {% endfor %} -{% endif %} volumeMounts: - name: scrape-config mountPath: "/etc/vmagent" diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 index 970c9b20aa..93efb4007b 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmagent.yaml.j2 @@ -50,6 +50,11 @@ spec: tlsConfig: insecureSkipVerify: true {% endif %} +{% for endpoint in telemetry_config.telemetry_sinks.victoria_metrics.additional_metric_remote_write_endpoints | default([]) %} + - url: {{ endpoint.url }} + tlsConfig: + insecureSkipVerify: {{ endpoint.tls_insecure_skip_verify | default(false) | lower }} +{% endfor %} # Resource limits resources: @@ -60,11 +65,13 @@ spec: memory: "{{ victoria_cluster.vmagent.resources.limits.memory}}" cpu: "{{ victoria_cluster.vmagent.resources.limits.cpu}}" - # Service discovery configs - operator uses VMServiceScrape/VMPodScrape CRDs + # Service discovery configs - operator uses VMServiceScrape/VMPodScrape/VMNodeScrape CRDs serviceScrapeNamespaceSelector: {} serviceScrapeSelector: {} podScrapeNamespaceSelector: {} podScrapeSelector: {} + nodeScrapeNamespaceSelector: {} + nodeScrapeSelector: {} # Extra args extraArgs: diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 index f4a71fdc81..b1a4c58b03 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 @@ -59,11 +59,11 @@ spec: # Target OTEL collector service selector: matchLabels: - app: otel-collector + app.kubernetes.io/name: otel-collector # Service metrics endpoints endpoints: - - port: "8889" + - port: prometheus interval: {{ vmagent.global.scrape_interval }} path: /metrics diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 index a42f32fb9e..1c8accfe4d 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 @@ -69,13 +69,16 @@ spec: # ======================================== # Forward logs to VictoriaLogs vlinsert endpoint # Supports JSON Lines format with optional TLS - # Using short service name (same namespace) to avoid DNS resolution issues + # Using service name only (same namespace) to avoid DNS resolution issues remoteWrite: {% if victoria_logs_cluster.tls_enabled %} - - url: https://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc:9481/internal/insert + - url: https://vlinsert-victoria-logs-cluster:9481/internal/insert {% else %} - - url: http://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc:9481/internal/insert + - url: http://vlinsert-victoria-logs-cluster:9481/internal/insert {% endif %} +{% for endpoint in telemetry_config.telemetry_sinks.victoria_logs.additional_log_write_endpoints | default([]) %} + - url: {{ endpoint.url }} +{% endfor %} # ======================================== # Resource Allocation @@ -166,10 +169,12 @@ spec: port: 514 targetPort: 514 protocol: TCP + nodePort: 32399 - name: syslog-udp port: 514 targetPort: 514 protocol: UDP + nodePort: 32400 # ======================================== # Health Probes diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index 2b391c2657..509bc95e54 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -265,19 +265,6 @@ victoria_logs_cluster: memory: "512Mi" cpu: "250m" -# VictoriaLogs ports (operator v0.66.1 defaults) -# Note: VictoriaMetrics operator uses these default ports and ignores custom port specifications -# will uncomment after verify that victoria oprator does not takes default port -# victoria_logs_ports: -# vlinsert_http: 9481 # Ingestion clients → vlinsert (HTTPS, LoadBalancer) - operator default -# vlselect_http: 9471 # Query clients → vlselect (HTTPS, LoadBalancer) - operator default -# vlstorage_http: 9491 # Health checks and admin API (internal) - operator default -# vlstorage_insert: 9400 # vlinsert → vlstorage data sharding (HTTPS, internal) -# vlstorage_select: 9401 # vlselect → vlstorage query fan-out (HTTPS, internal) -# vlagent_syslog: 514 # Syslog receiver plaintext (TCP+UDP) -# vlagent_syslog_tls: 6514 # Syslog receiver TLS (TCP, RFC 5425) -# vlagent_http: 9429 # VLAgent health checks (distinct from vmagent 8429) - # Telemetry shared path configuration telemetry_share_path: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/telemetry" @@ -311,10 +298,9 @@ vmagent: # Cluster URL (used when victoria_cluster.enabled: true) # Operator creates service as vminsert-victoria-cluster (not vminsert) # Protocol depends on tls_enabled: https when TLS is on, http otherwise - # Use service name without FQDN for better DNS resolution within cluster + # Use service name only since vmagent and vminsert are in same namespace remote_write_url_cluster: >- - {{ 'https' if victoria_cluster.tls_enabled else 'http' }}://vminsert-victoria-cluster.{{ - telemetry_namespace }}.svc:8480/insert/0/prometheus/api/v1/write + {{ 'https' if victoria_cluster.tls_enabled else 'http' }}://vminsert-victoria-cluster:8480/insert/0/prometheus/api/v1/write strmzi_kafka_tarball_url: "{{ offline_tarball_path }}/{{ strimzi_kafka_pkg }}/{{ strimzi_kafka_pkg }}.tar.gz" From b3091f9ecfdbffbed98ccec4ee6febfc070a2355 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sat, 9 May 2026 17:31:09 +0530 Subject: [PATCH 37/63] FQDN for lod collection --- .../victoria/victorialogs-operator-vlagent.yaml.j2 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 index 1c8accfe4d..4057aee91f 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 @@ -69,12 +69,12 @@ spec: # ======================================== # Forward logs to VictoriaLogs vlinsert endpoint # Supports JSON Lines format with optional TLS - # Using service name only (same namespace) to avoid DNS resolution issues + # Using FQDN (fully qualified domain name) to ensure DNS resolution remoteWrite: {% if victoria_logs_cluster.tls_enabled %} - - url: https://vlinsert-victoria-logs-cluster:9481/internal/insert + - url: https://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/internal/insert {% else %} - - url: http://vlinsert-victoria-logs-cluster:9481/internal/insert + - url: http://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/internal/insert {% endif %} {% for endpoint in telemetry_config.telemetry_sinks.victoria_logs.additional_log_write_endpoints | default([]) %} - url: {{ endpoint.url }} From 77bb185b09635a0d757badb4dde48912e240421d Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sat, 9 May 2026 17:52:57 +0530 Subject: [PATCH 38/63] controller side external health monitor --- .../victoria-operator-vmscrape.yaml.j2 | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 index b1a4c58b03..6abf26b38b 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 @@ -83,6 +83,34 @@ spec: replacement: otel-collector {% endif %} +--- +{% if telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) %} +# VMServiceScrape for CSI PowerScale Controller Health Monitor +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMServiceScrape +metadata: + name: csi-powerscale-health-monitor + namespace: {{ telemetry_namespace }} +spec: + # Target CSI PowerScale controller service + selector: + matchLabels: + app: isilon + + # Service metrics endpoints + endpoints: + - port: health-monitor + interval: {{ vmagent.global.scrape_interval }} + path: /metrics + + # Relabel configs + relabelConfigs: + - targetLabel: source + replacement: powerscale + - targetLabel: component + replacement: csi-health-monitor +{% endif %} + --- {% if hostvars['localhost']['powerscale_volume_health_enabled'] | default(false) %} # VMNodeScrape for Kubelet Volume Health Monitoring metrics From 2fbcd19679e1d62b5a8bc24d6186783b07cf04a9 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sat, 9 May 2026 21:40:39 +0530 Subject: [PATCH 39/63] cuxtome exporter for scraping powerscale health monitor metrics --- .../config/x86_64/rhel/10.0/service_k8s.json | 3 + .../tasks/generate_telemetry_deployments.yml | 23 ++ .../templates/telemetry/kustomization.yaml.j2 | 4 + .../victoria/csi-volume-exporter.yaml.j2 | 378 ++++++++++++++++++ .../victoria-operator-vmscrape.yaml.j2 | 49 +-- .../victoria/victoria-vmagent-rbac.yaml.j2 | 32 ++ .../victorialogs-operator-vlagent.yaml.j2 | 6 +- 7 files changed, 449 insertions(+), 46 deletions(-) create mode 100644 provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 diff --git a/input/config/x86_64/rhel/10.0/service_k8s.json b/input/config/x86_64/rhel/10.0/service_k8s.json index 086ced3db4..7300c7ebf7 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s.json +++ b/input/config/x86_64/rhel/10.0/service_k8s.json @@ -23,12 +23,15 @@ { "package": "docker.io/curlimages/curl", "type": "image", "tag": "8.17.0" }, { "package": "docker.io/rmohr/activemq", "type": "image", "tag": "5.15.9" }, { "package": "docker.io/library/mysql", "type": "image", "tag": "9.3.0" }, + { "package": "docker.io/library/python", "type": "image", "tag": "3.11-slim" }, { "package": "docker.io/dellhpcomniaaisolution/idrac_telemetry_receiver", "type": "image", "tag": "1.2" }, { "package": "docker.io/dellhpcomniaaisolution/kafkapump", "type": "image", "tag": "1.2" }, { "package": "docker.io/dellhpcomniaaisolution/victoriapump", "type": "image", "tag": "1.2" }, { "package": "cryptography==45.0.7", "type": "pip_module" }, { "package": "omsdk==1.2.518", "type": "pip_module" }, { "package": "cffi==1.17.1", "type": "pip_module" }, + { "package": "prometheus_client==0.20.0", "type": "pip_module" }, + { "package": "kubernetes==29.0.0", "type": "pip_module" }, { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" }, { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, diff --git a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml index 729a1db05c..0b262ff6a6 100644 --- a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml +++ b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml @@ -129,6 +129,29 @@ dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/{{ victoria_operator_pkg }}.tar.gz" mode: "{{ hostvars['localhost']['file_permissions_644'] }}" +- name: CSI Volume Exporter Python packages configuration + when: + - victoria_metrics_support | default(false) | bool + - telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool + block: + - name: Create python-packages directory + ansible.builtin.file: + path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/python-packages" + state: directory + mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" + + - name: Extract Python package names from service_k8s.json (pip_module type) + ansible.builtin.set_fact: + csi_pip_modules: "{{ k8s_packages_json['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', 'prometheus_client|kubernetes') | map(attribute='package') | list }}" + + - name: Download Python pip_modules from Pulp + ansible.builtin.get_url: + url: "{{ offline_tarball_path }}/{{ item }}/{{ item }}" + dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/python-packages/{{ item }}" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + loop: "{{ csi_pip_modules }}" + when: csi_pip_modules | length > 0 + - name: Populate common telemetry deployment configs ansible.builtin.template: src: "{{ item.src }}" diff --git a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 index 09cc0ae1e6..ef8411f789 100644 --- a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 @@ -19,6 +19,10 @@ resources: - victoria-operator-vmagent.yaml # VMScrape CR (native operator-based service/pod discovery for metrics) - victoria-operator-vmscrape.yaml +{% if telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) %} + # CSI Volume Exporter - PowerScale PV status metrics + - csi-volume-exporter.yaml +{% endif %} {% endif %} {% if victoria_logs_support | default(false) %} # victoria_logs Resources (Logs Only) diff --git a/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 new file mode 100644 index 0000000000..71742dc55e --- /dev/null +++ b/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 @@ -0,0 +1,378 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# CSI Volume Exporter - Exposes PowerScale volume health metrics as Prometheus metrics +# Deployed when powerscale metrics are enabled +# Covers all CSI external-health-monitor metrics: PV status, PVC binding, capacity, +# volume health events, node failure, storage class info, and aggregate counts +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ csi_volume_exporter.service_account_name }} + namespace: {{ telemetry_namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ csi_volume_exporter.cluster_role_name }} +rules: + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["events"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses", "csinodes"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ csi_volume_exporter.cluster_role_binding_name }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ csi_volume_exporter.cluster_role_name }} +subjects: + - kind: ServiceAccount + name: {{ csi_volume_exporter.service_account_name }} + namespace: {{ telemetry_namespace }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ csi_volume_exporter.app_name }} + namespace: {{ telemetry_namespace }} + labels: + app: {{ csi_volume_exporter.app_name }} + source: powerscale + component: volume-status +spec: + replicas: 1 + selector: + matchLabels: + app: {{ csi_volume_exporter.app_name }} + template: + metadata: + labels: + app: {{ csi_volume_exporter.app_name }} + source: powerscale + component: volume-status + spec: + serviceAccountName: {{ csi_volume_exporter.service_account_name }} + initContainers: + - name: install-deps + image: {{ csi_volume_exporter.image }} + command: ["/bin/sh", "-c"] + args: + - | + pip install --no-index --find-links=/packages \ + prometheus_client kubernetes || \ + pip install prometheus_client kubernetes + volumeMounts: + - name: python-packages + mountPath: /packages + - name: app-dir + mountPath: /usr/local/lib/python3.11/site-packages-shared + containers: + - name: exporter + image: {{ csi_volume_exporter.image }} + command: ["/bin/sh", "-c"] + args: + - | + cp -r /usr/local/lib/python3.11/site-packages-shared/* /usr/local/lib/python3.11/site-packages/ 2>/dev/null || true + mkdir -p /app + cat > /app/exporter.py <<'PYEOF' + from prometheus_client import start_http_server, Gauge, Counter, Info + from kubernetes import client, config + import time + import datetime + + PROVISIONER = 'csi-isilon.dellemc.com' + + config.load_incluster_config() + v1 = client.CoreV1Api() + storage_v1 = client.StorageV1Api() + + # ── PV metrics ── + pv_status = Gauge('powerscale_volume_status', + 'PV phase (1=Bound, 0=Other)', + ['pv_name', 'phase']) + pv_count = Gauge('powerscale_volume_count', + 'Total PowerScale PVs by phase', + ['phase']) + pv_capacity_bytes = Gauge('powerscale_volume_capacity_bytes', + 'PV capacity in bytes', + ['pv_name']) + pv_info = Gauge('powerscale_volume_info', + 'PV metadata (always 1)', + ['pv_name', 'phase', 'storage_class', 'reclaim_policy', + 'access_modes', 'volume_handle', 'pvc_name', 'pvc_namespace']) + pv_age_seconds = Gauge('powerscale_volume_age_seconds', + 'Seconds since PV creation', + ['pv_name']) + + # ── PVC metrics ── + pvc_status = Gauge('powerscale_pvc_status_phase', + 'PVC phase (1=Bound, 0=Other)', + ['pvc_name', 'pvc_namespace', 'phase']) + pvc_requested_bytes = Gauge('powerscale_pvc_requested_bytes', + 'PVC requested storage in bytes', + ['pvc_name', 'pvc_namespace']) + pvc_count = Gauge('powerscale_pvc_count', + 'Total PowerScale PVCs by phase', + ['phase']) + + # ── Health event metrics (from CSI external-health-monitor-controller) ── + volume_condition_abnormal = Gauge( + 'powerscale_volume_health_abnormal', + 'Volume condition abnormal (1=abnormal, 0=healthy)', + ['pvc_name', 'pvc_namespace', 'pv_name']) + volume_abnormal_events_total = Gauge( + 'powerscale_volume_abnormal_events_total', + 'Total VolumeConditionAbnormal events for PVC', + ['pvc_name', 'pvc_namespace']) + node_failure_events_total = Gauge( + 'powerscale_node_failure_events_total', + 'Total node failure events affecting PowerScale PVCs', + ['node']) + + # ── Node metrics ── + node_condition_ready = Gauge( + 'powerscale_node_ready', + 'Node Ready condition (1=True, 0=False)', + ['node']) + + # ── Storage class metrics ── + sc_info = Gauge('powerscale_storageclass_info', + 'StorageClass metadata (always 1)', + ['storageclass', 'provisioner', 'reclaim_policy', + 'volume_binding_mode', 'allow_volume_expansion']) + + # ── Aggregate summary ── + total_capacity_bytes = Gauge( + 'powerscale_total_capacity_bytes', + 'Total capacity of all PowerScale PVs in bytes') + + def parse_k8s_quantity(q): + if q is None: + return 0 + suffixes = {'Ki': 2**10, 'Mi': 2**20, 'Gi': 2**30, 'Ti': 2**40, + 'Pi': 2**50, 'k': 10**3, 'M': 10**6, 'G': 10**9, + 'T': 10**12, 'P': 10**15} + for s, mul in suffixes.items(): + if q.endswith(s): + return int(float(q[:-len(s)]) * mul) + return int(q) + + def collect_pv_metrics(): + pvs = v1.list_persistent_volume() + pv_phases = {} + total_cap = 0 + pv_to_pvc = {} + now = datetime.datetime.now(datetime.timezone.utc) + for pv in pvs.items: + ann = pv.metadata.annotations or {} + prov = ann.get('pv.kubernetes.io/provisioned-by', '') + if PROVISIONER not in prov: + continue + name = pv.metadata.name + phase = pv.status.phase or 'Unknown' + sc = pv.spec.storage_class_name or '' + reclaim = pv.spec.persistent_volume_reclaim_policy or '' + access = ','.join(pv.spec.access_modes or []) + vol_handle = '' + if pv.spec.csi and pv.spec.csi.volume_handle: + vol_handle = pv.spec.csi.volume_handle + pvc_name = '' + pvc_ns = '' + if pv.spec.claim_ref: + pvc_name = pv.spec.claim_ref.name or '' + pvc_ns = pv.spec.claim_ref.namespace or '' + pv_to_pvc[name] = (pvc_name, pvc_ns) + cap = parse_k8s_quantity( + pv.spec.capacity.get('storage', '0') if pv.spec.capacity else '0') + total_cap += cap + pv_status.labels(pv_name=name, phase=phase).set( + 1 if phase == 'Bound' else 0) + pv_capacity_bytes.labels(pv_name=name).set(cap) + pv_info.labels(pv_name=name, phase=phase, storage_class=sc, + reclaim_policy=reclaim, access_modes=access, + volume_handle=vol_handle, pvc_name=pvc_name, + pvc_namespace=pvc_ns).set(1) + if pv.metadata.creation_timestamp: + age = (now - pv.metadata.creation_timestamp).total_seconds() + pv_age_seconds.labels(pv_name=name).set(age) + pv_phases[phase] = pv_phases.get(phase, 0) + 1 + for phase, count in pv_phases.items(): + pv_count.labels(phase=phase).set(count) + total_capacity_bytes.set(total_cap) + return pv_to_pvc + + def collect_pvc_metrics(pv_to_pvc): + pvcs = v1.list_persistent_volume_claim_for_all_namespaces() + pvc_phases = {} + pvc_set = set((v[0], v[1]) for v in pv_to_pvc.values() if v[0]) + for pvc in pvcs.items: + key = (pvc.metadata.name, pvc.metadata.namespace) + if key not in pvc_set: + ann = pvc.metadata.annotations or {} + prov = ann.get('volume.kubernetes.io/storage-provisioner', '') + if PROVISIONER not in prov: + continue + name = pvc.metadata.name + ns = pvc.metadata.namespace + phase = pvc.status.phase or 'Unknown' + pvc_status.labels(pvc_name=name, pvc_namespace=ns, + phase=phase).set(1 if phase == 'Bound' else 0) + req = '0' + if pvc.spec.resources and pvc.spec.resources.requests: + req = pvc.spec.resources.requests.get('storage', '0') + pvc_requested_bytes.labels(pvc_name=name, + pvc_namespace=ns).set(parse_k8s_quantity(req)) + pvc_phases[phase] = pvc_phases.get(phase, 0) + 1 + for phase, count in pvc_phases.items(): + pvc_count.labels(phase=phase).set(count) + + def collect_health_events(pv_to_pvc): + pvc_set = set((v[0], v[1]) for v in pv_to_pvc.values() if v[0]) + abnormal_counts = {} + node_fail_counts = {} + healthy_pvcs = set(pvc_set) + try: + events = v1.list_event_for_all_namespaces( + field_selector='reason=VolumeConditionAbnormal') + for ev in events.items: + if ev.involved_object.kind != 'PersistentVolumeClaim': + continue + pvc_name = ev.involved_object.name + pvc_ns = ev.involved_object.namespace or '' + key = (pvc_name, pvc_ns) + if key in pvc_set: + cnt = ev.count or 1 + abnormal_counts[key] = abnormal_counts.get(key, 0) + cnt + healthy_pvcs.discard(key) + except Exception as e: + print(f"Error fetching VolumeConditionAbnormal events: {e}") + try: + events = v1.list_event_for_all_namespaces( + field_selector='reason=NodeFailure') + for ev in events.items: + node = ev.involved_object.name or '' + cnt = ev.count or 1 + node_fail_counts[node] = node_fail_counts.get(node, 0) + cnt + except Exception as e: + print(f"Error fetching NodeFailure events: {e}") + pvc_to_pv = {} + for pv_name, (pvc_name, pvc_ns) in pv_to_pvc.items(): + if pvc_name: + pvc_to_pv[(pvc_name, pvc_ns)] = pv_name + for key, cnt in abnormal_counts.items(): + pv_name = pvc_to_pv.get(key, '') + volume_condition_abnormal.labels(pvc_name=key[0], + pvc_namespace=key[1], pv_name=pv_name).set(1) + volume_abnormal_events_total.labels(pvc_name=key[0], + pvc_namespace=key[1]).set(cnt) + for key in healthy_pvcs: + pv_name = pvc_to_pv.get(key, '') + volume_condition_abnormal.labels(pvc_name=key[0], + pvc_namespace=key[1], pv_name=pv_name).set(0) + for node, cnt in node_fail_counts.items(): + node_failure_events_total.labels(node=node).set(cnt) + + def collect_node_metrics(): + nodes = v1.list_node() + for n in nodes.items: + name = n.metadata.name + ready = 0 + for cond in (n.status.conditions or []): + if cond.type == 'Ready': + ready = 1 if cond.status == 'True' else 0 + break + node_condition_ready.labels(node=name).set(ready) + + def collect_storageclass_metrics(): + scs = storage_v1.list_storage_class() + for sc in scs.items: + if PROVISIONER not in (sc.provisioner or ''): + continue + sc_info.labels( + storageclass=sc.metadata.name, + provisioner=sc.provisioner or '', + reclaim_policy=sc.reclaim_policy or '', + volume_binding_mode=sc.volume_binding_mode or '', + allow_volume_expansion=str( + sc.allow_volume_expansion or False).lower() + ).set(1) + + def collect_all(): + try: + pv_to_pvc = collect_pv_metrics() + collect_pvc_metrics(pv_to_pvc) + collect_health_events(pv_to_pvc) + collect_node_metrics() + collect_storageclass_metrics() + except Exception as e: + print(f"Error in collection cycle: {e}") + + if __name__ == '__main__': + start_http_server({{ csi_volume_exporter.metrics_port }}) + print("CSI Volume Exporter started on port {{ csi_volume_exporter.metrics_port }}") + collect_all() + while True: + time.sleep({{ csi_volume_exporter.scrape_interval_seconds }}) + collect_all() + PYEOF + python /app/exporter.py + ports: + - containerPort: {{ csi_volume_exporter.metrics_port }} + resources: + requests: + cpu: {{ csi_volume_exporter.resources.requests.cpu }} + memory: {{ csi_volume_exporter.resources.requests.memory }} + limits: + cpu: {{ csi_volume_exporter.resources.limits.cpu }} + memory: {{ csi_volume_exporter.resources.limits.memory }} + volumeMounts: + - name: app-dir + mountPath: /usr/local/lib/python3.11/site-packages-shared + volumes: + - name: python-packages + hostPath: + path: {{ hostvars['localhost']['k8s_client_mount_path'] }}/telemetry/python-packages + type: DirectoryOrCreate + - name: app-dir + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ csi_volume_exporter.service_name }} + namespace: {{ telemetry_namespace }} + labels: + app: {{ csi_volume_exporter.app_name }} +spec: + selector: + app: {{ csi_volume_exporter.app_name }} + ports: + - name: "{{ csi_volume_exporter.metrics_port }}" + port: {{ csi_volume_exporter.metrics_port }} + targetPort: {{ csi_volume_exporter.metrics_port }} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 index 6abf26b38b..63bdeca910 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 @@ -85,60 +85,23 @@ spec: --- {% if telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) %} -# VMServiceScrape for CSI PowerScale Controller Health Monitor +# VMServiceScrape for CSI Volume Exporter - PowerScale PV status metrics apiVersion: operator.victoriametrics.com/v1beta1 kind: VMServiceScrape metadata: - name: csi-powerscale-health-monitor + name: {{ csi_volume_exporter.app_name }} namespace: {{ telemetry_namespace }} spec: - # Target CSI PowerScale controller service selector: matchLabels: - app: isilon - - # Service metrics endpoints + app: {{ csi_volume_exporter.app_name }} endpoints: - - port: health-monitor - interval: {{ vmagent.global.scrape_interval }} + - port: "{{ csi_volume_exporter.metrics_port }}" path: /metrics - - # Relabel configs - relabelConfigs: - - targetLabel: source - replacement: powerscale - - targetLabel: component - replacement: csi-health-monitor -{% endif %} - ---- -{% if hostvars['localhost']['powerscale_volume_health_enabled'] | default(false) %} -# VMNodeScrape for Kubelet Volume Health Monitoring metrics -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMNodeScrape -metadata: - name: kubelet-volume-health - namespace: {{ telemetry_namespace }} -spec: - # Scrape interval - interval: {{ vmagent.global.scrape_interval }} - - # Kubelet HTTPS endpoint - scheme: https - - # TLS configuration with proper CA verification - tlsConfig: - caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - - # Bearer token for authentication - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - - # Relabel configs + interval: {{ csi_volume_exporter.scrape_interval }} relabelConfigs: - - sourceLabels: [__meta_kubernetes_node_name] - targetLabel: node - targetLabel: source replacement: powerscale - targetLabel: component - replacement: volume-health + replacement: volume-status {% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-vmagent-rbac.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-vmagent-rbac.yaml.j2 index e84877af56..4b0dc485ae 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-vmagent-rbac.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-vmagent-rbac.yaml.j2 @@ -45,3 +45,35 @@ roleRef: kind: Role name: "{{ vmagent.role_name }}" apiGroup: rbac.authorization.k8s.io + +--- +# ClusterRole for vmagent service discovery across namespaces +# Required by VictoriaMetrics operator for VMServiceScrape/VMPodScrape/VMNodeScrape +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: "{{ vmagent.service_account_name }}" +rules: + - apiGroups: [""] + resources: ["nodes", "nodes/proxy", "services", "endpoints", "pods"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: "{{ vmagent.service_account_name }}" +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: "{{ vmagent.service_account_name }}" +subjects: + - kind: ServiceAccount + name: "{{ vmagent.service_account_name }}" + namespace: "{{ telemetry_namespace }}" diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 index 4057aee91f..e6f5268125 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 @@ -69,12 +69,12 @@ spec: # ======================================== # Forward logs to VictoriaLogs vlinsert endpoint # Supports JSON Lines format with optional TLS - # Using FQDN (fully qualified domain name) to ensure DNS resolution + # Use FQDN without .cluster.local for DNS resolution remoteWrite: {% if victoria_logs_cluster.tls_enabled %} - - url: https://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/internal/insert + - url: "https://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc:9481/internal/insert?version=v1" {% else %} - - url: http://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/internal/insert + - url: http://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc:9481/internal/insert {% endif %} {% for endpoint in telemetry_config.telemetry_sinks.victoria_logs.additional_log_write_endpoints | default([]) %} - url: {{ endpoint.url }} From 55e500e4cc44fe6f053916e9531ba6bafec69fc6 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sat, 9 May 2026 21:51:32 +0530 Subject: [PATCH 40/63] revert vlinsert url --- input/config/x86_64/rhel/10.0/service_k8s.json | 4 +--- .../telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/input/config/x86_64/rhel/10.0/service_k8s.json b/input/config/x86_64/rhel/10.0/service_k8s.json index 7300c7ebf7..92a06a11d6 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s.json +++ b/input/config/x86_64/rhel/10.0/service_k8s.json @@ -31,7 +31,7 @@ { "package": "omsdk==1.2.518", "type": "pip_module" }, { "package": "cffi==1.17.1", "type": "pip_module" }, { "package": "prometheus_client==0.20.0", "type": "pip_module" }, - { "package": "kubernetes==29.0.0", "type": "pip_module" }, + { "package": "kubernetes==33.1.0", "type": "pip_module" }, { "package": "quay.io/strimzi/operator", "tag": "0.48.0", "type": "image" }, { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, @@ -78,7 +78,6 @@ { "package": "prettytable==3.14.0", "type": "pip_module" }, { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, { "package": "git", "type": "rpm", "repo_name": "appstream"}, - { "package": "kubernetes==33.1.0", "type": "pip_module" }, { "package": "PyMySQL==1.1.2", "type": "pip_module" } ] }, @@ -105,7 +104,6 @@ { "package": "prettytable==3.14.0", "type": "pip_module" }, { "package": "python3-3.12.9", "type": "rpm", "repo_name": "baseos" }, { "package": "git", "type": "rpm", "repo_name": "appstream"}, - { "package": "kubernetes==33.1.0", "type": "pip_module" }, { "package": "PyMySQL==1.1.2", "type": "pip_module" } ] }, diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 index e6f5268125..524fe05287 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 @@ -69,10 +69,10 @@ spec: # ======================================== # Forward logs to VictoriaLogs vlinsert endpoint # Supports JSON Lines format with optional TLS - # Use FQDN without .cluster.local for DNS resolution + # Using short service name (same namespace) to avoid DNS resolution issues remoteWrite: {% if victoria_logs_cluster.tls_enabled %} - - url: "https://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc:9481/internal/insert?version=v1" + - url: https://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc:9481/internal/insert {% else %} - url: http://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc:9481/internal/insert {% endif %} From 63f3b239c17122fc7fb27c1e88ee8aaee4e9de3e Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sat, 9 May 2026 22:09:44 +0530 Subject: [PATCH 41/63] template rendering for cSI volume exporter --- provision/roles/telemetry/vars/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index 509bc95e54..fe0ec74128 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -326,6 +326,8 @@ common_mode: "0755" victoria_templates_common: - src: 'telemetry/victoria/victoria-vmagent-rbac.yaml.j2' dest: 'victoria-vmagent-rbac.yaml' + - src: 'telemetry/victoria/csi-volume-exporter.yaml.j2' + dest: 'csi-volume-exporter.yaml' # Operator-based templates (new default) # Single-node operator template (used when victoria_cluster.enabled: false) From 0c64fd73506da7f082a12e1674a17ed68dae6e9e Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sat, 9 May 2026 22:25:08 +0530 Subject: [PATCH 42/63] update condition for csi volume exporter --- .../tasks/generate_telemetry_deployments.yml | 17 +++++++++++- .../templates/telemetry/kustomization.yaml.j2 | 2 +- .../victoria-operator-vmscrape.yaml.j2 | 2 +- provision/roles/telemetry/vars/main.yml | 26 +++++++++++++++++-- 4 files changed, 42 insertions(+), 5 deletions(-) diff --git a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml index 0b262ff6a6..1d5376bd97 100644 --- a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml +++ b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml @@ -142,7 +142,12 @@ - name: Extract Python package names from service_k8s.json (pip_module type) ansible.builtin.set_fact: - csi_pip_modules: "{{ k8s_packages_json['service_k8s']['cluster'] | selectattr('type', 'equalto', 'pip_module') | selectattr('package', 'search', 'prometheus_client|kubernetes') | map(attribute='package') | list }}" + csi_pip_modules: >- + {{ k8s_packages_json['service_k8s']['cluster'] + | selectattr('type', 'equalto', 'pip_module') + | selectattr('package', 'search', 'prometheus_client|kubernetes') + | map(attribute='package') + | list }} - name: Download Python pip_modules from Pulp ansible.builtin.get_url: @@ -169,6 +174,16 @@ when: telemetry_config.telemetry_sources.idrac.metrics_enabled | default(false) | bool tags: telemetry_deployment +- name: Populate CSI volume exporter deployment + ansible.builtin.template: + src: 'telemetry/victoria/csi-volume-exporter.yaml.j2' + dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/csi-volume-exporter.yaml" + mode: "{{ hostvars['localhost']['file_permissions_644'] }}" + when: + - powerscale_volume_health_enabled | default(false) | bool + - telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool + tags: telemetry_deployment + - name: Deploy telemetry cleanup script ansible.builtin.template: src: 'telemetry/cleanup_telemetry.sh.j2' diff --git a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 index ef8411f789..faf97140b5 100644 --- a/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/kustomization.yaml.j2 @@ -19,7 +19,7 @@ resources: - victoria-operator-vmagent.yaml # VMScrape CR (native operator-based service/pod discovery for metrics) - victoria-operator-vmscrape.yaml -{% if telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) %} +{% if powerscale_volume_health_enabled | default(false) and telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) %} # CSI Volume Exporter - PowerScale PV status metrics - csi-volume-exporter.yaml {% endif %} diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 index 63bdeca910..74ed4a80a3 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victoria-operator-vmscrape.yaml.j2 @@ -84,7 +84,7 @@ spec: {% endif %} --- -{% if telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) %} +{% if powerscale_volume_health_enabled | default(false) and telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) %} # VMServiceScrape for CSI Volume Exporter - PowerScale PV status metrics apiVersion: operator.victoriametrics.com/v1beta1 kind: VMServiceScrape diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index fe0ec74128..9846b1c3e4 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -278,6 +278,30 @@ powerscale_victoria_logs_validation_fail_msg: >- PowerScale log collection requires 'victoria_logs' in telemetry_sources.powerscale.collection_targets. +# ============================================================================ +# CSI Volume Exporter Configuration +# ============================================================================ +# Usage: csi-volume-exporter.yaml.j2, victoria-operator-vmscrape.yaml.j2 +# Deployed when telemetry_sources.powerscale.metrics_enabled: true +# Exposes PowerScale PersistentVolume status as Prometheus metrics +csi_volume_exporter: + app_name: "csi-volume-exporter" + service_name: "csi-volume-exporter" + service_account_name: "csi-volume-exporter" + cluster_role_name: "csi-volume-exporter" + cluster_role_binding_name: "csi-volume-exporter" + image: "{{ telemetry_images['library/python'] | default('docker.io/library/python:3.11-slim') }}" + metrics_port: 8080 + scrape_interval: "30s" + scrape_interval_seconds: 30 + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "200m" + memory: "256Mi" + # Usage: victoria-operator-vmagent.yaml.j2 (operator-native) vmagent: global: @@ -326,8 +350,6 @@ common_mode: "0755" victoria_templates_common: - src: 'telemetry/victoria/victoria-vmagent-rbac.yaml.j2' dest: 'victoria-vmagent-rbac.yaml' - - src: 'telemetry/victoria/csi-volume-exporter.yaml.j2' - dest: 'csi-volume-exporter.yaml' # Operator-based templates (new default) # Single-node operator template (used when victoria_cluster.enabled: false) From 881524e2d5e211cb48cde827b79a0e0b9836e389 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sat, 9 May 2026 22:37:13 +0530 Subject: [PATCH 43/63] UT issue fixes --- .../tasks/generate_telemetry_deployments.yml | 28 ----------------- .../victoria/csi-volume-exporter.yaml.j2 | 30 ++++--------------- 2 files changed, 5 insertions(+), 53 deletions(-) diff --git a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml index 1d5376bd97..ed8a5cdf60 100644 --- a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml +++ b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml @@ -129,34 +129,6 @@ dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/{{ victoria_operator_pkg }}.tar.gz" mode: "{{ hostvars['localhost']['file_permissions_644'] }}" -- name: CSI Volume Exporter Python packages configuration - when: - - victoria_metrics_support | default(false) | bool - - telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool - block: - - name: Create python-packages directory - ansible.builtin.file: - path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/python-packages" - state: directory - mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" - - - name: Extract Python package names from service_k8s.json (pip_module type) - ansible.builtin.set_fact: - csi_pip_modules: >- - {{ k8s_packages_json['service_k8s']['cluster'] - | selectattr('type', 'equalto', 'pip_module') - | selectattr('package', 'search', 'prometheus_client|kubernetes') - | map(attribute='package') - | list }} - - - name: Download Python pip_modules from Pulp - ansible.builtin.get_url: - url: "{{ offline_tarball_path }}/{{ item }}/{{ item }}" - dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/python-packages/{{ item }}" - mode: "{{ hostvars['localhost']['file_permissions_644'] }}" - loop: "{{ csi_pip_modules }}" - when: csi_pip_modules | length > 0 - - name: Populate common telemetry deployment configs ansible.builtin.template: src: "{{ item.src }}" diff --git a/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 index 71742dc55e..b4ba525407 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 @@ -78,27 +78,17 @@ spec: component: volume-status spec: serviceAccountName: {{ csi_volume_exporter.service_account_name }} - initContainers: - - name: install-deps - image: {{ csi_volume_exporter.image }} - command: ["/bin/sh", "-c"] - args: - - | - pip install --no-index --find-links=/packages \ - prometheus_client kubernetes || \ - pip install prometheus_client kubernetes - volumeMounts: - - name: python-packages - mountPath: /packages - - name: app-dir - mountPath: /usr/local/lib/python3.11/site-packages-shared containers: - name: exporter image: {{ csi_volume_exporter.image }} command: ["/bin/sh", "-c"] args: - | - cp -r /usr/local/lib/python3.11/site-packages-shared/* /usr/local/lib/python3.11/site-packages/ 2>/dev/null || true + pip3 install prometheus_client==0.20.0 kubernetes==33.1.0 \ + --find-links="{{ offline_pip_module_path }}/prometheus_client==0.20.0/" \ + --trusted-host "{{ pulp_server_ip }}" \ + --no-index || \ + pip3 install prometheus_client kubernetes mkdir -p /app cat > /app/exporter.py <<'PYEOF' from prometheus_client import start_http_server, Gauge, Counter, Info @@ -351,16 +341,6 @@ spec: limits: cpu: {{ csi_volume_exporter.resources.limits.cpu }} memory: {{ csi_volume_exporter.resources.limits.memory }} - volumeMounts: - - name: app-dir - mountPath: /usr/local/lib/python3.11/site-packages-shared - volumes: - - name: python-packages - hostPath: - path: {{ hostvars['localhost']['k8s_client_mount_path'] }}/telemetry/python-packages - type: DirectoryOrCreate - - name: app-dir - emptyDir: {} --- apiVersion: v1 kind: Service From a532ec54df4cd8776d2f10247a06c6c6756161d1 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sat, 9 May 2026 23:55:30 +0530 Subject: [PATCH 44/63] variable declaration in telemetry --- .../tasks/validate_telemetry_config.yml | 4 ++- .../tasks/generate_telemetry_deployments.yml | 33 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml index 2466ee6ea1..9deada8961 100644 --- a/provision/roles/provision_validations/tasks/validate_telemetry_config.yml +++ b/provision/roles/provision_validations/tasks/validate_telemetry_config.yml @@ -84,7 +84,9 @@ - name: Set Volume Health Monitoring status from CSI driver values ansible.builtin.set_fact: - powerscale_volume_health_enabled: "{{ csi_powerscale_values.node.healthMonitor.enabled | default(false) | bool }}" + powerscale_volume_health_enabled: >- + {{ (csi_powerscale_values.node.healthMonitor.enabled | default(false) | bool) + or (csi_powerscale_values.controller.healthMonitor.enabled | default(false) | bool) }} cacheable: true when: - csi_powerscale_values is defined diff --git a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml index ed8a5cdf60..e57eb52d30 100644 --- a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml +++ b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml @@ -20,6 +20,39 @@ mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" tags: telemetry_deployment +- name: Read CSI PowerScale driver values.yaml to detect Volume Health Monitoring + when: hostvars['localhost']['csi_driver_powerscale_support'] | default(false) | bool + block: + - name: Check if CSI PowerScale values file exists + ansible.builtin.stat: + path: "{{ service_cluster_info.csi_powerscale_driver_values_file_path }}" + register: csi_values_file_stat + when: service_cluster_info.csi_powerscale_driver_values_file_path is defined + + - name: Load CSI PowerScale values.yaml + ansible.builtin.include_vars: + file: "{{ service_cluster_info.csi_powerscale_driver_values_file_path }}" + name: csi_powerscale_values + when: + - service_cluster_info.csi_powerscale_driver_values_file_path is defined + - csi_values_file_stat.stat.exists | default(false) + + - name: Set Volume Health Monitoring status from CSI driver values + ansible.builtin.set_fact: + powerscale_volume_health_enabled: >- + {{ (csi_powerscale_values.node.healthMonitor.enabled | default(false) | bool) + or (csi_powerscale_values.controller.healthMonitor.enabled | default(false) | bool) }} + cacheable: true + when: + - csi_powerscale_values is defined + + - name: Set Volume Health Monitoring to false if not configured + ansible.builtin.set_fact: + powerscale_volume_health_enabled: false + cacheable: true + when: powerscale_volume_health_enabled is not defined + tags: telemetry_deployment + - name: Create test directory for TLS test jobs ansible.builtin.file: path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/deployments/test" From 1f40d673a28c3ef3096361182b8fed6ca8659578 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sun, 10 May 2026 00:57:48 +0530 Subject: [PATCH 45/63] correct setting of volume monitoring enabled --- .../tasks/generate_telemetry_deployments.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml index e57eb52d30..9f7a63ca9c 100644 --- a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml +++ b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml @@ -21,21 +21,22 @@ tags: telemetry_deployment - name: Read CSI PowerScale driver values.yaml to detect Volume Health Monitoring - when: hostvars['localhost']['csi_driver_powerscale_support'] | default(false) | bool block: - name: Check if CSI PowerScale values file exists ansible.builtin.stat: - path: "{{ service_cluster_info.csi_powerscale_driver_values_file_path }}" + path: "{{ csi_powerscale_driver_values_file_path }}" register: csi_values_file_stat - when: service_cluster_info.csi_powerscale_driver_values_file_path is defined + when: csi_powerscale_driver_values_file_path is defined + delegate_to: localhost - name: Load CSI PowerScale values.yaml ansible.builtin.include_vars: - file: "{{ service_cluster_info.csi_powerscale_driver_values_file_path }}" + file: "{{ csi_powerscale_driver_values_file_path }}" name: csi_powerscale_values when: - - service_cluster_info.csi_powerscale_driver_values_file_path is defined + - csi_powerscale_driver_values_file_path is defined - csi_values_file_stat.stat.exists | default(false) + delegate_to: localhost - name: Set Volume Health Monitoring status from CSI driver values ansible.builtin.set_fact: From 719277bd4fff1164c6d1f870f833885dfeb5ecab Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sun, 10 May 2026 01:41:16 +0530 Subject: [PATCH 46/63] DNS reolution issue for syslog --- .../victoria/victorialogs-operator-vlagent.yaml.j2 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 index 524fe05287..814f462067 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/victorialogs-operator-vlagent.yaml.j2 @@ -69,12 +69,12 @@ spec: # ======================================== # Forward logs to VictoriaLogs vlinsert endpoint # Supports JSON Lines format with optional TLS - # Using short service name (same namespace) to avoid DNS resolution issues + # Using full FQDN for DNS resolution remoteWrite: {% if victoria_logs_cluster.tls_enabled %} - - url: https://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc:9481/internal/insert + - url: https://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/internal/insert {% else %} - - url: http://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc:9481/internal/insert + - url: http://vlinsert-victoria-logs-cluster.{{ telemetry_namespace }}.svc.cluster.local:9481/internal/insert {% endif %} {% for endpoint in telemetry_config.telemetry_sinks.victoria_logs.additional_log_write_endpoints | default([]) %} - url: {{ endpoint.url }} From 39d969c380d8ed64f3ad3090d8eb6411e04a4cb5 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sun, 10 May 2026 06:56:02 +0530 Subject: [PATCH 47/63] collect volume stat metrics --- .../victoria/csi-volume-exporter.yaml.j2 | 90 +++++++++++++++++++ provision/roles/telemetry/vars/main.yml | 2 +- 2 files changed, 91 insertions(+), 1 deletion(-) diff --git a/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 index b4ba525407..b80ba8e274 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 @@ -39,6 +39,9 @@ rules: - apiGroups: [""] resources: ["nodes"] verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["nodes/proxy"] + verbs: ["get", "list"] - apiGroups: ["storage.k8s.io"] resources: ["storageclasses", "csinodes"] verbs: ["get", "list", "watch"] @@ -95,6 +98,7 @@ spec: from kubernetes import client, config import time import datetime + import re PROVISIONER = 'csi-isilon.dellemc.com' @@ -131,6 +135,23 @@ spec: 'Total PowerScale PVCs by phase', ['phase']) + # ── Volume usage metrics (from kubelet) ── + volume_used_bytes = Gauge('powerscale_volume_used_bytes', + 'Actual bytes used on the volume mount point', + ['pv_name', 'node']) + volume_available_bytes = Gauge('powerscale_volume_available_bytes', + 'Actual bytes available on the volume mount point', + ['pv_name', 'node']) + volume_inodes = Gauge('powerscale_volume_inodes', + 'Total inodes available on the volume', + ['pv_name', 'node']) + volume_inodes_used = Gauge('powerscale_volume_inodes_used', + 'Inodes currently used on the volume', + ['pv_name', 'node']) + volume_inodes_free = Gauge('powerscale_volume_inodes_free', + 'Remaining inodes on the volume', + ['pv_name', 'node']) + # ── Health event metrics (from CSI external-health-monitor-controller) ── volume_condition_abnormal = Gauge( 'powerscale_volume_health_abnormal', @@ -313,6 +334,74 @@ spec: sc.allow_volume_expansion or False).lower() ).set(1) + def collect_volume_stats(pv_to_pvc): + # Query kubelet volume stats from each node + nodes = v1.list_node() + pv_name_to_handle = {} + # Build mapping of PV name to volume handle + pvs = v1.list_persistent_volume() + for pv in pvs.items: + ann = pv.metadata.annotations or {} + prov = ann.get('pv.kubernetes.io/provisioned-by', '') + if PROVISIONER not in prov: + continue + vol_handle = '' + if pv.spec.csi and pv.spec.csi.volume_handle: + vol_handle = pv.spec.csi.volume_handle + pv_name_to_handle[pv.metadata.name] = vol_handle + + for node in nodes.items: + node_name = node.metadata.name + try: + # Query kubelet volume stats via node proxy API + path = f"/api/v1/nodes/{node_name}/proxy/metrics/volume_stats" + response = v1.connect_get_namespaced_pod_proxy_with_path( + namespace="", name="", path=path, _preload_content=False) + if response.status != 200: + print(f"Failed to get volume stats from node {node_name}: {response.status}") + continue + metrics_text = response.data.decode('utf-8') + + # Parse Prometheus metrics format + # Expected format: kubelet_volume_stats_*{volume_name="..."} value + stats = {} + for line in metrics_text.split('\n'): + line = line.strip() + if not line or line.startswith('#'): + continue + # Parse: metric_name{labels} value + match = re.match(r'(\w+)\{(.+)\}\s+(\d+)', line) + if match: + metric_name = match.group(1) + labels_str = match.group(2) + value = int(match.group(3)) + # Extract volume_name from labels + vol_name_match = re.search(r'volume_name="([^"]+)"', labels_str) + if vol_name_match: + vol_name = vol_name_match.group(1) + if vol_name not in stats: + stats[vol_name] = {} + stats[vol_name][metric_name] = value + + # Match volume stats to PVs and update metrics + for pv_name, vol_handle in pv_name_to_handle.items(): + # Try to match by volume handle (which is the volume name in stats) + if vol_handle in stats: + s = stats[vol_handle] + used = s.get('kubelet_volume_stats_used_bytes', 0) + avail = s.get('kubelet_volume_stats_available_bytes', 0) + inodes = s.get('kubelet_volume_stats_inodes', 0) + inodes_used = s.get('kubelet_volume_stats_inodes_used', 0) + inodes_free = s.get('kubelet_volume_stats_inodes_free', 0) + + volume_used_bytes.labels(pv_name=pv_name, node=node_name).set(used) + volume_available_bytes.labels(pv_name=pv_name, node=node_name).set(avail) + volume_inodes.labels(pv_name=pv_name, node=node_name).set(inodes) + volume_inodes_used.labels(pv_name=pv_name, node=node_name).set(inodes_used) + volume_inodes_free.labels(pv_name=pv_name, node=node_name).set(inodes_free) + except Exception as e: + print(f"Error collecting volume stats from node {node_name}: {e}") + def collect_all(): try: pv_to_pvc = collect_pv_metrics() @@ -320,6 +409,7 @@ spec: collect_health_events(pv_to_pvc) collect_node_metrics() collect_storageclass_metrics() + collect_volume_stats(pv_to_pvc) except Exception as e: print(f"Error in collection cycle: {e}") diff --git a/provision/roles/telemetry/vars/main.yml b/provision/roles/telemetry/vars/main.yml index 9846b1c3e4..53202e31ae 100644 --- a/provision/roles/telemetry/vars/main.yml +++ b/provision/roles/telemetry/vars/main.yml @@ -291,7 +291,7 @@ csi_volume_exporter: cluster_role_name: "csi-volume-exporter" cluster_role_binding_name: "csi-volume-exporter" image: "{{ telemetry_images['library/python'] | default('docker.io/library/python:3.11-slim') }}" - metrics_port: 8080 + metrics_port: 9090 scrape_interval: "30s" scrape_interval_seconds: 30 resources: From 3ae01b1d9906b9eeb61451b31d4e1ba37ff36628 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sun, 10 May 2026 07:03:21 +0530 Subject: [PATCH 48/63] ansible lint fixes --- .../roles/telemetry/tasks/generate_telemetry_deployments.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml index 9f7a63ca9c..ae1108b2a1 100644 --- a/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml +++ b/provision/roles/telemetry/tasks/generate_telemetry_deployments.yml @@ -21,6 +21,7 @@ tags: telemetry_deployment - name: Read CSI PowerScale driver values.yaml to detect Volume Health Monitoring + tags: telemetry_deployment block: - name: Check if CSI PowerScale values file exists ansible.builtin.stat: @@ -52,7 +53,6 @@ powerscale_volume_health_enabled: false cacheable: true when: powerscale_volume_health_enabled is not defined - tags: telemetry_deployment - name: Create test directory for TLS test jobs ansible.builtin.file: From 50d68580f32ffb52a5353733fa0d1599c25985c0 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Sun, 10 May 2026 21:18:55 +0530 Subject: [PATCH 49/63] update csi volume exporter --- .../victoria/csi-volume-exporter.yaml.j2 | 87 ------------------- 1 file changed, 87 deletions(-) diff --git a/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 index b80ba8e274..2bb7220e09 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 @@ -39,9 +39,6 @@ rules: - apiGroups: [""] resources: ["nodes"] verbs: ["get", "list", "watch"] - - apiGroups: [""] - resources: ["nodes/proxy"] - verbs: ["get", "list"] - apiGroups: ["storage.k8s.io"] resources: ["storageclasses", "csinodes"] verbs: ["get", "list", "watch"] @@ -135,22 +132,6 @@ spec: 'Total PowerScale PVCs by phase', ['phase']) - # ── Volume usage metrics (from kubelet) ── - volume_used_bytes = Gauge('powerscale_volume_used_bytes', - 'Actual bytes used on the volume mount point', - ['pv_name', 'node']) - volume_available_bytes = Gauge('powerscale_volume_available_bytes', - 'Actual bytes available on the volume mount point', - ['pv_name', 'node']) - volume_inodes = Gauge('powerscale_volume_inodes', - 'Total inodes available on the volume', - ['pv_name', 'node']) - volume_inodes_used = Gauge('powerscale_volume_inodes_used', - 'Inodes currently used on the volume', - ['pv_name', 'node']) - volume_inodes_free = Gauge('powerscale_volume_inodes_free', - 'Remaining inodes on the volume', - ['pv_name', 'node']) # ── Health event metrics (from CSI external-health-monitor-controller) ── volume_condition_abnormal = Gauge( @@ -334,73 +315,6 @@ spec: sc.allow_volume_expansion or False).lower() ).set(1) - def collect_volume_stats(pv_to_pvc): - # Query kubelet volume stats from each node - nodes = v1.list_node() - pv_name_to_handle = {} - # Build mapping of PV name to volume handle - pvs = v1.list_persistent_volume() - for pv in pvs.items: - ann = pv.metadata.annotations or {} - prov = ann.get('pv.kubernetes.io/provisioned-by', '') - if PROVISIONER not in prov: - continue - vol_handle = '' - if pv.spec.csi and pv.spec.csi.volume_handle: - vol_handle = pv.spec.csi.volume_handle - pv_name_to_handle[pv.metadata.name] = vol_handle - - for node in nodes.items: - node_name = node.metadata.name - try: - # Query kubelet volume stats via node proxy API - path = f"/api/v1/nodes/{node_name}/proxy/metrics/volume_stats" - response = v1.connect_get_namespaced_pod_proxy_with_path( - namespace="", name="", path=path, _preload_content=False) - if response.status != 200: - print(f"Failed to get volume stats from node {node_name}: {response.status}") - continue - metrics_text = response.data.decode('utf-8') - - # Parse Prometheus metrics format - # Expected format: kubelet_volume_stats_*{volume_name="..."} value - stats = {} - for line in metrics_text.split('\n'): - line = line.strip() - if not line or line.startswith('#'): - continue - # Parse: metric_name{labels} value - match = re.match(r'(\w+)\{(.+)\}\s+(\d+)', line) - if match: - metric_name = match.group(1) - labels_str = match.group(2) - value = int(match.group(3)) - # Extract volume_name from labels - vol_name_match = re.search(r'volume_name="([^"]+)"', labels_str) - if vol_name_match: - vol_name = vol_name_match.group(1) - if vol_name not in stats: - stats[vol_name] = {} - stats[vol_name][metric_name] = value - - # Match volume stats to PVs and update metrics - for pv_name, vol_handle in pv_name_to_handle.items(): - # Try to match by volume handle (which is the volume name in stats) - if vol_handle in stats: - s = stats[vol_handle] - used = s.get('kubelet_volume_stats_used_bytes', 0) - avail = s.get('kubelet_volume_stats_available_bytes', 0) - inodes = s.get('kubelet_volume_stats_inodes', 0) - inodes_used = s.get('kubelet_volume_stats_inodes_used', 0) - inodes_free = s.get('kubelet_volume_stats_inodes_free', 0) - - volume_used_bytes.labels(pv_name=pv_name, node=node_name).set(used) - volume_available_bytes.labels(pv_name=pv_name, node=node_name).set(avail) - volume_inodes.labels(pv_name=pv_name, node=node_name).set(inodes) - volume_inodes_used.labels(pv_name=pv_name, node=node_name).set(inodes_used) - volume_inodes_free.labels(pv_name=pv_name, node=node_name).set(inodes_free) - except Exception as e: - print(f"Error collecting volume stats from node {node_name}: {e}") def collect_all(): try: @@ -409,7 +323,6 @@ spec: collect_health_events(pv_to_pvc) collect_node_metrics() collect_storageclass_metrics() - collect_volume_stats(pv_to_pvc) except Exception as e: print(f"Error in collection cycle: {e}") From b9531704e3dafa1a23d0d1404dbdac61e4373287 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Mon, 11 May 2026 23:28:57 +0530 Subject: [PATCH 50/63] input validation of CSI secret file presence when syslog is enabled --- .../common_utils/en_us_validation_msg.py | 5 ++++ .../powerscale_telemetry_validation.py | 23 ++++++++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index 1f1685d81a..fd18fc9c39 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -324,6 +324,11 @@ def switch_snmp3_username_fail_msg(min_username_length, max_length): "csi_driver_powerscale is not configured in software_config.json. " "PowerScale telemetry requires the CSI driver for PowerScale to be configured." ) +POWERSCALE_CSI_SECRET_FILE_MISSING_MSG = ( + "CSI PowerScale secret.yaml file not found at {secret_path}. " + "PowerScale logs deployment will proceed with manual configuration instructions. " + "For automatic PowerScale syslog configuration, ensure CSI driver is deployed and secret.yaml exists." +) POWERSCALE_SERVICE_CLUSTER_MISSING_MSG = ( "service cluster is not defined in functional_groups_config.yml. " "PowerScale telemetry requires a service cluster." diff --git a/common/library/module_utils/input_validation/validation_flows/powerscale_telemetry_validation.py b/common/library/module_utils/input_validation/validation_flows/powerscale_telemetry_validation.py index 9b0ef57217..e6d35ea770 100644 --- a/common/library/module_utils/input_validation/validation_flows/powerscale_telemetry_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/powerscale_telemetry_validation.py @@ -342,6 +342,23 @@ def validate_powerscale_telemetry_config( en_us_validation_msg.POWERSCALE_VICTORIA_LOGS_REQUIRED_MSG )) - # NOTE: additional_remote_write_endpoints and additional_log_write_endpoints - # have been moved to telemetry_sinks.victoria_metrics and - # telemetry_sinks.victoria_logs. Validation is now handled in telemetry_validation.py + # Check CSI PowerScale secret.yaml file presence (optional/warning) + # Secret is used for automatic PowerScale syslog configuration via SSH + # Deployment handles missing secret gracefully with manual instructions + k8s_client_share_path = config_paths.get("k8s_client_share_path", "") + if k8s_client_share_path: + secret_file_path = os.path.join( + k8s_client_share_path, + "csi-driver-powerscale", + "secret.yaml" + ) + if not os.path.exists(secret_file_path): + logger.warning( + en_us_validation_msg.POWERSCALE_CSI_SECRET_FILE_MISSING_MSG.format( + secret_path=secret_file_path + ) + ) + + # Note: CSI driver check removed for logs + # Logs use syslog (no credentials) and deployment handles missing CSI secret gracefully + # by falling back to manual configuration instructions From fba93afa8a46553f9be89804dd8ee884e040f7db Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Wed, 13 May 2026 07:29:43 +0530 Subject: [PATCH 51/63] remove unwanted task --- .../powerscale_telemetry_validation.py | 21 ------- .../tasks/deploy_powerscale_logs.yml | 60 ------------------- provision/roles/telemetry/tasks/main.yml | 5 -- 3 files changed, 86 deletions(-) delete mode 100644 provision/roles/telemetry/tasks/deploy_powerscale_logs.yml diff --git a/common/library/module_utils/input_validation/validation_flows/powerscale_telemetry_validation.py b/common/library/module_utils/input_validation/validation_flows/powerscale_telemetry_validation.py index e6d35ea770..985a6de72d 100644 --- a/common/library/module_utils/input_validation/validation_flows/powerscale_telemetry_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/powerscale_telemetry_validation.py @@ -341,24 +341,3 @@ def validate_powerscale_telemetry_config( powerscale_collection_targets, en_us_validation_msg.POWERSCALE_VICTORIA_LOGS_REQUIRED_MSG )) - - # Check CSI PowerScale secret.yaml file presence (optional/warning) - # Secret is used for automatic PowerScale syslog configuration via SSH - # Deployment handles missing secret gracefully with manual instructions - k8s_client_share_path = config_paths.get("k8s_client_share_path", "") - if k8s_client_share_path: - secret_file_path = os.path.join( - k8s_client_share_path, - "csi-driver-powerscale", - "secret.yaml" - ) - if not os.path.exists(secret_file_path): - logger.warning( - en_us_validation_msg.POWERSCALE_CSI_SECRET_FILE_MISSING_MSG.format( - secret_path=secret_file_path - ) - ) - - # Note: CSI driver check removed for logs - # Logs use syslog (no credentials) and deployment handles missing CSI secret gracefully - # by falling back to manual configuration instructions diff --git a/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml b/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml deleted file mode 100644 index 892d13b18f..0000000000 --- a/provision/roles/telemetry/tasks/deploy_powerscale_logs.yml +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -# Configure PowerScale syslog collection via direct VLAgent LoadBalancer -# Gated by: telemetry_sources.powerscale.logs_enabled -# -# DATA PIPELINE: -# PowerScale (UDP/TCP:514) → VLAgent LoadBalancer IP:514 → VictoriaLogs -# -# What Omnia does: -# - Deploys VLAgent with LoadBalancer service (MetalLB assigns external IP) -# - VLAgent listens on UDP:514 and TCP:514 for syslog messages -# - Automatically configures PowerScale syslog forwarding via SSH (if CSI credentials available) -# - Falls back to manual configuration instructions if credentials unavailable -# -# What Omnia does NOT do: -# - Omnia does NOT configure PowerScale directly if CSI credentials are unavailable -# - User must manually configure PowerScale to send syslog to VLAgent LoadBalancer IP - -- name: Configure PowerScale syslog collection - when: powerscale_log_enabled | default(false) | bool - block: - - name: Validate VictoriaLogs is in collection_targets - ansible.builtin.assert: - that: - - "'victoria_logs' in telemetry_config.telemetry_sources.powerscale.collection_targets | default([])" - fail_msg: "{{ powerscale_victoria_logs_validation_fail_msg }}" - - - name: Read CSI PowerScale secret for cluster information - ansible.builtin.slurp: - src: "{{ hostvars['localhost']['k8s_client_share_path'] }}/csi-driver-powerscale/secret.yaml" - register: csi_powerscale_secret_content - ignore_errors: true - - - name: Parse CSI PowerScale secret - ansible.builtin.set_fact: - csi_powerscale_secret: "{{ csi_powerscale_secret_content.content | b64decode | from_yaml }}" - when: csi_powerscale_secret_content.skipped is not defined - - - name: Extract PowerScale clusters - ansible.builtin.set_fact: - ps_clusters: "{{ csi_powerscale_secret.isilonClusters | default([]) }}" - when: csi_powerscale_secret.skipped is not defined - - - name: Set empty ps_clusters if secret not available - ansible.builtin.set_fact: - ps_clusters: [] - when: csi_powerscale_secret.skipped is defined diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index bc7b25e1b2..1693a8f1cd 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -55,11 +55,6 @@ - telemetry_config.telemetry_sources.powerscale.metrics_enabled | default(false) | bool ansible.builtin.include_tasks: deploy_powerscale_metrics.yml -- name: Deploy PowerScale syslog log collection - when: - - telemetry_config.telemetry_sources.powerscale.logs_enabled | default(false) | bool - ansible.builtin.include_tasks: deploy_powerscale_logs.yml - - name: Generate telemetry deployments (idrac/ldms/powerscale/vector) when: - >- From 59a84d5b6b80412a1a93ba9911cbefff21826b01 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Wed, 13 May 2026 07:32:56 +0530 Subject: [PATCH 52/63] remove unused message --- .../input_validation/common_utils/en_us_validation_msg.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index 199375ba0e..6cea78848b 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -324,11 +324,6 @@ def switch_snmp3_username_fail_msg(min_username_length, max_length): "csi_driver_powerscale is not configured in software_config.json. " "PowerScale telemetry requires the CSI driver for PowerScale to be configured." ) -POWERSCALE_CSI_SECRET_FILE_MISSING_MSG = ( - "CSI PowerScale secret.yaml file not found at {secret_path}. " - "PowerScale logs deployment will proceed with manual configuration instructions. " - "For automatic PowerScale syslog configuration, ensure CSI driver is deployed and secret.yaml exists." -) POWERSCALE_SERVICE_CLUSTER_MISSING_MSG = ( "service cluster is not defined in functional_groups_config.yml. " "PowerScale telemetry requires a service cluster." From 9b551f5c9a31fe0a0fe27be9dec0e2d2d879b21f Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Wed, 13 May 2026 17:36:26 +0530 Subject: [PATCH 53/63] disable and enable telemetry --- .../check_kube_vip_reachability.yml | 0 common/tasks/telemetry/load_ha_config.yml | 25 ++++++ provision/roles/telemetry/tasks/main.yml | 2 +- .../tasks/disable_powerscale_metrics.yml | 89 +++++++++++++++++++ .../tasks/enable_powerscale_metrics.yml | 82 +++++++++++++++++ .../roles/telemetry_management/tasks/main.yml | 47 ++++++++++ .../roles/telemetry_management/vars/main.yml | 28 ++++++ telemetry/telemetry_disable.yml | 46 ++++++++++ telemetry/telemetry_enable.yml | 46 ++++++++++ 9 files changed, 364 insertions(+), 1 deletion(-) rename {provision/roles/telemetry/tasks => common/tasks/telemetry}/check_kube_vip_reachability.yml (100%) create mode 100644 common/tasks/telemetry/load_ha_config.yml create mode 100644 telemetry/roles/telemetry_management/tasks/disable_powerscale_metrics.yml create mode 100644 telemetry/roles/telemetry_management/tasks/enable_powerscale_metrics.yml create mode 100644 telemetry/roles/telemetry_management/tasks/main.yml create mode 100644 telemetry/roles/telemetry_management/vars/main.yml create mode 100644 telemetry/telemetry_disable.yml create mode 100644 telemetry/telemetry_enable.yml diff --git a/provision/roles/telemetry/tasks/check_kube_vip_reachability.yml b/common/tasks/telemetry/check_kube_vip_reachability.yml similarity index 100% rename from provision/roles/telemetry/tasks/check_kube_vip_reachability.yml rename to common/tasks/telemetry/check_kube_vip_reachability.yml diff --git a/common/tasks/telemetry/load_ha_config.yml b/common/tasks/telemetry/load_ha_config.yml new file mode 100644 index 0000000000..db45cb7003 --- /dev/null +++ b/common/tasks/telemetry/load_ha_config.yml @@ -0,0 +1,25 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Load high_availability_config.yml + ansible.builtin.include_vars: + file: "{{ ha_config_file }}" + register: ha_config_loaded + ignore_errors: true + +- name: Set kube_vip fact + ansible.builtin.set_fact: + kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}" + when: ha_config_loaded is succeeded diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index 1693a8f1cd..d14c960b6d 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -28,7 +28,7 @@ ansible.builtin.include_tasks: load_service_images.yml - name: Check kube_vip reachability for validation - ansible.builtin.include_tasks: check_kube_vip_reachability.yml + ansible.builtin.include_tasks: "{{ playbook_dir }}/../../common/tasks/telemetry/check_kube_vip_reachability.yml" when: - victoria_metrics_support | default(false) | bool - kube_vip is defined diff --git a/telemetry/roles/telemetry_management/tasks/disable_powerscale_metrics.yml b/telemetry/roles/telemetry_management/tasks/disable_powerscale_metrics.yml new file mode 100644 index 0000000000..e020bc2576 --- /dev/null +++ b/telemetry/roles/telemetry_management/tasks/disable_powerscale_metrics.yml @@ -0,0 +1,89 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Get current replica counts for PowerScale deployments + kubernetes.core.k8s_info: + api_version: apps/v1 + kind: Deployment + namespace: "{{ telemetry_namespace }}" + label_selectors: + - "app.kubernetes.io/name=karavi-metrics-powerscale" + - "app.kubernetes.io/name=otel-collector" + - "app=csi-volume-exporter" + delegate_to: "{{ kube_vip }}" + register: powerscale_deployments + ignore_errors: true + +- name: Save original replica counts + ansible.builtin.set_fact: + original_replicas: {} + ignore_errors: true + +- name: Store original replica counts + ansible.builtin.set_fact: + original_replicas: "{{ original_replicas | combine({item.metadata.name: item.spec.replicas | default(1)}) }}" + loop: "{{ powerscale_deployments.resources }}" + loop_control: + label: "{{ item.metadata.name }}" + ignore_errors: true + +- name: Save original replica counts to file + ansible.builtin.copy: + content: "{{ original_replicas | to_nice_yaml }}" + dest: /tmp/powerscale_replicas_backup.yml + mode: '0644' + delegate_to: "{{ kube_vip }}" + ignore_errors: true + +- name: Scale down OTEL Collector + kubernetes.core.k8s: + api_version: apps/v1 + kind: Deployment + name: otel-collector + namespace: "{{ telemetry_namespace }}" + definition: + spec: + replicas: 0 + delegate_to: "{{ kube_vip }}" + ignore_errors: true + +- name: Scale down karavi-metrics-powerscale + kubernetes.core.k8s: + api_version: apps/v1 + kind: Deployment + name: karavi-metrics-powerscale + namespace: "{{ telemetry_namespace }}" + definition: + spec: + replicas: 0 + delegate_to: "{{ kube_vip }}" + ignore_errors: true + +- name: Scale down csi-volume-exporter + kubernetes.core.k8s: + api_version: apps/v1 + kind: Deployment + name: csi-volume-exporter + namespace: "{{ telemetry_namespace }}" + definition: + spec: + replicas: 0 + delegate_to: "{{ kube_vip }}" + ignore_errors: true + +- name: Display PowerScale metric workloads scaled down + ansible.builtin.debug: + msg: "{{ powerscale_metrics_scaled_down_msg }}" + diff --git a/telemetry/roles/telemetry_management/tasks/enable_powerscale_metrics.yml b/telemetry/roles/telemetry_management/tasks/enable_powerscale_metrics.yml new file mode 100644 index 0000000000..0fd19a10ae --- /dev/null +++ b/telemetry/roles/telemetry_management/tasks/enable_powerscale_metrics.yml @@ -0,0 +1,82 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Load original replica counts from backup file + ansible.builtin.slurp: + src: /tmp/powerscale_replicas_backup.yml + delegate_to: "{{ kube_vip }}" + register: replicas_backup + ignore_errors: true + +- name: Parse replica counts from backup + ansible.builtin.set_fact: + original_replicas: "{{ replicas_backup.content | b64decode | from_yaml | default({}) }}" + ignore_errors: true + +- name: Scale up OTEL Collector to original replica count + kubernetes.core.k8s: + api_version: apps/v1 + kind: Deployment + name: otel-collector + namespace: "{{ telemetry_namespace }}" + definition: + spec: + replicas: "{{ original_replicas['otel-collector'] | default(1) }}" + delegate_to: "{{ kube_vip }}" + ignore_errors: true + +- name: Scale up karavi-metrics-powerscale to original replica count + kubernetes.core.k8s: + api_version: apps/v1 + kind: Deployment + name: karavi-metrics-powerscale + namespace: "{{ telemetry_namespace }}" + definition: + spec: + replicas: "{{ original_replicas['karavi-metrics-powerscale'] | default(1) }}" + delegate_to: "{{ kube_vip }}" + ignore_errors: true + +- name: Scale up csi-volume-exporter to original replica count + kubernetes.core.k8s: + api_version: apps/v1 + kind: Deployment + name: csi-volume-exporter + namespace: "{{ telemetry_namespace }}" + definition: + spec: + replicas: "{{ original_replicas['csi-volume-exporter'] | default(1) }}" + delegate_to: "{{ kube_vip }}" + ignore_errors: true + +- name: Wait for OTEL Collector to be ready + kubernetes.core.k8s_info: + api_version: apps/v1 + kind: Deployment + name: otel-collector + namespace: "{{ telemetry_namespace }}" + register: otel_deployment + until: > + otel_deployment.resources | length > 0 and + (otel_deployment.resources[0].status.readyReplicas | default(0)) >= 1 + retries: 30 + delay: 10 + delegate_to: "{{ kube_vip }}" + ignore_errors: true + +- name: Display PowerScale metric workloads scaled up + ansible.builtin.debug: + msg: "{{ powerscale_metrics_scaled_up_msg }}" + diff --git a/telemetry/roles/telemetry_management/tasks/main.yml b/telemetry/roles/telemetry_management/tasks/main.yml new file mode 100644 index 0000000000..005ec42820 --- /dev/null +++ b/telemetry/roles/telemetry_management/tasks/main.yml @@ -0,0 +1,47 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Fail if no tags provided + ansible.builtin.fail: + msg: "{{ tags_required_msg }}" + when: ansible_run_tags | default([]) | length == 0 + +- name: Load telemetry configuration + ansible.builtin.include_vars: + file: "{{ telemetry_config_file }}" + +- name: Load HA configuration + ansible.builtin.include_tasks: "{{ playbook_dir }}/../common/tasks/telemetry/load_ha_config.yml" + +- name: Check kube_vip reachability + ansible.builtin.include_tasks: "{{ playbook_dir }}/../common/tasks/telemetry/check_kube_vip_reachability.yml" + +# ============================================================================ +# DISABLE TASKS +# ============================================================================ +- name: Disable PowerScale metrics + ansible.builtin.include_tasks: disable_powerscale_metrics.yml + when: telemetry_operation == "disable" + tags: + - powerscale + +# ============================================================================ +# ENABLE TASKS +# ============================================================================ +- name: Enable PowerScale metrics + ansible.builtin.include_tasks: enable_powerscale_metrics.yml + when: telemetry_operation == "enable" + tags: + - powerscale diff --git a/telemetry/roles/telemetry_management/vars/main.yml b/telemetry/roles/telemetry_management/vars/main.yml new file mode 100644 index 0000000000..e67b720b5a --- /dev/null +++ b/telemetry/roles/telemetry_management/vars/main.yml @@ -0,0 +1,28 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# ============================================================================ +# TELEMETRY MANAGEMENT MESSAGES +# ============================================================================ + +# Error messages +tags_required_msg: "ERROR: No tags provided. Please specify --tags to select which telemetry source to manage. Valid tags: powerscale" + +kube_vip_unreachable_msg: "ERROR: Kube VIP {{ kube_vip }} is not reachable via SSH" + +# Success messages +powerscale_metrics_scaled_down_msg: "PowerScale metric collection disabled. Scaled down: otel-collector, karavi-metrics-powerscale in {{ csm_namespace }} namespace. To re-enable: ansible-playbook telemetry/enable_telemetry.yml --tags powerscale" + +powerscale_metrics_scaled_up_msg: "PowerScale metric collection re-enabled. Scaled up: otel-collector, karavi-metrics-powerscale in {{ csm_namespace }} namespace." diff --git a/telemetry/telemetry_disable.yml b/telemetry/telemetry_disable.yml new file mode 100644 index 0000000000..49b89895e6 --- /dev/null +++ b/telemetry/telemetry_disable.yml @@ -0,0 +1,46 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# ============================================================================ +# DISABLE TELEMETRY PLAYBOOK +# ============================================================================ +# Selectively disable telemetry collection by source. +# +# USAGE: +# Disable PowerScale metric collection: +# ansible-playbook telemetry/disable_telemetry.yml --tags powerscale +# +# WHAT IT DOES: +# 1. Scales down the corresponding Kubernetes workloads +# 2. Leaves storage components (VictoriaMetrics, VictoriaLogs) running +# +# TO RE-ENABLE: +# ansible-playbook telemetry/enable_telemetry.yml --tags powerscale +# +# NOTE: PowerScale syslog must be disabled on the PowerScale cluster itself: +# isi audit settings modify --config-syslog-enabled=0 +# ============================================================================ + +- name: Disable telemetry collection + hosts: localhost + connection: local + gather_facts: false + vars: + telemetry_config_file: "{{ playbook_dir }}/../input/telemetry_config.yml" + ha_config_file: "{{ playbook_dir }}/../input/high_availability_config.yml" + telemetry_namespace: telemetry + telemetry_operation: disable + roles: + - telemetry_management diff --git a/telemetry/telemetry_enable.yml b/telemetry/telemetry_enable.yml new file mode 100644 index 0000000000..4965cd0f4a --- /dev/null +++ b/telemetry/telemetry_enable.yml @@ -0,0 +1,46 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# ============================================================================ +# ENABLE TELEMETRY PLAYBOOK +# ============================================================================ +# Selectively re-enable telemetry collection by source. +# +# USAGE: +# Re-enable PowerScale metric collection: +# ansible-playbook telemetry/enable_telemetry.yml --tags powerscale +# +# WHAT IT DOES: +# 1. Scales up the corresponding Kubernetes workloads +# +# TO DISABLE: +# ansible-playbook telemetry/disable_telemetry.yml --tags powerscale +# +# NOTE: PowerScale syslog must be enabled on the PowerScale cluster itself: +# isi audit settings modify --config-syslog-enabled=1 +# isi audit settings modify --config-syslog-servers=:514 +# ============================================================================ + +- name: Enable telemetry collection + hosts: localhost + connection: local + gather_facts: false + vars: + telemetry_config_file: "{{ playbook_dir }}/../input/telemetry_config.yml" + ha_config_file: "{{ playbook_dir }}/../input/high_availability_config.yml" + telemetry_namespace: telemetry + telemetry_operation: enable + roles: + - telemetry_management From aebd242a034fef6a8c1fe887ec658d46c819022b Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Wed, 13 May 2026 18:47:26 +0530 Subject: [PATCH 54/63] fix for pxe mapping valdiation --- .../validation_flows/provision_validation.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index 98efc3637f..bc46073d65 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -394,6 +394,11 @@ def validate_mapping_file_entries(mapping_file_path): if not reader.fieldnames: raise ValueError("CSV header not found in mapping file.") + # Check for leading/trailing whitespace in header names + for fn in reader.fieldnames: + if fn != fn.strip(): + raise ValueError(f"Header '{fn}' has leading or trailing whitespace. Please remove all whitespace from header names in mapping file.") + # Map header names case-insensitively to original names fieldname_map = {fn.strip().upper(): fn for fn in reader.fieldnames} @@ -411,6 +416,12 @@ def validate_mapping_file_entries(mapping_file_path): row_seen = False for row_idx, row in enumerate(reader, start=2): # start=2 approximates CSV row number row_seen = True + + # Check for leading/trailing whitespace in all field values + for col, val in row.items(): + if val is not None and val != val.strip(): + raise ValueError(f"Field '{col}' at CSV row {row_idx} has leading or trailing whitespace. Please remove all whitespace from field values in mapping file.") + # Check presence and non-empty for all required headers for hdr in required_headers: col = fieldname_map[hdr] From 382583f6fcab3541de1cdc2d94261f3bc9cf0e24 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Wed, 13 May 2026 18:55:43 +0530 Subject: [PATCH 55/63] fix for creadentils files parsing is getting overwritten with flat variables --- telemetry/roles/telemetry_management/vars/main.yml | 2 -- utils/credential_utility/roles/update_config/tasks/main.yml | 2 +- utils/credential_utility/roles/update_config/vars/main.yml | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/telemetry/roles/telemetry_management/vars/main.yml b/telemetry/roles/telemetry_management/vars/main.yml index e67b720b5a..3e515a6c33 100644 --- a/telemetry/roles/telemetry_management/vars/main.yml +++ b/telemetry/roles/telemetry_management/vars/main.yml @@ -20,8 +20,6 @@ # Error messages tags_required_msg: "ERROR: No tags provided. Please specify --tags to select which telemetry source to manage. Valid tags: powerscale" -kube_vip_unreachable_msg: "ERROR: Kube VIP {{ kube_vip }} is not reachable via SSH" - # Success messages powerscale_metrics_scaled_down_msg: "PowerScale metric collection disabled. Scaled down: otel-collector, karavi-metrics-powerscale in {{ csm_namespace }} namespace. To re-enable: ansible-playbook telemetry/enable_telemetry.yml --tags powerscale" diff --git a/utils/credential_utility/roles/update_config/tasks/main.yml b/utils/credential_utility/roles/update_config/tasks/main.yml index 66d56c3b0e..fd73561782 100644 --- a/utils/credential_utility/roles/update_config/tasks/main.yml +++ b/utils/credential_utility/roles/update_config/tasks/main.yml @@ -42,7 +42,7 @@ - name: Fetch credentials ansible.builtin.include_tasks: fetch_credentials.yml - loop: "{{ omnia_credentials | dict2items }}" + loop: "{{ omnia_credentials_schema | dict2items }}" loop_control: loop_var: service diff --git a/utils/credential_utility/roles/update_config/vars/main.yml b/utils/credential_utility/roles/update_config/vars/main.yml index 6c60ad6110..9cca7d9405 100644 --- a/utils/credential_utility/roles/update_config/vars/main.yml +++ b/utils/credential_utility/roles/update_config/vars/main.yml @@ -58,7 +58,7 @@ docker_hub_warning: | Proceed to enter your Docker credentials if you want to avoid pull rate limits. Press Enter. -omnia_credentials: +omnia_credentials_schema: provision: mandatory: - { password: provision_password } From 1b9142bcfacac17fa2f7431f3ec17788db5314ba Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 14 May 2026 14:50:15 +0530 Subject: [PATCH 56/63] ansible lint fixes --- provision/roles/telemetry/tasks/main.yml | 2 +- .../templates/telemetry/telemetry.sh.j2 | 46 ++++++++++++ .../tasks/disable_powerscale_metrics.yml | 72 ++++++++++--------- .../tasks/enable_powerscale_metrics.yml | 64 +++++++++++------ .../roles/update_config/tasks/main.yml | 6 +- 5 files changed, 133 insertions(+), 57 deletions(-) diff --git a/provision/roles/telemetry/tasks/main.yml b/provision/roles/telemetry/tasks/main.yml index d14c960b6d..a02ee8b188 100644 --- a/provision/roles/telemetry/tasks/main.yml +++ b/provision/roles/telemetry/tasks/main.yml @@ -28,7 +28,7 @@ ansible.builtin.include_tasks: load_service_images.yml - name: Check kube_vip reachability for validation - ansible.builtin.include_tasks: "{{ playbook_dir }}/../../common/tasks/telemetry/check_kube_vip_reachability.yml" + ansible.builtin.include_tasks: "{{ playbook_dir }}/../common/tasks/telemetry/check_kube_vip_reachability.yml" when: - victoria_metrics_support | default(false) | bool - kube_vip is defined diff --git a/provision/roles/telemetry/templates/telemetry/telemetry.sh.j2 b/provision/roles/telemetry/templates/telemetry/telemetry.sh.j2 index 85e3002105..580d616f27 100644 --- a/provision/roles/telemetry/templates/telemetry/telemetry.sh.j2 +++ b/provision/roles/telemetry/templates/telemetry/telemetry.sh.j2 @@ -100,4 +100,50 @@ else fi {% endif %} +{% if victoria_logs_support %} +# Check reachability of additional log write endpoints +{% if telemetry_config.telemetry_sinks.victoria_logs.additional_log_write_endpoints | default([]) %} +echo "Checking reachability of additional log write endpoints..." +# Wait for VLAgent to be ready before checking endpoint reachability +echo " Waiting for VLAgent to be ready..." +kubectl wait --for=condition=ready --timeout=300s statefulset/vlagent -n telemetry || echo " WARNING: VLAgent not ready within timeout" + +VLAGENT_POD=$(kubectl get pod -n telemetry -l app.kubernetes.io/name=vlagent -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) +if [ -n "$VLAGENT_POD" ]; then + {% for endpoint in telemetry_config.telemetry_sinks.victoria_logs.additional_log_write_endpoints %} + echo " Testing connectivity to: {{ endpoint.url }}" + # Test connectivity using wget (more reliable than curl in minimal containers) + kubectl exec -n telemetry "$VLAGENT_POD" -- wget -T 5 -q --spider "{{ endpoint.url }}" 2>/dev/null && \ + echo " ✓ Endpoint reachable" || \ + echo " WARNING: Endpoint unreachable - logs may not be forwarded to {{ endpoint.url }}" + {% endfor %} +else + echo " WARNING: Could not find VLAgent pod to check endpoint reachability" +fi +{% endif %} +{% endif %} + +{% if victoria_metrics_support %} +# Check reachability of additional metric remote write endpoints +{% if telemetry_config.telemetry_sinks.victoria_metrics.additional_metric_remote_write_endpoints | default([]) %} +echo "Checking reachability of additional metric remote write endpoints..." +# Wait for vmagent to be ready before checking endpoint reachability +echo " Waiting for vmagent to be ready..." +kubectl wait --for=condition=ready --timeout=300s deployment/vmagent -n telemetry || echo " WARNING: vmagent not ready within timeout" + +VMAGENT_POD=$(kubectl get pod -n telemetry -l app.kubernetes.io/name=vmagent -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) +if [ -n "$VMAGENT_POD" ]; then + {% for endpoint in telemetry_config.telemetry_sinks.victoria_metrics.additional_metric_remote_write_endpoints %} + echo " Testing connectivity to: {{ endpoint.url }}" + # Test connectivity using wget (more reliable than curl in minimal containers) + kubectl exec -n telemetry "$VMAGENT_POD" -- wget -T 5 -q --spider "{{ endpoint.url }}" 2>/dev/null && \ + echo " ✓ Endpoint reachable" || \ + echo " WARNING: Endpoint unreachable - metrics may not be forwarded to {{ endpoint.url }}" + {% endfor %} +else + echo " WARNING: Could not find vmagent pod to check endpoint reachability" +fi +{% endif %} +{% endif %} + echo "===== Telemetry Stack Deployment Complete =====" diff --git a/telemetry/roles/telemetry_management/tasks/disable_powerscale_metrics.yml b/telemetry/roles/telemetry_management/tasks/disable_powerscale_metrics.yml index e020bc2576..25fe916b46 100644 --- a/telemetry/roles/telemetry_management/tasks/disable_powerscale_metrics.yml +++ b/telemetry/roles/telemetry_management/tasks/disable_powerscale_metrics.yml @@ -13,75 +13,77 @@ # limitations under the License. --- -- name: Get current replica counts for PowerScale deployments - kubernetes.core.k8s_info: +- name: Scale down OTEL Collector + kubernetes.core.k8s: api_version: apps/v1 kind: Deployment + name: otel-collector namespace: "{{ telemetry_namespace }}" - label_selectors: - - "app.kubernetes.io/name=karavi-metrics-powerscale" - - "app.kubernetes.io/name=otel-collector" - - "app=csi-volume-exporter" + definition: + spec: + replicas: 0 delegate_to: "{{ kube_vip }}" - register: powerscale_deployments - ignore_errors: true + failed_when: false -- name: Save original replica counts - ansible.builtin.set_fact: - original_replicas: {} - ignore_errors: true - -- name: Store original replica counts - ansible.builtin.set_fact: - original_replicas: "{{ original_replicas | combine({item.metadata.name: item.spec.replicas | default(1)}) }}" - loop: "{{ powerscale_deployments.resources }}" - loop_control: - label: "{{ item.metadata.name }}" - ignore_errors: true +- name: Scale down karavi-metrics-powerscale + kubernetes.core.k8s: + api_version: apps/v1 + kind: Deployment + name: karavi-metrics-powerscale + namespace: "{{ telemetry_namespace }}" + definition: + spec: + replicas: 0 + delegate_to: "{{ kube_vip }}" + failed_when: false -- name: Save original replica counts to file - ansible.builtin.copy: - content: "{{ original_replicas | to_nice_yaml }}" - dest: /tmp/powerscale_replicas_backup.yml - mode: '0644' +- name: Scale down csi-volume-exporter + kubernetes.core.k8s: + api_version: apps/v1 + kind: Deployment + name: csi-volume-exporter + namespace: "{{ telemetry_namespace }}" + definition: + spec: + replicas: 0 delegate_to: "{{ kube_vip }}" - ignore_errors: true + failed_when: false -- name: Scale down OTEL Collector +- name: Scale down karavi-observability-cert-manager kubernetes.core.k8s: api_version: apps/v1 kind: Deployment - name: otel-collector + name: karavi-observability-cert-manager namespace: "{{ telemetry_namespace }}" definition: spec: replicas: 0 delegate_to: "{{ kube_vip }}" - ignore_errors: true + failed_when: false -- name: Scale down karavi-metrics-powerscale +- name: Scale down karavi-observability-cert-manager-cainjector kubernetes.core.k8s: api_version: apps/v1 kind: Deployment - name: karavi-metrics-powerscale + name: karavi-observability-cert-manager-cainjector namespace: "{{ telemetry_namespace }}" definition: spec: replicas: 0 delegate_to: "{{ kube_vip }}" - ignore_errors: true + failed_when: false -- name: Scale down csi-volume-exporter +- name: Scale down karavi-observability-cert-manager-webhook kubernetes.core.k8s: api_version: apps/v1 kind: Deployment - name: csi-volume-exporter + name: karavi-observability-cert-manager-webhook namespace: "{{ telemetry_namespace }}" definition: spec: replicas: 0 delegate_to: "{{ kube_vip }}" - ignore_errors: true + failed_when: false - name: Display PowerScale metric workloads scaled down ansible.builtin.debug: diff --git a/telemetry/roles/telemetry_management/tasks/enable_powerscale_metrics.yml b/telemetry/roles/telemetry_management/tasks/enable_powerscale_metrics.yml index 0fd19a10ae..54c386c92f 100644 --- a/telemetry/roles/telemetry_management/tasks/enable_powerscale_metrics.yml +++ b/telemetry/roles/telemetry_management/tasks/enable_powerscale_metrics.yml @@ -13,31 +13,43 @@ # limitations under the License. --- -- name: Load original replica counts from backup file - ansible.builtin.slurp: - src: /tmp/powerscale_replicas_backup.yml +- name: Scale up karavi-observability-cert-manager to replica count 1 + kubernetes.core.k8s: + api_version: apps/v1 + kind: Deployment + name: karavi-observability-cert-manager + namespace: "{{ telemetry_namespace }}" + definition: + spec: + replicas: 1 delegate_to: "{{ kube_vip }}" - register: replicas_backup - ignore_errors: true + failed_when: false -- name: Parse replica counts from backup - ansible.builtin.set_fact: - original_replicas: "{{ replicas_backup.content | b64decode | from_yaml | default({}) }}" - ignore_errors: true +- name: Scale up karavi-observability-cert-manager-cainjector to replica count 1 + kubernetes.core.k8s: + api_version: apps/v1 + kind: Deployment + name: karavi-observability-cert-manager-cainjector + namespace: "{{ telemetry_namespace }}" + definition: + spec: + replicas: 1 + delegate_to: "{{ kube_vip }}" + failed_when: false -- name: Scale up OTEL Collector to original replica count +- name: Scale up karavi-observability-cert-manager-webhook to replica count 1 kubernetes.core.k8s: api_version: apps/v1 kind: Deployment - name: otel-collector + name: karavi-observability-cert-manager-webhook namespace: "{{ telemetry_namespace }}" definition: spec: - replicas: "{{ original_replicas['otel-collector'] | default(1) }}" + replicas: 1 delegate_to: "{{ kube_vip }}" - ignore_errors: true + failed_when: false -- name: Scale up karavi-metrics-powerscale to original replica count +- name: Scale up karavi-metrics-powerscale to replica count 1 kubernetes.core.k8s: api_version: apps/v1 kind: Deployment @@ -45,11 +57,11 @@ namespace: "{{ telemetry_namespace }}" definition: spec: - replicas: "{{ original_replicas['karavi-metrics-powerscale'] | default(1) }}" + replicas: 1 delegate_to: "{{ kube_vip }}" - ignore_errors: true + failed_when: false -- name: Scale up csi-volume-exporter to original replica count +- name: Scale up csi-volume-exporter to replica count 1 kubernetes.core.k8s: api_version: apps/v1 kind: Deployment @@ -57,9 +69,21 @@ namespace: "{{ telemetry_namespace }}" definition: spec: - replicas: "{{ original_replicas['csi-volume-exporter'] | default(1) }}" + replicas: 1 + delegate_to: "{{ kube_vip }}" + failed_when: false + +- name: Scale up OTEL Collector to replica count 1 + kubernetes.core.k8s: + api_version: apps/v1 + kind: Deployment + name: otel-collector + namespace: "{{ telemetry_namespace }}" + definition: + spec: + replicas: 1 delegate_to: "{{ kube_vip }}" - ignore_errors: true + failed_when: false - name: Wait for OTEL Collector to be ready kubernetes.core.k8s_info: @@ -74,7 +98,7 @@ retries: 30 delay: 10 delegate_to: "{{ kube_vip }}" - ignore_errors: true + failed_when: false - name: Display PowerScale metric workloads scaled up ansible.builtin.debug: diff --git a/utils/credential_utility/roles/update_config/tasks/main.yml b/utils/credential_utility/roles/update_config/tasks/main.yml index fd73561782..b89ba6a6ce 100644 --- a/utils/credential_utility/roles/update_config/tasks/main.yml +++ b/utils/credential_utility/roles/update_config/tasks/main.yml @@ -40,9 +40,13 @@ build_stream_auth_password_hash: "{{ auth_registration.password_hash | default('') }}" no_log: true +- name: Load update_config role vars (credential schema) + ansible.builtin.include_vars: + file: "{{ role_path }}/vars/main.yml" + - name: Fetch credentials ansible.builtin.include_tasks: fetch_credentials.yml - loop: "{{ omnia_credentials_schema | dict2items }}" + loop: "{{ (omnia_credentials_schema | default({})) | dict2items }}" loop_control: loop_var: service From 1d7cbafeee8c790fa1b2ac2d97d5da528f0ad866 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 14 May 2026 15:36:47 +0530 Subject: [PATCH 57/63] update packages path --- .../telemetry/victoria/csi-volume-exporter.yaml.j2 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 b/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 index 2bb7220e09..d24eacbb8f 100644 --- a/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 +++ b/provision/roles/telemetry/templates/telemetry/victoria/csi-volume-exporter.yaml.j2 @@ -84,8 +84,9 @@ spec: command: ["/bin/sh", "-c"] args: - | - pip3 install prometheus_client==0.20.0 kubernetes==33.1.0 \ - --find-links="{{ offline_pip_module_path }}/prometheus_client==0.20.0/" \ + pip3 install \ + "{{ offline_pip_module_path }}/prometheus_client==0.20.0/prometheus_client-0.20.0-py3-none-any.whl" \ + "{{ offline_pip_module_path }}/kubernetes==33.1.0/kubernetes-33.1.0-py2.py3-none-any.whl" \ --trusted-host "{{ pulp_server_ip }}" \ --no-index || \ pip3 install prometheus_client kubernetes @@ -132,7 +133,6 @@ spec: 'Total PowerScale PVCs by phase', ['phase']) - # ── Health event metrics (from CSI external-health-monitor-controller) ── volume_condition_abnormal = Gauge( 'powerscale_volume_health_abnormal', From 0e951157d5bbbc69fcf00ae388a381cae46bee21 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 14 May 2026 16:17:43 +0530 Subject: [PATCH 58/63] create seperate role --- .../telemetry/check_kube_vip_reachability.yml | 1 - common/tasks/telemetry/load_ha_config.yml | 13 +++++++ .../tasks/disable_powerscale_metrics.yml | 11 +++++- .../tasks/main.yml | 19 +++------- .../tasks/enable_powerscale_metrics.yml | 16 ++++---- .../roles/telemetry_enable/tasks/main.yml | 38 +++++++++++++++++++ .../roles/telemetry_management/vars/main.yml | 26 ------------- telemetry/telemetry_disable.yml | 2 +- telemetry/telemetry_enable.yml | 2 +- 9 files changed, 77 insertions(+), 51 deletions(-) rename telemetry/roles/{telemetry_management => telemetry_disable}/tasks/disable_powerscale_metrics.yml (87%) rename telemetry/roles/{telemetry_management => telemetry_disable}/tasks/main.yml (67%) rename telemetry/roles/{telemetry_management => telemetry_enable}/tasks/enable_powerscale_metrics.yml (89%) create mode 100644 telemetry/roles/telemetry_enable/tasks/main.yml delete mode 100644 telemetry/roles/telemetry_management/vars/main.yml diff --git a/common/tasks/telemetry/check_kube_vip_reachability.yml b/common/tasks/telemetry/check_kube_vip_reachability.yml index 015150abc6..e7e0588706 100644 --- a/common/tasks/telemetry/check_kube_vip_reachability.yml +++ b/common/tasks/telemetry/check_kube_vip_reachability.yml @@ -17,7 +17,6 @@ when: - kube_vip is defined - kube_vip | length > 0 - tags: telemetry_deployment block: - name: Set kube_vip reachability fact to false initially ansible.builtin.set_fact: diff --git a/common/tasks/telemetry/load_ha_config.yml b/common/tasks/telemetry/load_ha_config.yml index db45cb7003..961a50b24f 100644 --- a/common/tasks/telemetry/load_ha_config.yml +++ b/common/tasks/telemetry/load_ha_config.yml @@ -23,3 +23,16 @@ ansible.builtin.set_fact: kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}" when: ha_config_loaded is succeeded + +- name: Debug kube_vip value + ansible.builtin.debug: + msg: "kube_vip is set to: {{ kube_vip | default('NOT SET') }}" + when: ha_config_loaded is succeeded + +- name: Fail if kube_vip is empty + ansible.builtin.fail: + msg: "kube_vip is not set in high_availability_config.yml. Please configure service_k8s_cluster_ha[0].virtual_ip_address" + when: + - ha_config_loaded is succeeded + - kube_vip is defined + - kube_vip | length == 0 diff --git a/telemetry/roles/telemetry_management/tasks/disable_powerscale_metrics.yml b/telemetry/roles/telemetry_disable/tasks/disable_powerscale_metrics.yml similarity index 87% rename from telemetry/roles/telemetry_management/tasks/disable_powerscale_metrics.yml rename to telemetry/roles/telemetry_disable/tasks/disable_powerscale_metrics.yml index 25fe916b46..dc2eff6377 100644 --- a/telemetry/roles/telemetry_management/tasks/disable_powerscale_metrics.yml +++ b/telemetry/roles/telemetry_disable/tasks/disable_powerscale_metrics.yml @@ -13,6 +13,15 @@ # limitations under the License. --- +- name: Add kube_vip to inventory + ansible.builtin.add_host: + name: "{{ kube_vip }}" + ansible_host: "{{ kube_vip }}" + ansible_user: "{{ ansible_user | default('root') }}" + ansible_ssh_private_key_file: "{{ ansible_ssh_private_key_file | default(omit) }}" + groups: kube_vip_group + when: kube_vip is defined and kube_vip | length > 0 + - name: Scale down OTEL Collector kubernetes.core.k8s: api_version: apps/v1 @@ -71,7 +80,7 @@ spec: replicas: 0 delegate_to: "{{ kube_vip }}" - failed_when: false + failed_when: false - name: Scale down karavi-observability-cert-manager-webhook kubernetes.core.k8s: diff --git a/telemetry/roles/telemetry_management/tasks/main.yml b/telemetry/roles/telemetry_disable/tasks/main.yml similarity index 67% rename from telemetry/roles/telemetry_management/tasks/main.yml rename to telemetry/roles/telemetry_disable/tasks/main.yml index 005ec42820..2d1f1c898a 100644 --- a/telemetry/roles/telemetry_management/tasks/main.yml +++ b/telemetry/roles/telemetry_disable/tasks/main.yml @@ -16,7 +16,7 @@ - name: Fail if no tags provided ansible.builtin.fail: msg: "{{ tags_required_msg }}" - when: ansible_run_tags | default([]) | length == 0 + when: ansible_run_tags | default(['all']) | length == 1 and 'all' in ansible_run_tags | default(['all']) - name: Load telemetry configuration ansible.builtin.include_vars: @@ -25,23 +25,14 @@ - name: Load HA configuration ansible.builtin.include_tasks: "{{ playbook_dir }}/../common/tasks/telemetry/load_ha_config.yml" +- name: Debug kube_vip value + ansible.builtin.debug: + msg: "kube_vip is set to: {{ kube_vip | default('NOT SET') }}" + - name: Check kube_vip reachability ansible.builtin.include_tasks: "{{ playbook_dir }}/../common/tasks/telemetry/check_kube_vip_reachability.yml" -# ============================================================================ -# DISABLE TASKS -# ============================================================================ - name: Disable PowerScale metrics ansible.builtin.include_tasks: disable_powerscale_metrics.yml - when: telemetry_operation == "disable" - tags: - - powerscale - -# ============================================================================ -# ENABLE TASKS -# ============================================================================ -- name: Enable PowerScale metrics - ansible.builtin.include_tasks: enable_powerscale_metrics.yml - when: telemetry_operation == "enable" tags: - powerscale diff --git a/telemetry/roles/telemetry_management/tasks/enable_powerscale_metrics.yml b/telemetry/roles/telemetry_enable/tasks/enable_powerscale_metrics.yml similarity index 89% rename from telemetry/roles/telemetry_management/tasks/enable_powerscale_metrics.yml rename to telemetry/roles/telemetry_enable/tasks/enable_powerscale_metrics.yml index 54c386c92f..a59802d3f9 100644 --- a/telemetry/roles/telemetry_management/tasks/enable_powerscale_metrics.yml +++ b/telemetry/roles/telemetry_enable/tasks/enable_powerscale_metrics.yml @@ -13,6 +13,15 @@ # limitations under the License. --- +- name: Add kube_vip to inventory + ansible.builtin.add_host: + name: "{{ kube_vip }}" + ansible_host: "{{ kube_vip }}" + ansible_user: "{{ ansible_user | default('root') }}" + ansible_ssh_private_key_file: "{{ ansible_ssh_private_key_file | default(omit) }}" + groups: kube_vip_group + when: kube_vip is defined and kube_vip | length > 0 + - name: Scale up karavi-observability-cert-manager to replica count 1 kubernetes.core.k8s: api_version: apps/v1 @@ -22,7 +31,6 @@ definition: spec: replicas: 1 - delegate_to: "{{ kube_vip }}" failed_when: false - name: Scale up karavi-observability-cert-manager-cainjector to replica count 1 @@ -34,7 +42,6 @@ definition: spec: replicas: 1 - delegate_to: "{{ kube_vip }}" failed_when: false - name: Scale up karavi-observability-cert-manager-webhook to replica count 1 @@ -46,7 +53,6 @@ definition: spec: replicas: 1 - delegate_to: "{{ kube_vip }}" failed_when: false - name: Scale up karavi-metrics-powerscale to replica count 1 @@ -58,7 +64,6 @@ definition: spec: replicas: 1 - delegate_to: "{{ kube_vip }}" failed_when: false - name: Scale up csi-volume-exporter to replica count 1 @@ -70,7 +75,6 @@ definition: spec: replicas: 1 - delegate_to: "{{ kube_vip }}" failed_when: false - name: Scale up OTEL Collector to replica count 1 @@ -82,7 +86,6 @@ definition: spec: replicas: 1 - delegate_to: "{{ kube_vip }}" failed_when: false - name: Wait for OTEL Collector to be ready @@ -103,4 +106,3 @@ - name: Display PowerScale metric workloads scaled up ansible.builtin.debug: msg: "{{ powerscale_metrics_scaled_up_msg }}" - diff --git a/telemetry/roles/telemetry_enable/tasks/main.yml b/telemetry/roles/telemetry_enable/tasks/main.yml new file mode 100644 index 0000000000..faa0d96f2e --- /dev/null +++ b/telemetry/roles/telemetry_enable/tasks/main.yml @@ -0,0 +1,38 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Fail if no tags provided + ansible.builtin.fail: + msg: "{{ tags_required_msg }}" + when: ansible_run_tags | default(['all']) | length == 1 and 'all' in ansible_run_tags | default(['all']) + +- name: Load telemetry configuration + ansible.builtin.include_vars: + file: "{{ telemetry_config_file }}" + +- name: Load HA configuration + ansible.builtin.include_tasks: "{{ playbook_dir }}/../common/tasks/telemetry/load_ha_config.yml" + +- name: Debug kube_vip value + ansible.builtin.debug: + msg: "kube_vip is set to: {{ kube_vip | default('NOT SET') }}" + +- name: Check kube_vip reachability + ansible.builtin.include_tasks: "{{ playbook_dir }}/../common/tasks/telemetry/check_kube_vip_reachability.yml" + +- name: Enable PowerScale metrics + ansible.builtin.include_tasks: enable_powerscale_metrics.yml + tags: + - powerscale diff --git a/telemetry/roles/telemetry_management/vars/main.yml b/telemetry/roles/telemetry_management/vars/main.yml deleted file mode 100644 index 3e515a6c33..0000000000 --- a/telemetry/roles/telemetry_management/vars/main.yml +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -# ============================================================================ -# TELEMETRY MANAGEMENT MESSAGES -# ============================================================================ - -# Error messages -tags_required_msg: "ERROR: No tags provided. Please specify --tags to select which telemetry source to manage. Valid tags: powerscale" - -# Success messages -powerscale_metrics_scaled_down_msg: "PowerScale metric collection disabled. Scaled down: otel-collector, karavi-metrics-powerscale in {{ csm_namespace }} namespace. To re-enable: ansible-playbook telemetry/enable_telemetry.yml --tags powerscale" - -powerscale_metrics_scaled_up_msg: "PowerScale metric collection re-enabled. Scaled up: otel-collector, karavi-metrics-powerscale in {{ csm_namespace }} namespace." diff --git a/telemetry/telemetry_disable.yml b/telemetry/telemetry_disable.yml index 49b89895e6..0d8a0f2048 100644 --- a/telemetry/telemetry_disable.yml +++ b/telemetry/telemetry_disable.yml @@ -43,4 +43,4 @@ telemetry_namespace: telemetry telemetry_operation: disable roles: - - telemetry_management + - telemetry_disable diff --git a/telemetry/telemetry_enable.yml b/telemetry/telemetry_enable.yml index 4965cd0f4a..ffe362eaa8 100644 --- a/telemetry/telemetry_enable.yml +++ b/telemetry/telemetry_enable.yml @@ -43,4 +43,4 @@ telemetry_namespace: telemetry telemetry_operation: enable roles: - - telemetry_management + - telemetry_enable From cbf6bc51c0f6b708222d199046ffa8e913edc5ec Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 14 May 2026 16:56:52 +0530 Subject: [PATCH 59/63] Signed-off-by: priti-parate <140157516+priti-parate@users.noreply.github.com> fix UT issues --- .../check_kube_vip_reachability.yml | 0 .../{telemetry => common}/load_ha_config.yml | 7 +--- .../tasks/disable_powerscale_metrics.yml | 12 ++++++- .../roles/telemetry_disable/tasks/main.yml | 32 +++++++++---------- .../roles/telemetry_disable/vars/main.yml | 17 ++++++++++ .../tasks/enable_powerscale_metrics.yml | 4 +++ .../roles/telemetry_enable/tasks/main.yml | 32 +++++++++---------- .../roles/telemetry_enable/vars/main.yml | 17 ++++++++++ 8 files changed, 80 insertions(+), 41 deletions(-) rename common/tasks/{telemetry => common}/check_kube_vip_reachability.yml (100%) rename common/tasks/{telemetry => common}/load_ha_config.yml (82%) create mode 100644 telemetry/roles/telemetry_disable/vars/main.yml create mode 100644 telemetry/roles/telemetry_enable/vars/main.yml diff --git a/common/tasks/telemetry/check_kube_vip_reachability.yml b/common/tasks/common/check_kube_vip_reachability.yml similarity index 100% rename from common/tasks/telemetry/check_kube_vip_reachability.yml rename to common/tasks/common/check_kube_vip_reachability.yml diff --git a/common/tasks/telemetry/load_ha_config.yml b/common/tasks/common/load_ha_config.yml similarity index 82% rename from common/tasks/telemetry/load_ha_config.yml rename to common/tasks/common/load_ha_config.yml index 961a50b24f..6a575c2606 100644 --- a/common/tasks/telemetry/load_ha_config.yml +++ b/common/tasks/common/load_ha_config.yml @@ -21,12 +21,7 @@ - name: Set kube_vip fact ansible.builtin.set_fact: - kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}" - when: ha_config_loaded is succeeded - -- name: Debug kube_vip value - ansible.builtin.debug: - msg: "kube_vip is set to: {{ kube_vip | default('NOT SET') }}" + kube_vip: "{{ service_k8s_cluster_ha[0].virtual_ip_address | default('') }}" when: ha_config_loaded is succeeded - name: Fail if kube_vip is empty diff --git a/telemetry/roles/telemetry_disable/tasks/disable_powerscale_metrics.yml b/telemetry/roles/telemetry_disable/tasks/disable_powerscale_metrics.yml index dc2eff6377..fa77f98cd4 100644 --- a/telemetry/roles/telemetry_disable/tasks/disable_powerscale_metrics.yml +++ b/telemetry/roles/telemetry_disable/tasks/disable_powerscale_metrics.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Load role variables + ansible.builtin.include_vars: + file: vars/main.yml + - name: Add kube_vip to inventory ansible.builtin.add_host: name: "{{ kube_vip }}" @@ -33,6 +37,7 @@ replicas: 0 delegate_to: "{{ kube_vip }}" failed_when: false + when: kube_vip is defined and kube_vip | length > 0 - name: Scale down karavi-metrics-powerscale kubernetes.core.k8s: @@ -45,6 +50,7 @@ replicas: 0 delegate_to: "{{ kube_vip }}" failed_when: false + when: kube_vip is defined and kube_vip | length > 0 - name: Scale down csi-volume-exporter kubernetes.core.k8s: @@ -57,6 +63,7 @@ replicas: 0 delegate_to: "{{ kube_vip }}" failed_when: false + when: kube_vip is defined and kube_vip | length > 0 - name: Scale down karavi-observability-cert-manager kubernetes.core.k8s: @@ -69,6 +76,7 @@ replicas: 0 delegate_to: "{{ kube_vip }}" failed_when: false + when: kube_vip is defined and kube_vip | length > 0 - name: Scale down karavi-observability-cert-manager-cainjector kubernetes.core.k8s: @@ -80,7 +88,8 @@ spec: replicas: 0 delegate_to: "{{ kube_vip }}" - failed_when: false + failed_when: false + when: kube_vip is defined and kube_vip | length > 0 - name: Scale down karavi-observability-cert-manager-webhook kubernetes.core.k8s: @@ -93,6 +102,7 @@ replicas: 0 delegate_to: "{{ kube_vip }}" failed_when: false + when: kube_vip is defined and kube_vip | length > 0 - name: Display PowerScale metric workloads scaled down ansible.builtin.debug: diff --git a/telemetry/roles/telemetry_disable/tasks/main.yml b/telemetry/roles/telemetry_disable/tasks/main.yml index 2d1f1c898a..35414ad63b 100644 --- a/telemetry/roles/telemetry_disable/tasks/main.yml +++ b/telemetry/roles/telemetry_disable/tasks/main.yml @@ -13,26 +13,24 @@ # limitations under the License. --- -- name: Fail if no tags provided - ansible.builtin.fail: - msg: "{{ tags_required_msg }}" - when: ansible_run_tags | default(['all']) | length == 1 and 'all' in ansible_run_tags | default(['all']) +- name: Prerequisite setup + tags: always + block: + - name: Fail if no tags provided + ansible.builtin.fail: + msg: "{{ tags_required_msg }}" + when: ansible_run_tags | default(['all']) | length == 1 and 'all' in ansible_run_tags | default(['all']) -- name: Load telemetry configuration - ansible.builtin.include_vars: - file: "{{ telemetry_config_file }}" + - name: Load telemetry configuration + ansible.builtin.include_vars: + file: "{{ telemetry_config_file }}" -- name: Load HA configuration - ansible.builtin.include_tasks: "{{ playbook_dir }}/../common/tasks/telemetry/load_ha_config.yml" - -- name: Debug kube_vip value - ansible.builtin.debug: - msg: "kube_vip is set to: {{ kube_vip | default('NOT SET') }}" - -- name: Check kube_vip reachability - ansible.builtin.include_tasks: "{{ playbook_dir }}/../common/tasks/telemetry/check_kube_vip_reachability.yml" + - name: Load HA configuration + ansible.builtin.include_tasks: "{{ playbook_dir }}/../common/tasks/common/load_ha_config.yml" - name: Disable PowerScale metrics - ansible.builtin.include_tasks: disable_powerscale_metrics.yml tags: - powerscale + block: + - name: Disable PowerScale metrics + ansible.builtin.include_tasks: disable_powerscale_metrics.yml diff --git a/telemetry/roles/telemetry_disable/vars/main.yml b/telemetry/roles/telemetry_disable/vars/main.yml new file mode 100644 index 0000000000..3c1c3acfc0 --- /dev/null +++ b/telemetry/roles/telemetry_disable/vars/main.yml @@ -0,0 +1,17 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +tags_required_msg: "No tags provided. Please run this playbook with the --tags flag. Example: ansible-playbook telemetry_disable.yml --tags powerscale" +powerscale_metrics_scaled_down_msg: "PowerScale metrics workloads have been scaled down" diff --git a/telemetry/roles/telemetry_enable/tasks/enable_powerscale_metrics.yml b/telemetry/roles/telemetry_enable/tasks/enable_powerscale_metrics.yml index a59802d3f9..64c60aa4d5 100644 --- a/telemetry/roles/telemetry_enable/tasks/enable_powerscale_metrics.yml +++ b/telemetry/roles/telemetry_enable/tasks/enable_powerscale_metrics.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Load role variables + ansible.builtin.include_vars: + file: vars/main.yml + - name: Add kube_vip to inventory ansible.builtin.add_host: name: "{{ kube_vip }}" diff --git a/telemetry/roles/telemetry_enable/tasks/main.yml b/telemetry/roles/telemetry_enable/tasks/main.yml index faa0d96f2e..65f87fdcf8 100644 --- a/telemetry/roles/telemetry_enable/tasks/main.yml +++ b/telemetry/roles/telemetry_enable/tasks/main.yml @@ -13,26 +13,24 @@ # limitations under the License. --- -- name: Fail if no tags provided - ansible.builtin.fail: - msg: "{{ tags_required_msg }}" - when: ansible_run_tags | default(['all']) | length == 1 and 'all' in ansible_run_tags | default(['all']) +- name: Prerequisite setup + tags: always + block: + - name: Fail if no tags provided + ansible.builtin.fail: + msg: "{{ tags_required_msg }}" + when: ansible_run_tags | default(['all']) | length == 1 and 'all' in ansible_run_tags | default(['all']) -- name: Load telemetry configuration - ansible.builtin.include_vars: - file: "{{ telemetry_config_file }}" + - name: Load telemetry configuration + ansible.builtin.include_vars: + file: "{{ telemetry_config_file }}" -- name: Load HA configuration - ansible.builtin.include_tasks: "{{ playbook_dir }}/../common/tasks/telemetry/load_ha_config.yml" - -- name: Debug kube_vip value - ansible.builtin.debug: - msg: "kube_vip is set to: {{ kube_vip | default('NOT SET') }}" - -- name: Check kube_vip reachability - ansible.builtin.include_tasks: "{{ playbook_dir }}/../common/tasks/telemetry/check_kube_vip_reachability.yml" + - name: Load HA configuration + ansible.builtin.include_tasks: "{{ playbook_dir }}/../common/tasks/common/load_ha_config.yml" - name: Enable PowerScale metrics - ansible.builtin.include_tasks: enable_powerscale_metrics.yml tags: - powerscale + block: + - name: Enable PowerScale metrics + ansible.builtin.include_tasks: enable_powerscale_metrics.yml diff --git a/telemetry/roles/telemetry_enable/vars/main.yml b/telemetry/roles/telemetry_enable/vars/main.yml new file mode 100644 index 0000000000..c04d19da9f --- /dev/null +++ b/telemetry/roles/telemetry_enable/vars/main.yml @@ -0,0 +1,17 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +tags_required_msg: "No tags provided. Please run this playbook with the --tags flag. Example: ansible-playbook telemetry_enable.yml --tags powerscale" +powerscale_metrics_scaled_up_msg: "PowerScale metrics workloads have been scaled up" From e532e3a86339956562322ecce8e89fd502058d93 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 14 May 2026 17:07:35 +0530 Subject: [PATCH 60/63] update input vars --- telemetry/telemetry_disable.yml | 8 ++++++-- telemetry/telemetry_enable.yml | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/telemetry/telemetry_disable.yml b/telemetry/telemetry_disable.yml index 0d8a0f2048..a0f035dfa9 100644 --- a/telemetry/telemetry_disable.yml +++ b/telemetry/telemetry_disable.yml @@ -33,13 +33,17 @@ # isi audit settings modify --config-syslog-enabled=0 # ============================================================================ +- name: Include input directory + ansible.builtin.include_role: + name: include_input_dir + - name: Disable telemetry collection hosts: localhost connection: local gather_facts: false vars: - telemetry_config_file: "{{ playbook_dir }}/../input/telemetry_config.yml" - ha_config_file: "{{ playbook_dir }}/../input/high_availability_config.yml" + telemetry_config_file: "{{ input_project_dir }}/telemetry_config.yml" + ha_config_file: "{{ input_project_dir }}/high_availability_config.yml" telemetry_namespace: telemetry telemetry_operation: disable roles: diff --git a/telemetry/telemetry_enable.yml b/telemetry/telemetry_enable.yml index ffe362eaa8..e8aadf89a1 100644 --- a/telemetry/telemetry_enable.yml +++ b/telemetry/telemetry_enable.yml @@ -33,13 +33,17 @@ # isi audit settings modify --config-syslog-servers=:514 # ============================================================================ +- name: Include input directory + ansible.builtin.include_role: + name: include_input_dir + - name: Enable telemetry collection hosts: localhost connection: local gather_facts: false vars: - telemetry_config_file: "{{ playbook_dir }}/../input/telemetry_config.yml" - ha_config_file: "{{ playbook_dir }}/../input/high_availability_config.yml" + telemetry_config_file: "{{ input_project_dir }}/telemetry_config.yml" + ha_config_file: "{{ input_project_dir }}/high_availability_config.yml" telemetry_namespace: telemetry telemetry_operation: enable roles: From 79afdeceaa1c538d80c4357ab4b705ae1134c080 Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 14 May 2026 17:09:58 +0530 Subject: [PATCH 61/63] update input_dir task --- telemetry/telemetry_disable.yml | 26 +++++++++++++++----------- telemetry/telemetry_enable.yml | 26 +++++++++++++++----------- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/telemetry/telemetry_disable.yml b/telemetry/telemetry_disable.yml index a0f035dfa9..2afd0021fe 100644 --- a/telemetry/telemetry_disable.yml +++ b/telemetry/telemetry_disable.yml @@ -33,18 +33,22 @@ # isi audit settings modify --config-syslog-enabled=0 # ============================================================================ -- name: Include input directory - ansible.builtin.include_role: - name: include_input_dir - - name: Disable telemetry collection hosts: localhost connection: local gather_facts: false - vars: - telemetry_config_file: "{{ input_project_dir }}/telemetry_config.yml" - ha_config_file: "{{ input_project_dir }}/high_availability_config.yml" - telemetry_namespace: telemetry - telemetry_operation: disable - roles: - - telemetry_disable + tasks: + - name: Include input directory + ansible.builtin.include_role: + name: include_input_dir + + - name: Set config file paths + ansible.builtin.set_fact: + telemetry_config_file: "{{ input_project_dir }}/telemetry_config.yml" + ha_config_file: "{{ input_project_dir }}/high_availability_config.yml" + telemetry_namespace: telemetry + telemetry_operation: disable + + - name: Disable telemetry collection + ansible.builtin.include_role: + name: telemetry_disable diff --git a/telemetry/telemetry_enable.yml b/telemetry/telemetry_enable.yml index e8aadf89a1..786ded92ab 100644 --- a/telemetry/telemetry_enable.yml +++ b/telemetry/telemetry_enable.yml @@ -33,18 +33,22 @@ # isi audit settings modify --config-syslog-servers=:514 # ============================================================================ -- name: Include input directory - ansible.builtin.include_role: - name: include_input_dir - - name: Enable telemetry collection hosts: localhost connection: local gather_facts: false - vars: - telemetry_config_file: "{{ input_project_dir }}/telemetry_config.yml" - ha_config_file: "{{ input_project_dir }}/high_availability_config.yml" - telemetry_namespace: telemetry - telemetry_operation: enable - roles: - - telemetry_enable + tasks: + - name: Include input directory + ansible.builtin.include_role: + name: include_input_dir + + - name: Set config file paths + ansible.builtin.set_fact: + telemetry_config_file: "{{ input_project_dir }}/telemetry_config.yml" + ha_config_file: "{{ input_project_dir }}/high_availability_config.yml" + telemetry_namespace: telemetry + telemetry_operation: enable + + - name: Enable telemetry collection + ansible.builtin.include_role: + name: telemetry_enable From a391355194becfc69c07e5f409331712ebe038ba Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 14 May 2026 17:12:27 +0530 Subject: [PATCH 62/63] include input_dir --- telemetry/telemetry_disable.yml | 26 +++++++++++--------------- telemetry/telemetry_enable.yml | 26 +++++++++++--------------- 2 files changed, 22 insertions(+), 30 deletions(-) diff --git a/telemetry/telemetry_disable.yml b/telemetry/telemetry_disable.yml index 2afd0021fe..288b39bb05 100644 --- a/telemetry/telemetry_disable.yml +++ b/telemetry/telemetry_disable.yml @@ -33,22 +33,18 @@ # isi audit settings modify --config-syslog-enabled=0 # ============================================================================ +- name: Include input directory + when: not project_dir_status | default(false) | bool + ansible.builtin.import_playbook: ../utils/include_input_dir.yml + - name: Disable telemetry collection hosts: localhost connection: local gather_facts: false - tasks: - - name: Include input directory - ansible.builtin.include_role: - name: include_input_dir - - - name: Set config file paths - ansible.builtin.set_fact: - telemetry_config_file: "{{ input_project_dir }}/telemetry_config.yml" - ha_config_file: "{{ input_project_dir }}/high_availability_config.yml" - telemetry_namespace: telemetry - telemetry_operation: disable - - - name: Disable telemetry collection - ansible.builtin.include_role: - name: telemetry_disable + vars: + telemetry_config_file: "{{ input_project_dir }}/telemetry_config.yml" + ha_config_file: "{{ input_project_dir }}/high_availability_config.yml" + telemetry_namespace: telemetry + telemetry_operation: disable + roles: + - telemetry_disable diff --git a/telemetry/telemetry_enable.yml b/telemetry/telemetry_enable.yml index 786ded92ab..e01d817fce 100644 --- a/telemetry/telemetry_enable.yml +++ b/telemetry/telemetry_enable.yml @@ -33,22 +33,18 @@ # isi audit settings modify --config-syslog-servers=:514 # ============================================================================ +- name: Include input directory + when: not project_dir_status | default(false) | bool + ansible.builtin.import_playbook: ../utils/include_input_dir.yml + - name: Enable telemetry collection hosts: localhost connection: local gather_facts: false - tasks: - - name: Include input directory - ansible.builtin.include_role: - name: include_input_dir - - - name: Set config file paths - ansible.builtin.set_fact: - telemetry_config_file: "{{ input_project_dir }}/telemetry_config.yml" - ha_config_file: "{{ input_project_dir }}/high_availability_config.yml" - telemetry_namespace: telemetry - telemetry_operation: enable - - - name: Enable telemetry collection - ansible.builtin.include_role: - name: telemetry_enable + vars: + telemetry_config_file: "{{ input_project_dir }}/telemetry_config.yml" + ha_config_file: "{{ input_project_dir }}/high_availability_config.yml" + telemetry_namespace: telemetry + telemetry_operation: enable + roles: + - telemetry_enable From 2a9040044f5f864e9c1045cb2c4bc990151e5bfa Mon Sep 17 00:00:00 2001 From: priti-parate <140157516+priti-parate@users.noreply.github.com> Date: Thu, 14 May 2026 18:28:57 +0530 Subject: [PATCH 63/63] update telemetry enable and disable tasks --- .../tasks/disable_powerscale_metrics.yml | 86 +++----------- .../roles/telemetry_disable/tasks/main.yml | 1 + .../tasks/enable_powerscale_metrics.yml | 106 ++++++------------ .../roles/telemetry_enable/tasks/main.yml | 1 + telemetry/telemetry_disable.yml | 33 ++++-- telemetry/telemetry_enable.yml | 31 +++-- utils/roles/include_input_dir/tasks/main.yml | 8 ++ 7 files changed, 103 insertions(+), 163 deletions(-) diff --git a/telemetry/roles/telemetry_disable/tasks/disable_powerscale_metrics.yml b/telemetry/roles/telemetry_disable/tasks/disable_powerscale_metrics.yml index fa77f98cd4..22e585252e 100644 --- a/telemetry/roles/telemetry_disable/tasks/disable_powerscale_metrics.yml +++ b/telemetry/roles/telemetry_disable/tasks/disable_powerscale_metrics.yml @@ -13,98 +13,48 @@ # limitations under the License. --- -- name: Load role variables - ansible.builtin.include_vars: - file: vars/main.yml - -- name: Add kube_vip to inventory - ansible.builtin.add_host: - name: "{{ kube_vip }}" - ansible_host: "{{ kube_vip }}" - ansible_user: "{{ ansible_user | default('root') }}" - ansible_ssh_private_key_file: "{{ ansible_ssh_private_key_file | default(omit) }}" - groups: kube_vip_group - when: kube_vip is defined and kube_vip | length > 0 - - name: Scale down OTEL Collector - kubernetes.core.k8s: - api_version: apps/v1 - kind: Deployment - name: otel-collector - namespace: "{{ telemetry_namespace }}" - definition: - spec: - replicas: 0 + ansible.builtin.command: + kubectl scale deployment --replicas=0 -n {{ telemetry_namespace }} otel-collector delegate_to: "{{ kube_vip }}" failed_when: false - when: kube_vip is defined and kube_vip | length > 0 + changed_when: false - name: Scale down karavi-metrics-powerscale - kubernetes.core.k8s: - api_version: apps/v1 - kind: Deployment - name: karavi-metrics-powerscale - namespace: "{{ telemetry_namespace }}" - definition: - spec: - replicas: 0 + ansible.builtin.command: + kubectl scale deployment --replicas=0 -n {{ telemetry_namespace }} karavi-metrics-powerscale delegate_to: "{{ kube_vip }}" failed_when: false - when: kube_vip is defined and kube_vip | length > 0 + changed_when: false - name: Scale down csi-volume-exporter - kubernetes.core.k8s: - api_version: apps/v1 - kind: Deployment - name: csi-volume-exporter - namespace: "{{ telemetry_namespace }}" - definition: - spec: - replicas: 0 + ansible.builtin.command: + kubectl scale deployment --replicas=0 -n {{ telemetry_namespace }} csi-volume-exporter delegate_to: "{{ kube_vip }}" failed_when: false - when: kube_vip is defined and kube_vip | length > 0 + changed_when: false - name: Scale down karavi-observability-cert-manager - kubernetes.core.k8s: - api_version: apps/v1 - kind: Deployment - name: karavi-observability-cert-manager - namespace: "{{ telemetry_namespace }}" - definition: - spec: - replicas: 0 + ansible.builtin.command: + kubectl scale deployment --replicas=0 -n {{ telemetry_namespace }} karavi-observability-cert-manager delegate_to: "{{ kube_vip }}" failed_when: false - when: kube_vip is defined and kube_vip | length > 0 + changed_when: false - name: Scale down karavi-observability-cert-manager-cainjector - kubernetes.core.k8s: - api_version: apps/v1 - kind: Deployment - name: karavi-observability-cert-manager-cainjector - namespace: "{{ telemetry_namespace }}" - definition: - spec: - replicas: 0 + ansible.builtin.command: + kubectl scale deployment --replicas=0 -n {{ telemetry_namespace }} karavi-observability-cert-manager-cainjector delegate_to: "{{ kube_vip }}" failed_when: false - when: kube_vip is defined and kube_vip | length > 0 + changed_when: false - name: Scale down karavi-observability-cert-manager-webhook - kubernetes.core.k8s: - api_version: apps/v1 - kind: Deployment - name: karavi-observability-cert-manager-webhook - namespace: "{{ telemetry_namespace }}" - definition: - spec: - replicas: 0 + ansible.builtin.command: + kubectl scale deployment --replicas=0 -n {{ telemetry_namespace }} karavi-observability-cert-manager-webhook delegate_to: "{{ kube_vip }}" failed_when: false - when: kube_vip is defined and kube_vip | length > 0 + changed_when: false - name: Display PowerScale metric workloads scaled down ansible.builtin.debug: msg: "{{ powerscale_metrics_scaled_down_msg }}" - diff --git a/telemetry/roles/telemetry_disable/tasks/main.yml b/telemetry/roles/telemetry_disable/tasks/main.yml index 35414ad63b..ef0122c36f 100644 --- a/telemetry/roles/telemetry_disable/tasks/main.yml +++ b/telemetry/roles/telemetry_disable/tasks/main.yml @@ -31,6 +31,7 @@ - name: Disable PowerScale metrics tags: - powerscale + when: kube_vip is defined and kube_vip | length > 0 block: - name: Disable PowerScale metrics ansible.builtin.include_tasks: disable_powerscale_metrics.yml diff --git a/telemetry/roles/telemetry_enable/tasks/enable_powerscale_metrics.yml b/telemetry/roles/telemetry_enable/tasks/enable_powerscale_metrics.yml index 64c60aa4d5..50173614e4 100644 --- a/telemetry/roles/telemetry_enable/tasks/enable_powerscale_metrics.yml +++ b/telemetry/roles/telemetry_enable/tasks/enable_powerscale_metrics.yml @@ -13,99 +13,61 @@ # limitations under the License. --- -- name: Load role variables - ansible.builtin.include_vars: - file: vars/main.yml - -- name: Add kube_vip to inventory - ansible.builtin.add_host: - name: "{{ kube_vip }}" - ansible_host: "{{ kube_vip }}" - ansible_user: "{{ ansible_user | default('root') }}" - ansible_ssh_private_key_file: "{{ ansible_ssh_private_key_file | default(omit) }}" - groups: kube_vip_group - when: kube_vip is defined and kube_vip | length > 0 - - name: Scale up karavi-observability-cert-manager to replica count 1 - kubernetes.core.k8s: - api_version: apps/v1 - kind: Deployment - name: karavi-observability-cert-manager - namespace: "{{ telemetry_namespace }}" - definition: - spec: - replicas: 1 + ansible.builtin.command: + kubectl scale deployment --replicas=1 -n {{ telemetry_namespace }} karavi-observability-cert-manager + delegate_to: "{{ kube_vip }}" failed_when: false + changed_when: false - name: Scale up karavi-observability-cert-manager-cainjector to replica count 1 - kubernetes.core.k8s: - api_version: apps/v1 - kind: Deployment - name: karavi-observability-cert-manager-cainjector - namespace: "{{ telemetry_namespace }}" - definition: - spec: - replicas: 1 + ansible.builtin.command: + kubectl scale deployment --replicas=1 -n {{ telemetry_namespace }} karavi-observability-cert-manager-cainjector + delegate_to: "{{ kube_vip }}" failed_when: false + changed_when: false - name: Scale up karavi-observability-cert-manager-webhook to replica count 1 - kubernetes.core.k8s: - api_version: apps/v1 - kind: Deployment - name: karavi-observability-cert-manager-webhook - namespace: "{{ telemetry_namespace }}" - definition: - spec: - replicas: 1 + ansible.builtin.command: + kubectl scale deployment --replicas=1 -n {{ telemetry_namespace }} karavi-observability-cert-manager-webhook + delegate_to: "{{ kube_vip }}" failed_when: false + changed_when: false - name: Scale up karavi-metrics-powerscale to replica count 1 - kubernetes.core.k8s: - api_version: apps/v1 - kind: Deployment - name: karavi-metrics-powerscale - namespace: "{{ telemetry_namespace }}" - definition: - spec: - replicas: 1 + ansible.builtin.command: + kubectl scale deployment --replicas=1 -n {{ telemetry_namespace }} karavi-metrics-powerscale + delegate_to: "{{ kube_vip }}" failed_when: false + changed_when: false - name: Scale up csi-volume-exporter to replica count 1 - kubernetes.core.k8s: - api_version: apps/v1 - kind: Deployment - name: csi-volume-exporter - namespace: "{{ telemetry_namespace }}" - definition: - spec: - replicas: 1 + ansible.builtin.command: + kubectl scale deployment --replicas=1 -n {{ telemetry_namespace }} csi-volume-exporter + delegate_to: "{{ kube_vip }}" failed_when: false + changed_when: false + +- name: Wait for csi-volume-exporter to be ready + ansible.builtin.command: + kubectl wait deployment csi-volume-exporter -n {{ telemetry_namespace }} --for condition=available --timeout=5m + delegate_to: "{{ kube_vip }}" + failed_when: false + changed_when: false - name: Scale up OTEL Collector to replica count 1 - kubernetes.core.k8s: - api_version: apps/v1 - kind: Deployment - name: otel-collector - namespace: "{{ telemetry_namespace }}" - definition: - spec: - replicas: 1 + ansible.builtin.command: + kubectl scale deployment --replicas=1 -n {{ telemetry_namespace }} otel-collector + delegate_to: "{{ kube_vip }}" failed_when: false + changed_when: false - name: Wait for OTEL Collector to be ready - kubernetes.core.k8s_info: - api_version: apps/v1 - kind: Deployment - name: otel-collector - namespace: "{{ telemetry_namespace }}" - register: otel_deployment - until: > - otel_deployment.resources | length > 0 and - (otel_deployment.resources[0].status.readyReplicas | default(0)) >= 1 - retries: 30 - delay: 10 + ansible.builtin.command: + kubectl wait deployment otel-collector -n {{ telemetry_namespace }} --for condition=available --timeout=5m delegate_to: "{{ kube_vip }}" failed_when: false + changed_when: false - name: Display PowerScale metric workloads scaled up ansible.builtin.debug: diff --git a/telemetry/roles/telemetry_enable/tasks/main.yml b/telemetry/roles/telemetry_enable/tasks/main.yml index 65f87fdcf8..2897085dee 100644 --- a/telemetry/roles/telemetry_enable/tasks/main.yml +++ b/telemetry/roles/telemetry_enable/tasks/main.yml @@ -31,6 +31,7 @@ - name: Enable PowerScale metrics tags: - powerscale + when: kube_vip is defined and kube_vip | length > 0 block: - name: Enable PowerScale metrics ansible.builtin.include_tasks: enable_powerscale_metrics.yml diff --git a/telemetry/telemetry_disable.yml b/telemetry/telemetry_disable.yml index 288b39bb05..74d7761ccf 100644 --- a/telemetry/telemetry_disable.yml +++ b/telemetry/telemetry_disable.yml @@ -33,18 +33,27 @@ # isi audit settings modify --config-syslog-enabled=0 # ============================================================================ -- name: Include input directory - when: not project_dir_status | default(false) | bool - ansible.builtin.import_playbook: ../utils/include_input_dir.yml - - name: Disable telemetry collection hosts: localhost - connection: local + connection: ssh gather_facts: false - vars: - telemetry_config_file: "{{ input_project_dir }}/telemetry_config.yml" - ha_config_file: "{{ input_project_dir }}/high_availability_config.yml" - telemetry_namespace: telemetry - telemetry_operation: disable - roles: - - telemetry_disable + tasks: + - name: Include input directory + ansible.builtin.include_role: + name: ../utils/roles/include_input_dir + tags: always + + - name: Set config file paths + ansible.builtin.set_fact: + telemetry_config_file: "{{ input_project_dir }}/telemetry_config.yml" + ha_config_file: "{{ input_project_dir }}/high_availability_config.yml" + telemetry_namespace: telemetry + telemetry_operation: disable + cacheable: true + tags: always + + - name: Disable telemetry collection + ansible.builtin.include_role: + name: telemetry_disable + tags: + - powerscale diff --git a/telemetry/telemetry_enable.yml b/telemetry/telemetry_enable.yml index e01d817fce..25b1bafd8c 100644 --- a/telemetry/telemetry_enable.yml +++ b/telemetry/telemetry_enable.yml @@ -33,18 +33,27 @@ # isi audit settings modify --config-syslog-servers=:514 # ============================================================================ -- name: Include input directory - when: not project_dir_status | default(false) | bool - ansible.builtin.import_playbook: ../utils/include_input_dir.yml - - name: Enable telemetry collection hosts: localhost connection: local gather_facts: false - vars: - telemetry_config_file: "{{ input_project_dir }}/telemetry_config.yml" - ha_config_file: "{{ input_project_dir }}/high_availability_config.yml" - telemetry_namespace: telemetry - telemetry_operation: enable - roles: - - telemetry_enable + tasks: + - name: Include input directory + ansible.builtin.include_role: + name: ../utils/roles/include_input_dir + tags: always + + - name: Set config file paths + ansible.builtin.set_fact: + telemetry_config_file: "{{ input_project_dir }}/telemetry_config.yml" + ha_config_file: "{{ input_project_dir }}/high_availability_config.yml" + telemetry_namespace: telemetry + telemetry_operation: enable + cacheable: true + tags: always + + - name: Enable telemetry collection + ansible.builtin.include_role: + name: telemetry_enable + tags: + - powerscale diff --git a/utils/roles/include_input_dir/tasks/main.yml b/utils/roles/include_input_dir/tasks/main.yml index 6027137737..497febae7c 100644 --- a/utils/roles/include_input_dir/tasks/main.yml +++ b/utils/roles/include_input_dir/tasks/main.yml @@ -14,6 +14,7 @@ --- - name: Fetch omnia project configs + tags: always block: - name: Include omnia project config file ansible.builtin.include_vars: "{{ omnia_input_config_file }}" @@ -26,24 +27,31 @@ - name: Set input_project_dir ansible.builtin.set_fact: input_project_dir: "{{ omnia_input_dir }}/{{ project_name }}" + cacheable: true + tags: always - name: Verify the project directory exists ansible.builtin.stat: path: "{{ input_project_dir }}" register: verify_project_dir + tags: always - name: Fail if project directory does not exist ansible.builtin.fail: msg: "{{ project_dir_not_exist_fail_msg }}" when: not verify_project_dir.stat + tags: always - name: Include common vars ansible.builtin.include_vars: "{{ role_path }}/../../../common/vars/common_vars.yml" + tags: always - name: Include openchami vars ansible.builtin.include_vars: "{{ role_path }}/../../../common/vars/openchami_vars.yml" when: openchami_vars_suppport | default(false) + tags: always - name: Include oim metadata vars ansible.builtin.include_vars: "{{ omnia_metadata_file_path }}" when: omnia_metadata_support | default(false) + tags: always