From 5b95c919f5ed9b6c1aac6eb52feadbbbf228f273 Mon Sep 17 00:00:00 2001 From: Nupur Goyal Date: Fri, 12 Jul 2024 12:15:04 +0530 Subject: [PATCH] updating test cases and fixing existing bastion bug --- .gitignore | 3 + .tekton/scripts/ssh_create_delete.sh | 4 +- .tekton/scripts/suites.sh | 2 +- modules/landing_zone_vsi/main.tf | 3 +- modules/landing_zone_vsi/variables.tf | 6 + solutions/hpc/main.tf | 1 + tests/README.md | 355 +++++++++++--- tests/common_utils/deploy_utils.go | 2 + tests/common_utils/ssh_utils.go | 129 +++--- tests/common_utils/utils.go | 196 ++++++++ tests/constants.go | 24 +- tests/lsf/lsf_cluster_test_utils.go | 19 +- tests/lsf/lsf_cluster_test_validation.go | 564 ++++++++++++++++------- tests/lsf/lsf_cluster_utils.go | 64 +-- tests/other_test.go | 123 ++++- tests/pr_test.go | 4 +- tests/test_config.yml | 1 + 17 files changed, 1163 insertions(+), 337 deletions(-) diff --git a/.gitignore b/.gitignore index 1ae564ec..aad210b0 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,6 @@ terraform.rc # tweaks used locally localtweak__*.tf + +# tests folder log file +*.log diff --git a/.tekton/scripts/ssh_create_delete.sh b/.tekton/scripts/ssh_create_delete.sh index d09ab857..a6d4b220 100644 --- a/.tekton/scripts/ssh_create_delete.sh +++ b/.tekton/scripts/ssh_create_delete.sh @@ -6,7 +6,7 @@ CICD_SSH_KEY=$(echo $CICD_SSH_KEY-"$REVISION") elif [ "${PR_REVISION}" ] && [ -z "${REVISION}" ]; then CICD_SSH_KEY=$(echo $CICD_SSH_KEY-"$PR_REVISION") else -resource_group=$CICD_SSH_KEY-tekton +CICD_SSH_KEY=$CICD_SSH_KEY-tekton fi ssh_key_create() { @@ -35,7 +35,7 @@ for region in "${REGIONS[@]}"; check_key=$(eval "ibmcloud is keys | grep $CICD_SSH_KEY | awk '{print $2}'") if [[ -z "$check_key" ]]; then echo "$CICD_SSH_KEY creating in $region" - ssh_key_create=$(eval "ibmcloud is key-create $CICD_SSH_KEY @/artifacts/.ssh/id_rsa.pub --resource-group-name $resource_group") + ssh_key_create=$(eval "ibmcloud is key-create $CICD_SSH_KEY @/artifacts/.ssh/id_rsa.pub --resource-group-name ${resource_group:?}") if [[ $ssh_key_create = *Created* ]]; then echo "$CICD_SSH_KEY created in $region" else diff --git a/.tekton/scripts/suites.sh b/.tekton/scripts/suites.sh index 2f08e952..84da4d89 100644 --- a/.tekton/scripts/suites.sh +++ b/.tekton/scripts/suites.sh @@ -133,7 +133,7 @@ ubuntu_suite_3() { # regions based suite on regions-suite regions_suite() { suite=regions-suite - test_cases="TestRunInUsEastRegion,TestRunInEuDeRegion,TestRunInUSSouthRegion,TestRunCIDRsAsNonDefault,TestRunExistingPACEnvironment" + test_cases="TestRunInUsEastRegion,TestRunInEuDeRegion,TestRunInUSSouthRegion,TestRunCIDRsAsNonDefault" new_line="${test_cases//,/$'\n'}" echo "************** Going to run ${suite} ${new_line} **************" common_suite "${test_cases}" "${suite}" "${compute_image_name_rhel:?}" diff --git a/modules/landing_zone_vsi/main.tf b/modules/landing_zone_vsi/main.tf index 39667e96..cd85c984 100644 --- a/modules/landing_zone_vsi/main.tf +++ b/modules/landing_zone_vsi/main.tf @@ -123,7 +123,8 @@ module "login_vsi" { vpc_id = var.vpc_id kms_encryption_enabled = var.kms_encryption_enabled boot_volume_encryption_key = var.boot_volume_encryption_key - skip_iam_authorization_policy = local.skip_iam_authorization_policy + skip_iam_authorization_policy = var.bastion_instance_name != null ? false : local.skip_iam_authorization_policy + existing_kms_instance_guid = var.existing_kms_instance_guid } module "ldap_vsi" { diff --git a/modules/landing_zone_vsi/variables.tf b/modules/landing_zone_vsi/variables.tf index 4345e7de..46cc9549 100644 --- a/modules/landing_zone_vsi/variables.tf +++ b/modules/landing_zone_vsi/variables.tf @@ -409,3 +409,9 @@ variable "ce_project_guid" { description = "The GUID of the Code Engine Project associated to this cluster Reservation" type = string } + +variable "existing_kms_instance_guid" { + type = string + default = null + description = "GUID of boot volume encryption key" +} diff --git a/solutions/hpc/main.tf b/solutions/hpc/main.tf index b7b4a0fe..59c1bfa1 100644 --- a/solutions/hpc/main.tf +++ b/solutions/hpc/main.tf @@ -128,6 +128,7 @@ module "landing_zone_vsi" { cloud_monitoring_prws_url = var.observability_monitoring_enable ? module.cloud_monitoring_instance_creation.cloud_monitoring_prws_url : "" bastion_instance_name = var.bastion_instance_name ce_project_guid = module.ce_project.guid + existing_kms_instance_guid = local.existing_kms_instance_guid depends_on = [ module.validate_ldap_server_connection ] diff --git a/tests/README.md b/tests/README.md index 7cded2a4..ea0a71dd 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,115 +1,350 @@ -# IBM Cloud HPC - Running Tests with Terratest + +# HPC Automation + +## Overview + +This repository contains automation tests for High-Performance Computing as a Service (HPCaaS) using the `ibmcloud-terratest-wrapper/testhelper` library and the Terratest framework in Golang. This guide provides instructions for setting up the environment, running tests, and troubleshooting issues. + +## Table of Contents + +1. [Prerequisites](#prerequisites) +2. [Cloning the Repository](#cloning-the-repository) +3. [Setting Up the Go Project](#setting-up-the-go-project) +4. [Running the Tests](#running-the-tests) + - [Passing Input Parameters](#passing-input-parameters) + - [Updating `test_config.yml`](#updating-test_configyml) + - [Command-Line Overrides](#command-line-overrides) + - [Using Default Parameters](#using-default-parameters) + - [Overriding Parameters](#overriding-parameters) + - [Running Multiple Tests](#running-multiple-tests) +5. [Exporting API Key](#exporting-api-key) +6. [Analyzing Test Results](#analyzing-test-results) + - [Reviewing Test Output](#reviewing-test-output) + - [Viewing Test Output Logs](#viewing-test-output-logs) +7. [Troubleshooting](#troubleshooting) + - [Common Issues](#common-issues) +8. [Project Structure](#project-structure) +9. [Utilities](#utilities) + - [LSF Utilities](#lsf-utilities) + - [LSF Cluster Test Utilities](#lsf-cluster-test-utilities) + - [Test Validation Utilities](#test-validation-utilities) + - [SSH Utilities](#ssh-utilities) + - [Logger Utilities](#logger-utilities) + - [Common Utilities](#common-utilities) + - [Deploy Utilities](#deploy-utilities) +10. [Acknowledgments](#acknowledgments) ## Prerequisites -Ensure the following tools and utilities are installed and configured on your system: +Ensure you have the following tools and utilities installed: -- **Go Programming Language** -- **Git** -- **Terraform** +- **Go Programming Language**: [Install Go](https://golang.org/doc/install) +- **Git**: [Install Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) +- **Terraform**: [Install Terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) +- **IBM Cloud CLI**: [Install IBM Cloud CLI](https://cloud.ibm.com/docs/cli?topic=cli-install-ibmcloud-cli) - **IBM Cloud Plugins**: ```sh ibmcloud plugin install cloud-object-storage + ibmcloud plugin install vpc-infrastructure + ibmcloud plugin install dns ibmcloud plugin install key-protect -r "IBM Cloud" ``` -- **Initialize Git Submodules**: - ```sh - git submodule update --init - ``` -## Clone the Repository -Clone the repository containing your Go project: +## Cloning the Repository +Clone the repository to your local machine: ```sh git clone https://github.ibm.com/workload-eng-services/HPCaaS.git ``` -## Set up Your Go Project +## Setting Up the Go Project -1. Navigate to the project directory: - ```sh - cd HPCaaS/tests - ``` +Navigate to the project directory: +```sh +cd HPCaaS/tests +``` -2. Install project dependencies using Go modules: - ```sh - go mod tidy - ``` +Install project dependencies using Go modules: +```sh +go mod tidy +``` +Initialize Git Submodules: + ```sh + git submodule update --init + ``` ## Running the Tests -### Option 1: Use Default Parameters from YAML File +### Passing Input Parameters + +#### Updating `test_config.yml` -You can run the tests using the default parameter values specified in the YAML file: +You can update the `test_config.yml` file to provide input parameters. This file contains default values for various parameters used during testing. Modify the values as needed to suit your testing requirements. +#### Command-Line Overrides + +If you want to override the values in `test_config.yml`, you can pass the input parameters through the command line. Example: ```sh -go test -v -timeout 900m -parallel 4 -run "TestRunBasic" | tee test_output.log +SSH_KEY=your_ssh_key ZONE=your_zone RESOURCE_GROUP=your_resource_group go test -v -timeout 900m -parallel 4 -run "TestRunBasic" | tee test_output.log ``` +Replace placeholders (e.g., `your_ssh_key`, `your_zone`, etc.) with actual values. -### Option 2: Override Parameters - -If you want to override the default values, you can pass only the parameters you need to change, or you can override all the values based on your requirements. To do this, execute the following command with your desired parameter values: +### Using Default Parameters +Run tests with default parameter values from the `test_config.yml` file: ```sh -SSH_KEY=your_ssh_key ZONE=your_zone RESOURCE_GROUP=your_resource_group RESERVATION_ID=your_reservation_id KMS_INSTANCE_ID=kms_instance_id KMS_KEY_NAME=kms_key_name IMAGE_NAME=image_name CLUSTER=your_cluster_id DEFAULT_RESOURCE_GROUP=default_resource_group NON_DEFAULT_RESOURCE_GROUP=non_default_resource_group LOGIN_NODE_INSTANCE_TYPE=login_node_instance_type MANAGEMENT_IMAGE_NAME=management_image_name COMPUTE_IMAGE_NAME=compute_image_name MANAGEMENT_NODE_INSTANCE_TYPE=management_node_instance_type MANAGEMENT_NODE_COUNT=management_node_count ENABLE_VPC_FLOW_LOGS=enable_vpc_flow_logs KEY_MANAGEMENT=key_management KMS_INSTANCE_NAME=kms_instance_name HYPERTHREADING_ENABLED=hyperthreading_enabled US_EAST_ZONE=us_east_zone US_EAST_RESERVATION_ID=us_east_reservation_id US_EAST_CLUSTER_ID=us_east_cluster_id US_SOUTH_ZONE=us_south_zone US_SOUTH_RESERVATION_ID=us_south_reservation_id US_SOUTH_CLUSTER_ID=us_south_cluster_id EU_DE_ZONE=eu_de_zone EU_DE_RESERVATION_ID=eu_de_reservation_id EU_DE_CLUSTER_ID=eu_de_cluster_id SSH_FILE_PATH=ssh_file_path go test -v -timeout 900m -parallel 4 -run "TestRunBasic" | tee test_output.log +go test -v -timeout 900m -parallel 4 -run "TestRunBasic" | tee test_output.log ``` -Replace placeholders (e.g., `your_ssh_key`, `your_zone`) with actual values. +### Overriding Parameters -### Running Multiple Tests Simultaneously +To override default values, pass the necessary parameters in the command. Example: +```sh +SSH_KEY=your_ssh_key ZONE=your_zone RESOURCE_GROUP=your_resource_group RESERVATION_ID=your_reservation_id KMS_INSTANCE_ID=kms_instance_id KMS_KEY_NAME=kms_key_name IMAGE_NAME=image_name CLUSTER=your_cluster_id DEFAULT_RESOURCE_GROUP=default_resource_group NON_DEFAULT_RESOURCE_GROUP=non_default_resource_group LOGIN_NODE_INSTANCE_TYPE=login_node_instance_type MANAGEMENT_IMAGE_NAME=management_image_name COMPUTE_IMAGE_NAME=compute_image_name MANAGEMENT_NODE_INSTANCE_TYPE=management_node_instance_type MANAGEMENT_NODE_COUNT=management_node_count ENABLE_VPC_FLOW_LOGS=enable_vpc_flow_logs KEY_MANAGEMENT=key_management KMS_INSTANCE_NAME=kms_instance_name HYPERTHREADING_ENABLED=hyperthreading_enabled US_EAST_ZONE=us_east_zone US_EAST_RESERVATION_ID=us_east_reservation_id US_EAST_CLUSTER_ID=us_east_cluster_id US_SOUTH_ZONE=us_south_zone US_SOUTH_RESERVATION_ID=us_south_reservation_id US_SOUTH_CLUSTER_ID=us_south_cluster_id EU_DE_ZONE=eu_de_zone EU_DE_RESERVATION_ID=eu_de_reservation_id EU_DE_CLUSTER_ID=eu_de_cluster_id SSH_FILE_PATH=ssh_file_path go test -v -timeout 900m -parallel 4 -run "TestRunBasic" | tee test_output.log +``` +Replace placeholders (e.g., `your_ssh_key`, `your_zone`, etc.) with actual values. -To run multiple tests at the same time: +### Running Multiple Tests +Execute multiple tests simultaneously: ```sh go test -v -timeout 900m -parallel 10 -run="TestRunDefault|TestRunBasic|TestRunLDAP|TestRunAppCenter" | tee test_output.log ``` -### Export API Key +### Specific Test Files -Before running tests, export the IBM Cloud API key: +- `pr_test.go`: Contains tests that are run for any Pull Request (PR) raised. It ensures that changes proposed in a PR do not break existing functionality. +- `other_test.go`: Includes all P0, P1, and P2 test cases, covering all functional testing. It ensures comprehensive testing of all core functionalities. +## Exporting API Key + +Before running tests, export your IBM Cloud API key: ```sh -export TF_VAR_ibmcloud_api_key=your_api_key //pragma: allowlist secret +export TF_VAR_ibmcloud_api_key=your_api_key //pragma: allowlist secret ``` - -Replace `your_api_key` with your actual API key. //pragma: allowlist secret +Replace `your_api_key` with your actual IBM Cloud API key. //pragma: allowlist secret ## Analyzing Test Results -### Review Test Output +### Reviewing Test Output -- **Passing Test Example**: - ```sh - --- PASS: TestRunHpcBasicExample (514.35s) - PASS - ok github.com/terraform-ibm-modules/terraform-ibmcloud-hpc 514.369s - ``` +Passing Test Example: +```sh +--- PASS: TestRunHpcBasicExample (514.35s) +PASS +ok github.com/terraform-ibm-modules/terraform-ibmcloud-hpc 514.369s +``` -- **Failing Test Example**: - ```sh - --- FAIL: TestRunHpcBasicExample (663.30s) - FAIL - exit status 1 - FAIL github.com/terraform-ibm-modules/terraform-ibmcloud-hpc 663.323s - ``` +Failing Test Example: +```sh +--- FAIL: TestRunHpcBasicExample (663.30s) +FAIL +exit status 1 +FAIL github.com/terraform-ibm-modules/terraform-ibcloud-hpc 663.323s +``` -### Test Output Logs +### Viewing Test Output Logs -- **Console Output**: Check the console for detailed test results. -- **Log Files**: Review `test_output.log` and custom logs in the `/tests/test_output` folder with a timestamp for detailed analysis and troubleshooting. For example, a log file might be named `log_20XX-MM-DD_HH-MM-SS.log`. +- **Console Output**: Check the console for immediate test results. +- **Log Files**: Detailed logs are saved in `test_output.log` and custom logs in the `/tests/test_output` folder. Logs are timestamped for easier tracking (e.g., `log_20XX-MM-DD_HH-MM-SS.log`). ## Troubleshooting ### Common Issues -- **Missing Test Directories**: Verify the project structure and required files. -- **Invalid API Key**: Ensure `TF_VAR_ibmcloud_api_key` is correct. -- **Invalid SSH Key**: Check the `SSH_KEY` value. -- **Invalid Zone**: Ensure `ZONE` is set correctly. -- **Remote IP Configuration**: Customize `REMOTE_ALLOWED_IPS` if needed. -- **Terraform Initialization**: Ensure Terraform modules and plugins are up-to-date. -- **Test Output Logs**: Review logs for errors and failure messages. +- **Missing Test Directories**: Ensure the project directory structure is correct. +- **Invalid API Key**: Verify the `TF_VAR_ibmcloud_api_key` environment variable. +- **Invalid SSH Key**: Confirm that `SSH_KEY` is set properly. +- **Invalid Zone**: Check the `ZONE` environment variable. +- **Remote IP Configuration**: Adjust `REMOTE_ALLOWED_IPS` as needed. +- **Terraform Initialization**: Make sure Terraform modules and plugins are up-to-date. +- **Test Output Logs**: Inspect logs for errors and failure messages. + +For additional help, contact the project maintainers. + +## Project Structure + +```plaintext +/root/HPCAAS/HPCaaS/tests +├── README.md +├── common_utils +│ ├── deploy_utils.go +│ ├── log_utils.go +│ ├── ssh_utils.go +│ └── utils.go +├── constants.go +├── go.mod +├── go.sum +├── lsf +│ ├── lsf_cluster_test_utils.go +│ ├── lsf_cluster_test_validation.go +│ ├── lsf_cluster_utils.go +│ └── lsf_constants.go +├── other_test.go +├── pr_test.go +├── test_config.yml +└── test_output +``` -For additional assistance, contact the project maintainers. +## Utilities + +### LSF Utilities: `lsf_cluster_utils.go` + +- **CheckLSFVersion**: Verify the LSF version. +- **LSFCheckSSHKeyForComputeNodes**: Check SSH key for compute nodes. +- **LSFCheckSSHKeyForComputeNode**: Check SSH key for a specific compute node. +- **LSFCheckSSHKeyForManagementNodes**: Check SSH key for management nodes. +- **LSFCheckSSHKeyForManagementNode**: Check SSH key for a specific management node. +- **LSFCheckHyperthreading**: Verify hyperthreading configuration. +- **LSFDaemonsStatus**: Check the status of LSF daemons. +- **LSFGETDynamicComputeNodeIPs**: Retrieve IPs of dynamic compute nodes. +- **HPCCheckFileMount**: Verify file mount configuration. +- **LSFAPPCenterConfiguration**: Check APPCenter configuration. +- **LSFWaitForDynamicNodeDisappearance**: Wait for dynamic nodes to disappear. +- **LSFExtractJobID**: Extract job ID from LSF. +- **LSFRunJobs**: Run jobs on LSF. +- **LSFCheckBhostsResponse**: Check the response from `bhosts`. +- **LSFRebootInstance**: Reboot an LSF instance. +- **LSFCheckIntelOneMpiOnComputeNodes**: Check Intel MPI installation on compute nodes. +- **LSFControlBctrld**: Control `bctrld` service. +- **LSFRestartDaemons**: Restart LSF daemons. +- **LSFCheckManagementNodeCount**: Verify the count of management nodes. +- **HPCCheckContractID**: Check the contract ID for HPC. +- **LSFCheckMasterName**: Verify the master node name. +- **LSFCheckClusterID**: Check the cluster ID. +- **LSFIPRouteCheck**: Verify IP routing in LSF. +- **LSFMTUCheck**: Check the MTU settings. +- **IsDynamicNodeAvailable**: Check if a dynamic node is available. +- **verifyDirectories**: Verify the existence of directories. +- **VerifyTerraformOutputs**: Validate Terraform outputs. +- **LSFCheckSSHConnectivityToNodesFromLogin**: Verify SSH connectivity from the login node. +- **HPCCheckNoVNC**: Check NoVNC configuration. +- **GetJobCommand**: Get the command to run a job. +- **ValidateEncryption**: Validate file encryption. +- **ValidateRequiredEnvironmentVariables**: Check required environment variables. +- **LSFRunJobsAsLDAPUser**: Run jobs as an LDAP user. +- **HPCCheckFileMountAsLDAPUser**: Check file mount as an LDAP user. +- **verifyDirectoriesAsLdapUser**: Verify directories as an LDAP user. +- **VerifyLSFCommands**: Verify LSF commands. +- **VerifyLDAPConfig**: Check LDAP configuration. +- **VerifyLDAPServerConfig**: Validate LDAP server configuration. +- **runSSHCommandAndGetPaths**: Run an SSH command and retrieve file paths. +- **GetOSNameOfNode**: Get the OS name of a node. +- **verifyPTRRecords**: Verify PTR records. +- **CreateServiceInstanceAndReturnGUID**: Create a service instance and return its GUID. +- **DeleteServiceInstance**: Delete a service instance. +- **CreateKey**: Create a key. +- **LSFDNSCheck**: Verify LSF DNS settings. +- **HPCAddNewLDAPUser**: Add a new LDAP user. +- **VerifyLSFCommandsAsLDAPUser**: Verify LSF commands as an LDAP user. +- **VerifyCosServiceInstance**: Validate COS service instance. +- **HPCGenerateFilePathMap**: Generate a file path map. +- **ValidateFlowLogs**: Validate flow logs configuration. + +### LSF Cluster Test Utilities: `lsf_cluster_test_utils.go` + +- **VerifyManagementNodeConfig**: Verify configurations for management nodes. +- **VerifySSHKey**: Check if the SSH key is set correctly. +- **FailoverAndFailback**: Handle failover and failback processes. +- **RestartLsfDaemon**: Restart the LSF daemon. +- **RebootInstance**: Reboot an instance. +- **VerifyComputeNodeConfig**: Verify configurations for compute nodes. +- **VerifyLoginNodeConfig**: Verify configurations for login nodes. +- **VerifySSHConnectivityToNodesFromLogin**: Check SSH connectivity from the login node to other nodes. +- **VerifyTestTerraformOutputs**: Validate Terraform outputs for testing. +- **VerifyNoVNCConfig**: Verify NoVNC configurations. +- **VerifyAPPCenterConfig**: Check APPCenter configurations. +- **VerifyFileShareEncryption**: Validate file share encryption. +- **VerifyJobs**: Verify job statuses and configurations. +- **VerifyManagementNodeLDAPConfig**: Verify LDAP configurations for management nodes. +- **VerifyLoginNodeLDAPConfig**: Verify LDAP configurations for login nodes. +- **VerifyComputeNodeLDAPConfig**: Verify LDAP configurations for compute nodes. +- **CheckLDAPServerStatus**: Check the status of the LDAP server. +- **VerifyPTRRecordsForManagementAndLoginNodes**: Verify PTR records for management and login nodes. +- **CreateServiceInstanceAndKmsKey**: Create a service instance and KMS key. +- **DeleteServiceInstanceAndAssociatedKeys**: Delete a service instance and its associated keys. +- **VerifyCreateNewLdapUserAndManagementNodeLDAPConfig**: Verify the creation of a new LDAP user and management node LDAP configuration. +- **ValidateCosServiceInstanceAndVpcFlowLogs**: Validate COS service instance and VPC flow logs. +- **VerifyLSFDNS**: Check LSF DNS settings. + +### Test Validation Utilities: `lsf_cluster_test_validation.go` + +- **ValidateClusterConfigurationWithAPPCenter**: Validate cluster configuration with APPCenter. +- **ValidateClusterConfiguration**: Validate overall cluster configuration. +- **ValidateBasicClusterConfiguration**: Check basic cluster settings. +- **ValidateLDAPClusterConfiguration**: Verify LDAP configurations in the cluster. +- **ValidatePACANDLDAPClusterConfiguration**: Validate PAC and LDAP configurations. +- **ValidateClusterConfigurationWithAPPCenterForExistingEnv**: Validate cluster setup with APPCenter for an existing environment. +- **ValidateBasicClusterConfigurationWithVPCFlowLogsAndCos**: Check basic cluster configuration with VPC flow logs and COS. +- **ValidateClusterConfigurationWithMultipleKeys**: Validate cluster configuration with multiple keys. +- **ValidateExistingLDAPClusterConfig**: Check configurations for existing LDAP clusters. + +### SSH Utilities + +- **ConnectToHost**: Connect to a host via SSH. +- **ConnectToHostE**: Connect to a host via SSH with error handling. +- **ConnectToHostsWithMultipleUsers**: Connect to multiple hosts with different users. + +### Logger Utilities + +- **NewAggregatedLogger**: Create a custom logger with aggregated log levels. +- **getLogArgs**: Retrieve log arguments. + +### Common Utilities + +- **GetValueFromIniFile**: Retrieve values from an INI file. +- **ToCreateFile**: Create a file. +- **IsFileExist**: Check if a file exists. +- **IsPathExist**: Check if a path exists. +- **GetDirList**: Get a list of directories. +- **GetDirectoryFileList**: Get a list of files in a directory. +- **ToDeleteFile**: Delete a file. +- **ToCreateFileWithContent**: Create a file with specified content. +- **ReadRemoteFileContents**: Read contents from a remote file. +- **VerifyDataContains**: Verify if data contains specific values. +- **CountStringOccurrences**: Count occurrences of a string. +- **SplitString**: Split a string into substrings. +- **StringToInt**: Convert a string to an integer. +- **RemoveNilValues**: Remove nil values from a list. +- **LogVerificationResult**: Log the results of verification. +- **ParsePropertyValue**: Parse a property value. +- **FindImageNamesByCriteria**: Find image names based on criteria. +- **LoginIntoIBMCloudUsingCLI**: Log in to IBM Cloud using CLI. +- **CreateVPC**: Create a VPC. +- **IsVPCExist**: Check if a VPC exists. +- **GetRegion**: Get the region information. +- **SplitAndTrim**: Split and trim a string. +- **RemoveKeys**: Remove keys from a map. +- **GetBastionServerIP**: Get the IP address of the bastion server. +- **GetManagementNodeIPs**: Get IP addresses of management nodes. +- **GetLoginNodeIP**: Get the IP address of the login node. +- **GetLdapServerIP**: Get the IP address of the LDAP server. +- **GetServerIPs**: Get IP addresses of servers. +- **GetServerIPsWithLDAP**: Get IP addresses of servers with LDAP. +- **GenerateTimestampedClusterPrefix**: Generate a cluster prefix with a timestamp. +- **GetPublicIP**: Get the public IP address. +- **GetOrDefault**: Retrieve a value or default if not present. +- **GenerateRandomString**: Generate a random string. +- **GetSecretsManagerKey**: Retrieve a key from the secrets manager. +- **GetValueForKey**: Get a value for a specified key. +- **GetSubnetIds**: Get IDs of subnets. +- **GetDnsCustomResolverIds**: Get IDs of DNS custom resolvers. +- **ParseConfig**: Parse configuration files. +- **GetClusterSecurityID**: Get the security ID of the cluster. +- **UpdateSecurityGroupRules**: Update security group rules. +- **GetCustomResolverID**: Get the ID of the custom resolver. +- **RetrieveAndUpdateSecurityGroup**: Retrieve and update security group settings. +- **GetLdapIP**: Get the IP address of the LDAP server. +- **GetBastionIP**: Get the IP address of the bastion server. + +### Deploy Utilities + +- **GetConfigFromYAML**: Retrieve configuration from a YAML file. +- **SetEnvFromConfig**: Set environment variables from a configuration file. + +## Acknowledgments + +- [Terratest](https://terratest.gruntwork.io/) +- [ibmcloud-terratest-wrapper](https://github.com/terraform-ibm-modules/ibmcloud-terratest-wrapper) diff --git a/tests/common_utils/deploy_utils.go b/tests/common_utils/deploy_utils.go index 65a3de7e..17ef8ae3 100644 --- a/tests/common_utils/deploy_utils.go +++ b/tests/common_utils/deploy_utils.go @@ -57,6 +57,7 @@ type Config struct { USSouthClusterID string `yaml:"us_south_cluster_id"` USSouthReservationID string `yaml:"us_south_reservation_id"` SSHFilePath string `yaml:"ssh_file_path"` + SSHFilePathTwo string `yaml:"ssh_file_path_two"` } // GetConfigFromYAML reads configuration from a YAML file and sets environment variables based on the configuration. @@ -148,6 +149,7 @@ func setEnvFromConfig(config *Config) error { "US_SOUTH_RESERVATION_ID": config.USSouthReservationID, "US_SOUTH_CLUSTER_ID": config.USSouthClusterID, "SSH_FILE_PATH": config.SSHFilePath, + "SSH_FILE_PATH_TWO": config.SSHFilePathTwo, } for key, value := range envVars { diff --git a/tests/common_utils/ssh_utils.go b/tests/common_utils/ssh_utils.go index b1adab23..f92fd36f 100644 --- a/tests/common_utils/ssh_utils.go +++ b/tests/common_utils/ssh_utils.go @@ -167,73 +167,6 @@ func ConnectionE(t *testing.T, publicHostName, publicHostIP, privateHostName, pr return output, err } -// connectToHostsWithMultipleUsers establishes SSH connections to a host using multiple user credentials. -// It takes the public and private IP addresses and host names for two different users. -// Returns two SSH clients for the respective users, along with any errors encountered during the process. -func ConnectToHostsWithMultipleUsers(publicHostName, publicHostIP, privateHostName, privateHostIP string) (*ssh.Client, *ssh.Client, error, error) { - // Get the SSH private key file path for the first user from the environment variable - sshKeyFilePathUserOne := os.Getenv("SSHFILEPATH") - // Check if the file exists - if _, err := os.Stat(sshKeyFilePathUserOne); os.IsNotExist(err) { - return nil, nil, fmt.Errorf("SSH private key file '%s' does not exist", sshKeyFilePathUserOne), nil - } else if err != nil { - return nil, nil, fmt.Errorf("error checking SSH private key file: %v", err), nil - } - sshKeyUserOne, errUserOne := getSshKeyFile(sshKeyFilePathUserOne) - if errUserOne != nil { - return nil, nil, fmt.Errorf("failed to get SSH key for user one: %w", errUserOne), nil - } - - // Get the SSH private key file path for the second user from the environment variable - sshKeyFilePathUserTwo := os.Getenv("SSHFILEPATHTWO") - // Check if the file exists - if _, err := os.Stat(sshKeyFilePathUserTwo); os.IsNotExist(err) { - return nil, nil, nil, fmt.Errorf("SSH private key file '%s' does not exist", sshKeyFilePathUserTwo) - } else if err != nil { - return nil, nil, nil, fmt.Errorf("error checking SSH private key file: %v", err) - } - sshKeyUserTwo, errUserTwo := getSshKeyFile(sshKeyFilePathUserTwo) - if errUserTwo != nil { - return nil, nil, nil, fmt.Errorf("failed to get SSH key for user two: %w", errUserTwo) - } - - // Combine errors for better readability - var combinedErrUserOne error - if errUserOne != nil { - combinedErrUserOne = fmt.Errorf("user one SSH key error: %v", errUserOne) - } - var combinedErrUserTwo error - if errUserTwo != nil { - combinedErrUserTwo = fmt.Errorf("user two SSH key error: %v", errUserTwo) - } - - if combinedErrUserOne != nil && combinedErrUserTwo != nil { - return nil, nil, combinedErrUserOne, combinedErrUserTwo - } - - // Create SSH configurations for each user and host combination - sshConfigUserOnePrivate := getSshConfig(sshKeyUserOne, privateHostName) - sshConfigUserOnePublic := getSshConfig(sshKeyUserOne, publicHostName) - sshConfigUserTwoPrivate := getSshConfig(sshKeyUserTwo, privateHostName) - sshConfigUserTwoPublic := getSshConfig(sshKeyUserTwo, publicHostName) - - // Establish SSH connections for each user to the host - clientUserOne, errUserOne := sshClientJumpHost(sshConfigUserOnePrivate, sshConfigUserOnePublic, publicHostIP+":22", privateHostIP+":22") - clientUserTwo, errUserTwo := sshClientJumpHost(sshConfigUserTwoPrivate, sshConfigUserTwoPublic, publicHostIP+":22", privateHostIP+":22") - - // Combine errors for better readability - var combinedErrClientUserOne error - if errUserOne != nil { - combinedErrClientUserOne = fmt.Errorf("user one unable to log in to the node: %v", errUserOne) - } - var combinedErrClientUserTwo error - if errUserTwo != nil { - combinedErrClientUserTwo = fmt.Errorf("user two unable to log in to the node: %v", errUserTwo) - } - - return clientUserOne, clientUserTwo, combinedErrClientUserOne, combinedErrClientUserTwo -} - func ConnectToHostAsLDAPUser(publicHostName, publicHostIP, privateHostIP, ldapUser, ldapPassword string) (*ssh.Client, error) { sshFilePath := os.Getenv("SSH_FILE_PATH") @@ -267,3 +200,65 @@ func ConnectToHostAsLDAPUser(publicHostName, publicHostIP, privateHostIP, ldapUs } return sClient, nil } + +// ConnectToHostsWithMultipleUsers establishes SSH connections to a host using multiple user credentials. +// It takes the public and private IP addresses and host names for two different users. +// Returns two SSH clients for the respective users, along with any errors encountered during the process. +func ConnectToHostsWithMultipleUsers(publicHostName, publicHostIP, privateHostName, privateHostIP string) (*ssh.Client, *ssh.Client, error, error) { + + // Get the SSH private key file path for the first user from the environment variable + sshFilePath := os.Getenv("SSH_FILE_PATH") + + // Check if the file exists + _, err := os.Stat(sshFilePath) + if os.IsNotExist(err) { + return nil, nil, nil, fmt.Errorf("SSH private key file '%s' does not exist", sshFilePath) + } else if err != nil { + return nil, nil, nil, fmt.Errorf("error checking SSH private key file: %v", err) + } + + // Get the SSH key for the first user + key, err := getSshKeyFile(sshFilePath) + if err != nil { + return nil, nil, nil, fmt.Errorf("failed to get SSH key: %w", err) + } + + // Create SSH configurations for the first user + sshConfigUserOnePrivate := getSshConfig(key, publicHostName) + sshConfigUserOnePublic := getSshConfig(key, privateHostName) + + // Establish SSH connection for the first user + clientUserOne, combinedErrClientUserOne := sshClientJumpHost(sshConfigUserOnePrivate, sshConfigUserOnePublic, publicHostIP+":22", privateHostIP+":22") + if combinedErrClientUserOne != nil { + return nil, nil, nil, fmt.Errorf("unable to log in to the node: %w", combinedErrClientUserOne) + } + + // Get the SSH private key file path for the second user from the environment variable + sshFilePathTwo := os.Getenv("SSH_FILE_PATH_TWO") + + // Check if the file exists + _, err = os.Stat(sshFilePathTwo) + if os.IsNotExist(err) { + return nil, nil, nil, fmt.Errorf("SSH private key file '%s' does not exist", sshFilePathTwo) + } else if err != nil { + return nil, nil, nil, fmt.Errorf("error checking SSH private key file: %v", err) + } + + // Get the SSH key for the second user + key2, err2 := getSshKeyFile(sshFilePathTwo) + if err2 != nil { + return nil, nil, nil, fmt.Errorf("failed to get SSH key: %w", err2) + } + + // Create SSH configurations for the second user + sshConfigUserTwoPrivate := getSshConfig(key2, publicHostName) + sshConfigUserTwoPublic := getSshConfig(key2, privateHostName) + + // Establish SSH connection for the second user + clientUserTwo, combinedErrClientUserTwo := sshClientJumpHost(sshConfigUserTwoPrivate, sshConfigUserTwoPublic, publicHostIP+":22", privateHostIP+":22") + if combinedErrClientUserTwo != nil { + return nil, nil, nil, fmt.Errorf("unable to log in to the node: %w", combinedErrClientUserTwo) + } + + return clientUserOne, clientUserTwo, combinedErrClientUserOne, combinedErrClientUserTwo +} diff --git a/tests/common_utils/utils.go b/tests/common_utils/utils.go index a39db8b7..3c268973 100644 --- a/tests/common_utils/utils.go +++ b/tests/common_utils/utils.go @@ -765,3 +765,199 @@ func ParseConfig(filePath string) (*Configuration, error) { // Return the configuration struct and nil error on success return &config, nil } + +// GetClusterSecurityID retrieves the security group ID for a cluster based on the provided parameters. +// It logs in to IBM Cloud, executes a command to find the security group ID associated with the cluster prefix, +// and returns the security group ID or an error if any step fails. +func GetClusterSecurityID(t *testing.T, apiKey, region, resourceGroup, clusterPrefix string, logger *AggregatedLogger) (securityGroupID string, err error) { + // If the resource group is "null", set a custom resource group based on the cluster prefix. + if strings.Contains(resourceGroup, "null") { + resourceGroup = fmt.Sprintf("%s-workload-rg", clusterPrefix) + } + + // Log in to IBM Cloud using the API key, region, and resource group. + if err := LoginIntoIBMCloudUsingCLI(t, apiKey, region, resourceGroup); err != nil { + return "", fmt.Errorf("failed to log in to IBM Cloud: %w", err) + } + + // Determine the command to get the security group ID based on the cluster prefix. + cmd := fmt.Sprintf("ibmcloud is security-groups | grep %s-cluster-sg | awk '{print $1}'", clusterPrefix) + + // Execute the command to retrieve the security group ID. + output, err := exec.Command("bash", "-c", cmd).CombinedOutput() + if err != nil { + return "", fmt.Errorf("failed to retrieve security group ID: %w", err) + } + + // Trim and check if the result is empty. + securityGroupID = strings.TrimSpace(string(output)) + if securityGroupID == "" { + return "", fmt.Errorf("no security group ID found for cluster prefix %s", clusterPrefix) + } + + logger.Info(t, "securityGroupID: "+securityGroupID) + + return securityGroupID, nil +} + +// UpdateSecurityGroupRules updates the security group with specified port and CIDR based on the provided parameters. +// It logs in to IBM Cloud, determines the appropriate command, and executes it to update the security group. +// Returns an error if any step fails. +func UpdateSecurityGroupRules(t *testing.T, apiKey, region, resourceGroup, clusterPrefix, securityGroupId, cidr, minPort, maxPort string, logger *AggregatedLogger) (err error) { + // If the resource group is "null", set a custom resource group based on the cluster prefix. + if strings.Contains(resourceGroup, "null") { + resourceGroup = fmt.Sprintf("%s-workload-rg", clusterPrefix) + } + + // Log in to IBM Cloud using the API key, region, and resource group. + if err := LoginIntoIBMCloudUsingCLI(t, apiKey, region, resourceGroup); err != nil { + return fmt.Errorf("failed to log in to IBM Cloud: %w", err) + } + + // Determine the command to add a rule to the security group with the specified port and CIDR. + addRuleCmd := fmt.Sprintf("ibmcloud is security-group-rule-add %s inbound tcp --remote %s --port-min %s --port-max %s", securityGroupId, cidr, minPort, maxPort) + + // Execute the command to update the security group. + output, err := exec.Command("bash", "-c", addRuleCmd).CombinedOutput() + if err != nil { + return fmt.Errorf("failed to update security group with port and CIDR: %w", err) + } + + logger.Info(t, "security group updated output: "+strings.TrimSpace(string(output))) + + // Verify if the output contains the expected CIDR. + if !VerifyDataContains(t, strings.TrimSpace(string(output)), cidr, logger) { + return fmt.Errorf("failed to update security group CIDR: %s", string(output)) + } + + // Verify if the output contains the expected minimum port. + if !VerifyDataContains(t, strings.TrimSpace(string(output)), minPort, logger) { + return fmt.Errorf("failed to update security group port: %s", string(output)) + } + + return nil +} + +// GetCustomResolverID retrieves the custom resolver ID for a VPC based on the provided cluster prefix. +// It logs in to IBM Cloud, retrieves the DNS instance ID, and then fetches the custom resolver ID. +// Returns the custom resolver ID and any error encountered. +func GetCustomResolverID(t *testing.T, apiKey, region, resourceGroup, clusterPrefix string, logger *AggregatedLogger) (customResolverID string, err error) { + // If the resource group is "null", set a custom resource group based on the cluster prefix. + if strings.Contains(resourceGroup, "null") { + resourceGroup = fmt.Sprintf("%s-workload-rg", clusterPrefix) + } + + // Log in to IBM Cloud using the API key, region, and resource group. + if err := LoginIntoIBMCloudUsingCLI(t, apiKey, region, resourceGroup); err != nil { + return "", fmt.Errorf("failed to log in to IBM Cloud: %w", err) + } + + // Command to get the DNS instance ID based on the cluster prefix. + dnsInstanceCmd := fmt.Sprintf("ibmcloud dns instances | grep %s | awk '{print $2}'", clusterPrefix) + dnsInstanceIDOutput, err := exec.Command("bash", "-c", dnsInstanceCmd).CombinedOutput() + if err != nil { + return "", fmt.Errorf("failed to retrieve DNS instance ID: %w", err) + } + + // Trim whitespace and check if we received a valid DNS instance ID. + dnsInstanceID := strings.TrimSpace(string(dnsInstanceIDOutput)) + if dnsInstanceID == "" { + return "", fmt.Errorf("no DNS instance ID found for cluster prefix %s", clusterPrefix) + } + + // Command to get custom resolvers for the DNS instance ID. + customResolverCmd := fmt.Sprintf("ibmcloud dns custom-resolvers -i %s | awk 'NR>3 {print $1}'", dnsInstanceID) + customResolverIDOutput, err := exec.Command("bash", "-c", customResolverCmd).CombinedOutput() + if err != nil { + return "", fmt.Errorf("failed to retrieve custom resolver ID: %w", err) + } + + // Trim whitespace and check if we received a valid custom resolver ID. + customResolverID = strings.TrimSpace(string(customResolverIDOutput)) + if customResolverID == "" { + return "", fmt.Errorf("no custom resolver ID found for DNS instance ID %s", dnsInstanceID) + } + logger.Info(t, "customResolverID: "+customResolverID) + + return customResolverID, nil +} + +// RetrieveAndUpdateSecurityGroup retrieves the security group ID based on the provided cluster prefix, +// then updates the security group with the specified port and CIDR. +// It logs in to IBM Cloud, determines the appropriate commands, and executes them. +// Returns an error if any step fails. +func RetrieveAndUpdateSecurityGroup(t *testing.T, apiKey, region, resourceGroup, clusterPrefix, cidr, minPort, maxPort string, logger *AggregatedLogger) error { + // If the resource group is "null", set a custom resource group based on the cluster prefix. + if strings.Contains(resourceGroup, "null") { + resourceGroup = fmt.Sprintf("%s-workload-rg", clusterPrefix) + } + + // Log in to IBM Cloud using the API key, region, and resource group. + if err := LoginIntoIBMCloudUsingCLI(t, apiKey, region, resourceGroup); err != nil { + return fmt.Errorf("failed to log in to IBM Cloud: %w", err) + } + + // Command to get the security group ID based on the cluster prefix. + getSecurityGroupIDCmd := fmt.Sprintf("ibmcloud is security-groups | grep %s-cluster-sg | awk '{print $1}'", clusterPrefix) + securityGroupIDBytes, err := exec.Command("bash", "-c", getSecurityGroupIDCmd).CombinedOutput() + if err != nil { + return fmt.Errorf("failed to retrieve security group ID: %w", err) + } + + securityGroupID := strings.TrimSpace(string(securityGroupIDBytes)) + if securityGroupID == "" { + return fmt.Errorf("no security group ID found for cluster prefix %s", clusterPrefix) + } + + logger.Info(t, "securityGroupID: "+securityGroupID) + + // Command to add a rule to the security group with the specified port and CIDR. + addRuleCmd := fmt.Sprintf("ibmcloud is security-group-rule-add %s inbound tcp --remote %s --port-min %s --port-max %s", securityGroupID, cidr, minPort, maxPort) + outputBytes, err := exec.Command("bash", "-c", addRuleCmd).CombinedOutput() + if err != nil { + return fmt.Errorf("failed to update security group with port and CIDR: %w", err) + } + + output := strings.TrimSpace(string(outputBytes)) + logger.Info(t, "security group updated output: "+output) + + // Combine output verification steps. + if !VerifyDataContains(t, output, cidr, logger) || !VerifyDataContains(t, output, minPort, logger) { + return fmt.Errorf("failed to update security group with CIDR %s and port %s: %s", cidr, minPort, output) + } + + return nil +} + +// GetLdapIP retrieves the IP addresses of various servers, including the LDAP server, +// from the specified file path in the provided test options, using the provided logger for logging. +// It returns the LDAP server IP and any error encountered. +func GetLdapIP(t *testing.T, options *testhelper.TestOptions, logger *AggregatedLogger) (ldapIP string, err error) { + // Retrieve the Terraform directory from the options. + filePath := options.TerraformOptions.TerraformDir + + // Get the LDAP server IP and handle errors. + ldapIP, err = GetLdapServerIP(t, filePath, logger) + if err != nil { + return "", fmt.Errorf("error getting LDAP server IP: %w", err) + } + + // Return the retrieved IP address and any error. + return ldapIP, nil +} + +// GetBastionIP retrieves the bastion server IP address based on the provided test options. +// It returns the bastion IP address and an error if any step fails. +func GetBastionIP(t *testing.T, options *testhelper.TestOptions, logger *AggregatedLogger) (bastionIP string, err error) { + // Retrieve the Terraform directory from the options. + filePath := options.TerraformOptions.TerraformDir + + // Get the bastion server IP and handle errors. + bastionIP, err = GetBastionServerIP(t, filePath, logger) + if err != nil { + return "", fmt.Errorf("error getting bastion server IP: %w", err) + } + + // Return the bastion IP address. + return bastionIP, nil +} diff --git a/tests/constants.go b/tests/constants.go index c731c773..f2c47d4b 100644 --- a/tests/constants.go +++ b/tests/constants.go @@ -1,13 +1,19 @@ package tests const ( - IMAGE_NAME_PATH = "modules/landing_zone_vsi/image_map.tf" - HYPERTHREADTING_TRUE = "true" - HYPERTHREADTING_FALSE = "false" - LSF_DEFAULT_RESOURCE_GROUP = "Default" - LSF_CUSTOM_RESOURCE_GROUP_VALUE_AS_NULL = "null" - LSF_CUSTOM_RESOURCE_GROUP_OTHER_THAN_DEFAULT = "WES_TEST" - PRIVATE_KEY_SM_ID = "103a2267-c682-42e5-9393-4ed8f8e738c2" - PRIVATE_KEY_SM_REGION = "us-south" - PRIVATE_KEY_SECRET_ID_SOUTH = "48d56d5d-78fc-dc41-3089-5be6041bc00f" + IMAGE_NAME_PATH = "modules/landing_zone_vsi/image_map.tf" + HYPERTHREADTING_TRUE = "true" + HYPERTHREADTING_FALSE = "false" + LSF_DEFAULT_RESOURCE_GROUP = "Default" + LSF_CUSTOM_RESOURCE_GROUP_VALUE_AS_NULL = "null" + LSF_CUSTOM_RESOURCE_GROUP_OTHER_THAN_DEFAULT = "WES_TEST" + PRIVATE_KEY_SM_ID = "103a2267-c682-42e5-9393-4ed8f8e738c2" + PRIVATE_KEY_SM_REGION = "us-south" + PRIVATE_KEY_SECRET_ID_SOUTH = "48d56d5d-78fc-dc41-3089-5be6041bc00f" + CLUSTER_ONE_VPC_CIDR = "10.241.0.0/18" + CLUSTER_ONE_VPC_CLUSTER_PRIVATE_SUBNETS_CIDR_BLOCKS = "10.241.0.0/20" + CLUSTER_ONE_VPC_CLUSTER_LOGIN_PRIVATE_SUBNETS_CIDR_BLOCKS = "10.241.16.0/28" + CLUSTER_TWO_VPC_CLUSTER_PRIVATE_SUBNETS_CIDR_BLOCKS = "10.241.17.0/24" + CLUSTER_TWO_VPC_CLUSTER_LOGIN_PRIVATE_SUBNETS_CIDR_BLOCKS = "10.241.18.0/24" + CLUSTER_TWO_DNS_DOMAIN_NAME = "clustertwo.com" ) diff --git a/tests/lsf/lsf_cluster_test_utils.go b/tests/lsf/lsf_cluster_test_utils.go index 13dd96a6..992b9fcc 100644 --- a/tests/lsf/lsf_cluster_test_utils.go +++ b/tests/lsf/lsf_cluster_test_utils.go @@ -62,10 +62,10 @@ func VerifyManagementNodeConfig( } // VerifySSHKey verifies SSH keys for both management and compute nodes. -// It checks the SSH keys for a specified node type (management or compute) on a list of nodes. -// The function fails if the node list is empty, or if an invalid value (other than 'management' or 'compute') is provided for the node type. -// The verification results are logged using the provided logger. -func VerifySSHKey(t *testing.T, sshMgmtClient *ssh.Client, publicHostIP, publicHostName, privateHostName string, nodeType string, nodeList []string, logger *utils.AggregatedLogger) { +// It checks SSH keys for a specified node type (management or compute) on a list of nodes. +// Logs errors if the node list is empty or an invalid node type is provided. +// Verification results are logged using the provided logger. +func VerifySSHKey(t *testing.T, sshMgmtClient *ssh.Client, publicHostIP, publicHostName, privateHostName string, nodeType string, nodeList []string, numOfKeys int, logger *utils.AggregatedLogger) { // Check if the node list is empty if len(nodeList) == 0 { @@ -74,21 +74,20 @@ func VerifySSHKey(t *testing.T, sshMgmtClient *ssh.Client, publicHostIP, publicH return } - // Convert nodeType to lowercase for consistency + // Normalize nodeType to lowercase nodeType = strings.ToLower(nodeType) - var sshKeyCheckErr error switch nodeType { case "management": - sshKeyCheckErr = LSFCheckSSHKeyForManagementNodes(t, publicHostName, publicHostIP, privateHostName, nodeList, logger) + sshKeyCheckErr = LSFCheckSSHKeyForManagementNodes(t, publicHostName, publicHostIP, privateHostName, nodeList, numOfKeys, logger) case "compute": sshKeyCheckErr = LSFCheckSSHKeyForComputeNodes(t, sshMgmtClient, nodeList, logger) default: - // Log an error if the node type is unknown errorMsg := fmt.Sprintf("unknown node type for SSH key verification: %s", nodeType) utils.LogVerificationResult(t, fmt.Errorf(errorMsg), fmt.Sprintf("%s node SSH check", nodeType), logger) return } + // Log the result of the SSH key check utils.LogVerificationResult(t, sshKeyCheckErr, fmt.Sprintf("%s node SSH check", nodeType), logger) } @@ -149,11 +148,11 @@ func RebootInstance(t *testing.T, sshMgmtClient *ssh.Client, publicHostIP, publi } -// VerifyComputetNodeConfig verifies the configuration of compute nodes by performing various checks +// VerifyComputeNodeConfig verifies the configuration of compute nodes by performing various checks // It checks the cluster ID,such as MTU, IP route, hyperthreading, file mount, and Intel One MPI. // The results of the checks are logged using the provided logger. // NOTE : Compute Node nothing but worker node -func VerifyComputetNodeConfig( +func VerifyComputeNodeConfig( t *testing.T, sshMgmtClient *ssh.Client, expectedHyperthreadingStatus bool, diff --git a/tests/lsf/lsf_cluster_test_validation.go b/tests/lsf/lsf_cluster_test_validation.go index a26c135a..fd95cf74 100644 --- a/tests/lsf/lsf_cluster_test_validation.go +++ b/tests/lsf/lsf_cluster_test_validation.go @@ -5,10 +5,8 @@ import ( "strconv" "testing" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/terraform-ibm-modules/ibmcloud-terratest-wrapper/testhelper" - utils "github.com/terraform-ibm-modules/terraform-ibm-hpc/common_utils" ) @@ -22,36 +20,42 @@ import ( // Additionally, this function logs detailed information throughout the validation process. // This function doesn't return any value but logs errors and validation steps during the process. func ValidateClusterConfiguration(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve cluster information + // Retrieve cluster information from options expectedClusterID := options.TerraformVars["cluster_id"].(string) expectedReservationID := options.TerraformVars["reservation_id"].(string) expectedMasterName := options.TerraformVars["cluster_prefix"].(string) expectedResourceGroup := options.TerraformVars["resource_group"].(string) expectedKeyManagement := options.TerraformVars["key_management"].(string) expectedZone := options.TerraformVars["zones"].([]string)[0] + expectedNumOfKeys := len(options.TerraformVars["compute_ssh_keys"].([]string)) expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - assert.False(t, !ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") + require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") + + expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) + require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) - expectedHyperthreadingEnabled, _ := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) JOB_COMMAND_LOW := GetJobCommand(expectedZone, "low") JOB_COMMAND_MED := GetJobCommand(expectedZone, "med") - // Run the test and handle errors + // Run the test consistency check output, err := options.RunTestConsistency() require.NoError(t, err, "Error running consistency test: %v", err) require.NotNil(t, output, "Expected non-nil output, but got nil") - // Proceed with SSH connection and verification if there are no errors + // Log successful cluster creation testLogger.Info(t, t.Name()+" Cluster created successfully") + // Retrieve server IPs bastionIP, managementNodeIPList, loginNodeIP, ipRetrievalError := utils.GetServerIPs(t, options, testLogger) require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + // Log validation start testLogger.Info(t, t.Name()+" Validation started ......") // Connect to the master node via SSH and handle connection errors sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.Nil(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + require.NoError(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + defer sshClient.Close() testLogger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") @@ -59,10 +63,10 @@ func ValidateClusterConfiguration(t *testing.T, options *testhelper.TestOptions, // Verify management node configuration VerifyManagementNodeConfig(t, sshClient, expectedClusterID, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, JOB_COMMAND_LOW, EXPECTED_LSF_VERSION, testLogger) - // Verify SSH key - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, testLogger) + // Verify SSH key on management nodes + VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, expectedNumOfKeys, testLogger) - //VerifyLSFDNS on management nodes + // Verify LSF DNS on management nodes VerifyLSFDNS(t, sshClient, managementNodeIPList, expectedDnsDomainName, testLogger) // Perform failover and failback @@ -74,10 +78,12 @@ func ValidateClusterConfiguration(t *testing.T, options *testhelper.TestOptions, // Reboot instance RebootInstance(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0], JOB_COMMAND_MED, testLogger) - // Connect to the master node via SSH and handle connection errors + // Reconnect to the management node after reboot sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.Nil(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + require.NoError(t, connectionErr, "Failed to reconnect to the master via SSH: %v", connectionErr) defer sshClient.Close() + + // Wait for dynamic node disappearance and handle potential errors defer func() { if err := LSFWaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { t.Errorf("Error in LSFWaitForDynamicNodeDisappearance: %v", err) @@ -86,18 +92,18 @@ func ValidateClusterConfiguration(t *testing.T, options *testhelper.TestOptions, // Get dynamic compute node IPs and handle errors computeNodeIPList, computeIPErr := LSFGETDynamicComputeNodeIPs(t, sshClient, testLogger) - require.Nil(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) + require.NoError(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) // Verify compute node configuration - VerifyComputetNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) + VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - // Verify SSH key - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, testLogger) + // Verify SSH key on compute nodes + VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expectedNumOfKeys, testLogger) - //VerifyLSFDNS on compute nodes + // Verify LSF DNS on compute nodes VerifyLSFDNS(t, sshClient, computeNodeIPList, expectedDnsDomainName, testLogger) - // Connect to the login node via SSH and handle connection errors + // Verify SSH connectivity from login node and handle connection errors sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) require.NoError(t, connectionErr, "Failed to connect to the login node via SSH: %v", connectionErr) defer sshLoginNodeClient.Close() @@ -107,63 +113,74 @@ func ValidateClusterConfiguration(t *testing.T, options *testhelper.TestOptions, // Get dynamic compute node IPs and handle errors computeNodeIPList, computeIPErr = LSFGETDynamicComputeNodeIPs(t, sshClient, testLogger) - require.Nil(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) + require.NoError(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) - // Verify SSH connectivity to nodes from login + // Verify SSH connectivity from login node VerifySSHConnectivityToNodesFromLogin(t, sshLoginNodeClient, managementNodeIPList, computeNodeIPList, testLogger) // Verify PTR records VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) - //VerifyLSFDNS on login node + // Verify LSF DNS on login node VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expectedDnsDomainName, testLogger) // Verify file share encryption VerifyFileShareEncryption(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, testLogger) + + // Log validation end testLogger.Info(t, t.Name()+" Validation ended") } // ValidateClusterConfigurationWithAPPCenter performs validation tasks on the cluster configuration -// with additional verification for an application center. -// It extends the validation performed by ValidateClusterConfiguration to include checks for the application center configuration. -// It connects to various cluster components via SSH and verifies their configurations and functionality. -// This includes the following validations: +// with additional verification for an application center and noVNC configurations. +// It extends the validation performed by ValidateClusterConfiguration to include checks for these additional components. +// This function connects to various cluster components via SSH and verifies their configurations and functionality. +// It includes the following validations: // - Management Node: Verifies the configuration of the management node, including failover and failback procedures. // - Compute Nodes: Ensures proper configuration and SSH connectivity to compute nodes. // - Login Node: Validates the configuration and SSH connectivity to the login node. // - Dynamic Compute Nodes: Verifies the proper setup and functionality of dynamic compute nodes. +// - Application Center: Validates the configuration of the application center. +// - noVNC: Verifies the noVNC configuration. // Additionally, this function logs detailed information throughout the validation process. // This function doesn't return any value but logs errors and validation steps during the process. func ValidateClusterConfigurationWithAPPCenter(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve cluster information + // Retrieve cluster information from options expectedClusterID := options.TerraformVars["cluster_id"].(string) expectedReservationID := options.TerraformVars["reservation_id"].(string) expectedMasterName := options.TerraformVars["cluster_prefix"].(string) expectedResourceGroup := options.TerraformVars["resource_group"].(string) expectedKeyManagement := options.TerraformVars["key_management"].(string) expectedZone := options.TerraformVars["zones"].([]string)[0] + expectedNumOfKeys := len(options.TerraformVars["compute_ssh_keys"].([]string)) expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - assert.False(t, !ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") - expectedHyperthreadingEnabled, _ := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) + require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") + + expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) + require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) + JOB_COMMAND_LOW := GetJobCommand(expectedZone, "low") JOB_COMMAND_MED := GetJobCommand(expectedZone, "med") - // Run the test and handle errors + // Run the test consistency check output, err := options.RunTestConsistency() require.NoError(t, err, "Error running consistency test: %v", err) require.NotNil(t, output, "Expected non-nil output, but got nil") - // Proceed with SSH connection and verification if there are no errors + // Log successful cluster creation testLogger.Info(t, t.Name()+" Cluster created successfully") + // Retrieve server IPs bastionIP, managementNodeIPList, loginNodeIP, ipRetrievalError := utils.GetServerIPs(t, options, testLogger) require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + // Log validation start testLogger.Info(t, t.Name()+" Validation started ......") // Connect to the master node via SSH and handle connection errors sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.Nil(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + require.NoError(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + defer sshClient.Close() testLogger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") @@ -171,10 +188,10 @@ func ValidateClusterConfigurationWithAPPCenter(t *testing.T, options *testhelper // Verify management node configuration VerifyManagementNodeConfig(t, sshClient, expectedClusterID, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, JOB_COMMAND_LOW, EXPECTED_LSF_VERSION, testLogger) - // Verify SSH key - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, testLogger) + // Verify SSH key on management nodes + VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, expectedNumOfKeys, testLogger) - //VerifyLSFDNS on management nodes + // Verify LSF DNS on management nodes VerifyLSFDNS(t, sshClient, managementNodeIPList, expectedDnsDomainName, testLogger) // Perform failover and failback @@ -186,10 +203,12 @@ func ValidateClusterConfigurationWithAPPCenter(t *testing.T, options *testhelper // Reboot instance RebootInstance(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0], JOB_COMMAND_MED, testLogger) - // Connect to the master node via SSH and handle connection errors + // Reconnect to the management node after reboot sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.Nil(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + require.NoError(t, connectionErr, "Failed to reconnect to the master via SSH: %v", connectionErr) defer sshClient.Close() + + // Wait for dynamic node disappearance and handle potential errors defer func() { if err := LSFWaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { t.Errorf("Error in LSFWaitForDynamicNodeDisappearance: %v", err) @@ -198,18 +217,18 @@ func ValidateClusterConfigurationWithAPPCenter(t *testing.T, options *testhelper // Get dynamic compute node IPs and handle errors computeNodeIPList, computeIPErr := LSFGETDynamicComputeNodeIPs(t, sshClient, testLogger) - require.Nil(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) + require.NoError(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) // Verify compute node configuration - VerifyComputetNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) + VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - // Verify SSH key - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, testLogger) + // Verify SSH key on compute nodes + VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expectedNumOfKeys, testLogger) - //VerifyLSFDNS on compute nodes + // Verify LSF DNS on compute nodes VerifyLSFDNS(t, sshClient, computeNodeIPList, expectedDnsDomainName, testLogger) - // Connect to the login node via SSH and handle connection errors + // Verify SSH connectivity from login node and handle connection errors sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) require.NoError(t, connectionErr, "Failed to connect to the login node via SSH: %v", connectionErr) defer sshLoginNodeClient.Close() @@ -219,9 +238,9 @@ func ValidateClusterConfigurationWithAPPCenter(t *testing.T, options *testhelper // Get dynamic compute node IPs and handle errors computeNodeIPList, computeIPErr = LSFGETDynamicComputeNodeIPs(t, sshClient, testLogger) - require.Nil(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) + require.NoError(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) - // Verify SSH connectivity to nodes from login + // Verify SSH connectivity from login node VerifySSHConnectivityToNodesFromLogin(t, sshLoginNodeClient, managementNodeIPList, computeNodeIPList, testLogger) // Verify application center configuration @@ -233,12 +252,13 @@ func ValidateClusterConfigurationWithAPPCenter(t *testing.T, options *testhelper // Verify PTR records VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) - //VerifyLSFDNS on login node + // Verify LSF DNS on login node VerifyLSFDNS(t, sshClient, []string{loginNodeIP}, expectedDnsDomainName, testLogger) // Verify file share encryption VerifyFileShareEncryption(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, testLogger) + // Log validation end testLogger.Info(t, t.Name()+" Validation ended") } @@ -248,7 +268,7 @@ func ValidateClusterConfigurationWithAPPCenter(t *testing.T, options *testhelper // Additionally, it ensures proper connectivity and functionality. // This function doesn't return any value but logs errors and validation steps during the process. func ValidateBasicClusterConfiguration(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve cluster information + // Retrieve cluster information from options expectedClusterID := options.TerraformVars["cluster_id"].(string) expectedReservationID := options.TerraformVars["reservation_id"].(string) expectedMasterName := options.TerraformVars["cluster_prefix"].(string) @@ -256,26 +276,30 @@ func ValidateBasicClusterConfiguration(t *testing.T, options *testhelper.TestOpt expectedKeyManagement := options.TerraformVars["key_management"].(string) expectedZone := options.TerraformVars["zones"].([]string)[0] - expectedHyperthreadingEnabled, _ := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) + expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) + require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) JOB_COMMAND_LOW := GetJobCommand(expectedZone, "low") - // Run the test and handle errors - output, err := options.RunTest() + // Run the test consistency check + output, err := options.RunTestConsistency() require.NoError(t, err, "Error running consistency test: %v", err) require.NotNil(t, output, "Expected non-nil output, but got nil") - // Proceed with SSH connection and verification if there are no errors + // Log successful cluster creation testLogger.Info(t, t.Name()+" Cluster created successfully") + // Retrieve server IPs bastionIP, managementNodeIPList, loginNodeIP, ipRetrievalError := utils.GetServerIPs(t, options, testLogger) require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + // Log validation start testLogger.Info(t, t.Name()+" Validation started ......") // Connect to the master node via SSH and handle connection errors sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.Nil(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + require.NoError(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + defer sshClient.Close() testLogger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") @@ -283,10 +307,12 @@ func ValidateBasicClusterConfiguration(t *testing.T, options *testhelper.TestOpt // Verify management node configuration VerifyManagementNodeConfig(t, sshClient, expectedClusterID, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, JOB_COMMAND_LOW, EXPECTED_LSF_VERSION, testLogger) - // Connect to the master node via SSH and handle connection errors + // Reconnect to the management node after reboot sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.Nil(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + require.NoError(t, connectionErr, "Failed to reconnect to the master via SSH: %v", connectionErr) defer sshClient.Close() + + // Wait for dynamic node disappearance and handle potential errors defer func() { if err := LSFWaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { t.Errorf("Error in LSFWaitForDynamicNodeDisappearance: %v", err) @@ -295,12 +321,12 @@ func ValidateBasicClusterConfiguration(t *testing.T, options *testhelper.TestOpt // Get dynamic compute node IPs and handle errors computeNodeIPList, computeIPErr := LSFGETDynamicComputeNodeIPs(t, sshClient, testLogger) - require.Nil(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) + require.NoError(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) // Verify compute node configuration - VerifyComputetNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) + VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - // Connect to the login node via SSH and handle connection errors + // Verify SSH connectivity from login node and handle connection errors sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) require.NoError(t, connectionErr, "Failed to connect to the login node via SSH: %v", connectionErr) defer sshLoginNodeClient.Close() @@ -311,55 +337,56 @@ func ValidateBasicClusterConfiguration(t *testing.T, options *testhelper.TestOpt // Verify file share encryption VerifyFileShareEncryption(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, testLogger) + // Log validation end testLogger.Info(t, t.Name()+" Validation ended") } // ValidateLDAPClusterConfiguration performs comprehensive validation on the cluster setup. // It connects to various cluster components via SSH and verifies their configurations and functionality. -// This includes the following validations: -// - Management Node: Verifies the configuration of the management node, including failover and failback procedures. -// - Compute Nodes: Ensures proper configuration and SSH connectivity to compute nodes. -// - Login Node: Validates the configuration and SSH connectivity to the login node. -// - Dynamic Compute Nodes: Verifies the proper setup and functionality of dynamic compute nodes. +// This includes validations for management nodes, compute nodes, login nodes, dynamic compute nodes, and LDAP integration. // Additionally, this function logs detailed information throughout the validation process. // This function doesn't return any value but logs errors and validation steps during the process. func ValidateLDAPClusterConfiguration(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve cluster information + // Retrieve cluster information from options expectedClusterID := options.TerraformVars["cluster_id"].(string) expectedReservationID := options.TerraformVars["reservation_id"].(string) expectedMasterName := options.TerraformVars["cluster_prefix"].(string) expectedResourceGroup := options.TerraformVars["resource_group"].(string) expectedKeyManagement := options.TerraformVars["key_management"].(string) + expectedNumOfKeys := len(options.TerraformVars["compute_ssh_keys"].([]string)) expectedLdapDomain := options.TerraformVars["ldap_basedns"].(string) ldapAdminPassword := options.TerraformVars["ldap_admin_password"].(string) ldapUserName := options.TerraformVars["ldap_user_name"].(string) ldapUserPassword := options.TerraformVars["ldap_user_password"].(string) expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - assert.False(t, !ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") + require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") expectedZone := options.TerraformVars["zones"].([]string)[0] - expectedHyperthreadingEnabled, _ := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) + expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) + require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) JOB_COMMAND_LOW := GetJobCommand(expectedZone, "low") JOB_COMMAND_MED := GetJobCommand(expectedZone, "med") - // Run the test and handle errors + // Run the test consistency check output, err := options.RunTestConsistency() require.NoError(t, err, "Error running consistency test: %v", err) require.NotNil(t, output, "Expected non-nil output, but got nil") - // Proceed with SSH connection and verification if there are no errors + // Log successful cluster creation testLogger.Info(t, t.Name()+" Cluster created successfully") - // Get server IPs and handle errors - bastionIP, managementNodeIPList, loginNodeIP, LdapServerIP, ipRetrievalError := utils.GetServerIPsWithLDAP(t, options, testLogger) + // Retrieve server IPs + bastionIP, managementNodeIPList, loginNodeIP, ldapServerIP, ipRetrievalError := utils.GetServerIPsWithLDAP(t, options, testLogger) require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + // Log validation start testLogger.Info(t, t.Name()+" Validation started") // Connect to the master node via SSH and handle connection errors sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.Nil(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + require.NoError(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + defer sshClient.Close() testLogger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") @@ -367,8 +394,8 @@ func ValidateLDAPClusterConfiguration(t *testing.T, options *testhelper.TestOpti // Verify management node configuration VerifyManagementNodeConfig(t, sshClient, expectedClusterID, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, JOB_COMMAND_LOW, EXPECTED_LSF_VERSION, testLogger) - // Verify SSH key - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, testLogger) + // Verify SSH key on management nodes + VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, expectedNumOfKeys, testLogger) // Perform failover and failback FailoverAndFailback(t, sshClient, JOB_COMMAND_MED, testLogger) @@ -379,10 +406,12 @@ func ValidateLDAPClusterConfiguration(t *testing.T, options *testhelper.TestOpti // Reboot instance RebootInstance(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0], JOB_COMMAND_MED, testLogger) - // Connect to the master node via SSH and handle connection errors + // Reconnect to the management node after reboot sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.Nil(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + require.NoError(t, connectionErr, "Failed to reconnect to the master via SSH: %v", connectionErr) defer sshClient.Close() + + // Wait for dynamic node disappearance and handle potential errors defer func() { if err := LSFWaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { t.Errorf("Error in LSFWaitForDynamicNodeDisappearance: %v", err) @@ -391,101 +420,100 @@ func ValidateLDAPClusterConfiguration(t *testing.T, options *testhelper.TestOpti // Get dynamic compute node IPs and handle errors computeNodeIPList, computeIPErr := LSFGETDynamicComputeNodeIPs(t, sshClient, testLogger) - require.Nil(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) + require.NoError(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) // Verify compute node configuration - VerifyComputetNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) + VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - // Verify SSH key - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, testLogger) + // Verify SSH key on compute nodes + VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expectedNumOfKeys, testLogger) - // Connect to the login node via SSH and handle connection errors + // Verify SSH connectivity from login node and handle connection errors sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) require.NoError(t, connectionErr, "Failed to connect to the login node via SSH: %v", connectionErr) + defer sshLoginNodeClient.Close() // Verify login node configuration VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterID, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, JOB_COMMAND_LOW, EXPECTED_LSF_VERSION, testLogger) - // Verify SSH connectivity to nodes from login + // Verify SSH connectivity from login node VerifySSHConnectivityToNodesFromLogin(t, sshLoginNodeClient, managementNodeIPList, computeNodeIPList, testLogger) // Verify file share encryption VerifyFileShareEncryption(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, testLogger) - // Connect to the ldap server via SSH and handle connection errors - sshLdapClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_LDAP_HOST_NAME, LdapServerIP) - require.NoError(t, connectionErr, "Failed to connect to the ldap server via SSH: %v", connectionErr) + // Connect to the LDAP server via SSH and handle connection errors + sshLdapClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_LDAP_HOST_NAME, ldapServerIP) + require.NoError(t, connectionErr, "Failed to connect to the LDAP server via SSH: %v", connectionErr) + defer sshLdapClient.Close() - // Check ldap server status + // Check LDAP server status CheckLDAPServerStatus(t, sshLdapClient, ldapAdminPassword, expectedLdapDomain, ldapUserName, testLogger) - // Verify management node ldap config - VerifyManagementNodeLDAPConfig(t, sshClient, bastionIP, LdapServerIP, managementNodeIPList, JOB_COMMAND_LOW, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + // Verify management node LDAP config + VerifyManagementNodeLDAPConfig(t, sshClient, bastionIP, ldapServerIP, managementNodeIPList, JOB_COMMAND_LOW, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) - // Verify compute node ldap config - VerifyComputeNodeLDAPConfig(t, bastionIP, LdapServerIP, computeNodeIPList, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + // Verify compute node LDAP config + VerifyComputeNodeLDAPConfig(t, bastionIP, ldapServerIP, computeNodeIPList, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) - // Verify login node ldap config - VerifyLoginNodeLDAPConfig(t, sshClient, bastionIP, loginNodeIP, LdapServerIP, JOB_COMMAND_LOW, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + // Verify login node LDAP config + VerifyLoginNodeLDAPConfig(t, sshLoginNodeClient, bastionIP, loginNodeIP, ldapServerIP, JOB_COMMAND_LOW, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) - // Verify able to create LDAP User on LDAP Server and can able to perform LSF actions using new user - VerifyCreateNewLdapUserAndManagementNodeLDAPConfig(t, sshLdapClient, bastionIP, LdapServerIP, managementNodeIPList, JOB_COMMAND_LOW, ldapAdminPassword, expectedLdapDomain, ldapUserName, ldapUserPassword, "user2", testLogger) + // Verify ability to create LDAP user and perform LSF actions using new user + VerifyCreateNewLdapUserAndManagementNodeLDAPConfig(t, sshLdapClient, bastionIP, ldapServerIP, managementNodeIPList, JOB_COMMAND_LOW, ldapAdminPassword, expectedLdapDomain, ldapUserName, ldapUserPassword, "user2", testLogger) // Verify PTR records VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) + // Log validation end testLogger.Info(t, t.Name()+" Validation ended") } // ValidatePACANDLDAPClusterConfiguration performs comprehensive validation on the PAC and LDAP cluster setup. // It connects to various cluster components via SSH and verifies their configurations and functionality. -// This includes the following validations: -// - Management Node: Verifies the configuration of the management node, including failover and failback procedures. -// - Compute Nodes: Ensures proper configuration and SSH connectivity to compute nodes. -// - Login Node: Validates the configuration and SSH connectivity to the login node. -// - Dynamic Compute Nodes: Verifies the proper setup and functionality of dynamic compute nodes. -// - LDAP Server: Checks the LDAP server status and verifies LDAP configurations across nodes. -// - Application Center: Verifies the application center configuration. -// - noVNC: Verifies the noVNC configuration. +// This includes validations for management nodes, compute nodes, login nodes, dynamic compute nodes, LDAP server, application center, and noVNC. // Additionally, this function logs detailed information throughout the validation process. // This function doesn't return any value but logs errors and validation steps during the process. func ValidatePACANDLDAPClusterConfiguration(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve cluster information + // Retrieve cluster information from options expectedClusterID := options.TerraformVars["cluster_id"].(string) expectedReservationID := options.TerraformVars["reservation_id"].(string) expectedMasterName := options.TerraformVars["cluster_prefix"].(string) expectedResourceGroup := options.TerraformVars["resource_group"].(string) expectedKeyManagement := options.TerraformVars["key_management"].(string) + expectedNumOfKeys := len(options.TerraformVars["compute_ssh_keys"].([]string)) expectedLdapDomain := options.TerraformVars["ldap_basedns"].(string) ldapAdminPassword := options.TerraformVars["ldap_admin_password"].(string) ldapUserName := options.TerraformVars["ldap_user_name"].(string) ldapUserPassword := options.TerraformVars["ldap_user_password"].(string) expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] - assert.False(t, !ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") + require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") expectedZone := options.TerraformVars["zones"].([]string)[0] - expectedHyperthreadingEnabled, _ := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) + expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) + require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) JOB_COMMAND_LOW := GetJobCommand(expectedZone, "low") JOB_COMMAND_MED := GetJobCommand(expectedZone, "med") - // Run the test and handle errors + // Run the test consistency check output, err := options.RunTestConsistency() require.NoError(t, err, "Error running consistency test: %v", err) require.NotNil(t, output, "Expected non-nil output, but got nil") - // Proceed with SSH connection and verification if there are no errors + // Log successful cluster creation testLogger.Info(t, t.Name()+" Cluster created successfully") - // Get server IPs and handle errors - bastionIP, managementNodeIPList, loginNodeIP, LdapServerIP, ipRetrievalError := utils.GetServerIPsWithLDAP(t, options, testLogger) + // Retrieve server IPs + bastionIP, managementNodeIPList, loginNodeIP, ldapServerIP, ipRetrievalError := utils.GetServerIPsWithLDAP(t, options, testLogger) require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) testLogger.Info(t, t.Name()+" Validation started") // Connect to the master node via SSH and handle connection errors sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.Nil(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + require.NoError(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + defer sshClient.Close() testLogger.Info(t, "SSH connection to the master successful") t.Log("Validation in progress. Please wait...") @@ -493,8 +521,8 @@ func ValidatePACANDLDAPClusterConfiguration(t *testing.T, options *testhelper.Te // Verify management node configuration VerifyManagementNodeConfig(t, sshClient, expectedClusterID, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, JOB_COMMAND_LOW, EXPECTED_LSF_VERSION, testLogger) - // Verify SSH key - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, testLogger) + // Verify SSH key on management nodes + VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, expectedNumOfKeys, testLogger) // Perform failover and failback FailoverAndFailback(t, sshClient, JOB_COMMAND_MED, testLogger) @@ -505,10 +533,12 @@ func ValidatePACANDLDAPClusterConfiguration(t *testing.T, options *testhelper.Te // Reboot instance RebootInstance(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0], JOB_COMMAND_MED, testLogger) - // Connect to the master node via SSH and handle connection errors + // Reconnect to the management node after reboot sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.Nil(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + require.NoError(t, connectionErr, "Failed to reconnect to the master via SSH: %v", connectionErr) defer sshClient.Close() + + // Wait for dynamic node disappearance and handle potential errors defer func() { if err := LSFWaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { t.Errorf("Error in LSFWaitForDynamicNodeDisappearance: %v", err) @@ -517,22 +547,23 @@ func ValidatePACANDLDAPClusterConfiguration(t *testing.T, options *testhelper.Te // Get dynamic compute node IPs and handle errors computeNodeIPList, computeIPErr := LSFGETDynamicComputeNodeIPs(t, sshClient, testLogger) - require.Nil(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) + require.NoError(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) // Verify compute node configuration - VerifyComputetNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) + VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - // Verify SSH key - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, testLogger) + // Verify SSH key on compute nodes + VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expectedNumOfKeys, testLogger) - // Connect to the login node via SSH and handle connection errors + // Verify SSH connectivity from login node and handle connection errors sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) require.NoError(t, connectionErr, "Failed to connect to the login node via SSH: %v", connectionErr) + defer sshLoginNodeClient.Close() // Verify login node configuration VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterID, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, JOB_COMMAND_LOW, EXPECTED_LSF_VERSION, testLogger) - // Verify SSH connectivity to nodes from login + // Verify SSH connectivity from login node VerifySSHConnectivityToNodesFromLogin(t, sshLoginNodeClient, managementNodeIPList, computeNodeIPList, testLogger) // Verify file share encryption @@ -544,28 +575,30 @@ func ValidatePACANDLDAPClusterConfiguration(t *testing.T, options *testhelper.Te // Verify noVNC configuration VerifyNoVNCConfig(t, sshClient, testLogger) - // Connect to the ldap server via SSH and handle connection errors - sshLdapClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_LDAP_HOST_NAME, LdapServerIP) - require.NoError(t, connectionErr, "Failed to connect to the ldap server via SSH: %v", connectionErr) + // Connect to the LDAP server via SSH and handle connection errors + sshLdapClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_LDAP_HOST_NAME, ldapServerIP) + require.NoError(t, connectionErr, "Failed to connect to the LDAP server via SSH: %v", connectionErr) + defer sshLdapClient.Close() - // Check ldap server status + // Check LDAP server status CheckLDAPServerStatus(t, sshLdapClient, ldapAdminPassword, expectedLdapDomain, ldapUserName, testLogger) - // Verify management node ldap config - VerifyManagementNodeLDAPConfig(t, sshClient, bastionIP, LdapServerIP, managementNodeIPList, JOB_COMMAND_LOW, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + // Verify management node LDAP config + VerifyManagementNodeLDAPConfig(t, sshClient, bastionIP, ldapServerIP, managementNodeIPList, JOB_COMMAND_LOW, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) - // Verify compute node ldap config - VerifyComputeNodeLDAPConfig(t, bastionIP, LdapServerIP, computeNodeIPList, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + // Verify compute node LDAP config + VerifyComputeNodeLDAPConfig(t, bastionIP, ldapServerIP, computeNodeIPList, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) - // Verify login node ldap config - VerifyLoginNodeLDAPConfig(t, sshClient, bastionIP, loginNodeIP, LdapServerIP, JOB_COMMAND_LOW, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + // Verify login node LDAP config + VerifyLoginNodeLDAPConfig(t, sshLoginNodeClient, bastionIP, loginNodeIP, ldapServerIP, JOB_COMMAND_LOW, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) - // Verify able to create LDAP User on LDAP Server and can able to perform LSF actions using new user - VerifyCreateNewLdapUserAndManagementNodeLDAPConfig(t, sshLdapClient, bastionIP, LdapServerIP, managementNodeIPList, JOB_COMMAND_LOW, ldapAdminPassword, expectedLdapDomain, ldapUserName, ldapUserPassword, "user2", testLogger) + // Verify ability to create LDAP user and perform LSF actions using new user + VerifyCreateNewLdapUserAndManagementNodeLDAPConfig(t, sshLdapClient, bastionIP, ldapServerIP, managementNodeIPList, JOB_COMMAND_LOW, ldapAdminPassword, expectedLdapDomain, ldapUserName, ldapUserPassword, "user2", testLogger) // Verify PTR records VerifyPTRRecordsForManagementAndLoginNodes(t, sshClient, LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList, loginNodeIP, expectedDnsDomainName, testLogger) + // Log validation end testLogger.Info(t, t.Name()+" Validation ended") } @@ -573,9 +606,10 @@ func ValidatePACANDLDAPClusterConfiguration(t *testing.T, options *testhelper.Te // It verifies various aspects including management node configuration, SSH keys, failover and failback, LSF daemon restart, dynamic compute node configuration, // login node configuration, SSH connectivity, application center configuration, noVNC configuration, PTR records, and file share encryption. // -// testLogger: *utils.AggregatedLogger - The logger for the test. +// testLogger: *utils.AggregatedLogger - The logger for the test. func ValidateClusterConfigurationWithAPPCenterForExistingEnv( t *testing.T, + expectedNumOfKeys int, bastionIP, loginNodeIP, expectedClusterID, expectedReservationID, expectedMasterName, expectedResourceGroup, expectedKeyManagement, expectedZone, expectedDnsDomainName string, managementNodeIPList []string, @@ -586,13 +620,12 @@ func ValidateClusterConfigurationWithAPPCenterForExistingEnv( JOB_COMMAND_LOW := GetJobCommand(expectedZone, "low") JOB_COMMAND_MED := GetJobCommand(expectedZone, "med") - // Log the start of validation - testLogger.Info(t, t.Name()+" Cluster created successfully") + // Log validation start testLogger.Info(t, t.Name()+" Validation started ......") // Connect to the master node via SSH sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.Nil(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + require.NoError(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) defer sshClient.Close() testLogger.Info(t, "SSH connection to the master successful") @@ -601,8 +634,8 @@ func ValidateClusterConfigurationWithAPPCenterForExistingEnv( // Verify management node configuration VerifyManagementNodeConfig(t, sshClient, expectedClusterID, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, JOB_COMMAND_LOW, EXPECTED_LSF_VERSION, testLogger) - // Verify SSH key - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, testLogger) + // Verify SSH key on management nodes + VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, expectedNumOfKeys, testLogger) // Perform failover and failback FailoverAndFailback(t, sshClient, JOB_COMMAND_MED, testLogger) @@ -615,10 +648,10 @@ func ValidateClusterConfigurationWithAPPCenterForExistingEnv( // Reconnect to the master node via SSH after reboot sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.Nil(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + require.NoError(t, connectionErr, "Failed to reconnect to the master via SSH: %v", connectionErr) defer sshClient.Close() - // Wait for dynamic node disappearance + // Wait for dynamic node disappearance and handle potential errors defer func() { if err := LSFWaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { t.Errorf("Error in LSFWaitForDynamicNodeDisappearance: %v", err) @@ -627,15 +660,15 @@ func ValidateClusterConfigurationWithAPPCenterForExistingEnv( // Get dynamic compute node IPs computeNodeIPList, computeIPErr := LSFGETDynamicComputeNodeIPs(t, sshClient, testLogger) - require.Nil(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) + require.NoError(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) // Verify compute node configuration - VerifyComputetNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) + VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) // Verify SSH key for compute nodes - VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, testLogger) + VerifySSHKey(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expectedNumOfKeys, testLogger) - // Connect to the login node via SSH + // Verify SSH connectivity from login node sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) require.NoError(t, connectionErr, "Failed to connect to the login node via SSH: %v", connectionErr) defer sshLoginNodeClient.Close() @@ -645,9 +678,9 @@ func ValidateClusterConfigurationWithAPPCenterForExistingEnv( // Re-fetch dynamic compute node IPs computeNodeIPList, computeIPErr = LSFGETDynamicComputeNodeIPs(t, sshClient, testLogger) - require.Nil(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) + require.NoError(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) - // Verify SSH connectivity to nodes from login + // Verify SSH connectivity from login node VerifySSHConnectivityToNodesFromLogin(t, sshLoginNodeClient, managementNodeIPList, computeNodeIPList, testLogger) // Verify application center configuration @@ -662,7 +695,7 @@ func ValidateClusterConfigurationWithAPPCenterForExistingEnv( // Verify file share encryption VerifyFileShareEncryption(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, testLogger) - // Log the end of validation + // Log validation end testLogger.Info(t, t.Name()+" Validation ended") } @@ -673,7 +706,7 @@ func ValidateClusterConfigurationWithAPPCenterForExistingEnv( // Additionally, it ensures proper connectivity and functionality. // This function doesn't return any value but logs errors and validation steps during the process. func ValidateBasicClusterConfigurationWithVPCFlowLogsAndCos(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { - // Retrieve cluster information from Terraform variables from Terraform variables + // Retrieve cluster information from options expectedClusterID := options.TerraformVars["cluster_id"].(string) expectedReservationID := options.TerraformVars["reservation_id"].(string) expectedMasterName := options.TerraformVars["cluster_prefix"].(string) @@ -693,15 +726,16 @@ func ValidateBasicClusterConfigurationWithVPCFlowLogsAndCos(t *testing.T, option // Log successful cluster creation testLogger.Info(t, t.Name()+" Cluster created successfully") - // Retrieve server IPs and handle errors + // Retrieve server IPs bastionIP, managementNodeIPList, loginNodeIP, ipRetrievalError := utils.GetServerIPs(t, options, testLogger) require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + // Log validation start testLogger.Info(t, t.Name()+" Validation started ......") // Connect to the master node via SSH and handle connection errors sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) - require.Nil(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + require.NoError(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) defer sshClient.Close() testLogger.Info(t, "SSH connection to the master successful") @@ -710,22 +744,21 @@ func ValidateBasicClusterConfigurationWithVPCFlowLogsAndCos(t *testing.T, option // Verify management node configuration VerifyManagementNodeConfig(t, sshClient, expectedClusterID, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, JOB_COMMAND_LOW, EXPECTED_LSF_VERSION, testLogger) - defer sshClient.Close() - // Wait for dynamic node disappearance + // Wait for dynamic node disappearance and handle potential errors defer func() { if err := LSFWaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { t.Errorf("Error in LSFWaitForDynamicNodeDisappearance: %v", err) } }() - // Get dynamic compute node IPs and handle errors + // Get dynamic compute node IPs computeNodeIPList, computeIPErr := LSFGETDynamicComputeNodeIPs(t, sshClient, testLogger) - require.Nil(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) + require.NoError(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) // Verify compute node configuration - VerifyComputetNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) + VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) - // Connect to the login node via SSH and handle connection errors + // Verify SSH connectivity from login node sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) require.NoError(t, connectionErr, "Failed to connect to the login node via SSH: %v", connectionErr) defer sshLoginNodeClient.Close() @@ -739,5 +772,228 @@ func ValidateBasicClusterConfigurationWithVPCFlowLogsAndCos(t *testing.T, option // Validate COS service instance and VPC flow logs ValidateCosServiceInstanceAndVpcFlowLogs(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, testLogger) + // Log validation end + testLogger.Info(t, t.Name()+" Validation ended") +} + +// ValidateClusterConfigurationWithMultipleKeys performs a comprehensive validation on the cluster setup. +// It connects to various cluster components via SSH and verifies their configurations and functionality, +// including management nodes, compute nodes, login nodes, and dynamic compute nodes. It also performs +// additional validation checks like failover procedures, SSH key verification, and DNS verification. +// The function logs detailed information throughout the validation process but does not return any value. +func ValidateClusterConfigurationWithMultipleKeys(t *testing.T, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { + // Retrieve cluster information from options + expectedClusterID := options.TerraformVars["cluster_id"].(string) + expectedReservationID := options.TerraformVars["reservation_id"].(string) + expectedMasterName := options.TerraformVars["cluster_prefix"].(string) + expectedResourceGroup := options.TerraformVars["resource_group"].(string) + expectedKeyManagement := options.TerraformVars["key_management"].(string) + expectedZone := options.TerraformVars["zones"].([]string)[0] + expectedNumOfKeys := len(options.TerraformVars["compute_ssh_keys"].([]string)) + expectedDnsDomainName, ok := options.TerraformVars["dns_domain_name"].(map[string]string)["compute"] + require.True(t, ok, "Key 'compute' does not exist in dns_domain_name map or dns_domain_name is not of type map[string]string") + + expectedHyperthreadingEnabled, _ := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) + JOB_COMMAND_LOW := GetJobCommand(expectedZone, "low") + JOB_COMMAND_MED := GetJobCommand(expectedZone, "med") + + // Run the test consistency check + output, err := options.RunTestConsistency() + require.NoError(t, err, "Error running consistency test: %v", err) + require.NotNil(t, output, "Expected non-nil output, but got nil") + + // Log successful cluster creation + testLogger.Info(t, t.Name()+" Cluster created successfully") + + // Retrieve server IPs + bastionIP, managementNodeIPList, loginNodeIP, ipRetrievalError := utils.GetServerIPs(t, options, testLogger) + require.NoError(t, ipRetrievalError, "Error occurred while getting server IPs: %v", ipRetrievalError) + + // Log validation start + testLogger.Info(t, t.Name()+" Validation started ......") + + // Connect to the management node via SSH + sshClientOne, sshClientTwo, connectionErrOne, connectionErrTwo := utils.ConnectToHostsWithMultipleUsers(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) + require.NoError(t, connectionErrOne, "Failed to connect to the master via SSH: %v", connectionErrOne) + require.NoError(t, connectionErrTwo, "Failed to connect to the master via SSH: %v", connectionErrTwo) + defer sshClientOne.Close() + defer sshClientTwo.Close() + + testLogger.Info(t, "SSH connection to the master successful") + t.Log("Validation in progress. Please wait...") + + // Verify management node configuration + VerifyManagementNodeConfig(t, sshClientOne, expectedClusterID, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, JOB_COMMAND_LOW, EXPECTED_LSF_VERSION, testLogger) + VerifyManagementNodeConfig(t, sshClientTwo, expectedClusterID, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPList, JOB_COMMAND_LOW, EXPECTED_LSF_VERSION, testLogger) + + // Verify SSH key on management node + VerifySSHKey(t, sshClientOne, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "management", managementNodeIPList, expectedNumOfKeys, testLogger) + + // Perform failover and failback + FailoverAndFailback(t, sshClientOne, JOB_COMMAND_MED, testLogger) + + // Restart LSF daemon + RestartLsfDaemon(t, sshClientOne, JOB_COMMAND_LOW, testLogger) + + // Reboot instance + RebootInstance(t, sshClientOne, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0], JOB_COMMAND_MED, testLogger) + + // Reconnect to the management node after reboot + sshClientOne, connectionErrOne = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPList[0]) + require.NoError(t, connectionErrOne, "Failed to reconnect to the master via SSH: %v", connectionErrOne) + defer sshClientOne.Close() + + // Wait for dynamic node disappearance and handle errors + defer func() { + if err := LSFWaitForDynamicNodeDisappearance(t, sshClientOne, testLogger); err != nil { + t.Errorf("Error in LSFWaitForDynamicNodeDisappearance: %v", err) + } + }() + + // Get dynamic compute node IPs and handle errors + computeNodeIPList, computeIPErr := LSFGETDynamicComputeNodeIPs(t, sshClientOne, testLogger) + require.NoError(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) + + // Verify compute node configuration + VerifyComputeNodeConfig(t, sshClientOne, expectedHyperthreadingEnabled, computeNodeIPList, testLogger) + + // Verify SSH key on compute nodes + VerifySSHKey(t, sshClientOne, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, "compute", computeNodeIPList, expectedNumOfKeys, testLogger) + + // Verify LSF DNS on compute nodes + VerifyLSFDNS(t, sshClientOne, computeNodeIPList, expectedDnsDomainName, testLogger) + + // Verify SSH connectivity from login node + sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) + require.NoError(t, connectionErr, "Failed to connect to the login node via SSH: %v", connectionErr) + defer sshLoginNodeClient.Close() + + // Verify login node configuration + VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterID, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, JOB_COMMAND_LOW, EXPECTED_LSF_VERSION, testLogger) + + // Get dynamic compute node IPs again + computeNodeIPList, computeIPErr = LSFGETDynamicComputeNodeIPs(t, sshClientOne, testLogger) + require.NoError(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) + + // Verify SSH connectivity from login node + VerifySSHConnectivityToNodesFromLogin(t, sshLoginNodeClient, managementNodeIPList, computeNodeIPList, testLogger) + + // Verify LSF DNS on login node + VerifyLSFDNS(t, sshClientOne, []string{loginNodeIP}, expectedDnsDomainName, testLogger) + + // Verify file share encryption + VerifyFileShareEncryption(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, testLogger) + + // Log validation end + testLogger.Info(t, t.Name()+" Validation ended") +} + +// ValidateExistingLDAPClusterConfig performs comprehensive validation on an existing LDAP cluster configuration. +// It connects to various cluster components via SSH to verify their configurations and functionality, +// including management nodes, compute nodes, login nodes, dynamic compute nodes, and LDAP integration. +// This function logs detailed information throughout the validation process and does not return any value. +func ValidateExistingLDAPClusterConfig(t *testing.T, ldapServerBastionIP, ldapServerIP, expectedLdapDomain, ldapAdminPassword, ldapUserName, ldapUserPassword string, options *testhelper.TestOptions, testLogger *utils.AggregatedLogger) { + // Retrieve cluster information from options + expectedClusterID := options.TerraformVars["cluster_id"].(string) + expectedReservationID := options.TerraformVars["reservation_id"].(string) + expectedMasterName := options.TerraformVars["cluster_prefix"].(string) + expectedResourceGroup := options.TerraformVars["resource_group"].(string) + expectedKeyManagement := options.TerraformVars["key_management"].(string) + expectedZone := options.TerraformVars["zones"].([]string)[0] + + // Parse hyperthreading enabled flag + expectedHyperthreadingEnabled, err := strconv.ParseBool(options.TerraformVars["hyperthreading_enabled"].(string)) + require.NoError(t, err, "Error parsing hyperthreading_enabled: %v", err) + + // Define job commands for different priority levels + jobCommandLow := GetJobCommand(expectedZone, "high") + jobCommandMed := GetJobCommand(expectedZone, "med") + + // Run the test consistency check + output, err := options.RunTestConsistency() + require.NoError(t, err, "Error running consistency test: %v", err) + require.NotNil(t, output, "Expected non-nil output, but got nil") + + // Log successful cluster creation + testLogger.Info(t, t.Name()+" Cluster created successfully") + + // Retrieve server IPs + bastionIP, managementNodeIPs, loginNodeIP, ipRetrievalErr := utils.GetServerIPs(t, options, testLogger) + require.NoError(t, ipRetrievalErr, "Error occurred while getting server IPs: %v", ipRetrievalErr) + + // Log validation start + testLogger.Info(t, t.Name()+" Validation started") + + // Connect to the master node via SSH + sshClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + require.NoError(t, connectionErr, "Failed to connect to the master via SSH: %v", connectionErr) + defer sshClient.Close() + + testLogger.Info(t, "SSH connection to the master successful") + t.Log("Validation in progress. Please wait...") + + // Verify management node configuration + VerifyManagementNodeConfig(t, sshClient, expectedClusterID, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, managementNodeIPs, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) + + // Restart LSF daemon + RestartLsfDaemon(t, sshClient, jobCommandLow, testLogger) + + // Reboot instance + RebootInstance(t, sshClient, bastionIP, LSF_PUBLIC_HOST_NAME, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0], jobCommandMed, testLogger) + + // Reconnect to the management node after reboot + sshClient, connectionErr = utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, managementNodeIPs[0]) + require.NoError(t, connectionErr, "Failed to reconnect to the master via SSH: %v", connectionErr) + defer sshClient.Close() + + // Wait for dynamic node disappearance and handle potential errors + defer func() { + if err := LSFWaitForDynamicNodeDisappearance(t, sshClient, testLogger); err != nil { + t.Errorf("Error in LSFWaitForDynamicNodeDisappearance: %v", err) + } + }() + + // Get dynamic compute node IPs + computeNodeIPs, computeIPErr := LSFGETDynamicComputeNodeIPs(t, sshClient, testLogger) + require.NoError(t, computeIPErr, "Error getting dynamic compute node IPs: %v", computeIPErr) + + // Verify compute node configuration + VerifyComputeNodeConfig(t, sshClient, expectedHyperthreadingEnabled, computeNodeIPs, testLogger) + + // Verify SSH connectivity from login node + sshLoginNodeClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, bastionIP, LSF_PRIVATE_HOST_NAME, loginNodeIP) + require.NoError(t, connectionErr, "Failed to connect to the login node via SSH: %v", connectionErr) + defer sshLoginNodeClient.Close() + + // Verify login node configuration + VerifyLoginNodeConfig(t, sshLoginNodeClient, expectedClusterID, expectedMasterName, expectedReservationID, expectedHyperthreadingEnabled, loginNodeIP, jobCommandLow, EXPECTED_LSF_VERSION, testLogger) + + // Verify SSH connectivity to nodes from login node + VerifySSHConnectivityToNodesFromLogin(t, sshLoginNodeClient, managementNodeIPs, computeNodeIPs, testLogger) + + // Verify file share encryption + VerifyFileShareEncryption(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(expectedZone), expectedResourceGroup, expectedMasterName, expectedKeyManagement, testLogger) + + // Connect to the LDAP server via SSH + sshLdapClient, connectionErr := utils.ConnectToHost(LSF_PUBLIC_HOST_NAME, ldapServerBastionIP, LSF_LDAP_HOST_NAME, ldapServerIP) + require.NoError(t, connectionErr, "Failed to connect to the LDAP server via SSH: %v", connectionErr) + defer sshLdapClient.Close() + + // Check LDAP server status + CheckLDAPServerStatus(t, sshLdapClient, ldapAdminPassword, expectedLdapDomain, ldapUserName, testLogger) + + // Verify management node LDAP configuration + VerifyManagementNodeLDAPConfig(t, sshClient, bastionIP, ldapServerIP, managementNodeIPs, jobCommandLow, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + + // Verify compute node LDAP configuration + VerifyComputeNodeLDAPConfig(t, bastionIP, ldapServerIP, computeNodeIPs, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + + // Verify login node LDAP configuration + VerifyLoginNodeLDAPConfig(t, sshLoginNodeClient, bastionIP, loginNodeIP, ldapServerIP, jobCommandLow, expectedLdapDomain, ldapUserName, ldapUserPassword, testLogger) + + // Verify LDAP user creation and LSF actions using the new user + VerifyCreateNewLdapUserAndManagementNodeLDAPConfig(t, sshLdapClient, bastionIP, ldapServerIP, managementNodeIPs, jobCommandLow, ldapAdminPassword, expectedLdapDomain, ldapUserName, ldapUserPassword, "user2", testLogger) + + // Log validation end testLogger.Info(t, t.Name()+" Validation ended") } diff --git a/tests/lsf/lsf_cluster_utils.go b/tests/lsf/lsf_cluster_utils.go index 72417d56..8e677239 100644 --- a/tests/lsf/lsf_cluster_utils.go +++ b/tests/lsf/lsf_cluster_utils.go @@ -686,10 +686,9 @@ func runSSHCommandAndGetPaths(sClient *ssh.Client) ([]string, error) { } // LSFCheckSSHKeyForManagementNode checks the SSH key configurations on a management server. -// It retrieves a list of authorized_keys paths, compares them with expected paths, -// and validates the occurrences of SSH keys in each path. -func LSFCheckSSHKeyForManagementNode(t *testing.T, sClient *ssh.Client, logger *utils.AggregatedLogger) error { - // Run a command to get a list of authorized_keys paths +// Validates the number of SSH keys in each authorized_keys file against expected values. +func LSFCheckSSHKeyForManagementNode(t *testing.T, sClient *ssh.Client, numOfKeys int, logger *utils.AggregatedLogger) error { + // Retrieve authorized_keys paths from the management node pathList, err := runSSHCommandAndGetPaths(sClient) if err != nil { return fmt.Errorf("failed to retrieve authorized_keys paths: %w", err) @@ -698,14 +697,10 @@ func LSFCheckSSHKeyForManagementNode(t *testing.T, sClient *ssh.Client, logger * // Log the list of authorized_keys paths logger.Info(t, fmt.Sprintf("List of authorized_keys paths: %q", pathList)) - // Create a map with paths as keys and expected values - filePathMap := map[string]int{ - "/home/vpcuser/.ssh/authorized_keys": 1, - "/home/lsfadmin/.ssh/authorized_keys": 2, - "/root/.ssh/authorized_keys": 2, - } + // Generate expected number of SSH keys for each file path + filePathMap := HPCGenerateFilePathMap(numOfKeys) - // Iterate through the list of paths and check SSH key occurrences + // Verify SSH key occurrences for each path for _, path := range pathList { cmd := fmt.Sprintf("sudo su -l root -c 'cat %s'", path) out, err := utils.RunCommandInSSHSession(sClient, cmd) @@ -713,50 +708,59 @@ func LSFCheckSSHKeyForManagementNode(t *testing.T, sClient *ssh.Client, logger * return fmt.Errorf("failed to run command on %s: %w", path, err) } - // Log information about SSH key occurrences - value := filePathMap[path] - occur := utils.CountStringOccurrences(out, "ssh-rsa ") - logger.Info(t, fmt.Sprintf("Value: %d, Occurrences: %d, Path: %s", value, occur, path)) + // Count occurrences of SSH keys and log information + expectedCount := filePathMap[path] + actualCount := utils.CountStringOccurrences(out, "ssh-rsa ") + logger.Info(t, fmt.Sprintf("Expected: %d, Occurrences: %d, Path: %s", expectedCount, actualCount, path)) - // Check for mismatch in occurrences - if value != occur { - return fmt.Errorf("mismatch in occurrences for path %s: expected %d, got %d", path, value, occur) + // Validate the number of occurrences + if expectedCount != actualCount { + return fmt.Errorf("mismatch in occurrences for path %s: expected %d, got %d", path, expectedCount, actualCount) } } - // Log success for SSH key check + // Log success logger.Info(t, "SSH key check successful") return nil } -// LSFCheckSSHKeyForManagementNodes checks SSH key configurations for each management node in the provided list. -// It ensures the expected paths and occurrences of SSH keys are consistent. -func LSFCheckSSHKeyForManagementNodes(t *testing.T, publicHostName, publicHostIP, privateHostName string, managementNodeIPList []string, logger *utils.AggregatedLogger) error { - // Check if the node list is empty +// LSFCheckSSHKeyForManagementNodes verifies SSH key configurations for each management node in the provided list. +// Ensures that the number of keys in authorized_keys files match the expected values. +func LSFCheckSSHKeyForManagementNodes(t *testing.T, publicHostName, publicHostIP, privateHostName string, managementNodeIPList []string, numOfKeys int, logger *utils.AggregatedLogger) error { if len(managementNodeIPList) == 0 { return fmt.Errorf("management node IPs cannot be empty") } for _, mgmtIP := range managementNodeIPList { // Connect to the management node via SSH - mgmtSshClient, connectionErr := utils.ConnectToHost(publicHostName, publicHostIP, privateHostName, mgmtIP) - if connectionErr != nil { - return fmt.Errorf("failed to connect to the management node %s via SSH: %v", mgmtIP, connectionErr) + mgmtSshClient, err := utils.ConnectToHost(publicHostName, publicHostIP, privateHostName, mgmtIP) + if err != nil { + return fmt.Errorf("failed to connect to the management node %s via SSH: %w", mgmtIP, err) } defer mgmtSshClient.Close() logger.Info(t, fmt.Sprintf("SSH connection to the management node %s successful", mgmtIP)) - // SSH key check for management node - sshKeyErr := LSFCheckSSHKeyForManagementNode(t, mgmtSshClient, logger) - if sshKeyErr != nil { - return fmt.Errorf("management node %s SSH key check failed: %v", mgmtIP, sshKeyErr) + // Verify SSH keys for the management node + if err := LSFCheckSSHKeyForManagementNode(t, mgmtSshClient, numOfKeys, logger); err != nil { + return fmt.Errorf("management node %s SSH key check failed: %w", mgmtIP, err) } } return nil } +// HPCGenerateFilePathMap returns a map of authorized_keys paths to their expected +// number of SSH key occurrences based on the number of SSH keys provided (`numKeys`). +// It adjusts the expected values to account for default key counts. +func HPCGenerateFilePathMap(numKeys int) map[string]int { + return map[string]int{ + "/home/vpcuser/.ssh/authorized_keys": numKeys, // Default value plus number of keys + "/home/lsfadmin/.ssh/authorized_keys": numKeys + 1, // Default value plus number of keys + "/root/.ssh/authorized_keys": numKeys + 1, // Default value plus number of keys + } +} + // LSFCheckSSHKeyForComputeNode checks the SSH key configurations on a compute server. // It considers OS variations, retrieves a list of authorized_keys paths, and validates SSH key occurrences. func LSFCheckSSHKeyForComputeNode(t *testing.T, sClient *ssh.Client, computeIP string, logger *utils.AggregatedLogger) error { diff --git a/tests/other_test.go b/tests/other_test.go index 81640ba4..5a0b2ad8 100644 --- a/tests/other_test.go +++ b/tests/other_test.go @@ -21,7 +21,7 @@ const ( createVpcTerraformDir = "examples/create_vpc/solutions/hpc" // Brand new VPC ) -// TestRunBasic validates the cluster configuration and creation of an HPC cluster. +// TestRunBasic validates the cluster configuration. func TestRunBasic(t *testing.T) { // Parallelize the test @@ -788,7 +788,7 @@ func TestRunExistingPACEnvironment(t *testing.T) { require.NoError(t, err, "Error parsing JSON configuration: %v", err) // Validate the cluster configuration - lsf.ValidateClusterConfigurationWithAPPCenterForExistingEnv(t, config.BastionIP, config.LoginNodeIP, config.ClusterID, config.ReservationID, config.ClusterPrefixName, config.ResourceGroup, + lsf.ValidateClusterConfigurationWithAPPCenterForExistingEnv(t, 1, config.BastionIP, config.LoginNodeIP, config.ClusterID, config.ReservationID, config.ClusterPrefixName, config.ResourceGroup, config.KeyManagement, config.Zones, config.DnsDomainName, config.ManagementNodeIPList, config.HyperthreadingEnabled, testLogger) } @@ -1523,3 +1523,122 @@ func TestRunInvalidSubnetCIDR(t *testing.T) { // Cleanup resources defer terraform.Destroy(t, terraformOptions) } + +// TestRunMultipleSSHKeys validates the cluster configuration. +func TestRunMultipleSSHKeys(t *testing.T) { + + // Parallelize the test + t.Parallel() + + // Setup test suite + setupTestSuite(t) + + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // HPC cluster prefix + hpcClusterPrefix := utils.GenerateRandomString() + + // Retrieve cluster information from environment variables + envVars := GetEnvVars() + + // Create test options + options, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultResourceGroup, ignoreDestroys) + require.NoError(t, err, "Error setting up test options: %v", err) + + options.SkipTestTearDown = true + defer options.TestTearDown() + + lsf.ValidateClusterConfigurationWithMultipleKeys(t, options, testLogger) + +} + +// TestRunExistingLDAP validates the creation and configuration of HPC clusters with LDAP integration, including setup, validation, and error handling for both clusters. +func TestRunExistingLDAP(t *testing.T) { + // Parallelize the test to run concurrently with others + t.Parallel() + + // Setup the test suite + setupTestSuite(t) + + // Initialize logger + testLogger.Info(t, "Cluster creation process initiated for "+t.Name()) + + // Generate random prefix for HPC cluster + hpcClusterPrefix := utils.GenerateRandomString() + + // Retrieve environment variables + envVars := GetEnvVars() + + // Ensure LDAP is enabled and credentials are provided + if strings.ToLower(envVars.EnableLdap) == "true" { + if len(envVars.LdapAdminPassword) == 0 || len(envVars.LdapUserName) == 0 || len(envVars.LdapUserPassword) == 0 { + require.FailNow(t, "LDAP credentials are missing. Ensure LDAP admin password, LDAP user name, and LDAP user password are provided.") + } + } else { + require.FailNow(t, "LDAP is not enabled. Set the 'enable_ldap' environment variable to 'true' to enable LDAP.") + } + + // Create test options for the first cluster + options1, err := setupOptions(t, hpcClusterPrefix, terraformDir, envVars.DefaultResourceGroup, ignoreDestroys) + require.NoError(t, err, "Error setting up test options for the first cluster: %v", err) + + // Set Terraform variables for the first cluster + options1.TerraformVars["enable_ldap"] = strings.ToLower(envVars.EnableLdap) + options1.TerraformVars["ldap_basedns"] = envVars.LdapBaseDns + options1.TerraformVars["ldap_admin_password"] = envVars.LdapAdminPassword // pragma: allowlist secret + options1.TerraformVars["ldap_user_name"] = envVars.LdapUserName + options1.TerraformVars["ldap_user_password"] = envVars.LdapUserPassword // pragma: allowlist secret + options1.TerraformVars["key_management"] = "null" + options1.TerraformVars["management_node_count"] = 1 + + // Skip test teardown for further inspection + options1.SkipTestTearDown = true + defer options1.TestTearDown() + + // Run the test and validate output + output, err := options1.RunTest() + require.NoError(t, err, "Error running test: %v", err) + require.NotNil(t, output, "Expected non-nil output, but got nil") + + // Retrieve custom resolver ID + customResolverID, err := utils.GetCustomResolverID(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(envVars.Zone), envVars.DefaultResourceGroup, hpcClusterPrefix, testLogger) + require.NoError(t, err, "Error retrieving custom resolver ID: %v", err) + + // Retrieve LDAP IP and Bastion IP + ldapIP, err := utils.GetLdapIP(t, options1, testLogger) + require.NoError(t, err, "Error retrieving LDAP IP address: %v", err) + + ldapServerBastionIP, err := utils.GetBastionIP(t, options1, testLogger) + require.NoError(t, err, "Error retrieving LDAP server bastion IP address: %v", err) + + // Update security group for LDAP + err = utils.RetrieveAndUpdateSecurityGroup(t, os.Getenv("TF_VAR_ibmcloud_api_key"), utils.GetRegion(envVars.Zone), envVars.DefaultResourceGroup, hpcClusterPrefix, "10.241.0.0/18", "389", "389", testLogger) + require.NoError(t, err, "Error updating security group: %v", err) + + testLogger.Info(t, "Cluster creation process for the second cluster initiated for "+t.Name()) + + // Generate random prefix for the second HPC cluster + hpcClusterPrefix2 := utils.GenerateRandomString() + + // Create test options for the second cluster + options2, err := setupOptions(t, hpcClusterPrefix2, terraformDir, envVars.DefaultResourceGroup, ignoreDestroys) + require.NoError(t, err, "Error setting up test options for the second cluster: %v", err) + + // Set Terraform variables for the second cluster + options2.TerraformVars["vpc_name"] = options1.TerraformVars["cluster_prefix"].(string) + "-hpc-vpc" + options2.TerraformVars["vpc_cluster_private_subnets_cidr_blocks"] = []string{CLUSTER_TWO_VPC_CLUSTER_PRIVATE_SUBNETS_CIDR_BLOCKS} + options2.TerraformVars["vpc_cluster_login_private_subnets_cidr_blocks"] = []string{CLUSTER_TWO_VPC_CLUSTER_LOGIN_PRIVATE_SUBNETS_CIDR_BLOCKS} + options2.TerraformVars["management_node_count"] = 2 + options2.TerraformVars["dns_domain_name"] = map[string]string{"compute": CLUSTER_TWO_DNS_DOMAIN_NAME} + options2.TerraformVars["dns_custom_resolver_id"] = customResolverID + options2.TerraformVars["enable_ldap"] = strings.ToLower(envVars.EnableLdap) + options2.TerraformVars["ldap_basedns"] = envVars.LdapBaseDns + options2.TerraformVars["ldap_server"] = ldapIP + + // Skip test teardown for further inspection + options2.SkipTestTearDown = true + defer options2.TestTearDown() + + // Validate LDAP configuration for the second cluster + lsf.ValidateExistingLDAPClusterConfig(t, ldapServerBastionIP, ldapIP, envVars.LdapBaseDns, envVars.LdapAdminPassword, envVars.LdapUserName, envVars.LdapUserPassword, options2, testLogger) +} diff --git a/tests/pr_test.go b/tests/pr_test.go index 94282308..d0a942af 100644 --- a/tests/pr_test.go +++ b/tests/pr_test.go @@ -22,7 +22,6 @@ const ( terraformDir = "solutions/hpc" ) -// Terraform resource names to ignore during consistency checks var ignoreDestroys = []string{ "module.landing_zone_vsi.module.hpc.module.check_cluster_status.null_resource.remote_exec[0]", "module.landing_zone_vsi.module.hpc.module.check_node_status.null_resource.remote_exec[1]", @@ -37,6 +36,9 @@ var ignoreDestroys = []string{ "module.landing_zone_vsi.module.hpc.module.landing_zone_vsi.module.wait_management_vsi_booted.null_resource.remote_exec[0]", "module.landing_zone_vsi.module.do_management_vsi_configuration.null_resource.remote_exec_script_cp_files[1]", "module.landing_zone_vsi.module.do_management_vsi_configuration.null_resource.remote_exec_script_new_file[0]", + "module.landing_zone_vsi.module.do_management_vsi_configuration.null_resource.remote_exec_script_cp_files[0]", + "module.landing_zone_vsi.module.do_management_candidate_vsi_configuration.null_resource.remote_exec_script_new_file[0]", + "module.landing_zone_vsi.module.do_management_candidate_vsi_configuration.null_resource.remote_exec_script_run[0]", } // EnvVars stores environment variable values. diff --git a/tests/test_config.yml b/tests/test_config.yml index cf66852c..d7baf9e3 100644 --- a/tests/test_config.yml +++ b/tests/test_config.yml @@ -34,3 +34,4 @@ us_south_zone: us-south-1 us_south_reservation_id: us_south_cluster_id: HPC-LSF-1 ssh_file_path: /artifacts/.ssh/id_rsa +ssh_file_path_two: /artifacts/.ssh/id_rsa